Source code for mmdet.models.dense_heads.deformable_detr_head

import copy

import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import Linear, bias_init_with_prob, constant_init
from mmcv.runner import force_fp32

from mmdet.core import multi_apply
from mmdet.models.utils.transformer import inverse_sigmoid
from ..builder import HEADS
from .detr_head import DETRHead


[docs]@HEADS.register_module() class DeformableDETRHead(DETRHead): """Head of DeformDETR: Deformable DETR: Deformable Transformers for End-to- End Object Detection. Code is modified from the `official github repo <https://github.com/fundamentalvision/Deformable-DETR>`_. More details can be found in the `paper <https://arxiv.org/abs/2010.04159>`_ . Args: with_box_refine (bool): Whether to refine the reference points in the decoder. Defaults to False. as_two_stage (bool) : Whether to generate the proposal from the outputs of encoder. transformer (obj:`ConfigDict`): ConfigDict is used for building the Encoder and Decoder. """ def __init__(self, *args, with_box_refine=False, as_two_stage=False, transformer=None, **kwargs): self.with_box_refine = with_box_refine self.as_two_stage = as_two_stage if self.as_two_stage: transformer['as_two_stage'] = self.as_two_stage super(DeformableDETRHead, self).__init__( *args, transformer=transformer, **kwargs) def _init_layers(self): """Initialize classification branch and regression branch of head.""" fc_cls = Linear(self.embed_dims, self.cls_out_channels) reg_branch = [] for _ in range(self.num_reg_fcs): reg_branch.append(Linear(self.embed_dims, self.embed_dims)) reg_branch.append(nn.ReLU()) reg_branch.append(Linear(self.embed_dims, 4)) reg_branch = nn.Sequential(*reg_branch) def _get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) # last reg_branch is used to generate proposal from # encode feature map when as_two_stage is True. num_pred = (self.transformer.decoder.num_layers + 1) if \ self.as_two_stage else self.transformer.decoder.num_layers if self.with_box_refine: self.cls_branches = _get_clones(fc_cls, num_pred) self.reg_branches = _get_clones(reg_branch, num_pred) else: self.cls_branches = nn.ModuleList( [fc_cls for _ in range(num_pred)]) self.reg_branches = nn.ModuleList( [reg_branch for _ in range(num_pred)]) if not self.as_two_stage: self.query_embedding = nn.Embedding(self.num_query, self.embed_dims * 2)
[docs] def init_weights(self): """Initialize weights of the DeformDETR head.""" self.transformer.init_weights() if self.loss_cls.use_sigmoid: bias_init = bias_init_with_prob(0.01) for m in self.cls_branches: nn.init.constant_(m.bias, bias_init) for m in self.reg_branches: constant_init(m[-1], 0, bias=0) nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0) if self.as_two_stage: for m in self.reg_branches: nn.init.constant_(m[-1].bias.data[2:], 0.0)
[docs] def forward(self, mlvl_feats, img_metas): """Forward function. Args: mlvl_feats (tuple[Tensor]): Features from the upstream network, each is a 4D-tensor with shape (N, C, H, W). img_metas (list[dict]): List of image information. Returns: all_cls_scores (Tensor): Outputs from the classification head, \ shape [nb_dec, bs, num_query, cls_out_channels]. Note \ cls_out_channels should includes background. all_bbox_preds (Tensor): Sigmoid outputs from the regression \ head with normalized coordinate format (cx, cy, w, h). \ Shape [nb_dec, bs, num_query, 4]. enc_outputs_class (Tensor): The score of each point on encode \ feature map, has shape (N, h*w, num_class). Only when \ as_two_stage is True it would be returned, otherwise \ `None` would be returned. enc_outputs_coord (Tensor): The proposal generate from the \ encode feature map, has shape (N, h*w, 4). Only when \ as_two_stage is True it would be returned, otherwise \ `None` would be returned. """ batch_size = mlvl_feats[0].size(0) input_img_h, input_img_w = img_metas[0]['batch_input_shape'] img_masks = mlvl_feats[0].new_ones( (batch_size, input_img_h, input_img_w)) for img_id in range(batch_size): img_h, img_w, _ = img_metas[img_id]['img_shape'] img_masks[img_id, :img_h, :img_w] = 0 mlvl_masks = [] mlvl_positional_encodings = [] for feat in mlvl_feats: mlvl_masks.append( F.interpolate(img_masks[None], size=feat.shape[-2:]).to(torch.bool).squeeze(0)) mlvl_positional_encodings.append( self.positional_encoding(mlvl_masks[-1])) query_embeds = None if not self.as_two_stage: query_embeds = self.query_embedding.weight hs, init_reference, inter_references, \ enc_outputs_class, enc_outputs_coord = self.transformer( mlvl_feats, mlvl_masks, query_embeds, mlvl_positional_encodings, reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501 cls_branches=self.cls_branches if self.as_two_stage else None # noqa:E501 ) hs = hs.permute(0, 2, 1, 3) outputs_classes = [] outputs_coords = [] for lvl in range(hs.shape[0]): if lvl == 0: reference = init_reference else: reference = inter_references[lvl - 1] reference = inverse_sigmoid(reference) outputs_class = self.cls_branches[lvl](hs[lvl]) tmp = self.reg_branches[lvl](hs[lvl]) if reference.shape[-1] == 4: tmp += reference else: assert reference.shape[-1] == 2 tmp[..., :2] += reference outputs_coord = tmp.sigmoid() outputs_classes.append(outputs_class) outputs_coords.append(outputs_coord) outputs_classes = torch.stack(outputs_classes) outputs_coords = torch.stack(outputs_coords) if self.as_two_stage: return outputs_classes, outputs_coords, \ enc_outputs_class, \ enc_outputs_coord.sigmoid() else: return outputs_classes, outputs_coords, \ None, None
[docs] @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list')) def loss(self, all_cls_scores, all_bbox_preds, enc_cls_scores, enc_bbox_preds, gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore=None): """"Loss function. Args: all_cls_scores (Tensor): Classification score of all decoder layers, has shape [nb_dec, bs, num_query, cls_out_channels]. all_bbox_preds (Tensor): Sigmoid regression outputs of all decode layers. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and shape [nb_dec, bs, num_query, 4]. enc_cls_scores (Tensor): Classification scores of points on encode feature map , has shape (N, h*w, num_classes). Only be passed when as_two_stage is True, otherwise is None. enc_bbox_preds (Tensor): Regression results of each points on the encode feature map, has shape (N, h*w, 4). Only be passed when as_two_stage is True, otherwise is None. gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels_list (list[Tensor]): Ground truth class indices for each image with shape (num_gts, ). img_metas (list[dict]): List of image meta information. gt_bboxes_ignore (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert gt_bboxes_ignore is None, \ f'{self.__class__.__name__} only supports ' \ f'for gt_bboxes_ignore setting to None.' num_dec_layers = len(all_cls_scores) all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)] all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] all_gt_bboxes_ignore_list = [ gt_bboxes_ignore for _ in range(num_dec_layers) ] img_metas_list = [img_metas for _ in range(num_dec_layers)] losses_cls, losses_bbox, losses_iou = multi_apply( self.loss_single, all_cls_scores, all_bbox_preds, all_gt_bboxes_list, all_gt_labels_list, img_metas_list, all_gt_bboxes_ignore_list) loss_dict = dict() # loss of proposal generated from encode feature map. if enc_cls_scores is not None: binary_labels_list = [ torch.zeros_like(gt_labels_list[i]) for i in range(len(img_metas)) ] enc_loss_cls, enc_losses_bbox, enc_losses_iou = \ self.loss_single(enc_cls_scores, enc_bbox_preds, gt_bboxes_list, binary_labels_list, img_metas, gt_bboxes_ignore) loss_dict['enc_loss_cls'] = enc_loss_cls loss_dict['enc_loss_bbox'] = enc_losses_bbox loss_dict['enc_loss_iou'] = enc_losses_iou # loss from the last decoder layer loss_dict['loss_cls'] = losses_cls[-1] loss_dict['loss_bbox'] = losses_bbox[-1] loss_dict['loss_iou'] = losses_iou[-1] # loss from other decoder layers num_dec_layer = 0 for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1], losses_bbox[:-1], losses_iou[:-1]): loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i num_dec_layer += 1 return loss_dict
[docs] @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list')) def get_bboxes(self, all_cls_scores, all_bbox_preds, enc_cls_scores, enc_bbox_preds, img_metas, rescale=False): """Transform network outputs for a batch into bbox predictions. Args: all_cls_scores (Tensor): Classification score of all decoder layers, has shape [nb_dec, bs, num_query, cls_out_channels]. all_bbox_preds (Tensor): Sigmoid regression outputs of all decode layers. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and shape [nb_dec, bs, num_query, 4]. enc_cls_scores (Tensor): Classification scores of points on encode feature map , has shape (N, h*w, num_classes). Only be passed when as_two_stage is True, otherwise is None. enc_bbox_preds (Tensor): Regression results of each points on the encode feature map, has shape (N, h*w, 4). Only be passed when as_two_stage is True, otherwise is None. img_metas (list[dict]): Meta information of each image. rescale (bool, optional): If True, return boxes in original image space. Default False. Returns: list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \ The first item is an (n, 5) tensor, where the first 4 columns \ are bounding box positions (tl_x, tl_y, br_x, br_y) and the \ 5-th column is a score between 0 and 1. The second item is a \ (n,) tensor where each item is the predicted class label of \ the corresponding box. """ cls_scores = all_cls_scores[-1] bbox_preds = all_bbox_preds[-1] result_list = [] for img_id in range(len(img_metas)): cls_score = cls_scores[img_id] bbox_pred = bbox_preds[img_id] img_shape = img_metas[img_id]['img_shape'] scale_factor = img_metas[img_id]['scale_factor'] proposals = self._get_bboxes_single(cls_score, bbox_pred, img_shape, scale_factor, rescale) result_list.append(proposals) return result_list