| import torch |
| from .heads import DetrOvdHead |
| from .vlmbackbone import InternVL3_5_Backbone |
| from torch import nn |
|
|
|
|
| class InternVL3_5_OvdModel(nn.Module): |
| def __init__( |
| self, |
| backbone: InternVL3_5_Backbone, |
| model_config: object, |
| ) -> None: |
| super().__init__() |
| self.backbone = backbone |
| self.ovd_head = DetrOvdHead(model_config) |
| |
| self.ovd_head.to(dtype=self.backbone.dtype) |
|
|
| def forward( |
| self, |
| pixel_values: torch.Tensor, |
| input_ids: torch.Tensor, |
| attention_mask: torch.Tensor, |
| patch_mask: torch.Tensor | None = None, |
| ) -> tuple[torch.Tensor, torch.Tensor]: |
| """ |
| Forward pass. |
| |
| Args: |
| pixel_values: Image tensor |
| input_ids: Tokenized prompt |
| attention_mask: Attention mask for prompt |
| """ |
| memory, padding_mask = self.backbone.forward_fused( |
| pixel_values, |
| input_ids, |
| attention_mask, |
| patch_mask=patch_mask, |
| ) |
| pred_boxes, pred_scores = self.ovd_head(memory, padding_mask) |
| return pred_boxes, pred_scores |
|
|
|
|
| def build_internvl_ovd( |
| model_config: object, |
| device: str, |
| dtype: torch.dtype, |
| ) -> InternVL3_5_OvdModel: |
| backbone = InternVL3_5_Backbone( |
| model_config.vlm_model_name, |
| device, |
| dtype, |
| use_token_fpn=model_config.use_token_fpn, |
| token_fpn_levels=model_config.token_fpn_levels, |
| token_fpn_include_text=model_config.token_fpn_include_text, |
| ) |
| model = InternVL3_5_OvdModel(backbone, model_config) |
| return model |
|
|