xpuenabler's picture
Upload folder using huggingface_hub
c31a5fe verified
import torch
from .heads import DetrOvdHead
from .vlmbackbone import InternVL3_5_Backbone
from torch import nn
class InternVL3_5_OvdModel(nn.Module):
def __init__(
self,
backbone: InternVL3_5_Backbone,
model_config: object,
) -> None:
super().__init__()
self.backbone = backbone
self.ovd_head = DetrOvdHead(model_config)
# Keep head dtype aligned with backbone output for non-autocast inference paths.
self.ovd_head.to(dtype=self.backbone.dtype)
def forward(
self,
pixel_values: torch.Tensor,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
patch_mask: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
"""
Forward pass.
Args:
pixel_values: Image tensor
input_ids: Tokenized prompt
attention_mask: Attention mask for prompt
"""
memory, padding_mask = self.backbone.forward_fused(
pixel_values,
input_ids,
attention_mask,
patch_mask=patch_mask,
)
pred_boxes, pred_scores = self.ovd_head(memory, padding_mask)
return pred_boxes, pred_scores
def build_internvl_ovd(
model_config: object,
device: str,
dtype: torch.dtype,
) -> InternVL3_5_OvdModel:
backbone = InternVL3_5_Backbone(
model_config.vlm_model_name,
device,
dtype,
use_token_fpn=model_config.use_token_fpn,
token_fpn_levels=model_config.token_fpn_levels,
token_fpn_include_text=model_config.token_fpn_include_text,
)
model = InternVL3_5_OvdModel(backbone, model_config)
return model