|
|
""" |
|
|
Oculus Configuration |
|
|
|
|
|
HuggingFace-compatible configuration for the unified Oculus model. |
|
|
""" |
|
|
|
|
|
from typing import Optional, Dict, Any, List |
|
|
from transformers import PretrainedConfig |
|
|
|
|
|
|
|
|
class OculusConfig(PretrainedConfig): |
|
|
""" |
|
|
Configuration class for Oculus vision-language model. |
|
|
|
|
|
Args: |
|
|
vision_config: Configuration for vision encoders |
|
|
projector_config: Configuration for vision-to-language projector |
|
|
text_config: Configuration for language model |
|
|
reasoning_enabled: Whether to enable thinking traces |
|
|
output_mode: Default output mode ("text", "point", "box", "polygon") |
|
|
""" |
|
|
|
|
|
model_type = "oculus" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
|
|
|
dinov3_model_id: str = "facebook/dinov2-large", |
|
|
siglip_model_id: str = "google/siglip-base-patch16-224", |
|
|
dinov3_hidden_size: int = 1280, |
|
|
siglip_hidden_size: int = 768, |
|
|
|
|
|
|
|
|
projector_hidden_dim: int = 2048, |
|
|
num_vision_tokens: int = 64, |
|
|
|
|
|
|
|
|
text_model_id: str = "Salesforce/blip-image-captioning-base", |
|
|
lm_hidden_size: int = 1536, |
|
|
vocab_size: int = 131072, |
|
|
max_position_embeddings: int = 32768, |
|
|
|
|
|
|
|
|
reasoning_enabled: bool = True, |
|
|
thinking_token: str = "<think>", |
|
|
thinking_end_token: str = "</think>", |
|
|
max_thinking_tokens: int = 256, |
|
|
|
|
|
|
|
|
output_mode: str = "text", |
|
|
num_detection_classes: int = 80, |
|
|
num_segmentation_classes: int = 150, |
|
|
|
|
|
|
|
|
max_new_tokens: int = 512, |
|
|
temperature: float = 0.7, |
|
|
top_p: float = 0.95, |
|
|
|
|
|
|
|
|
enable_focus: bool = True, |
|
|
focus_token: str = "<focus>", |
|
|
focus_end_token: str = "</focus>", |
|
|
|
|
|
**kwargs |
|
|
): |
|
|
super().__init__(**kwargs) |
|
|
|
|
|
|
|
|
self.dinov3_model_id = dinov3_model_id |
|
|
self.siglip_model_id = siglip_model_id |
|
|
self.dinov3_hidden_size = dinov3_hidden_size |
|
|
self.siglip_hidden_size = siglip_hidden_size |
|
|
self.fused_vision_dim = dinov3_hidden_size + siglip_hidden_size |
|
|
|
|
|
|
|
|
self.projector_hidden_dim = projector_hidden_dim |
|
|
self.num_vision_tokens = num_vision_tokens |
|
|
|
|
|
|
|
|
self.text_model_id = text_model_id |
|
|
self.lm_hidden_size = lm_hidden_size |
|
|
self.vocab_size = vocab_size |
|
|
self.max_position_embeddings = max_position_embeddings |
|
|
|
|
|
|
|
|
self.reasoning_enabled = reasoning_enabled |
|
|
self.thinking_token = thinking_token |
|
|
self.thinking_end_token = thinking_end_token |
|
|
self.max_thinking_tokens = max_thinking_tokens |
|
|
|
|
|
|
|
|
self.output_mode = output_mode |
|
|
self.num_detection_classes = num_detection_classes |
|
|
self.num_segmentation_classes = num_segmentation_classes |
|
|
|
|
|
|
|
|
self.max_new_tokens = max_new_tokens |
|
|
self.temperature = temperature |
|
|
self.top_p = top_p |
|
|
|
|
|
|
|
|
self.enable_focus = enable_focus |
|
|
self.focus_token = focus_token |
|
|
self.focus_end_token = focus_end_token |
|
|
|
|
|
@classmethod |
|
|
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): |
|
|
"""Load config from pretrained path.""" |
|
|
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) |
|
|
return cls.from_dict(config_dict, **kwargs) |
|
|
|
|
|
def to_dict(self) -> Dict[str, Any]: |
|
|
"""Serialize config to dictionary.""" |
|
|
output = super().to_dict() |
|
|
return output |
|
|
|
|
|
|
|
|
|
|
|
OculusConfig.register_for_auto_class() |
|
|
|