File size: 4,001 Bytes
af9b0dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
Oculus Configuration

HuggingFace-compatible configuration for the unified Oculus model.
"""

from typing import Optional, Dict, Any, List
from transformers import PretrainedConfig


class OculusConfig(PretrainedConfig):
    """
    Configuration class for Oculus vision-language model.
    
    Args:
        vision_config: Configuration for vision encoders
        projector_config: Configuration for vision-to-language projector
        text_config: Configuration for language model
        reasoning_enabled: Whether to enable thinking traces
        output_mode: Default output mode ("text", "point", "box", "polygon")
    """
    
    model_type = "oculus"
    
    def __init__(
        self,
        # Vision encoder settings
        dinov3_model_id: str = "facebook/dinov2-large",
        siglip_model_id: str = "google/siglip-base-patch16-224",
        dinov3_hidden_size: int = 1280,  # DINOv3 ViT-H/16+ output dim
        siglip_hidden_size: int = 768,   # SigLIP2 base output dim
        
        # Projector settings
        projector_hidden_dim: int = 2048,
        num_vision_tokens: int = 64,
        
        # Language model settings
        text_model_id: str = "Salesforce/blip-image-captioning-base",
        lm_hidden_size: int = 1536,
        vocab_size: int = 131072,
        max_position_embeddings: int = 32768,
        
        # Reasoning settings
        reasoning_enabled: bool = True,
        thinking_token: str = "<think>",
        thinking_end_token: str = "</think>",
        max_thinking_tokens: int = 256,
        
        # Output mode settings
        output_mode: str = "text",  # "text", "point", "box", "polygon"
        num_detection_classes: int = 80,
        num_segmentation_classes: int = 150,
        
        # Generation settings
        max_new_tokens: int = 512,
        temperature: float = 0.7,
        top_p: float = 0.95,
        
        # Tool calling / Focus system
        enable_focus: bool = True,
        focus_token: str = "<focus>",
        focus_end_token: str = "</focus>",
        
        **kwargs
    ):
        super().__init__(**kwargs)
        
        # Vision
        self.dinov3_model_id = dinov3_model_id
        self.siglip_model_id = siglip_model_id
        self.dinov3_hidden_size = dinov3_hidden_size
        self.siglip_hidden_size = siglip_hidden_size
        self.fused_vision_dim = dinov3_hidden_size + siglip_hidden_size
        
        # Projector
        self.projector_hidden_dim = projector_hidden_dim
        self.num_vision_tokens = num_vision_tokens
        
        # Language model
        self.text_model_id = text_model_id
        self.lm_hidden_size = lm_hidden_size
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        
        # Reasoning
        self.reasoning_enabled = reasoning_enabled
        self.thinking_token = thinking_token
        self.thinking_end_token = thinking_end_token
        self.max_thinking_tokens = max_thinking_tokens
        
        # Output modes
        self.output_mode = output_mode
        self.num_detection_classes = num_detection_classes
        self.num_segmentation_classes = num_segmentation_classes
        
        # Generation
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p
        
        # Focus system
        self.enable_focus = enable_focus
        self.focus_token = focus_token
        self.focus_end_token = focus_end_token
    
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        """Load config from pretrained path."""
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
        return cls.from_dict(config_dict, **kwargs)
    
    def to_dict(self) -> Dict[str, Any]:
        """Serialize config to dictionary."""
        output = super().to_dict()
        return output


# Register for auto-loading
OculusConfig.register_for_auto_class()