kobiakor15 commited on
Commit
af9b0dd
·
verified ·
1 Parent(s): 4b92f99

Upload oculus_unified_model/configuration_oculus.py with huggingface_hub

Browse files
oculus_unified_model/configuration_oculus.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Oculus Configuration
3
+
4
+ HuggingFace-compatible configuration for the unified Oculus model.
5
+ """
6
+
7
+ from typing import Optional, Dict, Any, List
8
+ from transformers import PretrainedConfig
9
+
10
+
11
+ class OculusConfig(PretrainedConfig):
12
+ """
13
+ Configuration class for Oculus vision-language model.
14
+
15
+ Args:
16
+ vision_config: Configuration for vision encoders
17
+ projector_config: Configuration for vision-to-language projector
18
+ text_config: Configuration for language model
19
+ reasoning_enabled: Whether to enable thinking traces
20
+ output_mode: Default output mode ("text", "point", "box", "polygon")
21
+ """
22
+
23
+ model_type = "oculus"
24
+
25
+ def __init__(
26
+ self,
27
+ # Vision encoder settings
28
+ dinov3_model_id: str = "facebook/dinov2-large",
29
+ siglip_model_id: str = "google/siglip-base-patch16-224",
30
+ dinov3_hidden_size: int = 1280, # DINOv3 ViT-H/16+ output dim
31
+ siglip_hidden_size: int = 768, # SigLIP2 base output dim
32
+
33
+ # Projector settings
34
+ projector_hidden_dim: int = 2048,
35
+ num_vision_tokens: int = 64,
36
+
37
+ # Language model settings
38
+ text_model_id: str = "Salesforce/blip-image-captioning-base",
39
+ lm_hidden_size: int = 1536,
40
+ vocab_size: int = 131072,
41
+ max_position_embeddings: int = 32768,
42
+
43
+ # Reasoning settings
44
+ reasoning_enabled: bool = True,
45
+ thinking_token: str = "<think>",
46
+ thinking_end_token: str = "</think>",
47
+ max_thinking_tokens: int = 256,
48
+
49
+ # Output mode settings
50
+ output_mode: str = "text", # "text", "point", "box", "polygon"
51
+ num_detection_classes: int = 80,
52
+ num_segmentation_classes: int = 150,
53
+
54
+ # Generation settings
55
+ max_new_tokens: int = 512,
56
+ temperature: float = 0.7,
57
+ top_p: float = 0.95,
58
+
59
+ # Tool calling / Focus system
60
+ enable_focus: bool = True,
61
+ focus_token: str = "<focus>",
62
+ focus_end_token: str = "</focus>",
63
+
64
+ **kwargs
65
+ ):
66
+ super().__init__(**kwargs)
67
+
68
+ # Vision
69
+ self.dinov3_model_id = dinov3_model_id
70
+ self.siglip_model_id = siglip_model_id
71
+ self.dinov3_hidden_size = dinov3_hidden_size
72
+ self.siglip_hidden_size = siglip_hidden_size
73
+ self.fused_vision_dim = dinov3_hidden_size + siglip_hidden_size
74
+
75
+ # Projector
76
+ self.projector_hidden_dim = projector_hidden_dim
77
+ self.num_vision_tokens = num_vision_tokens
78
+
79
+ # Language model
80
+ self.text_model_id = text_model_id
81
+ self.lm_hidden_size = lm_hidden_size
82
+ self.vocab_size = vocab_size
83
+ self.max_position_embeddings = max_position_embeddings
84
+
85
+ # Reasoning
86
+ self.reasoning_enabled = reasoning_enabled
87
+ self.thinking_token = thinking_token
88
+ self.thinking_end_token = thinking_end_token
89
+ self.max_thinking_tokens = max_thinking_tokens
90
+
91
+ # Output modes
92
+ self.output_mode = output_mode
93
+ self.num_detection_classes = num_detection_classes
94
+ self.num_segmentation_classes = num_segmentation_classes
95
+
96
+ # Generation
97
+ self.max_new_tokens = max_new_tokens
98
+ self.temperature = temperature
99
+ self.top_p = top_p
100
+
101
+ # Focus system
102
+ self.enable_focus = enable_focus
103
+ self.focus_token = focus_token
104
+ self.focus_end_token = focus_end_token
105
+
106
+ @classmethod
107
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
108
+ """Load config from pretrained path."""
109
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
110
+ return cls.from_dict(config_dict, **kwargs)
111
+
112
+ def to_dict(self) -> Dict[str, Any]:
113
+ """Serialize config to dictionary."""
114
+ output = super().to_dict()
115
+ return output
116
+
117
+
118
+ # Register for auto-loading
119
+ OculusConfig.register_for_auto_class()