kobiakor15 commited on
Commit
0ff66ab
·
verified ·
1 Parent(s): af9b0dd

Upload oculus_unified_model/processing_oculus.py with huggingface_hub

Browse files
oculus_unified_model/processing_oculus.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Oculus Processor
3
+
4
+ Handles image and text preprocessing for the Oculus model.
5
+ """
6
+
7
+ from typing import Optional, Union, List, Dict, Any
8
+ from PIL import Image
9
+ import numpy as np
10
+
11
+ from transformers import ProcessorMixin, BatchFeature
12
+ from transformers.image_utils import ImageInput
13
+
14
+
15
+ class OculusProcessor(ProcessorMixin):
16
+ """
17
+ Processor for Oculus model.
18
+
19
+ Combines image processing and text tokenization.
20
+
21
+ Usage:
22
+ ```python
23
+ processor = OculusProcessor.from_pretrained("OceanirAI/oculus-0.2")
24
+
25
+ # Process inputs
26
+ inputs = processor(
27
+ images=image,
28
+ text="What is in this image?",
29
+ mode="text",
30
+ return_tensors="pt"
31
+ )
32
+ ```
33
+ """
34
+
35
+ attributes = ["image_processor", "tokenizer"]
36
+ image_processor_class = "AutoImageProcessor"
37
+ tokenizer_class = "AutoTokenizer"
38
+
39
+ def __init__(
40
+ self,
41
+ image_processor=None,
42
+ tokenizer=None,
43
+ **kwargs
44
+ ):
45
+ super().__init__(image_processor, tokenizer)
46
+ self.image_processor = image_processor
47
+ self.tokenizer = tokenizer
48
+
49
+ # Special tokens
50
+ self.thinking_token = kwargs.get("thinking_token", "<think>")
51
+ self.thinking_end_token = kwargs.get("thinking_end_token", "</think>")
52
+ self.focus_token = kwargs.get("focus_token", "<focus>")
53
+ self.focus_end_token = kwargs.get("focus_end_token", "</focus>")
54
+
55
+ # Output mode tokens
56
+ self.mode_tokens = {
57
+ "text": "<text>",
58
+ "point": "<point>",
59
+ "box": "<box>",
60
+ "polygon": "<polygon>",
61
+ }
62
+
63
+ def __call__(
64
+ self,
65
+ images: ImageInput = None,
66
+ text: Union[str, List[str]] = None,
67
+ mode: str = "text",
68
+ think: bool = False,
69
+ return_tensors: Optional[str] = None,
70
+ **kwargs
71
+ ) -> BatchFeature:
72
+ """
73
+ Process images and text for Oculus model.
74
+
75
+ Args:
76
+ images: Input image(s)
77
+ text: Input text prompt(s)
78
+ mode: Output mode ("text", "point", "box", "polygon")
79
+ think: Enable reasoning mode
80
+ return_tensors: Tensor format ("pt", "np", etc.)
81
+
82
+ Returns:
83
+ BatchFeature with processed inputs
84
+ """
85
+ # Process images
86
+ if images is not None:
87
+ if self.image_processor is not None:
88
+ image_features = self.image_processor(images, return_tensors=return_tensors)
89
+ else:
90
+ # Basic processing
91
+ if isinstance(images, Image.Image):
92
+ images = [images]
93
+ image_features = {"pixel_values": images}
94
+ else:
95
+ image_features = {}
96
+
97
+ # Process text
98
+ if text is not None:
99
+ # Add mode and thinking tokens
100
+ processed_text = self._format_prompt(text, mode, think)
101
+
102
+ if self.tokenizer is not None:
103
+ text_features = self.tokenizer(
104
+ processed_text,
105
+ return_tensors=return_tensors,
106
+ padding=True,
107
+ truncation=True,
108
+ **kwargs
109
+ )
110
+ else:
111
+ text_features = {"text": processed_text}
112
+ else:
113
+ text_features = {}
114
+
115
+ # Combine features
116
+ return BatchFeature(
117
+ data={
118
+ **image_features,
119
+ **text_features,
120
+ "mode": mode,
121
+ "think": think,
122
+ },
123
+ tensor_type=return_tensors
124
+ )
125
+
126
+ def _format_prompt(
127
+ self,
128
+ text: Union[str, List[str]],
129
+ mode: str,
130
+ think: bool
131
+ ) -> Union[str, List[str]]:
132
+ """Format prompt with special tokens."""
133
+
134
+ def format_single(t: str) -> str:
135
+ parts = []
136
+
137
+ # Add mode token
138
+ if mode in self.mode_tokens:
139
+ parts.append(self.mode_tokens[mode])
140
+
141
+ # Add thinking token if enabled
142
+ if think:
143
+ parts.append(self.thinking_token)
144
+
145
+ # Add prompt
146
+ parts.append(t)
147
+
148
+ return " ".join(parts)
149
+
150
+ if isinstance(text, str):
151
+ return format_single(text)
152
+ else:
153
+ return [format_single(t) for t in text]
154
+
155
+ def decode(
156
+ self,
157
+ token_ids,
158
+ skip_special_tokens: bool = True,
159
+ **kwargs
160
+ ) -> str:
161
+ """Decode token IDs to text."""
162
+ if self.tokenizer is not None:
163
+ text = self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
164
+ else:
165
+ text = str(token_ids)
166
+
167
+ # Parse thinking trace if present
168
+ thinking_trace = None
169
+ if self.thinking_token in text and self.thinking_end_token in text:
170
+ start = text.find(self.thinking_token) + len(self.thinking_token)
171
+ end = text.find(self.thinking_end_token)
172
+ thinking_trace = text[start:end].strip()
173
+ text = text[end + len(self.thinking_end_token):].strip()
174
+
175
+ return text, thinking_trace
176
+
177
+ def batch_decode(
178
+ self,
179
+ token_ids,
180
+ skip_special_tokens: bool = True,
181
+ **kwargs
182
+ ) -> List[str]:
183
+ """Decode batch of token IDs."""
184
+ return [
185
+ self.decode(ids, skip_special_tokens=skip_special_tokens, **kwargs)
186
+ for ids in token_ids
187
+ ]
188
+
189
+ @classmethod
190
+ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
191
+ """Load processor from pretrained."""
192
+ try:
193
+ from transformers import AutoImageProcessor, AutoTokenizer
194
+
195
+ image_processor = AutoImageProcessor.from_pretrained(
196
+ pretrained_model_name_or_path, **kwargs
197
+ )
198
+ tokenizer = AutoTokenizer.from_pretrained(
199
+ pretrained_model_name_or_path, **kwargs
200
+ )
201
+ return cls(image_processor=image_processor, tokenizer=tokenizer, **kwargs)
202
+ except:
203
+ # Return basic processor without HF components
204
+ return cls(**kwargs)
205
+
206
+ def save_pretrained(self, save_directory: str, **kwargs):
207
+ """Save processor to directory."""
208
+ if self.image_processor is not None:
209
+ self.image_processor.save_pretrained(save_directory)
210
+ if self.tokenizer is not None:
211
+ self.tokenizer.save_pretrained(save_directory)