kobiakor15 commited on
Commit
ff21a21
·
verified ·
1 Parent(s): ba0eff7

Upload folder using huggingface_hub

Browse files
Files changed (46) hide show
  1. LICENSE +75 -0
  2. README.md +99 -0
  3. config.json +59 -0
  4. language_model/blip-captioning/.gitattributes +34 -0
  5. language_model/blip-captioning/README.md +156 -0
  6. language_model/blip-captioning/config.json +169 -0
  7. language_model/blip-captioning/preprocessor_config.json +17 -0
  8. language_model/blip-captioning/pytorch_model.bin +3 -0
  9. language_model/blip-captioning/special_tokens_map.json +7 -0
  10. language_model/blip-captioning/tf_model.h5 +3 -0
  11. language_model/blip-captioning/tokenizer.json +0 -0
  12. language_model/blip-captioning/tokenizer_config.json +21 -0
  13. language_model/blip-captioning/vocab.txt +0 -0
  14. language_model/blip-vqa-finetuned/config.json +58 -0
  15. language_model/blip-vqa-finetuned/generation_config.json +4 -0
  16. language_model/blip-vqa-finetuned/model.safetensors +3 -0
  17. language_model/blip-vqa-finetuned/processor_config.json +29 -0
  18. language_model/blip-vqa-finetuned/tokenizer.json +0 -0
  19. language_model/blip-vqa-finetuned/tokenizer_config.json +22 -0
  20. oculus_unified_model/README.md +220 -0
  21. oculus_unified_model/__init__.py +35 -0
  22. oculus_unified_model/__pycache__/__init__.cpython-312.pyc +0 -0
  23. oculus_unified_model/__pycache__/configuration_oculus.cpython-312.pyc +0 -0
  24. oculus_unified_model/__pycache__/modeling_oculus.cpython-312.pyc +0 -0
  25. oculus_unified_model/__pycache__/processing_oculus.cpython-312.pyc +0 -0
  26. oculus_unified_model/configuration_oculus.py +119 -0
  27. oculus_unified_model/modeling_oculus.py +842 -0
  28. oculus_unified_model/processing_oculus.py +211 -0
  29. trained_components/heads.pth +3 -0
  30. trained_components/projector.npz +3 -0
  31. vision_encoders/dinov2-large/.gitattributes +35 -0
  32. vision_encoders/dinov2-large/README.md +60 -0
  33. vision_encoders/dinov2-large/config.json +24 -0
  34. vision_encoders/dinov2-large/model.safetensors +3 -0
  35. vision_encoders/dinov2-large/preprocessor_config.json +27 -0
  36. vision_encoders/dinov2-large/pytorch_model.bin +3 -0
  37. vision_encoders/siglip-base/.gitattributes +35 -0
  38. vision_encoders/siglip-base/README.md +110 -0
  39. vision_encoders/siglip-base/config.json +20 -0
  40. vision_encoders/siglip-base/model.safetensors +3 -0
  41. vision_encoders/siglip-base/preprocessor_config.json +23 -0
  42. vision_encoders/siglip-base/pytorch_model.bin +3 -0
  43. vision_encoders/siglip-base/special_tokens_map.json +23 -0
  44. vision_encoders/siglip-base/spiece.model +3 -0
  45. vision_encoders/siglip-base/tokenizer.json +0 -0
  46. vision_encoders/siglip-base/tokenizer_config.json +33 -0
LICENSE ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OCEANIR RESEARCH LICENSE
2
+ Version 1.0, January 2026
3
+
4
+ Copyright (c) 2026 OceanirAI
5
+
6
+ TERMS AND CONDITIONS
7
+
8
+ 1. DEFINITIONS
9
+
10
+ "Software" refers to the Oculus model weights, code, and associated materials
11
+ distributed under this license.
12
+
13
+ "Research Use" means non-commercial academic research, educational purposes,
14
+ and personal experimentation for learning.
15
+
16
+ "Commercial Use" means any use intended for or directed toward commercial
17
+ advantage or monetary compensation.
18
+
19
+ 2. GRANT OF LICENSE
20
+
21
+ Subject to the terms of this License, OceanirAI grants you a non-exclusive,
22
+ worldwide, royalty-free license to use, copy, and modify the Software for
23
+ Research Use only.
24
+
25
+ 3. PERMITTED USES
26
+
27
+ You MAY:
28
+ - Use the Software for academic research
29
+ - Use the Software for educational purposes
30
+ - Publish research papers using results obtained from the Software
31
+ - Modify the Software for Research Use
32
+ - Share modifications under this same license
33
+ - Use the Software in academic courses and tutorials
34
+
35
+ 4. PROHIBITED USES
36
+
37
+ You MAY NOT:
38
+ - Use the Software for any Commercial Use
39
+ - Sell, license, or sublicense the Software
40
+ - Use the Software to train models for commercial deployment
41
+ - Integrate the Software into commercial products or services
42
+ - Use the Software to provide commercial services
43
+ - Remove or alter any license notices or attributions
44
+
45
+ 5. ATTRIBUTION
46
+
47
+ Any publication, presentation, or distribution of work using this Software
48
+ must include the following citation:
49
+
50
+ "Oculus Vision-Language Model, OceanirAI, 2026"
51
+
52
+ 6. NO WARRANTY
53
+
54
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
55
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
56
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
57
+ OCEANIR AI OR CONTRIBUTORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
58
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
59
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
60
+ DEALINGS IN THE SOFTWARE.
61
+
62
+ 7. TERMINATION
63
+
64
+ This License and the rights granted hereunder will terminate automatically
65
+ upon any breach by you of the terms of this License.
66
+
67
+ 8. COMMERCIAL LICENSING
68
+
69
+ For commercial licensing inquiries, please contact: licensing@oceanir.ai
70
+
71
+ 9. GOVERNING LAW
72
+
73
+ This License shall be governed by and construed in accordance with the laws
74
+ of the State of California, United States, without regard to its conflict
75
+ of law provisions.
README.md ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: oceanir-research-license
4
+ license_link: LICENSE
5
+ language:
6
+ - en
7
+ library_name: oceanir
8
+ pipeline_tag: image-text-to-text
9
+ tags:
10
+ - vision
11
+ - multimodal
12
+ - vision-language
13
+ - vqa
14
+ - reasoning
15
+ - chain-of-thought
16
+ - instruction-following
17
+ - oculus
18
+ - standalone
19
+ ---
20
+
21
+ # Oculus 0.1 (Unified ~8GB)
22
+
23
+ **Complete standalone vision-language model with both instruction-following and chain-of-thought reasoning.**
24
+
25
+ Oculus 0.1 combines the best of both worlds:
26
+ - **Instruct**: Natural instruction following, image captioning, VQA
27
+ - **Reasoning**: Chain-of-thought thinking with `<think>...</think>` tokens
28
+
29
+ This package includes ALL model weights bundled together:
30
+ - DINOv2-Large vision encoder (~2.3GB)
31
+ - SigLIP vision encoder (~1.1GB)
32
+ - BLIP language models (~3GB)
33
+ - Trained projector & heads (~835MB)
34
+ - Unified VQA model (~1.5GB)
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install oceanir
40
+ ```
41
+
42
+ ## Usage
43
+
44
+ ```python
45
+ from oceanir import Oculus
46
+
47
+ # Load unified model
48
+ model = Oculus.from_pretrained("OceanirAI/Oculus-0.1")
49
+
50
+ # Instruction following
51
+ answer = model.ask("photo.jpg", "Describe what's happening in this image")
52
+
53
+ # Chain-of-thought reasoning
54
+ answer = model.ask(
55
+ "complex_scene.jpg",
56
+ "How many red cars are on the left side?",
57
+ think=True # Enable reasoning
58
+ )
59
+
60
+ # Captioning
61
+ caption = model.caption("image.jpg")
62
+
63
+ # Detection
64
+ results = model.detect("image.jpg")
65
+ ```
66
+
67
+ ## Capabilities
68
+
69
+ | Task | Method | Description |
70
+ |------|--------|-------------|
71
+ | VQA | `model.ask(image, question)` | Answer questions about images |
72
+ | Reasoning | `model.ask(image, question, think=True)` | Chain-of-thought reasoning |
73
+ | Captioning | `model.caption(image)` | Generate image descriptions |
74
+ | Detection | `model.detect(image)` | Object detection (80 COCO classes) |
75
+
76
+ ## Model Structure
77
+
78
+ ```
79
+ Oculus-0.1/
80
+ ├── config.json
81
+ ├── vision_encoders/
82
+ │ ├── dinov2-large/ # DINOv2 ViT-L (~2.3GB)
83
+ │ └── siglip-base/ # SigLIP (~1.1GB)
84
+ ├── language_model/
85
+ │ ├── blip-captioning/ # BLIP captioning
86
+ │ └── blip-vqa-finetuned/ # Unified VQA (~1.5GB)
87
+ ├── trained_components/
88
+ │ ├── projector.npz # Vision projector (~800MB)
89
+ │ └── heads.pth # Detection heads (~35MB)
90
+ └── oculus_unified_model/ # Model code
91
+ ```
92
+
93
+ ## Total Size: ~8GB
94
+
95
+ ## License
96
+
97
+ Oceanir Research License - Non-commercial research only.
98
+
99
+ For commercial licensing: licensing@oceanir.ai
config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "oculus",
3
+ "architectures": ["OculusForConditionalGeneration"],
4
+ "variant": "Unified",
5
+ "version": "0.1",
6
+
7
+ "vision_encoders": {
8
+ "dinov2": {
9
+ "path": "vision_encoders/dinov2-large",
10
+ "model_id": "facebook/dinov2-large",
11
+ "hidden_size": 1024,
12
+ "num_layers": 24,
13
+ "num_heads": 16
14
+ },
15
+ "siglip": {
16
+ "path": "vision_encoders/siglip-base",
17
+ "model_id": "google/siglip-base-patch16-224",
18
+ "hidden_size": 768,
19
+ "num_layers": 12,
20
+ "num_heads": 12
21
+ }
22
+ },
23
+
24
+ "language_model": {
25
+ "captioning": {
26
+ "path": "language_model/blip-captioning",
27
+ "model_id": "Salesforce/blip-image-captioning-base"
28
+ },
29
+ "vqa": {
30
+ "path": "language_model/blip-vqa-finetuned",
31
+ "base_model_id": "Salesforce/blip-vqa-base",
32
+ "finetuned": true
33
+ }
34
+ },
35
+
36
+ "trained_components": {
37
+ "projector": "trained_components/projector.npz",
38
+ "heads": "trained_components/heads.pth"
39
+ },
40
+
41
+ "projector_config": {
42
+ "fused_vision_dim": 1792,
43
+ "hidden_dim": 2048,
44
+ "num_tokens": 64,
45
+ "output_dim": 768
46
+ },
47
+
48
+ "task_heads": {
49
+ "detection_classes": 80,
50
+ "segmentation_classes": 150
51
+ },
52
+
53
+ "instruct_enabled": true,
54
+ "reasoning_enabled": true,
55
+ "thinking_token": "<think>",
56
+ "thinking_end_token": "</think>",
57
+ "max_thinking_tokens": 256,
58
+ "standalone": true
59
+ }
language_model/blip-captioning/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
language_model/blip-captioning/README.md ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: image-to-text
3
+ tags:
4
+ - image-captioning
5
+ languages:
6
+ - en
7
+ license: bsd-3-clause
8
+ ---
9
+
10
+ # BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
11
+
12
+ Model card for image captioning pretrained on COCO dataset - base architecture (with ViT base backbone).
13
+
14
+ | ![BLIP.gif](https://cdn-uploads.huggingface.co/production/uploads/1670928184033-62441d1d9fdefb55a0b7d12c.gif) |
15
+ |:--:|
16
+ | <b> Pull figure from BLIP official repo | Image source: https://github.com/salesforce/BLIP </b>|
17
+
18
+ ## TL;DR
19
+
20
+ Authors from the [paper](https://arxiv.org/abs/2201.12086) write in the abstract:
21
+
22
+ *Vision-Language Pre-training (VLP) has advanced the performance for many vision-language tasks. However, most existing pre-trained models only excel in either understanding-based tasks or generation-based tasks. Furthermore, performance improvement has been largely achieved by scaling up the dataset with noisy image-text pairs collected from the web, which is a suboptimal source of supervision. In this paper, we propose BLIP, a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks. BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). BLIP also demonstrates strong generalization ability when directly transferred to videolanguage tasks in a zero-shot manner. Code, models, and datasets are released.*
23
+
24
+ ## Usage
25
+
26
+ You can use this model for conditional and un-conditional image captioning
27
+
28
+ ### Using the Pytorch model
29
+
30
+ #### Running the model on CPU
31
+
32
+ <details>
33
+ <summary> Click to expand </summary>
34
+
35
+ ```python
36
+ import requests
37
+ from PIL import Image
38
+ from transformers import BlipProcessor, BlipForConditionalGeneration
39
+
40
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
41
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
42
+
43
+ img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
44
+ raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
45
+
46
+ # conditional image captioning
47
+ text = "a photography of"
48
+ inputs = processor(raw_image, text, return_tensors="pt")
49
+
50
+ out = model.generate(**inputs)
51
+ print(processor.decode(out[0], skip_special_tokens=True))
52
+ # >>> a photography of a woman and her dog
53
+
54
+ # unconditional image captioning
55
+ inputs = processor(raw_image, return_tensors="pt")
56
+
57
+ out = model.generate(**inputs)
58
+ print(processor.decode(out[0], skip_special_tokens=True))
59
+ >>> a woman sitting on the beach with her dog
60
+ ```
61
+ </details>
62
+
63
+ #### Running the model on GPU
64
+
65
+ ##### In full precision
66
+
67
+ <details>
68
+ <summary> Click to expand </summary>
69
+
70
+ ```python
71
+ import requests
72
+ from PIL import Image
73
+ from transformers import BlipProcessor, BlipForConditionalGeneration
74
+
75
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
76
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")
77
+
78
+ img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
79
+ raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
80
+
81
+ # conditional image captioning
82
+ text = "a photography of"
83
+ inputs = processor(raw_image, text, return_tensors="pt").to("cuda")
84
+
85
+ out = model.generate(**inputs)
86
+ print(processor.decode(out[0], skip_special_tokens=True))
87
+ # >>> a photography of a woman and her dog
88
+
89
+ # unconditional image captioning
90
+ inputs = processor(raw_image, return_tensors="pt").to("cuda")
91
+
92
+ out = model.generate(**inputs)
93
+ print(processor.decode(out[0], skip_special_tokens=True))
94
+ >>> a woman sitting on the beach with her dog
95
+ ```
96
+ </details>
97
+
98
+ ##### In half precision (`float16`)
99
+
100
+ <details>
101
+ <summary> Click to expand </summary>
102
+
103
+ ```python
104
+ import torch
105
+ import requests
106
+ from PIL import Image
107
+ from transformers import BlipProcessor, BlipForConditionalGeneration
108
+
109
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
110
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float16).to("cuda")
111
+
112
+ img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
113
+ raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
114
+
115
+ # conditional image captioning
116
+ text = "a photography of"
117
+ inputs = processor(raw_image, text, return_tensors="pt").to("cuda", torch.float16)
118
+
119
+ out = model.generate(**inputs)
120
+ print(processor.decode(out[0], skip_special_tokens=True))
121
+ # >>> a photography of a woman and her dog
122
+
123
+ # unconditional image captioning
124
+ inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)
125
+
126
+ out = model.generate(**inputs)
127
+ print(processor.decode(out[0], skip_special_tokens=True))
128
+ >>> a woman sitting on the beach with her dog
129
+ ```
130
+ </details>
131
+
132
+ ## Ethical Considerations
133
+ This release is for research purposes only in support of an academic paper. Our models, datasets, and code are not specifically designed or evaluated for all downstream purposes. We strongly recommend users evaluate and address potential concerns related to accuracy, safety, and fairness before deploying this model. We encourage users to consider the common limitations of AI, comply with applicable laws, and leverage best practices when selecting use cases, particularly for high-risk scenarios where errors or misuse could significantly impact people’s lives, rights, or safety. For further guidance on use cases, refer to our AUP and AI AUP.
134
+
135
+
136
+ ## BibTex and citation info
137
+
138
+ ```
139
+ @misc{https://doi.org/10.48550/arxiv.2201.12086,
140
+ doi = {10.48550/ARXIV.2201.12086},
141
+
142
+ url = {https://arxiv.org/abs/2201.12086},
143
+
144
+ author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
145
+
146
+ keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
147
+
148
+ title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
149
+
150
+ publisher = {arXiv},
151
+
152
+ year = {2022},
153
+
154
+ copyright = {Creative Commons Attribution 4.0 International}
155
+ }
156
+ ```
language_model/blip-captioning/config.json ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "architectures": [
4
+ "BlipForConditionalGeneration"
5
+ ],
6
+ "image_text_hidden_size": 256,
7
+ "initializer_factor": 1.0,
8
+ "logit_scale_init_value": 2.6592,
9
+ "model_type": "blip",
10
+ "projection_dim": 512,
11
+ "text_config": {
12
+ "_name_or_path": "",
13
+ "add_cross_attention": false,
14
+ "architectures": null,
15
+ "attention_probs_dropout_prob": 0.0,
16
+ "bad_words_ids": null,
17
+ "begin_suppress_tokens": null,
18
+ "bos_token_id": 30522,
19
+ "chunk_size_feed_forward": 0,
20
+ "cross_attention_hidden_size": null,
21
+ "decoder_start_token_id": null,
22
+ "diversity_penalty": 0.0,
23
+ "do_sample": false,
24
+ "early_stopping": false,
25
+ "encoder_no_repeat_ngram_size": 0,
26
+ "eos_token_id": 2,
27
+ "exponential_decay_length_penalty": null,
28
+ "finetuning_task": null,
29
+ "forced_bos_token_id": null,
30
+ "forced_eos_token_id": null,
31
+ "hidden_act": "gelu",
32
+ "hidden_dropout_prob": 0.0,
33
+ "hidden_size": 768,
34
+ "id2label": {
35
+ "0": "LABEL_0",
36
+ "1": "LABEL_1"
37
+ },
38
+ "initializer_factor": 1.0,
39
+ "initializer_range": 0.02,
40
+ "intermediate_size": 3072,
41
+ "is_decoder": true,
42
+ "is_encoder_decoder": false,
43
+ "label2id": {
44
+ "LABEL_0": 0,
45
+ "LABEL_1": 1
46
+ },
47
+ "layer_norm_eps": 1e-12,
48
+ "length_penalty": 1.0,
49
+ "max_length": 20,
50
+ "max_position_embeddings": 512,
51
+ "min_length": 0,
52
+ "model_type": "blip_text_model",
53
+ "no_repeat_ngram_size": 0,
54
+ "num_attention_heads": 12,
55
+ "num_beam_groups": 1,
56
+ "num_beams": 1,
57
+ "num_hidden_layers": 12,
58
+ "num_return_sequences": 1,
59
+ "output_attentions": false,
60
+ "output_hidden_states": false,
61
+ "output_scores": false,
62
+ "pad_token_id": 0,
63
+ "prefix": null,
64
+ "problem_type": null,
65
+ "projection_dim": 768,
66
+ "pruned_heads": {},
67
+ "remove_invalid_values": false,
68
+ "repetition_penalty": 1.0,
69
+ "return_dict": true,
70
+ "return_dict_in_generate": false,
71
+ "sep_token_id": 102,
72
+ "suppress_tokens": null,
73
+ "task_specific_params": null,
74
+ "temperature": 1.0,
75
+ "tf_legacy_loss": false,
76
+ "tie_encoder_decoder": false,
77
+ "tie_word_embeddings": true,
78
+ "tokenizer_class": null,
79
+ "top_k": 50,
80
+ "top_p": 1.0,
81
+ "torch_dtype": null,
82
+ "torchscript": false,
83
+ "transformers_version": "4.26.0.dev0",
84
+ "typical_p": 1.0,
85
+ "use_bfloat16": false,
86
+ "use_cache": true,
87
+ "vocab_size": 30524
88
+ },
89
+ "torch_dtype": "float32",
90
+ "transformers_version": null,
91
+ "vision_config": {
92
+ "_name_or_path": "",
93
+ "add_cross_attention": false,
94
+ "architectures": null,
95
+ "attention_dropout": 0.0,
96
+ "bad_words_ids": null,
97
+ "begin_suppress_tokens": null,
98
+ "bos_token_id": null,
99
+ "chunk_size_feed_forward": 0,
100
+ "cross_attention_hidden_size": null,
101
+ "decoder_start_token_id": null,
102
+ "diversity_penalty": 0.0,
103
+ "do_sample": false,
104
+ "dropout": 0.0,
105
+ "early_stopping": false,
106
+ "encoder_no_repeat_ngram_size": 0,
107
+ "eos_token_id": null,
108
+ "exponential_decay_length_penalty": null,
109
+ "finetuning_task": null,
110
+ "forced_bos_token_id": null,
111
+ "forced_eos_token_id": null,
112
+ "hidden_act": "gelu",
113
+ "hidden_size": 768,
114
+ "id2label": {
115
+ "0": "LABEL_0",
116
+ "1": "LABEL_1"
117
+ },
118
+ "image_size": 384,
119
+ "initializer_factor": 1.0,
120
+ "initializer_range": 0.02,
121
+ "intermediate_size": 3072,
122
+ "is_decoder": false,
123
+ "is_encoder_decoder": false,
124
+ "label2id": {
125
+ "LABEL_0": 0,
126
+ "LABEL_1": 1
127
+ },
128
+ "layer_norm_eps": 1e-05,
129
+ "length_penalty": 1.0,
130
+ "max_length": 20,
131
+ "min_length": 0,
132
+ "model_type": "blip_vision_model",
133
+ "no_repeat_ngram_size": 0,
134
+ "num_attention_heads": 12,
135
+ "num_beam_groups": 1,
136
+ "num_beams": 1,
137
+ "num_channels": 3,
138
+ "num_hidden_layers": 12,
139
+ "num_return_sequences": 1,
140
+ "output_attentions": false,
141
+ "output_hidden_states": false,
142
+ "output_scores": false,
143
+ "pad_token_id": null,
144
+ "patch_size": 16,
145
+ "prefix": null,
146
+ "problem_type": null,
147
+ "projection_dim": 512,
148
+ "pruned_heads": {},
149
+ "remove_invalid_values": false,
150
+ "repetition_penalty": 1.0,
151
+ "return_dict": true,
152
+ "return_dict_in_generate": false,
153
+ "sep_token_id": null,
154
+ "suppress_tokens": null,
155
+ "task_specific_params": null,
156
+ "temperature": 1.0,
157
+ "tf_legacy_loss": false,
158
+ "tie_encoder_decoder": false,
159
+ "tie_word_embeddings": true,
160
+ "tokenizer_class": null,
161
+ "top_k": 50,
162
+ "top_p": 1.0,
163
+ "torch_dtype": null,
164
+ "torchscript": false,
165
+ "transformers_version": "4.26.0.dev0",
166
+ "typical_p": 1.0,
167
+ "use_bfloat16": false
168
+ }
169
+ }
language_model/blip-captioning/preprocessor_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_resize": true,
4
+ "image_mean": [
5
+ 0.48145466,
6
+ 0.4578275,
7
+ 0.40821073
8
+ ],
9
+ "image_processor_type": "BlipImageProcessor",
10
+ "image_std": [
11
+ 0.26862954,
12
+ 0.26130258,
13
+ 0.27577711
14
+ ],
15
+ "processor_class": "BlipProcessor",
16
+ "size": 384
17
+ }
language_model/blip-captioning/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6638651a5526cc2ede56f2b5104d6851b0755816d220e5e046870430180c767
3
+ size 989820849
language_model/blip-captioning/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
language_model/blip-captioning/tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0aaa4c0e003f599d8baa53a9dee85af14eef20554cf2f8113a2673e25a59f8c
3
+ size 990275136
language_model/blip-captioning/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
language_model/blip-captioning/tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_basic_tokenize": true,
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 512,
7
+ "name_or_path": "bert-base-uncased",
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "processor_class": "BlipProcessor",
11
+ "sep_token": "[SEP]",
12
+ "special_tokens_map_file": null,
13
+ "strip_accents": null,
14
+ "tokenize_chinese_chars": true,
15
+ "tokenizer_class": "BertTokenizer",
16
+ "unk_token": "[UNK]",
17
+ "model_input_names": [
18
+ "input_ids",
19
+ "attention_mask"
20
+ ]
21
+ }
language_model/blip-captioning/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
language_model/blip-vqa-finetuned/config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BlipForQuestionAnswering"
4
+ ],
5
+ "dtype": "float32",
6
+ "image_text_hidden_size": 256,
7
+ "initializer_factor": 1.0,
8
+ "initializer_range": 0.02,
9
+ "label_smoothing": 0.0,
10
+ "logit_scale_init_value": 2.6592,
11
+ "model_type": "blip",
12
+ "projection_dim": 512,
13
+ "text_config": {
14
+ "attention_probs_dropout_prob": 0.0,
15
+ "encoder_hidden_size": 768,
16
+ "hidden_act": "gelu",
17
+ "hidden_dropout_prob": 0.0,
18
+ "hidden_size": 768,
19
+ "initializer_factor": 1.0,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 3072,
22
+ "label_smoothing": 0.0,
23
+ "layer_norm_eps": 1e-12,
24
+ "max_position_embeddings": 512,
25
+ "model_type": "blip_text_model",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 12,
28
+ "projection_dim": 768,
29
+ "pruned_heads": {},
30
+ "tf_legacy_loss": false,
31
+ "torchscript": false,
32
+ "use_bfloat16": false,
33
+ "use_cache": true,
34
+ "vocab_size": 30524
35
+ },
36
+ "transformers_version": "5.0.0rc1",
37
+ "vision_config": {
38
+ "attention_dropout": 0.0,
39
+ "dropout": 0.0,
40
+ "hidden_act": "gelu",
41
+ "hidden_size": 768,
42
+ "image_size": 384,
43
+ "initializer_factor": 1.0,
44
+ "initializer_range": 0.02,
45
+ "intermediate_size": 3072,
46
+ "layer_norm_eps": 1e-05,
47
+ "model_type": "blip_vision_model",
48
+ "num_attention_heads": 12,
49
+ "num_channels": 3,
50
+ "num_hidden_layers": 12,
51
+ "patch_size": 16,
52
+ "projection_dim": 512,
53
+ "pruned_heads": {},
54
+ "tf_legacy_loss": false,
55
+ "torchscript": false,
56
+ "use_bfloat16": false
57
+ }
58
+ }
language_model/blip-vqa-finetuned/generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "5.0.0rc1"
4
+ }
language_model/blip-vqa-finetuned/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e694b40dc9205491c9aa3b7a49ca93d79d780767a3ed578a0f6d8e8436b7ee56
3
+ size 1538792112
language_model/blip-vqa-finetuned/processor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_processor": {
3
+ "do_convert_rgb": true,
4
+ "do_normalize": true,
5
+ "do_pad": true,
6
+ "do_rescale": true,
7
+ "do_resize": true,
8
+ "image_mean": [
9
+ 0.48145466,
10
+ 0.4578275,
11
+ 0.40821073
12
+ ],
13
+ "image_processor_type": "BlipImageProcessor",
14
+ "image_std": [
15
+ 0.26862954,
16
+ 0.26130258,
17
+ 0.27577711
18
+ ],
19
+ "processor_class": "BlipProcessor",
20
+ "resample": 3,
21
+ "rescale_factor": 0.00392156862745098,
22
+ "size": {
23
+ "height": 384,
24
+ "width": 384
25
+ },
26
+ "size_divisor": 32
27
+ },
28
+ "processor_class": "BlipProcessor"
29
+ }
language_model/blip-vqa-finetuned/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
language_model/blip-vqa-finetuned/tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "backend": "tokenizers",
4
+ "cls_token": "[CLS]",
5
+ "do_basic_tokenize": true,
6
+ "do_lower_case": true,
7
+ "is_local": false,
8
+ "mask_token": "[MASK]",
9
+ "model_input_names": [
10
+ "input_ids",
11
+ "attention_mask"
12
+ ],
13
+ "model_max_length": 512,
14
+ "never_split": null,
15
+ "pad_token": "[PAD]",
16
+ "processor_class": "BlipProcessor",
17
+ "sep_token": "[SEP]",
18
+ "strip_accents": null,
19
+ "tokenize_chinese_chars": true,
20
+ "tokenizer_class": "BertTokenizer",
21
+ "unk_token": "[UNK]"
22
+ }
oculus_unified_model/README.md ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-4.0
3
+ language:
4
+ - en
5
+ pipeline_tag: image-text-to-text
6
+ library_name: transformers
7
+ tags:
8
+ - vision
9
+ - multimodal
10
+ - vision-language
11
+ - reasoning
12
+ - detection
13
+ - segmentation
14
+ - ocr
15
+ - vqa
16
+ - captioning
17
+ base_model:
18
+ - facebook/dinov2-large
19
+ - google/siglip-base-patch16-224
20
+ - Salesforce/blip-image-captioning-base
21
+ ---
22
+
23
+ # Oculus 0.2
24
+
25
+ **A unified vision-language model with multi-modal reasoning capabilities.**
26
+
27
+ Oculus 0.2 is a hybrid-reasoning vision-language model that combines:
28
+ - **DINOv3** for semantic visual understanding
29
+ - **SigLIP2** for vision-language alignment
30
+ - **Trained Projector** for vision-to-language mapping
31
+ - **Optional Reasoning** via thinking traces
32
+
33
+ ## 🚀 What's New in Oculus 0.2
34
+
35
+ | Feature | Description |
36
+ |---------|-------------|
37
+ | **🧠 Reasoning via Thinking Traces** | Short, structured reasoning traces improve multi-step decisions and ambiguous spatial tasks |
38
+ | **🔍 Focus System (Zoom & Crop)** | Automatically focus on smaller regions for fine-grained perception |
39
+ | **📦 Multiple Output Modes** | Text, Point, Box, and Polygon outputs for different tasks |
40
+ | **📝 Improved Captioning** | Better descriptions with context awareness |
41
+ | **❓ Enhanced VQA** | More accurate answers to visual questions |
42
+
43
+ ## Output Modes
44
+
45
+ | Mode | Description | Use Case |
46
+ |------|-------------|----------|
47
+ | **📝 Text** | Natural language output | Captioning, VQA, descriptions |
48
+ | **📍 Point** | (x, y) coordinates + labels | Object counting, localization |
49
+ | **📦 Box** | Bounding boxes + labels | Object detection |
50
+ | **🔷 Polygon** | Segmentation masks | Semantic/instance segmentation |
51
+
52
+ ## Quick Start
53
+
54
+ ```python
55
+ from oculus_unified_model import OculusForConditionalGeneration
56
+ from PIL import Image
57
+
58
+ # Load model
59
+ model = OculusForConditionalGeneration.from_pretrained("OceanirAI/oculus-0.2")
60
+
61
+ # Load image
62
+ image = Image.open("your_image.jpg")
63
+
64
+ # Caption mode
65
+ output = model.generate(image, mode="text", prompt="Describe this image")
66
+ print(output.text)
67
+
68
+ # VQA mode
69
+ output = model.generate(image, mode="text", prompt="What color is the car?")
70
+ print(output.text)
71
+
72
+ # With reasoning traces
73
+ output = model.generate(image, mode="text", prompt="Count the people", think=True)
74
+ print(f"Thinking: {output.thinking_trace}")
75
+ print(f"Answer: {output.text}")
76
+
77
+ # Detection mode (bounding boxes)
78
+ output = model.generate(image, mode="box", prompt="Find all vehicles")
79
+ for box, label, conf in zip(output.boxes, output.labels, output.confidences):
80
+ print(f" {label}: {box} (conf={conf:.2f})")
81
+
82
+ # Point mode (counting)
83
+ output = model.generate(image, mode="point", prompt="Count the birds")
84
+ print(f"Found {len(output.points)} points")
85
+
86
+ # Segmentation mode
87
+ output = model.generate(image, mode="polygon", prompt="Segment the road")
88
+ print(f"Mask shape: {output.mask.shape}")
89
+ ```
90
+
91
+ ## Reasoning Mode
92
+
93
+ Enable thinking traces for complex reasoning tasks:
94
+
95
+ ```python
96
+ output = model.generate(
97
+ image,
98
+ mode="text",
99
+ prompt="How many people are sitting vs standing?",
100
+ think=True # Enable reasoning
101
+ )
102
+
103
+ print(f"💭 Thinking: {output.thinking_trace}")
104
+ print(f"📝 Answer: {output.text}")
105
+ ```
106
+
107
+ ## Focus System
108
+
109
+ The Focus system enables zoom-and-crop for fine-grained perception:
110
+
111
+ ```python
112
+ output = model.generate(
113
+ image,
114
+ mode="text",
115
+ prompt="What does the small text say?",
116
+ focus=True # Enable focus/zoom
117
+ )
118
+ ```
119
+
120
+ ## Architecture
121
+
122
+ ```
123
+ Image → DINOv3 ────┐
124
+ ├→ Fusion → Projector → 64 tokens × 1536D ───┐
125
+ Image → SigLIP2 ──┘ │
126
+
127
+ ┌─────────────────────────────────┐
128
+ │ │
129
+ ↓ ↓
130
+ LM Head Task Heads
131
+ │ │
132
+ ↓ ↓
133
+ Text/Caption/VQA Point/Box/Polygon
134
+ ```
135
+
136
+ ## Model Details
137
+
138
+ | Component | Size | Description |
139
+ |-----------|------|-------------|
140
+ | DINOv3 Encoder | 1.0B | Semantic visual features |
141
+ | SigLIP2 Encoder | 400M | Vision-language aligned features |
142
+ | Projector | 160M | Vision-to-language bridge |
143
+ | Detection Head | 12M | Bounding box prediction |
144
+ | Point Head | 8M | Point localization |
145
+ | Segmentation Head | 24M | Mask prediction |
146
+ | **Total** | **~1.6B** | Full model |
147
+
148
+ ## Training
149
+
150
+ The model components were trained in stages:
151
+ 1. **Projector**: Trained on COCO Captions (5k paired images) for 3 epochs.
152
+ 2. **Detection Heads**: Trained on COCO Detection for 5+ epochs using GIoU and Focal Loss.
153
+
154
+ ## Benchmarks & Evaluation
155
+
156
+ We use a comprehensive benchmark suite `eval_benchmarks.py` covering:
157
+ - **COCO Detection**: mAP evaluation
158
+ - **Car Part Damage**: Specialized evaluation on HuggingFace `moondream/car_part_damage` dataset
159
+ - **Counting**: Accuracy on Pixmo-style counting tasks
160
+ - **VQA**: Open-ended question answering accuracy
161
+
162
+ To run benchmarks:
163
+ ```bash
164
+ python eval_benchmarks.py --model checkpoints/oculus_detection_v2/final
165
+ ```
166
+
167
+ ## 🔌 Python API Usage
168
+
169
+ To use Oculus in your own applications, simply import the `OculusPredictor`:
170
+
171
+ ```python
172
+ from oculus_inference import OculusPredictor
173
+
174
+ # Initialize (automatically loads best checkpoint)
175
+ model = OculusPredictor()
176
+
177
+ # 1. Object Detection
178
+ results = model.detect("image.jpg")
179
+ print(f"Found {len(results['boxes'])} objects")
180
+
181
+ # 2. Visual Question Answering (Reasoning)
182
+ answer = model.ask("image.jpg", "What is the person holding?")
183
+ print(f"Answer: {answer}")
184
+
185
+ # 3. Captioning
186
+ caption = model.caption("image.jpg")
187
+ print(f"Caption: {caption}")
188
+ ```
189
+
190
+ ## Requirements
191
+
192
+ ```bash
193
+ pip install transformers torch pillow numpy
194
+ ```
195
+
196
+ For Apple Silicon:
197
+ ```bash
198
+ pip install mlx
199
+ ```
200
+
201
+ ## Citation
202
+
203
+ ```bibtex
204
+ @misc{oculus2025,
205
+ title={Oculus: Unified Vision-Language Model with Multi-Modal Reasoning},
206
+ author={OceanirAI},
207
+ year={2025},
208
+ publisher={Hugging Face},
209
+ url={https://huggingface.co/OceanirAI/oculus-0.2}
210
+ }
211
+ ```
212
+
213
+ ## License
214
+
215
+ CC-BY-NC-4.0
216
+
217
+ ## Contact
218
+
219
+ - **Organization**: OceanirAI
220
+ - **GitHub**: [github.com/Oceanir](https://github.com/Oceanir)
oculus_unified_model/__init__.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Oculus Unified Vision-Language Model
3
+
4
+ A HuggingFace-compatible multimodal model combining:
5
+ - DINOv3 (vision encoder)
6
+ - SigLIP2 (vision encoder)
7
+ - Trained Projector (vision-to-language bridge)
8
+ - LLM (language generation)
9
+
10
+ Supports:
11
+ - Image captioning
12
+ - Visual question answering
13
+ - Object detection (Box mode)
14
+ - Point detection (counting)
15
+ - Polygon segmentation
16
+ - Optional reasoning with thinking traces
17
+ """
18
+
19
+ from .modeling_oculus import (
20
+ OculusForConditionalGeneration,
21
+ OculusVisionEncoder,
22
+ OculusProjector,
23
+ )
24
+ from .configuration_oculus import OculusConfig
25
+ from .processing_oculus import OculusProcessor
26
+
27
+ __all__ = [
28
+ "OculusForConditionalGeneration",
29
+ "OculusVisionEncoder",
30
+ "OculusProjector",
31
+ "OculusConfig",
32
+ "OculusProcessor",
33
+ ]
34
+
35
+ __version__ = "0.2.0"
oculus_unified_model/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (941 Bytes). View file
 
oculus_unified_model/__pycache__/configuration_oculus.cpython-312.pyc ADDED
Binary file (4.04 kB). View file
 
oculus_unified_model/__pycache__/modeling_oculus.cpython-312.pyc ADDED
Binary file (39.1 kB). View file
 
oculus_unified_model/__pycache__/processing_oculus.cpython-312.pyc ADDED
Binary file (7.19 kB). View file
 
oculus_unified_model/configuration_oculus.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Oculus Configuration
3
+
4
+ HuggingFace-compatible configuration for the unified Oculus model.
5
+ """
6
+
7
+ from typing import Optional, Dict, Any, List
8
+ from transformers import PretrainedConfig
9
+
10
+
11
+ class OculusConfig(PretrainedConfig):
12
+ """
13
+ Configuration class for Oculus vision-language model.
14
+
15
+ Args:
16
+ vision_config: Configuration for vision encoders
17
+ projector_config: Configuration for vision-to-language projector
18
+ text_config: Configuration for language model
19
+ reasoning_enabled: Whether to enable thinking traces
20
+ output_mode: Default output mode ("text", "point", "box", "polygon")
21
+ """
22
+
23
+ model_type = "oculus"
24
+
25
+ def __init__(
26
+ self,
27
+ # Vision encoder settings
28
+ dinov3_model_id: str = "facebook/dinov2-large",
29
+ siglip_model_id: str = "google/siglip-base-patch16-224",
30
+ dinov3_hidden_size: int = 1280, # DINOv3 ViT-H/16+ output dim
31
+ siglip_hidden_size: int = 768, # SigLIP2 base output dim
32
+
33
+ # Projector settings
34
+ projector_hidden_dim: int = 2048,
35
+ num_vision_tokens: int = 64,
36
+
37
+ # Language model settings
38
+ text_model_id: str = "Salesforce/blip-image-captioning-base",
39
+ lm_hidden_size: int = 1536,
40
+ vocab_size: int = 131072,
41
+ max_position_embeddings: int = 32768,
42
+
43
+ # Reasoning settings
44
+ reasoning_enabled: bool = True,
45
+ thinking_token: str = "<think>",
46
+ thinking_end_token: str = "</think>",
47
+ max_thinking_tokens: int = 256,
48
+
49
+ # Output mode settings
50
+ output_mode: str = "text", # "text", "point", "box", "polygon"
51
+ num_detection_classes: int = 80,
52
+ num_segmentation_classes: int = 150,
53
+
54
+ # Generation settings
55
+ max_new_tokens: int = 512,
56
+ temperature: float = 0.7,
57
+ top_p: float = 0.95,
58
+
59
+ # Tool calling / Focus system
60
+ enable_focus: bool = True,
61
+ focus_token: str = "<focus>",
62
+ focus_end_token: str = "</focus>",
63
+
64
+ **kwargs
65
+ ):
66
+ super().__init__(**kwargs)
67
+
68
+ # Vision
69
+ self.dinov3_model_id = dinov3_model_id
70
+ self.siglip_model_id = siglip_model_id
71
+ self.dinov3_hidden_size = dinov3_hidden_size
72
+ self.siglip_hidden_size = siglip_hidden_size
73
+ self.fused_vision_dim = dinov3_hidden_size + siglip_hidden_size
74
+
75
+ # Projector
76
+ self.projector_hidden_dim = projector_hidden_dim
77
+ self.num_vision_tokens = num_vision_tokens
78
+
79
+ # Language model
80
+ self.text_model_id = text_model_id
81
+ self.lm_hidden_size = lm_hidden_size
82
+ self.vocab_size = vocab_size
83
+ self.max_position_embeddings = max_position_embeddings
84
+
85
+ # Reasoning
86
+ self.reasoning_enabled = reasoning_enabled
87
+ self.thinking_token = thinking_token
88
+ self.thinking_end_token = thinking_end_token
89
+ self.max_thinking_tokens = max_thinking_tokens
90
+
91
+ # Output modes
92
+ self.output_mode = output_mode
93
+ self.num_detection_classes = num_detection_classes
94
+ self.num_segmentation_classes = num_segmentation_classes
95
+
96
+ # Generation
97
+ self.max_new_tokens = max_new_tokens
98
+ self.temperature = temperature
99
+ self.top_p = top_p
100
+
101
+ # Focus system
102
+ self.enable_focus = enable_focus
103
+ self.focus_token = focus_token
104
+ self.focus_end_token = focus_end_token
105
+
106
+ @classmethod
107
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
108
+ """Load config from pretrained path."""
109
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
110
+ return cls.from_dict(config_dict, **kwargs)
111
+
112
+ def to_dict(self) -> Dict[str, Any]:
113
+ """Serialize config to dictionary."""
114
+ output = super().to_dict()
115
+ return output
116
+
117
+
118
+ # Register for auto-loading
119
+ OculusConfig.register_for_auto_class()
oculus_unified_model/modeling_oculus.py ADDED
@@ -0,0 +1,842 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Oculus Unified Model
3
+
4
+ HuggingFace-compatible vision-language model with:
5
+ - Multi-encoder vision (DINOv3 + SigLIP2)
6
+ - Trained projector for vision-to-language
7
+ - Optional reasoning with thinking traces
8
+ - Multiple output modes (Text, Point, Box, Polygon)
9
+ - Focus/Zoom tool calling for fine-grained perception
10
+ """
11
+
12
+ import os
13
+ import json
14
+ import warnings
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+ from typing import Optional, Tuple, List, Dict, Any, Union
18
+
19
+ import numpy as np
20
+ import torch
21
+ import torch.nn as nn
22
+ import torch.nn.functional as F
23
+ from transformers import (
24
+ PreTrainedModel,
25
+ PretrainedConfig,
26
+ AutoImageProcessor,
27
+ AutoModel,
28
+ AutoTokenizer,
29
+ AutoModelForCausalLM,
30
+ GenerationConfig,
31
+ )
32
+ from transformers.modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
33
+ from PIL import Image
34
+
35
+ from .configuration_oculus import OculusConfig
36
+
37
+
38
+ # ============================================================================
39
+ # Output Data Classes
40
+ # ============================================================================
41
+
42
+ @dataclass
43
+ class OculusOutput:
44
+ """Base output class for Oculus model."""
45
+ text: Optional[str] = None
46
+ thinking_trace: Optional[str] = None
47
+ logits: Optional[torch.Tensor] = None
48
+ hidden_states: Optional[torch.Tensor] = None
49
+ vision_tokens: Optional[torch.Tensor] = None
50
+
51
+
52
+ @dataclass
53
+ class OculusTextOutput(OculusOutput):
54
+ """Output for text/caption mode."""
55
+ pass
56
+
57
+
58
+ @dataclass
59
+ class OculusPointOutput(OculusOutput):
60
+ """Output for point detection mode (counting objects)."""
61
+ points: Optional[List[Tuple[float, float]]] = None
62
+ labels: Optional[List[str]] = None
63
+ confidences: Optional[List[float]] = None
64
+
65
+
66
+ @dataclass
67
+ class OculusBoxOutput(OculusOutput):
68
+ """Output for bounding box detection mode."""
69
+ boxes: Optional[List[Tuple[float, float, float, float]]] = None # x1, y1, x2, y2
70
+ labels: Optional[List[str]] = None
71
+ confidences: Optional[List[float]] = None
72
+
73
+
74
+ @dataclass
75
+ class OculusPolygonOutput(OculusOutput):
76
+ """Output for polygon/segmentation mode."""
77
+ polygons: Optional[List[List[Tuple[float, float]]]] = None
78
+ labels: Optional[List[str]] = None
79
+ mask: Optional[np.ndarray] = None
80
+
81
+
82
+ # ============================================================================
83
+ # Vision Encoder (DINOv3 + SigLIP2)
84
+ # ============================================================================
85
+
86
+ class OculusVisionEncoder(nn.Module):
87
+ """
88
+ Dual vision encoder combining DINOv3 and SigLIP2.
89
+
90
+ DINOv3: Excellent at semantic understanding, object boundaries
91
+ SigLIP2: Strong at text/language alignment
92
+ """
93
+
94
+ def __init__(self, config: OculusConfig):
95
+ super().__init__()
96
+ self.config = config
97
+
98
+ # Will be loaded lazily
99
+ self.dinov3 = None
100
+ self.dinov3_processor = None
101
+ self.siglip = None
102
+ self.siglip_processor = None
103
+
104
+ self._loaded = False
105
+
106
+ def load_encoders(self, device: str = "cpu"):
107
+ """Load vision encoders from HuggingFace."""
108
+ if self._loaded:
109
+ return
110
+
111
+ print("[Oculus] Loading vision encoders...")
112
+
113
+ # DINOv3
114
+ try:
115
+ self.dinov3_processor = AutoImageProcessor.from_pretrained(
116
+ self.config.dinov3_model_id
117
+ )
118
+ self.dinov3 = AutoModel.from_pretrained(
119
+ self.config.dinov3_model_id
120
+ ).eval().to(device)
121
+ print(f" ✓ DINOv3: {self.config.dinov3_model_id}")
122
+ except Exception as e:
123
+ warnings.warn(f"Failed to load DINOv3: {e}")
124
+ self.dinov3_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
125
+ self.dinov3 = AutoModel.from_pretrained("facebook/dinov2-base").eval().to(device)
126
+ print(" ✓ DINOv2-base (fallback)")
127
+
128
+ # SigLIP2
129
+ try:
130
+ self.siglip_processor = AutoImageProcessor.from_pretrained(
131
+ self.config.siglip_model_id
132
+ )
133
+ self.siglip = AutoModel.from_pretrained(
134
+ self.config.siglip_model_id
135
+ ).eval().to(device)
136
+ print(f" ✓ SigLIP: {self.config.siglip_model_id}")
137
+ except Exception as e:
138
+ warnings.warn(f"Failed to load SigLIP: {e}")
139
+ from transformers import SiglipVisionModel
140
+ self.siglip_processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
141
+ self.siglip = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224").eval().to(device)
142
+ print(" ✓ SigLIP-base (fallback)")
143
+
144
+ self._loaded = True
145
+
146
+ @torch.no_grad()
147
+ def forward(self, image: Union[Image.Image, torch.Tensor, np.ndarray]) -> torch.Tensor:
148
+ """
149
+ Encode image with both vision encoders and fuse features.
150
+
151
+ Returns:
152
+ Fused vision features [batch, fused_dim]
153
+ """
154
+ if not self._loaded:
155
+ self.load_encoders()
156
+
157
+ # Handle different input types
158
+ if isinstance(image, np.ndarray):
159
+ image = Image.fromarray(image)
160
+ elif isinstance(image, torch.Tensor):
161
+ image = Image.fromarray(image.cpu().numpy().astype(np.uint8))
162
+
163
+ if isinstance(image, Image.Image):
164
+ image = image.convert('RGB')
165
+
166
+ device = next(self.dinov3.parameters()).device
167
+
168
+ # DINOv3 encoding
169
+ d_inputs = self.dinov3_processor(images=image, return_tensors="pt")
170
+ d_inputs = {k: v.to(device) for k, v in d_inputs.items()}
171
+ d_out = self.dinov3(**d_inputs)
172
+ d_pooled = d_out.pooler_output if hasattr(d_out, 'pooler_output') and d_out.pooler_output is not None else d_out.last_hidden_state[:, 0]
173
+
174
+ # SigLIP encoding
175
+ s_inputs = self.siglip_processor(images=image, return_tensors="pt")
176
+ s_inputs = {k: v.to(device) for k, v in s_inputs.items()}
177
+
178
+ if hasattr(self.siglip, 'vision_model'):
179
+ s_hidden = self.siglip.vision_model.embeddings(s_inputs['pixel_values'])
180
+ s_pooled = s_hidden.mean(dim=1)
181
+ else:
182
+ s_out = self.siglip(**s_inputs)
183
+ s_pooled = s_out.pooler_output if hasattr(s_out, 'pooler_output') else s_out.last_hidden_state[:, 0]
184
+
185
+ # Fuse features
186
+ fused = torch.cat([d_pooled, s_pooled], dim=-1)
187
+
188
+ return fused
189
+
190
+
191
+ # ============================================================================
192
+ # Vision Projector
193
+ # ============================================================================
194
+
195
+ class OculusProjector(nn.Module):
196
+ """
197
+ Projects fused vision features to language model token space.
198
+
199
+ Converts [batch, fused_dim] → [batch, num_tokens, lm_hidden_size]
200
+ """
201
+
202
+ def __init__(self, config: OculusConfig):
203
+ super().__init__()
204
+ self.config = config
205
+
206
+ fused_dim = config.fused_vision_dim
207
+ hidden_dim = config.projector_hidden_dim
208
+ num_tokens = config.num_vision_tokens
209
+ embed_dim = config.lm_hidden_size
210
+
211
+ self.fc1 = nn.Linear(fused_dim, hidden_dim)
212
+ self.act1 = nn.GELU()
213
+ self.fc2 = nn.Linear(hidden_dim, hidden_dim)
214
+ self.act2 = nn.GELU()
215
+ self.fc3 = nn.Linear(hidden_dim, num_tokens * embed_dim)
216
+ self.norm = nn.LayerNorm(embed_dim)
217
+
218
+ self.num_tokens = num_tokens
219
+ self.embed_dim = embed_dim
220
+
221
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
222
+ """
223
+ Project vision features to token embeddings.
224
+
225
+ Args:
226
+ x: Vision features [batch, fused_dim]
227
+
228
+ Returns:
229
+ Vision tokens [batch, num_tokens, embed_dim]
230
+ """
231
+ batch_size = x.shape[0]
232
+
233
+ h = self.fc1(x)
234
+ h = self.act1(h)
235
+ h = self.fc2(h)
236
+ h = self.act2(h)
237
+ h = self.fc3(h)
238
+
239
+ h = h.reshape(batch_size, self.num_tokens, self.embed_dim)
240
+ h = self.norm(h)
241
+
242
+ return h
243
+
244
+ @classmethod
245
+ def from_pretrained(cls, path: str, config: OculusConfig):
246
+ """Load projector from saved weights."""
247
+ projector = cls(config)
248
+
249
+ weights_path = Path(path) / "projector.npz"
250
+ if weights_path.exists():
251
+ import numpy as np
252
+ weights = np.load(weights_path, allow_pickle=True)
253
+
254
+ state_dict = {}
255
+ for key in weights.files:
256
+ layer_dict = weights[key].item()
257
+ for param_name, param_val in layer_dict.items():
258
+ full_key = f"{key}.{param_name}"
259
+ # Convert from MLX array if needed
260
+ if hasattr(param_val, 'tolist'):
261
+ param_val = np.array(param_val.tolist())
262
+ state_dict[full_key] = torch.from_numpy(np.array(param_val))
263
+
264
+ projector.load_state_dict(state_dict, strict=False)
265
+ print(f" ✓ Loaded projector from {path}")
266
+
267
+ return projector
268
+
269
+
270
+ # ============================================================================
271
+ # Detection/Segmentation Heads
272
+ # ============================================================================
273
+
274
+ class OculusDetectionHead(nn.Module):
275
+ """Head for bounding box detection."""
276
+
277
+ def __init__(self, config: OculusConfig):
278
+ super().__init__()
279
+ hidden_dim = config.lm_hidden_size
280
+ num_classes = config.num_detection_classes
281
+
282
+ self.cls_head = nn.Sequential(
283
+ nn.Linear(hidden_dim, hidden_dim // 2),
284
+ nn.GELU(),
285
+ nn.Linear(hidden_dim // 2, num_classes)
286
+ )
287
+
288
+ self.box_head = nn.Sequential(
289
+ nn.Linear(hidden_dim, hidden_dim // 2),
290
+ nn.GELU(),
291
+ nn.Linear(hidden_dim // 2, 4) # x1, y1, x2, y2
292
+ )
293
+
294
+ def forward(self, vision_tokens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
295
+ """
296
+ Predict boxes and classes from vision tokens.
297
+
298
+ Returns:
299
+ cls_logits: [batch, num_tokens, num_classes]
300
+ box_coords: [batch, num_tokens, 4]
301
+ """
302
+ cls_logits = self.cls_head(vision_tokens)
303
+ box_coords = self.box_head(vision_tokens).sigmoid() # Normalize to [0, 1]
304
+ return cls_logits, box_coords
305
+
306
+
307
+ class OculusPointHead(nn.Module):
308
+ """Head for point detection (object counting)."""
309
+
310
+ def __init__(self, config: OculusConfig):
311
+ super().__init__()
312
+ hidden_dim = config.lm_hidden_size
313
+ num_classes = config.num_detection_classes
314
+
315
+ self.point_head = nn.Sequential(
316
+ nn.Linear(hidden_dim, hidden_dim // 2),
317
+ nn.GELU(),
318
+ nn.Linear(hidden_dim // 2, 2) # x, y
319
+ )
320
+
321
+ self.cls_head = nn.Sequential(
322
+ nn.Linear(hidden_dim, hidden_dim // 2),
323
+ nn.GELU(),
324
+ nn.Linear(hidden_dim // 2, num_classes)
325
+ )
326
+
327
+ self.conf_head = nn.Sequential(
328
+ nn.Linear(hidden_dim, hidden_dim // 4),
329
+ nn.GELU(),
330
+ nn.Linear(hidden_dim // 4, 1)
331
+ )
332
+
333
+ def forward(self, vision_tokens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
334
+ points = self.point_head(vision_tokens).sigmoid()
335
+ cls_logits = self.cls_head(vision_tokens)
336
+ confidence = self.conf_head(vision_tokens).sigmoid()
337
+ return points, cls_logits, confidence
338
+
339
+
340
+ class OculusSegmentationHead(nn.Module):
341
+ """Head for polygon/mask segmentation."""
342
+
343
+ def __init__(self, config: OculusConfig):
344
+ super().__init__()
345
+ hidden_dim = config.lm_hidden_size
346
+ num_classes = config.num_segmentation_classes
347
+
348
+ # Predict mask logits
349
+ self.mask_head = nn.Sequential(
350
+ nn.Linear(hidden_dim, hidden_dim),
351
+ nn.GELU(),
352
+ nn.Linear(hidden_dim, 14 * 14 * num_classes) # Output spatial mask
353
+ )
354
+
355
+ self.num_classes = num_classes
356
+
357
+ def forward(self, vision_tokens: torch.Tensor) -> torch.Tensor:
358
+ batch_size = vision_tokens.shape[0]
359
+ pooled = vision_tokens.mean(dim=1)
360
+ mask_logits = self.mask_head(pooled)
361
+ mask_logits = mask_logits.reshape(batch_size, self.num_classes, 14, 14)
362
+ return mask_logits
363
+
364
+
365
+ # ============================================================================
366
+ # Main Model
367
+ # ============================================================================
368
+
369
+ class OculusForConditionalGeneration(PreTrainedModel):
370
+ """
371
+ Oculus: Unified Vision-Language Model
372
+
373
+ Features:
374
+ - Multi-encoder vision (DINOv3 + SigLIP2)
375
+ - Optional reasoning with thinking traces
376
+ - Multiple output modes: Text, Point, Box, Polygon
377
+ - Focus/Zoom tool calling for fine-grained perception
378
+
379
+ Usage:
380
+ ```python
381
+ from oculus_unified_model import OculusForConditionalGeneration
382
+
383
+ model = OculusForConditionalGeneration.from_pretrained("OceanirAI/oculus-0.2")
384
+
385
+ # Caption mode
386
+ output = model.generate(image, mode="text", prompt="Describe this image")
387
+
388
+ # VQA mode
389
+ output = model.generate(image, mode="text", prompt="What color is the cat?")
390
+
391
+ # With reasoning
392
+ output = model.generate(image, mode="text", prompt="Count the people", think=True)
393
+
394
+ # Detection mode
395
+ output = model.generate(image, mode="box", prompt="Find all cars")
396
+
397
+ # Point mode (counting)
398
+ output = model.generate(image, mode="point", prompt="Count the birds")
399
+
400
+ # Segmentation mode
401
+ output = model.generate(image, mode="polygon", prompt="Segment the road")
402
+ ```
403
+ """
404
+
405
+ config_class = OculusConfig
406
+ base_model_prefix = "oculus"
407
+
408
+ def __init__(self, config: OculusConfig):
409
+ super().__init__(config)
410
+ self.config = config
411
+
412
+ # Vision encoder
413
+ self.vision_encoder = OculusVisionEncoder(config)
414
+
415
+ # Vision adapter (handles dimension mismatch if needed)
416
+ self.vision_adapter = None
417
+ self._actual_vision_dim = None
418
+
419
+ # Projector
420
+ self.projector = OculusProjector(config)
421
+
422
+ # Task-specific heads
423
+ self.detection_head = OculusDetectionHead(config)
424
+ self.point_head = OculusPointHead(config)
425
+ self.segmentation_head = OculusSegmentationHead(config)
426
+
427
+ # Language model (loaded lazily)
428
+ self.lm_tokenizer = None
429
+ self.lm_model = None
430
+ self._lm_loaded = False
431
+
432
+ # Special tokens for reasoning
433
+ self.thinking_token = config.thinking_token
434
+ self.thinking_end_token = config.thinking_end_token
435
+ self.focus_token = config.focus_token
436
+ self.focus_end_token = config.focus_end_token
437
+
438
+ def load_language_model(self, device: str = "cpu"):
439
+ """Load language model for text generation."""
440
+ if self._lm_loaded:
441
+ return
442
+
443
+ print("[Oculus] Loading language model...")
444
+
445
+ try:
446
+ # Try BLIP for now (works well for captioning/VQA)
447
+ from transformers import BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
448
+
449
+ self.lm_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
450
+ self.lm_caption_model = BlipForConditionalGeneration.from_pretrained(
451
+ "Salesforce/blip-image-captioning-base"
452
+ ).to(device)
453
+
454
+ self.lm_vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
455
+ self.lm_vqa_model = BlipForQuestionAnswering.from_pretrained(
456
+ "Salesforce/blip-vqa-base"
457
+ ).to(device)
458
+
459
+ print(" ✓ BLIP (captioning + VQA)")
460
+ self._lm_loaded = True
461
+
462
+ except Exception as e:
463
+ warnings.warn(f"Failed to load language model: {e}")
464
+
465
+ def encode_image(self, image: Union[Image.Image, str, np.ndarray]) -> torch.Tensor:
466
+ """
467
+ Encode image to vision tokens.
468
+
469
+ Args:
470
+ image: PIL Image, file path, or numpy array
471
+
472
+ Returns:
473
+ Vision tokens [1, num_tokens, embed_dim]
474
+ """
475
+ # Load image if path
476
+ if isinstance(image, str):
477
+ image = Image.open(image)
478
+
479
+ # Encode with vision encoders
480
+ vision_features = self.vision_encoder(image)
481
+
482
+ # Check if we need an adapter for dimension mismatch
483
+ actual_dim = vision_features.shape[-1]
484
+ expected_dim = self.config.fused_vision_dim
485
+
486
+ if actual_dim != expected_dim:
487
+ if self.vision_adapter is None or self._actual_vision_dim != actual_dim:
488
+ # Create adapter layer
489
+ print(f" [Adapter] Creating vision adapter: {actual_dim} -> {expected_dim}")
490
+ self.vision_adapter = nn.Linear(actual_dim, expected_dim)
491
+ self._actual_vision_dim = actual_dim
492
+ # Initialize with small weights
493
+ nn.init.xavier_uniform_(self.vision_adapter.weight)
494
+ nn.init.zeros_(self.vision_adapter.bias)
495
+
496
+ vision_features = self.vision_adapter(vision_features)
497
+
498
+ # Project to language space
499
+ vision_tokens = self.projector(vision_features)
500
+
501
+ return vision_tokens
502
+
503
+ def _generate_thinking_trace(
504
+ self,
505
+ image: Image.Image,
506
+ prompt: str,
507
+ max_tokens: int = 256
508
+ ) -> str:
509
+ """
510
+ Generate a thinking/reasoning trace before answering.
511
+
512
+ This enables multi-step reasoning for complex tasks.
513
+ """
514
+ thinking_prompt = f"""Let me think about this step by step:
515
+ 1. First, I'll analyze what I see in the image.
516
+ 2. Then, I'll consider the question: "{prompt}"
517
+ 3. Finally, I'll formulate my answer.
518
+
519
+ Observation: """
520
+
521
+ # Generate reasoning (simplified for now)
522
+ if self._lm_loaded and hasattr(self, 'lm_caption_model'):
523
+ inputs = self.lm_processor(image, thinking_prompt, return_tensors="pt")
524
+ inputs = {k: v.to(self.lm_caption_model.device) for k, v in inputs.items()}
525
+
526
+ with torch.no_grad():
527
+ out = self.lm_caption_model.generate(
528
+ **inputs,
529
+ max_new_tokens=max_tokens,
530
+ do_sample=True,
531
+ temperature=0.7
532
+ )
533
+ thinking = self.lm_processor.decode(out[0], skip_special_tokens=True)
534
+ else:
535
+ thinking = "I observe the image and analyze its contents."
536
+
537
+ return thinking
538
+
539
+ def _detect_focus_regions(
540
+ self,
541
+ image: Image.Image,
542
+ prompt: str
543
+ ) -> List[Tuple[int, int, int, int]]:
544
+ """
545
+ Detect regions that need closer inspection (Focus/Zoom system).
546
+
547
+ Returns list of (x1, y1, x2, y2) crop regions.
548
+ """
549
+ # Simplified: return full image as single region
550
+ # In full implementation, would use attention maps to find regions of interest
551
+ w, h = image.size
552
+ return [(0, 0, w, h)]
553
+
554
+ def generate(
555
+ self,
556
+ image: Union[Image.Image, str, np.ndarray],
557
+ prompt: str = "Describe this image",
558
+ mode: str = "text",
559
+ think: bool = False,
560
+ focus: bool = False,
561
+ max_new_tokens: Optional[int] = None,
562
+ temperature: float = 0.7,
563
+ return_thinking: bool = True,
564
+ **kwargs
565
+ ) -> Union[OculusTextOutput, OculusPointOutput, OculusBoxOutput, OculusPolygonOutput]:
566
+ """
567
+ Generate output from image.
568
+
569
+ Args:
570
+ image: Input image (PIL, path, or array)
571
+ prompt: Text prompt/question
572
+ mode: Output mode ("text", "point", "box", "polygon")
573
+ think: Enable reasoning traces
574
+ focus: Enable zoom/crop for fine-grained perception
575
+ max_new_tokens: Maximum tokens to generate
576
+ temperature: Sampling temperature
577
+ return_thinking: Include thinking trace in output
578
+
579
+ Returns:
580
+ Mode-specific output dataclass
581
+ """
582
+ # Load models if needed
583
+ self.vision_encoder.load_encoders()
584
+ if mode == "text":
585
+ self.load_language_model()
586
+
587
+ # Load image
588
+ if isinstance(image, str):
589
+ image = Image.open(image).convert('RGB')
590
+ elif isinstance(image, np.ndarray):
591
+ image = Image.fromarray(image).convert('RGB')
592
+
593
+ # Encode image
594
+ vision_tokens = self.encode_image(image)
595
+
596
+ # Generate thinking trace if enabled
597
+ thinking_trace = None
598
+ if think and self.config.reasoning_enabled:
599
+ thinking_trace = self._generate_thinking_trace(image, prompt)
600
+
601
+ # Focus system: zoom/crop if needed
602
+ if focus and self.config.enable_focus:
603
+ focus_regions = self._detect_focus_regions(image, prompt)
604
+ # Could re-encode cropped regions here
605
+
606
+ # Mode-specific generation
607
+ if mode == "text":
608
+ return self._generate_text(image, prompt, vision_tokens, thinking_trace, max_new_tokens, **kwargs)
609
+ elif mode == "point":
610
+ return self._generate_points(vision_tokens, thinking_trace, **kwargs)
611
+ elif mode == "box":
612
+ return self._generate_boxes(vision_tokens, thinking_trace, **kwargs)
613
+ elif mode == "polygon":
614
+ return self._generate_polygons(vision_tokens, thinking_trace, **kwargs)
615
+ else:
616
+ raise ValueError(f"Unknown mode: {mode}")
617
+
618
+ def _generate_text(
619
+ self,
620
+ image: Image.Image,
621
+ prompt: str,
622
+ vision_tokens: torch.Tensor,
623
+ thinking_trace: Optional[str],
624
+ max_new_tokens: Optional[int],
625
+ **kwargs
626
+ ) -> OculusTextOutput:
627
+ """Generate text output (caption or VQA)."""
628
+
629
+ device = vision_tokens.device if vision_tokens.is_cuda else "cpu"
630
+ max_tokens = max_new_tokens or self.config.max_new_tokens
631
+
632
+ # Determine if this is a question
633
+ is_question = any(q in prompt.lower() for q in ["what", "where", "who", "how", "why", "is", "are", "does", "do", "can", "?"])
634
+
635
+ if is_question and hasattr(self, 'lm_vqa_model'):
636
+ # VQA mode
637
+ inputs = self.lm_vqa_processor(image, prompt, return_tensors="pt")
638
+ inputs = {k: v.to(device) for k, v in inputs.items()}
639
+
640
+ with torch.no_grad():
641
+ out = self.lm_vqa_model.generate(**inputs, max_new_tokens=50)
642
+ text = self.lm_vqa_processor.decode(out[0], skip_special_tokens=True)
643
+ else:
644
+ # Caption mode
645
+ inputs = self.lm_processor(image, prompt, return_tensors="pt")
646
+ inputs = {k: v.to(device) for k, v in inputs.items()}
647
+
648
+ with torch.no_grad():
649
+ out = self.lm_caption_model.generate(**inputs, max_new_tokens=max_tokens)
650
+ text = self.lm_processor.decode(out[0], skip_special_tokens=True)
651
+
652
+ return OculusTextOutput(
653
+ text=text,
654
+ thinking_trace=thinking_trace,
655
+ vision_tokens=vision_tokens
656
+ )
657
+
658
+ def _generate_points(
659
+ self,
660
+ vision_tokens: torch.Tensor,
661
+ thinking_trace: Optional[str],
662
+ threshold: float = 0.5,
663
+ **kwargs
664
+ ) -> OculusPointOutput:
665
+ """Generate point detections."""
666
+
667
+ points, cls_logits, confidence = self.point_head(vision_tokens)
668
+
669
+ # Filter by confidence
670
+ mask = confidence.squeeze(-1) > threshold
671
+
672
+ filtered_points = []
673
+ filtered_labels = []
674
+ filtered_conf = []
675
+
676
+ for i in range(vision_tokens.shape[0]):
677
+ token_mask = mask[i]
678
+ pts = points[i][token_mask].detach().cpu().numpy().tolist()
679
+ confs = confidence[i][token_mask].squeeze(-1).detach().cpu().numpy().tolist()
680
+ cls_ids = cls_logits[i][token_mask].argmax(dim=-1).detach().cpu().numpy().tolist()
681
+
682
+ filtered_points.extend([tuple(p) for p in pts])
683
+ filtered_conf.extend(confs)
684
+ filtered_labels.extend([str(c) for c in cls_ids])
685
+
686
+ return OculusPointOutput(
687
+ points=filtered_points,
688
+ labels=filtered_labels,
689
+ confidences=filtered_conf,
690
+ thinking_trace=thinking_trace,
691
+ vision_tokens=vision_tokens
692
+ )
693
+
694
+ def _generate_boxes(
695
+ self,
696
+ vision_tokens: torch.Tensor,
697
+ thinking_trace: Optional[str],
698
+ threshold: float = 0.3,
699
+ **kwargs
700
+ ) -> OculusBoxOutput:
701
+ """Generate bounding box detections."""
702
+
703
+ cls_logits, box_coords = self.detection_head(vision_tokens)
704
+
705
+ # Get confidence from class logits
706
+ confidence = F.softmax(cls_logits, dim=-1).max(dim=-1).values
707
+
708
+ filtered_boxes = []
709
+ filtered_labels = []
710
+ filtered_conf = []
711
+
712
+ for i in range(vision_tokens.shape[0]):
713
+ mask = confidence[i] > threshold
714
+ boxes = box_coords[i][mask].detach().cpu().numpy()
715
+ confs = confidence[i][mask].detach().cpu().numpy().tolist()
716
+ cls_ids = cls_logits[i][mask].argmax(dim=-1).detach().cpu().numpy().tolist()
717
+
718
+ filtered_boxes.extend([tuple(b) for b in boxes])
719
+ filtered_conf.extend(confs)
720
+ filtered_labels.extend([str(c) for c in cls_ids])
721
+
722
+ return OculusBoxOutput(
723
+ boxes=filtered_boxes,
724
+ labels=filtered_labels,
725
+ confidences=filtered_conf,
726
+ thinking_trace=thinking_trace,
727
+ vision_tokens=vision_tokens
728
+ )
729
+
730
+ def _generate_polygons(
731
+ self,
732
+ vision_tokens: torch.Tensor,
733
+ thinking_trace: Optional[str],
734
+ **kwargs
735
+ ) -> OculusPolygonOutput:
736
+ """Generate polygon/mask segmentation."""
737
+
738
+ mask_logits = self.segmentation_head(vision_tokens)
739
+
740
+ # Get predicted mask
741
+ mask = mask_logits.argmax(dim=1).detach().cpu().numpy()
742
+
743
+ # Convert to polygons (simplified)
744
+ # In full implementation, would use cv2.findContours
745
+ polygons = []
746
+ labels = []
747
+
748
+ unique_classes = np.unique(mask[0])
749
+ for cls_id in unique_classes:
750
+ if cls_id == 0: # Skip background
751
+ continue
752
+ labels.append(str(cls_id))
753
+ # Placeholder polygon
754
+ polygons.append([(0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0)])
755
+
756
+ return OculusPolygonOutput(
757
+ polygons=polygons,
758
+ labels=labels,
759
+ mask=mask[0],
760
+ thinking_trace=thinking_trace,
761
+ vision_tokens=vision_tokens
762
+ )
763
+
764
+ @classmethod
765
+ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
766
+ """
767
+ Load model from pretrained weights.
768
+
769
+ Args:
770
+ pretrained_model_name_or_path: HuggingFace repo ID or local path
771
+ """
772
+ path = Path(pretrained_model_name_or_path)
773
+
774
+ # Load config
775
+ config_path = path / "config.json"
776
+ if config_path.exists():
777
+ import json
778
+ with open(config_path) as f:
779
+ proj_config = json.load(f)
780
+
781
+ # Create config with correct dimensions from projector
782
+ config = OculusConfig(
783
+ dinov3_hidden_size=proj_config.get("fused_dim", 2048) - 768, # Infer from fused
784
+ siglip_hidden_size=768,
785
+ projector_hidden_dim=proj_config.get("hidden_dim", 2048),
786
+ num_vision_tokens=proj_config.get("num_tokens", 64),
787
+ lm_hidden_size=proj_config.get("embed_dim", 1536),
788
+ )
789
+ else:
790
+ config = OculusConfig()
791
+
792
+ # Create model
793
+ model = cls(config)
794
+
795
+ # Load projector weights
796
+ projector_path = path / "projector.npz"
797
+ if projector_path.exists():
798
+ model.projector = OculusProjector.from_pretrained(path, config)
799
+
800
+ # Load detection/segmentation heads if available
801
+ heads_path = path / "heads.pth"
802
+ if heads_path.exists():
803
+ heads_state = torch.load(heads_path, map_location="cpu")
804
+ model.detection_head.load_state_dict(heads_state.get("detection", {}), strict=False)
805
+ model.point_head.load_state_dict(heads_state.get("point", {}), strict=False)
806
+ model.segmentation_head.load_state_dict(heads_state.get("segmentation", {}), strict=False)
807
+
808
+ return model
809
+
810
+ def save_pretrained(self, save_directory: str):
811
+ """Save model to directory."""
812
+ path = Path(save_directory)
813
+ path.mkdir(parents=True, exist_ok=True)
814
+
815
+ # Save config
816
+ self.config.save_pretrained(path)
817
+
818
+ # Save projector
819
+ projector_state = self.projector.state_dict()
820
+ # Convert to numpy for MLX compatibility
821
+ np_weights = {}
822
+ for k, v in projector_state.items():
823
+ parts = k.split(".")
824
+ layer = parts[0]
825
+ param = ".".join(parts[1:])
826
+ if layer not in np_weights:
827
+ np_weights[layer] = {}
828
+ np_weights[layer][param] = v.cpu().numpy()
829
+ np.savez(path / "projector.npz", **{k: v for k, v in np_weights.items()})
830
+
831
+ # Save heads
832
+ torch.save({
833
+ "detection": self.detection_head.state_dict(),
834
+ "point": self.point_head.state_dict(),
835
+ "segmentation": self.segmentation_head.state_dict(),
836
+ }, path / "heads.pth")
837
+
838
+ print(f"✓ Saved model to {path}")
839
+
840
+
841
+ # Register for auto-loading
842
+ OculusForConditionalGeneration.register_for_auto_class("AutoModelForVision2Seq")
oculus_unified_model/processing_oculus.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Oculus Processor
3
+
4
+ Handles image and text preprocessing for the Oculus model.
5
+ """
6
+
7
+ from typing import Optional, Union, List, Dict, Any
8
+ from PIL import Image
9
+ import numpy as np
10
+
11
+ from transformers import ProcessorMixin, BatchFeature
12
+ from transformers.image_utils import ImageInput
13
+
14
+
15
+ class OculusProcessor(ProcessorMixin):
16
+ """
17
+ Processor for Oculus model.
18
+
19
+ Combines image processing and text tokenization.
20
+
21
+ Usage:
22
+ ```python
23
+ processor = OculusProcessor.from_pretrained("OceanirAI/oculus-0.2")
24
+
25
+ # Process inputs
26
+ inputs = processor(
27
+ images=image,
28
+ text="What is in this image?",
29
+ mode="text",
30
+ return_tensors="pt"
31
+ )
32
+ ```
33
+ """
34
+
35
+ attributes = ["image_processor", "tokenizer"]
36
+ image_processor_class = "AutoImageProcessor"
37
+ tokenizer_class = "AutoTokenizer"
38
+
39
+ def __init__(
40
+ self,
41
+ image_processor=None,
42
+ tokenizer=None,
43
+ **kwargs
44
+ ):
45
+ super().__init__(image_processor, tokenizer)
46
+ self.image_processor = image_processor
47
+ self.tokenizer = tokenizer
48
+
49
+ # Special tokens
50
+ self.thinking_token = kwargs.get("thinking_token", "<think>")
51
+ self.thinking_end_token = kwargs.get("thinking_end_token", "</think>")
52
+ self.focus_token = kwargs.get("focus_token", "<focus>")
53
+ self.focus_end_token = kwargs.get("focus_end_token", "</focus>")
54
+
55
+ # Output mode tokens
56
+ self.mode_tokens = {
57
+ "text": "<text>",
58
+ "point": "<point>",
59
+ "box": "<box>",
60
+ "polygon": "<polygon>",
61
+ }
62
+
63
+ def __call__(
64
+ self,
65
+ images: ImageInput = None,
66
+ text: Union[str, List[str]] = None,
67
+ mode: str = "text",
68
+ think: bool = False,
69
+ return_tensors: Optional[str] = None,
70
+ **kwargs
71
+ ) -> BatchFeature:
72
+ """
73
+ Process images and text for Oculus model.
74
+
75
+ Args:
76
+ images: Input image(s)
77
+ text: Input text prompt(s)
78
+ mode: Output mode ("text", "point", "box", "polygon")
79
+ think: Enable reasoning mode
80
+ return_tensors: Tensor format ("pt", "np", etc.)
81
+
82
+ Returns:
83
+ BatchFeature with processed inputs
84
+ """
85
+ # Process images
86
+ if images is not None:
87
+ if self.image_processor is not None:
88
+ image_features = self.image_processor(images, return_tensors=return_tensors)
89
+ else:
90
+ # Basic processing
91
+ if isinstance(images, Image.Image):
92
+ images = [images]
93
+ image_features = {"pixel_values": images}
94
+ else:
95
+ image_features = {}
96
+
97
+ # Process text
98
+ if text is not None:
99
+ # Add mode and thinking tokens
100
+ processed_text = self._format_prompt(text, mode, think)
101
+
102
+ if self.tokenizer is not None:
103
+ text_features = self.tokenizer(
104
+ processed_text,
105
+ return_tensors=return_tensors,
106
+ padding=True,
107
+ truncation=True,
108
+ **kwargs
109
+ )
110
+ else:
111
+ text_features = {"text": processed_text}
112
+ else:
113
+ text_features = {}
114
+
115
+ # Combine features
116
+ return BatchFeature(
117
+ data={
118
+ **image_features,
119
+ **text_features,
120
+ "mode": mode,
121
+ "think": think,
122
+ },
123
+ tensor_type=return_tensors
124
+ )
125
+
126
+ def _format_prompt(
127
+ self,
128
+ text: Union[str, List[str]],
129
+ mode: str,
130
+ think: bool
131
+ ) -> Union[str, List[str]]:
132
+ """Format prompt with special tokens."""
133
+
134
+ def format_single(t: str) -> str:
135
+ parts = []
136
+
137
+ # Add mode token
138
+ if mode in self.mode_tokens:
139
+ parts.append(self.mode_tokens[mode])
140
+
141
+ # Add thinking token if enabled
142
+ if think:
143
+ parts.append(self.thinking_token)
144
+
145
+ # Add prompt
146
+ parts.append(t)
147
+
148
+ return " ".join(parts)
149
+
150
+ if isinstance(text, str):
151
+ return format_single(text)
152
+ else:
153
+ return [format_single(t) for t in text]
154
+
155
+ def decode(
156
+ self,
157
+ token_ids,
158
+ skip_special_tokens: bool = True,
159
+ **kwargs
160
+ ) -> str:
161
+ """Decode token IDs to text."""
162
+ if self.tokenizer is not None:
163
+ text = self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
164
+ else:
165
+ text = str(token_ids)
166
+
167
+ # Parse thinking trace if present
168
+ thinking_trace = None
169
+ if self.thinking_token in text and self.thinking_end_token in text:
170
+ start = text.find(self.thinking_token) + len(self.thinking_token)
171
+ end = text.find(self.thinking_end_token)
172
+ thinking_trace = text[start:end].strip()
173
+ text = text[end + len(self.thinking_end_token):].strip()
174
+
175
+ return text, thinking_trace
176
+
177
+ def batch_decode(
178
+ self,
179
+ token_ids,
180
+ skip_special_tokens: bool = True,
181
+ **kwargs
182
+ ) -> List[str]:
183
+ """Decode batch of token IDs."""
184
+ return [
185
+ self.decode(ids, skip_special_tokens=skip_special_tokens, **kwargs)
186
+ for ids in token_ids
187
+ ]
188
+
189
+ @classmethod
190
+ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
191
+ """Load processor from pretrained."""
192
+ try:
193
+ from transformers import AutoImageProcessor, AutoTokenizer
194
+
195
+ image_processor = AutoImageProcessor.from_pretrained(
196
+ pretrained_model_name_or_path, **kwargs
197
+ )
198
+ tokenizer = AutoTokenizer.from_pretrained(
199
+ pretrained_model_name_or_path, **kwargs
200
+ )
201
+ return cls(image_processor=image_processor, tokenizer=tokenizer, **kwargs)
202
+ except:
203
+ # Return basic processor without HF components
204
+ return cls(**kwargs)
205
+
206
+ def save_pretrained(self, save_directory: str, **kwargs):
207
+ """Save processor to directory."""
208
+ if self.image_processor is not None:
209
+ self.image_processor.save_pretrained(save_directory)
210
+ if self.tokenizer is not None:
211
+ self.tokenizer.save_pretrained(save_directory)
trained_components/heads.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6227a8bdb1d7037a9667cdec18061af9c4f3771fd4f62b0afbe68c5e44bdf3d1
3
+ size 36454441
trained_components/projector.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94ed66d364bfdb636d28d537802ef16dfcd3407ed750b30688628c28f7684562
3
+ size 839285719
vision_encoders/dinov2-large/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
vision_encoders/dinov2-large/README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - dino
5
+ - vision
6
+ ---
7
+
8
+ # Vision Transformer (large-sized model) trained using DINOv2
9
+
10
+ Vision Transformer (ViT) model trained using the DINOv2 method. It was introduced in the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Oquab et al. and first released in [this repository](https://github.com/facebookresearch/dinov2).
11
+
12
+ Disclaimer: The team releasing DINOv2 did not write a model card for this model so this model card has been written by the Hugging Face team.
13
+
14
+ ## Model description
15
+
16
+ The Vision Transformer (ViT) is a transformer encoder model (BERT-like) pretrained on a large collection of images in a self-supervised fashion.
17
+
18
+ Images are presented to the model as a sequence of fixed-size patches, which are linearly embedded. One also adds a [CLS] token to the beginning of a sequence to use it for classification tasks. One also adds absolute position embeddings before feeding the sequence to the layers of the Transformer encoder.
19
+
20
+ Note that this model does not include any fine-tuned heads.
21
+
22
+ By pre-training the model, it learns an inner representation of images that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled images for instance, you can train a standard classifier by placing a linear layer on top of the pre-trained encoder. One typically places a linear layer on top of the [CLS] token, as the last hidden state of this token can be seen as a representation of an entire image.
23
+
24
+ ## Intended uses & limitations
25
+
26
+ You can use the raw model for feature extraction. See the [model hub](https://huggingface.co/models?search=facebook/dinov2) to look for
27
+ fine-tuned versions on a task that interests you.
28
+
29
+ ### How to use
30
+
31
+ Here is how to use this model:
32
+
33
+ ```python
34
+ from transformers import AutoImageProcessor, AutoModel
35
+ from PIL import Image
36
+ import requests
37
+
38
+ url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
39
+ image = Image.open(requests.get(url, stream=True).raw)
40
+
41
+ processor = AutoImageProcessor.from_pretrained('facebook/dinov2-large')
42
+ model = AutoModel.from_pretrained('facebook/dinov2-large')
43
+
44
+ inputs = processor(images=image, return_tensors="pt")
45
+ outputs = model(**inputs)
46
+ last_hidden_states = outputs.last_hidden_state
47
+ ```
48
+
49
+ ### BibTeX entry and citation info
50
+
51
+ ```bibtex
52
+ misc{oquab2023dinov2,
53
+ title={DINOv2: Learning Robust Visual Features without Supervision},
54
+ author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski},
55
+ year={2023},
56
+ eprint={2304.07193},
57
+ archivePrefix={arXiv},
58
+ primaryClass={cs.CV}
59
+ }
60
+ ```
vision_encoders/dinov2-large/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Dinov2Model"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "drop_path_rate": 0.0,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.0,
9
+ "hidden_size": 1024,
10
+ "image_size": 518,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_eps": 1e-06,
13
+ "layerscale_value": 1.0,
14
+ "mlp_ratio": 4,
15
+ "model_type": "dinov2",
16
+ "num_attention_heads": 16,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 24,
19
+ "patch_size": 14,
20
+ "qkv_bias": true,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.31.0.dev0",
23
+ "use_swiglu_ffn": false
24
+ }
vision_encoders/dinov2-large/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:399fba97a95f22c36834418bc69373364a99af3a1153da1c0fb31db567c92e23
3
+ size 1217522888
vision_encoders/dinov2-large/preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.485,
13
+ 0.456,
14
+ 0.406
15
+ ],
16
+ "image_processor_type": "BitImageProcessor",
17
+ "image_std": [
18
+ 0.229,
19
+ 0.224,
20
+ 0.225
21
+ ],
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "shortest_edge": 256
26
+ }
27
+ }
vision_encoders/dinov2-large/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8320e4778a7f8850d10f30d97e9138438e1851af1576fea789c43746140cc655
3
+ size 1217614569
vision_encoders/siglip-base/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
vision_encoders/siglip-base/README.md ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - vision
5
+ widget:
6
+ - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
7
+ candidate_labels: playing music, playing sports
8
+ example_title: Cat & Dog
9
+ ---
10
+
11
+ # SigLIP (base-sized model)
12
+
13
+ SigLIP model pre-trained on WebLi at resolution 224x224. It was introduced in the paper [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Zhai et al. and first released in [this repository](https://github.com/google-research/big_vision).
14
+
15
+ Disclaimer: The team releasing SigLIP did not write a model card for this model so this model card has been written by the Hugging Face team.
16
+
17
+ ## Model description
18
+
19
+ SigLIP is [CLIP](https://huggingface.co/docs/transformers/model_doc/clip), a multimodal model, with a better loss function. The sigmoid loss operates solely on image-text pairs and does not require a global view of the pairwise similarities for normalization. This allows further scaling up the batch size, while also performing better at smaller batch sizes.
20
+
21
+ A TLDR of SigLIP by one of the authors can be found [here](https://twitter.com/giffmana/status/1692641733459267713).
22
+
23
+ ## Intended uses & limitations
24
+
25
+ You can use the raw model for tasks like zero-shot image classification and image-text retrieval. See the [model hub](https://huggingface.co/models?search=google/siglip) to look for
26
+ other versions on a task that interests you.
27
+
28
+ ### How to use
29
+
30
+ Here is how to use this model to perform zero-shot image classification:
31
+
32
+ ```python
33
+ from PIL import Image
34
+ import requests
35
+ from transformers import AutoProcessor, AutoModel
36
+ import torch
37
+
38
+ model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
39
+ processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
40
+
41
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
42
+ image = Image.open(requests.get(url, stream=True).raw)
43
+
44
+ texts = ["a photo of 2 cats", "a photo of 2 dogs"]
45
+ inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
46
+
47
+ with torch.no_grad():
48
+ outputs = model(**inputs)
49
+
50
+ logits_per_image = outputs.logits_per_image
51
+ probs = torch.sigmoid(logits_per_image) # these are the probabilities
52
+ print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
53
+ ```
54
+
55
+ Alternatively, one can leverage the pipeline API which abstracts away the complexity for the user:
56
+
57
+ ```python
58
+ from transformers import pipeline
59
+ from PIL import Image
60
+ import requests
61
+
62
+ # load pipe
63
+ image_classifier = pipeline(task="zero-shot-image-classification", model="google/siglip-base-patch16-224")
64
+
65
+ # load image
66
+ url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
67
+ image = Image.open(requests.get(url, stream=True).raw)
68
+
69
+ # inference
70
+ outputs = image_classifier(image, candidate_labels=["2 cats", "a plane", "a remote"])
71
+ outputs = [{"score": round(output["score"], 4), "label": output["label"] } for output in outputs]
72
+ print(outputs)
73
+ ```
74
+ For more code examples, we refer to the [documentation](https://huggingface.co/transformers/main/model_doc/siglip.html#).
75
+
76
+ ## Training procedure
77
+
78
+ ### Training data
79
+
80
+ SigLIP is pre-trained on the English image-text pairs of the WebLI dataset [(Chen et al., 2023)](https://arxiv.org/abs/2209.06794).
81
+
82
+ ### Preprocessing
83
+
84
+ Images are resized/rescaled to the same resolution (224x224) and normalized across the RGB channels with mean (0.5, 0.5, 0.5) and standard deviation (0.5, 0.5, 0.5).
85
+
86
+ Texts are tokenized and padded to the same length (64 tokens).
87
+
88
+ ### Compute
89
+
90
+ The model was trained on 16 TPU-v4 chips for three days.
91
+
92
+ ## Evaluation results
93
+
94
+ Evaluation of SigLIP compared to CLIP is shown below (taken from the paper).
95
+
96
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/siglip_table.jpeg"
97
+ alt="drawing" width="600"/>
98
+
99
+ ### BibTeX entry and citation info
100
+
101
+ ```bibtex
102
+ @misc{zhai2023sigmoid,
103
+ title={Sigmoid Loss for Language Image Pre-Training},
104
+ author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
105
+ year={2023},
106
+ eprint={2303.15343},
107
+ archivePrefix={arXiv},
108
+ primaryClass={cs.CV}
109
+ }
110
+ ```
vision_encoders/siglip-base/config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SiglipModel"
4
+ ],
5
+ "initializer_factor": 1.0,
6
+ "model_type": "siglip",
7
+ "text_config": {
8
+ "hidden_size": 768,
9
+ "intermediate_size": 3072,
10
+ "model_type": "siglip_text_model",
11
+ "num_attention_heads": 12,
12
+ "vocab_size": 32000
13
+ },
14
+ "torch_dtype": "float32",
15
+ "transformers_version": "4.37.0.dev0",
16
+ "vision_config": {
17
+ "model_type": "siglip_vision_model",
18
+ "patch_size": 16
19
+ }
20
+ }
vision_encoders/siglip-base/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c63cb7d1f2e95ba501893cbb8faeb4ea9a3af295498d35097126228659c2af8
3
+ size 812672320
vision_encoders/siglip-base/preprocessor_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_processor_type": "SiglipImageProcessor",
11
+ "image_std": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "processor_class": "SiglipProcessor",
17
+ "resample": 3,
18
+ "rescale_factor": 0.00392156862745098,
19
+ "size": {
20
+ "height": 224,
21
+ "width": 224
22
+ }
23
+ }
vision_encoders/siglip-base/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb93f7f526b0a1b0e5f0612630f142bc5b6c05d329edff70478ff0a83e2bcd6e
3
+ size 812762989
vision_encoders/siglip-base/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": {
3
+ "content": "</s>",
4
+ "lstrip": true,
5
+ "normalized": false,
6
+ "rstrip": true,
7
+ "single_word": false
8
+ },
9
+ "pad_token": {
10
+ "content": "</s>",
11
+ "lstrip": true,
12
+ "normalized": false,
13
+ "rstrip": true,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": true,
19
+ "normalized": false,
20
+ "rstrip": true,
21
+ "single_word": false
22
+ }
23
+ }
vision_encoders/siglip-base/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e5036bed065526c3c212dfbe288752391797c4bb1a284aa18c9a0b23fcaf8ec
3
+ size 798330
vision_encoders/siglip-base/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
vision_encoders/siglip-base/tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "1": {
4
+ "content": "</s>",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "2": {
12
+ "content": "<unk>",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": true
18
+ }
19
+ },
20
+ "additional_special_tokens": [],
21
+ "clean_up_tokenization_spaces": true,
22
+ "do_lower_case": true,
23
+ "eos_token": "</s>",
24
+ "model_input_names": [
25
+ "input_ids"
26
+ ],
27
+ "model_max_length": 64,
28
+ "pad_token": "</s>",
29
+ "processor_class": "SiglipProcessor",
30
+ "sp_model_kwargs": {},
31
+ "tokenizer_class": "SiglipTokenizer",
32
+ "unk_token": "<unk>"
33
+ }