Upload folder using huggingface_hub
Browse files- LICENSE +75 -0
- README.md +99 -0
- config.json +59 -0
- language_model/blip-captioning/.gitattributes +34 -0
- language_model/blip-captioning/README.md +156 -0
- language_model/blip-captioning/config.json +169 -0
- language_model/blip-captioning/preprocessor_config.json +17 -0
- language_model/blip-captioning/pytorch_model.bin +3 -0
- language_model/blip-captioning/special_tokens_map.json +7 -0
- language_model/blip-captioning/tf_model.h5 +3 -0
- language_model/blip-captioning/tokenizer.json +0 -0
- language_model/blip-captioning/tokenizer_config.json +21 -0
- language_model/blip-captioning/vocab.txt +0 -0
- language_model/blip-vqa-finetuned/config.json +58 -0
- language_model/blip-vqa-finetuned/generation_config.json +4 -0
- language_model/blip-vqa-finetuned/model.safetensors +3 -0
- language_model/blip-vqa-finetuned/processor_config.json +29 -0
- language_model/blip-vqa-finetuned/tokenizer.json +0 -0
- language_model/blip-vqa-finetuned/tokenizer_config.json +22 -0
- oculus_unified_model/README.md +220 -0
- oculus_unified_model/__init__.py +35 -0
- oculus_unified_model/__pycache__/__init__.cpython-312.pyc +0 -0
- oculus_unified_model/__pycache__/configuration_oculus.cpython-312.pyc +0 -0
- oculus_unified_model/__pycache__/modeling_oculus.cpython-312.pyc +0 -0
- oculus_unified_model/__pycache__/processing_oculus.cpython-312.pyc +0 -0
- oculus_unified_model/configuration_oculus.py +119 -0
- oculus_unified_model/modeling_oculus.py +842 -0
- oculus_unified_model/processing_oculus.py +211 -0
- trained_components/heads.pth +3 -0
- trained_components/projector.npz +3 -0
- vision_encoders/dinov2-large/.gitattributes +35 -0
- vision_encoders/dinov2-large/README.md +60 -0
- vision_encoders/dinov2-large/config.json +24 -0
- vision_encoders/dinov2-large/model.safetensors +3 -0
- vision_encoders/dinov2-large/preprocessor_config.json +27 -0
- vision_encoders/dinov2-large/pytorch_model.bin +3 -0
- vision_encoders/siglip-base/.gitattributes +35 -0
- vision_encoders/siglip-base/README.md +110 -0
- vision_encoders/siglip-base/config.json +20 -0
- vision_encoders/siglip-base/model.safetensors +3 -0
- vision_encoders/siglip-base/preprocessor_config.json +23 -0
- vision_encoders/siglip-base/pytorch_model.bin +3 -0
- vision_encoders/siglip-base/special_tokens_map.json +23 -0
- vision_encoders/siglip-base/spiece.model +3 -0
- vision_encoders/siglip-base/tokenizer.json +0 -0
- vision_encoders/siglip-base/tokenizer_config.json +33 -0
LICENSE
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
OCEANIR RESEARCH LICENSE
|
| 2 |
+
Version 1.0, January 2026
|
| 3 |
+
|
| 4 |
+
Copyright (c) 2026 OceanirAI
|
| 5 |
+
|
| 6 |
+
TERMS AND CONDITIONS
|
| 7 |
+
|
| 8 |
+
1. DEFINITIONS
|
| 9 |
+
|
| 10 |
+
"Software" refers to the Oculus model weights, code, and associated materials
|
| 11 |
+
distributed under this license.
|
| 12 |
+
|
| 13 |
+
"Research Use" means non-commercial academic research, educational purposes,
|
| 14 |
+
and personal experimentation for learning.
|
| 15 |
+
|
| 16 |
+
"Commercial Use" means any use intended for or directed toward commercial
|
| 17 |
+
advantage or monetary compensation.
|
| 18 |
+
|
| 19 |
+
2. GRANT OF LICENSE
|
| 20 |
+
|
| 21 |
+
Subject to the terms of this License, OceanirAI grants you a non-exclusive,
|
| 22 |
+
worldwide, royalty-free license to use, copy, and modify the Software for
|
| 23 |
+
Research Use only.
|
| 24 |
+
|
| 25 |
+
3. PERMITTED USES
|
| 26 |
+
|
| 27 |
+
You MAY:
|
| 28 |
+
- Use the Software for academic research
|
| 29 |
+
- Use the Software for educational purposes
|
| 30 |
+
- Publish research papers using results obtained from the Software
|
| 31 |
+
- Modify the Software for Research Use
|
| 32 |
+
- Share modifications under this same license
|
| 33 |
+
- Use the Software in academic courses and tutorials
|
| 34 |
+
|
| 35 |
+
4. PROHIBITED USES
|
| 36 |
+
|
| 37 |
+
You MAY NOT:
|
| 38 |
+
- Use the Software for any Commercial Use
|
| 39 |
+
- Sell, license, or sublicense the Software
|
| 40 |
+
- Use the Software to train models for commercial deployment
|
| 41 |
+
- Integrate the Software into commercial products or services
|
| 42 |
+
- Use the Software to provide commercial services
|
| 43 |
+
- Remove or alter any license notices or attributions
|
| 44 |
+
|
| 45 |
+
5. ATTRIBUTION
|
| 46 |
+
|
| 47 |
+
Any publication, presentation, or distribution of work using this Software
|
| 48 |
+
must include the following citation:
|
| 49 |
+
|
| 50 |
+
"Oculus Vision-Language Model, OceanirAI, 2026"
|
| 51 |
+
|
| 52 |
+
6. NO WARRANTY
|
| 53 |
+
|
| 54 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 55 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 56 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
| 57 |
+
OCEANIR AI OR CONTRIBUTORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 58 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
| 59 |
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
| 60 |
+
DEALINGS IN THE SOFTWARE.
|
| 61 |
+
|
| 62 |
+
7. TERMINATION
|
| 63 |
+
|
| 64 |
+
This License and the rights granted hereunder will terminate automatically
|
| 65 |
+
upon any breach by you of the terms of this License.
|
| 66 |
+
|
| 67 |
+
8. COMMERCIAL LICENSING
|
| 68 |
+
|
| 69 |
+
For commercial licensing inquiries, please contact: licensing@oceanir.ai
|
| 70 |
+
|
| 71 |
+
9. GOVERNING LAW
|
| 72 |
+
|
| 73 |
+
This License shall be governed by and construed in accordance with the laws
|
| 74 |
+
of the State of California, United States, without regard to its conflict
|
| 75 |
+
of law provisions.
|
README.md
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: other
|
| 3 |
+
license_name: oceanir-research-license
|
| 4 |
+
license_link: LICENSE
|
| 5 |
+
language:
|
| 6 |
+
- en
|
| 7 |
+
library_name: oceanir
|
| 8 |
+
pipeline_tag: image-text-to-text
|
| 9 |
+
tags:
|
| 10 |
+
- vision
|
| 11 |
+
- multimodal
|
| 12 |
+
- vision-language
|
| 13 |
+
- vqa
|
| 14 |
+
- reasoning
|
| 15 |
+
- chain-of-thought
|
| 16 |
+
- instruction-following
|
| 17 |
+
- oculus
|
| 18 |
+
- standalone
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
# Oculus 0.1 (Unified ~8GB)
|
| 22 |
+
|
| 23 |
+
**Complete standalone vision-language model with both instruction-following and chain-of-thought reasoning.**
|
| 24 |
+
|
| 25 |
+
Oculus 0.1 combines the best of both worlds:
|
| 26 |
+
- **Instruct**: Natural instruction following, image captioning, VQA
|
| 27 |
+
- **Reasoning**: Chain-of-thought thinking with `<think>...</think>` tokens
|
| 28 |
+
|
| 29 |
+
This package includes ALL model weights bundled together:
|
| 30 |
+
- DINOv2-Large vision encoder (~2.3GB)
|
| 31 |
+
- SigLIP vision encoder (~1.1GB)
|
| 32 |
+
- BLIP language models (~3GB)
|
| 33 |
+
- Trained projector & heads (~835MB)
|
| 34 |
+
- Unified VQA model (~1.5GB)
|
| 35 |
+
|
| 36 |
+
## Installation
|
| 37 |
+
|
| 38 |
+
```bash
|
| 39 |
+
pip install oceanir
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
## Usage
|
| 43 |
+
|
| 44 |
+
```python
|
| 45 |
+
from oceanir import Oculus
|
| 46 |
+
|
| 47 |
+
# Load unified model
|
| 48 |
+
model = Oculus.from_pretrained("OceanirAI/Oculus-0.1")
|
| 49 |
+
|
| 50 |
+
# Instruction following
|
| 51 |
+
answer = model.ask("photo.jpg", "Describe what's happening in this image")
|
| 52 |
+
|
| 53 |
+
# Chain-of-thought reasoning
|
| 54 |
+
answer = model.ask(
|
| 55 |
+
"complex_scene.jpg",
|
| 56 |
+
"How many red cars are on the left side?",
|
| 57 |
+
think=True # Enable reasoning
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Captioning
|
| 61 |
+
caption = model.caption("image.jpg")
|
| 62 |
+
|
| 63 |
+
# Detection
|
| 64 |
+
results = model.detect("image.jpg")
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
## Capabilities
|
| 68 |
+
|
| 69 |
+
| Task | Method | Description |
|
| 70 |
+
|------|--------|-------------|
|
| 71 |
+
| VQA | `model.ask(image, question)` | Answer questions about images |
|
| 72 |
+
| Reasoning | `model.ask(image, question, think=True)` | Chain-of-thought reasoning |
|
| 73 |
+
| Captioning | `model.caption(image)` | Generate image descriptions |
|
| 74 |
+
| Detection | `model.detect(image)` | Object detection (80 COCO classes) |
|
| 75 |
+
|
| 76 |
+
## Model Structure
|
| 77 |
+
|
| 78 |
+
```
|
| 79 |
+
Oculus-0.1/
|
| 80 |
+
├── config.json
|
| 81 |
+
├── vision_encoders/
|
| 82 |
+
│ ├── dinov2-large/ # DINOv2 ViT-L (~2.3GB)
|
| 83 |
+
│ └── siglip-base/ # SigLIP (~1.1GB)
|
| 84 |
+
├── language_model/
|
| 85 |
+
│ ├── blip-captioning/ # BLIP captioning
|
| 86 |
+
│ └── blip-vqa-finetuned/ # Unified VQA (~1.5GB)
|
| 87 |
+
├── trained_components/
|
| 88 |
+
│ ├── projector.npz # Vision projector (~800MB)
|
| 89 |
+
│ └── heads.pth # Detection heads (~35MB)
|
| 90 |
+
└── oculus_unified_model/ # Model code
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
## Total Size: ~8GB
|
| 94 |
+
|
| 95 |
+
## License
|
| 96 |
+
|
| 97 |
+
Oceanir Research License - Non-commercial research only.
|
| 98 |
+
|
| 99 |
+
For commercial licensing: licensing@oceanir.ai
|
config.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "oculus",
|
| 3 |
+
"architectures": ["OculusForConditionalGeneration"],
|
| 4 |
+
"variant": "Unified",
|
| 5 |
+
"version": "0.1",
|
| 6 |
+
|
| 7 |
+
"vision_encoders": {
|
| 8 |
+
"dinov2": {
|
| 9 |
+
"path": "vision_encoders/dinov2-large",
|
| 10 |
+
"model_id": "facebook/dinov2-large",
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"num_layers": 24,
|
| 13 |
+
"num_heads": 16
|
| 14 |
+
},
|
| 15 |
+
"siglip": {
|
| 16 |
+
"path": "vision_encoders/siglip-base",
|
| 17 |
+
"model_id": "google/siglip-base-patch16-224",
|
| 18 |
+
"hidden_size": 768,
|
| 19 |
+
"num_layers": 12,
|
| 20 |
+
"num_heads": 12
|
| 21 |
+
}
|
| 22 |
+
},
|
| 23 |
+
|
| 24 |
+
"language_model": {
|
| 25 |
+
"captioning": {
|
| 26 |
+
"path": "language_model/blip-captioning",
|
| 27 |
+
"model_id": "Salesforce/blip-image-captioning-base"
|
| 28 |
+
},
|
| 29 |
+
"vqa": {
|
| 30 |
+
"path": "language_model/blip-vqa-finetuned",
|
| 31 |
+
"base_model_id": "Salesforce/blip-vqa-base",
|
| 32 |
+
"finetuned": true
|
| 33 |
+
}
|
| 34 |
+
},
|
| 35 |
+
|
| 36 |
+
"trained_components": {
|
| 37 |
+
"projector": "trained_components/projector.npz",
|
| 38 |
+
"heads": "trained_components/heads.pth"
|
| 39 |
+
},
|
| 40 |
+
|
| 41 |
+
"projector_config": {
|
| 42 |
+
"fused_vision_dim": 1792,
|
| 43 |
+
"hidden_dim": 2048,
|
| 44 |
+
"num_tokens": 64,
|
| 45 |
+
"output_dim": 768
|
| 46 |
+
},
|
| 47 |
+
|
| 48 |
+
"task_heads": {
|
| 49 |
+
"detection_classes": 80,
|
| 50 |
+
"segmentation_classes": 150
|
| 51 |
+
},
|
| 52 |
+
|
| 53 |
+
"instruct_enabled": true,
|
| 54 |
+
"reasoning_enabled": true,
|
| 55 |
+
"thinking_token": "<think>",
|
| 56 |
+
"thinking_end_token": "</think>",
|
| 57 |
+
"max_thinking_tokens": 256,
|
| 58 |
+
"standalone": true
|
| 59 |
+
}
|
language_model/blip-captioning/.gitattributes
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
language_model/blip-captioning/README.md
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
pipeline_tag: image-to-text
|
| 3 |
+
tags:
|
| 4 |
+
- image-captioning
|
| 5 |
+
languages:
|
| 6 |
+
- en
|
| 7 |
+
license: bsd-3-clause
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
|
| 11 |
+
|
| 12 |
+
Model card for image captioning pretrained on COCO dataset - base architecture (with ViT base backbone).
|
| 13 |
+
|
| 14 |
+
|  |
|
| 15 |
+
|:--:|
|
| 16 |
+
| <b> Pull figure from BLIP official repo | Image source: https://github.com/salesforce/BLIP </b>|
|
| 17 |
+
|
| 18 |
+
## TL;DR
|
| 19 |
+
|
| 20 |
+
Authors from the [paper](https://arxiv.org/abs/2201.12086) write in the abstract:
|
| 21 |
+
|
| 22 |
+
*Vision-Language Pre-training (VLP) has advanced the performance for many vision-language tasks. However, most existing pre-trained models only excel in either understanding-based tasks or generation-based tasks. Furthermore, performance improvement has been largely achieved by scaling up the dataset with noisy image-text pairs collected from the web, which is a suboptimal source of supervision. In this paper, we propose BLIP, a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks. BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). BLIP also demonstrates strong generalization ability when directly transferred to videolanguage tasks in a zero-shot manner. Code, models, and datasets are released.*
|
| 23 |
+
|
| 24 |
+
## Usage
|
| 25 |
+
|
| 26 |
+
You can use this model for conditional and un-conditional image captioning
|
| 27 |
+
|
| 28 |
+
### Using the Pytorch model
|
| 29 |
+
|
| 30 |
+
#### Running the model on CPU
|
| 31 |
+
|
| 32 |
+
<details>
|
| 33 |
+
<summary> Click to expand </summary>
|
| 34 |
+
|
| 35 |
+
```python
|
| 36 |
+
import requests
|
| 37 |
+
from PIL import Image
|
| 38 |
+
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 39 |
+
|
| 40 |
+
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 41 |
+
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 42 |
+
|
| 43 |
+
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
|
| 44 |
+
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
|
| 45 |
+
|
| 46 |
+
# conditional image captioning
|
| 47 |
+
text = "a photography of"
|
| 48 |
+
inputs = processor(raw_image, text, return_tensors="pt")
|
| 49 |
+
|
| 50 |
+
out = model.generate(**inputs)
|
| 51 |
+
print(processor.decode(out[0], skip_special_tokens=True))
|
| 52 |
+
# >>> a photography of a woman and her dog
|
| 53 |
+
|
| 54 |
+
# unconditional image captioning
|
| 55 |
+
inputs = processor(raw_image, return_tensors="pt")
|
| 56 |
+
|
| 57 |
+
out = model.generate(**inputs)
|
| 58 |
+
print(processor.decode(out[0], skip_special_tokens=True))
|
| 59 |
+
>>> a woman sitting on the beach with her dog
|
| 60 |
+
```
|
| 61 |
+
</details>
|
| 62 |
+
|
| 63 |
+
#### Running the model on GPU
|
| 64 |
+
|
| 65 |
+
##### In full precision
|
| 66 |
+
|
| 67 |
+
<details>
|
| 68 |
+
<summary> Click to expand </summary>
|
| 69 |
+
|
| 70 |
+
```python
|
| 71 |
+
import requests
|
| 72 |
+
from PIL import Image
|
| 73 |
+
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 74 |
+
|
| 75 |
+
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 76 |
+
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")
|
| 77 |
+
|
| 78 |
+
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
|
| 79 |
+
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
|
| 80 |
+
|
| 81 |
+
# conditional image captioning
|
| 82 |
+
text = "a photography of"
|
| 83 |
+
inputs = processor(raw_image, text, return_tensors="pt").to("cuda")
|
| 84 |
+
|
| 85 |
+
out = model.generate(**inputs)
|
| 86 |
+
print(processor.decode(out[0], skip_special_tokens=True))
|
| 87 |
+
# >>> a photography of a woman and her dog
|
| 88 |
+
|
| 89 |
+
# unconditional image captioning
|
| 90 |
+
inputs = processor(raw_image, return_tensors="pt").to("cuda")
|
| 91 |
+
|
| 92 |
+
out = model.generate(**inputs)
|
| 93 |
+
print(processor.decode(out[0], skip_special_tokens=True))
|
| 94 |
+
>>> a woman sitting on the beach with her dog
|
| 95 |
+
```
|
| 96 |
+
</details>
|
| 97 |
+
|
| 98 |
+
##### In half precision (`float16`)
|
| 99 |
+
|
| 100 |
+
<details>
|
| 101 |
+
<summary> Click to expand </summary>
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
import torch
|
| 105 |
+
import requests
|
| 106 |
+
from PIL import Image
|
| 107 |
+
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 108 |
+
|
| 109 |
+
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 110 |
+
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float16).to("cuda")
|
| 111 |
+
|
| 112 |
+
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
|
| 113 |
+
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
|
| 114 |
+
|
| 115 |
+
# conditional image captioning
|
| 116 |
+
text = "a photography of"
|
| 117 |
+
inputs = processor(raw_image, text, return_tensors="pt").to("cuda", torch.float16)
|
| 118 |
+
|
| 119 |
+
out = model.generate(**inputs)
|
| 120 |
+
print(processor.decode(out[0], skip_special_tokens=True))
|
| 121 |
+
# >>> a photography of a woman and her dog
|
| 122 |
+
|
| 123 |
+
# unconditional image captioning
|
| 124 |
+
inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)
|
| 125 |
+
|
| 126 |
+
out = model.generate(**inputs)
|
| 127 |
+
print(processor.decode(out[0], skip_special_tokens=True))
|
| 128 |
+
>>> a woman sitting on the beach with her dog
|
| 129 |
+
```
|
| 130 |
+
</details>
|
| 131 |
+
|
| 132 |
+
## Ethical Considerations
|
| 133 |
+
This release is for research purposes only in support of an academic paper. Our models, datasets, and code are not specifically designed or evaluated for all downstream purposes. We strongly recommend users evaluate and address potential concerns related to accuracy, safety, and fairness before deploying this model. We encourage users to consider the common limitations of AI, comply with applicable laws, and leverage best practices when selecting use cases, particularly for high-risk scenarios where errors or misuse could significantly impact people’s lives, rights, or safety. For further guidance on use cases, refer to our AUP and AI AUP.
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
## BibTex and citation info
|
| 137 |
+
|
| 138 |
+
```
|
| 139 |
+
@misc{https://doi.org/10.48550/arxiv.2201.12086,
|
| 140 |
+
doi = {10.48550/ARXIV.2201.12086},
|
| 141 |
+
|
| 142 |
+
url = {https://arxiv.org/abs/2201.12086},
|
| 143 |
+
|
| 144 |
+
author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
|
| 145 |
+
|
| 146 |
+
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
|
| 147 |
+
|
| 148 |
+
title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
|
| 149 |
+
|
| 150 |
+
publisher = {arXiv},
|
| 151 |
+
|
| 152 |
+
year = {2022},
|
| 153 |
+
|
| 154 |
+
copyright = {Creative Commons Attribution 4.0 International}
|
| 155 |
+
}
|
| 156 |
+
```
|
language_model/blip-captioning/config.json
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_commit_hash": null,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"BlipForConditionalGeneration"
|
| 5 |
+
],
|
| 6 |
+
"image_text_hidden_size": 256,
|
| 7 |
+
"initializer_factor": 1.0,
|
| 8 |
+
"logit_scale_init_value": 2.6592,
|
| 9 |
+
"model_type": "blip",
|
| 10 |
+
"projection_dim": 512,
|
| 11 |
+
"text_config": {
|
| 12 |
+
"_name_or_path": "",
|
| 13 |
+
"add_cross_attention": false,
|
| 14 |
+
"architectures": null,
|
| 15 |
+
"attention_probs_dropout_prob": 0.0,
|
| 16 |
+
"bad_words_ids": null,
|
| 17 |
+
"begin_suppress_tokens": null,
|
| 18 |
+
"bos_token_id": 30522,
|
| 19 |
+
"chunk_size_feed_forward": 0,
|
| 20 |
+
"cross_attention_hidden_size": null,
|
| 21 |
+
"decoder_start_token_id": null,
|
| 22 |
+
"diversity_penalty": 0.0,
|
| 23 |
+
"do_sample": false,
|
| 24 |
+
"early_stopping": false,
|
| 25 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 26 |
+
"eos_token_id": 2,
|
| 27 |
+
"exponential_decay_length_penalty": null,
|
| 28 |
+
"finetuning_task": null,
|
| 29 |
+
"forced_bos_token_id": null,
|
| 30 |
+
"forced_eos_token_id": null,
|
| 31 |
+
"hidden_act": "gelu",
|
| 32 |
+
"hidden_dropout_prob": 0.0,
|
| 33 |
+
"hidden_size": 768,
|
| 34 |
+
"id2label": {
|
| 35 |
+
"0": "LABEL_0",
|
| 36 |
+
"1": "LABEL_1"
|
| 37 |
+
},
|
| 38 |
+
"initializer_factor": 1.0,
|
| 39 |
+
"initializer_range": 0.02,
|
| 40 |
+
"intermediate_size": 3072,
|
| 41 |
+
"is_decoder": true,
|
| 42 |
+
"is_encoder_decoder": false,
|
| 43 |
+
"label2id": {
|
| 44 |
+
"LABEL_0": 0,
|
| 45 |
+
"LABEL_1": 1
|
| 46 |
+
},
|
| 47 |
+
"layer_norm_eps": 1e-12,
|
| 48 |
+
"length_penalty": 1.0,
|
| 49 |
+
"max_length": 20,
|
| 50 |
+
"max_position_embeddings": 512,
|
| 51 |
+
"min_length": 0,
|
| 52 |
+
"model_type": "blip_text_model",
|
| 53 |
+
"no_repeat_ngram_size": 0,
|
| 54 |
+
"num_attention_heads": 12,
|
| 55 |
+
"num_beam_groups": 1,
|
| 56 |
+
"num_beams": 1,
|
| 57 |
+
"num_hidden_layers": 12,
|
| 58 |
+
"num_return_sequences": 1,
|
| 59 |
+
"output_attentions": false,
|
| 60 |
+
"output_hidden_states": false,
|
| 61 |
+
"output_scores": false,
|
| 62 |
+
"pad_token_id": 0,
|
| 63 |
+
"prefix": null,
|
| 64 |
+
"problem_type": null,
|
| 65 |
+
"projection_dim": 768,
|
| 66 |
+
"pruned_heads": {},
|
| 67 |
+
"remove_invalid_values": false,
|
| 68 |
+
"repetition_penalty": 1.0,
|
| 69 |
+
"return_dict": true,
|
| 70 |
+
"return_dict_in_generate": false,
|
| 71 |
+
"sep_token_id": 102,
|
| 72 |
+
"suppress_tokens": null,
|
| 73 |
+
"task_specific_params": null,
|
| 74 |
+
"temperature": 1.0,
|
| 75 |
+
"tf_legacy_loss": false,
|
| 76 |
+
"tie_encoder_decoder": false,
|
| 77 |
+
"tie_word_embeddings": true,
|
| 78 |
+
"tokenizer_class": null,
|
| 79 |
+
"top_k": 50,
|
| 80 |
+
"top_p": 1.0,
|
| 81 |
+
"torch_dtype": null,
|
| 82 |
+
"torchscript": false,
|
| 83 |
+
"transformers_version": "4.26.0.dev0",
|
| 84 |
+
"typical_p": 1.0,
|
| 85 |
+
"use_bfloat16": false,
|
| 86 |
+
"use_cache": true,
|
| 87 |
+
"vocab_size": 30524
|
| 88 |
+
},
|
| 89 |
+
"torch_dtype": "float32",
|
| 90 |
+
"transformers_version": null,
|
| 91 |
+
"vision_config": {
|
| 92 |
+
"_name_or_path": "",
|
| 93 |
+
"add_cross_attention": false,
|
| 94 |
+
"architectures": null,
|
| 95 |
+
"attention_dropout": 0.0,
|
| 96 |
+
"bad_words_ids": null,
|
| 97 |
+
"begin_suppress_tokens": null,
|
| 98 |
+
"bos_token_id": null,
|
| 99 |
+
"chunk_size_feed_forward": 0,
|
| 100 |
+
"cross_attention_hidden_size": null,
|
| 101 |
+
"decoder_start_token_id": null,
|
| 102 |
+
"diversity_penalty": 0.0,
|
| 103 |
+
"do_sample": false,
|
| 104 |
+
"dropout": 0.0,
|
| 105 |
+
"early_stopping": false,
|
| 106 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 107 |
+
"eos_token_id": null,
|
| 108 |
+
"exponential_decay_length_penalty": null,
|
| 109 |
+
"finetuning_task": null,
|
| 110 |
+
"forced_bos_token_id": null,
|
| 111 |
+
"forced_eos_token_id": null,
|
| 112 |
+
"hidden_act": "gelu",
|
| 113 |
+
"hidden_size": 768,
|
| 114 |
+
"id2label": {
|
| 115 |
+
"0": "LABEL_0",
|
| 116 |
+
"1": "LABEL_1"
|
| 117 |
+
},
|
| 118 |
+
"image_size": 384,
|
| 119 |
+
"initializer_factor": 1.0,
|
| 120 |
+
"initializer_range": 0.02,
|
| 121 |
+
"intermediate_size": 3072,
|
| 122 |
+
"is_decoder": false,
|
| 123 |
+
"is_encoder_decoder": false,
|
| 124 |
+
"label2id": {
|
| 125 |
+
"LABEL_0": 0,
|
| 126 |
+
"LABEL_1": 1
|
| 127 |
+
},
|
| 128 |
+
"layer_norm_eps": 1e-05,
|
| 129 |
+
"length_penalty": 1.0,
|
| 130 |
+
"max_length": 20,
|
| 131 |
+
"min_length": 0,
|
| 132 |
+
"model_type": "blip_vision_model",
|
| 133 |
+
"no_repeat_ngram_size": 0,
|
| 134 |
+
"num_attention_heads": 12,
|
| 135 |
+
"num_beam_groups": 1,
|
| 136 |
+
"num_beams": 1,
|
| 137 |
+
"num_channels": 3,
|
| 138 |
+
"num_hidden_layers": 12,
|
| 139 |
+
"num_return_sequences": 1,
|
| 140 |
+
"output_attentions": false,
|
| 141 |
+
"output_hidden_states": false,
|
| 142 |
+
"output_scores": false,
|
| 143 |
+
"pad_token_id": null,
|
| 144 |
+
"patch_size": 16,
|
| 145 |
+
"prefix": null,
|
| 146 |
+
"problem_type": null,
|
| 147 |
+
"projection_dim": 512,
|
| 148 |
+
"pruned_heads": {},
|
| 149 |
+
"remove_invalid_values": false,
|
| 150 |
+
"repetition_penalty": 1.0,
|
| 151 |
+
"return_dict": true,
|
| 152 |
+
"return_dict_in_generate": false,
|
| 153 |
+
"sep_token_id": null,
|
| 154 |
+
"suppress_tokens": null,
|
| 155 |
+
"task_specific_params": null,
|
| 156 |
+
"temperature": 1.0,
|
| 157 |
+
"tf_legacy_loss": false,
|
| 158 |
+
"tie_encoder_decoder": false,
|
| 159 |
+
"tie_word_embeddings": true,
|
| 160 |
+
"tokenizer_class": null,
|
| 161 |
+
"top_k": 50,
|
| 162 |
+
"top_p": 1.0,
|
| 163 |
+
"torch_dtype": null,
|
| 164 |
+
"torchscript": false,
|
| 165 |
+
"transformers_version": "4.26.0.dev0",
|
| 166 |
+
"typical_p": 1.0,
|
| 167 |
+
"use_bfloat16": false
|
| 168 |
+
}
|
| 169 |
+
}
|
language_model/blip-captioning/preprocessor_config.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": true,
|
| 3 |
+
"do_resize": true,
|
| 4 |
+
"image_mean": [
|
| 5 |
+
0.48145466,
|
| 6 |
+
0.4578275,
|
| 7 |
+
0.40821073
|
| 8 |
+
],
|
| 9 |
+
"image_processor_type": "BlipImageProcessor",
|
| 10 |
+
"image_std": [
|
| 11 |
+
0.26862954,
|
| 12 |
+
0.26130258,
|
| 13 |
+
0.27577711
|
| 14 |
+
],
|
| 15 |
+
"processor_class": "BlipProcessor",
|
| 16 |
+
"size": 384
|
| 17 |
+
}
|
language_model/blip-captioning/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6638651a5526cc2ede56f2b5104d6851b0755816d220e5e046870430180c767
|
| 3 |
+
size 989820849
|
language_model/blip-captioning/special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"mask_token": "[MASK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"sep_token": "[SEP]",
|
| 6 |
+
"unk_token": "[UNK]"
|
| 7 |
+
}
|
language_model/blip-captioning/tf_model.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0aaa4c0e003f599d8baa53a9dee85af14eef20554cf2f8113a2673e25a59f8c
|
| 3 |
+
size 990275136
|
language_model/blip-captioning/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
language_model/blip-captioning/tokenizer_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"do_basic_tokenize": true,
|
| 4 |
+
"do_lower_case": true,
|
| 5 |
+
"mask_token": "[MASK]",
|
| 6 |
+
"model_max_length": 512,
|
| 7 |
+
"name_or_path": "bert-base-uncased",
|
| 8 |
+
"never_split": null,
|
| 9 |
+
"pad_token": "[PAD]",
|
| 10 |
+
"processor_class": "BlipProcessor",
|
| 11 |
+
"sep_token": "[SEP]",
|
| 12 |
+
"special_tokens_map_file": null,
|
| 13 |
+
"strip_accents": null,
|
| 14 |
+
"tokenize_chinese_chars": true,
|
| 15 |
+
"tokenizer_class": "BertTokenizer",
|
| 16 |
+
"unk_token": "[UNK]",
|
| 17 |
+
"model_input_names": [
|
| 18 |
+
"input_ids",
|
| 19 |
+
"attention_mask"
|
| 20 |
+
]
|
| 21 |
+
}
|
language_model/blip-captioning/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
language_model/blip-vqa-finetuned/config.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BlipForQuestionAnswering"
|
| 4 |
+
],
|
| 5 |
+
"dtype": "float32",
|
| 6 |
+
"image_text_hidden_size": 256,
|
| 7 |
+
"initializer_factor": 1.0,
|
| 8 |
+
"initializer_range": 0.02,
|
| 9 |
+
"label_smoothing": 0.0,
|
| 10 |
+
"logit_scale_init_value": 2.6592,
|
| 11 |
+
"model_type": "blip",
|
| 12 |
+
"projection_dim": 512,
|
| 13 |
+
"text_config": {
|
| 14 |
+
"attention_probs_dropout_prob": 0.0,
|
| 15 |
+
"encoder_hidden_size": 768,
|
| 16 |
+
"hidden_act": "gelu",
|
| 17 |
+
"hidden_dropout_prob": 0.0,
|
| 18 |
+
"hidden_size": 768,
|
| 19 |
+
"initializer_factor": 1.0,
|
| 20 |
+
"initializer_range": 0.02,
|
| 21 |
+
"intermediate_size": 3072,
|
| 22 |
+
"label_smoothing": 0.0,
|
| 23 |
+
"layer_norm_eps": 1e-12,
|
| 24 |
+
"max_position_embeddings": 512,
|
| 25 |
+
"model_type": "blip_text_model",
|
| 26 |
+
"num_attention_heads": 12,
|
| 27 |
+
"num_hidden_layers": 12,
|
| 28 |
+
"projection_dim": 768,
|
| 29 |
+
"pruned_heads": {},
|
| 30 |
+
"tf_legacy_loss": false,
|
| 31 |
+
"torchscript": false,
|
| 32 |
+
"use_bfloat16": false,
|
| 33 |
+
"use_cache": true,
|
| 34 |
+
"vocab_size": 30524
|
| 35 |
+
},
|
| 36 |
+
"transformers_version": "5.0.0rc1",
|
| 37 |
+
"vision_config": {
|
| 38 |
+
"attention_dropout": 0.0,
|
| 39 |
+
"dropout": 0.0,
|
| 40 |
+
"hidden_act": "gelu",
|
| 41 |
+
"hidden_size": 768,
|
| 42 |
+
"image_size": 384,
|
| 43 |
+
"initializer_factor": 1.0,
|
| 44 |
+
"initializer_range": 0.02,
|
| 45 |
+
"intermediate_size": 3072,
|
| 46 |
+
"layer_norm_eps": 1e-05,
|
| 47 |
+
"model_type": "blip_vision_model",
|
| 48 |
+
"num_attention_heads": 12,
|
| 49 |
+
"num_channels": 3,
|
| 50 |
+
"num_hidden_layers": 12,
|
| 51 |
+
"patch_size": 16,
|
| 52 |
+
"projection_dim": 512,
|
| 53 |
+
"pruned_heads": {},
|
| 54 |
+
"tf_legacy_loss": false,
|
| 55 |
+
"torchscript": false,
|
| 56 |
+
"use_bfloat16": false
|
| 57 |
+
}
|
| 58 |
+
}
|
language_model/blip-vqa-finetuned/generation_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"transformers_version": "5.0.0rc1"
|
| 4 |
+
}
|
language_model/blip-vqa-finetuned/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e694b40dc9205491c9aa3b7a49ca93d79d780767a3ed578a0f6d8e8436b7ee56
|
| 3 |
+
size 1538792112
|
language_model/blip-vqa-finetuned/processor_config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"image_processor": {
|
| 3 |
+
"do_convert_rgb": true,
|
| 4 |
+
"do_normalize": true,
|
| 5 |
+
"do_pad": true,
|
| 6 |
+
"do_rescale": true,
|
| 7 |
+
"do_resize": true,
|
| 8 |
+
"image_mean": [
|
| 9 |
+
0.48145466,
|
| 10 |
+
0.4578275,
|
| 11 |
+
0.40821073
|
| 12 |
+
],
|
| 13 |
+
"image_processor_type": "BlipImageProcessor",
|
| 14 |
+
"image_std": [
|
| 15 |
+
0.26862954,
|
| 16 |
+
0.26130258,
|
| 17 |
+
0.27577711
|
| 18 |
+
],
|
| 19 |
+
"processor_class": "BlipProcessor",
|
| 20 |
+
"resample": 3,
|
| 21 |
+
"rescale_factor": 0.00392156862745098,
|
| 22 |
+
"size": {
|
| 23 |
+
"height": 384,
|
| 24 |
+
"width": 384
|
| 25 |
+
},
|
| 26 |
+
"size_divisor": 32
|
| 27 |
+
},
|
| 28 |
+
"processor_class": "BlipProcessor"
|
| 29 |
+
}
|
language_model/blip-vqa-finetuned/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
language_model/blip-vqa-finetuned/tokenizer_config.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": null,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"cls_token": "[CLS]",
|
| 5 |
+
"do_basic_tokenize": true,
|
| 6 |
+
"do_lower_case": true,
|
| 7 |
+
"is_local": false,
|
| 8 |
+
"mask_token": "[MASK]",
|
| 9 |
+
"model_input_names": [
|
| 10 |
+
"input_ids",
|
| 11 |
+
"attention_mask"
|
| 12 |
+
],
|
| 13 |
+
"model_max_length": 512,
|
| 14 |
+
"never_split": null,
|
| 15 |
+
"pad_token": "[PAD]",
|
| 16 |
+
"processor_class": "BlipProcessor",
|
| 17 |
+
"sep_token": "[SEP]",
|
| 18 |
+
"strip_accents": null,
|
| 19 |
+
"tokenize_chinese_chars": true,
|
| 20 |
+
"tokenizer_class": "BertTokenizer",
|
| 21 |
+
"unk_token": "[UNK]"
|
| 22 |
+
}
|
oculus_unified_model/README.md
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: cc-by-nc-4.0
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
pipeline_tag: image-text-to-text
|
| 6 |
+
library_name: transformers
|
| 7 |
+
tags:
|
| 8 |
+
- vision
|
| 9 |
+
- multimodal
|
| 10 |
+
- vision-language
|
| 11 |
+
- reasoning
|
| 12 |
+
- detection
|
| 13 |
+
- segmentation
|
| 14 |
+
- ocr
|
| 15 |
+
- vqa
|
| 16 |
+
- captioning
|
| 17 |
+
base_model:
|
| 18 |
+
- facebook/dinov2-large
|
| 19 |
+
- google/siglip-base-patch16-224
|
| 20 |
+
- Salesforce/blip-image-captioning-base
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
# Oculus 0.2
|
| 24 |
+
|
| 25 |
+
**A unified vision-language model with multi-modal reasoning capabilities.**
|
| 26 |
+
|
| 27 |
+
Oculus 0.2 is a hybrid-reasoning vision-language model that combines:
|
| 28 |
+
- **DINOv3** for semantic visual understanding
|
| 29 |
+
- **SigLIP2** for vision-language alignment
|
| 30 |
+
- **Trained Projector** for vision-to-language mapping
|
| 31 |
+
- **Optional Reasoning** via thinking traces
|
| 32 |
+
|
| 33 |
+
## 🚀 What's New in Oculus 0.2
|
| 34 |
+
|
| 35 |
+
| Feature | Description |
|
| 36 |
+
|---------|-------------|
|
| 37 |
+
| **🧠 Reasoning via Thinking Traces** | Short, structured reasoning traces improve multi-step decisions and ambiguous spatial tasks |
|
| 38 |
+
| **🔍 Focus System (Zoom & Crop)** | Automatically focus on smaller regions for fine-grained perception |
|
| 39 |
+
| **📦 Multiple Output Modes** | Text, Point, Box, and Polygon outputs for different tasks |
|
| 40 |
+
| **📝 Improved Captioning** | Better descriptions with context awareness |
|
| 41 |
+
| **❓ Enhanced VQA** | More accurate answers to visual questions |
|
| 42 |
+
|
| 43 |
+
## Output Modes
|
| 44 |
+
|
| 45 |
+
| Mode | Description | Use Case |
|
| 46 |
+
|------|-------------|----------|
|
| 47 |
+
| **📝 Text** | Natural language output | Captioning, VQA, descriptions |
|
| 48 |
+
| **📍 Point** | (x, y) coordinates + labels | Object counting, localization |
|
| 49 |
+
| **📦 Box** | Bounding boxes + labels | Object detection |
|
| 50 |
+
| **🔷 Polygon** | Segmentation masks | Semantic/instance segmentation |
|
| 51 |
+
|
| 52 |
+
## Quick Start
|
| 53 |
+
|
| 54 |
+
```python
|
| 55 |
+
from oculus_unified_model import OculusForConditionalGeneration
|
| 56 |
+
from PIL import Image
|
| 57 |
+
|
| 58 |
+
# Load model
|
| 59 |
+
model = OculusForConditionalGeneration.from_pretrained("OceanirAI/oculus-0.2")
|
| 60 |
+
|
| 61 |
+
# Load image
|
| 62 |
+
image = Image.open("your_image.jpg")
|
| 63 |
+
|
| 64 |
+
# Caption mode
|
| 65 |
+
output = model.generate(image, mode="text", prompt="Describe this image")
|
| 66 |
+
print(output.text)
|
| 67 |
+
|
| 68 |
+
# VQA mode
|
| 69 |
+
output = model.generate(image, mode="text", prompt="What color is the car?")
|
| 70 |
+
print(output.text)
|
| 71 |
+
|
| 72 |
+
# With reasoning traces
|
| 73 |
+
output = model.generate(image, mode="text", prompt="Count the people", think=True)
|
| 74 |
+
print(f"Thinking: {output.thinking_trace}")
|
| 75 |
+
print(f"Answer: {output.text}")
|
| 76 |
+
|
| 77 |
+
# Detection mode (bounding boxes)
|
| 78 |
+
output = model.generate(image, mode="box", prompt="Find all vehicles")
|
| 79 |
+
for box, label, conf in zip(output.boxes, output.labels, output.confidences):
|
| 80 |
+
print(f" {label}: {box} (conf={conf:.2f})")
|
| 81 |
+
|
| 82 |
+
# Point mode (counting)
|
| 83 |
+
output = model.generate(image, mode="point", prompt="Count the birds")
|
| 84 |
+
print(f"Found {len(output.points)} points")
|
| 85 |
+
|
| 86 |
+
# Segmentation mode
|
| 87 |
+
output = model.generate(image, mode="polygon", prompt="Segment the road")
|
| 88 |
+
print(f"Mask shape: {output.mask.shape}")
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
## Reasoning Mode
|
| 92 |
+
|
| 93 |
+
Enable thinking traces for complex reasoning tasks:
|
| 94 |
+
|
| 95 |
+
```python
|
| 96 |
+
output = model.generate(
|
| 97 |
+
image,
|
| 98 |
+
mode="text",
|
| 99 |
+
prompt="How many people are sitting vs standing?",
|
| 100 |
+
think=True # Enable reasoning
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
print(f"💭 Thinking: {output.thinking_trace}")
|
| 104 |
+
print(f"📝 Answer: {output.text}")
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
## Focus System
|
| 108 |
+
|
| 109 |
+
The Focus system enables zoom-and-crop for fine-grained perception:
|
| 110 |
+
|
| 111 |
+
```python
|
| 112 |
+
output = model.generate(
|
| 113 |
+
image,
|
| 114 |
+
mode="text",
|
| 115 |
+
prompt="What does the small text say?",
|
| 116 |
+
focus=True # Enable focus/zoom
|
| 117 |
+
)
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
## Architecture
|
| 121 |
+
|
| 122 |
+
```
|
| 123 |
+
Image → DINOv3 ────┐
|
| 124 |
+
├→ Fusion → Projector → 64 tokens × 1536D ───┐
|
| 125 |
+
Image → SigLIP2 ──┘ │
|
| 126 |
+
↓
|
| 127 |
+
┌─────────────────────────────────┐
|
| 128 |
+
│ │
|
| 129 |
+
↓ ↓
|
| 130 |
+
LM Head Task Heads
|
| 131 |
+
│ │
|
| 132 |
+
↓ ↓
|
| 133 |
+
Text/Caption/VQA Point/Box/Polygon
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
## Model Details
|
| 137 |
+
|
| 138 |
+
| Component | Size | Description |
|
| 139 |
+
|-----------|------|-------------|
|
| 140 |
+
| DINOv3 Encoder | 1.0B | Semantic visual features |
|
| 141 |
+
| SigLIP2 Encoder | 400M | Vision-language aligned features |
|
| 142 |
+
| Projector | 160M | Vision-to-language bridge |
|
| 143 |
+
| Detection Head | 12M | Bounding box prediction |
|
| 144 |
+
| Point Head | 8M | Point localization |
|
| 145 |
+
| Segmentation Head | 24M | Mask prediction |
|
| 146 |
+
| **Total** | **~1.6B** | Full model |
|
| 147 |
+
|
| 148 |
+
## Training
|
| 149 |
+
|
| 150 |
+
The model components were trained in stages:
|
| 151 |
+
1. **Projector**: Trained on COCO Captions (5k paired images) for 3 epochs.
|
| 152 |
+
2. **Detection Heads**: Trained on COCO Detection for 5+ epochs using GIoU and Focal Loss.
|
| 153 |
+
|
| 154 |
+
## Benchmarks & Evaluation
|
| 155 |
+
|
| 156 |
+
We use a comprehensive benchmark suite `eval_benchmarks.py` covering:
|
| 157 |
+
- **COCO Detection**: mAP evaluation
|
| 158 |
+
- **Car Part Damage**: Specialized evaluation on HuggingFace `moondream/car_part_damage` dataset
|
| 159 |
+
- **Counting**: Accuracy on Pixmo-style counting tasks
|
| 160 |
+
- **VQA**: Open-ended question answering accuracy
|
| 161 |
+
|
| 162 |
+
To run benchmarks:
|
| 163 |
+
```bash
|
| 164 |
+
python eval_benchmarks.py --model checkpoints/oculus_detection_v2/final
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
## 🔌 Python API Usage
|
| 168 |
+
|
| 169 |
+
To use Oculus in your own applications, simply import the `OculusPredictor`:
|
| 170 |
+
|
| 171 |
+
```python
|
| 172 |
+
from oculus_inference import OculusPredictor
|
| 173 |
+
|
| 174 |
+
# Initialize (automatically loads best checkpoint)
|
| 175 |
+
model = OculusPredictor()
|
| 176 |
+
|
| 177 |
+
# 1. Object Detection
|
| 178 |
+
results = model.detect("image.jpg")
|
| 179 |
+
print(f"Found {len(results['boxes'])} objects")
|
| 180 |
+
|
| 181 |
+
# 2. Visual Question Answering (Reasoning)
|
| 182 |
+
answer = model.ask("image.jpg", "What is the person holding?")
|
| 183 |
+
print(f"Answer: {answer}")
|
| 184 |
+
|
| 185 |
+
# 3. Captioning
|
| 186 |
+
caption = model.caption("image.jpg")
|
| 187 |
+
print(f"Caption: {caption}")
|
| 188 |
+
```
|
| 189 |
+
|
| 190 |
+
## Requirements
|
| 191 |
+
|
| 192 |
+
```bash
|
| 193 |
+
pip install transformers torch pillow numpy
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
For Apple Silicon:
|
| 197 |
+
```bash
|
| 198 |
+
pip install mlx
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
## Citation
|
| 202 |
+
|
| 203 |
+
```bibtex
|
| 204 |
+
@misc{oculus2025,
|
| 205 |
+
title={Oculus: Unified Vision-Language Model with Multi-Modal Reasoning},
|
| 206 |
+
author={OceanirAI},
|
| 207 |
+
year={2025},
|
| 208 |
+
publisher={Hugging Face},
|
| 209 |
+
url={https://huggingface.co/OceanirAI/oculus-0.2}
|
| 210 |
+
}
|
| 211 |
+
```
|
| 212 |
+
|
| 213 |
+
## License
|
| 214 |
+
|
| 215 |
+
CC-BY-NC-4.0
|
| 216 |
+
|
| 217 |
+
## Contact
|
| 218 |
+
|
| 219 |
+
- **Organization**: OceanirAI
|
| 220 |
+
- **GitHub**: [github.com/Oceanir](https://github.com/Oceanir)
|
oculus_unified_model/__init__.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Oculus Unified Vision-Language Model
|
| 3 |
+
|
| 4 |
+
A HuggingFace-compatible multimodal model combining:
|
| 5 |
+
- DINOv3 (vision encoder)
|
| 6 |
+
- SigLIP2 (vision encoder)
|
| 7 |
+
- Trained Projector (vision-to-language bridge)
|
| 8 |
+
- LLM (language generation)
|
| 9 |
+
|
| 10 |
+
Supports:
|
| 11 |
+
- Image captioning
|
| 12 |
+
- Visual question answering
|
| 13 |
+
- Object detection (Box mode)
|
| 14 |
+
- Point detection (counting)
|
| 15 |
+
- Polygon segmentation
|
| 16 |
+
- Optional reasoning with thinking traces
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from .modeling_oculus import (
|
| 20 |
+
OculusForConditionalGeneration,
|
| 21 |
+
OculusVisionEncoder,
|
| 22 |
+
OculusProjector,
|
| 23 |
+
)
|
| 24 |
+
from .configuration_oculus import OculusConfig
|
| 25 |
+
from .processing_oculus import OculusProcessor
|
| 26 |
+
|
| 27 |
+
__all__ = [
|
| 28 |
+
"OculusForConditionalGeneration",
|
| 29 |
+
"OculusVisionEncoder",
|
| 30 |
+
"OculusProjector",
|
| 31 |
+
"OculusConfig",
|
| 32 |
+
"OculusProcessor",
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
__version__ = "0.2.0"
|
oculus_unified_model/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (941 Bytes). View file
|
|
|
oculus_unified_model/__pycache__/configuration_oculus.cpython-312.pyc
ADDED
|
Binary file (4.04 kB). View file
|
|
|
oculus_unified_model/__pycache__/modeling_oculus.cpython-312.pyc
ADDED
|
Binary file (39.1 kB). View file
|
|
|
oculus_unified_model/__pycache__/processing_oculus.cpython-312.pyc
ADDED
|
Binary file (7.19 kB). View file
|
|
|
oculus_unified_model/configuration_oculus.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Oculus Configuration
|
| 3 |
+
|
| 4 |
+
HuggingFace-compatible configuration for the unified Oculus model.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import Optional, Dict, Any, List
|
| 8 |
+
from transformers import PretrainedConfig
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class OculusConfig(PretrainedConfig):
|
| 12 |
+
"""
|
| 13 |
+
Configuration class for Oculus vision-language model.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
vision_config: Configuration for vision encoders
|
| 17 |
+
projector_config: Configuration for vision-to-language projector
|
| 18 |
+
text_config: Configuration for language model
|
| 19 |
+
reasoning_enabled: Whether to enable thinking traces
|
| 20 |
+
output_mode: Default output mode ("text", "point", "box", "polygon")
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
model_type = "oculus"
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
# Vision encoder settings
|
| 28 |
+
dinov3_model_id: str = "facebook/dinov2-large",
|
| 29 |
+
siglip_model_id: str = "google/siglip-base-patch16-224",
|
| 30 |
+
dinov3_hidden_size: int = 1280, # DINOv3 ViT-H/16+ output dim
|
| 31 |
+
siglip_hidden_size: int = 768, # SigLIP2 base output dim
|
| 32 |
+
|
| 33 |
+
# Projector settings
|
| 34 |
+
projector_hidden_dim: int = 2048,
|
| 35 |
+
num_vision_tokens: int = 64,
|
| 36 |
+
|
| 37 |
+
# Language model settings
|
| 38 |
+
text_model_id: str = "Salesforce/blip-image-captioning-base",
|
| 39 |
+
lm_hidden_size: int = 1536,
|
| 40 |
+
vocab_size: int = 131072,
|
| 41 |
+
max_position_embeddings: int = 32768,
|
| 42 |
+
|
| 43 |
+
# Reasoning settings
|
| 44 |
+
reasoning_enabled: bool = True,
|
| 45 |
+
thinking_token: str = "<think>",
|
| 46 |
+
thinking_end_token: str = "</think>",
|
| 47 |
+
max_thinking_tokens: int = 256,
|
| 48 |
+
|
| 49 |
+
# Output mode settings
|
| 50 |
+
output_mode: str = "text", # "text", "point", "box", "polygon"
|
| 51 |
+
num_detection_classes: int = 80,
|
| 52 |
+
num_segmentation_classes: int = 150,
|
| 53 |
+
|
| 54 |
+
# Generation settings
|
| 55 |
+
max_new_tokens: int = 512,
|
| 56 |
+
temperature: float = 0.7,
|
| 57 |
+
top_p: float = 0.95,
|
| 58 |
+
|
| 59 |
+
# Tool calling / Focus system
|
| 60 |
+
enable_focus: bool = True,
|
| 61 |
+
focus_token: str = "<focus>",
|
| 62 |
+
focus_end_token: str = "</focus>",
|
| 63 |
+
|
| 64 |
+
**kwargs
|
| 65 |
+
):
|
| 66 |
+
super().__init__(**kwargs)
|
| 67 |
+
|
| 68 |
+
# Vision
|
| 69 |
+
self.dinov3_model_id = dinov3_model_id
|
| 70 |
+
self.siglip_model_id = siglip_model_id
|
| 71 |
+
self.dinov3_hidden_size = dinov3_hidden_size
|
| 72 |
+
self.siglip_hidden_size = siglip_hidden_size
|
| 73 |
+
self.fused_vision_dim = dinov3_hidden_size + siglip_hidden_size
|
| 74 |
+
|
| 75 |
+
# Projector
|
| 76 |
+
self.projector_hidden_dim = projector_hidden_dim
|
| 77 |
+
self.num_vision_tokens = num_vision_tokens
|
| 78 |
+
|
| 79 |
+
# Language model
|
| 80 |
+
self.text_model_id = text_model_id
|
| 81 |
+
self.lm_hidden_size = lm_hidden_size
|
| 82 |
+
self.vocab_size = vocab_size
|
| 83 |
+
self.max_position_embeddings = max_position_embeddings
|
| 84 |
+
|
| 85 |
+
# Reasoning
|
| 86 |
+
self.reasoning_enabled = reasoning_enabled
|
| 87 |
+
self.thinking_token = thinking_token
|
| 88 |
+
self.thinking_end_token = thinking_end_token
|
| 89 |
+
self.max_thinking_tokens = max_thinking_tokens
|
| 90 |
+
|
| 91 |
+
# Output modes
|
| 92 |
+
self.output_mode = output_mode
|
| 93 |
+
self.num_detection_classes = num_detection_classes
|
| 94 |
+
self.num_segmentation_classes = num_segmentation_classes
|
| 95 |
+
|
| 96 |
+
# Generation
|
| 97 |
+
self.max_new_tokens = max_new_tokens
|
| 98 |
+
self.temperature = temperature
|
| 99 |
+
self.top_p = top_p
|
| 100 |
+
|
| 101 |
+
# Focus system
|
| 102 |
+
self.enable_focus = enable_focus
|
| 103 |
+
self.focus_token = focus_token
|
| 104 |
+
self.focus_end_token = focus_end_token
|
| 105 |
+
|
| 106 |
+
@classmethod
|
| 107 |
+
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
| 108 |
+
"""Load config from pretrained path."""
|
| 109 |
+
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
| 110 |
+
return cls.from_dict(config_dict, **kwargs)
|
| 111 |
+
|
| 112 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 113 |
+
"""Serialize config to dictionary."""
|
| 114 |
+
output = super().to_dict()
|
| 115 |
+
return output
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
# Register for auto-loading
|
| 119 |
+
OculusConfig.register_for_auto_class()
|
oculus_unified_model/modeling_oculus.py
ADDED
|
@@ -0,0 +1,842 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Oculus Unified Model
|
| 3 |
+
|
| 4 |
+
HuggingFace-compatible vision-language model with:
|
| 5 |
+
- Multi-encoder vision (DINOv3 + SigLIP2)
|
| 6 |
+
- Trained projector for vision-to-language
|
| 7 |
+
- Optional reasoning with thinking traces
|
| 8 |
+
- Multiple output modes (Text, Point, Box, Polygon)
|
| 9 |
+
- Focus/Zoom tool calling for fine-grained perception
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import json
|
| 14 |
+
import warnings
|
| 15 |
+
from dataclasses import dataclass
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Optional, Tuple, List, Dict, Any, Union
|
| 18 |
+
|
| 19 |
+
import numpy as np
|
| 20 |
+
import torch
|
| 21 |
+
import torch.nn as nn
|
| 22 |
+
import torch.nn.functional as F
|
| 23 |
+
from transformers import (
|
| 24 |
+
PreTrainedModel,
|
| 25 |
+
PretrainedConfig,
|
| 26 |
+
AutoImageProcessor,
|
| 27 |
+
AutoModel,
|
| 28 |
+
AutoTokenizer,
|
| 29 |
+
AutoModelForCausalLM,
|
| 30 |
+
GenerationConfig,
|
| 31 |
+
)
|
| 32 |
+
from transformers.modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
|
| 33 |
+
from PIL import Image
|
| 34 |
+
|
| 35 |
+
from .configuration_oculus import OculusConfig
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ============================================================================
|
| 39 |
+
# Output Data Classes
|
| 40 |
+
# ============================================================================
|
| 41 |
+
|
| 42 |
+
@dataclass
|
| 43 |
+
class OculusOutput:
|
| 44 |
+
"""Base output class for Oculus model."""
|
| 45 |
+
text: Optional[str] = None
|
| 46 |
+
thinking_trace: Optional[str] = None
|
| 47 |
+
logits: Optional[torch.Tensor] = None
|
| 48 |
+
hidden_states: Optional[torch.Tensor] = None
|
| 49 |
+
vision_tokens: Optional[torch.Tensor] = None
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@dataclass
|
| 53 |
+
class OculusTextOutput(OculusOutput):
|
| 54 |
+
"""Output for text/caption mode."""
|
| 55 |
+
pass
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass
|
| 59 |
+
class OculusPointOutput(OculusOutput):
|
| 60 |
+
"""Output for point detection mode (counting objects)."""
|
| 61 |
+
points: Optional[List[Tuple[float, float]]] = None
|
| 62 |
+
labels: Optional[List[str]] = None
|
| 63 |
+
confidences: Optional[List[float]] = None
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@dataclass
|
| 67 |
+
class OculusBoxOutput(OculusOutput):
|
| 68 |
+
"""Output for bounding box detection mode."""
|
| 69 |
+
boxes: Optional[List[Tuple[float, float, float, float]]] = None # x1, y1, x2, y2
|
| 70 |
+
labels: Optional[List[str]] = None
|
| 71 |
+
confidences: Optional[List[float]] = None
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@dataclass
|
| 75 |
+
class OculusPolygonOutput(OculusOutput):
|
| 76 |
+
"""Output for polygon/segmentation mode."""
|
| 77 |
+
polygons: Optional[List[List[Tuple[float, float]]]] = None
|
| 78 |
+
labels: Optional[List[str]] = None
|
| 79 |
+
mask: Optional[np.ndarray] = None
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# ============================================================================
|
| 83 |
+
# Vision Encoder (DINOv3 + SigLIP2)
|
| 84 |
+
# ============================================================================
|
| 85 |
+
|
| 86 |
+
class OculusVisionEncoder(nn.Module):
|
| 87 |
+
"""
|
| 88 |
+
Dual vision encoder combining DINOv3 and SigLIP2.
|
| 89 |
+
|
| 90 |
+
DINOv3: Excellent at semantic understanding, object boundaries
|
| 91 |
+
SigLIP2: Strong at text/language alignment
|
| 92 |
+
"""
|
| 93 |
+
|
| 94 |
+
def __init__(self, config: OculusConfig):
|
| 95 |
+
super().__init__()
|
| 96 |
+
self.config = config
|
| 97 |
+
|
| 98 |
+
# Will be loaded lazily
|
| 99 |
+
self.dinov3 = None
|
| 100 |
+
self.dinov3_processor = None
|
| 101 |
+
self.siglip = None
|
| 102 |
+
self.siglip_processor = None
|
| 103 |
+
|
| 104 |
+
self._loaded = False
|
| 105 |
+
|
| 106 |
+
def load_encoders(self, device: str = "cpu"):
|
| 107 |
+
"""Load vision encoders from HuggingFace."""
|
| 108 |
+
if self._loaded:
|
| 109 |
+
return
|
| 110 |
+
|
| 111 |
+
print("[Oculus] Loading vision encoders...")
|
| 112 |
+
|
| 113 |
+
# DINOv3
|
| 114 |
+
try:
|
| 115 |
+
self.dinov3_processor = AutoImageProcessor.from_pretrained(
|
| 116 |
+
self.config.dinov3_model_id
|
| 117 |
+
)
|
| 118 |
+
self.dinov3 = AutoModel.from_pretrained(
|
| 119 |
+
self.config.dinov3_model_id
|
| 120 |
+
).eval().to(device)
|
| 121 |
+
print(f" ✓ DINOv3: {self.config.dinov3_model_id}")
|
| 122 |
+
except Exception as e:
|
| 123 |
+
warnings.warn(f"Failed to load DINOv3: {e}")
|
| 124 |
+
self.dinov3_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
|
| 125 |
+
self.dinov3 = AutoModel.from_pretrained("facebook/dinov2-base").eval().to(device)
|
| 126 |
+
print(" ✓ DINOv2-base (fallback)")
|
| 127 |
+
|
| 128 |
+
# SigLIP2
|
| 129 |
+
try:
|
| 130 |
+
self.siglip_processor = AutoImageProcessor.from_pretrained(
|
| 131 |
+
self.config.siglip_model_id
|
| 132 |
+
)
|
| 133 |
+
self.siglip = AutoModel.from_pretrained(
|
| 134 |
+
self.config.siglip_model_id
|
| 135 |
+
).eval().to(device)
|
| 136 |
+
print(f" ✓ SigLIP: {self.config.siglip_model_id}")
|
| 137 |
+
except Exception as e:
|
| 138 |
+
warnings.warn(f"Failed to load SigLIP: {e}")
|
| 139 |
+
from transformers import SiglipVisionModel
|
| 140 |
+
self.siglip_processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
|
| 141 |
+
self.siglip = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224").eval().to(device)
|
| 142 |
+
print(" ✓ SigLIP-base (fallback)")
|
| 143 |
+
|
| 144 |
+
self._loaded = True
|
| 145 |
+
|
| 146 |
+
@torch.no_grad()
|
| 147 |
+
def forward(self, image: Union[Image.Image, torch.Tensor, np.ndarray]) -> torch.Tensor:
|
| 148 |
+
"""
|
| 149 |
+
Encode image with both vision encoders and fuse features.
|
| 150 |
+
|
| 151 |
+
Returns:
|
| 152 |
+
Fused vision features [batch, fused_dim]
|
| 153 |
+
"""
|
| 154 |
+
if not self._loaded:
|
| 155 |
+
self.load_encoders()
|
| 156 |
+
|
| 157 |
+
# Handle different input types
|
| 158 |
+
if isinstance(image, np.ndarray):
|
| 159 |
+
image = Image.fromarray(image)
|
| 160 |
+
elif isinstance(image, torch.Tensor):
|
| 161 |
+
image = Image.fromarray(image.cpu().numpy().astype(np.uint8))
|
| 162 |
+
|
| 163 |
+
if isinstance(image, Image.Image):
|
| 164 |
+
image = image.convert('RGB')
|
| 165 |
+
|
| 166 |
+
device = next(self.dinov3.parameters()).device
|
| 167 |
+
|
| 168 |
+
# DINOv3 encoding
|
| 169 |
+
d_inputs = self.dinov3_processor(images=image, return_tensors="pt")
|
| 170 |
+
d_inputs = {k: v.to(device) for k, v in d_inputs.items()}
|
| 171 |
+
d_out = self.dinov3(**d_inputs)
|
| 172 |
+
d_pooled = d_out.pooler_output if hasattr(d_out, 'pooler_output') and d_out.pooler_output is not None else d_out.last_hidden_state[:, 0]
|
| 173 |
+
|
| 174 |
+
# SigLIP encoding
|
| 175 |
+
s_inputs = self.siglip_processor(images=image, return_tensors="pt")
|
| 176 |
+
s_inputs = {k: v.to(device) for k, v in s_inputs.items()}
|
| 177 |
+
|
| 178 |
+
if hasattr(self.siglip, 'vision_model'):
|
| 179 |
+
s_hidden = self.siglip.vision_model.embeddings(s_inputs['pixel_values'])
|
| 180 |
+
s_pooled = s_hidden.mean(dim=1)
|
| 181 |
+
else:
|
| 182 |
+
s_out = self.siglip(**s_inputs)
|
| 183 |
+
s_pooled = s_out.pooler_output if hasattr(s_out, 'pooler_output') else s_out.last_hidden_state[:, 0]
|
| 184 |
+
|
| 185 |
+
# Fuse features
|
| 186 |
+
fused = torch.cat([d_pooled, s_pooled], dim=-1)
|
| 187 |
+
|
| 188 |
+
return fused
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# ============================================================================
|
| 192 |
+
# Vision Projector
|
| 193 |
+
# ============================================================================
|
| 194 |
+
|
| 195 |
+
class OculusProjector(nn.Module):
|
| 196 |
+
"""
|
| 197 |
+
Projects fused vision features to language model token space.
|
| 198 |
+
|
| 199 |
+
Converts [batch, fused_dim] → [batch, num_tokens, lm_hidden_size]
|
| 200 |
+
"""
|
| 201 |
+
|
| 202 |
+
def __init__(self, config: OculusConfig):
|
| 203 |
+
super().__init__()
|
| 204 |
+
self.config = config
|
| 205 |
+
|
| 206 |
+
fused_dim = config.fused_vision_dim
|
| 207 |
+
hidden_dim = config.projector_hidden_dim
|
| 208 |
+
num_tokens = config.num_vision_tokens
|
| 209 |
+
embed_dim = config.lm_hidden_size
|
| 210 |
+
|
| 211 |
+
self.fc1 = nn.Linear(fused_dim, hidden_dim)
|
| 212 |
+
self.act1 = nn.GELU()
|
| 213 |
+
self.fc2 = nn.Linear(hidden_dim, hidden_dim)
|
| 214 |
+
self.act2 = nn.GELU()
|
| 215 |
+
self.fc3 = nn.Linear(hidden_dim, num_tokens * embed_dim)
|
| 216 |
+
self.norm = nn.LayerNorm(embed_dim)
|
| 217 |
+
|
| 218 |
+
self.num_tokens = num_tokens
|
| 219 |
+
self.embed_dim = embed_dim
|
| 220 |
+
|
| 221 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 222 |
+
"""
|
| 223 |
+
Project vision features to token embeddings.
|
| 224 |
+
|
| 225 |
+
Args:
|
| 226 |
+
x: Vision features [batch, fused_dim]
|
| 227 |
+
|
| 228 |
+
Returns:
|
| 229 |
+
Vision tokens [batch, num_tokens, embed_dim]
|
| 230 |
+
"""
|
| 231 |
+
batch_size = x.shape[0]
|
| 232 |
+
|
| 233 |
+
h = self.fc1(x)
|
| 234 |
+
h = self.act1(h)
|
| 235 |
+
h = self.fc2(h)
|
| 236 |
+
h = self.act2(h)
|
| 237 |
+
h = self.fc3(h)
|
| 238 |
+
|
| 239 |
+
h = h.reshape(batch_size, self.num_tokens, self.embed_dim)
|
| 240 |
+
h = self.norm(h)
|
| 241 |
+
|
| 242 |
+
return h
|
| 243 |
+
|
| 244 |
+
@classmethod
|
| 245 |
+
def from_pretrained(cls, path: str, config: OculusConfig):
|
| 246 |
+
"""Load projector from saved weights."""
|
| 247 |
+
projector = cls(config)
|
| 248 |
+
|
| 249 |
+
weights_path = Path(path) / "projector.npz"
|
| 250 |
+
if weights_path.exists():
|
| 251 |
+
import numpy as np
|
| 252 |
+
weights = np.load(weights_path, allow_pickle=True)
|
| 253 |
+
|
| 254 |
+
state_dict = {}
|
| 255 |
+
for key in weights.files:
|
| 256 |
+
layer_dict = weights[key].item()
|
| 257 |
+
for param_name, param_val in layer_dict.items():
|
| 258 |
+
full_key = f"{key}.{param_name}"
|
| 259 |
+
# Convert from MLX array if needed
|
| 260 |
+
if hasattr(param_val, 'tolist'):
|
| 261 |
+
param_val = np.array(param_val.tolist())
|
| 262 |
+
state_dict[full_key] = torch.from_numpy(np.array(param_val))
|
| 263 |
+
|
| 264 |
+
projector.load_state_dict(state_dict, strict=False)
|
| 265 |
+
print(f" ✓ Loaded projector from {path}")
|
| 266 |
+
|
| 267 |
+
return projector
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
# ============================================================================
|
| 271 |
+
# Detection/Segmentation Heads
|
| 272 |
+
# ============================================================================
|
| 273 |
+
|
| 274 |
+
class OculusDetectionHead(nn.Module):
|
| 275 |
+
"""Head for bounding box detection."""
|
| 276 |
+
|
| 277 |
+
def __init__(self, config: OculusConfig):
|
| 278 |
+
super().__init__()
|
| 279 |
+
hidden_dim = config.lm_hidden_size
|
| 280 |
+
num_classes = config.num_detection_classes
|
| 281 |
+
|
| 282 |
+
self.cls_head = nn.Sequential(
|
| 283 |
+
nn.Linear(hidden_dim, hidden_dim // 2),
|
| 284 |
+
nn.GELU(),
|
| 285 |
+
nn.Linear(hidden_dim // 2, num_classes)
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
self.box_head = nn.Sequential(
|
| 289 |
+
nn.Linear(hidden_dim, hidden_dim // 2),
|
| 290 |
+
nn.GELU(),
|
| 291 |
+
nn.Linear(hidden_dim // 2, 4) # x1, y1, x2, y2
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
def forward(self, vision_tokens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 295 |
+
"""
|
| 296 |
+
Predict boxes and classes from vision tokens.
|
| 297 |
+
|
| 298 |
+
Returns:
|
| 299 |
+
cls_logits: [batch, num_tokens, num_classes]
|
| 300 |
+
box_coords: [batch, num_tokens, 4]
|
| 301 |
+
"""
|
| 302 |
+
cls_logits = self.cls_head(vision_tokens)
|
| 303 |
+
box_coords = self.box_head(vision_tokens).sigmoid() # Normalize to [0, 1]
|
| 304 |
+
return cls_logits, box_coords
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
class OculusPointHead(nn.Module):
|
| 308 |
+
"""Head for point detection (object counting)."""
|
| 309 |
+
|
| 310 |
+
def __init__(self, config: OculusConfig):
|
| 311 |
+
super().__init__()
|
| 312 |
+
hidden_dim = config.lm_hidden_size
|
| 313 |
+
num_classes = config.num_detection_classes
|
| 314 |
+
|
| 315 |
+
self.point_head = nn.Sequential(
|
| 316 |
+
nn.Linear(hidden_dim, hidden_dim // 2),
|
| 317 |
+
nn.GELU(),
|
| 318 |
+
nn.Linear(hidden_dim // 2, 2) # x, y
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
self.cls_head = nn.Sequential(
|
| 322 |
+
nn.Linear(hidden_dim, hidden_dim // 2),
|
| 323 |
+
nn.GELU(),
|
| 324 |
+
nn.Linear(hidden_dim // 2, num_classes)
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
self.conf_head = nn.Sequential(
|
| 328 |
+
nn.Linear(hidden_dim, hidden_dim // 4),
|
| 329 |
+
nn.GELU(),
|
| 330 |
+
nn.Linear(hidden_dim // 4, 1)
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
def forward(self, vision_tokens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 334 |
+
points = self.point_head(vision_tokens).sigmoid()
|
| 335 |
+
cls_logits = self.cls_head(vision_tokens)
|
| 336 |
+
confidence = self.conf_head(vision_tokens).sigmoid()
|
| 337 |
+
return points, cls_logits, confidence
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
class OculusSegmentationHead(nn.Module):
|
| 341 |
+
"""Head for polygon/mask segmentation."""
|
| 342 |
+
|
| 343 |
+
def __init__(self, config: OculusConfig):
|
| 344 |
+
super().__init__()
|
| 345 |
+
hidden_dim = config.lm_hidden_size
|
| 346 |
+
num_classes = config.num_segmentation_classes
|
| 347 |
+
|
| 348 |
+
# Predict mask logits
|
| 349 |
+
self.mask_head = nn.Sequential(
|
| 350 |
+
nn.Linear(hidden_dim, hidden_dim),
|
| 351 |
+
nn.GELU(),
|
| 352 |
+
nn.Linear(hidden_dim, 14 * 14 * num_classes) # Output spatial mask
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
+
self.num_classes = num_classes
|
| 356 |
+
|
| 357 |
+
def forward(self, vision_tokens: torch.Tensor) -> torch.Tensor:
|
| 358 |
+
batch_size = vision_tokens.shape[0]
|
| 359 |
+
pooled = vision_tokens.mean(dim=1)
|
| 360 |
+
mask_logits = self.mask_head(pooled)
|
| 361 |
+
mask_logits = mask_logits.reshape(batch_size, self.num_classes, 14, 14)
|
| 362 |
+
return mask_logits
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
# ============================================================================
|
| 366 |
+
# Main Model
|
| 367 |
+
# ============================================================================
|
| 368 |
+
|
| 369 |
+
class OculusForConditionalGeneration(PreTrainedModel):
|
| 370 |
+
"""
|
| 371 |
+
Oculus: Unified Vision-Language Model
|
| 372 |
+
|
| 373 |
+
Features:
|
| 374 |
+
- Multi-encoder vision (DINOv3 + SigLIP2)
|
| 375 |
+
- Optional reasoning with thinking traces
|
| 376 |
+
- Multiple output modes: Text, Point, Box, Polygon
|
| 377 |
+
- Focus/Zoom tool calling for fine-grained perception
|
| 378 |
+
|
| 379 |
+
Usage:
|
| 380 |
+
```python
|
| 381 |
+
from oculus_unified_model import OculusForConditionalGeneration
|
| 382 |
+
|
| 383 |
+
model = OculusForConditionalGeneration.from_pretrained("OceanirAI/oculus-0.2")
|
| 384 |
+
|
| 385 |
+
# Caption mode
|
| 386 |
+
output = model.generate(image, mode="text", prompt="Describe this image")
|
| 387 |
+
|
| 388 |
+
# VQA mode
|
| 389 |
+
output = model.generate(image, mode="text", prompt="What color is the cat?")
|
| 390 |
+
|
| 391 |
+
# With reasoning
|
| 392 |
+
output = model.generate(image, mode="text", prompt="Count the people", think=True)
|
| 393 |
+
|
| 394 |
+
# Detection mode
|
| 395 |
+
output = model.generate(image, mode="box", prompt="Find all cars")
|
| 396 |
+
|
| 397 |
+
# Point mode (counting)
|
| 398 |
+
output = model.generate(image, mode="point", prompt="Count the birds")
|
| 399 |
+
|
| 400 |
+
# Segmentation mode
|
| 401 |
+
output = model.generate(image, mode="polygon", prompt="Segment the road")
|
| 402 |
+
```
|
| 403 |
+
"""
|
| 404 |
+
|
| 405 |
+
config_class = OculusConfig
|
| 406 |
+
base_model_prefix = "oculus"
|
| 407 |
+
|
| 408 |
+
def __init__(self, config: OculusConfig):
|
| 409 |
+
super().__init__(config)
|
| 410 |
+
self.config = config
|
| 411 |
+
|
| 412 |
+
# Vision encoder
|
| 413 |
+
self.vision_encoder = OculusVisionEncoder(config)
|
| 414 |
+
|
| 415 |
+
# Vision adapter (handles dimension mismatch if needed)
|
| 416 |
+
self.vision_adapter = None
|
| 417 |
+
self._actual_vision_dim = None
|
| 418 |
+
|
| 419 |
+
# Projector
|
| 420 |
+
self.projector = OculusProjector(config)
|
| 421 |
+
|
| 422 |
+
# Task-specific heads
|
| 423 |
+
self.detection_head = OculusDetectionHead(config)
|
| 424 |
+
self.point_head = OculusPointHead(config)
|
| 425 |
+
self.segmentation_head = OculusSegmentationHead(config)
|
| 426 |
+
|
| 427 |
+
# Language model (loaded lazily)
|
| 428 |
+
self.lm_tokenizer = None
|
| 429 |
+
self.lm_model = None
|
| 430 |
+
self._lm_loaded = False
|
| 431 |
+
|
| 432 |
+
# Special tokens for reasoning
|
| 433 |
+
self.thinking_token = config.thinking_token
|
| 434 |
+
self.thinking_end_token = config.thinking_end_token
|
| 435 |
+
self.focus_token = config.focus_token
|
| 436 |
+
self.focus_end_token = config.focus_end_token
|
| 437 |
+
|
| 438 |
+
def load_language_model(self, device: str = "cpu"):
|
| 439 |
+
"""Load language model for text generation."""
|
| 440 |
+
if self._lm_loaded:
|
| 441 |
+
return
|
| 442 |
+
|
| 443 |
+
print("[Oculus] Loading language model...")
|
| 444 |
+
|
| 445 |
+
try:
|
| 446 |
+
# Try BLIP for now (works well for captioning/VQA)
|
| 447 |
+
from transformers import BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
|
| 448 |
+
|
| 449 |
+
self.lm_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 450 |
+
self.lm_caption_model = BlipForConditionalGeneration.from_pretrained(
|
| 451 |
+
"Salesforce/blip-image-captioning-base"
|
| 452 |
+
).to(device)
|
| 453 |
+
|
| 454 |
+
self.lm_vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
|
| 455 |
+
self.lm_vqa_model = BlipForQuestionAnswering.from_pretrained(
|
| 456 |
+
"Salesforce/blip-vqa-base"
|
| 457 |
+
).to(device)
|
| 458 |
+
|
| 459 |
+
print(" ✓ BLIP (captioning + VQA)")
|
| 460 |
+
self._lm_loaded = True
|
| 461 |
+
|
| 462 |
+
except Exception as e:
|
| 463 |
+
warnings.warn(f"Failed to load language model: {e}")
|
| 464 |
+
|
| 465 |
+
def encode_image(self, image: Union[Image.Image, str, np.ndarray]) -> torch.Tensor:
|
| 466 |
+
"""
|
| 467 |
+
Encode image to vision tokens.
|
| 468 |
+
|
| 469 |
+
Args:
|
| 470 |
+
image: PIL Image, file path, or numpy array
|
| 471 |
+
|
| 472 |
+
Returns:
|
| 473 |
+
Vision tokens [1, num_tokens, embed_dim]
|
| 474 |
+
"""
|
| 475 |
+
# Load image if path
|
| 476 |
+
if isinstance(image, str):
|
| 477 |
+
image = Image.open(image)
|
| 478 |
+
|
| 479 |
+
# Encode with vision encoders
|
| 480 |
+
vision_features = self.vision_encoder(image)
|
| 481 |
+
|
| 482 |
+
# Check if we need an adapter for dimension mismatch
|
| 483 |
+
actual_dim = vision_features.shape[-1]
|
| 484 |
+
expected_dim = self.config.fused_vision_dim
|
| 485 |
+
|
| 486 |
+
if actual_dim != expected_dim:
|
| 487 |
+
if self.vision_adapter is None or self._actual_vision_dim != actual_dim:
|
| 488 |
+
# Create adapter layer
|
| 489 |
+
print(f" [Adapter] Creating vision adapter: {actual_dim} -> {expected_dim}")
|
| 490 |
+
self.vision_adapter = nn.Linear(actual_dim, expected_dim)
|
| 491 |
+
self._actual_vision_dim = actual_dim
|
| 492 |
+
# Initialize with small weights
|
| 493 |
+
nn.init.xavier_uniform_(self.vision_adapter.weight)
|
| 494 |
+
nn.init.zeros_(self.vision_adapter.bias)
|
| 495 |
+
|
| 496 |
+
vision_features = self.vision_adapter(vision_features)
|
| 497 |
+
|
| 498 |
+
# Project to language space
|
| 499 |
+
vision_tokens = self.projector(vision_features)
|
| 500 |
+
|
| 501 |
+
return vision_tokens
|
| 502 |
+
|
| 503 |
+
def _generate_thinking_trace(
|
| 504 |
+
self,
|
| 505 |
+
image: Image.Image,
|
| 506 |
+
prompt: str,
|
| 507 |
+
max_tokens: int = 256
|
| 508 |
+
) -> str:
|
| 509 |
+
"""
|
| 510 |
+
Generate a thinking/reasoning trace before answering.
|
| 511 |
+
|
| 512 |
+
This enables multi-step reasoning for complex tasks.
|
| 513 |
+
"""
|
| 514 |
+
thinking_prompt = f"""Let me think about this step by step:
|
| 515 |
+
1. First, I'll analyze what I see in the image.
|
| 516 |
+
2. Then, I'll consider the question: "{prompt}"
|
| 517 |
+
3. Finally, I'll formulate my answer.
|
| 518 |
+
|
| 519 |
+
Observation: """
|
| 520 |
+
|
| 521 |
+
# Generate reasoning (simplified for now)
|
| 522 |
+
if self._lm_loaded and hasattr(self, 'lm_caption_model'):
|
| 523 |
+
inputs = self.lm_processor(image, thinking_prompt, return_tensors="pt")
|
| 524 |
+
inputs = {k: v.to(self.lm_caption_model.device) for k, v in inputs.items()}
|
| 525 |
+
|
| 526 |
+
with torch.no_grad():
|
| 527 |
+
out = self.lm_caption_model.generate(
|
| 528 |
+
**inputs,
|
| 529 |
+
max_new_tokens=max_tokens,
|
| 530 |
+
do_sample=True,
|
| 531 |
+
temperature=0.7
|
| 532 |
+
)
|
| 533 |
+
thinking = self.lm_processor.decode(out[0], skip_special_tokens=True)
|
| 534 |
+
else:
|
| 535 |
+
thinking = "I observe the image and analyze its contents."
|
| 536 |
+
|
| 537 |
+
return thinking
|
| 538 |
+
|
| 539 |
+
def _detect_focus_regions(
|
| 540 |
+
self,
|
| 541 |
+
image: Image.Image,
|
| 542 |
+
prompt: str
|
| 543 |
+
) -> List[Tuple[int, int, int, int]]:
|
| 544 |
+
"""
|
| 545 |
+
Detect regions that need closer inspection (Focus/Zoom system).
|
| 546 |
+
|
| 547 |
+
Returns list of (x1, y1, x2, y2) crop regions.
|
| 548 |
+
"""
|
| 549 |
+
# Simplified: return full image as single region
|
| 550 |
+
# In full implementation, would use attention maps to find regions of interest
|
| 551 |
+
w, h = image.size
|
| 552 |
+
return [(0, 0, w, h)]
|
| 553 |
+
|
| 554 |
+
def generate(
|
| 555 |
+
self,
|
| 556 |
+
image: Union[Image.Image, str, np.ndarray],
|
| 557 |
+
prompt: str = "Describe this image",
|
| 558 |
+
mode: str = "text",
|
| 559 |
+
think: bool = False,
|
| 560 |
+
focus: bool = False,
|
| 561 |
+
max_new_tokens: Optional[int] = None,
|
| 562 |
+
temperature: float = 0.7,
|
| 563 |
+
return_thinking: bool = True,
|
| 564 |
+
**kwargs
|
| 565 |
+
) -> Union[OculusTextOutput, OculusPointOutput, OculusBoxOutput, OculusPolygonOutput]:
|
| 566 |
+
"""
|
| 567 |
+
Generate output from image.
|
| 568 |
+
|
| 569 |
+
Args:
|
| 570 |
+
image: Input image (PIL, path, or array)
|
| 571 |
+
prompt: Text prompt/question
|
| 572 |
+
mode: Output mode ("text", "point", "box", "polygon")
|
| 573 |
+
think: Enable reasoning traces
|
| 574 |
+
focus: Enable zoom/crop for fine-grained perception
|
| 575 |
+
max_new_tokens: Maximum tokens to generate
|
| 576 |
+
temperature: Sampling temperature
|
| 577 |
+
return_thinking: Include thinking trace in output
|
| 578 |
+
|
| 579 |
+
Returns:
|
| 580 |
+
Mode-specific output dataclass
|
| 581 |
+
"""
|
| 582 |
+
# Load models if needed
|
| 583 |
+
self.vision_encoder.load_encoders()
|
| 584 |
+
if mode == "text":
|
| 585 |
+
self.load_language_model()
|
| 586 |
+
|
| 587 |
+
# Load image
|
| 588 |
+
if isinstance(image, str):
|
| 589 |
+
image = Image.open(image).convert('RGB')
|
| 590 |
+
elif isinstance(image, np.ndarray):
|
| 591 |
+
image = Image.fromarray(image).convert('RGB')
|
| 592 |
+
|
| 593 |
+
# Encode image
|
| 594 |
+
vision_tokens = self.encode_image(image)
|
| 595 |
+
|
| 596 |
+
# Generate thinking trace if enabled
|
| 597 |
+
thinking_trace = None
|
| 598 |
+
if think and self.config.reasoning_enabled:
|
| 599 |
+
thinking_trace = self._generate_thinking_trace(image, prompt)
|
| 600 |
+
|
| 601 |
+
# Focus system: zoom/crop if needed
|
| 602 |
+
if focus and self.config.enable_focus:
|
| 603 |
+
focus_regions = self._detect_focus_regions(image, prompt)
|
| 604 |
+
# Could re-encode cropped regions here
|
| 605 |
+
|
| 606 |
+
# Mode-specific generation
|
| 607 |
+
if mode == "text":
|
| 608 |
+
return self._generate_text(image, prompt, vision_tokens, thinking_trace, max_new_tokens, **kwargs)
|
| 609 |
+
elif mode == "point":
|
| 610 |
+
return self._generate_points(vision_tokens, thinking_trace, **kwargs)
|
| 611 |
+
elif mode == "box":
|
| 612 |
+
return self._generate_boxes(vision_tokens, thinking_trace, **kwargs)
|
| 613 |
+
elif mode == "polygon":
|
| 614 |
+
return self._generate_polygons(vision_tokens, thinking_trace, **kwargs)
|
| 615 |
+
else:
|
| 616 |
+
raise ValueError(f"Unknown mode: {mode}")
|
| 617 |
+
|
| 618 |
+
def _generate_text(
|
| 619 |
+
self,
|
| 620 |
+
image: Image.Image,
|
| 621 |
+
prompt: str,
|
| 622 |
+
vision_tokens: torch.Tensor,
|
| 623 |
+
thinking_trace: Optional[str],
|
| 624 |
+
max_new_tokens: Optional[int],
|
| 625 |
+
**kwargs
|
| 626 |
+
) -> OculusTextOutput:
|
| 627 |
+
"""Generate text output (caption or VQA)."""
|
| 628 |
+
|
| 629 |
+
device = vision_tokens.device if vision_tokens.is_cuda else "cpu"
|
| 630 |
+
max_tokens = max_new_tokens or self.config.max_new_tokens
|
| 631 |
+
|
| 632 |
+
# Determine if this is a question
|
| 633 |
+
is_question = any(q in prompt.lower() for q in ["what", "where", "who", "how", "why", "is", "are", "does", "do", "can", "?"])
|
| 634 |
+
|
| 635 |
+
if is_question and hasattr(self, 'lm_vqa_model'):
|
| 636 |
+
# VQA mode
|
| 637 |
+
inputs = self.lm_vqa_processor(image, prompt, return_tensors="pt")
|
| 638 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 639 |
+
|
| 640 |
+
with torch.no_grad():
|
| 641 |
+
out = self.lm_vqa_model.generate(**inputs, max_new_tokens=50)
|
| 642 |
+
text = self.lm_vqa_processor.decode(out[0], skip_special_tokens=True)
|
| 643 |
+
else:
|
| 644 |
+
# Caption mode
|
| 645 |
+
inputs = self.lm_processor(image, prompt, return_tensors="pt")
|
| 646 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 647 |
+
|
| 648 |
+
with torch.no_grad():
|
| 649 |
+
out = self.lm_caption_model.generate(**inputs, max_new_tokens=max_tokens)
|
| 650 |
+
text = self.lm_processor.decode(out[0], skip_special_tokens=True)
|
| 651 |
+
|
| 652 |
+
return OculusTextOutput(
|
| 653 |
+
text=text,
|
| 654 |
+
thinking_trace=thinking_trace,
|
| 655 |
+
vision_tokens=vision_tokens
|
| 656 |
+
)
|
| 657 |
+
|
| 658 |
+
def _generate_points(
|
| 659 |
+
self,
|
| 660 |
+
vision_tokens: torch.Tensor,
|
| 661 |
+
thinking_trace: Optional[str],
|
| 662 |
+
threshold: float = 0.5,
|
| 663 |
+
**kwargs
|
| 664 |
+
) -> OculusPointOutput:
|
| 665 |
+
"""Generate point detections."""
|
| 666 |
+
|
| 667 |
+
points, cls_logits, confidence = self.point_head(vision_tokens)
|
| 668 |
+
|
| 669 |
+
# Filter by confidence
|
| 670 |
+
mask = confidence.squeeze(-1) > threshold
|
| 671 |
+
|
| 672 |
+
filtered_points = []
|
| 673 |
+
filtered_labels = []
|
| 674 |
+
filtered_conf = []
|
| 675 |
+
|
| 676 |
+
for i in range(vision_tokens.shape[0]):
|
| 677 |
+
token_mask = mask[i]
|
| 678 |
+
pts = points[i][token_mask].detach().cpu().numpy().tolist()
|
| 679 |
+
confs = confidence[i][token_mask].squeeze(-1).detach().cpu().numpy().tolist()
|
| 680 |
+
cls_ids = cls_logits[i][token_mask].argmax(dim=-1).detach().cpu().numpy().tolist()
|
| 681 |
+
|
| 682 |
+
filtered_points.extend([tuple(p) for p in pts])
|
| 683 |
+
filtered_conf.extend(confs)
|
| 684 |
+
filtered_labels.extend([str(c) for c in cls_ids])
|
| 685 |
+
|
| 686 |
+
return OculusPointOutput(
|
| 687 |
+
points=filtered_points,
|
| 688 |
+
labels=filtered_labels,
|
| 689 |
+
confidences=filtered_conf,
|
| 690 |
+
thinking_trace=thinking_trace,
|
| 691 |
+
vision_tokens=vision_tokens
|
| 692 |
+
)
|
| 693 |
+
|
| 694 |
+
def _generate_boxes(
|
| 695 |
+
self,
|
| 696 |
+
vision_tokens: torch.Tensor,
|
| 697 |
+
thinking_trace: Optional[str],
|
| 698 |
+
threshold: float = 0.3,
|
| 699 |
+
**kwargs
|
| 700 |
+
) -> OculusBoxOutput:
|
| 701 |
+
"""Generate bounding box detections."""
|
| 702 |
+
|
| 703 |
+
cls_logits, box_coords = self.detection_head(vision_tokens)
|
| 704 |
+
|
| 705 |
+
# Get confidence from class logits
|
| 706 |
+
confidence = F.softmax(cls_logits, dim=-1).max(dim=-1).values
|
| 707 |
+
|
| 708 |
+
filtered_boxes = []
|
| 709 |
+
filtered_labels = []
|
| 710 |
+
filtered_conf = []
|
| 711 |
+
|
| 712 |
+
for i in range(vision_tokens.shape[0]):
|
| 713 |
+
mask = confidence[i] > threshold
|
| 714 |
+
boxes = box_coords[i][mask].detach().cpu().numpy()
|
| 715 |
+
confs = confidence[i][mask].detach().cpu().numpy().tolist()
|
| 716 |
+
cls_ids = cls_logits[i][mask].argmax(dim=-1).detach().cpu().numpy().tolist()
|
| 717 |
+
|
| 718 |
+
filtered_boxes.extend([tuple(b) for b in boxes])
|
| 719 |
+
filtered_conf.extend(confs)
|
| 720 |
+
filtered_labels.extend([str(c) for c in cls_ids])
|
| 721 |
+
|
| 722 |
+
return OculusBoxOutput(
|
| 723 |
+
boxes=filtered_boxes,
|
| 724 |
+
labels=filtered_labels,
|
| 725 |
+
confidences=filtered_conf,
|
| 726 |
+
thinking_trace=thinking_trace,
|
| 727 |
+
vision_tokens=vision_tokens
|
| 728 |
+
)
|
| 729 |
+
|
| 730 |
+
def _generate_polygons(
|
| 731 |
+
self,
|
| 732 |
+
vision_tokens: torch.Tensor,
|
| 733 |
+
thinking_trace: Optional[str],
|
| 734 |
+
**kwargs
|
| 735 |
+
) -> OculusPolygonOutput:
|
| 736 |
+
"""Generate polygon/mask segmentation."""
|
| 737 |
+
|
| 738 |
+
mask_logits = self.segmentation_head(vision_tokens)
|
| 739 |
+
|
| 740 |
+
# Get predicted mask
|
| 741 |
+
mask = mask_logits.argmax(dim=1).detach().cpu().numpy()
|
| 742 |
+
|
| 743 |
+
# Convert to polygons (simplified)
|
| 744 |
+
# In full implementation, would use cv2.findContours
|
| 745 |
+
polygons = []
|
| 746 |
+
labels = []
|
| 747 |
+
|
| 748 |
+
unique_classes = np.unique(mask[0])
|
| 749 |
+
for cls_id in unique_classes:
|
| 750 |
+
if cls_id == 0: # Skip background
|
| 751 |
+
continue
|
| 752 |
+
labels.append(str(cls_id))
|
| 753 |
+
# Placeholder polygon
|
| 754 |
+
polygons.append([(0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0)])
|
| 755 |
+
|
| 756 |
+
return OculusPolygonOutput(
|
| 757 |
+
polygons=polygons,
|
| 758 |
+
labels=labels,
|
| 759 |
+
mask=mask[0],
|
| 760 |
+
thinking_trace=thinking_trace,
|
| 761 |
+
vision_tokens=vision_tokens
|
| 762 |
+
)
|
| 763 |
+
|
| 764 |
+
@classmethod
|
| 765 |
+
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 766 |
+
"""
|
| 767 |
+
Load model from pretrained weights.
|
| 768 |
+
|
| 769 |
+
Args:
|
| 770 |
+
pretrained_model_name_or_path: HuggingFace repo ID or local path
|
| 771 |
+
"""
|
| 772 |
+
path = Path(pretrained_model_name_or_path)
|
| 773 |
+
|
| 774 |
+
# Load config
|
| 775 |
+
config_path = path / "config.json"
|
| 776 |
+
if config_path.exists():
|
| 777 |
+
import json
|
| 778 |
+
with open(config_path) as f:
|
| 779 |
+
proj_config = json.load(f)
|
| 780 |
+
|
| 781 |
+
# Create config with correct dimensions from projector
|
| 782 |
+
config = OculusConfig(
|
| 783 |
+
dinov3_hidden_size=proj_config.get("fused_dim", 2048) - 768, # Infer from fused
|
| 784 |
+
siglip_hidden_size=768,
|
| 785 |
+
projector_hidden_dim=proj_config.get("hidden_dim", 2048),
|
| 786 |
+
num_vision_tokens=proj_config.get("num_tokens", 64),
|
| 787 |
+
lm_hidden_size=proj_config.get("embed_dim", 1536),
|
| 788 |
+
)
|
| 789 |
+
else:
|
| 790 |
+
config = OculusConfig()
|
| 791 |
+
|
| 792 |
+
# Create model
|
| 793 |
+
model = cls(config)
|
| 794 |
+
|
| 795 |
+
# Load projector weights
|
| 796 |
+
projector_path = path / "projector.npz"
|
| 797 |
+
if projector_path.exists():
|
| 798 |
+
model.projector = OculusProjector.from_pretrained(path, config)
|
| 799 |
+
|
| 800 |
+
# Load detection/segmentation heads if available
|
| 801 |
+
heads_path = path / "heads.pth"
|
| 802 |
+
if heads_path.exists():
|
| 803 |
+
heads_state = torch.load(heads_path, map_location="cpu")
|
| 804 |
+
model.detection_head.load_state_dict(heads_state.get("detection", {}), strict=False)
|
| 805 |
+
model.point_head.load_state_dict(heads_state.get("point", {}), strict=False)
|
| 806 |
+
model.segmentation_head.load_state_dict(heads_state.get("segmentation", {}), strict=False)
|
| 807 |
+
|
| 808 |
+
return model
|
| 809 |
+
|
| 810 |
+
def save_pretrained(self, save_directory: str):
|
| 811 |
+
"""Save model to directory."""
|
| 812 |
+
path = Path(save_directory)
|
| 813 |
+
path.mkdir(parents=True, exist_ok=True)
|
| 814 |
+
|
| 815 |
+
# Save config
|
| 816 |
+
self.config.save_pretrained(path)
|
| 817 |
+
|
| 818 |
+
# Save projector
|
| 819 |
+
projector_state = self.projector.state_dict()
|
| 820 |
+
# Convert to numpy for MLX compatibility
|
| 821 |
+
np_weights = {}
|
| 822 |
+
for k, v in projector_state.items():
|
| 823 |
+
parts = k.split(".")
|
| 824 |
+
layer = parts[0]
|
| 825 |
+
param = ".".join(parts[1:])
|
| 826 |
+
if layer not in np_weights:
|
| 827 |
+
np_weights[layer] = {}
|
| 828 |
+
np_weights[layer][param] = v.cpu().numpy()
|
| 829 |
+
np.savez(path / "projector.npz", **{k: v for k, v in np_weights.items()})
|
| 830 |
+
|
| 831 |
+
# Save heads
|
| 832 |
+
torch.save({
|
| 833 |
+
"detection": self.detection_head.state_dict(),
|
| 834 |
+
"point": self.point_head.state_dict(),
|
| 835 |
+
"segmentation": self.segmentation_head.state_dict(),
|
| 836 |
+
}, path / "heads.pth")
|
| 837 |
+
|
| 838 |
+
print(f"✓ Saved model to {path}")
|
| 839 |
+
|
| 840 |
+
|
| 841 |
+
# Register for auto-loading
|
| 842 |
+
OculusForConditionalGeneration.register_for_auto_class("AutoModelForVision2Seq")
|
oculus_unified_model/processing_oculus.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Oculus Processor
|
| 3 |
+
|
| 4 |
+
Handles image and text preprocessing for the Oculus model.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import Optional, Union, List, Dict, Any
|
| 8 |
+
from PIL import Image
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
from transformers import ProcessorMixin, BatchFeature
|
| 12 |
+
from transformers.image_utils import ImageInput
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class OculusProcessor(ProcessorMixin):
|
| 16 |
+
"""
|
| 17 |
+
Processor for Oculus model.
|
| 18 |
+
|
| 19 |
+
Combines image processing and text tokenization.
|
| 20 |
+
|
| 21 |
+
Usage:
|
| 22 |
+
```python
|
| 23 |
+
processor = OculusProcessor.from_pretrained("OceanirAI/oculus-0.2")
|
| 24 |
+
|
| 25 |
+
# Process inputs
|
| 26 |
+
inputs = processor(
|
| 27 |
+
images=image,
|
| 28 |
+
text="What is in this image?",
|
| 29 |
+
mode="text",
|
| 30 |
+
return_tensors="pt"
|
| 31 |
+
)
|
| 32 |
+
```
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
attributes = ["image_processor", "tokenizer"]
|
| 36 |
+
image_processor_class = "AutoImageProcessor"
|
| 37 |
+
tokenizer_class = "AutoTokenizer"
|
| 38 |
+
|
| 39 |
+
def __init__(
|
| 40 |
+
self,
|
| 41 |
+
image_processor=None,
|
| 42 |
+
tokenizer=None,
|
| 43 |
+
**kwargs
|
| 44 |
+
):
|
| 45 |
+
super().__init__(image_processor, tokenizer)
|
| 46 |
+
self.image_processor = image_processor
|
| 47 |
+
self.tokenizer = tokenizer
|
| 48 |
+
|
| 49 |
+
# Special tokens
|
| 50 |
+
self.thinking_token = kwargs.get("thinking_token", "<think>")
|
| 51 |
+
self.thinking_end_token = kwargs.get("thinking_end_token", "</think>")
|
| 52 |
+
self.focus_token = kwargs.get("focus_token", "<focus>")
|
| 53 |
+
self.focus_end_token = kwargs.get("focus_end_token", "</focus>")
|
| 54 |
+
|
| 55 |
+
# Output mode tokens
|
| 56 |
+
self.mode_tokens = {
|
| 57 |
+
"text": "<text>",
|
| 58 |
+
"point": "<point>",
|
| 59 |
+
"box": "<box>",
|
| 60 |
+
"polygon": "<polygon>",
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
def __call__(
|
| 64 |
+
self,
|
| 65 |
+
images: ImageInput = None,
|
| 66 |
+
text: Union[str, List[str]] = None,
|
| 67 |
+
mode: str = "text",
|
| 68 |
+
think: bool = False,
|
| 69 |
+
return_tensors: Optional[str] = None,
|
| 70 |
+
**kwargs
|
| 71 |
+
) -> BatchFeature:
|
| 72 |
+
"""
|
| 73 |
+
Process images and text for Oculus model.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
images: Input image(s)
|
| 77 |
+
text: Input text prompt(s)
|
| 78 |
+
mode: Output mode ("text", "point", "box", "polygon")
|
| 79 |
+
think: Enable reasoning mode
|
| 80 |
+
return_tensors: Tensor format ("pt", "np", etc.)
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
BatchFeature with processed inputs
|
| 84 |
+
"""
|
| 85 |
+
# Process images
|
| 86 |
+
if images is not None:
|
| 87 |
+
if self.image_processor is not None:
|
| 88 |
+
image_features = self.image_processor(images, return_tensors=return_tensors)
|
| 89 |
+
else:
|
| 90 |
+
# Basic processing
|
| 91 |
+
if isinstance(images, Image.Image):
|
| 92 |
+
images = [images]
|
| 93 |
+
image_features = {"pixel_values": images}
|
| 94 |
+
else:
|
| 95 |
+
image_features = {}
|
| 96 |
+
|
| 97 |
+
# Process text
|
| 98 |
+
if text is not None:
|
| 99 |
+
# Add mode and thinking tokens
|
| 100 |
+
processed_text = self._format_prompt(text, mode, think)
|
| 101 |
+
|
| 102 |
+
if self.tokenizer is not None:
|
| 103 |
+
text_features = self.tokenizer(
|
| 104 |
+
processed_text,
|
| 105 |
+
return_tensors=return_tensors,
|
| 106 |
+
padding=True,
|
| 107 |
+
truncation=True,
|
| 108 |
+
**kwargs
|
| 109 |
+
)
|
| 110 |
+
else:
|
| 111 |
+
text_features = {"text": processed_text}
|
| 112 |
+
else:
|
| 113 |
+
text_features = {}
|
| 114 |
+
|
| 115 |
+
# Combine features
|
| 116 |
+
return BatchFeature(
|
| 117 |
+
data={
|
| 118 |
+
**image_features,
|
| 119 |
+
**text_features,
|
| 120 |
+
"mode": mode,
|
| 121 |
+
"think": think,
|
| 122 |
+
},
|
| 123 |
+
tensor_type=return_tensors
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
def _format_prompt(
|
| 127 |
+
self,
|
| 128 |
+
text: Union[str, List[str]],
|
| 129 |
+
mode: str,
|
| 130 |
+
think: bool
|
| 131 |
+
) -> Union[str, List[str]]:
|
| 132 |
+
"""Format prompt with special tokens."""
|
| 133 |
+
|
| 134 |
+
def format_single(t: str) -> str:
|
| 135 |
+
parts = []
|
| 136 |
+
|
| 137 |
+
# Add mode token
|
| 138 |
+
if mode in self.mode_tokens:
|
| 139 |
+
parts.append(self.mode_tokens[mode])
|
| 140 |
+
|
| 141 |
+
# Add thinking token if enabled
|
| 142 |
+
if think:
|
| 143 |
+
parts.append(self.thinking_token)
|
| 144 |
+
|
| 145 |
+
# Add prompt
|
| 146 |
+
parts.append(t)
|
| 147 |
+
|
| 148 |
+
return " ".join(parts)
|
| 149 |
+
|
| 150 |
+
if isinstance(text, str):
|
| 151 |
+
return format_single(text)
|
| 152 |
+
else:
|
| 153 |
+
return [format_single(t) for t in text]
|
| 154 |
+
|
| 155 |
+
def decode(
|
| 156 |
+
self,
|
| 157 |
+
token_ids,
|
| 158 |
+
skip_special_tokens: bool = True,
|
| 159 |
+
**kwargs
|
| 160 |
+
) -> str:
|
| 161 |
+
"""Decode token IDs to text."""
|
| 162 |
+
if self.tokenizer is not None:
|
| 163 |
+
text = self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens, **kwargs)
|
| 164 |
+
else:
|
| 165 |
+
text = str(token_ids)
|
| 166 |
+
|
| 167 |
+
# Parse thinking trace if present
|
| 168 |
+
thinking_trace = None
|
| 169 |
+
if self.thinking_token in text and self.thinking_end_token in text:
|
| 170 |
+
start = text.find(self.thinking_token) + len(self.thinking_token)
|
| 171 |
+
end = text.find(self.thinking_end_token)
|
| 172 |
+
thinking_trace = text[start:end].strip()
|
| 173 |
+
text = text[end + len(self.thinking_end_token):].strip()
|
| 174 |
+
|
| 175 |
+
return text, thinking_trace
|
| 176 |
+
|
| 177 |
+
def batch_decode(
|
| 178 |
+
self,
|
| 179 |
+
token_ids,
|
| 180 |
+
skip_special_tokens: bool = True,
|
| 181 |
+
**kwargs
|
| 182 |
+
) -> List[str]:
|
| 183 |
+
"""Decode batch of token IDs."""
|
| 184 |
+
return [
|
| 185 |
+
self.decode(ids, skip_special_tokens=skip_special_tokens, **kwargs)
|
| 186 |
+
for ids in token_ids
|
| 187 |
+
]
|
| 188 |
+
|
| 189 |
+
@classmethod
|
| 190 |
+
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 191 |
+
"""Load processor from pretrained."""
|
| 192 |
+
try:
|
| 193 |
+
from transformers import AutoImageProcessor, AutoTokenizer
|
| 194 |
+
|
| 195 |
+
image_processor = AutoImageProcessor.from_pretrained(
|
| 196 |
+
pretrained_model_name_or_path, **kwargs
|
| 197 |
+
)
|
| 198 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 199 |
+
pretrained_model_name_or_path, **kwargs
|
| 200 |
+
)
|
| 201 |
+
return cls(image_processor=image_processor, tokenizer=tokenizer, **kwargs)
|
| 202 |
+
except:
|
| 203 |
+
# Return basic processor without HF components
|
| 204 |
+
return cls(**kwargs)
|
| 205 |
+
|
| 206 |
+
def save_pretrained(self, save_directory: str, **kwargs):
|
| 207 |
+
"""Save processor to directory."""
|
| 208 |
+
if self.image_processor is not None:
|
| 209 |
+
self.image_processor.save_pretrained(save_directory)
|
| 210 |
+
if self.tokenizer is not None:
|
| 211 |
+
self.tokenizer.save_pretrained(save_directory)
|
trained_components/heads.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6227a8bdb1d7037a9667cdec18061af9c4f3771fd4f62b0afbe68c5e44bdf3d1
|
| 3 |
+
size 36454441
|
trained_components/projector.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:94ed66d364bfdb636d28d537802ef16dfcd3407ed750b30688628c28f7684562
|
| 3 |
+
size 839285719
|
vision_encoders/dinov2-large/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
vision_encoders/dinov2-large/README.md
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
tags:
|
| 4 |
+
- dino
|
| 5 |
+
- vision
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
# Vision Transformer (large-sized model) trained using DINOv2
|
| 9 |
+
|
| 10 |
+
Vision Transformer (ViT) model trained using the DINOv2 method. It was introduced in the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Oquab et al. and first released in [this repository](https://github.com/facebookresearch/dinov2).
|
| 11 |
+
|
| 12 |
+
Disclaimer: The team releasing DINOv2 did not write a model card for this model so this model card has been written by the Hugging Face team.
|
| 13 |
+
|
| 14 |
+
## Model description
|
| 15 |
+
|
| 16 |
+
The Vision Transformer (ViT) is a transformer encoder model (BERT-like) pretrained on a large collection of images in a self-supervised fashion.
|
| 17 |
+
|
| 18 |
+
Images are presented to the model as a sequence of fixed-size patches, which are linearly embedded. One also adds a [CLS] token to the beginning of a sequence to use it for classification tasks. One also adds absolute position embeddings before feeding the sequence to the layers of the Transformer encoder.
|
| 19 |
+
|
| 20 |
+
Note that this model does not include any fine-tuned heads.
|
| 21 |
+
|
| 22 |
+
By pre-training the model, it learns an inner representation of images that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled images for instance, you can train a standard classifier by placing a linear layer on top of the pre-trained encoder. One typically places a linear layer on top of the [CLS] token, as the last hidden state of this token can be seen as a representation of an entire image.
|
| 23 |
+
|
| 24 |
+
## Intended uses & limitations
|
| 25 |
+
|
| 26 |
+
You can use the raw model for feature extraction. See the [model hub](https://huggingface.co/models?search=facebook/dinov2) to look for
|
| 27 |
+
fine-tuned versions on a task that interests you.
|
| 28 |
+
|
| 29 |
+
### How to use
|
| 30 |
+
|
| 31 |
+
Here is how to use this model:
|
| 32 |
+
|
| 33 |
+
```python
|
| 34 |
+
from transformers import AutoImageProcessor, AutoModel
|
| 35 |
+
from PIL import Image
|
| 36 |
+
import requests
|
| 37 |
+
|
| 38 |
+
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
|
| 39 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
| 40 |
+
|
| 41 |
+
processor = AutoImageProcessor.from_pretrained('facebook/dinov2-large')
|
| 42 |
+
model = AutoModel.from_pretrained('facebook/dinov2-large')
|
| 43 |
+
|
| 44 |
+
inputs = processor(images=image, return_tensors="pt")
|
| 45 |
+
outputs = model(**inputs)
|
| 46 |
+
last_hidden_states = outputs.last_hidden_state
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
### BibTeX entry and citation info
|
| 50 |
+
|
| 51 |
+
```bibtex
|
| 52 |
+
misc{oquab2023dinov2,
|
| 53 |
+
title={DINOv2: Learning Robust Visual Features without Supervision},
|
| 54 |
+
author={Maxime Oquab and Timothée Darcet and Théo Moutakanni and Huy Vo and Marc Szafraniec and Vasil Khalidov and Pierre Fernandez and Daniel Haziza and Francisco Massa and Alaaeldin El-Nouby and Mahmoud Assran and Nicolas Ballas and Wojciech Galuba and Russell Howes and Po-Yao Huang and Shang-Wen Li and Ishan Misra and Michael Rabbat and Vasu Sharma and Gabriel Synnaeve and Hu Xu and Hervé Jegou and Julien Mairal and Patrick Labatut and Armand Joulin and Piotr Bojanowski},
|
| 55 |
+
year={2023},
|
| 56 |
+
eprint={2304.07193},
|
| 57 |
+
archivePrefix={arXiv},
|
| 58 |
+
primaryClass={cs.CV}
|
| 59 |
+
}
|
| 60 |
+
```
|
vision_encoders/dinov2-large/config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Dinov2Model"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.0,
|
| 6 |
+
"drop_path_rate": 0.0,
|
| 7 |
+
"hidden_act": "gelu",
|
| 8 |
+
"hidden_dropout_prob": 0.0,
|
| 9 |
+
"hidden_size": 1024,
|
| 10 |
+
"image_size": 518,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"layer_norm_eps": 1e-06,
|
| 13 |
+
"layerscale_value": 1.0,
|
| 14 |
+
"mlp_ratio": 4,
|
| 15 |
+
"model_type": "dinov2",
|
| 16 |
+
"num_attention_heads": 16,
|
| 17 |
+
"num_channels": 3,
|
| 18 |
+
"num_hidden_layers": 24,
|
| 19 |
+
"patch_size": 14,
|
| 20 |
+
"qkv_bias": true,
|
| 21 |
+
"torch_dtype": "float32",
|
| 22 |
+
"transformers_version": "4.31.0.dev0",
|
| 23 |
+
"use_swiglu_ffn": false
|
| 24 |
+
}
|
vision_encoders/dinov2-large/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:399fba97a95f22c36834418bc69373364a99af3a1153da1c0fb31db567c92e23
|
| 3 |
+
size 1217522888
|
vision_encoders/dinov2-large/preprocessor_config.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": {
|
| 3 |
+
"height": 224,
|
| 4 |
+
"width": 224
|
| 5 |
+
},
|
| 6 |
+
"do_center_crop": true,
|
| 7 |
+
"do_convert_rgb": true,
|
| 8 |
+
"do_normalize": true,
|
| 9 |
+
"do_rescale": true,
|
| 10 |
+
"do_resize": true,
|
| 11 |
+
"image_mean": [
|
| 12 |
+
0.485,
|
| 13 |
+
0.456,
|
| 14 |
+
0.406
|
| 15 |
+
],
|
| 16 |
+
"image_processor_type": "BitImageProcessor",
|
| 17 |
+
"image_std": [
|
| 18 |
+
0.229,
|
| 19 |
+
0.224,
|
| 20 |
+
0.225
|
| 21 |
+
],
|
| 22 |
+
"resample": 3,
|
| 23 |
+
"rescale_factor": 0.00392156862745098,
|
| 24 |
+
"size": {
|
| 25 |
+
"shortest_edge": 256
|
| 26 |
+
}
|
| 27 |
+
}
|
vision_encoders/dinov2-large/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8320e4778a7f8850d10f30d97e9138438e1851af1576fea789c43746140cc655
|
| 3 |
+
size 1217614569
|
vision_encoders/siglip-base/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
vision_encoders/siglip-base/README.md
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
tags:
|
| 4 |
+
- vision
|
| 5 |
+
widget:
|
| 6 |
+
- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
|
| 7 |
+
candidate_labels: playing music, playing sports
|
| 8 |
+
example_title: Cat & Dog
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# SigLIP (base-sized model)
|
| 12 |
+
|
| 13 |
+
SigLIP model pre-trained on WebLi at resolution 224x224. It was introduced in the paper [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Zhai et al. and first released in [this repository](https://github.com/google-research/big_vision).
|
| 14 |
+
|
| 15 |
+
Disclaimer: The team releasing SigLIP did not write a model card for this model so this model card has been written by the Hugging Face team.
|
| 16 |
+
|
| 17 |
+
## Model description
|
| 18 |
+
|
| 19 |
+
SigLIP is [CLIP](https://huggingface.co/docs/transformers/model_doc/clip), a multimodal model, with a better loss function. The sigmoid loss operates solely on image-text pairs and does not require a global view of the pairwise similarities for normalization. This allows further scaling up the batch size, while also performing better at smaller batch sizes.
|
| 20 |
+
|
| 21 |
+
A TLDR of SigLIP by one of the authors can be found [here](https://twitter.com/giffmana/status/1692641733459267713).
|
| 22 |
+
|
| 23 |
+
## Intended uses & limitations
|
| 24 |
+
|
| 25 |
+
You can use the raw model for tasks like zero-shot image classification and image-text retrieval. See the [model hub](https://huggingface.co/models?search=google/siglip) to look for
|
| 26 |
+
other versions on a task that interests you.
|
| 27 |
+
|
| 28 |
+
### How to use
|
| 29 |
+
|
| 30 |
+
Here is how to use this model to perform zero-shot image classification:
|
| 31 |
+
|
| 32 |
+
```python
|
| 33 |
+
from PIL import Image
|
| 34 |
+
import requests
|
| 35 |
+
from transformers import AutoProcessor, AutoModel
|
| 36 |
+
import torch
|
| 37 |
+
|
| 38 |
+
model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
|
| 39 |
+
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
|
| 40 |
+
|
| 41 |
+
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
| 42 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
| 43 |
+
|
| 44 |
+
texts = ["a photo of 2 cats", "a photo of 2 dogs"]
|
| 45 |
+
inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
|
| 46 |
+
|
| 47 |
+
with torch.no_grad():
|
| 48 |
+
outputs = model(**inputs)
|
| 49 |
+
|
| 50 |
+
logits_per_image = outputs.logits_per_image
|
| 51 |
+
probs = torch.sigmoid(logits_per_image) # these are the probabilities
|
| 52 |
+
print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
Alternatively, one can leverage the pipeline API which abstracts away the complexity for the user:
|
| 56 |
+
|
| 57 |
+
```python
|
| 58 |
+
from transformers import pipeline
|
| 59 |
+
from PIL import Image
|
| 60 |
+
import requests
|
| 61 |
+
|
| 62 |
+
# load pipe
|
| 63 |
+
image_classifier = pipeline(task="zero-shot-image-classification", model="google/siglip-base-patch16-224")
|
| 64 |
+
|
| 65 |
+
# load image
|
| 66 |
+
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
|
| 67 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
| 68 |
+
|
| 69 |
+
# inference
|
| 70 |
+
outputs = image_classifier(image, candidate_labels=["2 cats", "a plane", "a remote"])
|
| 71 |
+
outputs = [{"score": round(output["score"], 4), "label": output["label"] } for output in outputs]
|
| 72 |
+
print(outputs)
|
| 73 |
+
```
|
| 74 |
+
For more code examples, we refer to the [documentation](https://huggingface.co/transformers/main/model_doc/siglip.html#).
|
| 75 |
+
|
| 76 |
+
## Training procedure
|
| 77 |
+
|
| 78 |
+
### Training data
|
| 79 |
+
|
| 80 |
+
SigLIP is pre-trained on the English image-text pairs of the WebLI dataset [(Chen et al., 2023)](https://arxiv.org/abs/2209.06794).
|
| 81 |
+
|
| 82 |
+
### Preprocessing
|
| 83 |
+
|
| 84 |
+
Images are resized/rescaled to the same resolution (224x224) and normalized across the RGB channels with mean (0.5, 0.5, 0.5) and standard deviation (0.5, 0.5, 0.5).
|
| 85 |
+
|
| 86 |
+
Texts are tokenized and padded to the same length (64 tokens).
|
| 87 |
+
|
| 88 |
+
### Compute
|
| 89 |
+
|
| 90 |
+
The model was trained on 16 TPU-v4 chips for three days.
|
| 91 |
+
|
| 92 |
+
## Evaluation results
|
| 93 |
+
|
| 94 |
+
Evaluation of SigLIP compared to CLIP is shown below (taken from the paper).
|
| 95 |
+
|
| 96 |
+
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/siglip_table.jpeg"
|
| 97 |
+
alt="drawing" width="600"/>
|
| 98 |
+
|
| 99 |
+
### BibTeX entry and citation info
|
| 100 |
+
|
| 101 |
+
```bibtex
|
| 102 |
+
@misc{zhai2023sigmoid,
|
| 103 |
+
title={Sigmoid Loss for Language Image Pre-Training},
|
| 104 |
+
author={Xiaohua Zhai and Basil Mustafa and Alexander Kolesnikov and Lucas Beyer},
|
| 105 |
+
year={2023},
|
| 106 |
+
eprint={2303.15343},
|
| 107 |
+
archivePrefix={arXiv},
|
| 108 |
+
primaryClass={cs.CV}
|
| 109 |
+
}
|
| 110 |
+
```
|
vision_encoders/siglip-base/config.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"SiglipModel"
|
| 4 |
+
],
|
| 5 |
+
"initializer_factor": 1.0,
|
| 6 |
+
"model_type": "siglip",
|
| 7 |
+
"text_config": {
|
| 8 |
+
"hidden_size": 768,
|
| 9 |
+
"intermediate_size": 3072,
|
| 10 |
+
"model_type": "siglip_text_model",
|
| 11 |
+
"num_attention_heads": 12,
|
| 12 |
+
"vocab_size": 32000
|
| 13 |
+
},
|
| 14 |
+
"torch_dtype": "float32",
|
| 15 |
+
"transformers_version": "4.37.0.dev0",
|
| 16 |
+
"vision_config": {
|
| 17 |
+
"model_type": "siglip_vision_model",
|
| 18 |
+
"patch_size": 16
|
| 19 |
+
}
|
| 20 |
+
}
|
vision_encoders/siglip-base/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2c63cb7d1f2e95ba501893cbb8faeb4ea9a3af295498d35097126228659c2af8
|
| 3 |
+
size 812672320
|
vision_encoders/siglip-base/preprocessor_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": true,
|
| 3 |
+
"do_rescale": true,
|
| 4 |
+
"do_resize": true,
|
| 5 |
+
"image_mean": [
|
| 6 |
+
0.5,
|
| 7 |
+
0.5,
|
| 8 |
+
0.5
|
| 9 |
+
],
|
| 10 |
+
"image_processor_type": "SiglipImageProcessor",
|
| 11 |
+
"image_std": [
|
| 12 |
+
0.5,
|
| 13 |
+
0.5,
|
| 14 |
+
0.5
|
| 15 |
+
],
|
| 16 |
+
"processor_class": "SiglipProcessor",
|
| 17 |
+
"resample": 3,
|
| 18 |
+
"rescale_factor": 0.00392156862745098,
|
| 19 |
+
"size": {
|
| 20 |
+
"height": 224,
|
| 21 |
+
"width": 224
|
| 22 |
+
}
|
| 23 |
+
}
|
vision_encoders/siglip-base/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb93f7f526b0a1b0e5f0612630f142bc5b6c05d329edff70478ff0a83e2bcd6e
|
| 3 |
+
size 812762989
|
vision_encoders/siglip-base/special_tokens_map.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"eos_token": {
|
| 3 |
+
"content": "</s>",
|
| 4 |
+
"lstrip": true,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": true,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"pad_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": true,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": true,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"unk_token": {
|
| 17 |
+
"content": "<unk>",
|
| 18 |
+
"lstrip": true,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": true,
|
| 21 |
+
"single_word": false
|
| 22 |
+
}
|
| 23 |
+
}
|
vision_encoders/siglip-base/spiece.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1e5036bed065526c3c212dfbe288752391797c4bb1a284aa18c9a0b23fcaf8ec
|
| 3 |
+
size 798330
|
vision_encoders/siglip-base/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vision_encoders/siglip-base/tokenizer_config.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"1": {
|
| 4 |
+
"content": "</s>",
|
| 5 |
+
"lstrip": true,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": true,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"2": {
|
| 12 |
+
"content": "<unk>",
|
| 13 |
+
"lstrip": true,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": true,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"additional_special_tokens": [],
|
| 21 |
+
"clean_up_tokenization_spaces": true,
|
| 22 |
+
"do_lower_case": true,
|
| 23 |
+
"eos_token": "</s>",
|
| 24 |
+
"model_input_names": [
|
| 25 |
+
"input_ids"
|
| 26 |
+
],
|
| 27 |
+
"model_max_length": 64,
|
| 28 |
+
"pad_token": "</s>",
|
| 29 |
+
"processor_class": "SiglipProcessor",
|
| 30 |
+
"sp_model_kwargs": {},
|
| 31 |
+
"tokenizer_class": "SiglipTokenizer",
|
| 32 |
+
"unk_token": "<unk>"
|
| 33 |
+
}
|