| """ | |
| Oculus Unified Vision-Language Model | |
| A HuggingFace-compatible multimodal model combining: | |
| - DINOv3 (vision encoder) | |
| - SigLIP2 (vision encoder) | |
| - Trained Projector (vision-to-language bridge) | |
| - LLM (language generation) | |
| Supports: | |
| - Image captioning | |
| - Visual question answering | |
| - Object detection (Box mode) | |
| - Point detection (counting) | |
| - Polygon segmentation | |
| - Optional reasoning with thinking traces | |
| """ | |
| from .modeling_oculus import ( | |
| OculusForConditionalGeneration, | |
| OculusVisionEncoder, | |
| OculusProjector, | |
| ) | |
| from .configuration_oculus import OculusConfig | |
| from .processing_oculus import OculusProcessor | |
| __all__ = [ | |
| "OculusForConditionalGeneration", | |
| "OculusVisionEncoder", | |
| "OculusProjector", | |
| "OculusConfig", | |
| "OculusProcessor", | |
| ] | |
| __version__ = "0.2.0" | |