""" Oculus Unified Vision-Language Model A HuggingFace-compatible multimodal model combining: - DINOv3 (vision encoder) - SigLIP2 (vision encoder) - Trained Projector (vision-to-language bridge) - LLM (language generation) Supports: - Image captioning - Visual question answering - Object detection (Box mode) - Point detection (counting) - Polygon segmentation - Optional reasoning with thinking traces """ from .modeling_oculus import ( OculusForConditionalGeneration, OculusVisionEncoder, OculusProjector, ) from .configuration_oculus import OculusConfig from .processing_oculus import OculusProcessor __all__ = [ "OculusForConditionalGeneration", "OculusVisionEncoder", "OculusProjector", "OculusConfig", "OculusProcessor", ] __version__ = "0.2.0"