OceanirAI
/

Oculus

Image-Text-to-Text

vision-language

image-captioning

object-detection

Model card Files Files and versions

Oculus / oculus_inference.py

kobiakor15's picture

Upload oculus_inference.py with huggingface_hub

52ac305 verified 3 months ago

3.7 kB


	import torch
	import requests
	from PIL import Image
	from io import BytesIO
	from pathlib import Path
	from typing import Union, List, Dict, Any
	import sys

	# Ensure Oculus root is in path
	OCULUS_ROOT = Path(__file__).parent
	sys.path.insert(0, str(OCULUS_ROOT))

	try:
	from oculus_unified_model import OculusForConditionalGeneration
	except ImportError:
	# Attempt absolute import if relative fails
	from Oculus.oculus_unified_model import OculusForConditionalGeneration

	class OculusPredictor:
	"""
	Easy-to-use interface for the Oculus Unified Model.
	Supports Object Detection, VQA, and Captioning.
	"""

	def __init__(self, model_path: str = None, device: str = "cpu"):
	self.device = device

	# Auto-discover latest model if not provided
	if model_path is None:
	base_dir = OCULUS_ROOT / "checkpoints" / "oculus_detection_v2"
	if (base_dir / "final").exists():
	model_path = str(base_dir / "final")
	else:
	# Fallback to V1
	model_path = str(OCULUS_ROOT / "checkpoints" / "oculus_detection" / "final")

	print(f"Loading Oculus model from: {model_path}")
	self.model = OculusForConditionalGeneration.from_pretrained(model_path)

	# Load detection heads
	heads_path = Path(model_path) / "heads.pth"
	if heads_path.exists():
	heads = torch.load(heads_path, map_location=device)
	self.model.detection_head.load_state_dict(heads['detection'])
	print("✓ Detection heads loaded")

	# Load instruction-tuned VQA model if available
	instruct_path = OCULUS_ROOT / "checkpoints" / "oculus_instruct_v1" / "vqa_model"
	if instruct_path.exists():
	from transformers import BlipForQuestionAnswering
	self.model.lm_vqa_model = BlipForQuestionAnswering.from_pretrained(instruct_path)
	print("✓ Instruction-tuned VQA model loaded")

	print("✓ Model loaded successfully")

	def load_image(self, image_source: Union[str, Image.Image]) -> Image.Image:
	"""Load image from path, URL, or PIL object."""
	if isinstance(image_source, Image.Image):
	return image_source.convert("RGB")

	if image_source.startswith("http"):
	response = requests.get(image_source, headers={'User-Agent': 'Mozilla/5.0'})
	return Image.open(BytesIO(response.content)).convert("RGB")

	return Image.open(image_source).convert("RGB")

	def detect(self, image_source: Union[str, Image.Image], prompt: str = "Detect objects", threshold: float = 0.2) -> Dict[str, Any]:
	"""
	Run object detection.
	Returns: {'boxes': [[x1,y1,x2,y2], ...], 'labels': [...], 'confidences': [...]}
	"""
	image = self.load_image(image_source)
	output = self.model.generate(image, mode="box", prompt=prompt, threshold=threshold)

	# Convert to python friendly format
	return {
	'boxes': output.boxes, # Normalized [0-1]
	'labels': output.labels,
	'confidences': output.confidences,
	'image_size': image.size
	}

	def ask(self, image_source: Union[str, Image.Image], question: str) -> str:
	"""Ask a question about the image (VQA)."""
	image = self.load_image(image_source)
	output = self.model.generate(image, mode="text", prompt=question)
	return output.text

	def caption(self, image_source: Union[str, Image.Image]) -> str:
	"""Generate a caption for the image."""
	return self.ask(image_source, "A photo of")