Spaces:

build-small-hackathon
/

ObjectverseDiary

Running on Zero

App Files Files Community

ObjectverseDiary / src /models /vision_runner.py

qqyule

Sync runtime diagnostics and smoke helpers

d30bd8e verified 4 days ago

raw

history blame contribute delete

10.1 kB

	"""Object understanding runtime for mock and MiniCPM-V backends."""

	from __future__ import annotations

	import re
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Any

	from src.config import RuntimeSettings, get_runtime_settings
	from src.models.schema import ObjectInfo, ObjectUnderstanding
	from src.utils.json_repair import parse_json_object


	KNOWN_OBJECTS = {
	"mug": "coffee mug",
	"cup": "coffee mug",
	"keyboard": "keyboard",
	"shoe": "shoe",
	"book": "book",
	"phone": "phone",
	"lamp": "desk lamp",
	"bottle": "water bottle",
	"bag": "bag",
	}

	MINICPM_DEFAULT_MODEL_ID = "openbmb/MiniCPM-V-2_6"
	MINICPM_BACKENDS = {"minicpm-v", "minicpm_v", "minicpmv"}
	SENSITIVE_PROBE_MARKERS = ("HF_TOKEN", "HUGGINGFACE_TOKEN", "hf_", ".env")

	_MINICPM_MODEL: Any \| None = None
	_MINICPM_TOKENIZER: Any \| None = None
	_MINICPM_MODEL_ID: str \| None = None


	@dataclass(frozen=True)
	class VisionRunResult:
	object_understanding: ObjectUnderstanding
	fallbacks: list[str]


	def understand_object(image_path: str \| None, description: str) -> ObjectUnderstanding:
	"""Return object understanding without exposing runtime metadata."""
	return understand_object_with_metadata(image_path, description).object_understanding


	def probe_vision_runtime(
	*,
	settings: RuntimeSettings \| None = None,
	load_model: bool = True,
	) -> dict[str, Any]:
	"""Return non-secret runtime diagnostics for hosted MiniCPM-V debugging."""
	current = settings or get_runtime_settings()
	backend = current.vision_backend.strip().lower()
	model_id = current.vision_model_id or MINICPM_DEFAULT_MODEL_ID
	probe: dict[str, Any] = {
	"backend": backend,
	"vision_model_id": model_id if backend in MINICPM_BACKENDS else current.vision_model_id,
	"torch_import": False,
	"transformers_import": False,
	"cuda_available": False,
	"device_count": 0,
	"device_name": "",
	"mps_available": False,
	"minicpm_load_attempted": False,
	"minicpm_load_ok": False,
	"errors": [],
	}

	torch_module: Any \| None = None
	try:
	import torch

	torch_module = torch
	probe["torch_import"] = True
	probe["cuda_available"] = torch.cuda.is_available()
	probe["device_count"] = torch.cuda.device_count()
	if probe["cuda_available"] and probe["device_count"]:
	probe["device_name"] = torch.cuda.get_device_name(0)
	probe["mps_available"] = bool(
	getattr(torch.backends, "mps", None) and torch.backends.mps.is_available()
	)
	except Exception as exc:
	_add_probe_error(probe, "torch", exc)

	try:
	from transformers import AutoModel as _AutoModel # noqa: F401
	from transformers import AutoTokenizer as _AutoTokenizer # noqa: F401

	probe["transformers_import"] = True
	except Exception as exc:
	_add_probe_error(probe, "transformers", exc)

	if backend in MINICPM_BACKENDS and load_model:
	probe["minicpm_load_attempted"] = True
	try:
	_load_minicpm_components(model_id)
	probe["minicpm_load_ok"] = True
	except Exception as exc:
	_add_probe_error(probe, "minicpm_load", exc)

	return _sanitize_probe_payload(probe)


	def understand_object_with_metadata(
	image_path: str \| None,
	description: str,
	*,
	settings: RuntimeSettings \| None = None,
	) -> VisionRunResult:
	current = settings or get_runtime_settings()
	backend = current.vision_backend.strip().lower()

	if backend == "mock":
	return VisionRunResult(_understand_object_mock(image_path, description), [])

	if backend in MINICPM_BACKENDS:
	try:
	return VisionRunResult(_understand_object_minicpm(image_path, description, current), [])
	except Exception as exc:
	_log_vision_fallback("minicpm-v", exc)
	return VisionRunResult(
	_understand_object_mock(image_path, description),
	["vision-fallback-to-mock"],
	)

	return VisionRunResult(
	_understand_object_mock(image_path, description),
	[f"unknown-vision-backend-{backend}-fallback-to-mock"],
	)


	def _understand_object_mock(image_path: str \| None, description: str) -> ObjectUnderstanding:
	"""Return deterministic mock object understanding for fallback-safe demos."""
	clean_description = description.strip()
	object_name = _infer_object_name(clean_description, image_path)
	features = _infer_features(clean_description, image_path)

	return ObjectUnderstanding(
	object=ObjectInfo(
	name=object_name,
	visible_features=features,
	likely_context=_infer_context(clean_description),
	confidence=0.42 if clean_description else 0.32,
	)
	)


	def _understand_object_minicpm(
	image_path: str \| None,
	description: str,
	settings: RuntimeSettings,
	) -> ObjectUnderstanding:
	if not image_path:
	raise ValueError("MiniCPM-V requires an uploaded image.")

	model_id = settings.vision_model_id or MINICPM_DEFAULT_MODEL_ID
	model, tokenizer = _load_minicpm_components(model_id)
	image = _load_rgb_image(image_path)
	prompt = _object_understanding_prompt(description)
	messages = [{"role": "user", "content": [image, prompt]}]
	raw = model.chat(image=None, msgs=messages, tokenizer=tokenizer)
	if isinstance(raw, tuple):
	raw = raw[0]

	payload = parse_json_object(str(raw))
	return ObjectUnderstanding.model_validate(payload)


	def _load_minicpm_components(model_id: str) -> tuple[Any, Any]:
	global _MINICPM_MODEL, _MINICPM_TOKENIZER, _MINICPM_MODEL_ID

	if _MINICPM_MODEL is not None and _MINICPM_TOKENIZER is not None and _MINICPM_MODEL_ID == model_id:
	return _MINICPM_MODEL, _MINICPM_TOKENIZER

	import torch
	from transformers import AutoModel, AutoTokenizer

	model_kwargs: dict[str, Any] = {
	"trust_remote_code": True,
	"torch_dtype": torch.bfloat16,
	}
	try:
	model_kwargs["attn_implementation"] = "sdpa"
	model = AutoModel.from_pretrained(model_id, **model_kwargs)
	except TypeError:
	model_kwargs.pop("attn_implementation", None)
	model = AutoModel.from_pretrained(model_id, **model_kwargs)

	if torch.cuda.is_available():
	model = model.eval().cuda()
	elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
	model = model.eval().to(device="mps", dtype=torch.float16)
	else:
	model = model.eval()

	tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
	_MINICPM_MODEL = model
	_MINICPM_TOKENIZER = tokenizer
	_MINICPM_MODEL_ID = model_id
	return model, tokenizer


	def _load_rgb_image(image_path: str) -> Any:
	from PIL import Image

	return Image.open(image_path).convert("RGB")


	def _object_understanding_prompt(description: str) -> str:
	context = description.strip() or "No user description was provided."
	return (
	"You are the vision module for Objectverse Diary. Inspect the uploaded everyday object photo. "
	"Return only valid JSON with exactly this shape: "
	'{"object":{"name":"short object name","visible_features":["feature 1","feature 2","feature 3"],'
	'"likely_context":"where this object probably is","confidence":0.0}}. '
	"Use 3 to 5 concrete visible_features. confidence must be a number from 0 to 1. "
	f"Optional user context: {context}"
	)


	def _log_vision_fallback(backend: str, exc: Exception) -> None:
	print(
	f"[Objectverse Diary] Vision backend '{backend}' fell back to mock: {type(exc).__name__}",
	flush=True,
	)


	def _add_probe_error(probe: dict[str, Any], stage: str, exc: Exception) -> None:
	probe["errors"].append(
	{
	"stage": stage,
	"type": type(exc).__name__,
	"summary": _sanitize_probe_text(str(exc) or type(exc).__name__),
	}
	)


	def _sanitize_probe_payload(value: Any) -> Any:
	if isinstance(value, dict):
	return {str(key): _sanitize_probe_payload(item) for key, item in value.items()}
	if isinstance(value, list):
	return [_sanitize_probe_payload(item) for item in value]
	if isinstance(value, str):
	return _sanitize_probe_text(value)
	return value


	def _sanitize_probe_text(value: str, *, max_length: int = 240) -> str:
	clean = value.replace(str(Path.home()), "[home]")
	clean = re.sub(r"hf_[A-Za-z0-9_-]+", "[redacted-token]", clean)
	for marker in SENSITIVE_PROBE_MARKERS:
	clean = clean.replace(marker, "[redacted]")
	if len(clean) > max_length:
	return clean[: max_length - 3] + "..."
	return clean


	def _infer_object_name(description: str, image_path: str \| None) -> str:
	lowered = description.lower()
	for keyword, name in KNOWN_OBJECTS.items():
	if keyword in lowered:
	return name

	if image_path:
	stem = Path(image_path).stem.replace("_", " ").replace("-", " ").strip()
	if stem:
	return stem[:40]

	return "mysterious everyday object"


	def _infer_features(description: str, image_path: str \| None) -> list[str]:
	features: list[str] = []
	lowered = description.lower()

	for word in ["old", "new", "cracked", "white", "black", "dusty", "metal", "ceramic", "plastic"]:
	if word in lowered:
	features.append(word)

	if image_path:
	features.append("uploaded photo provided")

	if description:
	features.append("user-supplied description")

	return features[:5] or ["ordinary surface", "unknown material", "quietly suspicious"]


	def _infer_context(description: str) -> str:
	lowered = description.lower()
	if "desk" in lowered:
	return "developer desk"
	if "kitchen" in lowered:
	return "kitchen counter"
	if "bedroom" in lowered:
	return "bedroom shelf"
	if "office" in lowered:
	return "office corner"
	return "everyday human environment"