"""Object understanding runtime for mock and MiniCPM-V backends.""" from __future__ import annotations import re from dataclasses import dataclass from pathlib import Path from typing import Any from src.config import RuntimeSettings, get_runtime_settings from src.models.schema import ObjectInfo, ObjectUnderstanding from src.utils.json_repair import parse_json_object KNOWN_OBJECTS = { "mug": "coffee mug", "cup": "coffee mug", "keyboard": "keyboard", "shoe": "shoe", "book": "book", "phone": "phone", "lamp": "desk lamp", "bottle": "water bottle", "bag": "bag", } MINICPM_DEFAULT_MODEL_ID = "openbmb/MiniCPM-V-2_6" MINICPM_BACKENDS = {"minicpm-v", "minicpm_v", "minicpmv"} SENSITIVE_PROBE_MARKERS = ("HF_TOKEN", "HUGGINGFACE_TOKEN", "hf_", ".env") _MINICPM_MODEL: Any | None = None _MINICPM_TOKENIZER: Any | None = None _MINICPM_MODEL_ID: str | None = None @dataclass(frozen=True) class VisionRunResult: object_understanding: ObjectUnderstanding fallbacks: list[str] def understand_object(image_path: str | None, description: str) -> ObjectUnderstanding: """Return object understanding without exposing runtime metadata.""" return understand_object_with_metadata(image_path, description).object_understanding def probe_vision_runtime( *, settings: RuntimeSettings | None = None, load_model: bool = True, ) -> dict[str, Any]: """Return non-secret runtime diagnostics for hosted MiniCPM-V debugging.""" current = settings or get_runtime_settings() backend = current.vision_backend.strip().lower() model_id = current.vision_model_id or MINICPM_DEFAULT_MODEL_ID probe: dict[str, Any] = { "backend": backend, "vision_model_id": model_id if backend in MINICPM_BACKENDS else current.vision_model_id, "torch_import": False, "transformers_import": False, "cuda_available": False, "device_count": 0, "device_name": "", "mps_available": False, "minicpm_load_attempted": False, "minicpm_load_ok": False, "errors": [], } torch_module: Any | None = None try: import torch torch_module = torch probe["torch_import"] = True probe["cuda_available"] = torch.cuda.is_available() probe["device_count"] = torch.cuda.device_count() if probe["cuda_available"] and probe["device_count"]: probe["device_name"] = torch.cuda.get_device_name(0) probe["mps_available"] = bool( getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() ) except Exception as exc: _add_probe_error(probe, "torch", exc) try: from transformers import AutoModel as _AutoModel # noqa: F401 from transformers import AutoTokenizer as _AutoTokenizer # noqa: F401 probe["transformers_import"] = True except Exception as exc: _add_probe_error(probe, "transformers", exc) if backend in MINICPM_BACKENDS and load_model: probe["minicpm_load_attempted"] = True try: _load_minicpm_components(model_id) probe["minicpm_load_ok"] = True except Exception as exc: _add_probe_error(probe, "minicpm_load", exc) return _sanitize_probe_payload(probe) def understand_object_with_metadata( image_path: str | None, description: str, *, settings: RuntimeSettings | None = None, ) -> VisionRunResult: current = settings or get_runtime_settings() backend = current.vision_backend.strip().lower() if backend == "mock": return VisionRunResult(_understand_object_mock(image_path, description), []) if backend in MINICPM_BACKENDS: try: return VisionRunResult(_understand_object_minicpm(image_path, description, current), []) except Exception as exc: _log_vision_fallback("minicpm-v", exc) return VisionRunResult( _understand_object_mock(image_path, description), ["vision-fallback-to-mock"], ) return VisionRunResult( _understand_object_mock(image_path, description), [f"unknown-vision-backend-{backend}-fallback-to-mock"], ) def _understand_object_mock(image_path: str | None, description: str) -> ObjectUnderstanding: """Return deterministic mock object understanding for fallback-safe demos.""" clean_description = description.strip() object_name = _infer_object_name(clean_description, image_path) features = _infer_features(clean_description, image_path) return ObjectUnderstanding( object=ObjectInfo( name=object_name, visible_features=features, likely_context=_infer_context(clean_description), confidence=0.42 if clean_description else 0.32, ) ) def _understand_object_minicpm( image_path: str | None, description: str, settings: RuntimeSettings, ) -> ObjectUnderstanding: if not image_path: raise ValueError("MiniCPM-V requires an uploaded image.") model_id = settings.vision_model_id or MINICPM_DEFAULT_MODEL_ID model, tokenizer = _load_minicpm_components(model_id) image = _load_rgb_image(image_path) prompt = _object_understanding_prompt(description) messages = [{"role": "user", "content": [image, prompt]}] raw = model.chat(image=None, msgs=messages, tokenizer=tokenizer) if isinstance(raw, tuple): raw = raw[0] payload = parse_json_object(str(raw)) return ObjectUnderstanding.model_validate(payload) def _load_minicpm_components(model_id: str) -> tuple[Any, Any]: global _MINICPM_MODEL, _MINICPM_TOKENIZER, _MINICPM_MODEL_ID if _MINICPM_MODEL is not None and _MINICPM_TOKENIZER is not None and _MINICPM_MODEL_ID == model_id: return _MINICPM_MODEL, _MINICPM_TOKENIZER import torch from transformers import AutoModel, AutoTokenizer model_kwargs: dict[str, Any] = { "trust_remote_code": True, "torch_dtype": torch.bfloat16, } try: model_kwargs["attn_implementation"] = "sdpa" model = AutoModel.from_pretrained(model_id, **model_kwargs) except TypeError: model_kwargs.pop("attn_implementation", None) model = AutoModel.from_pretrained(model_id, **model_kwargs) if torch.cuda.is_available(): model = model.eval().cuda() elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available(): model = model.eval().to(device="mps", dtype=torch.float16) else: model = model.eval() tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) _MINICPM_MODEL = model _MINICPM_TOKENIZER = tokenizer _MINICPM_MODEL_ID = model_id return model, tokenizer def _load_rgb_image(image_path: str) -> Any: from PIL import Image return Image.open(image_path).convert("RGB") def _object_understanding_prompt(description: str) -> str: context = description.strip() or "No user description was provided." return ( "You are the vision module for Objectverse Diary. Inspect the uploaded everyday object photo. " "Return only valid JSON with exactly this shape: " '{"object":{"name":"short object name","visible_features":["feature 1","feature 2","feature 3"],' '"likely_context":"where this object probably is","confidence":0.0}}. ' "Use 3 to 5 concrete visible_features. confidence must be a number from 0 to 1. " f"Optional user context: {context}" ) def _log_vision_fallback(backend: str, exc: Exception) -> None: print( f"[Objectverse Diary] Vision backend '{backend}' fell back to mock: {type(exc).__name__}", flush=True, ) def _add_probe_error(probe: dict[str, Any], stage: str, exc: Exception) -> None: probe["errors"].append( { "stage": stage, "type": type(exc).__name__, "summary": _sanitize_probe_text(str(exc) or type(exc).__name__), } ) def _sanitize_probe_payload(value: Any) -> Any: if isinstance(value, dict): return {str(key): _sanitize_probe_payload(item) for key, item in value.items()} if isinstance(value, list): return [_sanitize_probe_payload(item) for item in value] if isinstance(value, str): return _sanitize_probe_text(value) return value def _sanitize_probe_text(value: str, *, max_length: int = 240) -> str: clean = value.replace(str(Path.home()), "[home]") clean = re.sub(r"hf_[A-Za-z0-9_-]+", "[redacted-token]", clean) for marker in SENSITIVE_PROBE_MARKERS: clean = clean.replace(marker, "[redacted]") if len(clean) > max_length: return clean[: max_length - 3] + "..." return clean def _infer_object_name(description: str, image_path: str | None) -> str: lowered = description.lower() for keyword, name in KNOWN_OBJECTS.items(): if keyword in lowered: return name if image_path: stem = Path(image_path).stem.replace("_", " ").replace("-", " ").strip() if stem: return stem[:40] return "mysterious everyday object" def _infer_features(description: str, image_path: str | None) -> list[str]: features: list[str] = [] lowered = description.lower() for word in ["old", "new", "cracked", "white", "black", "dusty", "metal", "ceramic", "plastic"]: if word in lowered: features.append(word) if image_path: features.append("uploaded photo provided") if description: features.append("user-supplied description") return features[:5] or ["ordinary surface", "unknown material", "quietly suspicious"] def _infer_context(description: str) -> str: lowered = description.lower() if "desk" in lowered: return "developer desk" if "kitchen" in lowered: return "kitchen counter" if "bedroom" in lowered: return "bedroom shelf" if "office" in lowered: return "office corner" return "everyday human environment"