ObjectverseDiary / src /models /vision_runner.py
qqyule's picture
Sync runtime diagnostics and smoke helpers
d30bd8e verified
"""Object understanding runtime for mock and MiniCPM-V backends."""
from __future__ import annotations
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from src.config import RuntimeSettings, get_runtime_settings
from src.models.schema import ObjectInfo, ObjectUnderstanding
from src.utils.json_repair import parse_json_object
KNOWN_OBJECTS = {
"mug": "coffee mug",
"cup": "coffee mug",
"keyboard": "keyboard",
"shoe": "shoe",
"book": "book",
"phone": "phone",
"lamp": "desk lamp",
"bottle": "water bottle",
"bag": "bag",
}
MINICPM_DEFAULT_MODEL_ID = "openbmb/MiniCPM-V-2_6"
MINICPM_BACKENDS = {"minicpm-v", "minicpm_v", "minicpmv"}
SENSITIVE_PROBE_MARKERS = ("HF_TOKEN", "HUGGINGFACE_TOKEN", "hf_", ".env")
_MINICPM_MODEL: Any | None = None
_MINICPM_TOKENIZER: Any | None = None
_MINICPM_MODEL_ID: str | None = None
@dataclass(frozen=True)
class VisionRunResult:
object_understanding: ObjectUnderstanding
fallbacks: list[str]
def understand_object(image_path: str | None, description: str) -> ObjectUnderstanding:
"""Return object understanding without exposing runtime metadata."""
return understand_object_with_metadata(image_path, description).object_understanding
def probe_vision_runtime(
*,
settings: RuntimeSettings | None = None,
load_model: bool = True,
) -> dict[str, Any]:
"""Return non-secret runtime diagnostics for hosted MiniCPM-V debugging."""
current = settings or get_runtime_settings()
backend = current.vision_backend.strip().lower()
model_id = current.vision_model_id or MINICPM_DEFAULT_MODEL_ID
probe: dict[str, Any] = {
"backend": backend,
"vision_model_id": model_id if backend in MINICPM_BACKENDS else current.vision_model_id,
"torch_import": False,
"transformers_import": False,
"cuda_available": False,
"device_count": 0,
"device_name": "",
"mps_available": False,
"minicpm_load_attempted": False,
"minicpm_load_ok": False,
"errors": [],
}
torch_module: Any | None = None
try:
import torch
torch_module = torch
probe["torch_import"] = True
probe["cuda_available"] = torch.cuda.is_available()
probe["device_count"] = torch.cuda.device_count()
if probe["cuda_available"] and probe["device_count"]:
probe["device_name"] = torch.cuda.get_device_name(0)
probe["mps_available"] = bool(
getattr(torch.backends, "mps", None) and torch.backends.mps.is_available()
)
except Exception as exc:
_add_probe_error(probe, "torch", exc)
try:
from transformers import AutoModel as _AutoModel # noqa: F401
from transformers import AutoTokenizer as _AutoTokenizer # noqa: F401
probe["transformers_import"] = True
except Exception as exc:
_add_probe_error(probe, "transformers", exc)
if backend in MINICPM_BACKENDS and load_model:
probe["minicpm_load_attempted"] = True
try:
_load_minicpm_components(model_id)
probe["minicpm_load_ok"] = True
except Exception as exc:
_add_probe_error(probe, "minicpm_load", exc)
return _sanitize_probe_payload(probe)
def understand_object_with_metadata(
image_path: str | None,
description: str,
*,
settings: RuntimeSettings | None = None,
) -> VisionRunResult:
current = settings or get_runtime_settings()
backend = current.vision_backend.strip().lower()
if backend == "mock":
return VisionRunResult(_understand_object_mock(image_path, description), [])
if backend in MINICPM_BACKENDS:
try:
return VisionRunResult(_understand_object_minicpm(image_path, description, current), [])
except Exception as exc:
_log_vision_fallback("minicpm-v", exc)
return VisionRunResult(
_understand_object_mock(image_path, description),
["vision-fallback-to-mock"],
)
return VisionRunResult(
_understand_object_mock(image_path, description),
[f"unknown-vision-backend-{backend}-fallback-to-mock"],
)
def _understand_object_mock(image_path: str | None, description: str) -> ObjectUnderstanding:
"""Return deterministic mock object understanding for fallback-safe demos."""
clean_description = description.strip()
object_name = _infer_object_name(clean_description, image_path)
features = _infer_features(clean_description, image_path)
return ObjectUnderstanding(
object=ObjectInfo(
name=object_name,
visible_features=features,
likely_context=_infer_context(clean_description),
confidence=0.42 if clean_description else 0.32,
)
)
def _understand_object_minicpm(
image_path: str | None,
description: str,
settings: RuntimeSettings,
) -> ObjectUnderstanding:
if not image_path:
raise ValueError("MiniCPM-V requires an uploaded image.")
model_id = settings.vision_model_id or MINICPM_DEFAULT_MODEL_ID
model, tokenizer = _load_minicpm_components(model_id)
image = _load_rgb_image(image_path)
prompt = _object_understanding_prompt(description)
messages = [{"role": "user", "content": [image, prompt]}]
raw = model.chat(image=None, msgs=messages, tokenizer=tokenizer)
if isinstance(raw, tuple):
raw = raw[0]
payload = parse_json_object(str(raw))
return ObjectUnderstanding.model_validate(payload)
def _load_minicpm_components(model_id: str) -> tuple[Any, Any]:
global _MINICPM_MODEL, _MINICPM_TOKENIZER, _MINICPM_MODEL_ID
if _MINICPM_MODEL is not None and _MINICPM_TOKENIZER is not None and _MINICPM_MODEL_ID == model_id:
return _MINICPM_MODEL, _MINICPM_TOKENIZER
import torch
from transformers import AutoModel, AutoTokenizer
model_kwargs: dict[str, Any] = {
"trust_remote_code": True,
"torch_dtype": torch.bfloat16,
}
try:
model_kwargs["attn_implementation"] = "sdpa"
model = AutoModel.from_pretrained(model_id, **model_kwargs)
except TypeError:
model_kwargs.pop("attn_implementation", None)
model = AutoModel.from_pretrained(model_id, **model_kwargs)
if torch.cuda.is_available():
model = model.eval().cuda()
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
model = model.eval().to(device="mps", dtype=torch.float16)
else:
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
_MINICPM_MODEL = model
_MINICPM_TOKENIZER = tokenizer
_MINICPM_MODEL_ID = model_id
return model, tokenizer
def _load_rgb_image(image_path: str) -> Any:
from PIL import Image
return Image.open(image_path).convert("RGB")
def _object_understanding_prompt(description: str) -> str:
context = description.strip() or "No user description was provided."
return (
"You are the vision module for Objectverse Diary. Inspect the uploaded everyday object photo. "
"Return only valid JSON with exactly this shape: "
'{"object":{"name":"short object name","visible_features":["feature 1","feature 2","feature 3"],'
'"likely_context":"where this object probably is","confidence":0.0}}. '
"Use 3 to 5 concrete visible_features. confidence must be a number from 0 to 1. "
f"Optional user context: {context}"
)
def _log_vision_fallback(backend: str, exc: Exception) -> None:
print(
f"[Objectverse Diary] Vision backend '{backend}' fell back to mock: {type(exc).__name__}",
flush=True,
)
def _add_probe_error(probe: dict[str, Any], stage: str, exc: Exception) -> None:
probe["errors"].append(
{
"stage": stage,
"type": type(exc).__name__,
"summary": _sanitize_probe_text(str(exc) or type(exc).__name__),
}
)
def _sanitize_probe_payload(value: Any) -> Any:
if isinstance(value, dict):
return {str(key): _sanitize_probe_payload(item) for key, item in value.items()}
if isinstance(value, list):
return [_sanitize_probe_payload(item) for item in value]
if isinstance(value, str):
return _sanitize_probe_text(value)
return value
def _sanitize_probe_text(value: str, *, max_length: int = 240) -> str:
clean = value.replace(str(Path.home()), "[home]")
clean = re.sub(r"hf_[A-Za-z0-9_-]+", "[redacted-token]", clean)
for marker in SENSITIVE_PROBE_MARKERS:
clean = clean.replace(marker, "[redacted]")
if len(clean) > max_length:
return clean[: max_length - 3] + "..."
return clean
def _infer_object_name(description: str, image_path: str | None) -> str:
lowered = description.lower()
for keyword, name in KNOWN_OBJECTS.items():
if keyword in lowered:
return name
if image_path:
stem = Path(image_path).stem.replace("_", " ").replace("-", " ").strip()
if stem:
return stem[:40]
return "mysterious everyday object"
def _infer_features(description: str, image_path: str | None) -> list[str]:
features: list[str] = []
lowered = description.lower()
for word in ["old", "new", "cracked", "white", "black", "dusty", "metal", "ceramic", "plastic"]:
if word in lowered:
features.append(word)
if image_path:
features.append("uploaded photo provided")
if description:
features.append("user-supplied description")
return features[:5] or ["ordinary surface", "unknown material", "quietly suspicious"]
def _infer_context(description: str) -> str:
lowered = description.lower()
if "desk" in lowered:
return "developer desk"
if "kitchen" in lowered:
return "kitchen counter"
if "bedroom" in lowered:
return "bedroom shelf"
if "office" in lowered:
return "office corner"
return "everyday human environment"