Spaces:
Running on Zero
Running on Zero
| """Object understanding runtime for mock and MiniCPM-V backends.""" | |
| from __future__ import annotations | |
| import re | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Any | |
| from src.config import RuntimeSettings, get_runtime_settings | |
| from src.models.schema import ObjectInfo, ObjectUnderstanding | |
| from src.utils.json_repair import parse_json_object | |
| KNOWN_OBJECTS = { | |
| "mug": "coffee mug", | |
| "cup": "coffee mug", | |
| "keyboard": "keyboard", | |
| "shoe": "shoe", | |
| "book": "book", | |
| "phone": "phone", | |
| "lamp": "desk lamp", | |
| "bottle": "water bottle", | |
| "bag": "bag", | |
| } | |
| MINICPM_DEFAULT_MODEL_ID = "openbmb/MiniCPM-V-2_6" | |
| MINICPM_BACKENDS = {"minicpm-v", "minicpm_v", "minicpmv"} | |
| SENSITIVE_PROBE_MARKERS = ("HF_TOKEN", "HUGGINGFACE_TOKEN", "hf_", ".env") | |
| _MINICPM_MODEL: Any | None = None | |
| _MINICPM_TOKENIZER: Any | None = None | |
| _MINICPM_MODEL_ID: str | None = None | |
| class VisionRunResult: | |
| object_understanding: ObjectUnderstanding | |
| fallbacks: list[str] | |
| def understand_object(image_path: str | None, description: str) -> ObjectUnderstanding: | |
| """Return object understanding without exposing runtime metadata.""" | |
| return understand_object_with_metadata(image_path, description).object_understanding | |
| def probe_vision_runtime( | |
| *, | |
| settings: RuntimeSettings | None = None, | |
| load_model: bool = True, | |
| ) -> dict[str, Any]: | |
| """Return non-secret runtime diagnostics for hosted MiniCPM-V debugging.""" | |
| current = settings or get_runtime_settings() | |
| backend = current.vision_backend.strip().lower() | |
| model_id = current.vision_model_id or MINICPM_DEFAULT_MODEL_ID | |
| probe: dict[str, Any] = { | |
| "backend": backend, | |
| "vision_model_id": model_id if backend in MINICPM_BACKENDS else current.vision_model_id, | |
| "torch_import": False, | |
| "transformers_import": False, | |
| "cuda_available": False, | |
| "device_count": 0, | |
| "device_name": "", | |
| "mps_available": False, | |
| "minicpm_load_attempted": False, | |
| "minicpm_load_ok": False, | |
| "errors": [], | |
| } | |
| torch_module: Any | None = None | |
| try: | |
| import torch | |
| torch_module = torch | |
| probe["torch_import"] = True | |
| probe["cuda_available"] = torch.cuda.is_available() | |
| probe["device_count"] = torch.cuda.device_count() | |
| if probe["cuda_available"] and probe["device_count"]: | |
| probe["device_name"] = torch.cuda.get_device_name(0) | |
| probe["mps_available"] = bool( | |
| getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() | |
| ) | |
| except Exception as exc: | |
| _add_probe_error(probe, "torch", exc) | |
| try: | |
| from transformers import AutoModel as _AutoModel # noqa: F401 | |
| from transformers import AutoTokenizer as _AutoTokenizer # noqa: F401 | |
| probe["transformers_import"] = True | |
| except Exception as exc: | |
| _add_probe_error(probe, "transformers", exc) | |
| if backend in MINICPM_BACKENDS and load_model: | |
| probe["minicpm_load_attempted"] = True | |
| try: | |
| _load_minicpm_components(model_id) | |
| probe["minicpm_load_ok"] = True | |
| except Exception as exc: | |
| _add_probe_error(probe, "minicpm_load", exc) | |
| return _sanitize_probe_payload(probe) | |
| def understand_object_with_metadata( | |
| image_path: str | None, | |
| description: str, | |
| *, | |
| settings: RuntimeSettings | None = None, | |
| ) -> VisionRunResult: | |
| current = settings or get_runtime_settings() | |
| backend = current.vision_backend.strip().lower() | |
| if backend == "mock": | |
| return VisionRunResult(_understand_object_mock(image_path, description), []) | |
| if backend in MINICPM_BACKENDS: | |
| try: | |
| return VisionRunResult(_understand_object_minicpm(image_path, description, current), []) | |
| except Exception as exc: | |
| _log_vision_fallback("minicpm-v", exc) | |
| return VisionRunResult( | |
| _understand_object_mock(image_path, description), | |
| ["vision-fallback-to-mock"], | |
| ) | |
| return VisionRunResult( | |
| _understand_object_mock(image_path, description), | |
| [f"unknown-vision-backend-{backend}-fallback-to-mock"], | |
| ) | |
| def _understand_object_mock(image_path: str | None, description: str) -> ObjectUnderstanding: | |
| """Return deterministic mock object understanding for fallback-safe demos.""" | |
| clean_description = description.strip() | |
| object_name = _infer_object_name(clean_description, image_path) | |
| features = _infer_features(clean_description, image_path) | |
| return ObjectUnderstanding( | |
| object=ObjectInfo( | |
| name=object_name, | |
| visible_features=features, | |
| likely_context=_infer_context(clean_description), | |
| confidence=0.42 if clean_description else 0.32, | |
| ) | |
| ) | |
| def _understand_object_minicpm( | |
| image_path: str | None, | |
| description: str, | |
| settings: RuntimeSettings, | |
| ) -> ObjectUnderstanding: | |
| if not image_path: | |
| raise ValueError("MiniCPM-V requires an uploaded image.") | |
| model_id = settings.vision_model_id or MINICPM_DEFAULT_MODEL_ID | |
| model, tokenizer = _load_minicpm_components(model_id) | |
| image = _load_rgb_image(image_path) | |
| prompt = _object_understanding_prompt(description) | |
| messages = [{"role": "user", "content": [image, prompt]}] | |
| raw = model.chat(image=None, msgs=messages, tokenizer=tokenizer) | |
| if isinstance(raw, tuple): | |
| raw = raw[0] | |
| payload = parse_json_object(str(raw)) | |
| return ObjectUnderstanding.model_validate(payload) | |
| def _load_minicpm_components(model_id: str) -> tuple[Any, Any]: | |
| global _MINICPM_MODEL, _MINICPM_TOKENIZER, _MINICPM_MODEL_ID | |
| if _MINICPM_MODEL is not None and _MINICPM_TOKENIZER is not None and _MINICPM_MODEL_ID == model_id: | |
| return _MINICPM_MODEL, _MINICPM_TOKENIZER | |
| import torch | |
| from transformers import AutoModel, AutoTokenizer | |
| model_kwargs: dict[str, Any] = { | |
| "trust_remote_code": True, | |
| "torch_dtype": torch.bfloat16, | |
| } | |
| try: | |
| model_kwargs["attn_implementation"] = "sdpa" | |
| model = AutoModel.from_pretrained(model_id, **model_kwargs) | |
| except TypeError: | |
| model_kwargs.pop("attn_implementation", None) | |
| model = AutoModel.from_pretrained(model_id, **model_kwargs) | |
| if torch.cuda.is_available(): | |
| model = model.eval().cuda() | |
| elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available(): | |
| model = model.eval().to(device="mps", dtype=torch.float16) | |
| else: | |
| model = model.eval() | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
| _MINICPM_MODEL = model | |
| _MINICPM_TOKENIZER = tokenizer | |
| _MINICPM_MODEL_ID = model_id | |
| return model, tokenizer | |
| def _load_rgb_image(image_path: str) -> Any: | |
| from PIL import Image | |
| return Image.open(image_path).convert("RGB") | |
| def _object_understanding_prompt(description: str) -> str: | |
| context = description.strip() or "No user description was provided." | |
| return ( | |
| "You are the vision module for Objectverse Diary. Inspect the uploaded everyday object photo. " | |
| "Return only valid JSON with exactly this shape: " | |
| '{"object":{"name":"short object name","visible_features":["feature 1","feature 2","feature 3"],' | |
| '"likely_context":"where this object probably is","confidence":0.0}}. ' | |
| "Use 3 to 5 concrete visible_features. confidence must be a number from 0 to 1. " | |
| f"Optional user context: {context}" | |
| ) | |
| def _log_vision_fallback(backend: str, exc: Exception) -> None: | |
| print( | |
| f"[Objectverse Diary] Vision backend '{backend}' fell back to mock: {type(exc).__name__}", | |
| flush=True, | |
| ) | |
| def _add_probe_error(probe: dict[str, Any], stage: str, exc: Exception) -> None: | |
| probe["errors"].append( | |
| { | |
| "stage": stage, | |
| "type": type(exc).__name__, | |
| "summary": _sanitize_probe_text(str(exc) or type(exc).__name__), | |
| } | |
| ) | |
| def _sanitize_probe_payload(value: Any) -> Any: | |
| if isinstance(value, dict): | |
| return {str(key): _sanitize_probe_payload(item) for key, item in value.items()} | |
| if isinstance(value, list): | |
| return [_sanitize_probe_payload(item) for item in value] | |
| if isinstance(value, str): | |
| return _sanitize_probe_text(value) | |
| return value | |
| def _sanitize_probe_text(value: str, *, max_length: int = 240) -> str: | |
| clean = value.replace(str(Path.home()), "[home]") | |
| clean = re.sub(r"hf_[A-Za-z0-9_-]+", "[redacted-token]", clean) | |
| for marker in SENSITIVE_PROBE_MARKERS: | |
| clean = clean.replace(marker, "[redacted]") | |
| if len(clean) > max_length: | |
| return clean[: max_length - 3] + "..." | |
| return clean | |
| def _infer_object_name(description: str, image_path: str | None) -> str: | |
| lowered = description.lower() | |
| for keyword, name in KNOWN_OBJECTS.items(): | |
| if keyword in lowered: | |
| return name | |
| if image_path: | |
| stem = Path(image_path).stem.replace("_", " ").replace("-", " ").strip() | |
| if stem: | |
| return stem[:40] | |
| return "mysterious everyday object" | |
| def _infer_features(description: str, image_path: str | None) -> list[str]: | |
| features: list[str] = [] | |
| lowered = description.lower() | |
| for word in ["old", "new", "cracked", "white", "black", "dusty", "metal", "ceramic", "plastic"]: | |
| if word in lowered: | |
| features.append(word) | |
| if image_path: | |
| features.append("uploaded photo provided") | |
| if description: | |
| features.append("user-supplied description") | |
| return features[:5] or ["ordinary surface", "unknown material", "quietly suspicious"] | |
| def _infer_context(description: str) -> str: | |
| lowered = description.lower() | |
| if "desk" in lowered: | |
| return "developer desk" | |
| if "kitchen" in lowered: | |
| return "kitchen counter" | |
| if "bedroom" in lowered: | |
| return "bedroom shelf" | |
| if "office" in lowered: | |
| return "office corner" | |
| return "everyday human environment" | |