Spaces:

build-small-hackathon
/

ObjectverseDiary

Running on Zero

File size: 10,099 Bytes

"""Object understanding runtime for mock and MiniCPM-V backends."""

from __future__ import annotations

import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any

from src.config import RuntimeSettings, get_runtime_settings
from src.models.schema import ObjectInfo, ObjectUnderstanding
from src.utils.json_repair import parse_json_object


KNOWN_OBJECTS = {
    "mug": "coffee mug",
    "cup": "coffee mug",
    "keyboard": "keyboard",
    "shoe": "shoe",
    "book": "book",
    "phone": "phone",
    "lamp": "desk lamp",
    "bottle": "water bottle",
    "bag": "bag",
}

MINICPM_DEFAULT_MODEL_ID = "openbmb/MiniCPM-V-2_6"
MINICPM_BACKENDS = {"minicpm-v", "minicpm_v", "minicpmv"}
SENSITIVE_PROBE_MARKERS = ("HF_TOKEN", "HUGGINGFACE_TOKEN", "hf_", ".env")

_MINICPM_MODEL: Any | None = None
_MINICPM_TOKENIZER: Any | None = None
_MINICPM_MODEL_ID: str | None = None


@dataclass(frozen=True)
class VisionRunResult:
    object_understanding: ObjectUnderstanding
    fallbacks: list[str]


def understand_object(image_path: str | None, description: str) -> ObjectUnderstanding:
    """Return object understanding without exposing runtime metadata."""
    return understand_object_with_metadata(image_path, description).object_understanding


def probe_vision_runtime(
    *,
    settings: RuntimeSettings | None = None,
    load_model: bool = True,
) -> dict[str, Any]:
    """Return non-secret runtime diagnostics for hosted MiniCPM-V debugging."""
    current = settings or get_runtime_settings()
    backend = current.vision_backend.strip().lower()
    model_id = current.vision_model_id or MINICPM_DEFAULT_MODEL_ID
    probe: dict[str, Any] = {
        "backend": backend,
        "vision_model_id": model_id if backend in MINICPM_BACKENDS else current.vision_model_id,
        "torch_import": False,
        "transformers_import": False,
        "cuda_available": False,
        "device_count": 0,
        "device_name": "",
        "mps_available": False,
        "minicpm_load_attempted": False,
        "minicpm_load_ok": False,
        "errors": [],
    }

    torch_module: Any | None = None
    try:
        import torch

        torch_module = torch
        probe["torch_import"] = True
        probe["cuda_available"] = torch.cuda.is_available()
        probe["device_count"] = torch.cuda.device_count()
        if probe["cuda_available"] and probe["device_count"]:
            probe["device_name"] = torch.cuda.get_device_name(0)
        probe["mps_available"] = bool(
            getattr(torch.backends, "mps", None) and torch.backends.mps.is_available()
        )
    except Exception as exc:
        _add_probe_error(probe, "torch", exc)

    try:
        from transformers import AutoModel as _AutoModel  # noqa: F401
        from transformers import AutoTokenizer as _AutoTokenizer  # noqa: F401

        probe["transformers_import"] = True
    except Exception as exc:
        _add_probe_error(probe, "transformers", exc)

    if backend in MINICPM_BACKENDS and load_model:
        probe["minicpm_load_attempted"] = True
        try:
            _load_minicpm_components(model_id)
            probe["minicpm_load_ok"] = True
        except Exception as exc:
            _add_probe_error(probe, "minicpm_load", exc)

    return _sanitize_probe_payload(probe)


def understand_object_with_metadata(
    image_path: str | None,
    description: str,
    *,
    settings: RuntimeSettings | None = None,
) -> VisionRunResult:
    current = settings or get_runtime_settings()
    backend = current.vision_backend.strip().lower()

    if backend == "mock":
        return VisionRunResult(_understand_object_mock(image_path, description), [])

    if backend in MINICPM_BACKENDS:
        try:
            return VisionRunResult(_understand_object_minicpm(image_path, description, current), [])
        except Exception as exc:
            _log_vision_fallback("minicpm-v", exc)
            return VisionRunResult(
                _understand_object_mock(image_path, description),
                ["vision-fallback-to-mock"],
            )

    return VisionRunResult(
        _understand_object_mock(image_path, description),
        [f"unknown-vision-backend-{backend}-fallback-to-mock"],
    )


def _understand_object_mock(image_path: str | None, description: str) -> ObjectUnderstanding:
    """Return deterministic mock object understanding for fallback-safe demos."""
    clean_description = description.strip()
    object_name = _infer_object_name(clean_description, image_path)
    features = _infer_features(clean_description, image_path)

    return ObjectUnderstanding(
        object=ObjectInfo(
            name=object_name,
            visible_features=features,
            likely_context=_infer_context(clean_description),
            confidence=0.42 if clean_description else 0.32,
        )
    )


def _understand_object_minicpm(
    image_path: str | None,
    description: str,
    settings: RuntimeSettings,
) -> ObjectUnderstanding:
    if not image_path:
        raise ValueError("MiniCPM-V requires an uploaded image.")

    model_id = settings.vision_model_id or MINICPM_DEFAULT_MODEL_ID
    model, tokenizer = _load_minicpm_components(model_id)
    image = _load_rgb_image(image_path)
    prompt = _object_understanding_prompt(description)
    messages = [{"role": "user", "content": [image, prompt]}]
    raw = model.chat(image=None, msgs=messages, tokenizer=tokenizer)
    if isinstance(raw, tuple):
        raw = raw[0]

    payload = parse_json_object(str(raw))
    return ObjectUnderstanding.model_validate(payload)


def _load_minicpm_components(model_id: str) -> tuple[Any, Any]:
    global _MINICPM_MODEL, _MINICPM_TOKENIZER, _MINICPM_MODEL_ID

    if _MINICPM_MODEL is not None and _MINICPM_TOKENIZER is not None and _MINICPM_MODEL_ID == model_id:
        return _MINICPM_MODEL, _MINICPM_TOKENIZER

    import torch
    from transformers import AutoModel, AutoTokenizer

    model_kwargs: dict[str, Any] = {
        "trust_remote_code": True,
        "torch_dtype": torch.bfloat16,
    }
    try:
        model_kwargs["attn_implementation"] = "sdpa"
        model = AutoModel.from_pretrained(model_id, **model_kwargs)
    except TypeError:
        model_kwargs.pop("attn_implementation", None)
        model = AutoModel.from_pretrained(model_id, **model_kwargs)

    if torch.cuda.is_available():
        model = model.eval().cuda()
    elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
        model = model.eval().to(device="mps", dtype=torch.float16)
    else:
        model = model.eval()

    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    _MINICPM_MODEL = model
    _MINICPM_TOKENIZER = tokenizer
    _MINICPM_MODEL_ID = model_id
    return model, tokenizer


def _load_rgb_image(image_path: str) -> Any:
    from PIL import Image

    return Image.open(image_path).convert("RGB")


def _object_understanding_prompt(description: str) -> str:
    context = description.strip() or "No user description was provided."
    return (
        "You are the vision module for Objectverse Diary. Inspect the uploaded everyday object photo. "
        "Return only valid JSON with exactly this shape: "
        '{"object":{"name":"short object name","visible_features":["feature 1","feature 2","feature 3"],'
        '"likely_context":"where this object probably is","confidence":0.0}}. '
        "Use 3 to 5 concrete visible_features. confidence must be a number from 0 to 1. "
        f"Optional user context: {context}"
    )


def _log_vision_fallback(backend: str, exc: Exception) -> None:
    print(
        f"[Objectverse Diary] Vision backend '{backend}' fell back to mock: {type(exc).__name__}",
        flush=True,
    )


def _add_probe_error(probe: dict[str, Any], stage: str, exc: Exception) -> None:
    probe["errors"].append(
        {
            "stage": stage,
            "type": type(exc).__name__,
            "summary": _sanitize_probe_text(str(exc) or type(exc).__name__),
        }
    )


def _sanitize_probe_payload(value: Any) -> Any:
    if isinstance(value, dict):
        return {str(key): _sanitize_probe_payload(item) for key, item in value.items()}
    if isinstance(value, list):
        return [_sanitize_probe_payload(item) for item in value]
    if isinstance(value, str):
        return _sanitize_probe_text(value)
    return value


def _sanitize_probe_text(value: str, *, max_length: int = 240) -> str:
    clean = value.replace(str(Path.home()), "[home]")
    clean = re.sub(r"hf_[A-Za-z0-9_-]+", "[redacted-token]", clean)
    for marker in SENSITIVE_PROBE_MARKERS:
        clean = clean.replace(marker, "[redacted]")
    if len(clean) > max_length:
        return clean[: max_length - 3] + "..."
    return clean


def _infer_object_name(description: str, image_path: str | None) -> str:
    lowered = description.lower()
    for keyword, name in KNOWN_OBJECTS.items():
        if keyword in lowered:
            return name

    if image_path:
        stem = Path(image_path).stem.replace("_", " ").replace("-", " ").strip()
        if stem:
            return stem[:40]

    return "mysterious everyday object"


def _infer_features(description: str, image_path: str | None) -> list[str]:
    features: list[str] = []
    lowered = description.lower()

    for word in ["old", "new", "cracked", "white", "black", "dusty", "metal", "ceramic", "plastic"]:
        if word in lowered:
            features.append(word)

    if image_path:
        features.append("uploaded photo provided")

    if description:
        features.append("user-supplied description")

    return features[:5] or ["ordinary surface", "unknown material", "quietly suspicious"]


def _infer_context(description: str) -> str:
    lowered = description.lower()
    if "desk" in lowered:
        return "developer desk"
    if "kitchen" in lowered:
        return "kitchen counter"
    if "bedroom" in lowered:
        return "bedroom shelf"
    if "office" in lowered:
        return "office corner"
    return "everyday human environment"