File size: 10,099 Bytes
e20e3d9
bc02199
 
 
d30bd8e
e20e3d9
bc02199
e20e3d9
bc02199
e20e3d9
bc02199
e20e3d9
bc02199
 
 
 
 
 
 
 
 
 
 
 
 
 
e20e3d9
 
d30bd8e
e20e3d9
 
 
 
 
 
 
 
 
 
 
bc02199
 
e20e3d9
 
 
 
d30bd8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e20e3d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc02199
 
 
 
 
 
 
 
 
 
 
 
 
 
e20e3d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d30bd8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc02199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
"""Object understanding runtime for mock and MiniCPM-V backends."""

from __future__ import annotations

import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any

from src.config import RuntimeSettings, get_runtime_settings
from src.models.schema import ObjectInfo, ObjectUnderstanding
from src.utils.json_repair import parse_json_object


KNOWN_OBJECTS = {
    "mug": "coffee mug",
    "cup": "coffee mug",
    "keyboard": "keyboard",
    "shoe": "shoe",
    "book": "book",
    "phone": "phone",
    "lamp": "desk lamp",
    "bottle": "water bottle",
    "bag": "bag",
}

MINICPM_DEFAULT_MODEL_ID = "openbmb/MiniCPM-V-2_6"
MINICPM_BACKENDS = {"minicpm-v", "minicpm_v", "minicpmv"}
SENSITIVE_PROBE_MARKERS = ("HF_TOKEN", "HUGGINGFACE_TOKEN", "hf_", ".env")

_MINICPM_MODEL: Any | None = None
_MINICPM_TOKENIZER: Any | None = None
_MINICPM_MODEL_ID: str | None = None


@dataclass(frozen=True)
class VisionRunResult:
    object_understanding: ObjectUnderstanding
    fallbacks: list[str]


def understand_object(image_path: str | None, description: str) -> ObjectUnderstanding:
    """Return object understanding without exposing runtime metadata."""
    return understand_object_with_metadata(image_path, description).object_understanding


def probe_vision_runtime(
    *,
    settings: RuntimeSettings | None = None,
    load_model: bool = True,
) -> dict[str, Any]:
    """Return non-secret runtime diagnostics for hosted MiniCPM-V debugging."""
    current = settings or get_runtime_settings()
    backend = current.vision_backend.strip().lower()
    model_id = current.vision_model_id or MINICPM_DEFAULT_MODEL_ID
    probe: dict[str, Any] = {
        "backend": backend,
        "vision_model_id": model_id if backend in MINICPM_BACKENDS else current.vision_model_id,
        "torch_import": False,
        "transformers_import": False,
        "cuda_available": False,
        "device_count": 0,
        "device_name": "",
        "mps_available": False,
        "minicpm_load_attempted": False,
        "minicpm_load_ok": False,
        "errors": [],
    }

    torch_module: Any | None = None
    try:
        import torch

        torch_module = torch
        probe["torch_import"] = True
        probe["cuda_available"] = torch.cuda.is_available()
        probe["device_count"] = torch.cuda.device_count()
        if probe["cuda_available"] and probe["device_count"]:
            probe["device_name"] = torch.cuda.get_device_name(0)
        probe["mps_available"] = bool(
            getattr(torch.backends, "mps", None) and torch.backends.mps.is_available()
        )
    except Exception as exc:
        _add_probe_error(probe, "torch", exc)

    try:
        from transformers import AutoModel as _AutoModel  # noqa: F401
        from transformers import AutoTokenizer as _AutoTokenizer  # noqa: F401

        probe["transformers_import"] = True
    except Exception as exc:
        _add_probe_error(probe, "transformers", exc)

    if backend in MINICPM_BACKENDS and load_model:
        probe["minicpm_load_attempted"] = True
        try:
            _load_minicpm_components(model_id)
            probe["minicpm_load_ok"] = True
        except Exception as exc:
            _add_probe_error(probe, "minicpm_load", exc)

    return _sanitize_probe_payload(probe)


def understand_object_with_metadata(
    image_path: str | None,
    description: str,
    *,
    settings: RuntimeSettings | None = None,
) -> VisionRunResult:
    current = settings or get_runtime_settings()
    backend = current.vision_backend.strip().lower()

    if backend == "mock":
        return VisionRunResult(_understand_object_mock(image_path, description), [])

    if backend in MINICPM_BACKENDS:
        try:
            return VisionRunResult(_understand_object_minicpm(image_path, description, current), [])
        except Exception as exc:
            _log_vision_fallback("minicpm-v", exc)
            return VisionRunResult(
                _understand_object_mock(image_path, description),
                ["vision-fallback-to-mock"],
            )

    return VisionRunResult(
        _understand_object_mock(image_path, description),
        [f"unknown-vision-backend-{backend}-fallback-to-mock"],
    )


def _understand_object_mock(image_path: str | None, description: str) -> ObjectUnderstanding:
    """Return deterministic mock object understanding for fallback-safe demos."""
    clean_description = description.strip()
    object_name = _infer_object_name(clean_description, image_path)
    features = _infer_features(clean_description, image_path)

    return ObjectUnderstanding(
        object=ObjectInfo(
            name=object_name,
            visible_features=features,
            likely_context=_infer_context(clean_description),
            confidence=0.42 if clean_description else 0.32,
        )
    )


def _understand_object_minicpm(
    image_path: str | None,
    description: str,
    settings: RuntimeSettings,
) -> ObjectUnderstanding:
    if not image_path:
        raise ValueError("MiniCPM-V requires an uploaded image.")

    model_id = settings.vision_model_id or MINICPM_DEFAULT_MODEL_ID
    model, tokenizer = _load_minicpm_components(model_id)
    image = _load_rgb_image(image_path)
    prompt = _object_understanding_prompt(description)
    messages = [{"role": "user", "content": [image, prompt]}]
    raw = model.chat(image=None, msgs=messages, tokenizer=tokenizer)
    if isinstance(raw, tuple):
        raw = raw[0]

    payload = parse_json_object(str(raw))
    return ObjectUnderstanding.model_validate(payload)


def _load_minicpm_components(model_id: str) -> tuple[Any, Any]:
    global _MINICPM_MODEL, _MINICPM_TOKENIZER, _MINICPM_MODEL_ID

    if _MINICPM_MODEL is not None and _MINICPM_TOKENIZER is not None and _MINICPM_MODEL_ID == model_id:
        return _MINICPM_MODEL, _MINICPM_TOKENIZER

    import torch
    from transformers import AutoModel, AutoTokenizer

    model_kwargs: dict[str, Any] = {
        "trust_remote_code": True,
        "torch_dtype": torch.bfloat16,
    }
    try:
        model_kwargs["attn_implementation"] = "sdpa"
        model = AutoModel.from_pretrained(model_id, **model_kwargs)
    except TypeError:
        model_kwargs.pop("attn_implementation", None)
        model = AutoModel.from_pretrained(model_id, **model_kwargs)

    if torch.cuda.is_available():
        model = model.eval().cuda()
    elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
        model = model.eval().to(device="mps", dtype=torch.float16)
    else:
        model = model.eval()

    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    _MINICPM_MODEL = model
    _MINICPM_TOKENIZER = tokenizer
    _MINICPM_MODEL_ID = model_id
    return model, tokenizer


def _load_rgb_image(image_path: str) -> Any:
    from PIL import Image

    return Image.open(image_path).convert("RGB")


def _object_understanding_prompt(description: str) -> str:
    context = description.strip() or "No user description was provided."
    return (
        "You are the vision module for Objectverse Diary. Inspect the uploaded everyday object photo. "
        "Return only valid JSON with exactly this shape: "
        '{"object":{"name":"short object name","visible_features":["feature 1","feature 2","feature 3"],'
        '"likely_context":"where this object probably is","confidence":0.0}}. '
        "Use 3 to 5 concrete visible_features. confidence must be a number from 0 to 1. "
        f"Optional user context: {context}"
    )


def _log_vision_fallback(backend: str, exc: Exception) -> None:
    print(
        f"[Objectverse Diary] Vision backend '{backend}' fell back to mock: {type(exc).__name__}",
        flush=True,
    )


def _add_probe_error(probe: dict[str, Any], stage: str, exc: Exception) -> None:
    probe["errors"].append(
        {
            "stage": stage,
            "type": type(exc).__name__,
            "summary": _sanitize_probe_text(str(exc) or type(exc).__name__),
        }
    )


def _sanitize_probe_payload(value: Any) -> Any:
    if isinstance(value, dict):
        return {str(key): _sanitize_probe_payload(item) for key, item in value.items()}
    if isinstance(value, list):
        return [_sanitize_probe_payload(item) for item in value]
    if isinstance(value, str):
        return _sanitize_probe_text(value)
    return value


def _sanitize_probe_text(value: str, *, max_length: int = 240) -> str:
    clean = value.replace(str(Path.home()), "[home]")
    clean = re.sub(r"hf_[A-Za-z0-9_-]+", "[redacted-token]", clean)
    for marker in SENSITIVE_PROBE_MARKERS:
        clean = clean.replace(marker, "[redacted]")
    if len(clean) > max_length:
        return clean[: max_length - 3] + "..."
    return clean


def _infer_object_name(description: str, image_path: str | None) -> str:
    lowered = description.lower()
    for keyword, name in KNOWN_OBJECTS.items():
        if keyword in lowered:
            return name

    if image_path:
        stem = Path(image_path).stem.replace("_", " ").replace("-", " ").strip()
        if stem:
            return stem[:40]

    return "mysterious everyday object"


def _infer_features(description: str, image_path: str | None) -> list[str]:
    features: list[str] = []
    lowered = description.lower()

    for word in ["old", "new", "cracked", "white", "black", "dusty", "metal", "ceramic", "plastic"]:
        if word in lowered:
            features.append(word)

    if image_path:
        features.append("uploaded photo provided")

    if description:
        features.append("user-supplied description")

    return features[:5] or ["ordinary surface", "unknown material", "quietly suspicious"]


def _infer_context(description: str) -> str:
    lowered = description.lower()
    if "desk" in lowered:
        return "developer desk"
    if "kitchen" in lowered:
        return "kitchen counter"
    if "bedroom" in lowered:
        return "bedroom shelf"
    if "office" in lowered:
        return "office corner"
    return "everyday human environment"