"""Vision LLM OCR for mnemonic schema analysis. Sends the full schema image to an OpenAI-compatible vision API and receives structured JSON with all detected elements. Configure via environment variables: VISION_API_URL — base URL (e.g. http://localhost:8000/v1) VISION_API_KEY — API key (optional) VISION_MODEL — model name (e.g. llava-v1.6-mistral-7b) """ from __future__ import annotations import base64 import json import logging import re import urllib.error import urllib.request from typing import Any import cv2 import numpy as np logger = logging.getLogger(__name__) _SYSTEM_PROMPT = ( "Ты — эксперт по анализу промышленных мнемосхем (SCADA/HMI). " "Тебе дано изображение мнемосхемы. Твоя задача — найти и описать ВСЕ элементы на схеме." ) _USER_PROMPT = """\ Проанализируй это изображение промышленной мнемосхемы и верни JSON со ВСЕМИ элементами. Верни ТОЛЬКО валидный JSON (без markdown, без ```): { "title": "название схемы из шапки", "elements": [ { "type": "один из: widget, circle_uid, text, static_equipment, table, button, group_frame, arrow_pipe", "uid": "число — номер параметра из красного кружка или красные цифры в углу ячейки. Пустая строка если нет", "text": "текст внутри элемента", "x": 0, "y": 0, "width": 0, "height": 0, "description": "краткое описание элемента" } ] } Правила определения типов: 1. "circle_uid" — красный кружок с чёрными цифрами внутри (номер параметра) 2. "widget" — прямоугольник с числовым значением (показания датчика), часто с красной рамкой. Красные мелкие цифры в правом верхнем углу = uid 3. "text" — текстовая подпись (название оборудования, единицы измерения, заголовки секций) 4. "static_equipment" — изображение оборудования (насос, задвижка, вентилятор, резервуар, компрессор, циклон) 5. "table" — таблица с несколькими параметрами в строках/столбцах 6. "button" — кнопка интерфейса (Главный экран, Легенда и т.д.) 7. "group_frame" — рамка группы объектов (пунктирная или сплошная рамка вокруг секции) 8. "arrow_pipe" — стрелка или труба (линия потока материала/газа) Координаты x, y, width, height — в пикселях изображения. Найди ВСЕ элементы, особенно мелкие красные кружки с номерами и ячейки значений.""" def _encode_image_base64(image_bgr: np.ndarray) -> str: """Encode BGR numpy image to base64 PNG string.""" success, buffer = cv2.imencode(".png", image_bgr) if not success: raise ValueError("Failed to encode image to PNG") return base64.b64encode(buffer.tobytes()).decode("ascii") def _extract_json_from_response(text: str) -> dict[str, Any]: """Extract JSON object from LLM response text. Handles: markdown fences, truncated output (missing closing braces), trailing commas before closing brackets. """ cleaned = text.strip() fence_match = re.search(r"```(?:json)?\s*\n?(.*?)```", cleaned, re.DOTALL) if fence_match: cleaned = fence_match.group(1).strip() brace_start = cleaned.find("{") if brace_start < 0: raise json.JSONDecodeError("No JSON object found", cleaned, 0) brace_end = cleaned.rfind("}") if brace_end > brace_start: cleaned = cleaned[brace_start:brace_end + 1] else: # Truncated — try to repair by closing open structures cleaned = cleaned[brace_start:] # Fix trailing commas: ,] or ,} cleaned = re.sub(r",\s*([}\]])", r"\1", cleaned) try: return json.loads(cleaned) except json.JSONDecodeError: pass # Truncated JSON — find last complete element in "elements" array # Try progressively shorter substrings last_complete = cleaned.rfind("}") while last_complete > 0: attempt = cleaned[:last_complete + 1] # Count open/close braces and brackets open_braces = attempt.count("{") - attempt.count("}") open_brackets = attempt.count("[") - attempt.count("]") # Close everything attempt += "]" * open_brackets + "}" * open_braces attempt = re.sub(r",\s*([}\]])", r"\1", attempt) try: return json.loads(attempt) except json.JSONDecodeError: pass last_complete = cleaned.rfind("}", 0, last_complete) raise json.JSONDecodeError("Could not parse truncated JSON", cleaned[:200], 0) def analyze_schema_with_vision( image_bgr: np.ndarray, *, api_url: str, api_key: str = "", model: str = "", max_tokens: int = 16384, timeout_seconds: float = 300.0, ) -> dict[str, Any]: """Send schema image to vision LLM and get structured element list. Args: image_bgr: BGR numpy image of the schema. api_url: OpenAI-compatible API base URL (e.g. http://localhost:8000/v1). api_key: API key (empty string if not required). model: Model name. max_tokens: Maximum response tokens. timeout_seconds: Request timeout. Returns: Parsed dict with 'title' and 'elements' list. """ b64_image = _encode_image_base64(image_bgr) img_h, img_w = image_bgr.shape[:2] endpoint = api_url.rstrip("/") + "/chat/completions" payload = { "model": model, "messages": [ { "role": "system", "content": _SYSTEM_PROMPT, }, { "role": "user", "content": [ {"type": "text", "text": _USER_PROMPT}, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{b64_image}", }, }, ], }, ], "max_tokens": max_tokens, "temperature": 0, } headers = { "Content-Type": "application/json", "Accept": "application/json", } if api_key: headers["Authorization"] = f"Bearer {api_key}" body = json.dumps(payload).encode("utf-8") request = urllib.request.Request( endpoint, data=body, headers=headers, method="POST", ) logger.info( "Vision OCR: sending %dx%d image to %s (model=%s)", img_w, img_h, endpoint, model, ) try: with urllib.request.urlopen(request, timeout=timeout_seconds) as response: raw = response.read() except urllib.error.HTTPError as exc: detail = exc.read().decode("utf-8", errors="ignore")[:500] raise RuntimeError(f"Vision API returned {exc.code}: {detail}") from exc except urllib.error.URLError as exc: raise RuntimeError(f"Vision API unavailable: {exc.reason}") from exc except Exception as exc: raise RuntimeError(f"Vision API call failed: {exc}") from exc try: api_result = json.loads(raw.decode("utf-8")) except Exception as exc: raise RuntimeError("Vision API returned invalid JSON response") from exc choices = api_result.get("choices") or [] if not choices: raise RuntimeError("Vision API returned no choices") message_content = str( choices[0].get("message", {}).get("content", "") ).strip() if not message_content: raise RuntimeError("Vision API returned empty content") try: result = _extract_json_from_response(message_content) except json.JSONDecodeError as exc: logger.warning("Failed to parse vision response as JSON: %s", exc) logger.debug("Raw response: %s", message_content[:1000]) raise RuntimeError( f"Vision API returned non-JSON response: {message_content[:200]}" ) from exc elements = result.get("elements") or [] logger.info( "Vision OCR: received %d elements, title='%s'", len(elements), str(result.get("title", ""))[:50], ) return { "title": str(result.get("title") or "").strip(), "elements": [ _normalize_element(elem, img_w, img_h) for elem in elements if isinstance(elem, dict) ], "imageWidth": img_w, "imageHeight": img_h, } def _normalize_element( elem: dict[str, Any], img_w: int, img_h: int, ) -> dict[str, Any]: """Normalize and validate element coordinates.""" x = max(0, min(img_w, int(float(elem.get("x") or 0)))) y = max(0, min(img_h, int(float(elem.get("y") or 0)))) w = max(1, min(img_w - x, int(float(elem.get("width") or 24)))) h = max(1, min(img_h - y, int(float(elem.get("height") or 24)))) return { "type": str(elem.get("type") or "text").strip(), "uid": str(elem.get("uid") or "").strip(), "text": str(elem.get("text") or "").strip(), "x": x, "y": y, "width": w, "height": h, "description": str(elem.get("description") or "").strip(), }