Spaces:
Paused
Paused
| """Vision LLM OCR for mnemonic schema analysis. | |
| Sends the full schema image to an OpenAI-compatible vision API | |
| and receives structured JSON with all detected elements. | |
| Configure via environment variables: | |
| VISION_API_URL — base URL (e.g. http://localhost:8000/v1) | |
| VISION_API_KEY — API key (optional) | |
| VISION_MODEL — model name (e.g. llava-v1.6-mistral-7b) | |
| """ | |
| from __future__ import annotations | |
| import base64 | |
| import json | |
| import logging | |
| import re | |
| import urllib.error | |
| import urllib.request | |
| from typing import Any | |
| import cv2 | |
| import numpy as np | |
| logger = logging.getLogger(__name__) | |
| _SYSTEM_PROMPT = ( | |
| "Ты — эксперт по анализу промышленных мнемосхем (SCADA/HMI). " | |
| "Тебе дано изображение мнемосхемы. Твоя задача — найти и описать ВСЕ элементы на схеме." | |
| ) | |
| _USER_PROMPT = """\ | |
| Проанализируй это изображение промышленной мнемосхемы и верни JSON со ВСЕМИ элементами. | |
| Верни ТОЛЬКО валидный JSON (без markdown, без ```): | |
| { | |
| "title": "название схемы из шапки", | |
| "elements": [ | |
| { | |
| "type": "один из: widget, circle_uid, text, static_equipment, table, button, group_frame, arrow_pipe", | |
| "uid": "число — номер параметра из красного кружка или красные цифры в углу ячейки. Пустая строка если нет", | |
| "text": "текст внутри элемента", | |
| "x": 0, | |
| "y": 0, | |
| "width": 0, | |
| "height": 0, | |
| "description": "краткое описание элемента" | |
| } | |
| ] | |
| } | |
| Правила определения типов: | |
| 1. "circle_uid" — красный кружок с чёрными цифрами внутри (номер параметра) | |
| 2. "widget" — прямоугольник с числовым значением (показания датчика), часто с красной рамкой. Красные мелкие цифры в правом верхнем углу = uid | |
| 3. "text" — текстовая подпись (название оборудования, единицы измерения, заголовки секций) | |
| 4. "static_equipment" — изображение оборудования (насос, задвижка, вентилятор, резервуар, компрессор, циклон) | |
| 5. "table" — таблица с несколькими параметрами в строках/столбцах | |
| 6. "button" — кнопка интерфейса (Главный экран, Легенда и т.д.) | |
| 7. "group_frame" — рамка группы объектов (пунктирная или сплошная рамка вокруг секции) | |
| 8. "arrow_pipe" — стрелка или труба (линия потока материала/газа) | |
| Координаты x, y, width, height — в пикселях изображения. | |
| Найди ВСЕ элементы, особенно мелкие красные кружки с номерами и ячейки значений.""" | |
| def _encode_image_base64(image_bgr: np.ndarray) -> str: | |
| """Encode BGR numpy image to base64 PNG string.""" | |
| success, buffer = cv2.imencode(".png", image_bgr) | |
| if not success: | |
| raise ValueError("Failed to encode image to PNG") | |
| return base64.b64encode(buffer.tobytes()).decode("ascii") | |
| def _extract_json_from_response(text: str) -> dict[str, Any]: | |
| """Extract JSON object from LLM response text. | |
| Handles: markdown fences, truncated output (missing closing braces), | |
| trailing commas before closing brackets. | |
| """ | |
| cleaned = text.strip() | |
| fence_match = re.search(r"```(?:json)?\s*\n?(.*?)```", cleaned, re.DOTALL) | |
| if fence_match: | |
| cleaned = fence_match.group(1).strip() | |
| brace_start = cleaned.find("{") | |
| if brace_start < 0: | |
| raise json.JSONDecodeError("No JSON object found", cleaned, 0) | |
| brace_end = cleaned.rfind("}") | |
| if brace_end > brace_start: | |
| cleaned = cleaned[brace_start:brace_end + 1] | |
| else: | |
| # Truncated — try to repair by closing open structures | |
| cleaned = cleaned[brace_start:] | |
| # Fix trailing commas: ,] or ,} | |
| cleaned = re.sub(r",\s*([}\]])", r"\1", cleaned) | |
| try: | |
| return json.loads(cleaned) | |
| except json.JSONDecodeError: | |
| pass | |
| # Truncated JSON — find last complete element in "elements" array | |
| # Try progressively shorter substrings | |
| last_complete = cleaned.rfind("}") | |
| while last_complete > 0: | |
| attempt = cleaned[:last_complete + 1] | |
| # Count open/close braces and brackets | |
| open_braces = attempt.count("{") - attempt.count("}") | |
| open_brackets = attempt.count("[") - attempt.count("]") | |
| # Close everything | |
| attempt += "]" * open_brackets + "}" * open_braces | |
| attempt = re.sub(r",\s*([}\]])", r"\1", attempt) | |
| try: | |
| return json.loads(attempt) | |
| except json.JSONDecodeError: | |
| pass | |
| last_complete = cleaned.rfind("}", 0, last_complete) | |
| raise json.JSONDecodeError("Could not parse truncated JSON", cleaned[:200], 0) | |
| def analyze_schema_with_vision( | |
| image_bgr: np.ndarray, | |
| *, | |
| api_url: str, | |
| api_key: str = "", | |
| model: str = "", | |
| max_tokens: int = 16384, | |
| timeout_seconds: float = 300.0, | |
| ) -> dict[str, Any]: | |
| """Send schema image to vision LLM and get structured element list. | |
| Args: | |
| image_bgr: BGR numpy image of the schema. | |
| api_url: OpenAI-compatible API base URL (e.g. http://localhost:8000/v1). | |
| api_key: API key (empty string if not required). | |
| model: Model name. | |
| max_tokens: Maximum response tokens. | |
| timeout_seconds: Request timeout. | |
| Returns: | |
| Parsed dict with 'title' and 'elements' list. | |
| """ | |
| b64_image = _encode_image_base64(image_bgr) | |
| img_h, img_w = image_bgr.shape[:2] | |
| endpoint = api_url.rstrip("/") + "/chat/completions" | |
| payload = { | |
| "model": model, | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": _SYSTEM_PROMPT, | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": _USER_PROMPT}, | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/png;base64,{b64_image}", | |
| }, | |
| }, | |
| ], | |
| }, | |
| ], | |
| "max_tokens": max_tokens, | |
| "temperature": 0, | |
| } | |
| headers = { | |
| "Content-Type": "application/json", | |
| "Accept": "application/json", | |
| } | |
| if api_key: | |
| headers["Authorization"] = f"Bearer {api_key}" | |
| body = json.dumps(payload).encode("utf-8") | |
| request = urllib.request.Request( | |
| endpoint, data=body, headers=headers, method="POST", | |
| ) | |
| logger.info( | |
| "Vision OCR: sending %dx%d image to %s (model=%s)", | |
| img_w, img_h, endpoint, model, | |
| ) | |
| try: | |
| with urllib.request.urlopen(request, timeout=timeout_seconds) as response: | |
| raw = response.read() | |
| except urllib.error.HTTPError as exc: | |
| detail = exc.read().decode("utf-8", errors="ignore")[:500] | |
| raise RuntimeError(f"Vision API returned {exc.code}: {detail}") from exc | |
| except urllib.error.URLError as exc: | |
| raise RuntimeError(f"Vision API unavailable: {exc.reason}") from exc | |
| except Exception as exc: | |
| raise RuntimeError(f"Vision API call failed: {exc}") from exc | |
| try: | |
| api_result = json.loads(raw.decode("utf-8")) | |
| except Exception as exc: | |
| raise RuntimeError("Vision API returned invalid JSON response") from exc | |
| choices = api_result.get("choices") or [] | |
| if not choices: | |
| raise RuntimeError("Vision API returned no choices") | |
| message_content = str( | |
| choices[0].get("message", {}).get("content", "") | |
| ).strip() | |
| if not message_content: | |
| raise RuntimeError("Vision API returned empty content") | |
| try: | |
| result = _extract_json_from_response(message_content) | |
| except json.JSONDecodeError as exc: | |
| logger.warning("Failed to parse vision response as JSON: %s", exc) | |
| logger.debug("Raw response: %s", message_content[:1000]) | |
| raise RuntimeError( | |
| f"Vision API returned non-JSON response: {message_content[:200]}" | |
| ) from exc | |
| elements = result.get("elements") or [] | |
| logger.info( | |
| "Vision OCR: received %d elements, title='%s'", | |
| len(elements), str(result.get("title", ""))[:50], | |
| ) | |
| return { | |
| "title": str(result.get("title") or "").strip(), | |
| "elements": [ | |
| _normalize_element(elem, img_w, img_h) | |
| for elem in elements | |
| if isinstance(elem, dict) | |
| ], | |
| "imageWidth": img_w, | |
| "imageHeight": img_h, | |
| } | |
| def _normalize_element( | |
| elem: dict[str, Any], | |
| img_w: int, | |
| img_h: int, | |
| ) -> dict[str, Any]: | |
| """Normalize and validate element coordinates.""" | |
| x = max(0, min(img_w, int(float(elem.get("x") or 0)))) | |
| y = max(0, min(img_h, int(float(elem.get("y") or 0)))) | |
| w = max(1, min(img_w - x, int(float(elem.get("width") or 24)))) | |
| h = max(1, min(img_h - y, int(float(elem.get("height") or 24)))) | |
| return { | |
| "type": str(elem.get("type") or "text").strip(), | |
| "uid": str(elem.get("uid") or "").strip(), | |
| "text": str(elem.get("text") or "").strip(), | |
| "x": x, | |
| "y": y, | |
| "width": w, | |
| "height": h, | |
| "description": str(elem.get("description") or "").strip(), | |
| } | |