mnemo-ocr-core / src /vision_ocr.py
MABobrov's picture
Deploy updated core backend pipeline
7fb79e4
"""Vision LLM OCR for mnemonic schema analysis.
Sends the full schema image to an OpenAI-compatible vision API
and receives structured JSON with all detected elements.
Configure via environment variables:
VISION_API_URL — base URL (e.g. http://localhost:8000/v1)
VISION_API_KEY — API key (optional)
VISION_MODEL — model name (e.g. llava-v1.6-mistral-7b)
"""
from __future__ import annotations
import base64
import json
import logging
import re
import urllib.error
import urllib.request
from typing import Any
import cv2
import numpy as np
logger = logging.getLogger(__name__)
_SYSTEM_PROMPT = (
"Ты — эксперт по анализу промышленных мнемосхем (SCADA/HMI). "
"Тебе дано изображение мнемосхемы. Твоя задача — найти и описать ВСЕ элементы на схеме."
)
_USER_PROMPT = """\
Проанализируй это изображение промышленной мнемосхемы и верни JSON со ВСЕМИ элементами.
Верни ТОЛЬКО валидный JSON (без markdown, без ```):
{
"title": "название схемы из шапки",
"elements": [
{
"type": "один из: widget, circle_uid, text, static_equipment, table, button, group_frame, arrow_pipe",
"uid": "число — номер параметра из красного кружка или красные цифры в углу ячейки. Пустая строка если нет",
"text": "текст внутри элемента",
"x": 0,
"y": 0,
"width": 0,
"height": 0,
"description": "краткое описание элемента"
}
]
}
Правила определения типов:
1. "circle_uid" — красный кружок с чёрными цифрами внутри (номер параметра)
2. "widget" — прямоугольник с числовым значением (показания датчика), часто с красной рамкой. Красные мелкие цифры в правом верхнем углу = uid
3. "text" — текстовая подпись (название оборудования, единицы измерения, заголовки секций)
4. "static_equipment" — изображение оборудования (насос, задвижка, вентилятор, резервуар, компрессор, циклон)
5. "table" — таблица с несколькими параметрами в строках/столбцах
6. "button" — кнопка интерфейса (Главный экран, Легенда и т.д.)
7. "group_frame" — рамка группы объектов (пунктирная или сплошная рамка вокруг секции)
8. "arrow_pipe" — стрелка или труба (линия потока материала/газа)
Координаты x, y, width, height — в пикселях изображения.
Найди ВСЕ элементы, особенно мелкие красные кружки с номерами и ячейки значений."""
def _encode_image_base64(image_bgr: np.ndarray) -> str:
"""Encode BGR numpy image to base64 PNG string."""
success, buffer = cv2.imencode(".png", image_bgr)
if not success:
raise ValueError("Failed to encode image to PNG")
return base64.b64encode(buffer.tobytes()).decode("ascii")
def _extract_json_from_response(text: str) -> dict[str, Any]:
"""Extract JSON object from LLM response text.
Handles: markdown fences, truncated output (missing closing braces),
trailing commas before closing brackets.
"""
cleaned = text.strip()
fence_match = re.search(r"```(?:json)?\s*\n?(.*?)```", cleaned, re.DOTALL)
if fence_match:
cleaned = fence_match.group(1).strip()
brace_start = cleaned.find("{")
if brace_start < 0:
raise json.JSONDecodeError("No JSON object found", cleaned, 0)
brace_end = cleaned.rfind("}")
if brace_end > brace_start:
cleaned = cleaned[brace_start:brace_end + 1]
else:
# Truncated — try to repair by closing open structures
cleaned = cleaned[brace_start:]
# Fix trailing commas: ,] or ,}
cleaned = re.sub(r",\s*([}\]])", r"\1", cleaned)
try:
return json.loads(cleaned)
except json.JSONDecodeError:
pass
# Truncated JSON — find last complete element in "elements" array
# Try progressively shorter substrings
last_complete = cleaned.rfind("}")
while last_complete > 0:
attempt = cleaned[:last_complete + 1]
# Count open/close braces and brackets
open_braces = attempt.count("{") - attempt.count("}")
open_brackets = attempt.count("[") - attempt.count("]")
# Close everything
attempt += "]" * open_brackets + "}" * open_braces
attempt = re.sub(r",\s*([}\]])", r"\1", attempt)
try:
return json.loads(attempt)
except json.JSONDecodeError:
pass
last_complete = cleaned.rfind("}", 0, last_complete)
raise json.JSONDecodeError("Could not parse truncated JSON", cleaned[:200], 0)
def analyze_schema_with_vision(
image_bgr: np.ndarray,
*,
api_url: str,
api_key: str = "",
model: str = "",
max_tokens: int = 16384,
timeout_seconds: float = 300.0,
) -> dict[str, Any]:
"""Send schema image to vision LLM and get structured element list.
Args:
image_bgr: BGR numpy image of the schema.
api_url: OpenAI-compatible API base URL (e.g. http://localhost:8000/v1).
api_key: API key (empty string if not required).
model: Model name.
max_tokens: Maximum response tokens.
timeout_seconds: Request timeout.
Returns:
Parsed dict with 'title' and 'elements' list.
"""
b64_image = _encode_image_base64(image_bgr)
img_h, img_w = image_bgr.shape[:2]
endpoint = api_url.rstrip("/") + "/chat/completions"
payload = {
"model": model,
"messages": [
{
"role": "system",
"content": _SYSTEM_PROMPT,
},
{
"role": "user",
"content": [
{"type": "text", "text": _USER_PROMPT},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{b64_image}",
},
},
],
},
],
"max_tokens": max_tokens,
"temperature": 0,
}
headers = {
"Content-Type": "application/json",
"Accept": "application/json",
}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
body = json.dumps(payload).encode("utf-8")
request = urllib.request.Request(
endpoint, data=body, headers=headers, method="POST",
)
logger.info(
"Vision OCR: sending %dx%d image to %s (model=%s)",
img_w, img_h, endpoint, model,
)
try:
with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
raw = response.read()
except urllib.error.HTTPError as exc:
detail = exc.read().decode("utf-8", errors="ignore")[:500]
raise RuntimeError(f"Vision API returned {exc.code}: {detail}") from exc
except urllib.error.URLError as exc:
raise RuntimeError(f"Vision API unavailable: {exc.reason}") from exc
except Exception as exc:
raise RuntimeError(f"Vision API call failed: {exc}") from exc
try:
api_result = json.loads(raw.decode("utf-8"))
except Exception as exc:
raise RuntimeError("Vision API returned invalid JSON response") from exc
choices = api_result.get("choices") or []
if not choices:
raise RuntimeError("Vision API returned no choices")
message_content = str(
choices[0].get("message", {}).get("content", "")
).strip()
if not message_content:
raise RuntimeError("Vision API returned empty content")
try:
result = _extract_json_from_response(message_content)
except json.JSONDecodeError as exc:
logger.warning("Failed to parse vision response as JSON: %s", exc)
logger.debug("Raw response: %s", message_content[:1000])
raise RuntimeError(
f"Vision API returned non-JSON response: {message_content[:200]}"
) from exc
elements = result.get("elements") or []
logger.info(
"Vision OCR: received %d elements, title='%s'",
len(elements), str(result.get("title", ""))[:50],
)
return {
"title": str(result.get("title") or "").strip(),
"elements": [
_normalize_element(elem, img_w, img_h)
for elem in elements
if isinstance(elem, dict)
],
"imageWidth": img_w,
"imageHeight": img_h,
}
def _normalize_element(
elem: dict[str, Any],
img_w: int,
img_h: int,
) -> dict[str, Any]:
"""Normalize and validate element coordinates."""
x = max(0, min(img_w, int(float(elem.get("x") or 0))))
y = max(0, min(img_h, int(float(elem.get("y") or 0))))
w = max(1, min(img_w - x, int(float(elem.get("width") or 24))))
h = max(1, min(img_h - y, int(float(elem.get("height") or 24))))
return {
"type": str(elem.get("type") or "text").strip(),
"uid": str(elem.get("uid") or "").strip(),
"text": str(elem.get("text") or "").strip(),
"x": x,
"y": y,
"width": w,
"height": h,
"description": str(elem.get("description") or "").strip(),
}