Spaces:

MakcukBobrov
/

mnemo-ocr-core

Paused

App Files Files Community

mnemo-ocr-core / src /vision_ocr.py

MABobrov

Deploy updated core backend pipeline

7fb79e4 about 2 months ago

raw

history blame contribute delete

9.83 kB

	"""Vision LLM OCR for mnemonic schema analysis.

	Sends the full schema image to an OpenAI-compatible vision API
	and receives structured JSON with all detected elements.

	Configure via environment variables:
	VISION_API_URL — base URL (e.g. http://localhost:8000/v1)
	VISION_API_KEY — API key (optional)
	VISION_MODEL — model name (e.g. llava-v1.6-mistral-7b)
	"""

	from __future__ import annotations

	import base64
	import json
	import logging
	import re
	import urllib.error
	import urllib.request
	from typing import Any

	import cv2
	import numpy as np

	logger = logging.getLogger(__name__)

	_SYSTEM_PROMPT = (
	"Ты — эксперт по анализу промышленных мнемосхем (SCADA/HMI). "
	"Тебе дано изображение мнемосхемы. Твоя задача — найти и описать ВСЕ элементы на схеме."
	)

	_USER_PROMPT = """\
	Проанализируй это изображение промышленной мнемосхемы и верни JSON со ВСЕМИ элементами.

	Верни ТОЛЬКО валидный JSON (без markdown, без ```):

	{
	"title": "название схемы из шапки",
	"elements": [
	{
	"type": "один из: widget, circle_uid, text, static_equipment, table, button, group_frame, arrow_pipe",
	"uid": "число — номер параметра из красного кружка или красные цифры в углу ячейки. Пустая строка если нет",
	"text": "текст внутри элемента",
	"x": 0,
	"y": 0,
	"width": 0,
	"height": 0,
	"description": "краткое описание элемента"
	}
	]
	}

	Правила определения типов:
	1. "circle_uid" — красный кружок с чёрными цифрами внутри (номер параметра)
	2. "widget" — прямоугольник с числовым значением (показания датчика), часто с красной рамкой. Красные мелкие цифры в правом верхнем углу = uid
	3. "text" — текстовая подпись (название оборудования, единицы измерения, заголовки секций)
	4. "static_equipment" — изображение оборудования (насос, задвижка, вентилятор, резервуар, компрессор, циклон)
	5. "table" — таблица с несколькими параметрами в строках/столбцах
	6. "button" — кнопка интерфейса (Главный экран, Легенда и т.д.)
	7. "group_frame" — рамка группы объектов (пунктирная или сплошная рамка вокруг секции)
	8. "arrow_pipe" — стрелка или труба (линия потока материала/газа)

	Координаты x, y, width, height — в пикселях изображения.
	Найди ВСЕ элементы, особенно мелкие красные кружки с номерами и ячейки значений."""


	def _encode_image_base64(image_bgr: np.ndarray) -> str:
	"""Encode BGR numpy image to base64 PNG string."""
	success, buffer = cv2.imencode(".png", image_bgr)
	if not success:
	raise ValueError("Failed to encode image to PNG")
	return base64.b64encode(buffer.tobytes()).decode("ascii")


	def _extract_json_from_response(text: str) -> dict[str, Any]:
	"""Extract JSON object from LLM response text.

	Handles: markdown fences, truncated output (missing closing braces),
	trailing commas before closing brackets.
	"""
	cleaned = text.strip()

	fence_match = re.search(r"```(?:json)?\s\n?(.?)```", cleaned, re.DOTALL)
	if fence_match:
	cleaned = fence_match.group(1).strip()

	brace_start = cleaned.find("{")
	if brace_start < 0:
	raise json.JSONDecodeError("No JSON object found", cleaned, 0)

	brace_end = cleaned.rfind("}")
	if brace_end > brace_start:
	cleaned = cleaned[brace_start:brace_end + 1]
	else:
	# Truncated — try to repair by closing open structures
	cleaned = cleaned[brace_start:]

	# Fix trailing commas: ,] or ,}
	cleaned = re.sub(r",\s*([}\]])", r"\1", cleaned)

	try:
	return json.loads(cleaned)
	except json.JSONDecodeError:
	pass

	# Truncated JSON — find last complete element in "elements" array
	# Try progressively shorter substrings
	last_complete = cleaned.rfind("}")
	while last_complete > 0:
	attempt = cleaned[:last_complete + 1]
	# Count open/close braces and brackets
	open_braces = attempt.count("{") - attempt.count("}")
	open_brackets = attempt.count("[") - attempt.count("]")
	# Close everything
	attempt += "]" * open_brackets + "}" * open_braces
	attempt = re.sub(r",\s*([}\]])", r"\1", attempt)
	try:
	return json.loads(attempt)
	except json.JSONDecodeError:
	pass
	last_complete = cleaned.rfind("}", 0, last_complete)

	raise json.JSONDecodeError("Could not parse truncated JSON", cleaned[:200], 0)


	def analyze_schema_with_vision(
	image_bgr: np.ndarray,
	*,
	api_url: str,
	api_key: str = "",
	model: str = "",
	max_tokens: int = 16384,
	timeout_seconds: float = 300.0,
	) -> dict[str, Any]:
	"""Send schema image to vision LLM and get structured element list.

	Args:
	image_bgr: BGR numpy image of the schema.
	api_url: OpenAI-compatible API base URL (e.g. http://localhost:8000/v1).
	api_key: API key (empty string if not required).
	model: Model name.
	max_tokens: Maximum response tokens.
	timeout_seconds: Request timeout.

	Returns:
	Parsed dict with 'title' and 'elements' list.
	"""
	b64_image = _encode_image_base64(image_bgr)
	img_h, img_w = image_bgr.shape[:2]

	endpoint = api_url.rstrip("/") + "/chat/completions"

	payload = {
	"model": model,
	"messages": [
	{
	"role": "system",
	"content": _SYSTEM_PROMPT,
	},
	{
	"role": "user",
	"content": [
	{"type": "text", "text": _USER_PROMPT},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/png;base64,{b64_image}",
	},
	},
	],
	},
	],
	"max_tokens": max_tokens,
	"temperature": 0,
	}

	headers = {
	"Content-Type": "application/json",
	"Accept": "application/json",
	}
	if api_key:
	headers["Authorization"] = f"Bearer {api_key}"

	body = json.dumps(payload).encode("utf-8")
	request = urllib.request.Request(
	endpoint, data=body, headers=headers, method="POST",
	)

	logger.info(
	"Vision OCR: sending %dx%d image to %s (model=%s)",
	img_w, img_h, endpoint, model,
	)

	try:
	with urllib.request.urlopen(request, timeout=timeout_seconds) as response:
	raw = response.read()
	except urllib.error.HTTPError as exc:
	detail = exc.read().decode("utf-8", errors="ignore")[:500]
	raise RuntimeError(f"Vision API returned {exc.code}: {detail}") from exc
	except urllib.error.URLError as exc:
	raise RuntimeError(f"Vision API unavailable: {exc.reason}") from exc
	except Exception as exc:
	raise RuntimeError(f"Vision API call failed: {exc}") from exc

	try:
	api_result = json.loads(raw.decode("utf-8"))
	except Exception as exc:
	raise RuntimeError("Vision API returned invalid JSON response") from exc

	choices = api_result.get("choices") or []
	if not choices:
	raise RuntimeError("Vision API returned no choices")

	message_content = str(
	choices[0].get("message", {}).get("content", "")
	).strip()

	if not message_content:
	raise RuntimeError("Vision API returned empty content")

	try:
	result = _extract_json_from_response(message_content)
	except json.JSONDecodeError as exc:
	logger.warning("Failed to parse vision response as JSON: %s", exc)
	logger.debug("Raw response: %s", message_content[:1000])
	raise RuntimeError(
	f"Vision API returned non-JSON response: {message_content[:200]}"
	) from exc

	elements = result.get("elements") or []
	logger.info(
	"Vision OCR: received %d elements, title='%s'",
	len(elements), str(result.get("title", ""))[:50],
	)

	return {
	"title": str(result.get("title") or "").strip(),
	"elements": [
	_normalize_element(elem, img_w, img_h)
	for elem in elements
	if isinstance(elem, dict)
	],
	"imageWidth": img_w,
	"imageHeight": img_h,
	}


	def _normalize_element(
	elem: dict[str, Any],
	img_w: int,
	img_h: int,
	) -> dict[str, Any]:
	"""Normalize and validate element coordinates."""
	x = max(0, min(img_w, int(float(elem.get("x") or 0))))
	y = max(0, min(img_h, int(float(elem.get("y") or 0))))
	w = max(1, min(img_w - x, int(float(elem.get("width") or 24))))
	h = max(1, min(img_h - y, int(float(elem.get("height") or 24))))

	return {
	"type": str(elem.get("type") or "text").strip(),
	"uid": str(elem.get("uid") or "").strip(),
	"text": str(elem.get("text") or "").strip(),
	"x": x,
	"y": y,
	"width": w,
	"height": h,
	"description": str(elem.get("description") or "").strip(),
	}