Spaces:

ricklon
/

DeepSeek-OCR-2-Math

Running on Zero

App Files Files Community

DeepSeek-OCR-2-Math / app.py

ricklon

Use workspace base image as first-load extract fallback

9a36dca 7 days ago

raw

history blame contribute delete

77 kB

	import gradio as gr
	from transformers import AutoModel, AutoTokenizer
	import torch
	import spaces
	import os
	import sys
	import tempfile
	import shutil
	import inspect
	from PIL import Image, ImageDraw, ImageFont, ImageOps
	import fitz
	import re
	import ast
	import numpy as np
	import base64
	import html as html_lib
	import markdown as md_lib
	import latex2mathml.converter
	from collections import deque

	from io import StringIO, BytesIO

	HAS_IMAGE_EDITOR = hasattr(gr, "ImageEditor")
	HAS_PAINT = hasattr(gr, "Paint")
	HAS_BRUSH = hasattr(gr, "Brush")
	HAS_ERASER = hasattr(gr, "Eraser")
	HAS_REGION_WORKSPACE = HAS_PAINT or HAS_IMAGE_EDITOR

	# Model options — swap MODEL_NAME to reduce VRAM usage on GPUs with <= 8GB
	#
	# Full precision BF16 (~8GB VRAM) — original, highest accuracy
	MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
	#
	# FP8 dynamic quantization (~3.5GB VRAM) — ~50% VRAM reduction, 3750 downloads/mo
	# Requires Ampere GPU or newer (RTX 3070 is supported)
	# MODEL_NAME = 'richarddavison/DeepSeek-OCR-2-FP8'
	#
	# 8-bit quantization (~4GB VRAM) — same stack (torch 2.6, flash-attn 2.7.3, py3.12)
	# Explicitly supports dynamic resolution (0-6 patches), 140 downloads/mo
	# MODEL_NAME = 'mzbac/DeepSeek-OCR-2-8bit'

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
	# flash_attention_2 requires a CUDA device at init time — not available on ZeroGPU at
	# module load. DeepseekOCR2 only supports 'flash_attention_2' and 'eager'; sdpa is not
	# implemented for this model class. Fall back to 'eager' when no GPU is present.
	# Locally with CUDA, flash_attention_2 is used for maximum throughput.
	_attn_impl = 'flash_attention_2' if torch.cuda.is_available() else 'eager'
	model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation=_attn_impl, torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True).eval()
	# .cuda() is NOT called here — on ZeroGPU, GPU is only available inside @spaces.GPU
	# functions. Locally, model.cuda() is called inside process_image on first run.

	BASE_SIZE = 1024
	IMAGE_SIZE = 768
	CROP_MODE = True
	WORKSPACE_EDITOR_HEIGHT = 640
	WORKSPACE_EDITOR_WIDTH_EST = 980
	WORKSPACE_DEFAULT_SCALE = 89
	GROUNDING_PATTERN = re.compile(r'<\\|ref\\|>(.?)<\\|/ref\\|><\\|det\\|>(.?)<\\|/det\\|>', re.DOTALL)
	INFER_DEBUG_FILTERS = ['PATCHES', '====', 'BASE:', 'directly resize', 'NO PATCHES', 'torch.Size', '%\|']
	EQUATION_ZOOM_PROMPT = "<image>\n<\|grounding\|>Locate each individual equation or math line."
	EQUATION_LINE_OCR_PROMPT = "<image>\nRead the math expression exactly as written. Return only the equation text."
	EQUATION_ZOOM_MAX_CANDIDATES = 6
	EQUATION_ZOOM_MIN_AREA = 0.05
	EQUATION_ZOOM_MIN_DIM = 0.24
	EQUATION_ZOOM_PADDING = 0.025
	EQUATION_ZOOM_MAX_ASPECT = 12.0
	EQUATION_DETAIL_MAX_BOXES = 24
	EQUATION_DETAIL_IOU_DEDUPE = 0.7
	EQUATION_LINE_IOU_DEDUPE = 0.55
	EQUATION_LINE_MIN_AREA = 0.0008
	EQUATION_LINE_MIN_W = 0.03
	EQUATION_LINE_MIN_H = 0.01
	EQUATION_LINE_MAX_ASPECT = 30.0
	MATH_LABEL_HINTS = ("formula", "equation", "math")
	MATH_STRONG_MARKERS = ("\\(", "\\[", "\\frac", "\\sum", "\\int", "\\sqrt", "\\lim", "\\begin{")
	MATH_WEAK_MARKERS = ("^", "_", "=", "+", "\\cdot", "\\times")

	TASK_PROMPTS = {
	"📋 Markdown": {"prompt": "<image>\n<\|grounding\|>Convert the document to markdown.", "has_grounding": True},
	"📝 Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
	"📍 Locate": {"prompt": "<image>\nLocate <\|ref\|>text<\|/ref\|> in the image.", "has_grounding": True},
	"🔍 Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
	"✏️ Custom": {"prompt": "", "has_grounding": False}
	}

	def extract_grounding_references(text):
	refs = []
	seen = set()
	for entry in _extract_grounding_entries(text):
	coord_text = repr(entry["coords"])
	key = (
	entry["label"].strip().lower(),
	tuple(
	(round(c[0], 1), round(c[1], 1), round(c[2], 1), round(c[3], 1))
	for c in entry["coords"]
	),
	)
	if key in seen:
	continue
	seen.add(key)
	raw = f'<\|ref\|>{entry["label"]}<\|/ref\|><\|det\|>{coord_text}<\|/det\|>'
	refs.append((raw, entry["label"], coord_text))
	return refs

	def _parse_coord_payload(payload):
	if isinstance(payload, str):
	try:
	coords = ast.literal_eval(payload.strip())
	except (SyntaxError, ValueError):
	return []
	else:
	coords = payload

	if isinstance(coords, (tuple, list)) and coords and isinstance(coords[0], (int, float)):
	coords = [coords]
	if not isinstance(coords, list):
	return []

	out = []
	for c in coords:
	if not isinstance(c, (list, tuple)) or len(c) < 4:
	continue
	x1, y1, x2, y2 = [float(v) for v in c[:4]]
	x1, x2 = sorted((max(0.0, min(999.0, x1)), max(0.0, min(999.0, x2))))
	y1, y2 = sorted((max(0.0, min(999.0, y1)), max(0.0, min(999.0, y2))))
	if x2 <= x1 or y2 <= y1:
	continue
	out.append([x1, y1, x2, y2])
	return out

	def _extract_grounding_entries(raw_text: str):
	if not raw_text:
	return []

	entries = []
	last_end = 0
	for m in GROUNDING_PATTERN.finditer(raw_text):
	label = m.group(1).strip() or "text"
	coords = _parse_coord_payload(m.group(2))
	if not coords:
	continue
	text_chunk = raw_text[last_end:m.start()].strip()
	entries.append({
	"label": label,
	"coords": coords,
	"text": text_chunk,
	})
	last_end = m.end()
	return entries

	def _math_marker_score(text_chunk: str) -> int:
	score = 0
	for marker in MATH_STRONG_MARKERS:
	if marker in text_chunk:
	score += 3
	for marker in MATH_WEAK_MARKERS:
	if marker in text_chunk:
	score += 1
	return score

	def _box_iou(a, b):
	ax1, ay1, ax2, ay2 = a
	bx1, by1, bx2, by2 = b
	inter_x1 = max(ax1, bx1)
	inter_y1 = max(ay1, by1)
	inter_x2 = min(ax2, bx2)
	inter_y2 = min(ay2, by2)
	if inter_x2 <= inter_x1 or inter_y2 <= inter_y1:
	return 0.0
	inter = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
	area_a = max(1e-9, (ax2 - ax1) * (ay2 - ay1))
	area_b = max(1e-9, (bx2 - bx1) * (by2 - by1))
	union = area_a + area_b - inter
	return inter / union if union > 0 else 0.0

	def _dedupe_boxes(boxes, iou_threshold):
	kept = []
	for box in sorted(boxes, key=lambda b: ((b[2] - b[0]) * (b[3] - b[1]))):
	if any(_box_iou(box, other) >= iou_threshold for other in kept):
	continue
	kept.append(box)
	return kept

	def _is_math_candidate(label: str, text_chunk: str, box):
	label_l = label.lower()
	box_w = (box[2] - box[0]) / 999.0
	box_h = (box[3] - box[1]) / 999.0
	area = box_w * box_h
	aspect = max(box_w / max(1e-9, box_h), box_h / max(1e-9, box_w))
	has_math_label = any(hint in label_l for hint in MATH_LABEL_HINTS)
	has_math_text = _math_marker_score(text_chunk) >= 3
	is_large = area >= EQUATION_ZOOM_MIN_AREA or box_w >= EQUATION_ZOOM_MIN_DIM or box_h >= EQUATION_ZOOM_MIN_DIM
	return (has_math_label or has_math_text) and is_large and aspect <= EQUATION_ZOOM_MAX_ASPECT

	def _map_crop_box_to_page(sub_box, crop_px, img_w, img_h):
	crop_x1, crop_y1, crop_x2, crop_y2 = crop_px
	crop_w = max(1, crop_x2 - crop_x1)
	crop_h = max(1, crop_y2 - crop_y1)
	page_x1 = ((crop_x1 + (sub_box[0] / 999.0) * crop_w) / img_w) * 999.0
	page_y1 = ((crop_y1 + (sub_box[1] / 999.0) * crop_h) / img_h) * 999.0
	page_x2 = ((crop_x1 + (sub_box[2] / 999.0) * crop_w) / img_w) * 999.0
	page_y2 = ((crop_y1 + (sub_box[3] / 999.0) * crop_h) / img_h) * 999.0
	return _parse_coord_payload([[page_x1, page_y1, page_x2, page_y2]])[0]

	def draw_bounding_boxes(image, refs, extract_images=False):
	img_w, img_h = image.size
	img_draw = image.copy()
	draw = ImageDraw.Draw(img_draw)
	overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
	draw2 = ImageDraw.Draw(overlay)
	font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 15)
	crops = []

	color_map = {}
	np.random.seed(42)

	for ref in refs:
	label = ref[1]
	if label not in color_map:
	color_map[label] = (np.random.randint(50, 255), np.random.randint(50, 255), np.random.randint(50, 255))

	color = color_map[label]
	coords = _parse_coord_payload(ref[2])
	color_a = color + (60,)

	for box in coords:
	x1, y1, x2, y2 = int(box[0]/999img_w), int(box[1]/999img_h), int(box[2]/999img_w), int(box[3]/999img_h)

	if extract_images and label == 'image':
	crops.append(image.crop((x1, y1, x2, y2)))

	width = 5 if label == 'title' else 3
	draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
	draw2.rectangle([x1, y1, x2, y2], fill=color_a)

	text_bbox = draw.textbbox((0, 0), label, font=font)
	tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
	ty = max(0, y1 - 20)
	draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
	draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))

	img_draw.paste(overlay, (0, 0), overlay)
	return img_draw, crops

	def _extract_labeled_crops_from_refs(image, refs, max_items=24):
	img_w, img_h = image.size
	items = []
	seen = set()

	for ref in refs:
	label = str(ref[1])
	coords = _parse_coord_payload(ref[2])
	for box in coords:
	x1 = int(box[0] / 999.0 * img_w)
	y1 = int(box[1] / 999.0 * img_h)
	x2 = int(box[2] / 999.0 * img_w)
	y2 = int(box[3] / 999.0 * img_h)
	if x2 - x1 < 8 or y2 - y1 < 8:
	continue
	key = (label.lower(), x1, y1, x2, y2)
	if key in seen:
	continue
	seen.add(key)
	crop = image.crop((x1, y1, x2, y2))
	caption = f"{label} ({crop.width}x{crop.height})"
	items.append((crop, caption))
	if len(items) >= max_items:
	return items
	return items

	def clean_output(text, include_images=False):
	if not text:
	return ""
	pattern = r'(<\\|ref\\|>(.?)<\\|/ref\\|><\\|det\\|>(.?)<\\|/det\\|>)'
	matches = re.findall(pattern, text, re.DOTALL)
	img_num = 0

	for match in matches:
	if '<\|ref\|>image<\|/ref\|>' in match[0]:
	if include_images:
	text = text.replace(match[0], f'\n\n[Figure {img_num + 1}]\n\n', 1)
	img_num += 1
	else:
	text = text.replace(match[0], '', 1)
	else:
	text = re.sub(rf'(?m)^[^\n]{re.escape(match[0])}[^\n]\n?', '', text)

	text = _strip_malformed_grounding(text)
	text = _dedupe_repeated_math_blocks(text)
	return text.strip()

	def _strip_malformed_grounding(text: str) -> str:
	"""Remove incomplete grounding tags that can leak into OCR markdown/text."""
	if not text:
	return ""

	line_patterns = [
	r'(?m)^[^\n]<\\|ref\\|>.?<\\|/ref\\|><\\|det\\|>.?(?:<\\|/det\\|>)?[^\n]\n?',
	r'(?m)^[^\n]<\\|det\\|>.?(?:<\\|/det\\|>)?[^\n]*\n?',
	r'(?m)^[^\n]<\\|/?ref\\|>[^\n]\n?',
	]
	for p in line_patterns:
	text = re.sub(p, '', text)

	text = re.sub(r'<\\|/?ref\\|>', '', text)
	text = re.sub(r'<\\|/?det\\|>', '', text)
	return text

	def _equation_text_key(text: str) -> str:
	if not text:
	return ""
	key = text.strip()
	key = re.sub(r'\\\[(.+?)\\\]', r'\1', key, flags=re.DOTALL)
	key = re.sub(r'\\$(.+?)\\$', r'\1', key, flags=re.DOTALL)
	key = re.sub(r'\$\$(.+?)\$\$', r'\1', key, flags=re.DOTALL)
	key = re.sub(r'\^\{([A-Za-z0-9])\}', r'^\1', key)
	key = re.sub(r'_\{([A-Za-z0-9])\}', r'_\1', key)
	key = re.sub(r'\s+', '', key)
	return key.lower()

	def _dedupe_repeated_math_blocks(text: str) -> str:
	if not text:
	return ""

	pattern = re.compile(r'\\\[(.+?)\\\]\|\\$(.+?)\\$\|\$\$(.+?)\$\$', re.DOTALL)
	seen = set()
	out = []
	last = 0
	removed_any = False

	for m in pattern.finditer(text):
	out.append(text[last:m.start()])
	expr = m.group(1) or m.group(2) or m.group(3) or ""
	key = _equation_text_key(expr)
	if key and key in seen:
	removed_any = True
	else:
	if key:
	seen.add(key)
	out.append(m.group(0))
	last = m.end()
	out.append(text[last:])

	merged = ''.join(out)
	if removed_any:
	merged = re.sub(r'\n{3,}', '\n\n', merged)
	return merged

	PREVIEW_CSS = """
	<style>
	.math-preview {
	padding: 1.5em;
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
	font-size: 15px;
	line-height: 1.8;
	color: #1a1a1a;
	max-width: 100%;
	overflow-x: auto;
	}
	.math-display {
	text-align: center;
	overflow-x: auto;
	margin: 1em 0;
	padding: 0.5em 0;
	}
	math[display="block"] { display: block; overflow-x: auto; max-width: 100%; }
	.math-preview h1 { font-size: 1.8em; font-weight: 700; margin: 1em 0 0.4em; border-bottom: 2px solid #e0e0e0; padding-bottom: 0.3em; }
	.math-preview h2 { font-size: 1.4em; font-weight: 600; margin: 1em 0 0.4em; border-bottom: 1px solid #e0e0e0; padding-bottom: 0.2em; }
	.math-preview h3 { font-size: 1.15em; font-weight: 600; margin: 0.9em 0 0.3em; }
	.math-preview p { margin: 0.6em 0; }
	.math-preview ul, .math-preview ol { padding-left: 1.8em; margin: 0.5em 0; }
	.math-preview li { margin: 0.25em 0; }
	.math-preview table { border-collapse: collapse; width: 100%; margin: 1em 0; font-size: 0.95em; }
	.math-preview th, .math-preview td { border: 1px solid #ccc; padding: 0.45em 0.75em; text-align: left; }
	.math-preview th { background: #f2f2f2; font-weight: 600; }
	.math-preview tr:nth-child(even) { background: #fafafa; }
	.math-preview code { background: #f4f4f4; padding: 0.15em 0.4em; border-radius: 3px; font-family: 'Courier New', monospace; font-size: 0.88em; }
	.math-preview pre { background: #f4f4f4; padding: 1em; border-radius: 5px; overflow-x: auto; margin: 0.8em 0; }
	.math-preview pre code { background: none; padding: 0; }
	.math-preview blockquote { border-left: 4px solid #ccc; margin: 0.8em 0; padding: 0.4em 1em; color: #555; background: #fafafa; }
	.math-preview img { max-width: 100%; height: auto; display: block; margin: 0.8em 0; }
	.math-preview .ocr-gap, .mathjax-preview .ocr-gap { width: 100%; }
	.math-fallback { color: #888; font-style: italic; }
	</style>
	<script>
	(() => {
	if (window.__ocrMathJaxInit) return;
	window.__ocrMathJaxInit = true;

	if (!window.MathJax) {
	window.MathJax = {
	tex: {
	inlineMath: [['\\\$', '\\\$'], ['$', '$']],
	displayMath: [['\\\\[', '\\\\]'], ['$$', '$$']]
	},
	options: {
	skipHtmlTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
	}
	};
	}

	const typeset = () => {
	if (window.MathJax?.typesetPromise) {
	const nodes = Array.from(document.querySelectorAll('.mathjax-preview, .spatial-preview'));
	if (nodes.length) window.MathJax.typesetPromise(nodes).catch(() => {});
	}
	};
	window.__typesetOcrMath = typeset;

	const ensureScript = () => {
	if (document.getElementById('mathjax-ocr-preview')) return;
	const script = document.createElement('script');
	script.id = 'mathjax-ocr-preview';
	script.async = true;
	script.src = 'https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js';
	script.onload = () => setTimeout(typeset, 20);
	document.head.appendChild(script);
	};

	ensureScript();
	setTimeout(typeset, 100);

	const observer = new MutationObserver((mutations) => {
	for (const m of mutations) {
	for (const n of m.addedNodes) {
	if (n.nodeType !== 1) continue;
	if (n.matches?.('.mathjax-preview, .spatial-preview') \|\| n.querySelector?.('.mathjax-preview, .spatial-preview')) {
	setTimeout(typeset, 30);
	return;
	}
	}
	}
	});
	observer.observe(document.body, { childList: true, subtree: true });
	})();

	(() => {
	if (window.__ocrWorkspaceZoomInit) return;
	window.__ocrWorkspaceZoomInit = true;

	const stateByRoot = new WeakMap();
	const targetZoomPct = 88;
	const nearTargetTolerancePct = 3;
	const tinyFitThresholdPct = 45;

	const getState = (root) => {
	let state = stateByRoot.get(root);
	if (!state) {
	state = { busy: false, applied: false, lastSeenZoom: null, lastAutoAt: 0 };
	stateByRoot.set(root, state);
	}
	return state;
	};

	const parseZoomPct = (root) => {
	const zoomNode = root.querySelector(".zoom-number span[role='button']");
	if (!zoomNode) return null;
	const m = (zoomNode.textContent \|\| "").match(/([0-9]+(?:\\.[0-9]+)?)\\s*%/);
	return m ? parseFloat(m[1]) : null;
	};

	const getZoomInBtn = (root) =>
	root.querySelector("button[aria-label='Zoom in'], button[title='Zoom in']");

	const isWorkspaceRoot = (root) =>
	!!root.querySelector(".pixi-target") && !!root.querySelector(".zoom-number");

	const maybeAutoZoom = (root) => {
	if (!isWorkspaceRoot(root)) return;

	const state = getState(root);
	const now = Date.now();
	const zoomPct = parseZoomPct(root);
	if (zoomPct == null) return;

	// A drop from high zoom to low zoom usually means a new image was loaded.
	if (state.lastSeenZoom != null && state.lastSeenZoom > 70 && zoomPct < 35) {
	state.applied = false;
	}
	state.lastSeenZoom = zoomPct;

	if (state.busy \|\| state.applied) return;
	if (zoomPct > tinyFitThresholdPct) return;
	if (now - state.lastAutoAt < 1200) return;

	const zoomInBtn = getZoomInBtn(root);
	if (!zoomInBtn) return;

	state.busy = true;
	state.lastAutoAt = now;
	let steps = 0;

	const step = () => {
	const current = parseZoomPct(root);
	if (
	current == null \|\|
	current >= (targetZoomPct - nearTargetTolerancePct) \|\|
	steps >= 20
	) {
	state.busy = false;
	state.applied = true;
	return;
	}
	zoomInBtn.click();
	steps += 1;
	setTimeout(step, 80);
	};

	setTimeout(step, 90);
	};

	const attachRootObserver = (root) => {
	if (root.dataset.ocrZoomObserved === "1") return;
	root.dataset.ocrZoomObserved = "1";

	const obs = new MutationObserver(() => maybeAutoZoom(root));
	obs.observe(root, { childList: true, subtree: true, characterData: true });

	setTimeout(() => maybeAutoZoom(root), 200);
	setTimeout(() => maybeAutoZoom(root), 800);
	};

	const scan = () => {
	document.querySelectorAll("[data-testid='image']").forEach((root) => {
	if (isWorkspaceRoot(root)) attachRootObserver(root);
	});
	};

	scan();
	const pageObs = new MutationObserver(scan);
	pageObs.observe(document.body, { childList: true, subtree: true });
	})();
	</script>
	"""

	def _inject_spatial_gap_placeholders(text: str):
	"""Preserve runs of blank lines so OCR spacing is visible in preview."""
	gaps: dict[str, int] = {}
	counter = [0]

	def repl(m):
	key = f'ZZOCRGAP{counter[0]}ZZ'
	counter[0] += 1
	# Two newlines are a normal paragraph break; extras represent vertical spacing.
	gaps[key] = max(1, len(m.group(0)) - 2)
	return f'\n\n{key}\n\n'

	return re.sub(r'\n{3,}', repl, text), gaps

	def _restore_spatial_gap_placeholders(html: str, gaps: dict[str, int]) -> str:
	if not gaps:
	return html
	for key, extra_lines in gaps.items():
	gap_em = min(10.0, 0.9 * extra_lines)
	block = f'<div class="ocr-gap" style="height:{gap_em:.2f}em"></div>'
	html = html.replace(f'<p>{key}</p>', block)
	html = html.replace(key, block)
	return html

	def _to_mathml(latex: str, display: bool) -> str:
	"""Convert a LaTeX string to MathML. Falls back to a code block on error."""
	# Fix OCR error: \frac{n/m} (single-argument fraction) → \frac{n}{m}
	latex = re.sub(r'\\frac\{(\d+)/(\d+)\}(?!\s*\{)', r'\\frac{\1}{\2}', latex)
	try:
	mathml = latex2mathml.converter.convert(latex)
	if display:
	mathml = re.sub(r'<math\b', '<math display="block"', mathml, count=1)
	return mathml
	except Exception:
	escaped = html_lib.escape(latex)
	if display:
	return f'<pre class="math-fallback"><code>{escaped}</code></pre>'
	return f'<code class="math-fallback">{escaped}</code>'

	def to_math_html(text: str) -> str:
	"""Convert model markdown output to HTML with server-side MathML rendering.

	Uses a placeholder approach: math is extracted and replaced with unique
	tokens before the markdown pass, then swapped back afterwards. This avoids
	Python-Markdown mishandling multi-line <div> blocks that contain blank lines.
	"""
	if not text:
	return ""

	blocks: dict[str, str] = {}
	literals: dict[str, str] = {}
	counter = [0]

	def display_block(m):
	key = f'ZZDISPLAYMATH{counter[0]}ZZ'
	counter[0] += 1
	expr = m.group(1).strip()
	blocks[key] = f'<div class="math-display">{_to_mathml(expr, display=True)}</div>'
	literals[key] = f'\\[{expr}\\]'
	return f'\n\n{key}\n\n'

	def inline_math(m):
	key = f'ZZINLINEMATH{counter[0]}ZZ'
	counter[0] += 1
	expr = m.group(1).strip()
	blocks[key] = _to_mathml(expr, display=False)
	literals[key] = f'\${expr}\$'
	return key

	# Replace display math \[...\] with placeholder tokens
	text = re.sub(r'\\\[(.+?)\\\]', display_block, text, flags=re.DOTALL)
	# Remove orphaned \[ with no matching \] (truncated model output)
	text = re.sub(r'\\\[.*', '', text, flags=re.DOTALL)
	# Replace inline math $...$ with placeholder tokens
	text = re.sub(r'\\$(.+?)\\$', inline_math, text)
	text, gaps = _inject_spatial_gap_placeholders(text)

	# Run markdown on text that now contains only safe placeholder tokens
	html = md_lib.markdown(text, extensions=['tables', 'fenced_code', 'sane_lists', 'nl2br'])

	# Protect rendered code/pre blocks so placeholder swap never mutates literal code.
	protected_blocks: dict[str, str] = {}
	protected_counter = [0]

	def _protect_code_html(m):
	token = f'ZZCODEHTML{protected_counter[0]}ZZ'
	protected_counter[0] += 1
	protected_blocks[token] = m.group(0)
	return token

	html = re.sub(r'<pre\b[^>]>.?</pre>', _protect_code_html, html, flags=re.DOTALL)
	html = re.sub(r'<code\b[^>]>.?</code>', _protect_code_html, html, flags=re.DOTALL)

	# Swap placeholders back for MathML/HTML (handle <p>KEY</p> wrapping too)
	for key, value in blocks.items():
	html = html.replace(f'<p>{key}</p>', value)
	html = html.replace(key, value)

	# Restore protected literal code/pre blocks unchanged.
	for token, original in protected_blocks.items():
	html = html.replace(token, original)

	# Placeholders left at this stage occur inside code/pre; keep them literal.
	for key, literal in literals.items():
	html = html.replace(key, html_lib.escape(literal))

	html = _restore_spatial_gap_placeholders(html, gaps)

	return f'<div class="math-preview">{html}</div>'

	def to_mathjax_html(text: str) -> str:
	"""Render markdown to HTML and typeset math client-side with MathJax."""
	if not text:
	return ""
	text, gaps = _inject_spatial_gap_placeholders(text)
	html = md_lib.markdown(text, extensions=['tables', 'fenced_code', 'sane_lists', 'nl2br'])
	html = _restore_spatial_gap_placeholders(html, gaps)
	return f'<div class="mathjax-preview">{html}</div>'

	def _grounding_blocks_from_raw(raw_text: str):
	blocks = []
	for entry in _extract_grounding_entries(raw_text):
	label = entry["label"]
	text = entry["text"].strip()
	coords = entry["coords"]
	for idx, c in enumerate(coords):
	blocks.append({
	"label": label,
	"text": text if idx == 0 else "",
	"x1": c[0],
	"y1": c[1],
	"x2": c[2],
	"y2": c[3],
	})

	return blocks

	def to_spatial_html(raw_text: str, markdown_text: str) -> str:
	"""Render OCR content using grounding boxes for spatially-positioned blocks."""
	blocks = _grounding_blocks_from_raw(raw_text)
	if not blocks:
	return to_mathjax_html(markdown_text)

	used_text = 0
	rendered = []
	palette = {
	"title": "#8b5cf6",
	"text": "#2563eb",
	"image": "#059669",
	"table": "#d97706",
	"formula": "#dc2626",
	}

	for i, b in enumerate(sorted(blocks, key=lambda x: (x["y1"], x["x1"]))):
	label = b["label"]
	color = palette.get(label.lower(), "#4b5563")
	body = b["text"].strip()
	if body:
	used_text += len(body)
	body_text, gaps = _inject_spatial_gap_placeholders(body)
	body_html = md_lib.markdown(body_text, extensions=['tables', 'fenced_code', 'sane_lists', 'nl2br'])
	body_html = _restore_spatial_gap_placeholders(body_html, gaps)
	else:
	body_html = ""

	if not body_html:
	body_html = f"<p><em>{html_lib.escape(label)}</em></p>"

	left = b["x1"] / 999.0 * 100.0
	top = b["y1"] / 999.0 * 100.0
	width = max(1.0, (b["x2"] - b["x1"]) / 999.0 * 100.0)
	height = max(1.2, (b["y2"] - b["y1"]) / 999.0 * 100.0)

	rendered.append(
	f"""
	<article class="spatial-block" style="left:{left:.2f}%; top:{top:.2f}%; width:{width:.2f}%; min-height:{height:.2f}%; --block-color:{color};">
	<header>{html_lib.escape(label)}</header>
	<section>{body_html}</section>
	</article>
	"""
	)

	fallback = ""
	if markdown_text and used_text < max(120, int(len(markdown_text) * 0.4)):
	fallback_html = to_mathjax_html(markdown_text)
	fallback = f"""
	<details class="spatial-fallback">
	<summary>Show full linear markdown rendering</summary>
	{fallback_html}
	</details>
	"""

	return f"""
	<style>
	.spatial-preview {{
	padding: 1rem;
	}}
	.spatial-canvas {{
	position: relative;
	width: 100%;
	min-height: 72vh;
	aspect-ratio: 1 / 1.35;
	background: linear-gradient(180deg, #fcfdff 0%, #f7f9fc 100%);
	border: 1px solid #d8dee9;
	border-radius: 8px;
	overflow: auto;
	}}
	.spatial-block {{
	position: absolute;
	box-sizing: border-box;
	border: 1px solid var(--block-color);
	background: color-mix(in srgb, var(--block-color) 7%, white);
	border-radius: 6px;
	padding: 0.35rem 0.5rem;
	overflow: hidden;
	}}
	.spatial-block > header {{
	font-size: 11px;
	font-weight: 700;
	letter-spacing: 0.03em;
	text-transform: uppercase;
	color: var(--block-color);
	margin-bottom: 0.25rem;
	}}
	.spatial-block > section {{
	font-size: 13px;
	line-height: 1.35;
	}}
	.spatial-block p {{ margin: 0.2rem 0; }}
	.spatial-fallback {{
	margin-top: 1rem;
	padding-top: 0.5rem;
	border-top: 1px solid #d8dee9;
	}}
	</style>
	<div class="spatial-preview mathjax-preview">
	<div class="spatial-canvas">
	{''.join(rendered)}
	</div>
	{fallback}
	</div>
	"""

	def embed_images(markdown, crops):
	if not crops:
	return markdown
	for i, img in enumerate(crops):
	buf = BytesIO()
	img.save(buf, format="PNG")
	b64 = base64.b64encode(buf.getvalue()).decode()
	markdown = markdown.replace(f'[Figure {i + 1}]', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
	return markdown

	def _infer_with_prompt(image, prompt, crop_mode=None):
	if crop_mode is None:
	crop_mode = CROP_MODE
	tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
	image.save(tmp.name, 'JPEG', quality=95)
	tmp.close()
	out_dir = tempfile.mkdtemp()

	stdout = sys.stdout
	capture = StringIO()
	sys.stdout = capture
	try:
	model.infer(
	tokenizer=tokenizer,
	prompt=prompt,
	image_file=tmp.name,
	output_path=out_dir,
	base_size=BASE_SIZE,
	image_size=IMAGE_SIZE,
	crop_mode=crop_mode,
	save_results=False
	)
	finally:
	sys.stdout = stdout
	os.unlink(tmp.name)
	shutil.rmtree(out_dir, ignore_errors=True)

	lines = [
	l for l in capture.getvalue().split('\n')
	if l.strip() and not any(s in l for s in INFER_DEBUG_FILTERS)
	]
	return '\n'.join(lines).strip()

	def _refine_equation_refs(image, raw_text):
	entries = _extract_grounding_entries(raw_text)
	if not entries:
	return []

	img_w, img_h = image.size
	candidates = []
	for entry in entries:
	for box in entry["coords"]:
	if _is_math_candidate(entry["label"], entry["text"], box):
	area = (box[2] - box[0]) * (box[3] - box[1])
	candidates.append((area, entry, box))

	if not candidates:
	return []

	candidates.sort(key=lambda x: x[0], reverse=True)
	refined_refs = []
	for _, entry, box in candidates[:EQUATION_ZOOM_MAX_CANDIDATES]:
	x1 = int(box[0] / 999.0 * img_w)
	y1 = int(box[1] / 999.0 * img_h)
	x2 = int(box[2] / 999.0 * img_w)
	y2 = int(box[3] / 999.0 * img_h)
	box_w = max(1, x2 - x1)
	box_h = max(1, y2 - y1)
	pad_x = max(8, int(box_w * EQUATION_ZOOM_PADDING))
	pad_y = max(8, int(box_h * EQUATION_ZOOM_PADDING))
	crop_x1 = max(0, x1 - pad_x)
	crop_y1 = max(0, y1 - pad_y)
	crop_x2 = min(img_w, x2 + pad_x)
	crop_y2 = min(img_h, y2 + pad_y)
	if crop_x2 - crop_x1 < 32 or crop_y2 - crop_y1 < 32:
	continue

	crop = image.crop((crop_x1, crop_y1, crop_x2, crop_y2))
	sub_result = _infer_with_prompt(crop, EQUATION_ZOOM_PROMPT)
	sub_entries = _extract_grounding_entries(sub_result)
	if not sub_entries:
	continue

	mapped_boxes = []
	for sub in sub_entries:
	sub_label = sub["label"].lower()
	sub_text = sub["text"]
	is_math_sub = any(hint in sub_label for hint in MATH_LABEL_HINTS) or _math_marker_score(sub_text) >= 3
	if sub_label in ("image", "table") or not is_math_sub:
	continue
	for sub_box in sub["coords"]:
	mapped = _map_crop_box_to_page(sub_box, (crop_x1, crop_y1, crop_x2, crop_y2), img_w, img_h)
	w = (mapped[2] - mapped[0]) / 999.0
	h = (mapped[3] - mapped[1]) / 999.0
	if w * h < 0.0004:
	continue
	mapped_boxes.append(mapped)

	if not mapped_boxes:
	continue
	mapped_boxes = _dedupe_boxes(mapped_boxes, EQUATION_DETAIL_IOU_DEDUPE)
	mapped_boxes = sorted(mapped_boxes, key=lambda b: (b[1], b[0]))[:EQUATION_DETAIL_MAX_BOXES]
	if len(mapped_boxes) < 2:
	continue

	merged_text = repr(mapped_boxes)
	label = "equation_detail"
	raw = f'<\|ref\|>{label}<\|/ref\|><\|det\|>{merged_text}<\|/det\|>'
	refined_refs.append((raw, label, merged_text))

	return refined_refs

	def _norm_box_to_pixels(box, img_w, img_h, pad_ratio=0.0):
	x1 = int(box[0] / 999.0 * img_w)
	y1 = int(box[1] / 999.0 * img_h)
	x2 = int(box[2] / 999.0 * img_w)
	y2 = int(box[3] / 999.0 * img_h)
	if pad_ratio > 0:
	pad_x = max(1, int((x2 - x1) * pad_ratio))
	pad_y = max(1, int((y2 - y1) * pad_ratio))
	x1 -= pad_x
	y1 -= pad_y
	x2 += pad_x
	y2 += pad_y
	x1 = max(0, min(img_w - 1, x1))
	y1 = max(0, min(img_h - 1, y1))
	x2 = max(x1 + 1, min(img_w, x2))
	y2 = max(y1 + 1, min(img_h, y2))
	return (x1, y1, x2, y2)

	def _detect_equation_line_boxes(image, infer_crop_mode=None):
	detect_raw = _infer_with_prompt(image, EQUATION_ZOOM_PROMPT, crop_mode=infer_crop_mode)
	entries = _extract_grounding_entries(detect_raw)
	if not entries:
	return [], detect_raw

	boxes = []
	for entry in entries:
	label_l = entry["label"].lower()
	text_chunk = entry["text"]
	if label_l in ("image", "table"):
	continue
	for box in entry["coords"]:
	w = (box[2] - box[0]) / 999.0
	h = (box[3] - box[1]) / 999.0
	area = w * h
	aspect = max(w / max(1e-9, h), h / max(1e-9, w))
	looks_math = any(hint in label_l for hint in MATH_LABEL_HINTS) or _math_marker_score(text_chunk) >= 2
	if area < EQUATION_LINE_MIN_AREA or w < EQUATION_LINE_MIN_W or h < EQUATION_LINE_MIN_H:
	continue
	if aspect > EQUATION_LINE_MAX_ASPECT:
	continue
	if not looks_math and area < 0.004:
	continue
	boxes.append(box)

	boxes = _dedupe_boxes(boxes, EQUATION_LINE_IOU_DEDUPE)
	boxes = sorted(boxes, key=lambda b: (round(b[1], 3), b[0]))
	return boxes, detect_raw

	def _process_equation_lines_separately(image, infer_crop_mode=None):
	boxes, detect_raw = _detect_equation_line_boxes(image, infer_crop_mode=infer_crop_mode)
	if not boxes:
	return None

	img_w, img_h = image.size
	cleaned_parts = []
	markdown_parts = []
	raw_parts = [f"## Detection\n\n{detect_raw}".strip()]
	refs = []
	crops = []
	seen_line_keys = set()

	for i, box in enumerate(boxes, 1):
	x1, y1, x2, y2 = _norm_box_to_pixels(box, img_w, img_h, pad_ratio=0.01)
	crop = image.crop((x1, y1, x2, y2))
	line_raw = _infer_with_prompt(crop, EQUATION_LINE_OCR_PROMPT, crop_mode=False)
	line_clean = clean_output(line_raw, False).strip()
	if not line_clean:
	continue
	line_key = _equation_text_key(line_clean)
	if line_key and line_key in seen_line_keys:
	continue
	if line_key:
	seen_line_keys.add(line_key)
	line_label = f"Eq {i}"
	line_markdown = line_clean
	if "$$" not in line_markdown and "\\[" not in line_markdown and "\\(" not in line_markdown:
	line_markdown = f"$$\n{line_markdown}\n$$"
	cleaned_parts.append(f"{line_label}: {line_clean}")
	markdown_parts.append(f"### {line_label}\n\n{line_markdown}")
	raw_parts.append(f"## {line_label}\n\n{line_raw}")
	coord_text = repr([box])
	raw_ref = f'<\|ref\|>eq_line_{i}<\|/ref\|><\|det\|>{coord_text}<\|/det\|>'
	refs.append((raw_ref, line_label, coord_text))
	crops.append((crop, line_label))

	if not cleaned_parts:
	return None

	img_out, _ = draw_bounding_boxes(image, refs, extract_images=False)
	cleaned = "\n".join(cleaned_parts).strip()
	markdown = "\n\n".join(markdown_parts).strip()
	raw = "\n\n".join(raw_parts).strip()
	return cleaned, markdown, raw, img_out, crops

	@spaces.GPU(duration=90)
	def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
	model.cuda() # GPU is available here — works on ZeroGPU and locally
	if image is None:
	return "Error: Upload an image", "", "", None, []
	if not separate_equation_lines and task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
	return "Please enter a prompt", "", "", None, []

	if image.mode in ('RGBA', 'LA', 'P'):
	image = image.convert('RGB')
	image = ImageOps.exif_transpose(image)

	if separate_equation_lines:
	separate_result = _process_equation_lines_separately(image, infer_crop_mode=infer_crop_mode)
	if separate_result is not None:
	return separate_result
	msg = "No separate equation lines detected. Try Selected Region + freehand highlight around the equation steps."
	return msg, msg, msg, None, []

	if task == "✏️ Custom":
	prompt = f"<image>\n{custom_prompt.strip()}"
	has_grounding = '<\|grounding\|>' in custom_prompt
	elif task == "📍 Locate":
	prompt = f"<image>\nLocate <\|ref\|>{custom_prompt.strip()}<\|/ref\|> in the image."
	has_grounding = True
	else:
	prompt = TASK_PROMPTS[task]["prompt"]
	has_grounding = TASK_PROMPTS[task]["has_grounding"]
	result = _infer_with_prompt(image, prompt, crop_mode=infer_crop_mode)

	if not result:
	return "No text detected", "", "", None, []

	cleaned = clean_output(result, False)
	markdown = clean_output(result, True)

	img_out = None
	crops = []
	figure_crops = []
	result_for_layout = result

	if has_grounding and '<\|ref\|>' in result:
	refs = extract_grounding_references(result)
	if task == "📋 Markdown" and enable_equation_zoom:
	refs.extend(_refine_equation_refs(image, result))
	if refs:
	img_out, figure_crops = draw_bounding_boxes(image, refs, True)
	crops = _extract_labeled_crops_from_refs(image, refs)
	synthetic = [r[0] for r in refs if r[1] == "equation_detail"]
	if synthetic:
	result_for_layout = result + "\n" + "\n".join(synthetic)

	markdown = embed_images(markdown, figure_crops)
	if not crops and figure_crops:
	crops = _label_gallery_items(figure_crops, prefix="Figure")

	return cleaned, markdown, result_for_layout, img_out, crops

	@spaces.GPU(duration=90)
	def process_pdf(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
	doc = fitz.open(path)
	total_pages = len(doc)
	if page_num < 1 or page_num > total_pages:
	doc.close()
	return f"Invalid page number. PDF has {total_pages} pages.", "", "", None, []
	page = doc.load_page(page_num - 1)
	pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
	img = Image.open(BytesIO(pix.tobytes("png")))
	doc.close()

	return process_image(
	img,
	task,
	custom_prompt,
	enable_equation_zoom=enable_equation_zoom,
	infer_crop_mode=infer_crop_mode,
	separate_equation_lines=separate_equation_lines,
	)

	def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
	if not path:
	return "Error: Upload a file", "", "", None, []
	if path.lower().endswith('.pdf'):
	return process_pdf(
	path,
	task,
	custom_prompt,
	page_num,
	enable_equation_zoom=enable_equation_zoom,
	infer_crop_mode=infer_crop_mode,
	separate_equation_lines=separate_equation_lines,
	)
	else:
	return process_image(
	Image.open(path),
	task,
	custom_prompt,
	enable_equation_zoom=enable_equation_zoom,
	infer_crop_mode=infer_crop_mode,
	separate_equation_lines=separate_equation_lines,
	)

	def _extract_editor_background(editor_value):
	if editor_value is None:
	return None
	if isinstance(editor_value, Image.Image):
	return editor_value
	if isinstance(editor_value, dict):
	background = editor_value.get("background")
	if isinstance(background, Image.Image):
	return background
	composite = editor_value.get("composite")
	if isinstance(composite, Image.Image):
	return composite
	return None

	def _to_rgba_image(obj):
	if isinstance(obj, dict):
	for k in ("image", "layer", "composite", "background", "mask"):
	if k in obj:
	return _to_rgba_image(obj[k])
	return None
	if isinstance(obj, Image.Image):
	return obj.convert("RGBA")
	if isinstance(obj, np.ndarray):
	arr = obj
	if arr.ndim == 2:
	arr = np.stack([arr, arr, arr, np.full_like(arr, 255)], axis=-1)
	elif arr.ndim == 3 and arr.shape[2] == 3:
	alpha = np.full((arr.shape[0], arr.shape[1], 1), 255, dtype=arr.dtype)
	arr = np.concatenate([arr, alpha], axis=2)
	elif arr.ndim != 3 or arr.shape[2] != 4:
	return None
	return Image.fromarray(arr.astype(np.uint8), mode="RGBA")
	return None

	def _to_mask_array(obj):
	if obj is None:
	return None
	if isinstance(obj, dict):
	for k in ("mask", "image", "layer", "composite", "background"):
	if k in obj:
	arr = _to_mask_array(obj[k])
	if arr is not None:
	return arr
	return None

	if isinstance(obj, Image.Image):
	arr = np.asarray(obj)
	elif isinstance(obj, np.ndarray):
	arr = obj
	else:
	return None

	if arr.ndim == 2:
	return arr > 0
	if arr.ndim == 3:
	if arr.shape[2] >= 4:
	return arr[:, :, 3] > 0
	return np.max(arr[:, :, :3], axis=2) > 0
	return None

	def _locate_patch_bbox(base_image: Image.Image, patch_image: Image.Image):
	"""Approximate patch location in base image using downscaled SSD search."""
	if base_image is None or patch_image is None:
	return None
	base = np.asarray(base_image.convert("L"), dtype=np.float32)
	patch = np.asarray(patch_image.convert("L"), dtype=np.float32)
	bh, bw = base.shape[:2]
	ph, pw = patch.shape[:2]
	if ph <= 0 or pw <= 0 or ph > bh or pw > bw:
	return None

	max_dim = max(bh, bw)
	scale = min(1.0, 320.0 / max_dim) if max_dim > 0 else 1.0
	if scale < 1.0:
	new_bw = max(1, int(round(bw * scale)))
	new_bh = max(1, int(round(bh * scale)))
	new_pw = max(1, int(round(pw * scale)))
	new_ph = max(1, int(round(ph * scale)))
	base_small = np.asarray(Image.fromarray(base.astype(np.uint8)).resize((new_bw, new_bh), Image.Resampling.BILINEAR), dtype=np.float32)
	patch_small = np.asarray(Image.fromarray(patch.astype(np.uint8)).resize((new_pw, new_ph), Image.Resampling.BILINEAR), dtype=np.float32)
	else:
	base_small = base
	patch_small = patch

	sbh, sbw = base_small.shape
	sph, spw = patch_small.shape
	if sph > sbh or spw > sbw:
	return None

	best_score = float("inf")
	best_x = 0
	best_y = 0
	for y in range(sbh - sph + 1):
	row = base_small[y:y + sph, :]
	windows = np.lib.stride_tricks.sliding_window_view(row, spw, axis=1)
	# windows: (sph, sbw-spw+1, spw)
	diff = windows - patch_small[:, None, :]
	scores = np.mean(diff * diff, axis=(0, 2))
	x = int(np.argmin(scores))
	score = float(scores[x])
	if score < best_score:
	best_score = score
	best_x = x
	best_y = y

	if scale < 1.0:
	x1 = int(round(best_x / scale))
	y1 = int(round(best_y / scale))
	x2 = int(round((best_x + spw) / scale))
	y2 = int(round((best_y + sph) / scale))
	else:
	x1, y1, x2, y2 = best_x, best_y, best_x + spw, best_y + sph

	x1 = max(0, min(bw - 1, x1))
	y1 = max(0, min(bh - 1, y1))
	x2 = max(x1 + 1, min(bw, x2))
	y2 = max(y1 + 1, min(bh, y2))
	return (x1, y1, x2, y2)

	def _component_boxes(binary_mask, min_pixels=24):
	h, w = binary_mask.shape
	visited = np.zeros((h, w), dtype=bool)
	boxes = []
	neighbors = [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]

	ys, xs = np.where(binary_mask)
	for sy, sx in zip(ys.tolist(), xs.tolist()):
	if visited[sy, sx]:
	continue
	q = deque([(sy, sx)])
	visited[sy, sx] = True
	min_x = max_x = sx
	min_y = max_y = sy
	count = 0
	while q:
	y, x = q.popleft()
	count += 1
	if x < min_x:
	min_x = x
	if x > max_x:
	max_x = x
	if y < min_y:
	min_y = y
	if y > max_y:
	max_y = y
	for dy, dx in neighbors:
	ny, nx = y + dy, x + dx
	if ny < 0 or ny >= h or nx < 0 or nx >= w:
	continue
	if visited[ny, nx] or not binary_mask[ny, nx]:
	continue
	visited[ny, nx] = True
	q.append((ny, nx))
	if count >= min_pixels:
	boxes.append((min_x, min_y, max_x + 1, max_y + 1, count))
	return boxes

	def _extract_regions_from_mask(background, mask):
	components = _component_boxes(mask, min_pixels=24)
	if not components:
	return []

	regions = []
	for x1, y1, x2, y2, _ in components:
	pad_x = max(2, int((x2 - x1) * 0.02))
	pad_y = max(2, int((y2 - y1) * 0.02))
	px1 = max(0, x1 - pad_x)
	py1 = max(0, y1 - pad_y)
	px2 = min(background.width, x2 + pad_x)
	py2 = min(background.height, y2 + pad_y)
	if px2 <= px1 or py2 <= py1:
	continue
	crop = background.crop((px1, py1, px2, py2)).convert("RGB")
	regions.append((crop, (px1, py1, px2, py2)))

	regions.sort(
	key=lambda item: (item[1][2] - item[1][0]) * (item[1][3] - item[1][1]),
	reverse=True,
	)
	return regions

	def _editor_background_and_mask(editor_value):
	if not isinstance(editor_value, dict):
	return None, None
	background = _to_rgba_image(editor_value.get("background"))
	if background is None:
	background = _to_rgba_image(editor_value.get("image"))
	composite = _to_rgba_image(editor_value.get("composite"))
	layers = editor_value.get("layers") or []
	if background is None:
	if composite is None:
	return None, None
	background = composite

	mask = _to_mask_array(editor_value.get("mask"))
	if mask is not None:
	if mask.shape[:2] != (background.height, background.width):
	mask_img = Image.fromarray(mask.astype(np.uint8) * 255, mode="L")
	nearest = Image.Resampling.NEAREST if hasattr(Image, "Resampling") else Image.NEAREST
	mask = np.asarray(mask_img.resize((background.width, background.height), nearest)) > 0
	return background, mask

	if not isinstance(layers, list) or not layers:
	return background, None

	alpha_acc = np.zeros((background.height, background.width), dtype=np.uint8)
	for layer in layers:
	layer_img = _to_rgba_image(layer)
	if layer_img is None:
	continue
	if layer_img.size != background.size:
	nearest = Image.Resampling.NEAREST if hasattr(Image, "Resampling") else Image.NEAREST
	layer_img = layer_img.resize(background.size, nearest)
	layer_alpha = np.asarray(layer_img, dtype=np.uint8)[:, :, 3]
	alpha_acc = np.maximum(alpha_acc, layer_alpha)
	return background, (alpha_acc > 0)

	def _extract_selected_regions(editor_value, base_size=None, base_image=None):
	if editor_value is None:
	return []
	if isinstance(editor_value, Image.Image):
	if base_size and tuple(editor_value.size) == tuple(base_size):
	return []
	bbox = _locate_patch_bbox(base_image, editor_value) if base_image is not None else None
	return [(editor_value, bbox)]
	if not isinstance(editor_value, dict):
	return []

	background, mask = _editor_background_and_mask(editor_value)
	layers = editor_value.get("layers") or []
	if background is None:
	return []

	if not isinstance(layers, list) or not layers:
	# No annotation layers; treat as explicit crop only if size changed from base.
	if base_size and tuple(background.size) == tuple(base_size):
	return []
	patch = background.convert("RGB")
	bbox = _locate_patch_bbox(base_image, patch) if base_image is not None else None
	return [(patch, bbox)]

	if mask is None:
	return []
	return _extract_regions_from_mask(background, mask)

	def _extract_new_drawn_regions(editor_value, base_size=None, base_image=None, consumed_mask=None):
	# For crop mode / explicit cropped image, fall back to classic extraction.
	if isinstance(editor_value, Image.Image):
	regions = _extract_selected_regions(editor_value, base_size=base_size, base_image=base_image)
	return regions, consumed_mask
	if not isinstance(editor_value, dict):
	return [], consumed_mask

	background, mask = _editor_background_and_mask(editor_value)
	layers = editor_value.get("layers") or []
	if background is None:
	return [], consumed_mask

	has_layer_data = isinstance(layers, list) and len(layers) > 0
	has_draw_data = (mask is not None) or has_layer_data

	# If there are no draw layers/mask, treat as explicit crop mode.
	if not has_draw_data:
	regions = _extract_selected_regions(editor_value, base_size=base_size, base_image=base_image)
	return regions, consumed_mask

	if mask is None:
	return [], consumed_mask

	if consumed_mask is None or not isinstance(consumed_mask, np.ndarray) or consumed_mask.shape != mask.shape:
	delta_mask = mask
	else:
	delta_mask = np.logical_and(mask, np.logical_not(consumed_mask))
	regions = _extract_regions_from_mask(background, delta_mask)
	return regions, mask

	def _extract_selected_region(editor_value, base_size=None, base_image=None):
	regions = _extract_selected_regions(editor_value, base_size=base_size, base_image=base_image)
	if not regions:
	return None, None
	return regions[0]

	def _bbox_overlap_ratio(a, b):
	ax1, ay1, ax2, ay2 = a
	bx1, by1, bx2, by2 = b
	ix1 = max(ax1, bx1)
	iy1 = max(ay1, by1)
	ix2 = min(ax2, bx2)
	iy2 = min(ay2, by2)
	if ix2 <= ix1 or iy2 <= iy1:
	return 0.0, 0.0
	inter = float((ix2 - ix1) * (iy2 - iy1))
	area_a = float(max(1, (ax2 - ax1) * (ay2 - ay1)))
	area_b = float(max(1, (bx2 - bx1) * (by2 - by1)))
	return inter / area_a, inter / area_b

	def _is_duplicate_bbox(candidate_bbox, existing_bbox):
	iou = _box_iou(candidate_bbox, existing_bbox)
	cover_cand, cover_exist = _bbox_overlap_ratio(candidate_bbox, existing_bbox)
	return iou >= 0.85 or cover_cand >= 0.92 or cover_exist >= 0.97

	def _draw_selected_region_boxes(image, boxes):
	if image is None or not boxes:
	return None
	refs = []
	w, h = image.size
	for i, b in enumerate(boxes, 1):
	x1, y1, x2, y2 = b
	nx1 = max(0.0, min(999.0, x1 / max(1, w) * 999.0))
	ny1 = max(0.0, min(999.0, y1 / max(1, h) * 999.0))
	nx2 = max(0.0, min(999.0, x2 / max(1, w) * 999.0))
	ny2 = max(0.0, min(999.0, y2 / max(1, h) * 999.0))
	label = f"Region {i}"
	coord_text = repr([[nx1, ny1, nx2, ny2]])
	raw = f'<\|ref\|>region_{i}<\|/ref\|><\|det\|>{coord_text}<\|/det\|>'
	refs.append((raw, label, coord_text))
	img_out, _ = draw_bounding_boxes(image, refs, extract_images=False)
	return img_out

	def _region_gallery_items(regions):
	items = []
	for i, r in enumerate(regions, 1):
	img = r["image"]
	label = f"Region {i}"
	if isinstance(img, Image.Image):
	label = f"{label} ({img.width}x{img.height})"
	items.append((img, label))
	return items

	def _label_gallery_items(items, prefix=None):
	labeled = []
	for i, item in enumerate(items, 1):
	if isinstance(item, tuple) and len(item) >= 2:
	img, label = item[0], str(item[1])
	else:
	img, label = item, f"Item {i}"
	if prefix:
	label = f"{prefix} - {label}"
	if isinstance(img, Image.Image):
	label = f"{label} ({img.width}x{img.height})"
	labeled.append((img, label))
	return labeled

	def _reset_selected_regions():
	return [], [], "No saved regions."

	def _reset_drawn_mask():
	return None

	def add_selected_region(editor_value, base_size, base_image, selected_regions, consumed_mask):
	candidates, updated_mask = _extract_new_drawn_regions(
	editor_value,
	base_size=base_size,
	base_image=base_image,
	consumed_mask=consumed_mask,
	)
	regions = list(selected_regions or [])
	if not candidates:
	msg = "No region detected. Use Crop or draw/highlight a region first."
	return regions, _region_gallery_items(regions), msg, updated_mask

	existing_boxes = [r.get("bbox") for r in regions if r.get("bbox") is not None]
	added = 0
	for region_img, bbox in candidates:
	if bbox is not None and any(_is_duplicate_bbox(bbox, eb) for eb in existing_boxes):
	continue
	regions.append({"image": region_img, "bbox": bbox})
	if bbox is not None:
	existing_boxes.append(bbox)
	added += 1

	if added == 0:
	msg = "No new region added. Draw one region, click Add Region, then draw the next region."
	return regions, _region_gallery_items(regions), msg, updated_mask

	msg = f"Added {added} region(s). {len(regions)} total. Zoom/pan is preserved."
	return regions, _region_gallery_items(regions), msg, updated_mask

	def clear_selected_regions():
	return _reset_selected_regions()

	def clear_regions_preserve_view(editor_value):
	regions, gallery_items, msg = _reset_selected_regions()
	_, mask = _editor_background_and_mask(editor_value)
	return regions, gallery_items, msg, mask

	def _compose_ui_outputs(cleaned, markdown, raw, img_out, gallery_items):
	text_display = re.sub(
	r'\\\[(.+?)\\\]',
	lambda m: f'\n$$\n{m.group(1).strip()}\n$$\n',
	cleaned,
	flags=re.DOTALL
	)
	text_display = re.sub(r'\\$(.+?)\\$', lambda m: f'${m.group(1).strip()}$', text_display)

	dl_tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.md', mode='w', encoding='utf-8')
	dl_tmp.write(cleaned)
	dl_tmp.close()

	markdown_html = to_math_html(markdown)
	return (
	text_display,
	cleaned,
	markdown_html,
	raw,
	img_out,
	gallery_items,
	gr.DownloadButton(value=dl_tmp.name, visible=True),
	)

	def toggle_prompt(task):
	if task == "✏️ Custom":
	return gr.update(visible=True, label="Custom Prompt", placeholder="Add <\|grounding\|> for bounding boxes")
	elif task == "📍 Locate":
	return gr.update(visible=True, label="Text to Locate", placeholder="Enter text to locate")
	return gr.update(visible=False)

	def select_boxes(task):
	if task == "📍 Locate":
	return gr.update(selected="tab_boxes")
	return gr.update()

	def toggle_scope_ui(scope):
	if scope == "Selected Region":
	hint = (
	"Selected Region mode: Draw/highlight on the workspace, click Add Region "
	"for each target area, then click Extract."
	)
	return (
	gr.update(value=hint),
	gr.update(visible=True),
	gr.update(visible=True),
	gr.update(visible=True),
	)
	hint = "Entire Page mode: No drawing needed. Click Extract to process the full page."
	return (
	gr.update(value=hint),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	)

	def select_post_extract_tab(task, scope):
	if scope == "Selected Region" or task == "📍 Locate":
	return gr.update(selected="tab_boxes")
	return gr.update(selected="tab_text")

	def get_pdf_page_count(file_path):
	if not file_path or not file_path.lower().endswith('.pdf'):
	return 1
	doc = fitz.open(file_path)
	count = len(doc)
	doc.close()
	return count

	def load_image(file_path, page_num=1):
	if not file_path:
	return None
	if file_path.lower().endswith('.pdf'):
	doc = fitz.open(file_path)
	page_idx = max(0, min(int(page_num) - 1, len(doc) - 1))
	page = doc.load_page(page_idx)
	pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
	img = Image.open(BytesIO(pix.tobytes("png")))
	doc.close()
	return img
	else:
	return Image.open(file_path)

	def _scale_workspace_image(img, workspace_scale):
	if img is None:
	return None
	# Keep native pixels for workspace quality. Gradio's in-canvas zoom controls
	# visual scale; pre-resampling here causes blurry math when users zoom in.
	return img

	def _prepare_workspace_image(img, workspace_scale=WORKSPACE_DEFAULT_SCALE):
	if img is None:
	return None, None, None
	display_img = _scale_workspace_image(img, workspace_scale)
	return display_img, (int(display_img.width), int(display_img.height)), display_img

	def load_image_with_size(file_path, page_num=1, workspace_scale=WORKSPACE_DEFAULT_SCALE):
	img = load_image(file_path, page_num)
	return _prepare_workspace_image(img, workspace_scale)

	def load_example_into_workspace(example_value):
	if example_value is None:
	return None, None, None

	file_path = None
	if isinstance(example_value, os.PathLike):
	file_path = os.fspath(example_value)
	elif isinstance(example_value, str):
	file_path = example_value
	elif isinstance(example_value, dict):
	path_candidate = example_value.get("path") or example_value.get("name")
	if isinstance(path_candidate, os.PathLike):
	file_path = os.fspath(path_candidate)
	elif isinstance(path_candidate, str):
	file_path = path_candidate
	elif isinstance(example_value, (list, tuple)) and example_value:
	first = example_value[0]
	if isinstance(first, os.PathLike):
	file_path = os.fspath(first)
	elif isinstance(first, str):
	file_path = first

	if file_path:
	img = load_image(file_path, 1)
	return _prepare_workspace_image(img, WORKSPACE_DEFAULT_SCALE)

	if isinstance(example_value, Image.Image):
	img = example_value
	else:
	maybe_rgba = _to_rgba_image(example_value)
	if maybe_rgba is None:
	return None, None, None
	img = maybe_rgba.convert("RGB")
	return _prepare_workspace_image(img, WORKSPACE_DEFAULT_SCALE)

	def load_example_into_workspace_and_reset(example_value):
	display_img, base_size, base_img = load_example_into_workspace(example_value)
	return display_img, base_size, base_img, [], [], "No saved regions.", None

	def sync_workspace_state(editor_value, current_base_image):
	background = _extract_editor_background(editor_value)
	if isinstance(background, Image.Image):
	return (int(background.width), int(background.height)), background
	if isinstance(current_base_image, Image.Image):
	return (int(current_base_image.width), int(current_base_image.height)), current_base_image
	return None, None

	def update_page_selector(file_path):
	if not file_path:
	return gr.update(visible=False)
	if file_path.lower().endswith('.pdf'):
	page_count = get_pdf_page_count(file_path)
	return gr.update(visible=True, maximum=page_count, value=1, minimum=1,
	label=f"Select Page (1-{page_count})")
	return gr.update(visible=False)

	blocks_kwargs = {"title": "DeepSeek-OCR-2"}
	if hasattr(gr, "themes") and hasattr(gr.themes, "Soft"):
	try:
	blocks_kwargs["theme"] = gr.themes.Soft()
	except Exception:
	pass

	with gr.Blocks(**blocks_kwargs) as demo:
	gr.Markdown("""
	# 🧮 DeepSeek-OCR-2 — Math Rendering Edition
	Convert documents to markdown, extract text, parse figures, and locate specific content with bounding boxes.
	Model uses DeepEncoder v2 and achieves 91.09% on OmniDocBench (+3.73% over v1).

	Built on the original [DeepSeek-OCR-2 Demo](https://huggingface.co/spaces/merterbak/DeepSeek-OCR-2) by Mert Erbak — thank you for the excellent foundation.
	This fork adds math rendering in the Markdown Preview tab so that equations from scanned papers and textbooks display as proper math notation.
	""")

	region_editor = None
	workspace_base_size = gr.State(None)
	workspace_base_image = gr.State(None)
	selected_regions_state = gr.State([])
	drawn_mask_state = gr.State(None)

	with gr.Row():
	with gr.Column(scale=3):
	file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
	with gr.Column(scale=1):
	page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)

	with gr.Row():
	with gr.Column(scale=3):
	workspace_hint = gr.Markdown("Entire Page mode: No drawing needed. Click Extract to process the full page.")
	gr.Markdown("Image Workspace (full page + region selection)")
	if HAS_REGION_WORKSPACE:
	editor_kwargs = {}
	if HAS_BRUSH:
	try:
	highlight = ("#2563eb", 0.35)
	editor_kwargs["brush"] = gr.Brush(
	colors=[highlight],
	default_color=highlight,
	color_mode="fixed",
	default_size=22,
	)
	except TypeError:
	try:
	editor_kwargs["brush"] = gr.Brush(
	colors=["rgba(37,99,235,0.35)"],
	default_color="rgba(37,99,235,0.35)",
	color_mode="fixed",
	default_size=22,
	)
	except TypeError:
	editor_kwargs["brush"] = gr.Brush()
	if HAS_ERASER:
	try:
	editor_kwargs["eraser"] = gr.Eraser(default_size=26)
	except TypeError:
	editor_kwargs["eraser"] = gr.Eraser()
	if HAS_IMAGE_EDITOR:
	try:
	region_editor = gr.ImageEditor(
	label="Image Workspace",
	show_label=False,
	type="pil",
	height=WORKSPACE_EDITOR_HEIGHT,
	**editor_kwargs,
	)
	except TypeError:
	try:
	region_editor = gr.ImageEditor(
	label="Image Workspace",
	show_label=False,
	height=WORKSPACE_EDITOR_HEIGHT,
	**editor_kwargs,
	)
	except TypeError:
	region_editor = gr.ImageEditor(
	label="Image Workspace",
	show_label=False,
	height=WORKSPACE_EDITOR_HEIGHT,
	)
	else:
	region_editor = gr.Paint(
	label="Image Workspace",
	show_label=False,
	type="pil",
	height=WORKSPACE_EDITOR_HEIGHT,
	**editor_kwargs,
	)
	else:
	gr.Markdown("Region drawing requires a newer Gradio version with `Paint` or `ImageEditor` support.")
	region_editor = gr.State(None)

	with gr.Column(scale=1):
	gr.Markdown("### OCR Workflow")
	task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
	input_scope = gr.Radio(["Entire Page", "Selected Region"], value="Entire Page", label="Input Scope")

	selection_controls = gr.Row(visible=False)
	with selection_controls:
	add_region_btn = gr.Button("Add Region", variant="secondary")
	clear_regions_btn = gr.Button("Clear Regions")
	selection_status = gr.Textbox(label="Region Selection Status", value="No saved regions.", interactive=False, visible=False)
	selected_regions_gallery = gr.Gallery(
	label="Selected Regions",
	show_label=True,
	columns=2,
	height=190,
	visible=False,
	object_fit="contain",
	)

	with gr.Accordion("Advanced Options", open=False):
	equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
	separate_eq_lines = gr.Checkbox(label="Detect Equation Lines Separately", value=False)
	prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
	btn = gr.Button("Extract", variant="primary", size="lg")

	with gr.Row():
	with gr.Column(scale=1):
	with gr.Tabs() as tabs:
	with gr.Tab("Text", id="tab_text"):
	text_out = gr.Textbox(lines=20, show_label=False)
	with gr.Tab("LaTeX", id="tab_text_latex"):
	latex_out = gr.Textbox(lines=20, show_label=False)
	with gr.Tab("Preview", id="tab_markdown"):
	md_out = gr.HTML("")
	with gr.Tab("Boxes", id="tab_boxes"):
	img_out = gr.Image(type="pil", height=560, show_label=False)
	with gr.Tab("Crops", id="tab_crops"):
	gallery = gr.Gallery(show_label=False, columns=3, height=420, object_fit="contain")
	with gr.Tab("Raw", id="tab_raw"):
	raw_out = gr.Textbox(lines=20, show_label=False)
	download_btn = gr.DownloadButton("Download Markdown", visible=False, variant="secondary")

	gr.Markdown("### Examples")
	with gr.Row():
	with gr.Column(scale=2):
	image_examples = [
	"examples/2022-0922 Section 13 Notes.png",
	"examples/2022-0922 Section 14 Notes.png",
	"examples/2022-0922 Section 15 Notes.png",
	]
	if HAS_REGION_WORKSPACE and region_editor is not None:
	image_examples_input = gr.Image(
	label="Example Loader",
	type="filepath",
	visible=False,
	show_label=False,
	)
	gr.Examples(
	label="Image Examples (click thumbnail to load into workspace)",
	examples=image_examples,
	inputs=[image_examples_input],
	outputs=[region_editor, workspace_base_size, workspace_base_image, selected_regions_state, selected_regions_gallery, selection_status, drawn_mask_state],
	fn=load_example_into_workspace_and_reset,
	run_on_click=True,
	cache_examples=False,
	)
	else:
	gr.Examples(
	label="Image Examples",
	examples=[[p] for p in image_examples],
	inputs=[file_in],
	cache_examples=False,
	)
	with gr.Column(scale=1):
	gr.Examples(
	label="PDF Examples",
	examples=[["examples/Gursoy Class Notes_ Accessibility Sandbox.pdf"]],
	inputs=[file_in],
	cache_examples=False,
	)

	with gr.Accordion("ℹ️ Info", open=False):
	gr.Markdown("""
	### Configuration
	1024 base + 768 patches with dynamic cropping (2-6 patches). 144 tokens per patch + 256 base tokens.

	### Faculty Quick Workflow
	1. Upload a page/image, then confirm Task.
	2. Choose Input Scope:
	- `Entire Page` for the full page.
	- `Selected Region` for a specific area.
	2a. Workspace keeps native image resolution for clarity. For very tall pages, it auto-boosts from tiny fit view toward ~88% width-friendly zoom.
	3. For `Selected Region`, use the Image Workspace:
	- Recommended: freehand selection (draw/highlight target); app uses an automatic bounding box around your marks.
	- Optional rectangle selection: use the Crop tool.
	- Freehand/highlight ink is semi-transparent so underlying content stays visible.
	- Current known behavior: after zooming in/out, freehand stroke display may appear fully on mouse release (selection is still captured correctly).
	- Optional multi-select: click Add Region after each selection.
	- Add Region snapshots only newly drawn pixels so zoom/pan stays in place while you continue selecting.
	Then click Extract.
	4. Use Clear Regions to reset multi-select state.
	5. Review Cropped Images and Boxes: both are labeled `Region 1`, `Region 2`, etc.
	6. Use Advanced Options only when needed (Equation Zoom / line-by-line equation OCR).

	### Tasks
	- Markdown: Convert document to structured markdown with layout detection (grounding ✅)
	- Free OCR: Read all visible text from the full page/image (no boxes, no targeting)
	- Locate: Find and highlight where specific text appears (grounding ✅)
	- Describe: General image description
	- Custom: Your own prompt
	- Region selection: Use Input Scope=Selected Region, draw/crop in the Image Workspace, then click Extract
	- Input Scope: `Entire Page` or `Selected Region` (Selected Region uses the workspace crop as main input)
	- Equation Zoom (multipass): Optional nested equation refinement for Markdown. Off by default for speed/stability.
	- Detect Equation Lines Separately: Detects likely equation-line boxes and OCRs each line independently to reduce merged multi-step equations.

	### Free OCR vs Locate (important)
	- Free OCR does not take a selected region. It runs OCR on the whole image/page.
	- If you want OCR for one area only, crop that area first, then run Free OCR on the cropped image.
	- If you want to keep the full page but highlight where text appears, use Locate and enter the text to search.
	- For advanced region workflows, use Custom with `<\|grounding\|>` in the prompt.

	### Special Tokens
	- `<image>` - Placeholder where visual tokens are inserted
	- `<\|grounding\|>` - Enables layout detection with bounding boxes
	- `<\|ref\|>text<\|/ref\|>` - Reference text to locate in the image

	""")

	file_in.change(update_page_selector, [file_in], [page_selector])
	task.change(toggle_prompt, [task], [prompt])
	task.change(select_boxes, [task], [tabs])
	input_scope.change(toggle_scope_ui, [input_scope], [workspace_hint, selection_controls, selection_status, selected_regions_gallery])
	if HAS_REGION_WORKSPACE and region_editor is not None:
	file_in.change(load_image_with_size, [file_in, page_selector], [region_editor, workspace_base_size, workspace_base_image])
	page_selector.change(load_image_with_size, [file_in, page_selector], [region_editor, workspace_base_size, workspace_base_image])
	region_editor.change(sync_workspace_state, [region_editor, workspace_base_image], [workspace_base_size, workspace_base_image])
	file_in.change(_reset_selected_regions, outputs=[selected_regions_state, selected_regions_gallery, selection_status])
	page_selector.change(_reset_selected_regions, outputs=[selected_regions_state, selected_regions_gallery, selection_status])
	file_in.change(_reset_drawn_mask, outputs=[drawn_mask_state])
	page_selector.change(_reset_drawn_mask, outputs=[drawn_mask_state])

	add_region_btn.click(
	add_selected_region,
	[region_editor, workspace_base_size, workspace_base_image, selected_regions_state, drawn_mask_state],
	[selected_regions_state, selected_regions_gallery, selection_status, drawn_mask_state],
	)
	clear_regions_btn.click(
	clear_regions_preserve_view,
	inputs=[region_editor],
	outputs=[selected_regions_state, selected_regions_gallery, selection_status, drawn_mask_state],
	)

	def run(file_path, task, custom_prompt, page_num, enable_equation_zoom, detect_eq_lines, scope, region_value, base_size, base_image, selected_regions):
	if scope == "Selected Region":
	regions = list(selected_regions or [])
	if not regions:
	selected_region, selected_bbox = _extract_selected_region(region_value, base_size=base_size, base_image=base_image)
	if selected_region is None:
	msg = "Select Input Scope=Selected Region, then crop or annotate a target area in the Image Workspace first."
	return (msg, "", "", "", None, [], gr.DownloadButton(visible=False))
	regions = [{"image": selected_region, "bbox": selected_bbox}]

	cleaned_parts = []
	markdown_parts = []
	raw_parts = []
	line_crops = []
	for i, r in enumerate(regions, 1):
	cleaned_i, markdown_i, raw_i, _, crops_i = process_image(
	r["image"],
	task,
	custom_prompt,
	enable_equation_zoom=enable_equation_zoom,
	infer_crop_mode=False,
	separate_equation_lines=detect_eq_lines,
	)
	if len(regions) > 1:
	cleaned_parts.append(f"## Region {i}\n\n{cleaned_i}")
	markdown_parts.append(f"## Region {i}\n\n{markdown_i}")
	raw_parts.append(f"## Region {i}\n\n{raw_i}")
	else:
	cleaned_parts.append(cleaned_i)
	markdown_parts.append(markdown_i)
	raw_parts.append(raw_i)
	if detect_eq_lines and crops_i:
	line_crops.extend(_label_gallery_items(crops_i, prefix=f"Region {i}" if len(regions) > 1 else None))

	cleaned = "\n\n".join(cleaned_parts).strip()
	markdown = "\n\n".join(markdown_parts).strip()
	raw = "\n\n".join(raw_parts).strip()
	crops = line_crops if line_crops else _region_gallery_items(regions)
	full_img = base_image if isinstance(base_image, Image.Image) else _extract_editor_background(region_value)
	region_boxes = [r["bbox"] for r in regions if r.get("bbox") is not None]
	img_out = _draw_selected_region_boxes(full_img, region_boxes)
	elif file_path:
	cleaned, markdown, raw, img_out, crops = process_file(
	file_path,
	task,
	custom_prompt,
	int(page_num),
	enable_equation_zoom=enable_equation_zoom,
	separate_equation_lines=detect_eq_lines,
	)
	elif (full_image := _extract_editor_background(region_value)) is not None:
	cleaned, markdown, raw, img_out, crops = process_image(
	full_image,
	task,
	custom_prompt,
	enable_equation_zoom=enable_equation_zoom,
	separate_equation_lines=detect_eq_lines,
	)
	elif isinstance(base_image, Image.Image):
	# Example clicks can briefly race editor-value hydration on first load.
	cleaned, markdown, raw, img_out, crops = process_image(
	base_image,
	task,
	custom_prompt,
	enable_equation_zoom=enable_equation_zoom,
	separate_equation_lines=detect_eq_lines,
	)
	else:
	msg = "Error: Upload a file or image"
	return (msg, "", "", "", None, [], gr.DownloadButton(visible=False))

	return _compose_ui_outputs(cleaned, markdown, raw, img_out, crops)

	submit_event = btn.click(
	run,
	[file_in, task, prompt, page_selector, equation_zoom, separate_eq_lines, input_scope, region_editor, workspace_base_size, workspace_base_image, selected_regions_state],
	[text_out, latex_out, md_out, raw_out, img_out, gallery, download_btn]
	)
	submit_event.then(select_post_extract_tab, [task, input_scope], [tabs])

	if __name__ == "__main__":
	# server_name="0.0.0.0" is needed locally (WSL2 → Windows access)
	# On HuggingFace Spaces, SPACE_ID is set and Gradio handles binding automatically
	local = not os.environ.get("SPACE_ID")
	queued = demo.queue(max_size=20)
	launch_sig = inspect.signature(queued.launch)
	launch_kwargs = {}
	if "server_name" in launch_sig.parameters:
	launch_kwargs["server_name"] = "0.0.0.0" if local else None
	if "head" in launch_sig.parameters:
	launch_kwargs["head"] = PREVIEW_CSS
	if "ssr_mode" in launch_sig.parameters:
	launch_kwargs["ssr_mode"] = False # SSR breaks HF Spaces routing in Gradio 6
	queued.launch(**launch_kwargs)