ricklon's picture
Use workspace base image as first-load extract fallback
9a36dca
import gradio as gr
from transformers import AutoModel, AutoTokenizer
import torch
import spaces
import os
import sys
import tempfile
import shutil
import inspect
from PIL import Image, ImageDraw, ImageFont, ImageOps
import fitz
import re
import ast
import numpy as np
import base64
import html as html_lib
import markdown as md_lib
import latex2mathml.converter
from collections import deque
from io import StringIO, BytesIO
HAS_IMAGE_EDITOR = hasattr(gr, "ImageEditor")
HAS_PAINT = hasattr(gr, "Paint")
HAS_BRUSH = hasattr(gr, "Brush")
HAS_ERASER = hasattr(gr, "Eraser")
HAS_REGION_WORKSPACE = HAS_PAINT or HAS_IMAGE_EDITOR
# Model options — swap MODEL_NAME to reduce VRAM usage on GPUs with <= 8GB
#
# Full precision BF16 (~8GB VRAM) — original, highest accuracy
MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
#
# FP8 dynamic quantization (~3.5GB VRAM) — ~50% VRAM reduction, 3750 downloads/mo
# Requires Ampere GPU or newer (RTX 3070 is supported)
# MODEL_NAME = 'richarddavison/DeepSeek-OCR-2-FP8'
#
# 8-bit quantization (~4GB VRAM) — same stack (torch 2.6, flash-attn 2.7.3, py3.12)
# Explicitly supports dynamic resolution (0-6 patches), 140 downloads/mo
# MODEL_NAME = 'mzbac/DeepSeek-OCR-2-8bit'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
# flash_attention_2 requires a CUDA device at init time — not available on ZeroGPU at
# module load. DeepseekOCR2 only supports 'flash_attention_2' and 'eager'; sdpa is not
# implemented for this model class. Fall back to 'eager' when no GPU is present.
# Locally with CUDA, flash_attention_2 is used for maximum throughput.
_attn_impl = 'flash_attention_2' if torch.cuda.is_available() else 'eager'
model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation=_attn_impl, torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True).eval()
# .cuda() is NOT called here — on ZeroGPU, GPU is only available inside @spaces.GPU
# functions. Locally, model.cuda() is called inside process_image on first run.
BASE_SIZE = 1024
IMAGE_SIZE = 768
CROP_MODE = True
WORKSPACE_EDITOR_HEIGHT = 640
WORKSPACE_EDITOR_WIDTH_EST = 980
WORKSPACE_DEFAULT_SCALE = 89
GROUNDING_PATTERN = re.compile(r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>', re.DOTALL)
INFER_DEBUG_FILTERS = ['PATCHES', '====', 'BASE:', 'directly resize', 'NO PATCHES', 'torch.Size', '%|']
EQUATION_ZOOM_PROMPT = "<image>\n<|grounding|>Locate each individual equation or math line."
EQUATION_LINE_OCR_PROMPT = "<image>\nRead the math expression exactly as written. Return only the equation text."
EQUATION_ZOOM_MAX_CANDIDATES = 6
EQUATION_ZOOM_MIN_AREA = 0.05
EQUATION_ZOOM_MIN_DIM = 0.24
EQUATION_ZOOM_PADDING = 0.025
EQUATION_ZOOM_MAX_ASPECT = 12.0
EQUATION_DETAIL_MAX_BOXES = 24
EQUATION_DETAIL_IOU_DEDUPE = 0.7
EQUATION_LINE_IOU_DEDUPE = 0.55
EQUATION_LINE_MIN_AREA = 0.0008
EQUATION_LINE_MIN_W = 0.03
EQUATION_LINE_MIN_H = 0.01
EQUATION_LINE_MAX_ASPECT = 30.0
MATH_LABEL_HINTS = ("formula", "equation", "math")
MATH_STRONG_MARKERS = ("\\(", "\\[", "\\frac", "\\sum", "\\int", "\\sqrt", "\\lim", "\\begin{")
MATH_WEAK_MARKERS = ("^", "_", "=", "+", "\\cdot", "\\times")
TASK_PROMPTS = {
"📋 Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
"📝 Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
"📍 Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
"🔍 Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
"✏️ Custom": {"prompt": "", "has_grounding": False}
}
def extract_grounding_references(text):
refs = []
seen = set()
for entry in _extract_grounding_entries(text):
coord_text = repr(entry["coords"])
key = (
entry["label"].strip().lower(),
tuple(
(round(c[0], 1), round(c[1], 1), round(c[2], 1), round(c[3], 1))
for c in entry["coords"]
),
)
if key in seen:
continue
seen.add(key)
raw = f'<|ref|>{entry["label"]}<|/ref|><|det|>{coord_text}<|/det|>'
refs.append((raw, entry["label"], coord_text))
return refs
def _parse_coord_payload(payload):
if isinstance(payload, str):
try:
coords = ast.literal_eval(payload.strip())
except (SyntaxError, ValueError):
return []
else:
coords = payload
if isinstance(coords, (tuple, list)) and coords and isinstance(coords[0], (int, float)):
coords = [coords]
if not isinstance(coords, list):
return []
out = []
for c in coords:
if not isinstance(c, (list, tuple)) or len(c) < 4:
continue
x1, y1, x2, y2 = [float(v) for v in c[:4]]
x1, x2 = sorted((max(0.0, min(999.0, x1)), max(0.0, min(999.0, x2))))
y1, y2 = sorted((max(0.0, min(999.0, y1)), max(0.0, min(999.0, y2))))
if x2 <= x1 or y2 <= y1:
continue
out.append([x1, y1, x2, y2])
return out
def _extract_grounding_entries(raw_text: str):
if not raw_text:
return []
entries = []
last_end = 0
for m in GROUNDING_PATTERN.finditer(raw_text):
label = m.group(1).strip() or "text"
coords = _parse_coord_payload(m.group(2))
if not coords:
continue
text_chunk = raw_text[last_end:m.start()].strip()
entries.append({
"label": label,
"coords": coords,
"text": text_chunk,
})
last_end = m.end()
return entries
def _math_marker_score(text_chunk: str) -> int:
score = 0
for marker in MATH_STRONG_MARKERS:
if marker in text_chunk:
score += 3
for marker in MATH_WEAK_MARKERS:
if marker in text_chunk:
score += 1
return score
def _box_iou(a, b):
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
inter_x1 = max(ax1, bx1)
inter_y1 = max(ay1, by1)
inter_x2 = min(ax2, bx2)
inter_y2 = min(ay2, by2)
if inter_x2 <= inter_x1 or inter_y2 <= inter_y1:
return 0.0
inter = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
area_a = max(1e-9, (ax2 - ax1) * (ay2 - ay1))
area_b = max(1e-9, (bx2 - bx1) * (by2 - by1))
union = area_a + area_b - inter
return inter / union if union > 0 else 0.0
def _dedupe_boxes(boxes, iou_threshold):
kept = []
for box in sorted(boxes, key=lambda b: ((b[2] - b[0]) * (b[3] - b[1]))):
if any(_box_iou(box, other) >= iou_threshold for other in kept):
continue
kept.append(box)
return kept
def _is_math_candidate(label: str, text_chunk: str, box):
label_l = label.lower()
box_w = (box[2] - box[0]) / 999.0
box_h = (box[3] - box[1]) / 999.0
area = box_w * box_h
aspect = max(box_w / max(1e-9, box_h), box_h / max(1e-9, box_w))
has_math_label = any(hint in label_l for hint in MATH_LABEL_HINTS)
has_math_text = _math_marker_score(text_chunk) >= 3
is_large = area >= EQUATION_ZOOM_MIN_AREA or box_w >= EQUATION_ZOOM_MIN_DIM or box_h >= EQUATION_ZOOM_MIN_DIM
return (has_math_label or has_math_text) and is_large and aspect <= EQUATION_ZOOM_MAX_ASPECT
def _map_crop_box_to_page(sub_box, crop_px, img_w, img_h):
crop_x1, crop_y1, crop_x2, crop_y2 = crop_px
crop_w = max(1, crop_x2 - crop_x1)
crop_h = max(1, crop_y2 - crop_y1)
page_x1 = ((crop_x1 + (sub_box[0] / 999.0) * crop_w) / img_w) * 999.0
page_y1 = ((crop_y1 + (sub_box[1] / 999.0) * crop_h) / img_h) * 999.0
page_x2 = ((crop_x1 + (sub_box[2] / 999.0) * crop_w) / img_w) * 999.0
page_y2 = ((crop_y1 + (sub_box[3] / 999.0) * crop_h) / img_h) * 999.0
return _parse_coord_payload([[page_x1, page_y1, page_x2, page_y2]])[0]
def draw_bounding_boxes(image, refs, extract_images=False):
img_w, img_h = image.size
img_draw = image.copy()
draw = ImageDraw.Draw(img_draw)
overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
draw2 = ImageDraw.Draw(overlay)
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 15)
crops = []
color_map = {}
np.random.seed(42)
for ref in refs:
label = ref[1]
if label not in color_map:
color_map[label] = (np.random.randint(50, 255), np.random.randint(50, 255), np.random.randint(50, 255))
color = color_map[label]
coords = _parse_coord_payload(ref[2])
color_a = color + (60,)
for box in coords:
x1, y1, x2, y2 = int(box[0]/999*img_w), int(box[1]/999*img_h), int(box[2]/999*img_w), int(box[3]/999*img_h)
if extract_images and label == 'image':
crops.append(image.crop((x1, y1, x2, y2)))
width = 5 if label == 'title' else 3
draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
draw2.rectangle([x1, y1, x2, y2], fill=color_a)
text_bbox = draw.textbbox((0, 0), label, font=font)
tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
ty = max(0, y1 - 20)
draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))
img_draw.paste(overlay, (0, 0), overlay)
return img_draw, crops
def _extract_labeled_crops_from_refs(image, refs, max_items=24):
img_w, img_h = image.size
items = []
seen = set()
for ref in refs:
label = str(ref[1])
coords = _parse_coord_payload(ref[2])
for box in coords:
x1 = int(box[0] / 999.0 * img_w)
y1 = int(box[1] / 999.0 * img_h)
x2 = int(box[2] / 999.0 * img_w)
y2 = int(box[3] / 999.0 * img_h)
if x2 - x1 < 8 or y2 - y1 < 8:
continue
key = (label.lower(), x1, y1, x2, y2)
if key in seen:
continue
seen.add(key)
crop = image.crop((x1, y1, x2, y2))
caption = f"{label} ({crop.width}x{crop.height})"
items.append((crop, caption))
if len(items) >= max_items:
return items
return items
def clean_output(text, include_images=False):
if not text:
return ""
pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
matches = re.findall(pattern, text, re.DOTALL)
img_num = 0
for match in matches:
if '<|ref|>image<|/ref|>' in match[0]:
if include_images:
text = text.replace(match[0], f'\n\n**[Figure {img_num + 1}]**\n\n', 1)
img_num += 1
else:
text = text.replace(match[0], '', 1)
else:
text = re.sub(rf'(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?', '', text)
text = _strip_malformed_grounding(text)
text = _dedupe_repeated_math_blocks(text)
return text.strip()
def _strip_malformed_grounding(text: str) -> str:
"""Remove incomplete grounding tags that can leak into OCR markdown/text."""
if not text:
return ""
line_patterns = [
r'(?m)^[^\n]*<\|ref\|>.*?<\|/ref\|><\|det\|>.*?(?:<\|/det\|>)?[^\n]*\n?',
r'(?m)^[^\n]*<\|det\|>.*?(?:<\|/det\|>)?[^\n]*\n?',
r'(?m)^[^\n]*<\|/?ref\|>[^\n]*\n?',
]
for p in line_patterns:
text = re.sub(p, '', text)
text = re.sub(r'<\|/?ref\|>', '', text)
text = re.sub(r'<\|/?det\|>', '', text)
return text
def _equation_text_key(text: str) -> str:
if not text:
return ""
key = text.strip()
key = re.sub(r'\\\[(.+?)\\\]', r'\1', key, flags=re.DOTALL)
key = re.sub(r'\\\((.+?)\\\)', r'\1', key, flags=re.DOTALL)
key = re.sub(r'\$\$(.+?)\$\$', r'\1', key, flags=re.DOTALL)
key = re.sub(r'\^\{([A-Za-z0-9])\}', r'^\1', key)
key = re.sub(r'_\{([A-Za-z0-9])\}', r'_\1', key)
key = re.sub(r'\s+', '', key)
return key.lower()
def _dedupe_repeated_math_blocks(text: str) -> str:
if not text:
return ""
pattern = re.compile(r'\\\[(.+?)\\\]|\\\((.+?)\\\)|\$\$(.+?)\$\$', re.DOTALL)
seen = set()
out = []
last = 0
removed_any = False
for m in pattern.finditer(text):
out.append(text[last:m.start()])
expr = m.group(1) or m.group(2) or m.group(3) or ""
key = _equation_text_key(expr)
if key and key in seen:
removed_any = True
else:
if key:
seen.add(key)
out.append(m.group(0))
last = m.end()
out.append(text[last:])
merged = ''.join(out)
if removed_any:
merged = re.sub(r'\n{3,}', '\n\n', merged)
return merged
PREVIEW_CSS = """
<style>
.math-preview {
padding: 1.5em;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
font-size: 15px;
line-height: 1.8;
color: #1a1a1a;
max-width: 100%;
overflow-x: auto;
}
.math-display {
text-align: center;
overflow-x: auto;
margin: 1em 0;
padding: 0.5em 0;
}
math[display="block"] { display: block; overflow-x: auto; max-width: 100%; }
.math-preview h1 { font-size: 1.8em; font-weight: 700; margin: 1em 0 0.4em; border-bottom: 2px solid #e0e0e0; padding-bottom: 0.3em; }
.math-preview h2 { font-size: 1.4em; font-weight: 600; margin: 1em 0 0.4em; border-bottom: 1px solid #e0e0e0; padding-bottom: 0.2em; }
.math-preview h3 { font-size: 1.15em; font-weight: 600; margin: 0.9em 0 0.3em; }
.math-preview p { margin: 0.6em 0; }
.math-preview ul, .math-preview ol { padding-left: 1.8em; margin: 0.5em 0; }
.math-preview li { margin: 0.25em 0; }
.math-preview table { border-collapse: collapse; width: 100%; margin: 1em 0; font-size: 0.95em; }
.math-preview th, .math-preview td { border: 1px solid #ccc; padding: 0.45em 0.75em; text-align: left; }
.math-preview th { background: #f2f2f2; font-weight: 600; }
.math-preview tr:nth-child(even) { background: #fafafa; }
.math-preview code { background: #f4f4f4; padding: 0.15em 0.4em; border-radius: 3px; font-family: 'Courier New', monospace; font-size: 0.88em; }
.math-preview pre { background: #f4f4f4; padding: 1em; border-radius: 5px; overflow-x: auto; margin: 0.8em 0; }
.math-preview pre code { background: none; padding: 0; }
.math-preview blockquote { border-left: 4px solid #ccc; margin: 0.8em 0; padding: 0.4em 1em; color: #555; background: #fafafa; }
.math-preview img { max-width: 100%; height: auto; display: block; margin: 0.8em 0; }
.math-preview .ocr-gap, .mathjax-preview .ocr-gap { width: 100%; }
.math-fallback { color: #888; font-style: italic; }
</style>
<script>
(() => {
if (window.__ocrMathJaxInit) return;
window.__ocrMathJaxInit = true;
if (!window.MathJax) {
window.MathJax = {
tex: {
inlineMath: [['\\\\(', '\\\\)'], ['$', '$']],
displayMath: [['\\\\[', '\\\\]'], ['$$', '$$']]
},
options: {
skipHtmlTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
}
};
}
const typeset = () => {
if (window.MathJax?.typesetPromise) {
const nodes = Array.from(document.querySelectorAll('.mathjax-preview, .spatial-preview'));
if (nodes.length) window.MathJax.typesetPromise(nodes).catch(() => {});
}
};
window.__typesetOcrMath = typeset;
const ensureScript = () => {
if (document.getElementById('mathjax-ocr-preview')) return;
const script = document.createElement('script');
script.id = 'mathjax-ocr-preview';
script.async = true;
script.src = 'https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js';
script.onload = () => setTimeout(typeset, 20);
document.head.appendChild(script);
};
ensureScript();
setTimeout(typeset, 100);
const observer = new MutationObserver((mutations) => {
for (const m of mutations) {
for (const n of m.addedNodes) {
if (n.nodeType !== 1) continue;
if (n.matches?.('.mathjax-preview, .spatial-preview') || n.querySelector?.('.mathjax-preview, .spatial-preview')) {
setTimeout(typeset, 30);
return;
}
}
}
});
observer.observe(document.body, { childList: true, subtree: true });
})();
(() => {
if (window.__ocrWorkspaceZoomInit) return;
window.__ocrWorkspaceZoomInit = true;
const stateByRoot = new WeakMap();
const targetZoomPct = 88;
const nearTargetTolerancePct = 3;
const tinyFitThresholdPct = 45;
const getState = (root) => {
let state = stateByRoot.get(root);
if (!state) {
state = { busy: false, applied: false, lastSeenZoom: null, lastAutoAt: 0 };
stateByRoot.set(root, state);
}
return state;
};
const parseZoomPct = (root) => {
const zoomNode = root.querySelector(".zoom-number span[role='button']");
if (!zoomNode) return null;
const m = (zoomNode.textContent || "").match(/([0-9]+(?:\\.[0-9]+)?)\\s*%/);
return m ? parseFloat(m[1]) : null;
};
const getZoomInBtn = (root) =>
root.querySelector("button[aria-label='Zoom in'], button[title='Zoom in']");
const isWorkspaceRoot = (root) =>
!!root.querySelector(".pixi-target") && !!root.querySelector(".zoom-number");
const maybeAutoZoom = (root) => {
if (!isWorkspaceRoot(root)) return;
const state = getState(root);
const now = Date.now();
const zoomPct = parseZoomPct(root);
if (zoomPct == null) return;
// A drop from high zoom to low zoom usually means a new image was loaded.
if (state.lastSeenZoom != null && state.lastSeenZoom > 70 && zoomPct < 35) {
state.applied = false;
}
state.lastSeenZoom = zoomPct;
if (state.busy || state.applied) return;
if (zoomPct > tinyFitThresholdPct) return;
if (now - state.lastAutoAt < 1200) return;
const zoomInBtn = getZoomInBtn(root);
if (!zoomInBtn) return;
state.busy = true;
state.lastAutoAt = now;
let steps = 0;
const step = () => {
const current = parseZoomPct(root);
if (
current == null ||
current >= (targetZoomPct - nearTargetTolerancePct) ||
steps >= 20
) {
state.busy = false;
state.applied = true;
return;
}
zoomInBtn.click();
steps += 1;
setTimeout(step, 80);
};
setTimeout(step, 90);
};
const attachRootObserver = (root) => {
if (root.dataset.ocrZoomObserved === "1") return;
root.dataset.ocrZoomObserved = "1";
const obs = new MutationObserver(() => maybeAutoZoom(root));
obs.observe(root, { childList: true, subtree: true, characterData: true });
setTimeout(() => maybeAutoZoom(root), 200);
setTimeout(() => maybeAutoZoom(root), 800);
};
const scan = () => {
document.querySelectorAll("[data-testid='image']").forEach((root) => {
if (isWorkspaceRoot(root)) attachRootObserver(root);
});
};
scan();
const pageObs = new MutationObserver(scan);
pageObs.observe(document.body, { childList: true, subtree: true });
})();
</script>
"""
def _inject_spatial_gap_placeholders(text: str):
"""Preserve runs of blank lines so OCR spacing is visible in preview."""
gaps: dict[str, int] = {}
counter = [0]
def repl(m):
key = f'ZZOCRGAP{counter[0]}ZZ'
counter[0] += 1
# Two newlines are a normal paragraph break; extras represent vertical spacing.
gaps[key] = max(1, len(m.group(0)) - 2)
return f'\n\n{key}\n\n'
return re.sub(r'\n{3,}', repl, text), gaps
def _restore_spatial_gap_placeholders(html: str, gaps: dict[str, int]) -> str:
if not gaps:
return html
for key, extra_lines in gaps.items():
gap_em = min(10.0, 0.9 * extra_lines)
block = f'<div class="ocr-gap" style="height:{gap_em:.2f}em"></div>'
html = html.replace(f'<p>{key}</p>', block)
html = html.replace(key, block)
return html
def _to_mathml(latex: str, display: bool) -> str:
"""Convert a LaTeX string to MathML. Falls back to a code block on error."""
# Fix OCR error: \frac{n/m} (single-argument fraction) → \frac{n}{m}
latex = re.sub(r'\\frac\{(\d+)/(\d+)\}(?!\s*\{)', r'\\frac{\1}{\2}', latex)
try:
mathml = latex2mathml.converter.convert(latex)
if display:
mathml = re.sub(r'<math\b', '<math display="block"', mathml, count=1)
return mathml
except Exception:
escaped = html_lib.escape(latex)
if display:
return f'<pre class="math-fallback"><code>{escaped}</code></pre>'
return f'<code class="math-fallback">{escaped}</code>'
def to_math_html(text: str) -> str:
"""Convert model markdown output to HTML with server-side MathML rendering.
Uses a placeholder approach: math is extracted and replaced with unique
tokens before the markdown pass, then swapped back afterwards. This avoids
Python-Markdown mishandling multi-line <div> blocks that contain blank lines.
"""
if not text:
return ""
blocks: dict[str, str] = {}
literals: dict[str, str] = {}
counter = [0]
def display_block(m):
key = f'ZZDISPLAYMATH{counter[0]}ZZ'
counter[0] += 1
expr = m.group(1).strip()
blocks[key] = f'<div class="math-display">{_to_mathml(expr, display=True)}</div>'
literals[key] = f'\\[{expr}\\]'
return f'\n\n{key}\n\n'
def inline_math(m):
key = f'ZZINLINEMATH{counter[0]}ZZ'
counter[0] += 1
expr = m.group(1).strip()
blocks[key] = _to_mathml(expr, display=False)
literals[key] = f'\\({expr}\\)'
return key
# Replace display math \[...\] with placeholder tokens
text = re.sub(r'\\\[(.+?)\\\]', display_block, text, flags=re.DOTALL)
# Remove orphaned \[ with no matching \] (truncated model output)
text = re.sub(r'\\\[.*', '', text, flags=re.DOTALL)
# Replace inline math \(...\) with placeholder tokens
text = re.sub(r'\\\((.+?)\\\)', inline_math, text)
text, gaps = _inject_spatial_gap_placeholders(text)
# Run markdown on text that now contains only safe placeholder tokens
html = md_lib.markdown(text, extensions=['tables', 'fenced_code', 'sane_lists', 'nl2br'])
# Protect rendered code/pre blocks so placeholder swap never mutates literal code.
protected_blocks: dict[str, str] = {}
protected_counter = [0]
def _protect_code_html(m):
token = f'ZZCODEHTML{protected_counter[0]}ZZ'
protected_counter[0] += 1
protected_blocks[token] = m.group(0)
return token
html = re.sub(r'<pre\b[^>]*>.*?</pre>', _protect_code_html, html, flags=re.DOTALL)
html = re.sub(r'<code\b[^>]*>.*?</code>', _protect_code_html, html, flags=re.DOTALL)
# Swap placeholders back for MathML/HTML (handle <p>KEY</p> wrapping too)
for key, value in blocks.items():
html = html.replace(f'<p>{key}</p>', value)
html = html.replace(key, value)
# Restore protected literal code/pre blocks unchanged.
for token, original in protected_blocks.items():
html = html.replace(token, original)
# Placeholders left at this stage occur inside code/pre; keep them literal.
for key, literal in literals.items():
html = html.replace(key, html_lib.escape(literal))
html = _restore_spatial_gap_placeholders(html, gaps)
return f'<div class="math-preview">{html}</div>'
def to_mathjax_html(text: str) -> str:
"""Render markdown to HTML and typeset math client-side with MathJax."""
if not text:
return ""
text, gaps = _inject_spatial_gap_placeholders(text)
html = md_lib.markdown(text, extensions=['tables', 'fenced_code', 'sane_lists', 'nl2br'])
html = _restore_spatial_gap_placeholders(html, gaps)
return f'<div class="mathjax-preview">{html}</div>'
def _grounding_blocks_from_raw(raw_text: str):
blocks = []
for entry in _extract_grounding_entries(raw_text):
label = entry["label"]
text = entry["text"].strip()
coords = entry["coords"]
for idx, c in enumerate(coords):
blocks.append({
"label": label,
"text": text if idx == 0 else "",
"x1": c[0],
"y1": c[1],
"x2": c[2],
"y2": c[3],
})
return blocks
def to_spatial_html(raw_text: str, markdown_text: str) -> str:
"""Render OCR content using grounding boxes for spatially-positioned blocks."""
blocks = _grounding_blocks_from_raw(raw_text)
if not blocks:
return to_mathjax_html(markdown_text)
used_text = 0
rendered = []
palette = {
"title": "#8b5cf6",
"text": "#2563eb",
"image": "#059669",
"table": "#d97706",
"formula": "#dc2626",
}
for i, b in enumerate(sorted(blocks, key=lambda x: (x["y1"], x["x1"]))):
label = b["label"]
color = palette.get(label.lower(), "#4b5563")
body = b["text"].strip()
if body:
used_text += len(body)
body_text, gaps = _inject_spatial_gap_placeholders(body)
body_html = md_lib.markdown(body_text, extensions=['tables', 'fenced_code', 'sane_lists', 'nl2br'])
body_html = _restore_spatial_gap_placeholders(body_html, gaps)
else:
body_html = ""
if not body_html:
body_html = f"<p><em>{html_lib.escape(label)}</em></p>"
left = b["x1"] / 999.0 * 100.0
top = b["y1"] / 999.0 * 100.0
width = max(1.0, (b["x2"] - b["x1"]) / 999.0 * 100.0)
height = max(1.2, (b["y2"] - b["y1"]) / 999.0 * 100.0)
rendered.append(
f"""
<article class="spatial-block" style="left:{left:.2f}%; top:{top:.2f}%; width:{width:.2f}%; min-height:{height:.2f}%; --block-color:{color};">
<header>{html_lib.escape(label)}</header>
<section>{body_html}</section>
</article>
"""
)
fallback = ""
if markdown_text and used_text < max(120, int(len(markdown_text) * 0.4)):
fallback_html = to_mathjax_html(markdown_text)
fallback = f"""
<details class="spatial-fallback">
<summary>Show full linear markdown rendering</summary>
{fallback_html}
</details>
"""
return f"""
<style>
.spatial-preview {{
padding: 1rem;
}}
.spatial-canvas {{
position: relative;
width: 100%;
min-height: 72vh;
aspect-ratio: 1 / 1.35;
background: linear-gradient(180deg, #fcfdff 0%, #f7f9fc 100%);
border: 1px solid #d8dee9;
border-radius: 8px;
overflow: auto;
}}
.spatial-block {{
position: absolute;
box-sizing: border-box;
border: 1px solid var(--block-color);
background: color-mix(in srgb, var(--block-color) 7%, white);
border-radius: 6px;
padding: 0.35rem 0.5rem;
overflow: hidden;
}}
.spatial-block > header {{
font-size: 11px;
font-weight: 700;
letter-spacing: 0.03em;
text-transform: uppercase;
color: var(--block-color);
margin-bottom: 0.25rem;
}}
.spatial-block > section {{
font-size: 13px;
line-height: 1.35;
}}
.spatial-block p {{ margin: 0.2rem 0; }}
.spatial-fallback {{
margin-top: 1rem;
padding-top: 0.5rem;
border-top: 1px solid #d8dee9;
}}
</style>
<div class="spatial-preview mathjax-preview">
<div class="spatial-canvas">
{''.join(rendered)}
</div>
{fallback}
</div>
"""
def embed_images(markdown, crops):
if not crops:
return markdown
for i, img in enumerate(crops):
buf = BytesIO()
img.save(buf, format="PNG")
b64 = base64.b64encode(buf.getvalue()).decode()
markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
return markdown
def _infer_with_prompt(image, prompt, crop_mode=None):
if crop_mode is None:
crop_mode = CROP_MODE
tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
image.save(tmp.name, 'JPEG', quality=95)
tmp.close()
out_dir = tempfile.mkdtemp()
stdout = sys.stdout
capture = StringIO()
sys.stdout = capture
try:
model.infer(
tokenizer=tokenizer,
prompt=prompt,
image_file=tmp.name,
output_path=out_dir,
base_size=BASE_SIZE,
image_size=IMAGE_SIZE,
crop_mode=crop_mode,
save_results=False
)
finally:
sys.stdout = stdout
os.unlink(tmp.name)
shutil.rmtree(out_dir, ignore_errors=True)
lines = [
l for l in capture.getvalue().split('\n')
if l.strip() and not any(s in l for s in INFER_DEBUG_FILTERS)
]
return '\n'.join(lines).strip()
def _refine_equation_refs(image, raw_text):
entries = _extract_grounding_entries(raw_text)
if not entries:
return []
img_w, img_h = image.size
candidates = []
for entry in entries:
for box in entry["coords"]:
if _is_math_candidate(entry["label"], entry["text"], box):
area = (box[2] - box[0]) * (box[3] - box[1])
candidates.append((area, entry, box))
if not candidates:
return []
candidates.sort(key=lambda x: x[0], reverse=True)
refined_refs = []
for _, entry, box in candidates[:EQUATION_ZOOM_MAX_CANDIDATES]:
x1 = int(box[0] / 999.0 * img_w)
y1 = int(box[1] / 999.0 * img_h)
x2 = int(box[2] / 999.0 * img_w)
y2 = int(box[3] / 999.0 * img_h)
box_w = max(1, x2 - x1)
box_h = max(1, y2 - y1)
pad_x = max(8, int(box_w * EQUATION_ZOOM_PADDING))
pad_y = max(8, int(box_h * EQUATION_ZOOM_PADDING))
crop_x1 = max(0, x1 - pad_x)
crop_y1 = max(0, y1 - pad_y)
crop_x2 = min(img_w, x2 + pad_x)
crop_y2 = min(img_h, y2 + pad_y)
if crop_x2 - crop_x1 < 32 or crop_y2 - crop_y1 < 32:
continue
crop = image.crop((crop_x1, crop_y1, crop_x2, crop_y2))
sub_result = _infer_with_prompt(crop, EQUATION_ZOOM_PROMPT)
sub_entries = _extract_grounding_entries(sub_result)
if not sub_entries:
continue
mapped_boxes = []
for sub in sub_entries:
sub_label = sub["label"].lower()
sub_text = sub["text"]
is_math_sub = any(hint in sub_label for hint in MATH_LABEL_HINTS) or _math_marker_score(sub_text) >= 3
if sub_label in ("image", "table") or not is_math_sub:
continue
for sub_box in sub["coords"]:
mapped = _map_crop_box_to_page(sub_box, (crop_x1, crop_y1, crop_x2, crop_y2), img_w, img_h)
w = (mapped[2] - mapped[0]) / 999.0
h = (mapped[3] - mapped[1]) / 999.0
if w * h < 0.0004:
continue
mapped_boxes.append(mapped)
if not mapped_boxes:
continue
mapped_boxes = _dedupe_boxes(mapped_boxes, EQUATION_DETAIL_IOU_DEDUPE)
mapped_boxes = sorted(mapped_boxes, key=lambda b: (b[1], b[0]))[:EQUATION_DETAIL_MAX_BOXES]
if len(mapped_boxes) < 2:
continue
merged_text = repr(mapped_boxes)
label = "equation_detail"
raw = f'<|ref|>{label}<|/ref|><|det|>{merged_text}<|/det|>'
refined_refs.append((raw, label, merged_text))
return refined_refs
def _norm_box_to_pixels(box, img_w, img_h, pad_ratio=0.0):
x1 = int(box[0] / 999.0 * img_w)
y1 = int(box[1] / 999.0 * img_h)
x2 = int(box[2] / 999.0 * img_w)
y2 = int(box[3] / 999.0 * img_h)
if pad_ratio > 0:
pad_x = max(1, int((x2 - x1) * pad_ratio))
pad_y = max(1, int((y2 - y1) * pad_ratio))
x1 -= pad_x
y1 -= pad_y
x2 += pad_x
y2 += pad_y
x1 = max(0, min(img_w - 1, x1))
y1 = max(0, min(img_h - 1, y1))
x2 = max(x1 + 1, min(img_w, x2))
y2 = max(y1 + 1, min(img_h, y2))
return (x1, y1, x2, y2)
def _detect_equation_line_boxes(image, infer_crop_mode=None):
detect_raw = _infer_with_prompt(image, EQUATION_ZOOM_PROMPT, crop_mode=infer_crop_mode)
entries = _extract_grounding_entries(detect_raw)
if not entries:
return [], detect_raw
boxes = []
for entry in entries:
label_l = entry["label"].lower()
text_chunk = entry["text"]
if label_l in ("image", "table"):
continue
for box in entry["coords"]:
w = (box[2] - box[0]) / 999.0
h = (box[3] - box[1]) / 999.0
area = w * h
aspect = max(w / max(1e-9, h), h / max(1e-9, w))
looks_math = any(hint in label_l for hint in MATH_LABEL_HINTS) or _math_marker_score(text_chunk) >= 2
if area < EQUATION_LINE_MIN_AREA or w < EQUATION_LINE_MIN_W or h < EQUATION_LINE_MIN_H:
continue
if aspect > EQUATION_LINE_MAX_ASPECT:
continue
if not looks_math and area < 0.004:
continue
boxes.append(box)
boxes = _dedupe_boxes(boxes, EQUATION_LINE_IOU_DEDUPE)
boxes = sorted(boxes, key=lambda b: (round(b[1], 3), b[0]))
return boxes, detect_raw
def _process_equation_lines_separately(image, infer_crop_mode=None):
boxes, detect_raw = _detect_equation_line_boxes(image, infer_crop_mode=infer_crop_mode)
if not boxes:
return None
img_w, img_h = image.size
cleaned_parts = []
markdown_parts = []
raw_parts = [f"## Detection\n\n{detect_raw}".strip()]
refs = []
crops = []
seen_line_keys = set()
for i, box in enumerate(boxes, 1):
x1, y1, x2, y2 = _norm_box_to_pixels(box, img_w, img_h, pad_ratio=0.01)
crop = image.crop((x1, y1, x2, y2))
line_raw = _infer_with_prompt(crop, EQUATION_LINE_OCR_PROMPT, crop_mode=False)
line_clean = clean_output(line_raw, False).strip()
if not line_clean:
continue
line_key = _equation_text_key(line_clean)
if line_key and line_key in seen_line_keys:
continue
if line_key:
seen_line_keys.add(line_key)
line_label = f"Eq {i}"
line_markdown = line_clean
if "$$" not in line_markdown and "\\[" not in line_markdown and "\\(" not in line_markdown:
line_markdown = f"$$\n{line_markdown}\n$$"
cleaned_parts.append(f"{line_label}: {line_clean}")
markdown_parts.append(f"### {line_label}\n\n{line_markdown}")
raw_parts.append(f"## {line_label}\n\n{line_raw}")
coord_text = repr([box])
raw_ref = f'<|ref|>eq_line_{i}<|/ref|><|det|>{coord_text}<|/det|>'
refs.append((raw_ref, line_label, coord_text))
crops.append((crop, line_label))
if not cleaned_parts:
return None
img_out, _ = draw_bounding_boxes(image, refs, extract_images=False)
cleaned = "\n".join(cleaned_parts).strip()
markdown = "\n\n".join(markdown_parts).strip()
raw = "\n\n".join(raw_parts).strip()
return cleaned, markdown, raw, img_out, crops
@spaces.GPU(duration=90)
def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
model.cuda() # GPU is available here — works on ZeroGPU and locally
if image is None:
return "Error: Upload an image", "", "", None, []
if not separate_equation_lines and task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
return "Please enter a prompt", "", "", None, []
if image.mode in ('RGBA', 'LA', 'P'):
image = image.convert('RGB')
image = ImageOps.exif_transpose(image)
if separate_equation_lines:
separate_result = _process_equation_lines_separately(image, infer_crop_mode=infer_crop_mode)
if separate_result is not None:
return separate_result
msg = "No separate equation lines detected. Try Selected Region + freehand highlight around the equation steps."
return msg, msg, msg, None, []
if task == "✏️ Custom":
prompt = f"<image>\n{custom_prompt.strip()}"
has_grounding = '<|grounding|>' in custom_prompt
elif task == "📍 Locate":
prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
has_grounding = True
else:
prompt = TASK_PROMPTS[task]["prompt"]
has_grounding = TASK_PROMPTS[task]["has_grounding"]
result = _infer_with_prompt(image, prompt, crop_mode=infer_crop_mode)
if not result:
return "No text detected", "", "", None, []
cleaned = clean_output(result, False)
markdown = clean_output(result, True)
img_out = None
crops = []
figure_crops = []
result_for_layout = result
if has_grounding and '<|ref|>' in result:
refs = extract_grounding_references(result)
if task == "📋 Markdown" and enable_equation_zoom:
refs.extend(_refine_equation_refs(image, result))
if refs:
img_out, figure_crops = draw_bounding_boxes(image, refs, True)
crops = _extract_labeled_crops_from_refs(image, refs)
synthetic = [r[0] for r in refs if r[1] == "equation_detail"]
if synthetic:
result_for_layout = result + "\n" + "\n".join(synthetic)
markdown = embed_images(markdown, figure_crops)
if not crops and figure_crops:
crops = _label_gallery_items(figure_crops, prefix="Figure")
return cleaned, markdown, result_for_layout, img_out, crops
@spaces.GPU(duration=90)
def process_pdf(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
doc = fitz.open(path)
total_pages = len(doc)
if page_num < 1 or page_num > total_pages:
doc.close()
return f"Invalid page number. PDF has {total_pages} pages.", "", "", None, []
page = doc.load_page(page_num - 1)
pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
img = Image.open(BytesIO(pix.tobytes("png")))
doc.close()
return process_image(
img,
task,
custom_prompt,
enable_equation_zoom=enable_equation_zoom,
infer_crop_mode=infer_crop_mode,
separate_equation_lines=separate_equation_lines,
)
def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
if not path:
return "Error: Upload a file", "", "", None, []
if path.lower().endswith('.pdf'):
return process_pdf(
path,
task,
custom_prompt,
page_num,
enable_equation_zoom=enable_equation_zoom,
infer_crop_mode=infer_crop_mode,
separate_equation_lines=separate_equation_lines,
)
else:
return process_image(
Image.open(path),
task,
custom_prompt,
enable_equation_zoom=enable_equation_zoom,
infer_crop_mode=infer_crop_mode,
separate_equation_lines=separate_equation_lines,
)
def _extract_editor_background(editor_value):
if editor_value is None:
return None
if isinstance(editor_value, Image.Image):
return editor_value
if isinstance(editor_value, dict):
background = editor_value.get("background")
if isinstance(background, Image.Image):
return background
composite = editor_value.get("composite")
if isinstance(composite, Image.Image):
return composite
return None
def _to_rgba_image(obj):
if isinstance(obj, dict):
for k in ("image", "layer", "composite", "background", "mask"):
if k in obj:
return _to_rgba_image(obj[k])
return None
if isinstance(obj, Image.Image):
return obj.convert("RGBA")
if isinstance(obj, np.ndarray):
arr = obj
if arr.ndim == 2:
arr = np.stack([arr, arr, arr, np.full_like(arr, 255)], axis=-1)
elif arr.ndim == 3 and arr.shape[2] == 3:
alpha = np.full((arr.shape[0], arr.shape[1], 1), 255, dtype=arr.dtype)
arr = np.concatenate([arr, alpha], axis=2)
elif arr.ndim != 3 or arr.shape[2] != 4:
return None
return Image.fromarray(arr.astype(np.uint8), mode="RGBA")
return None
def _to_mask_array(obj):
if obj is None:
return None
if isinstance(obj, dict):
for k in ("mask", "image", "layer", "composite", "background"):
if k in obj:
arr = _to_mask_array(obj[k])
if arr is not None:
return arr
return None
if isinstance(obj, Image.Image):
arr = np.asarray(obj)
elif isinstance(obj, np.ndarray):
arr = obj
else:
return None
if arr.ndim == 2:
return arr > 0
if arr.ndim == 3:
if arr.shape[2] >= 4:
return arr[:, :, 3] > 0
return np.max(arr[:, :, :3], axis=2) > 0
return None
def _locate_patch_bbox(base_image: Image.Image, patch_image: Image.Image):
"""Approximate patch location in base image using downscaled SSD search."""
if base_image is None or patch_image is None:
return None
base = np.asarray(base_image.convert("L"), dtype=np.float32)
patch = np.asarray(patch_image.convert("L"), dtype=np.float32)
bh, bw = base.shape[:2]
ph, pw = patch.shape[:2]
if ph <= 0 or pw <= 0 or ph > bh or pw > bw:
return None
max_dim = max(bh, bw)
scale = min(1.0, 320.0 / max_dim) if max_dim > 0 else 1.0
if scale < 1.0:
new_bw = max(1, int(round(bw * scale)))
new_bh = max(1, int(round(bh * scale)))
new_pw = max(1, int(round(pw * scale)))
new_ph = max(1, int(round(ph * scale)))
base_small = np.asarray(Image.fromarray(base.astype(np.uint8)).resize((new_bw, new_bh), Image.Resampling.BILINEAR), dtype=np.float32)
patch_small = np.asarray(Image.fromarray(patch.astype(np.uint8)).resize((new_pw, new_ph), Image.Resampling.BILINEAR), dtype=np.float32)
else:
base_small = base
patch_small = patch
sbh, sbw = base_small.shape
sph, spw = patch_small.shape
if sph > sbh or spw > sbw:
return None
best_score = float("inf")
best_x = 0
best_y = 0
for y in range(sbh - sph + 1):
row = base_small[y:y + sph, :]
windows = np.lib.stride_tricks.sliding_window_view(row, spw, axis=1)
# windows: (sph, sbw-spw+1, spw)
diff = windows - patch_small[:, None, :]
scores = np.mean(diff * diff, axis=(0, 2))
x = int(np.argmin(scores))
score = float(scores[x])
if score < best_score:
best_score = score
best_x = x
best_y = y
if scale < 1.0:
x1 = int(round(best_x / scale))
y1 = int(round(best_y / scale))
x2 = int(round((best_x + spw) / scale))
y2 = int(round((best_y + sph) / scale))
else:
x1, y1, x2, y2 = best_x, best_y, best_x + spw, best_y + sph
x1 = max(0, min(bw - 1, x1))
y1 = max(0, min(bh - 1, y1))
x2 = max(x1 + 1, min(bw, x2))
y2 = max(y1 + 1, min(bh, y2))
return (x1, y1, x2, y2)
def _component_boxes(binary_mask, min_pixels=24):
h, w = binary_mask.shape
visited = np.zeros((h, w), dtype=bool)
boxes = []
neighbors = [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]
ys, xs = np.where(binary_mask)
for sy, sx in zip(ys.tolist(), xs.tolist()):
if visited[sy, sx]:
continue
q = deque([(sy, sx)])
visited[sy, sx] = True
min_x = max_x = sx
min_y = max_y = sy
count = 0
while q:
y, x = q.popleft()
count += 1
if x < min_x:
min_x = x
if x > max_x:
max_x = x
if y < min_y:
min_y = y
if y > max_y:
max_y = y
for dy, dx in neighbors:
ny, nx = y + dy, x + dx
if ny < 0 or ny >= h or nx < 0 or nx >= w:
continue
if visited[ny, nx] or not binary_mask[ny, nx]:
continue
visited[ny, nx] = True
q.append((ny, nx))
if count >= min_pixels:
boxes.append((min_x, min_y, max_x + 1, max_y + 1, count))
return boxes
def _extract_regions_from_mask(background, mask):
components = _component_boxes(mask, min_pixels=24)
if not components:
return []
regions = []
for x1, y1, x2, y2, _ in components:
pad_x = max(2, int((x2 - x1) * 0.02))
pad_y = max(2, int((y2 - y1) * 0.02))
px1 = max(0, x1 - pad_x)
py1 = max(0, y1 - pad_y)
px2 = min(background.width, x2 + pad_x)
py2 = min(background.height, y2 + pad_y)
if px2 <= px1 or py2 <= py1:
continue
crop = background.crop((px1, py1, px2, py2)).convert("RGB")
regions.append((crop, (px1, py1, px2, py2)))
regions.sort(
key=lambda item: (item[1][2] - item[1][0]) * (item[1][3] - item[1][1]),
reverse=True,
)
return regions
def _editor_background_and_mask(editor_value):
if not isinstance(editor_value, dict):
return None, None
background = _to_rgba_image(editor_value.get("background"))
if background is None:
background = _to_rgba_image(editor_value.get("image"))
composite = _to_rgba_image(editor_value.get("composite"))
layers = editor_value.get("layers") or []
if background is None:
if composite is None:
return None, None
background = composite
mask = _to_mask_array(editor_value.get("mask"))
if mask is not None:
if mask.shape[:2] != (background.height, background.width):
mask_img = Image.fromarray(mask.astype(np.uint8) * 255, mode="L")
nearest = Image.Resampling.NEAREST if hasattr(Image, "Resampling") else Image.NEAREST
mask = np.asarray(mask_img.resize((background.width, background.height), nearest)) > 0
return background, mask
if not isinstance(layers, list) or not layers:
return background, None
alpha_acc = np.zeros((background.height, background.width), dtype=np.uint8)
for layer in layers:
layer_img = _to_rgba_image(layer)
if layer_img is None:
continue
if layer_img.size != background.size:
nearest = Image.Resampling.NEAREST if hasattr(Image, "Resampling") else Image.NEAREST
layer_img = layer_img.resize(background.size, nearest)
layer_alpha = np.asarray(layer_img, dtype=np.uint8)[:, :, 3]
alpha_acc = np.maximum(alpha_acc, layer_alpha)
return background, (alpha_acc > 0)
def _extract_selected_regions(editor_value, base_size=None, base_image=None):
if editor_value is None:
return []
if isinstance(editor_value, Image.Image):
if base_size and tuple(editor_value.size) == tuple(base_size):
return []
bbox = _locate_patch_bbox(base_image, editor_value) if base_image is not None else None
return [(editor_value, bbox)]
if not isinstance(editor_value, dict):
return []
background, mask = _editor_background_and_mask(editor_value)
layers = editor_value.get("layers") or []
if background is None:
return []
if not isinstance(layers, list) or not layers:
# No annotation layers; treat as explicit crop only if size changed from base.
if base_size and tuple(background.size) == tuple(base_size):
return []
patch = background.convert("RGB")
bbox = _locate_patch_bbox(base_image, patch) if base_image is not None else None
return [(patch, bbox)]
if mask is None:
return []
return _extract_regions_from_mask(background, mask)
def _extract_new_drawn_regions(editor_value, base_size=None, base_image=None, consumed_mask=None):
# For crop mode / explicit cropped image, fall back to classic extraction.
if isinstance(editor_value, Image.Image):
regions = _extract_selected_regions(editor_value, base_size=base_size, base_image=base_image)
return regions, consumed_mask
if not isinstance(editor_value, dict):
return [], consumed_mask
background, mask = _editor_background_and_mask(editor_value)
layers = editor_value.get("layers") or []
if background is None:
return [], consumed_mask
has_layer_data = isinstance(layers, list) and len(layers) > 0
has_draw_data = (mask is not None) or has_layer_data
# If there are no draw layers/mask, treat as explicit crop mode.
if not has_draw_data:
regions = _extract_selected_regions(editor_value, base_size=base_size, base_image=base_image)
return regions, consumed_mask
if mask is None:
return [], consumed_mask
if consumed_mask is None or not isinstance(consumed_mask, np.ndarray) or consumed_mask.shape != mask.shape:
delta_mask = mask
else:
delta_mask = np.logical_and(mask, np.logical_not(consumed_mask))
regions = _extract_regions_from_mask(background, delta_mask)
return regions, mask
def _extract_selected_region(editor_value, base_size=None, base_image=None):
regions = _extract_selected_regions(editor_value, base_size=base_size, base_image=base_image)
if not regions:
return None, None
return regions[0]
def _bbox_overlap_ratio(a, b):
ax1, ay1, ax2, ay2 = a
bx1, by1, bx2, by2 = b
ix1 = max(ax1, bx1)
iy1 = max(ay1, by1)
ix2 = min(ax2, bx2)
iy2 = min(ay2, by2)
if ix2 <= ix1 or iy2 <= iy1:
return 0.0, 0.0
inter = float((ix2 - ix1) * (iy2 - iy1))
area_a = float(max(1, (ax2 - ax1) * (ay2 - ay1)))
area_b = float(max(1, (bx2 - bx1) * (by2 - by1)))
return inter / area_a, inter / area_b
def _is_duplicate_bbox(candidate_bbox, existing_bbox):
iou = _box_iou(candidate_bbox, existing_bbox)
cover_cand, cover_exist = _bbox_overlap_ratio(candidate_bbox, existing_bbox)
return iou >= 0.85 or cover_cand >= 0.92 or cover_exist >= 0.97
def _draw_selected_region_boxes(image, boxes):
if image is None or not boxes:
return None
refs = []
w, h = image.size
for i, b in enumerate(boxes, 1):
x1, y1, x2, y2 = b
nx1 = max(0.0, min(999.0, x1 / max(1, w) * 999.0))
ny1 = max(0.0, min(999.0, y1 / max(1, h) * 999.0))
nx2 = max(0.0, min(999.0, x2 / max(1, w) * 999.0))
ny2 = max(0.0, min(999.0, y2 / max(1, h) * 999.0))
label = f"Region {i}"
coord_text = repr([[nx1, ny1, nx2, ny2]])
raw = f'<|ref|>region_{i}<|/ref|><|det|>{coord_text}<|/det|>'
refs.append((raw, label, coord_text))
img_out, _ = draw_bounding_boxes(image, refs, extract_images=False)
return img_out
def _region_gallery_items(regions):
items = []
for i, r in enumerate(regions, 1):
img = r["image"]
label = f"Region {i}"
if isinstance(img, Image.Image):
label = f"{label} ({img.width}x{img.height})"
items.append((img, label))
return items
def _label_gallery_items(items, prefix=None):
labeled = []
for i, item in enumerate(items, 1):
if isinstance(item, tuple) and len(item) >= 2:
img, label = item[0], str(item[1])
else:
img, label = item, f"Item {i}"
if prefix:
label = f"{prefix} - {label}"
if isinstance(img, Image.Image):
label = f"{label} ({img.width}x{img.height})"
labeled.append((img, label))
return labeled
def _reset_selected_regions():
return [], [], "No saved regions."
def _reset_drawn_mask():
return None
def add_selected_region(editor_value, base_size, base_image, selected_regions, consumed_mask):
candidates, updated_mask = _extract_new_drawn_regions(
editor_value,
base_size=base_size,
base_image=base_image,
consumed_mask=consumed_mask,
)
regions = list(selected_regions or [])
if not candidates:
msg = "No region detected. Use Crop or draw/highlight a region first."
return regions, _region_gallery_items(regions), msg, updated_mask
existing_boxes = [r.get("bbox") for r in regions if r.get("bbox") is not None]
added = 0
for region_img, bbox in candidates:
if bbox is not None and any(_is_duplicate_bbox(bbox, eb) for eb in existing_boxes):
continue
regions.append({"image": region_img, "bbox": bbox})
if bbox is not None:
existing_boxes.append(bbox)
added += 1
if added == 0:
msg = "No new region added. Draw one region, click Add Region, then draw the next region."
return regions, _region_gallery_items(regions), msg, updated_mask
msg = f"Added {added} region(s). {len(regions)} total. Zoom/pan is preserved."
return regions, _region_gallery_items(regions), msg, updated_mask
def clear_selected_regions():
return _reset_selected_regions()
def clear_regions_preserve_view(editor_value):
regions, gallery_items, msg = _reset_selected_regions()
_, mask = _editor_background_and_mask(editor_value)
return regions, gallery_items, msg, mask
def _compose_ui_outputs(cleaned, markdown, raw, img_out, gallery_items):
text_display = re.sub(
r'\\\[(.+?)\\\]',
lambda m: f'\n$$\n{m.group(1).strip()}\n$$\n',
cleaned,
flags=re.DOTALL
)
text_display = re.sub(r'\\\((.+?)\\\)', lambda m: f'${m.group(1).strip()}$', text_display)
dl_tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.md', mode='w', encoding='utf-8')
dl_tmp.write(cleaned)
dl_tmp.close()
markdown_html = to_math_html(markdown)
return (
text_display,
cleaned,
markdown_html,
raw,
img_out,
gallery_items,
gr.DownloadButton(value=dl_tmp.name, visible=True),
)
def toggle_prompt(task):
if task == "✏️ Custom":
return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for bounding boxes")
elif task == "📍 Locate":
return gr.update(visible=True, label="Text to Locate", placeholder="Enter text to locate")
return gr.update(visible=False)
def select_boxes(task):
if task == "📍 Locate":
return gr.update(selected="tab_boxes")
return gr.update()
def toggle_scope_ui(scope):
if scope == "Selected Region":
hint = (
"**Selected Region mode:** Draw/highlight on the workspace, click **Add Region** "
"for each target area, then click **Extract**."
)
return (
gr.update(value=hint),
gr.update(visible=True),
gr.update(visible=True),
gr.update(visible=True),
)
hint = "**Entire Page mode:** No drawing needed. Click **Extract** to process the full page."
return (
gr.update(value=hint),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
)
def select_post_extract_tab(task, scope):
if scope == "Selected Region" or task == "📍 Locate":
return gr.update(selected="tab_boxes")
return gr.update(selected="tab_text")
def get_pdf_page_count(file_path):
if not file_path or not file_path.lower().endswith('.pdf'):
return 1
doc = fitz.open(file_path)
count = len(doc)
doc.close()
return count
def load_image(file_path, page_num=1):
if not file_path:
return None
if file_path.lower().endswith('.pdf'):
doc = fitz.open(file_path)
page_idx = max(0, min(int(page_num) - 1, len(doc) - 1))
page = doc.load_page(page_idx)
pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
img = Image.open(BytesIO(pix.tobytes("png")))
doc.close()
return img
else:
return Image.open(file_path)
def _scale_workspace_image(img, workspace_scale):
if img is None:
return None
# Keep native pixels for workspace quality. Gradio's in-canvas zoom controls
# visual scale; pre-resampling here causes blurry math when users zoom in.
return img
def _prepare_workspace_image(img, workspace_scale=WORKSPACE_DEFAULT_SCALE):
if img is None:
return None, None, None
display_img = _scale_workspace_image(img, workspace_scale)
return display_img, (int(display_img.width), int(display_img.height)), display_img
def load_image_with_size(file_path, page_num=1, workspace_scale=WORKSPACE_DEFAULT_SCALE):
img = load_image(file_path, page_num)
return _prepare_workspace_image(img, workspace_scale)
def load_example_into_workspace(example_value):
if example_value is None:
return None, None, None
file_path = None
if isinstance(example_value, os.PathLike):
file_path = os.fspath(example_value)
elif isinstance(example_value, str):
file_path = example_value
elif isinstance(example_value, dict):
path_candidate = example_value.get("path") or example_value.get("name")
if isinstance(path_candidate, os.PathLike):
file_path = os.fspath(path_candidate)
elif isinstance(path_candidate, str):
file_path = path_candidate
elif isinstance(example_value, (list, tuple)) and example_value:
first = example_value[0]
if isinstance(first, os.PathLike):
file_path = os.fspath(first)
elif isinstance(first, str):
file_path = first
if file_path:
img = load_image(file_path, 1)
return _prepare_workspace_image(img, WORKSPACE_DEFAULT_SCALE)
if isinstance(example_value, Image.Image):
img = example_value
else:
maybe_rgba = _to_rgba_image(example_value)
if maybe_rgba is None:
return None, None, None
img = maybe_rgba.convert("RGB")
return _prepare_workspace_image(img, WORKSPACE_DEFAULT_SCALE)
def load_example_into_workspace_and_reset(example_value):
display_img, base_size, base_img = load_example_into_workspace(example_value)
return display_img, base_size, base_img, [], [], "No saved regions.", None
def sync_workspace_state(editor_value, current_base_image):
background = _extract_editor_background(editor_value)
if isinstance(background, Image.Image):
return (int(background.width), int(background.height)), background
if isinstance(current_base_image, Image.Image):
return (int(current_base_image.width), int(current_base_image.height)), current_base_image
return None, None
def update_page_selector(file_path):
if not file_path:
return gr.update(visible=False)
if file_path.lower().endswith('.pdf'):
page_count = get_pdf_page_count(file_path)
return gr.update(visible=True, maximum=page_count, value=1, minimum=1,
label=f"Select Page (1-{page_count})")
return gr.update(visible=False)
blocks_kwargs = {"title": "DeepSeek-OCR-2"}
if hasattr(gr, "themes") and hasattr(gr.themes, "Soft"):
try:
blocks_kwargs["theme"] = gr.themes.Soft()
except Exception:
pass
with gr.Blocks(**blocks_kwargs) as demo:
gr.Markdown("""
# 🧮 DeepSeek-OCR-2 — Math Rendering Edition
**Convert documents to markdown, extract text, parse figures, and locate specific content with bounding boxes.**
**Model uses DeepEncoder v2 and achieves 91.09% on OmniDocBench (+3.73% over v1).**
Built on the original [DeepSeek-OCR-2 Demo](https://huggingface.co/spaces/merterbak/DeepSeek-OCR-2) by **Mert Erbak** — thank you for the excellent foundation.
This fork adds **math rendering** in the Markdown Preview tab so that equations from scanned papers and textbooks display as proper math notation.
""")
region_editor = None
workspace_base_size = gr.State(None)
workspace_base_image = gr.State(None)
selected_regions_state = gr.State([])
drawn_mask_state = gr.State(None)
with gr.Row():
with gr.Column(scale=3):
file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
with gr.Column(scale=1):
page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
with gr.Row():
with gr.Column(scale=3):
workspace_hint = gr.Markdown("**Entire Page mode:** No drawing needed. Click **Extract** to process the full page.")
gr.Markdown("**Image Workspace (full page + region selection)**")
if HAS_REGION_WORKSPACE:
editor_kwargs = {}
if HAS_BRUSH:
try:
highlight = ("#2563eb", 0.35)
editor_kwargs["brush"] = gr.Brush(
colors=[highlight],
default_color=highlight,
color_mode="fixed",
default_size=22,
)
except TypeError:
try:
editor_kwargs["brush"] = gr.Brush(
colors=["rgba(37,99,235,0.35)"],
default_color="rgba(37,99,235,0.35)",
color_mode="fixed",
default_size=22,
)
except TypeError:
editor_kwargs["brush"] = gr.Brush()
if HAS_ERASER:
try:
editor_kwargs["eraser"] = gr.Eraser(default_size=26)
except TypeError:
editor_kwargs["eraser"] = gr.Eraser()
if HAS_IMAGE_EDITOR:
try:
region_editor = gr.ImageEditor(
label="Image Workspace",
show_label=False,
type="pil",
height=WORKSPACE_EDITOR_HEIGHT,
**editor_kwargs,
)
except TypeError:
try:
region_editor = gr.ImageEditor(
label="Image Workspace",
show_label=False,
height=WORKSPACE_EDITOR_HEIGHT,
**editor_kwargs,
)
except TypeError:
region_editor = gr.ImageEditor(
label="Image Workspace",
show_label=False,
height=WORKSPACE_EDITOR_HEIGHT,
)
else:
region_editor = gr.Paint(
label="Image Workspace",
show_label=False,
type="pil",
height=WORKSPACE_EDITOR_HEIGHT,
**editor_kwargs,
)
else:
gr.Markdown("Region drawing requires a newer Gradio version with `Paint` or `ImageEditor` support.")
region_editor = gr.State(None)
with gr.Column(scale=1):
gr.Markdown("### OCR Workflow")
task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
input_scope = gr.Radio(["Entire Page", "Selected Region"], value="Entire Page", label="Input Scope")
selection_controls = gr.Row(visible=False)
with selection_controls:
add_region_btn = gr.Button("Add Region", variant="secondary")
clear_regions_btn = gr.Button("Clear Regions")
selection_status = gr.Textbox(label="Region Selection Status", value="No saved regions.", interactive=False, visible=False)
selected_regions_gallery = gr.Gallery(
label="Selected Regions",
show_label=True,
columns=2,
height=190,
visible=False,
object_fit="contain",
)
with gr.Accordion("Advanced Options", open=False):
equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
separate_eq_lines = gr.Checkbox(label="Detect Equation Lines Separately", value=False)
prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
btn = gr.Button("Extract", variant="primary", size="lg")
with gr.Row():
with gr.Column(scale=1):
with gr.Tabs() as tabs:
with gr.Tab("Text", id="tab_text"):
text_out = gr.Textbox(lines=20, show_label=False)
with gr.Tab("LaTeX", id="tab_text_latex"):
latex_out = gr.Textbox(lines=20, show_label=False)
with gr.Tab("Preview", id="tab_markdown"):
md_out = gr.HTML("")
with gr.Tab("Boxes", id="tab_boxes"):
img_out = gr.Image(type="pil", height=560, show_label=False)
with gr.Tab("Crops", id="tab_crops"):
gallery = gr.Gallery(show_label=False, columns=3, height=420, object_fit="contain")
with gr.Tab("Raw", id="tab_raw"):
raw_out = gr.Textbox(lines=20, show_label=False)
download_btn = gr.DownloadButton("Download Markdown", visible=False, variant="secondary")
gr.Markdown("### Examples")
with gr.Row():
with gr.Column(scale=2):
image_examples = [
"examples/2022-0922 Section 13 Notes.png",
"examples/2022-0922 Section 14 Notes.png",
"examples/2022-0922 Section 15 Notes.png",
]
if HAS_REGION_WORKSPACE and region_editor is not None:
image_examples_input = gr.Image(
label="Example Loader",
type="filepath",
visible=False,
show_label=False,
)
gr.Examples(
label="Image Examples (click thumbnail to load into workspace)",
examples=image_examples,
inputs=[image_examples_input],
outputs=[region_editor, workspace_base_size, workspace_base_image, selected_regions_state, selected_regions_gallery, selection_status, drawn_mask_state],
fn=load_example_into_workspace_and_reset,
run_on_click=True,
cache_examples=False,
)
else:
gr.Examples(
label="Image Examples",
examples=[[p] for p in image_examples],
inputs=[file_in],
cache_examples=False,
)
with gr.Column(scale=1):
gr.Examples(
label="PDF Examples",
examples=[["examples/Gursoy Class Notes_ Accessibility Sandbox.pdf"]],
inputs=[file_in],
cache_examples=False,
)
with gr.Accordion("ℹ️ Info", open=False):
gr.Markdown("""
### Configuration
1024 base + 768 patches with dynamic cropping (2-6 patches). 144 tokens per patch + 256 base tokens.
### Faculty Quick Workflow
1. Upload a page/image, then confirm **Task**.
2. Choose **Input Scope**:
- `Entire Page` for the full page.
- `Selected Region` for a specific area.
2a. Workspace keeps native image resolution for clarity. For very tall pages, it auto-boosts from tiny fit view toward ~88% width-friendly zoom.
3. For `Selected Region`, use the **Image Workspace**:
- Recommended: freehand selection (draw/highlight target); app uses an automatic bounding box around your marks.
- Optional rectangle selection: use the **Crop** tool.
- Freehand/highlight ink is semi-transparent so underlying content stays visible.
- Current known behavior: after zooming in/out, freehand stroke display may appear fully on mouse release (selection is still captured correctly).
- Optional multi-select: click **Add Region** after each selection.
- **Add Region** snapshots only newly drawn pixels so zoom/pan stays in place while you continue selecting.
Then click **Extract**.
4. Use **Clear Regions** to reset multi-select state.
5. Review **Cropped Images** and **Boxes**: both are labeled `Region 1`, `Region 2`, etc.
6. Use **Advanced Options** only when needed (Equation Zoom / line-by-line equation OCR).
### Tasks
- **Markdown**: Convert document to structured markdown with layout detection (grounding ✅)
- **Free OCR**: Read all visible text from the full page/image (no boxes, no targeting)
- **Locate**: Find and highlight where specific text appears (grounding ✅)
- **Describe**: General image description
- **Custom**: Your own prompt
- **Region selection**: Use **Input Scope=Selected Region**, draw/crop in the Image Workspace, then click **Extract**
- **Input Scope**: `Entire Page` or `Selected Region` (Selected Region uses the workspace crop as main input)
- **Equation Zoom (multipass)**: Optional nested equation refinement for Markdown. Off by default for speed/stability.
- **Detect Equation Lines Separately**: Detects likely equation-line boxes and OCRs each line independently to reduce merged multi-step equations.
### Free OCR vs Locate (important)
- **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
- If you want OCR for one area only, crop that area first, then run **Free OCR** on the cropped image.
- If you want to keep the full page but highlight where text appears, use **Locate** and enter the text to search.
- For advanced region workflows, use **Custom** with `<|grounding|>` in the prompt.
### Special Tokens
- `<image>` - Placeholder where visual tokens are inserted
- `<|grounding|>` - Enables layout detection with bounding boxes
- `<|ref|>text<|/ref|>` - Reference text to locate in the image
""")
file_in.change(update_page_selector, [file_in], [page_selector])
task.change(toggle_prompt, [task], [prompt])
task.change(select_boxes, [task], [tabs])
input_scope.change(toggle_scope_ui, [input_scope], [workspace_hint, selection_controls, selection_status, selected_regions_gallery])
if HAS_REGION_WORKSPACE and region_editor is not None:
file_in.change(load_image_with_size, [file_in, page_selector], [region_editor, workspace_base_size, workspace_base_image])
page_selector.change(load_image_with_size, [file_in, page_selector], [region_editor, workspace_base_size, workspace_base_image])
region_editor.change(sync_workspace_state, [region_editor, workspace_base_image], [workspace_base_size, workspace_base_image])
file_in.change(_reset_selected_regions, outputs=[selected_regions_state, selected_regions_gallery, selection_status])
page_selector.change(_reset_selected_regions, outputs=[selected_regions_state, selected_regions_gallery, selection_status])
file_in.change(_reset_drawn_mask, outputs=[drawn_mask_state])
page_selector.change(_reset_drawn_mask, outputs=[drawn_mask_state])
add_region_btn.click(
add_selected_region,
[region_editor, workspace_base_size, workspace_base_image, selected_regions_state, drawn_mask_state],
[selected_regions_state, selected_regions_gallery, selection_status, drawn_mask_state],
)
clear_regions_btn.click(
clear_regions_preserve_view,
inputs=[region_editor],
outputs=[selected_regions_state, selected_regions_gallery, selection_status, drawn_mask_state],
)
def run(file_path, task, custom_prompt, page_num, enable_equation_zoom, detect_eq_lines, scope, region_value, base_size, base_image, selected_regions):
if scope == "Selected Region":
regions = list(selected_regions or [])
if not regions:
selected_region, selected_bbox = _extract_selected_region(region_value, base_size=base_size, base_image=base_image)
if selected_region is None:
msg = "Select Input Scope=Selected Region, then crop or annotate a target area in the Image Workspace first."
return (msg, "", "", "", None, [], gr.DownloadButton(visible=False))
regions = [{"image": selected_region, "bbox": selected_bbox}]
cleaned_parts = []
markdown_parts = []
raw_parts = []
line_crops = []
for i, r in enumerate(regions, 1):
cleaned_i, markdown_i, raw_i, _, crops_i = process_image(
r["image"],
task,
custom_prompt,
enable_equation_zoom=enable_equation_zoom,
infer_crop_mode=False,
separate_equation_lines=detect_eq_lines,
)
if len(regions) > 1:
cleaned_parts.append(f"## Region {i}\n\n{cleaned_i}")
markdown_parts.append(f"## Region {i}\n\n{markdown_i}")
raw_parts.append(f"## Region {i}\n\n{raw_i}")
else:
cleaned_parts.append(cleaned_i)
markdown_parts.append(markdown_i)
raw_parts.append(raw_i)
if detect_eq_lines and crops_i:
line_crops.extend(_label_gallery_items(crops_i, prefix=f"Region {i}" if len(regions) > 1 else None))
cleaned = "\n\n".join(cleaned_parts).strip()
markdown = "\n\n".join(markdown_parts).strip()
raw = "\n\n".join(raw_parts).strip()
crops = line_crops if line_crops else _region_gallery_items(regions)
full_img = base_image if isinstance(base_image, Image.Image) else _extract_editor_background(region_value)
region_boxes = [r["bbox"] for r in regions if r.get("bbox") is not None]
img_out = _draw_selected_region_boxes(full_img, region_boxes)
elif file_path:
cleaned, markdown, raw, img_out, crops = process_file(
file_path,
task,
custom_prompt,
int(page_num),
enable_equation_zoom=enable_equation_zoom,
separate_equation_lines=detect_eq_lines,
)
elif (full_image := _extract_editor_background(region_value)) is not None:
cleaned, markdown, raw, img_out, crops = process_image(
full_image,
task,
custom_prompt,
enable_equation_zoom=enable_equation_zoom,
separate_equation_lines=detect_eq_lines,
)
elif isinstance(base_image, Image.Image):
# Example clicks can briefly race editor-value hydration on first load.
cleaned, markdown, raw, img_out, crops = process_image(
base_image,
task,
custom_prompt,
enable_equation_zoom=enable_equation_zoom,
separate_equation_lines=detect_eq_lines,
)
else:
msg = "Error: Upload a file or image"
return (msg, "", "", "", None, [], gr.DownloadButton(visible=False))
return _compose_ui_outputs(cleaned, markdown, raw, img_out, crops)
submit_event = btn.click(
run,
[file_in, task, prompt, page_selector, equation_zoom, separate_eq_lines, input_scope, region_editor, workspace_base_size, workspace_base_image, selected_regions_state],
[text_out, latex_out, md_out, raw_out, img_out, gallery, download_btn]
)
submit_event.then(select_post_extract_tab, [task, input_scope], [tabs])
if __name__ == "__main__":
# server_name="0.0.0.0" is needed locally (WSL2 → Windows access)
# On HuggingFace Spaces, SPACE_ID is set and Gradio handles binding automatically
local = not os.environ.get("SPACE_ID")
queued = demo.queue(max_size=20)
launch_sig = inspect.signature(queued.launch)
launch_kwargs = {}
if "server_name" in launch_sig.parameters:
launch_kwargs["server_name"] = "0.0.0.0" if local else None
if "head" in launch_sig.parameters:
launch_kwargs["head"] = PREVIEW_CSS
if "ssr_mode" in launch_sig.parameters:
launch_kwargs["ssr_mode"] = False # SSR breaks HF Spaces routing in Gradio 6
queued.launch(**launch_kwargs)