NuExtract3 / app.py
Alexandre-Numind's picture
Update app.py
a19c568 verified
import argparse
import base64
import io
import json
import os
import re
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional, Tuple
import gradio as gr
from openai import OpenAI
from PIL import Image
# ---------------- Paths ----------------
APP_DIR = Path(__file__).resolve().parent
# ---------------- CLI / environment configuration ----------------
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="NuExtract Gradio demo")
parser.add_argument(
"--model-name",
default=os.environ.get("MODEL_NAME", "numind/NuExtract3"),
help="Model name served by the OpenAI-compatible endpoint.",
)
parser.add_argument(
"--api-base",
default=os.environ.get("OPENAI_API_BASE", "http://127.0.0.1:8000/v1"),
help="OpenAI-compatible base URL.",
)
parser.add_argument(
"--api-key",
default=os.environ.get("OPENAI_API_KEY", "EMPTY"),
help="API key for the OpenAI-compatible endpoint.",
)
parser.add_argument(
"--server-name",
default=os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"),
help="Gradio server host.",
)
parser.add_argument(
"--server-port",
type=int,
default=int(os.environ.get("GRADIO_SERVER_PORT", "7860")),
help="Gradio server port.",
)
parser.add_argument(
"--share",
action="store_true",
default=os.environ.get("GRADIO_SHARE", "false").lower() in {"1", "true", "yes"},
help="Create a public Gradio share link.",
)
parser.add_argument(
"--max-tokens",
type=int,
default=int(os.environ.get("NUEXTRACT_MAX_TOKENS", "10000")),
help="Maximum tokens for model generation. Hidden from the UI.",
)
parser.add_argument(
"--example-dir",
default=os.environ.get("NUEXTRACT_EXAMPLE_DIR", str(APP_DIR / "examples")),
help="Directory containing image examples.",
)
parser.add_argument(
"--assets-dir",
default=os.environ.get("NUEXTRACT_ASSETS_DIR", str(APP_DIR / "assets")),
help="Directory containing static assets such as the NuExtract logo.",
)
args, _ = parser.parse_known_args()
return args
def resolve_dir(path_like: str) -> Path:
path = Path(path_like).expanduser()
if path.is_absolute():
return path.resolve()
return (APP_DIR / path).resolve()
ARGS = parse_args()
DEFAULT_MODEL = ARGS.model_name
DEFAULT_API_BASE = ARGS.api_base
DEFAULT_API_KEY = ARGS.api_key
DEFAULT_MAX_TOKENS = ARGS.max_tokens
EXAMPLE_DIR = resolve_dir(ARGS.example_dir)
ASSETS_DIR = resolve_dir(ARGS.assets_dir)
LOGO_PATH = ASSETS_DIR / "logo_numind_picto.svg"
if LOGO_PATH.exists():
LOGO_URL = f"/gradio_api/file={LOGO_PATH}"
gr.set_static_paths(paths=[ASSETS_DIR])
else:
print(f"[assets] Missing logo: {LOGO_PATH}", flush=True)
LOGO_URL = ""
SYSTEM_PROMPT_DEFAULT = (
"You are a precise information extraction assistant. "
"Return faithful, source-grounded results only."
)
# ---------------- Structured extraction examples ----------------
# These examples populate: Image + Template + Instructions.
STRUCTURED_EXAMPLE_TEMPLATES: Dict[str, Dict[str, Any]] = {
"1.jpg": {
"movie_name": "verbatim-string",
"tagline": "verbatim-string",
"language": "string",
"motion_picture_association_rating": [
"G - General Audiences",
"PG - Parental Guidance Suggested",
"PG-13 – Parents Strongly Cautioned",
"R – Restricted",
"NC-17 – Adults Only",
"not provided"
],
"movie_distribution_company": "verbatim-string",
"movie_production_company": "verbatim-string",
"theatre_release_date": "date-time",
"movie_website_address": "verbatim-string",
"movie_director_name": "verbatim-string",
"actors_names": [
"verbatim-string"
],
"staff": [
{
"staff_name": "verbatim-string",
"staff_role": [
"director",
"co-director",
"screenwriter",
"author",
"cinematographer",
"costume designer",
"production designer",
"set designer",
"animator",
"color designer",
"art director",
"animation director",
"vfx director",
"voice actor",
"composer",
"songwriter",
"music performer",
"music supervisor",
"choreographer",
"casting director",
"editor",
"producer",
"co-producer",
"associate producer",
"executive producer",
"co-executive producer",
"line producer"
]
}
],
"reviews": [
{
"critic_name": "verbatim-string",
"review_comment": "verbatim-string"
}
],
"technologies": [
[
"Dolby Stereo",
"Dolby Digital",
"Dolby Stereo Digital",
"Dolby Atmos",
"Dolby Vision",
"Dolby Cinema",
"DTS",
"SDDS",
"IMAX",
"4DX"
]
]
},
"2.png": {
"number_of_bathrooms": "integer",
"number_of_fireplaces": "integer",
"distance_unit": ["meter", "foot"],
"rooms_that_are_not_bedrooms_or_corridors_or_toilets": [
{
"room_name": "verbatim-string",
"surface_area": "number",
}
],
"bedrooms": [
{
"bedroom_name": "verbatim-string",
"surface_area": "number",
"number_of_windows": "integer",
"has_private_bathroom": "boolean",
}
],
"has_laundry_room": "boolean",
"has_terrace": "boolean",
"has_balcony": "boolean",
"number_of_parking_spaces_in_garage": "integer",
"number_of_parking_spaces_exterior": "integer",
},
"8.png" : {
"invoice_number": "verbatim-string",
"issuer_name": "verbatim-string",
"recipient_name": "verbatim-string",
"issuer_location": {
"street_number": "verbatim-string",
"street_name": "verbatim-string",
"city": "verbatim-string",
"zip_code": "string",
"country": "string"
},
"date_of_issue": "date-time",
"date_due": "date-time",
"currency_code_iso4217": "string",
"items": [
{
"item_name": "verbatim-string",
"item_quantity": "number",
"item_price_per_unit": "number",
"item_total_price": "number"
}
],
"total_discount_amount": "number",
"total_fee_amount": "number",
"total_tax_amount": "number",
"total_price_net": "number",
"total_price_gross": "number"
},
"18.jpg":{
"festival_name": "verbatim-string",
"website_url": "url",
"location": {
"city": "string",
"zip_code": "string",
"country": "country"
},
"date_first_day": "date",
"date_last_day": "date",
"lineup_entry": [
{
"artist_or_group_name": "string",
"artist_entity_type": ["individual", "band", "ensemble", "b2b", "project_or_collaboration", "other"],
"performing_stage_name": "verbatim-string",
"is_headliner": "boolean",
"day_playing": "date"
}
],
"sponsors": [
{
"name": "verbatim-string",
"type": ["press", "tv", "bank", "insurance", "beverage company", "car company", "technology company", "clothing company", "transportation", "public institution", "other"]
}
]
},
"17.png":{
"parts": [
{
"name": "verbatim-string",
"id": "verbatim-string",
"details": "verbatim-string"
}
]
},
"16.jpeg":{
"Applicant": {
"Name": "verbatim-string",
"Registration no": "verbatim-string",
"Holding compagny": "verbatim-string",
"VAT Registration no": "verbatim-string",
"Date of creation": "date",
"Type of entity": "verbatim-string",
"Location": {
"Street number": "verbatim-string",
"Street name": "verbatim-string",
"City": "verbatim-string",
"Zip code": "string",
"Country": "country"
},
"Website": "url",
"Phone": "phone-number",
"Email": "email-address"
},
"Bank Reference": {
"Bank name": "verbatim-string",
"Account name": "verbatim-string",
"Account no": "integer",
"Importer code": "verbatim-string"
},
"Trades references": [
{
"Company name": "verbatim-string",
"Account opened since": "date",
"Tel": "phone-number",
"Email": "email-address",
"Location": {
"Street number": "verbatim-string",
"Street name": "verbatim-string",
"City": "verbatim-string",
"Zip code": "string",
"Country": "country"
},
"Credit limit": "string"
}
],
"Is document signed": "boolean",
"Date of signature": "date"
},
}
STRUCTURED_EXAMPLE_INSTRUCTIONS: Dict[str, str] = {
"1.jpg": "",
"2.png": "",
"8.png": "",
"18.jpg": "",
"17.png": "",
"16.jpeg": ""
}
# ---------------- Markdown/OCR examples ----------------
# Put Markdown example image paths here.
# These examples populate only the Image input and are meant for the
# “Convert to Markdown” button.
MARKDOWN_EXAMPLE_IMAGE_PATHS: List[str] = [
"3.jpg",
"4.jpg",
"5.jpg",
"6.png",
"7.jpg",
"9.jpg",
"10.png",
"11.png",
"12.jpg",
"14.jpg",
"15.jpg"
]
def resolve_example_path(path_like: str) -> Path:
path = Path(path_like).expanduser()
if path.is_absolute():
return path.resolve()
return (EXAMPLE_DIR / path).resolve()
def build_structured_examples() -> List[List[Any]]:
examples: List[List[Any]] = []
for filename, template_obj in STRUCTURED_EXAMPLE_TEMPLATES.items():
image_path = resolve_example_path(filename)
if not image_path.exists():
print(f"[structured examples] Missing image: {image_path}", flush=True)
continue
examples.append(
[
str(image_path),
json.dumps(template_obj, indent=4, ensure_ascii=False),
STRUCTURED_EXAMPLE_INSTRUCTIONS.get(filename, ""),
]
)
return examples
def build_markdown_examples() -> List[List[Any]]:
examples: List[List[Any]] = []
for path_like in MARKDOWN_EXAMPLE_IMAGE_PATHS:
image_path = resolve_example_path(path_like)
if not image_path.exists():
print(f"[markdown examples] Missing image: {image_path}", flush=True)
continue
examples.append([str(image_path)])
return examples
STRUCTURED_EXAMPLES = build_structured_examples()
MARKDOWN_EXAMPLES = build_markdown_examples()
# ---------------- Utility helpers ----------------
def image_bytes_to_base64(b: bytes) -> str:
return base64.b64encode(b).decode("utf-8")
def ensure_rgb_image(image_bytes: bytes) -> Image.Image:
img = Image.open(io.BytesIO(image_bytes))
if img.mode != "RGB":
img = img.convert("RGB")
return img
def file_path_to_bytes(path: str) -> bytes:
with open(path, "rb") as f:
return f.read()
# ---------------- Response parsing ----------------
def strip_code_fence(payload: str) -> str:
return re.sub(
r"^```(?:json|markdown|text)?\s*|\s*```$",
"",
payload.strip(),
flags=re.IGNORECASE | re.MULTILINE,
).strip()
def pretty_json_or_text(payload: str) -> str:
if not payload:
return ""
cleaned = strip_code_fence(payload)
try:
return json.dumps(json.loads(cleaned), indent=4, ensure_ascii=False)
except Exception:
return cleaned
def extract_answer_block(text: str) -> str:
if not text:
return ""
try:
match = re.search(
r"<answer>\s*(.*?)\s*</answer>",
text,
flags=re.DOTALL | re.IGNORECASE,
)
if match:
return pretty_json_or_text(match.group(1).strip())
except Exception:
pass
json_objects = list(re.finditer(r"\{[\s\S]*\}", text))
if json_objects:
candidate = max(json_objects, key=lambda match: len(match.group(0))).group(0)
return pretty_json_or_text(candidate)
return text.strip()
def split_reasoning_and_output(text: str, reasoning_enabled: bool) -> Tuple[str, str]:
if not text:
return "", ""
if not reasoning_enabled:
return "", text.strip()
lower = text.lower()
end_tag = "</think>"
if end_tag in lower:
end_idx = lower.find(end_tag)
reasoning = text[:end_idx].strip()
output = text[end_idx + len(end_tag):].strip()
return reasoning, output
return text.strip(), ""
# ---------------- Message building ----------------
def make_text_content(text: str) -> List[Dict[str, Any]]:
return [{"type": "text", "text": text or ""}]
def make_image_content(
image_bytes: bytes,
extra_text: Optional[str] = None,
) -> List[Dict[str, Any]]:
img = ensure_rgb_image(image_bytes)
buffer = io.BytesIO()
img.save(buffer, format="JPEG", quality=95)
img_b64 = image_bytes_to_base64(buffer.getvalue())
content: List[Dict[str, Any]] = [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_b64}",
"detail": "high",
},
}
]
if extra_text and extra_text.strip():
content.append({"type": "text", "text": extra_text.strip()})
return content
def normalize_template(template: str) -> str:
tpl = (template or "").strip()
if not tpl:
return "{}"
try:
return json.dumps(json.loads(tpl), indent=4, ensure_ascii=False)
except Exception:
return tpl
def collate_single_input(
*,
text_or_image: Any,
template: str,
system_prompt: Optional[str],
instruction: Optional[str],
) -> Tuple[List[Dict[str, Any]], str]:
is_image_input = isinstance(text_or_image, dict) and "bytes" in text_or_image
messages: List[Dict[str, Any]] = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
template_json = normalize_template(template)
extra_parts = []
if instruction and instruction.strip():
extra_parts.append(f"Instructions:\n{instruction.strip()}")
if template_json and template_json.strip() not in {"{}", ""}:
extra_parts.append(f"Extraction template:\n```json\n{template_json}\n```")
extra_text_for_user = "\n\n".join(extra_parts) if extra_parts else None
if is_image_input:
messages.append(
{
"role": "user",
"content": make_image_content(
image_bytes=text_or_image["bytes"],
extra_text=extra_text_for_user,
),
}
)
else:
text = str(text_or_image or "")
if extra_text_for_user:
text = f"{text}\n\n{extra_text_for_user}".strip()
messages.append({"role": "user", "content": make_text_content(text)})
return messages, template_json
def collate_for_template_generation(
*,
context_text: str,
context_image_path: Optional[str],
system_prompt: Optional[str],
) -> List[Dict[str, Any]]:
messages: List[Dict[str, Any]] = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
guidance = (
"Generate a concise JSON extraction template for this document. "
"Use descriptive field names and simple type hints like string, number, YYYY-MM-DD, "
"boolean, or arrays of objects. Return only the JSON template."
)
if context_image_path:
messages.append(
{
"role": "user",
"content": make_image_content(
image_bytes=file_path_to_bytes(context_image_path),
extra_text=guidance,
),
}
)
else:
text = (context_text or "").strip()
messages.append(
{
"role": "user",
"content": make_text_content(f"{text}\n\n{guidance}".strip()),
}
)
return messages
def collate_markdown_image_only(
*,
image_bytes: bytes,
system_prompt: Optional[str],
) -> List[Dict[str, Any]]:
messages: List[Dict[str, Any]] = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append(
{
"role": "user",
"content": make_image_content(image_bytes=image_bytes),
}
)
return messages
# ---------------- Model calls ----------------
def chunk_to_text(chunk: Any) -> str:
try:
if not chunk or not getattr(chunk, "choices", None):
return ""
delta = getattr(chunk.choices[0], "delta", None)
if delta is None:
return ""
content = getattr(delta, "content", None)
if isinstance(content, str):
return content
if isinstance(content, list):
parts: List[str] = []
for item in content:
if isinstance(item, dict) and item.get("text"):
parts.append(item["text"])
elif getattr(item, "text", None):
parts.append(item.text)
return "".join(parts)
except Exception:
return ""
return ""
def build_chat_template_kwargs(
*,
template_json: str,
reasoning: bool,
instruction: Optional[str],
markdown_mode: bool,
) -> Dict[str, Any]:
if markdown_mode:
return {
"mode": "markdown",
"enable_thinking": bool(reasoning),
}
use_structured = bool(
template_json
and template_json.strip()
and template_json.strip() != "{}"
)
chat_kwargs: Dict[str, Any] = {
"mode": "structured" if use_structured else "content",
"enable_thinking": bool(reasoning),
}
if use_structured:
chat_kwargs["template"] = template_json
if instruction and instruction.strip():
chat_kwargs["instructions"] = instruction.strip()
return chat_kwargs
def call_model_stream(
*,
api_base: str,
api_key: str,
model_name: str,
messages: List[Dict[str, Any]],
template_json: str,
temperature: float,
max_tokens: int,
reasoning: bool,
instruction: Optional[str],
markdown_mode: bool,
) -> Iterator[str]:
client = OpenAI(base_url=api_base, api_key=api_key)
chat_kwargs = build_chat_template_kwargs(
template_json=template_json,
reasoning=reasoning,
instruction=instruction,
markdown_mode=markdown_mode,
)
stream = client.chat.completions.create(
model=model_name,
temperature=float(temperature),
max_tokens=int(max_tokens),
messages=messages,
stream=True,
extra_body={"chat_template_kwargs": chat_kwargs},
)
accumulated = ""
for chunk in stream:
delta_text = chunk_to_text(chunk)
if delta_text:
accumulated += delta_text
yield accumulated
def call_model_once(
*,
api_base: str,
api_key: str,
model_name: str,
messages: List[Dict[str, Any]],
mode: str,
temperature: float,
max_tokens: int,
) -> str:
client = OpenAI(base_url=api_base, api_key=api_key)
chat = client.chat.completions.create(
model=model_name,
temperature=float(temperature),
max_tokens=int(max_tokens),
messages=messages,
extra_body={
"chat_template_kwargs": {
"mode": mode,
"enable_thinking": False,
}
},
)
return chat.choices[0].message.content if chat.choices else ""
# ---------------- Inference orchestration ----------------
def prepare_input(context_text: str, context_image_path: Optional[str]) -> Any:
if context_image_path:
return {"bytes": file_path_to_bytes(context_image_path)}
return context_text or ""
def infer_stream(
*,
api_key: str,
api_base: str,
system_prompt: str,
template: str,
instruction: str,
context_text: str,
context_image_path: Optional[str],
temperature: float,
reasoning: bool,
markdown_mode: bool,
):
single_input = prepare_input(context_text, context_image_path)
is_image = isinstance(single_input, dict) and "bytes" in single_input
if markdown_mode:
if not is_image:
raise ValueError("Markdown conversion requires an image input.")
messages = collate_markdown_image_only(
image_bytes=single_input["bytes"],
system_prompt=system_prompt,
)
template_json = ""
else:
messages, template_json = collate_single_input(
text_or_image=single_input,
template=template,
system_prompt=system_prompt,
instruction=instruction,
)
for partial_text in call_model_stream(
api_base=api_base,
api_key=api_key,
model_name=DEFAULT_MODEL,
messages=messages,
template_json=template_json,
temperature=temperature,
max_tokens=DEFAULT_MAX_TOKENS,
reasoning=reasoning,
instruction=instruction,
markdown_mode=markdown_mode,
):
trace, output_text = split_reasoning_and_output(
partial_text,
reasoning_enabled=reasoning,
)
if markdown_mode:
output_display = output_text or (
"_(Waiting for output after `</think>`.)_"
if reasoning
else "_(Empty output.)_"
)
yield {
"mode": "markdown",
"output": output_display,
"think": trace if reasoning else "",
}
continue
if not reasoning:
output_text = partial_text or ""
answer = extract_answer_block(output_text)
output_display = answer or (
"_(Waiting for output after `</think>`.)_"
if reasoning
else "_(No output found yet.)_"
)
if output_display.strip().startswith("{") or output_display.strip().startswith("["):
output_display = pretty_json_or_text(output_display)
output_display = f"```json\n{output_display}\n```"
else:
output_display = output_display.replace("\\n", "\n")
yield {
"mode": "structured",
"output": output_display,
"think": trace if reasoning else "",
}
def infer_template_generation(
*,
api_key: str,
api_base: str,
system_prompt: str,
context_text: str,
context_image_path: Optional[str],
temperature: float,
) -> str:
messages = collate_for_template_generation(
context_text=context_text,
context_image_path=context_image_path,
system_prompt=system_prompt,
)
result = call_model_once(
api_base=api_base,
api_key=api_key,
model_name=DEFAULT_MODEL,
messages=messages,
mode="template-generation",
temperature=temperature,
max_tokens=DEFAULT_MAX_TOKENS,
)
return pretty_json_or_text(result)
# ---------------- UI styling ----------------
CSS = """
:root {
color-scheme: light;
--bg: #f6f2eb;
--panel: #ffffff;
--panel-rgb: 255, 255, 255;
--panel-strong-rgb: 255, 252, 246;
--input-rgb: 255, 255, 255;
--border-blue: rgba(67, 111, 148, 0.30);
--border-blue-soft: rgba(67, 111, 148, 0.18);
--border-input: rgba(67, 111, 148, 0.22);
--border-orange-soft: rgba(190, 103, 36, 0.26);
--text: #23252b;
--text-strong: #101318;
--text-on-accent: #101318;
--muted: #5f6673;
--muted-2: #7d8490;
--logo-blue: #5d9bcf;
--logo-orange: #d6742f;
--green: #178f66;
--card-alpha: 0.88;
--header-alpha: 0.82;
--input-alpha: 0.94;
--shadow: rgba(54, 46, 35, 0.14);
--inset-highlight: rgba(255, 255, 255, 0.85);
--logo-opacity: 0.18;
--focus-ring: rgba(67, 111, 148, 0.26);
--code-bg: #fdfaf5;
--dropzone-bg: #fbf8f2;
}
html.dark,
body.dark,
.dark,
[data-theme="dark"] {
color-scheme: dark;
--bg: #242529;
--panel: #1d1f26;
--panel-rgb: 29, 31, 38;
--panel-strong-rgb: 21, 22, 26;
--input-rgb: 12, 14, 19;
--border-blue: rgba(135, 183, 224, 0.24);
--border-blue-soft: rgba(135, 183, 224, 0.16);
--border-input: rgba(135, 183, 224, 0.14);
--border-orange-soft: rgba(228, 132, 58, 0.22);
--text: #eef0f4;
--text-strong: #ffffff;
--text-on-accent: #101318;
--muted: #969baa;
--muted-2: #737988;
--logo-blue: #87b7e0;
--logo-orange: #e4843a;
--green: #31c48d;
--card-alpha: 0.66;
--header-alpha: 0.42;
--input-alpha: 0.78;
--shadow: rgba(0, 0, 0, 0.28);
--inset-highlight: rgba(255, 255, 255, 0.055);
--logo-opacity: 0.88;
--focus-ring: rgba(135, 183, 224, 0.32);
--code-bg: rgba(12, 14, 19, 0.78);
--dropzone-bg: rgba(12, 14, 19, 0.78);
}
@media (prefers-color-scheme: dark) {
:root:not([data-theme="light"]) {
color-scheme: dark;
--bg: #242529;
--panel: #1d1f26;
--panel-rgb: 29, 31, 38;
--panel-strong-rgb: 21, 22, 26;
--input-rgb: 12, 14, 19;
--border-blue: rgba(135, 183, 224, 0.24);
--border-blue-soft: rgba(135, 183, 224, 0.16);
--border-input: rgba(135, 183, 224, 0.14);
--border-orange-soft: rgba(228, 132, 58, 0.22);
--text: #eef0f4;
--text-strong: #ffffff;
--text-on-accent: #101318;
--muted: #969baa;
--muted-2: #737988;
--logo-blue: #87b7e0;
--logo-orange: #e4843a;
--green: #31c48d;
--card-alpha: 0.66;
--header-alpha: 0.42;
--input-alpha: 0.78;
--shadow: rgba(0, 0, 0, 0.28);
--inset-highlight: rgba(255, 255, 255, 0.055);
--logo-opacity: 0.88;
--focus-ring: rgba(135, 183, 224, 0.32);
--code-bg: rgba(12, 14, 19, 0.78);
--dropzone-bg: rgba(12, 14, 19, 0.78);
}
}
html,
body,
footer,
.gradio-container {
color: var(--text) !important;
}
html,
body {
min-height: 100vh !important;
width: 100% !important;
margin: 0 !important;
overflow-x: hidden !important;
}
body {
background: var(--bg) !important;
background-attachment: fixed !important;
}
footer {
background: transparent !important;
}
.gradio-container {
position: relative !important;
isolation: isolate !important;
max-width: none !important;
width: 100% !important;
min-height: 100vh !important;
padding: 10px 18px 18px 18px !important;
background: transparent !important;
box-sizing: border-box !important;
}
.gradio-container::before {
content: "";
position: fixed;
inset: 0;
z-index: -2;
pointer-events: none;
background-image: url("__LOGO_URL__");
background-repeat: no-repeat;
background-size: min(86vw, 980px) min(86vw, 980px);
background-position: calc(100% + 230px) 34px;
opacity: var(--logo-opacity);
filter: saturate(1.2) drop-shadow(0 0 28px rgba(135, 183, 224, 0.14));
}
.with-gap,
.gradio-row {
gap: 18px !important;
}
.gradio-row {
width: 100% !important;
}
.app-header {
position: relative;
display: flex;
align-items: center;
justify-content: space-between;
gap: 16px;
padding: 10px 12px 14px 12px;
margin-bottom: 10px;
border-bottom: 1px solid var(--border-blue-soft);
background: rgba(var(--panel-strong-rgb), var(--header-alpha));
border-radius: 14px;
backdrop-filter: blur(8px);
box-shadow: 0 12px 42px var(--shadow), inset 0 1px 0 var(--inset-highlight);
}
.brand {
display: flex;
align-items: center;
gap: 10px;
}
.brand-mark {
width: 28px;
height: 28px;
flex: 0 0 auto;
object-fit: contain;
}
.brand-title {
display: flex;
align-items: baseline;
gap: 8px;
}
.brand-name {
font-size: 23px;
line-height: 1;
font-weight: 750;
letter-spacing: -0.045em;
color: var(--text-strong) !important;
}
.brand-name span {
color: var(--muted) !important;
}
.model-chip {
display: inline-flex;
align-items: center;
max-width: 520px;
padding: 5px 9px;
border-radius: 999px;
background: rgba(var(--panel-rgb), 0.88);
border: 1px solid var(--border-blue-soft);
color: var(--muted) !important;
font-size: 12px;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
text-decoration: none !important;
cursor: pointer;
transition: border-color 0.15s ease, transform 0.15s ease, background 0.15s ease;
}
.model-chip:hover {
border-color: var(--logo-blue);
background: rgba(var(--panel-rgb), 1);
transform: translateY(-1px);
text-decoration: none !important;
}
.model-chip:focus-visible {
outline: none;
box-shadow: 0 0 0 3px var(--focus-ring);
}
.model-chip code {
color: var(--text-strong) !important;
background: transparent !important;
}
.header-actions {
display: flex;
align-items: center;
gap: 10px;
color: var(--muted) !important;
font-size: 13px;
}
.status-dot {
width: 8px;
height: 8px;
border-radius: 99px;
background: var(--green);
box-shadow: 0 0 14px rgba(49, 196, 141, 0.65);
}
.intro-card {
margin: 0 0 16px 0;
padding: 14px 16px;
border-radius: 14px;
background: rgba(var(--panel-rgb), var(--card-alpha));
border: 1px solid var(--border-blue-soft);
box-shadow: 0 12px 42px var(--shadow), inset 0 1px 0 var(--inset-highlight);
backdrop-filter: blur(8px);
}
.intro-card p {
margin: 0 0 8px 0;
line-height: 1.5;
}
.section-title {
margin: 0 0 8px 0;
color: var(--text-strong) !important;
font-size: 13px;
font-weight: 750;
letter-spacing: 0.01em;
}
.main-card,
.output-card,
.gradio-group {
background: rgba(var(--panel-rgb), var(--card-alpha)) !important;
border: 1px solid var(--border-blue) !important;
border-radius: 14px !important;
box-shadow: 0 22px 70px var(--shadow), inset 0 1px 0 var(--inset-highlight) !important;
backdrop-filter: blur(10px) saturate(1.18);
}
.main-card {
width: 100% !important;
box-sizing: border-box !important;
}
.output-card {
min-height: 720px !important;
max-height: 860px !important;
border-color: var(--border-orange-soft) !important;
width: 100% !important;
box-sizing: border-box !important;
overflow: hidden !important;
}
label,
.markdown,
.prose,
h1,
h2,
h3,
h4,
h5,
h6,
p,
span,
div {
color: var(--text) !important;
}
.section-title,
label > span,
.gradio-container label {
color: var(--text-strong) !important;
}
.secondary-note {
color: var(--muted) !important;
font-size: 12px;
line-height: 1.35;
}
textarea,
input[type="text"],
input[type="password"],
input[type="number"],
input[type="email"],
.cm-editor {
background: rgba(var(--input-rgb), var(--input-alpha)) !important;
color: var(--text) !important;
border-color: var(--border-input) !important;
}
textarea::placeholder,
input::placeholder {
color: var(--muted-2) !important;
}
textarea:focus,
input:focus,
.cm-editor.cm-focused {
border-color: var(--logo-blue) !important;
box-shadow: 0 0 0 3px var(--focus-ring) !important;
}
input[type="checkbox"] {
accent-color: var(--logo-blue) !important;
}
#schema-box .cm-editor {
min-height: 410px !important;
max-height: 480px !important;
background: var(--code-bg) !important;
}
.cm-editor,
.cm-scroller,
.cm-content,
.cm-line,
.cm-gutters,
.cm-activeLine,
.cm-activeLineGutter {
background: var(--code-bg) !important;
color: var(--text) !important;
}
.cm-gutters {
border-color: var(--border-blue-soft) !important;
color: var(--muted-2) !important;
}
.cm-cursor {
border-left-color: var(--text-strong) !important;
}
#image-box {
min-height: 335px !important;
background: var(--dropzone-bg) !important;
border-color: var(--border-blue-soft) !important;
}
#image-box,
#image-box *,
.upload-container,
.upload-container *,
.file-preview,
.file-preview * {
color: var(--text) !important;
}
#image-box button,
#image-box .icon-wrap,
#image-box .wrap {
background: transparent !important;
}
#reasoning-box {
min-height: 180px !important;
max-height: 240px !important;
overflow: auto !important;
padding: 8px;
border-radius: 8px;
background: rgba(var(--input-rgb), var(--input-alpha)) !important;
border: 1px solid var(--border-blue-soft);
white-space: pre-wrap !important;
overflow-wrap: anywhere !important;
word-break: break-word !important;
}
#output-box {
min-height: 360px !important;
max-height: 520px !important;
overflow: auto !important;
padding: 8px;
border-radius: 8px;
background: rgba(var(--input-rgb), var(--input-alpha)) !important;
border: 1px solid var(--border-blue-soft);
white-space: pre-wrap !important;
overflow-wrap: anywhere !important;
word-break: break-word !important;
}
#reasoning-box pre,
#reasoning-box code,
#output-box pre,
#output-box code {
white-space: pre-wrap !important;
overflow-wrap: anywhere !important;
word-break: break-word !important;
color: var(--text) !important;
background: transparent !important;
}
button {
border-radius: 9px !important;
min-height: 34px !important;
}
button.primary-button,
.primary-button button,
.primary-button {
background: var(--logo-blue) !important;
background-color: var(--logo-blue) !important;
color: var(--text-on-accent) !important;
border: none !important;
font-weight: 750 !important;
}
button.markdown-button,
.markdown-button button,
.markdown-button {
background: var(--logo-orange) !important;
background-color: var(--logo-orange) !important;
color: var(--text-on-accent) !important;
border: none !important;
font-weight: 750 !important;
}
.clear-button button,
button.clear-button,
.clear-button {
background: transparent !important;
background-color: transparent !important;
color: var(--muted) !important;
border: 1px solid var(--border-blue-soft) !important;
}
.gradio-container .wrap,
.gradio-container .block,
.gradio-container .form,
.gradio-container .panel,
.gradio-container .tabs,
.gradio-container .tabitem {
background: transparent !important;
color: var(--text) !important;
}
.gradio-accordion {
border-color: var(--border-blue-soft) !important;
}
.gradio-container table,
.gradio-container th,
.gradio-container td {
color: var(--text) !important;
}
.gradio-container label,
.gradio-container label span,
.gradio-container .label-wrap,
.gradio-container .label-wrap span {
color: var(--text-strong) !important;
}
@media (max-width: 1100px) {
.gradio-container {
width: 100% !important;
padding: 10px 12px 18px 12px !important;
}
.app-header {
align-items: flex-start;
flex-direction: column;
}
.brand-title {
align-items: flex-start;
flex-direction: column;
}
.model-chip {
max-width: 100%;
}
.output-card {
min-height: 520px !important;
max-height: none !important;
}
#reasoning-box {
min-height: 160px !important;
max-height: 220px !important;
}
#output-box {
min-height: 320px !important;
max-height: 480px !important;
}
}
""".replace("__LOGO_URL__", LOGO_URL or "")
# ---------------- Gradio app ----------------
with gr.Blocks(
title="NuExtract3",
css=CSS,
theme=gr.themes.Base(
primary_hue="blue",
secondary_hue="orange",
neutral_hue="slate",
),
) as demo:
logo_html = (
f'<img class="brand-mark" src="{LOGO_URL}" alt="NuExtract logo" />'
if LOGO_URL
else '<div class="brand-mark"></div>'
)
gr.HTML(
f"""
<header class="app-header">
<div class="brand">
{logo_html}
<div class="brand-title">
<div class="brand-name">NuExtract3</span></div>
<a
class="model-chip"
href="https://huggingface.co/numind/NuExtract3"
target="_blank"
rel="noopener noreferrer">
Model&nbsp;<code>{DEFAULT_MODEL}</code>
</a>
</div>
</div>
<div class="header-actions">
<span class="status-dot"></span>
<span>OpenAI-compatible endpoint</span>
</div>
</header>
"""
)
gr.Markdown(
"""
<div style="padding: 0.25rem 0 1rem 0;">
<h1 style="margin-top: 0; margin-bottom: 0.25rem;">NuExtract3</h1>
<p style="font-size: 1.05rem; line-height: 1.6;">
<strong>NuExtract3</strong> is a unified <strong>4B vision-language reasoning model</strong>
for document understanding.
</p>
<p style="line-height: 1.6;">
It combines <strong>structured information extraction</strong> with high-quality
<strong>image-to-Markdown conversion</strong>, making it useful for OCR, RAG preprocessing,
and extraction pipelines across scans, receipts, forms, invoices, contracts, tables, and more.
</p>
<h3>Features</h3>
<ul>
<li><strong>Structured extraction:</strong> text/images + JSON template + instructions → JSON output</li>
<li><strong>Markdown conversion:</strong> text/images → Markdown</li>
<li><strong>Multimodal inputs:</strong> text, images, or text + images</li>
<li><strong>Multilingual documents</strong></li>
<li><strong>Reasoning and non-reasoning inference modes</strong></li>
<li><strong>Template generation</strong> from natural language or input documents</li>
</ul>
<p align="center">
🤗 <a href="https://huggingface.co/numind/NuExtract3">Model</a>&nbsp;&nbsp; | &nbsp;&nbsp;
🖥️ <a href="https://nuextract.ai/">API / Platform</a>&nbsp;&nbsp; | &nbsp;&nbsp;
📑 <a href="https://numind.ai/blog">Blog</a>&nbsp;&nbsp; | &nbsp;&nbsp;
🗣️ <a href="https://discord.gg/3tsEtJNCDe">Discord</a>&nbsp;&nbsp; | &nbsp;&nbsp;
🛠️ <a href="https://github.com/numindai/nuextract">GitHub</a>
</p>
</div>
""",
elem_classes=["intro-card"],
)
with gr.Row(equal_height=True):
# Left: input, schema, controls
with gr.Column(scale=1, min_width=520):
with gr.Group(elem_classes="main-card"):
gr.HTML("<div class='section-title'>Input</div>")
context_image = gr.Image(
label="Image",
type="filepath",
height=340,
sources=["upload", "clipboard"],
elem_id="image-box",
)
context_text = gr.Textbox(
label="Text",
placeholder="Optional: paste document text.",
lines=3,
max_lines=5,
)
with gr.Group(elem_classes="main-card"):
gr.HTML("<div class='section-title'>Schema & instructions</div>")
instruction = gr.Textbox(
label="Instructions",
placeholder="Optional extraction instructions.",
lines=2,
max_lines=3,
)
with gr.Row(equal_height=True):
template = gr.Code(
label="Template",
language="json",
value=json.dumps(
{
"title": "string",
"entities": ["string"],
"dates": ["YYYY-MM-DD"],
"amounts": [
{
"value": "number",
"currency": "string",
}
],
},
indent=4,
),
lines=16,
scale=5,
elem_id="schema-box",
)
with gr.Column(scale=2, min_width=150):
generate_template_btn = gr.Button(
"Generate template",
variant="secondary",
)
gr.HTML(
"<div class='secondary-note'>"
"Use Extract for JSON. Use Markdown to convert an image document. Use generate template to generate a well formated template from an input image."
"</div>"
)
with gr.Group(elem_classes="main-card"):
gr.HTML("<div class='section-title'>Run</div>")
with gr.Row():
extract_btn = gr.Button(
"Extract JSON",
variant="secondary",
elem_classes=["primary-button"],
)
markdown_btn = gr.Button(
"Convert to Markdown",
variant="secondary",
elem_classes=["markdown-button"],
)
with gr.Row():
stop_btn = gr.Button("Stop", variant="stop")
clear_btn = gr.Button(
"Clear results",
variant="secondary",
elem_classes=["clear-button"],
)
reasoning_checkbox = gr.Checkbox(
label="Reasoning",
value=True,
interactive=True,
info="If enabled, reasoning is everything before </think>.",
)
temperature = gr.Slider(
0.0,
1,
value=0.0,
step=0.05,
label="Temperature",
info="Higher values make the output less deterministic but can improve reasoning performance (around 0.4-0.6)",
)
with gr.Accordion("Structured examples", open=False):
if STRUCTURED_EXAMPLES:
gr.Examples(
examples=STRUCTURED_EXAMPLES,
inputs=[context_image, template, instruction],
label="Load structured example",
examples_per_page=8,
cache_examples=False,
)
else:
gr.Markdown(
f"""
No structured examples found.
Add files referenced in `STRUCTURED_EXAMPLE_TEMPLATES`, for example:
```text
{EXAMPLE_DIR}/1.jpg
{EXAMPLE_DIR}/2.png
```
"""
)
with gr.Accordion("Markdown examples", open=False):
if MARKDOWN_EXAMPLES:
gr.Examples(
examples=MARKDOWN_EXAMPLES,
inputs=[context_image],
label="Load Markdown example",
examples_per_page=8,
cache_examples=False,
)
else:
gr.Markdown(
f"""
No Markdown examples found.
Add image paths to `MARKDOWN_EXAMPLE_IMAGE_PATHS`, for example:
```python
MARKDOWN_EXAMPLE_IMAGE_PATHS = [
"markdown_1.png",
"markdown_2.jpg",
"/home/user/app/examples/report.png",
]
```
Relative paths are resolved from:
```text
{EXAMPLE_DIR}
```
"""
)
# Endpoint settings are intentionally hidden from the UI.
api_base = gr.State(DEFAULT_API_BASE)
api_key = gr.State(DEFAULT_API_KEY)
system_prompt = gr.State(SYSTEM_PROMPT_DEFAULT)
# Right: reasoning + output
with gr.Column(scale=1, min_width=520):
with gr.Group(elem_classes="output-card"):
gr.HTML("<div class='section-title'>Reasoning</div>")
reasoning_md = gr.Markdown(
label="Reasoning",
elem_id="reasoning-box",
)
gr.HTML("<div class='section-title' style='margin-top: 12px;'>Output</div>")
output_md = gr.Markdown(
label="Output",
elem_id="output-box",
)
error_box = gr.Markdown(visible=False)
def run_model_click(
api_key_val,
api_base_val,
system_prompt_val,
instruction_val,
template_val,
context_text_val,
context_image_val,
temperature_val,
reasoning_val,
markdown_mode_val,
):
mode_name = "Markdown" if markdown_mode_val else "Extract"
print(f"[button] {mode_name} clicked", flush=True)
print(f"[button] image={context_image_val}", flush=True)
print(f"[button] text_len={len(context_text_val or '')}", flush=True)
print(f"[button] reasoning={bool(reasoning_val)}", flush=True)
if markdown_mode_val and not context_image_val:
msg = "Markdown conversion requires a document image."
yield (
gr.update(value=""),
gr.update(value=""),
gr.update(visible=True, value=f"### Error\n{msg}"),
)
return
if not context_image_val and not (context_text_val or "").strip():
msg = "Please provide a document image or paste document text."
yield (
gr.update(value=""),
gr.update(value=""),
gr.update(visible=True, value=f"### Error\n{msg}"),
)
return
try:
yielded_anything = False
for res in infer_stream(
api_key=api_key_val,
api_base=api_base_val,
system_prompt=system_prompt_val,
template=template_val,
instruction=instruction_val,
context_text=context_text_val,
context_image_path=context_image_val,
temperature=temperature_val,
reasoning=bool(reasoning_val),
markdown_mode=bool(markdown_mode_val),
):
yielded_anything = True
think = res.get("think") or ""
output = res.get("output") or "_(Empty output.)_"
yield (
gr.update(value=f"```text\n{think}\n```" if think else ""),
gr.update(value=output),
gr.update(visible=False, value=""),
)
if not yielded_anything:
yield (
gr.update(value=""),
gr.update(value=""),
gr.update(
visible=True,
value="### Error\nThe model returned no streamed output.",
),
)
except Exception:
import traceback
tb = traceback.format_exc()
print(tb, flush=True)
yield (
gr.update(value=""),
gr.update(value=""),
gr.update(visible=True, value=f"### Error\n```text\n{tb}\n```"),
)
def on_extract_click(
api_key_val,
api_base_val,
system_prompt_val,
instruction_val,
template_val,
context_text_val,
context_image_val,
temperature_val,
reasoning_val,
):
yield from run_model_click(
api_key_val,
api_base_val,
system_prompt_val,
instruction_val,
template_val,
context_text_val,
context_image_val,
temperature_val,
reasoning_val,
False,
)
def on_markdown_click(
api_key_val,
api_base_val,
system_prompt_val,
instruction_val,
template_val,
context_text_val,
context_image_val,
temperature_val,
reasoning_val,
):
yield from run_model_click(
api_key_val,
api_base_val,
system_prompt_val,
instruction_val,
template_val,
context_text_val,
context_image_val,
temperature_val,
reasoning_val,
True,
)
def on_click_generate_template(
api_key_val,
api_base_val,
system_prompt_val,
context_text_val,
context_image_val,
temperature_val,
):
print("[button] Generate template clicked", flush=True)
if not context_image_val and not (context_text_val or "").strip():
return (
gr.update(),
gr.update(
visible=True,
value="### Error\nPlease provide a document image or paste document text.",
),
)
try:
template_text = infer_template_generation(
api_key=api_key_val,
api_base=api_base_val,
system_prompt=system_prompt_val,
context_text=context_text_val,
context_image_path=context_image_val,
temperature=temperature_val,
)
return gr.update(value=template_text), gr.update(visible=False, value="")
except Exception:
import traceback
tb = traceback.format_exc()
print(tb, flush=True)
return (
gr.update(),
gr.update(visible=True, value=f"### Error\n```text\n{tb}\n```"),
)
def on_clear():
return (
gr.update(value=""),
gr.update(value=""),
gr.update(visible=False, value=""),
)
common_inputs = [
api_key,
api_base,
system_prompt,
instruction,
template,
context_text,
context_image,
temperature,
reasoning_checkbox,
]
common_outputs = [
reasoning_md,
output_md,
error_box,
]
extract_event = extract_btn.click(
fn=on_extract_click,
inputs=common_inputs,
outputs=common_outputs,
show_progress=True,
)
markdown_event = markdown_btn.click(
fn=on_markdown_click,
inputs=common_inputs,
outputs=common_outputs,
show_progress=True,
)
stop_btn.click(
fn=None,
inputs=None,
outputs=None,
cancels=[extract_event, markdown_event],
)
clear_btn.click(
fn=on_clear,
inputs=None,
outputs=common_outputs,
)
generate_template_btn.click(
fn=on_click_generate_template,
inputs=[
api_key,
api_base,
system_prompt,
context_text,
context_image,
temperature,
],
outputs=[
template,
error_box,
],
show_progress=True,
)
if __name__ == "__main__":
allowed_paths = []
if ASSETS_DIR.exists():
allowed_paths.append(str(ASSETS_DIR))
if EXAMPLE_DIR.exists():
allowed_paths.append(str(EXAMPLE_DIR))
demo.queue().launch(
share=ARGS.share,
server_name=ARGS.server_name,
server_port=ARGS.server_port,
show_error=True,
allowed_paths=allowed_paths or None,
)