import argparse import base64 import io import json import os import re from pathlib import Path from typing import Any, Dict, Iterator, List, Optional, Tuple import gradio as gr from openai import OpenAI from PIL import Image # ---------------- Paths ---------------- APP_DIR = Path(__file__).resolve().parent # ---------------- CLI / environment configuration ---------------- def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="NuExtract Gradio demo") parser.add_argument( "--model-name", default=os.environ.get("MODEL_NAME", "numind/NuExtract3"), help="Model name served by the OpenAI-compatible endpoint.", ) parser.add_argument( "--api-base", default=os.environ.get("OPENAI_API_BASE", "http://127.0.0.1:8000/v1"), help="OpenAI-compatible base URL.", ) parser.add_argument( "--api-key", default=os.environ.get("OPENAI_API_KEY", "EMPTY"), help="API key for the OpenAI-compatible endpoint.", ) parser.add_argument( "--server-name", default=os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"), help="Gradio server host.", ) parser.add_argument( "--server-port", type=int, default=int(os.environ.get("GRADIO_SERVER_PORT", "7860")), help="Gradio server port.", ) parser.add_argument( "--share", action="store_true", default=os.environ.get("GRADIO_SHARE", "false").lower() in {"1", "true", "yes"}, help="Create a public Gradio share link.", ) parser.add_argument( "--max-tokens", type=int, default=int(os.environ.get("NUEXTRACT_MAX_TOKENS", "10000")), help="Maximum tokens for model generation. Hidden from the UI.", ) parser.add_argument( "--example-dir", default=os.environ.get("NUEXTRACT_EXAMPLE_DIR", str(APP_DIR / "examples")), help="Directory containing image examples.", ) parser.add_argument( "--assets-dir", default=os.environ.get("NUEXTRACT_ASSETS_DIR", str(APP_DIR / "assets")), help="Directory containing static assets such as the NuExtract logo.", ) args, _ = parser.parse_known_args() return args def resolve_dir(path_like: str) -> Path: path = Path(path_like).expanduser() if path.is_absolute(): return path.resolve() return (APP_DIR / path).resolve() ARGS = parse_args() DEFAULT_MODEL = ARGS.model_name DEFAULT_API_BASE = ARGS.api_base DEFAULT_API_KEY = ARGS.api_key DEFAULT_MAX_TOKENS = ARGS.max_tokens EXAMPLE_DIR = resolve_dir(ARGS.example_dir) ASSETS_DIR = resolve_dir(ARGS.assets_dir) LOGO_PATH = ASSETS_DIR / "logo_numind_picto.svg" if LOGO_PATH.exists(): LOGO_URL = f"/gradio_api/file={LOGO_PATH}" gr.set_static_paths(paths=[ASSETS_DIR]) else: print(f"[assets] Missing logo: {LOGO_PATH}", flush=True) LOGO_URL = "" SYSTEM_PROMPT_DEFAULT = ( "You are a precise information extraction assistant. " "Return faithful, source-grounded results only." ) # ---------------- Structured extraction examples ---------------- # These examples populate: Image + Template + Instructions. STRUCTURED_EXAMPLE_TEMPLATES: Dict[str, Dict[str, Any]] = { "1.jpg": { "movie_name": "verbatim-string", "tagline": "verbatim-string", "language": "string", "motion_picture_association_rating": [ "G - General Audiences", "PG - Parental Guidance Suggested", "PG-13 – Parents Strongly Cautioned", "R – Restricted", "NC-17 – Adults Only", "not provided" ], "movie_distribution_company": "verbatim-string", "movie_production_company": "verbatim-string", "theatre_release_date": "date-time", "movie_website_address": "verbatim-string", "movie_director_name": "verbatim-string", "actors_names": [ "verbatim-string" ], "staff": [ { "staff_name": "verbatim-string", "staff_role": [ "director", "co-director", "screenwriter", "author", "cinematographer", "costume designer", "production designer", "set designer", "animator", "color designer", "art director", "animation director", "vfx director", "voice actor", "composer", "songwriter", "music performer", "music supervisor", "choreographer", "casting director", "editor", "producer", "co-producer", "associate producer", "executive producer", "co-executive producer", "line producer" ] } ], "reviews": [ { "critic_name": "verbatim-string", "review_comment": "verbatim-string" } ], "technologies": [ [ "Dolby Stereo", "Dolby Digital", "Dolby Stereo Digital", "Dolby Atmos", "Dolby Vision", "Dolby Cinema", "DTS", "SDDS", "IMAX", "4DX" ] ] }, "2.png": { "number_of_bathrooms": "integer", "number_of_fireplaces": "integer", "distance_unit": ["meter", "foot"], "rooms_that_are_not_bedrooms_or_corridors_or_toilets": [ { "room_name": "verbatim-string", "surface_area": "number", } ], "bedrooms": [ { "bedroom_name": "verbatim-string", "surface_area": "number", "number_of_windows": "integer", "has_private_bathroom": "boolean", } ], "has_laundry_room": "boolean", "has_terrace": "boolean", "has_balcony": "boolean", "number_of_parking_spaces_in_garage": "integer", "number_of_parking_spaces_exterior": "integer", }, "8.png" : { "invoice_number": "verbatim-string", "issuer_name": "verbatim-string", "recipient_name": "verbatim-string", "issuer_location": { "street_number": "verbatim-string", "street_name": "verbatim-string", "city": "verbatim-string", "zip_code": "string", "country": "string" }, "date_of_issue": "date-time", "date_due": "date-time", "currency_code_iso4217": "string", "items": [ { "item_name": "verbatim-string", "item_quantity": "number", "item_price_per_unit": "number", "item_total_price": "number" } ], "total_discount_amount": "number", "total_fee_amount": "number", "total_tax_amount": "number", "total_price_net": "number", "total_price_gross": "number" }, "18.jpg":{ "festival_name": "verbatim-string", "website_url": "url", "location": { "city": "string", "zip_code": "string", "country": "country" }, "date_first_day": "date", "date_last_day": "date", "lineup_entry": [ { "artist_or_group_name": "string", "artist_entity_type": ["individual", "band", "ensemble", "b2b", "project_or_collaboration", "other"], "performing_stage_name": "verbatim-string", "is_headliner": "boolean", "day_playing": "date" } ], "sponsors": [ { "name": "verbatim-string", "type": ["press", "tv", "bank", "insurance", "beverage company", "car company", "technology company", "clothing company", "transportation", "public institution", "other"] } ] }, "17.png":{ "parts": [ { "name": "verbatim-string", "id": "verbatim-string", "details": "verbatim-string" } ] }, "16.jpeg":{ "Applicant": { "Name": "verbatim-string", "Registration no": "verbatim-string", "Holding compagny": "verbatim-string", "VAT Registration no": "verbatim-string", "Date of creation": "date", "Type of entity": "verbatim-string", "Location": { "Street number": "verbatim-string", "Street name": "verbatim-string", "City": "verbatim-string", "Zip code": "string", "Country": "country" }, "Website": "url", "Phone": "phone-number", "Email": "email-address" }, "Bank Reference": { "Bank name": "verbatim-string", "Account name": "verbatim-string", "Account no": "integer", "Importer code": "verbatim-string" }, "Trades references": [ { "Company name": "verbatim-string", "Account opened since": "date", "Tel": "phone-number", "Email": "email-address", "Location": { "Street number": "verbatim-string", "Street name": "verbatim-string", "City": "verbatim-string", "Zip code": "string", "Country": "country" }, "Credit limit": "string" } ], "Is document signed": "boolean", "Date of signature": "date" }, } STRUCTURED_EXAMPLE_INSTRUCTIONS: Dict[str, str] = { "1.jpg": "", "2.png": "", "8.png": "", "18.jpg": "", "17.png": "", "16.jpeg": "" } # ---------------- Markdown/OCR examples ---------------- # Put Markdown example image paths here. # These examples populate only the Image input and are meant for the # “Convert to Markdown” button. MARKDOWN_EXAMPLE_IMAGE_PATHS: List[str] = [ "3.jpg", "4.jpg", "5.jpg", "6.png", "7.jpg", "9.jpg", "10.png", "11.png", "12.jpg", "14.jpg", "15.jpg" ] def resolve_example_path(path_like: str) -> Path: path = Path(path_like).expanduser() if path.is_absolute(): return path.resolve() return (EXAMPLE_DIR / path).resolve() def build_structured_examples() -> List[List[Any]]: examples: List[List[Any]] = [] for filename, template_obj in STRUCTURED_EXAMPLE_TEMPLATES.items(): image_path = resolve_example_path(filename) if not image_path.exists(): print(f"[structured examples] Missing image: {image_path}", flush=True) continue examples.append( [ str(image_path), json.dumps(template_obj, indent=4, ensure_ascii=False), STRUCTURED_EXAMPLE_INSTRUCTIONS.get(filename, ""), ] ) return examples def build_markdown_examples() -> List[List[Any]]: examples: List[List[Any]] = [] for path_like in MARKDOWN_EXAMPLE_IMAGE_PATHS: image_path = resolve_example_path(path_like) if not image_path.exists(): print(f"[markdown examples] Missing image: {image_path}", flush=True) continue examples.append([str(image_path)]) return examples STRUCTURED_EXAMPLES = build_structured_examples() MARKDOWN_EXAMPLES = build_markdown_examples() # ---------------- Utility helpers ---------------- def image_bytes_to_base64(b: bytes) -> str: return base64.b64encode(b).decode("utf-8") def ensure_rgb_image(image_bytes: bytes) -> Image.Image: img = Image.open(io.BytesIO(image_bytes)) if img.mode != "RGB": img = img.convert("RGB") return img def file_path_to_bytes(path: str) -> bytes: with open(path, "rb") as f: return f.read() # ---------------- Response parsing ---------------- def strip_code_fence(payload: str) -> str: return re.sub( r"^```(?:json|markdown|text)?\s*|\s*```$", "", payload.strip(), flags=re.IGNORECASE | re.MULTILINE, ).strip() def pretty_json_or_text(payload: str) -> str: if not payload: return "" cleaned = strip_code_fence(payload) try: return json.dumps(json.loads(cleaned), indent=4, ensure_ascii=False) except Exception: return cleaned def extract_answer_block(text: str) -> str: if not text: return "" try: match = re.search( r"\s*(.*?)\s*", text, flags=re.DOTALL | re.IGNORECASE, ) if match: return pretty_json_or_text(match.group(1).strip()) except Exception: pass json_objects = list(re.finditer(r"\{[\s\S]*\}", text)) if json_objects: candidate = max(json_objects, key=lambda match: len(match.group(0))).group(0) return pretty_json_or_text(candidate) return text.strip() def split_reasoning_and_output(text: str, reasoning_enabled: bool) -> Tuple[str, str]: if not text: return "", "" if not reasoning_enabled: return "", text.strip() lower = text.lower() end_tag = "" if end_tag in lower: end_idx = lower.find(end_tag) reasoning = text[:end_idx].strip() output = text[end_idx + len(end_tag):].strip() return reasoning, output return text.strip(), "" # ---------------- Message building ---------------- def make_text_content(text: str) -> List[Dict[str, Any]]: return [{"type": "text", "text": text or ""}] def make_image_content( image_bytes: bytes, extra_text: Optional[str] = None, ) -> List[Dict[str, Any]]: img = ensure_rgb_image(image_bytes) buffer = io.BytesIO() img.save(buffer, format="JPEG", quality=95) img_b64 = image_bytes_to_base64(buffer.getvalue()) content: List[Dict[str, Any]] = [ { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{img_b64}", "detail": "high", }, } ] if extra_text and extra_text.strip(): content.append({"type": "text", "text": extra_text.strip()}) return content def normalize_template(template: str) -> str: tpl = (template or "").strip() if not tpl: return "{}" try: return json.dumps(json.loads(tpl), indent=4, ensure_ascii=False) except Exception: return tpl def collate_single_input( *, text_or_image: Any, template: str, system_prompt: Optional[str], instruction: Optional[str], ) -> Tuple[List[Dict[str, Any]], str]: is_image_input = isinstance(text_or_image, dict) and "bytes" in text_or_image messages: List[Dict[str, Any]] = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) template_json = normalize_template(template) extra_parts = [] if instruction and instruction.strip(): extra_parts.append(f"Instructions:\n{instruction.strip()}") if template_json and template_json.strip() not in {"{}", ""}: extra_parts.append(f"Extraction template:\n```json\n{template_json}\n```") extra_text_for_user = "\n\n".join(extra_parts) if extra_parts else None if is_image_input: messages.append( { "role": "user", "content": make_image_content( image_bytes=text_or_image["bytes"], extra_text=extra_text_for_user, ), } ) else: text = str(text_or_image or "") if extra_text_for_user: text = f"{text}\n\n{extra_text_for_user}".strip() messages.append({"role": "user", "content": make_text_content(text)}) return messages, template_json def collate_for_template_generation( *, context_text: str, context_image_path: Optional[str], system_prompt: Optional[str], ) -> List[Dict[str, Any]]: messages: List[Dict[str, Any]] = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) guidance = ( "Generate a concise JSON extraction template for this document. " "Use descriptive field names and simple type hints like string, number, YYYY-MM-DD, " "boolean, or arrays of objects. Return only the JSON template." ) if context_image_path: messages.append( { "role": "user", "content": make_image_content( image_bytes=file_path_to_bytes(context_image_path), extra_text=guidance, ), } ) else: text = (context_text or "").strip() messages.append( { "role": "user", "content": make_text_content(f"{text}\n\n{guidance}".strip()), } ) return messages def collate_markdown_image_only( *, image_bytes: bytes, system_prompt: Optional[str], ) -> List[Dict[str, Any]]: messages: List[Dict[str, Any]] = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) messages.append( { "role": "user", "content": make_image_content(image_bytes=image_bytes), } ) return messages # ---------------- Model calls ---------------- def chunk_to_text(chunk: Any) -> str: try: if not chunk or not getattr(chunk, "choices", None): return "" delta = getattr(chunk.choices[0], "delta", None) if delta is None: return "" content = getattr(delta, "content", None) if isinstance(content, str): return content if isinstance(content, list): parts: List[str] = [] for item in content: if isinstance(item, dict) and item.get("text"): parts.append(item["text"]) elif getattr(item, "text", None): parts.append(item.text) return "".join(parts) except Exception: return "" return "" def build_chat_template_kwargs( *, template_json: str, reasoning: bool, instruction: Optional[str], markdown_mode: bool, ) -> Dict[str, Any]: if markdown_mode: return { "mode": "markdown", "enable_thinking": bool(reasoning), } use_structured = bool( template_json and template_json.strip() and template_json.strip() != "{}" ) chat_kwargs: Dict[str, Any] = { "mode": "structured" if use_structured else "content", "enable_thinking": bool(reasoning), } if use_structured: chat_kwargs["template"] = template_json if instruction and instruction.strip(): chat_kwargs["instructions"] = instruction.strip() return chat_kwargs def call_model_stream( *, api_base: str, api_key: str, model_name: str, messages: List[Dict[str, Any]], template_json: str, temperature: float, max_tokens: int, reasoning: bool, instruction: Optional[str], markdown_mode: bool, ) -> Iterator[str]: client = OpenAI(base_url=api_base, api_key=api_key) chat_kwargs = build_chat_template_kwargs( template_json=template_json, reasoning=reasoning, instruction=instruction, markdown_mode=markdown_mode, ) stream = client.chat.completions.create( model=model_name, temperature=float(temperature), max_tokens=int(max_tokens), messages=messages, stream=True, extra_body={"chat_template_kwargs": chat_kwargs}, ) accumulated = "" for chunk in stream: delta_text = chunk_to_text(chunk) if delta_text: accumulated += delta_text yield accumulated def call_model_once( *, api_base: str, api_key: str, model_name: str, messages: List[Dict[str, Any]], mode: str, temperature: float, max_tokens: int, ) -> str: client = OpenAI(base_url=api_base, api_key=api_key) chat = client.chat.completions.create( model=model_name, temperature=float(temperature), max_tokens=int(max_tokens), messages=messages, extra_body={ "chat_template_kwargs": { "mode": mode, "enable_thinking": False, } }, ) return chat.choices[0].message.content if chat.choices else "" # ---------------- Inference orchestration ---------------- def prepare_input(context_text: str, context_image_path: Optional[str]) -> Any: if context_image_path: return {"bytes": file_path_to_bytes(context_image_path)} return context_text or "" def infer_stream( *, api_key: str, api_base: str, system_prompt: str, template: str, instruction: str, context_text: str, context_image_path: Optional[str], temperature: float, reasoning: bool, markdown_mode: bool, ): single_input = prepare_input(context_text, context_image_path) is_image = isinstance(single_input, dict) and "bytes" in single_input if markdown_mode: if not is_image: raise ValueError("Markdown conversion requires an image input.") messages = collate_markdown_image_only( image_bytes=single_input["bytes"], system_prompt=system_prompt, ) template_json = "" else: messages, template_json = collate_single_input( text_or_image=single_input, template=template, system_prompt=system_prompt, instruction=instruction, ) for partial_text in call_model_stream( api_base=api_base, api_key=api_key, model_name=DEFAULT_MODEL, messages=messages, template_json=template_json, temperature=temperature, max_tokens=DEFAULT_MAX_TOKENS, reasoning=reasoning, instruction=instruction, markdown_mode=markdown_mode, ): trace, output_text = split_reasoning_and_output( partial_text, reasoning_enabled=reasoning, ) if markdown_mode: output_display = output_text or ( "_(Waiting for output after ``.)_" if reasoning else "_(Empty output.)_" ) yield { "mode": "markdown", "output": output_display, "think": trace if reasoning else "", } continue if not reasoning: output_text = partial_text or "" answer = extract_answer_block(output_text) output_display = answer or ( "_(Waiting for output after ``.)_" if reasoning else "_(No output found yet.)_" ) if output_display.strip().startswith("{") or output_display.strip().startswith("["): output_display = pretty_json_or_text(output_display) output_display = f"```json\n{output_display}\n```" else: output_display = output_display.replace("\\n", "\n") yield { "mode": "structured", "output": output_display, "think": trace if reasoning else "", } def infer_template_generation( *, api_key: str, api_base: str, system_prompt: str, context_text: str, context_image_path: Optional[str], temperature: float, ) -> str: messages = collate_for_template_generation( context_text=context_text, context_image_path=context_image_path, system_prompt=system_prompt, ) result = call_model_once( api_base=api_base, api_key=api_key, model_name=DEFAULT_MODEL, messages=messages, mode="template-generation", temperature=temperature, max_tokens=DEFAULT_MAX_TOKENS, ) return pretty_json_or_text(result) # ---------------- UI styling ---------------- CSS = """ :root { color-scheme: light; --bg: #f6f2eb; --panel: #ffffff; --panel-rgb: 255, 255, 255; --panel-strong-rgb: 255, 252, 246; --input-rgb: 255, 255, 255; --border-blue: rgba(67, 111, 148, 0.30); --border-blue-soft: rgba(67, 111, 148, 0.18); --border-input: rgba(67, 111, 148, 0.22); --border-orange-soft: rgba(190, 103, 36, 0.26); --text: #23252b; --text-strong: #101318; --text-on-accent: #101318; --muted: #5f6673; --muted-2: #7d8490; --logo-blue: #5d9bcf; --logo-orange: #d6742f; --green: #178f66; --card-alpha: 0.88; --header-alpha: 0.82; --input-alpha: 0.94; --shadow: rgba(54, 46, 35, 0.14); --inset-highlight: rgba(255, 255, 255, 0.85); --logo-opacity: 0.18; --focus-ring: rgba(67, 111, 148, 0.26); --code-bg: #fdfaf5; --dropzone-bg: #fbf8f2; } html.dark, body.dark, .dark, [data-theme="dark"] { color-scheme: dark; --bg: #242529; --panel: #1d1f26; --panel-rgb: 29, 31, 38; --panel-strong-rgb: 21, 22, 26; --input-rgb: 12, 14, 19; --border-blue: rgba(135, 183, 224, 0.24); --border-blue-soft: rgba(135, 183, 224, 0.16); --border-input: rgba(135, 183, 224, 0.14); --border-orange-soft: rgba(228, 132, 58, 0.22); --text: #eef0f4; --text-strong: #ffffff; --text-on-accent: #101318; --muted: #969baa; --muted-2: #737988; --logo-blue: #87b7e0; --logo-orange: #e4843a; --green: #31c48d; --card-alpha: 0.66; --header-alpha: 0.42; --input-alpha: 0.78; --shadow: rgba(0, 0, 0, 0.28); --inset-highlight: rgba(255, 255, 255, 0.055); --logo-opacity: 0.88; --focus-ring: rgba(135, 183, 224, 0.32); --code-bg: rgba(12, 14, 19, 0.78); --dropzone-bg: rgba(12, 14, 19, 0.78); } @media (prefers-color-scheme: dark) { :root:not([data-theme="light"]) { color-scheme: dark; --bg: #242529; --panel: #1d1f26; --panel-rgb: 29, 31, 38; --panel-strong-rgb: 21, 22, 26; --input-rgb: 12, 14, 19; --border-blue: rgba(135, 183, 224, 0.24); --border-blue-soft: rgba(135, 183, 224, 0.16); --border-input: rgba(135, 183, 224, 0.14); --border-orange-soft: rgba(228, 132, 58, 0.22); --text: #eef0f4; --text-strong: #ffffff; --text-on-accent: #101318; --muted: #969baa; --muted-2: #737988; --logo-blue: #87b7e0; --logo-orange: #e4843a; --green: #31c48d; --card-alpha: 0.66; --header-alpha: 0.42; --input-alpha: 0.78; --shadow: rgba(0, 0, 0, 0.28); --inset-highlight: rgba(255, 255, 255, 0.055); --logo-opacity: 0.88; --focus-ring: rgba(135, 183, 224, 0.32); --code-bg: rgba(12, 14, 19, 0.78); --dropzone-bg: rgba(12, 14, 19, 0.78); } } html, body, footer, .gradio-container { color: var(--text) !important; } html, body { min-height: 100vh !important; width: 100% !important; margin: 0 !important; overflow-x: hidden !important; } body { background: var(--bg) !important; background-attachment: fixed !important; } footer { background: transparent !important; } .gradio-container { position: relative !important; isolation: isolate !important; max-width: none !important; width: 100% !important; min-height: 100vh !important; padding: 10px 18px 18px 18px !important; background: transparent !important; box-sizing: border-box !important; } .gradio-container::before { content: ""; position: fixed; inset: 0; z-index: -2; pointer-events: none; background-image: url("__LOGO_URL__"); background-repeat: no-repeat; background-size: min(86vw, 980px) min(86vw, 980px); background-position: calc(100% + 230px) 34px; opacity: var(--logo-opacity); filter: saturate(1.2) drop-shadow(0 0 28px rgba(135, 183, 224, 0.14)); } .with-gap, .gradio-row { gap: 18px !important; } .gradio-row { width: 100% !important; } .app-header { position: relative; display: flex; align-items: center; justify-content: space-between; gap: 16px; padding: 10px 12px 14px 12px; margin-bottom: 10px; border-bottom: 1px solid var(--border-blue-soft); background: rgba(var(--panel-strong-rgb), var(--header-alpha)); border-radius: 14px; backdrop-filter: blur(8px); box-shadow: 0 12px 42px var(--shadow), inset 0 1px 0 var(--inset-highlight); } .brand { display: flex; align-items: center; gap: 10px; } .brand-mark { width: 28px; height: 28px; flex: 0 0 auto; object-fit: contain; } .brand-title { display: flex; align-items: baseline; gap: 8px; } .brand-name { font-size: 23px; line-height: 1; font-weight: 750; letter-spacing: -0.045em; color: var(--text-strong) !important; } .brand-name span { color: var(--muted) !important; } .model-chip { display: inline-flex; align-items: center; max-width: 520px; padding: 5px 9px; border-radius: 999px; background: rgba(var(--panel-rgb), 0.88); border: 1px solid var(--border-blue-soft); color: var(--muted) !important; font-size: 12px; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; text-decoration: none !important; cursor: pointer; transition: border-color 0.15s ease, transform 0.15s ease, background 0.15s ease; } .model-chip:hover { border-color: var(--logo-blue); background: rgba(var(--panel-rgb), 1); transform: translateY(-1px); text-decoration: none !important; } .model-chip:focus-visible { outline: none; box-shadow: 0 0 0 3px var(--focus-ring); } .model-chip code { color: var(--text-strong) !important; background: transparent !important; } .header-actions { display: flex; align-items: center; gap: 10px; color: var(--muted) !important; font-size: 13px; } .status-dot { width: 8px; height: 8px; border-radius: 99px; background: var(--green); box-shadow: 0 0 14px rgba(49, 196, 141, 0.65); } .intro-card { margin: 0 0 16px 0; padding: 14px 16px; border-radius: 14px; background: rgba(var(--panel-rgb), var(--card-alpha)); border: 1px solid var(--border-blue-soft); box-shadow: 0 12px 42px var(--shadow), inset 0 1px 0 var(--inset-highlight); backdrop-filter: blur(8px); } .intro-card p { margin: 0 0 8px 0; line-height: 1.5; } .section-title { margin: 0 0 8px 0; color: var(--text-strong) !important; font-size: 13px; font-weight: 750; letter-spacing: 0.01em; } .main-card, .output-card, .gradio-group { background: rgba(var(--panel-rgb), var(--card-alpha)) !important; border: 1px solid var(--border-blue) !important; border-radius: 14px !important; box-shadow: 0 22px 70px var(--shadow), inset 0 1px 0 var(--inset-highlight) !important; backdrop-filter: blur(10px) saturate(1.18); } .main-card { width: 100% !important; box-sizing: border-box !important; } .output-card { min-height: 720px !important; max-height: 860px !important; border-color: var(--border-orange-soft) !important; width: 100% !important; box-sizing: border-box !important; overflow: hidden !important; } label, .markdown, .prose, h1, h2, h3, h4, h5, h6, p, span, div { color: var(--text) !important; } .section-title, label > span, .gradio-container label { color: var(--text-strong) !important; } .secondary-note { color: var(--muted) !important; font-size: 12px; line-height: 1.35; } textarea, input[type="text"], input[type="password"], input[type="number"], input[type="email"], .cm-editor { background: rgba(var(--input-rgb), var(--input-alpha)) !important; color: var(--text) !important; border-color: var(--border-input) !important; } textarea::placeholder, input::placeholder { color: var(--muted-2) !important; } textarea:focus, input:focus, .cm-editor.cm-focused { border-color: var(--logo-blue) !important; box-shadow: 0 0 0 3px var(--focus-ring) !important; } input[type="checkbox"] { accent-color: var(--logo-blue) !important; } #schema-box .cm-editor { min-height: 410px !important; max-height: 480px !important; background: var(--code-bg) !important; } .cm-editor, .cm-scroller, .cm-content, .cm-line, .cm-gutters, .cm-activeLine, .cm-activeLineGutter { background: var(--code-bg) !important; color: var(--text) !important; } .cm-gutters { border-color: var(--border-blue-soft) !important; color: var(--muted-2) !important; } .cm-cursor { border-left-color: var(--text-strong) !important; } #image-box { min-height: 335px !important; background: var(--dropzone-bg) !important; border-color: var(--border-blue-soft) !important; } #image-box, #image-box *, .upload-container, .upload-container *, .file-preview, .file-preview * { color: var(--text) !important; } #image-box button, #image-box .icon-wrap, #image-box .wrap { background: transparent !important; } #reasoning-box { min-height: 180px !important; max-height: 240px !important; overflow: auto !important; padding: 8px; border-radius: 8px; background: rgba(var(--input-rgb), var(--input-alpha)) !important; border: 1px solid var(--border-blue-soft); white-space: pre-wrap !important; overflow-wrap: anywhere !important; word-break: break-word !important; } #output-box { min-height: 360px !important; max-height: 520px !important; overflow: auto !important; padding: 8px; border-radius: 8px; background: rgba(var(--input-rgb), var(--input-alpha)) !important; border: 1px solid var(--border-blue-soft); white-space: pre-wrap !important; overflow-wrap: anywhere !important; word-break: break-word !important; } #reasoning-box pre, #reasoning-box code, #output-box pre, #output-box code { white-space: pre-wrap !important; overflow-wrap: anywhere !important; word-break: break-word !important; color: var(--text) !important; background: transparent !important; } button { border-radius: 9px !important; min-height: 34px !important; } button.primary-button, .primary-button button, .primary-button { background: var(--logo-blue) !important; background-color: var(--logo-blue) !important; color: var(--text-on-accent) !important; border: none !important; font-weight: 750 !important; } button.markdown-button, .markdown-button button, .markdown-button { background: var(--logo-orange) !important; background-color: var(--logo-orange) !important; color: var(--text-on-accent) !important; border: none !important; font-weight: 750 !important; } .clear-button button, button.clear-button, .clear-button { background: transparent !important; background-color: transparent !important; color: var(--muted) !important; border: 1px solid var(--border-blue-soft) !important; } .gradio-container .wrap, .gradio-container .block, .gradio-container .form, .gradio-container .panel, .gradio-container .tabs, .gradio-container .tabitem { background: transparent !important; color: var(--text) !important; } .gradio-accordion { border-color: var(--border-blue-soft) !important; } .gradio-container table, .gradio-container th, .gradio-container td { color: var(--text) !important; } .gradio-container label, .gradio-container label span, .gradio-container .label-wrap, .gradio-container .label-wrap span { color: var(--text-strong) !important; } @media (max-width: 1100px) { .gradio-container { width: 100% !important; padding: 10px 12px 18px 12px !important; } .app-header { align-items: flex-start; flex-direction: column; } .brand-title { align-items: flex-start; flex-direction: column; } .model-chip { max-width: 100%; } .output-card { min-height: 520px !important; max-height: none !important; } #reasoning-box { min-height: 160px !important; max-height: 220px !important; } #output-box { min-height: 320px !important; max-height: 480px !important; } } """.replace("__LOGO_URL__", LOGO_URL or "") # ---------------- Gradio app ---------------- with gr.Blocks( title="NuExtract3", css=CSS, theme=gr.themes.Base( primary_hue="blue", secondary_hue="orange", neutral_hue="slate", ), ) as demo: logo_html = ( f'NuExtract logo' if LOGO_URL else '
' ) gr.HTML( f"""
{logo_html}
OpenAI-compatible endpoint
""" ) gr.Markdown( """

NuExtract3

NuExtract3 is a unified 4B vision-language reasoning model for document understanding.

It combines structured information extraction with high-quality image-to-Markdown conversion, making it useful for OCR, RAG preprocessing, and extraction pipelines across scans, receipts, forms, invoices, contracts, tables, and more.

Features

🤗 Model   |    🖥️ API / Platform   |    📑 Blog   |    🗣️ Discord   |    🛠️ GitHub

""", elem_classes=["intro-card"], ) with gr.Row(equal_height=True): # Left: input, schema, controls with gr.Column(scale=1, min_width=520): with gr.Group(elem_classes="main-card"): gr.HTML("
Input
") context_image = gr.Image( label="Image", type="filepath", height=340, sources=["upload", "clipboard"], elem_id="image-box", ) context_text = gr.Textbox( label="Text", placeholder="Optional: paste document text.", lines=3, max_lines=5, ) with gr.Group(elem_classes="main-card"): gr.HTML("
Schema & instructions
") instruction = gr.Textbox( label="Instructions", placeholder="Optional extraction instructions.", lines=2, max_lines=3, ) with gr.Row(equal_height=True): template = gr.Code( label="Template", language="json", value=json.dumps( { "title": "string", "entities": ["string"], "dates": ["YYYY-MM-DD"], "amounts": [ { "value": "number", "currency": "string", } ], }, indent=4, ), lines=16, scale=5, elem_id="schema-box", ) with gr.Column(scale=2, min_width=150): generate_template_btn = gr.Button( "Generate template", variant="secondary", ) gr.HTML( "
" "Use Extract for JSON. Use Markdown to convert an image document. Use generate template to generate a well formated template from an input image." "
" ) with gr.Group(elem_classes="main-card"): gr.HTML("
Run
") with gr.Row(): extract_btn = gr.Button( "Extract JSON", variant="secondary", elem_classes=["primary-button"], ) markdown_btn = gr.Button( "Convert to Markdown", variant="secondary", elem_classes=["markdown-button"], ) with gr.Row(): stop_btn = gr.Button("Stop", variant="stop") clear_btn = gr.Button( "Clear results", variant="secondary", elem_classes=["clear-button"], ) reasoning_checkbox = gr.Checkbox( label="Reasoning", value=True, interactive=True, info="If enabled, reasoning is everything before .", ) temperature = gr.Slider( 0.0, 1, value=0.0, step=0.05, label="Temperature", info="Higher values make the output less deterministic but can improve reasoning performance (around 0.4-0.6)", ) with gr.Accordion("Structured examples", open=False): if STRUCTURED_EXAMPLES: gr.Examples( examples=STRUCTURED_EXAMPLES, inputs=[context_image, template, instruction], label="Load structured example", examples_per_page=8, cache_examples=False, ) else: gr.Markdown( f""" No structured examples found. Add files referenced in `STRUCTURED_EXAMPLE_TEMPLATES`, for example: ```text {EXAMPLE_DIR}/1.jpg {EXAMPLE_DIR}/2.png ``` """ ) with gr.Accordion("Markdown examples", open=False): if MARKDOWN_EXAMPLES: gr.Examples( examples=MARKDOWN_EXAMPLES, inputs=[context_image], label="Load Markdown example", examples_per_page=8, cache_examples=False, ) else: gr.Markdown( f""" No Markdown examples found. Add image paths to `MARKDOWN_EXAMPLE_IMAGE_PATHS`, for example: ```python MARKDOWN_EXAMPLE_IMAGE_PATHS = [ "markdown_1.png", "markdown_2.jpg", "/home/user/app/examples/report.png", ] ``` Relative paths are resolved from: ```text {EXAMPLE_DIR} ``` """ ) # Endpoint settings are intentionally hidden from the UI. api_base = gr.State(DEFAULT_API_BASE) api_key = gr.State(DEFAULT_API_KEY) system_prompt = gr.State(SYSTEM_PROMPT_DEFAULT) # Right: reasoning + output with gr.Column(scale=1, min_width=520): with gr.Group(elem_classes="output-card"): gr.HTML("
Reasoning
") reasoning_md = gr.Markdown( label="Reasoning", elem_id="reasoning-box", ) gr.HTML("
Output
") output_md = gr.Markdown( label="Output", elem_id="output-box", ) error_box = gr.Markdown(visible=False) def run_model_click( api_key_val, api_base_val, system_prompt_val, instruction_val, template_val, context_text_val, context_image_val, temperature_val, reasoning_val, markdown_mode_val, ): mode_name = "Markdown" if markdown_mode_val else "Extract" print(f"[button] {mode_name} clicked", flush=True) print(f"[button] image={context_image_val}", flush=True) print(f"[button] text_len={len(context_text_val or '')}", flush=True) print(f"[button] reasoning={bool(reasoning_val)}", flush=True) if markdown_mode_val and not context_image_val: msg = "Markdown conversion requires a document image." yield ( gr.update(value=""), gr.update(value=""), gr.update(visible=True, value=f"### Error\n{msg}"), ) return if not context_image_val and not (context_text_val or "").strip(): msg = "Please provide a document image or paste document text." yield ( gr.update(value=""), gr.update(value=""), gr.update(visible=True, value=f"### Error\n{msg}"), ) return try: yielded_anything = False for res in infer_stream( api_key=api_key_val, api_base=api_base_val, system_prompt=system_prompt_val, template=template_val, instruction=instruction_val, context_text=context_text_val, context_image_path=context_image_val, temperature=temperature_val, reasoning=bool(reasoning_val), markdown_mode=bool(markdown_mode_val), ): yielded_anything = True think = res.get("think") or "" output = res.get("output") or "_(Empty output.)_" yield ( gr.update(value=f"```text\n{think}\n```" if think else ""), gr.update(value=output), gr.update(visible=False, value=""), ) if not yielded_anything: yield ( gr.update(value=""), gr.update(value=""), gr.update( visible=True, value="### Error\nThe model returned no streamed output.", ), ) except Exception: import traceback tb = traceback.format_exc() print(tb, flush=True) yield ( gr.update(value=""), gr.update(value=""), gr.update(visible=True, value=f"### Error\n```text\n{tb}\n```"), ) def on_extract_click( api_key_val, api_base_val, system_prompt_val, instruction_val, template_val, context_text_val, context_image_val, temperature_val, reasoning_val, ): yield from run_model_click( api_key_val, api_base_val, system_prompt_val, instruction_val, template_val, context_text_val, context_image_val, temperature_val, reasoning_val, False, ) def on_markdown_click( api_key_val, api_base_val, system_prompt_val, instruction_val, template_val, context_text_val, context_image_val, temperature_val, reasoning_val, ): yield from run_model_click( api_key_val, api_base_val, system_prompt_val, instruction_val, template_val, context_text_val, context_image_val, temperature_val, reasoning_val, True, ) def on_click_generate_template( api_key_val, api_base_val, system_prompt_val, context_text_val, context_image_val, temperature_val, ): print("[button] Generate template clicked", flush=True) if not context_image_val and not (context_text_val or "").strip(): return ( gr.update(), gr.update( visible=True, value="### Error\nPlease provide a document image or paste document text.", ), ) try: template_text = infer_template_generation( api_key=api_key_val, api_base=api_base_val, system_prompt=system_prompt_val, context_text=context_text_val, context_image_path=context_image_val, temperature=temperature_val, ) return gr.update(value=template_text), gr.update(visible=False, value="") except Exception: import traceback tb = traceback.format_exc() print(tb, flush=True) return ( gr.update(), gr.update(visible=True, value=f"### Error\n```text\n{tb}\n```"), ) def on_clear(): return ( gr.update(value=""), gr.update(value=""), gr.update(visible=False, value=""), ) common_inputs = [ api_key, api_base, system_prompt, instruction, template, context_text, context_image, temperature, reasoning_checkbox, ] common_outputs = [ reasoning_md, output_md, error_box, ] extract_event = extract_btn.click( fn=on_extract_click, inputs=common_inputs, outputs=common_outputs, show_progress=True, ) markdown_event = markdown_btn.click( fn=on_markdown_click, inputs=common_inputs, outputs=common_outputs, show_progress=True, ) stop_btn.click( fn=None, inputs=None, outputs=None, cancels=[extract_event, markdown_event], ) clear_btn.click( fn=on_clear, inputs=None, outputs=common_outputs, ) generate_template_btn.click( fn=on_click_generate_template, inputs=[ api_key, api_base, system_prompt, context_text, context_image, temperature, ], outputs=[ template, error_box, ], show_progress=True, ) if __name__ == "__main__": allowed_paths = [] if ASSETS_DIR.exists(): allowed_paths.append(str(ASSETS_DIR)) if EXAMPLE_DIR.exists(): allowed_paths.append(str(EXAMPLE_DIR)) demo.queue().launch( share=ARGS.share, server_name=ARGS.server_name, server_port=ARGS.server_port, show_error=True, allowed_paths=allowed_paths or None, )