Spaces:
Running on A100
Running on A100
| import argparse | |
| import base64 | |
| import io | |
| import json | |
| import os | |
| import re | |
| from pathlib import Path | |
| from typing import Any, Dict, Iterator, List, Optional, Tuple | |
| import gradio as gr | |
| from openai import OpenAI | |
| from PIL import Image | |
| # ---------------- Paths ---------------- | |
| APP_DIR = Path(__file__).resolve().parent | |
| # ---------------- CLI / environment configuration ---------------- | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description="NuExtract Gradio demo") | |
| parser.add_argument( | |
| "--model-name", | |
| default=os.environ.get("MODEL_NAME", "numind/NuExtract3"), | |
| help="Model name served by the OpenAI-compatible endpoint.", | |
| ) | |
| parser.add_argument( | |
| "--api-base", | |
| default=os.environ.get("OPENAI_API_BASE", "http://127.0.0.1:8000/v1"), | |
| help="OpenAI-compatible base URL.", | |
| ) | |
| parser.add_argument( | |
| "--api-key", | |
| default=os.environ.get("OPENAI_API_KEY", "EMPTY"), | |
| help="API key for the OpenAI-compatible endpoint.", | |
| ) | |
| parser.add_argument( | |
| "--server-name", | |
| default=os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"), | |
| help="Gradio server host.", | |
| ) | |
| parser.add_argument( | |
| "--server-port", | |
| type=int, | |
| default=int(os.environ.get("GRADIO_SERVER_PORT", "7860")), | |
| help="Gradio server port.", | |
| ) | |
| parser.add_argument( | |
| "--share", | |
| action="store_true", | |
| default=os.environ.get("GRADIO_SHARE", "false").lower() in {"1", "true", "yes"}, | |
| help="Create a public Gradio share link.", | |
| ) | |
| parser.add_argument( | |
| "--max-tokens", | |
| type=int, | |
| default=int(os.environ.get("NUEXTRACT_MAX_TOKENS", "10000")), | |
| help="Maximum tokens for model generation. Hidden from the UI.", | |
| ) | |
| parser.add_argument( | |
| "--example-dir", | |
| default=os.environ.get("NUEXTRACT_EXAMPLE_DIR", str(APP_DIR / "examples")), | |
| help="Directory containing image examples.", | |
| ) | |
| parser.add_argument( | |
| "--assets-dir", | |
| default=os.environ.get("NUEXTRACT_ASSETS_DIR", str(APP_DIR / "assets")), | |
| help="Directory containing static assets such as the NuExtract logo.", | |
| ) | |
| args, _ = parser.parse_known_args() | |
| return args | |
| def resolve_dir(path_like: str) -> Path: | |
| path = Path(path_like).expanduser() | |
| if path.is_absolute(): | |
| return path.resolve() | |
| return (APP_DIR / path).resolve() | |
| ARGS = parse_args() | |
| DEFAULT_MODEL = ARGS.model_name | |
| DEFAULT_API_BASE = ARGS.api_base | |
| DEFAULT_API_KEY = ARGS.api_key | |
| DEFAULT_MAX_TOKENS = ARGS.max_tokens | |
| EXAMPLE_DIR = resolve_dir(ARGS.example_dir) | |
| ASSETS_DIR = resolve_dir(ARGS.assets_dir) | |
| LOGO_PATH = ASSETS_DIR / "logo_numind_picto.svg" | |
| if LOGO_PATH.exists(): | |
| LOGO_URL = f"/gradio_api/file={LOGO_PATH}" | |
| gr.set_static_paths(paths=[ASSETS_DIR]) | |
| else: | |
| print(f"[assets] Missing logo: {LOGO_PATH}", flush=True) | |
| LOGO_URL = "" | |
| SYSTEM_PROMPT_DEFAULT = ( | |
| "You are a precise information extraction assistant. " | |
| "Return faithful, source-grounded results only." | |
| ) | |
| # ---------------- Structured extraction examples ---------------- | |
| # These examples populate: Image + Template + Instructions. | |
| STRUCTURED_EXAMPLE_TEMPLATES: Dict[str, Dict[str, Any]] = { | |
| "1.jpg": { | |
| "movie_name": "verbatim-string", | |
| "tagline": "verbatim-string", | |
| "language": "string", | |
| "motion_picture_association_rating": [ | |
| "G - General Audiences", | |
| "PG - Parental Guidance Suggested", | |
| "PG-13 – Parents Strongly Cautioned", | |
| "R – Restricted", | |
| "NC-17 – Adults Only", | |
| "not provided" | |
| ], | |
| "movie_distribution_company": "verbatim-string", | |
| "movie_production_company": "verbatim-string", | |
| "theatre_release_date": "date-time", | |
| "movie_website_address": "verbatim-string", | |
| "movie_director_name": "verbatim-string", | |
| "actors_names": [ | |
| "verbatim-string" | |
| ], | |
| "staff": [ | |
| { | |
| "staff_name": "verbatim-string", | |
| "staff_role": [ | |
| "director", | |
| "co-director", | |
| "screenwriter", | |
| "author", | |
| "cinematographer", | |
| "costume designer", | |
| "production designer", | |
| "set designer", | |
| "animator", | |
| "color designer", | |
| "art director", | |
| "animation director", | |
| "vfx director", | |
| "voice actor", | |
| "composer", | |
| "songwriter", | |
| "music performer", | |
| "music supervisor", | |
| "choreographer", | |
| "casting director", | |
| "editor", | |
| "producer", | |
| "co-producer", | |
| "associate producer", | |
| "executive producer", | |
| "co-executive producer", | |
| "line producer" | |
| ] | |
| } | |
| ], | |
| "reviews": [ | |
| { | |
| "critic_name": "verbatim-string", | |
| "review_comment": "verbatim-string" | |
| } | |
| ], | |
| "technologies": [ | |
| [ | |
| "Dolby Stereo", | |
| "Dolby Digital", | |
| "Dolby Stereo Digital", | |
| "Dolby Atmos", | |
| "Dolby Vision", | |
| "Dolby Cinema", | |
| "DTS", | |
| "SDDS", | |
| "IMAX", | |
| "4DX" | |
| ] | |
| ] | |
| }, | |
| "2.png": { | |
| "number_of_bathrooms": "integer", | |
| "number_of_fireplaces": "integer", | |
| "distance_unit": ["meter", "foot"], | |
| "rooms_that_are_not_bedrooms_or_corridors_or_toilets": [ | |
| { | |
| "room_name": "verbatim-string", | |
| "surface_area": "number", | |
| } | |
| ], | |
| "bedrooms": [ | |
| { | |
| "bedroom_name": "verbatim-string", | |
| "surface_area": "number", | |
| "number_of_windows": "integer", | |
| "has_private_bathroom": "boolean", | |
| } | |
| ], | |
| "has_laundry_room": "boolean", | |
| "has_terrace": "boolean", | |
| "has_balcony": "boolean", | |
| "number_of_parking_spaces_in_garage": "integer", | |
| "number_of_parking_spaces_exterior": "integer", | |
| }, | |
| "8.png" : { | |
| "invoice_number": "verbatim-string", | |
| "issuer_name": "verbatim-string", | |
| "recipient_name": "verbatim-string", | |
| "issuer_location": { | |
| "street_number": "verbatim-string", | |
| "street_name": "verbatim-string", | |
| "city": "verbatim-string", | |
| "zip_code": "string", | |
| "country": "string" | |
| }, | |
| "date_of_issue": "date-time", | |
| "date_due": "date-time", | |
| "currency_code_iso4217": "string", | |
| "items": [ | |
| { | |
| "item_name": "verbatim-string", | |
| "item_quantity": "number", | |
| "item_price_per_unit": "number", | |
| "item_total_price": "number" | |
| } | |
| ], | |
| "total_discount_amount": "number", | |
| "total_fee_amount": "number", | |
| "total_tax_amount": "number", | |
| "total_price_net": "number", | |
| "total_price_gross": "number" | |
| }, | |
| "18.jpg":{ | |
| "festival_name": "verbatim-string", | |
| "website_url": "url", | |
| "location": { | |
| "city": "string", | |
| "zip_code": "string", | |
| "country": "country" | |
| }, | |
| "date_first_day": "date", | |
| "date_last_day": "date", | |
| "lineup_entry": [ | |
| { | |
| "artist_or_group_name": "string", | |
| "artist_entity_type": ["individual", "band", "ensemble", "b2b", "project_or_collaboration", "other"], | |
| "performing_stage_name": "verbatim-string", | |
| "is_headliner": "boolean", | |
| "day_playing": "date" | |
| } | |
| ], | |
| "sponsors": [ | |
| { | |
| "name": "verbatim-string", | |
| "type": ["press", "tv", "bank", "insurance", "beverage company", "car company", "technology company", "clothing company", "transportation", "public institution", "other"] | |
| } | |
| ] | |
| }, | |
| "17.png":{ | |
| "parts": [ | |
| { | |
| "name": "verbatim-string", | |
| "id": "verbatim-string", | |
| "details": "verbatim-string" | |
| } | |
| ] | |
| }, | |
| "16.jpeg":{ | |
| "Applicant": { | |
| "Name": "verbatim-string", | |
| "Registration no": "verbatim-string", | |
| "Holding compagny": "verbatim-string", | |
| "VAT Registration no": "verbatim-string", | |
| "Date of creation": "date", | |
| "Type of entity": "verbatim-string", | |
| "Location": { | |
| "Street number": "verbatim-string", | |
| "Street name": "verbatim-string", | |
| "City": "verbatim-string", | |
| "Zip code": "string", | |
| "Country": "country" | |
| }, | |
| "Website": "url", | |
| "Phone": "phone-number", | |
| "Email": "email-address" | |
| }, | |
| "Bank Reference": { | |
| "Bank name": "verbatim-string", | |
| "Account name": "verbatim-string", | |
| "Account no": "integer", | |
| "Importer code": "verbatim-string" | |
| }, | |
| "Trades references": [ | |
| { | |
| "Company name": "verbatim-string", | |
| "Account opened since": "date", | |
| "Tel": "phone-number", | |
| "Email": "email-address", | |
| "Location": { | |
| "Street number": "verbatim-string", | |
| "Street name": "verbatim-string", | |
| "City": "verbatim-string", | |
| "Zip code": "string", | |
| "Country": "country" | |
| }, | |
| "Credit limit": "string" | |
| } | |
| ], | |
| "Is document signed": "boolean", | |
| "Date of signature": "date" | |
| }, | |
| } | |
| STRUCTURED_EXAMPLE_INSTRUCTIONS: Dict[str, str] = { | |
| "1.jpg": "", | |
| "2.png": "", | |
| "8.png": "", | |
| "18.jpg": "", | |
| "17.png": "", | |
| "16.jpeg": "" | |
| } | |
| # ---------------- Markdown/OCR examples ---------------- | |
| # Put Markdown example image paths here. | |
| # These examples populate only the Image input and are meant for the | |
| # “Convert to Markdown” button. | |
| MARKDOWN_EXAMPLE_IMAGE_PATHS: List[str] = [ | |
| "3.jpg", | |
| "4.jpg", | |
| "5.jpg", | |
| "6.png", | |
| "7.jpg", | |
| "9.jpg", | |
| "10.png", | |
| "11.png", | |
| "12.jpg", | |
| "14.jpg", | |
| "15.jpg" | |
| ] | |
| def resolve_example_path(path_like: str) -> Path: | |
| path = Path(path_like).expanduser() | |
| if path.is_absolute(): | |
| return path.resolve() | |
| return (EXAMPLE_DIR / path).resolve() | |
| def build_structured_examples() -> List[List[Any]]: | |
| examples: List[List[Any]] = [] | |
| for filename, template_obj in STRUCTURED_EXAMPLE_TEMPLATES.items(): | |
| image_path = resolve_example_path(filename) | |
| if not image_path.exists(): | |
| print(f"[structured examples] Missing image: {image_path}", flush=True) | |
| continue | |
| examples.append( | |
| [ | |
| str(image_path), | |
| json.dumps(template_obj, indent=4, ensure_ascii=False), | |
| STRUCTURED_EXAMPLE_INSTRUCTIONS.get(filename, ""), | |
| ] | |
| ) | |
| return examples | |
| def build_markdown_examples() -> List[List[Any]]: | |
| examples: List[List[Any]] = [] | |
| for path_like in MARKDOWN_EXAMPLE_IMAGE_PATHS: | |
| image_path = resolve_example_path(path_like) | |
| if not image_path.exists(): | |
| print(f"[markdown examples] Missing image: {image_path}", flush=True) | |
| continue | |
| examples.append([str(image_path)]) | |
| return examples | |
| STRUCTURED_EXAMPLES = build_structured_examples() | |
| MARKDOWN_EXAMPLES = build_markdown_examples() | |
| # ---------------- Utility helpers ---------------- | |
| def image_bytes_to_base64(b: bytes) -> str: | |
| return base64.b64encode(b).decode("utf-8") | |
| def ensure_rgb_image(image_bytes: bytes) -> Image.Image: | |
| img = Image.open(io.BytesIO(image_bytes)) | |
| if img.mode != "RGB": | |
| img = img.convert("RGB") | |
| return img | |
| def file_path_to_bytes(path: str) -> bytes: | |
| with open(path, "rb") as f: | |
| return f.read() | |
| # ---------------- Response parsing ---------------- | |
| def strip_code_fence(payload: str) -> str: | |
| return re.sub( | |
| r"^```(?:json|markdown|text)?\s*|\s*```$", | |
| "", | |
| payload.strip(), | |
| flags=re.IGNORECASE | re.MULTILINE, | |
| ).strip() | |
| def pretty_json_or_text(payload: str) -> str: | |
| if not payload: | |
| return "" | |
| cleaned = strip_code_fence(payload) | |
| try: | |
| return json.dumps(json.loads(cleaned), indent=4, ensure_ascii=False) | |
| except Exception: | |
| return cleaned | |
| def extract_answer_block(text: str) -> str: | |
| if not text: | |
| return "" | |
| try: | |
| match = re.search( | |
| r"<answer>\s*(.*?)\s*</answer>", | |
| text, | |
| flags=re.DOTALL | re.IGNORECASE, | |
| ) | |
| if match: | |
| return pretty_json_or_text(match.group(1).strip()) | |
| except Exception: | |
| pass | |
| json_objects = list(re.finditer(r"\{[\s\S]*\}", text)) | |
| if json_objects: | |
| candidate = max(json_objects, key=lambda match: len(match.group(0))).group(0) | |
| return pretty_json_or_text(candidate) | |
| return text.strip() | |
| def split_reasoning_and_output(text: str, reasoning_enabled: bool) -> Tuple[str, str]: | |
| if not text: | |
| return "", "" | |
| if not reasoning_enabled: | |
| return "", text.strip() | |
| lower = text.lower() | |
| end_tag = "</think>" | |
| if end_tag in lower: | |
| end_idx = lower.find(end_tag) | |
| reasoning = text[:end_idx].strip() | |
| output = text[end_idx + len(end_tag):].strip() | |
| return reasoning, output | |
| return text.strip(), "" | |
| # ---------------- Message building ---------------- | |
| def make_text_content(text: str) -> List[Dict[str, Any]]: | |
| return [{"type": "text", "text": text or ""}] | |
| def make_image_content( | |
| image_bytes: bytes, | |
| extra_text: Optional[str] = None, | |
| ) -> List[Dict[str, Any]]: | |
| img = ensure_rgb_image(image_bytes) | |
| buffer = io.BytesIO() | |
| img.save(buffer, format="JPEG", quality=95) | |
| img_b64 = image_bytes_to_base64(buffer.getvalue()) | |
| content: List[Dict[str, Any]] = [ | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{img_b64}", | |
| "detail": "high", | |
| }, | |
| } | |
| ] | |
| if extra_text and extra_text.strip(): | |
| content.append({"type": "text", "text": extra_text.strip()}) | |
| return content | |
| def normalize_template(template: str) -> str: | |
| tpl = (template or "").strip() | |
| if not tpl: | |
| return "{}" | |
| try: | |
| return json.dumps(json.loads(tpl), indent=4, ensure_ascii=False) | |
| except Exception: | |
| return tpl | |
| def collate_single_input( | |
| *, | |
| text_or_image: Any, | |
| template: str, | |
| system_prompt: Optional[str], | |
| instruction: Optional[str], | |
| ) -> Tuple[List[Dict[str, Any]], str]: | |
| is_image_input = isinstance(text_or_image, dict) and "bytes" in text_or_image | |
| messages: List[Dict[str, Any]] = [] | |
| if system_prompt: | |
| messages.append({"role": "system", "content": system_prompt}) | |
| template_json = normalize_template(template) | |
| extra_parts = [] | |
| if instruction and instruction.strip(): | |
| extra_parts.append(f"Instructions:\n{instruction.strip()}") | |
| if template_json and template_json.strip() not in {"{}", ""}: | |
| extra_parts.append(f"Extraction template:\n```json\n{template_json}\n```") | |
| extra_text_for_user = "\n\n".join(extra_parts) if extra_parts else None | |
| if is_image_input: | |
| messages.append( | |
| { | |
| "role": "user", | |
| "content": make_image_content( | |
| image_bytes=text_or_image["bytes"], | |
| extra_text=extra_text_for_user, | |
| ), | |
| } | |
| ) | |
| else: | |
| text = str(text_or_image or "") | |
| if extra_text_for_user: | |
| text = f"{text}\n\n{extra_text_for_user}".strip() | |
| messages.append({"role": "user", "content": make_text_content(text)}) | |
| return messages, template_json | |
| def collate_for_template_generation( | |
| *, | |
| context_text: str, | |
| context_image_path: Optional[str], | |
| system_prompt: Optional[str], | |
| ) -> List[Dict[str, Any]]: | |
| messages: List[Dict[str, Any]] = [] | |
| if system_prompt: | |
| messages.append({"role": "system", "content": system_prompt}) | |
| guidance = ( | |
| "Generate a concise JSON extraction template for this document. " | |
| "Use descriptive field names and simple type hints like string, number, YYYY-MM-DD, " | |
| "boolean, or arrays of objects. Return only the JSON template." | |
| ) | |
| if context_image_path: | |
| messages.append( | |
| { | |
| "role": "user", | |
| "content": make_image_content( | |
| image_bytes=file_path_to_bytes(context_image_path), | |
| extra_text=guidance, | |
| ), | |
| } | |
| ) | |
| else: | |
| text = (context_text or "").strip() | |
| messages.append( | |
| { | |
| "role": "user", | |
| "content": make_text_content(f"{text}\n\n{guidance}".strip()), | |
| } | |
| ) | |
| return messages | |
| def collate_markdown_image_only( | |
| *, | |
| image_bytes: bytes, | |
| system_prompt: Optional[str], | |
| ) -> List[Dict[str, Any]]: | |
| messages: List[Dict[str, Any]] = [] | |
| if system_prompt: | |
| messages.append({"role": "system", "content": system_prompt}) | |
| messages.append( | |
| { | |
| "role": "user", | |
| "content": make_image_content(image_bytes=image_bytes), | |
| } | |
| ) | |
| return messages | |
| # ---------------- Model calls ---------------- | |
| def chunk_to_text(chunk: Any) -> str: | |
| try: | |
| if not chunk or not getattr(chunk, "choices", None): | |
| return "" | |
| delta = getattr(chunk.choices[0], "delta", None) | |
| if delta is None: | |
| return "" | |
| content = getattr(delta, "content", None) | |
| if isinstance(content, str): | |
| return content | |
| if isinstance(content, list): | |
| parts: List[str] = [] | |
| for item in content: | |
| if isinstance(item, dict) and item.get("text"): | |
| parts.append(item["text"]) | |
| elif getattr(item, "text", None): | |
| parts.append(item.text) | |
| return "".join(parts) | |
| except Exception: | |
| return "" | |
| return "" | |
| def build_chat_template_kwargs( | |
| *, | |
| template_json: str, | |
| reasoning: bool, | |
| instruction: Optional[str], | |
| markdown_mode: bool, | |
| ) -> Dict[str, Any]: | |
| if markdown_mode: | |
| return { | |
| "mode": "markdown", | |
| "enable_thinking": bool(reasoning), | |
| } | |
| use_structured = bool( | |
| template_json | |
| and template_json.strip() | |
| and template_json.strip() != "{}" | |
| ) | |
| chat_kwargs: Dict[str, Any] = { | |
| "mode": "structured" if use_structured else "content", | |
| "enable_thinking": bool(reasoning), | |
| } | |
| if use_structured: | |
| chat_kwargs["template"] = template_json | |
| if instruction and instruction.strip(): | |
| chat_kwargs["instructions"] = instruction.strip() | |
| return chat_kwargs | |
| def call_model_stream( | |
| *, | |
| api_base: str, | |
| api_key: str, | |
| model_name: str, | |
| messages: List[Dict[str, Any]], | |
| template_json: str, | |
| temperature: float, | |
| max_tokens: int, | |
| reasoning: bool, | |
| instruction: Optional[str], | |
| markdown_mode: bool, | |
| ) -> Iterator[str]: | |
| client = OpenAI(base_url=api_base, api_key=api_key) | |
| chat_kwargs = build_chat_template_kwargs( | |
| template_json=template_json, | |
| reasoning=reasoning, | |
| instruction=instruction, | |
| markdown_mode=markdown_mode, | |
| ) | |
| stream = client.chat.completions.create( | |
| model=model_name, | |
| temperature=float(temperature), | |
| max_tokens=int(max_tokens), | |
| messages=messages, | |
| stream=True, | |
| extra_body={"chat_template_kwargs": chat_kwargs}, | |
| ) | |
| accumulated = "" | |
| for chunk in stream: | |
| delta_text = chunk_to_text(chunk) | |
| if delta_text: | |
| accumulated += delta_text | |
| yield accumulated | |
| def call_model_once( | |
| *, | |
| api_base: str, | |
| api_key: str, | |
| model_name: str, | |
| messages: List[Dict[str, Any]], | |
| mode: str, | |
| temperature: float, | |
| max_tokens: int, | |
| ) -> str: | |
| client = OpenAI(base_url=api_base, api_key=api_key) | |
| chat = client.chat.completions.create( | |
| model=model_name, | |
| temperature=float(temperature), | |
| max_tokens=int(max_tokens), | |
| messages=messages, | |
| extra_body={ | |
| "chat_template_kwargs": { | |
| "mode": mode, | |
| "enable_thinking": False, | |
| } | |
| }, | |
| ) | |
| return chat.choices[0].message.content if chat.choices else "" | |
| # ---------------- Inference orchestration ---------------- | |
| def prepare_input(context_text: str, context_image_path: Optional[str]) -> Any: | |
| if context_image_path: | |
| return {"bytes": file_path_to_bytes(context_image_path)} | |
| return context_text or "" | |
| def infer_stream( | |
| *, | |
| api_key: str, | |
| api_base: str, | |
| system_prompt: str, | |
| template: str, | |
| instruction: str, | |
| context_text: str, | |
| context_image_path: Optional[str], | |
| temperature: float, | |
| reasoning: bool, | |
| markdown_mode: bool, | |
| ): | |
| single_input = prepare_input(context_text, context_image_path) | |
| is_image = isinstance(single_input, dict) and "bytes" in single_input | |
| if markdown_mode: | |
| if not is_image: | |
| raise ValueError("Markdown conversion requires an image input.") | |
| messages = collate_markdown_image_only( | |
| image_bytes=single_input["bytes"], | |
| system_prompt=system_prompt, | |
| ) | |
| template_json = "" | |
| else: | |
| messages, template_json = collate_single_input( | |
| text_or_image=single_input, | |
| template=template, | |
| system_prompt=system_prompt, | |
| instruction=instruction, | |
| ) | |
| for partial_text in call_model_stream( | |
| api_base=api_base, | |
| api_key=api_key, | |
| model_name=DEFAULT_MODEL, | |
| messages=messages, | |
| template_json=template_json, | |
| temperature=temperature, | |
| max_tokens=DEFAULT_MAX_TOKENS, | |
| reasoning=reasoning, | |
| instruction=instruction, | |
| markdown_mode=markdown_mode, | |
| ): | |
| trace, output_text = split_reasoning_and_output( | |
| partial_text, | |
| reasoning_enabled=reasoning, | |
| ) | |
| if markdown_mode: | |
| output_display = output_text or ( | |
| "_(Waiting for output after `</think>`.)_" | |
| if reasoning | |
| else "_(Empty output.)_" | |
| ) | |
| yield { | |
| "mode": "markdown", | |
| "output": output_display, | |
| "think": trace if reasoning else "", | |
| } | |
| continue | |
| if not reasoning: | |
| output_text = partial_text or "" | |
| answer = extract_answer_block(output_text) | |
| output_display = answer or ( | |
| "_(Waiting for output after `</think>`.)_" | |
| if reasoning | |
| else "_(No output found yet.)_" | |
| ) | |
| if output_display.strip().startswith("{") or output_display.strip().startswith("["): | |
| output_display = pretty_json_or_text(output_display) | |
| output_display = f"```json\n{output_display}\n```" | |
| else: | |
| output_display = output_display.replace("\\n", "\n") | |
| yield { | |
| "mode": "structured", | |
| "output": output_display, | |
| "think": trace if reasoning else "", | |
| } | |
| def infer_template_generation( | |
| *, | |
| api_key: str, | |
| api_base: str, | |
| system_prompt: str, | |
| context_text: str, | |
| context_image_path: Optional[str], | |
| temperature: float, | |
| ) -> str: | |
| messages = collate_for_template_generation( | |
| context_text=context_text, | |
| context_image_path=context_image_path, | |
| system_prompt=system_prompt, | |
| ) | |
| result = call_model_once( | |
| api_base=api_base, | |
| api_key=api_key, | |
| model_name=DEFAULT_MODEL, | |
| messages=messages, | |
| mode="template-generation", | |
| temperature=temperature, | |
| max_tokens=DEFAULT_MAX_TOKENS, | |
| ) | |
| return pretty_json_or_text(result) | |
| # ---------------- UI styling ---------------- | |
| CSS = """ | |
| :root { | |
| color-scheme: light; | |
| --bg: #f6f2eb; | |
| --panel: #ffffff; | |
| --panel-rgb: 255, 255, 255; | |
| --panel-strong-rgb: 255, 252, 246; | |
| --input-rgb: 255, 255, 255; | |
| --border-blue: rgba(67, 111, 148, 0.30); | |
| --border-blue-soft: rgba(67, 111, 148, 0.18); | |
| --border-input: rgba(67, 111, 148, 0.22); | |
| --border-orange-soft: rgba(190, 103, 36, 0.26); | |
| --text: #23252b; | |
| --text-strong: #101318; | |
| --text-on-accent: #101318; | |
| --muted: #5f6673; | |
| --muted-2: #7d8490; | |
| --logo-blue: #5d9bcf; | |
| --logo-orange: #d6742f; | |
| --green: #178f66; | |
| --card-alpha: 0.88; | |
| --header-alpha: 0.82; | |
| --input-alpha: 0.94; | |
| --shadow: rgba(54, 46, 35, 0.14); | |
| --inset-highlight: rgba(255, 255, 255, 0.85); | |
| --logo-opacity: 0.18; | |
| --focus-ring: rgba(67, 111, 148, 0.26); | |
| --code-bg: #fdfaf5; | |
| --dropzone-bg: #fbf8f2; | |
| } | |
| html.dark, | |
| body.dark, | |
| .dark, | |
| [data-theme="dark"] { | |
| color-scheme: dark; | |
| --bg: #242529; | |
| --panel: #1d1f26; | |
| --panel-rgb: 29, 31, 38; | |
| --panel-strong-rgb: 21, 22, 26; | |
| --input-rgb: 12, 14, 19; | |
| --border-blue: rgba(135, 183, 224, 0.24); | |
| --border-blue-soft: rgba(135, 183, 224, 0.16); | |
| --border-input: rgba(135, 183, 224, 0.14); | |
| --border-orange-soft: rgba(228, 132, 58, 0.22); | |
| --text: #eef0f4; | |
| --text-strong: #ffffff; | |
| --text-on-accent: #101318; | |
| --muted: #969baa; | |
| --muted-2: #737988; | |
| --logo-blue: #87b7e0; | |
| --logo-orange: #e4843a; | |
| --green: #31c48d; | |
| --card-alpha: 0.66; | |
| --header-alpha: 0.42; | |
| --input-alpha: 0.78; | |
| --shadow: rgba(0, 0, 0, 0.28); | |
| --inset-highlight: rgba(255, 255, 255, 0.055); | |
| --logo-opacity: 0.88; | |
| --focus-ring: rgba(135, 183, 224, 0.32); | |
| --code-bg: rgba(12, 14, 19, 0.78); | |
| --dropzone-bg: rgba(12, 14, 19, 0.78); | |
| } | |
| @media (prefers-color-scheme: dark) { | |
| :root:not([data-theme="light"]) { | |
| color-scheme: dark; | |
| --bg: #242529; | |
| --panel: #1d1f26; | |
| --panel-rgb: 29, 31, 38; | |
| --panel-strong-rgb: 21, 22, 26; | |
| --input-rgb: 12, 14, 19; | |
| --border-blue: rgba(135, 183, 224, 0.24); | |
| --border-blue-soft: rgba(135, 183, 224, 0.16); | |
| --border-input: rgba(135, 183, 224, 0.14); | |
| --border-orange-soft: rgba(228, 132, 58, 0.22); | |
| --text: #eef0f4; | |
| --text-strong: #ffffff; | |
| --text-on-accent: #101318; | |
| --muted: #969baa; | |
| --muted-2: #737988; | |
| --logo-blue: #87b7e0; | |
| --logo-orange: #e4843a; | |
| --green: #31c48d; | |
| --card-alpha: 0.66; | |
| --header-alpha: 0.42; | |
| --input-alpha: 0.78; | |
| --shadow: rgba(0, 0, 0, 0.28); | |
| --inset-highlight: rgba(255, 255, 255, 0.055); | |
| --logo-opacity: 0.88; | |
| --focus-ring: rgba(135, 183, 224, 0.32); | |
| --code-bg: rgba(12, 14, 19, 0.78); | |
| --dropzone-bg: rgba(12, 14, 19, 0.78); | |
| } | |
| } | |
| html, | |
| body, | |
| footer, | |
| .gradio-container { | |
| color: var(--text) !important; | |
| } | |
| html, | |
| body { | |
| min-height: 100vh !important; | |
| width: 100% !important; | |
| margin: 0 !important; | |
| overflow-x: hidden !important; | |
| } | |
| body { | |
| background: var(--bg) !important; | |
| background-attachment: fixed !important; | |
| } | |
| footer { | |
| background: transparent !important; | |
| } | |
| .gradio-container { | |
| position: relative !important; | |
| isolation: isolate !important; | |
| max-width: none !important; | |
| width: 100% !important; | |
| min-height: 100vh !important; | |
| padding: 10px 18px 18px 18px !important; | |
| background: transparent !important; | |
| box-sizing: border-box !important; | |
| } | |
| .gradio-container::before { | |
| content: ""; | |
| position: fixed; | |
| inset: 0; | |
| z-index: -2; | |
| pointer-events: none; | |
| background-image: url("__LOGO_URL__"); | |
| background-repeat: no-repeat; | |
| background-size: min(86vw, 980px) min(86vw, 980px); | |
| background-position: calc(100% + 230px) 34px; | |
| opacity: var(--logo-opacity); | |
| filter: saturate(1.2) drop-shadow(0 0 28px rgba(135, 183, 224, 0.14)); | |
| } | |
| .with-gap, | |
| .gradio-row { | |
| gap: 18px !important; | |
| } | |
| .gradio-row { | |
| width: 100% !important; | |
| } | |
| .app-header { | |
| position: relative; | |
| display: flex; | |
| align-items: center; | |
| justify-content: space-between; | |
| gap: 16px; | |
| padding: 10px 12px 14px 12px; | |
| margin-bottom: 10px; | |
| border-bottom: 1px solid var(--border-blue-soft); | |
| background: rgba(var(--panel-strong-rgb), var(--header-alpha)); | |
| border-radius: 14px; | |
| backdrop-filter: blur(8px); | |
| box-shadow: 0 12px 42px var(--shadow), inset 0 1px 0 var(--inset-highlight); | |
| } | |
| .brand { | |
| display: flex; | |
| align-items: center; | |
| gap: 10px; | |
| } | |
| .brand-mark { | |
| width: 28px; | |
| height: 28px; | |
| flex: 0 0 auto; | |
| object-fit: contain; | |
| } | |
| .brand-title { | |
| display: flex; | |
| align-items: baseline; | |
| gap: 8px; | |
| } | |
| .brand-name { | |
| font-size: 23px; | |
| line-height: 1; | |
| font-weight: 750; | |
| letter-spacing: -0.045em; | |
| color: var(--text-strong) !important; | |
| } | |
| .brand-name span { | |
| color: var(--muted) !important; | |
| } | |
| .model-chip { | |
| display: inline-flex; | |
| align-items: center; | |
| max-width: 520px; | |
| padding: 5px 9px; | |
| border-radius: 999px; | |
| background: rgba(var(--panel-rgb), 0.88); | |
| border: 1px solid var(--border-blue-soft); | |
| color: var(--muted) !important; | |
| font-size: 12px; | |
| white-space: nowrap; | |
| overflow: hidden; | |
| text-overflow: ellipsis; | |
| text-decoration: none !important; | |
| cursor: pointer; | |
| transition: border-color 0.15s ease, transform 0.15s ease, background 0.15s ease; | |
| } | |
| .model-chip:hover { | |
| border-color: var(--logo-blue); | |
| background: rgba(var(--panel-rgb), 1); | |
| transform: translateY(-1px); | |
| text-decoration: none !important; | |
| } | |
| .model-chip:focus-visible { | |
| outline: none; | |
| box-shadow: 0 0 0 3px var(--focus-ring); | |
| } | |
| .model-chip code { | |
| color: var(--text-strong) !important; | |
| background: transparent !important; | |
| } | |
| .header-actions { | |
| display: flex; | |
| align-items: center; | |
| gap: 10px; | |
| color: var(--muted) !important; | |
| font-size: 13px; | |
| } | |
| .status-dot { | |
| width: 8px; | |
| height: 8px; | |
| border-radius: 99px; | |
| background: var(--green); | |
| box-shadow: 0 0 14px rgba(49, 196, 141, 0.65); | |
| } | |
| .intro-card { | |
| margin: 0 0 16px 0; | |
| padding: 14px 16px; | |
| border-radius: 14px; | |
| background: rgba(var(--panel-rgb), var(--card-alpha)); | |
| border: 1px solid var(--border-blue-soft); | |
| box-shadow: 0 12px 42px var(--shadow), inset 0 1px 0 var(--inset-highlight); | |
| backdrop-filter: blur(8px); | |
| } | |
| .intro-card p { | |
| margin: 0 0 8px 0; | |
| line-height: 1.5; | |
| } | |
| .section-title { | |
| margin: 0 0 8px 0; | |
| color: var(--text-strong) !important; | |
| font-size: 13px; | |
| font-weight: 750; | |
| letter-spacing: 0.01em; | |
| } | |
| .main-card, | |
| .output-card, | |
| .gradio-group { | |
| background: rgba(var(--panel-rgb), var(--card-alpha)) !important; | |
| border: 1px solid var(--border-blue) !important; | |
| border-radius: 14px !important; | |
| box-shadow: 0 22px 70px var(--shadow), inset 0 1px 0 var(--inset-highlight) !important; | |
| backdrop-filter: blur(10px) saturate(1.18); | |
| } | |
| .main-card { | |
| width: 100% !important; | |
| box-sizing: border-box !important; | |
| } | |
| .output-card { | |
| min-height: 720px !important; | |
| max-height: 860px !important; | |
| border-color: var(--border-orange-soft) !important; | |
| width: 100% !important; | |
| box-sizing: border-box !important; | |
| overflow: hidden !important; | |
| } | |
| label, | |
| .markdown, | |
| .prose, | |
| h1, | |
| h2, | |
| h3, | |
| h4, | |
| h5, | |
| h6, | |
| p, | |
| span, | |
| div { | |
| color: var(--text) !important; | |
| } | |
| .section-title, | |
| label > span, | |
| .gradio-container label { | |
| color: var(--text-strong) !important; | |
| } | |
| .secondary-note { | |
| color: var(--muted) !important; | |
| font-size: 12px; | |
| line-height: 1.35; | |
| } | |
| textarea, | |
| input[type="text"], | |
| input[type="password"], | |
| input[type="number"], | |
| input[type="email"], | |
| .cm-editor { | |
| background: rgba(var(--input-rgb), var(--input-alpha)) !important; | |
| color: var(--text) !important; | |
| border-color: var(--border-input) !important; | |
| } | |
| textarea::placeholder, | |
| input::placeholder { | |
| color: var(--muted-2) !important; | |
| } | |
| textarea:focus, | |
| input:focus, | |
| .cm-editor.cm-focused { | |
| border-color: var(--logo-blue) !important; | |
| box-shadow: 0 0 0 3px var(--focus-ring) !important; | |
| } | |
| input[type="checkbox"] { | |
| accent-color: var(--logo-blue) !important; | |
| } | |
| #schema-box .cm-editor { | |
| min-height: 410px !important; | |
| max-height: 480px !important; | |
| background: var(--code-bg) !important; | |
| } | |
| .cm-editor, | |
| .cm-scroller, | |
| .cm-content, | |
| .cm-line, | |
| .cm-gutters, | |
| .cm-activeLine, | |
| .cm-activeLineGutter { | |
| background: var(--code-bg) !important; | |
| color: var(--text) !important; | |
| } | |
| .cm-gutters { | |
| border-color: var(--border-blue-soft) !important; | |
| color: var(--muted-2) !important; | |
| } | |
| .cm-cursor { | |
| border-left-color: var(--text-strong) !important; | |
| } | |
| #image-box { | |
| min-height: 335px !important; | |
| background: var(--dropzone-bg) !important; | |
| border-color: var(--border-blue-soft) !important; | |
| } | |
| #image-box, | |
| #image-box *, | |
| .upload-container, | |
| .upload-container *, | |
| .file-preview, | |
| .file-preview * { | |
| color: var(--text) !important; | |
| } | |
| #image-box button, | |
| #image-box .icon-wrap, | |
| #image-box .wrap { | |
| background: transparent !important; | |
| } | |
| #reasoning-box { | |
| min-height: 180px !important; | |
| max-height: 240px !important; | |
| overflow: auto !important; | |
| padding: 8px; | |
| border-radius: 8px; | |
| background: rgba(var(--input-rgb), var(--input-alpha)) !important; | |
| border: 1px solid var(--border-blue-soft); | |
| white-space: pre-wrap !important; | |
| overflow-wrap: anywhere !important; | |
| word-break: break-word !important; | |
| } | |
| #output-box { | |
| min-height: 360px !important; | |
| max-height: 520px !important; | |
| overflow: auto !important; | |
| padding: 8px; | |
| border-radius: 8px; | |
| background: rgba(var(--input-rgb), var(--input-alpha)) !important; | |
| border: 1px solid var(--border-blue-soft); | |
| white-space: pre-wrap !important; | |
| overflow-wrap: anywhere !important; | |
| word-break: break-word !important; | |
| } | |
| #reasoning-box pre, | |
| #reasoning-box code, | |
| #output-box pre, | |
| #output-box code { | |
| white-space: pre-wrap !important; | |
| overflow-wrap: anywhere !important; | |
| word-break: break-word !important; | |
| color: var(--text) !important; | |
| background: transparent !important; | |
| } | |
| button { | |
| border-radius: 9px !important; | |
| min-height: 34px !important; | |
| } | |
| button.primary-button, | |
| .primary-button button, | |
| .primary-button { | |
| background: var(--logo-blue) !important; | |
| background-color: var(--logo-blue) !important; | |
| color: var(--text-on-accent) !important; | |
| border: none !important; | |
| font-weight: 750 !important; | |
| } | |
| button.markdown-button, | |
| .markdown-button button, | |
| .markdown-button { | |
| background: var(--logo-orange) !important; | |
| background-color: var(--logo-orange) !important; | |
| color: var(--text-on-accent) !important; | |
| border: none !important; | |
| font-weight: 750 !important; | |
| } | |
| .clear-button button, | |
| button.clear-button, | |
| .clear-button { | |
| background: transparent !important; | |
| background-color: transparent !important; | |
| color: var(--muted) !important; | |
| border: 1px solid var(--border-blue-soft) !important; | |
| } | |
| .gradio-container .wrap, | |
| .gradio-container .block, | |
| .gradio-container .form, | |
| .gradio-container .panel, | |
| .gradio-container .tabs, | |
| .gradio-container .tabitem { | |
| background: transparent !important; | |
| color: var(--text) !important; | |
| } | |
| .gradio-accordion { | |
| border-color: var(--border-blue-soft) !important; | |
| } | |
| .gradio-container table, | |
| .gradio-container th, | |
| .gradio-container td { | |
| color: var(--text) !important; | |
| } | |
| .gradio-container label, | |
| .gradio-container label span, | |
| .gradio-container .label-wrap, | |
| .gradio-container .label-wrap span { | |
| color: var(--text-strong) !important; | |
| } | |
| @media (max-width: 1100px) { | |
| .gradio-container { | |
| width: 100% !important; | |
| padding: 10px 12px 18px 12px !important; | |
| } | |
| .app-header { | |
| align-items: flex-start; | |
| flex-direction: column; | |
| } | |
| .brand-title { | |
| align-items: flex-start; | |
| flex-direction: column; | |
| } | |
| .model-chip { | |
| max-width: 100%; | |
| } | |
| .output-card { | |
| min-height: 520px !important; | |
| max-height: none !important; | |
| } | |
| #reasoning-box { | |
| min-height: 160px !important; | |
| max-height: 220px !important; | |
| } | |
| #output-box { | |
| min-height: 320px !important; | |
| max-height: 480px !important; | |
| } | |
| } | |
| """.replace("__LOGO_URL__", LOGO_URL or "") | |
| # ---------------- Gradio app ---------------- | |
| with gr.Blocks( | |
| title="NuExtract3", | |
| css=CSS, | |
| theme=gr.themes.Base( | |
| primary_hue="blue", | |
| secondary_hue="orange", | |
| neutral_hue="slate", | |
| ), | |
| ) as demo: | |
| logo_html = ( | |
| f'<img class="brand-mark" src="{LOGO_URL}" alt="NuExtract logo" />' | |
| if LOGO_URL | |
| else '<div class="brand-mark"></div>' | |
| ) | |
| gr.HTML( | |
| f""" | |
| <header class="app-header"> | |
| <div class="brand"> | |
| {logo_html} | |
| <div class="brand-title"> | |
| <div class="brand-name">NuExtract3</span></div> | |
| <a | |
| class="model-chip" | |
| href="https://huggingface.co/numind/NuExtract3" | |
| target="_blank" | |
| rel="noopener noreferrer"> | |
| Model <code>{DEFAULT_MODEL}</code> | |
| </a> | |
| </div> | |
| </div> | |
| <div class="header-actions"> | |
| <span class="status-dot"></span> | |
| <span>OpenAI-compatible endpoint</span> | |
| </div> | |
| </header> | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| <div style="padding: 0.25rem 0 1rem 0;"> | |
| <h1 style="margin-top: 0; margin-bottom: 0.25rem;">NuExtract3</h1> | |
| <p style="font-size: 1.05rem; line-height: 1.6;"> | |
| <strong>NuExtract3</strong> is a unified <strong>4B vision-language reasoning model</strong> | |
| for document understanding. | |
| </p> | |
| <p style="line-height: 1.6;"> | |
| It combines <strong>structured information extraction</strong> with high-quality | |
| <strong>image-to-Markdown conversion</strong>, making it useful for OCR, RAG preprocessing, | |
| and extraction pipelines across scans, receipts, forms, invoices, contracts, tables, and more. | |
| </p> | |
| <h3>Features</h3> | |
| <ul> | |
| <li><strong>Structured extraction:</strong> text/images + JSON template + instructions → JSON output</li> | |
| <li><strong>Markdown conversion:</strong> text/images → Markdown</li> | |
| <li><strong>Multimodal inputs:</strong> text, images, or text + images</li> | |
| <li><strong>Multilingual documents</strong></li> | |
| <li><strong>Reasoning and non-reasoning inference modes</strong></li> | |
| <li><strong>Template generation</strong> from natural language or input documents</li> | |
| </ul> | |
| <p align="center"> | |
| 🤗 <a href="https://huggingface.co/numind/NuExtract3">Model</a> | | |
| 🖥️ <a href="https://nuextract.ai/">API / Platform</a> | | |
| 📑 <a href="https://numind.ai/blog">Blog</a> | | |
| 🗣️ <a href="https://discord.gg/3tsEtJNCDe">Discord</a> | | |
| 🛠️ <a href="https://github.com/numindai/nuextract">GitHub</a> | |
| </p> | |
| </div> | |
| """, | |
| elem_classes=["intro-card"], | |
| ) | |
| with gr.Row(equal_height=True): | |
| # Left: input, schema, controls | |
| with gr.Column(scale=1, min_width=520): | |
| with gr.Group(elem_classes="main-card"): | |
| gr.HTML("<div class='section-title'>Input</div>") | |
| context_image = gr.Image( | |
| label="Image", | |
| type="filepath", | |
| height=340, | |
| sources=["upload", "clipboard"], | |
| elem_id="image-box", | |
| ) | |
| context_text = gr.Textbox( | |
| label="Text", | |
| placeholder="Optional: paste document text.", | |
| lines=3, | |
| max_lines=5, | |
| ) | |
| with gr.Group(elem_classes="main-card"): | |
| gr.HTML("<div class='section-title'>Schema & instructions</div>") | |
| instruction = gr.Textbox( | |
| label="Instructions", | |
| placeholder="Optional extraction instructions.", | |
| lines=2, | |
| max_lines=3, | |
| ) | |
| with gr.Row(equal_height=True): | |
| template = gr.Code( | |
| label="Template", | |
| language="json", | |
| value=json.dumps( | |
| { | |
| "title": "string", | |
| "entities": ["string"], | |
| "dates": ["YYYY-MM-DD"], | |
| "amounts": [ | |
| { | |
| "value": "number", | |
| "currency": "string", | |
| } | |
| ], | |
| }, | |
| indent=4, | |
| ), | |
| lines=16, | |
| scale=5, | |
| elem_id="schema-box", | |
| ) | |
| with gr.Column(scale=2, min_width=150): | |
| generate_template_btn = gr.Button( | |
| "Generate template", | |
| variant="secondary", | |
| ) | |
| gr.HTML( | |
| "<div class='secondary-note'>" | |
| "Use Extract for JSON. Use Markdown to convert an image document. Use generate template to generate a well formated template from an input image." | |
| "</div>" | |
| ) | |
| with gr.Group(elem_classes="main-card"): | |
| gr.HTML("<div class='section-title'>Run</div>") | |
| with gr.Row(): | |
| extract_btn = gr.Button( | |
| "Extract JSON", | |
| variant="secondary", | |
| elem_classes=["primary-button"], | |
| ) | |
| markdown_btn = gr.Button( | |
| "Convert to Markdown", | |
| variant="secondary", | |
| elem_classes=["markdown-button"], | |
| ) | |
| with gr.Row(): | |
| stop_btn = gr.Button("Stop", variant="stop") | |
| clear_btn = gr.Button( | |
| "Clear results", | |
| variant="secondary", | |
| elem_classes=["clear-button"], | |
| ) | |
| reasoning_checkbox = gr.Checkbox( | |
| label="Reasoning", | |
| value=True, | |
| interactive=True, | |
| info="If enabled, reasoning is everything before </think>.", | |
| ) | |
| temperature = gr.Slider( | |
| 0.0, | |
| 1, | |
| value=0.0, | |
| step=0.05, | |
| label="Temperature", | |
| info="Higher values make the output less deterministic but can improve reasoning performance (around 0.4-0.6)", | |
| ) | |
| with gr.Accordion("Structured examples", open=False): | |
| if STRUCTURED_EXAMPLES: | |
| gr.Examples( | |
| examples=STRUCTURED_EXAMPLES, | |
| inputs=[context_image, template, instruction], | |
| label="Load structured example", | |
| examples_per_page=8, | |
| cache_examples=False, | |
| ) | |
| else: | |
| gr.Markdown( | |
| f""" | |
| No structured examples found. | |
| Add files referenced in `STRUCTURED_EXAMPLE_TEMPLATES`, for example: | |
| ```text | |
| {EXAMPLE_DIR}/1.jpg | |
| {EXAMPLE_DIR}/2.png | |
| ``` | |
| """ | |
| ) | |
| with gr.Accordion("Markdown examples", open=False): | |
| if MARKDOWN_EXAMPLES: | |
| gr.Examples( | |
| examples=MARKDOWN_EXAMPLES, | |
| inputs=[context_image], | |
| label="Load Markdown example", | |
| examples_per_page=8, | |
| cache_examples=False, | |
| ) | |
| else: | |
| gr.Markdown( | |
| f""" | |
| No Markdown examples found. | |
| Add image paths to `MARKDOWN_EXAMPLE_IMAGE_PATHS`, for example: | |
| ```python | |
| MARKDOWN_EXAMPLE_IMAGE_PATHS = [ | |
| "markdown_1.png", | |
| "markdown_2.jpg", | |
| "/home/user/app/examples/report.png", | |
| ] | |
| ``` | |
| Relative paths are resolved from: | |
| ```text | |
| {EXAMPLE_DIR} | |
| ``` | |
| """ | |
| ) | |
| # Endpoint settings are intentionally hidden from the UI. | |
| api_base = gr.State(DEFAULT_API_BASE) | |
| api_key = gr.State(DEFAULT_API_KEY) | |
| system_prompt = gr.State(SYSTEM_PROMPT_DEFAULT) | |
| # Right: reasoning + output | |
| with gr.Column(scale=1, min_width=520): | |
| with gr.Group(elem_classes="output-card"): | |
| gr.HTML("<div class='section-title'>Reasoning</div>") | |
| reasoning_md = gr.Markdown( | |
| label="Reasoning", | |
| elem_id="reasoning-box", | |
| ) | |
| gr.HTML("<div class='section-title' style='margin-top: 12px;'>Output</div>") | |
| output_md = gr.Markdown( | |
| label="Output", | |
| elem_id="output-box", | |
| ) | |
| error_box = gr.Markdown(visible=False) | |
| def run_model_click( | |
| api_key_val, | |
| api_base_val, | |
| system_prompt_val, | |
| instruction_val, | |
| template_val, | |
| context_text_val, | |
| context_image_val, | |
| temperature_val, | |
| reasoning_val, | |
| markdown_mode_val, | |
| ): | |
| mode_name = "Markdown" if markdown_mode_val else "Extract" | |
| print(f"[button] {mode_name} clicked", flush=True) | |
| print(f"[button] image={context_image_val}", flush=True) | |
| print(f"[button] text_len={len(context_text_val or '')}", flush=True) | |
| print(f"[button] reasoning={bool(reasoning_val)}", flush=True) | |
| if markdown_mode_val and not context_image_val: | |
| msg = "Markdown conversion requires a document image." | |
| yield ( | |
| gr.update(value=""), | |
| gr.update(value=""), | |
| gr.update(visible=True, value=f"### Error\n{msg}"), | |
| ) | |
| return | |
| if not context_image_val and not (context_text_val or "").strip(): | |
| msg = "Please provide a document image or paste document text." | |
| yield ( | |
| gr.update(value=""), | |
| gr.update(value=""), | |
| gr.update(visible=True, value=f"### Error\n{msg}"), | |
| ) | |
| return | |
| try: | |
| yielded_anything = False | |
| for res in infer_stream( | |
| api_key=api_key_val, | |
| api_base=api_base_val, | |
| system_prompt=system_prompt_val, | |
| template=template_val, | |
| instruction=instruction_val, | |
| context_text=context_text_val, | |
| context_image_path=context_image_val, | |
| temperature=temperature_val, | |
| reasoning=bool(reasoning_val), | |
| markdown_mode=bool(markdown_mode_val), | |
| ): | |
| yielded_anything = True | |
| think = res.get("think") or "" | |
| output = res.get("output") or "_(Empty output.)_" | |
| yield ( | |
| gr.update(value=f"```text\n{think}\n```" if think else ""), | |
| gr.update(value=output), | |
| gr.update(visible=False, value=""), | |
| ) | |
| if not yielded_anything: | |
| yield ( | |
| gr.update(value=""), | |
| gr.update(value=""), | |
| gr.update( | |
| visible=True, | |
| value="### Error\nThe model returned no streamed output.", | |
| ), | |
| ) | |
| except Exception: | |
| import traceback | |
| tb = traceback.format_exc() | |
| print(tb, flush=True) | |
| yield ( | |
| gr.update(value=""), | |
| gr.update(value=""), | |
| gr.update(visible=True, value=f"### Error\n```text\n{tb}\n```"), | |
| ) | |
| def on_extract_click( | |
| api_key_val, | |
| api_base_val, | |
| system_prompt_val, | |
| instruction_val, | |
| template_val, | |
| context_text_val, | |
| context_image_val, | |
| temperature_val, | |
| reasoning_val, | |
| ): | |
| yield from run_model_click( | |
| api_key_val, | |
| api_base_val, | |
| system_prompt_val, | |
| instruction_val, | |
| template_val, | |
| context_text_val, | |
| context_image_val, | |
| temperature_val, | |
| reasoning_val, | |
| False, | |
| ) | |
| def on_markdown_click( | |
| api_key_val, | |
| api_base_val, | |
| system_prompt_val, | |
| instruction_val, | |
| template_val, | |
| context_text_val, | |
| context_image_val, | |
| temperature_val, | |
| reasoning_val, | |
| ): | |
| yield from run_model_click( | |
| api_key_val, | |
| api_base_val, | |
| system_prompt_val, | |
| instruction_val, | |
| template_val, | |
| context_text_val, | |
| context_image_val, | |
| temperature_val, | |
| reasoning_val, | |
| True, | |
| ) | |
| def on_click_generate_template( | |
| api_key_val, | |
| api_base_val, | |
| system_prompt_val, | |
| context_text_val, | |
| context_image_val, | |
| temperature_val, | |
| ): | |
| print("[button] Generate template clicked", flush=True) | |
| if not context_image_val and not (context_text_val or "").strip(): | |
| return ( | |
| gr.update(), | |
| gr.update( | |
| visible=True, | |
| value="### Error\nPlease provide a document image or paste document text.", | |
| ), | |
| ) | |
| try: | |
| template_text = infer_template_generation( | |
| api_key=api_key_val, | |
| api_base=api_base_val, | |
| system_prompt=system_prompt_val, | |
| context_text=context_text_val, | |
| context_image_path=context_image_val, | |
| temperature=temperature_val, | |
| ) | |
| return gr.update(value=template_text), gr.update(visible=False, value="") | |
| except Exception: | |
| import traceback | |
| tb = traceback.format_exc() | |
| print(tb, flush=True) | |
| return ( | |
| gr.update(), | |
| gr.update(visible=True, value=f"### Error\n```text\n{tb}\n```"), | |
| ) | |
| def on_clear(): | |
| return ( | |
| gr.update(value=""), | |
| gr.update(value=""), | |
| gr.update(visible=False, value=""), | |
| ) | |
| common_inputs = [ | |
| api_key, | |
| api_base, | |
| system_prompt, | |
| instruction, | |
| template, | |
| context_text, | |
| context_image, | |
| temperature, | |
| reasoning_checkbox, | |
| ] | |
| common_outputs = [ | |
| reasoning_md, | |
| output_md, | |
| error_box, | |
| ] | |
| extract_event = extract_btn.click( | |
| fn=on_extract_click, | |
| inputs=common_inputs, | |
| outputs=common_outputs, | |
| show_progress=True, | |
| ) | |
| markdown_event = markdown_btn.click( | |
| fn=on_markdown_click, | |
| inputs=common_inputs, | |
| outputs=common_outputs, | |
| show_progress=True, | |
| ) | |
| stop_btn.click( | |
| fn=None, | |
| inputs=None, | |
| outputs=None, | |
| cancels=[extract_event, markdown_event], | |
| ) | |
| clear_btn.click( | |
| fn=on_clear, | |
| inputs=None, | |
| outputs=common_outputs, | |
| ) | |
| generate_template_btn.click( | |
| fn=on_click_generate_template, | |
| inputs=[ | |
| api_key, | |
| api_base, | |
| system_prompt, | |
| context_text, | |
| context_image, | |
| temperature, | |
| ], | |
| outputs=[ | |
| template, | |
| error_box, | |
| ], | |
| show_progress=True, | |
| ) | |
| if __name__ == "__main__": | |
| allowed_paths = [] | |
| if ASSETS_DIR.exists(): | |
| allowed_paths.append(str(ASSETS_DIR)) | |
| if EXAMPLE_DIR.exists(): | |
| allowed_paths.append(str(EXAMPLE_DIR)) | |
| demo.queue().launch( | |
| share=ARGS.share, | |
| server_name=ARGS.server_name, | |
| server_port=ARGS.server_port, | |
| show_error=True, | |
| allowed_paths=allowed_paths or None, | |
| ) | |