Spaces:
Sleeping
Sleeping
| import os | |
| import gc | |
| import json | |
| import base64 | |
| import time | |
| from pathlib import Path | |
| from io import BytesIO | |
| from threading import Thread | |
| from urllib.parse import urlparse | |
| from urllib.request import urlopen | |
| import gradio as gr | |
| import requests | |
| import spaces | |
| import torch | |
| from PIL import Image | |
| from huggingface_hub import snapshot_download | |
| from transformers import ( | |
| LightOnOcrForConditionalGeneration, | |
| LightOnOcrProcessor, | |
| Qwen2VLForConditionalGeneration, | |
| Qwen3VLForConditionalGeneration, | |
| Qwen2_5_VLForConditionalGeneration, | |
| AutoModelForCausalLM, | |
| AutoProcessor, | |
| AutoModel, | |
| AutoTokenizer, | |
| TextIteratorStreamer, | |
| ) | |
| os.environ["CUDA_LAUNCH_BLOCKING"] = "1" | |
| MAX_MAX_NEW_TOKENS = 8192 #4096 | |
| DEFAULT_MAX_NEW_TOKENS = 4096 #1024 | |
| MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "10000")) | |
| device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |
| print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES")) | |
| print("torch.__version__ =", torch.__version__) | |
| print("torch.version.cuda =", torch.version.cuda) | |
| print("cuda available:", torch.cuda.is_available()) | |
| print("cuda device count:", torch.cuda.device_count()) | |
| if torch.cuda.is_available(): | |
| print("current device:", torch.cuda.current_device()) | |
| print("device name:", torch.cuda.get_device_name(torch.cuda.current_device())) | |
| print("Using device:", device) | |
| def get_attention_fallbacks() -> list[str | None]: | |
| fallbacks = [] | |
| if torch.cuda.is_available() and os.getenv("USE_FLASH_ATTN", "0") == "1": | |
| fallbacks.append("kernels-community/flash-attn2") | |
| if torch.cuda.is_available(): | |
| fallbacks.append("sdpa") | |
| fallbacks.append("eager") | |
| fallbacks.append(None) | |
| return fallbacks | |
| def load_model_with_attention_fallback(model_cls, model_id, **kwargs): | |
| last_error = None | |
| for attn_impl in get_attention_fallbacks(): | |
| load_kwargs = dict(kwargs) | |
| label = attn_impl or "default" | |
| if attn_impl is None: | |
| load_kwargs.pop("attn_implementation", None) | |
| else: | |
| load_kwargs["attn_implementation"] = attn_impl | |
| try: | |
| print(f"Loading {model_id} with attention backend: {label}") | |
| return model_cls.from_pretrained(model_id, **load_kwargs) | |
| except Exception as exc: | |
| last_error = exc | |
| print(f"Failed loading {model_id} with attention backend {label}: {exc}") | |
| raise last_error | |
| def patch_dots_ocr_configuration(repo_path: str) -> None: | |
| config_path = Path(repo_path) / "configuration_dots.py" | |
| if not config_path.exists(): | |
| return | |
| source = config_path.read_text(encoding="utf-8") | |
| updated = source | |
| if 'attributes = ["image_processor", "tokenizer"]' not in updated: | |
| updated = updated.replace( | |
| "class DotsVLProcessor(Qwen2_5_VLProcessor):\n", | |
| 'class DotsVLProcessor(Qwen2_5_VLProcessor):\n attributes = ["image_processor", "tokenizer"]\n', | |
| 1, | |
| ) | |
| if "def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):" in updated: | |
| updated = updated.replace( | |
| "def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):", | |
| "def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):", | |
| 1, | |
| ) | |
| if "super().__init__(image_processor, tokenizer, chat_template=chat_template)" in updated: | |
| updated = updated.replace( | |
| "super().__init__(image_processor, tokenizer, chat_template=chat_template)", | |
| "super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)", | |
| 1, | |
| ) | |
| if updated != source: | |
| config_path.write_text(updated, encoding="utf-8") | |
| print(f"Patched dots.OCR processor config: {config_path}") | |
| def resolve_dots_ocr_model_path(repo_id: str) -> str: | |
| try: | |
| AutoProcessor.from_pretrained(repo_id, trust_remote_code=True) | |
| return repo_id | |
| except TypeError as exc: | |
| if "video_processor" not in str(exc): | |
| raise | |
| print("dots.OCR processor compatibility issue detected, applying local patch...") | |
| local_path = snapshot_download(repo_id=repo_id, local_dir="/tmp/dots_ocr_model", local_dir_use_symlinks=False) | |
| patch_dots_ocr_configuration(local_path) | |
| return local_path | |
| MODEL_ID_V = "nanonets/Nanonets-OCR2-3B" | |
| processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True) | |
| model_v = load_model_with_attention_fallback( | |
| Qwen2_5_VLForConditionalGeneration, | |
| MODEL_ID_V, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16 | |
| ).to(device).eval() | |
| MODEL_ID_Y = "lightonai/LightOnOCR-2-1B" | |
| LIGHTON_DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32 | |
| processor_y = LightOnOcrProcessor.from_pretrained(MODEL_ID_Y) | |
| model_y = LightOnOcrForConditionalGeneration.from_pretrained( | |
| MODEL_ID_Y, | |
| torch_dtype=LIGHTON_DTYPE, | |
| ).to(device).eval() | |
| MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct" | |
| processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True) | |
| model_x = load_model_with_attention_fallback( | |
| Qwen2VLForConditionalGeneration, | |
| MODEL_ID_X, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16 | |
| ).to(device).eval() | |
| MODEL_ID_W = "allenai/olmOCR-7B-0725" | |
| processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True) | |
| model_w = load_model_with_attention_fallback( | |
| Qwen2_5_VLForConditionalGeneration, | |
| MODEL_ID_W, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16 | |
| ).to(device).eval() | |
| MODEL_ID_M = "Qwen/Qwen3-VL-4B-Instruct" | |
| processor_m = AutoProcessor.from_pretrained(MODEL_ID_M) | |
| model_m = load_model_with_attention_fallback( | |
| Qwen3VLForConditionalGeneration, | |
| MODEL_ID_M, | |
| torch_dtype="auto" | |
| ).to(device).eval() | |
| MODEL_ID_A = "mistralai/Ministral-3-8B-Instruct-2512" | |
| ALBERT_API_URL = "https://albert.api.etalab.gouv.fr/v1/chat/completions" | |
| MODEL_MAP = { | |
| "Nanonets-OCR2-3B": (processor_v, model_v), | |
| "LightOnOCR-2-1B": (processor_y, model_y), | |
| "olmOCR-7B-0725": (processor_w, model_w), | |
| "Qwen3-VL-4B-Instruct": (processor_m, model_m), | |
| "Qwen2-VL-OCR-2B": (processor_x, model_x), | |
| "Ministral-3-8B-Instruct-2512": (None, MODEL_ID_A), | |
| } | |
| MODEL_CHOICES = list(MODEL_MAP.keys()) | |
| PROMPTS = { | |
| "OCR_GENERAL": { | |
| "name": "Simple Text Etraction", | |
| "description": "Extract the text including inside images and logos", | |
| "prompt": "Extract the text including inside images and logos", | |
| "icon": "📝" | |
| }, | |
| "OCR_MARKDOWN": { | |
| "name": "OCR -> Markdown", | |
| "description": "Perform OCR and convert to Markdown", | |
| "prompt": "Perform OCR including inside images and logos and convert to Markdown.", | |
| "icon": "🔍" | |
| }, | |
| "STRUCTURED_EXTRACTION": { | |
| "name": "Json Metadata Extraction", | |
| "description": "Extract metadata", | |
| "prompt": """Extract the document title from this cover page. | |
| Output ONLY valid JSON: | |
| { | |
| "title": "" | |
| }""", | |
| "icon": "🏷️" | |
| }, | |
| "STRUCTURED_LOCATED_EXTRACTION": { | |
| "name": "Located Json Metadata Extraction", | |
| "description": "Extract located metadata", | |
| "prompt": """Extract the document title from the middle central block of this cover page. | |
| Output ONLY valid JSON: | |
| { | |
| "title": "" | |
| }""", | |
| "icon": "📍" | |
| }, | |
| "STRUCTURED_GROUNDED_EXTRACTION": { | |
| "name": "Grounded Json Metadata Extraction", | |
| "description": "Extract grounded metadata", | |
| "prompt": """Extract the document title usually located around (x=0.5015, y=0.442) from this cover page. | |
| Output ONLY valid JSON: | |
| { | |
| "title": "" | |
| }""", | |
| "icon": "📍" | |
| }, | |
| "FULL_SCHEMA_JSON": { | |
| "name": "Full Json Schema", | |
| "description": "Extract full document metadata content as JSON with schema", | |
| "prompt": """Analyze this thesis/dissertation cover image and extract ONLY visible information. | |
| CRITICAL: Only extract information that is CLEARLY VISIBLE on the page. | |
| DO NOT invent, guess, or hallucinate any data. If a field is not visible, use null. | |
| Return ONLY valid JSON with this exact structure: | |
| { | |
| "title": "Main title of the thesis or dissertation as it appears on the title page", | |
| "subtitle": "Subtitle or remainder of the title, usually following a colon; null if not present", | |
| "author": "Full name of the author (student) who wrote the thesis or dissertation", | |
| "degree_type": "Academic degree sought by the author (e.g. PhD, Doctorate, Master's degree, Master's thesis)", | |
| "discipline": "Academic field or discipline of the thesis if explicitly stated; null if not present. Possible values: Mathématiques|Physics|Biology|others", | |
| "granting_institution": "Institution where the thesis was submitted and the degree is granted (degree-granting institution)", | |
| "doctoral_school": "Doctoral school or graduate program, if explicitly mentioned; null if not present", | |
| "co_tutelle_institutions": "List of institutions involved in a joint supervision or co-tutelle agreement; empty list if none", | |
| "partner_institutions": "List of partner institutions associated with the thesis but not granting the degree; empty list if none", | |
| "defense_year": "Year the thesis or dissertation was defended, in YYYY format; null if not visible", | |
| "defense_place": "City or place where the defense took place, if stated; null if not present", | |
| "thesis_advisor": "Main thesis advisor or supervisor (director of thesis); full name; null if not present", | |
| "co_advisors": "List of co-advisors or co-supervisors if explicitly mentioned; full names; empty list if none", | |
| "jury_president": "President or chair of the thesis examination committee, if specified; null if not present", | |
| "reviewers": "List of reviewers or rapporteurs of the thesis, if specified; full names; empty list if none", | |
| "committee_members": "List of other thesis committee or jury members, excluding advisor and reviewers; full names; empty list if none", | |
| "language": "Language in which the thesis is written, if explicitly stated; null if not present", | |
| "confidence": "Confidence score between 0.0 and 1.0 indicating reliability of the extracted metadata" | |
| } | |
| IMPORTANT: Return null for any field where information is NOT clearly visible. | |
| Return ONLY the JSON, no explanation.""", | |
| "icon": "📄" | |
| }, | |
| "NUEXTRACT_SCHEMA_JSON": { | |
| "name": "NuExtract Json Schema", | |
| "description": "Strict data extraction following deterministic JSON schema", | |
| "prompt": """{ | |
| "title": "verbatim-string", | |
| "subtitle": "verbatim-string", | |
| "author": "verbatim-string", | |
| "degree_type": "verbatim-string", | |
| "discipline": [["Mathématiques", "Physique", "Autres"]], | |
| "granting_institution": ["verbatim-string"], | |
| "doctoral_school": ["verbatim-string"], | |
| "co_tutelle_institutions": ["verbatim-string"], | |
| "partner_institutions": ["verbatim-string"], | |
| "defense_year": "integer", | |
| "thesis_advisor": ["verbatim-string"], | |
| "co_advisors": ["verbatim-string"], | |
| "jury_president": "verbatim-string", | |
| "reviewers": ["verbatim-string"], | |
| "other_jury_members": ["verbatim-string"], | |
| "language": "verbatim-string", | |
| "confidence": "float" | |
| }""", | |
| "icon": "📄" | |
| } | |
| } | |
| image_examples = [ | |
| {"query": PROMPTS["OCR_GENERAL"]["prompt"], "image": "examples/dumas_01382452.png", "model": "Nanonets-OCR2-3B"}, | |
| {"query": PROMPTS["OCR_MARKDOWN"]["prompt"], "image": "examples/dumas_01646440.png", "model": "olmOCR-7B-0725"}, | |
| {"query": PROMPTS["STRUCTURED_EXTRACTION"]["prompt"], "image": "examples/ephesvt_theses_doc13.jpg", "model": "Qwen3-VL-4B-Instruct"}, | |
| {"query": PROMPTS["STRUCTURED_LOCATED_EXTRACTION"]["prompt"], "image": "examples/memoires_cridaf_doc07.jpg", "model": "Qwen2-VL-OCR-2B"}, | |
| {"query": PROMPTS["STRUCTURED_GROUNDED_EXTRACTION"]["prompt"], "image": "examples/thesefr_2015PA010690.png", "model": "LightOnOCR-2-1B"}, | |
| {"query": "", "image": "examples/thesefr_2015PA010690.png", "model": "LightOnOCR-2-1B"}, | |
| ] | |
| def select_model(model_name: str): | |
| if model_name not in MODEL_MAP: | |
| raise ValueError("Invalid model selected.") | |
| return MODEL_MAP[model_name] | |
| def pil_to_data_url(img: Image.Image, fmt="PNG"): | |
| buf = BytesIO() | |
| img.save(buf, format=fmt) | |
| data = base64.b64encode(buf.getvalue()).decode() | |
| mime = "image/png" if fmt.upper() == "PNG" else "image/jpeg" | |
| return f"data:{mime};base64,{data}" | |
| def file_to_data_url(path): | |
| if not os.path.exists(path): | |
| return "" | |
| ext = path.rsplit(".", 1)[-1].lower() | |
| mime = { | |
| "jpg": "image/jpeg", | |
| "jpeg": "image/jpeg", | |
| "png": "image/png", | |
| "webp": "image/webp", | |
| }.get(ext, "image/jpeg") | |
| with open(path, "rb") as f: | |
| data = base64.b64encode(f.read()).decode() | |
| return f"data:{mime};base64,{data}" | |
| def image_url_to_data_url(url): | |
| try: | |
| with urlopen(url, timeout=15) as response: | |
| content_type = response.headers.get_content_type() or "image/jpeg" | |
| data = base64.b64encode(response.read()).decode() | |
| return f"data:{content_type};base64,{data}" | |
| except Exception as e: | |
| print("Image URL load error:", e) | |
| return "" | |
| def is_image_url(value): | |
| try: | |
| parsed = urlparse(str(value)) | |
| except Exception: | |
| return False | |
| return parsed.scheme in {"http", "https"} and bool(parsed.netloc) | |
| def make_thumb_b64(path, max_dim=240): | |
| try: | |
| if is_image_url(path): | |
| with urlopen(path, timeout=15) as response: | |
| img = Image.open(BytesIO(response.read())).convert("RGB") | |
| else: | |
| img = Image.open(path).convert("RGB") | |
| img.thumbnail((max_dim, max_dim)) | |
| return pil_to_data_url(img, "JPEG") | |
| except Exception as e: | |
| print("Thumbnail error:", e) | |
| return "" | |
| def build_example_cards_html(): | |
| cards = "" | |
| for i, ex in enumerate(image_examples): | |
| thumb = make_thumb_b64(ex["image"]) | |
| prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "") | |
| cards += f""" | |
| <div class="example-card" data-idx="{i}"> | |
| <div class="example-thumb-wrap"> | |
| {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"} | |
| </div> | |
| <div class="example-meta-row"> | |
| <span class="example-badge">{ex["model"]}</span> | |
| </div> | |
| <div class="example-prompt-text">{prompt_short}</div> | |
| </div> | |
| """ | |
| return cards | |
| EXAMPLE_CARDS_HTML = build_example_cards_html() | |
| def load_example_data(idx_str): | |
| try: | |
| idx = int(str(idx_str).strip()) | |
| except Exception: | |
| return gr.update(value=json.dumps({"status": "error", "message": "Invalid example index"})) | |
| if idx < 0 or idx >= len(image_examples): | |
| return gr.update(value=json.dumps({"status": "error", "message": "Example index out of range"})) | |
| ex = image_examples[idx] | |
| img_b64 = file_to_data_url(ex["image"]) | |
| if not img_b64: | |
| return gr.update(value=json.dumps({"status": "error", "message": "Could not load example image"})) | |
| return gr.update(value=json.dumps({ | |
| "status": "ok", | |
| "query": ex["query"], | |
| "image": img_b64, | |
| "model": ex["model"], | |
| "name": os.path.basename(ex["image"]), | |
| })) | |
| def b64_to_pil(b64_str): | |
| if not b64_str: | |
| return None | |
| try: | |
| if is_image_url(b64_str): | |
| with urlopen(b64_str, timeout=15) as response: | |
| return Image.open(BytesIO(response.read())).convert("RGB") | |
| if b64_str.startswith("data:"): | |
| _, data = b64_str.split(",", 1) | |
| else: | |
| data = b64_str | |
| image_data = base64.b64decode(data) | |
| return Image.open(BytesIO(image_data)).convert("RGB") | |
| except Exception: | |
| return None | |
| def calc_timeout_duration(*args, **kwargs): | |
| gpu_timeout = kwargs.get("gpu_timeout", None) | |
| if gpu_timeout is None and args: | |
| gpu_timeout = args[-1] | |
| try: | |
| return int(gpu_timeout) | |
| except Exception: | |
| return 60 | |
| def align_inputs_to_model_dtype(inputs, model): | |
| model_dtype = getattr(model, "dtype", None) | |
| if model_dtype is None: | |
| try: | |
| model_dtype = next(model.parameters()).dtype | |
| except StopIteration: | |
| model_dtype = None | |
| if model_dtype is None: | |
| return inputs | |
| for key, value in list(inputs.items()): | |
| if torch.is_tensor(value) and value.is_floating_point(): | |
| inputs[key] = value.to(dtype=model_dtype) | |
| return inputs | |
| def model_requires_text_prompt(model_name: str) -> bool: | |
| return model_name != "LightOnOCR-2-1B" | |
| def call_albert_api(model_id, text, image, max_new_tokens, temperature, top_p): | |
| api_key = os.getenv("ALBERT_API_KEY") | |
| if not api_key: | |
| raise RuntimeError("ALBERT_API_KEY environment variable is not set.") | |
| image_data_url = pil_to_data_url(image, "PNG") | |
| payload = { | |
| "model": model_id, | |
| "messages": [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": text}, | |
| {"type": "image_url", "image_url": {"url": image_data_url}}, | |
| ], | |
| } | |
| ], | |
| "max_tokens": int(max_new_tokens), | |
| "temperature": float(temperature), | |
| "top_p": float(top_p), | |
| } | |
| headers = { | |
| "Content-Type": "application/json", | |
| "Authorization": f"Bearer {api_key}", | |
| } | |
| response = requests.post(ALBERT_API_URL, headers=headers, json=payload, timeout=180) | |
| response.raise_for_status() | |
| data = response.json() | |
| choices = data.get("choices") or [] | |
| if not choices: | |
| raise RuntimeError(f"Albert API returned no choices: {json.dumps(data)}") | |
| message = choices[0].get("message") or {} | |
| content = message.get("content", "") | |
| if isinstance(content, str): | |
| return content | |
| if isinstance(content, list): | |
| text_parts = [ | |
| item.get("text", "") | |
| for item in content | |
| if isinstance(item, dict) and item.get("type") == "text" | |
| ] | |
| return "\n".join(part for part in text_parts if part).strip() | |
| return str(content).strip() | |
| def generate_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout): | |
| try: | |
| if not model_name or model_name not in MODEL_MAP: | |
| yield "[ERROR] Please select a valid model." | |
| return | |
| if image is None: | |
| yield "[ERROR] Please upload an image." | |
| return | |
| text = str(text or "").strip() | |
| if not model_requires_text_prompt(model_name): | |
| text = "" | |
| if model_requires_text_prompt(model_name) and not text: | |
| yield "[ERROR] Please enter your OCR/query instruction." | |
| return | |
| if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8: | |
| yield "[ERROR] Query is too long. Please shorten your input." | |
| return | |
| if model_name == "Ministral-3-8B-Instruct-2512": | |
| output_text = call_albert_api( | |
| model_id=MODEL_ID_A, | |
| text=text, | |
| image=image, | |
| max_new_tokens=max_new_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| ) | |
| if output_text.strip(): | |
| yield output_text | |
| else: | |
| yield "[ERROR] No output was generated." | |
| return | |
| processor, model = select_model(model_name) | |
| if model_name == "LightOnOCR-2-1B": | |
| conversation = [{"role": "user", "content": [{"type": "image", "image": image}]}] | |
| inputs = processor.apply_chat_template( | |
| conversation, | |
| add_generation_prompt=True, | |
| tokenize=True, | |
| return_dict=True, | |
| return_tensors="pt", | |
| ) | |
| inputs = { | |
| k: v.to(device=device, dtype=LIGHTON_DTYPE) if torch.is_tensor(v) and v.is_floating_point() else v.to(device) | |
| for k, v in inputs.items() | |
| } | |
| output_ids = model.generate(**inputs, max_new_tokens=int(max_new_tokens)) | |
| generated_ids = output_ids[0, inputs["input_ids"].shape[1]:] | |
| output_text = processor.decode(generated_ids, skip_special_tokens=True) | |
| if output_text.strip(): | |
| yield output_text | |
| else: | |
| yield "[ERROR] No output was generated." | |
| return | |
| streamer = TextIteratorStreamer( | |
| processor.tokenizer if hasattr(processor, "tokenizer") else processor, | |
| skip_prompt=True, | |
| skip_special_tokens=True | |
| ) | |
| generation_error = {"error": None} | |
| if model_name == "Qwen3-VL-4B-Instruct": | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": text}, | |
| ], | |
| } | |
| ] | |
| inputs = processor.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_dict=True, | |
| return_tensors="pt", | |
| ) | |
| inputs.pop("token_type_ids", None) | |
| inputs = {k: v.to(model.device) if hasattr(v, "to") else v for k, v in inputs.items()} | |
| generation_kwargs = { | |
| **inputs, | |
| "streamer": streamer, | |
| "max_new_tokens": int(max_new_tokens), | |
| "do_sample": True, | |
| "temperature": float(temperature), | |
| "top_p": float(top_p), | |
| "top_k": int(top_k), | |
| "repetition_penalty": float(repetition_penalty), | |
| } | |
| else: | |
| messages = [{ | |
| "role": "user", | |
| "content": [ | |
| {"type": "image"}, | |
| {"type": "text", "text": text}, | |
| ] | |
| }] | |
| prompt_full = processor.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| inputs = processor( | |
| text=[prompt_full], | |
| images=[image], | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=MAX_INPUT_TOKEN_LENGTH | |
| ).to(device) | |
| inputs = align_inputs_to_model_dtype(inputs, model) | |
| generation_kwargs = { | |
| **inputs, | |
| "streamer": streamer, | |
| "max_new_tokens": int(max_new_tokens), | |
| "do_sample": True, | |
| "temperature": float(temperature), | |
| "top_p": float(top_p), | |
| "top_k": int(top_k), | |
| "repetition_penalty": float(repetition_penalty), | |
| } | |
| def _run_generation(): | |
| try: | |
| model.generate(**generation_kwargs) | |
| except Exception as e: | |
| generation_error["error"] = e | |
| try: | |
| streamer.end() | |
| except Exception: | |
| pass | |
| thread = Thread(target=_run_generation, daemon=True) | |
| thread.start() | |
| buffer = "" | |
| for new_text in streamer: | |
| buffer += new_text.replace("<|im_end|>", "") | |
| time.sleep(0.01) | |
| yield buffer | |
| thread.join(timeout=1.0) | |
| if generation_error["error"] is not None: | |
| err_msg = f"[ERROR] Inference failed: {str(generation_error['error'])}" | |
| if buffer.strip(): | |
| yield buffer + "\n\n" + err_msg | |
| else: | |
| yield err_msg | |
| return | |
| if not buffer.strip(): | |
| yield "[ERROR] No output was generated." | |
| except Exception as e: | |
| yield f"[ERROR] {str(e)}" | |
| finally: | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| def run_ocr(model_name, text, image_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v): | |
| try: | |
| image = b64_to_pil(image_b64) | |
| yield from generate_image( | |
| model_name=model_name, | |
| text=text, | |
| image=image, | |
| max_new_tokens=max_new_tokens_v, | |
| temperature=temperature_v, | |
| top_p=top_p_v, | |
| top_k=top_k_v, | |
| repetition_penalty=repetition_penalty_v, | |
| gpu_timeout=gpu_timeout_v, | |
| ) | |
| except Exception as e: | |
| yield f"[ERROR] {str(e)}" | |
| def noop(): | |
| return None | |
| css = r""" | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500;600&display=swap'); | |
| *{box-sizing:border-box;margin:0;padding:0} | |
| html,body{height:100%;overflow-x:hidden} | |
| body,.gradio-container{ | |
| background:#0f0f13!important; | |
| font-family:'Inter',system-ui,-apple-system,sans-serif!important; | |
| font-size:14px!important;color:#e4e4e7!important;min-height:100vh;overflow-x:hidden; | |
| } | |
| .dark body,.dark .gradio-container{background:#0f0f13!important;color:#e4e4e7!important} | |
| footer{display:none!important} | |
| .hidden-input{display:none!important;height:0!important;overflow:hidden!important;margin:0!important;padding:0!important} | |
| #gradio-run-btn,#example-load-btn{ | |
| position:absolute!important;left:-9999px!important;top:-9999px!important; | |
| width:1px!important;height:1px!important;opacity:0.01!important; | |
| pointer-events:none!important;overflow:hidden!important; | |
| } | |
| .app-shell{ | |
| background:#18181b;border:1px solid #27272a;border-radius:16px; | |
| margin:12px auto;max-width:1580px;overflow:hidden; | |
| box-shadow:0 25px 50px -12px rgba(0,0,0,.6),0 0 0 1px rgba(255,255,255,.03); | |
| } | |
| .app-header{ | |
| background:linear-gradient(135deg,#18181b,#1e1e24);border-bottom:1px solid #27272a; | |
| padding:14px 24px;display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:12px; | |
| } | |
| .app-header-left{display:flex;align-items:center;gap:12px} | |
| .app-logo{ | |
| width:38px;height:38px;background:linear-gradient(135deg,#ADFF2F,#C6FF66,#D8FF8A); | |
| border-radius:10px;display:flex;align-items:center;justify-content:center; | |
| box-shadow:0 4px 12px rgba(173,255,47,.28); | |
| } | |
| .app-logo svg{width:22px;height:22px;fill:#fff;flex-shrink:0} | |
| .app-title{ | |
| font-size:18px;font-weight:700;background:linear-gradient(135deg,#f5f5f5,#bdbdbd); | |
| -webkit-background-clip:text;-webkit-text-fill-color:transparent;letter-spacing:-.3px; | |
| } | |
| .app-badge{ | |
| font-size:11px;font-weight:600;padding:3px 10px;border-radius:20px; | |
| background:rgba(173,255,47,.12);color:#D6FF8C;border:1px solid rgba(173,255,47,.28);letter-spacing:.3px; | |
| } | |
| .app-badge.fast{background:rgba(173,255,47,.08);color:#C6FF66;border:1px solid rgba(173,255,47,.22)} | |
| .model-tabs-bar{ | |
| background:#18181b;border-bottom:1px solid #27272a;padding:10px 16px; | |
| display:flex;gap:8px;align-items:center;flex-wrap:wrap; | |
| } | |
| .model-tab{ | |
| display:inline-flex;align-items:center;justify-content:center;gap:6px; | |
| min-width:32px;height:34px;background:transparent;border:1px solid #27272a; | |
| border-radius:999px;cursor:pointer;font-size:12px;font-weight:600;padding:0 12px; | |
| color:#ffffff!important;transition:all .15s ease; | |
| } | |
| .model-tab:hover{background:rgba(173,255,47,.10);border-color:rgba(173,255,47,.35)} | |
| .model-tab.active{background:rgba(173,255,47,.18);border-color:#ADFF2F;color:#fff!important;box-shadow:0 0 0 2px rgba(173,255,47,.08)} | |
| .model-tab-label{font-size:12px;color:#ffffff!important;font-weight:600} | |
| .app-main-row{display:flex;gap:0;flex:1;overflow:hidden} | |
| .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #27272a} | |
| .app-main-right{ | |
| width:min(48vw,700px);min-width:620px;display:flex;flex-direction:column; | |
| flex-shrink:0;background:#18181b; | |
| } | |
| #image-drop-zone{ | |
| position:relative;background:#09090b;height:440px;min-height:440px;max-height:440px; | |
| overflow:hidden; | |
| } | |
| #image-drop-zone.drag-over{outline:2px solid #ADFF2F;outline-offset:-2px;background:rgba(173,255,47,.04)} | |
| .upload-prompt-modern{ | |
| position:absolute;inset:0;display:flex;align-items:center;justify-content:center; | |
| padding:20px;z-index:20;overflow:hidden; | |
| } | |
| .upload-click-area{ | |
| display:flex;flex-direction:column;align-items:center;justify-content:center; | |
| cursor:pointer;padding:28px 36px;max-width:92%;max-height:92%; | |
| border:2px dashed #3f3f46;border-radius:16px; | |
| background:rgba(173,255,47,.03);transition:all .2s ease;gap:8px;text-align:center; | |
| overflow:hidden; | |
| } | |
| .upload-prompt-stack{ | |
| display:flex;flex-direction:column;align-items:center;justify-content:center; | |
| gap:14px;width:min(560px,100%);max-width:100%; | |
| } | |
| .upload-click-area:hover{background:rgba(173,255,47,.08);border-color:#ADFF2F;transform:scale(1.02)} | |
| .upload-click-area:active{background:rgba(173,255,47,.12);transform:scale(.99)} | |
| .upload-click-area svg{width:86px;height:86px;max-width:100%;flex-shrink:0} | |
| .upload-main-text{color:#a1a1aa;font-size:14px;font-weight:600;margin-top:4px} | |
| .upload-sub-text{color:#71717a;font-size:12px} | |
| .upload-url-box{ | |
| display:flex;gap:8px;width:min(520px,100%);max-width:100%; | |
| align-items:center;justify-content:center;flex-wrap:wrap; | |
| } | |
| .upload-url-input{ | |
| flex:1;min-width:240px;background:#09090b;border:1px solid #27272a;border-radius:10px; | |
| color:#e4e4e7;padding:10px 12px;font-size:13px;outline:none; | |
| } | |
| .upload-url-input:focus{border-color:#ADFF2F;box-shadow:0 0 0 3px rgba(173,255,47,.14)} | |
| .upload-url-input::placeholder{color:#52525b} | |
| .upload-url-btn{ | |
| min-width:96px;height:40px;padding:0 14px;background:rgba(173,255,47,.12);border:1px solid rgba(173,255,47,.28); | |
| border-radius:10px;cursor:pointer;color:#D6FF8C;font-size:12px;font-weight:700;transition:all .15s ease; | |
| } | |
| .upload-url-btn:hover{background:#ADFF2F;border-color:#ADFF2F;color:#111} | |
| .single-preview-wrap{ | |
| width:100%;height:100%;display:none;align-items:center;justify-content:center;padding:16px; | |
| overflow:hidden; | |
| } | |
| .single-preview-card{ | |
| width:100%;height:100%;max-width:100%;max-height:100%;border-radius:14px; | |
| overflow:hidden;border:1px solid #27272a;background:#111114; | |
| display:flex;align-items:center;justify-content:center;position:relative; | |
| } | |
| .single-preview-card img{ | |
| width:100%;height:100%;max-width:100%;max-height:100%; | |
| object-fit:contain;display:block; | |
| } | |
| .preview-overlay-actions{ | |
| position:absolute;top:12px;right:12px;display:flex;gap:8px;z-index:5; | |
| } | |
| .preview-action-btn{ | |
| display:inline-flex;align-items:center;justify-content:center; | |
| min-width:34px;height:34px;padding:0 12px;background:rgba(0,0,0,.65); | |
| border:1px solid rgba(255,255,255,.14);border-radius:10px;cursor:pointer; | |
| color:#fff!important;font-size:12px;font-weight:600;transition:all .15s ease; | |
| } | |
| .preview-action-btn:hover{background:#ADFF2F;border-color:#ADFF2F;color:#111!important} | |
| .hint-bar{ | |
| background:rgba(173,255,47,.05);border-top:1px solid #27272a;border-bottom:1px solid #27272a; | |
| padding:10px 20px;font-size:13px;color:#a1a1aa;line-height:1.7; | |
| } | |
| .hint-bar b{color:#D6FF8C;font-weight:600} | |
| .hint-bar kbd{ | |
| display:inline-block;padding:1px 6px;background:#27272a;border:1px solid #3f3f46; | |
| border-radius:4px;font-family:'JetBrains Mono',monospace;font-size:11px;color:#a1a1aa; | |
| } | |
| .examples-section{border-top:1px solid #27272a;padding:12px 16px} | |
| .examples-title{ | |
| font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase; | |
| letter-spacing:.8px;margin-bottom:10px; | |
| } | |
| .examples-scroll{display:flex;gap:10px;overflow-x:auto;padding-bottom:8px} | |
| .examples-scroll::-webkit-scrollbar{height:6px} | |
| .examples-scroll::-webkit-scrollbar-track{background:#09090b;border-radius:3px} | |
| .examples-scroll::-webkit-scrollbar-thumb{background:#27272a;border-radius:3px} | |
| .examples-scroll::-webkit-scrollbar-thumb:hover{background:#3f3f46} | |
| .example-card{ | |
| flex-shrink:0;width:220px;background:#09090b;border:1px solid #27272a; | |
| border-radius:10px;overflow:hidden;cursor:pointer;transition:all .2s ease; | |
| } | |
| .example-card:hover{border-color:#ADFF2F;transform:translateY(-2px);box-shadow:0 4px 12px rgba(173,255,47,.12)} | |
| .example-card.loading{opacity:.5;pointer-events:none} | |
| .example-thumb-wrap{height:120px;overflow:hidden;background:#18181b} | |
| .example-thumb-wrap img{width:100%;height:100%;object-fit:cover} | |
| .example-thumb-placeholder{ | |
| width:100%;height:100%;display:flex;align-items:center;justify-content:center; | |
| background:#18181b;color:#3f3f46;font-size:11px; | |
| } | |
| .example-meta-row{padding:6px 10px;display:flex;align-items:center;gap:6px} | |
| .example-badge{ | |
| display:inline-flex;padding:2px 7px;background:rgba(173,255,47,.12);border-radius:4px; | |
| font-size:10px;font-weight:600;color:#D6FF8C;font-family:'JetBrains Mono',monospace;white-space:nowrap; | |
| } | |
| .example-prompt-text{ | |
| padding:0 10px 8px;font-size:11px;color:#a1a1aa;line-height:1.4; | |
| display:-webkit-box;-webkit-line-clamp:2;-webkit-box-orient:vertical;overflow:hidden; | |
| } | |
| .panel-card{border-bottom:1px solid #27272a} | |
| .panel-card-title{ | |
| padding:12px 20px;font-size:12px;font-weight:600;color:#71717a; | |
| text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(39,39,42,.6); | |
| } | |
| .panel-card-body{padding:18px 22px 20px;display:flex;flex-direction:column;gap:10px} | |
| .modern-label{font-size:13px;font-weight:500;color:#a1a1aa;margin-bottom:4px;display:block} | |
| .modern-textarea{ | |
| width:100%;background:#09090b;border:1px solid #27272a;border-radius:8px; | |
| padding:14px 16px;font-family:'Inter',sans-serif;font-size:14px;color:#e4e4e7; | |
| resize:vertical;outline:none;min-height:220px;transition:border-color .2s; | |
| } | |
| .is-hidden{display:none!important} | |
| .modern-textarea:focus{border-color:#ADFF2F;box-shadow:0 0 0 3px rgba(173,255,47,.14)} | |
| .modern-textarea::placeholder{color:#3f3f46} | |
| .modern-textarea.error-flash{ | |
| border-color:#ef4444!important;box-shadow:0 0 0 3px rgba(239,68,68,.2)!important;animation:shake .4s ease; | |
| } | |
| @keyframes shake{0%,100%{transform:translateX(0)}20%,60%{transform:translateX(-4px)}40%,80%{transform:translateX(4px)}} | |
| .toast-notification{ | |
| position:fixed;top:24px;left:50%;transform:translateX(-50%) translateY(-120%); | |
| z-index:9999;padding:10px 24px;border-radius:10px;font-family:'Inter',sans-serif; | |
| font-size:14px;font-weight:600;display:flex;align-items:center;gap:8px; | |
| box-shadow:0 8px 24px rgba(0,0,0,.5); | |
| transition:transform .35s cubic-bezier(.34,1.56,.64,1),opacity .35s ease;opacity:0;pointer-events:none; | |
| } | |
| .toast-notification.visible{transform:translateX(-50%) translateY(0);opacity:1;pointer-events:auto} | |
| .toast-notification.error{background:linear-gradient(135deg,#dc2626,#b91c1c);color:#fff;border:1px solid rgba(255,255,255,.15)} | |
| .toast-notification.warning{background:linear-gradient(135deg,#84cc16,#65a30d);color:#111;border:1px solid rgba(255,255,255,.08)} | |
| .toast-notification.info{background:linear-gradient(135deg,#a3e635,#84cc16);color:#111;border:1px solid rgba(255,255,255,.08)} | |
| .toast-notification .toast-icon{font-size:16px;line-height:1} | |
| .toast-notification .toast-text{line-height:1.3} | |
| .btn-run{ | |
| display:flex;align-items:center;justify-content:center;gap:8px;width:100%; | |
| background:linear-gradient(135deg,#ADFF2F,#8FD61F);border:none;border-radius:10px; | |
| padding:12px 24px;cursor:pointer;font-size:15px;font-weight:700;font-family:'Inter',sans-serif; | |
| color:#ffffff!important;-webkit-text-fill-color:#ffffff!important; | |
| transition:all .2s ease;letter-spacing:-.2px; | |
| box-shadow:0 4px 16px rgba(173,255,47,.25),inset 0 1px 0 rgba(255,255,255,.25); | |
| } | |
| .btn-run:hover{ | |
| background:linear-gradient(135deg,#C6FF66,#ADFF2F);transform:translateY(-1px); | |
| box-shadow:0 6px 24px rgba(173,255,47,.35),inset 0 1px 0 rgba(255,255,255,.25); | |
| } | |
| .btn-run:active{transform:translateY(0);box-shadow:0 2px 8px rgba(173,255,47,.25)} | |
| #custom-run-btn,#custom-run-btn *,#run-btn-label,.btn-run,.btn-run *{ | |
| color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;fill:#ffffff!important; | |
| } | |
| body:not(.dark) .btn-run,body:not(.dark) .btn-run *, | |
| .dark .btn-run,.dark .btn-run *, | |
| .gradio-container .btn-run,.gradio-container .btn-run *, | |
| .gradio-container #custom-run-btn,.gradio-container #custom-run-btn *{ | |
| color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;fill:#ffffff!important; | |
| } | |
| .output-frame{border-bottom:1px solid #27272a;display:flex;flex-direction:column;position:relative} | |
| .output-frame .out-title, | |
| .output-frame .out-title *, | |
| #output-title-label{ | |
| color:#ffffff!important; | |
| -webkit-text-fill-color:#ffffff!important; | |
| } | |
| .output-frame .out-title{ | |
| padding:10px 20px;font-size:13px;font-weight:700; | |
| text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(39,39,42,.6); | |
| display:flex;align-items:center;justify-content:space-between;gap:8px;flex-wrap:wrap; | |
| } | |
| .out-title-right{display:flex;gap:8px;align-items:center} | |
| .out-action-btn{ | |
| display:inline-flex;align-items:center;justify-content:center;background:rgba(173,255,47,.10); | |
| border:1px solid rgba(173,255,47,.2);border-radius:6px;cursor:pointer;padding:3px 10px; | |
| font-size:11px;font-weight:500;color:#D6FF8C!important;gap:4px;height:24px;transition:all .15s; | |
| } | |
| .out-action-btn:hover{background:rgba(173,255,47,.2);border-color:rgba(173,255,47,.35);color:#111!important} | |
| .out-action-btn svg{width:12px;height:12px;fill:#D6FF8C} | |
| .output-frame .out-body{ | |
| flex:1;background:#09090b;display:flex;align-items:stretch;justify-content:stretch; | |
| overflow:hidden;min-height:520px;position:relative; | |
| } | |
| .output-scroll-wrap{ | |
| width:100%;height:100%;padding:0;overflow:hidden; | |
| } | |
| .output-textarea{ | |
| width:100%;height:520px;min-height:520px;max-height:none;background:#09090b;color:#e4e4e7; | |
| border:none;outline:none;padding:18px 20px;font-size:13px;line-height:1.65; | |
| font-family:'JetBrains Mono',monospace;overflow:auto;resize:vertical;white-space:pre-wrap; | |
| } | |
| .output-textarea::placeholder{color:#52525b} | |
| .output-textarea.error-flash{ | |
| box-shadow:inset 0 0 0 2px rgba(239,68,68,.6); | |
| } | |
| .modern-loader{ | |
| display:none;position:absolute;top:0;left:0;right:0;bottom:0;background:rgba(9,9,11,.92); | |
| z-index:15;flex-direction:column;align-items:center;justify-content:center;gap:16px;backdrop-filter:blur(4px); | |
| } | |
| .modern-loader.active{display:flex} | |
| .modern-loader .loader-spinner{ | |
| width:36px;height:36px;border:3px solid #27272a;border-top-color:#ADFF2F; | |
| border-radius:50%;animation:spin .8s linear infinite; | |
| } | |
| @keyframes spin{to{transform:rotate(360deg)}} | |
| .modern-loader .loader-text{font-size:13px;color:#a1a1aa;font-weight:500} | |
| .loader-bar-track{width:200px;height:4px;background:#27272a;border-radius:2px;overflow:hidden} | |
| .loader-bar-fill{ | |
| height:100%;background:linear-gradient(90deg,#ADFF2F,#C6FF66,#ADFF2F); | |
| background-size:200% 100%;animation:shimmer 1.5s ease-in-out infinite;border-radius:2px; | |
| } | |
| @keyframes shimmer{0%{background-position:200% 0}100%{background-position:-200% 0}} | |
| .settings-group{border:1px solid #27272a;border-radius:10px;margin:12px 16px;padding:0;overflow:hidden} | |
| .settings-group-title{ | |
| font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;letter-spacing:.8px; | |
| padding:10px 16px;border-bottom:1px solid #27272a;background:rgba(24,24,27,.5); | |
| } | |
| .settings-group-body{padding:14px 16px;display:flex;flex-direction:column;gap:12px} | |
| .slider-row{display:flex;align-items:center;gap:10px;min-height:28px} | |
| .slider-row label{font-size:13px;font-weight:500;color:#a1a1aa;min-width:118px;flex-shrink:0} | |
| .slider-row input[type="range"]{ | |
| flex:1;-webkit-appearance:none;appearance:none;height:6px;background:#27272a; | |
| border-radius:3px;outline:none;min-width:0; | |
| } | |
| .slider-row input[type="range"]::-webkit-slider-thumb{ | |
| -webkit-appearance:none;width:16px;height:16px;background:linear-gradient(135deg,#ADFF2F,#8FD61F); | |
| border-radius:50%;cursor:pointer;box-shadow:0 2px 6px rgba(173,255,47,.35);transition:transform .15s; | |
| } | |
| .slider-row input[type="range"]::-webkit-slider-thumb:hover{transform:scale(1.2)} | |
| .slider-row input[type="range"]::-moz-range-thumb{ | |
| width:16px;height:16px;background:linear-gradient(135deg,#ADFF2F,#8FD61F); | |
| border-radius:50%;cursor:pointer;border:none;box-shadow:0 2px 6px rgba(173,255,47,.35); | |
| } | |
| .slider-row .slider-val{ | |
| min-width:58px;text-align:right;font-family:'JetBrains Mono',monospace;font-size:12px; | |
| font-weight:500;padding:3px 8px;background:#09090b;border:1px solid #27272a; | |
| border-radius:6px;color:#a1a1aa;flex-shrink:0; | |
| } | |
| .app-statusbar{ | |
| background:#18181b;border-top:1px solid #27272a;padding:6px 20px; | |
| display:flex;gap:12px;height:34px;align-items:center;font-size:12px; | |
| } | |
| .app-statusbar .sb-section{ | |
| padding:0 12px;flex:1;display:flex;align-items:center;font-family:'JetBrains Mono',monospace; | |
| font-size:12px;color:#52525b;overflow:hidden;white-space:nowrap; | |
| } | |
| .app-statusbar .sb-section.sb-fixed{ | |
| flex:0 0 auto;min-width:110px;text-align:center;justify-content:center; | |
| padding:3px 12px;background:rgba(173,255,47,.08);border-radius:6px;color:#D6FF8C;font-weight:500; | |
| } | |
| .exp-note{padding:10px 20px;font-size:12px;color:#52525b;border-top:1px solid #27272a;text-align:center} | |
| .exp-note a{color:#D6FF8C;text-decoration:none} | |
| .exp-note a:hover{text-decoration:underline} | |
| ::-webkit-scrollbar{width:8px;height:8px} | |
| ::-webkit-scrollbar-track{background:#09090b} | |
| ::-webkit-scrollbar-thumb{background:#27272a;border-radius:4px} | |
| ::-webkit-scrollbar-thumb:hover{background:#3f3f46} | |
| @media(max-width:980px){ | |
| .app-main-row{flex-direction:column} | |
| .app-main-right{width:100%;min-width:0} | |
| .app-main-left{border-right:none;border-bottom:1px solid #27272a} | |
| .modern-textarea{min-height:180px} | |
| .output-frame .out-body{min-height:420px} | |
| .output-textarea{height:420px;min-height:420px} | |
| } | |
| """ | |
| gallery_js = r""" | |
| () => { | |
| function init() { | |
| if (window.__ocr2GreenInitDone) return; | |
| const dropZone = document.getElementById('image-drop-zone'); | |
| const uploadPrompt = document.getElementById('upload-prompt'); | |
| const uploadClick = document.getElementById('upload-click-area'); | |
| const fileInput = document.getElementById('custom-file-input'); | |
| const previewWrap = document.getElementById('single-preview-wrap'); | |
| const previewImg = document.getElementById('single-preview-img'); | |
| const btnUpload = document.getElementById('preview-upload-btn'); | |
| const btnClear = document.getElementById('preview-clear-btn'); | |
| const urlInput = document.getElementById('image-url-input'); | |
| const urlBtn = document.getElementById('image-url-btn'); | |
| const promptInput = document.getElementById('custom-query-input'); | |
| const promptPanel = document.getElementById('prompt-panel'); | |
| const promptTabsBar = document.getElementById('prompt-tabs-bar'); | |
| const runBtnEl = document.getElementById('custom-run-btn'); | |
| const outputArea = document.getElementById('custom-output-textarea'); | |
| const imgStatus = document.getElementById('sb-image-status'); | |
| if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg || !urlInput || !urlBtn) { | |
| setTimeout(init, 250); | |
| return; | |
| } | |
| window.__ocr2GreenInitDone = true; | |
| let imageState = null; | |
| let toastTimer = null; | |
| let examplePoller = null; | |
| let lastSeenExamplePayload = null; | |
| const promptTemplatesNode = document.getElementById('prompt-templates-data'); | |
| let promptTemplates = {}; | |
| try { | |
| promptTemplates = promptTemplatesNode ? JSON.parse(promptTemplatesNode.textContent || '{}') : {}; | |
| } catch (e) { | |
| promptTemplates = {}; | |
| } | |
| function showToast(message, type) { | |
| let toast = document.getElementById('app-toast'); | |
| if (!toast) { | |
| toast = document.createElement('div'); | |
| toast.id = 'app-toast'; | |
| toast.className = 'toast-notification'; | |
| toast.innerHTML = '<span class="toast-icon"></span><span class="toast-text"></span>'; | |
| document.body.appendChild(toast); | |
| } | |
| const icon = toast.querySelector('.toast-icon'); | |
| const text = toast.querySelector('.toast-text'); | |
| toast.className = 'toast-notification ' + (type || 'error'); | |
| if (type === 'warning') icon.textContent = '\u26A0'; | |
| else if (type === 'info') icon.textContent = '\u2139'; | |
| else icon.textContent = '\u2717'; | |
| text.textContent = message; | |
| if (toastTimer) clearTimeout(toastTimer); | |
| void toast.offsetWidth; | |
| toast.classList.add('visible'); | |
| toastTimer = setTimeout(() => toast.classList.remove('visible'), 3500); | |
| } | |
| function showLoader() { | |
| const l = document.getElementById('output-loader'); | |
| if (l) l.classList.add('active'); | |
| const sb = document.getElementById('sb-run-state'); | |
| if (sb) sb.textContent = 'Processing...'; | |
| } | |
| function hideLoader() { | |
| const l = document.getElementById('output-loader'); | |
| if (l) l.classList.remove('active'); | |
| const sb = document.getElementById('sb-run-state'); | |
| if (sb) sb.textContent = 'Done'; | |
| } | |
| function setRunErrorState() { | |
| const l = document.getElementById('output-loader'); | |
| if (l) l.classList.remove('active'); | |
| const sb = document.getElementById('sb-run-state'); | |
| if (sb) sb.textContent = 'Error'; | |
| } | |
| window.__showToast = showToast; | |
| window.__showLoader = showLoader; | |
| window.__hideLoader = hideLoader; | |
| window.__setRunErrorState = setRunErrorState; | |
| function flashPromptError() { | |
| promptInput.classList.add('error-flash'); | |
| promptInput.focus(); | |
| setTimeout(() => promptInput.classList.remove('error-flash'), 800); | |
| } | |
| function flashOutputError() { | |
| if (!outputArea) return; | |
| outputArea.classList.add('error-flash'); | |
| setTimeout(() => outputArea.classList.remove('error-flash'), 800); | |
| } | |
| function getValueFromContainer(containerId) { | |
| const container = document.getElementById(containerId); | |
| if (!container) return ''; | |
| const el = container.querySelector('textarea, input'); | |
| return el ? (el.value || '') : ''; | |
| } | |
| function setGradioValue(containerId, value) { | |
| const container = document.getElementById(containerId); | |
| if (!container) return false; | |
| const el = container.querySelector('textarea, input'); | |
| if (!el) return false; | |
| const proto = el.tagName === 'TEXTAREA' ? HTMLTextAreaElement.prototype : HTMLInputElement.prototype; | |
| const ns = Object.getOwnPropertyDescriptor(proto, 'value'); | |
| if (ns && ns.set) { | |
| ns.set.call(el, value); | |
| el.dispatchEvent(new Event('input', {bubbles:true, composed:true})); | |
| el.dispatchEvent(new Event('change', {bubbles:true, composed:true})); | |
| return true; | |
| } | |
| return false; | |
| } | |
| function syncImageToGradio() { | |
| setGradioValue('hidden-image-b64', imageState ? imageState.value : ''); | |
| const txt = imageState ? ('Image ready: ' + (imageState.name || 'image')) : 'No image uploaded'; | |
| if (imgStatus) imgStatus.textContent = txt; | |
| } | |
| function syncPromptToGradio() { | |
| const activeModel = (document.querySelector('.model-tab.active') || {}).dataset?.model; | |
| setGradioValue('prompt-gradio-input', activeModel === 'LightOnOCR-2-1B' ? '' : promptInput.value); | |
| } | |
| function syncModelToGradio(name) { | |
| setGradioValue('hidden-model-name', name); | |
| } | |
| function setPreview(src, name, value) { | |
| imageState = {src, name: name || 'image', value: value || src}; | |
| previewImg.src = src; | |
| previewWrap.style.display = 'flex'; | |
| if (uploadPrompt) uploadPrompt.style.display = 'none'; | |
| if (urlInput) urlInput.value = ''; | |
| syncImageToGradio(); | |
| } | |
| window.__setPreview = setPreview; | |
| function clearPreview() { | |
| imageState = null; | |
| previewImg.src = ''; | |
| previewWrap.style.display = 'none'; | |
| if (uploadPrompt) uploadPrompt.style.display = 'flex'; | |
| syncImageToGradio(); | |
| } | |
| window.__clearPreview = clearPreview; | |
| function processFile(file) { | |
| if (!file) return; | |
| if (!file.type.startsWith('image/')) { | |
| showToast('Only image files are supported', 'error'); | |
| return; | |
| } | |
| const reader = new FileReader(); | |
| reader.onload = (e) => setPreview(e.target.result, file.name, e.target.result); | |
| reader.readAsDataURL(file); | |
| } | |
| function applyImageUrl() { | |
| const raw = urlInput.value.trim(); | |
| if (!raw) { | |
| showToast('Enter an image URL', 'warning'); | |
| return; | |
| } | |
| let parsed; | |
| try { | |
| parsed = new URL(raw); | |
| } catch (e) { | |
| showToast('Invalid image URL', 'error'); | |
| return; | |
| } | |
| if (!['http:', 'https:'].includes(parsed.protocol)) { | |
| showToast('Only http/https image URLs are supported', 'error'); | |
| return; | |
| } | |
| const testImg = new Image(); | |
| testImg.onload = () => { | |
| const name = parsed.pathname.split('/').filter(Boolean).pop() || parsed.hostname; | |
| setPreview(raw, name, raw); | |
| showToast('Image URL loaded', 'info'); | |
| }; | |
| testImg.onerror = () => showToast('Could not load image from URL', 'error'); | |
| testImg.src = raw; | |
| } | |
| fileInput.addEventListener('change', (e) => { | |
| const file = e.target.files && e.target.files[0] ? e.target.files[0] : null; | |
| if (file) processFile(file); | |
| e.target.value = ''; | |
| }); | |
| urlBtn.addEventListener('click', applyImageUrl); | |
| urlInput.addEventListener('keydown', (e) => { | |
| if (e.key === 'Enter') { | |
| e.preventDefault(); | |
| applyImageUrl(); | |
| } | |
| }); | |
| if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click()); | |
| if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click()); | |
| if (btnClear) btnClear.addEventListener('click', clearPreview); | |
| dropZone.addEventListener('dragover', (e) => { | |
| e.preventDefault(); | |
| dropZone.classList.add('drag-over'); | |
| }); | |
| dropZone.addEventListener('dragleave', (e) => { | |
| e.preventDefault(); | |
| dropZone.classList.remove('drag-over'); | |
| }); | |
| dropZone.addEventListener('drop', (e) => { | |
| e.preventDefault(); | |
| dropZone.classList.remove('drag-over'); | |
| if (e.dataTransfer.files && e.dataTransfer.files.length) processFile(e.dataTransfer.files[0]); | |
| }); | |
| promptInput.addEventListener('input', syncPromptToGradio); | |
| function activateModelTab(name) { | |
| document.querySelectorAll('.model-tab[data-model]').forEach(btn => { | |
| btn.classList.toggle('active', btn.getAttribute('data-model') === name); | |
| }); | |
| const hidePrompt = name === 'LightOnOCR-2-1B'; | |
| if (promptPanel) promptPanel.classList.toggle('is-hidden', hidePrompt); | |
| if (promptTabsBar) promptTabsBar.classList.toggle('is-hidden', hidePrompt); | |
| syncModelToGradio(name); | |
| syncPromptToGradio(); | |
| } | |
| function findPromptKeyByValue(value) { | |
| return Object.keys(promptTemplates).find(key => promptTemplates[key] === value) || null; | |
| } | |
| function activatePromptTab(key) { | |
| document.querySelectorAll('.prompt-tab[data-prompt-key]').forEach(btn => { | |
| btn.classList.toggle('active', btn.getAttribute('data-prompt-key') === key); | |
| }); | |
| if (!key || !promptTemplates[key]) return; | |
| promptInput.value = promptTemplates[key]; | |
| syncPromptToGradio(); | |
| } | |
| window.__activateModelTab = activateModelTab; | |
| window.__activatePromptTab = activatePromptTab; | |
| document.querySelectorAll('.model-tab[data-model]').forEach(btn => { | |
| btn.addEventListener('click', () => { | |
| const model = btn.getAttribute('data-model'); | |
| activateModelTab(model); | |
| }); | |
| }); | |
| document.querySelectorAll('.prompt-tab[data-prompt-key]').forEach(btn => { | |
| btn.addEventListener('click', () => { | |
| const promptKey = btn.getAttribute('data-prompt-key'); | |
| activatePromptTab(promptKey); | |
| }); | |
| }); | |
| activateModelTab('Nanonets-OCR2-3B'); | |
| const initialPromptKey = findPromptKeyByValue(promptInput.value) || 'GENERAL'; | |
| if (!promptInput.value.trim() && promptTemplates[initialPromptKey]) { | |
| promptInput.value = promptTemplates[initialPromptKey]; | |
| } | |
| activatePromptTab(findPromptKeyByValue(promptInput.value) || initialPromptKey); | |
| promptInput.addEventListener('input', () => { | |
| const matchingKey = findPromptKeyByValue(promptInput.value); | |
| document.querySelectorAll('.prompt-tab[data-prompt-key]').forEach(btn => { | |
| btn.classList.toggle('active', btn.getAttribute('data-prompt-key') === matchingKey); | |
| }); | |
| }); | |
| function syncSlider(customId, gradioId) { | |
| const slider = document.getElementById(customId); | |
| const valSpan = document.getElementById(customId + '-val'); | |
| if (!slider) return; | |
| slider.addEventListener('input', () => { | |
| if (valSpan) valSpan.textContent = slider.value; | |
| const container = document.getElementById(gradioId); | |
| if (!container) return; | |
| container.querySelectorAll('input[type="range"],input[type="number"]').forEach(el => { | |
| const ns = Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, 'value'); | |
| if (ns && ns.set) { | |
| ns.set.call(el, slider.value); | |
| el.dispatchEvent(new Event('input', {bubbles:true, composed:true})); | |
| el.dispatchEvent(new Event('change', {bubbles:true, composed:true})); | |
| } | |
| }); | |
| }); | |
| } | |
| syncSlider('custom-max-new-tokens', 'gradio-max-new-tokens'); | |
| syncSlider('custom-temperature', 'gradio-temperature'); | |
| syncSlider('custom-top-p', 'gradio-top-p'); | |
| syncSlider('custom-top-k', 'gradio-top-k'); | |
| syncSlider('custom-repetition-penalty', 'gradio-repetition-penalty'); | |
| syncSlider('custom-gpu-duration', 'gradio-gpu-duration'); | |
| function validateBeforeRun() { | |
| const promptVal = promptInput.value.trim(); | |
| const currentModel = (document.querySelector('.model-tab.active') || {}).dataset?.model; | |
| const requiresPrompt = currentModel !== 'LightOnOCR-2-1B'; | |
| if (!imageState && !promptVal && requiresPrompt) { | |
| showToast('Please upload an image and enter your OCR instruction', 'error'); | |
| flashPromptError(); | |
| return false; | |
| } | |
| if (!imageState) { | |
| showToast('Please upload an image', 'error'); | |
| return false; | |
| } | |
| if (requiresPrompt && !promptVal) { | |
| showToast('Please enter your OCR/query instruction', 'warning'); | |
| flashPromptError(); | |
| return false; | |
| } | |
| if (!currentModel) { | |
| showToast('Please select a model', 'error'); | |
| return false; | |
| } | |
| return true; | |
| } | |
| window.__clickGradioRunBtn = function() { | |
| if (!validateBeforeRun()) return; | |
| syncPromptToGradio(); | |
| syncImageToGradio(); | |
| const active = document.querySelector('.model-tab.active'); | |
| if (active) syncModelToGradio(active.getAttribute('data-model')); | |
| if (outputArea) outputArea.value = ''; | |
| showLoader(); | |
| setTimeout(() => { | |
| const gradioBtn = document.getElementById('gradio-run-btn'); | |
| if (!gradioBtn) { | |
| setRunErrorState(); | |
| if (outputArea) outputArea.value = '[ERROR] Run button not found.'; | |
| showToast('Run button not found', 'error'); | |
| return; | |
| } | |
| const btn = gradioBtn.querySelector('button'); | |
| if (btn) btn.click(); else gradioBtn.click(); | |
| }, 180); | |
| }; | |
| if (runBtnEl) runBtnEl.addEventListener('click', () => window.__clickGradioRunBtn()); | |
| const copyBtn = document.getElementById('copy-output-btn'); | |
| if (copyBtn) { | |
| copyBtn.addEventListener('click', async () => { | |
| try { | |
| const text = outputArea ? outputArea.value : ''; | |
| if (!text.trim()) { | |
| showToast('No output to copy', 'warning'); | |
| flashOutputError(); | |
| return; | |
| } | |
| await navigator.clipboard.writeText(text); | |
| showToast('Output copied to clipboard', 'info'); | |
| } catch(e) { | |
| showToast('Copy failed', 'error'); | |
| } | |
| }); | |
| } | |
| const saveBtn = document.getElementById('save-output-btn'); | |
| if (saveBtn) { | |
| saveBtn.addEventListener('click', () => { | |
| const text = outputArea ? outputArea.value : ''; | |
| if (!text.trim()) { | |
| showToast('No output to save', 'warning'); | |
| flashOutputError(); | |
| return; | |
| } | |
| const blob = new Blob([text], {type: 'text/plain;charset=utf-8'}); | |
| const a = document.createElement('a'); | |
| a.href = URL.createObjectURL(blob); | |
| a.download = 'multimodal_ocr_output.txt'; | |
| document.body.appendChild(a); | |
| a.click(); | |
| setTimeout(() => { | |
| URL.revokeObjectURL(a.href); | |
| document.body.removeChild(a); | |
| }, 200); | |
| showToast('Output saved', 'info'); | |
| }); | |
| } | |
| function applyExamplePayload(raw) { | |
| try { | |
| const data = JSON.parse(raw); | |
| if (data.status === 'ok') { | |
| if (data.image) setPreview(data.image, data.name || 'example.jpg', data.image); | |
| if (data.query) { | |
| promptInput.value = data.query; | |
| syncPromptToGradio(); | |
| } | |
| if (data.model) activateModelTab(data.model); | |
| document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading')); | |
| showToast('Example loaded', 'info'); | |
| } else if (data.status === 'error') { | |
| document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading')); | |
| showToast(data.message || 'Failed to load example', 'error'); | |
| } | |
| } catch (e) { | |
| document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading')); | |
| } | |
| } | |
| function startExamplePolling() { | |
| if (examplePoller) clearInterval(examplePoller); | |
| let attempts = 0; | |
| examplePoller = setInterval(() => { | |
| attempts += 1; | |
| const current = getValueFromContainer('example-result-data'); | |
| if (current && current !== lastSeenExamplePayload) { | |
| lastSeenExamplePayload = current; | |
| clearInterval(examplePoller); | |
| examplePoller = null; | |
| applyExamplePayload(current); | |
| return; | |
| } | |
| if (attempts >= 100) { | |
| clearInterval(examplePoller); | |
| examplePoller = null; | |
| document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading')); | |
| showToast('Example load timed out', 'error'); | |
| } | |
| }, 120); | |
| } | |
| function triggerExampleLoad(idx) { | |
| const btnWrap = document.getElementById('example-load-btn'); | |
| const btn = btnWrap ? (btnWrap.querySelector('button') || btnWrap) : null; | |
| if (!btn) return; | |
| let attempts = 0; | |
| function writeIdxAndClick() { | |
| attempts += 1; | |
| const ok1 = setGradioValue('example-idx-input', String(idx)); | |
| setGradioValue('example-result-data', ''); | |
| const currentVal = getValueFromContainer('example-idx-input'); | |
| if (ok1 && currentVal === String(idx)) { | |
| btn.click(); | |
| startExamplePolling(); | |
| return; | |
| } | |
| if (attempts < 30) { | |
| setTimeout(writeIdxAndClick, 100); | |
| } else { | |
| document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading')); | |
| showToast('Failed to initialize example loader', 'error'); | |
| } | |
| } | |
| writeIdxAndClick(); | |
| } | |
| document.querySelectorAll('.example-card[data-idx]').forEach(card => { | |
| card.addEventListener('click', () => { | |
| const idx = card.getAttribute('data-idx'); | |
| if (idx === null || idx === undefined || idx === '') return; | |
| document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading')); | |
| card.classList.add('loading'); | |
| showToast('Loading example...', 'info'); | |
| triggerExampleLoad(idx); | |
| }); | |
| }); | |
| const observerTarget = document.getElementById('example-result-data'); | |
| if (observerTarget) { | |
| const obs = new MutationObserver(() => { | |
| const current = getValueFromContainer('example-result-data'); | |
| if (!current || current === lastSeenExamplePayload) return; | |
| lastSeenExamplePayload = current; | |
| if (examplePoller) { | |
| clearInterval(examplePoller); | |
| examplePoller = null; | |
| } | |
| applyExamplePayload(current); | |
| }); | |
| obs.observe(observerTarget, {childList:true, subtree:true, characterData:true, attributes:true}); | |
| } | |
| if (outputArea) outputArea.value = ''; | |
| const sb = document.getElementById('sb-run-state'); | |
| if (sb) sb.textContent = 'Ready'; | |
| if (imgStatus) imgStatus.textContent = 'No image uploaded'; | |
| } | |
| init(); | |
| } | |
| """ | |
| wire_outputs_js = r""" | |
| () => { | |
| function watchOutputs() { | |
| const resultContainer = document.getElementById('gradio-result'); | |
| const outArea = document.getElementById('custom-output-textarea'); | |
| if (!resultContainer || !outArea) { setTimeout(watchOutputs, 500); return; } | |
| let lastText = ''; | |
| function isErrorText(val) { | |
| return typeof val === 'string' && val.trim().startsWith('[ERROR]'); | |
| } | |
| function syncOutput() { | |
| const el = resultContainer.querySelector('textarea') || resultContainer.querySelector('input'); | |
| if (!el) return; | |
| const val = el.value || ''; | |
| if (val !== lastText) { | |
| lastText = val; | |
| outArea.value = val; | |
| outArea.scrollTop = outArea.scrollHeight; | |
| if (val.trim()) { | |
| if (isErrorText(val)) { | |
| if (window.__setRunErrorState) window.__setRunErrorState(); | |
| if (window.__showToast) window.__showToast('Inference failed', 'error'); | |
| } else { | |
| if (window.__hideLoader) window.__hideLoader(); | |
| } | |
| } | |
| } | |
| } | |
| const observer = new MutationObserver(syncOutput); | |
| observer.observe(resultContainer, {childList:true, subtree:true, characterData:true, attributes:true}); | |
| setInterval(syncOutput, 500); | |
| } | |
| watchOutputs(); | |
| } | |
| """ | |
| OCR_LOGO_SVG = """ | |
| <svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"> | |
| <path d="M4 5.5A2.5 2.5 0 0 1 6.5 3H11v2H6.5a.5.5 0 0 0-.5.5V10H4V5.5Z"/> | |
| <path d="M20 10h-2V5.5a.5.5 0 0 0-.5-.5H13V3h4.5A2.5 2.5 0 0 1 20 5.5V10Z"/> | |
| <path d="M4 14h2v4.5a.5.5 0 0 0 .5.5H11v2H6.5A2.5 2.5 0 0 1 4 18.5V14Z"/> | |
| <path d="M20 14v4.5A2.5 2.5 0 0 1 17.5 21H13v-2h4.5a.5.5 0 0 0 .5-.5V14h2Z"/> | |
| <path d="M8 8h8v2H8V8Zm0 3h8v2H8v-2Zm0 3h5v2H8v-2Z"/> | |
| </svg> | |
| """ | |
| UPLOAD_PREVIEW_SVG = """ | |
| <svg viewBox="0 0 80 80" fill="none" xmlns="http://www.w3.org/2000/svg"> | |
| <rect x="8" y="14" width="64" height="52" rx="6" fill="none" stroke="#ADFF2F" stroke-width="2" stroke-dasharray="4 3"/> | |
| <polygon points="12,62 30,40 42,50 54,34 68,62" fill="rgba(173,255,47,0.15)" stroke="#ADFF2F" stroke-width="1.5"/> | |
| <circle cx="28" cy="30" r="6" fill="rgba(173,255,47,0.22)" stroke="#ADFF2F" stroke-width="1.5"/> | |
| </svg> | |
| """ | |
| COPY_SVG = """<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M16 1H4C2.9 1 2 1.9 2 3v12h2V3h12V1zm3 4H8C6.9 5 6 5.9 6 7v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"/></svg>""" | |
| SAVE_SVG = """<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 3H5a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2V7l-4-4zM7 5h8v4H7V5zm12 14H5v-6h14v6z"/></svg>""" | |
| MODEL_TABS_HTML = "".join([ | |
| f'<button class="model-tab{" active" if m == "Nanonets-OCR2-3B" else ""}" data-model="{m}"><span class="model-tab-label">{m}</span></button>' | |
| for m in MODEL_CHOICES | |
| ]) | |
| PROMPT_TABS_HTML = "".join([ | |
| f'<button class="model-tab prompt-tab{" active" if key == "GENERAL" else ""}" data-prompt-key="{key}"><span class="model-tab-label">{value["name"]}</span></button>' | |
| for key, value in PROMPTS.items() | |
| ]) | |
| PROMPT_TEMPLATES_JSON = json.dumps({key: value["prompt"] for key, value in PROMPTS.items()}) | |
| with gr.Blocks() as demo: | |
| hidden_image_b64 = gr.Textbox(value="", elem_id="hidden-image-b64", elem_classes="hidden-input", container=False) | |
| prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False) | |
| hidden_model_name = gr.Textbox(value="Nanonets-OCR2-3B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False) | |
| max_new_tokens = gr.Slider(minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS, elem_id="gradio-max-new-tokens", elem_classes="hidden-input", container=False) | |
| temperature = gr.Slider(minimum=0.1, maximum=4.0, step=0.1, value=0.7, elem_id="gradio-temperature", elem_classes="hidden-input", container=False) | |
| top_p = gr.Slider(minimum=0.05, maximum=1.0, step=0.05, value=0.9, elem_id="gradio-top-p", elem_classes="hidden-input", container=False) | |
| top_k = gr.Slider(minimum=1, maximum=1000, step=1, value=50, elem_id="gradio-top-k", elem_classes="hidden-input", container=False) | |
| repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, step=0.05, value=1.1, elem_id="gradio-repetition-penalty", elem_classes="hidden-input", container=False) | |
| gpu_duration_state = gr.Number(value=60, elem_id="gradio-gpu-duration", elem_classes="hidden-input", container=False) | |
| result = gr.Textbox(value="", elem_id="gradio-result", elem_classes="hidden-input", container=False) | |
| example_idx = gr.Textbox(value="", elem_id="example-idx-input", elem_classes="hidden-input", container=False) | |
| example_result = gr.Textbox(value="", elem_id="example-result-data", elem_classes="hidden-input", container=False) | |
| example_load_btn = gr.Button("Load Example", elem_id="example-load-btn") | |
| gr.HTML(f""" | |
| <div class="app-shell"> | |
| <div class="app-header"> | |
| <div class="app-header-left"> | |
| <div class="app-logo">{OCR_LOGO_SVG}</div> | |
| <span class="app-title">Multimodal OCR</span> | |
| <span class="app-badge">vision enabled</span> | |
| <span class="app-badge fast">OCR Suite</span> | |
| </div> | |
| </div> | |
| <div class="model-tabs-bar"> | |
| {MODEL_TABS_HTML} | |
| </div> | |
| <div id="prompt-tabs-bar" class="model-tabs-bar"> | |
| {PROMPT_TABS_HTML} | |
| </div> | |
| <div class="app-main-row"> | |
| <div class="app-main-left"> | |
| <div id="image-drop-zone"> | |
| <div id="upload-prompt" class="upload-prompt-modern"> | |
| <div class="upload-prompt-stack"> | |
| <div id="upload-click-area" class="upload-click-area"> | |
| {UPLOAD_PREVIEW_SVG} | |
| <span class="upload-main-text">Click or drag an image here</span> | |
| <span class="upload-sub-text">Upload one document, page, receipt, screenshot, or scene image for OCR and vision tasks</span> | |
| </div> | |
| <div class="upload-url-box"> | |
| <input id="image-url-input" class="upload-url-input" type="url" placeholder="...or paste an image URL"> | |
| <button id="image-url-btn" class="upload-url-btn" type="button">Use URL</button> | |
| </div> | |
| </div> | |
| </div> | |
| <input id="custom-file-input" type="file" accept="image/*" style="display:none;" /> | |
| <div id="single-preview-wrap" class="single-preview-wrap"> | |
| <div class="single-preview-card"> | |
| <img id="single-preview-img" src="" alt="Preview"> | |
| <div class="preview-overlay-actions"> | |
| <button id="preview-upload-btn" class="preview-action-btn" title="Replace">Upload</button> | |
| <button id="preview-clear-btn" class="preview-action-btn" title="Clear">Clear</button> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="hint-bar"> | |
| <b>Upload:</b> Click, drag, or paste an image URL · | |
| <b>Model:</b> Switch model tabs from the header · | |
| <kbd>Clear</kbd> removes the current image | |
| </div> | |
| <div class="examples-section"> | |
| <div class="examples-title">Quick Examples</div> | |
| <div class="examples-scroll"> | |
| {EXAMPLE_CARDS_HTML} | |
| </div> | |
| </div> | |
| </div> | |
| <div class="app-main-right"> | |
| <div id="prompt-panel" class="panel-card"> | |
| <div class="panel-card-title">OCR / Vision Instruction</div> | |
| <div class="panel-card-body"> | |
| <label class="modern-label" for="custom-query-input">Query Input</label> | |
| <textarea id="custom-query-input" class="modern-textarea" rows="8" placeholder="e.g., perform OCR on the image precisely, extract all text, describe the visual scene, summarize visible content..."></textarea> | |
| </div> | |
| </div> | |
| <div style="padding:12px 20px;"> | |
| <button id="custom-run-btn" class="btn-run"> | |
| <span id="run-btn-label">Run OCR</span> | |
| </button> | |
| </div> | |
| <div class="output-frame"> | |
| <div class="out-title"> | |
| <span id="output-title-label">Raw Output Stream</span> | |
| <div class="out-title-right"> | |
| <button id="copy-output-btn" class="out-action-btn" title="Copy">{COPY_SVG} Copy</button> | |
| <button id="save-output-btn" class="out-action-btn" title="Save">{SAVE_SVG} Save File</button> | |
| </div> | |
| </div> | |
| <div class="out-body"> | |
| <div class="modern-loader" id="output-loader"> | |
| <div class="loader-spinner"></div> | |
| <div class="loader-text">Running OCR...</div> | |
| <div class="loader-bar-track"><div class="loader-bar-fill"></div></div> | |
| </div> | |
| <div class="output-scroll-wrap"> | |
| <textarea id="custom-output-textarea" class="output-textarea" placeholder="Raw output will appear here..." readonly></textarea> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="settings-group"> | |
| <div class="settings-group-title">Advanced Settings</div> | |
| <div class="settings-group-body"> | |
| <div class="slider-row"> | |
| <label>Max new tokens</label> | |
| <input type="range" id="custom-max-new-tokens" min="1" max="{MAX_MAX_NEW_TOKENS}" step="1" value="{DEFAULT_MAX_NEW_TOKENS}"> | |
| <span class="slider-val" id="custom-max-new-tokens-val">{DEFAULT_MAX_NEW_TOKENS}</span> | |
| </div> | |
| <div class="slider-row"> | |
| <label>Temperature</label> | |
| <input type="range" id="custom-temperature" min="0.1" max="4.0" step="0.1" value="0.7"> | |
| <span class="slider-val" id="custom-temperature-val">0.7</span> | |
| </div> | |
| <div class="slider-row"> | |
| <label>Top-p</label> | |
| <input type="range" id="custom-top-p" min="0.05" max="1.0" step="0.05" value="0.9"> | |
| <span class="slider-val" id="custom-top-p-val">0.9</span> | |
| </div> | |
| <div class="slider-row"> | |
| <label>Top-k</label> | |
| <input type="range" id="custom-top-k" min="1" max="1000" step="1" value="50"> | |
| <span class="slider-val" id="custom-top-k-val">50</span> | |
| </div> | |
| <div class="slider-row"> | |
| <label>Repetition penalty</label> | |
| <input type="range" id="custom-repetition-penalty" min="1.0" max="2.0" step="0.05" value="1.1"> | |
| <span class="slider-val" id="custom-repetition-penalty-val">1.1</span> | |
| </div> | |
| <div class="slider-row"> | |
| <label>GPU Duration (seconds)</label> | |
| <input type="range" id="custom-gpu-duration" min="60" max="240" step="30" value="60"> | |
| <span class="slider-val" id="custom-gpu-duration-val">60</span> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="exp-note"> | |
| Experimental OCR Suite · Open on <a href="https://github.com/PRITHIVSAKTHIUR/Multimodal-OCR" target="_blank">GitHub</a> | |
| </div> | |
| <div class="app-statusbar"> | |
| <div class="sb-section" id="sb-image-status">No image uploaded</div> | |
| <div class="sb-section sb-fixed" id="sb-run-state">Ready</div> | |
| </div> | |
| <script id="prompt-templates-data" type="application/json">{PROMPT_TEMPLATES_JSON}</script> | |
| </div> | |
| """) | |
| run_btn = gr.Button("Run", elem_id="gradio-run-btn") | |
| demo.load(fn=noop, inputs=None, outputs=None, js=gallery_js) | |
| demo.load(fn=noop, inputs=None, outputs=None, js=wire_outputs_js) | |
| run_btn.click( | |
| fn=run_ocr, | |
| inputs=[ | |
| hidden_model_name, | |
| prompt, | |
| hidden_image_b64, | |
| max_new_tokens, | |
| temperature, | |
| top_p, | |
| top_k, | |
| repetition_penalty, | |
| gpu_duration_state, | |
| ], | |
| outputs=[result], | |
| js=r"""(m, p, img, mnt, t, tp, tk, rp, gd) => { | |
| const modelEl = document.querySelector('.model-tab.active'); | |
| const model = modelEl ? modelEl.getAttribute('data-model') : m; | |
| const promptEl = document.getElementById('custom-query-input'); | |
| const promptVal = promptEl ? promptEl.value : p; | |
| const imgContainer = document.getElementById('hidden-image-b64'); | |
| let imgVal = img; | |
| if (imgContainer) { | |
| const inner = imgContainer.querySelector('textarea, input'); | |
| if (inner) imgVal = inner.value; | |
| } | |
| return [model, promptVal, imgVal, mnt, t, tp, tk, rp, gd]; | |
| }""", | |
| ) | |
| example_load_btn.click( | |
| fn=load_example_data, | |
| inputs=[example_idx], | |
| outputs=[example_result], | |
| queue=False, | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=50).launch( | |
| css=css, | |
| mcp_server=False, | |
| ssr_mode=False, | |
| show_error=True, | |
| allowed_paths=["examples"], | |
| ) |