#!/usr/bin/env python3 # SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 """Nemotron OCR v2 — HuggingFace Space (ZeroGPU).""" import logging import os import subprocess import sys logging.basicConfig(level=logging.INFO, format="%(name)s %(levelname)s: %(message)s") try: import nemotron_ocr except ImportError: print("Installing nemotron_ocr wheel...", flush=True) subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", "nemotron_ocr-1.0.0-cp312-cp312-linux_x86_64.whl"]) import json from pathlib import Path import spaces import gradio as gr import numpy as np from huggingface_hub import hf_hub_download from PIL import Image, ImageDraw from nemotron_ocr.inference.pipeline_v2 import NemotronOCRV2 MODELS = { "Multilingual (en, zh, ja, ko, ru, …)": "multi", "English-only": "en", "v1 (legacy, English-only)": "v1", } _pipelines: dict[str, NemotronOCRV2] = {} GROUP_COLORS = [ (76, 175, 80), (33, 150, 243), (255, 152, 0), (156, 39, 176), (0, 188, 212), (244, 67, 54), (139, 195, 74), (121, 85, 72), (255, 235, 59), (63, 81, 181), (233, 30, 99), (0, 150, 136), (255, 87, 34), (103, 58, 183), (205, 220, 57), (96, 125, 139), ] V1_DEFAULT_CONFIG = { "num_tokens": 858, "max_width": 32, "sequence_length": 32, "scope": 512, "coordinate_mode": "RBOX", "backbone": "regnet_y_8gf", "charset_size": 855, } def _ensure_v1_model_dir() -> str: """Download v1 checkpoints and write a model_config.json if missing.""" v1_files = ["detector.pth", "recognizer.pth", "relational.pth", "charset.txt"] model_dir = None for f in v1_files: path = hf_hub_download(repo_id="nvidia/nemotron-ocr-v1", filename=f"checkpoints/{f}") model_dir = str(Path(path).parent) config_path = Path(model_dir) / "model_config.json" if not config_path.exists(): with open(config_path, "w") as fh: json.dump(V1_DEFAULT_CONFIG, fh) return model_dir def _get_pipeline(lang_key: str) -> NemotronOCRV2: if lang_key not in _pipelines: if lang_key in ("v1", "legacy"): model_dir = _ensure_v1_model_dir() _pipelines[lang_key] = NemotronOCRV2(model_dir=model_dir) else: _pipelines[lang_key] = NemotronOCRV2(lang=lang_key) return _pipelines[lang_key] def draw_boxes(image: Image.Image, predictions: list[dict]) -> Image.Image: """Draw detected text regions colored by group index.""" overlay = image.copy().convert("RGBA") box_layer = Image.new("RGBA", overlay.size, (0, 0, 0, 0)) draw = ImageDraw.Draw(box_layer) w, h = image.size for i, pred in enumerate(predictions): x0 = pred["left"] * w y0 = pred["upper"] * h x1 = pred["right"] * w y1 = pred["lower"] * h left = min(x0, x1) upper = min(y0, y1) right = max(x0, x1) lower = max(y0, y1) if right - left < 1 or lower - upper < 1: continue r, g, b = GROUP_COLORS[i % len(GROUP_COLORS)] draw.rectangle([left, upper, right, lower], fill=(r, g, b, 40), outline=(r, g, b, 220), width=2) result = Image.alpha_composite(overlay, box_layer) return result.convert("RGB") # --------------------------------------------------------------------------- # Layout-aware spatial text reconstruction # Adapted from https://github.com/h2oai/h2ogpt/blob/main/src/image_doctr.py # --------------------------------------------------------------------------- def _is_same_line(box_a: dict, box_b: dict) -> bool: """Two word boxes are on the same line if their vertical midpoints overlap.""" a_mid = (box_a["upper"] + box_a["lower"]) / 2 b_mid = (box_b["upper"] + box_b["lower"]) / 2 a_top, a_bot = min(box_a["upper"], box_a["lower"]), max(box_a["upper"], box_a["lower"]) b_top, b_bot = min(box_b["upper"], box_b["lower"]), max(box_b["upper"], box_b["lower"]) return a_top < b_mid < a_bot and b_top < a_mid < b_bot def space_layout(words: list[dict]) -> str: """Reconstruct the page layout using monospace-style spacing. Groups words into lines by vertical overlap, sorts left-to-right within each line, then pads with spaces so horizontal positions are preserved. """ if not words: return "(no text detected)" texts = [w.get("text", "") for w in words] boxes = np.array([ [min(w["left"], w["right"]), min(w["upper"], w["lower"]), max(w["left"], w["right"]), max(w["upper"], w["lower"])] for w in words ]) sorted_ids = sorted(range(len(boxes)), key=lambda i: boxes[i][1]) remaining = set(sorted_ids) line_texts: list[list[str]] = [] line_boxes: list[np.ndarray] = [] max_line_char_num = 0 max_line_width = 0.0 while remaining: anchor = min(remaining) anchor_box = boxes[anchor] inline = [] for idx in list(remaining): mid_y = (boxes[idx][1] + boxes[idx][3]) / 2 if anchor_box[1] <= mid_y <= anchor_box[3]: inline.append(idx) for idx in inline: remaining.discard(idx) inline.sort(key=lambda i: boxes[i][0]) lt = [texts[i] for i in inline] lb = boxes[np.array(inline)] line_texts.append(lt) line_boxes.append(lb) joined = " ".join(lt) if len(joined) > max_line_char_num: max_line_char_num = len(joined) max_line_width = float(lb[:, 2].max() - lb[:, 0].min()) char_width = (max_line_width / max_line_char_num) if max_line_char_num > 0 else 0.02 char_width = max(char_width, 0.005) output_lines = [] for lt, lb in zip(line_texts, line_boxes): line = "" for j, (text, box) in enumerate(zip(lt, lb)): target_col = int(box[0] / char_width) pad = max(target_col - len(line), 1 if j > 0 else 0) line += " " * pad + text output_lines.append(line) return "\n".join(output_lines) def format_text(predictions: list[dict], merge_level: str) -> str: texts = [p.get("text", "") for p in predictions if p.get("text")] if not texts: return "(no text detected)" if merge_level == "paragraph": return "\n\n".join(texts) elif merge_level == "sentence": return "\n".join(texts) else: return " ".join(texts) @spaces.GPU(duration=120) def run_ocr(image: Image.Image, model_name: str, merge_level: str): if image is None: return None, "Please upload an image." lang_key = MODELS[model_name] ocr = _get_pipeline(lang_key) img_array = np.array(image.convert("RGB")) if merge_level == "layout": words = ocr(img_array, merge_level="word") annotated = draw_boxes(image, words) result_text = space_layout(words) else: display_preds = ocr(img_array, merge_level=merge_level) annotated = draw_boxes(image, display_preds) result_text = format_text(display_preds, merge_level) return annotated, result_text with gr.Blocks( title="Nemotron OCR v2", theme=gr.themes.Default(), ) as demo: gr.Markdown( "# Nemotron OCR v2\n" "State-of-the-art multilingual OCR by NVIDIA. " "Upload an image to extract text with bounding boxes.\n\n" "*Powered by ZeroGPU — GPU allocated on demand.*" ) gr.HTML( """