remove_weights_from_python_wheel

by jdye64 - opened Dec 19, 2025

base: refs/heads/main

←

from: refs/pr/6

Discussion Files changed

+43

-6

Files changed (3) hide show

example.py +1 -1
nemotron-ocr/pyproject.toml +1 -0
nemotron-ocr/src/nemotron_ocr/inference/pipeline.py +41 -5

example.py CHANGED Viewed

@@ -8,7 +8,7 @@ from nemotron_ocr.inference.pipeline import NemotronOCR
 def main(image_path, merge_level, no_visualize, model_dir):
-    ocr_pipeline = NemotronOCR()
     predictions = ocr_pipeline(image_path, merge_level=merge_level, visualize=not no_visualize)

 def main(image_path, merge_level, no_visualize, model_dir):
+    ocr_pipeline = NemotronOCR(model_dir=model_dir)
     predictions = ocr_pipeline(image_path, merge_level=merge_level, visualize=not no_visualize)

nemotron-ocr/pyproject.toml CHANGED Viewed

@@ -5,6 +5,7 @@ description = "Nemoton OCR"
 authors = [{ name = "NVIDIA Nemotron" }]
 requires-python = ">=3.12,<3.13"
 dependencies = [
     "pandas>=2.3.3",
     "pillow>=12.0.0",
     "scikit-learn>=1.7.2",

 authors = [{ name = "NVIDIA Nemotron" }]
 requires-python = ">=3.12,<3.13"
 dependencies = [
+    "huggingface_hub>=0.20.0",
     "pandas>=2.3.3",
     "pillow>=12.0.0",
     "scikit-learn>=1.7.2",

nemotron-ocr/src/nemotron_ocr/inference/pipeline.py CHANGED Viewed

@@ -6,6 +6,7 @@ import io
 import json
 import os
 from pathlib import Path
 import numpy as np
 import torch
@@ -20,6 +21,7 @@ from nemotron_ocr.inference.post_processing.data.text_region import TextBlock
 from nemotron_ocr.inference.post_processing.quad_rectify import QuadRectify
 from nemotron_ocr.inference.post_processing.research_ops import parse_relational_results, reorder_boxes
 from nemotron_ocr.inference.pre_processing import interpolate_and_pad, pad_to_square
 from nemotron_ocr_cpp import quad_non_maximal_suppression, region_counts_to_indices, rrect_to_quads
 from PIL import Image, ImageDraw, ImageFont
 from torch import amp
@@ -37,25 +39,57 @@ MERGE_LEVELS = {"word", "sentence", "paragraph"}
 DEFAULT_MERGE_LEVEL = "paragraph"
 class NemotronOCR:
     """
     A high-level pipeline for performing OCR on images.
     """
-    def __init__(self, model_dir="./checkpoints"):
-        self._model_dir = Path(model_dir)
         self._load_models()
         self._load_charset()
         self._initialize_processors()
     def _load_models(self):
         """Loads all necessary models into memory."""
         self.detector = FOTSDetector(coordinate_mode="RBOX", backbone="regnet_y_8gf", verbose=False)
-        self.detector.load_state_dict(torch.load(self._model_dir / "detector.pth"), strict=True)
         self.recognizer = TransformerRecognizer(nic=self.detector.num_features[-1], num_tokens=858, max_width=32)
-        self.recognizer.load_state_dict(torch.load(self._model_dir / "recognizer.pth"), strict=True)
         self.relational = GlobalRelationalModel(
             num_input_channels=self.detector.num_features,
@@ -64,7 +98,9 @@ class NemotronOCR:
             k=16,
             num_layers=4,
         )
-        self.relational.load_state_dict(torch.load(self._model_dir / "relational.pth"), strict=True)
         for model in (self.detector, self.recognizer, self.relational):
             model = model.cuda()

 import json
 import os
 from pathlib import Path
+from typing import Optional
 import numpy as np
 import torch
 from nemotron_ocr.inference.post_processing.quad_rectify import QuadRectify
 from nemotron_ocr.inference.post_processing.research_ops import parse_relational_results, reorder_boxes
 from nemotron_ocr.inference.pre_processing import interpolate_and_pad, pad_to_square
+from huggingface_hub import hf_hub_download
 from nemotron_ocr_cpp import quad_non_maximal_suppression, region_counts_to_indices, rrect_to_quads
 from PIL import Image, ImageDraw, ImageFont
 from torch import amp
 DEFAULT_MERGE_LEVEL = "paragraph"
+# HuggingFace repository for downloading model weights
+HF_REPO_ID = "nvidia/nemotron-ocr-v1"
+CHECKPOINT_FILES = ["detector.pth", "recognizer.pth", "relational.pth", "charset.txt"]
 class NemotronOCR:
     """
     A high-level pipeline for performing OCR on images.
+    Model weights are automatically downloaded from Hugging Face Hub
+    (nvidia/nemotron-ocr-v1) if not found locally.
     """
+    def __init__(self, model_dir: Optional[str] = None):
+        # If model_dir is provided and contains all required files, use it directly
+        if model_dir is not None:
+            local_path = Path(model_dir)
+            if all((local_path / f).is_file() for f in CHECKPOINT_FILES):
+                self._model_dir = local_path
+            else:
+                self._model_dir = self._download_checkpoints()
+        else:
+            self._model_dir = self._download_checkpoints()
         self._load_models()
         self._load_charset()
         self._initialize_processors()
+    @staticmethod
+    def _download_checkpoints() -> Path:
+        """Download model checkpoints from HuggingFace Hub (cached locally after first download)."""
+        downloaded_path = None
+        for filename in CHECKPOINT_FILES:
+            downloaded_path = hf_hub_download(
+                repo_id=HF_REPO_ID,
+                filename=f"checkpoints/{filename}",
+            )
+        # All checkpoint files are in the same directory
+        return Path(downloaded_path).parent
     def _load_models(self):
         """Loads all necessary models into memory."""
         self.detector = FOTSDetector(coordinate_mode="RBOX", backbone="regnet_y_8gf", verbose=False)
+        self.detector.load_state_dict(
+            torch.load(self._model_dir / "detector.pth", weights_only=True), strict=True
+        )
         self.recognizer = TransformerRecognizer(nic=self.detector.num_features[-1], num_tokens=858, max_width=32)
+        self.recognizer.load_state_dict(
+            torch.load(self._model_dir / "recognizer.pth", weights_only=True), strict=True
+        )
         self.relational = GlobalRelationalModel(
             num_input_channels=self.detector.num_features,
             k=16,
             num_layers=4,
         )
+        self.relational.load_state_dict(
+            torch.load(self._model_dir / "relational.pth", weights_only=True), strict=True
+        )
         for model in (self.detector, self.recognizer, self.relational):
             model = model.cuda()