emelryan commited on 17 days ago

Commit

b28505d

0 Parent(s):

Duplicate from nvidia/nemotron-ocr-v2-multilingual

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +37 -0
Dockerfile +21 -0
LICENSE +243 -0
README.md +400 -0
THIRD_PARTY_NOTICES.md +519 -0
checkpoints/charset.txt +0 -0
checkpoints/detector.pth +3 -0
checkpoints/model_config.json +18 -0
checkpoints/recognizer.pth +3 -0
checkpoints/relational.pth +3 -0
config.json +0 -0
docker-compose.yaml +21 -0
example.py +61 -0
nemotron-ocr/.gitignore +9 -0
nemotron-ocr/cpp/.gitattributes +1 -0
nemotron-ocr/cpp/.gitignore +6 -0
nemotron-ocr/cpp/.gitmodules +3 -0
nemotron-ocr/cpp/README.md +15 -0
nemotron-ocr/cpp/beam_decode/beam_decode.cpp +459 -0
nemotron-ocr/cpp/beam_decode/beam_decode.h +17 -0
nemotron-ocr/cpp/beam_decode/kn_lm.cpp +85 -0
nemotron-ocr/cpp/beam_decode/kn_lm.h +26 -0
nemotron-ocr/cpp/beam_decode/language_model.cpp +146 -0
nemotron-ocr/cpp/beam_decode/language_model.h +65 -0
nemotron-ocr/cpp/beam_decode/log_sum_exp.cpp +6 -0
nemotron-ocr/cpp/beam_decode/log_sum_exp.h +53 -0
nemotron-ocr/cpp/beam_decode/ngram_lm_base.cpp +329 -0
nemotron-ocr/cpp/beam_decode/ngram_lm_base.h +79 -0
nemotron-ocr/cpp/beam_decode/prefix.cpp +22 -0
nemotron-ocr/cpp/beam_decode/prefix.h +157 -0
nemotron-ocr/cpp/beam_decode/sbo_lm.cpp +46 -0
nemotron-ocr/cpp/beam_decode/sbo_lm.h +20 -0
nemotron-ocr/cpp/better_grid_sample/cpu_indirect_grid_sample.cpp +93 -0
nemotron-ocr/cpp/better_grid_sample/gpu_grid_sample_utils.cuh +41 -0
nemotron-ocr/cpp/better_grid_sample/gpu_indirect_grid_sample.cu +327 -0
nemotron-ocr/cpp/better_grid_sample/grid_sample.h +66 -0
nemotron-ocr/cpp/common.cpp +12 -0
nemotron-ocr/cpp/common.h +57 -0
nemotron-ocr/cpp/cuda_intellisense.cuh +50 -0
nemotron-ocr/cpp/geometry.h +1100 -0
nemotron-ocr/cpp/geometry_api/calc_poly_min_rrect.cpp +164 -0
nemotron-ocr/cpp/geometry_api/geometry_api.cpp +100 -0
nemotron-ocr/cpp/geometry_api/geometry_api.h +15 -0
nemotron-ocr/cpp/geometry_api/geometry_api_common.h +120 -0
nemotron-ocr/cpp/geometry_api/geometry_api_gpu.cu +141 -0
nemotron-ocr/cpp/geometry_api/get_rel_continuation_cos.cpp +59 -0
nemotron-ocr/cpp/geometry_api/matrix2x2.h +92 -0
nemotron-ocr/cpp/geometry_api/poly_bounds_quad.cpp +60 -0
nemotron-ocr/cpp/graph_detection/encode_util.cpp +271 -0
nemotron-ocr/cpp/graph_detection/encode_util.h +183 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.ipynb filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM nvcr.io/nvidia/pytorch:25.09-py3
+ARG TARGETARCH
+ARG TORCH_CUDA_ARCH_LIST
+ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -U pip hatchling "setuptools>=68" --root-user-action ignore
+COPY nemotron-ocr /workspace/nemotron-ocr
+WORKDIR /workspace/nemotron-ocr
+# Ensure no prebuilt binaries/artifacts from the host are present
+RUN rm -f src/nemotron_ocr_cpp/*.so || true \
+    && rm -rf build/ dist/
+RUN --mount=type=cache,target=/root/.cache/pip \
+    BUILD_CPP_FORCE=1 ARCH=${TARGETARCH} pip install -v . --no-build-isolation --root-user-action ignore
+WORKDIR /workspace

LICENSE ADDED Viewed

	@@ -0,0 +1,243 @@

+All binary model files are licensed under NVIDIA Open Model License Agreement.
+All source code files are licensed under the Apache 2.0 License.
+------------
+NVIDIA Open Model License Agreement
+Last Modified: October 24, 2025
+https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/
+This NVIDIA Open Model License Agreement (the “Agreement”) is a legal agreement between the Legal Entity You represent, or if no entity is identified, You and NVIDIA Corporation and its Affiliates (“NVIDIA”) and governs Your use of the Models that NVIDIA provides to You under this Agreement. NVIDIA and You are each a “party” and collectively the “parties.”
+NVIDIA models released under this Agreement are intended to be used permissively and enable the further development of AI technologies. Subject to the terms of this Agreement, NVIDIA confirms that:
+- Models are commercially usable.
+- You are free to create and distribute Derivative Models.
+- NVIDIA does not claim ownership to any outputs generated using the Models or Derivative Models.
+By using, reproducing, modifying, distributing, performing or displaying any portion or element of the Model or Derivative Model, or otherwise accepting the terms of this Agreement, you agree to be bound by this Agreement.
+1. Definitions. The following definitions apply to this Agreement:
+1.1 "Derivative Model" means all (a) modifications to the Model, (b) works based on the Model, and (c) any other derivative works of the Model. An output is not a Derivative Model.
+1.2 "Legal Entity" means the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (a) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (b) ownership of fifty percent (50%) or more of the outstanding shares, or (c) beneficial ownership of such entity.
+1.3 “Model” means the machine learning model, software, checkpoints, learnt weights, algorithms, parameters, configuration files and documentation shared under this Agreement.
+1.4 "NVIDIA Cosmos Model" means a multimodal Model shared under this Agreement
+1.5 "Special-Purpose Model" means a Model that is only competent in a narrow set of purpose-specific tasks and should not be used for unintended or general-purpose applications
+1.6 “You” or “Your” means an individual or Legal Entity exercising permissions granted by this Agreement.
+2. Conditions for Use, License Grant, AI Ethics and IP Ownership.
+2.1 Conditions for Use. The Model and any Derivative Model are subject to additional terms as described in Section 2 and Section 3 of this Agreement and govern Your use. If You institute copyright or patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model or a Derivative Model constitutes direct or contributory copyright or patent infringement, then any licenses granted to You under this Agreement for that Model or Derivative Model will terminate as of the date such litigation is filed. If You bypass, disable, reduce the efficacy of, or circumvent any technical limitation, safety guardrail or associated safety guardrail hyperparameter, encryption, security, digital rights management, or authentication mechanism (collectively “Guardrail”) contained in the Model without a substantially similar Guardrail appropriate for your use case, your rights under this Agreement will automatically terminate. NVIDIA may indicate in relevant documentation that a Model is a Special-Purpose Model. NVIDIA may update this Agreement to comply with legal and regulatory requirements at any time and You agree to either comply with any updated license or cease Your copying, use, and distribution of the Model and any Derivative Model.
+2.2 License Grant. The rights granted herein are explicitly conditioned on Your full compliance with the terms of this Agreement. Subject to the terms and conditions of this Agreement, NVIDIA hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, revocable (as stated in Section 2.1) license to publicly perform, publicly display, reproduce, use, create derivative works of, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution) and import the Model.
+2.3 AI Ethics. Use of the Models under the Agreement must be consistent with NVIDIA’s Trustworthy AI terms found at https://www.nvidia.com/en-us/agreements/trustworthy-ai/terms/.
+2.4 NVIDIA owns the Model and any Derivative Models created by NVIDIA. Subject to NVIDIA’s underlying ownership rights in the Model or its Derivative Models, You are and will be the owner of Your Derivative Models. NVIDIA claims no ownership rights in outputs. You are responsible for outputs and their subsequent uses. Except as expressly granted in this Agreement, (a) NVIDIA reserves all rights, interests and remedies in connection with the Model and (b) no other license or right is granted to you by implication, estoppel or otherwise.
+3. Redistribution. You may reproduce and distribute copies of the Model or Derivative Models thereof in any medium, with or without modifications, provided that You meet the following conditions:
+3.1 If you distribute the Model, You must give any other recipients of the Model a copy of this Agreement and include the following attribution notice within a “Notice” text file with such copies: “Licensed by NVIDIA Corporation under the NVIDIA Open Model License”;
+3.2 If you distribute or make available a NVIDIA Cosmos Model, or a product or service (including an AI model) that contains or uses a NVIDIA Cosmos Model, use a NVIDIA Cosmos Model to create a Derivative Model, or use a NVIDIA Cosmos Model or its outputs to create, train, fine tune, or otherwise improve an AI model, you will include “Built on NVIDIA Cosmos” on a related website, user interface, blogpost, about page, or product documentation; and
+3.3 You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Models as a whole, provided Your use, reproduction, and distribution of the Model otherwise complies with the conditions stated in this Agreement.
+4. Separate Components. The Models may include or be distributed with components provided with separate legal notices or terms that accompany the components, such as an Open Source Software License or other third-party license. The components are subject to the applicable other licenses, including any proprietary notices, disclaimers, requirements and extended use rights; except that this Agreement will prevail regarding the use of third-party Open Source Software License, unless a third-party Open Source Software License requires its license terms to prevail. “Open Source Software License” means any software, data or documentation subject to any license identified as an open source license by the Open Source Initiative (https://opensource.org), Free Software Foundation (https://www.fsf.org) or other similar open source organization or listed by the Software Package Data Exchange (SPDX) Workgroup under the Linux Foundation (https://www.spdx.org).
+5. Trademarks. This Agreement does not grant permission to use the trade names, trademarks, service marks, or product names of NVIDIA, except as required for reasonable and customary use in describing the origin of the Model and reproducing the content of the “Notice” text file.
+6. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, NVIDIA provides the Model on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for reviewing Model documentation, including any Special-Purpose Model limitations, and determining the appropriateness of using or redistributing the Model, Derivative Models and outputs. You assume any risks associated with Your exercise of permissions under this Agreement.
+7. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, will NVIDIA be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this Agreement or out of the use or inability to use the Model, Derivative Models or outputs (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if NVIDIA has been advised of the possibility of such damages.
+8. Indemnity. You will indemnify and hold harmless NVIDIA from and against any claim by any third party arising out of or related to your use or distribution of the Model, Derivative Models or outputs.
+9. Feedback. NVIDIA appreciates your feedback, and You agree that NVIDIA may use it without restriction or compensation to You.
+10. Governing Law. This Agreement will be governed in all respects by the laws of the United States and the laws of the State of Delaware, without regard to conflict of laws principles or the United Nations Convention on Contracts for the International Sale of Goods. The state and federal courts residing in Santa Clara County, California will have exclusive jurisdiction over any dispute or claim arising out of or related to this Agreement, and the parties irrevocably consent to personal jurisdiction and venue in those courts; except that, either party may apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+11. Trade and Compliance. You agree to comply with all applicable export, import, trade and economic sanctions laws and regulations, as amended, including without limitation U.S. Export Administration Regulations and Office of Foreign Assets Control regulations. These laws include restrictions on destinations, end-users and end-use.
+Version Release Date: October 24, 2025
+-----------------
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,400 @@

+---
+license: other
+license_name: nvidia-open-model-license
+license_link: >-
+  https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/
+language:
+- en
+- zh
+- ja
+- ko
+- ru
+pipeline_tag: image-to-text
+arxiv: None
+tags:
+- image
+- ocr
+- object recognition
+- text recognition
+- layout analysis
+- ingestion
+- multilingual
+---
+# Nemotron OCR v2 (multilingual)
+## **Model Overview**
+### **Description**
+Nemotron OCR v2 is a state-of-the-art multilingual text recognition model designed for robust end-to-end optical character recognition (OCR) on complex real-world images. It integrates three core neural network modules: a detector for text region localization, a recognizer for transcription of detected regions, and a relational model for layout and structure analysis.
+This model is optimized for a wide variety of OCR tasks, including multi-line, multi-block, and natural scene text, and it supports advanced reading order analysis via its relational model component. Nemotron OCR v2 supports multiple languages and has been developed to be production-ready and commercially usable, with a focus on speed and accuracy on both document and natural scene images.
+Nemotron OCR v2 is part of the NVIDIA NeMo Retriever collection, which provides state-of-the-art, commercially-ready models and microservices optimized for the lowest latency and highest throughput. It features a production-ready information retrieval pipeline with enterprise support. The models that form the core of this solution have been trained using responsibly selected, auditable data sources. With multiple pre-trained models available as starting points, developers can readily customize them for domain-specific use cases, such as information technology, human resource help assistants, and research and development assistants.
+This model is ready for commercial use.
+### **License/Terms of use**
+The use of this model is governed by the [NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/) and the use of the post-processing scripts are licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt).
+### Release Date:  <br>
+Hugging Face (this repo) [nvidia/nemotron-ocr-v2-multilingual](https://huggingface.co/nvidia/nemotron-ocr-v2-multilingual) <br>
+Collection / variant hub: [nvidia/nemotron-ocr-v2](https://huggingface.co/nvidia/nemotron-ocr-v2) <br>
+Build.Nvidia.com 04/15/2026 via [https://build.nvidia.com/nvidia/nemotron-ocr-v2](https://build.nvidia.com/nvidia/nemotron-ocr-v2) <br>
+NGC 04/15/2026 via [https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo-microservices/containers/nemoretriever-ocr-v2](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo-microservices/containers/nemoretriever-ocr-v2) <br>
+### Deployment Geography
+Global
+### Use Case
+**Nemotron OCR v2** is designed for high-accuracy and high-speed extraction of textual information from images across multiple languages, making it ideal for powering multimodal retrieval systems, Retrieval-Augmented Generation (RAG) pipelines, and agentic applications that require seamless integration of visual and language understanding. Its robust multilingual performance and efficiency make it an excellent choice for next-generation AI systems that demand both precision and scalability across diverse real-world content.
+### **Model Architecture**
+**Architecture Type:** Hybrid detector-recognizer with document-level relational modeling
+Nemotron OCR v2 is available in two variants:
+- **v2_english** — Optimized for English-language OCR with a compact recognizer for lower latency.
+- **v2_multilingual** — Supports English, Chinese (Simplified and Traditional), Japanese, Korean, and Russian with a larger recognizer to accommodate the expanded character set.
+Both variants share the same three-component architecture:
+- **Text Detector:** Utilizes a RegNetX-8GF convolutional backbone for high-accuracy localization of text regions within images.
+- **Text Recognizer:** Employs a pre-norm Transformer-based sequence recognizer to transcribe text from detected regions, supporting variable word and line lengths.
+- **Relational Model:** Applies a multi-layer global relational module to predict logical groupings, reading order, and layout relationships across detected text elements.
+All components are trained jointly in an end-to-end fashion, providing robust, scalable, and production-ready OCR for diverse document and scene images.
+**Network Architecture**: RegNetX-8GF
+#### Recognizer Comparison
+The two variants share an identical detector and relational architecture but differ in recognizer capacity:
+| Spec | v2_english | v2_multilingual |
+|------|-----------|----------------|
+| Transformer layers | 3 | 6 |
+| Hidden dimension (`d_model`) | 256 | 512 |
+| FFN width (`dim_feedforward`) | 1024 | 2048 |
+| Attention heads | 8 | 8 |
+| Max sequence length | 32 | 128 |
+| Character set size | 855 | 14,244 |
+#### Parameter Counts
+**v2_english** (from `v2_english/`):
+| Component         | Parameters  |
+|-------------------|-------------|
+| Detector          | 45,445,259  |
+| Recognizer        | 6,130,657   |
+| Relational model  | 2,255,419   |
+| **Total**         | **53,831,335**  |
+**v2_multilingual** (this repository: `checkpoints/`):
+| Component         | Parameters  |
+|-------------------|-------------|
+| Detector          | 45,445,259  |
+| Recognizer        | 36,119,598  |
+| Relational model  | 2,288,187   |
+| **Total**         | **83,853,044**  |
+### **Input**
+| Property         | Value              |
+|------------------|-------------------|
+| Input Type & Format       | Image (RGB, PNG/JPEG, float32/uint8), aggregation level (word, sentence, or paragraph) |
+| Input Parameters (Two-Dimensional)      | 3 x H x W (single image) or B x 3 x H x W (batch) |
+| Input Range      | [0, 1] (float32) or [0, 255] (uint8, auto-converted) |
+| Other Properties | Handles both single images and batches. Automatic multi-scale resizing for best accuracy. |
+### **Output**
+| Property        | Value              |
+|-----------------|-------------------|
+| Output Type     | Structured OCR results: a list of detected text regions (bounding boxes), recognized text, and confidence scores |
+| Output Format   | Bounding boxes: tuple of floats, recognized text: string, confidence score: float |
+| Output Parameters | Bounding boxes: One-Dimenional (1D) list of bounding box coordinates, recognized text: One-Dimenional (1D) list of strings, confidence score: One-Dimenional (1D) list of floats |
+| Other Properties | Please see the sample output for an example of the model output |
+### Sample output
+```
+ocr_boxes = [[[15.552736282348633, 43.141815185546875],
+  [150.00149536132812, 43.141815185546875],
+  [150.00149536132812, 56.845645904541016],
+  [15.552736282348633, 56.845645904541016]],
+ [[298.3145751953125, 44.43315124511719],
+  [356.93585205078125, 44.43315124511719],
+  [356.93585205078125, 57.34814453125],
+  [298.3145751953125, 57.34814453125]],
+ [[15.44686508178711, 13.67985725402832],
+  [233.15859985351562, 13.67985725402832],
+  [233.15859985351562, 27.376562118530273],
+  [15.44686508178711, 27.376562118530273]],
+ [[298.51727294921875, 14.268900871276855],
+  [356.9850769042969, 14.268900871276855],
+  [356.9850769042969, 27.790447235107422],
+  [298.51727294921875, 27.790447235107422]]]
+ocr_txts = ['The previous notice was dated',
+ '22 April 2016',
+ 'The previous notice was given to the company on',
+ '22 April 2016']
+ocr_confs = [0.97730815, 0.98834222, 0.96804602, 0.98499225]
+```
+Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated systems. By leveraging NVIDIA’s hardware (e.g. GPU cores) and software frameworks (e.g., CUDA libraries), the model achieves faster training and inference times compared to CPU-only solutions.
+### Usage
+#### Prerequisites
+- **OS**: Linux amd64 with NVIDIA GPU
+- **CUDA**: CUDA Toolkit 12.8 and compatible NVIDIA driver installed (for PyTorch CUDA). Verify with `nvidia-smi`.
+- **Python**: 3.12 (both subpackages require `python = ~3.12`)
+- **Build tools (when building the C++ extension)**:
+  - GCC/G++ with C++17 support
+  - CUDA toolkit headers (for building CUDA kernels)
+  - OpenMP (used by the C++ extension)
+#### Installation
+The model requires torch, and the custom code available in this repository.
+1. Clone the repository
+- Make sure git-lfs is installed (https://git-lfs.com)
+```
+git lfs install
+```
+2. Installation
+##### With pip
+- Create and activate a Python 3.12 environment (optional)
+- Run the following command to install the package:
+```bash
+cd nemotron-ocr
+pip install hatchling
+pip install -v .
+```
+##### With docker
+Run the example end-to-end without installing anything on the host (besides Docker, docker compose, and NVIDIA Container Toolkit):
+- Ensure Docker can see your GPU:
+```bash
+docker run --rm --gpus all nvcr.io/nvidia/pytorch:25.09-py3 nvidia-smi
+```
+- From the repo root, bring up the service to run the example (sample image `ocr-example-input-1.png` when present):
+```bash
+docker compose run --rm nemotron-ocr \
+  bash -lc "python example.py ocr-example-input-1.png --merge-level paragraph"
+```
+This will:
+- Build an image from the provided `Dockerfile` (based on `nvcr.io/nvidia/pytorch`)
+- Mount the repo at `/workspace`
+- Run `example.py` (downloads **v2 multilingual** from Hugging Face on first run unless you pass `--model-dir`)
+Output is saved next to your input image as `<name>-annotated.<ext>` on the host.
+3. Run the model using the following code.
+Use `nemotron_ocr.inference.pipeline.NemotronOCR`. With no arguments, checkpoints are downloaded from Hugging Face: **by default** the **v2 multilingual** bundle ([`nvidia/nemotron-ocr-v2-multilingual`](https://huggingface.co/nvidia/nemotron-ocr-v2-multilingual), `checkpoints/`). Use `lang="en"` for the English-optimized v2 build (`nvidia/nemotron-ocr-v2` / `v2_english/`), or pass `model_dir` to load from disk (any complete checkpoint folder; `lang` is then ignored).
+```python
+from nemotron_ocr.inference.pipeline import NemotronOCR
+# Default: Hugging Face v2 multilingual
+ocr = NemotronOCR()
+# English-optimized v2 (Hub)
+ocr_en = NemotronOCR(lang="en")
+# Multilingual v2 explicitly (same default as NemotronOCR())
+ocr_multi = NemotronOCR(lang="multi")
+# Local directory with detector.pth, recognizer.pth, relational.pth, charset.txt (this repo: ./checkpoints)
+ocr_local = NemotronOCR(model_dir="./checkpoints")
+# Legacy v1 weights from Hub (optional)
+ocr_v1 = NemotronOCR(lang="v1")
+predictions = ocr("ocr-example-input-1.png")
+for pred in predictions:
+    print(
+        f"  - Text: '{pred['text']}', "
+        f"Confidence: {pred['confidence']:.2f}, "
+        f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
+    )
+```
+**Constructor rules**
+- **`model_dir`**: If it contains all four checkpoint files, that directory is used and **`lang` is ignored**.
+- **`lang`** (keyword only): When weights are fetched from the Hub — `None` or `"multi"` / `"multilingual"` → [nvidia/nemotron-ocr-v2-multilingual](https://huggingface.co/nvidia/nemotron-ocr-v2-multilingual) `checkpoints/` (default); `"en"` / `"english"` → `nvidia/nemotron-ocr-v2` / `v2_english/`; `"v1"` / `"legacy"` → original v1 layout on `nvidia/nemotron-ocr-v1`.
+- If `model_dir` is set but incomplete, the client falls back to a Hub download using **`lang`** (defaulting to v2 multilingual when `lang` is `None`).
+### Software Integration
+**Runtime Engine(s):**
+- PyTorch
+**Supported Hardware Microarchitecture Compatibility:**
+- NVIDIA Ampere
+- NVIDIA Blackwell
+- NVIDIA Hopper
+- NVIDIA Lovelace
+**Preferred/Supported Operating System(s):**
+- Linux
+## Model Version(s)
+* **This repository:** Nemotron OCR **v2 multilingual** (`checkpoints/`).
+* **Related:** [nvidia/nemotron-ocr-v2](https://huggingface.co/nvidia/nemotron-ocr-v2) hosts the **v2 English** variant (`v2_english/`) and collection metadata.
+## **Training and Evaluation Datasets:**
+### **Training Dataset**
+**Data Modality**
+* Image
+**Image Training Data Size**
+* Approximately 12 million images
+The model is trained on a large-scale, curated mix of real-world and synthetic OCR datasets spanning multiple languages, scripts, and document types.
+**Real-world datasets (~680K images):** Natural scene text, multilingual scene text, arbitrary-shaped text, chart and infographic text, table images with bilingual annotations, and handwritten document pages. These cover diverse layouts, languages, and document types.
+**Synthetic datasets (~11M+ images):** Rendered multilingual document pages in six languages (English, Japanese, Korean, Russian, Chinese Simplified, and Chinese Traditional) and synthetic historical document crops covering archaic characters with degradation effects.
+**Data Collection Method by dataset:** Hybrid (Automated, Human, Synthetic)<br>
+**Labeling Method by dataset:** Hybrid (Automated, Human, Synthetic)<br>
+**Properties:** Includes scanned documents, natural scene images, charts, tables, infographics, handwritten documents, and synthetic rendered pages in multiple languages and scripts.
+### **Evaluation Datasets**
+Nemotron OCR v2 is evaluated on [OmniDocBench](https://github.com/opendatalab/OmniDocBench), a comprehensive document OCR benchmark covering English, Chinese, and mixed-language content across diverse document categories.
+**Data Collection Method by dataset:** Hybrid (Automated, Human, Synthetic)<br>
+**Labeling Method by dataset:** Hybrid (Automated, Human, Synthetic)<br>
+**Properties:** Benchmarks include challenging scene images, documents with varied layouts, and multi-language data.
+### **Evaluation Results**
+Tables below are **reference metrics** from NVIDIA’s benchmark runs (OmniDocBench, SynthDoG). Reproducing them requires datasets and scripts that are **not** checked into this Hugging Face repository.
+#### OmniDocBench
+Normalized Edit Distance (NED) sample_avg on OmniDocBench (lower = better). Results follow OmniDocBench methodology (empty predictions skipped). All models evaluated in crop mode. Speed measured on a single A100 GPU.
+| Model | crops/s | pages/s | EN | ZH | Mixed | White | Single | Multi | Normal | Rotate90 | Rotate270 | Horizontal |
+| :--- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
+| PaddleOCR v5 (server) | 20.6 | 1.2 | 0.027 | 0.037 | 0.041 | 0.031 | 0.035 | 0.064 | 0.031 | 0.116 | 0.897 | 0.027 |
+| OpenOCR (server) | 17.4 | 1.5 | 0.024 | 0.033 | 0.049 | 0.027 | 0.034 | 0.061 | 0.028 | 0.042 | 0.761 | 0.034 |
+| **Nemotron OCR v2(Multilingual)** | **68.1** | **21.8** | **0.048** | **0.072** | **0.142** | **0.061** | **0.049** | **0.117** | **0.062** | **0.109** | **0.332** | **0.372** |
+| *Nemotron OCR v2 (EN)* | *74.6* | *19.9* | *0.038* | *0.830* | *0.437* | *0.348* | *0.282* | *0.572* | *0.353* | *0.232* | *0.827* | *0.893* |
+| EasyOCR | 10.3 | 0.4 | 0.095 | 0.117 | 0.326 | 0.095 | 0.179 | 0.322 | 0.110 | 0.987 | 0.979 | 0.809 |
+| Tesseract-OCR | | | 0.096 | 0.551 | 0.250 | 0.439 | 0.328 | 0.331 | 0.426 | 0.117 | 0.969 | 0.984 |
+| *Nemotron OCR v1* | *61.1* | *21.4* | *0.038* | *0.876* | *0.436* | *0.472* | *0.434* | *0.715* | *0.482* | *0.358* | *0.871* | *0.979* |
+Column key: **crops/s** and **pages/s** are throughput using the v2 batched pipeline where measured; **EN** = English, **ZH** = Simplified Chinese, **Mixed** = English/Chinese mixed, **White/Single/Multi** = background type, **Normal/Rotate90/Rotate270/Horizontal** = text orientation.
+#### [SynthDoG](https://github.com/clovaai/donut/tree/master/synthdog) Generated Benchmark Data
+Normalized Edit Distance (NED) page_avg on [SynthDoG](https://github.com/clovaai/donut/tree/master/synthdog) generated benchmark data (lower = better):
+| Language | PaddleOCR (base) | PaddleOCR (specialized) | OpenOCR (server) | Nemotron OCR v1 | *Nemotron OCR v2 (EN)* | **Nemotron OCR v2** |
+| :--- | ---: | ---: | ---: | ---: | ---: | ---: |
+| English | 0.117 | 0.096 | 0.105 | 0.078 | *0.079* | **0.069** |
+| Japanese | 0.201 | 0.201 | 0.586 | 0.723 | *0.765* | **0.046** |
+| Korean | 0.943 | 0.133 | 0.837 | 0.923 | *0.924* | **0.047** |
+| Russian | 0.959 | 0.163 | 0.950 | 0.564 | *0.632* | **0.043** |
+| Chinese (Simplified) | 0.054 | 0.054 | 0.061 | 0.784 | *0.819* | **0.035** |
+| Chinese (Traditional) | 0.094 | 0.094 | 0.127 | 0.700 | *0.756* | **0.065** |
+### **Detailed Performance Analysis**
+The model demonstrates robust multilingual performance on complex layouts, noisy backgrounds, and challenging real-world scenes. Reading order and block detection are powered by the relational module, supporting downstream applications such as chart-to-text, table-to-text, and infographic-to-text extraction.
+**Inference**<br>
+**Acceleration Engine:** PyTorch<br>
+**Supported Hardware:** H100 PCIe/SXM, A100 PCIe/SXM, L40S, L4, A10G, H200 NVL, B200, RTX PRO 6000 Blackwell Server Edition<br>
+## Ethical Considerations
+NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse. <br>
+The integration of foundation and fine-tuned models into AI systems requires additional testing using use-case-specific data to ensure safe and effective deployment. Following the V-model methodology, iterative testing and validation at both unit and system levels are essential to mitigate risks, meet technical and functional requirements, and ensure compliance with safety and ethical standards before deployment. <br>
+Please make sure you have proper rights and permissions for all input image and video content; if image or video includes people, personal health information, or intellectual property, the image or video generated will not blur or maintain proportions of image subjects included. <br>
+For more detailed information on ethical considerations for this model, please see the [Explainability](#explainability), [Bias](#bias), [Safety](#safety) & Security, and [Privacy](#privacy) sections below. <br>
+Please report security vulnerabilities or NVIDIA AI Concerns [here](https://app.intigriti.com/programs/nvidia/nvidiavdp/detail).
+## Bias
+| Field | Response |
+| ----- | ----- |
+| Participation considerations from adversely impacted groups [protected classes](https://www.senate.ca.gov/content/protected-classes) in model design and testing | None |
+| Measures taken to mitigate against unwanted bias | None |
+## Explainability
+| Field | Response |
+| ----- | ----- |
+| Intended Task/Domain: | Optical Character Recognition (OCR) with a focus on retrieval application and documents. |
+| Model Type: | Hybrid neural network with convolutional detector, transformer recognizer, and document structure modeling. |
+| Intended Users: | Developers and teams building AI-driven search applications, retrieval-augmented generation (RAG) workflows, multimodal agents, or document intelligence applications. It is ideal for those working with large collections of scanned or photographed documents, including PDFs, forms, and reports. |
+| Output: | Structured OCR results, including detected bounding boxes, recognized text, and confidence scores. |
+| Describe how the model works: | The model first detects text regions in the image, then transcribes recognized text, and finally analyzes document structure and reading order. Outputs structured, machine-readable results suitable for downstream search and analysis. |
+| Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | Not Applicable |
+| Technical Limitations & Mitigation: | Performance may vary across languages and scripts. |
+| Verified to have met prescribed NVIDIA quality standards: | Yes |
+| Performance Metrics: | Accuracy (e.g., character error rate), throughput, and latency. |
+| Potential Known Risks: | The model may not always extract or transcribe all text with perfect accuracy, particularly in cases of poor image quality or highly stylized fonts. |
+| Licensing & Terms of Use: | Use of this model is governed by [NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/) and the use of the post-processing scripts are licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt). |
+## Privacy
+| Field | Response |
+| ----- | ----- |
+| Generatable or reverse engineerable personal data? | No |
+| Personal data used to create this model? | None Known |
+| How often is dataset reviewed? | The dataset is initially reviewed when added, and subsequent reviews are conducted as needed or in response to change requests. |
+| Is there provenance for all datasets used in training? | Yes |
+| Does data labeling (annotation, metadata) comply with privacy laws? | Yes |
+| Is data compliant with data subject requests for data correction or removal, if such a request was made? | No, not possible with externally-sourced data. |
+| Applicable Privacy Policy | https://www.nvidia.com/en-us/about-nvidia/privacy-policy/ |
+| Was consent obtained for any personal data used? | Not Applicable |
+| Was data from user interactions with the AI model (e.g. user input and prompts) used to train the model? | No |
+## Safety
+| Field | Response |
+| ----- | ----- |
+| Model Application Field(s): | Text recognition and structured OCR for multimodal retrieval. Inputs can include natural scene images, scanned documents, charts, tables, and infographics. |
+| Use Case Restrictions: | Abide by [NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/) and the use of the post-processing scripts are licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt). |
+| Model and dataset restrictions: | The principle of least privilege (PoLP) is applied, limiting access for dataset generation and model development. Restrictions enforce dataset access only during training, and all dataset license constraints are adhered to. |
+| Describe the life critical impact (if present): | Not applicable. |

THIRD_PARTY_NOTICES.md ADDED Viewed

	@@ -0,0 +1,519 @@

+Copyright "Angus Johnson" - Boost Software License 1.0
+License Text([https://sourceforge.net/p/polyclipping/code/HEAD/tree/tags/6.2.0/License.txt](https://sourceforge.net/p/polyclipping/code/HEAD/tree/tags/6.2.0/License.txt))
+This notice applies to **clipper**.
+Copyright (c) 2010-2014 Angus Johnson
+Boost Software License - Version 1.0 - August 17th, 2003
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+-----
+Copyright "Ofek Lev" - MIT License
+License Text([https://github.com/pypa/hatch/blob/master/LICENSE.txt](https://github.com/pypa/hatch/blob/master/LICENSE.txt))
+This notice applies to **hatchling**.
+Copyright (c) 2017-present Ofek Lev <ofekmeister@gmail.com>
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+-----
+Copyright "NumPy Developers" - BSD 3-Clause License
+License Text([https://github.com/numpy/numpy/blob/main/LICENSE.txt](https://github.com/numpy/numpy/blob/main/LICENSE.txt))
+This notice applies to **numpy**.
+Copyright (c) 2005-2023, NumPy Developers.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials provided
+    with the distribution.
+  * Neither the name of the NumPy Developers nor the names of any
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-----
+Copyright "pandas Developers" - BSD 3-Clause License
+License Text([https://github.com/pandas-dev/pandas/blob/main/LICENSE](https://github.com/pandas-dev/pandas/blob/main/LICENSE))
+This notice applies to **pandas**.
+Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
+All rights reserved.
+Copyright (c) 2011-2023, The PyData Development Team
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials provided
+    with the distribution.
+  * Neither the name of the pandas development team nor the names of
+    any contributors may be used to endorse or promote products
+    derived from this software without specific prior written
+    permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-----
+Copyright "Secret Labs AB, Fredrik Lundh, Alex Clark and contributors" - Pillow License
+License Text([https://github.com/python-pillow/Pillow/blob/main/LICENSE](https://github.com/python-pillow/Pillow/blob/main/LICENSE))
+This notice applies to **PIL (Pillow)**.
+The Python Imaging Library (PIL) is
+Copyright (c) 1997-2011 by Secret Labs AB
+Copyright (c) 1995-2011 by Fredrik Lundh
+Copyright (c) 2010-2023 by Alex Clark and contributors
+Like PIL, Pillow is licensed under the open source HPND License:
+By obtaining, using, and/or copying this software and/or its
+associated documentation, you agree that you have read, understood,
+and will comply with the following terms and conditions:
+Permission to use, copy, modify, and distribute this software and
+its associated documentation for any purpose and without fee is
+hereby granted, provided that the above copyright notice appears in
+all copies, and that both that copyright notice and this permission
+notice appear in supporting documentation, and that the name of
+Secret Labs AB or the author not be used in advertising or publicity
+pertaining to distribution of the software without specific, written
+prior permission.
+SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
+TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
+ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
+BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
+DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+OF THIS SOFTWARE.
+-----
+Copyright "The scikit-learn developers" - BSD 3-Clause License
+License Text([https://github.com/scikit-learn/scikit-learn/blob/main/COPYING](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING))
+This notice applies to **scikit-learn**.
+Copyright (c) 2007-2024 The scikit-learn developers.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials provided
+    with the distribution.
+  * Neither the name of the scikit-learn developers nor the names of
+    any contributors may be used to endorse or promote products
+    derived from this software without specific prior written
+    permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-----
+Copyright "Jason R. Coombs" - MIT License
+License Text([https://github.com/pypa/setuptools/blob/main/LICENSE](https://github.com/pypa/setuptools/blob/main/LICENSE))
+This notice applies to **setuptools**.
+Copyright (c) 2016 Jason R. Coombs <jaraco@jaraco.com>
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+-----
+Copyright "Sean Gillies" - BSD 3-Clause License
+License Text([https://github.com/shapely/shapely/blob/main/LICENSE.txt](https://github.com/shapely/shapely/blob/main/LICENSE.txt))
+This notice applies to **Shapely**.
+Copyright (c) 2007, Sean Gillies.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of Sean Gillies nor the names of
+    its contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----
+Copyright "PyTorch Contributors" - BSD-style License
+License Text([https://github.com/pytorch/pytorch/blob/main/LICENSE](https://github.com/pytorch/pytorch/blob/main/LICENSE))
+This notice applies to **torch** and **torchvision**.
+Copyright (c) 2016-     Facebook, Inc. (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc. (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 DeepMind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Clement Farabet)
+Copyright (c) 2011-2013 New York University    (Antoine Bordes)
+Copyright (c) 2012-2013 University of Montreal   (Pascal Vincent)
+Copyright (c) 2014-    Google Inc.
+Copyright (c) 2015-    Twitter, Inc.
+Copyright (c) 2015-    Intel Corporation
+Copyright (c) 2015-    AMD Inc.
+Copyright (c) 2016-    Baidu, Inc.
+Copyright (c) 2016-    Microsoft Corporation
+Copyright (c) 2017-    Amazon.com, Inc.
+Copyright (c) 2018-    Facebook AI Research
+Copyright (c) 2019-    fast.ai, Inc.
+Copyright (c) 2022-     PyTorch Contributors
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+  * Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+  * Neither the name of Facebook Inc. nor the names of its contributors may be
+    used to endorse or promote products derived from this software without
+    specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-----
+Copyright "Baidu USA LLC" - Apache License 2.0
+License Text([https://github.com/bryancatanzaro/trove/blob/master/LICENSE](https://github.com/bryancatanzaro/trove/blob/master/LICENSE))
+This notice applies to **trove**.
+Copyright 2015-2016 Baidu USA LLC. All rights reserved.
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS

checkpoints/charset.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/detector.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d54398ec39156c6a8a17c89588271c84a976195bae4227dc2643c4635c6442e9
+size 181974624

checkpoints/model_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "num_tokens": 14247,
+  "max_width": 128,
+  "sequence_length": 128,
+  "scope": 2048,
+  "coordinate_mode": "RBOX",
+  "backbone": "regnet_x_8gf",
+  "charset_size": 14244,
+  "recognizer_variant": "prenorm",
+  "has_pre_norm": false,
+  "has_tx_norm": true,
+  "norm_first": true,
+  "depth": 256,
+  "num_layers": 6,
+  "nhead": 8,
+  "dim_feedforward": 2048,
+  "feature_depth": 512
+}

checkpoints/recognizer.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20bf070ab5d9a9e85edbaa140aaa3e2c518ad94fafbf2fa856c8773f1594647c
+size 144516943

checkpoints/relational.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:509701e97de006bb060aa4e7a6937dcfe4222d1717dcc0d447bf396090a1e10b
+size 9175733

config.json ADDED Viewed

File without changes

docker-compose.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+services:
+  nemotron-ocr:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities: [gpu]
+    working_dir: /workspace
+    volumes:
+      - .:/workspace:rw
+      - ${XDG_CACHE_HOME:-~/cache}:/root/.cache:rw
+    command: bash -lc "python example.py ocr-example-input-1.png --merge-level paragraph"
+    ipc: host
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+      stack: 6710886

example.py ADDED Viewed

	@@ -0,0 +1,61 @@

+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+from nemotron_ocr.inference.pipeline import NemotronOCR
+def main(image_path, merge_level, no_visualize, model_dir, lang):
+    if model_dir is not None:
+        ocr_pipeline = NemotronOCR(model_dir=model_dir)
+    else:
+        ocr_pipeline = NemotronOCR(lang=lang)
+    predictions = ocr_pipeline(image_path, merge_level=merge_level, visualize=not no_visualize)
+    print(f"Found {len(predictions)} text regions.")
+    for pred in predictions:
+        print(
+            f"  - Text: '{pred['text']}', "
+            f"Confidence: {pred['confidence']:.2f}, "
+            f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, "
+            f"right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
+        )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run OCR inference and annotate image.")
+    parser.add_argument("image_path", type=str, help="Path to the input image.")
+    parser.add_argument(
+        "--merge-level",
+        type=str,
+        choices=["word", "sentence", "paragraph"],
+        default="paragraph",
+        help="Merge level for OCR output (word, sentence, paragraph).",
+    )
+    parser.add_argument("--no-visualize", action="store_true", help="Do not save the annotated image.")
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        default=None,
+        help="Path to a directory with detector.pth, recognizer.pth, relational.pth, charset.txt. "
+        "If omitted, weights are downloaded from Hugging Face (default: v2 multilingual).",
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        choices=["en", "multi", "v1"],
+        default=None,
+        help="Hub checkpoint when --model-dir is omitted: en=v2 English, multi=v2 multilingual (default), v1=legacy.",
+    )
+    args = parser.parse_args()
+    main(
+        args.image_path,
+        merge_level=args.merge_level,
+        no_visualize=args.no_visualize,
+        model_dir=args.model_dir,
+        lang=args.lang,
+    )

nemotron-ocr/.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+# Built C++/CUDA extension (produced by: pip install -v .)
+src/nemotron_ocr_cpp/*.so
+src/nemotron_ocr_cpp/*.pyd
+__pycache__/
+*.py[cod]
+.pytest_cache/
+.venv/
+build/
+*.egg-info/

nemotron-ocr/cpp/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ load_png/wuffs-v0.3.c filter=lfs diff=lfs merge=lfs -text

nemotron-ocr/cpp/.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+__pycache__
+.vscode
+build
+*.egg-info
+dist
+.vs

nemotron-ocr/cpp/.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "trove"]
+	path = trove
+	url = https://github.com/bryancatanzaro/trove.git

nemotron-ocr/cpp/README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+# Optimized Image Operations for PyTorch
+## Installation
+```
+python setup.py install
+```
+## Usage
+```
+# It's important that you do this first
+import torch
+from pytorch_image_ops import color_transform, spatial_transform
+```

nemotron-ocr/cpp/beam_decode/beam_decode.cpp ADDED Viewed

	@@ -0,0 +1,459 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "beam_decode.h"
+#include <vector>
+#include <deque>
+#include <limits>
+#include <memory>
+#include <unordered_set>
+#include <set>
+#include <algorithm>
+#include <chrono>
+#include "../common.h"
+#include "prefix.h"
+#include "log_sum_exp.h"
+#include "sbo_lm.h"
+using namespace std;
+template<typename scalar_t>
+using pred_seq_t = torch::TensorAccessor<scalar_t, 2>;
+struct PrefixScore
+{
+    float_t lProbBlank;
+    float_t lProbChar;
+    // float_t raw_lProbBlank;
+    // float_t raw_lProbChar;
+    mutable float_t _lProb;
+    PrefixScore(float_t lProbBlank = NEG_INF /* log P(0) */, float_t lProbChar = NEG_INF /* log P(0) */)
+        : lProbBlank(lProbBlank), lProbChar(lProbChar), _lProb(NEG_INF)
+        //   , raw_lProbBlank(lProbBlank), raw_lProbChar(lProbChar)
+    {}
+    float_t get_lScore() const {
+        if (_lProb == NEG_INF) {
+            _lProb = log_sum_exp(lProbBlank, lProbChar);
+        }
+        return _lProb;
+    }
+    // float_t get_raw_lScore() const {
+    //     return log_sum_exp(raw_lProbBlank, raw_lProbChar);
+    // }
+};
+typedef std::unordered_map<Prefix*, PrefixScore> PrefixMap;
+typedef std::pair<Prefix*, PrefixScore> BeamItem;
+typedef std::vector<BeamItem> Beam;
+/*
+    Allows us to get an estimate of the vision model confidence, irrespective of how the language
+    model guided the decoding. NOTE: This scoring could follow an entirely different path than
+    the returned decoded sequence.
+*/
+template<typename scalar_t>
+scalar_t get_vision_confidence(const pred_seq_t<scalar_t> &logProbs, scalar_t minProb)
+{
+    const int64_t T = logProbs.size(0);
+    const int64_t S = logProbs.size(1);
+    scalar_t ret = 0; // log(1)
+    for (size_t t = 0; t < T; ++t) {
+        float_t maxP = logProbs[t][0];
+        int64_t maxC = 0;
+        for (int64_t c = 1; c < S; ++c) {
+            float_t p = logProbs[t][c];
+            if (p > maxP) {
+                maxP = p;
+                maxC = c;
+            }
+        }
+        ret += maxP;
+        // Ignore everything past the sequence terminator
+        if (maxC == 1) {
+            break;
+        }
+        if (ret < minProb) {
+            break;
+        }
+    }
+    return ret;
+}
+template<typename scalar_t>
+pair<vector<token_t>, float_t>
+    ctc_beam_decode_impl(const pred_seq_t<scalar_t> &probs, const int64_t beamSize,
+                         const int64_t blank, scalar_t minProb,
+                         const LanguageModel &langModel, scalar_t lmWeight)
+{
+    if (blank != 0) {
+        throw runtime_error("Currently, only ordinal 0 supported for the blank prediction");
+    }
+    const int64_t T = probs.size(0);
+    const int64_t S = probs.size(1);
+    // NOTE: In log space, the following is true:
+    // 1. Adding two probabilities: log_sum_exp(l_p_a, l_p_b)
+    // 2. Multiplying two probabilities: l_p_a + l_p_b
+    // 3. log P(0) = -inf
+    // 4. log P(1) = 0
+    // Convert to log-space
+    if (minProb > 0) {
+        minProb = log(minProb);
+    } else {
+        minProb = NEG_INF;
+    }
+    auto retScore = get_vision_confidence(probs, minProb);
+    if (retScore < minProb) {
+        return { {}, NEG_INF };
+    }
+    PrefixAllocator prefixAlloc;
+    Beam beam;
+    beam.emplace_back(prefixAlloc.GetPrefix(), PrefixScore{0, NEG_INF}); // Add a dummy first node
+    Beam terminated;
+    typedef tuple<Prefix*, token_t> lm_cache_key_t;
+    unordered_map<lm_cache_key_t, float_t> lmScoreCache;
+    for (int64_t t = 0; t < T; ++t) {
+        PrefixMap nextBeam;
+        // Add all of the completed paths to the next beam.
+        // This allows us to accumulate new paths into these,
+        // but otherwise not process them
+        for (const BeamItem &prevNode : beam) {
+            if (prevNode.first->Token == 1) {
+                nextBeam.insert(prevNode);
+            }
+        }
+        // Loop over vocab
+        for (int64_t s = 0; s < S; ++s) {
+            float_t lpEmit = probs[t][s];
+            if (lpEmit < minProb) {
+                continue;
+            }
+            for (const BeamItem &prevNode : beam) {
+                Prefix *prevPrefix = prevNode.first;
+                const PrefixScore &prevScore = prevNode.second;
+                // Ignore already completed paths
+                if (prevPrefix->Token == 1) {
+                    continue;
+                }
+                // Ignore impossible paths
+                if (prevScore.lProbBlank == NEG_INF && prevScore.lProbChar == NEG_INF) {
+                    continue;
+                }
+                // If we propose a blank the prefix doesn't change.
+                // Only the probability of ending in blank gets updated.
+                if (s == blank) {
+                    PrefixScore &score = nextBeam[prevPrefix];
+                    score.lProbBlank     = log_sum_exp(score.lProbBlank    , prevScore.lProbBlank     + lpEmit, prevScore.lProbChar     + lpEmit);
+                    // score.raw_lProbBlank = log_sum_exp(score.raw_lProbBlank, prevScore.raw_lProbBlank + lpEmit, prevScore.raw_lProbChar + lpEmit);
+                    continue;
+                }
+                // Extend the prefix by the new character s and add it to the beam.
+                // Only the probability of not ending in blank gets updated.
+                token_t prevToken = prevPrefix->Token;
+                // NOTE: We always create a new prefix regardless of duplication because the PrefixScore
+                // is simultaneously tracking prefixes that do and don't end in a blank. And it's those
+                // that end in a blank that would cause the prefix to be extended.
+                auto extendPrefix = prefixAlloc.GetPrefix(s, prevPrefix);
+                // Evaluate the language model, but use the cache if we've already considered this string before
+                auto lmCacheItem = make_tuple(prevPrefix, s);
+                auto lmCacheIter = lmScoreCache.find(lmCacheItem);
+                float_t lpLang = 0;
+                if (lmCacheIter == lmScoreCache.end()) {
+                    lpLang = langModel.ScoreTransition(prevPrefix, s);
+                    lpLang *= lmWeight;
+                    lmCacheIter = lmScoreCache.emplace(lmCacheItem, lpLang).first;
+                }
+                lpLang = lmCacheIter->second;
+                PrefixScore &extendScore = nextBeam[extendPrefix];
+                // Remember, adding two log probabilities is equivalent to multiplying two probabilities
+                if (s != prevToken) {
+                    extendScore.lProbChar     = log_sum_exp(extendScore.lProbChar,     prevScore.lProbBlank     + lpEmit + lpLang, prevScore.lProbChar     + lpEmit + lpLang);
+                    // extendScore.raw_lProbChar = log_sum_exp(extendScore.raw_lProbChar, prevScore.raw_lProbBlank + lpEmit         , prevScore.raw_lProbChar + lpEmit         );
+                } else {
+                    // We don't include the previous probability of not ending in blank if s is repeated at the end. The CTC
+                    // algorithm merges characters not separated by a blank.
+                    extendScore.lProbChar     = log_sum_exp(extendScore.lProbChar    , prevScore.lProbBlank     + lpEmit + lpLang);
+                    // extendScore.raw_lProbChar = log_sum_exp(extendScore.raw_lProbChar, prevScore.raw_lProbBlank + lpEmit         );
+                }
+                // If the token is repeated, we also have to deal with the unchanged prefix since repeated characters are collapsed
+                if (s == prevToken) {
+                    PrefixScore &collapseScore = nextBeam[prevPrefix];
+                    collapseScore.lProbChar     = log_sum_exp(collapseScore.lProbChar    , prevScore.lProbChar     + lpEmit);
+                    // collapseScore.raw_lProbChar = log_sum_exp(collapseScore.raw_lProbChar, prevScore.raw_lProbChar + lpEmit);
+                }
+            }
+        }
+        Beam vecNextBeam(begin(nextBeam), end(nextBeam));
+        if (vecNextBeam.size() > beamSize) {
+            partial_sort(begin(vecNextBeam), begin(vecNextBeam) + beamSize, end(vecNextBeam),
+                [] (const BeamItem &a, const BeamItem &b) {
+                    return a.second.get_lScore() > b.second.get_lScore();
+                }
+            );
+            vecNextBeam.resize(beamSize);
+        }
+        beam = move(vecNextBeam);
+    }
+    // Find the best raw score
+    const BeamItem *bestItem = nullptr;
+    // for (const BeamItem &b : beam) {
+    //     if (bestItem == nullptr or b.second.get_raw_lScore() > bestItem->second.get_raw_lScore()) {
+    //         bestItem = &b;
+    //     }
+    // }
+    if (! beam.empty()) {
+        bestItem = &beam[0];
+    }
+    if (bestItem != nullptr) {
+        auto retList = bestItem->first->ToList();
+        return { move(retList), retScore };
+    } else {
+        return { {}, NEG_INF };
+    }
+}
+typedef std::pair<Prefix*, float_t> RegBeamItem;
+bool operator<(const RegBeamItem &a, const RegBeamItem &b) {
+    return a.second > b.second;
+}
+template<typename scalar_t>
+pair<vector<token_t>, float_t>
+    reg_beam_decode_impl(const pred_seq_t<scalar_t> &logProbs, const int64_t beamSize,
+                         scalar_t minProb,
+                         const LanguageModel &langModel, scalar_t lmWeight)
+{
+    const int64_t T = logProbs.size(0);
+    const int64_t S = logProbs.size(1);
+    // NOTE: In log space, the following is true:
+    // 1. Adding two probabilities: log_sum_exp(l_p_a, l_p_b)
+    // 2. Multiplying two probabilities: l_p_a + l_p_b
+    // 3. log P(0) = -inf
+    // 4. log P(1) = 0
+    // Convert to log-space
+    if (minProb > 0) {
+        minProb = log(minProb);
+    } else {
+        minProb = NEG_INF;
+    }
+    auto retScore = get_vision_confidence(logProbs, minProb);
+    if (retScore < minProb) {
+        return { {}, NEG_INF };
+    }
+    PrefixAllocator prefixAlloc;
+    vector<RegBeamItem> beam, nextBeam;
+    beam.emplace_back(prefixAlloc.GetPrefix(), 0); // log(1) = 0
+    for (int64_t t = 0; t < T && !beam.empty(); ++t) {
+        nextBeam.clear();
+        auto addToBeam = [&nextBeam, beamSize] (const RegBeamItem &rbi) {
+            nextBeam.push_back(rbi);
+        };
+        // Expand each path in the beam
+        for (const RegBeamItem &prevNode : beam) {
+            if (prevNode.first->Token == 1) {
+                // Move completed paths along without processing further
+                addToBeam(prevNode);
+                continue;
+            }
+            Prefix *prevPrefix = prevNode.first;
+            float_t prevScore = prevNode.second;
+            // Loop over vocab
+            for (int64_t s = 0; s < S; ++s) {
+                float_t lpEmit = logProbs[t][s];
+                if (lpEmit < minProb) {
+                    // The probability dropped below threshold, so stop processing this path
+                    continue;
+                }
+                auto extendPrefix = prefixAlloc.GetPrefix(s, prevPrefix);
+                float_t lpLang = langModel.ScoreTransition(prevPrefix, s);
+                float_t lpNext = prevScore + lpLang + lpEmit;
+                addToBeam({extendPrefix, lpNext});
+            }
+        }
+        if (nextBeam.size() > beamSize) {
+            // Find the top-k items, and then truncate the rest
+            partial_sort(begin(nextBeam), begin(nextBeam) + beamSize, end(nextBeam));
+            nextBeam.resize(beamSize);
+        }
+        std::swap(beam, nextBeam);
+    }
+    if (!beam.empty()) {
+        // The highest probability element will always be in the back
+        RegBeamItem rbi{ nullptr, NEG_INF };
+        for (auto &rb : beam) {
+            if (rbi.first == nullptr || rb.second > rbi.second) {
+                rbi = rb;
+            }
+        }
+        auto retList = rbi.first->ToList();
+        return { move(retList), retScore };
+    } else {
+        return { {}, NEG_INF };
+    }
+}
+template<typename scalar_t>
+void dp_beam_decode_impl(const torch::TensorAccessor<scalar_t, 3> &probsAccess,
+                         torch::TensorAccessor<int64_t, 2> retAccess,
+                         torch::TensorAccessor<scalar_t, 1> confAccess,
+                         int64_t beamSize, int64_t blank,
+                         scalar_t minProb,
+                         const LanguageModel *langModel,
+                         scalar_t lmWeight,
+                         bool combineDuplicates)
+{
+    const int64_t N = probsAccess.size(0);
+    #pragma omp parallel for num_threads(8)
+    for (int64_t i = 0; i < N; ++i) {
+        vector<token_t> seq;
+        float_t lConf;
+        if (combineDuplicates) {
+            tie(seq, lConf) = ctc_beam_decode_impl(probsAccess[i], beamSize, blank,
+                                                   minProb,
+                                                   *langModel, lmWeight);
+        } else {
+            tie(seq, lConf) = reg_beam_decode_impl(probsAccess[i], beamSize,
+                                                   minProb,
+                                                   *langModel, lmWeight);
+        }
+        int64_t sz = min<int64_t>(seq.size(), retAccess.size(1));
+        for (int64_t k = 0; k < sz; ++k) {
+            retAccess[i][k] = seq[k];
+        }
+        confAccess[i] = exp(lConf);
+    }
+}
+std::tuple<torch::Tensor, torch::Tensor>
+    beam_decode(torch::Tensor probs, int64_t beamSize, int64_t blank,
+                float minProb,
+                const LanguageModel *langModel,
+                float lmWeight,
+                bool combineDuplicates)
+{
+    if (langModel == nullptr) {
+        langModel = &NullLanguageModel;
+    }
+    auto tStart = chrono::high_resolution_clock::now();
+    probs = probs.contiguous();
+    bool collapse = false;
+    if (probs.dim() == 2) {
+        // N,T,C
+        probs = probs.unsqueeze(0);
+        collapse = true;
+    }
+    probs = probs.log();
+    torch::Tensor ret = torch::ones({ probs.size(0), probs.size(1) }, torch::kInt64);
+    torch::Tensor conf = torch::zeros({ probs.size(0) }, probs.options());
+    auto retAccess = ret.accessor<int64_t, 2>();
+    AT_DISPATCH_FLOATING_TYPES(
+        probs.scalar_type(),
+        "cpu_beam_decode",
+        ([&] {
+            dp_beam_decode_impl(
+                probs.accessor<scalar_t, 3>(),
+                retAccess,
+                conf.accessor<scalar_t, 1>(),
+                beamSize, blank,
+                static_cast<scalar_t>(minProb),
+                langModel,
+                static_cast<scalar_t>(lmWeight),
+                combineDuplicates
+            );
+        })
+    );
+    if (collapse) {
+        ret = ret.squeeze(0);
+        conf = conf[0];
+    }
+    auto tEnd = chrono::high_resolution_clock::now();
+    typedef chrono::duration<double, std::milli> tp_t;
+    tp_t totalElapsed = tEnd - tStart;
+    cout << "Beam Decode " << probs.size(0) << " - "
+         << "Total: " << totalElapsed.count() << "ms"
+         << endl;
+    return { ret, conf };
+}
+std::unique_ptr<LanguageModel> create_sbo_lm(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t backoffWeight)
+{
+    return make_unique<SBO_LanguageModel>(dataFilePath, move(tokenMapping), backoffWeight);
+}

nemotron-ocr/cpp/beam_decode/beam_decode.h ADDED Viewed

	@@ -0,0 +1,17 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <torch/torch.h>
+#include "language_model.h"
+std::tuple<torch::Tensor, torch::Tensor>
+    beam_decode(torch::Tensor probs, int64_t beamSize, int64_t blank,
+                float minProb,
+                const LanguageModel *langModel,
+                float lmWeight,
+                bool combineDuplicates);
+std::unique_ptr<LanguageModel> create_sbo_lm(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t backoffWeight);

nemotron-ocr/cpp/beam_decode/kn_lm.cpp ADDED Viewed

	@@ -0,0 +1,85 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "kn_lm.h"
+using namespace std;
+KN_LanguageModel::KN_LanguageModel(const string &dataFilePath, token_mapping_t tokenMapping, float_t knDelta)
+    : NGramLMBase(dataFilePath, move(tokenMapping)), m_knDelta(knDelta)
+{
+}
+float KN_LanguageModel::ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const
+{
+    if (prefix.empty()) {
+        return ScoreUnigram(suffix);
+    } else {
+        return ScoreTransition(prefix, suffix);
+    }
+}
+float_t KN_LanguageModel::ScoreUnigram(const std::wstring &uni) const
+{
+    auto lIter = m_lookup[1].find(L""s);
+    if (lIter == m_lookup[1].end()) {
+        throw std::runtime_error("Unigrams not supported by this model!");
+    }
+    auto uniIter = lIter->second.find(uni);
+    float_t ctUni = 1e-8;
+    if (uniIter != lIter->second.end()) {
+        ctUni = uniIter->second;
+    }
+    float_t ctSuffixes = GetPrefixSum(L""s);
+    return ctUni / ctSuffixes;
+}
+float_t KN_LanguageModel::ScoreTransition(const std::wstring &prefix, const std::wstring &suffix) const
+{
+    if (prefix.empty()) {
+        // The number of distinct bigrams that end with this token
+        auto rlIter = m_reverseLookup.find(suffix);
+        float_t ctEndingBigrams = 0;
+        if (rlIter != m_reverseLookup.end()) {
+            ctEndingBigrams = rlIter->second[2].size();
+        }
+        float_t ctAllBigrams = m_lookup[2].size();
+        return ctEndingBigrams / ctAllBigrams;
+    }
+    auto lIter = m_lookup[prefix.size() + 1].find(prefix);
+    float_t ctUqSuffixes = 0;
+    float_t ctSuffixes = 0;
+    float_t ctSuffix = 0;
+    if (lIter != m_lookup[prefix.size() + 1].end()) {
+        ctUqSuffixes = lIter->second.size();
+        ctSuffixes = GetPrefixSum(prefix);
+        auto sIter = lIter->second.find(suffix);
+        if (sIter != lIter->second.end()) {
+            ctSuffix = sIter->second;
+        }
+    }
+    float_t factor = 0;
+    float_t main = 0;
+    if (ctSuffixes != 0) {
+        factor = m_knDelta * ctUqSuffixes / ctSuffixes;
+        // TODO: Figure out how to make this call without copying the string!
+        factor *= ScoreTransition({begin(prefix) + 1, end(prefix)}, suffix);
+        main = max<float_t>(ctSuffix - m_knDelta, 0) / ctSuffixes;
+    }
+    float_t total = main + factor;
+    return total;
+}

nemotron-ocr/cpp/beam_decode/kn_lm.h ADDED Viewed

	@@ -0,0 +1,26 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <unordered_map>
+#include <vector>
+#include "ngram_lm_base.h"
+class KN_LanguageModel
+    : public NGramLMBase
+{
+public:
+    KN_LanguageModel(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t knDelta);
+protected:
+    virtual float_t ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const override;
+private:
+    float_t ScoreUnigram(const std::wstring &uni) const;
+    float_t ScoreTransition(const std::wstring &prefix, const std::wstring &suffix) const;
+    float_t m_knDelta;
+};

nemotron-ocr/cpp/beam_decode/language_model.cpp ADDED Viewed

	@@ -0,0 +1,146 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "language_model.h"
+#include <locale>
+#include <codecvt>
+using namespace std;
+const NullLanguageModel_t NullLanguageModel;
+NullLanguageModel_t::NullLanguageModel_t()
+    : LanguageModel({})
+{
+}
+TokenMappingWrapper::TokenMappingWrapper(token_mapping_t mapping)
+    : token_mapping(move(mapping))
+{
+    for (const auto &mp : token_mapping) {
+        if (mp.second.size() == 1) {
+            wchar_t c = mp.second.front();
+            reverse_token_mapping.emplace(c, mp.first);
+        }
+    }
+}
+TokenMappingWrapper::Ptr create_token_mapping(token_mapping_t tokenMapping)
+{
+    return make_shared<TokenMappingWrapper>(move(tokenMapping));
+}
+template<typename token_t>
+vector<tuple<wstring, float>>
+    decode_sequences_impl(torch::Tensor tokens, const TokenMappingWrapper *tokenMapping,
+                          c10::optional<torch::Tensor> probs)
+{
+    const token_mapping_t &mapping = tokenMapping->token_mapping;
+    auto tokensAccess = tokens.accessor<token_t, 2>();
+    torch::Tensor pTens = probs.value_or(torch::ones({ tokens.size(0) }, torch::kFloat32));
+    if (pTens.dim() == 1) {
+        pTens = pTens.unsqueeze(1);
+    }
+    auto probsAccess = pTens.accessor<float, 2>();
+    const int64_t B = tokens.size(0);
+    const int64_t T = tokens.size(1);
+    vector<tuple<wstring, float>> ret;
+    for (int64_t b = 0; b < B; ++b) {
+        wstring buff;
+        float logProb = 0.0f; // log 1
+        bool done = false;
+        for (int64_t t = 0; t < T && ! done; ++t) {
+            typename token_mapping_t::key_type tokIdx = tokensAccess[b][t];
+            if (t < probsAccess.size(1)) {
+                logProb += log(probsAccess[b][t]);
+            }
+            switch (tokIdx) {
+                case 0:
+                    // Blank char
+                    continue;
+                case 1:
+                    // End of sequence char
+                    done = true;
+                    break;
+                case 2:
+                    buff.push_back('^');
+                    break;
+                default:
+                    auto iter = mapping.find(tokIdx);
+                    if (iter == mapping.end()) {
+                        throw std::runtime_error("The token mapping doesn't contain an entry for index " + to_string(tokIdx));
+                    }
+                    buff += iter->second;
+                    break;
+            }
+        }
+        ret.emplace_back(move(buff), exp(logProb));
+    }
+    return ret;
+}
+vector<tuple<wstring, float>>
+    decode_sequences(torch::Tensor tokens, const TokenMappingWrapper *tokenMapping,
+                     c10::optional<torch::Tensor> probs)
+{
+    if (tokens.dim() != 2) {
+        throw std::runtime_error("`tokens` must be 2-dimensions of type B,T!");
+    }
+    if (tokenMapping == nullptr) {
+        throw std::runtime_error("Cannot supply a null token mapping!");
+    }
+    const token_mapping_t &mapping = tokenMapping->token_mapping;
+    if (mapping.empty()) {
+        throw std::runtime_error("The token mapping hasn't been initialized!");
+    }
+    if (probs.has_value()) {
+        if (probs.value().scalar_type() != torch::kFloat32) {
+            throw std::runtime_error("If the probability distribution is specified, then it must be of type `torch.float32`");
+        }
+        if (probs.value().size(0) != tokens.size(0)) {
+            throw std::runtime_error("The probability distribution batch size doesn't match the tokens batch size!");
+        }
+        if (probs.value().dim() == 2 && probs.value().size(1) != tokens.size(1)) {
+            throw std::runtime_error("Invalid probability distribution shape!");
+        }
+    }
+    vector<tuple<wstring, float>> ret;
+    AT_DISPATCH_INTEGRAL_TYPES(
+        tokens.scalar_type(),
+        "decode_sequences_impl",
+        ([&] {
+            ret = decode_sequences_impl<scalar_t>(tokens, tokenMapping, probs);
+        })
+    );
+    return ret;
+}
+std::string ws2s(const std::wstring& wstr)
+{
+    using convert_typeX = std::codecvt_utf8<wchar_t>;
+    std::wstring_convert<convert_typeX, wchar_t> converterX;
+    return converterX.to_bytes(wstr);
+}

nemotron-ocr/cpp/beam_decode/language_model.h ADDED Viewed

	@@ -0,0 +1,65 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <memory>
+#include <torch/torch.h>
+#include "prefix.h"
+#include "log_sum_exp.h"
+typedef std::unordered_map<int64_t, std::wstring> token_mapping_t;
+typedef std::unordered_map<wchar_t, int64_t> reverse_token_mapping_t;
+class LanguageModel
+{
+public:
+    virtual ~LanguageModel() {}
+    virtual float_t ScoreTransition(const Prefix *p, token_t nextToken) const = 0;
+    const token_mapping_t &TokenMapping() const { return m_tokenMapping; }
+protected:
+    LanguageModel(token_mapping_t tokenMapping)
+        : m_tokenMapping(std::move(tokenMapping))
+    {}
+    token_mapping_t m_tokenMapping;
+};
+class NullLanguageModel_t
+    : public LanguageModel
+{
+public:
+    NullLanguageModel_t();
+    virtual float_t ScoreTransition(const Prefix *p, token_t nextToken) const override
+    {
+        // log P(1)
+        // Which means the probability is unchanged
+        return 0;
+    }
+};
+extern const NullLanguageModel_t NullLanguageModel;
+struct TokenMappingWrapper
+{
+    typedef std::shared_ptr<TokenMappingWrapper> Ptr;
+    TokenMappingWrapper(token_mapping_t mapping);
+    token_mapping_t token_mapping;
+    reverse_token_mapping_t reverse_token_mapping;
+};
+TokenMappingWrapper::Ptr create_token_mapping(token_mapping_t tokenMapping);
+std::vector<std::tuple<std::wstring, float>>
+    decode_sequences(torch::Tensor tokens, const TokenMappingWrapper *tokenMapping,
+                     c10::optional<torch::Tensor> probs = torch::nullopt);

nemotron-ocr/cpp/beam_decode/log_sum_exp.cpp ADDED Viewed

	@@ -0,0 +1,6 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "log_sum_exp.h"
+const float_t NEG_INF = -std::numeric_limits<float_t>::infinity();

nemotron-ocr/cpp/beam_decode/log_sum_exp.h ADDED Viewed

	@@ -0,0 +1,53 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <cmath>
+#include <limits>
+#include <algorithm>
+typedef float float_t;
+extern const float_t NEG_INF;
+template<typename T>
+inline T max_val(T v)
+{
+    return v;
+}
+template<typename T, typename ...Args>
+inline T max_val(T v, Args... rest)
+{
+    auto restMax = max_val(rest...);
+    return std::max(v, restMax);
+}
+template<typename T>
+inline T sum_exp(T maxVal, T v)
+{
+    return std::exp(v - maxVal);
+}
+template<typename T, typename ...Args>
+inline T sum_exp(T maxVal, T v, Args... rest)
+{
+    auto restSum = sum_exp(maxVal, rest...);
+    return sum_exp(maxVal, v) + restSum;
+}
+template<typename T, typename ...Args>
+inline T log_sum_exp(T v, Args ...args)
+{
+    auto maxVal = max_val(v, args...);
+    if (maxVal == -std::numeric_limits<T>::infinity()) {
+        return -std::numeric_limits<T>::infinity();
+    }
+    auto sumExp = sum_exp(maxVal, v, args...);
+    return maxVal + std::log(sumExp);
+}

nemotron-ocr/cpp/beam_decode/ngram_lm_base.cpp ADDED Viewed

	@@ -0,0 +1,329 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "ngram_lm_base.h"
+#include <iostream>
+#include <fstream>
+#if defined( USE_BOOST )
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/serialization/vector.hpp>
+#include <boost/serialization/string.hpp>
+#include <boost/serialization/unordered_map.hpp>
+#endif // USE_BOOST
+using namespace std;
+const std::wstring WORD_END(1, 2);
+const std::wstring NUMERIC(1, 3);
+const std::wstring UNMODELED(1, 4);
+struct LMStorage
+{
+    lookup_t Lookup;
+    reverse_lookup_t ReverseLookup;
+    template<class Archive>
+    void serialize(Archive &ar, const unsigned int version) {
+        ar & Lookup;
+        ar & ReverseLookup;
+    }
+};
+void save_suffix_map(std::fstream& fs, const suffix_map_t& suffix_map)
+{
+    // write out number of elements for Lookup
+    std::size_t suffix_map_count = suffix_map.size();
+    fs.write((char*)(&suffix_map_count), sizeof(suffix_map_count));
+    for (suffix_map_t::const_iterator reverse_lookup_it = suffix_map.begin(); reverse_lookup_it != suffix_map.end(); ++reverse_lookup_it)
+    {
+        // write out the key
+        size_t key_len = reverse_lookup_it->first.length();
+        fs.write((char*)(&key_len), sizeof(key_len));
+        fs.write((char*)(reverse_lookup_it->first.data()), key_len * sizeof(wchar_t));
+        // write out value
+        fs.write((char*)(&reverse_lookup_it->second), sizeof(reverse_lookup_it->second));
+    }
+}
+void save_lookup(std::fstream& fs, const lookup_t& lookup)
+{
+    // write out number of elements for Lookup
+    std::size_t lookup_count = lookup.size();
+    fs.write((char*)(&lookup_count), sizeof(lookup_count));
+    for (lookup_t::const_iterator lookup_it = lookup.begin(); lookup_it != lookup.end(); ++lookup_it)
+    {
+        // write out element map size
+        std::size_t map_elem_count = lookup_it->size();
+        fs.write((char*)(&map_elem_count), sizeof(map_elem_count));
+        for (string_suffix_map_t::const_iterator str_sfx_it = lookup_it->begin(); str_sfx_it != lookup_it->end(); ++str_sfx_it)
+        {
+            // write out key
+            size_t key_len = str_sfx_it->first.length();
+            fs.write((char*)(&key_len), sizeof(key_len));
+            fs.write((char*)(str_sfx_it->first.data()), key_len * sizeof(wchar_t));
+            save_suffix_map(fs, str_sfx_it->second);
+        }
+    }
+}
+void save_reverse_lookup(std::fstream& fs, const reverse_lookup_t& reverse_lookup)
+{
+    // write out number of elements for Lookup
+    std::size_t reverse_lookup_count = reverse_lookup.size();
+    fs.write((char*)(&reverse_lookup_count), sizeof(reverse_lookup_count));
+    for (reverse_lookup_t::const_iterator reverse_lookup_it = reverse_lookup.begin(); reverse_lookup_it != reverse_lookup.end(); ++reverse_lookup_it)
+    {
+        // write out the key
+        size_t key_len = reverse_lookup_it->first.length();
+        fs.write((char*)(&key_len), sizeof(key_len));
+        fs.write((char*)(reverse_lookup_it->first.data()), key_len * sizeof(wchar_t));
+        // write out value vector length
+        size_t val_vec_len = reverse_lookup_it->second.size();
+        fs.write((char*)(&val_vec_len), sizeof(val_vec_len));
+        for (suffix_map_vec_t::const_iterator val_vec_it = reverse_lookup_it->second.begin();
+            val_vec_it != reverse_lookup_it->second.end();
+            ++val_vec_it)
+        {
+            save_suffix_map(fs, *val_vec_it);
+        }
+    }
+}
+void load_suffix_map(std::fstream& fs, suffix_map_t& suffix_map)
+{
+    // read in number of elements
+    std::size_t suffix_map_count = 0;
+    fs.read((char*)(&suffix_map_count), sizeof(suffix_map_count));
+    for (size_t suffix_map_index = 0; suffix_map_index < suffix_map_count; ++suffix_map_index )
+    {
+        // read in key
+        std::size_t key_len = 0;
+        fs.read((char*)(&key_len), sizeof(key_len));
+        std::wstring wkey(key_len, 0);
+        fs.read((char*)(wkey.data()), key_len * sizeof(wchar_t));
+        uint32_t value = 0;
+        fs.read((char*)(&value), sizeof(value));
+        suffix_map.insert(std::make_pair(wkey, value));
+    }
+}
+void load_lookup(std::fstream& fs, lookup_t& lookup)
+{
+    // read in number of elements
+    std::size_t lookup_count = 0;
+    fs.read((char*)(&lookup_count), sizeof(lookup_count));
+    for (size_t lookup_index = 0; lookup_index < lookup_count; ++lookup_index)
+    {
+        std::size_t map_elem_count = 0;
+        fs.read((char*)(&map_elem_count), sizeof(map_elem_count));
+        lookup.push_back(string_suffix_map_t());
+        string_suffix_map_t& str_sfx_map = lookup.back();
+        for (size_t str_sfx_map_index = 0; str_sfx_map_index < map_elem_count; ++str_sfx_map_index)
+        {
+            std::size_t key_len = 0;
+            fs.read((char*)(&key_len), sizeof(key_len));
+            std::wstring wkey(key_len, 0);
+            fs.read((char*)(wkey.data()), key_len * sizeof(wchar_t));
+            str_sfx_map.insert(std::make_pair<wstring, suffix_map_t>(std::wstring(wkey), suffix_map_t()));
+            suffix_map_t& suffix_map = str_sfx_map[wkey];
+            load_suffix_map(fs, suffix_map);
+        }
+    }
+}
+void load_reverse_lookup(std::fstream& fs, reverse_lookup_t& reverse_lookup)
+{
+    // read in number of elements
+    std::size_t reverse_lookup_count = 0;
+    fs.read((char*)(&reverse_lookup_count), sizeof(reverse_lookup_count));
+    for (size_t rev_lookup_index = 0; rev_lookup_index < reverse_lookup_count; ++rev_lookup_index )
+    {
+        // read in the key
+        std::size_t key_len = 0;
+        fs.read((char*)(&key_len), sizeof(key_len));
+        std::wstring wkey(key_len, 0);
+        fs.read((char*)(wkey.data()), key_len * sizeof(wchar_t));
+        reverse_lookup.insert(std::make_pair(wkey, suffix_map_vec_t()));
+        suffix_map_vec_t& val_vec = reverse_lookup[wkey];
+        std::size_t val_vec_len = 0;
+        fs.read((char*)(&val_vec_len), sizeof(val_vec_len));
+        for (size_t val_vec_index = 0; val_vec_index < val_vec_len; ++val_vec_index)
+        {
+            val_vec.push_back(suffix_map_t());
+            suffix_map_t& suffix_map = val_vec.back();
+            load_suffix_map(fs, suffix_map);
+        }
+    }
+}
+#if ! defined( USE_BOOST )
+NGramLMBase::NGramLMBase(const string &dataFilePath, token_mapping_t tokenMapping)
+    : LanguageModel(move(tokenMapping))
+{
+    std::fstream in(dataFilePath, std::ios::in | std::ios::binary);
+    load_lookup(in, m_lookup);
+    load_reverse_lookup(in, m_reverseLookup);
+    if (m_lookup.size() >= 10) {
+        throw runtime_error("Only N-Grams of 9 or less are supported!");
+    }
+    for (auto &ngLevel : m_lookup) {
+        for (auto &kvPrefixLevel : ngLevel) {
+            uint32_t ct = 0;
+            for (auto &kvSfx : kvPrefixLevel.second) {
+                ct += kvSfx.second;
+            }
+            m_prefixSumLookup.emplace(kvPrefixLevel.first, ct);
+        }
+    }
+}
+void save_ngram_data_file(const lookup_t& lookup, const reverse_lookup_t& reverseLookup, const std::string &outputPath)
+{
+    std::fstream out(outputPath, std::ios::out | std::ios::binary);
+    save_lookup(out, lookup);
+    save_reverse_lookup(out, reverseLookup);
+}
+#else // USE_BOOST
+NGramLMBase::NGramLMBase(const string &dataFilePath, token_mapping_t tokenMapping)
+    : LanguageModel(move(tokenMapping))
+{
+    {
+        ifstream dfStr(dataFilePath, ios_base::in | ios_base::binary);
+        boost::archive::binary_iarchive ia(dfStr);
+        LMStorage s;
+        ia >> s;
+        m_lookup = move(s.Lookup);
+        m_reverseLookup = move(s.ReverseLookup);
+    }
+    if (m_lookup.size() >= 10) {
+        throw runtime_error("Only N-Grams of 9 or less are supported!");
+    }
+    for (auto &ngLevel : m_lookup) {
+        for (auto &kvPrefixLevel : ngLevel) {
+            uint32_t ct = 0;
+            for (auto &kvSfx : kvPrefixLevel.second) {
+                ct += kvSfx.second;
+            }
+            m_prefixSumLookup.emplace(kvPrefixLevel.first, ct);
+        }
+    }
+}
+void save_ngram_data_file(lookup_t lookup, reverse_lookup_t reverseLookup, const std::string &outputPath)
+{
+    ofstream ofs(outputPath, ios_base::out | ios_base::binary);
+    LMStorage s;
+    s.Lookup = move(lookup);
+    s.ReverseLookup = move(reverseLookup);
+    boost::archive::binary_oarchive oa(ofs);
+    oa << s;
+}
+#endif // USE_BOOST
+float_t NGramLMBase::ScoreTransition(const Prefix *p, token_t nextToken) const
+{
+    std::wstring prefix;
+    if (! ConvertToString(p, prefix)) {
+        return NEG_INF;
+    }
+    const std::wstring *pSuffix = nullptr;
+    if (nextToken != 1) {
+        auto iter = m_tokenMapping.find(nextToken);
+        if (iter == m_tokenMapping.end()) {
+            pSuffix = &UNMODELED;
+        } else {
+            pSuffix = &iter->second;
+            if (iswdigit(pSuffix->at(0))) {
+                pSuffix = &NUMERIC;
+            }
+        }
+    } else {
+        pSuffix = &WORD_END;
+    }
+    float_t ret = ScoreTransitionImpl(prefix, *pSuffix);
+    if (ret > 0) {
+        return log(ret);
+    } else {
+        return NEG_INF;
+    }
+}
+bool NGramLMBase::ConvertToString(const Prefix *p, std::wstring &prefix) const
+{
+    const Prefix *stk[10];
+    int32_t sz = -1;
+    const Prefix *curr = p;
+    decltype(sz) mlSz{(int)m_lookup.size() - 2};
+    while (curr && sz < mlSz) {
+        stk[++sz] = curr;
+        curr = curr->Parent;
+    }
+    // Either blank or empty prefix
+    if (sz < 1) { return true; }
+    --sz;
+    for (; sz >= 0; --sz) {
+        token_t tok = stk[sz]->Token;
+        // End of word token, which maps to the null character
+        if (tok == 1) {
+            prefix.push_back(WORD_END[0]);
+        } else if (tok == 0) {
+            // Do nothing
+        } else {
+            auto iter = m_tokenMapping.find(tok);
+            if (iter == m_tokenMapping.end()) {
+                prefix += UNMODELED;
+            } else {
+                const std::wstring &wChar = iter->second;
+                if (iswdigit(wChar[0])) {
+                    prefix += NUMERIC;
+                } else {
+                    prefix += wChar;
+                }
+            }
+        }
+    }
+    return true;
+}

nemotron-ocr/cpp/beam_decode/ngram_lm_base.h ADDED Viewed

	@@ -0,0 +1,79 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <unordered_map>
+#include <vector>
+#include "language_model.h"
+// #define USE_BOOST 1
+typedef std::unordered_map<std::wstring, uint32_t> suffix_map_t;
+/* Tells us the number of suffixes for a given ngram of order K
+   Keys:
+    1. NGram Order
+    2. Prefix
+    3. Suffix
+Value:
+    Count
+*/
+typedef std::unordered_map<std::wstring, suffix_map_t> string_suffix_map_t;
+typedef std::vector<string_suffix_map_t> lookup_t;
+/* Tells us the number of K-gram prefixes found for a given suffix
+   Keys:
+    1. Suffix
+    2. NGram Order
+    3. Prefix
+Values:
+    Count
+*/
+typedef std::vector<suffix_map_t> suffix_map_vec_t;
+typedef std::unordered_map<std::wstring, suffix_map_vec_t> reverse_lookup_t;
+extern const std::wstring WORD_END;
+extern const std::wstring NUMERIC;
+extern const std::wstring UNMODELED;
+class NGramLMBase
+    : public LanguageModel
+{
+public:
+    virtual float_t ScoreTransition(const Prefix *p, token_t nextToken) const override;
+protected:
+    NGramLMBase(const std::string &dataFilePath, token_mapping_t tokenMapping);
+    virtual float_t ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const = 0;
+    bool ConvertToString(const Prefix *p, std::wstring &prefix) const;
+    float_t GetPrefixSum(const std::wstring &prefix) const;
+    lookup_t m_lookup;
+    reverse_lookup_t m_reverseLookup;
+    std::unordered_map<std::wstring, uint32_t> m_prefixSumLookup;
+};
+#if ! defined( USE_BOOST )
+void save_ngram_data_file(const lookup_t& lookup, const reverse_lookup_t& reverseLookup, const std::string &output_path);
+#else // USE_BOOST
+void save_ngram_data_file(lookup_t lookup, reverse_lookup_t reverseLookup, const std::string &output_path);
+#endif // USE_BOOST
+inline float_t NGramLMBase::GetPrefixSum(const std::wstring &prefix) const
+{
+    auto iter = m_prefixSumLookup.find(prefix);
+    if (iter == m_prefixSumLookup.end()) {
+        return 0;
+    } else {
+        return iter->second;
+    }
+}

nemotron-ocr/cpp/beam_decode/prefix.cpp ADDED Viewed

	@@ -0,0 +1,22 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "prefix.h"
+using namespace std;
+vector<token_t> Prefix::ToList() const
+{
+    vector<token_t> ret;
+    auto curr = this;
+    while (curr) {
+        if (curr->Token != 0) {
+            ret.push_back(curr->Token);
+        }
+        curr = curr->Parent;
+    }
+    return { rbegin(ret), rend(ret) };
+}

nemotron-ocr/cpp/beam_decode/prefix.h ADDED Viewed

	@@ -0,0 +1,157 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <cstdlib>
+#include <memory>
+#include <vector>
+#include <unordered_map>
+#include <list>
+typedef int32_t token_t;
+class Prefix;
+// typedef std::shared_ptr<Prefix> PrefixPtr;
+class Prefix
+{
+public:
+    token_t Token;
+    Prefix *Parent;
+    Prefix(token_t token = 0 /* blank */, Prefix *parent = nullptr)
+        : Token(token), Parent(parent)
+    {}
+    std::vector<token_t> ToList() const;
+    size_t size() const;
+};
+///// Borrowed from Boost libraries
+template<typename T>
+void hash_combine(size_t & seed, T const& v)
+{
+    seed ^= std::hash<T>()(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+/////
+namespace std {
+template<>
+struct hash<Prefix*>
+{
+    size_t operator()(const Prefix *p) const noexcept
+    {
+        size_t seed = 0;
+        while (p) {
+            if (p->Token != 0) {
+                hash_combine(seed, p->Token);
+            }
+            p = p->Parent;
+        }
+        return seed;
+    }
+};
+template<>
+struct hash<tuple<Prefix*, token_t>>
+{
+    size_t operator()(const tuple<Prefix*, token_t> &t) const noexcept
+    {
+        size_t seed = 0;
+        hash_combine(seed, get<0>(t));
+        hash_combine(seed, get<1>(t));
+        return seed;
+    }
+};
+template<>
+struct equal_to<Prefix*>
+{
+    bool operator()(const Prefix *a, const Prefix *b) const noexcept
+    {
+        while (a != nullptr && b != nullptr) {
+            if (a->Token != b->Token) {
+                return false;
+            }
+            a = a->Parent;
+            b = b->Parent;
+        }
+        // If one chain is shorter than the other
+        return a == b;
+    }
+};
+}
+inline size_t Prefix::size() const
+{
+    size_t ret = 0;
+    auto p = this;
+    while (p != nullptr) {
+        ret += 1;
+        p = p->Parent;
+    }
+    return ret;
+}
+class PrefixAllocator
+{
+public:
+    PrefixAllocator() = default;
+    ~PrefixAllocator();
+    template<typename ...Args>
+    Prefix *GetPrefix(Args&& ...ctorArgs);
+private:
+    void AllocateNextBuffer();
+    std::list<Prefix*> m_buffers;
+    size_t m_allocSize = 0;
+    size_t m_currOff = 0;
+};
+inline PrefixAllocator::~PrefixAllocator()
+{
+    for (auto p : m_buffers) {
+        // Prefix is a POD, and are allocated without initializing
+        // to prevent redundant work upfront
+        // delete[] p;
+        free(p);
+    }
+}
+inline void PrefixAllocator::AllocateNextBuffer()
+{
+    size_t nextSize = m_allocSize == 0 ? 1000 : 2 * m_allocSize;
+    // Using malloc here to prevent the ctor of Prefix being called for each item.
+    // Instead, the ctor will be called upon first access using GetPrefix
+    auto pBuff = reinterpret_cast<Prefix*>(malloc(sizeof(Prefix) * nextSize));
+    m_buffers.push_back(pBuff);
+    m_allocSize = nextSize;
+    m_currOff = 0;
+}
+template<typename ...Args>
+Prefix *PrefixAllocator::GetPrefix(Args&& ...ctorArgs)
+{
+    if (m_currOff == m_allocSize) {
+        AllocateNextBuffer();
+    }
+    auto buff = m_buffers.back() + m_currOff;
+    auto ret = new (buff) Prefix(std::forward<Args>(ctorArgs)...);
+    ++m_currOff;
+    return ret;
+}

nemotron-ocr/cpp/beam_decode/sbo_lm.cpp ADDED Viewed

	@@ -0,0 +1,46 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "sbo_lm.h"
+#include <assert.h>
+// Reference paper: https://www.aclweb.org/anthology/D07-1090.pdf
+SBO_LanguageModel::SBO_LanguageModel(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t backoff)
+    : NGramLMBase(dataFilePath, move(tokenMapping)), m_backoff(backoff)
+{
+}
+float SBO_LanguageModel::ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const
+{
+    auto lIter = m_lookup[prefix.size() + 1].find(prefix);
+    // This prefix doesn't exist. Shrink it!
+    if (lIter == m_lookup[prefix.size() + 1].end()) {
+        return m_backoff * ScoreTransitionImpl({ begin(prefix) + 1, end(prefix) }, suffix);
+    }
+    const suffix_map_t &suffixMap = lIter->second;
+    auto sfIter = suffixMap.find(suffix);
+    if (sfIter == suffixMap.end()) {
+        // This is a novel character entirely!
+        if (prefix.empty()) {
+            return 1e-8;
+        } else {
+            return m_backoff * ScoreTransitionImpl({ begin(prefix) + 1, end(prefix) }, suffix);
+        }
+    }
+    float_t ctSuffix = sfIter->second;
+    float_t ctNgram = GetPrefixSum(prefix);
+    float_t score = ctSuffix / ctNgram;
+    assert(score >= 0 && score <= 1);
+    return score;
+}

nemotron-ocr/cpp/beam_decode/sbo_lm.h ADDED Viewed

	@@ -0,0 +1,20 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include "kn_lm.h"
+class SBO_LanguageModel
+    : public NGramLMBase
+{
+public:
+    SBO_LanguageModel(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t backoff);
+protected:
+    virtual float_t ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const override;
+private:
+    float_t m_backoff;
+};

nemotron-ocr/cpp/better_grid_sample/cpu_indirect_grid_sample.cpp ADDED Viewed

	@@ -0,0 +1,93 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "grid_sample.h"
+#include "gpu_grid_sample_utils.cuh"
+template<typename T>
+void indirect_grid_sample_forward_bilinear(torch::TensorAccessor<T, 4> input,
+                                           torch::TensorAccessor<T, 4> grid,
+                                           torch::TensorAccessor<int64_t, 1> inputIndices,
+                                           torch::TensorAccessor<T, 4> output)
+{
+    const int64_t N = inputIndices.size(0);
+    const int64_t C = output.size(1);
+    T fInputHeight = input.size(2);
+    T fInputWidth = input.size(3);
+    int64_t outputHeight = output.size(2);
+    int64_t outputWidth = output.size(3);
+    #pragma omp parallel for num_threads(8)
+    for (int64_t i = 0; i < N; ++i) {
+        int64_t inputIdx = inputIndices[i];
+        for (int64_t c = 0; c < C; ++c) {
+            for (int64_t outY = 0; outY < outputHeight; ++outY) {
+                for (int64_t outX = 0; outX < outputWidth; ++outX) {
+                    T u = grid[i][outY][outX][0];
+                    T v = grid[i][outY][outX][1];
+                    if (u < -1 || u > 1 || v < -1 || v > 1) {
+                        output[i][c][outY][outX] = 0;
+                        continue;
+                    }
+                    // Denormalize the coordinates
+                    u = (u + 1) * ((fInputWidth - 1) / 2);
+                    v = (v + 1) * ((fInputHeight - 1) / 2);
+                    // Calculate coordinates
+                    const T inX = u;
+                    const T inXint = std::floor(inX);
+                    const T inXfrac = inX - inXint;
+                    const T inY = v;
+                    const T inYint = std::floor(inY);
+                    const T inYfrac = inY - inYint;
+                    T ps[] = { 1 - inXfrac, inXfrac };
+                    T rs[] = { 1 - inYfrac, inYfrac };
+                    T opVal = 0;
+                    #pragma unroll
+                    for (int64_t row = 0; row < 2; ++row) {
+                        #pragma unroll
+                        for (int64_t col = 0; col < 2; ++col) {
+                            T Tpx = utils::get_pixel_clamped(input, inputIdx, c, inXint + col, inYint + row);
+                            opVal += rs[row] * ps[col] * Tpx;
+                        }
+                    }
+                    output[i][c][outY][outX] = opVal;
+                }
+            }
+        }
+    }
+}
+torch::Tensor cpu_indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid,
+                                               torch::Tensor inputIndices, const std::string &method)
+{
+    auto output = input.new_empty({ inputIndices.size(0), input.size(1), grid.size(1), grid.size(2) });
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(),
+        "cpu_indirect_grid_sample_forward_impl",
+        ([&] {
+            typedef scalar_t T;
+            if (method == "bilinear") {
+                indirect_grid_sample_forward_bilinear(
+                    input.accessor<T, 4>(),
+                    grid.accessor<T, 4>(),
+                    inputIndices.accessor<int64_t, 1>(),
+                    output.accessor<T, 4>()
+                );
+            } else {
+                throw std::runtime_error("Unsupported resample method: " + method);
+            }
+        })
+    );
+    return output;
+}

nemotron-ocr/cpp/better_grid_sample/gpu_grid_sample_utils.cuh ADDED Viewed

	@@ -0,0 +1,41 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <torch/torch.h>
+#include "../cuda_intellisense.cuh"
+#ifndef __NVCC__
+#include <algorithm>
+#define __device__
+#endif
+namespace utils {
+#ifdef __NVCC__
+template<typename T>
+__device__ __lib_inline__
+T clamp(T val, T minVal, T maxVal)
+{
+    return max(minVal, min(val, maxVal));
+}
+#else
+using std::clamp;
+#endif
+template<typename accessor_t>
+__device__ __lib_inline__
+auto &get_pixel_clamped(accessor_t &inputs,
+                            int64_t n, int64_t c, int64_t x, int64_t y)
+{
+    x = clamp<decltype(x)>(x, 0, inputs.size(3) - 1);
+    y = clamp<decltype(y)>(y, 0, inputs.size(2) - 1);
+    return inputs[n][c][y][x];
+}
+}

nemotron-ocr/cpp/better_grid_sample/gpu_indirect_grid_sample.cu ADDED Viewed

	@@ -0,0 +1,327 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "grid_sample.h"
+#include "../cuda_intellisense.cuh"
+#include "../half_ops.cuh"
+#include "gpu_grid_sample_utils.cuh"
+using namespace std;
+template<typename accessor_t, typename index_t>
+__device__ __lib_inline__
+auto &my_get_pixel_clamped(accessor_t &inputs, index_t x, index_t y)
+{
+    x = utils::clamp(x, 0, inputs.size(1) - 1);
+    y = utils::clamp(y, 0, inputs.size(0) - 1);
+    return inputs[y][x];
+}
+__global__
+void single_ex_grid_sample_bilinear_kernel(const float *pInputImage,
+                                           uint32_t imgHeight, uint32_t imgWidth, uint32_t numChannels,
+                                           const float2 *pGrid,
+                                           uint32_t numGridCells,
+                                           float *pOutputImage)
+{
+    const uint32_t z = blockDim.x * blockIdx.x + threadIdx.x;
+    const uint32_t c = blockDim.y * blockIdx.y + threadIdx.y;
+    if (c >= numChannels || z >= numGridCells) {
+        return;
+    }
+    const uint32_t g = blockIdx.z;
+    const float2 uv = pGrid[g * numGridCells + z];
+    float &outPx = pOutputImage[(g * numChannels + c) * numGridCells + z];
+    if (abs(uv.x) > 1.0f || abs(uv.y) > 1.0f) {
+        outPx = 0.0f;
+    } else {
+        const uint32_t maxX = imgWidth - 1;
+        const uint32_t maxY = imgHeight - 1;
+        const float u = (uv.x + 1.0f) * maxX * 0.5f;
+        const float v = (uv.y + 1.0f) * maxY * 0.5f;
+        // calculate coordinates
+        const float inX = u;
+        const uint32_t inXint = inX;
+        const float inXfrac = inX - inXint;
+        const float inY = v;
+        const uint32_t inYint = inY;
+        const float inYfrac = inY - inYint;
+        const float *pChanImage = pInputImage + c * imgHeight * imgWidth;
+        // By being in this conditional block, we know that u and v are >= 0, which means
+        // that their truncated value is also >= 0. Instead of clamping the value to within the buffer,
+        // we set the multiplication factor to be 0 if the interpolated value is outside the buffer
+        const float ps[] = { 1.0f - inXfrac, inXfrac * (inXint < maxX) };
+        const float rs[] = { 1.0f - inYfrac, inYfrac * (inYint < maxY) };
+        float opVal = 0.0f;
+        #pragma unroll
+        for (uint32_t row = 0; row < 2; ++row) {
+            const float *pRowImage = pChanImage + (inYint + row) * imgWidth;
+            #pragma unroll
+            for (uint32_t col = 0; col < 2; ++col) {
+                const float px = pRowImage[inXint + col];
+                opVal += rs[row] * ps[col] * px;
+            }
+        }
+        outPx = opVal;
+    }
+}
+template<typename T>
+__global__
+void indirect_grid_sample_forward_bilinear_kernel(torch::PackedTensorAccessor32<T, 4> inputs,
+                                                  torch::PackedTensorAccessor32<T, 4> grid,
+                                                  torch::PackedTensorAccessor32<int64_t, 1> inputIndices,
+                                                  torch::PackedTensorAccessor32<T, 4> outputs)
+{
+    static_assert(std::is_same<T, float>::value, "Currently only float32 is supported!");
+    //typedef typename fp_promote<T>::type accum_t;
+    typedef float accum_t;
+    constexpr T NEG_ONE = -1;
+    constexpr T ONE = 1;
+    constexpr T ZERO = 0;
+    constexpr T TWO = 2;
+    constexpr T ZERO_PT_5 = 0.5;
+    typedef decltype(inputs.stride(0)) index_t;
+    const index_t n = blockDim.z * blockIdx.z + threadIdx.z;
+    if (n >= inputIndices.size(0)) return;
+    const index_t c = blockDim.y * blockIdx.y + threadIdx.y;
+    const index_t z = blockDim.x * blockIdx.x + threadIdx.x;
+    const accum_t inputHeight = inputs.size(2);
+    const accum_t inputWidth = inputs.size(3);
+    const index_t outputHeight = outputs.size(2);
+    const index_t outputWidth = outputs.size(3);
+    const index_t outY = z / outputWidth;
+    //const index_t outX = z % outputWidth;
+    const index_t outX = z - (outY * outputWidth);
+    if (outY >= outputHeight) return;
+    index_t inputIdx = inputIndices[n];
+    const float2 f2uv = *reinterpret_cast<const float2*>(grid[n][outY][outX].data());
+    float u = f2uv.x;
+    float v = f2uv.y;
+    if (u < NEG_ONE || u > ONE || v < NEG_ONE || v > ONE) {
+        outputs[n][c][outY][outX] = ZERO;
+        return;
+    }
+    // Denormalize the coordinates
+    u = (u + ONE) * ((inputWidth - ONE) * ZERO_PT_5);
+    v = (v + ONE) * ((inputHeight - ONE) * ZERO_PT_5);
+    // calculate coordinates
+    const accum_t inX = u;
+    const index_t inXint = inX;
+    const accum_t inXfrac = inX - inXint;
+    const accum_t inY = v;
+    const index_t inYint = inY;
+    const accum_t inYfrac = inY - inYint;
+    accum_t ps[] = { ONE - inXfrac, inXfrac };
+    accum_t rs[] = { ONE - inYfrac, inYfrac };
+    accum_t opVal = ZERO;
+    auto localInputs = inputs[inputIdx][c];
+    #pragma unroll
+    for (index_t row = 0; row < 2; ++row) {
+        #pragma unroll
+        for (index_t col = 0; col < 2; ++col) {
+            T Tpx = my_get_pixel_clamped(localInputs, inXint + col, inYint + row);
+            opVal += rs[row] * ps[col] * Convert<T, accum_t>::LeftToRight(Tpx);
+        }
+    }
+    outputs[n][c][outY][outX] = Convert<T, accum_t>::RightToLeft(opVal);
+}
+template<typename T>
+__global__
+void indirect_grid_sample_backward_bilinear_kernel(torch::PackedTensorAccessor64<T, 4> inputs,
+                                                   torch::PackedTensorAccessor64<T, 4> grid,
+                                                   torch::PackedTensorAccessor64<int64_t, 1> inputIndices,
+                                                   torch::PackedTensorAccessor64<T, 4> gradOutput,
+                                                   torch::PackedTensorAccessor64<T, 4> gradInput,
+                                                   torch::PackedTensorAccessor64<T, 4> gradGrid)
+{
+    typedef typename fp_promote<T>::type accum_t;
+    constexpr T NEG_ONE = -1;
+    constexpr T ONE = 1;
+    const int64_t n = blockDim.z * blockIdx.z + threadIdx.z;
+    if (n >= inputIndices.size(0)) return;
+    const int64_t c = blockDim.y * blockIdx.y + threadIdx.y;
+    const int64_t z = blockDim.x * blockIdx.x + threadIdx.x;
+    const accum_t inputHeight = inputs.size(2);
+    const accum_t inputWidth = inputs.size(3);
+    const int64_t outputHeight = gradOutput.size(2);
+    const int64_t outputWidth = gradOutput.size(3);
+    const int64_t outY = z / outputWidth;
+    const int64_t outX = z % outputWidth;
+    if (outY >= outputHeight) return;
+    int64_t inputIdx = inputIndices[n];
+    const float2 f2uv = *reinterpret_cast<const float2*>(grid[n][outY][outX].data());
+    float u = f2uv.x;
+    float v = f2uv.y;
+    // No output gradient contribution from this position
+    if (u < NEG_ONE || u > ONE || v < NEG_ONE || v > ONE) {
+        return;
+    }
+    // Denormalize the coordinates
+    u = (u + 1) * ((inputWidth - 1) / 2);
+    v = (v + 1) * ((inputHeight - 1) / 2);
+    // calculate coordinates
+    const accum_t inX = u;
+    const accum_t inXint = floor(inX);
+    const accum_t inXfrac = inX - inXint;
+    const accum_t inY = v;
+    const accum_t inYint = floor(inY);
+    const accum_t inYfrac = inY - inYint;
+    accum_t ps[] = { 1 - inXfrac, inXfrac };
+    accum_t rs[] = { 1 - inYfrac, inYfrac };
+    const accum_t gOut = Convert<T, accum_t>::LeftToRight(gradOutput[n][c][outY][outX]);
+    #pragma unroll
+    for (size_t row = 0; row < 2; ++row) {
+        #pragma unroll
+        for (size_t col = 0; col < 2; ++col) {
+            T &gIn = utils::get_pixel_clamped(gradInput, inputIdx, c, inXint + col, inYint + row);
+            T gContrib = Convert<T, accum_t>::RightToLeft(rs[row] * ps[col] * gOut);
+            atomicAdd(&gIn, gContrib);
+        }
+    }
+}
+torch::Tensor gpu_indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method)
+{
+    auto output = input.new_empty({ inputIndices.size(0), input.size(1), grid.size(1), grid.size(2) });
+    if (method != "bilinear"s) {
+        throw runtime_error("Only 'bilinear' sampling is currently supported!");
+    }
+    if (input.size(0) == 1 && input.is_contiguous() && grid.is_contiguous()) {
+        uint32_t gridNumCells = grid.size(1) * grid.size(2);
+        dim3 blockDim(32, 3, 1);
+        dim3 gridDim(div_up(gridNumCells, blockDim.x),
+                     div_up(input.size(1), blockDim.y),
+                     div_up(grid.size(0), blockDim.z));
+        single_ex_grid_sample_bilinear_kernel KERNEL_ARG2(gridDim, blockDim) (
+            input.data_ptr<float>(),
+            input.size(2), input.size(3), input.size(1),
+            reinterpret_cast<const float2*>(grid.data_ptr()),
+            gridNumCells,
+            output.data_ptr<float>()
+        );
+    } else {
+        // z is batch idx
+        // y is channel
+        // x is w*h
+        dim3 blockDim(32, 1, 3);
+        dim3 gridDim(div_up(grid.size(1) * grid.size(2), blockDim.x),
+                        div_up(input.size(1), blockDim.y),
+                        div_up(inputIndices.size(0), blockDim.z));
+        indirect_grid_sample_forward_bilinear_kernel KERNEL_ARG2(gridDim, blockDim) (
+            input.packed_accessor32<float, 4>(),
+            grid.packed_accessor32<float, 4>(),
+            inputIndices.packed_accessor32<int64_t, 1>(),
+            output.packed_accessor32<float, 4>()
+        );
+    }
+    //AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    //    input.scalar_type(),
+    //    "gpu_indirect_grid_sample_forward",
+    //    ([&] {
+    //        typedef typename remap_half<scalar_t>::type T;
+    //        // typedef scalar_t T;
+    //        if (method == "bilinear") {
+    //            indirect_grid_sample_forward_bilinear_kernel KERNEL_ARG2(gridDim, blockDim) (
+    //                input.packed_accessor64<T, 4>(),
+    //                grid.packed_accessor64<T, 4>(),
+    //                inputIndices.packed_accessor64<int64_t, 1>(),
+    //                output.packed_accessor64<T, 4>()
+    //            );
+    //        } else {
+    //            throw runtime_error("Unsupported resample method: " + method);
+    //        }
+    //    })
+    //);
+    return output;
+}
+std::vector<torch::Tensor> gpu_indirect_grad_sample_backward(torch::Tensor gradOutput, torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method)
+{
+    auto gradInput = torch::zeros_like(input);
+    auto gradGrid = torch::zeros_like(grid);
+    // z is batch idx
+    // y is channel
+    // x is w*h
+    dim3 blockDim(32, 1, 1);
+    dim3 gridDim(div_up(grid.size(1) * grid.size(2), blockDim.x),
+                 div_up(input.size(1), blockDim.y),
+                 div_up(inputIndices.size(0), blockDim.z));
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(),
+        "gpu_indirect_grid_sample_backward",
+        ([&] {
+            typedef typename remap_half<scalar_t>::type T;
+            // typedef scalar_t T;
+            if (method == "bilinear") {
+                indirect_grid_sample_backward_bilinear_kernel KERNEL_ARG2(gridDim, blockDim) (
+                    input.packed_accessor64<T, 4>(),
+                    grid.packed_accessor64<T, 4>(),
+                    inputIndices.packed_accessor64<int64_t, 1>(),
+                    gradOutput.packed_accessor64<T, 4>(),
+                    gradInput.packed_accessor64<T, 4>(),
+                    gradGrid.packed_accessor64<T, 4>()
+                );
+            } else {
+                throw runtime_error("Unsupported resample method: " + method);
+            }
+        })
+    );
+    return { gradInput, gradGrid };
+}

nemotron-ocr/cpp/better_grid_sample/grid_sample.h ADDED Viewed

	@@ -0,0 +1,66 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <torch/torch.h>
+inline
+torch::Tensor region_counts_to_indices(torch::Tensor regionCounts, int64_t numOutputs)
+{
+    // If there's only one example, we can trivially return idx 0 for all
+    if (regionCounts.size(0) == 1) {
+        return torch::zeros({ numOutputs }, regionCounts.options().dtype(torch::kInt64));
+    }
+    // regionCounts will be some tensor like [ 5, 1, 10, 2 ] which means that the first 5 outputs
+    // correspond to the first input, the next output to the second input, 10 to the third, and so on.
+    // We want to convert this to instead have an entry for each output which specifies the index of the corresponding input.
+    // To do this, we can count the number of times the output index exceeds the cumulative input counts.
+    // e.g. the cumulative region count for the above tensor is [ 5, 6, 16, 18 ].
+    // The output indices 0-4 are not greater than or equal to any cumulative count, so they get the input index of 0.
+    // The output index 5 is equal to a single count, therefore index 1.
+    // The outputs 6-15 are all greater than or equal to two cumulative counts, therefore index 2.
+    // And so on.
+    auto indices = torch::arange(regionCounts.size(0), regionCounts.options().dtype(torch::kInt64));
+    auto outputIndices = torch::repeat_interleave(indices, regionCounts, /*dim=*/ 0, /*output_size=*/ numOutputs);
+    return outputIndices;
+}
+torch::Tensor gpu_indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method);
+torch::Tensor cpu_indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method);
+std::vector<torch::Tensor> gpu_indirect_grad_sample_backward(torch::Tensor gradOutput, torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method);
+inline
+torch::Tensor indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method)
+{
+    if (input.is_cuda() != grid.is_cuda() || input.is_cuda() != inputIndices.is_cuda()) {
+        throw std::runtime_error("Input tensors must all be on the same device!");
+    }
+    if (inputIndices.size(0) != grid.size(0)) {
+        throw std::runtime_error("The batch dimensions must match!");
+    }
+    if (grid.size(-1) != 2) {
+        throw std::runtime_error("The final grid dimension must be 2.");
+    }
+    if (input.is_cuda()) {
+        return gpu_indirect_grid_sample_forward(std::move(input), std::move(grid), std::move(inputIndices), method);
+    } else {
+        return cpu_indirect_grid_sample_forward(std::move(input), std::move(grid), std::move(inputIndices), method);
+    }
+}
+inline
+std::vector<torch::Tensor> indirect_grad_sample_backward(torch::Tensor gradOutput, torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method)
+{
+    if (gradOutput.is_cuda()) {
+        return gpu_indirect_grad_sample_backward(std::move(gradOutput), std::move(input), std::move(grid), std::move(inputIndices), method);
+    } else {
+        throw std::runtime_error("Not implemented!");
+    }
+}

nemotron-ocr/cpp/common.cpp ADDED Viewed

	@@ -0,0 +1,12 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "common.h"
+#include <sstream>
+using namespace std;
+void print_tensor(const torch::Tensor &t) {
+    cout << t << endl;
+}

nemotron-ocr/cpp/common.h ADDED Viewed

	@@ -0,0 +1,57 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <ostream>
+#include <vector>
+#include <torch/torch.h>
+template<typename T>
+inline
+std::ostream &operator<<(std::ostream &os, const std::vector<T> &v) {
+    os << "[";
+    if (! v.empty()) {
+        os << v[0];
+        for (size_t i = 1; i < v.size(); ++i) {
+            os << ", " << v[i];
+        }
+    }
+    os << "]";
+    return os;
+}
+template<int Counter, typename ...Args>
+struct _inner_tuple_print
+{
+    inline
+    static std::ostream &print(std::ostream &os, const std::tuple<Args...> &t) {
+        _inner_tuple_print<Counter - 1, Args...>::print(os, t);
+        os << ", " << std::get<Counter>(t);
+        return os;
+    }
+};
+template<typename ...Args>
+struct _inner_tuple_print<0, Args...>
+{
+    inline
+    static std::ostream &print(std::ostream &os, const std::tuple<Args...> &t) {
+        os << std::get<0>(t);
+        return os;
+    }
+};
+template<typename... Args>
+inline
+std::ostream &operator<<(std::ostream &os, const std::tuple<Args...> &t) {
+    os << "(";
+    _inner_tuple_print<sizeof...(Args) - 1, Args...>::print(os, t);
+    os << ")";
+    return os;
+}
+void print_tensor(const torch::Tensor &t);

nemotron-ocr/cpp/cuda_intellisense.cuh ADDED Viewed

	@@ -0,0 +1,50 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#if defined(__INTELLISENSE__) || !defined(__NVCC__)
+#ifndef KERNEL_ARG2
+#define KERNEL_ARG2(grid, block)
+#define KERNEL_ARG3(grid, block, sh_mem)
+#define KERNEL_ARG4(grid, block, sh_mem, stream)
+#define __global__
+#define __device__
+#define __host__
+#endif
+#endif
+#ifdef __INTELLISENSE__
+#define __CUDACC__
+#include <cuda_runtime.h>
+void __syncthreads();  // workaround __syncthreads warning
+dim3 threadIdx;
+dim3 blockIdx;
+dim3 blockDim;
+dim3 gridDim;
+#else
+#ifndef KERNEL_ARG2
+#define KERNEL_ARG2(grid, block) <<< grid, block >>>
+#define KERNEL_ARG3(grid, block, sh_mem) <<< grid, block, sh_mem >>>
+#define KERNEL_ARG4(grid, block, sh_mem, stream) <<< grid, block, sh_mem, stream >>>
+#endif
+#endif
+#define __any_device__ __host__ __device__
+#ifdef __NVCC__
+#define __lib_inline__ __forceinline__
+#else
+#define __lib_inline__ inline
+#endif
+template<typename T1, typename T2>
+__any_device__
+inline auto div_up(T1 n, T2 d)
+{
+    return (n + d - 1) / d;
+}

nemotron-ocr/cpp/geometry.h ADDED Viewed

	@@ -0,0 +1,1100 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <type_traits>
+#ifndef _GEOMETRY_NO_TORCH
+#include <torch/torch.h>
+#endif
+#include "cuda_intellisense.cuh"
+#ifndef __NVCC__
+#define SORT_ALGO std::sort
+#define SWAP std::swap
+template<typename ...Args>
+using tuple_t = std::tuple<Args...>;
+#else
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+#define SORT_ALGO thrust::sort
+#define SWAP thrust::swap
+template<typename ...Args>
+using tuple_t = thrust::tuple<Args...>;
+#endif
+template<typename T>
+struct Point_ {
+    typedef T inner_type;
+    T X, Y;
+    Point_() = default;
+    __any_device__
+    Point_(T x, T y) : X(x), Y(y) {}
+    __any_device__
+    Point_(T *ptr) : X(ptr[0]), Y(ptr[1]) {}
+#ifndef _GEOMETRY_NO_TORCH
+    template<typename T2>
+    __any_device__
+    Point_(const torch::TensorAccessor<T2, 1> &accessor) : X(accessor[0]), Y(accessor[1]) {}
+    template<typename T2>
+    __any_device__
+    Point_(const torch::PackedTensorAccessor64<T2, 1> &accessor) : X(accessor[0]), Y(accessor[1]) {}
+#endif
+    __any_device__
+    Point_ &operator+=(const Point_ &other);
+    __any_device__
+    Point_ &operator-=(const Point_ &other);
+    __any_device__
+    Point_ &operator*=(const Point_ &other);
+    __any_device__
+    Point_ &operator/=(const Point_ &other);
+    template<typename W>
+    __any_device__
+    Point_ &operator/=(W w);
+    template<typename W>
+    __any_device__
+    Point_ &operator*=(W w);
+    __any_device__
+    Point_ operator-() {
+        return { -X, -Y };
+    }
+    __any_device__
+    T Sum() const { return X + Y; }
+    __any_device__
+    T Angle() const;
+    __any_device__
+    void swap(Point_ &other) noexcept {
+        SWAP(X, other.X);
+        SWAP(Y, other.Y);
+    }
+};
+template<typename T>
+__lib_inline__ __any_device__
+void swap(Point_<T> &a, Point_<T> &b) {
+    a.swap(b);
+}
+template<typename T>
+__any_device__
+__lib_inline__ T Point_<T>::Angle() const {
+#ifndef __NVCC__
+    using std::atan2;
+#endif
+    return atan2(Y, X);
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> min(const Point_<T> &a, const Point_<T> &b) {
+#ifndef __NVCC__
+    using std::min;
+#endif
+    return {
+        min(a.X, b.X),
+        min(a.Y, b.Y)
+    };
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> max(const Point_<T> &a, const Point_<T> &b) {
+#ifndef __NVCC__
+    using std::max;
+#endif
+    return {
+        max(a.X, b.X),
+        max(a.Y, b.Y)
+    };
+}
+template<typename T>
+struct AABB_ {
+    typedef T inner_type;
+    T X;
+    T Y;
+    T MaxX;
+    T MaxY;
+    AABB_() = default;
+    __any_device__
+    AABB_(T x, T y, T maxX, T maxY)
+        : X(x), Y(y), MaxX(maxX), MaxY(maxY) {}
+    __any_device__
+    bool Contains(const Point_<T> &p) const {
+        return p.X >= X && p.X < MaxX &&
+               p.Y >= Y && p.Y < MaxY;
+    }
+    __any_device__ __lib_inline__
+    AABB_ Union(const AABB_ &other) const {
+#ifndef __NVCC__
+        using std::min;
+        using std::max;
+#endif
+        T minX = min(X, other.X);
+        T maxX = max(MaxX, other.MaxX);
+        T minY = min(Y, other.Y);
+        T maxY = max(MaxY, other.MaxY);
+        return { minX, minY, maxX, maxY };
+    }
+    __any_device__
+    AABB_ &operator-=(const Point_<T> &offset) {
+        X -= offset.X;
+        MaxX -= offset.X;
+        Y -= offset.Y;
+        MaxY -= offset.Y;
+        return *this;
+    }
+    __any_device__
+    __lib_inline__ T Width() const { return MaxX - X; }
+    __any_device__
+    __lib_inline__ T Height() const { return MaxY - Y; }
+    __any_device__
+    __lib_inline__ T Area() const { return Width() * Height(); }
+    __lib_inline__ T &operator[] (int64_t idx)
+    {
+        static_assert(std::is_standard_layout<AABB_<T>>::value, "This function is only valid for standard layout");
+        return (&X)[idx];
+    }
+    __lib_inline__ T operator[] (int64_t idx) const
+    {
+        static_assert(std::is_standard_layout<AABB_<T>>::value, "This function is only valid for standard layout");
+        return (&X)[idx];
+    }
+    __any_device__ __lib_inline__
+    AABB_ Intersection(const AABB_ &other) const {
+#ifndef __NVCC__
+        using std::min;
+        using std::max;
+#endif
+        T minX = max(X, other.X);
+        T minY = max(Y, other.Y);
+        T maxX = min(MaxX, other.MaxX);
+        T maxY = min(MaxY, other.MaxY);
+        // Prevent negative area
+        minX = min(minX, maxX);
+        minY = min(minY, maxY);
+        return { minX, minY, maxX, maxY };
+    }
+    __any_device__ __lib_inline__
+    T IntersectionArea(const AABB_ &other) const { return Intersection(other).Area(); }
+};
+template<typename T, typename Derived>
+struct QuadBase_ {
+    typedef T inner_type;
+    __any_device__
+    AABB_<T> Bounds() const;
+    __any_device__
+    bool Contains(const Point_<T> &p) const;
+    __any_device__
+    T Area() const;
+    __any_device__
+    T Height() const;
+    __any_device__
+    T Width() const;
+    template<typename Derived2>
+    __any_device__
+    T IntersectionArea(const QuadBase_<T, Derived2> &other) const;
+    template<typename Derived2>
+    __any_device__
+    T IOU(const QuadBase_<T, Derived2> &other) const;
+    template<typename Derived2>
+    __any_device__
+    T IOU_UpperBound(const QuadBase_<T, Derived2> &other) const;
+    __any_device__
+    Point_<T> Center() const;
+    template<typename Derived2>
+    __any_device__
+    /*
+        Returns 3 geometric associations between the two quads:
+            0: The percent shared area between this and other relative to this (e.g. if other contains this, then it returns 1)
+            1: The percent shared area between other and this relative to other (e.g. if this contains other, then it return 1)
+            2: The IOU of the two quads
+    */
+    tuple_t<T, T, T> RegionSizes(const QuadBase_<T, Derived2> &other) const;
+    template<typename Derived2>
+    __any_device__
+    tuple_t<T, T, T> RegionSizes_UpperBound(const QuadBase_<T, Derived2> &other) const;
+    __any_device__
+    Derived &operator/=(T val) {
+        auto rcp = 1 / val;
+        return *this *= rcp;
+    }
+    __any_device__
+    Derived &operator*=(T val) {
+        auto dThis = static_cast<Derived*>(this);
+        #pragma unroll
+        for (size_t i = 0; i < 4; ++i) {
+            dThis->Vertices[i] *= val;
+        }
+        return *dThis;
+    }
+    friend auto begin(const QuadBase_ &q) { return static_cast<const Derived&>(q).Vertices; }
+    friend auto begin(QuadBase_& q) { return static_cast<const Derived&>(q).Vertices; }
+    friend auto end(const QuadBase_ &q) { return static_cast<const Derived&>(q).Vertices + 4; }
+    friend auto end(QuadBase_ &q) { return static_cast<const Derived&>(q).Vertices + 4; }
+};
+template<typename T>
+struct Quad_ : QuadBase_<T, Quad_<T>> {
+    Point_<T> *Vertices = nullptr;
+    Quad_() = default;
+    __any_device__
+    Quad_(T *dataPtr)
+        : Vertices(reinterpret_cast<Point_<T>*>(dataPtr)) {}
+    __any_device__
+    Quad_(Point_<T> *dataPtr)
+        : Vertices(dataPtr) {}
+    template<typename index_t>
+    __any_device__ __lib_inline__
+    const Point_<T> &operator[](index_t offset) const { return Vertices[offset]; }
+    template<typename index_t>
+    __any_device__ __lib_inline__
+    Point_<T> &operator[](index_t offset) { return Vertices[offset]; }
+};
+template<typename T>
+struct InPlaceQuad_ : public QuadBase_<T, InPlaceQuad_<T>> {
+    Point_<T> Vertices[4];
+    InPlaceQuad_() = default;
+    __any_device__
+    InPlaceQuad_(const T *dataPtr)
+    {
+#if defined(__NVCC__)
+        T *pVals = reinterpret_cast<T*>(Vertices);
+        #pragma unroll
+        for (uint32_t i = 0; i < 8; ++i) {
+            pVals[i] = dataPtr[i];
+        }
+#else
+        using std::copy;
+        copy(dataPtr, dataPtr + 8, reinterpret_cast<T*>(Vertices));
+#endif
+    }
+    __any_device__
+    InPlaceQuad_(const Point_<T> *dataPtr)
+    {
+#if defined(__NVCC__)
+        #pragma unroll
+        for (uint32_t i = 0; i < 4; ++i) {
+            Vertices[i] = dataPtr[i];
+        }
+#else
+        using std::copy;
+        copy(dataPtr, dataPtr + 4, Vertices);
+#endif
+    }
+    template<typename index_t>
+    __any_device__ __lib_inline__
+    const Point_<T> &operator[](index_t v) const { return Vertices[v]; }
+    template<typename index_t>
+    __any_device__ __lib_inline__
+    Point_<T> &operator[](index_t v) { return Vertices[v]; }
+};
+template<typename T, typename Derived>
+struct PolygonBase_ {
+    typedef T inner_type;
+    __any_device__
+    AABB_<T> Bounds() const;
+    __any_device__
+    bool Contains(const Point_<T> &p) const;
+    __any_device__
+    T EdgeLength() const;
+    __any_device__
+    Point_<T> Center() const;
+    __any_device__
+    T Area() const;
+};
+template<typename T>
+struct Polygon_ : PolygonBase_<T, Polygon_<T>> {
+    Point_<T> *Vertices = nullptr;
+    size_t Count = 0;
+    Polygon_() = default;
+    __any_device__
+    Polygon_(T *dataPtr, size_t vertexCount)
+        : Vertices(reinterpret_cast<Point_<T>*>(dataPtr)), Count(vertexCount) {}
+    __any_device__
+    Polygon_(Point_<T> *dataPtr, size_t vertexCount)
+        : Vertices(dataPtr), Count(vertexCount) {}
+    __any_device__
+    const Point_<T> &operator[](size_t offset) const { return Vertices[offset]; }
+    __any_device__
+    Point_<T> &operator[](size_t offset) { return Vertices[offset]; }
+};
+template<typename T>
+struct Segment_ {
+    Point_<T> A, B;
+    Segment_() = default;
+    __any_device__
+    Segment_(const Point_<T> &a, const Point_<T> &b) : A(a), B(b) {}
+    __any_device__
+    T Length() const;
+    __any_device__
+    T LengthSq() const;
+    __any_device__
+    bool Intersection(const Segment_<T> &other, Point_<T> &out_ptAlong) const;
+};
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> operator+(const Point_<T> &a, const Point_<T> &b) {
+    return { a.X + b.X, a.Y + b.Y };
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> operator-(const Point_<T> &a, const Point_<T> &b) {
+    return { a.X - b.X, a.Y - b.Y };
+}
+template<typename T, typename W>
+__any_device__
+__lib_inline__ Point_<T> operator*(W scale, const Point_<T> &p) {
+    return { scale * p.X, scale * p.Y };
+}
+template<typename T, typename W>
+__any_device__
+__lib_inline__ Point_<T> operator*(const Point_<T> &p, W scale) {
+    return { scale * p.X, scale * p.Y };
+}
+template<typename T, typename W>
+__any_device__
+__lib_inline__ Point_<T> operator/(const Point_<T> &p, W divisor) {
+    return { p.X / divisor, p.Y / divisor };
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> operator*(const Point_<T> &a, const Point_<T> &b) {
+    return { a.X * b.X, a.Y * b.Y };
+}
+template<typename T, typename W>
+__any_device__
+__lib_inline__ Point_<T> operator-(const Point_<T> &p, W v) {
+    return { p.X - v, p.Y - v };
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> &Point_<T>::operator+=(const Point_<T> &p) {
+    X = X + p.X;
+    Y = Y + p.Y;
+    return *this;
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> &Point_<T>::operator-=(const Point_<T> &p) {
+    X = X - p.X;
+    Y = Y - p.Y;
+    return *this;
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> &Point_<T>::operator*=(const Point_<T> &p) {
+    X = X * p.X;
+    Y = Y * p.Y;
+    return *this;
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> &Point_<T>::operator/=(const Point_<T> &p) {
+    X = X / p.X;
+    Y = Y / p.Y;
+    return *this;
+}
+template<typename T>
+template<typename W>
+__any_device__
+__lib_inline__ Point_<T> &Point_<T>::operator/=(W val) {
+    // TODO: This can be more efficient for float types by computing the reciprocal
+    X /= val;
+    Y /= val;
+    return *this;
+}
+template<typename T>
+template<typename W>
+__any_device__
+__lib_inline__ Point_<T> &Point_<T>::operator*=(W val) {
+    X *= val;
+    Y *= val;
+    return *this;
+}
+template<typename T>
+__any_device__
+__lib_inline__ T dot(const Point_<T> &a, const Point_<T> &b) {
+    return a.X * b.X + a.Y * b.Y;
+}
+template<typename T>
+__any_device__
+__lib_inline__ T dot(const Point_<T> &p) {
+    return dot(p, p);
+}
+template<typename T>
+__any_device__
+__lib_inline__ T length(const Point_<T> &p) {
+#ifndef __NVCC__
+    using std::sqrt;
+#endif
+    return sqrt(dot(p));
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> normalize(const Point_<T> &p) {
+    static constexpr T epsilon = std::numeric_limits<T>::epsilon();
+    auto len = length(p) + epsilon;
+    return { p.X / len, p.Y / len };
+}
+template<typename T>
+__any_device__
+__lib_inline__ Point_<T> ortho_2d(const Point_<T> &p) {
+    return { -p.Y, p.X };
+}
+template<typename T>
+__host__
+__lib_inline__ std::ostream &operator<<(std::ostream &os, const Point_<T> &p) {
+    return os << "(" << p.X << ", " << p.Y << ")";
+}
+template<typename T>
+__host__
+__lib_inline__ std::ostream &operator<<(std::ostream &os, const AABB_<T> &b) {
+    return os << "[(" << b.X << ", " << b.Y << "), (" << b.MaxX << ", " << b.MaxY << ")]";
+}
+template<typename T>
+__host__
+__lib_inline__ std::ostream &operator<<(std::ostream &os, const Segment_<T> &s) {
+    return os << "[(" << s.A.X << ", " << s.A.Y << "), (" << s.B.X << ", " << s.B.Y << ")]";
+}
+template<typename T>
+__host__
+__lib_inline__ std::ostream &operator<<(std::ostream &os, const Quad_<T> &q) {
+    os << "[" << q.Vertices[0];
+    for (size_t i = 1; i < 4; ++i) {
+        os << ", " << q.Vertices[i];
+    }
+    return os << "]";
+}
+template<typename T>
+__any_device__
+__lib_inline__ int _signum(T val) {
+    return (T(0) < val) - (val < T(0));
+}
+template<typename T>
+__any_device__
+__lib_inline__ T sign(const Point_<T> &p1, const Point_<T> &p2, const Point_<T> &p3) {
+    T ret = (p1.X - p3.X) * (p2.Y - p3.Y) - (p2.X - p3.X) * (p1.Y - p3.Y);
+    auto sgn = _signum(ret);
+    return sgn;
+}
+template<typename T>
+__any_device__
+__lib_inline__ T Segment_<T>::Length() const
+{
+#ifndef __NVCC__
+    using std::sqrt;
+#endif
+    return sqrt(LengthSq());
+}
+template<typename T>
+__any_device__
+__lib_inline__ T Segment_<T>::LengthSq() const
+{
+    return dot(B - A);
+}
+template<typename T>
+__any_device__
+inline bool Segment_<T>::Intersection(const Segment_<T> &other, Point_<T> &out_ptAlong) const
+{
+    auto p1 = A, p2 = B, p3 = other.A, p4 = other.B;
+    auto denom = (p4.Y - p3.Y) * (p2.X - p1.X) - (p4.X - p3.X) * (p2.Y - p1.Y);
+    if (abs(denom) < 1e-8) {
+        return false;
+    }
+    auto numer = (p4.X - p3.X) * (p1.Y - p3.Y) - (p4.Y - p3.Y) * (p1.X - p3.X);
+    auto t = numer / denom;
+    auto Bnumer = (p2.X - p1.X) * (p1.Y - p3.Y) - (p2.Y - p1.Y) * (p1.X - p3.X);
+    auto Bt = Bnumer / denom;
+    if (t < 0 || t > 1 || Bt < 0 || Bt > 1) {
+        return false;
+    }
+    out_ptAlong = A + t * (B - A);
+    return true;
+}
+template<typename quad_t>
+__any_device__
+auto quad_center(const quad_t &quad) -> Point_<typename quad_t::inner_type>
+{
+    typedef typename quad_t::inner_type T;
+    Point_<T> center = quad[0];
+    for (size_t i = 1; i < 4; ++i) {
+        center += quad[i];
+    }
+    return center / T{ 4 };
+}
+template<typename T, typename Derived>
+__any_device__
+Point_<T> QuadBase_<T, Derived>::Center() const {
+    return quad_center(static_cast<const Derived&>(*this));
+}
+template<typename quad_t>
+__any_device__
+auto quad_bounds(const quad_t &quad) -> AABB_<typename quad_t::inner_type>
+{
+#ifndef __NVCC__
+    using std::min;
+    using std::max;
+#endif
+    auto minP = quad[0];
+    auto maxP = minP;
+    for (size_t i = 1; i < 4; ++i) {
+        auto qp = quad[i];
+        minP = min(minP, qp);
+        maxP = max(maxP, qp);
+    }
+    return { minP.X, minP.Y, maxP.X, maxP.Y };
+}
+template<typename T, typename Derived>
+__any_device__
+AABB_<T> QuadBase_<T, Derived>::Bounds() const {
+    return quad_bounds(static_cast<const Derived&>(*this));
+}
+template<typename Quad_t, typename point_t>
+__any_device__
+inline bool quad_contains(const Quad_t &quad, const point_t &pt)
+{
+#ifndef __NVCC__
+    using std::abs;
+#endif
+    // Checks that the point lies on the interior side of each half plane
+    auto d1 = sign(pt, quad[0], quad[1]);
+    auto d2 = sign(pt, quad[1], quad[2]);
+    auto d3 = sign(pt, quad[2], quad[3]);
+    auto d4 = sign(pt, quad[3], quad[0]);
+    // bool has_neg = (d1 < 0) || (d2 < 0) || (d3 < 0) || (d4 < 0);
+    // bool has_pos = (d1 > 0) || (d2 > 0) || (d3 > 0) || (d4 > 0);
+    int tot = d1 + d2 + d3 + d4;
+    // return !(has_neg && has_pos);
+    return abs(tot) == 4;
+}
+template<typename T, typename Derived>
+__any_device__
+__lib_inline__ bool QuadBase_<T, Derived>::Contains(const Point_<T> &pt) const
+{
+    return quad_contains(static_cast<const Derived&>(*this), pt);
+}
+template<typename PtList>
+__any_device__
+inline auto shoelace_area(const PtList &points, size_t numPts, bool isSigned=false) -> decltype(points[0].X)
+{
+#ifndef __NVCC__
+    using std::abs;
+#endif
+    decltype(points[0].X) area = 0;
+    size_t j = numPts - 1;
+    for (size_t i = 0; i < numPts; ++i) {
+        auto Pi = points[i];
+        auto Pj = points[j];
+        area += (Pj.X + Pi.X) * (Pj.Y - Pi.Y);
+        j = i;
+    }
+    area = area / 2;
+    if (! isSigned) {
+        area = abs(area);
+    }
+    return area;
+}
+template<typename T, typename Derived>
+__any_device__
+__lib_inline__ T QuadBase_<T, Derived>::Height() const
+{
+    auto &d = static_cast<const Derived&>(*this);
+    auto h1 = Segment_<T>(d[1], d[2]).Length();
+    auto h2 = Segment_<T>(d[3], d[0]).Length();
+    return (h1 + h2) / 2;
+}
+template<typename T, typename Derived>
+__any_device__
+__lib_inline__ T QuadBase_<T, Derived>::Width() const
+{
+    auto &d = static_cast<const Derived&>(*this);
+    auto w1 = Segment_<T>(d[0], d[1]).Length();
+    auto w2 = Segment_<T>(d[3], d[2]).Length();
+    return (w1 + w2) / 2;
+}
+// A quad can be defined as the sum of the area of two triangles
+template<typename T, typename Derived>
+__any_device__
+inline T QuadBase_<T, Derived>::Area() const
+{
+    // auto vertices = static_cast<const Derived *>(this)->Vertices;
+    return shoelace_area(static_cast<const Derived&>(*this), 4);
+}
+template<typename Quad_t1, typename Quad_t2>
+__any_device__
+inline auto intersection_area(const Quad_t1 &quadsA, const Quad_t2 &quadsB) -> typename Quad_t1::inner_type
+{
+#ifndef __NVCC__
+    using std::atan2;
+#endif
+    typedef typename Quad_t1::inner_type T;
+    static const size_t MAX_PTS = 32;
+    Point_<T> points[MAX_PTS], sortedPoints[MAX_PTS];
+    T angles[MAX_PTS];
+    size_t indices[MAX_PTS];
+    size_t numPts = 0;
+    auto addPt = [&] (const Point_<T> &p) {
+        points[numPts] = p;
+        ++numPts;
+    };
+    for (size_t i = 0; i < 4; ++i) {
+        Point_<T> aPt = quadsA[i];
+        Point_<T> bPt = quadsB[i];
+        if (quadsA.Contains(bPt)) {
+            addPt(bPt);
+        }
+        if (quadsB.Contains(aPt)) {
+            addPt(aPt);
+        }
+    }
+    for (size_t i = 0; i < 4; ++i) {
+        Segment_<T> segA{ quadsA[i], quadsA[(i + 1) % 4] };
+        for (size_t j = 0; j < 4; ++j) {
+            Segment_<T> segB{ quadsB[j], quadsB[(j + 1) % 4] };
+            Point_<T> ptAlong;
+            if (segA.Intersection(segB, ptAlong)) {
+                addPt(ptAlong);
+            }
+        }
+    }
+    if (numPts == 0) {
+        return 0;
+    }
+    Point_<T> center{ 0, 0 };
+    for (size_t i = 0; i < numPts; ++i) {
+        center += points[i];
+    }
+    center /= numPts;
+    for (size_t i = 0; i < numPts; ++i) {
+        points[i] -= center;
+        angles[i] = atan2(points[i].Y, points[i].X);
+        indices[i] = i;
+    }
+    // Perform an argsort over the angles
+    SORT_ALGO(indices, indices + numPts,
+        [&] (size_t a, size_t b) {
+            return angles[a] < angles[b];
+        }
+    );
+    for (size_t i = 0; i < numPts; ++i) {
+        sortedPoints[i] = points[indices[i]];
+    }
+    // Finally, we can compute the area of this polygon using the shoelace formula
+    T area = shoelace_area(sortedPoints, numPts);
+    return area;
+}
+template<typename T, typename Derived>
+template<typename Derived2>
+__any_device__
+__lib_inline__ T QuadBase_<T, Derived>::IntersectionArea(const QuadBase_<T, Derived2> &other) const
+{
+    return intersection_area(
+        static_cast<const Derived&>(*this),
+        static_cast<const Derived2&>(other)
+    );
+}
+template<typename T1, typename T2>
+__any_device__
+__lib_inline__ auto geometry_iou(const T1 &a, const T2 &b) -> decltype(a.Area())
+{
+    auto aArea = a.Area();
+    auto bArea = b.Area();
+    auto ixArea = a.IntersectionArea(b);
+    auto unionArea = aArea + bArea - ixArea;
+    return ixArea / unionArea;
+}
+template<typename T, typename Derived>
+template<typename Derived2>
+__any_device__
+__lib_inline__ T QuadBase_<T, Derived>::IOU(const QuadBase_<T, Derived2> &other) const
+{
+    return geometry_iou(
+        static_cast<const Derived&>(*this),
+        static_cast<const Derived2&>(other)
+    );
+}
+template<typename T, typename Derived>
+template<typename Derived2>
+__any_device__
+__lib_inline__ T QuadBase_<T, Derived>::IOU_UpperBound(const QuadBase_<T, Derived2> &other) const
+{
+    return geometry_iou(
+        Bounds(),
+        other.Bounds()
+    );
+}
+template<typename T1, typename T2>
+__any_device__ __lib_inline__
+auto geometry_region_sizes(const T1 &a, const T2 &b) -> tuple_t<decltype(a.Area()), decltype(a.Area()), decltype(a.IntersectionArea(b))>
+{
+    auto aArea = a.Area();
+    auto bArea = b.Area();
+    auto ixArea = a.IntersectionArea(b);
+    auto unionArea = aArea + bArea - ixArea;
+    auto iou = ixArea / unionArea;
+    return { ixArea / aArea, ixArea / bArea, iou };
+}
+template<typename T, typename Derived>
+template<typename Derived2>
+__any_device__ __lib_inline__
+tuple_t<T, T, T> QuadBase_<T, Derived>::RegionSizes(const QuadBase_<T, Derived2> &other) const
+{
+    return geometry_region_sizes(
+        static_cast<const Derived&>(*this),
+        static_cast<const Derived2&>(other)
+    );
+}
+template<typename T, typename Derived>
+template<typename Derived2>
+__any_device__ __lib_inline__
+tuple_t<T, T, T> QuadBase_<T, Derived>::RegionSizes_UpperBound(const QuadBase_<T, Derived2> &other) const
+{
+    return geometry_region_sizes(
+        Bounds(),
+        other.Bounds()
+    );
+}
+template<typename polygon_t>
+__any_device__
+auto polygon_bounds(const polygon_t &poly) -> AABB_<typename polygon_t::inner_type>
+{
+#ifndef __NVCC__
+    using std::min;
+    using std::max;
+#endif
+    auto minP = poly[0];
+    auto maxP = minP;
+    for (size_t i = 1; i < poly.Count; ++i) {
+        auto qp = poly[i];
+        minP = min(minP, qp);
+        maxP = max(maxP, qp);
+    }
+    return { minP.X, minP.Y, maxP.X, maxP.Y };
+}
+template<typename T, typename Derived>
+__any_device__
+AABB_<T> PolygonBase_<T, Derived>::Bounds() const {
+    return polygon_bounds(static_cast<const Derived&>(*this));
+}
+template<typename polygon_t, typename point_t>
+__any_device__
+bool polygon_contains(const polygon_t &poly, const point_t &pt)
+{
+    typedef typename polygon_t::inner_type T;
+    // Some arbitrary segment. Technically this should be a ray, but functionally this will work
+    Segment_<T> testSeg{ pt, { -1e6, -2e6 }};
+    Point_<T> trash;
+    int32_t ixCount = 0;
+    for (size_t i = 0; i < poly.Count; ++i) {
+        Segment_<T> polySeg{ poly[i], poly[(i + 1) % poly.Count] };
+        if (testSeg.Intersection(polySeg, trash)) {
+            ++ixCount;
+        }
+    }
+    // If there are an odd number of intersections, then the point is inside
+    return (ixCount % 2) == 1;
+}
+template<typename T, typename Derived>
+__any_device__
+bool PolygonBase_<T, Derived>::Contains(const Point_<T> &pt) const {
+    return polygon_contains(static_cast<const Derived&>(*this), pt);
+}
+template<typename polygon_t>
+__any_device__
+auto polygon_edge_length(const polygon_t &poly) -> typename polygon_t::inner_type
+{
+    typedef typename polygon_t::inner_type T;
+    T ret = 0;
+    for (size_t i = 0; i < poly.Count; ++i) {
+        Segment_<T> seg{ poly[i], poly[(i + 1) % poly.Count] };
+        ret += seg.Length();
+    }
+    return ret;
+}
+template<typename T, typename Derived>
+__any_device__
+T PolygonBase_<T, Derived>::EdgeLength() const {
+    return polygon_edge_length(static_cast<const Derived&>(*this));
+}
+template<typename polygon_t>
+__any_device__
+auto polygon_center(const polygon_t &poly) -> Point_<typename polygon_t::inner_type>
+{
+    typedef typename polygon_t::inner_type T;
+    T cx = 0, cy = 0, a = 0;
+    size_t j = poly.Count - 1;
+    for (size_t i = 0; i < poly.Count; ++i) {
+        Point_<T> p0 = poly[i];
+        Point_<T> p1 = poly[j];
+        T common = (p0.X * p1.Y - p1.X * p0.Y);
+        cx += (p0.X + p1.X) * common;
+        cy += (p0.Y + p1.Y) * common;
+        a += common;
+        j = i;
+    }
+    a /= 2;
+    Point_<T> center{ cx / (6 * a), cy / (6 * a) };
+    return center;
+}
+template<typename T, typename Derived>
+__any_device__
+Point_<T> PolygonBase_<T, Derived>::Center() const {
+    return polygon_center(static_cast<const Derived&>(*this));
+}
+template<typename T, typename Derived>
+__any_device__
+T PolygonBase_<T, Derived>::Area() const {
+    const Derived &dThis = static_cast<const Derived&>(*this);
+    return shoelace_area(dThis, dThis.Count);
+}
+template<typename T>
+__any_device__
+Point_<T> nearest_point_on_segment(const Point_<T> &pt, const Segment_<T> &seg)
+{
+#ifndef __NVCC__
+    using std::max;
+    using std::min;
+#endif
+    const T l2 = seg.LengthSq();
+    if (l2 == 0.0) {
+        return seg.A;
+    }
+    const auto v = seg.A;
+    const auto w = seg.B;
+    // Consider the line extending the segment, parameterized as v + t*(w-v)
+    // Find projection of point p onto the line
+    auto t = dot(pt - v, w - v) / l2;
+    // Clamp between t=0 and t=1
+    t = max(static_cast<T>(0), min(static_cast<T>(1), t));
+    const auto projection = v + t * (w - v);
+    return projection;
+}
+template<typename T>
+__any_device__
+Segment_<T> shortest_line_between_segments(const Segment_<T> &a, const Segment_<T> &b)
+{
+    Segment_<T> segs[] = {
+        { a.A, nearest_point_on_segment(a.A, b) },
+        { a.B, nearest_point_on_segment(a.B, b) },
+        { nearest_point_on_segment(b.A, a), b.A },
+        { nearest_point_on_segment(b.B, a), b.B }
+    };
+    T minDist = std::numeric_limits<T>::max();
+    size_t idx;
+    #pragma unroll
+    for (size_t i = 0; i < 4; ++i) {
+        T dist = segs[i].LengthSq();
+        if (dist < minDist) {
+            minDist = dist;
+            idx = i;
+        }
+    }
+    return segs[idx];
+}
+// Find the distance between a point and the nearest point along the specified segment
+template<typename T>
+__any_device__
+T distance_to_segment(const Point_<T> &pt, const Segment_<T> &seg)
+{
+    auto projection = nearest_point_on_segment(pt, seg);
+    auto dist = length(pt - projection);
+    return dist;
+}

nemotron-ocr/cpp/geometry_api/calc_poly_min_rrect.cpp ADDED Viewed

	@@ -0,0 +1,164 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "geometry_api.h"
+#include "../graph_detection/encode_util.h"
+#include "../geometry.h"
+#include "matrix2x2.h"
+using namespace std;
+template<typename T>
+void _calc_poly_min_rrect(const torch::TensorAccessor<T, 2> vertices, torch::TensorAccessor<T, 2> outRRect);
+template<typename T>
+void _calc_quad_min_rrect(const torch::TensorAccessor<T, 2> vertices, torch::TensorAccessor<T, 2> outRRect);
+torch::Tensor calc_poly_min_rrect(torch::Tensor vertices)
+{
+    if (vertices.size(0) < 3) {
+        throw runtime_error("Invalid polygon! Expected >= 3 vertices, got " + to_string(vertices.size(0)));
+    }
+    auto ret = torch::empty({ 4, 2 }, vertices.options());
+    auto retAcc = ret.accessor<float, 2>();
+    if (vertices.size(0) != 4) {
+        // OpenCV requires this to be a contiguous buffer
+        vertices = vertices.contiguous();
+        _calc_poly_min_rrect(vertices.accessor<float, 2>(), retAcc);
+    } else {
+        _calc_quad_min_rrect(vertices.accessor<float, 2>(), retAcc);
+    }
+    return ret;
+}
+template<typename T>
+void _calc_bounds(const torch::TensorAccessor<T, 2> &vertices, torch::TensorAccessor<T, 2> &outRRect,
+                  const Point_<T> &leftCenter, const Point_<T> &rightCenter)
+{
+    typedef Point_<T> Pointf;
+    Pointf vecAlong = rightCenter - leftCenter;
+    auto alongMag = length(vecAlong);
+    if (alongMag == 0.0f) {
+        throw runtime_error("Invalid polygon!");
+    }
+    vecAlong /= alongMag;
+    Pointf dOrtho{ -vecAlong.Y, vecAlong.X };
+    Pointf center = (leftCenter + rightCenter) / 2.0f;
+    Matrix2x2<T> rotMat{ vecAlong, dOrtho };
+    auto get_fn = [&vertices, &center] (int64_t i) {
+        return Pointf{ vertices[i] } - center;
+    };
+    // All we care about it getting the bounds in the normalized space, so this saves
+    // us from having to do any memory allocation
+    Pointf minPt{ 0, 0 }, maxPt{ 0, 0 };
+    auto tx_fn = [&minPt, &maxPt] (int64_t i, const Pointf &pt) {
+        minPt = min(minPt, pt);
+        maxPt = max(maxPt, pt);
+    };
+    matmul_fn(vertices.size(0), get_fn, rotMat, tx_fn, transpose_tag{});
+    Pointf rotBox[4] = {
+        minPt,
+        { maxPt.X, minPt.Y },
+        maxPt,
+        { minPt.X, maxPt.Y }
+    };
+    auto get_fn2 = [&rotBox] (int64_t i) {
+        return rotBox[i];
+    };
+    auto assign_fn = [&center, &outRRect] (int64_t i, const Pointf &pt) {
+        outRRect[i][0] = pt.X + center.X;
+        outRRect[i][1] = pt.Y + center.Y;
+    };
+    matmul_fn(4, get_fn2, rotMat, assign_fn, contiguous_tag{});
+}
+template<typename T>
+void _calc_poly_min_rrect(const torch::TensorAccessor<T, 2> vertices, torch::TensorAccessor<T, 2> outRRect)
+{
+    typedef Point_<T> Pointf;
+    typedef Polygon_<T> Polygonf;
+    Polygonf poly{ vertices.data(), vertices.size(0) };
+    vector<graph_detection::Edge> bottoms = graph_detection::find_bottom(poly, false);
+    if (bottoms.size() != 2) {
+        throw runtime_error("Invalid polygon!");
+    }
+    vector<graph_detection::Edge> longEdges[2];
+    graph_detection::find_long_edges(poly, bottoms.data(), longEdges[0], longEdges[1]);
+    ////
+    // Determine which edge is above the other
+    Pointf cpts[2];
+    for (size_t i = 0; i < 2; ++i) {
+        auto &pedge = longEdges[i];
+        cpts[i] = Pointf{0.0f, 0.0f};
+        float ct = 0;
+        for (size_t z = 0; z < pedge.size(); ++z) {
+            auto edge = pedge[z];
+            Pointf p1 = poly[edge.A];
+            Pointf p2 = poly[edge.B];
+            cpts[i] += (p1 + p2) / 2.0f;
+            ct += 1.0f;
+        }
+        if (ct < 1.0f) {
+            throw runtime_error("Edge was empty!");
+        }
+        cpts[i] /= ct;
+    }
+    float vpp = graph_detection::vector_sin(cpts[0] - cpts[1]);
+    if (vpp >= 0) {
+        swap(bottoms[0], bottoms[1]);
+    }
+    ////
+    Pointf edge1[2] = { poly[bottoms[0].A], poly[bottoms[0].B] };
+    Pointf edge2[2] = { poly[bottoms[1].A], poly[bottoms[1].B] };
+    Pointf c0 = (edge1[0] + edge1[1]) / 2.0f;
+    Pointf c1 = (edge2[0] + edge2[1]) / 2.0f;
+    _calc_bounds(vertices, outRRect, c0, c1);
+}
+template<typename T>
+void _calc_quad_min_rrect(const torch::TensorAccessor<T, 2> vertices, torch::TensorAccessor<T, 2> outRRect)
+{
+    typedef Point_<T> Pointf;
+    // Instead of finding an arbitrary rotated box, find a reasonable
+    // fit for the quadrangle
+    Pointf pts[4] = {
+        vertices[0], vertices[1], vertices[2], vertices[3]
+    };
+    Pointf c0 = (pts[0] + pts[3]) / 2.0f;
+    Pointf c1 = (pts[1] + pts[2]) / 2.0f;
+    _calc_bounds(vertices, outRRect, c0, c1);
+}

nemotron-ocr/cpp/geometry_api/geometry_api.cpp ADDED Viewed

	@@ -0,0 +1,100 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "geometry_api.h"
+#include "geometry_api_common.h"
+using namespace std;
+torch::Tensor rrect_to_quads_gpu(torch::Tensor rrects, float cellSize);
+template<typename T>
+torch::Tensor rrect_to_quads_impl(torch::Tensor rrects, T cellSize)
+{
+    // BHW(5)
+    auto rrectAccess = rrects.accessor<T, 4>();
+    T cellOff = cellSize / 2;
+    auto quads = torch::empty({ rrects.size(0), rrects.size(1), rrects.size(2), 4, 2 }, rrects.options());
+    auto quadsAccess = quads.accessor<T, 5>();
+    for (long b = 0; b < rrects.size(0); ++b) {
+        for (long y = 0; y < rrects.size(1); ++y) {
+            for (long x = 0; x < rrects.size(2); ++x) {
+                auto rrect = rrectAccess[b][y][x];
+                auto quad = quadsAccess[b][y][x];
+                assign_rrect_to_quad(rrect, quad, cellSize, cellOff,
+                                     static_cast<T>(x),
+                                     static_cast<T>(y));
+            }
+        }
+    }
+    return quads;
+}
+torch::Tensor rrect_to_quads(torch::Tensor rrects, float cellSize)
+{
+    if (rrects.is_cuda()) {
+        return rrect_to_quads_gpu(rrects, cellSize);
+    }
+    torch::Tensor quads;
+    AT_DISPATCH_FLOATING_TYPES(
+        rrects.scalar_type(),
+        "rrect_to_quads_impl",
+        ([&] {
+            quads = rrect_to_quads_impl<scalar_t>(rrects, scalar_t(cellSize));
+        })
+    );
+    return quads;
+}
+template<typename T>
+torch::Tensor rrect_to_quads_backward_impl(torch::Tensor rrects, torch::Tensor gradOutput)
+{
+    // BHW(5)
+    auto gradInput = torch::empty_like(rrects);
+    auto rrectAccess = rrects.accessor<T, 4>();
+    // BHW42
+    auto gradOutputAccess = gradOutput.accessor<T, 5>();
+    auto gradInputAccess = gradInput.accessor<T, 4>();
+    for (long b = 0; b < rrects.size(0); ++b) {
+        for (long y = 0; y < rrects.size(1); ++y) {
+            for (long x = 0; x < rrects.size(2); ++x) {
+                assign_grad_rrect_to_quad<T>(rrectAccess[b][y][x], gradOutputAccess[b][y][x], gradInputAccess[b][y][x]);
+            }
+        }
+    }
+    return gradInput;
+}
+torch::Tensor rrect_to_quads_backward_gpu(torch::Tensor rrects, torch::Tensor gradOutput);
+torch::Tensor rrect_to_quads_backward(torch::Tensor rrects, torch::Tensor gradOutput)
+{
+    if (rrects.is_cuda()) {
+        return rrect_to_quads_backward_gpu(rrects, gradOutput);
+    }
+    torch::Tensor gradInput;
+    AT_DISPATCH_FLOATING_TYPES(
+        rrects.scalar_type(),
+        "rrect_to_quads_backward_impl",
+        ([&] {
+            gradInput = rrect_to_quads_backward_impl<scalar_t>(rrects, gradOutput);
+        })
+    );
+    return gradInput;
+}

nemotron-ocr/cpp/geometry_api/geometry_api.h ADDED Viewed

	@@ -0,0 +1,15 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <torch/torch.h>
+torch::Tensor rrect_to_quads(torch::Tensor rrects, float cellSize);
+torch::Tensor rrect_to_quads_backward(torch::Tensor rrects, torch::Tensor gradOutput);
+torch::Tensor calc_poly_min_rrect(torch::Tensor vertices);
+float get_rel_continuation_cos(torch::Tensor rrectA, torch::Tensor rrectB);
+torch::Tensor get_poly_bounds_quad(torch::Tensor poly);

nemotron-ocr/cpp/geometry_api/geometry_api_common.h ADDED Viewed

	@@ -0,0 +1,120 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <torch/torch.h>
+#include "../cuda_intellisense.cuh"
+#include "../geometry.h"
+#if defined(__NVCC__)
+#include <math_constants.h>
+#define GEO_PI CUDART_PI_F
+#else
+#include <math.h>
+#define GEO_PI M_PI
+#endif
+template<typename access_t, typename point_t>
+__device__
+inline
+void pt_assign(access_t acc, const point_t &p) {
+    acc[0] = p.X;
+    acc[1] = p.Y;
+}
+template<typename T, typename rrect_access_t>
+__device__ __lib_inline__
+InPlaceQuad_<T> cvt_rrect_to_quad(const rrect_access_t &rrect, T cellSize, T cellOff, T x, T y)
+{
+    typedef Point_<T> Pointf;
+    Pointf prior{
+        x * cellSize + cellOff,
+        y * cellSize + cellOff
+    };
+    T dTop = rrect[0];
+    T dRight = rrect[1];
+    T dBottom = rrect[2];
+    T dLeft = rrect[3];
+    T theta = rrect[4];
+    T piOver2{GEO_PI / 2.0f};
+    Pointf vX{ cos(theta), sin(theta) };
+    Pointf vY{ cos(theta - piOver2), sin(theta - piOver2) };
+    InPlaceQuad_<T> ret;
+    ret[0] = prior - vX * dLeft + vY * dTop;
+    ret[1] = prior + vX * dRight + vY * dTop;
+    ret[2] = prior + vX * dRight - vY * dBottom;
+    ret[3] = prior - vX * dLeft - vY * dBottom;
+    return ret;
+}
+template<typename rrect_access_t, typename quad_access_t, typename T>
+__device__ __lib_inline__
+void assign_rrect_to_quad(const rrect_access_t &rrect, quad_access_t &quad,
+                          T cellSize, T cellOff, T x, T y)
+{
+    const InPlaceQuad_<T> cvQuad = cvt_rrect_to_quad<T>(rrect, cellSize, cellOff, x, y);
+    const T *pInQuad = reinterpret_cast<const T*>(&cvQuad);
+    T *pOutQuad = reinterpret_cast<T*>(quad.data());
+    #pragma unroll
+    for (uint32_t i = 0; i < 8; ++i) {
+        pOutQuad[i] = pInQuad[i];
+    }
+}
+template<typename T, typename rrect_access_t, typename quad_access_t>
+__device__
+inline
+void assign_grad_rrect_to_quad(const rrect_access_t &rrect,
+                               const quad_access_t &gradOutput,
+                               rrect_access_t gradInput)
+{
+    typedef Point_<T> Pointf;
+    T Top = rrect[0];
+    T Right = rrect[1];
+    T Bottom = rrect[2];
+    T Left = rrect[3];
+    T theta = rrect[4];
+    T piOver2{GEO_PI / 2.0f};
+    Pointf vX{ cos(theta), sin(theta) };
+    Pointf vY{ cos(theta - piOver2), sin(theta - piOver2) };
+    Pointf dVX{ -vX.Y, vX.X };
+    Pointf dVY{ -vY.Y, vY.X };
+    Pointf gP0 = gradOutput[0],
+           gP1 = gradOutput[1],
+           gP2 = gradOutput[2],
+           gP3 = gradOutput[3];
+    // Top
+    gradInput[0] = (gP0 * vY + gP1 * vY).Sum();
+    // Right
+    gradInput[1] = (gP1 * vX + gP2 * vX).Sum();
+    // Bottom
+    gradInput[2] = -(gP2 * vY + gP3 * vY).Sum();
+    // Left
+    gradInput[3] = -(gP0 * vX + gP3 * vX).Sum();
+    // Theta
+    gradInput[4] = (
+        gP0 * (-Left * dVX + Top * dVY) +
+        gP1 * (Right * dVX + Top * dVY) +
+        gP2 * (Right * dVX - Bottom * dVY) +
+        gP3 * (-Left * dVX - Bottom * dVY)
+    ).Sum();
+}
+#undef GEO_PI

nemotron-ocr/cpp/geometry_api/geometry_api_gpu.cu ADDED Viewed

	@@ -0,0 +1,141 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "geometry_api.h"
+#include "../geometry.h"
+#include "../cuda_intellisense.cuh"
+#include "geometry_api_common.h"
+#include <trove/ptr.h>
+using namespace std;
+template<typename T>
+struct RRect_ {
+    T Data[5];
+    template<typename index_t>
+    __device__
+    const T &operator[](index_t i) const { return Data[i]; }
+    template<typename index_t>
+    __device__
+    T &operator[](index_t i) { return Data[i]; }
+};
+template<typename T>
+__global__
+void device_rrect_to_quads_gpu(torch::PackedTensorAccessor64<T, 2> rrectAccess,
+                               torch::PackedTensorAccessor64<T, 3> quadsAccess,
+                               int64_t numRows, int64_t numCols,
+                               T cellSize)
+{
+    typedef Point_<T> Pointf;
+    typedef RRect_<T> RRectf;
+    typedef InPlaceQuad_<T> Quadf;
+    constexpr T TWO = 2;
+    const int64_t jobIdx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (jobIdx >= rrectAccess.size(0)) {
+        return;
+    }
+    int64_t row = jobIdx / numCols;
+    const int64_t col = jobIdx - (row * numCols);
+    row = row % numRows;
+    auto rawRRect = reinterpret_cast<RRectf*>(rrectAccess.data());
+    auto rawQuad = reinterpret_cast<Quadf*>(quadsAccess.data());
+#if defined(NDEBUG)
+    trove::coalesced_ptr<RRectf> pRRect(rawRRect);
+    trove::coalesced_ptr<Quadf> pQuad(rawQuad);
+#else
+    auto pRRect = rawRRect;
+    auto pQuad = rawQuad;
+#endif
+    RRectf rrect = pRRect[jobIdx];
+    T cellOff = cellSize / TWO;
+    Quadf cvQuad = cvt_rrect_to_quad<T>(rrect, cellSize, cellOff, col, row);
+    pQuad[jobIdx] = cvQuad;
+}
+torch::Tensor rrect_to_quads_gpu(torch::Tensor rrects, float cellSize)
+{
+    if (!rrects.is_contiguous()) {
+        throw std::runtime_error("Expected the rrects to be contiguous!");
+    }
+    torch::Tensor quads = torch::empty({ rrects.size(0), rrects.size(1), rrects.size(2), 4, 2 }, rrects.options());
+    auto rrFlat = rrects.flatten(0, 2);
+    auto qFlat = quads.flatten(0, 2);
+    dim3 blockSize(96);
+    dim3 gridSize(div_up(qFlat.size(0), blockSize.x));
+    if (quads.numel() > 0) {
+        AT_DISPATCH_FLOATING_TYPES(
+            quads.scalar_type(),
+            "cuda_rrect_to_quads",
+            ([&] {
+                device_rrect_to_quads_gpu<scalar_t> KERNEL_ARG2(gridSize, blockSize) (
+                    rrFlat.packed_accessor64<scalar_t, 2>(),
+                    qFlat.packed_accessor64<scalar_t, 3>(),
+                    rrects.size(1), rrects.size(2),
+                    cellSize
+                );
+            })
+        );
+    }
+    return quads;
+}
+template<typename scalar_t>
+__global__
+void device_rrect_to_quads_backward_gpu(torch::PackedTensorAccessor64<scalar_t, 2> rrect,
+                                        torch::PackedTensorAccessor64<scalar_t, 3> gradOutput,
+                                        torch::PackedTensorAccessor64<scalar_t, 2> gradInput)
+{
+    const int64_t jobIdx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (jobIdx >= rrect.size(0)) return;
+    assign_grad_rrect_to_quad<scalar_t>(rrect[jobIdx], gradOutput[jobIdx], gradInput[jobIdx]);
+}
+torch::Tensor rrect_to_quads_backward_gpu(torch::Tensor rrects, torch::Tensor gradOutput)
+{
+    auto gradInput = torch::empty_like(rrects);
+    auto flatRRects = rrects.reshape({ -1, 5 });
+    auto flatGradOutput = gradOutput.reshape({ -1, 4, 2 });
+    auto flatGradInput = gradInput.reshape({ -1, 5 });
+    dim3 blockSize(32);
+    dim3 gridSize(div_up(rrects.size(0) * rrects.size(1) * rrects.size(2), blockSize.x));
+    if (rrects.numel() > 0) {
+        AT_DISPATCH_FLOATING_TYPES(
+            rrects.scalar_type(),
+            "cuda_rrect_to_quads_backward",
+            ([&] {
+                device_rrect_to_quads_backward_gpu KERNEL_ARG2(gridSize, blockSize) (
+                    flatRRects.packed_accessor64<scalar_t, 2>(),
+                    flatGradOutput.packed_accessor64<scalar_t, 3>(),
+                    flatGradInput.packed_accessor64<scalar_t, 2>()
+                );
+            })
+        );
+    }
+    return gradInput;
+}

nemotron-ocr/cpp/geometry_api/get_rel_continuation_cos.cpp ADDED Viewed

	@@ -0,0 +1,59 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "geometry_api.h"
+#include "../geometry.h"
+using namespace std;
+float get_rel_continuation_cos(torch::Tensor rrectATensor, torch::Tensor rrectBTensor)
+{
+    typedef Point_<float> Pointf;
+    if (rrectATensor.size(0) != 4 || rrectBTensor.size(0) != 4) {
+        throw runtime_error("Invalid rrect arguments. Both must have 4 vertices! A=" +
+                                to_string(rrectATensor.size(0)) + ", B=" + to_string(rrectBTensor.size(0)));
+    }
+    auto rrectA = rrectATensor.accessor<float, 2>();
+    auto rrectB = rrectBTensor.accessor<float, 2>();
+    Pointf aPts[4] = {
+        rrectA[0], rrectA[1], rrectA[2], rrectA[3]
+    };
+    auto c1 = (aPts[0] + aPts[3]) / 2.0f;
+    auto c2 = (aPts[1] + aPts[2]) / 2.0f;
+    auto aDir = c2 - c1;
+    auto aLen = length(aDir);
+    if (aLen > 0) {
+        aDir /= aLen;
+    } else {
+        aDir = Pointf{ 1, 0 };
+    }
+    auto centerA = (c1 + c2) / 2.0f;
+    Pointf bPts[4] = {
+        rrectB[0], rrectB[1], rrectB[2], rrectB[3]
+    };
+    auto centerB = (bPts[0] + bPts[1] + bPts[2] + bPts[3]) / 4.0f;
+    auto connDir = centerB - centerA;
+    auto connLen = length(connDir);
+    if (connLen == 0.0f) {
+        return 1.0f;
+    }
+    connDir /= connLen;
+    auto cosT = dot(aDir, connDir);
+    return cosT;
+}

nemotron-ocr/cpp/geometry_api/matrix2x2.h ADDED Viewed

	@@ -0,0 +1,92 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include "../geometry.h"
+struct contiguous_tag{};
+struct transpose_tag{};
+template<typename layout_t, uint32_t R, uint32_t C>
+struct Matrix2x2_Offset;
+template<uint32_t R, uint32_t C>
+struct Matrix2x2_Offset<contiguous_tag, R, C>
+{
+    static const uint32_t OFFSET = R * 2 + C;
+};
+template<uint32_t R, uint32_t C>
+struct Matrix2x2_Offset<transpose_tag, R, C>
+{
+    static const uint32_t OFFSET = C * 2 + R;
+};
+template<typename T, typename layout_t, uint32_t R, uint32_t C>
+struct Matrix2x2_Indexor
+{
+    static const uint32_t OFFSET = Matrix2x2_Offset<layout_t, R, C>::OFFSET;
+    static T &get(T *data) { return data[OFFSET]; }
+    static const T get(const T *data) { return data[OFFSET]; }
+};
+template<typename T>
+struct Matrix2x2
+{
+    Matrix2x2() = default;
+    Matrix2x2(T r0c0, T r0c1, T r1c0, T r1c1)
+        : m_data{ r0c0, r0c1, r1c0, r1c1 }
+    {
+    }
+    Matrix2x2(const Point_<T> &r0, const Point_<T> &r1)
+        : m_data{ r0.X, r0.Y, r1.X, r1.Y }
+    {
+    }
+    Matrix2x2(const Point_<T> &r0, const Point_<T> &r1, transpose_tag)
+        : m_data{ r0.X, r1.X, r0.Y, r1.Y }
+    {
+    }
+    inline T &operator[](uint32_t i) { return m_data[i]; }
+    inline const T operator[](uint32_t i) const { return m_data[i]; }
+    T m_data[4];
+};
+template<typename T, typename layout_t>
+struct Matrix2x2_View
+{
+    Matrix2x2_View(const Matrix2x2<T> &m) : m_data(m.m_data) {}
+    const T *m_data;
+};
+template<uint32_t R, uint32_t C, typename T, typename layout_t>
+const T get(const Matrix2x2_View<T, layout_t> &m)
+{
+    return Matrix2x2_Indexor<T, layout_t, R, C>::get(m.m_data);
+}
+template<typename T, typename get_pt_t, typename callback_t, typename layout_t = contiguous_tag>
+inline
+void matmul_fn(int64_t N, const get_pt_t &get_fn, const Matrix2x2<T> &mat, const callback_t &callback,
+               layout_t lt = layout_t{})
+{
+    Matrix2x2_View<T, layout_t> m{ mat };
+    #pragma omp simd
+    for (int64_t i = 0; i < N; ++i) {
+        Point_<T> pt = get_fn(i);
+        T x = pt.X * get<0, 0>(m) + pt.Y * get<1, 0>(m);
+        T y = pt.X * get<0, 1>(m) + pt.Y * get<1, 1>(m);
+        callback(i, Point_<T>{ x, y });
+    }
+}

nemotron-ocr/cpp/geometry_api/poly_bounds_quad.cpp ADDED Viewed

	@@ -0,0 +1,60 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "geometry_api.h"
+using namespace std;
+template<typename T>
+void pt_assign(torch::TensorAccessor<T, 1> acc, T x, T y)
+{
+    acc[0] = x;
+    acc[1] = y;
+}
+template<typename T>
+void poly_bounds_quad_impl(torch::TensorAccessor<T, 2> poly, torch::TensorAccessor<T, 2> outBounds)
+{
+    T minX = poly[0][0],
+      minY = poly[0][1],
+      maxX = poly[0][0],
+      maxY = poly[0][1];
+    const int64_t numVertices = poly.size(0);
+    for (int64_t i = 0; i < numVertices; ++i) {
+        auto vert = poly[i];
+        minX = min(minX, vert[0]);
+        maxX = max(maxX, vert[0]);
+        minY = min(minY, vert[1]);
+        maxY = max(maxY, vert[1]);
+    }
+    pt_assign(outBounds[0], minX, minY);
+    pt_assign(outBounds[1], maxX, minY);
+    pt_assign(outBounds[2], maxX, maxY);
+    pt_assign(outBounds[3], minX, maxY);
+}
+torch::Tensor get_poly_bounds_quad(torch::Tensor poly)
+{
+    auto ret = torch::empty({ 4, 2 }, poly.options());
+    AT_DISPATCH_FLOATING_TYPES(
+        poly.scalar_type(),
+        "poly_bounds_quad_impl",
+        ([&] {
+            poly_bounds_quad_impl(
+                poly.accessor<scalar_t, 2>(),
+                ret.accessor<scalar_t, 2>()
+            );
+        })
+    );
+    return ret;
+}

nemotron-ocr/cpp/graph_detection/encode_util.cpp ADDED Viewed

	@@ -0,0 +1,271 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#include "encode_util.h"
+#include <algorithm>
+#include <numeric>
+#include <sstream>
+#include "../third_party/clipper/clipper.hpp"
+using namespace std;
+namespace graph_detection {
+template<typename T>
+struct Candidate : Edge {
+    T C;
+    Candidate() = default;
+    Candidate(int32_t a, int32_t b, T c) : Edge(a, b), C(c) {}
+};
+struct DistStruct {
+    Candidate<Pointf> A;
+    Candidate<Pointf> B;
+    float Dist;
+    DistStruct() = default;
+    DistStruct(Candidate<Pointf> a, Candidate<Pointf> b, float dist) : A(a), B(b), Dist(dist) {}
+};
+template<typename T>
+float vec_cos(const Point_<T> &a, const Point_<T> &b)
+{
+    return dot(a, b) / (length(a) * length(b) + 1e-8);
+}
+template<typename T, typename Fn = std::less<T>>
+vector<size_t> arg_sort(const vector<T> &vec, Fn comp = Fn())
+{
+    vector<size_t> ret;
+    ret.reserve(vec.size());
+    for (size_t i = 0; i < vec.size(); ++i) {
+        ret.push_back(i);
+    }
+    sort(begin(ret), end(ret),
+        [&vec, &comp] (size_t idxA, size_t idxB) {
+            return comp(vec[idxA], vec[idxB]);
+        }
+    );
+    return ret;
+}
+float edge_length(const Polygon_<float> &poly, const vector<Edge> &edges);
+vector<Edge> find_bottom(const Polygon_<float> &poly, bool useVertexOrder)
+{
+    if (poly.Count < 4) {
+        throw runtime_error("Invalid polygon. Fewer than 4 vertices!");
+    }
+    // If we trust the source of the geometries, then this saves us both computation,
+    // but can also be more reliable since we won't reorder the vertices
+    if (useVertexOrder) {
+        if ((poly.Count % 2) == 1) {
+            throw runtime_error("Can't use trusted vertex order when the vertex count is odd!");
+        }
+        int32_t halfCt = poly.Count / 2;
+        return { { halfCt - 1, halfCt },
+                 { static_cast<int32_t>(poly.Count) - 1, 0 } };
+    }
+    if (poly.Count == 4) {
+        float d1 = length(poly[1] - poly[0]) + length(poly[2] - poly[3]);
+        float d2 = length(poly[2] - poly[1]) + length(poly[0] - poly[3]);
+        if (4 * d1 < d2) {
+            return { { 0, 1 }, { 2, 3 } };
+        } else {
+            return { { 1, 2 }, { 3, 0 } };
+        }
+    }
+    auto idx_wrap = [&poly] (size_t idx) {
+        return poly[idx % poly.Count];
+    };
+    vector<Candidate<float>> candidates;
+    for (size_t i = 1; i < (poly.Count + 1); ++i) {
+        auto vPrev = idx_wrap(i) - idx_wrap(i - 1);
+        auto vNext = idx_wrap(i + 2) - idx_wrap(i + 1);
+        // We're looking for the segment where the preceding and following segment
+        // essentially travel in opposite directions
+        if (vec_cos(vPrev, vNext) < -0.875f) {
+            auto currSeg = idx_wrap(i) - idx_wrap(i + 1);
+            candidates.emplace_back(i % poly.Count, (i + 1) % poly.Count, length(currSeg));
+        }
+    }
+    if (candidates.size() != 2 || candidates[0].A == candidates[1].B || candidates[0].B == candidates[1].A) {
+        // If candidate number < 2, or two bottom are joined, select 2 farthest edge
+        vector<Candidate<Pointf>> midList;
+        for (size_t i = 0; i < poly.Count; ++i) {
+            Pointf midPoint = (idx_wrap(i) + idx_wrap(i + 1)) / 2.0f;
+            midList.emplace_back(i, (i + 1) % poly.Count, midPoint);
+        }
+        vector<DistStruct> distList;
+        // Only found one good candidate, so search for the edge that's the furthest from this candidate
+        if (candidates.size() == 1) {
+            auto idx1a = candidates.back().A;
+            auto idx1b = candidates.back().B;
+            Candidate<Pointf> cand1{ idx1a, idx1b, (idx_wrap(idx1a) + idx_wrap(idx1b)) / 2.0f };
+            for (size_t j = 0; j < poly.Count; ++j) {
+                auto &cand2 = midList[j];
+                if (cand1.Touches(cand2)) continue;
+                float dist = length(cand1.C - cand2.C);
+                distList.emplace_back(cand1, cand2, dist);
+            }
+        } else {
+            for (size_t i = 0; i < poly.Count; ++i) {
+                for (size_t j = i + 1; j < poly.Count; ++j) {
+                    auto &cand1 = midList[i];
+                    auto &cand2 = midList[j];
+                    if (cand1.Touches(cand2)) continue;
+                    float dist = length(cand1.C - cand2.C);
+                    distList.emplace_back(cand1, cand2, dist);
+                }
+            }
+        }
+        sort(begin(distList), end(distList), [] (auto a, auto b) { return a.Dist < b.Dist; });
+        if (distList.empty()) {
+            throw runtime_error("No valid bottom candidates found for this polygon!");
+        }
+        auto &bEdge = distList.back();
+        return vector<Edge>{ bEdge.A, bEdge.B };
+    } else {
+        return vector<Edge>{ candidates[0], candidates[1] };
+    }
+}
+void find_long_edges(const Polygon_<float> &poly, Edge *bottoms, vector<Edge> &outLongEdge1, vector<Edge> &outLongEdge2)
+{
+    int32_t b1End = bottoms[0].B;
+    int32_t b2End = bottoms[1].B;
+    int32_t nPoints = poly.Count;
+    auto accum_into = [nPoints] (int32_t end1, int32_t end2, vector<Edge> &outEdge) {
+        int32_t i = (end1 + 1) % nPoints;
+        while ((i % nPoints) != end2) {
+            int32_t start = i > 0 ? i - 1 : nPoints - 1;
+            int32_t end = i % nPoints;
+            outEdge.emplace_back(start, end);
+            i = (i + 1) % nPoints;
+        }
+    };
+    accum_into(b1End, b2End, outLongEdge1);
+    accum_into(b2End, b1End, outLongEdge2);
+}
+float edge_length(const Polygon_<float> &poly, const vector<Edge> &edges)
+{
+    float ret = 0.0f;
+    for (const Edge &e : edges) {
+        ret += length(poly[e.B] - poly[e.A]);
+    }
+    return ret;
+}
+vector<float> edge_lengths(const Polygon_<float> &poly, const vector<Edge> &edges)
+{
+    if (edges.empty()) {
+        throw runtime_error("Found an empty edge!");
+    }
+    vector<float> ret;
+    ret.reserve(edges.size());
+    for (const Edge &e : edges) {
+        ret.push_back(length(poly[e.B] - poly[e.A]));
+    }
+    return ret;
+}
+void split_edge_sequence(const Polygon_<float> &poly, const vector<Edge> &edges,
+                         const vector<float> &edgeLengths, float nParts,
+                         vector<Pointf> &outPts);
+void split_edge_sequence_by_step(const Polygon_<float> &poly, const vector<Edge> &longEdge1, const vector<Edge> &longEdge2,
+                                 float step, vector<Pointf> &outInnerPoints1, vector<Pointf> &outInnerPoints2)
+{
+    auto edgeLengths1 = edge_lengths(poly, longEdge1);
+    auto edgeLengths2 = edge_lengths(poly, longEdge2);
+    float totalLength = (accumulate(begin(edgeLengths1), end(edgeLengths1), 0.0f) + accumulate(begin(edgeLengths2), end(edgeLengths2), 0.0f)) / 2;
+    float nParts = max<float>(ceil(totalLength / step), 2);
+    split_edge_sequence(poly, longEdge1, edgeLengths1, nParts, outInnerPoints1);
+    split_edge_sequence(poly, longEdge2, edgeLengths2, nParts, outInnerPoints2);
+}
+void split_edge_sequence(const Polygon_<float> &poly, const vector<Edge> &edges,
+                         const vector<float> &edgeLengths, float nParts,
+                         vector<Pointf> &outPts)
+{
+    vector<float> elCumSum = vec_cumsum(edgeLengths);
+    float totalLength = elCumSum.back();
+    float lengthPerPart = totalLength / (nParts - 1);
+    size_t iNumParts = nParts;
+    size_t currNode = 0;
+    size_t ctr = 0;
+    for (float i = 0.0f; ctr < iNumParts; i += 1.0f, ++ctr) {
+        float t = min(i * lengthPerPart, totalLength);
+        while (t > elCumSum[currNode + 1]) {
+            ++currNode;
+        }
+        Edge currEdge = edges[currNode];
+        Pointf e1 = poly[currEdge.A];
+        Pointf e2 = poly[currEdge.B];
+        float currLen = edgeLengths[currNode];
+        Pointf sampledPt;
+        if (currLen > 0) {
+            float deltaT = t - elCumSum[currNode];
+            float ratio = deltaT / currLen;
+            sampledPt = e1 + ratio * (e2 - e1);
+        } else {
+            sampledPt = e1;
+        }
+        outPts.push_back(sampledPt);
+    }
+}
+string print_poly(const Polyf &poly) {
+    ostringstream oss;
+    oss << "[";
+    for (size_t i = 0; i < poly.Count; ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << "(" << poly[i].X << ", " << poly[i].Y << ")";
+    }
+    oss << "]";
+    return oss.str();
+}
+} // namespace graph_detection

nemotron-ocr/cpp/graph_detection/encode_util.h ADDED Viewed

	@@ -0,0 +1,183 @@

+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <vector>
+#include <random>
+#include <algorithm>
+#include "../geometry.h"
+namespace graph_detection {
+struct Edge {
+    int32_t A;
+    int32_t B;
+    Edge() = default;
+    Edge(int32_t a, int32_t b) : A(a), B(b) {}
+    bool Touches(int32_t idx) const { return A == idx || B == idx; }
+    bool Touches(const Edge &other) const;
+};
+inline
+bool edge_touches(const Edge &edge, int32_t vertex) {
+    return edge.A == vertex || edge.B == vertex;
+}
+inline
+bool Edge::Touches(const Edge &other) const {
+    return edge_touches(other, A) || edge_touches(other, B);
+}
+typedef Point_<float> Pointf;
+typedef AABB_<float> AABBf;
+typedef Polygon_<float> Polyf;
+typedef std::vector<Pointf> Polyline;
+std::vector<Edge> find_bottom(const Polygon_<float> &poly, bool useVertexOrder);
+void find_long_edges(const Polygon_<float> &poly, Edge *bottoms, std::vector<Edge> &outLongEdge1, std::vector<Edge> &outLongEdge2);
+void split_edge_sequence_by_step(const Polygon_<float> &poly, const std::vector<Edge> &longEdge1, const std::vector<Edge> &longEdge2,
+                                 float step, std::vector<Pointf> &outInnerPoints1, std::vector<Pointf> &outInnerPoints2);
+std::string print_poly(const Polyf &poly);
+template<typename T>
+inline
+std::vector<T> vec_cumsum(const std::vector<T> &v)
+{
+    std::vector<T> ret;
+    ret.reserve(v.size() + 1);
+    ret.push_back(0);
+    for (T val : v) {
+        ret.push_back(ret.back() + val);
+    }
+    return ret;
+}
+template<typename RandEng, typename Fn>
+inline
+void n_choose_k(size_t n, size_t k, RandEng &randEng, Fn fn)
+{
+    if (k == 0) return;
+    // TODO(mranzinger): This algorithm can be replaced with sampling from a geometric
+    // distribution, which drastically reduces the runtime complexity
+    for (size_t i = 0; i < n; ++i) {
+        size_t leftover = n - i;
+        if (leftover <= k) {
+            fn(i);
+            --k;
+        } else {
+            float p = std::uniform_real_distribution<float>(0.0f, 1.0f)(randEng);
+            float probSample = float{k} / float{leftover};
+            if (p < probSample) {
+                fn(i);
+                --k;
+            }
+        }
+    }
+}
+template<typename T>
+inline T clamp(T val, T minVal, T maxVal) {
+    return std::max(std::min(val, maxVal), minVal);
+}
+inline
+Pointf avg_point(const std::vector<Pointf> &points)
+{
+    return std::accumulate(std::begin(points), std::end(points), Pointf(0,0)) / float(points.size());
+}
+inline
+float vector_sin(const Pointf &pt)
+{
+    // sin = y / len(pt)
+    return pt.Y / (length(pt) + 1e-8);
+}
+inline
+float vector_cos(const Pointf &pt)
+{
+    // cos = x / len(pt)
+    return pt.X / (length(pt) + 1e-8);
+}
+inline
+void vector_cos_sin(const Pointf & pt, float &outCos, float &outSin)
+{
+    float len = length(pt) + 1e-8;
+    outCos = pt.X / len;
+    outSin = pt.Y / len;
+}
+inline
+float point_dist_to_line(const Pointf &l1, const Pointf &l2, const Pointf &pt)
+{
+    auto d = l2 - l1;
+    auto lineLen = length(d);
+    if (lineLen > 0) {
+        float distance = abs(
+              d.Y * pt.X
+            - d.X * pt.Y
+            + l2.X * l1.Y
+            - l2.Y * l1.X
+        ) / lineLen;
+        return distance;
+    } else {
+        return length(pt - l1);
+    }
+}
+template<typename T>
+T find_mode(std::vector<T> &inputs) {
+    using std::sort;
+    using std::begin;
+    using std::end;
+    if (inputs.empty()) {
+        throw std::runtime_error("Cannot find mode of empty distribution!");
+    }
+    sort(begin(inputs), end(inputs));
+    T currVal = inputs[0];
+    size_t currCount = 1;
+    T modeVal = inputs[0];
+    size_t modeCount = 1;
+    auto commitCurr = [&] () {
+        if (currCount > modeCount) {
+            modeCount = currCount;
+            modeVal = currVal;
+        }
+    };
+    for (size_t i = 1; i < inputs.size(); ++i) {
+        if (inputs[i] == currVal) {
+            ++currCount;
+        } else {
+            // Start of a new value
+            commitCurr();
+            currCount = 1;
+            currVal = inputs[i];
+        }
+    }
+    commitCurr();
+    return modeVal;
+}
+} // namespace graph_detection