emelryan commited on
Commit
b28505d
·
0 Parent(s):

Duplicate from nvidia/nemotron-ocr-v2-multilingual

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +37 -0
  2. Dockerfile +21 -0
  3. LICENSE +243 -0
  4. README.md +400 -0
  5. THIRD_PARTY_NOTICES.md +519 -0
  6. checkpoints/charset.txt +0 -0
  7. checkpoints/detector.pth +3 -0
  8. checkpoints/model_config.json +18 -0
  9. checkpoints/recognizer.pth +3 -0
  10. checkpoints/relational.pth +3 -0
  11. config.json +0 -0
  12. docker-compose.yaml +21 -0
  13. example.py +61 -0
  14. nemotron-ocr/.gitignore +9 -0
  15. nemotron-ocr/cpp/.gitattributes +1 -0
  16. nemotron-ocr/cpp/.gitignore +6 -0
  17. nemotron-ocr/cpp/.gitmodules +3 -0
  18. nemotron-ocr/cpp/README.md +15 -0
  19. nemotron-ocr/cpp/beam_decode/beam_decode.cpp +459 -0
  20. nemotron-ocr/cpp/beam_decode/beam_decode.h +17 -0
  21. nemotron-ocr/cpp/beam_decode/kn_lm.cpp +85 -0
  22. nemotron-ocr/cpp/beam_decode/kn_lm.h +26 -0
  23. nemotron-ocr/cpp/beam_decode/language_model.cpp +146 -0
  24. nemotron-ocr/cpp/beam_decode/language_model.h +65 -0
  25. nemotron-ocr/cpp/beam_decode/log_sum_exp.cpp +6 -0
  26. nemotron-ocr/cpp/beam_decode/log_sum_exp.h +53 -0
  27. nemotron-ocr/cpp/beam_decode/ngram_lm_base.cpp +329 -0
  28. nemotron-ocr/cpp/beam_decode/ngram_lm_base.h +79 -0
  29. nemotron-ocr/cpp/beam_decode/prefix.cpp +22 -0
  30. nemotron-ocr/cpp/beam_decode/prefix.h +157 -0
  31. nemotron-ocr/cpp/beam_decode/sbo_lm.cpp +46 -0
  32. nemotron-ocr/cpp/beam_decode/sbo_lm.h +20 -0
  33. nemotron-ocr/cpp/better_grid_sample/cpu_indirect_grid_sample.cpp +93 -0
  34. nemotron-ocr/cpp/better_grid_sample/gpu_grid_sample_utils.cuh +41 -0
  35. nemotron-ocr/cpp/better_grid_sample/gpu_indirect_grid_sample.cu +327 -0
  36. nemotron-ocr/cpp/better_grid_sample/grid_sample.h +66 -0
  37. nemotron-ocr/cpp/common.cpp +12 -0
  38. nemotron-ocr/cpp/common.h +57 -0
  39. nemotron-ocr/cpp/cuda_intellisense.cuh +50 -0
  40. nemotron-ocr/cpp/geometry.h +1100 -0
  41. nemotron-ocr/cpp/geometry_api/calc_poly_min_rrect.cpp +164 -0
  42. nemotron-ocr/cpp/geometry_api/geometry_api.cpp +100 -0
  43. nemotron-ocr/cpp/geometry_api/geometry_api.h +15 -0
  44. nemotron-ocr/cpp/geometry_api/geometry_api_common.h +120 -0
  45. nemotron-ocr/cpp/geometry_api/geometry_api_gpu.cu +141 -0
  46. nemotron-ocr/cpp/geometry_api/get_rel_continuation_cos.cpp +59 -0
  47. nemotron-ocr/cpp/geometry_api/matrix2x2.h +92 -0
  48. nemotron-ocr/cpp/geometry_api/poly_bounds_quad.cpp +60 -0
  49. nemotron-ocr/cpp/graph_detection/encode_util.cpp +271 -0
  50. nemotron-ocr/cpp/graph_detection/encode_util.h +183 -0
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.ipynb filter=lfs diff=lfs merge=lfs -text
37
+ *.png filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvcr.io/nvidia/pytorch:25.09-py3
2
+
3
+ ARG TARGETARCH
4
+
5
+ ARG TORCH_CUDA_ARCH_LIST
6
+ ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
7
+
8
+ RUN --mount=type=cache,target=/root/.cache/pip \
9
+ pip install -U pip hatchling "setuptools>=68" --root-user-action ignore
10
+
11
+ COPY nemotron-ocr /workspace/nemotron-ocr
12
+ WORKDIR /workspace/nemotron-ocr
13
+
14
+ # Ensure no prebuilt binaries/artifacts from the host are present
15
+ RUN rm -f src/nemotron_ocr_cpp/*.so || true \
16
+ && rm -rf build/ dist/
17
+
18
+ RUN --mount=type=cache,target=/root/.cache/pip \
19
+ BUILD_CPP_FORCE=1 ARCH=${TARGETARCH} pip install -v . --no-build-isolation --root-user-action ignore
20
+
21
+ WORKDIR /workspace
LICENSE ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ All binary model files are licensed under NVIDIA Open Model License Agreement.
2
+ All source code files are licensed under the Apache 2.0 License.
3
+
4
+ ------------
5
+ NVIDIA Open Model License Agreement
6
+ Last Modified: October 24, 2025
7
+ https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/
8
+
9
+ This NVIDIA Open Model License Agreement (the “Agreement”) is a legal agreement between the Legal Entity You represent, or if no entity is identified, You and NVIDIA Corporation and its Affiliates (“NVIDIA”) and governs Your use of the Models that NVIDIA provides to You under this Agreement. NVIDIA and You are each a “party” and collectively the “parties.”
10
+
11
+ NVIDIA models released under this Agreement are intended to be used permissively and enable the further development of AI technologies. Subject to the terms of this Agreement, NVIDIA confirms that:
12
+
13
+ - Models are commercially usable.
14
+ - You are free to create and distribute Derivative Models.
15
+ - NVIDIA does not claim ownership to any outputs generated using the Models or Derivative Models.
16
+ By using, reproducing, modifying, distributing, performing or displaying any portion or element of the Model or Derivative Model, or otherwise accepting the terms of this Agreement, you agree to be bound by this Agreement.
17
+
18
+ 1. Definitions. The following definitions apply to this Agreement:
19
+
20
+ 1.1 "Derivative Model" means all (a) modifications to the Model, (b) works based on the Model, and (c) any other derivative works of the Model. An output is not a Derivative Model.
21
+
22
+ 1.2 "Legal Entity" means the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (a) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (b) ownership of fifty percent (50%) or more of the outstanding shares, or (c) beneficial ownership of such entity.
23
+
24
+ 1.3 “Model” means the machine learning model, software, checkpoints, learnt weights, algorithms, parameters, configuration files and documentation shared under this Agreement.
25
+
26
+ 1.4 "NVIDIA Cosmos Model" means a multimodal Model shared under this Agreement
27
+
28
+ 1.5 "Special-Purpose Model" means a Model that is only competent in a narrow set of purpose-specific tasks and should not be used for unintended or general-purpose applications
29
+
30
+ 1.6 “You” or “Your” means an individual or Legal Entity exercising permissions granted by this Agreement.
31
+
32
+ 2. Conditions for Use, License Grant, AI Ethics and IP Ownership.
33
+
34
+ 2.1 Conditions for Use. The Model and any Derivative Model are subject to additional terms as described in Section 2 and Section 3 of this Agreement and govern Your use. If You institute copyright or patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model or a Derivative Model constitutes direct or contributory copyright or patent infringement, then any licenses granted to You under this Agreement for that Model or Derivative Model will terminate as of the date such litigation is filed. If You bypass, disable, reduce the efficacy of, or circumvent any technical limitation, safety guardrail or associated safety guardrail hyperparameter, encryption, security, digital rights management, or authentication mechanism (collectively “Guardrail”) contained in the Model without a substantially similar Guardrail appropriate for your use case, your rights under this Agreement will automatically terminate. NVIDIA may indicate in relevant documentation that a Model is a Special-Purpose Model. NVIDIA may update this Agreement to comply with legal and regulatory requirements at any time and You agree to either comply with any updated license or cease Your copying, use, and distribution of the Model and any Derivative Model.
35
+
36
+ 2.2 License Grant. The rights granted herein are explicitly conditioned on Your full compliance with the terms of this Agreement. Subject to the terms and conditions of this Agreement, NVIDIA hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, revocable (as stated in Section 2.1) license to publicly perform, publicly display, reproduce, use, create derivative works of, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution) and import the Model.
37
+
38
+ 2.3 AI Ethics. Use of the Models under the Agreement must be consistent with NVIDIA’s Trustworthy AI terms found at https://www.nvidia.com/en-us/agreements/trustworthy-ai/terms/.
39
+
40
+ 2.4 NVIDIA owns the Model and any Derivative Models created by NVIDIA. Subject to NVIDIA’s underlying ownership rights in the Model or its Derivative Models, You are and will be the owner of Your Derivative Models. NVIDIA claims no ownership rights in outputs. You are responsible for outputs and their subsequent uses. Except as expressly granted in this Agreement, (a) NVIDIA reserves all rights, interests and remedies in connection with the Model and (b) no other license or right is granted to you by implication, estoppel or otherwise.
41
+
42
+ 3. Redistribution. You may reproduce and distribute copies of the Model or Derivative Models thereof in any medium, with or without modifications, provided that You meet the following conditions:
43
+
44
+ 3.1 If you distribute the Model, You must give any other recipients of the Model a copy of this Agreement and include the following attribution notice within a “Notice” text file with such copies: “Licensed by NVIDIA Corporation under the NVIDIA Open Model License”;
45
+
46
+ 3.2 If you distribute or make available a NVIDIA Cosmos Model, or a product or service (including an AI model) that contains or uses a NVIDIA Cosmos Model, use a NVIDIA Cosmos Model to create a Derivative Model, or use a NVIDIA Cosmos Model or its outputs to create, train, fine tune, or otherwise improve an AI model, you will include “Built on NVIDIA Cosmos” on a related website, user interface, blogpost, about page, or product documentation; and
47
+
48
+ 3.3 You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Models as a whole, provided Your use, reproduction, and distribution of the Model otherwise complies with the conditions stated in this Agreement.
49
+
50
+ 4. Separate Components. The Models may include or be distributed with components provided with separate legal notices or terms that accompany the components, such as an Open Source Software License or other third-party license. The components are subject to the applicable other licenses, including any proprietary notices, disclaimers, requirements and extended use rights; except that this Agreement will prevail regarding the use of third-party Open Source Software License, unless a third-party Open Source Software License requires its license terms to prevail. “Open Source Software License” means any software, data or documentation subject to any license identified as an open source license by the Open Source Initiative (https://opensource.org), Free Software Foundation (https://www.fsf.org) or other similar open source organization or listed by the Software Package Data Exchange (SPDX) Workgroup under the Linux Foundation (https://www.spdx.org).
51
+
52
+ 5. Trademarks. This Agreement does not grant permission to use the trade names, trademarks, service marks, or product names of NVIDIA, except as required for reasonable and customary use in describing the origin of the Model and reproducing the content of the “Notice” text file.
53
+
54
+ 6. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, NVIDIA provides the Model on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for reviewing Model documentation, including any Special-Purpose Model limitations, and determining the appropriateness of using or redistributing the Model, Derivative Models and outputs. You assume any risks associated with Your exercise of permissions under this Agreement.
55
+
56
+ 7. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, will NVIDIA be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this Agreement or out of the use or inability to use the Model, Derivative Models or outputs (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if NVIDIA has been advised of the possibility of such damages.
57
+
58
+ 8. Indemnity. You will indemnify and hold harmless NVIDIA from and against any claim by any third party arising out of or related to your use or distribution of the Model, Derivative Models or outputs.
59
+
60
+ 9. Feedback. NVIDIA appreciates your feedback, and You agree that NVIDIA may use it without restriction or compensation to You.
61
+
62
+ 10. Governing Law. This Agreement will be governed in all respects by the laws of the United States and the laws of the State of Delaware, without regard to conflict of laws principles or the United Nations Convention on Contracts for the International Sale of Goods. The state and federal courts residing in Santa Clara County, California will have exclusive jurisdiction over any dispute or claim arising out of or related to this Agreement, and the parties irrevocably consent to personal jurisdiction and venue in those courts; except that, either party may apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
63
+
64
+ 11. Trade and Compliance. You agree to comply with all applicable export, import, trade and economic sanctions laws and regulations, as amended, including without limitation U.S. Export Administration Regulations and Office of Foreign Assets Control regulations. These laws include restrictions on destinations, end-users and end-use.
65
+
66
+ Version Release Date: October 24, 2025
67
+
68
+ -----------------
69
+ Apache License
70
+ Version 2.0, January 2004
71
+ http://www.apache.org/licenses/
72
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
73
+ 1. Definitions.
74
+ "License" shall mean the terms and conditions for use, reproduction,
75
+ and distribution as defined by Sections 1 through 9 of this document.
76
+ "Licensor" shall mean the copyright owner or entity authorized by
77
+ the copyright owner that is granting the License.
78
+ "Legal Entity" shall mean the union of the acting entity and all
79
+ other entities that control, are controlled by, or are under common
80
+ control with that entity. For the purposes of this definition,
81
+ "control" means (i) the power, direct or indirect, to cause the
82
+ direction or management of such entity, whether by contract or
83
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
84
+ outstanding shares, or (iii) beneficial ownership of such entity.
85
+ "You" (or "Your") shall mean an individual or Legal Entity
86
+ exercising permissions granted by this License.
87
+ "Source" form shall mean the preferred form for making modifications,
88
+ including but not limited to software source code, documentation
89
+ source, and configuration files.
90
+ "Object" form shall mean any form resulting from mechanical
91
+ transformation or translation of a Source form, including but
92
+ not limited to compiled object code, generated documentation,
93
+ and conversions to other media types.
94
+ "Work" shall mean the work of authorship, whether in Source or
95
+ Object form, made available under the License, as indicated by a
96
+ copyright notice that is included in or attached to the work
97
+ (an example is provided in the Appendix below).
98
+ "Derivative Works" shall mean any work, whether in Source or Object
99
+ form, that is based on (or derived from) the Work and for which the
100
+ editorial revisions, annotations, elaborations, or other modifications
101
+ represent, as a whole, an original work of authorship. For the purposes
102
+ of this License, Derivative Works shall not include works that remain
103
+ separable from, or merely link (or bind by name) to the interfaces of,
104
+ the Work and Derivative Works thereof.
105
+ "Contribution" shall mean any work of authorship, including
106
+ the original version of the Work and any modifications or additions
107
+ to that Work or Derivative Works thereof, that is intentionally
108
+ submitted to Licensor for inclusion in the Work by the copyright owner
109
+ or by an individual or Legal Entity authorized to submit on behalf of
110
+ the copyright owner. For the purposes of this definition, "submitted"
111
+ means any form of electronic, verbal, or written communication sent
112
+ to the Licensor or its representatives, including but not limited to
113
+ communication on electronic mailing lists, source code control systems,
114
+ and issue tracking systems that are managed by, or on behalf of, the
115
+ Licensor for the purpose of discussing and improving the Work, but
116
+ excluding communication that is conspicuously marked or otherwise
117
+ designated in writing by the copyright owner as "Not a Contribution."
118
+ "Contributor" shall mean Licensor and any individual or Legal Entity
119
+ on behalf of whom a Contribution has been received by Licensor and
120
+ subsequently incorporated within the Work.
121
+ 2. Grant of Copyright License. Subject to the terms and conditions of
122
+ this License, each Contributor hereby grants to You a perpetual,
123
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
124
+ copyright license to reproduce, prepare Derivative Works of,
125
+ publicly display, publicly perform, sublicense, and distribute the
126
+ Work and such Derivative Works in Source or Object form.
127
+ 3. Grant of Patent License. Subject to the terms and conditions of
128
+ this License, each Contributor hereby grants to You a perpetual,
129
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
130
+ (except as stated in this section) patent license to make, have made,
131
+ use, offer to sell, sell, import, and otherwise transfer the Work,
132
+ where such license applies only to those patent claims licensable
133
+ by such Contributor that are necessarily infringed by their
134
+ Contribution(s) alone or by combination of their Contribution(s)
135
+ with the Work to which such Contribution(s) was submitted. If You
136
+ institute patent litigation against any entity (including a
137
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
138
+ or a Contribution incorporated within the Work constitutes direct
139
+ or contributory patent infringement, then any patent licenses
140
+ granted to You under this License for that Work shall terminate
141
+ as of the date such litigation is filed.
142
+ 4. Redistribution. You may reproduce and distribute copies of the
143
+ Work or Derivative Works thereof in any medium, with or without
144
+ modifications, and in Source or Object form, provided that You
145
+ meet the following conditions:
146
+ (a) You must give any other recipients of the Work or
147
+ Derivative Works a copy of this License; and
148
+ (b) You must cause any modified files to carry prominent notices
149
+ stating that You changed the files; and
150
+ (c) You must retain, in the Source form of any Derivative Works
151
+ that You distribute, all copyright, patent, trademark, and
152
+ attribution notices from the Source form of the Work,
153
+ excluding those notices that do not pertain to any part of
154
+ the Derivative Works; and
155
+ (d) If the Work includes a "NOTICE" text file as part of its
156
+ distribution, then any Derivative Works that You distribute must
157
+ include a readable copy of the attribution notices contained
158
+ within such NOTICE file, excluding those notices that do not
159
+ pertain to any part of the Derivative Works, in at least one
160
+ of the following places: within a NOTICE text file distributed
161
+ as part of the Derivative Works; within the Source form or
162
+ documentation, if provided along with the Derivative Works; or,
163
+ within a display generated by the Derivative Works, if and
164
+ wherever such third-party notices normally appear. The contents
165
+ of the NOTICE file are for informational purposes only and
166
+ do not modify the License. You may add Your own attribution
167
+ notices within Derivative Works that You distribute, alongside
168
+ or as an addendum to the NOTICE text from the Work, provided
169
+ that such additional attribution notices cannot be construed
170
+ as modifying the License.
171
+ You may add Your own copyright statement to Your modifications and
172
+ may provide additional or different license terms and conditions
173
+ for use, reproduction, or distribution of Your modifications, or
174
+ for any such Derivative Works as a whole, provided Your use,
175
+ reproduction, and distribution of the Work otherwise complies with
176
+ the conditions stated in this License.
177
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
178
+ any Contribution intentionally submitted for inclusion in the Work
179
+ by You to the Licensor shall be under the terms and conditions of
180
+ this License, without any additional terms or conditions.
181
+ Notwithstanding the above, nothing herein shall supersede or modify
182
+ the terms of any separate license agreement you may have executed
183
+ with Licensor regarding such Contributions.
184
+ 6. Trademarks. This License does not grant permission to use the trade
185
+ names, trademarks, service marks, or product names of the Licensor,
186
+ except as required for reasonable and customary use in describing the
187
+ origin of the Work and reproducing the content of the NOTICE file.
188
+ 7. Disclaimer of Warranty. Unless required by applicable law or
189
+ agreed to in writing, Licensor provides the Work (and each
190
+ Contributor provides its Contributions) on an "AS IS" BASIS,
191
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
192
+ implied, including, without limitation, any warranties or conditions
193
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
194
+ PARTICULAR PURPOSE. You are solely responsible for determining the
195
+ appropriateness of using or redistributing the Work and assume any
196
+ risks associated with Your exercise of permissions under this License.
197
+ 8. Limitation of Liability. In no event and under no legal theory,
198
+ whether in tort (including negligence), contract, or otherwise,
199
+ unless required by applicable law (such as deliberate and grossly
200
+ negligent acts) or agreed to in writing, shall any Contributor be
201
+ liable to You for damages, including any direct, indirect, special,
202
+ incidental, or consequential damages of any character arising as a
203
+ result of this License or out of the use or inability to use the
204
+ Work (including but not limited to damages for loss of goodwill,
205
+ work stoppage, computer failure or malfunction, or any and all
206
+ other commercial damages or losses), even if such Contributor
207
+ has been advised of the possibility of such damages.
208
+ 9. Accepting Warranty or Additional Liability. While redistributing
209
+ the Work or Derivative Works thereof, You may choose to offer,
210
+ and charge a fee for, acceptance of support, warranty, indemnity,
211
+ or other liability obligations and/or rights consistent with this
212
+ License. However, in accepting such obligations, You may act only
213
+ on Your own behalf and on Your sole responsibility, not on behalf
214
+ of any other Contributor, and only if You agree to indemnify,
215
+ defend, and hold each Contributor harmless for any liability
216
+ incurred by, or claims asserted against, such Contributor by reason
217
+ of your accepting any such warranty or additional liability.
218
+ END OF TERMS AND CONDITIONS
219
+ APPENDIX: How to apply the Apache License to your work.
220
+ To apply the Apache License to your work, attach the following
221
+ boilerplate notice, with the fields enclosed by brackets "[]"
222
+ replaced with your own identifying information. (Don't include
223
+ the brackets!) The text should be enclosed in the appropriate
224
+ comment syntax for the file format. We also recommend that a
225
+ file or class name and description of purpose be included on the
226
+ same "printed page" as the copyright notice for easier
227
+ identification within third-party archives.
228
+
229
+ Copyright [yyyy] [name of copyright owner]
230
+
231
+ Licensed under the Apache License, Version 2.0 (the "License");
232
+ you may not use this file except in compliance with the License.
233
+ You may obtain a copy of the License at
234
+
235
+ http://www.apache.org/licenses/LICENSE-2.0
236
+
237
+ Unless required by applicable law or agreed to in writing, software
238
+ distributed under the License is distributed on an "AS IS" BASIS,
239
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
240
+ See the License for the specific language governing permissions and
241
+ limitations under the License.
242
+
243
+
README.md ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: nvidia-open-model-license
4
+ license_link: >-
5
+ https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/
6
+ language:
7
+ - en
8
+ - zh
9
+ - ja
10
+ - ko
11
+ - ru
12
+ pipeline_tag: image-to-text
13
+ arxiv: None
14
+ tags:
15
+ - image
16
+ - ocr
17
+ - object recognition
18
+ - text recognition
19
+ - layout analysis
20
+ - ingestion
21
+ - multilingual
22
+ ---
23
+
24
+ # Nemotron OCR v2 (multilingual)
25
+
26
+ ## **Model Overview**
27
+
28
+ ### **Description**
29
+
30
+ Nemotron OCR v2 is a state-of-the-art multilingual text recognition model designed for robust end-to-end optical character recognition (OCR) on complex real-world images. It integrates three core neural network modules: a detector for text region localization, a recognizer for transcription of detected regions, and a relational model for layout and structure analysis.
31
+
32
+ This model is optimized for a wide variety of OCR tasks, including multi-line, multi-block, and natural scene text, and it supports advanced reading order analysis via its relational model component. Nemotron OCR v2 supports multiple languages and has been developed to be production-ready and commercially usable, with a focus on speed and accuracy on both document and natural scene images.
33
+
34
+ Nemotron OCR v2 is part of the NVIDIA NeMo Retriever collection, which provides state-of-the-art, commercially-ready models and microservices optimized for the lowest latency and highest throughput. It features a production-ready information retrieval pipeline with enterprise support. The models that form the core of this solution have been trained using responsibly selected, auditable data sources. With multiple pre-trained models available as starting points, developers can readily customize them for domain-specific use cases, such as information technology, human resource help assistants, and research and development assistants.
35
+
36
+ This model is ready for commercial use.
37
+
38
+ ### **License/Terms of use**
39
+
40
+ The use of this model is governed by the [NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/) and the use of the post-processing scripts are licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt).
41
+
42
+ ### Release Date: <br>
43
+ Hugging Face (this repo) [nvidia/nemotron-ocr-v2-multilingual](https://huggingface.co/nvidia/nemotron-ocr-v2-multilingual) <br>
44
+ Collection / variant hub: [nvidia/nemotron-ocr-v2](https://huggingface.co/nvidia/nemotron-ocr-v2) <br>
45
+ Build.Nvidia.com 04/15/2026 via [https://build.nvidia.com/nvidia/nemotron-ocr-v2](https://build.nvidia.com/nvidia/nemotron-ocr-v2) <br>
46
+ NGC 04/15/2026 via [https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo-microservices/containers/nemoretriever-ocr-v2](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo-microservices/containers/nemoretriever-ocr-v2) <br>
47
+
48
+ ### Deployment Geography
49
+
50
+ Global
51
+
52
+ ### Use Case
53
+
54
+ **Nemotron OCR v2** is designed for high-accuracy and high-speed extraction of textual information from images across multiple languages, making it ideal for powering multimodal retrieval systems, Retrieval-Augmented Generation (RAG) pipelines, and agentic applications that require seamless integration of visual and language understanding. Its robust multilingual performance and efficiency make it an excellent choice for next-generation AI systems that demand both precision and scalability across diverse real-world content.
55
+
56
+ ### **Model Architecture**
57
+
58
+ **Architecture Type:** Hybrid detector-recognizer with document-level relational modeling
59
+
60
+ Nemotron OCR v2 is available in two variants:
61
+
62
+ - **v2_english** — Optimized for English-language OCR with a compact recognizer for lower latency.
63
+ - **v2_multilingual** — Supports English, Chinese (Simplified and Traditional), Japanese, Korean, and Russian with a larger recognizer to accommodate the expanded character set.
64
+
65
+ Both variants share the same three-component architecture:
66
+
67
+ - **Text Detector:** Utilizes a RegNetX-8GF convolutional backbone for high-accuracy localization of text regions within images.
68
+ - **Text Recognizer:** Employs a pre-norm Transformer-based sequence recognizer to transcribe text from detected regions, supporting variable word and line lengths.
69
+ - **Relational Model:** Applies a multi-layer global relational module to predict logical groupings, reading order, and layout relationships across detected text elements.
70
+
71
+ All components are trained jointly in an end-to-end fashion, providing robust, scalable, and production-ready OCR for diverse document and scene images.
72
+
73
+ **Network Architecture**: RegNetX-8GF
74
+
75
+ #### Recognizer Comparison
76
+
77
+ The two variants share an identical detector and relational architecture but differ in recognizer capacity:
78
+
79
+ | Spec | v2_english | v2_multilingual |
80
+ |------|-----------|----------------|
81
+ | Transformer layers | 3 | 6 |
82
+ | Hidden dimension (`d_model`) | 256 | 512 |
83
+ | FFN width (`dim_feedforward`) | 1024 | 2048 |
84
+ | Attention heads | 8 | 8 |
85
+ | Max sequence length | 32 | 128 |
86
+ | Character set size | 855 | 14,244 |
87
+
88
+ #### Parameter Counts
89
+
90
+ **v2_english** (from `v2_english/`):
91
+
92
+ | Component | Parameters |
93
+ |-------------------|-------------|
94
+ | Detector | 45,445,259 |
95
+ | Recognizer | 6,130,657 |
96
+ | Relational model | 2,255,419 |
97
+ | **Total** | **53,831,335** |
98
+
99
+ **v2_multilingual** (this repository: `checkpoints/`):
100
+
101
+ | Component | Parameters |
102
+ |-------------------|-------------|
103
+ | Detector | 45,445,259 |
104
+ | Recognizer | 36,119,598 |
105
+ | Relational model | 2,288,187 |
106
+ | **Total** | **83,853,044** |
107
+
108
+ ### **Input**
109
+
110
+ | Property | Value |
111
+ |------------------|-------------------|
112
+ | Input Type & Format | Image (RGB, PNG/JPEG, float32/uint8), aggregation level (word, sentence, or paragraph) |
113
+ | Input Parameters (Two-Dimensional) | 3 x H x W (single image) or B x 3 x H x W (batch) |
114
+ | Input Range | [0, 1] (float32) or [0, 255] (uint8, auto-converted) |
115
+ | Other Properties | Handles both single images and batches. Automatic multi-scale resizing for best accuracy. |
116
+
117
+ ### **Output**
118
+
119
+ | Property | Value |
120
+ |-----------------|-------------------|
121
+ | Output Type | Structured OCR results: a list of detected text regions (bounding boxes), recognized text, and confidence scores |
122
+ | Output Format | Bounding boxes: tuple of floats, recognized text: string, confidence score: float |
123
+ | Output Parameters | Bounding boxes: One-Dimenional (1D) list of bounding box coordinates, recognized text: One-Dimenional (1D) list of strings, confidence score: One-Dimenional (1D) list of floats |
124
+ | Other Properties | Please see the sample output for an example of the model output |
125
+
126
+ ### Sample output
127
+
128
+ ```
129
+ ocr_boxes = [[[15.552736282348633, 43.141815185546875],
130
+ [150.00149536132812, 43.141815185546875],
131
+ [150.00149536132812, 56.845645904541016],
132
+ [15.552736282348633, 56.845645904541016]],
133
+ [[298.3145751953125, 44.43315124511719],
134
+ [356.93585205078125, 44.43315124511719],
135
+ [356.93585205078125, 57.34814453125],
136
+ [298.3145751953125, 57.34814453125]],
137
+ [[15.44686508178711, 13.67985725402832],
138
+ [233.15859985351562, 13.67985725402832],
139
+ [233.15859985351562, 27.376562118530273],
140
+ [15.44686508178711, 27.376562118530273]],
141
+ [[298.51727294921875, 14.268900871276855],
142
+ [356.9850769042969, 14.268900871276855],
143
+ [356.9850769042969, 27.790447235107422],
144
+ [298.51727294921875, 27.790447235107422]]]
145
+
146
+ ocr_txts = ['The previous notice was dated',
147
+ '22 April 2016',
148
+ 'The previous notice was given to the company on',
149
+ '22 April 2016']
150
+
151
+ ocr_confs = [0.97730815, 0.98834222, 0.96804602, 0.98499225]
152
+ ```
153
+
154
+ Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated systems. By leveraging NVIDIA’s hardware (e.g. GPU cores) and software frameworks (e.g., CUDA libraries), the model achieves faster training and inference times compared to CPU-only solutions.
155
+
156
+
157
+ ### Usage
158
+
159
+ #### Prerequisites
160
+
161
+ - **OS**: Linux amd64 with NVIDIA GPU
162
+ - **CUDA**: CUDA Toolkit 12.8 and compatible NVIDIA driver installed (for PyTorch CUDA). Verify with `nvidia-smi`.
163
+ - **Python**: 3.12 (both subpackages require `python = ~3.12`)
164
+ - **Build tools (when building the C++ extension)**:
165
+ - GCC/G++ with C++17 support
166
+ - CUDA toolkit headers (for building CUDA kernels)
167
+ - OpenMP (used by the C++ extension)
168
+
169
+
170
+ #### Installation
171
+ The model requires torch, and the custom code available in this repository.
172
+
173
+ 1. Clone the repository
174
+
175
+ - Make sure git-lfs is installed (https://git-lfs.com)
176
+ ```
177
+ git lfs install
178
+ ```
179
+
180
+ 2. Installation
181
+
182
+ ##### With pip
183
+
184
+ - Create and activate a Python 3.12 environment (optional)
185
+
186
+ - Run the following command to install the package:
187
+
188
+ ```bash
189
+ cd nemotron-ocr
190
+ pip install hatchling
191
+ pip install -v .
192
+ ```
193
+
194
+ ##### With docker
195
+
196
+ Run the example end-to-end without installing anything on the host (besides Docker, docker compose, and NVIDIA Container Toolkit):
197
+
198
+ - Ensure Docker can see your GPU:
199
+
200
+ ```bash
201
+ docker run --rm --gpus all nvcr.io/nvidia/pytorch:25.09-py3 nvidia-smi
202
+ ```
203
+
204
+ - From the repo root, bring up the service to run the example (sample image `ocr-example-input-1.png` when present):
205
+
206
+ ```bash
207
+ docker compose run --rm nemotron-ocr \
208
+ bash -lc "python example.py ocr-example-input-1.png --merge-level paragraph"
209
+ ```
210
+
211
+ This will:
212
+ - Build an image from the provided `Dockerfile` (based on `nvcr.io/nvidia/pytorch`)
213
+ - Mount the repo at `/workspace`
214
+ - Run `example.py` (downloads **v2 multilingual** from Hugging Face on first run unless you pass `--model-dir`)
215
+
216
+ Output is saved next to your input image as `<name>-annotated.<ext>` on the host.
217
+
218
+
219
+ 3. Run the model using the following code.
220
+
221
+ Use `nemotron_ocr.inference.pipeline.NemotronOCR`. With no arguments, checkpoints are downloaded from Hugging Face: **by default** the **v2 multilingual** bundle ([`nvidia/nemotron-ocr-v2-multilingual`](https://huggingface.co/nvidia/nemotron-ocr-v2-multilingual), `checkpoints/`). Use `lang="en"` for the English-optimized v2 build (`nvidia/nemotron-ocr-v2` / `v2_english/`), or pass `model_dir` to load from disk (any complete checkpoint folder; `lang` is then ignored).
222
+
223
+ ```python
224
+ from nemotron_ocr.inference.pipeline import NemotronOCR
225
+
226
+ # Default: Hugging Face v2 multilingual
227
+ ocr = NemotronOCR()
228
+
229
+ # English-optimized v2 (Hub)
230
+ ocr_en = NemotronOCR(lang="en")
231
+
232
+ # Multilingual v2 explicitly (same default as NemotronOCR())
233
+ ocr_multi = NemotronOCR(lang="multi")
234
+
235
+ # Local directory with detector.pth, recognizer.pth, relational.pth, charset.txt (this repo: ./checkpoints)
236
+ ocr_local = NemotronOCR(model_dir="./checkpoints")
237
+
238
+ # Legacy v1 weights from Hub (optional)
239
+ ocr_v1 = NemotronOCR(lang="v1")
240
+
241
+ predictions = ocr("ocr-example-input-1.png")
242
+
243
+ for pred in predictions:
244
+ print(
245
+ f" - Text: '{pred['text']}', "
246
+ f"Confidence: {pred['confidence']:.2f}, "
247
+ f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
248
+ )
249
+ ```
250
+
251
+ **Constructor rules**
252
+
253
+ - **`model_dir`**: If it contains all four checkpoint files, that directory is used and **`lang` is ignored**.
254
+ - **`lang`** (keyword only): When weights are fetched from the Hub — `None` or `"multi"` / `"multilingual"` → [nvidia/nemotron-ocr-v2-multilingual](https://huggingface.co/nvidia/nemotron-ocr-v2-multilingual) `checkpoints/` (default); `"en"` / `"english"` → `nvidia/nemotron-ocr-v2` / `v2_english/`; `"v1"` / `"legacy"` → original v1 layout on `nvidia/nemotron-ocr-v1`.
255
+ - If `model_dir` is set but incomplete, the client falls back to a Hub download using **`lang`** (defaulting to v2 multilingual when `lang` is `None`).
256
+
257
+ ### Software Integration
258
+
259
+ **Runtime Engine(s):**
260
+ - PyTorch
261
+
262
+ **Supported Hardware Microarchitecture Compatibility:**
263
+ - NVIDIA Ampere
264
+ - NVIDIA Blackwell
265
+ - NVIDIA Hopper
266
+ - NVIDIA Lovelace
267
+
268
+ **Preferred/Supported Operating System(s):**
269
+ - Linux
270
+
271
+ ## Model Version(s)
272
+
273
+ * **This repository:** Nemotron OCR **v2 multilingual** (`checkpoints/`).
274
+ * **Related:** [nvidia/nemotron-ocr-v2](https://huggingface.co/nvidia/nemotron-ocr-v2) hosts the **v2 English** variant (`v2_english/`) and collection metadata.
275
+
276
+ ## **Training and Evaluation Datasets:**
277
+
278
+ ### **Training Dataset**
279
+
280
+ **Data Modality**
281
+ * Image
282
+
283
+ **Image Training Data Size**
284
+ * Approximately 12 million images
285
+
286
+ The model is trained on a large-scale, curated mix of real-world and synthetic OCR datasets spanning multiple languages, scripts, and document types.
287
+
288
+ **Real-world datasets (~680K images):** Natural scene text, multilingual scene text, arbitrary-shaped text, chart and infographic text, table images with bilingual annotations, and handwritten document pages. These cover diverse layouts, languages, and document types.
289
+
290
+ **Synthetic datasets (~11M+ images):** Rendered multilingual document pages in six languages (English, Japanese, Korean, Russian, Chinese Simplified, and Chinese Traditional) and synthetic historical document crops covering archaic characters with degradation effects.
291
+
292
+ **Data Collection Method by dataset:** Hybrid (Automated, Human, Synthetic)<br>
293
+ **Labeling Method by dataset:** Hybrid (Automated, Human, Synthetic)<br>
294
+ **Properties:** Includes scanned documents, natural scene images, charts, tables, infographics, handwritten documents, and synthetic rendered pages in multiple languages and scripts.
295
+
296
+ ### **Evaluation Datasets**
297
+
298
+ Nemotron OCR v2 is evaluated on [OmniDocBench](https://github.com/opendatalab/OmniDocBench), a comprehensive document OCR benchmark covering English, Chinese, and mixed-language content across diverse document categories.
299
+
300
+ **Data Collection Method by dataset:** Hybrid (Automated, Human, Synthetic)<br>
301
+ **Labeling Method by dataset:** Hybrid (Automated, Human, Synthetic)<br>
302
+ **Properties:** Benchmarks include challenging scene images, documents with varied layouts, and multi-language data.
303
+
304
+ ### **Evaluation Results**
305
+
306
+ Tables below are **reference metrics** from NVIDIA’s benchmark runs (OmniDocBench, SynthDoG). Reproducing them requires datasets and scripts that are **not** checked into this Hugging Face repository.
307
+
308
+ #### OmniDocBench
309
+
310
+ Normalized Edit Distance (NED) sample_avg on OmniDocBench (lower = better). Results follow OmniDocBench methodology (empty predictions skipped). All models evaluated in crop mode. Speed measured on a single A100 GPU.
311
+
312
+ | Model | crops/s | pages/s | EN | ZH | Mixed | White | Single | Multi | Normal | Rotate90 | Rotate270 | Horizontal |
313
+ | :--- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
314
+ | PaddleOCR v5 (server) | 20.6 | 1.2 | 0.027 | 0.037 | 0.041 | 0.031 | 0.035 | 0.064 | 0.031 | 0.116 | 0.897 | 0.027 |
315
+ | OpenOCR (server) | 17.4 | 1.5 | 0.024 | 0.033 | 0.049 | 0.027 | 0.034 | 0.061 | 0.028 | 0.042 | 0.761 | 0.034 |
316
+ | **Nemotron OCR v2(Multilingual)** | **68.1** | **21.8** | **0.048** | **0.072** | **0.142** | **0.061** | **0.049** | **0.117** | **0.062** | **0.109** | **0.332** | **0.372** |
317
+ | *Nemotron OCR v2 (EN)* | *74.6* | *19.9* | *0.038* | *0.830* | *0.437* | *0.348* | *0.282* | *0.572* | *0.353* | *0.232* | *0.827* | *0.893* |
318
+ | EasyOCR | 10.3 | 0.4 | 0.095 | 0.117 | 0.326 | 0.095 | 0.179 | 0.322 | 0.110 | 0.987 | 0.979 | 0.809 |
319
+ | Tesseract-OCR | | | 0.096 | 0.551 | 0.250 | 0.439 | 0.328 | 0.331 | 0.426 | 0.117 | 0.969 | 0.984 |
320
+ | *Nemotron OCR v1* | *61.1* | *21.4* | *0.038* | *0.876* | *0.436* | *0.472* | *0.434* | *0.715* | *0.482* | *0.358* | *0.871* | *0.979* |
321
+
322
+ Column key: **crops/s** and **pages/s** are throughput using the v2 batched pipeline where measured; **EN** = English, **ZH** = Simplified Chinese, **Mixed** = English/Chinese mixed, **White/Single/Multi** = background type, **Normal/Rotate90/Rotate270/Horizontal** = text orientation.
323
+
324
+ #### [SynthDoG](https://github.com/clovaai/donut/tree/master/synthdog) Generated Benchmark Data
325
+
326
+ Normalized Edit Distance (NED) page_avg on [SynthDoG](https://github.com/clovaai/donut/tree/master/synthdog) generated benchmark data (lower = better):
327
+
328
+ | Language | PaddleOCR (base) | PaddleOCR (specialized) | OpenOCR (server) | Nemotron OCR v1 | *Nemotron OCR v2 (EN)* | **Nemotron OCR v2** |
329
+ | :--- | ---: | ---: | ---: | ---: | ---: | ---: |
330
+ | English | 0.117 | 0.096 | 0.105 | 0.078 | *0.079* | **0.069** |
331
+ | Japanese | 0.201 | 0.201 | 0.586 | 0.723 | *0.765* | **0.046** |
332
+ | Korean | 0.943 | 0.133 | 0.837 | 0.923 | *0.924* | **0.047** |
333
+ | Russian | 0.959 | 0.163 | 0.950 | 0.564 | *0.632* | **0.043** |
334
+ | Chinese (Simplified) | 0.054 | 0.054 | 0.061 | 0.784 | *0.819* | **0.035** |
335
+ | Chinese (Traditional) | 0.094 | 0.094 | 0.127 | 0.700 | *0.756* | **0.065** |
336
+
337
+ ### **Detailed Performance Analysis**
338
+
339
+ The model demonstrates robust multilingual performance on complex layouts, noisy backgrounds, and challenging real-world scenes. Reading order and block detection are powered by the relational module, supporting downstream applications such as chart-to-text, table-to-text, and infographic-to-text extraction.
340
+
341
+ **Inference**<br>
342
+ **Acceleration Engine:** PyTorch<br>
343
+ **Supported Hardware:** H100 PCIe/SXM, A100 PCIe/SXM, L40S, L4, A10G, H200 NVL, B200, RTX PRO 6000 Blackwell Server Edition<br>
344
+
345
+ ## Ethical Considerations
346
+
347
+ NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse. <br>
348
+ The integration of foundation and fine-tuned models into AI systems requires additional testing using use-case-specific data to ensure safe and effective deployment. Following the V-model methodology, iterative testing and validation at both unit and system levels are essential to mitigate risks, meet technical and functional requirements, and ensure compliance with safety and ethical standards before deployment. <br>
349
+ Please make sure you have proper rights and permissions for all input image and video content; if image or video includes people, personal health information, or intellectual property, the image or video generated will not blur or maintain proportions of image subjects included. <br>
350
+ For more detailed information on ethical considerations for this model, please see the [Explainability](#explainability), [Bias](#bias), [Safety](#safety) & Security, and [Privacy](#privacy) sections below. <br>
351
+ Please report security vulnerabilities or NVIDIA AI Concerns [here](https://app.intigriti.com/programs/nvidia/nvidiavdp/detail).
352
+
353
+ ## Bias
354
+
355
+ | Field | Response |
356
+ | ----- | ----- |
357
+ | Participation considerations from adversely impacted groups [protected classes](https://www.senate.ca.gov/content/protected-classes) in model design and testing | None |
358
+ | Measures taken to mitigate against unwanted bias | None |
359
+
360
+
361
+ ## Explainability
362
+
363
+ | Field | Response |
364
+ | ----- | ----- |
365
+ | Intended Task/Domain: | Optical Character Recognition (OCR) with a focus on retrieval application and documents. |
366
+ | Model Type: | Hybrid neural network with convolutional detector, transformer recognizer, and document structure modeling. |
367
+ | Intended Users: | Developers and teams building AI-driven search applications, retrieval-augmented generation (RAG) workflows, multimodal agents, or document intelligence applications. It is ideal for those working with large collections of scanned or photographed documents, including PDFs, forms, and reports. |
368
+ | Output: | Structured OCR results, including detected bounding boxes, recognized text, and confidence scores. |
369
+ | Describe how the model works: | The model first detects text regions in the image, then transcribes recognized text, and finally analyzes document structure and reading order. Outputs structured, machine-readable results suitable for downstream search and analysis. |
370
+ | Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | Not Applicable |
371
+ | Technical Limitations & Mitigation: | Performance may vary across languages and scripts. |
372
+ | Verified to have met prescribed NVIDIA quality standards: | Yes |
373
+ | Performance Metrics: | Accuracy (e.g., character error rate), throughput, and latency. |
374
+ | Potential Known Risks: | The model may not always extract or transcribe all text with perfect accuracy, particularly in cases of poor image quality or highly stylized fonts. |
375
+ | Licensing & Terms of Use: | Use of this model is governed by [NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/) and the use of the post-processing scripts are licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt). |
376
+
377
+
378
+ ## Privacy
379
+
380
+ | Field | Response |
381
+ | ----- | ----- |
382
+ | Generatable or reverse engineerable personal data? | No |
383
+ | Personal data used to create this model? | None Known |
384
+ | How often is dataset reviewed? | The dataset is initially reviewed when added, and subsequent reviews are conducted as needed or in response to change requests. |
385
+ | Is there provenance for all datasets used in training? | Yes |
386
+ | Does data labeling (annotation, metadata) comply with privacy laws? | Yes |
387
+ | Is data compliant with data subject requests for data correction or removal, if such a request was made? | No, not possible with externally-sourced data. |
388
+ | Applicable Privacy Policy | https://www.nvidia.com/en-us/about-nvidia/privacy-policy/ |
389
+ | Was consent obtained for any personal data used? | Not Applicable |
390
+ | Was data from user interactions with the AI model (e.g. user input and prompts) used to train the model? | No |
391
+
392
+
393
+ ## Safety
394
+
395
+ | Field | Response |
396
+ | ----- | ----- |
397
+ | Model Application Field(s): | Text recognition and structured OCR for multimodal retrieval. Inputs can include natural scene images, scanned documents, charts, tables, and infographics. |
398
+ | Use Case Restrictions: | Abide by [NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/) and the use of the post-processing scripts are licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt). |
399
+ | Model and dataset restrictions: | The principle of least privilege (PoLP) is applied, limiting access for dataset generation and model development. Restrictions enforce dataset access only during training, and all dataset license constraints are adhered to. |
400
+ | Describe the life critical impact (if present): | Not applicable. |
THIRD_PARTY_NOTICES.md ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright "Angus Johnson" - Boost Software License 1.0
2
+ License Text([https://sourceforge.net/p/polyclipping/code/HEAD/tree/tags/6.2.0/License.txt](https://sourceforge.net/p/polyclipping/code/HEAD/tree/tags/6.2.0/License.txt))
3
+
4
+ This notice applies to **clipper**.
5
+
6
+ Copyright (c) 2010-2014 Angus Johnson
7
+
8
+ Boost Software License - Version 1.0 - August 17th, 2003
9
+
10
+ Permission is hereby granted, free of charge, to any person or organization
11
+ obtaining a copy of the software and accompanying documentation covered by
12
+ this license (the "Software") to use, reproduce, display, distribute,
13
+ execute, and transmit the Software, and to prepare derivative works of the
14
+ Software, and to permit third-parties to whom the Software is furnished to
15
+ do so, all subject to the following:
16
+
17
+ The copyright notices in the Software and this entire statement, including
18
+ the above license grant, this restriction and the following disclaimer,
19
+ must be included in all copies of the Software, in whole or in part, and
20
+ all derivative works of the Software, unless such copies or derivative
21
+ works are solely in the form of machine-executable object code generated by
22
+ a source language processor.
23
+
24
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26
+ FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
27
+ SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
28
+ FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
29
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
30
+ DEALINGS IN THE SOFTWARE.
31
+
32
+ -----
33
+
34
+ Copyright "Ofek Lev" - MIT License
35
+ License Text([https://github.com/pypa/hatch/blob/master/LICENSE.txt](https://github.com/pypa/hatch/blob/master/LICENSE.txt))
36
+
37
+ This notice applies to **hatchling**.
38
+
39
+ Copyright (c) 2017-present Ofek Lev <ofekmeister@gmail.com>
40
+
41
+ Permission is hereby granted, free of charge, to any person obtaining a copy
42
+ of this software and associated documentation files (the "Software"), to deal
43
+ in the Software without restriction, including without limitation the rights
44
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
45
+ copies of the Software, and to permit persons to whom the Software is
46
+ furnished to do so, subject to the following conditions:
47
+
48
+ The above copyright notice and this permission notice shall be included in all
49
+ copies or substantial portions of the Software.
50
+
51
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
52
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
53
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
54
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
55
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
56
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
57
+ SOFTWARE.
58
+
59
+ -----
60
+
61
+ Copyright "NumPy Developers" - BSD 3-Clause License
62
+ License Text([https://github.com/numpy/numpy/blob/main/LICENSE.txt](https://github.com/numpy/numpy/blob/main/LICENSE.txt))
63
+
64
+ This notice applies to **numpy**.
65
+
66
+ Copyright (c) 2005-2023, NumPy Developers.
67
+ All rights reserved.
68
+
69
+ Redistribution and use in source and binary forms, with or without
70
+ modification, are permitted provided that the following conditions are
71
+ met:
72
+
73
+ * Redistributions of source code must retain the above copyright
74
+ notice, this list of conditions and the following disclaimer.
75
+
76
+ * Redistributions in binary form must reproduce the above
77
+ copyright notice, this list of conditions and the following
78
+ disclaimer in the documentation and/or other materials provided
79
+ with the distribution.
80
+
81
+ * Neither the name of the NumPy Developers nor the names of any
82
+ contributors may be used to endorse or promote products derived
83
+ from this software without specific prior written permission.
84
+
85
+
86
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
87
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
88
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
89
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
90
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
91
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
92
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
93
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
94
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
95
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
96
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
97
+
98
+ -----
99
+
100
+ Copyright "pandas Developers" - BSD 3-Clause License
101
+ License Text([https://github.com/pandas-dev/pandas/blob/main/LICENSE](https://github.com/pandas-dev/pandas/blob/main/LICENSE))
102
+
103
+ This notice applies to **pandas**.
104
+
105
+ Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
106
+ All rights reserved.
107
+
108
+ Copyright (c) 2011-2023, The PyData Development Team
109
+ All rights reserved.
110
+
111
+ Redistribution and use in source and binary forms, with or without
112
+ modification, are permitted provided that the following conditions are
113
+ met:
114
+
115
+
116
+ * Redistributions of source code must retain the above copyright
117
+ notice, this list of conditions and the following disclaimer.
118
+
119
+ * Redistributions in binary form must reproduce the above
120
+ copyright notice, this list of conditions and the following
121
+ disclaimer in the documentation and/or other materials provided
122
+ with the distribution.
123
+
124
+ * Neither the name of the pandas development team nor the names of
125
+ any contributors may be used to endorse or promote products
126
+ derived from this software without specific prior written
127
+ permission.
128
+
129
+
130
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
131
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
132
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
133
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
134
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
135
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
136
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
137
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
138
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
139
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
140
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
141
+
142
+ -----
143
+
144
+ Copyright "Secret Labs AB, Fredrik Lundh, Alex Clark and contributors" - Pillow License
145
+ License Text([https://github.com/python-pillow/Pillow/blob/main/LICENSE](https://github.com/python-pillow/Pillow/blob/main/LICENSE))
146
+
147
+ This notice applies to **PIL (Pillow)**.
148
+
149
+ The Python Imaging Library (PIL) is
150
+ Copyright (c) 1997-2011 by Secret Labs AB
151
+ Copyright (c) 1995-2011 by Fredrik Lundh
152
+ Copyright (c) 2010-2023 by Alex Clark and contributors
153
+
154
+ Like PIL, Pillow is licensed under the open source HPND License:
155
+
156
+ By obtaining, using, and/or copying this software and/or its
157
+ associated documentation, you agree that you have read, understood,
158
+ and will comply with the following terms and conditions:
159
+
160
+ Permission to use, copy, modify, and distribute this software and
161
+ its associated documentation for any purpose and without fee is
162
+ hereby granted, provided that the above copyright notice appears in
163
+ all copies, and that both that copyright notice and this permission
164
+ notice appear in supporting documentation, and that the name of
165
+ Secret Labs AB or the author not be used in advertising or publicity
166
+ pertaining to distribution of the software without specific, written
167
+ prior permission.
168
+
169
+ SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
170
+ TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
171
+ ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
172
+ BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
173
+ DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
174
+ WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
175
+ ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
176
+ OF THIS SOFTWARE.
177
+
178
+ -----
179
+
180
+ Copyright "The scikit-learn developers" - BSD 3-Clause License
181
+ License Text([https://github.com/scikit-learn/scikit-learn/blob/main/COPYING](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING))
182
+
183
+ This notice applies to **scikit-learn**.
184
+
185
+ Copyright (c) 2007-2024 The scikit-learn developers.
186
+ All rights reserved.
187
+
188
+ Redistribution and use in source and binary forms, with or without
189
+ modification, are permitted provided that the following conditions are
190
+ met:
191
+
192
+
193
+ * Redistributions of source code must retain the above copyright
194
+ notice, this list of conditions and the following disclaimer.
195
+
196
+ * Redistributions in binary form must reproduce the above
197
+ copyright notice, this list of conditions and the following
198
+ disclaimer in the documentation and/or other materials provided
199
+ with the distribution.
200
+
201
+ * Neither the name of the scikit-learn developers nor the names of
202
+ any contributors may be used to endorse or promote products
203
+ derived from this software without specific prior written
204
+ permission.
205
+
206
+
207
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
208
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
209
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
210
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
211
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
212
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
213
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
214
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
215
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
216
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
217
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
218
+
219
+ -----
220
+
221
+ Copyright "Jason R. Coombs" - MIT License
222
+ License Text([https://github.com/pypa/setuptools/blob/main/LICENSE](https://github.com/pypa/setuptools/blob/main/LICENSE))
223
+
224
+ This notice applies to **setuptools**.
225
+
226
+ Copyright (c) 2016 Jason R. Coombs <jaraco@jaraco.com>
227
+
228
+ Permission is hereby granted, free of charge, to any person obtaining a copy
229
+ of this software and associated documentation files (the "Software"), to deal
230
+ in the Software without restriction, including without limitation the rights
231
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
232
+ copies of the Software, and to permit persons to whom the Software is
233
+ furnished to do so, subject to the following conditions:
234
+
235
+ The above copyright notice and this permission notice shall be included in all
236
+ copies or substantial portions of the Software.
237
+
238
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
239
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
240
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
241
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
242
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
243
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
244
+ SOFTWARE.
245
+
246
+ -----
247
+
248
+ Copyright "Sean Gillies" - BSD 3-Clause License
249
+ License Text([https://github.com/shapely/shapely/blob/main/LICENSE.txt](https://github.com/shapely/shapely/blob/main/LICENSE.txt))
250
+
251
+ This notice applies to **Shapely**.
252
+
253
+ Copyright (c) 2007, Sean Gillies.
254
+ All rights reserved.
255
+
256
+ Redistribution and use in source and binary forms, with or without
257
+ modification, are permitted provided that the following conditions are met:
258
+
259
+
260
+ * Redistributions of source code must retain the above copyright
261
+ notice, this list of conditions and the following disclaimer.
262
+
263
+ * Redistributions in binary form must reproduce the above copyright
264
+ notice, this list of conditions and the following disclaimer in the
265
+ documentation and/or other materials provided with the distribution.
266
+
267
+ * Neither the name of Sean Gillies nor the names of
268
+ its contributors may be used to endorse or promote products derived from
269
+ this software without specific prior written permission.
270
+
271
+
272
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
273
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
274
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
275
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
276
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
277
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
278
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
279
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
280
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
281
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
282
+ POSSIBILITY OF SUCH DAMAGE.
283
+
284
+ -----
285
+
286
+ Copyright "PyTorch Contributors" - BSD-style License
287
+ License Text([https://github.com/pytorch/pytorch/blob/main/LICENSE](https://github.com/pytorch/pytorch/blob/main/LICENSE))
288
+
289
+ This notice applies to **torch** and **torchvision**.
290
+
291
+ Copyright (c) 2016- Facebook, Inc. (Adam Paszke)
292
+ Copyright (c) 2014- Facebook, Inc. (Soumith Chintala)
293
+ Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
294
+ Copyright (c) 2012-2014 DeepMind Technologies (Koray Kavukcuoglu)
295
+ Copyright (c) 2011-2012 NEC Laboratories America (Clement Farabet)
296
+ Copyright (c) 2011-2013 New York University (Antoine Bordes)
297
+ Copyright (c) 2012-2013 University of Montreal (Pascal Vincent)
298
+ Copyright (c) 2014- Google Inc.
299
+ Copyright (c) 2015- Twitter, Inc.
300
+ Copyright (c) 2015- Intel Corporation
301
+ Copyright (c) 2015- AMD Inc.
302
+ Copyright (c) 2016- Baidu, Inc.
303
+ Copyright (c) 2016- Microsoft Corporation
304
+ Copyright (c) 2017- Amazon.com, Inc.
305
+ Copyright (c) 2018- Facebook AI Research
306
+ Copyright (c) 2019- fast.ai, Inc.
307
+ Copyright (c) 2022- PyTorch Contributors
308
+ All rights reserved.
309
+
310
+ Redistribution and use in source and binary forms, with or without
311
+ modification, are permitted provided that the following conditions are met:
312
+
313
+ * Redistributions of source code must retain the above copyright notice, this
314
+ list of conditions and the following disclaimer.
315
+
316
+ * Redistributions in binary form must reproduce the above copyright notice,
317
+ this list of conditions and the following disclaimer in the documentation
318
+ and/or other materials provided with the distribution.
319
+
320
+ * Neither the name of Facebook Inc. nor the names of its contributors may be
321
+ used to endorse or promote products derived from this software without
322
+ specific prior written permission.
323
+
324
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
325
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
326
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
327
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
328
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
329
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
330
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
331
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
332
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
333
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
334
+
335
+ -----
336
+
337
+ Copyright "Baidu USA LLC" - Apache License 2.0
338
+ License Text([https://github.com/bryancatanzaro/trove/blob/master/LICENSE](https://github.com/bryancatanzaro/trove/blob/master/LICENSE))
339
+
340
+ This notice applies to **trove**.
341
+
342
+ Copyright 2015-2016 Baidu USA LLC. All rights reserved.
343
+
344
+ Apache License
345
+ Version 2.0, January 2004
346
+ http://www.apache.org/licenses/
347
+
348
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
349
+
350
+ 1. Definitions.
351
+
352
+ "License" shall mean the terms and conditions for use, reproduction,
353
+ and distribution as defined by Sections 1 through 9 of this document.
354
+
355
+ "Licensor" shall mean the copyright owner or entity authorized by
356
+ the copyright owner that is granting the License.
357
+
358
+ "Legal Entity" shall mean the union of the acting entity and all
359
+ other entities that control, are controlled by, or are under common
360
+ control with that entity. For the purposes of this definition,
361
+ "control" means (i) the power, direct or indirect, to cause the
362
+ direction or management of such entity, whether by contract or
363
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
364
+ outstanding shares, or (iii) beneficial ownership of such entity.
365
+
366
+ "You" (or "Your") shall mean an individual or Legal Entity
367
+ exercising permissions granted by this License.
368
+
369
+ "Source" form shall mean the preferred form for making modifications,
370
+ including but not limited to software source code, documentation
371
+ source, and configuration files.
372
+
373
+ "Object" form shall mean any form resulting from mechanical
374
+ transformation or translation of a Source form, including but
375
+ not limited to compiled object code, generated documentation,
376
+ and conversions to other media types.
377
+
378
+ "Work" shall mean the work of authorship, whether in Source or
379
+ Object form, made available under the License, as indicated by a
380
+ copyright notice that is included in or attached to the work
381
+ (an example is provided in the Appendix below).
382
+
383
+ "Derivative Works" shall mean any work, whether in Source or Object
384
+ form, that is based on (or derived from) the Work and for which the
385
+ editorial revisions, annotations, elaborations, or other modifications
386
+ represent, as a whole, an original work of authorship. For the purposes
387
+ of this License, Derivative Works shall not include works that remain
388
+ separable from, or merely link (or bind by name) to the interfaces of,
389
+ the Work and Derivative Works thereof.
390
+
391
+ "Contribution" shall mean any work of authorship, including
392
+ the original version of the Work and any modifications or additions
393
+ to that Work or Derivative Works thereof, that is intentionally
394
+ submitted to Licensor for inclusion in the Work by the copyright owner
395
+ or by an individual or Legal Entity authorized to submit on behalf of
396
+ the copyright owner. For the purposes of this definition, "submitted"
397
+ means any form of electronic, verbal, or written communication sent
398
+ to the Licensor or its representatives, including but not limited to
399
+ communication on electronic mailing lists, source code control systems,
400
+ and issue tracking systems that are managed by, or on behalf of, the
401
+ Licensor for the purpose of discussing and improving the Work, but
402
+ excluding communication that is conspicuously marked or otherwise
403
+ designated in writing by the copyright owner as "Not a Contribution."
404
+
405
+ "Contributor" shall mean Licensor and any individual or Legal Entity
406
+ on behalf of whom a Contribution has been received by Licensor and
407
+ subsequently incorporated within the Work.
408
+
409
+ 2. Grant of Copyright License. Subject to the terms and conditions of
410
+ this License, each Contributor hereby grants to You a perpetual,
411
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
412
+ copyright license to reproduce, prepare Derivative Works of,
413
+ publicly display, publicly perform, sublicense, and distribute the
414
+ Work and such Derivative Works in Source or Object form.
415
+
416
+ 3. Grant of Patent License. Subject to the terms and conditions of
417
+ this License, each Contributor hereby grants to You a perpetual,
418
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
419
+ (except as stated in this section) patent license to make, have made,
420
+ use, offer to sell, sell, import, and otherwise transfer the Work,
421
+ where such license applies only to those patent claims licensable
422
+ by such Contributor that are necessarily infringed by their
423
+ Contribution(s) alone or by combination of their Contribution(s)
424
+ with the Work to which such Contribution(s) was submitted. If You
425
+ institute patent litigation against any entity (including a
426
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
427
+ or a Contribution incorporated within the Work constitutes direct
428
+ or contributory patent infringement, then any patent licenses
429
+ granted to You under this License for that Work shall terminate
430
+ as of the date such litigation is filed.
431
+
432
+ 4. Redistribution. You may reproduce and distribute copies of the
433
+ Work or Derivative Works thereof in any medium, with or without
434
+ modifications, and in Source or Object form, provided that You
435
+ meet the following conditions:
436
+
437
+ (a) You must give any other recipients of the Work or
438
+ Derivative Works a copy of this License; and
439
+
440
+ (b) You must cause any modified files to carry prominent notices
441
+ stating that You changed the files; and
442
+
443
+ (c) You must retain, in the Source form of any Derivative Works
444
+ that You distribute, all copyright, patent, trademark, and
445
+ attribution notices from the Source form of the Work,
446
+ excluding those notices that do not pertain to any part of
447
+ the Derivative Works; and
448
+
449
+ (d) If the Work includes a "NOTICE" text file as part of its
450
+ distribution, then any Derivative Works that You distribute must
451
+ include a readable copy of the attribution notices contained
452
+ within such NOTICE file, excluding those notices that do not
453
+ pertain to any part of the Derivative Works, in at least one
454
+ of the following places: within a NOTICE text file distributed
455
+ as part of the Derivative Works; within the Source form or
456
+ documentation, if provided along with the Derivative Works; or,
457
+ within a display generated by the Derivative Works, if and
458
+ wherever such third-party notices normally appear. The contents
459
+ of the NOTICE file are for informational purposes only and
460
+ do not modify the License. You may add Your own attribution
461
+ notices within Derivative Works that You distribute, alongside
462
+ or as an addendum to the NOTICE text from the Work, provided
463
+ that such additional attribution notices cannot be construed
464
+ as modifying the License.
465
+
466
+ You may add Your own copyright statement to Your modifications and
467
+ may provide additional or different license terms and conditions
468
+ for use, reproduction, or distribution of Your modifications, or
469
+ for any such Derivative Works as a whole, provided Your use,
470
+ reproduction, and distribution of the Work otherwise complies with
471
+ the conditions stated in this License.
472
+
473
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
474
+ any Contribution intentionally submitted for inclusion in the Work
475
+ by You to the Licensor shall be under the terms and conditions of
476
+ this License, without any additional terms or conditions.
477
+ Notwithstanding the above, nothing herein shall supersede or modify
478
+ the terms of any separate license agreement you may have executed
479
+ with Licensor regarding such Contributions.
480
+
481
+ 6. Trademarks. This License does not grant permission to use the trade
482
+ names, trademarks, service marks, or product names of the Licensor,
483
+ except as required for reasonable and customary use in describing the
484
+ origin of the Work and reproducing the content of the NOTICE file.
485
+
486
+ 7. Disclaimer of Warranty. Unless required by applicable law or
487
+ agreed to in writing, Licensor provides the Work (and each
488
+ Contributor provides its Contributions) on an "AS IS" BASIS,
489
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
490
+ implied, including, without limitation, any warranties or conditions
491
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
492
+ PARTICULAR PURPOSE. You are solely responsible for determining the
493
+ appropriateness of using or redistributing the Work and assume any
494
+ risks associated with Your exercise of permissions under this License.
495
+
496
+ 8. Limitation of Liability. In no event and under no legal theory,
497
+ whether in tort (including negligence), contract, or otherwise,
498
+ unless required by applicable law (such as deliberate and grossly
499
+ negligent acts) or agreed to in writing, shall any Contributor be
500
+ liable to You for damages, including any direct, indirect, special,
501
+ incidental, or consequential damages of any character arising as a
502
+ result of this License or out of the use or inability to use the
503
+ Work (including but not limited to damages for loss of goodwill,
504
+ work stoppage, computer failure or malfunction, or any and all
505
+ other commercial damages or losses), even if such Contributor
506
+ has been advised of the possibility of such damages.
507
+
508
+ 9. Accepting Warranty or Additional Liability. While redistributing
509
+ the Work or Derivative Works thereof, You may choose to offer,
510
+ and charge a fee for, acceptance of support, warranty, indemnity,
511
+ or other liability obligations and/or rights consistent with this
512
+ License. However, in accepting such obligations, You may act only
513
+ on Your own behalf and on Your sole responsibility, not on behalf
514
+ of any other Contributor, and only if You agree to indemnify,
515
+ defend, and hold each Contributor harmless for any liability
516
+ incurred by, or claims asserted against, such Contributor by reason
517
+ of your accepting any such warranty or additional liability.
518
+
519
+ END OF TERMS AND CONDITIONS
checkpoints/charset.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/detector.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d54398ec39156c6a8a17c89588271c84a976195bae4227dc2643c4635c6442e9
3
+ size 181974624
checkpoints/model_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_tokens": 14247,
3
+ "max_width": 128,
4
+ "sequence_length": 128,
5
+ "scope": 2048,
6
+ "coordinate_mode": "RBOX",
7
+ "backbone": "regnet_x_8gf",
8
+ "charset_size": 14244,
9
+ "recognizer_variant": "prenorm",
10
+ "has_pre_norm": false,
11
+ "has_tx_norm": true,
12
+ "norm_first": true,
13
+ "depth": 256,
14
+ "num_layers": 6,
15
+ "nhead": 8,
16
+ "dim_feedforward": 2048,
17
+ "feature_depth": 512
18
+ }
checkpoints/recognizer.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20bf070ab5d9a9e85edbaa140aaa3e2c518ad94fafbf2fa856c8773f1594647c
3
+ size 144516943
checkpoints/relational.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:509701e97de006bb060aa4e7a6937dcfe4222d1717dcc0d447bf396090a1e10b
3
+ size 9175733
config.json ADDED
File without changes
docker-compose.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ nemotron-ocr:
3
+ build:
4
+ context: .
5
+ dockerfile: Dockerfile
6
+ deploy:
7
+ resources:
8
+ reservations:
9
+ devices:
10
+ - capabilities: [gpu]
11
+ working_dir: /workspace
12
+ volumes:
13
+ - .:/workspace:rw
14
+ - ${XDG_CACHE_HOME:-~/cache}:/root/.cache:rw
15
+ command: bash -lc "python example.py ocr-example-input-1.png --merge-level paragraph"
16
+ ipc: host
17
+ ulimits:
18
+ memlock:
19
+ soft: -1
20
+ hard: -1
21
+ stack: 6710886
example.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import argparse
6
+
7
+ from nemotron_ocr.inference.pipeline import NemotronOCR
8
+
9
+
10
+ def main(image_path, merge_level, no_visualize, model_dir, lang):
11
+ if model_dir is not None:
12
+ ocr_pipeline = NemotronOCR(model_dir=model_dir)
13
+ else:
14
+ ocr_pipeline = NemotronOCR(lang=lang)
15
+
16
+ predictions = ocr_pipeline(image_path, merge_level=merge_level, visualize=not no_visualize)
17
+
18
+ print(f"Found {len(predictions)} text regions.")
19
+ for pred in predictions:
20
+ print(
21
+ f" - Text: '{pred['text']}', "
22
+ f"Confidence: {pred['confidence']:.2f}, "
23
+ f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, "
24
+ f"right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
25
+ )
26
+
27
+
28
+ if __name__ == "__main__":
29
+ parser = argparse.ArgumentParser(description="Run OCR inference and annotate image.")
30
+ parser.add_argument("image_path", type=str, help="Path to the input image.")
31
+ parser.add_argument(
32
+ "--merge-level",
33
+ type=str,
34
+ choices=["word", "sentence", "paragraph"],
35
+ default="paragraph",
36
+ help="Merge level for OCR output (word, sentence, paragraph).",
37
+ )
38
+ parser.add_argument("--no-visualize", action="store_true", help="Do not save the annotated image.")
39
+ parser.add_argument(
40
+ "--model-dir",
41
+ type=str,
42
+ default=None,
43
+ help="Path to a directory with detector.pth, recognizer.pth, relational.pth, charset.txt. "
44
+ "If omitted, weights are downloaded from Hugging Face (default: v2 multilingual).",
45
+ )
46
+ parser.add_argument(
47
+ "--lang",
48
+ type=str,
49
+ choices=["en", "multi", "v1"],
50
+ default=None,
51
+ help="Hub checkpoint when --model-dir is omitted: en=v2 English, multi=v2 multilingual (default), v1=legacy.",
52
+ )
53
+ args = parser.parse_args()
54
+
55
+ main(
56
+ args.image_path,
57
+ merge_level=args.merge_level,
58
+ no_visualize=args.no_visualize,
59
+ model_dir=args.model_dir,
60
+ lang=args.lang,
61
+ )
nemotron-ocr/.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Built C++/CUDA extension (produced by: pip install -v .)
2
+ src/nemotron_ocr_cpp/*.so
3
+ src/nemotron_ocr_cpp/*.pyd
4
+ __pycache__/
5
+ *.py[cod]
6
+ .pytest_cache/
7
+ .venv/
8
+ build/
9
+ *.egg-info/
nemotron-ocr/cpp/.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ load_png/wuffs-v0.3.c filter=lfs diff=lfs merge=lfs -text
nemotron-ocr/cpp/.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ __pycache__
2
+ .vscode
3
+ build
4
+ *.egg-info
5
+ dist
6
+ .vs
nemotron-ocr/cpp/.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "trove"]
2
+ path = trove
3
+ url = https://github.com/bryancatanzaro/trove.git
nemotron-ocr/cpp/README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Optimized Image Operations for PyTorch
2
+
3
+ ## Installation
4
+
5
+ ```
6
+ python setup.py install
7
+ ```
8
+
9
+ ## Usage
10
+
11
+ ```
12
+ # It's important that you do this first
13
+ import torch
14
+ from pytorch_image_ops import color_transform, spatial_transform
15
+ ```
nemotron-ocr/cpp/beam_decode/beam_decode.cpp ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "beam_decode.h"
5
+
6
+ #include <vector>
7
+ #include <deque>
8
+ #include <limits>
9
+ #include <memory>
10
+ #include <unordered_set>
11
+ #include <set>
12
+ #include <algorithm>
13
+ #include <chrono>
14
+
15
+ #include "../common.h"
16
+ #include "prefix.h"
17
+ #include "log_sum_exp.h"
18
+ #include "sbo_lm.h"
19
+
20
+ using namespace std;
21
+
22
+ template<typename scalar_t>
23
+ using pred_seq_t = torch::TensorAccessor<scalar_t, 2>;
24
+
25
+ struct PrefixScore
26
+ {
27
+ float_t lProbBlank;
28
+ float_t lProbChar;
29
+ // float_t raw_lProbBlank;
30
+ // float_t raw_lProbChar;
31
+ mutable float_t _lProb;
32
+
33
+ PrefixScore(float_t lProbBlank = NEG_INF /* log P(0) */, float_t lProbChar = NEG_INF /* log P(0) */)
34
+ : lProbBlank(lProbBlank), lProbChar(lProbChar), _lProb(NEG_INF)
35
+ // , raw_lProbBlank(lProbBlank), raw_lProbChar(lProbChar)
36
+ {}
37
+
38
+ float_t get_lScore() const {
39
+ if (_lProb == NEG_INF) {
40
+ _lProb = log_sum_exp(lProbBlank, lProbChar);
41
+ }
42
+ return _lProb;
43
+ }
44
+
45
+ // float_t get_raw_lScore() const {
46
+ // return log_sum_exp(raw_lProbBlank, raw_lProbChar);
47
+ // }
48
+ };
49
+
50
+ typedef std::unordered_map<Prefix*, PrefixScore> PrefixMap;
51
+ typedef std::pair<Prefix*, PrefixScore> BeamItem;
52
+ typedef std::vector<BeamItem> Beam;
53
+
54
+ /*
55
+ Allows us to get an estimate of the vision model confidence, irrespective of how the language
56
+ model guided the decoding. NOTE: This scoring could follow an entirely different path than
57
+ the returned decoded sequence.
58
+ */
59
+ template<typename scalar_t>
60
+ scalar_t get_vision_confidence(const pred_seq_t<scalar_t> &logProbs, scalar_t minProb)
61
+ {
62
+ const int64_t T = logProbs.size(0);
63
+ const int64_t S = logProbs.size(1);
64
+
65
+ scalar_t ret = 0; // log(1)
66
+
67
+ for (size_t t = 0; t < T; ++t) {
68
+ float_t maxP = logProbs[t][0];
69
+ int64_t maxC = 0;
70
+ for (int64_t c = 1; c < S; ++c) {
71
+ float_t p = logProbs[t][c];
72
+ if (p > maxP) {
73
+ maxP = p;
74
+ maxC = c;
75
+ }
76
+ }
77
+ ret += maxP;
78
+ // Ignore everything past the sequence terminator
79
+ if (maxC == 1) {
80
+ break;
81
+ }
82
+
83
+ if (ret < minProb) {
84
+ break;
85
+ }
86
+ }
87
+
88
+ return ret;
89
+ }
90
+
91
+
92
+ template<typename scalar_t>
93
+ pair<vector<token_t>, float_t>
94
+ ctc_beam_decode_impl(const pred_seq_t<scalar_t> &probs, const int64_t beamSize,
95
+ const int64_t blank, scalar_t minProb,
96
+ const LanguageModel &langModel, scalar_t lmWeight)
97
+ {
98
+ if (blank != 0) {
99
+ throw runtime_error("Currently, only ordinal 0 supported for the blank prediction");
100
+ }
101
+
102
+ const int64_t T = probs.size(0);
103
+ const int64_t S = probs.size(1);
104
+
105
+ // NOTE: In log space, the following is true:
106
+ // 1. Adding two probabilities: log_sum_exp(l_p_a, l_p_b)
107
+ // 2. Multiplying two probabilities: l_p_a + l_p_b
108
+ // 3. log P(0) = -inf
109
+ // 4. log P(1) = 0
110
+
111
+ // Convert to log-space
112
+ if (minProb > 0) {
113
+ minProb = log(minProb);
114
+ } else {
115
+ minProb = NEG_INF;
116
+ }
117
+
118
+ auto retScore = get_vision_confidence(probs, minProb);
119
+
120
+ if (retScore < minProb) {
121
+ return { {}, NEG_INF };
122
+ }
123
+
124
+ PrefixAllocator prefixAlloc;
125
+
126
+ Beam beam;
127
+ beam.emplace_back(prefixAlloc.GetPrefix(), PrefixScore{0, NEG_INF}); // Add a dummy first node
128
+
129
+ Beam terminated;
130
+
131
+ typedef tuple<Prefix*, token_t> lm_cache_key_t;
132
+ unordered_map<lm_cache_key_t, float_t> lmScoreCache;
133
+
134
+ for (int64_t t = 0; t < T; ++t) {
135
+ PrefixMap nextBeam;
136
+
137
+ // Add all of the completed paths to the next beam.
138
+ // This allows us to accumulate new paths into these,
139
+ // but otherwise not process them
140
+ for (const BeamItem &prevNode : beam) {
141
+ if (prevNode.first->Token == 1) {
142
+ nextBeam.insert(prevNode);
143
+ }
144
+ }
145
+
146
+ // Loop over vocab
147
+ for (int64_t s = 0; s < S; ++s) {
148
+ float_t lpEmit = probs[t][s];
149
+
150
+ if (lpEmit < minProb) {
151
+ continue;
152
+ }
153
+
154
+ for (const BeamItem &prevNode : beam) {
155
+ Prefix *prevPrefix = prevNode.first;
156
+ const PrefixScore &prevScore = prevNode.second;
157
+
158
+ // Ignore already completed paths
159
+ if (prevPrefix->Token == 1) {
160
+ continue;
161
+ }
162
+
163
+ // Ignore impossible paths
164
+ if (prevScore.lProbBlank == NEG_INF && prevScore.lProbChar == NEG_INF) {
165
+ continue;
166
+ }
167
+
168
+ // If we propose a blank the prefix doesn't change.
169
+ // Only the probability of ending in blank gets updated.
170
+ if (s == blank) {
171
+ PrefixScore &score = nextBeam[prevPrefix];
172
+ score.lProbBlank = log_sum_exp(score.lProbBlank , prevScore.lProbBlank + lpEmit, prevScore.lProbChar + lpEmit);
173
+ // score.raw_lProbBlank = log_sum_exp(score.raw_lProbBlank, prevScore.raw_lProbBlank + lpEmit, prevScore.raw_lProbChar + lpEmit);
174
+ continue;
175
+ }
176
+
177
+ // Extend the prefix by the new character s and add it to the beam.
178
+ // Only the probability of not ending in blank gets updated.
179
+ token_t prevToken = prevPrefix->Token;
180
+
181
+ // NOTE: We always create a new prefix regardless of duplication because the PrefixScore
182
+ // is simultaneously tracking prefixes that do and don't end in a blank. And it's those
183
+ // that end in a blank that would cause the prefix to be extended.
184
+ auto extendPrefix = prefixAlloc.GetPrefix(s, prevPrefix);
185
+
186
+ // Evaluate the language model, but use the cache if we've already considered this string before
187
+ auto lmCacheItem = make_tuple(prevPrefix, s);
188
+ auto lmCacheIter = lmScoreCache.find(lmCacheItem);
189
+ float_t lpLang = 0;
190
+ if (lmCacheIter == lmScoreCache.end()) {
191
+ lpLang = langModel.ScoreTransition(prevPrefix, s);
192
+ lpLang *= lmWeight;
193
+ lmCacheIter = lmScoreCache.emplace(lmCacheItem, lpLang).first;
194
+ }
195
+ lpLang = lmCacheIter->second;
196
+
197
+ PrefixScore &extendScore = nextBeam[extendPrefix];
198
+ // Remember, adding two log probabilities is equivalent to multiplying two probabilities
199
+ if (s != prevToken) {
200
+ extendScore.lProbChar = log_sum_exp(extendScore.lProbChar, prevScore.lProbBlank + lpEmit + lpLang, prevScore.lProbChar + lpEmit + lpLang);
201
+ // extendScore.raw_lProbChar = log_sum_exp(extendScore.raw_lProbChar, prevScore.raw_lProbBlank + lpEmit , prevScore.raw_lProbChar + lpEmit );
202
+ } else {
203
+ // We don't include the previous probability of not ending in blank if s is repeated at the end. The CTC
204
+ // algorithm merges characters not separated by a blank.
205
+ extendScore.lProbChar = log_sum_exp(extendScore.lProbChar , prevScore.lProbBlank + lpEmit + lpLang);
206
+ // extendScore.raw_lProbChar = log_sum_exp(extendScore.raw_lProbChar, prevScore.raw_lProbBlank + lpEmit );
207
+ }
208
+
209
+ // If the token is repeated, we also have to deal with the unchanged prefix since repeated characters are collapsed
210
+ if (s == prevToken) {
211
+ PrefixScore &collapseScore = nextBeam[prevPrefix];
212
+ collapseScore.lProbChar = log_sum_exp(collapseScore.lProbChar , prevScore.lProbChar + lpEmit);
213
+ // collapseScore.raw_lProbChar = log_sum_exp(collapseScore.raw_lProbChar, prevScore.raw_lProbChar + lpEmit);
214
+ }
215
+
216
+ }
217
+ }
218
+
219
+ Beam vecNextBeam(begin(nextBeam), end(nextBeam));
220
+
221
+ if (vecNextBeam.size() > beamSize) {
222
+ partial_sort(begin(vecNextBeam), begin(vecNextBeam) + beamSize, end(vecNextBeam),
223
+ [] (const BeamItem &a, const BeamItem &b) {
224
+ return a.second.get_lScore() > b.second.get_lScore();
225
+ }
226
+ );
227
+ vecNextBeam.resize(beamSize);
228
+ }
229
+
230
+ beam = move(vecNextBeam);
231
+ }
232
+
233
+ // Find the best raw score
234
+ const BeamItem *bestItem = nullptr;
235
+ // for (const BeamItem &b : beam) {
236
+ // if (bestItem == nullptr or b.second.get_raw_lScore() > bestItem->second.get_raw_lScore()) {
237
+ // bestItem = &b;
238
+ // }
239
+ // }
240
+ if (! beam.empty()) {
241
+ bestItem = &beam[0];
242
+ }
243
+
244
+ if (bestItem != nullptr) {
245
+ auto retList = bestItem->first->ToList();
246
+
247
+ return { move(retList), retScore };
248
+ } else {
249
+ return { {}, NEG_INF };
250
+ }
251
+ }
252
+
253
+ typedef std::pair<Prefix*, float_t> RegBeamItem;
254
+
255
+ bool operator<(const RegBeamItem &a, const RegBeamItem &b) {
256
+ return a.second > b.second;
257
+ }
258
+
259
+ template<typename scalar_t>
260
+ pair<vector<token_t>, float_t>
261
+ reg_beam_decode_impl(const pred_seq_t<scalar_t> &logProbs, const int64_t beamSize,
262
+ scalar_t minProb,
263
+ const LanguageModel &langModel, scalar_t lmWeight)
264
+ {
265
+ const int64_t T = logProbs.size(0);
266
+ const int64_t S = logProbs.size(1);
267
+
268
+ // NOTE: In log space, the following is true:
269
+ // 1. Adding two probabilities: log_sum_exp(l_p_a, l_p_b)
270
+ // 2. Multiplying two probabilities: l_p_a + l_p_b
271
+ // 3. log P(0) = -inf
272
+ // 4. log P(1) = 0
273
+
274
+ // Convert to log-space
275
+ if (minProb > 0) {
276
+ minProb = log(minProb);
277
+ } else {
278
+ minProb = NEG_INF;
279
+ }
280
+
281
+ auto retScore = get_vision_confidence(logProbs, minProb);
282
+
283
+ if (retScore < minProb) {
284
+ return { {}, NEG_INF };
285
+ }
286
+
287
+ PrefixAllocator prefixAlloc;
288
+
289
+ vector<RegBeamItem> beam, nextBeam;
290
+ beam.emplace_back(prefixAlloc.GetPrefix(), 0); // log(1) = 0
291
+
292
+ for (int64_t t = 0; t < T && !beam.empty(); ++t) {
293
+ nextBeam.clear();
294
+
295
+ auto addToBeam = [&nextBeam, beamSize] (const RegBeamItem &rbi) {
296
+ nextBeam.push_back(rbi);
297
+ };
298
+
299
+ // Expand each path in the beam
300
+ for (const RegBeamItem &prevNode : beam) {
301
+ if (prevNode.first->Token == 1) {
302
+ // Move completed paths along without processing further
303
+ addToBeam(prevNode);
304
+ continue;
305
+ }
306
+
307
+ Prefix *prevPrefix = prevNode.first;
308
+ float_t prevScore = prevNode.second;
309
+
310
+ // Loop over vocab
311
+ for (int64_t s = 0; s < S; ++s) {
312
+ float_t lpEmit = logProbs[t][s];
313
+
314
+ if (lpEmit < minProb) {
315
+ // The probability dropped below threshold, so stop processing this path
316
+ continue;
317
+ }
318
+
319
+ auto extendPrefix = prefixAlloc.GetPrefix(s, prevPrefix);
320
+
321
+ float_t lpLang = langModel.ScoreTransition(prevPrefix, s);
322
+
323
+ float_t lpNext = prevScore + lpLang + lpEmit;
324
+
325
+ addToBeam({extendPrefix, lpNext});
326
+ }
327
+ }
328
+
329
+ if (nextBeam.size() > beamSize) {
330
+ // Find the top-k items, and then truncate the rest
331
+ partial_sort(begin(nextBeam), begin(nextBeam) + beamSize, end(nextBeam));
332
+ nextBeam.resize(beamSize);
333
+ }
334
+
335
+ std::swap(beam, nextBeam);
336
+ }
337
+
338
+ if (!beam.empty()) {
339
+ // The highest probability element will always be in the back
340
+ RegBeamItem rbi{ nullptr, NEG_INF };
341
+ for (auto &rb : beam) {
342
+ if (rbi.first == nullptr || rb.second > rbi.second) {
343
+ rbi = rb;
344
+ }
345
+ }
346
+
347
+ auto retList = rbi.first->ToList();
348
+
349
+ return { move(retList), retScore };
350
+ } else {
351
+ return { {}, NEG_INF };
352
+ }
353
+ }
354
+
355
+
356
+
357
+ template<typename scalar_t>
358
+ void dp_beam_decode_impl(const torch::TensorAccessor<scalar_t, 3> &probsAccess,
359
+ torch::TensorAccessor<int64_t, 2> retAccess,
360
+ torch::TensorAccessor<scalar_t, 1> confAccess,
361
+ int64_t beamSize, int64_t blank,
362
+ scalar_t minProb,
363
+ const LanguageModel *langModel,
364
+ scalar_t lmWeight,
365
+ bool combineDuplicates)
366
+ {
367
+ const int64_t N = probsAccess.size(0);
368
+
369
+ #pragma omp parallel for num_threads(8)
370
+ for (int64_t i = 0; i < N; ++i) {
371
+ vector<token_t> seq;
372
+ float_t lConf;
373
+ if (combineDuplicates) {
374
+ tie(seq, lConf) = ctc_beam_decode_impl(probsAccess[i], beamSize, blank,
375
+ minProb,
376
+ *langModel, lmWeight);
377
+ } else {
378
+ tie(seq, lConf) = reg_beam_decode_impl(probsAccess[i], beamSize,
379
+ minProb,
380
+ *langModel, lmWeight);
381
+ }
382
+
383
+ int64_t sz = min<int64_t>(seq.size(), retAccess.size(1));
384
+
385
+ for (int64_t k = 0; k < sz; ++k) {
386
+ retAccess[i][k] = seq[k];
387
+ }
388
+
389
+ confAccess[i] = exp(lConf);
390
+ }
391
+ }
392
+
393
+ std::tuple<torch::Tensor, torch::Tensor>
394
+ beam_decode(torch::Tensor probs, int64_t beamSize, int64_t blank,
395
+ float minProb,
396
+ const LanguageModel *langModel,
397
+ float lmWeight,
398
+ bool combineDuplicates)
399
+ {
400
+ if (langModel == nullptr) {
401
+ langModel = &NullLanguageModel;
402
+ }
403
+
404
+ auto tStart = chrono::high_resolution_clock::now();
405
+
406
+ probs = probs.contiguous();
407
+
408
+ bool collapse = false;
409
+ if (probs.dim() == 2) {
410
+ // N,T,C
411
+ probs = probs.unsqueeze(0);
412
+ collapse = true;
413
+ }
414
+
415
+ probs = probs.log();
416
+
417
+ torch::Tensor ret = torch::ones({ probs.size(0), probs.size(1) }, torch::kInt64);
418
+ torch::Tensor conf = torch::zeros({ probs.size(0) }, probs.options());
419
+
420
+ auto retAccess = ret.accessor<int64_t, 2>();
421
+
422
+ AT_DISPATCH_FLOATING_TYPES(
423
+ probs.scalar_type(),
424
+ "cpu_beam_decode",
425
+ ([&] {
426
+ dp_beam_decode_impl(
427
+ probs.accessor<scalar_t, 3>(),
428
+ retAccess,
429
+ conf.accessor<scalar_t, 1>(),
430
+ beamSize, blank,
431
+ static_cast<scalar_t>(minProb),
432
+ langModel,
433
+ static_cast<scalar_t>(lmWeight),
434
+ combineDuplicates
435
+ );
436
+ })
437
+ );
438
+
439
+ if (collapse) {
440
+ ret = ret.squeeze(0);
441
+ conf = conf[0];
442
+ }
443
+
444
+ auto tEnd = chrono::high_resolution_clock::now();
445
+
446
+ typedef chrono::duration<double, std::milli> tp_t;
447
+ tp_t totalElapsed = tEnd - tStart;
448
+
449
+ cout << "Beam Decode " << probs.size(0) << " - "
450
+ << "Total: " << totalElapsed.count() << "ms"
451
+ << endl;
452
+
453
+ return { ret, conf };
454
+ }
455
+
456
+ std::unique_ptr<LanguageModel> create_sbo_lm(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t backoffWeight)
457
+ {
458
+ return make_unique<SBO_LanguageModel>(dataFilePath, move(tokenMapping), backoffWeight);
459
+ }
nemotron-ocr/cpp/beam_decode/beam_decode.h ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include <torch/torch.h>
7
+
8
+ #include "language_model.h"
9
+
10
+ std::tuple<torch::Tensor, torch::Tensor>
11
+ beam_decode(torch::Tensor probs, int64_t beamSize, int64_t blank,
12
+ float minProb,
13
+ const LanguageModel *langModel,
14
+ float lmWeight,
15
+ bool combineDuplicates);
16
+
17
+ std::unique_ptr<LanguageModel> create_sbo_lm(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t backoffWeight);
nemotron-ocr/cpp/beam_decode/kn_lm.cpp ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "kn_lm.h"
5
+
6
+ using namespace std;
7
+
8
+
9
+ KN_LanguageModel::KN_LanguageModel(const string &dataFilePath, token_mapping_t tokenMapping, float_t knDelta)
10
+ : NGramLMBase(dataFilePath, move(tokenMapping)), m_knDelta(knDelta)
11
+ {
12
+ }
13
+
14
+ float KN_LanguageModel::ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const
15
+ {
16
+ if (prefix.empty()) {
17
+ return ScoreUnigram(suffix);
18
+ } else {
19
+ return ScoreTransition(prefix, suffix);
20
+ }
21
+ }
22
+
23
+ float_t KN_LanguageModel::ScoreUnigram(const std::wstring &uni) const
24
+ {
25
+ auto lIter = m_lookup[1].find(L""s);
26
+ if (lIter == m_lookup[1].end()) {
27
+ throw std::runtime_error("Unigrams not supported by this model!");
28
+ }
29
+
30
+ auto uniIter = lIter->second.find(uni);
31
+ float_t ctUni = 1e-8;
32
+ if (uniIter != lIter->second.end()) {
33
+ ctUni = uniIter->second;
34
+ }
35
+
36
+ float_t ctSuffixes = GetPrefixSum(L""s);
37
+
38
+ return ctUni / ctSuffixes;
39
+ }
40
+
41
+ float_t KN_LanguageModel::ScoreTransition(const std::wstring &prefix, const std::wstring &suffix) const
42
+ {
43
+ if (prefix.empty()) {
44
+ // The number of distinct bigrams that end with this token
45
+ auto rlIter = m_reverseLookup.find(suffix);
46
+
47
+ float_t ctEndingBigrams = 0;
48
+ if (rlIter != m_reverseLookup.end()) {
49
+ ctEndingBigrams = rlIter->second[2].size();
50
+ }
51
+
52
+ float_t ctAllBigrams = m_lookup[2].size();
53
+
54
+ return ctEndingBigrams / ctAllBigrams;
55
+ }
56
+
57
+ auto lIter = m_lookup[prefix.size() + 1].find(prefix);
58
+ float_t ctUqSuffixes = 0;
59
+ float_t ctSuffixes = 0;
60
+ float_t ctSuffix = 0;
61
+ if (lIter != m_lookup[prefix.size() + 1].end()) {
62
+ ctUqSuffixes = lIter->second.size();
63
+
64
+ ctSuffixes = GetPrefixSum(prefix);
65
+
66
+ auto sIter = lIter->second.find(suffix);
67
+ if (sIter != lIter->second.end()) {
68
+ ctSuffix = sIter->second;
69
+ }
70
+ }
71
+
72
+ float_t factor = 0;
73
+ float_t main = 0;
74
+ if (ctSuffixes != 0) {
75
+ factor = m_knDelta * ctUqSuffixes / ctSuffixes;
76
+ // TODO: Figure out how to make this call without copying the string!
77
+ factor *= ScoreTransition({begin(prefix) + 1, end(prefix)}, suffix);
78
+
79
+ main = max<float_t>(ctSuffix - m_knDelta, 0) / ctSuffixes;
80
+ }
81
+
82
+ float_t total = main + factor;
83
+
84
+ return total;
85
+ }
nemotron-ocr/cpp/beam_decode/kn_lm.h ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include <unordered_map>
7
+ #include <vector>
8
+
9
+ #include "ngram_lm_base.h"
10
+
11
+
12
+ class KN_LanguageModel
13
+ : public NGramLMBase
14
+ {
15
+ public:
16
+ KN_LanguageModel(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t knDelta);
17
+
18
+ protected:
19
+ virtual float_t ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const override;
20
+
21
+ private:
22
+ float_t ScoreUnigram(const std::wstring &uni) const;
23
+ float_t ScoreTransition(const std::wstring &prefix, const std::wstring &suffix) const;
24
+
25
+ float_t m_knDelta;
26
+ };
nemotron-ocr/cpp/beam_decode/language_model.cpp ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "language_model.h"
5
+
6
+ #include <locale>
7
+ #include <codecvt>
8
+
9
+ using namespace std;
10
+
11
+ const NullLanguageModel_t NullLanguageModel;
12
+
13
+ NullLanguageModel_t::NullLanguageModel_t()
14
+ : LanguageModel({})
15
+ {
16
+ }
17
+
18
+ TokenMappingWrapper::TokenMappingWrapper(token_mapping_t mapping)
19
+ : token_mapping(move(mapping))
20
+ {
21
+ for (const auto &mp : token_mapping) {
22
+ if (mp.second.size() == 1) {
23
+ wchar_t c = mp.second.front();
24
+ reverse_token_mapping.emplace(c, mp.first);
25
+ }
26
+ }
27
+ }
28
+
29
+ TokenMappingWrapper::Ptr create_token_mapping(token_mapping_t tokenMapping)
30
+ {
31
+ return make_shared<TokenMappingWrapper>(move(tokenMapping));
32
+ }
33
+
34
+
35
+ template<typename token_t>
36
+ vector<tuple<wstring, float>>
37
+ decode_sequences_impl(torch::Tensor tokens, const TokenMappingWrapper *tokenMapping,
38
+ c10::optional<torch::Tensor> probs)
39
+ {
40
+ const token_mapping_t &mapping = tokenMapping->token_mapping;
41
+
42
+ auto tokensAccess = tokens.accessor<token_t, 2>();
43
+
44
+ torch::Tensor pTens = probs.value_or(torch::ones({ tokens.size(0) }, torch::kFloat32));
45
+ if (pTens.dim() == 1) {
46
+ pTens = pTens.unsqueeze(1);
47
+ }
48
+
49
+ auto probsAccess = pTens.accessor<float, 2>();
50
+
51
+ const int64_t B = tokens.size(0);
52
+ const int64_t T = tokens.size(1);
53
+
54
+ vector<tuple<wstring, float>> ret;
55
+
56
+ for (int64_t b = 0; b < B; ++b) {
57
+ wstring buff;
58
+
59
+ float logProb = 0.0f; // log 1
60
+ bool done = false;
61
+ for (int64_t t = 0; t < T && ! done; ++t) {
62
+ typename token_mapping_t::key_type tokIdx = tokensAccess[b][t];
63
+
64
+ if (t < probsAccess.size(1)) {
65
+ logProb += log(probsAccess[b][t]);
66
+ }
67
+
68
+ switch (tokIdx) {
69
+ case 0:
70
+ // Blank char
71
+ continue;
72
+ case 1:
73
+ // End of sequence char
74
+ done = true;
75
+ break;
76
+ case 2:
77
+ buff.push_back('^');
78
+ break;
79
+ default:
80
+ auto iter = mapping.find(tokIdx);
81
+ if (iter == mapping.end()) {
82
+ throw std::runtime_error("The token mapping doesn't contain an entry for index " + to_string(tokIdx));
83
+ }
84
+ buff += iter->second;
85
+ break;
86
+ }
87
+ }
88
+
89
+ ret.emplace_back(move(buff), exp(logProb));
90
+ }
91
+
92
+ return ret;
93
+ }
94
+
95
+ vector<tuple<wstring, float>>
96
+ decode_sequences(torch::Tensor tokens, const TokenMappingWrapper *tokenMapping,
97
+ c10::optional<torch::Tensor> probs)
98
+ {
99
+ if (tokens.dim() != 2) {
100
+ throw std::runtime_error("`tokens` must be 2-dimensions of type B,T!");
101
+ }
102
+
103
+ if (tokenMapping == nullptr) {
104
+ throw std::runtime_error("Cannot supply a null token mapping!");
105
+ }
106
+
107
+ const token_mapping_t &mapping = tokenMapping->token_mapping;
108
+
109
+ if (mapping.empty()) {
110
+ throw std::runtime_error("The token mapping hasn't been initialized!");
111
+ }
112
+
113
+ if (probs.has_value()) {
114
+ if (probs.value().scalar_type() != torch::kFloat32) {
115
+ throw std::runtime_error("If the probability distribution is specified, then it must be of type `torch.float32`");
116
+ }
117
+ if (probs.value().size(0) != tokens.size(0)) {
118
+ throw std::runtime_error("The probability distribution batch size doesn't match the tokens batch size!");
119
+ }
120
+ if (probs.value().dim() == 2 && probs.value().size(1) != tokens.size(1)) {
121
+ throw std::runtime_error("Invalid probability distribution shape!");
122
+ }
123
+ }
124
+
125
+ vector<tuple<wstring, float>> ret;
126
+
127
+ AT_DISPATCH_INTEGRAL_TYPES(
128
+ tokens.scalar_type(),
129
+ "decode_sequences_impl",
130
+ ([&] {
131
+ ret = decode_sequences_impl<scalar_t>(tokens, tokenMapping, probs);
132
+ })
133
+ );
134
+
135
+ return ret;
136
+ }
137
+
138
+
139
+ std::string ws2s(const std::wstring& wstr)
140
+ {
141
+ using convert_typeX = std::codecvt_utf8<wchar_t>;
142
+ std::wstring_convert<convert_typeX, wchar_t> converterX;
143
+
144
+ return converterX.to_bytes(wstr);
145
+ }
146
+
nemotron-ocr/cpp/beam_decode/language_model.h ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include <memory>
7
+
8
+ #include <torch/torch.h>
9
+
10
+ #include "prefix.h"
11
+ #include "log_sum_exp.h"
12
+
13
+ typedef std::unordered_map<int64_t, std::wstring> token_mapping_t;
14
+ typedef std::unordered_map<wchar_t, int64_t> reverse_token_mapping_t;
15
+
16
+
17
+ class LanguageModel
18
+ {
19
+ public:
20
+ virtual ~LanguageModel() {}
21
+
22
+ virtual float_t ScoreTransition(const Prefix *p, token_t nextToken) const = 0;
23
+
24
+ const token_mapping_t &TokenMapping() const { return m_tokenMapping; }
25
+
26
+ protected:
27
+ LanguageModel(token_mapping_t tokenMapping)
28
+ : m_tokenMapping(std::move(tokenMapping))
29
+ {}
30
+
31
+ token_mapping_t m_tokenMapping;
32
+ };
33
+
34
+
35
+ class NullLanguageModel_t
36
+ : public LanguageModel
37
+ {
38
+ public:
39
+ NullLanguageModel_t();
40
+
41
+ virtual float_t ScoreTransition(const Prefix *p, token_t nextToken) const override
42
+ {
43
+ // log P(1)
44
+ // Which means the probability is unchanged
45
+ return 0;
46
+ }
47
+ };
48
+
49
+ extern const NullLanguageModel_t NullLanguageModel;
50
+
51
+ struct TokenMappingWrapper
52
+ {
53
+ typedef std::shared_ptr<TokenMappingWrapper> Ptr;
54
+
55
+ TokenMappingWrapper(token_mapping_t mapping);
56
+
57
+ token_mapping_t token_mapping;
58
+ reverse_token_mapping_t reverse_token_mapping;
59
+ };
60
+
61
+ TokenMappingWrapper::Ptr create_token_mapping(token_mapping_t tokenMapping);
62
+
63
+ std::vector<std::tuple<std::wstring, float>>
64
+ decode_sequences(torch::Tensor tokens, const TokenMappingWrapper *tokenMapping,
65
+ c10::optional<torch::Tensor> probs = torch::nullopt);
nemotron-ocr/cpp/beam_decode/log_sum_exp.cpp ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "log_sum_exp.h"
5
+
6
+ const float_t NEG_INF = -std::numeric_limits<float_t>::infinity();
nemotron-ocr/cpp/beam_decode/log_sum_exp.h ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include <cmath>
7
+ #include <limits>
8
+ #include <algorithm>
9
+
10
+ typedef float float_t;
11
+ extern const float_t NEG_INF;
12
+
13
+ template<typename T>
14
+ inline T max_val(T v)
15
+ {
16
+ return v;
17
+ }
18
+
19
+ template<typename T, typename ...Args>
20
+ inline T max_val(T v, Args... rest)
21
+ {
22
+ auto restMax = max_val(rest...);
23
+
24
+ return std::max(v, restMax);
25
+ }
26
+
27
+ template<typename T>
28
+ inline T sum_exp(T maxVal, T v)
29
+ {
30
+ return std::exp(v - maxVal);
31
+ }
32
+
33
+ template<typename T, typename ...Args>
34
+ inline T sum_exp(T maxVal, T v, Args... rest)
35
+ {
36
+ auto restSum = sum_exp(maxVal, rest...);
37
+
38
+ return sum_exp(maxVal, v) + restSum;
39
+ }
40
+
41
+ template<typename T, typename ...Args>
42
+ inline T log_sum_exp(T v, Args ...args)
43
+ {
44
+ auto maxVal = max_val(v, args...);
45
+
46
+ if (maxVal == -std::numeric_limits<T>::infinity()) {
47
+ return -std::numeric_limits<T>::infinity();
48
+ }
49
+
50
+ auto sumExp = sum_exp(maxVal, v, args...);
51
+
52
+ return maxVal + std::log(sumExp);
53
+ }
nemotron-ocr/cpp/beam_decode/ngram_lm_base.cpp ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "ngram_lm_base.h"
5
+
6
+ #include <iostream>
7
+ #include <fstream>
8
+
9
+ #if defined( USE_BOOST )
10
+
11
+ #include <boost/archive/binary_oarchive.hpp>
12
+ #include <boost/archive/binary_iarchive.hpp>
13
+ #include <boost/serialization/vector.hpp>
14
+ #include <boost/serialization/string.hpp>
15
+ #include <boost/serialization/unordered_map.hpp>
16
+
17
+ #endif // USE_BOOST
18
+
19
+ using namespace std;
20
+
21
+ const std::wstring WORD_END(1, 2);
22
+ const std::wstring NUMERIC(1, 3);
23
+ const std::wstring UNMODELED(1, 4);
24
+
25
+ struct LMStorage
26
+ {
27
+ lookup_t Lookup;
28
+ reverse_lookup_t ReverseLookup;
29
+
30
+ template<class Archive>
31
+ void serialize(Archive &ar, const unsigned int version) {
32
+ ar & Lookup;
33
+ ar & ReverseLookup;
34
+ }
35
+ };
36
+
37
+ void save_suffix_map(std::fstream& fs, const suffix_map_t& suffix_map)
38
+ {
39
+ // write out number of elements for Lookup
40
+ std::size_t suffix_map_count = suffix_map.size();
41
+ fs.write((char*)(&suffix_map_count), sizeof(suffix_map_count));
42
+ for (suffix_map_t::const_iterator reverse_lookup_it = suffix_map.begin(); reverse_lookup_it != suffix_map.end(); ++reverse_lookup_it)
43
+ {
44
+ // write out the key
45
+ size_t key_len = reverse_lookup_it->first.length();
46
+ fs.write((char*)(&key_len), sizeof(key_len));
47
+ fs.write((char*)(reverse_lookup_it->first.data()), key_len * sizeof(wchar_t));
48
+
49
+ // write out value
50
+ fs.write((char*)(&reverse_lookup_it->second), sizeof(reverse_lookup_it->second));
51
+ }
52
+ }
53
+
54
+ void save_lookup(std::fstream& fs, const lookup_t& lookup)
55
+ {
56
+ // write out number of elements for Lookup
57
+ std::size_t lookup_count = lookup.size();
58
+ fs.write((char*)(&lookup_count), sizeof(lookup_count));
59
+ for (lookup_t::const_iterator lookup_it = lookup.begin(); lookup_it != lookup.end(); ++lookup_it)
60
+ {
61
+ // write out element map size
62
+ std::size_t map_elem_count = lookup_it->size();
63
+ fs.write((char*)(&map_elem_count), sizeof(map_elem_count));
64
+
65
+ for (string_suffix_map_t::const_iterator str_sfx_it = lookup_it->begin(); str_sfx_it != lookup_it->end(); ++str_sfx_it)
66
+ {
67
+ // write out key
68
+ size_t key_len = str_sfx_it->first.length();
69
+ fs.write((char*)(&key_len), sizeof(key_len));
70
+ fs.write((char*)(str_sfx_it->first.data()), key_len * sizeof(wchar_t));
71
+ save_suffix_map(fs, str_sfx_it->second);
72
+ }
73
+ }
74
+ }
75
+
76
+ void save_reverse_lookup(std::fstream& fs, const reverse_lookup_t& reverse_lookup)
77
+ {
78
+ // write out number of elements for Lookup
79
+ std::size_t reverse_lookup_count = reverse_lookup.size();
80
+ fs.write((char*)(&reverse_lookup_count), sizeof(reverse_lookup_count));
81
+ for (reverse_lookup_t::const_iterator reverse_lookup_it = reverse_lookup.begin(); reverse_lookup_it != reverse_lookup.end(); ++reverse_lookup_it)
82
+ {
83
+ // write out the key
84
+ size_t key_len = reverse_lookup_it->first.length();
85
+ fs.write((char*)(&key_len), sizeof(key_len));
86
+ fs.write((char*)(reverse_lookup_it->first.data()), key_len * sizeof(wchar_t));
87
+
88
+ // write out value vector length
89
+ size_t val_vec_len = reverse_lookup_it->second.size();
90
+ fs.write((char*)(&val_vec_len), sizeof(val_vec_len));
91
+
92
+ for (suffix_map_vec_t::const_iterator val_vec_it = reverse_lookup_it->second.begin();
93
+ val_vec_it != reverse_lookup_it->second.end();
94
+ ++val_vec_it)
95
+ {
96
+ save_suffix_map(fs, *val_vec_it);
97
+ }
98
+ }
99
+ }
100
+
101
+ void load_suffix_map(std::fstream& fs, suffix_map_t& suffix_map)
102
+ {
103
+ // read in number of elements
104
+ std::size_t suffix_map_count = 0;
105
+ fs.read((char*)(&suffix_map_count), sizeof(suffix_map_count));
106
+ for (size_t suffix_map_index = 0; suffix_map_index < suffix_map_count; ++suffix_map_index )
107
+ {
108
+ // read in key
109
+ std::size_t key_len = 0;
110
+ fs.read((char*)(&key_len), sizeof(key_len));
111
+
112
+ std::wstring wkey(key_len, 0);
113
+ fs.read((char*)(wkey.data()), key_len * sizeof(wchar_t));
114
+ uint32_t value = 0;
115
+ fs.read((char*)(&value), sizeof(value));
116
+
117
+ suffix_map.insert(std::make_pair(wkey, value));
118
+ }
119
+ }
120
+
121
+ void load_lookup(std::fstream& fs, lookup_t& lookup)
122
+ {
123
+ // read in number of elements
124
+ std::size_t lookup_count = 0;
125
+ fs.read((char*)(&lookup_count), sizeof(lookup_count));
126
+ for (size_t lookup_index = 0; lookup_index < lookup_count; ++lookup_index)
127
+ {
128
+ std::size_t map_elem_count = 0;
129
+ fs.read((char*)(&map_elem_count), sizeof(map_elem_count));
130
+
131
+ lookup.push_back(string_suffix_map_t());
132
+ string_suffix_map_t& str_sfx_map = lookup.back();
133
+
134
+ for (size_t str_sfx_map_index = 0; str_sfx_map_index < map_elem_count; ++str_sfx_map_index)
135
+ {
136
+ std::size_t key_len = 0;
137
+ fs.read((char*)(&key_len), sizeof(key_len));
138
+
139
+ std::wstring wkey(key_len, 0);
140
+ fs.read((char*)(wkey.data()), key_len * sizeof(wchar_t));
141
+ str_sfx_map.insert(std::make_pair<wstring, suffix_map_t>(std::wstring(wkey), suffix_map_t()));
142
+ suffix_map_t& suffix_map = str_sfx_map[wkey];
143
+
144
+ load_suffix_map(fs, suffix_map);
145
+ }
146
+ }
147
+ }
148
+
149
+ void load_reverse_lookup(std::fstream& fs, reverse_lookup_t& reverse_lookup)
150
+ {
151
+ // read in number of elements
152
+ std::size_t reverse_lookup_count = 0;
153
+ fs.read((char*)(&reverse_lookup_count), sizeof(reverse_lookup_count));
154
+ for (size_t rev_lookup_index = 0; rev_lookup_index < reverse_lookup_count; ++rev_lookup_index )
155
+ {
156
+ // read in the key
157
+ std::size_t key_len = 0;
158
+ fs.read((char*)(&key_len), sizeof(key_len));
159
+
160
+ std::wstring wkey(key_len, 0);
161
+ fs.read((char*)(wkey.data()), key_len * sizeof(wchar_t));
162
+ reverse_lookup.insert(std::make_pair(wkey, suffix_map_vec_t()));
163
+ suffix_map_vec_t& val_vec = reverse_lookup[wkey];
164
+
165
+ std::size_t val_vec_len = 0;
166
+ fs.read((char*)(&val_vec_len), sizeof(val_vec_len));
167
+
168
+ for (size_t val_vec_index = 0; val_vec_index < val_vec_len; ++val_vec_index)
169
+ {
170
+ val_vec.push_back(suffix_map_t());
171
+ suffix_map_t& suffix_map = val_vec.back();
172
+ load_suffix_map(fs, suffix_map);
173
+ }
174
+ }
175
+ }
176
+
177
+ #if ! defined( USE_BOOST )
178
+
179
+ NGramLMBase::NGramLMBase(const string &dataFilePath, token_mapping_t tokenMapping)
180
+ : LanguageModel(move(tokenMapping))
181
+ {
182
+ std::fstream in(dataFilePath, std::ios::in | std::ios::binary);
183
+ load_lookup(in, m_lookup);
184
+ load_reverse_lookup(in, m_reverseLookup);
185
+
186
+ if (m_lookup.size() >= 10) {
187
+ throw runtime_error("Only N-Grams of 9 or less are supported!");
188
+ }
189
+
190
+ for (auto &ngLevel : m_lookup) {
191
+ for (auto &kvPrefixLevel : ngLevel) {
192
+ uint32_t ct = 0;
193
+ for (auto &kvSfx : kvPrefixLevel.second) {
194
+ ct += kvSfx.second;
195
+ }
196
+ m_prefixSumLookup.emplace(kvPrefixLevel.first, ct);
197
+ }
198
+ }
199
+ }
200
+
201
+ void save_ngram_data_file(const lookup_t& lookup, const reverse_lookup_t& reverseLookup, const std::string &outputPath)
202
+ {
203
+ std::fstream out(outputPath, std::ios::out | std::ios::binary);
204
+
205
+ save_lookup(out, lookup);
206
+ save_reverse_lookup(out, reverseLookup);
207
+ }
208
+
209
+ #else // USE_BOOST
210
+
211
+ NGramLMBase::NGramLMBase(const string &dataFilePath, token_mapping_t tokenMapping)
212
+ : LanguageModel(move(tokenMapping))
213
+ {
214
+ {
215
+ ifstream dfStr(dataFilePath, ios_base::in | ios_base::binary);
216
+ boost::archive::binary_iarchive ia(dfStr);
217
+
218
+ LMStorage s;
219
+ ia >> s;
220
+
221
+
222
+ m_lookup = move(s.Lookup);
223
+
224
+ m_reverseLookup = move(s.ReverseLookup);
225
+ }
226
+
227
+ if (m_lookup.size() >= 10) {
228
+ throw runtime_error("Only N-Grams of 9 or less are supported!");
229
+ }
230
+
231
+ for (auto &ngLevel : m_lookup) {
232
+ for (auto &kvPrefixLevel : ngLevel) {
233
+ uint32_t ct = 0;
234
+ for (auto &kvSfx : kvPrefixLevel.second) {
235
+ ct += kvSfx.second;
236
+ }
237
+ m_prefixSumLookup.emplace(kvPrefixLevel.first, ct);
238
+ }
239
+ }
240
+ }
241
+
242
+ void save_ngram_data_file(lookup_t lookup, reverse_lookup_t reverseLookup, const std::string &outputPath)
243
+ {
244
+ ofstream ofs(outputPath, ios_base::out | ios_base::binary);
245
+
246
+ LMStorage s;
247
+ s.Lookup = move(lookup);
248
+ s.ReverseLookup = move(reverseLookup);
249
+
250
+ boost::archive::binary_oarchive oa(ofs);
251
+ oa << s;
252
+ }
253
+
254
+ #endif // USE_BOOST
255
+
256
+ float_t NGramLMBase::ScoreTransition(const Prefix *p, token_t nextToken) const
257
+ {
258
+ std::wstring prefix;
259
+ if (! ConvertToString(p, prefix)) {
260
+ return NEG_INF;
261
+ }
262
+
263
+ const std::wstring *pSuffix = nullptr;
264
+
265
+ if (nextToken != 1) {
266
+ auto iter = m_tokenMapping.find(nextToken);
267
+ if (iter == m_tokenMapping.end()) {
268
+ pSuffix = &UNMODELED;
269
+ } else {
270
+ pSuffix = &iter->second;
271
+
272
+ if (iswdigit(pSuffix->at(0))) {
273
+ pSuffix = &NUMERIC;
274
+ }
275
+ }
276
+
277
+ } else {
278
+ pSuffix = &WORD_END;
279
+ }
280
+
281
+ float_t ret = ScoreTransitionImpl(prefix, *pSuffix);
282
+
283
+ if (ret > 0) {
284
+ return log(ret);
285
+ } else {
286
+ return NEG_INF;
287
+ }
288
+ }
289
+
290
+ bool NGramLMBase::ConvertToString(const Prefix *p, std::wstring &prefix) const
291
+ {
292
+ const Prefix *stk[10];
293
+ int32_t sz = -1;
294
+ const Prefix *curr = p;
295
+ decltype(sz) mlSz{(int)m_lookup.size() - 2};
296
+ while (curr && sz < mlSz) {
297
+ stk[++sz] = curr;
298
+ curr = curr->Parent;
299
+ }
300
+
301
+ // Either blank or empty prefix
302
+ if (sz < 1) { return true; }
303
+
304
+ --sz;
305
+ for (; sz >= 0; --sz) {
306
+ token_t tok = stk[sz]->Token;
307
+ // End of word token, which maps to the null character
308
+ if (tok == 1) {
309
+ prefix.push_back(WORD_END[0]);
310
+ } else if (tok == 0) {
311
+ // Do nothing
312
+ } else {
313
+ auto iter = m_tokenMapping.find(tok);
314
+ if (iter == m_tokenMapping.end()) {
315
+ prefix += UNMODELED;
316
+ } else {
317
+ const std::wstring &wChar = iter->second;
318
+
319
+ if (iswdigit(wChar[0])) {
320
+ prefix += NUMERIC;
321
+ } else {
322
+ prefix += wChar;
323
+ }
324
+ }
325
+ }
326
+ }
327
+
328
+ return true;
329
+ }
nemotron-ocr/cpp/beam_decode/ngram_lm_base.h ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include <unordered_map>
7
+ #include <vector>
8
+
9
+ #include "language_model.h"
10
+
11
+ // #define USE_BOOST 1
12
+
13
+
14
+ typedef std::unordered_map<std::wstring, uint32_t> suffix_map_t;
15
+
16
+ /* Tells us the number of suffixes for a given ngram of order K
17
+ Keys:
18
+ 1. NGram Order
19
+ 2. Prefix
20
+ 3. Suffix
21
+ Value:
22
+ Count
23
+ */
24
+ typedef std::unordered_map<std::wstring, suffix_map_t> string_suffix_map_t;
25
+ typedef std::vector<string_suffix_map_t> lookup_t;
26
+ /* Tells us the number of K-gram prefixes found for a given suffix
27
+ Keys:
28
+ 1. Suffix
29
+ 2. NGram Order
30
+ 3. Prefix
31
+ Values:
32
+ Count
33
+ */
34
+ typedef std::vector<suffix_map_t> suffix_map_vec_t;
35
+ typedef std::unordered_map<std::wstring, suffix_map_vec_t> reverse_lookup_t;
36
+
37
+
38
+
39
+ extern const std::wstring WORD_END;
40
+ extern const std::wstring NUMERIC;
41
+ extern const std::wstring UNMODELED;
42
+
43
+ class NGramLMBase
44
+ : public LanguageModel
45
+ {
46
+ public:
47
+ virtual float_t ScoreTransition(const Prefix *p, token_t nextToken) const override;
48
+
49
+ protected:
50
+ NGramLMBase(const std::string &dataFilePath, token_mapping_t tokenMapping);
51
+
52
+ virtual float_t ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const = 0;
53
+
54
+ bool ConvertToString(const Prefix *p, std::wstring &prefix) const;
55
+
56
+ float_t GetPrefixSum(const std::wstring &prefix) const;
57
+
58
+ lookup_t m_lookup;
59
+ reverse_lookup_t m_reverseLookup;
60
+
61
+ std::unordered_map<std::wstring, uint32_t> m_prefixSumLookup;
62
+ };
63
+
64
+ #if ! defined( USE_BOOST )
65
+ void save_ngram_data_file(const lookup_t& lookup, const reverse_lookup_t& reverseLookup, const std::string &output_path);
66
+ #else // USE_BOOST
67
+ void save_ngram_data_file(lookup_t lookup, reverse_lookup_t reverseLookup, const std::string &output_path);
68
+ #endif // USE_BOOST
69
+
70
+ inline float_t NGramLMBase::GetPrefixSum(const std::wstring &prefix) const
71
+ {
72
+ auto iter = m_prefixSumLookup.find(prefix);
73
+
74
+ if (iter == m_prefixSumLookup.end()) {
75
+ return 0;
76
+ } else {
77
+ return iter->second;
78
+ }
79
+ }
nemotron-ocr/cpp/beam_decode/prefix.cpp ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "prefix.h"
5
+
6
+ using namespace std;
7
+
8
+ vector<token_t> Prefix::ToList() const
9
+ {
10
+ vector<token_t> ret;
11
+
12
+ auto curr = this;
13
+
14
+ while (curr) {
15
+ if (curr->Token != 0) {
16
+ ret.push_back(curr->Token);
17
+ }
18
+ curr = curr->Parent;
19
+ }
20
+
21
+ return { rbegin(ret), rend(ret) };
22
+ }
nemotron-ocr/cpp/beam_decode/prefix.h ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include <cstdlib>
7
+ #include <memory>
8
+ #include <vector>
9
+ #include <unordered_map>
10
+ #include <list>
11
+
12
+ typedef int32_t token_t;
13
+
14
+ class Prefix;
15
+
16
+ // typedef std::shared_ptr<Prefix> PrefixPtr;
17
+
18
+ class Prefix
19
+ {
20
+ public:
21
+ token_t Token;
22
+ Prefix *Parent;
23
+
24
+ Prefix(token_t token = 0 /* blank */, Prefix *parent = nullptr)
25
+ : Token(token), Parent(parent)
26
+ {}
27
+
28
+ std::vector<token_t> ToList() const;
29
+
30
+ size_t size() const;
31
+ };
32
+
33
+
34
+ ///// Borrowed from Boost libraries
35
+ template<typename T>
36
+ void hash_combine(size_t & seed, T const& v)
37
+ {
38
+ seed ^= std::hash<T>()(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
39
+ }
40
+ /////
41
+
42
+ namespace std {
43
+ template<>
44
+ struct hash<Prefix*>
45
+ {
46
+ size_t operator()(const Prefix *p) const noexcept
47
+ {
48
+ size_t seed = 0;
49
+
50
+ while (p) {
51
+ if (p->Token != 0) {
52
+ hash_combine(seed, p->Token);
53
+ }
54
+ p = p->Parent;
55
+ }
56
+ return seed;
57
+ }
58
+ };
59
+
60
+ template<>
61
+ struct hash<tuple<Prefix*, token_t>>
62
+ {
63
+ size_t operator()(const tuple<Prefix*, token_t> &t) const noexcept
64
+ {
65
+ size_t seed = 0;
66
+ hash_combine(seed, get<0>(t));
67
+ hash_combine(seed, get<1>(t));
68
+ return seed;
69
+ }
70
+ };
71
+
72
+ template<>
73
+ struct equal_to<Prefix*>
74
+ {
75
+ bool operator()(const Prefix *a, const Prefix *b) const noexcept
76
+ {
77
+ while (a != nullptr && b != nullptr) {
78
+ if (a->Token != b->Token) {
79
+ return false;
80
+ }
81
+ a = a->Parent;
82
+ b = b->Parent;
83
+ }
84
+ // If one chain is shorter than the other
85
+ return a == b;
86
+ }
87
+ };
88
+ }
89
+
90
+ inline size_t Prefix::size() const
91
+ {
92
+ size_t ret = 0;
93
+ auto p = this;
94
+ while (p != nullptr) {
95
+ ret += 1;
96
+ p = p->Parent;
97
+ }
98
+ return ret;
99
+ }
100
+
101
+
102
+ class PrefixAllocator
103
+ {
104
+ public:
105
+ PrefixAllocator() = default;
106
+ ~PrefixAllocator();
107
+
108
+ template<typename ...Args>
109
+ Prefix *GetPrefix(Args&& ...ctorArgs);
110
+
111
+ private:
112
+ void AllocateNextBuffer();
113
+
114
+ std::list<Prefix*> m_buffers;
115
+ size_t m_allocSize = 0;
116
+ size_t m_currOff = 0;
117
+ };
118
+
119
+ inline PrefixAllocator::~PrefixAllocator()
120
+ {
121
+ for (auto p : m_buffers) {
122
+ // Prefix is a POD, and are allocated without initializing
123
+ // to prevent redundant work upfront
124
+ // delete[] p;
125
+ free(p);
126
+ }
127
+ }
128
+
129
+ inline void PrefixAllocator::AllocateNextBuffer()
130
+ {
131
+ size_t nextSize = m_allocSize == 0 ? 1000 : 2 * m_allocSize;
132
+
133
+ // Using malloc here to prevent the ctor of Prefix being called for each item.
134
+ // Instead, the ctor will be called upon first access using GetPrefix
135
+ auto pBuff = reinterpret_cast<Prefix*>(malloc(sizeof(Prefix) * nextSize));
136
+
137
+ m_buffers.push_back(pBuff);
138
+
139
+ m_allocSize = nextSize;
140
+ m_currOff = 0;
141
+ }
142
+
143
+ template<typename ...Args>
144
+ Prefix *PrefixAllocator::GetPrefix(Args&& ...ctorArgs)
145
+ {
146
+ if (m_currOff == m_allocSize) {
147
+ AllocateNextBuffer();
148
+ }
149
+
150
+ auto buff = m_buffers.back() + m_currOff;
151
+
152
+ auto ret = new (buff) Prefix(std::forward<Args>(ctorArgs)...);
153
+
154
+ ++m_currOff;
155
+
156
+ return ret;
157
+ }
nemotron-ocr/cpp/beam_decode/sbo_lm.cpp ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "sbo_lm.h"
5
+
6
+ #include <assert.h>
7
+
8
+ // Reference paper: https://www.aclweb.org/anthology/D07-1090.pdf
9
+
10
+
11
+ SBO_LanguageModel::SBO_LanguageModel(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t backoff)
12
+ : NGramLMBase(dataFilePath, move(tokenMapping)), m_backoff(backoff)
13
+ {
14
+ }
15
+
16
+ float SBO_LanguageModel::ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const
17
+ {
18
+ auto lIter = m_lookup[prefix.size() + 1].find(prefix);
19
+
20
+ // This prefix doesn't exist. Shrink it!
21
+ if (lIter == m_lookup[prefix.size() + 1].end()) {
22
+ return m_backoff * ScoreTransitionImpl({ begin(prefix) + 1, end(prefix) }, suffix);
23
+ }
24
+
25
+ const suffix_map_t &suffixMap = lIter->second;
26
+
27
+ auto sfIter = suffixMap.find(suffix);
28
+
29
+ if (sfIter == suffixMap.end()) {
30
+ // This is a novel character entirely!
31
+ if (prefix.empty()) {
32
+ return 1e-8;
33
+ } else {
34
+ return m_backoff * ScoreTransitionImpl({ begin(prefix) + 1, end(prefix) }, suffix);
35
+ }
36
+ }
37
+
38
+ float_t ctSuffix = sfIter->second;
39
+ float_t ctNgram = GetPrefixSum(prefix);
40
+
41
+ float_t score = ctSuffix / ctNgram;
42
+
43
+ assert(score >= 0 && score <= 1);
44
+
45
+ return score;
46
+ }
nemotron-ocr/cpp/beam_decode/sbo_lm.h ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include "kn_lm.h"
7
+
8
+
9
+ class SBO_LanguageModel
10
+ : public NGramLMBase
11
+ {
12
+ public:
13
+ SBO_LanguageModel(const std::string &dataFilePath, token_mapping_t tokenMapping, float_t backoff);
14
+
15
+ protected:
16
+ virtual float_t ScoreTransitionImpl(const std::wstring &prefix, const std::wstring &suffix) const override;
17
+
18
+ private:
19
+ float_t m_backoff;
20
+ };
nemotron-ocr/cpp/better_grid_sample/cpu_indirect_grid_sample.cpp ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "grid_sample.h"
5
+ #include "gpu_grid_sample_utils.cuh"
6
+
7
+ template<typename T>
8
+ void indirect_grid_sample_forward_bilinear(torch::TensorAccessor<T, 4> input,
9
+ torch::TensorAccessor<T, 4> grid,
10
+ torch::TensorAccessor<int64_t, 1> inputIndices,
11
+ torch::TensorAccessor<T, 4> output)
12
+ {
13
+ const int64_t N = inputIndices.size(0);
14
+ const int64_t C = output.size(1);
15
+
16
+ T fInputHeight = input.size(2);
17
+ T fInputWidth = input.size(3);
18
+ int64_t outputHeight = output.size(2);
19
+ int64_t outputWidth = output.size(3);
20
+
21
+ #pragma omp parallel for num_threads(8)
22
+ for (int64_t i = 0; i < N; ++i) {
23
+ int64_t inputIdx = inputIndices[i];
24
+
25
+ for (int64_t c = 0; c < C; ++c) {
26
+ for (int64_t outY = 0; outY < outputHeight; ++outY) {
27
+ for (int64_t outX = 0; outX < outputWidth; ++outX) {
28
+ T u = grid[i][outY][outX][0];
29
+ T v = grid[i][outY][outX][1];
30
+
31
+ if (u < -1 || u > 1 || v < -1 || v > 1) {
32
+ output[i][c][outY][outX] = 0;
33
+ continue;
34
+ }
35
+
36
+ // Denormalize the coordinates
37
+ u = (u + 1) * ((fInputWidth - 1) / 2);
38
+ v = (v + 1) * ((fInputHeight - 1) / 2);
39
+
40
+ // Calculate coordinates
41
+ const T inX = u;
42
+ const T inXint = std::floor(inX);
43
+ const T inXfrac = inX - inXint;
44
+
45
+ const T inY = v;
46
+ const T inYint = std::floor(inY);
47
+ const T inYfrac = inY - inYint;
48
+
49
+ T ps[] = { 1 - inXfrac, inXfrac };
50
+ T rs[] = { 1 - inYfrac, inYfrac };
51
+ T opVal = 0;
52
+
53
+ #pragma unroll
54
+ for (int64_t row = 0; row < 2; ++row) {
55
+ #pragma unroll
56
+ for (int64_t col = 0; col < 2; ++col) {
57
+ T Tpx = utils::get_pixel_clamped(input, inputIdx, c, inXint + col, inYint + row);
58
+ opVal += rs[row] * ps[col] * Tpx;
59
+ }
60
+ }
61
+
62
+ output[i][c][outY][outX] = opVal;
63
+ }
64
+ }
65
+ }
66
+ }
67
+ }
68
+
69
+ torch::Tensor cpu_indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid,
70
+ torch::Tensor inputIndices, const std::string &method)
71
+ {
72
+ auto output = input.new_empty({ inputIndices.size(0), input.size(1), grid.size(1), grid.size(2) });
73
+
74
+ AT_DISPATCH_FLOATING_TYPES(
75
+ input.scalar_type(),
76
+ "cpu_indirect_grid_sample_forward_impl",
77
+ ([&] {
78
+ typedef scalar_t T;
79
+ if (method == "bilinear") {
80
+ indirect_grid_sample_forward_bilinear(
81
+ input.accessor<T, 4>(),
82
+ grid.accessor<T, 4>(),
83
+ inputIndices.accessor<int64_t, 1>(),
84
+ output.accessor<T, 4>()
85
+ );
86
+ } else {
87
+ throw std::runtime_error("Unsupported resample method: " + method);
88
+ }
89
+ })
90
+ );
91
+
92
+ return output;
93
+ }
nemotron-ocr/cpp/better_grid_sample/gpu_grid_sample_utils.cuh ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include <torch/torch.h>
7
+
8
+ #include "../cuda_intellisense.cuh"
9
+
10
+ #ifndef __NVCC__
11
+ #include <algorithm>
12
+ #define __device__
13
+ #endif
14
+
15
+ namespace utils {
16
+
17
+ #ifdef __NVCC__
18
+
19
+ template<typename T>
20
+ __device__ __lib_inline__
21
+ T clamp(T val, T minVal, T maxVal)
22
+ {
23
+ return max(minVal, min(val, maxVal));
24
+ }
25
+
26
+ #else
27
+ using std::clamp;
28
+ #endif
29
+
30
+ template<typename accessor_t>
31
+ __device__ __lib_inline__
32
+ auto &get_pixel_clamped(accessor_t &inputs,
33
+ int64_t n, int64_t c, int64_t x, int64_t y)
34
+ {
35
+ x = clamp<decltype(x)>(x, 0, inputs.size(3) - 1);
36
+ y = clamp<decltype(y)>(y, 0, inputs.size(2) - 1);
37
+
38
+ return inputs[n][c][y][x];
39
+ }
40
+
41
+ }
nemotron-ocr/cpp/better_grid_sample/gpu_indirect_grid_sample.cu ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "grid_sample.h"
5
+
6
+ #include "../cuda_intellisense.cuh"
7
+ #include "../half_ops.cuh"
8
+ #include "gpu_grid_sample_utils.cuh"
9
+
10
+ using namespace std;
11
+
12
+ template<typename accessor_t, typename index_t>
13
+ __device__ __lib_inline__
14
+ auto &my_get_pixel_clamped(accessor_t &inputs, index_t x, index_t y)
15
+ {
16
+ x = utils::clamp(x, 0, inputs.size(1) - 1);
17
+ y = utils::clamp(y, 0, inputs.size(0) - 1);
18
+
19
+ return inputs[y][x];
20
+ }
21
+
22
+ __global__
23
+ void single_ex_grid_sample_bilinear_kernel(const float *pInputImage,
24
+ uint32_t imgHeight, uint32_t imgWidth, uint32_t numChannels,
25
+ const float2 *pGrid,
26
+ uint32_t numGridCells,
27
+ float *pOutputImage)
28
+ {
29
+ const uint32_t z = blockDim.x * blockIdx.x + threadIdx.x;
30
+ const uint32_t c = blockDim.y * blockIdx.y + threadIdx.y;
31
+
32
+ if (c >= numChannels || z >= numGridCells) {
33
+ return;
34
+ }
35
+
36
+ const uint32_t g = blockIdx.z;
37
+
38
+ const float2 uv = pGrid[g * numGridCells + z];
39
+
40
+ float &outPx = pOutputImage[(g * numChannels + c) * numGridCells + z];
41
+ if (abs(uv.x) > 1.0f || abs(uv.y) > 1.0f) {
42
+ outPx = 0.0f;
43
+ } else {
44
+ const uint32_t maxX = imgWidth - 1;
45
+ const uint32_t maxY = imgHeight - 1;
46
+
47
+ const float u = (uv.x + 1.0f) * maxX * 0.5f;
48
+ const float v = (uv.y + 1.0f) * maxY * 0.5f;
49
+
50
+ // calculate coordinates
51
+ const float inX = u;
52
+ const uint32_t inXint = inX;
53
+ const float inXfrac = inX - inXint;
54
+
55
+ const float inY = v;
56
+ const uint32_t inYint = inY;
57
+ const float inYfrac = inY - inYint;
58
+
59
+ const float *pChanImage = pInputImage + c * imgHeight * imgWidth;
60
+
61
+ // By being in this conditional block, we know that u and v are >= 0, which means
62
+ // that their truncated value is also >= 0. Instead of clamping the value to within the buffer,
63
+ // we set the multiplication factor to be 0 if the interpolated value is outside the buffer
64
+ const float ps[] = { 1.0f - inXfrac, inXfrac * (inXint < maxX) };
65
+ const float rs[] = { 1.0f - inYfrac, inYfrac * (inYint < maxY) };
66
+ float opVal = 0.0f;
67
+ #pragma unroll
68
+ for (uint32_t row = 0; row < 2; ++row) {
69
+ const float *pRowImage = pChanImage + (inYint + row) * imgWidth;
70
+
71
+ #pragma unroll
72
+ for (uint32_t col = 0; col < 2; ++col) {
73
+ const float px = pRowImage[inXint + col];
74
+ opVal += rs[row] * ps[col] * px;
75
+ }
76
+ }
77
+
78
+ outPx = opVal;
79
+ }
80
+ }
81
+
82
+ template<typename T>
83
+ __global__
84
+ void indirect_grid_sample_forward_bilinear_kernel(torch::PackedTensorAccessor32<T, 4> inputs,
85
+ torch::PackedTensorAccessor32<T, 4> grid,
86
+ torch::PackedTensorAccessor32<int64_t, 1> inputIndices,
87
+ torch::PackedTensorAccessor32<T, 4> outputs)
88
+ {
89
+ static_assert(std::is_same<T, float>::value, "Currently only float32 is supported!");
90
+ //typedef typename fp_promote<T>::type accum_t;
91
+ typedef float accum_t;
92
+ constexpr T NEG_ONE = -1;
93
+ constexpr T ONE = 1;
94
+ constexpr T ZERO = 0;
95
+ constexpr T TWO = 2;
96
+ constexpr T ZERO_PT_5 = 0.5;
97
+ typedef decltype(inputs.stride(0)) index_t;
98
+
99
+ const index_t n = blockDim.z * blockIdx.z + threadIdx.z;
100
+
101
+ if (n >= inputIndices.size(0)) return;
102
+
103
+ const index_t c = blockDim.y * blockIdx.y + threadIdx.y;
104
+
105
+ const index_t z = blockDim.x * blockIdx.x + threadIdx.x;
106
+
107
+ const accum_t inputHeight = inputs.size(2);
108
+ const accum_t inputWidth = inputs.size(3);
109
+ const index_t outputHeight = outputs.size(2);
110
+ const index_t outputWidth = outputs.size(3);
111
+
112
+ const index_t outY = z / outputWidth;
113
+ //const index_t outX = z % outputWidth;
114
+ const index_t outX = z - (outY * outputWidth);
115
+
116
+ if (outY >= outputHeight) return;
117
+
118
+ index_t inputIdx = inputIndices[n];
119
+ const float2 f2uv = *reinterpret_cast<const float2*>(grid[n][outY][outX].data());
120
+ float u = f2uv.x;
121
+ float v = f2uv.y;
122
+
123
+ if (u < NEG_ONE || u > ONE || v < NEG_ONE || v > ONE) {
124
+ outputs[n][c][outY][outX] = ZERO;
125
+ return;
126
+ }
127
+
128
+ // Denormalize the coordinates
129
+ u = (u + ONE) * ((inputWidth - ONE) * ZERO_PT_5);
130
+ v = (v + ONE) * ((inputHeight - ONE) * ZERO_PT_5);
131
+
132
+ // calculate coordinates
133
+ const accum_t inX = u;
134
+ const index_t inXint = inX;
135
+ const accum_t inXfrac = inX - inXint;
136
+
137
+ const accum_t inY = v;
138
+ const index_t inYint = inY;
139
+ const accum_t inYfrac = inY - inYint;
140
+
141
+ accum_t ps[] = { ONE - inXfrac, inXfrac };
142
+ accum_t rs[] = { ONE - inYfrac, inYfrac };
143
+ accum_t opVal = ZERO;
144
+
145
+ auto localInputs = inputs[inputIdx][c];
146
+
147
+ #pragma unroll
148
+ for (index_t row = 0; row < 2; ++row) {
149
+ #pragma unroll
150
+ for (index_t col = 0; col < 2; ++col) {
151
+ T Tpx = my_get_pixel_clamped(localInputs, inXint + col, inYint + row);
152
+ opVal += rs[row] * ps[col] * Convert<T, accum_t>::LeftToRight(Tpx);
153
+ }
154
+ }
155
+
156
+ outputs[n][c][outY][outX] = Convert<T, accum_t>::RightToLeft(opVal);
157
+ }
158
+
159
+ template<typename T>
160
+ __global__
161
+ void indirect_grid_sample_backward_bilinear_kernel(torch::PackedTensorAccessor64<T, 4> inputs,
162
+ torch::PackedTensorAccessor64<T, 4> grid,
163
+ torch::PackedTensorAccessor64<int64_t, 1> inputIndices,
164
+ torch::PackedTensorAccessor64<T, 4> gradOutput,
165
+ torch::PackedTensorAccessor64<T, 4> gradInput,
166
+ torch::PackedTensorAccessor64<T, 4> gradGrid)
167
+ {
168
+ typedef typename fp_promote<T>::type accum_t;
169
+ constexpr T NEG_ONE = -1;
170
+ constexpr T ONE = 1;
171
+
172
+ const int64_t n = blockDim.z * blockIdx.z + threadIdx.z;
173
+
174
+ if (n >= inputIndices.size(0)) return;
175
+
176
+ const int64_t c = blockDim.y * blockIdx.y + threadIdx.y;
177
+
178
+ const int64_t z = blockDim.x * blockIdx.x + threadIdx.x;
179
+
180
+ const accum_t inputHeight = inputs.size(2);
181
+ const accum_t inputWidth = inputs.size(3);
182
+ const int64_t outputHeight = gradOutput.size(2);
183
+ const int64_t outputWidth = gradOutput.size(3);
184
+
185
+ const int64_t outY = z / outputWidth;
186
+ const int64_t outX = z % outputWidth;
187
+
188
+ if (outY >= outputHeight) return;
189
+
190
+ int64_t inputIdx = inputIndices[n];
191
+ const float2 f2uv = *reinterpret_cast<const float2*>(grid[n][outY][outX].data());
192
+ float u = f2uv.x;
193
+ float v = f2uv.y;
194
+
195
+ // No output gradient contribution from this position
196
+ if (u < NEG_ONE || u > ONE || v < NEG_ONE || v > ONE) {
197
+ return;
198
+ }
199
+
200
+ // Denormalize the coordinates
201
+ u = (u + 1) * ((inputWidth - 1) / 2);
202
+ v = (v + 1) * ((inputHeight - 1) / 2);
203
+
204
+ // calculate coordinates
205
+ const accum_t inX = u;
206
+ const accum_t inXint = floor(inX);
207
+ const accum_t inXfrac = inX - inXint;
208
+
209
+ const accum_t inY = v;
210
+ const accum_t inYint = floor(inY);
211
+ const accum_t inYfrac = inY - inYint;
212
+
213
+ accum_t ps[] = { 1 - inXfrac, inXfrac };
214
+ accum_t rs[] = { 1 - inYfrac, inYfrac };
215
+
216
+ const accum_t gOut = Convert<T, accum_t>::LeftToRight(gradOutput[n][c][outY][outX]);
217
+
218
+ #pragma unroll
219
+ for (size_t row = 0; row < 2; ++row) {
220
+ #pragma unroll
221
+ for (size_t col = 0; col < 2; ++col) {
222
+ T &gIn = utils::get_pixel_clamped(gradInput, inputIdx, c, inXint + col, inYint + row);
223
+
224
+ T gContrib = Convert<T, accum_t>::RightToLeft(rs[row] * ps[col] * gOut);
225
+
226
+ atomicAdd(&gIn, gContrib);
227
+ }
228
+ }
229
+ }
230
+
231
+ torch::Tensor gpu_indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method)
232
+ {
233
+ auto output = input.new_empty({ inputIndices.size(0), input.size(1), grid.size(1), grid.size(2) });
234
+
235
+
236
+ if (method != "bilinear"s) {
237
+ throw runtime_error("Only 'bilinear' sampling is currently supported!");
238
+ }
239
+
240
+ if (input.size(0) == 1 && input.is_contiguous() && grid.is_contiguous()) {
241
+ uint32_t gridNumCells = grid.size(1) * grid.size(2);
242
+ dim3 blockDim(32, 3, 1);
243
+ dim3 gridDim(div_up(gridNumCells, blockDim.x),
244
+ div_up(input.size(1), blockDim.y),
245
+ div_up(grid.size(0), blockDim.z));
246
+ single_ex_grid_sample_bilinear_kernel KERNEL_ARG2(gridDim, blockDim) (
247
+ input.data_ptr<float>(),
248
+ input.size(2), input.size(3), input.size(1),
249
+ reinterpret_cast<const float2*>(grid.data_ptr()),
250
+ gridNumCells,
251
+ output.data_ptr<float>()
252
+ );
253
+
254
+ } else {
255
+ // z is batch idx
256
+ // y is channel
257
+ // x is w*h
258
+ dim3 blockDim(32, 1, 3);
259
+ dim3 gridDim(div_up(grid.size(1) * grid.size(2), blockDim.x),
260
+ div_up(input.size(1), blockDim.y),
261
+ div_up(inputIndices.size(0), blockDim.z));
262
+ indirect_grid_sample_forward_bilinear_kernel KERNEL_ARG2(gridDim, blockDim) (
263
+ input.packed_accessor32<float, 4>(),
264
+ grid.packed_accessor32<float, 4>(),
265
+ inputIndices.packed_accessor32<int64_t, 1>(),
266
+ output.packed_accessor32<float, 4>()
267
+ );
268
+ }
269
+
270
+ //AT_DISPATCH_FLOATING_TYPES_AND_HALF(
271
+ // input.scalar_type(),
272
+ // "gpu_indirect_grid_sample_forward",
273
+ // ([&] {
274
+ // typedef typename remap_half<scalar_t>::type T;
275
+ // // typedef scalar_t T;
276
+ // if (method == "bilinear") {
277
+ // indirect_grid_sample_forward_bilinear_kernel KERNEL_ARG2(gridDim, blockDim) (
278
+ // input.packed_accessor64<T, 4>(),
279
+ // grid.packed_accessor64<T, 4>(),
280
+ // inputIndices.packed_accessor64<int64_t, 1>(),
281
+ // output.packed_accessor64<T, 4>()
282
+ // );
283
+ // } else {
284
+ // throw runtime_error("Unsupported resample method: " + method);
285
+ // }
286
+ // })
287
+ //);
288
+
289
+ return output;
290
+ }
291
+
292
+ std::vector<torch::Tensor> gpu_indirect_grad_sample_backward(torch::Tensor gradOutput, torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method)
293
+ {
294
+ auto gradInput = torch::zeros_like(input);
295
+ auto gradGrid = torch::zeros_like(grid);
296
+
297
+ // z is batch idx
298
+ // y is channel
299
+ // x is w*h
300
+ dim3 blockDim(32, 1, 1);
301
+ dim3 gridDim(div_up(grid.size(1) * grid.size(2), blockDim.x),
302
+ div_up(input.size(1), blockDim.y),
303
+ div_up(inputIndices.size(0), blockDim.z));
304
+
305
+ AT_DISPATCH_FLOATING_TYPES(
306
+ input.scalar_type(),
307
+ "gpu_indirect_grid_sample_backward",
308
+ ([&] {
309
+ typedef typename remap_half<scalar_t>::type T;
310
+ // typedef scalar_t T;
311
+ if (method == "bilinear") {
312
+ indirect_grid_sample_backward_bilinear_kernel KERNEL_ARG2(gridDim, blockDim) (
313
+ input.packed_accessor64<T, 4>(),
314
+ grid.packed_accessor64<T, 4>(),
315
+ inputIndices.packed_accessor64<int64_t, 1>(),
316
+ gradOutput.packed_accessor64<T, 4>(),
317
+ gradInput.packed_accessor64<T, 4>(),
318
+ gradGrid.packed_accessor64<T, 4>()
319
+ );
320
+ } else {
321
+ throw runtime_error("Unsupported resample method: " + method);
322
+ }
323
+ })
324
+ );
325
+
326
+ return { gradInput, gradGrid };
327
+ }
nemotron-ocr/cpp/better_grid_sample/grid_sample.h ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include <torch/torch.h>
7
+
8
+ inline
9
+ torch::Tensor region_counts_to_indices(torch::Tensor regionCounts, int64_t numOutputs)
10
+ {
11
+ // If there's only one example, we can trivially return idx 0 for all
12
+ if (regionCounts.size(0) == 1) {
13
+ return torch::zeros({ numOutputs }, regionCounts.options().dtype(torch::kInt64));
14
+ }
15
+
16
+ // regionCounts will be some tensor like [ 5, 1, 10, 2 ] which means that the first 5 outputs
17
+ // correspond to the first input, the next output to the second input, 10 to the third, and so on.
18
+
19
+ // We want to convert this to instead have an entry for each output which specifies the index of the corresponding input.
20
+ // To do this, we can count the number of times the output index exceeds the cumulative input counts.
21
+ // e.g. the cumulative region count for the above tensor is [ 5, 6, 16, 18 ].
22
+ // The output indices 0-4 are not greater than or equal to any cumulative count, so they get the input index of 0.
23
+ // The output index 5 is equal to a single count, therefore index 1.
24
+ // The outputs 6-15 are all greater than or equal to two cumulative counts, therefore index 2.
25
+ // And so on.
26
+
27
+ auto indices = torch::arange(regionCounts.size(0), regionCounts.options().dtype(torch::kInt64));
28
+
29
+ auto outputIndices = torch::repeat_interleave(indices, regionCounts, /*dim=*/ 0, /*output_size=*/ numOutputs);
30
+
31
+ return outputIndices;
32
+ }
33
+
34
+ torch::Tensor gpu_indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method);
35
+ torch::Tensor cpu_indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method);
36
+ std::vector<torch::Tensor> gpu_indirect_grad_sample_backward(torch::Tensor gradOutput, torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method);
37
+
38
+ inline
39
+ torch::Tensor indirect_grid_sample_forward(torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method)
40
+ {
41
+ if (input.is_cuda() != grid.is_cuda() || input.is_cuda() != inputIndices.is_cuda()) {
42
+ throw std::runtime_error("Input tensors must all be on the same device!");
43
+ }
44
+ if (inputIndices.size(0) != grid.size(0)) {
45
+ throw std::runtime_error("The batch dimensions must match!");
46
+ }
47
+ if (grid.size(-1) != 2) {
48
+ throw std::runtime_error("The final grid dimension must be 2.");
49
+ }
50
+
51
+ if (input.is_cuda()) {
52
+ return gpu_indirect_grid_sample_forward(std::move(input), std::move(grid), std::move(inputIndices), method);
53
+ } else {
54
+ return cpu_indirect_grid_sample_forward(std::move(input), std::move(grid), std::move(inputIndices), method);
55
+ }
56
+ }
57
+
58
+ inline
59
+ std::vector<torch::Tensor> indirect_grad_sample_backward(torch::Tensor gradOutput, torch::Tensor input, torch::Tensor grid, torch::Tensor inputIndices, const std::string &method)
60
+ {
61
+ if (gradOutput.is_cuda()) {
62
+ return gpu_indirect_grad_sample_backward(std::move(gradOutput), std::move(input), std::move(grid), std::move(inputIndices), method);
63
+ } else {
64
+ throw std::runtime_error("Not implemented!");
65
+ }
66
+ }
nemotron-ocr/cpp/common.cpp ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "common.h"
5
+
6
+ #include <sstream>
7
+
8
+ using namespace std;
9
+
10
+ void print_tensor(const torch::Tensor &t) {
11
+ cout << t << endl;
12
+ }
nemotron-ocr/cpp/common.h ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include <ostream>
7
+ #include <vector>
8
+
9
+ #include <torch/torch.h>
10
+
11
+ template<typename T>
12
+ inline
13
+ std::ostream &operator<<(std::ostream &os, const std::vector<T> &v) {
14
+ os << "[";
15
+ if (! v.empty()) {
16
+ os << v[0];
17
+ for (size_t i = 1; i < v.size(); ++i) {
18
+ os << ", " << v[i];
19
+ }
20
+ }
21
+ os << "]";
22
+ return os;
23
+ }
24
+
25
+ template<int Counter, typename ...Args>
26
+ struct _inner_tuple_print
27
+ {
28
+ inline
29
+ static std::ostream &print(std::ostream &os, const std::tuple<Args...> &t) {
30
+ _inner_tuple_print<Counter - 1, Args...>::print(os, t);
31
+
32
+ os << ", " << std::get<Counter>(t);
33
+ return os;
34
+ }
35
+ };
36
+
37
+ template<typename ...Args>
38
+ struct _inner_tuple_print<0, Args...>
39
+ {
40
+ inline
41
+ static std::ostream &print(std::ostream &os, const std::tuple<Args...> &t) {
42
+ os << std::get<0>(t);
43
+ return os;
44
+ }
45
+ };
46
+
47
+
48
+ template<typename... Args>
49
+ inline
50
+ std::ostream &operator<<(std::ostream &os, const std::tuple<Args...> &t) {
51
+ os << "(";
52
+ _inner_tuple_print<sizeof...(Args) - 1, Args...>::print(os, t);
53
+ os << ")";
54
+ return os;
55
+ }
56
+
57
+ void print_tensor(const torch::Tensor &t);
nemotron-ocr/cpp/cuda_intellisense.cuh ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #if defined(__INTELLISENSE__) || !defined(__NVCC__)
7
+ #ifndef KERNEL_ARG2
8
+ #define KERNEL_ARG2(grid, block)
9
+ #define KERNEL_ARG3(grid, block, sh_mem)
10
+ #define KERNEL_ARG4(grid, block, sh_mem, stream)
11
+ #define __global__
12
+ #define __device__
13
+ #define __host__
14
+ #endif
15
+ #endif
16
+
17
+ #ifdef __INTELLISENSE__
18
+ #define __CUDACC__
19
+ #include <cuda_runtime.h>
20
+
21
+ void __syncthreads(); // workaround __syncthreads warning
22
+
23
+ dim3 threadIdx;
24
+ dim3 blockIdx;
25
+ dim3 blockDim;
26
+ dim3 gridDim;
27
+
28
+ #else
29
+ #ifndef KERNEL_ARG2
30
+ #define KERNEL_ARG2(grid, block) <<< grid, block >>>
31
+ #define KERNEL_ARG3(grid, block, sh_mem) <<< grid, block, sh_mem >>>
32
+ #define KERNEL_ARG4(grid, block, sh_mem, stream) <<< grid, block, sh_mem, stream >>>
33
+ #endif
34
+ #endif
35
+
36
+ #define __any_device__ __host__ __device__
37
+
38
+ #ifdef __NVCC__
39
+ #define __lib_inline__ __forceinline__
40
+
41
+ #else
42
+ #define __lib_inline__ inline
43
+ #endif
44
+
45
+ template<typename T1, typename T2>
46
+ __any_device__
47
+ inline auto div_up(T1 n, T2 d)
48
+ {
49
+ return (n + d - 1) / d;
50
+ }
nemotron-ocr/cpp/geometry.h ADDED
@@ -0,0 +1,1100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <iostream>
9
+ #include <type_traits>
10
+
11
+ #ifndef _GEOMETRY_NO_TORCH
12
+ #include <torch/torch.h>
13
+ #endif
14
+
15
+ #include "cuda_intellisense.cuh"
16
+
17
+ #ifndef __NVCC__
18
+ #define SORT_ALGO std::sort
19
+ #define SWAP std::swap
20
+
21
+ template<typename ...Args>
22
+ using tuple_t = std::tuple<Args...>;
23
+
24
+ #else
25
+
26
+ #include <thrust/sort.h>
27
+ #include <thrust/tuple.h>
28
+
29
+ #define SORT_ALGO thrust::sort
30
+ #define SWAP thrust::swap
31
+
32
+ template<typename ...Args>
33
+ using tuple_t = thrust::tuple<Args...>;
34
+ #endif
35
+
36
+ template<typename T>
37
+ struct Point_ {
38
+ typedef T inner_type;
39
+
40
+ T X, Y;
41
+
42
+ Point_() = default;
43
+
44
+ __any_device__
45
+ Point_(T x, T y) : X(x), Y(y) {}
46
+
47
+ __any_device__
48
+ Point_(T *ptr) : X(ptr[0]), Y(ptr[1]) {}
49
+
50
+ #ifndef _GEOMETRY_NO_TORCH
51
+ template<typename T2>
52
+ __any_device__
53
+ Point_(const torch::TensorAccessor<T2, 1> &accessor) : X(accessor[0]), Y(accessor[1]) {}
54
+
55
+ template<typename T2>
56
+ __any_device__
57
+ Point_(const torch::PackedTensorAccessor64<T2, 1> &accessor) : X(accessor[0]), Y(accessor[1]) {}
58
+ #endif
59
+
60
+ __any_device__
61
+ Point_ &operator+=(const Point_ &other);
62
+
63
+ __any_device__
64
+ Point_ &operator-=(const Point_ &other);
65
+
66
+ __any_device__
67
+ Point_ &operator*=(const Point_ &other);
68
+
69
+ __any_device__
70
+ Point_ &operator/=(const Point_ &other);
71
+
72
+ template<typename W>
73
+ __any_device__
74
+ Point_ &operator/=(W w);
75
+
76
+ template<typename W>
77
+ __any_device__
78
+ Point_ &operator*=(W w);
79
+
80
+ __any_device__
81
+ Point_ operator-() {
82
+ return { -X, -Y };
83
+ }
84
+
85
+ __any_device__
86
+ T Sum() const { return X + Y; }
87
+
88
+ __any_device__
89
+ T Angle() const;
90
+
91
+ __any_device__
92
+ void swap(Point_ &other) noexcept {
93
+ SWAP(X, other.X);
94
+ SWAP(Y, other.Y);
95
+ }
96
+ };
97
+
98
+ template<typename T>
99
+ __lib_inline__ __any_device__
100
+ void swap(Point_<T> &a, Point_<T> &b) {
101
+ a.swap(b);
102
+ }
103
+
104
+
105
+ template<typename T>
106
+ __any_device__
107
+ __lib_inline__ T Point_<T>::Angle() const {
108
+ #ifndef __NVCC__
109
+ using std::atan2;
110
+ #endif
111
+ return atan2(Y, X);
112
+ }
113
+
114
+ template<typename T>
115
+ __any_device__
116
+ __lib_inline__ Point_<T> min(const Point_<T> &a, const Point_<T> &b) {
117
+ #ifndef __NVCC__
118
+ using std::min;
119
+ #endif
120
+ return {
121
+ min(a.X, b.X),
122
+ min(a.Y, b.Y)
123
+ };
124
+ }
125
+
126
+ template<typename T>
127
+ __any_device__
128
+ __lib_inline__ Point_<T> max(const Point_<T> &a, const Point_<T> &b) {
129
+ #ifndef __NVCC__
130
+ using std::max;
131
+ #endif
132
+ return {
133
+ max(a.X, b.X),
134
+ max(a.Y, b.Y)
135
+ };
136
+ }
137
+
138
+ template<typename T>
139
+ struct AABB_ {
140
+ typedef T inner_type;
141
+
142
+ T X;
143
+ T Y;
144
+ T MaxX;
145
+ T MaxY;
146
+
147
+ AABB_() = default;
148
+ __any_device__
149
+ AABB_(T x, T y, T maxX, T maxY)
150
+ : X(x), Y(y), MaxX(maxX), MaxY(maxY) {}
151
+
152
+ __any_device__
153
+ bool Contains(const Point_<T> &p) const {
154
+ return p.X >= X && p.X < MaxX &&
155
+ p.Y >= Y && p.Y < MaxY;
156
+ }
157
+
158
+ __any_device__ __lib_inline__
159
+ AABB_ Union(const AABB_ &other) const {
160
+ #ifndef __NVCC__
161
+ using std::min;
162
+ using std::max;
163
+ #endif
164
+ T minX = min(X, other.X);
165
+ T maxX = max(MaxX, other.MaxX);
166
+ T minY = min(Y, other.Y);
167
+ T maxY = max(MaxY, other.MaxY);
168
+
169
+ return { minX, minY, maxX, maxY };
170
+ }
171
+
172
+ __any_device__
173
+ AABB_ &operator-=(const Point_<T> &offset) {
174
+ X -= offset.X;
175
+ MaxX -= offset.X;
176
+ Y -= offset.Y;
177
+ MaxY -= offset.Y;
178
+ return *this;
179
+ }
180
+
181
+ __any_device__
182
+ __lib_inline__ T Width() const { return MaxX - X; }
183
+ __any_device__
184
+ __lib_inline__ T Height() const { return MaxY - Y; }
185
+ __any_device__
186
+ __lib_inline__ T Area() const { return Width() * Height(); }
187
+
188
+ __lib_inline__ T &operator[] (int64_t idx)
189
+ {
190
+ static_assert(std::is_standard_layout<AABB_<T>>::value, "This function is only valid for standard layout");
191
+ return (&X)[idx];
192
+ }
193
+ __lib_inline__ T operator[] (int64_t idx) const
194
+ {
195
+ static_assert(std::is_standard_layout<AABB_<T>>::value, "This function is only valid for standard layout");
196
+ return (&X)[idx];
197
+ }
198
+
199
+ __any_device__ __lib_inline__
200
+ AABB_ Intersection(const AABB_ &other) const {
201
+ #ifndef __NVCC__
202
+ using std::min;
203
+ using std::max;
204
+ #endif
205
+ T minX = max(X, other.X);
206
+ T minY = max(Y, other.Y);
207
+ T maxX = min(MaxX, other.MaxX);
208
+ T maxY = min(MaxY, other.MaxY);
209
+ // Prevent negative area
210
+ minX = min(minX, maxX);
211
+ minY = min(minY, maxY);
212
+ return { minX, minY, maxX, maxY };
213
+ }
214
+
215
+ __any_device__ __lib_inline__
216
+ T IntersectionArea(const AABB_ &other) const { return Intersection(other).Area(); }
217
+ };
218
+
219
+ template<typename T, typename Derived>
220
+ struct QuadBase_ {
221
+ typedef T inner_type;
222
+
223
+ __any_device__
224
+ AABB_<T> Bounds() const;
225
+
226
+ __any_device__
227
+ bool Contains(const Point_<T> &p) const;
228
+
229
+ __any_device__
230
+ T Area() const;
231
+
232
+ __any_device__
233
+ T Height() const;
234
+
235
+ __any_device__
236
+ T Width() const;
237
+
238
+ template<typename Derived2>
239
+ __any_device__
240
+ T IntersectionArea(const QuadBase_<T, Derived2> &other) const;
241
+
242
+ template<typename Derived2>
243
+ __any_device__
244
+ T IOU(const QuadBase_<T, Derived2> &other) const;
245
+
246
+ template<typename Derived2>
247
+ __any_device__
248
+ T IOU_UpperBound(const QuadBase_<T, Derived2> &other) const;
249
+
250
+ __any_device__
251
+ Point_<T> Center() const;
252
+
253
+ template<typename Derived2>
254
+ __any_device__
255
+ /*
256
+ Returns 3 geometric associations between the two quads:
257
+ 0: The percent shared area between this and other relative to this (e.g. if other contains this, then it returns 1)
258
+ 1: The percent shared area between other and this relative to other (e.g. if this contains other, then it return 1)
259
+ 2: The IOU of the two quads
260
+ */
261
+ tuple_t<T, T, T> RegionSizes(const QuadBase_<T, Derived2> &other) const;
262
+
263
+ template<typename Derived2>
264
+ __any_device__
265
+ tuple_t<T, T, T> RegionSizes_UpperBound(const QuadBase_<T, Derived2> &other) const;
266
+
267
+ __any_device__
268
+ Derived &operator/=(T val) {
269
+ auto rcp = 1 / val;
270
+ return *this *= rcp;
271
+ }
272
+
273
+ __any_device__
274
+ Derived &operator*=(T val) {
275
+ auto dThis = static_cast<Derived*>(this);
276
+ #pragma unroll
277
+ for (size_t i = 0; i < 4; ++i) {
278
+ dThis->Vertices[i] *= val;
279
+ }
280
+ return *dThis;
281
+ }
282
+
283
+ friend auto begin(const QuadBase_ &q) { return static_cast<const Derived&>(q).Vertices; }
284
+ friend auto begin(QuadBase_& q) { return static_cast<const Derived&>(q).Vertices; }
285
+ friend auto end(const QuadBase_ &q) { return static_cast<const Derived&>(q).Vertices + 4; }
286
+ friend auto end(QuadBase_ &q) { return static_cast<const Derived&>(q).Vertices + 4; }
287
+ };
288
+
289
+ template<typename T>
290
+ struct Quad_ : QuadBase_<T, Quad_<T>> {
291
+ Point_<T> *Vertices = nullptr;
292
+
293
+ Quad_() = default;
294
+ __any_device__
295
+ Quad_(T *dataPtr)
296
+ : Vertices(reinterpret_cast<Point_<T>*>(dataPtr)) {}
297
+ __any_device__
298
+ Quad_(Point_<T> *dataPtr)
299
+ : Vertices(dataPtr) {}
300
+
301
+ template<typename index_t>
302
+ __any_device__ __lib_inline__
303
+ const Point_<T> &operator[](index_t offset) const { return Vertices[offset]; }
304
+ template<typename index_t>
305
+ __any_device__ __lib_inline__
306
+ Point_<T> &operator[](index_t offset) { return Vertices[offset]; }
307
+ };
308
+
309
+ template<typename T>
310
+ struct InPlaceQuad_ : public QuadBase_<T, InPlaceQuad_<T>> {
311
+ Point_<T> Vertices[4];
312
+
313
+ InPlaceQuad_() = default;
314
+ __any_device__
315
+ InPlaceQuad_(const T *dataPtr)
316
+ {
317
+ #if defined(__NVCC__)
318
+ T *pVals = reinterpret_cast<T*>(Vertices);
319
+ #pragma unroll
320
+ for (uint32_t i = 0; i < 8; ++i) {
321
+ pVals[i] = dataPtr[i];
322
+ }
323
+ #else
324
+ using std::copy;
325
+ copy(dataPtr, dataPtr + 8, reinterpret_cast<T*>(Vertices));
326
+ #endif
327
+ }
328
+ __any_device__
329
+ InPlaceQuad_(const Point_<T> *dataPtr)
330
+ {
331
+ #if defined(__NVCC__)
332
+ #pragma unroll
333
+ for (uint32_t i = 0; i < 4; ++i) {
334
+ Vertices[i] = dataPtr[i];
335
+ }
336
+ #else
337
+ using std::copy;
338
+ copy(dataPtr, dataPtr + 4, Vertices);
339
+ #endif
340
+ }
341
+
342
+ template<typename index_t>
343
+ __any_device__ __lib_inline__
344
+ const Point_<T> &operator[](index_t v) const { return Vertices[v]; }
345
+
346
+ template<typename index_t>
347
+ __any_device__ __lib_inline__
348
+ Point_<T> &operator[](index_t v) { return Vertices[v]; }
349
+ };
350
+
351
+ template<typename T, typename Derived>
352
+ struct PolygonBase_ {
353
+ typedef T inner_type;
354
+
355
+ __any_device__
356
+ AABB_<T> Bounds() const;
357
+
358
+ __any_device__
359
+ bool Contains(const Point_<T> &p) const;
360
+
361
+ __any_device__
362
+ T EdgeLength() const;
363
+
364
+ __any_device__
365
+ Point_<T> Center() const;
366
+
367
+ __any_device__
368
+ T Area() const;
369
+ };
370
+
371
+ template<typename T>
372
+ struct Polygon_ : PolygonBase_<T, Polygon_<T>> {
373
+ Point_<T> *Vertices = nullptr;
374
+ size_t Count = 0;
375
+
376
+ Polygon_() = default;
377
+ __any_device__
378
+ Polygon_(T *dataPtr, size_t vertexCount)
379
+ : Vertices(reinterpret_cast<Point_<T>*>(dataPtr)), Count(vertexCount) {}
380
+ __any_device__
381
+ Polygon_(Point_<T> *dataPtr, size_t vertexCount)
382
+ : Vertices(dataPtr), Count(vertexCount) {}
383
+
384
+ __any_device__
385
+ const Point_<T> &operator[](size_t offset) const { return Vertices[offset]; }
386
+ __any_device__
387
+ Point_<T> &operator[](size_t offset) { return Vertices[offset]; }
388
+ };
389
+
390
+ template<typename T>
391
+ struct Segment_ {
392
+ Point_<T> A, B;
393
+
394
+ Segment_() = default;
395
+ __any_device__
396
+ Segment_(const Point_<T> &a, const Point_<T> &b) : A(a), B(b) {}
397
+
398
+ __any_device__
399
+ T Length() const;
400
+ __any_device__
401
+ T LengthSq() const;
402
+ __any_device__
403
+ bool Intersection(const Segment_<T> &other, Point_<T> &out_ptAlong) const;
404
+ };
405
+
406
+ template<typename T>
407
+ __any_device__
408
+ __lib_inline__ Point_<T> operator+(const Point_<T> &a, const Point_<T> &b) {
409
+ return { a.X + b.X, a.Y + b.Y };
410
+ }
411
+
412
+ template<typename T>
413
+ __any_device__
414
+ __lib_inline__ Point_<T> operator-(const Point_<T> &a, const Point_<T> &b) {
415
+ return { a.X - b.X, a.Y - b.Y };
416
+ }
417
+
418
+ template<typename T, typename W>
419
+ __any_device__
420
+ __lib_inline__ Point_<T> operator*(W scale, const Point_<T> &p) {
421
+ return { scale * p.X, scale * p.Y };
422
+ }
423
+
424
+ template<typename T, typename W>
425
+ __any_device__
426
+ __lib_inline__ Point_<T> operator*(const Point_<T> &p, W scale) {
427
+ return { scale * p.X, scale * p.Y };
428
+ }
429
+
430
+ template<typename T, typename W>
431
+ __any_device__
432
+ __lib_inline__ Point_<T> operator/(const Point_<T> &p, W divisor) {
433
+ return { p.X / divisor, p.Y / divisor };
434
+ }
435
+
436
+ template<typename T>
437
+ __any_device__
438
+ __lib_inline__ Point_<T> operator*(const Point_<T> &a, const Point_<T> &b) {
439
+ return { a.X * b.X, a.Y * b.Y };
440
+ }
441
+
442
+ template<typename T, typename W>
443
+ __any_device__
444
+ __lib_inline__ Point_<T> operator-(const Point_<T> &p, W v) {
445
+ return { p.X - v, p.Y - v };
446
+ }
447
+
448
+ template<typename T>
449
+ __any_device__
450
+ __lib_inline__ Point_<T> &Point_<T>::operator+=(const Point_<T> &p) {
451
+ X = X + p.X;
452
+ Y = Y + p.Y;
453
+ return *this;
454
+ }
455
+
456
+ template<typename T>
457
+ __any_device__
458
+ __lib_inline__ Point_<T> &Point_<T>::operator-=(const Point_<T> &p) {
459
+ X = X - p.X;
460
+ Y = Y - p.Y;
461
+ return *this;
462
+ }
463
+
464
+ template<typename T>
465
+ __any_device__
466
+ __lib_inline__ Point_<T> &Point_<T>::operator*=(const Point_<T> &p) {
467
+ X = X * p.X;
468
+ Y = Y * p.Y;
469
+ return *this;
470
+ }
471
+
472
+ template<typename T>
473
+ __any_device__
474
+ __lib_inline__ Point_<T> &Point_<T>::operator/=(const Point_<T> &p) {
475
+ X = X / p.X;
476
+ Y = Y / p.Y;
477
+ return *this;
478
+ }
479
+
480
+ template<typename T>
481
+ template<typename W>
482
+ __any_device__
483
+ __lib_inline__ Point_<T> &Point_<T>::operator/=(W val) {
484
+ // TODO: This can be more efficient for float types by computing the reciprocal
485
+ X /= val;
486
+ Y /= val;
487
+ return *this;
488
+ }
489
+
490
+ template<typename T>
491
+ template<typename W>
492
+ __any_device__
493
+ __lib_inline__ Point_<T> &Point_<T>::operator*=(W val) {
494
+ X *= val;
495
+ Y *= val;
496
+ return *this;
497
+ }
498
+
499
+ template<typename T>
500
+ __any_device__
501
+ __lib_inline__ T dot(const Point_<T> &a, const Point_<T> &b) {
502
+ return a.X * b.X + a.Y * b.Y;
503
+ }
504
+
505
+ template<typename T>
506
+ __any_device__
507
+ __lib_inline__ T dot(const Point_<T> &p) {
508
+ return dot(p, p);
509
+ }
510
+
511
+ template<typename T>
512
+ __any_device__
513
+ __lib_inline__ T length(const Point_<T> &p) {
514
+ #ifndef __NVCC__
515
+ using std::sqrt;
516
+ #endif
517
+ return sqrt(dot(p));
518
+ }
519
+
520
+ template<typename T>
521
+ __any_device__
522
+ __lib_inline__ Point_<T> normalize(const Point_<T> &p) {
523
+ static constexpr T epsilon = std::numeric_limits<T>::epsilon();
524
+ auto len = length(p) + epsilon;
525
+ return { p.X / len, p.Y / len };
526
+ }
527
+
528
+ template<typename T>
529
+ __any_device__
530
+ __lib_inline__ Point_<T> ortho_2d(const Point_<T> &p) {
531
+ return { -p.Y, p.X };
532
+ }
533
+
534
+ template<typename T>
535
+ __host__
536
+ __lib_inline__ std::ostream &operator<<(std::ostream &os, const Point_<T> &p) {
537
+ return os << "(" << p.X << ", " << p.Y << ")";
538
+ }
539
+
540
+ template<typename T>
541
+ __host__
542
+ __lib_inline__ std::ostream &operator<<(std::ostream &os, const AABB_<T> &b) {
543
+ return os << "[(" << b.X << ", " << b.Y << "), (" << b.MaxX << ", " << b.MaxY << ")]";
544
+ }
545
+
546
+ template<typename T>
547
+ __host__
548
+ __lib_inline__ std::ostream &operator<<(std::ostream &os, const Segment_<T> &s) {
549
+ return os << "[(" << s.A.X << ", " << s.A.Y << "), (" << s.B.X << ", " << s.B.Y << ")]";
550
+ }
551
+
552
+ template<typename T>
553
+ __host__
554
+ __lib_inline__ std::ostream &operator<<(std::ostream &os, const Quad_<T> &q) {
555
+ os << "[" << q.Vertices[0];
556
+ for (size_t i = 1; i < 4; ++i) {
557
+ os << ", " << q.Vertices[i];
558
+ }
559
+ return os << "]";
560
+ }
561
+
562
+ template<typename T>
563
+ __any_device__
564
+ __lib_inline__ int _signum(T val) {
565
+ return (T(0) < val) - (val < T(0));
566
+ }
567
+
568
+ template<typename T>
569
+ __any_device__
570
+ __lib_inline__ T sign(const Point_<T> &p1, const Point_<T> &p2, const Point_<T> &p3) {
571
+ T ret = (p1.X - p3.X) * (p2.Y - p3.Y) - (p2.X - p3.X) * (p1.Y - p3.Y);
572
+ auto sgn = _signum(ret);
573
+ return sgn;
574
+ }
575
+
576
+ template<typename T>
577
+ __any_device__
578
+ __lib_inline__ T Segment_<T>::Length() const
579
+ {
580
+ #ifndef __NVCC__
581
+ using std::sqrt;
582
+ #endif
583
+ return sqrt(LengthSq());
584
+ }
585
+
586
+ template<typename T>
587
+ __any_device__
588
+ __lib_inline__ T Segment_<T>::LengthSq() const
589
+ {
590
+ return dot(B - A);
591
+ }
592
+
593
+ template<typename T>
594
+ __any_device__
595
+ inline bool Segment_<T>::Intersection(const Segment_<T> &other, Point_<T> &out_ptAlong) const
596
+ {
597
+ auto p1 = A, p2 = B, p3 = other.A, p4 = other.B;
598
+
599
+ auto denom = (p4.Y - p3.Y) * (p2.X - p1.X) - (p4.X - p3.X) * (p2.Y - p1.Y);
600
+
601
+ if (abs(denom) < 1e-8) {
602
+ return false;
603
+ }
604
+
605
+ auto numer = (p4.X - p3.X) * (p1.Y - p3.Y) - (p4.Y - p3.Y) * (p1.X - p3.X);
606
+
607
+ auto t = numer / denom;
608
+
609
+ auto Bnumer = (p2.X - p1.X) * (p1.Y - p3.Y) - (p2.Y - p1.Y) * (p1.X - p3.X);
610
+
611
+ auto Bt = Bnumer / denom;
612
+
613
+ if (t < 0 || t > 1 || Bt < 0 || Bt > 1) {
614
+ return false;
615
+ }
616
+
617
+ out_ptAlong = A + t * (B - A);
618
+
619
+ return true;
620
+ }
621
+
622
+ template<typename quad_t>
623
+ __any_device__
624
+ auto quad_center(const quad_t &quad) -> Point_<typename quad_t::inner_type>
625
+ {
626
+ typedef typename quad_t::inner_type T;
627
+
628
+ Point_<T> center = quad[0];
629
+ for (size_t i = 1; i < 4; ++i) {
630
+ center += quad[i];
631
+ }
632
+
633
+ return center / T{ 4 };
634
+ }
635
+
636
+ template<typename T, typename Derived>
637
+ __any_device__
638
+ Point_<T> QuadBase_<T, Derived>::Center() const {
639
+ return quad_center(static_cast<const Derived&>(*this));
640
+ }
641
+
642
+ template<typename quad_t>
643
+ __any_device__
644
+ auto quad_bounds(const quad_t &quad) -> AABB_<typename quad_t::inner_type>
645
+ {
646
+ #ifndef __NVCC__
647
+ using std::min;
648
+ using std::max;
649
+ #endif
650
+ auto minP = quad[0];
651
+ auto maxP = minP;
652
+ for (size_t i = 1; i < 4; ++i) {
653
+ auto qp = quad[i];
654
+ minP = min(minP, qp);
655
+ maxP = max(maxP, qp);
656
+ }
657
+ return { minP.X, minP.Y, maxP.X, maxP.Y };
658
+ }
659
+
660
+ template<typename T, typename Derived>
661
+ __any_device__
662
+ AABB_<T> QuadBase_<T, Derived>::Bounds() const {
663
+ return quad_bounds(static_cast<const Derived&>(*this));
664
+ }
665
+
666
+ template<typename Quad_t, typename point_t>
667
+ __any_device__
668
+ inline bool quad_contains(const Quad_t &quad, const point_t &pt)
669
+ {
670
+ #ifndef __NVCC__
671
+ using std::abs;
672
+ #endif
673
+
674
+ // Checks that the point lies on the interior side of each half plane
675
+ auto d1 = sign(pt, quad[0], quad[1]);
676
+ auto d2 = sign(pt, quad[1], quad[2]);
677
+ auto d3 = sign(pt, quad[2], quad[3]);
678
+ auto d4 = sign(pt, quad[3], quad[0]);
679
+
680
+ // bool has_neg = (d1 < 0) || (d2 < 0) || (d3 < 0) || (d4 < 0);
681
+ // bool has_pos = (d1 > 0) || (d2 > 0) || (d3 > 0) || (d4 > 0);
682
+ int tot = d1 + d2 + d3 + d4;
683
+
684
+ // return !(has_neg && has_pos);
685
+ return abs(tot) == 4;
686
+ }
687
+
688
+ template<typename T, typename Derived>
689
+ __any_device__
690
+ __lib_inline__ bool QuadBase_<T, Derived>::Contains(const Point_<T> &pt) const
691
+ {
692
+ return quad_contains(static_cast<const Derived&>(*this), pt);
693
+ }
694
+
695
+ template<typename PtList>
696
+ __any_device__
697
+ inline auto shoelace_area(const PtList &points, size_t numPts, bool isSigned=false) -> decltype(points[0].X)
698
+ {
699
+ #ifndef __NVCC__
700
+ using std::abs;
701
+ #endif
702
+
703
+ decltype(points[0].X) area = 0;
704
+
705
+ size_t j = numPts - 1;
706
+ for (size_t i = 0; i < numPts; ++i) {
707
+ auto Pi = points[i];
708
+ auto Pj = points[j];
709
+
710
+ area += (Pj.X + Pi.X) * (Pj.Y - Pi.Y);
711
+ j = i;
712
+ }
713
+
714
+ area = area / 2;
715
+
716
+ if (! isSigned) {
717
+ area = abs(area);
718
+ }
719
+
720
+ return area;
721
+ }
722
+
723
+ template<typename T, typename Derived>
724
+ __any_device__
725
+ __lib_inline__ T QuadBase_<T, Derived>::Height() const
726
+ {
727
+ auto &d = static_cast<const Derived&>(*this);
728
+ auto h1 = Segment_<T>(d[1], d[2]).Length();
729
+ auto h2 = Segment_<T>(d[3], d[0]).Length();
730
+ return (h1 + h2) / 2;
731
+ }
732
+
733
+ template<typename T, typename Derived>
734
+ __any_device__
735
+ __lib_inline__ T QuadBase_<T, Derived>::Width() const
736
+ {
737
+ auto &d = static_cast<const Derived&>(*this);
738
+ auto w1 = Segment_<T>(d[0], d[1]).Length();
739
+ auto w2 = Segment_<T>(d[3], d[2]).Length();
740
+ return (w1 + w2) / 2;
741
+ }
742
+
743
+ // A quad can be defined as the sum of the area of two triangles
744
+ template<typename T, typename Derived>
745
+ __any_device__
746
+ inline T QuadBase_<T, Derived>::Area() const
747
+ {
748
+ // auto vertices = static_cast<const Derived *>(this)->Vertices;
749
+ return shoelace_area(static_cast<const Derived&>(*this), 4);
750
+ }
751
+
752
+ template<typename Quad_t1, typename Quad_t2>
753
+ __any_device__
754
+ inline auto intersection_area(const Quad_t1 &quadsA, const Quad_t2 &quadsB) -> typename Quad_t1::inner_type
755
+ {
756
+ #ifndef __NVCC__
757
+ using std::atan2;
758
+ #endif
759
+
760
+ typedef typename Quad_t1::inner_type T;
761
+
762
+ static const size_t MAX_PTS = 32;
763
+
764
+ Point_<T> points[MAX_PTS], sortedPoints[MAX_PTS];
765
+ T angles[MAX_PTS];
766
+ size_t indices[MAX_PTS];
767
+ size_t numPts = 0;
768
+
769
+ auto addPt = [&] (const Point_<T> &p) {
770
+ points[numPts] = p;
771
+ ++numPts;
772
+ };
773
+
774
+ for (size_t i = 0; i < 4; ++i) {
775
+ Point_<T> aPt = quadsA[i];
776
+ Point_<T> bPt = quadsB[i];
777
+
778
+ if (quadsA.Contains(bPt)) {
779
+ addPt(bPt);
780
+ }
781
+ if (quadsB.Contains(aPt)) {
782
+ addPt(aPt);
783
+ }
784
+ }
785
+
786
+ for (size_t i = 0; i < 4; ++i) {
787
+ Segment_<T> segA{ quadsA[i], quadsA[(i + 1) % 4] };
788
+
789
+ for (size_t j = 0; j < 4; ++j) {
790
+ Segment_<T> segB{ quadsB[j], quadsB[(j + 1) % 4] };
791
+
792
+ Point_<T> ptAlong;
793
+ if (segA.Intersection(segB, ptAlong)) {
794
+ addPt(ptAlong);
795
+ }
796
+ }
797
+ }
798
+
799
+ if (numPts == 0) {
800
+ return 0;
801
+ }
802
+
803
+ Point_<T> center{ 0, 0 };
804
+ for (size_t i = 0; i < numPts; ++i) {
805
+ center += points[i];
806
+ }
807
+ center /= numPts;
808
+
809
+ for (size_t i = 0; i < numPts; ++i) {
810
+ points[i] -= center;
811
+
812
+ angles[i] = atan2(points[i].Y, points[i].X);
813
+
814
+ indices[i] = i;
815
+ }
816
+
817
+ // Perform an argsort over the angles
818
+ SORT_ALGO(indices, indices + numPts,
819
+ [&] (size_t a, size_t b) {
820
+ return angles[a] < angles[b];
821
+ }
822
+ );
823
+
824
+ for (size_t i = 0; i < numPts; ++i) {
825
+ sortedPoints[i] = points[indices[i]];
826
+ }
827
+
828
+ // Finally, we can compute the area of this polygon using the shoelace formula
829
+ T area = shoelace_area(sortedPoints, numPts);
830
+
831
+ return area;
832
+ }
833
+
834
+ template<typename T, typename Derived>
835
+ template<typename Derived2>
836
+ __any_device__
837
+ __lib_inline__ T QuadBase_<T, Derived>::IntersectionArea(const QuadBase_<T, Derived2> &other) const
838
+ {
839
+ return intersection_area(
840
+ static_cast<const Derived&>(*this),
841
+ static_cast<const Derived2&>(other)
842
+ );
843
+ }
844
+
845
+ template<typename T1, typename T2>
846
+ __any_device__
847
+ __lib_inline__ auto geometry_iou(const T1 &a, const T2 &b) -> decltype(a.Area())
848
+ {
849
+ auto aArea = a.Area();
850
+ auto bArea = b.Area();
851
+ auto ixArea = a.IntersectionArea(b);
852
+
853
+ auto unionArea = aArea + bArea - ixArea;
854
+
855
+ return ixArea / unionArea;
856
+ }
857
+
858
+ template<typename T, typename Derived>
859
+ template<typename Derived2>
860
+ __any_device__
861
+ __lib_inline__ T QuadBase_<T, Derived>::IOU(const QuadBase_<T, Derived2> &other) const
862
+ {
863
+ return geometry_iou(
864
+ static_cast<const Derived&>(*this),
865
+ static_cast<const Derived2&>(other)
866
+ );
867
+ }
868
+
869
+ template<typename T, typename Derived>
870
+ template<typename Derived2>
871
+ __any_device__
872
+ __lib_inline__ T QuadBase_<T, Derived>::IOU_UpperBound(const QuadBase_<T, Derived2> &other) const
873
+ {
874
+ return geometry_iou(
875
+ Bounds(),
876
+ other.Bounds()
877
+ );
878
+ }
879
+
880
+ template<typename T1, typename T2>
881
+ __any_device__ __lib_inline__
882
+ auto geometry_region_sizes(const T1 &a, const T2 &b) -> tuple_t<decltype(a.Area()), decltype(a.Area()), decltype(a.IntersectionArea(b))>
883
+ {
884
+ auto aArea = a.Area();
885
+ auto bArea = b.Area();
886
+ auto ixArea = a.IntersectionArea(b);
887
+
888
+ auto unionArea = aArea + bArea - ixArea;
889
+ auto iou = ixArea / unionArea;
890
+
891
+ return { ixArea / aArea, ixArea / bArea, iou };
892
+ }
893
+
894
+
895
+ template<typename T, typename Derived>
896
+ template<typename Derived2>
897
+ __any_device__ __lib_inline__
898
+ tuple_t<T, T, T> QuadBase_<T, Derived>::RegionSizes(const QuadBase_<T, Derived2> &other) const
899
+ {
900
+ return geometry_region_sizes(
901
+ static_cast<const Derived&>(*this),
902
+ static_cast<const Derived2&>(other)
903
+ );
904
+ }
905
+
906
+ template<typename T, typename Derived>
907
+ template<typename Derived2>
908
+ __any_device__ __lib_inline__
909
+ tuple_t<T, T, T> QuadBase_<T, Derived>::RegionSizes_UpperBound(const QuadBase_<T, Derived2> &other) const
910
+ {
911
+ return geometry_region_sizes(
912
+ Bounds(),
913
+ other.Bounds()
914
+ );
915
+ }
916
+
917
+ template<typename polygon_t>
918
+ __any_device__
919
+ auto polygon_bounds(const polygon_t &poly) -> AABB_<typename polygon_t::inner_type>
920
+ {
921
+ #ifndef __NVCC__
922
+ using std::min;
923
+ using std::max;
924
+ #endif
925
+ auto minP = poly[0];
926
+ auto maxP = minP;
927
+ for (size_t i = 1; i < poly.Count; ++i) {
928
+ auto qp = poly[i];
929
+ minP = min(minP, qp);
930
+ maxP = max(maxP, qp);
931
+ }
932
+ return { minP.X, minP.Y, maxP.X, maxP.Y };
933
+ }
934
+
935
+ template<typename T, typename Derived>
936
+ __any_device__
937
+ AABB_<T> PolygonBase_<T, Derived>::Bounds() const {
938
+ return polygon_bounds(static_cast<const Derived&>(*this));
939
+ }
940
+
941
+ template<typename polygon_t, typename point_t>
942
+ __any_device__
943
+ bool polygon_contains(const polygon_t &poly, const point_t &pt)
944
+ {
945
+ typedef typename polygon_t::inner_type T;
946
+
947
+ // Some arbitrary segment. Technically this should be a ray, but functionally this will work
948
+ Segment_<T> testSeg{ pt, { -1e6, -2e6 }};
949
+ Point_<T> trash;
950
+
951
+ int32_t ixCount = 0;
952
+ for (size_t i = 0; i < poly.Count; ++i) {
953
+ Segment_<T> polySeg{ poly[i], poly[(i + 1) % poly.Count] };
954
+
955
+ if (testSeg.Intersection(polySeg, trash)) {
956
+ ++ixCount;
957
+ }
958
+ }
959
+
960
+ // If there are an odd number of intersections, then the point is inside
961
+ return (ixCount % 2) == 1;
962
+ }
963
+
964
+ template<typename T, typename Derived>
965
+ __any_device__
966
+ bool PolygonBase_<T, Derived>::Contains(const Point_<T> &pt) const {
967
+ return polygon_contains(static_cast<const Derived&>(*this), pt);
968
+ }
969
+
970
+ template<typename polygon_t>
971
+ __any_device__
972
+ auto polygon_edge_length(const polygon_t &poly) -> typename polygon_t::inner_type
973
+ {
974
+ typedef typename polygon_t::inner_type T;
975
+
976
+ T ret = 0;
977
+
978
+ for (size_t i = 0; i < poly.Count; ++i) {
979
+ Segment_<T> seg{ poly[i], poly[(i + 1) % poly.Count] };
980
+
981
+ ret += seg.Length();
982
+ }
983
+
984
+ return ret;
985
+ }
986
+
987
+ template<typename T, typename Derived>
988
+ __any_device__
989
+ T PolygonBase_<T, Derived>::EdgeLength() const {
990
+ return polygon_edge_length(static_cast<const Derived&>(*this));
991
+ }
992
+
993
+ template<typename polygon_t>
994
+ __any_device__
995
+ auto polygon_center(const polygon_t &poly) -> Point_<typename polygon_t::inner_type>
996
+ {
997
+ typedef typename polygon_t::inner_type T;
998
+
999
+ T cx = 0, cy = 0, a = 0;
1000
+ size_t j = poly.Count - 1;
1001
+ for (size_t i = 0; i < poly.Count; ++i) {
1002
+ Point_<T> p0 = poly[i];
1003
+ Point_<T> p1 = poly[j];
1004
+
1005
+ T common = (p0.X * p1.Y - p1.X * p0.Y);
1006
+ cx += (p0.X + p1.X) * common;
1007
+ cy += (p0.Y + p1.Y) * common;
1008
+ a += common;
1009
+
1010
+ j = i;
1011
+ }
1012
+
1013
+ a /= 2;
1014
+
1015
+ Point_<T> center{ cx / (6 * a), cy / (6 * a) };
1016
+
1017
+ return center;
1018
+ }
1019
+
1020
+ template<typename T, typename Derived>
1021
+ __any_device__
1022
+ Point_<T> PolygonBase_<T, Derived>::Center() const {
1023
+ return polygon_center(static_cast<const Derived&>(*this));
1024
+ }
1025
+
1026
+ template<typename T, typename Derived>
1027
+ __any_device__
1028
+ T PolygonBase_<T, Derived>::Area() const {
1029
+ const Derived &dThis = static_cast<const Derived&>(*this);
1030
+ return shoelace_area(dThis, dThis.Count);
1031
+ }
1032
+
1033
+
1034
+ template<typename T>
1035
+ __any_device__
1036
+ Point_<T> nearest_point_on_segment(const Point_<T> &pt, const Segment_<T> &seg)
1037
+ {
1038
+ #ifndef __NVCC__
1039
+ using std::max;
1040
+ using std::min;
1041
+ #endif
1042
+
1043
+ const T l2 = seg.LengthSq();
1044
+
1045
+ if (l2 == 0.0) {
1046
+ return seg.A;
1047
+ }
1048
+
1049
+ const auto v = seg.A;
1050
+ const auto w = seg.B;
1051
+ // Consider the line extending the segment, parameterized as v + t*(w-v)
1052
+ // Find projection of point p onto the line
1053
+ auto t = dot(pt - v, w - v) / l2;
1054
+
1055
+ // Clamp between t=0 and t=1
1056
+ t = max(static_cast<T>(0), min(static_cast<T>(1), t));
1057
+
1058
+ const auto projection = v + t * (w - v);
1059
+
1060
+ return projection;
1061
+ }
1062
+
1063
+
1064
+ template<typename T>
1065
+ __any_device__
1066
+ Segment_<T> shortest_line_between_segments(const Segment_<T> &a, const Segment_<T> &b)
1067
+ {
1068
+ Segment_<T> segs[] = {
1069
+ { a.A, nearest_point_on_segment(a.A, b) },
1070
+ { a.B, nearest_point_on_segment(a.B, b) },
1071
+ { nearest_point_on_segment(b.A, a), b.A },
1072
+ { nearest_point_on_segment(b.B, a), b.B }
1073
+ };
1074
+
1075
+ T minDist = std::numeric_limits<T>::max();
1076
+ size_t idx;
1077
+
1078
+ #pragma unroll
1079
+ for (size_t i = 0; i < 4; ++i) {
1080
+ T dist = segs[i].LengthSq();
1081
+ if (dist < minDist) {
1082
+ minDist = dist;
1083
+ idx = i;
1084
+ }
1085
+ }
1086
+
1087
+ return segs[idx];
1088
+ }
1089
+
1090
+ // Find the distance between a point and the nearest point along the specified segment
1091
+ template<typename T>
1092
+ __any_device__
1093
+ T distance_to_segment(const Point_<T> &pt, const Segment_<T> &seg)
1094
+ {
1095
+ auto projection = nearest_point_on_segment(pt, seg);
1096
+
1097
+ auto dist = length(pt - projection);
1098
+
1099
+ return dist;
1100
+ }
nemotron-ocr/cpp/geometry_api/calc_poly_min_rrect.cpp ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "geometry_api.h"
5
+
6
+ #include "../graph_detection/encode_util.h"
7
+
8
+ #include "../geometry.h"
9
+ #include "matrix2x2.h"
10
+
11
+ using namespace std;
12
+
13
+ template<typename T>
14
+ void _calc_poly_min_rrect(const torch::TensorAccessor<T, 2> vertices, torch::TensorAccessor<T, 2> outRRect);
15
+ template<typename T>
16
+ void _calc_quad_min_rrect(const torch::TensorAccessor<T, 2> vertices, torch::TensorAccessor<T, 2> outRRect);
17
+
18
+ torch::Tensor calc_poly_min_rrect(torch::Tensor vertices)
19
+ {
20
+ if (vertices.size(0) < 3) {
21
+ throw runtime_error("Invalid polygon! Expected >= 3 vertices, got " + to_string(vertices.size(0)));
22
+ }
23
+
24
+ auto ret = torch::empty({ 4, 2 }, vertices.options());
25
+
26
+ auto retAcc = ret.accessor<float, 2>();
27
+
28
+ if (vertices.size(0) != 4) {
29
+ // OpenCV requires this to be a contiguous buffer
30
+ vertices = vertices.contiguous();
31
+ _calc_poly_min_rrect(vertices.accessor<float, 2>(), retAcc);
32
+ } else {
33
+ _calc_quad_min_rrect(vertices.accessor<float, 2>(), retAcc);
34
+ }
35
+
36
+ return ret;
37
+ }
38
+
39
+
40
+ template<typename T>
41
+ void _calc_bounds(const torch::TensorAccessor<T, 2> &vertices, torch::TensorAccessor<T, 2> &outRRect,
42
+ const Point_<T> &leftCenter, const Point_<T> &rightCenter)
43
+ {
44
+ typedef Point_<T> Pointf;
45
+
46
+ Pointf vecAlong = rightCenter - leftCenter;
47
+ auto alongMag = length(vecAlong);
48
+
49
+ if (alongMag == 0.0f) {
50
+ throw runtime_error("Invalid polygon!");
51
+ }
52
+
53
+ vecAlong /= alongMag;
54
+
55
+ Pointf dOrtho{ -vecAlong.Y, vecAlong.X };
56
+
57
+ Pointf center = (leftCenter + rightCenter) / 2.0f;
58
+
59
+ Matrix2x2<T> rotMat{ vecAlong, dOrtho };
60
+
61
+ auto get_fn = [&vertices, &center] (int64_t i) {
62
+ return Pointf{ vertices[i] } - center;
63
+ };
64
+
65
+ // All we care about it getting the bounds in the normalized space, so this saves
66
+ // us from having to do any memory allocation
67
+ Pointf minPt{ 0, 0 }, maxPt{ 0, 0 };
68
+ auto tx_fn = [&minPt, &maxPt] (int64_t i, const Pointf &pt) {
69
+ minPt = min(minPt, pt);
70
+ maxPt = max(maxPt, pt);
71
+ };
72
+
73
+ matmul_fn(vertices.size(0), get_fn, rotMat, tx_fn, transpose_tag{});
74
+
75
+ Pointf rotBox[4] = {
76
+ minPt,
77
+ { maxPt.X, minPt.Y },
78
+ maxPt,
79
+ { minPt.X, maxPt.Y }
80
+ };
81
+
82
+ auto get_fn2 = [&rotBox] (int64_t i) {
83
+ return rotBox[i];
84
+ };
85
+
86
+ auto assign_fn = [&center, &outRRect] (int64_t i, const Pointf &pt) {
87
+ outRRect[i][0] = pt.X + center.X;
88
+ outRRect[i][1] = pt.Y + center.Y;
89
+ };
90
+
91
+ matmul_fn(4, get_fn2, rotMat, assign_fn, contiguous_tag{});
92
+ }
93
+
94
+
95
+ template<typename T>
96
+ void _calc_poly_min_rrect(const torch::TensorAccessor<T, 2> vertices, torch::TensorAccessor<T, 2> outRRect)
97
+ {
98
+ typedef Point_<T> Pointf;
99
+ typedef Polygon_<T> Polygonf;
100
+
101
+ Polygonf poly{ vertices.data(), vertices.size(0) };
102
+
103
+ vector<graph_detection::Edge> bottoms = graph_detection::find_bottom(poly, false);
104
+
105
+ if (bottoms.size() != 2) {
106
+ throw runtime_error("Invalid polygon!");
107
+ }
108
+
109
+ vector<graph_detection::Edge> longEdges[2];
110
+ graph_detection::find_long_edges(poly, bottoms.data(), longEdges[0], longEdges[1]);
111
+
112
+ ////
113
+ // Determine which edge is above the other
114
+ Pointf cpts[2];
115
+ for (size_t i = 0; i < 2; ++i) {
116
+ auto &pedge = longEdges[i];
117
+
118
+ cpts[i] = Pointf{0.0f, 0.0f};
119
+ float ct = 0;
120
+ for (size_t z = 0; z < pedge.size(); ++z) {
121
+ auto edge = pedge[z];
122
+ Pointf p1 = poly[edge.A];
123
+ Pointf p2 = poly[edge.B];
124
+ cpts[i] += (p1 + p2) / 2.0f;
125
+ ct += 1.0f;
126
+ }
127
+
128
+ if (ct < 1.0f) {
129
+ throw runtime_error("Edge was empty!");
130
+ }
131
+ cpts[i] /= ct;
132
+ }
133
+
134
+ float vpp = graph_detection::vector_sin(cpts[0] - cpts[1]);
135
+ if (vpp >= 0) {
136
+ swap(bottoms[0], bottoms[1]);
137
+ }
138
+ ////
139
+
140
+ Pointf edge1[2] = { poly[bottoms[0].A], poly[bottoms[0].B] };
141
+ Pointf edge2[2] = { poly[bottoms[1].A], poly[bottoms[1].B] };
142
+
143
+ Pointf c0 = (edge1[0] + edge1[1]) / 2.0f;
144
+ Pointf c1 = (edge2[0] + edge2[1]) / 2.0f;
145
+
146
+ _calc_bounds(vertices, outRRect, c0, c1);
147
+ }
148
+
149
+ template<typename T>
150
+ void _calc_quad_min_rrect(const torch::TensorAccessor<T, 2> vertices, torch::TensorAccessor<T, 2> outRRect)
151
+ {
152
+ typedef Point_<T> Pointf;
153
+
154
+ // Instead of finding an arbitrary rotated box, find a reasonable
155
+ // fit for the quadrangle
156
+ Pointf pts[4] = {
157
+ vertices[0], vertices[1], vertices[2], vertices[3]
158
+ };
159
+
160
+ Pointf c0 = (pts[0] + pts[3]) / 2.0f;
161
+ Pointf c1 = (pts[1] + pts[2]) / 2.0f;
162
+
163
+ _calc_bounds(vertices, outRRect, c0, c1);
164
+ }
nemotron-ocr/cpp/geometry_api/geometry_api.cpp ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "geometry_api.h"
5
+
6
+ #include "geometry_api_common.h"
7
+
8
+ using namespace std;
9
+
10
+ torch::Tensor rrect_to_quads_gpu(torch::Tensor rrects, float cellSize);
11
+
12
+ template<typename T>
13
+ torch::Tensor rrect_to_quads_impl(torch::Tensor rrects, T cellSize)
14
+ {
15
+ // BHW(5)
16
+ auto rrectAccess = rrects.accessor<T, 4>();
17
+
18
+ T cellOff = cellSize / 2;
19
+
20
+ auto quads = torch::empty({ rrects.size(0), rrects.size(1), rrects.size(2), 4, 2 }, rrects.options());
21
+
22
+ auto quadsAccess = quads.accessor<T, 5>();
23
+
24
+ for (long b = 0; b < rrects.size(0); ++b) {
25
+ for (long y = 0; y < rrects.size(1); ++y) {
26
+ for (long x = 0; x < rrects.size(2); ++x) {
27
+ auto rrect = rrectAccess[b][y][x];
28
+
29
+ auto quad = quadsAccess[b][y][x];
30
+
31
+ assign_rrect_to_quad(rrect, quad, cellSize, cellOff,
32
+ static_cast<T>(x),
33
+ static_cast<T>(y));
34
+ }
35
+ }
36
+ }
37
+
38
+ return quads;
39
+ }
40
+
41
+ torch::Tensor rrect_to_quads(torch::Tensor rrects, float cellSize)
42
+ {
43
+ if (rrects.is_cuda()) {
44
+ return rrect_to_quads_gpu(rrects, cellSize);
45
+ }
46
+
47
+ torch::Tensor quads;
48
+ AT_DISPATCH_FLOATING_TYPES(
49
+ rrects.scalar_type(),
50
+ "rrect_to_quads_impl",
51
+ ([&] {
52
+ quads = rrect_to_quads_impl<scalar_t>(rrects, scalar_t(cellSize));
53
+ })
54
+ );
55
+
56
+ return quads;
57
+ }
58
+
59
+
60
+ template<typename T>
61
+ torch::Tensor rrect_to_quads_backward_impl(torch::Tensor rrects, torch::Tensor gradOutput)
62
+ {
63
+ // BHW(5)
64
+ auto gradInput = torch::empty_like(rrects);
65
+
66
+ auto rrectAccess = rrects.accessor<T, 4>();
67
+ // BHW42
68
+ auto gradOutputAccess = gradOutput.accessor<T, 5>();
69
+ auto gradInputAccess = gradInput.accessor<T, 4>();
70
+
71
+ for (long b = 0; b < rrects.size(0); ++b) {
72
+ for (long y = 0; y < rrects.size(1); ++y) {
73
+ for (long x = 0; x < rrects.size(2); ++x) {
74
+ assign_grad_rrect_to_quad<T>(rrectAccess[b][y][x], gradOutputAccess[b][y][x], gradInputAccess[b][y][x]);
75
+ }
76
+ }
77
+ }
78
+
79
+ return gradInput;
80
+ }
81
+
82
+ torch::Tensor rrect_to_quads_backward_gpu(torch::Tensor rrects, torch::Tensor gradOutput);
83
+
84
+ torch::Tensor rrect_to_quads_backward(torch::Tensor rrects, torch::Tensor gradOutput)
85
+ {
86
+ if (rrects.is_cuda()) {
87
+ return rrect_to_quads_backward_gpu(rrects, gradOutput);
88
+ }
89
+
90
+ torch::Tensor gradInput;
91
+ AT_DISPATCH_FLOATING_TYPES(
92
+ rrects.scalar_type(),
93
+ "rrect_to_quads_backward_impl",
94
+ ([&] {
95
+ gradInput = rrect_to_quads_backward_impl<scalar_t>(rrects, gradOutput);
96
+ })
97
+ );
98
+
99
+ return gradInput;
100
+ }
nemotron-ocr/cpp/geometry_api/geometry_api.h ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include <torch/torch.h>
7
+
8
+ torch::Tensor rrect_to_quads(torch::Tensor rrects, float cellSize);
9
+ torch::Tensor rrect_to_quads_backward(torch::Tensor rrects, torch::Tensor gradOutput);
10
+
11
+ torch::Tensor calc_poly_min_rrect(torch::Tensor vertices);
12
+
13
+ float get_rel_continuation_cos(torch::Tensor rrectA, torch::Tensor rrectB);
14
+
15
+ torch::Tensor get_poly_bounds_quad(torch::Tensor poly);
nemotron-ocr/cpp/geometry_api/geometry_api_common.h ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include <torch/torch.h>
7
+
8
+ #include "../cuda_intellisense.cuh"
9
+ #include "../geometry.h"
10
+
11
+ #if defined(__NVCC__)
12
+ #include <math_constants.h>
13
+ #define GEO_PI CUDART_PI_F
14
+ #else
15
+ #include <math.h>
16
+ #define GEO_PI M_PI
17
+ #endif
18
+
19
+
20
+ template<typename access_t, typename point_t>
21
+ __device__
22
+ inline
23
+ void pt_assign(access_t acc, const point_t &p) {
24
+ acc[0] = p.X;
25
+ acc[1] = p.Y;
26
+ }
27
+
28
+ template<typename T, typename rrect_access_t>
29
+ __device__ __lib_inline__
30
+ InPlaceQuad_<T> cvt_rrect_to_quad(const rrect_access_t &rrect, T cellSize, T cellOff, T x, T y)
31
+ {
32
+ typedef Point_<T> Pointf;
33
+
34
+ Pointf prior{
35
+ x * cellSize + cellOff,
36
+ y * cellSize + cellOff
37
+ };
38
+
39
+ T dTop = rrect[0];
40
+ T dRight = rrect[1];
41
+ T dBottom = rrect[2];
42
+ T dLeft = rrect[3];
43
+ T theta = rrect[4];
44
+
45
+ T piOver2{GEO_PI / 2.0f};
46
+ Pointf vX{ cos(theta), sin(theta) };
47
+ Pointf vY{ cos(theta - piOver2), sin(theta - piOver2) };
48
+
49
+ InPlaceQuad_<T> ret;
50
+
51
+ ret[0] = prior - vX * dLeft + vY * dTop;
52
+ ret[1] = prior + vX * dRight + vY * dTop;
53
+ ret[2] = prior + vX * dRight - vY * dBottom;
54
+ ret[3] = prior - vX * dLeft - vY * dBottom;
55
+
56
+ return ret;
57
+ }
58
+
59
+ template<typename rrect_access_t, typename quad_access_t, typename T>
60
+ __device__ __lib_inline__
61
+ void assign_rrect_to_quad(const rrect_access_t &rrect, quad_access_t &quad,
62
+ T cellSize, T cellOff, T x, T y)
63
+ {
64
+ const InPlaceQuad_<T> cvQuad = cvt_rrect_to_quad<T>(rrect, cellSize, cellOff, x, y);
65
+
66
+ const T *pInQuad = reinterpret_cast<const T*>(&cvQuad);
67
+ T *pOutQuad = reinterpret_cast<T*>(quad.data());
68
+
69
+ #pragma unroll
70
+ for (uint32_t i = 0; i < 8; ++i) {
71
+ pOutQuad[i] = pInQuad[i];
72
+ }
73
+ }
74
+
75
+ template<typename T, typename rrect_access_t, typename quad_access_t>
76
+ __device__
77
+ inline
78
+ void assign_grad_rrect_to_quad(const rrect_access_t &rrect,
79
+ const quad_access_t &gradOutput,
80
+ rrect_access_t gradInput)
81
+ {
82
+ typedef Point_<T> Pointf;
83
+
84
+ T Top = rrect[0];
85
+ T Right = rrect[1];
86
+ T Bottom = rrect[2];
87
+ T Left = rrect[3];
88
+ T theta = rrect[4];
89
+
90
+ T piOver2{GEO_PI / 2.0f};
91
+ Pointf vX{ cos(theta), sin(theta) };
92
+ Pointf vY{ cos(theta - piOver2), sin(theta - piOver2) };
93
+
94
+ Pointf dVX{ -vX.Y, vX.X };
95
+ Pointf dVY{ -vY.Y, vY.X };
96
+
97
+ Pointf gP0 = gradOutput[0],
98
+ gP1 = gradOutput[1],
99
+ gP2 = gradOutput[2],
100
+ gP3 = gradOutput[3];
101
+
102
+ // Top
103
+ gradInput[0] = (gP0 * vY + gP1 * vY).Sum();
104
+ // Right
105
+ gradInput[1] = (gP1 * vX + gP2 * vX).Sum();
106
+ // Bottom
107
+ gradInput[2] = -(gP2 * vY + gP3 * vY).Sum();
108
+ // Left
109
+ gradInput[3] = -(gP0 * vX + gP3 * vX).Sum();
110
+
111
+ // Theta
112
+ gradInput[4] = (
113
+ gP0 * (-Left * dVX + Top * dVY) +
114
+ gP1 * (Right * dVX + Top * dVY) +
115
+ gP2 * (Right * dVX - Bottom * dVY) +
116
+ gP3 * (-Left * dVX - Bottom * dVY)
117
+ ).Sum();
118
+ }
119
+
120
+ #undef GEO_PI
nemotron-ocr/cpp/geometry_api/geometry_api_gpu.cu ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "geometry_api.h"
5
+
6
+ #include "../geometry.h"
7
+ #include "../cuda_intellisense.cuh"
8
+ #include "geometry_api_common.h"
9
+
10
+ #include <trove/ptr.h>
11
+
12
+ using namespace std;
13
+
14
+
15
+ template<typename T>
16
+ struct RRect_ {
17
+ T Data[5];
18
+
19
+ template<typename index_t>
20
+ __device__
21
+ const T &operator[](index_t i) const { return Data[i]; }
22
+ template<typename index_t>
23
+ __device__
24
+ T &operator[](index_t i) { return Data[i]; }
25
+ };
26
+
27
+ template<typename T>
28
+ __global__
29
+ void device_rrect_to_quads_gpu(torch::PackedTensorAccessor64<T, 2> rrectAccess,
30
+ torch::PackedTensorAccessor64<T, 3> quadsAccess,
31
+ int64_t numRows, int64_t numCols,
32
+ T cellSize)
33
+ {
34
+ typedef Point_<T> Pointf;
35
+ typedef RRect_<T> RRectf;
36
+ typedef InPlaceQuad_<T> Quadf;
37
+ constexpr T TWO = 2;
38
+
39
+ const int64_t jobIdx = blockIdx.x * blockDim.x + threadIdx.x;
40
+
41
+ if (jobIdx >= rrectAccess.size(0)) {
42
+ return;
43
+ }
44
+
45
+ int64_t row = jobIdx / numCols;
46
+ const int64_t col = jobIdx - (row * numCols);
47
+ row = row % numRows;
48
+
49
+ auto rawRRect = reinterpret_cast<RRectf*>(rrectAccess.data());
50
+ auto rawQuad = reinterpret_cast<Quadf*>(quadsAccess.data());
51
+ #if defined(NDEBUG)
52
+ trove::coalesced_ptr<RRectf> pRRect(rawRRect);
53
+ trove::coalesced_ptr<Quadf> pQuad(rawQuad);
54
+ #else
55
+ auto pRRect = rawRRect;
56
+ auto pQuad = rawQuad;
57
+ #endif
58
+
59
+ RRectf rrect = pRRect[jobIdx];
60
+
61
+ T cellOff = cellSize / TWO;
62
+ Quadf cvQuad = cvt_rrect_to_quad<T>(rrect, cellSize, cellOff, col, row);
63
+
64
+ pQuad[jobIdx] = cvQuad;
65
+ }
66
+
67
+ torch::Tensor rrect_to_quads_gpu(torch::Tensor rrects, float cellSize)
68
+ {
69
+ if (!rrects.is_contiguous()) {
70
+ throw std::runtime_error("Expected the rrects to be contiguous!");
71
+ }
72
+
73
+ torch::Tensor quads = torch::empty({ rrects.size(0), rrects.size(1), rrects.size(2), 4, 2 }, rrects.options());
74
+
75
+ auto rrFlat = rrects.flatten(0, 2);
76
+ auto qFlat = quads.flatten(0, 2);
77
+
78
+ dim3 blockSize(96);
79
+ dim3 gridSize(div_up(qFlat.size(0), blockSize.x));
80
+
81
+ if (quads.numel() > 0) {
82
+ AT_DISPATCH_FLOATING_TYPES(
83
+ quads.scalar_type(),
84
+ "cuda_rrect_to_quads",
85
+ ([&] {
86
+
87
+ device_rrect_to_quads_gpu<scalar_t> KERNEL_ARG2(gridSize, blockSize) (
88
+ rrFlat.packed_accessor64<scalar_t, 2>(),
89
+ qFlat.packed_accessor64<scalar_t, 3>(),
90
+ rrects.size(1), rrects.size(2),
91
+ cellSize
92
+ );
93
+
94
+ })
95
+ );
96
+ }
97
+
98
+ return quads;
99
+ }
100
+
101
+ template<typename scalar_t>
102
+ __global__
103
+ void device_rrect_to_quads_backward_gpu(torch::PackedTensorAccessor64<scalar_t, 2> rrect,
104
+ torch::PackedTensorAccessor64<scalar_t, 3> gradOutput,
105
+ torch::PackedTensorAccessor64<scalar_t, 2> gradInput)
106
+ {
107
+ const int64_t jobIdx = blockIdx.x * blockDim.x + threadIdx.x;
108
+
109
+ if (jobIdx >= rrect.size(0)) return;
110
+
111
+ assign_grad_rrect_to_quad<scalar_t>(rrect[jobIdx], gradOutput[jobIdx], gradInput[jobIdx]);
112
+ }
113
+
114
+
115
+ torch::Tensor rrect_to_quads_backward_gpu(torch::Tensor rrects, torch::Tensor gradOutput)
116
+ {
117
+ auto gradInput = torch::empty_like(rrects);
118
+
119
+ auto flatRRects = rrects.reshape({ -1, 5 });
120
+ auto flatGradOutput = gradOutput.reshape({ -1, 4, 2 });
121
+ auto flatGradInput = gradInput.reshape({ -1, 5 });
122
+
123
+ dim3 blockSize(32);
124
+ dim3 gridSize(div_up(rrects.size(0) * rrects.size(1) * rrects.size(2), blockSize.x));
125
+
126
+ if (rrects.numel() > 0) {
127
+ AT_DISPATCH_FLOATING_TYPES(
128
+ rrects.scalar_type(),
129
+ "cuda_rrect_to_quads_backward",
130
+ ([&] {
131
+ device_rrect_to_quads_backward_gpu KERNEL_ARG2(gridSize, blockSize) (
132
+ flatRRects.packed_accessor64<scalar_t, 2>(),
133
+ flatGradOutput.packed_accessor64<scalar_t, 3>(),
134
+ flatGradInput.packed_accessor64<scalar_t, 2>()
135
+ );
136
+ })
137
+ );
138
+ }
139
+
140
+ return gradInput;
141
+ }
nemotron-ocr/cpp/geometry_api/get_rel_continuation_cos.cpp ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "geometry_api.h"
5
+
6
+ #include "../geometry.h"
7
+
8
+ using namespace std;
9
+
10
+
11
+ float get_rel_continuation_cos(torch::Tensor rrectATensor, torch::Tensor rrectBTensor)
12
+ {
13
+ typedef Point_<float> Pointf;
14
+
15
+ if (rrectATensor.size(0) != 4 || rrectBTensor.size(0) != 4) {
16
+ throw runtime_error("Invalid rrect arguments. Both must have 4 vertices! A=" +
17
+ to_string(rrectATensor.size(0)) + ", B=" + to_string(rrectBTensor.size(0)));
18
+ }
19
+
20
+ auto rrectA = rrectATensor.accessor<float, 2>();
21
+ auto rrectB = rrectBTensor.accessor<float, 2>();
22
+
23
+ Pointf aPts[4] = {
24
+ rrectA[0], rrectA[1], rrectA[2], rrectA[3]
25
+ };
26
+
27
+ auto c1 = (aPts[0] + aPts[3]) / 2.0f;
28
+ auto c2 = (aPts[1] + aPts[2]) / 2.0f;
29
+
30
+ auto aDir = c2 - c1;
31
+ auto aLen = length(aDir);
32
+
33
+ if (aLen > 0) {
34
+ aDir /= aLen;
35
+ } else {
36
+ aDir = Pointf{ 1, 0 };
37
+ }
38
+
39
+ auto centerA = (c1 + c2) / 2.0f;
40
+
41
+ Pointf bPts[4] = {
42
+ rrectB[0], rrectB[1], rrectB[2], rrectB[3]
43
+ };
44
+
45
+ auto centerB = (bPts[0] + bPts[1] + bPts[2] + bPts[3]) / 4.0f;
46
+
47
+ auto connDir = centerB - centerA;
48
+ auto connLen = length(connDir);
49
+
50
+ if (connLen == 0.0f) {
51
+ return 1.0f;
52
+ }
53
+
54
+ connDir /= connLen;
55
+
56
+ auto cosT = dot(aDir, connDir);
57
+
58
+ return cosT;
59
+ }
nemotron-ocr/cpp/geometry_api/matrix2x2.h ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include "../geometry.h"
7
+
8
+
9
+ struct contiguous_tag{};
10
+
11
+ struct transpose_tag{};
12
+
13
+ template<typename layout_t, uint32_t R, uint32_t C>
14
+ struct Matrix2x2_Offset;
15
+
16
+ template<uint32_t R, uint32_t C>
17
+ struct Matrix2x2_Offset<contiguous_tag, R, C>
18
+ {
19
+ static const uint32_t OFFSET = R * 2 + C;
20
+ };
21
+
22
+ template<uint32_t R, uint32_t C>
23
+ struct Matrix2x2_Offset<transpose_tag, R, C>
24
+ {
25
+ static const uint32_t OFFSET = C * 2 + R;
26
+ };
27
+
28
+
29
+ template<typename T, typename layout_t, uint32_t R, uint32_t C>
30
+ struct Matrix2x2_Indexor
31
+ {
32
+ static const uint32_t OFFSET = Matrix2x2_Offset<layout_t, R, C>::OFFSET;
33
+
34
+ static T &get(T *data) { return data[OFFSET]; }
35
+ static const T get(const T *data) { return data[OFFSET]; }
36
+ };
37
+
38
+
39
+ template<typename T>
40
+ struct Matrix2x2
41
+ {
42
+ Matrix2x2() = default;
43
+ Matrix2x2(T r0c0, T r0c1, T r1c0, T r1c1)
44
+ : m_data{ r0c0, r0c1, r1c0, r1c1 }
45
+ {
46
+ }
47
+ Matrix2x2(const Point_<T> &r0, const Point_<T> &r1)
48
+ : m_data{ r0.X, r0.Y, r1.X, r1.Y }
49
+ {
50
+ }
51
+ Matrix2x2(const Point_<T> &r0, const Point_<T> &r1, transpose_tag)
52
+ : m_data{ r0.X, r1.X, r0.Y, r1.Y }
53
+ {
54
+ }
55
+
56
+ inline T &operator[](uint32_t i) { return m_data[i]; }
57
+ inline const T operator[](uint32_t i) const { return m_data[i]; }
58
+
59
+ T m_data[4];
60
+ };
61
+
62
+ template<typename T, typename layout_t>
63
+ struct Matrix2x2_View
64
+ {
65
+ Matrix2x2_View(const Matrix2x2<T> &m) : m_data(m.m_data) {}
66
+
67
+ const T *m_data;
68
+ };
69
+
70
+ template<uint32_t R, uint32_t C, typename T, typename layout_t>
71
+ const T get(const Matrix2x2_View<T, layout_t> &m)
72
+ {
73
+ return Matrix2x2_Indexor<T, layout_t, R, C>::get(m.m_data);
74
+ }
75
+
76
+ template<typename T, typename get_pt_t, typename callback_t, typename layout_t = contiguous_tag>
77
+ inline
78
+ void matmul_fn(int64_t N, const get_pt_t &get_fn, const Matrix2x2<T> &mat, const callback_t &callback,
79
+ layout_t lt = layout_t{})
80
+ {
81
+ Matrix2x2_View<T, layout_t> m{ mat };
82
+
83
+ #pragma omp simd
84
+ for (int64_t i = 0; i < N; ++i) {
85
+ Point_<T> pt = get_fn(i);
86
+
87
+ T x = pt.X * get<0, 0>(m) + pt.Y * get<1, 0>(m);
88
+ T y = pt.X * get<0, 1>(m) + pt.Y * get<1, 1>(m);
89
+
90
+ callback(i, Point_<T>{ x, y });
91
+ }
92
+ }
nemotron-ocr/cpp/geometry_api/poly_bounds_quad.cpp ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "geometry_api.h"
5
+
6
+ using namespace std;
7
+
8
+
9
+ template<typename T>
10
+ void pt_assign(torch::TensorAccessor<T, 1> acc, T x, T y)
11
+ {
12
+ acc[0] = x;
13
+ acc[1] = y;
14
+ }
15
+
16
+
17
+ template<typename T>
18
+ void poly_bounds_quad_impl(torch::TensorAccessor<T, 2> poly, torch::TensorAccessor<T, 2> outBounds)
19
+ {
20
+ T minX = poly[0][0],
21
+ minY = poly[0][1],
22
+ maxX = poly[0][0],
23
+ maxY = poly[0][1];
24
+
25
+ const int64_t numVertices = poly.size(0);
26
+
27
+ for (int64_t i = 0; i < numVertices; ++i) {
28
+ auto vert = poly[i];
29
+
30
+ minX = min(minX, vert[0]);
31
+ maxX = max(maxX, vert[0]);
32
+
33
+ minY = min(minY, vert[1]);
34
+ maxY = max(maxY, vert[1]);
35
+ }
36
+
37
+ pt_assign(outBounds[0], minX, minY);
38
+ pt_assign(outBounds[1], maxX, minY);
39
+ pt_assign(outBounds[2], maxX, maxY);
40
+ pt_assign(outBounds[3], minX, maxY);
41
+ }
42
+
43
+
44
+ torch::Tensor get_poly_bounds_quad(torch::Tensor poly)
45
+ {
46
+ auto ret = torch::empty({ 4, 2 }, poly.options());
47
+
48
+ AT_DISPATCH_FLOATING_TYPES(
49
+ poly.scalar_type(),
50
+ "poly_bounds_quad_impl",
51
+ ([&] {
52
+ poly_bounds_quad_impl(
53
+ poly.accessor<scalar_t, 2>(),
54
+ ret.accessor<scalar_t, 2>()
55
+ );
56
+ })
57
+ );
58
+
59
+ return ret;
60
+ }
nemotron-ocr/cpp/graph_detection/encode_util.cpp ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include "encode_util.h"
5
+
6
+ #include <algorithm>
7
+ #include <numeric>
8
+ #include <sstream>
9
+
10
+ #include "../third_party/clipper/clipper.hpp"
11
+
12
+ using namespace std;
13
+
14
+ namespace graph_detection {
15
+
16
+ template<typename T>
17
+ struct Candidate : Edge {
18
+ T C;
19
+
20
+ Candidate() = default;
21
+ Candidate(int32_t a, int32_t b, T c) : Edge(a, b), C(c) {}
22
+ };
23
+
24
+ struct DistStruct {
25
+ Candidate<Pointf> A;
26
+ Candidate<Pointf> B;
27
+ float Dist;
28
+
29
+ DistStruct() = default;
30
+ DistStruct(Candidate<Pointf> a, Candidate<Pointf> b, float dist) : A(a), B(b), Dist(dist) {}
31
+ };
32
+
33
+ template<typename T>
34
+ float vec_cos(const Point_<T> &a, const Point_<T> &b)
35
+ {
36
+ return dot(a, b) / (length(a) * length(b) + 1e-8);
37
+ }
38
+
39
+ template<typename T, typename Fn = std::less<T>>
40
+ vector<size_t> arg_sort(const vector<T> &vec, Fn comp = Fn())
41
+ {
42
+ vector<size_t> ret;
43
+ ret.reserve(vec.size());
44
+ for (size_t i = 0; i < vec.size(); ++i) {
45
+ ret.push_back(i);
46
+ }
47
+
48
+ sort(begin(ret), end(ret),
49
+ [&vec, &comp] (size_t idxA, size_t idxB) {
50
+ return comp(vec[idxA], vec[idxB]);
51
+ }
52
+ );
53
+
54
+ return ret;
55
+ }
56
+
57
+
58
+ float edge_length(const Polygon_<float> &poly, const vector<Edge> &edges);
59
+
60
+ vector<Edge> find_bottom(const Polygon_<float> &poly, bool useVertexOrder)
61
+ {
62
+ if (poly.Count < 4) {
63
+ throw runtime_error("Invalid polygon. Fewer than 4 vertices!");
64
+ }
65
+
66
+ // If we trust the source of the geometries, then this saves us both computation,
67
+ // but can also be more reliable since we won't reorder the vertices
68
+ if (useVertexOrder) {
69
+ if ((poly.Count % 2) == 1) {
70
+ throw runtime_error("Can't use trusted vertex order when the vertex count is odd!");
71
+ }
72
+ int32_t halfCt = poly.Count / 2;
73
+ return { { halfCt - 1, halfCt },
74
+ { static_cast<int32_t>(poly.Count) - 1, 0 } };
75
+ }
76
+
77
+ if (poly.Count == 4) {
78
+ float d1 = length(poly[1] - poly[0]) + length(poly[2] - poly[3]);
79
+ float d2 = length(poly[2] - poly[1]) + length(poly[0] - poly[3]);
80
+
81
+ if (4 * d1 < d2) {
82
+ return { { 0, 1 }, { 2, 3 } };
83
+ } else {
84
+ return { { 1, 2 }, { 3, 0 } };
85
+ }
86
+ }
87
+
88
+ auto idx_wrap = [&poly] (size_t idx) {
89
+ return poly[idx % poly.Count];
90
+ };
91
+
92
+ vector<Candidate<float>> candidates;
93
+ for (size_t i = 1; i < (poly.Count + 1); ++i) {
94
+ auto vPrev = idx_wrap(i) - idx_wrap(i - 1);
95
+ auto vNext = idx_wrap(i + 2) - idx_wrap(i + 1);
96
+
97
+ // We're looking for the segment where the preceding and following segment
98
+ // essentially travel in opposite directions
99
+ if (vec_cos(vPrev, vNext) < -0.875f) {
100
+ auto currSeg = idx_wrap(i) - idx_wrap(i + 1);
101
+ candidates.emplace_back(i % poly.Count, (i + 1) % poly.Count, length(currSeg));
102
+ }
103
+ }
104
+
105
+ if (candidates.size() != 2 || candidates[0].A == candidates[1].B || candidates[0].B == candidates[1].A) {
106
+ // If candidate number < 2, or two bottom are joined, select 2 farthest edge
107
+ vector<Candidate<Pointf>> midList;
108
+ for (size_t i = 0; i < poly.Count; ++i) {
109
+ Pointf midPoint = (idx_wrap(i) + idx_wrap(i + 1)) / 2.0f;
110
+ midList.emplace_back(i, (i + 1) % poly.Count, midPoint);
111
+ }
112
+
113
+ vector<DistStruct> distList;
114
+
115
+ // Only found one good candidate, so search for the edge that's the furthest from this candidate
116
+ if (candidates.size() == 1) {
117
+ auto idx1a = candidates.back().A;
118
+ auto idx1b = candidates.back().B;
119
+ Candidate<Pointf> cand1{ idx1a, idx1b, (idx_wrap(idx1a) + idx_wrap(idx1b)) / 2.0f };
120
+ for (size_t j = 0; j < poly.Count; ++j) {
121
+ auto &cand2 = midList[j];
122
+
123
+ if (cand1.Touches(cand2)) continue;
124
+
125
+ float dist = length(cand1.C - cand2.C);
126
+ distList.emplace_back(cand1, cand2, dist);
127
+ }
128
+ } else {
129
+ for (size_t i = 0; i < poly.Count; ++i) {
130
+ for (size_t j = i + 1; j < poly.Count; ++j) {
131
+ auto &cand1 = midList[i];
132
+ auto &cand2 = midList[j];
133
+
134
+ if (cand1.Touches(cand2)) continue;
135
+
136
+ float dist = length(cand1.C - cand2.C);
137
+ distList.emplace_back(cand1, cand2, dist);
138
+ }
139
+ }
140
+ }
141
+ sort(begin(distList), end(distList), [] (auto a, auto b) { return a.Dist < b.Dist; });
142
+
143
+ if (distList.empty()) {
144
+ throw runtime_error("No valid bottom candidates found for this polygon!");
145
+ }
146
+
147
+ auto &bEdge = distList.back();
148
+ return vector<Edge>{ bEdge.A, bEdge.B };
149
+
150
+ } else {
151
+ return vector<Edge>{ candidates[0], candidates[1] };
152
+ }
153
+ }
154
+
155
+ void find_long_edges(const Polygon_<float> &poly, Edge *bottoms, vector<Edge> &outLongEdge1, vector<Edge> &outLongEdge2)
156
+ {
157
+ int32_t b1End = bottoms[0].B;
158
+ int32_t b2End = bottoms[1].B;
159
+
160
+ int32_t nPoints = poly.Count;
161
+
162
+ auto accum_into = [nPoints] (int32_t end1, int32_t end2, vector<Edge> &outEdge) {
163
+ int32_t i = (end1 + 1) % nPoints;
164
+ while ((i % nPoints) != end2) {
165
+ int32_t start = i > 0 ? i - 1 : nPoints - 1;
166
+ int32_t end = i % nPoints;
167
+ outEdge.emplace_back(start, end);
168
+ i = (i + 1) % nPoints;
169
+ }
170
+ };
171
+
172
+ accum_into(b1End, b2End, outLongEdge1);
173
+ accum_into(b2End, b1End, outLongEdge2);
174
+ }
175
+
176
+ float edge_length(const Polygon_<float> &poly, const vector<Edge> &edges)
177
+ {
178
+ float ret = 0.0f;
179
+ for (const Edge &e : edges) {
180
+ ret += length(poly[e.B] - poly[e.A]);
181
+ }
182
+ return ret;
183
+ }
184
+
185
+ vector<float> edge_lengths(const Polygon_<float> &poly, const vector<Edge> &edges)
186
+ {
187
+ if (edges.empty()) {
188
+ throw runtime_error("Found an empty edge!");
189
+ }
190
+
191
+ vector<float> ret;
192
+ ret.reserve(edges.size());
193
+
194
+ for (const Edge &e : edges) {
195
+ ret.push_back(length(poly[e.B] - poly[e.A]));
196
+ }
197
+
198
+ return ret;
199
+ }
200
+
201
+ void split_edge_sequence(const Polygon_<float> &poly, const vector<Edge> &edges,
202
+ const vector<float> &edgeLengths, float nParts,
203
+ vector<Pointf> &outPts);
204
+
205
+ void split_edge_sequence_by_step(const Polygon_<float> &poly, const vector<Edge> &longEdge1, const vector<Edge> &longEdge2,
206
+ float step, vector<Pointf> &outInnerPoints1, vector<Pointf> &outInnerPoints2)
207
+ {
208
+ auto edgeLengths1 = edge_lengths(poly, longEdge1);
209
+ auto edgeLengths2 = edge_lengths(poly, longEdge2);
210
+
211
+ float totalLength = (accumulate(begin(edgeLengths1), end(edgeLengths1), 0.0f) + accumulate(begin(edgeLengths2), end(edgeLengths2), 0.0f)) / 2;
212
+
213
+ float nParts = max<float>(ceil(totalLength / step), 2);
214
+
215
+ split_edge_sequence(poly, longEdge1, edgeLengths1, nParts, outInnerPoints1);
216
+ split_edge_sequence(poly, longEdge2, edgeLengths2, nParts, outInnerPoints2);
217
+ }
218
+
219
+ void split_edge_sequence(const Polygon_<float> &poly, const vector<Edge> &edges,
220
+ const vector<float> &edgeLengths, float nParts,
221
+ vector<Pointf> &outPts)
222
+ {
223
+ vector<float> elCumSum = vec_cumsum(edgeLengths);
224
+
225
+ float totalLength = elCumSum.back();
226
+ float lengthPerPart = totalLength / (nParts - 1);
227
+
228
+ size_t iNumParts = nParts;
229
+ size_t currNode = 0;
230
+ size_t ctr = 0;
231
+ for (float i = 0.0f; ctr < iNumParts; i += 1.0f, ++ctr) {
232
+ float t = min(i * lengthPerPart, totalLength);
233
+
234
+ while (t > elCumSum[currNode + 1]) {
235
+ ++currNode;
236
+ }
237
+
238
+ Edge currEdge = edges[currNode];
239
+ Pointf e1 = poly[currEdge.A];
240
+ Pointf e2 = poly[currEdge.B];
241
+
242
+ float currLen = edgeLengths[currNode];
243
+
244
+ Pointf sampledPt;
245
+
246
+ if (currLen > 0) {
247
+ float deltaT = t - elCumSum[currNode];
248
+ float ratio = deltaT / currLen;
249
+ sampledPt = e1 + ratio * (e2 - e1);
250
+ } else {
251
+ sampledPt = e1;
252
+ }
253
+
254
+ outPts.push_back(sampledPt);
255
+ }
256
+ }
257
+
258
+ string print_poly(const Polyf &poly) {
259
+ ostringstream oss;
260
+ oss << "[";
261
+ for (size_t i = 0; i < poly.Count; ++i) {
262
+ if (i > 0) {
263
+ oss << ", ";
264
+ }
265
+ oss << "(" << poly[i].X << ", " << poly[i].Y << ")";
266
+ }
267
+ oss << "]";
268
+ return oss.str();
269
+ }
270
+
271
+ } // namespace graph_detection
nemotron-ocr/cpp/graph_detection/encode_util.h ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #pragma once
5
+
6
+ #include <vector>
7
+ #include <random>
8
+ #include <algorithm>
9
+
10
+ #include "../geometry.h"
11
+
12
+ namespace graph_detection {
13
+
14
+
15
+
16
+ struct Edge {
17
+ int32_t A;
18
+ int32_t B;
19
+
20
+ Edge() = default;
21
+ Edge(int32_t a, int32_t b) : A(a), B(b) {}
22
+
23
+ bool Touches(int32_t idx) const { return A == idx || B == idx; }
24
+ bool Touches(const Edge &other) const;
25
+ };
26
+
27
+ inline
28
+ bool edge_touches(const Edge &edge, int32_t vertex) {
29
+ return edge.A == vertex || edge.B == vertex;
30
+ }
31
+
32
+ inline
33
+ bool Edge::Touches(const Edge &other) const {
34
+ return edge_touches(other, A) || edge_touches(other, B);
35
+ }
36
+
37
+ typedef Point_<float> Pointf;
38
+ typedef AABB_<float> AABBf;
39
+ typedef Polygon_<float> Polyf;
40
+ typedef std::vector<Pointf> Polyline;
41
+
42
+ std::vector<Edge> find_bottom(const Polygon_<float> &poly, bool useVertexOrder);
43
+
44
+ void find_long_edges(const Polygon_<float> &poly, Edge *bottoms, std::vector<Edge> &outLongEdge1, std::vector<Edge> &outLongEdge2);
45
+
46
+ void split_edge_sequence_by_step(const Polygon_<float> &poly, const std::vector<Edge> &longEdge1, const std::vector<Edge> &longEdge2,
47
+ float step, std::vector<Pointf> &outInnerPoints1, std::vector<Pointf> &outInnerPoints2);
48
+
49
+ std::string print_poly(const Polyf &poly);
50
+
51
+ template<typename T>
52
+ inline
53
+ std::vector<T> vec_cumsum(const std::vector<T> &v)
54
+ {
55
+ std::vector<T> ret;
56
+ ret.reserve(v.size() + 1);
57
+ ret.push_back(0);
58
+ for (T val : v) {
59
+ ret.push_back(ret.back() + val);
60
+ }
61
+ return ret;
62
+ }
63
+
64
+ template<typename RandEng, typename Fn>
65
+ inline
66
+ void n_choose_k(size_t n, size_t k, RandEng &randEng, Fn fn)
67
+ {
68
+ if (k == 0) return;
69
+
70
+ // TODO(mranzinger): This algorithm can be replaced with sampling from a geometric
71
+ // distribution, which drastically reduces the runtime complexity
72
+ for (size_t i = 0; i < n; ++i) {
73
+ size_t leftover = n - i;
74
+ if (leftover <= k) {
75
+ fn(i);
76
+ --k;
77
+ } else {
78
+ float p = std::uniform_real_distribution<float>(0.0f, 1.0f)(randEng);
79
+ float probSample = float{k} / float{leftover};
80
+ if (p < probSample) {
81
+ fn(i);
82
+ --k;
83
+ }
84
+ }
85
+ }
86
+ }
87
+
88
+ template<typename T>
89
+ inline T clamp(T val, T minVal, T maxVal) {
90
+ return std::max(std::min(val, maxVal), minVal);
91
+ }
92
+
93
+ inline
94
+ Pointf avg_point(const std::vector<Pointf> &points)
95
+ {
96
+ return std::accumulate(std::begin(points), std::end(points), Pointf(0,0)) / float(points.size());
97
+ }
98
+
99
+ inline
100
+ float vector_sin(const Pointf &pt)
101
+ {
102
+ // sin = y / len(pt)
103
+ return pt.Y / (length(pt) + 1e-8);
104
+ }
105
+
106
+ inline
107
+ float vector_cos(const Pointf &pt)
108
+ {
109
+ // cos = x / len(pt)
110
+ return pt.X / (length(pt) + 1e-8);
111
+ }
112
+
113
+ inline
114
+ void vector_cos_sin(const Pointf & pt, float &outCos, float &outSin)
115
+ {
116
+ float len = length(pt) + 1e-8;
117
+ outCos = pt.X / len;
118
+ outSin = pt.Y / len;
119
+ }
120
+
121
+ inline
122
+ float point_dist_to_line(const Pointf &l1, const Pointf &l2, const Pointf &pt)
123
+ {
124
+ auto d = l2 - l1;
125
+
126
+ auto lineLen = length(d);
127
+
128
+ if (lineLen > 0) {
129
+ float distance = abs(
130
+ d.Y * pt.X
131
+ - d.X * pt.Y
132
+ + l2.X * l1.Y
133
+ - l2.Y * l1.X
134
+ ) / lineLen;
135
+ return distance;
136
+ } else {
137
+ return length(pt - l1);
138
+ }
139
+ }
140
+
141
+ template<typename T>
142
+ T find_mode(std::vector<T> &inputs) {
143
+ using std::sort;
144
+ using std::begin;
145
+ using std::end;
146
+
147
+ if (inputs.empty()) {
148
+ throw std::runtime_error("Cannot find mode of empty distribution!");
149
+ }
150
+
151
+ sort(begin(inputs), end(inputs));
152
+
153
+ T currVal = inputs[0];
154
+ size_t currCount = 1;
155
+
156
+ T modeVal = inputs[0];
157
+ size_t modeCount = 1;
158
+
159
+ auto commitCurr = [&] () {
160
+ if (currCount > modeCount) {
161
+ modeCount = currCount;
162
+ modeVal = currVal;
163
+ }
164
+ };
165
+
166
+ for (size_t i = 1; i < inputs.size(); ++i) {
167
+ if (inputs[i] == currVal) {
168
+ ++currCount;
169
+ } else {
170
+ // Start of a new value
171
+ commitCurr();
172
+
173
+ currCount = 1;
174
+ currVal = inputs[i];
175
+ }
176
+ }
177
+
178
+ commitCurr();
179
+
180
+ return modeVal;
181
+ }
182
+
183
+ } // namespace graph_detection