mranzinger/fix_misaligned_address

#1
by mranzinger - opened
This view is limited to 50 files because it contains too many changes.  See the raw diff here.
Files changed (50) hide show
  1. Dockerfile +3 -3
  2. README.md +19 -19
  3. THIRD_PARTY_NOTICES.md +15 -519
  4. docker-compose.yaml +1 -1
  5. example.py +2 -2
  6. {nemotron-ocr → nemo-retriever-ocr}/cpp/.gitattributes +0 -0
  7. {nemotron-ocr → nemo-retriever-ocr}/cpp/.gitignore +0 -0
  8. {nemotron-ocr → nemo-retriever-ocr}/cpp/.gitmodules +0 -0
  9. {nemotron-ocr → nemo-retriever-ocr}/cpp/README.md +0 -0
  10. {nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/beam_decode.cpp +0 -0
  11. {nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/beam_decode.h +0 -0
  12. {nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/kn_lm.cpp +0 -0
  13. {nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/kn_lm.h +0 -0
  14. {nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/language_model.cpp +0 -0
  15. {nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/language_model.h +0 -0
  16. {nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/log_sum_exp.cpp +0 -0
  17. {nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/log_sum_exp.h +0 -0
  18. {nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/ngram_lm_base.cpp +0 -0
  19. {nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/ngram_lm_base.h +0 -0
  20. {nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/prefix.cpp +0 -0
  21. {nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/prefix.h +0 -0
  22. {nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/sbo_lm.cpp +0 -0
  23. {nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/sbo_lm.h +0 -0
  24. {nemotron-ocr → nemo-retriever-ocr}/cpp/better_grid_sample/cpu_indirect_grid_sample.cpp +0 -0
  25. {nemotron-ocr → nemo-retriever-ocr}/cpp/better_grid_sample/gpu_grid_sample_utils.cuh +0 -0
  26. {nemotron-ocr → nemo-retriever-ocr}/cpp/better_grid_sample/gpu_indirect_grid_sample.cu +0 -0
  27. {nemotron-ocr → nemo-retriever-ocr}/cpp/better_grid_sample/grid_sample.h +0 -0
  28. {nemotron-ocr → nemo-retriever-ocr}/cpp/common.cpp +0 -0
  29. {nemotron-ocr → nemo-retriever-ocr}/cpp/common.h +0 -0
  30. {nemotron-ocr → nemo-retriever-ocr}/cpp/cuda_intellisense.cuh +0 -0
  31. {nemotron-ocr → nemo-retriever-ocr}/cpp/geometry.h +0 -0
  32. {nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/calc_poly_min_rrect.cpp +0 -0
  33. {nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/geometry_api.cpp +0 -0
  34. {nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/geometry_api.h +0 -0
  35. {nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/geometry_api_common.h +0 -0
  36. {nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/geometry_api_gpu.cu +0 -0
  37. {nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/get_rel_continuation_cos.cpp +0 -0
  38. {nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/matrix2x2.h +0 -0
  39. {nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/poly_bounds_quad.cpp +0 -0
  40. {nemotron-ocr → nemo-retriever-ocr}/cpp/graph_detection/encode_util.cpp +0 -0
  41. {nemotron-ocr → nemo-retriever-ocr}/cpp/graph_detection/encode_util.h +0 -0
  42. {nemotron-ocr → nemo-retriever-ocr}/cpp/half_ops.cu +0 -0
  43. {nemotron-ocr → nemo-retriever-ocr}/cpp/half_ops.cuh +0 -0
  44. {nemotron-ocr → nemo-retriever-ocr}/cpp/local_ips/local_ips.h +0 -0
  45. {nemotron-ocr → nemo-retriever-ocr}/cpp/local_ips/quad_all_2_all_dist_v2.cu +0 -0
  46. {nemotron-ocr → nemo-retriever-ocr}/cpp/module.cpp +0 -0
  47. {nemotron-ocr → nemo-retriever-ocr}/cpp/non_maximal_suppression/cpu_non_maximal_suppression.cpp +0 -0
  48. {nemotron-ocr → nemo-retriever-ocr}/cpp/non_maximal_suppression/cuda_non_maximal_suppression.cu +118 -67
  49. {nemotron-ocr → nemo-retriever-ocr}/cpp/non_maximal_suppression/nms_common.h +0 -0
  50. {nemotron-ocr → nemo-retriever-ocr}/cpp/non_maximal_suppression/nms_kd_tree.h +0 -0
Dockerfile CHANGED
@@ -8,11 +8,11 @@ ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
8
  RUN --mount=type=cache,target=/root/.cache/pip \
9
  pip install -U pip hatchling "setuptools>=68" --root-user-action ignore
10
 
11
- COPY nemotron-ocr /workspace/nemotron-ocr
12
- WORKDIR /workspace/nemotron-ocr
13
 
14
  # Ensure no prebuilt binaries/artifacts from the host are present
15
- RUN rm -f src/nemotron_ocr_cpp/*.so || true \
16
  && rm -rf build/ dist/
17
 
18
  RUN --mount=type=cache,target=/root/.cache/pip \
 
8
  RUN --mount=type=cache,target=/root/.cache/pip \
9
  pip install -U pip hatchling "setuptools>=68" --root-user-action ignore
10
 
11
+ COPY nemo-retriever-ocr /workspace/nemo-retriever-ocr
12
+ WORKDIR /workspace/nemo-retriever-ocr
13
 
14
  # Ensure no prebuilt binaries/artifacts from the host are present
15
+ RUN rm -f src/nemo_retriever_ocr_cpp/*.so || true \
16
  && rm -rf build/ dist/
17
 
18
  RUN --mount=type=cache,target=/root/.cache/pip \
README.md CHANGED
@@ -16,7 +16,7 @@ tags:
16
  - ingestion
17
  ---
18
 
19
- # Nemotron OCR v1
20
 
21
  ## **Model Overview**
22
 
@@ -27,15 +27,15 @@ tags:
27
 
28
  ### **Description**
29
 
30
- The Nemotron OCR v1 model is a state-of-the-art text recognition model designed for robust end-to-end optical character recognition (OCR) on complex real-world images. It integrates three core neural network modules: a detector for text region localization, a recognizer for transcription of detected regions, and a relational model for layout and structure analysis.
31
 
32
- This model is optimized for a wide variety of OCR tasks, including multi-line, multi-block, and natural scene text, and it supports advanced reading order analysis via its relational model component. Nemotron OCR v1 has been developed to be production-ready and commercially usable, with a focus on speed and accuracy on both document and natural scene images.
33
 
34
- The Nemotron OCR v1 model is part of the NVIDIA NeMo Retriever collection of NIM microservices, which provides state-of-the-art, commercially-ready models and microservices optimized for the lowest latency and highest throughput. It features a production-ready information retrieval pipeline with enterprise support. The models that form the core of this solution have been trained using responsibly selected, auditable data sources. With multiple pre-trained models available as starting points, developers can readily customize them for domain-specific use cases, such as information technology, human resource help assistants, and research & development research assistants.
35
 
36
  This model is ready for commercial use.
37
 
38
- We are excited to announce the open sourcing of this commercial model. For users interested in deploying this model in production environments, it is also available via the model API in NVIDIA Inference Microservices (NIM) at [nemotron-ocr-v1](https://build.nvidia.com/nvidia/nemoretriever-ocr-v1).
39
 
40
  ### **License/Terms of use**
41
 
@@ -57,11 +57,11 @@ Global
57
 
58
  ### Use Case
59
 
60
- The **Nemotron OCR v1** model is designed for high-accuracy and high-speed extraction of textual information from images, making it ideal for powering multimodal retrieval systems, Retrieval-Augmented Generation (RAG) pipelines, and agentic applications that require seamless integration of visual and language understanding. Its robust performance and efficiency make it an excellent choice for next-generation AI systems that demand both precision and scalability across diverse real-world content.
61
 
62
  ### Release Date
63
 
64
- 10/23/2025 via https://huggingface.co/nvidia/nemotron-ocr-v1
65
 
66
  ### References
67
 
@@ -71,7 +71,7 @@ The **Nemotron OCR v1** model is designed for high-accuracy and high-speed extra
71
 
72
  **Architecture Type:** Hybrid detector–recognizer with document-level relational modeling
73
 
74
- The Nemotron OCR v1 model integrates three specialized neural components:
75
 
76
  - **Text Detector:** Utilizes a RegNetY-8GF convolutional backbone for high-accuracy localization of text regions within images.
77
  - **Text Recognizer:** Employs a Transformer-based sequence recognizer to transcribe text from detected regions, supporting variable word and line lengths.
@@ -163,11 +163,11 @@ git lfs install
163
  ```
164
  - Using https
165
  ```
166
- git clone https://huggingface.co/nvidia/nemotron-ocr-v1
167
  ```
168
  - Or using ssh
169
  ```
170
- git clone git@hf.co:nvidia/nemotron-ocr-v1
171
  ```
172
 
173
  2. Installation
@@ -179,7 +179,7 @@ git clone git@hf.co:nvidia/nemotron-ocr-v1
179
  - Run the following command to install the package:
180
 
181
  ```bash
182
- cd nemotron-ocr
183
  pip install hatchling
184
  pip install -v .
185
  ```
@@ -197,7 +197,7 @@ docker run --rm --gpus all nvcr.io/nvidia/pytorch:25.09-py3 nvidia-smi
197
  - From the repo root, bring up the service to run the example against the provided image `ocr-example-image.png`:
198
 
199
  ```bash
200
- docker compose run --rm nemotron-ocr \
201
  bash -lc "python example.py ocr-example-input-1.png --merge-level paragraph"
202
  ```
203
 
@@ -212,9 +212,9 @@ Output is saved next to your input image as `<name>-annotated.<ext>` on the host
212
  3. Run the model using the following code:
213
 
214
  ```python
215
- from nemotron_ocr.inference.pipeline import NemotronOCR
216
 
217
- ocr = NemotronOCR()
218
 
219
  predictions = ocr("ocr-example-input-1.png")
220
 
@@ -230,7 +230,7 @@ for pred in predictions:
230
  ### Software Integration
231
 
232
  **Runtime Engine(s):**
233
- - **NeMo Nemotron OCR V1** NIM
234
 
235
 
236
  **Supported Hardware Microarchitecture Compatibility [List in Alphabetic Order]:**
@@ -247,7 +247,7 @@ This AI model can be embedded as an Application Programming Interface (API) call
247
 
248
  ## Model Version(s):
249
 
250
- * `nemotron-ocr-v1`
251
 
252
  ## **Training and Evaluation Datasets:**
253
 
@@ -267,7 +267,7 @@ The model is trained on a large-scale, curated mix of public and proprietary OCR
267
 
268
  ### **Evaluation Datasets**
269
 
270
- The Nemotron OCR v1 model is evaluated on several NVIDIA internal datasets for various tasks, such as pure OCR, table content extraction, and document retrieval.
271
 
272
  **Data Collection Method:** Hybrid (Automated, Human, Synthetic)<br>
273
  **Labeling Method:** Hybrid (Automated, Human, Synthetic)<br>
@@ -275,9 +275,9 @@ The Nemotron OCR v1 model is evaluated on several NVIDIA internal datasets for v
275
 
276
  ### **Evaluation Results**
277
 
278
- We benchmarked Nemotron OCR v1 on internal evaluation datasets against PaddleOCR on various tasks, such as pure OCR (Character Error Rate), table content extraction (TEDS), and document retrieval (Recall@5).
279
 
280
- | Metric | Nemotron OCR v1 | PaddleOCR | Net change |
281
  |-------------------------------------------|--------------------|-----------|-----------------|
282
  | Character Error Rate | 0.1633 | 0.2029 | -19.5% ✔️ |
283
  | Bag-of-character Error Rate | 0.0453 | 0.0512 | -11.5% ✔️ |
 
16
  - ingestion
17
  ---
18
 
19
+ # NeMo Retriever OCR v1
20
 
21
  ## **Model Overview**
22
 
 
27
 
28
  ### **Description**
29
 
30
+ The NeMo Retriever OCR v1 model is a state-of-the-art text recognition model designed for robust end-to-end optical character recognition (OCR) on complex real-world images. It integrates three core neural network modules: a detector for text region localization, a recognizer for transcription of detected regions, and a relational model for layout and structure analysis.
31
 
32
+ This model is optimized for a wide variety of OCR tasks, including multi-line, multi-block, and natural scene text, and it supports advanced reading order analysis via its relational model component. NeMo Retriever OCR v1 has been developed to be production-ready and commercially usable, with a focus on speed and accuracy on both document and natural scene images.
33
 
34
+ The NeMo Retriever OCR v1 model is part of the NVIDIA NeMo Retriever collection of NIM microservices, which provides state-of-the-art, commercially-ready models and microservices optimized for the lowest latency and highest throughput. It features a production-ready information retrieval pipeline with enterprise support. The models that form the core of this solution have been trained using responsibly selected, auditable data sources. With multiple pre-trained models available as starting points, developers can readily customize them for domain-specific use cases, such as information technology, human resource help assistants, and research & development research assistants.
35
 
36
  This model is ready for commercial use.
37
 
38
+ We are excited to announce the open sourcing of this commercial model. For users interested in deploying this model in production environments, it is also available via the model API in NVIDIA Inference Microservices (NIM) at [nemoretriever-ocr-v1](https://build.nvidia.com/nvidia/nemoretriever-ocr-v1).
39
 
40
  ### **License/Terms of use**
41
 
 
57
 
58
  ### Use Case
59
 
60
+ The **NeMo Retriever OCR v1** model is designed for high-accuracy and high-speed extraction of textual information from images, making it ideal for powering multimodal retrieval systems, Retrieval-Augmented Generation (RAG) pipelines, and agentic applications that require seamless integration of visual and language understanding. Its robust performance and efficiency make it an excellent choice for next-generation AI systems that demand both precision and scalability across diverse real-world content.
61
 
62
  ### Release Date
63
 
64
+ 10/23/2025 via https://huggingface.co/nvidia/nemoretriever-ocr-v1
65
 
66
  ### References
67
 
 
71
 
72
  **Architecture Type:** Hybrid detector–recognizer with document-level relational modeling
73
 
74
+ The NeMo Retriever OCR v1 model integrates three specialized neural components:
75
 
76
  - **Text Detector:** Utilizes a RegNetY-8GF convolutional backbone for high-accuracy localization of text regions within images.
77
  - **Text Recognizer:** Employs a Transformer-based sequence recognizer to transcribe text from detected regions, supporting variable word and line lengths.
 
163
  ```
164
  - Using https
165
  ```
166
+ git clone https://huggingface.co/nvidia/nemoretriever-ocr-v1
167
  ```
168
  - Or using ssh
169
  ```
170
+ git clone git@hf.co:nvidia/nemoretriever-ocr-v1
171
  ```
172
 
173
  2. Installation
 
179
  - Run the following command to install the package:
180
 
181
  ```bash
182
+ cd nemo-retriever-ocr
183
  pip install hatchling
184
  pip install -v .
185
  ```
 
197
  - From the repo root, bring up the service to run the example against the provided image `ocr-example-image.png`:
198
 
199
  ```bash
200
+ docker compose run --rm nemo-retriever-ocr \
201
  bash -lc "python example.py ocr-example-input-1.png --merge-level paragraph"
202
  ```
203
 
 
212
  3. Run the model using the following code:
213
 
214
  ```python
215
+ from nemo_retriever_ocr.inference.pipeline import NemoRetrieverOCR
216
 
217
+ ocr = NemoRetrieverOCR()
218
 
219
  predictions = ocr("ocr-example-input-1.png")
220
 
 
230
  ### Software Integration
231
 
232
  **Runtime Engine(s):**
233
+ - **NeMo Retriever Page Elements v3** NIM
234
 
235
 
236
  **Supported Hardware Microarchitecture Compatibility [List in Alphabetic Order]:**
 
247
 
248
  ## Model Version(s):
249
 
250
+ * `nemoretriever-ocr-v1`
251
 
252
  ## **Training and Evaluation Datasets:**
253
 
 
267
 
268
  ### **Evaluation Datasets**
269
 
270
+ The NeMo Retriever OCR v1 model is evaluated on several NVIDIA internal datasets for various tasks, such as pure OCR, table content extraction, and document retrieval.
271
 
272
  **Data Collection Method:** Hybrid (Automated, Human, Synthetic)<br>
273
  **Labeling Method:** Hybrid (Automated, Human, Synthetic)<br>
 
275
 
276
  ### **Evaluation Results**
277
 
278
+ We benchmarked NeMo Retriever OCR v1 on internal evaluation datasets against PaddleOCR on various tasks, such as pure OCR (Character Error Rate), table content extraction (TEDS), and document retrieval (Recall@5).
279
 
280
+ | Metric | NeMo Retriever OCR v1 | PaddleOCR | Net change |
281
  |-------------------------------------------|--------------------|-----------|-----------------|
282
  | Character Error Rate | 0.1633 | 0.2029 | -19.5% ✔️ |
283
  | Bag-of-character Error Rate | 0.0453 | 0.0512 | -11.5% ✔️ |
THIRD_PARTY_NOTICES.md CHANGED
@@ -1,519 +1,15 @@
1
- Copyright "Angus Johnson" - Boost Software License 1.0
2
- License Text([https://sourceforge.net/p/polyclipping/code/HEAD/tree/tags/6.2.0/License.txt](https://sourceforge.net/p/polyclipping/code/HEAD/tree/tags/6.2.0/License.txt))
3
-
4
- This notice applies to **clipper**.
5
-
6
- Copyright (c) 2010-2014 Angus Johnson
7
-
8
- Boost Software License - Version 1.0 - August 17th, 2003
9
-
10
- Permission is hereby granted, free of charge, to any person or organization
11
- obtaining a copy of the software and accompanying documentation covered by
12
- this license (the "Software") to use, reproduce, display, distribute,
13
- execute, and transmit the Software, and to prepare derivative works of the
14
- Software, and to permit third-parties to whom the Software is furnished to
15
- do so, all subject to the following:
16
-
17
- The copyright notices in the Software and this entire statement, including
18
- the above license grant, this restriction and the following disclaimer,
19
- must be included in all copies of the Software, in whole or in part, and
20
- all derivative works of the Software, unless such copies or derivative
21
- works are solely in the form of machine-executable object code generated by
22
- a source language processor.
23
-
24
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26
- FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
27
- SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
28
- FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
29
- ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
30
- DEALINGS IN THE SOFTWARE.
31
-
32
- -----
33
-
34
- Copyright "Ofek Lev" - MIT License
35
- License Text([https://github.com/pypa/hatch/blob/master/LICENSE.txt](https://github.com/pypa/hatch/blob/master/LICENSE.txt))
36
-
37
- This notice applies to **hatchling**.
38
-
39
- Copyright (c) 2017-present Ofek Lev <ofekmeister@gmail.com>
40
-
41
- Permission is hereby granted, free of charge, to any person obtaining a copy
42
- of this software and associated documentation files (the "Software"), to deal
43
- in the Software without restriction, including without limitation the rights
44
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
45
- copies of the Software, and to permit persons to whom the Software is
46
- furnished to do so, subject to the following conditions:
47
-
48
- The above copyright notice and this permission notice shall be included in all
49
- copies or substantial portions of the Software.
50
-
51
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
52
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
53
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
54
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
55
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
56
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
57
- SOFTWARE.
58
-
59
- -----
60
-
61
- Copyright "NumPy Developers" - BSD 3-Clause License
62
- License Text([https://github.com/numpy/numpy/blob/main/LICENSE.txt](https://github.com/numpy/numpy/blob/main/LICENSE.txt))
63
-
64
- This notice applies to **numpy**.
65
-
66
- Copyright (c) 2005-2023, NumPy Developers.
67
- All rights reserved.
68
-
69
- Redistribution and use in source and binary forms, with or without
70
- modification, are permitted provided that the following conditions are
71
- met:
72
-
73
- * Redistributions of source code must retain the above copyright
74
- notice, this list of conditions and the following disclaimer.
75
-
76
- * Redistributions in binary form must reproduce the above
77
- copyright notice, this list of conditions and the following
78
- disclaimer in the documentation and/or other materials provided
79
- with the distribution.
80
-
81
- * Neither the name of the NumPy Developers nor the names of any
82
- contributors may be used to endorse or promote products derived
83
- from this software without specific prior written permission.
84
-
85
-
86
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
87
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
88
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
89
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
90
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
91
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
92
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
93
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
94
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
95
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
96
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
97
-
98
- -----
99
-
100
- Copyright "pandas Developers" - BSD 3-Clause License
101
- License Text([https://github.com/pandas-dev/pandas/blob/main/LICENSE](https://github.com/pandas-dev/pandas/blob/main/LICENSE))
102
-
103
- This notice applies to **pandas**.
104
-
105
- Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
106
- All rights reserved.
107
-
108
- Copyright (c) 2011-2023, The PyData Development Team
109
- All rights reserved.
110
-
111
- Redistribution and use in source and binary forms, with or without
112
- modification, are permitted provided that the following conditions are
113
- met:
114
-
115
-
116
- * Redistributions of source code must retain the above copyright
117
- notice, this list of conditions and the following disclaimer.
118
-
119
- * Redistributions in binary form must reproduce the above
120
- copyright notice, this list of conditions and the following
121
- disclaimer in the documentation and/or other materials provided
122
- with the distribution.
123
-
124
- * Neither the name of the pandas development team nor the names of
125
- any contributors may be used to endorse or promote products
126
- derived from this software without specific prior written
127
- permission.
128
-
129
-
130
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
131
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
132
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
133
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
134
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
135
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
136
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
137
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
138
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
139
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
140
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
141
-
142
- -----
143
-
144
- Copyright "Secret Labs AB, Fredrik Lundh, Alex Clark and contributors" - Pillow License
145
- License Text([https://github.com/python-pillow/Pillow/blob/main/LICENSE](https://github.com/python-pillow/Pillow/blob/main/LICENSE))
146
-
147
- This notice applies to **PIL (Pillow)**.
148
-
149
- The Python Imaging Library (PIL) is
150
- Copyright (c) 1997-2011 by Secret Labs AB
151
- Copyright (c) 1995-2011 by Fredrik Lundh
152
- Copyright (c) 2010-2023 by Alex Clark and contributors
153
-
154
- Like PIL, Pillow is licensed under the open source HPND License:
155
-
156
- By obtaining, using, and/or copying this software and/or its
157
- associated documentation, you agree that you have read, understood,
158
- and will comply with the following terms and conditions:
159
-
160
- Permission to use, copy, modify, and distribute this software and
161
- its associated documentation for any purpose and without fee is
162
- hereby granted, provided that the above copyright notice appears in
163
- all copies, and that both that copyright notice and this permission
164
- notice appear in supporting documentation, and that the name of
165
- Secret Labs AB or the author not be used in advertising or publicity
166
- pertaining to distribution of the software without specific, written
167
- prior permission.
168
-
169
- SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
170
- TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
171
- ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
172
- BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
173
- DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
174
- WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
175
- ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
176
- OF THIS SOFTWARE.
177
-
178
- -----
179
-
180
- Copyright "The scikit-learn developers" - BSD 3-Clause License
181
- License Text([https://github.com/scikit-learn/scikit-learn/blob/main/COPYING](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING))
182
-
183
- This notice applies to **scikit-learn**.
184
-
185
- Copyright (c) 2007-2024 The scikit-learn developers.
186
- All rights reserved.
187
-
188
- Redistribution and use in source and binary forms, with or without
189
- modification, are permitted provided that the following conditions are
190
- met:
191
-
192
-
193
- * Redistributions of source code must retain the above copyright
194
- notice, this list of conditions and the following disclaimer.
195
-
196
- * Redistributions in binary form must reproduce the above
197
- copyright notice, this list of conditions and the following
198
- disclaimer in the documentation and/or other materials provided
199
- with the distribution.
200
-
201
- * Neither the name of the scikit-learn developers nor the names of
202
- any contributors may be used to endorse or promote products
203
- derived from this software without specific prior written
204
- permission.
205
-
206
-
207
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
208
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
209
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
210
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
211
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
212
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
213
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
214
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
215
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
216
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
217
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
218
-
219
- -----
220
-
221
- Copyright "Jason R. Coombs" - MIT License
222
- License Text([https://github.com/pypa/setuptools/blob/main/LICENSE](https://github.com/pypa/setuptools/blob/main/LICENSE))
223
-
224
- This notice applies to **setuptools**.
225
-
226
- Copyright (c) 2016 Jason R. Coombs <jaraco@jaraco.com>
227
-
228
- Permission is hereby granted, free of charge, to any person obtaining a copy
229
- of this software and associated documentation files (the "Software"), to deal
230
- in the Software without restriction, including without limitation the rights
231
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
232
- copies of the Software, and to permit persons to whom the Software is
233
- furnished to do so, subject to the following conditions:
234
-
235
- The above copyright notice and this permission notice shall be included in all
236
- copies or substantial portions of the Software.
237
-
238
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
239
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
240
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
241
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
242
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
243
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
244
- SOFTWARE.
245
-
246
- -----
247
-
248
- Copyright "Sean Gillies" - BSD 3-Clause License
249
- License Text([https://github.com/shapely/shapely/blob/main/LICENSE.txt](https://github.com/shapely/shapely/blob/main/LICENSE.txt))
250
-
251
- This notice applies to **Shapely**.
252
-
253
- Copyright (c) 2007, Sean Gillies.
254
- All rights reserved.
255
-
256
- Redistribution and use in source and binary forms, with or without
257
- modification, are permitted provided that the following conditions are met:
258
-
259
-
260
- * Redistributions of source code must retain the above copyright
261
- notice, this list of conditions and the following disclaimer.
262
-
263
- * Redistributions in binary form must reproduce the above copyright
264
- notice, this list of conditions and the following disclaimer in the
265
- documentation and/or other materials provided with the distribution.
266
-
267
- * Neither the name of Sean Gillies nor the names of
268
- its contributors may be used to endorse or promote products derived from
269
- this software without specific prior written permission.
270
-
271
-
272
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
273
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
274
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
275
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
276
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
277
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
278
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
279
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
280
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
281
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
282
- POSSIBILITY OF SUCH DAMAGE.
283
-
284
- -----
285
-
286
- Copyright "PyTorch Contributors" - BSD-style License
287
- License Text([https://github.com/pytorch/pytorch/blob/main/LICENSE](https://github.com/pytorch/pytorch/blob/main/LICENSE))
288
-
289
- This notice applies to **torch** and **torchvision**.
290
-
291
- Copyright (c) 2016- Facebook, Inc. (Adam Paszke)
292
- Copyright (c) 2014- Facebook, Inc. (Soumith Chintala)
293
- Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
294
- Copyright (c) 2012-2014 DeepMind Technologies (Koray Kavukcuoglu)
295
- Copyright (c) 2011-2012 NEC Laboratories America (Clement Farabet)
296
- Copyright (c) 2011-2013 New York University (Antoine Bordes)
297
- Copyright (c) 2012-2013 University of Montreal (Pascal Vincent)
298
- Copyright (c) 2014- Google Inc.
299
- Copyright (c) 2015- Twitter, Inc.
300
- Copyright (c) 2015- Intel Corporation
301
- Copyright (c) 2015- AMD Inc.
302
- Copyright (c) 2016- Baidu, Inc.
303
- Copyright (c) 2016- Microsoft Corporation
304
- Copyright (c) 2017- Amazon.com, Inc.
305
- Copyright (c) 2018- Facebook AI Research
306
- Copyright (c) 2019- fast.ai, Inc.
307
- Copyright (c) 2022- PyTorch Contributors
308
- All rights reserved.
309
-
310
- Redistribution and use in source and binary forms, with or without
311
- modification, are permitted provided that the following conditions are met:
312
-
313
- * Redistributions of source code must retain the above copyright notice, this
314
- list of conditions and the following disclaimer.
315
-
316
- * Redistributions in binary form must reproduce the above copyright notice,
317
- this list of conditions and the following disclaimer in the documentation
318
- and/or other materials provided with the distribution.
319
-
320
- * Neither the name of Facebook Inc. nor the names of its contributors may be
321
- used to endorse or promote products derived from this software without
322
- specific prior written permission.
323
-
324
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
325
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
326
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
327
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
328
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
329
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
330
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
331
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
332
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
333
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
334
-
335
- -----
336
-
337
- Copyright "Baidu USA LLC" - Apache License 2.0
338
- License Text([https://github.com/bryancatanzaro/trove/blob/master/LICENSE](https://github.com/bryancatanzaro/trove/blob/master/LICENSE))
339
-
340
- This notice applies to **trove**.
341
-
342
- Copyright 2015-2016 Baidu USA LLC. All rights reserved.
343
-
344
- Apache License
345
- Version 2.0, January 2004
346
- http://www.apache.org/licenses/
347
-
348
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
349
-
350
- 1. Definitions.
351
-
352
- "License" shall mean the terms and conditions for use, reproduction,
353
- and distribution as defined by Sections 1 through 9 of this document.
354
-
355
- "Licensor" shall mean the copyright owner or entity authorized by
356
- the copyright owner that is granting the License.
357
-
358
- "Legal Entity" shall mean the union of the acting entity and all
359
- other entities that control, are controlled by, or are under common
360
- control with that entity. For the purposes of this definition,
361
- "control" means (i) the power, direct or indirect, to cause the
362
- direction or management of such entity, whether by contract or
363
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
364
- outstanding shares, or (iii) beneficial ownership of such entity.
365
-
366
- "You" (or "Your") shall mean an individual or Legal Entity
367
- exercising permissions granted by this License.
368
-
369
- "Source" form shall mean the preferred form for making modifications,
370
- including but not limited to software source code, documentation
371
- source, and configuration files.
372
-
373
- "Object" form shall mean any form resulting from mechanical
374
- transformation or translation of a Source form, including but
375
- not limited to compiled object code, generated documentation,
376
- and conversions to other media types.
377
-
378
- "Work" shall mean the work of authorship, whether in Source or
379
- Object form, made available under the License, as indicated by a
380
- copyright notice that is included in or attached to the work
381
- (an example is provided in the Appendix below).
382
-
383
- "Derivative Works" shall mean any work, whether in Source or Object
384
- form, that is based on (or derived from) the Work and for which the
385
- editorial revisions, annotations, elaborations, or other modifications
386
- represent, as a whole, an original work of authorship. For the purposes
387
- of this License, Derivative Works shall not include works that remain
388
- separable from, or merely link (or bind by name) to the interfaces of,
389
- the Work and Derivative Works thereof.
390
-
391
- "Contribution" shall mean any work of authorship, including
392
- the original version of the Work and any modifications or additions
393
- to that Work or Derivative Works thereof, that is intentionally
394
- submitted to Licensor for inclusion in the Work by the copyright owner
395
- or by an individual or Legal Entity authorized to submit on behalf of
396
- the copyright owner. For the purposes of this definition, "submitted"
397
- means any form of electronic, verbal, or written communication sent
398
- to the Licensor or its representatives, including but not limited to
399
- communication on electronic mailing lists, source code control systems,
400
- and issue tracking systems that are managed by, or on behalf of, the
401
- Licensor for the purpose of discussing and improving the Work, but
402
- excluding communication that is conspicuously marked or otherwise
403
- designated in writing by the copyright owner as "Not a Contribution."
404
-
405
- "Contributor" shall mean Licensor and any individual or Legal Entity
406
- on behalf of whom a Contribution has been received by Licensor and
407
- subsequently incorporated within the Work.
408
-
409
- 2. Grant of Copyright License. Subject to the terms and conditions of
410
- this License, each Contributor hereby grants to You a perpetual,
411
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
412
- copyright license to reproduce, prepare Derivative Works of,
413
- publicly display, publicly perform, sublicense, and distribute the
414
- Work and such Derivative Works in Source or Object form.
415
-
416
- 3. Grant of Patent License. Subject to the terms and conditions of
417
- this License, each Contributor hereby grants to You a perpetual,
418
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
419
- (except as stated in this section) patent license to make, have made,
420
- use, offer to sell, sell, import, and otherwise transfer the Work,
421
- where such license applies only to those patent claims licensable
422
- by such Contributor that are necessarily infringed by their
423
- Contribution(s) alone or by combination of their Contribution(s)
424
- with the Work to which such Contribution(s) was submitted. If You
425
- institute patent litigation against any entity (including a
426
- cross-claim or counterclaim in a lawsuit) alleging that the Work
427
- or a Contribution incorporated within the Work constitutes direct
428
- or contributory patent infringement, then any patent licenses
429
- granted to You under this License for that Work shall terminate
430
- as of the date such litigation is filed.
431
-
432
- 4. Redistribution. You may reproduce and distribute copies of the
433
- Work or Derivative Works thereof in any medium, with or without
434
- modifications, and in Source or Object form, provided that You
435
- meet the following conditions:
436
-
437
- (a) You must give any other recipients of the Work or
438
- Derivative Works a copy of this License; and
439
-
440
- (b) You must cause any modified files to carry prominent notices
441
- stating that You changed the files; and
442
-
443
- (c) You must retain, in the Source form of any Derivative Works
444
- that You distribute, all copyright, patent, trademark, and
445
- attribution notices from the Source form of the Work,
446
- excluding those notices that do not pertain to any part of
447
- the Derivative Works; and
448
-
449
- (d) If the Work includes a "NOTICE" text file as part of its
450
- distribution, then any Derivative Works that You distribute must
451
- include a readable copy of the attribution notices contained
452
- within such NOTICE file, excluding those notices that do not
453
- pertain to any part of the Derivative Works, in at least one
454
- of the following places: within a NOTICE text file distributed
455
- as part of the Derivative Works; within the Source form or
456
- documentation, if provided along with the Derivative Works; or,
457
- within a display generated by the Derivative Works, if and
458
- wherever such third-party notices normally appear. The contents
459
- of the NOTICE file are for informational purposes only and
460
- do not modify the License. You may add Your own attribution
461
- notices within Derivative Works that You distribute, alongside
462
- or as an addendum to the NOTICE text from the Work, provided
463
- that such additional attribution notices cannot be construed
464
- as modifying the License.
465
-
466
- You may add Your own copyright statement to Your modifications and
467
- may provide additional or different license terms and conditions
468
- for use, reproduction, or distribution of Your modifications, or
469
- for any such Derivative Works as a whole, provided Your use,
470
- reproduction, and distribution of the Work otherwise complies with
471
- the conditions stated in this License.
472
-
473
- 5. Submission of Contributions. Unless You explicitly state otherwise,
474
- any Contribution intentionally submitted for inclusion in the Work
475
- by You to the Licensor shall be under the terms and conditions of
476
- this License, without any additional terms or conditions.
477
- Notwithstanding the above, nothing herein shall supersede or modify
478
- the terms of any separate license agreement you may have executed
479
- with Licensor regarding such Contributions.
480
-
481
- 6. Trademarks. This License does not grant permission to use the trade
482
- names, trademarks, service marks, or product names of the Licensor,
483
- except as required for reasonable and customary use in describing the
484
- origin of the Work and reproducing the content of the NOTICE file.
485
-
486
- 7. Disclaimer of Warranty. Unless required by applicable law or
487
- agreed to in writing, Licensor provides the Work (and each
488
- Contributor provides its Contributions) on an "AS IS" BASIS,
489
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
490
- implied, including, without limitation, any warranties or conditions
491
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
492
- PARTICULAR PURPOSE. You are solely responsible for determining the
493
- appropriateness of using or redistributing the Work and assume any
494
- risks associated with Your exercise of permissions under this License.
495
-
496
- 8. Limitation of Liability. In no event and under no legal theory,
497
- whether in tort (including negligence), contract, or otherwise,
498
- unless required by applicable law (such as deliberate and grossly
499
- negligent acts) or agreed to in writing, shall any Contributor be
500
- liable to You for damages, including any direct, indirect, special,
501
- incidental, or consequential damages of any character arising as a
502
- result of this License or out of the use or inability to use the
503
- Work (including but not limited to damages for loss of goodwill,
504
- work stoppage, computer failure or malfunction, or any and all
505
- other commercial damages or losses), even if such Contributor
506
- has been advised of the possibility of such damages.
507
-
508
- 9. Accepting Warranty or Additional Liability. While redistributing
509
- the Work or Derivative Works thereof, You may choose to offer,
510
- and charge a fee for, acceptance of support, warranty, indemnity,
511
- or other liability obligations and/or rights consistent with this
512
- License. However, in accepting such obligations, You may act only
513
- on Your own behalf and on Your sole responsibility, not on behalf
514
- of any other Contributor, and only if You agree to indemnify,
515
- defend, and hold each Contributor harmless for any liability
516
- incurred by, or claims asserted against, such Contributor by reason
517
- of your accepting any such warranty or additional liability.
518
-
519
- END OF TERMS AND CONDITIONS
 
1
+ # Third Party Notices
2
+
3
+ The scripts contained in this repository make use of the following third-party libraries:
4
+
5
+ - [clipper](https://sourceforge.net/p/polyclipping/code/HEAD/tree/tags/6.2.0/License.txt)
6
+ - [hatchling](https://github.com/pypa/hatch/blob/master/LICENSE.txt)
7
+ - [numpy](https://github.com/numpy/numpy/blob/main/LICENSE.txt)
8
+ - [pandas](https://github.com/pandas-dev/pandas/blob/main/LICENSE)
9
+ - [Pillow](https://github.com/python-pillow/pillow/blob/main/LICENSE)
10
+ - [scikit_learn](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING)
11
+ - [setuptools](https://github.com/pypa/setuptools/blob/main/LICENSE)
12
+ - [Shapely](https://github.com/shapely/shapely/blob/main/LICENSE.txt)
13
+ - [torch](https://github.com/pytorch/pytorch/blob/main/LICENSE)
14
+ - [torchvision](https://github.com/pytorch/vision/blob/main/LICENSE)
15
+ - [trove](https://github.com/bryancatanzaro/trove/blob/master/LICENSE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docker-compose.yaml CHANGED
@@ -1,5 +1,5 @@
1
  services:
2
- nemotron-ocr:
3
  build:
4
  context: .
5
  dockerfile: Dockerfile
 
1
  services:
2
+ nemo-retriever-ocr:
3
  build:
4
  context: .
5
  dockerfile: Dockerfile
example.py CHANGED
@@ -4,11 +4,11 @@
4
 
5
  import argparse
6
 
7
- from nemotron_ocr.inference.pipeline import NemotronOCR
8
 
9
 
10
  def main(image_path, merge_level, no_visualize, model_dir):
11
- ocr_pipeline = NemotronOCR(model_dir=model_dir)
12
 
13
  predictions = ocr_pipeline(image_path, merge_level=merge_level, visualize=not no_visualize)
14
 
 
4
 
5
  import argparse
6
 
7
+ from nemo_retriever_ocr.inference.pipeline import NemoRetrieverOCR
8
 
9
 
10
  def main(image_path, merge_level, no_visualize, model_dir):
11
+ ocr_pipeline = NemoRetrieverOCR()
12
 
13
  predictions = ocr_pipeline(image_path, merge_level=merge_level, visualize=not no_visualize)
14
 
{nemotron-ocr → nemo-retriever-ocr}/cpp/.gitattributes RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/.gitignore RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/.gitmodules RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/README.md RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/beam_decode.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/beam_decode.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/kn_lm.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/kn_lm.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/language_model.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/language_model.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/log_sum_exp.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/log_sum_exp.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/ngram_lm_base.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/ngram_lm_base.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/prefix.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/prefix.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/sbo_lm.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/beam_decode/sbo_lm.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/better_grid_sample/cpu_indirect_grid_sample.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/better_grid_sample/gpu_grid_sample_utils.cuh RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/better_grid_sample/gpu_indirect_grid_sample.cu RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/better_grid_sample/grid_sample.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/common.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/common.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/cuda_intellisense.cuh RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/geometry.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/calc_poly_min_rrect.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/geometry_api.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/geometry_api.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/geometry_api_common.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/geometry_api_gpu.cu RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/get_rel_continuation_cos.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/matrix2x2.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/geometry_api/poly_bounds_quad.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/graph_detection/encode_util.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/graph_detection/encode_util.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/half_ops.cu RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/half_ops.cuh RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/local_ips/local_ips.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/local_ips/quad_all_2_all_dist_v2.cu RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/module.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/non_maximal_suppression/cpu_non_maximal_suppression.cpp RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/non_maximal_suppression/cuda_non_maximal_suppression.cu RENAMED
@@ -157,8 +157,11 @@ void device_row_collapse(torch::PackedTensorAccessor64<T, 5> allQuads,
157
  torch::PackedTensorAccessor64<T, 3> allConfs,
158
  T confThreshold, T iouThreshold,
159
  torch::PackedTensorAccessor64<int32_t, 1> allOutCounts,
160
- torch::PackedTensorAccessor64<T, 3> allOutEmbedQuads,
161
- torch::PackedTensorAccessor64<int32_t, 2> allOutIds)
 
 
 
162
  {
163
  typedef InPlaceQuad_<T> Quadf;
164
  static_assert(sizeof(Quadf) == sizeof(T) * 8, "Invalid QuadMem size!");
@@ -303,9 +306,11 @@ void device_row_collapse(torch::PackedTensorAccessor64<T, 5> allQuads,
303
  }
304
 
305
  write_embed_quad(outEmbedQuads, outQuad, storeOff + procLabel - 1);
 
306
  if (threadRank == 0) {
307
  allOutIds[b][storeOff + procLabel - 1] = r * 32 + startIdx;
308
  }
 
309
  }
310
 
311
  if (threadRank == 0) {
@@ -316,9 +321,9 @@ void device_row_collapse(torch::PackedTensorAccessor64<T, 5> allQuads,
316
  #undef threadRank
317
  }
318
 
319
- template<typename T>
320
  __global__
321
- void device_a2a_adjacency_sparse(const int32_t *ptrQuadCts,
322
  T iouThreshold,
323
  torch::PackedTensorAccessor64<T, 3> embedQuads,
324
  torch::PackedTensorAccessor64<bool, 2> outIsStart,
@@ -327,11 +332,7 @@ void device_a2a_adjacency_sparse(const int32_t *ptrQuadCts,
327
  {
328
  const uint32_t b = blockIdx.y;
329
 
330
- const int32_t quadCt = ptrQuadCts[b];
331
-
332
- if (quadCt == 0) {
333
- return;
334
- }
335
 
336
  const int32_t jobIdx = blockIdx.x * blockDim.x + threadIdx.x;
337
  const int32_t row = jobIdx / quadCt;
@@ -342,7 +343,7 @@ void device_a2a_adjacency_sparse(const int32_t *ptrQuadCts,
342
  return;
343
  }
344
 
345
- T* exData = embedQuads[b].data();
346
 
347
  const auto qRow = StridedEmbedQuad_<T>{ exData + row * embedQuads.stride(2), embedQuads.stride(1) }.Bounds(),
348
  qCol = StridedEmbedQuad_<T>{ exData + col * embedQuads.stride(2), embedQuads.stride(1) }.Bounds();
@@ -404,9 +405,9 @@ void device_a2a_adjacency_sparse(const int32_t *ptrQuadCts,
404
  }
405
  }
406
 
407
- template<uint32_t NumWarps, typename T, int32_t I_CELL_SIZE>
408
  __global__
409
- void device_a2a_adjacency_build_grid(const int32_t *ptrQuadCts,
410
  torch::PackedTensorAccessor64<T, 3> embedQuads,
411
  torch::PackedTensorAccessor64<int32_t, 4> outGridCells,
412
  torch::PackedTensorAccessor64<int32_t, 3> outQuadCells)
@@ -422,10 +423,10 @@ void device_a2a_adjacency_build_grid(const int32_t *ptrQuadCts,
422
 
423
  const uint32_t b = blockIdx.z;
424
 
425
- const uint32_t quadCt = ptrQuadCts[b];
426
  const uint32_t quadIdx = blockIdx.y;
427
 
428
- if (quadIdx >= quadCt) {
429
  return;
430
  }
431
 
@@ -484,9 +485,9 @@ void device_a2a_adjacency_build_grid(const int32_t *ptrQuadCts,
484
 
485
  typedef uint8_t visit_mask_t;
486
 
487
- template<uint32_t NumWarps, typename T>
488
  __global__
489
- void device_a2a_adjacency_with_grid(const int32_t *ptrQuadCts,
490
  T iouThreshold,
491
  torch::PackedTensorAccessor64<T, 3> allEmbedQuads,
492
  torch::PackedTensorAccessor64<int32_t, 4> allCells,
@@ -502,10 +503,10 @@ void device_a2a_adjacency_with_grid(const int32_t *ptrQuadCts,
502
 
503
  const uint32_t b = blockIdx.z;
504
 
505
- const uint32_t quadCt = ptrQuadCts[b];
506
  const uint32_t quadIdx = blockIdx.y;
507
 
508
- if (quadIdx >= quadCt) {
509
  return;
510
  }
511
 
@@ -534,7 +535,7 @@ void device_a2a_adjacency_with_grid(const int32_t *ptrQuadCts,
534
  auto exAdjCounts = reinterpret_cast<uint32_t*>(outAdjCounts[b].data());
535
  auto exAdjValues = outSparseAdj[b][quadIdx].data();
536
 
537
- T *exData = allEmbedQuads[b].data();
538
 
539
  const auto bdsAnchor = Quad_<T>{ s_quadVerts }.Bounds();
540
 
@@ -598,8 +599,9 @@ void device_a2a_adjacency_with_grid(const int32_t *ptrQuadCts,
598
  }
599
  }
600
 
 
601
  __global__
602
- void device_flatten_graph_iterative(const int32_t *ptrQuadCts,
603
  torch::PackedTensorAccessor64<bool, 2> allIsStart,
604
  volatile uint32_t *allAdjCounts,
605
  volatile uint32_t *allAdjValues
@@ -620,12 +622,14 @@ void device_flatten_graph_iterative(const int32_t *ptrQuadCts,
620
  const uint32_t b = blockIdx.z;
621
  const uint32_t anchorRow = blockIdx.y;
622
 
623
- const uint32_t quadCt = ptrQuadCts[b];
624
 
625
  // Only need to check this if there are multiple examples, since in the case of a single example,
626
  // the grid is precisely sized to that quadCt
627
- if (anchorRow >= quadCt) {
628
- return;
 
 
629
  }
630
 
631
  auto isStart = allIsStart[b].data();
@@ -686,13 +690,12 @@ void device_flatten_graph_iterative(const int32_t *ptrQuadCts,
686
  visitStack[1] = anchorRow;
687
  #ifndef NDEBUG
688
  for (uint32_t i = 2; i < VISIT_STACK_SIZE; ++i) {
689
- visitStack[i] = TERM_VALUE;
690
  }
691
  #endif
692
  int32_t visitPtr = 1;
693
 
694
- // NOTE: This loop is actually terminated by the `if (warpNextCol == TERM_VALUE)` check below
695
- for (uint32_t dfsIter = 0; true; ++dfsIter) {
696
  #ifdef NMS_VERIFY_CORRECTNESS
697
  assert(visitPtr >= 0 && visitPtr < VISIT_STACK_SIZE);
698
  #endif
@@ -704,7 +707,7 @@ void device_flatten_graph_iterative(const int32_t *ptrQuadCts,
704
  if (threadNextCol == warpNextCol) {
705
  #ifndef NDEBUG
706
  // This makes it easier to debug where the pointer is
707
- visitStack[visitPtr] = TERM_VALUE;
708
  #endif
709
  --visitPtr;
710
  }
@@ -728,15 +731,12 @@ void device_flatten_graph_iterative(const int32_t *ptrQuadCts,
728
  const uint32_t procAdjCount = adjCounts[procRow];
729
  auto procAdjValues = adjValues + (procRow * maxExCount);
730
 
 
 
 
 
731
  for (uint32_t i = threadRank; i < procAdjCount; i += WARP_SIZE) {
732
- uint32_t adjCol = procAdjValues[i];
733
-
734
- auto group = cg::coalesced_threads();
735
- // Offsetting by the iteration number will help balance out the maximum depth of any stack in the warp.
736
- // The reason behind this is due to how otherwise, warp-0 will always get a new element, warp-1 iff the adj graph
737
- // has more than one element, warp-2 iff the adj graph has more than two elements, and so on. Basically,
738
- // the warps have decreasing pressure. With the rotation mechanism, it helps to balance out stack usage.
739
- adjCol = group.shfl(adjCol, (group.thread_rank() + dfsIter) % group.size());
740
 
741
  // This will set the queued flag for this column, if it's not already set.
742
  // It also returns the old state. In our case, we only want to add this value to the
@@ -748,6 +748,7 @@ void device_flatten_graph_iterative(const int32_t *ptrQuadCts,
748
 
749
  bool alreadyAdded = oldMask & ADDED_MASK;
750
 
 
751
  const uint32_t gThreadRank = group.thread_rank();
752
  uint32_t notAddedBallot = group.ballot(!alreadyAdded);
753
  if (notAddedBallot) {
@@ -824,7 +825,8 @@ void add_to_set(const torch::TensorAccessor<int32_t, 1>& adjCounts,
824
  }
825
  }
826
 
827
- void cpu_flatten_graph(const int32_t *ptrQuadCts,
 
828
  torch::Tensor isStartTensorGPU,
829
  torch::Tensor adjCountsTensorGPU,
830
  torch::Tensor adjValuesTensorGPU)
@@ -838,7 +840,7 @@ void cpu_flatten_graph(const int32_t *ptrQuadCts,
838
  auto allAdjValues = adjValuesTensor.accessor<int32_t, 3>();
839
 
840
  for (int32_t b = 0; b < allAdjCounts.size(0); ++b) {
841
- const int32_t quadCt = ptrQuadCts[b];
842
 
843
  for (int32_t row = 0; row < quadCt; ++row) {
844
  std::unordered_set<int32_t> fullAdjSet;
@@ -893,9 +895,9 @@ void device_a2a_adj_cleanup(const int32_t *counts,
893
  }
894
  }
895
 
896
- template<uint32_t NumWarps, typename T>
897
  __global__
898
- void device_a2a_collapse(torch::PackedTensorAccessor64<int32_t, 1> quadCounts,
899
  torch::PackedTensorAccessor64<T, 3> allEmbedQuads,
900
  torch::PackedTensorAccessor64<bool, 2> allIsLeadRow,
901
  const int64_t *regionCounts,
@@ -915,14 +917,16 @@ void device_a2a_collapse(torch::PackedTensorAccessor64<int32_t, 1> quadCounts,
915
  const uint32_t b = blockIdx.z;
916
  const uint32_t row = blockIdx.y;
917
 
918
- const int32_t quadCt = quadCounts[b];
919
 
920
- if (row >= quadCt) {
921
- return;
 
 
922
  }
923
 
924
  // Only process the lead rows
925
- const auto isLeadRow = allIsLeadRow[b].data();
926
  if (!isLeadRow[row]) {
927
  return;
928
  }
@@ -941,7 +945,7 @@ void device_a2a_collapse(torch::PackedTensorAccessor64<int32_t, 1> quadCounts,
941
  __syncthreads();
942
  }
943
 
944
- T *exData = allEmbedQuads[b].data();
945
 
946
  const int32_t adjCount = allAdjCounts[b][row];
947
  const int32_t *adjIdxs = allAdjValues[b][row].data();
@@ -982,12 +986,20 @@ void device_a2a_collapse(torch::PackedTensorAccessor64<int32_t, 1> quadCounts,
982
 
983
  // Figure out the output position
984
  uint32_t writePosition = 0;
985
- for (int32_t i = threadRank; i < b; i += BLOCK_WIDTH) {
986
- writePosition += regionCounts[i];
 
 
987
  }
988
 
 
989
  const uint8_t *pCurrIsLeadRow = reinterpret_cast<const uint8_t*>(isLeadRow);
990
- for (int32_t i = threadRank; i < row; i += BLOCK_WIDTH) {
 
 
 
 
 
991
  if (pCurrIsLeadRow[i]) {
992
  ++writePosition;
993
  }
@@ -1063,9 +1075,13 @@ CollapseRowsResult collapse_rows(
1063
  int64_t embedSize = sizeof(EmbedQuad_<scalar_t>) / sizeof(scalar_t);
1064
  auto rowMergeTensor = torch::empty({ quads.size(0), embedSize, quads.size(1) * quads.size(2) }, quads.options());
1065
 
 
1066
  auto idsTensor = torch::full({ quads.size(0), quads.size(1) * quads.size(2) },
1067
  std::numeric_limits<int32_t>::max(),
1068
  counts.options().dtype(torch::kInt32));
 
 
 
1069
 
1070
  dim3 blockSize(32, 3, 1);
1071
  dim3 gridSize(1,
@@ -1077,8 +1093,10 @@ CollapseRowsResult collapse_rows(
1077
  probs.packed_accessor64<scalar_t, 3>(),
1078
  probThreshold, iouThreshold,
1079
  counts.packed_accessor64<int32_t, 1>(),
1080
- rowMergeTensor.packed_accessor64<scalar_t, 3>(),
1081
- idsTensor.packed_accessor64<int32_t, 2>()
 
 
1082
  );
1083
 
1084
  #ifdef NMS_VERIFY_CORRECTNESS
@@ -1101,6 +1119,7 @@ CollapseRowsResult collapse_rows(
1101
 
1102
  counts = counts.slice(/*dim=*/ 0, 0, counts.size(0) - 1);
1103
 
 
1104
  int64_t maxExCount;
1105
  if (counts.size(0) > 1) {
1106
  maxExCount = counts.max().item<int32_t>();
@@ -1112,12 +1131,13 @@ CollapseRowsResult collapse_rows(
1112
 
1113
  rowMergeTensor = rowMergeTensor.slice(2, 0, maxExCount);
1114
  idsTensor = idsTensor.slice(1, 0, maxExCount);
1115
- auto order = torch::argsort(idsTensor, /*dim=*/ 1, s_sortOrder);
1116
 
1117
  auto embOrder = order.unsqueeze(1).expand_as(rowMergeTensor);
1118
 
1119
  rowMergeTensor = torch::gather(rowMergeTensor, /*dim=*/ 2, embOrder);
1120
  idsTensor = torch::gather(idsTensor, /*dim=*/ 1, order);
 
1121
 
1122
  return { counts, rowMergeTensor, totalQuads, idsTensor, imageWidth, imageHeight };
1123
  }
@@ -1157,8 +1177,8 @@ struct AdjacencyResult {
1157
  int64_t MaxExCount;
1158
  };
1159
 
1160
- template<typename T>
1161
- void cpu_a2a_adjacency_sparse(const int32_t *ptrQuadCts,
1162
  const T iouThreshold,
1163
  torch::Tensor embedQuadsTensor,
1164
  torch::Tensor outIsStartTensorGPU,
@@ -1176,7 +1196,7 @@ void cpu_a2a_adjacency_sparse(const int32_t *ptrQuadCts,
1176
  auto adjValues = outSparseAdjTensor.accessor<int32_t, 3>();
1177
 
1178
  for (int32_t b = 0; b < embedQuadsTensor.size(0); ++b) {
1179
- const int32_t quadCt = ptrQuadCts[b];
1180
 
1181
  T *exData = embedQuads[b].data();
1182
 
@@ -1264,6 +1284,13 @@ AdjacencyResult compute_all_to_all_adjacency(
1264
  counts.options().dtype(torch::kInt32));
1265
  #endif
1266
 
 
 
 
 
 
 
 
1267
  #ifdef NMS_VERIFY_CORRECTNESS
1268
  auto cpuAdjValuesTensor = adjValuesTensor.cpu();
1269
  auto cpuAdjCountsTensor = adjCountsTensor.cpu();
@@ -1291,15 +1318,23 @@ AdjacencyResult compute_all_to_all_adjacency(
1291
  //blockSize = dim3{ GRID_NUM_WARPS * 32, 1, 1 };
1292
  //gridSize = dim3{ 1, static_cast<uint32_t>(maxExCount), static_cast<uint32_t>(counts.size(0)) };
1293
 
1294
- //device_a2a_adjacency_build_grid<GRID_NUM_WARPS, scalar_t, CELL_SIZE> KERNEL_ARG2(gridSize, blockSize) (
1295
- // counts.data_ptr<int32_t>(),
 
 
 
 
1296
  // collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
1297
  // gridCellsTensor.packed_accessor64<int32_t, 4>(),
1298
  // quadCellExtentsTensor.packed_accessor64<int32_t, 3>()
1299
  //);
1300
 
1301
- //device_a2a_adjacency_with_grid<GRID_NUM_WARPS, scalar_t> KERNEL_ARG3(gridSize, blockSize, smemSize) (
1302
- // counts.data_ptr<int32_t>(),
 
 
 
 
1303
  // iouThreshold,
1304
  // collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
1305
  // gridCellsTensor.packed_accessor64<int32_t, 4>(),
@@ -1316,9 +1351,11 @@ AdjacencyResult compute_all_to_all_adjacency(
1316
  gridSize = dim3{div_up(totalWork, blockSize.x),
1317
  static_cast<uint32_t>(counts.size(0))};
1318
 
 
 
1319
  // This algorithm is O(n^2) with n being the current number of quads
1320
- device_a2a_adjacency_sparse<scalar_t> KERNEL_ARG2(gridSize, blockSize) (
1321
- counts.data_ptr<int32_t>(),
1322
  iouThreshold,
1323
  collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
1324
  isStartTensor.packed_accessor64<bool, 2>(),
@@ -1328,9 +1365,7 @@ AdjacencyResult compute_all_to_all_adjacency(
1328
 
1329
 
1330
  #ifdef NMS_VERIFY_CORRECTNESS
1331
- auto cpuCounts = counts.cpu();
1332
-
1333
- cpu_a2a_adjacency_sparse<scalar_t>(cpuCounts.data_ptr<int32_t>(), iouThreshold,
1334
  collapseResult.StridedMergeQuads, cpuIsStartTensor, cpuAdjCountsTensor, cpuAdjValuesTensor);
1335
 
1336
  adjValuesTensor = std::get<0>(torch::sort(adjValuesTensor, /*dim=*/ 2));
@@ -1345,12 +1380,16 @@ AdjacencyResult compute_all_to_all_adjacency(
1345
  auto maxDepthTensor = torch::tensor(0, adjCountsTensor.options());
1346
  #endif
1347
 
 
 
 
 
1348
  blockSize = dim3{ 128, 1, 1 };
1349
  gridSize = dim3{ 1, static_cast<uint32_t>(maxExCount), static_cast<uint32_t>(counts.size(0)) };
1350
  smemSize = div_up(maxExCount * sizeof(visit_mask_t), sizeof(uint32_t)) * sizeof(uint32_t);
1351
 
1352
- device_flatten_graph_iterative KERNEL_ARG3(gridSize, blockSize, smemSize) (
1353
- counts.data_ptr<int32_t>(),
1354
  isStartTensor.packed_accessor64<bool, 2>(),
1355
  reinterpret_cast<uint32_t*>(adjCountsTensor.data_ptr<int32_t>()),
1356
  reinterpret_cast<uint32_t*>(adjValuesTensor.data_ptr<int32_t>())
@@ -1360,7 +1399,7 @@ AdjacencyResult compute_all_to_all_adjacency(
1360
  );
1361
 
1362
  #ifdef NMS_VERIFY_CORRECTNESS
1363
- cpu_flatten_graph(cpuCounts.data_ptr<int32_t>(), cpuIsStartTensor, cpuAdjCountsTensor, cpuAdjValuesTensor);
1364
 
1365
  cpuAdjValuesTensor = std::get<0>(torch::sort(cpuAdjValuesTensor, /*dim=*/ 2));
1366
  adjValuesTensor = std::get<0>(torch::sort(adjValuesTensor, /*dim=*/ 2));
@@ -1398,6 +1437,7 @@ AdjacencyResult compute_all_to_all_adjacency(
1398
  cpuIsStartTensor = isStartTensor.cpu();
1399
  cpuAdjCountsTensor = adjCountsTensor.cpu();
1400
  cpuAdjValuesTensor = adjValuesTensor.cpu();
 
1401
  auto cpuCollapseIds = collapseResult.QuadIds.cpu();
1402
 
1403
  static std::vector<std::unordered_set<int32_t>> s_knownGroups;
@@ -1549,11 +1589,22 @@ nms_result_t
1549
  dim3 blockSize(NUM_WARPS * 32, 1, 1);
1550
  dim3 gridSize(1, adjResult.MaxExCount, counts.size(0));
1551
 
 
 
 
 
 
 
 
1552
  torch::Tensor outQuads = torch::empty({ numOutQuads, 4, 2 }, embedQuads.options());
1553
  torch::Tensor outConf = torch::empty({ numOutQuads }, embedQuads.options());
1554
 
1555
- device_a2a_collapse<NUM_WARPS, scalar_t> KERNEL_ARG2(gridSize, blockSize) (
1556
- counts.packed_accessor64<int32_t, 1>(),
 
 
 
 
1557
  embedQuads.packed_accessor64<scalar_t, 3>(),
1558
  isLeadRow.packed_accessor64<bool, 2>(),
1559
  regionCounts.data_ptr<int64_t>(),
 
157
  torch::PackedTensorAccessor64<T, 3> allConfs,
158
  T confThreshold, T iouThreshold,
159
  torch::PackedTensorAccessor64<int32_t, 1> allOutCounts,
160
+ torch::PackedTensorAccessor64<T, 3> allOutEmbedQuads
161
+ #ifdef NMS_VERIFY_CORRECTNESS
162
+ , torch::PackedTensorAccessor64<int32_t, 2> allOutIds
163
+ #endif
164
+ )
165
  {
166
  typedef InPlaceQuad_<T> Quadf;
167
  static_assert(sizeof(Quadf) == sizeof(T) * 8, "Invalid QuadMem size!");
 
306
  }
307
 
308
  write_embed_quad(outEmbedQuads, outQuad, storeOff + procLabel - 1);
309
+ #ifdef NMS_VERIFY_CORRECTNESS
310
  if (threadRank == 0) {
311
  allOutIds[b][storeOff + procLabel - 1] = r * 32 + startIdx;
312
  }
313
+ #endif
314
  }
315
 
316
  if (threadRank == 0) {
 
321
  #undef threadRank
322
  }
323
 
324
+ template<bool IsSingleExample, typename T>
325
  __global__
326
+ void device_a2a_adjacency_sparse(const uint64_t punCounts,
327
  T iouThreshold,
328
  torch::PackedTensorAccessor64<T, 3> embedQuads,
329
  torch::PackedTensorAccessor64<bool, 2> outIsStart,
 
332
  {
333
  const uint32_t b = blockIdx.y;
334
 
335
+ const int32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
 
 
 
 
336
 
337
  const int32_t jobIdx = blockIdx.x * blockDim.x + threadIdx.x;
338
  const int32_t row = jobIdx / quadCt;
 
343
  return;
344
  }
345
 
346
+ T* exData = IsSingleExample ? embedQuads.data() : embedQuads[b].data();
347
 
348
  const auto qRow = StridedEmbedQuad_<T>{ exData + row * embedQuads.stride(2), embedQuads.stride(1) }.Bounds(),
349
  qCol = StridedEmbedQuad_<T>{ exData + col * embedQuads.stride(2), embedQuads.stride(1) }.Bounds();
 
405
  }
406
  }
407
 
408
+ template<uint32_t NumWarps, bool IsSingleExample, typename T, int32_t I_CELL_SIZE>
409
  __global__
410
+ void device_a2a_adjacency_build_grid(const uint64_t punCounts,
411
  torch::PackedTensorAccessor64<T, 3> embedQuads,
412
  torch::PackedTensorAccessor64<int32_t, 4> outGridCells,
413
  torch::PackedTensorAccessor64<int32_t, 3> outQuadCells)
 
423
 
424
  const uint32_t b = blockIdx.z;
425
 
426
+ const uint32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
427
  const uint32_t quadIdx = blockIdx.y;
428
 
429
+ if (!IsSingleExample && quadIdx >= quadCt) {
430
  return;
431
  }
432
 
 
485
 
486
  typedef uint8_t visit_mask_t;
487
 
488
+ template<uint32_t NumWarps, bool IsSingleExample, typename T>
489
  __global__
490
+ void device_a2a_adjacency_with_grid(const uint64_t punCounts,
491
  T iouThreshold,
492
  torch::PackedTensorAccessor64<T, 3> allEmbedQuads,
493
  torch::PackedTensorAccessor64<int32_t, 4> allCells,
 
503
 
504
  const uint32_t b = blockIdx.z;
505
 
506
+ const uint32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
507
  const uint32_t quadIdx = blockIdx.y;
508
 
509
+ if (!IsSingleExample && quadIdx >= quadCt) {
510
  return;
511
  }
512
 
 
535
  auto exAdjCounts = reinterpret_cast<uint32_t*>(outAdjCounts[b].data());
536
  auto exAdjValues = outSparseAdj[b][quadIdx].data();
537
 
538
+ T *exData = IsSingleExample ? allEmbedQuads.data() : allEmbedQuads[b].data();
539
 
540
  const auto bdsAnchor = Quad_<T>{ s_quadVerts }.Bounds();
541
 
 
599
  }
600
  }
601
 
602
+ template<bool IsSingleExample>
603
  __global__
604
+ void device_flatten_graph_iterative(const uint64_t punCounts,
605
  torch::PackedTensorAccessor64<bool, 2> allIsStart,
606
  volatile uint32_t *allAdjCounts,
607
  volatile uint32_t *allAdjValues
 
622
  const uint32_t b = blockIdx.z;
623
  const uint32_t anchorRow = blockIdx.y;
624
 
625
+ const uint32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
626
 
627
  // Only need to check this if there are multiple examples, since in the case of a single example,
628
  // the grid is precisely sized to that quadCt
629
+ if constexpr (!IsSingleExample) {
630
+ if (anchorRow >= quadCt) {
631
+ return;
632
+ }
633
  }
634
 
635
  auto isStart = allIsStart[b].data();
 
690
  visitStack[1] = anchorRow;
691
  #ifndef NDEBUG
692
  for (uint32_t i = 2; i < VISIT_STACK_SIZE; ++i) {
693
+ visitStack[i] = -2;
694
  }
695
  #endif
696
  int32_t visitPtr = 1;
697
 
698
+ while (true) {
 
699
  #ifdef NMS_VERIFY_CORRECTNESS
700
  assert(visitPtr >= 0 && visitPtr < VISIT_STACK_SIZE);
701
  #endif
 
707
  if (threadNextCol == warpNextCol) {
708
  #ifndef NDEBUG
709
  // This makes it easier to debug where the pointer is
710
+ visitStack[visitPtr] = -2;
711
  #endif
712
  --visitPtr;
713
  }
 
731
  const uint32_t procAdjCount = adjCounts[procRow];
732
  auto procAdjValues = adjValues + (procRow * maxExCount);
733
 
734
+ // Offsetting by the iteration number will help balance out the maximum depth of any stack in the warp.
735
+ // The reason behind this is due to how otherwise, warp-0 will always get a new element, warp-1 iff the adj graph
736
+ // has more than one element, warp-2 iff the adj graph has more than two elements, and so on. Basically,
737
+ // the warps have decreasing pressure. With the rotation mechanism, it helps to balance out stack usage.
738
  for (uint32_t i = threadRank; i < procAdjCount; i += WARP_SIZE) {
739
+ const uint32_t adjCol = procAdjValues[i];
 
 
 
 
 
 
 
740
 
741
  // This will set the queued flag for this column, if it's not already set.
742
  // It also returns the old state. In our case, we only want to add this value to the
 
748
 
749
  bool alreadyAdded = oldMask & ADDED_MASK;
750
 
751
+ auto group = cg::coalesced_threads();
752
  const uint32_t gThreadRank = group.thread_rank();
753
  uint32_t notAddedBallot = group.ballot(!alreadyAdded);
754
  if (notAddedBallot) {
 
825
  }
826
  }
827
 
828
+ template<bool IsSingleExample>
829
+ void cpu_flatten_graph(const uint64_t punCounts,
830
  torch::Tensor isStartTensorGPU,
831
  torch::Tensor adjCountsTensorGPU,
832
  torch::Tensor adjValuesTensorGPU)
 
840
  auto allAdjValues = adjValuesTensor.accessor<int32_t, 3>();
841
 
842
  for (int32_t b = 0; b < allAdjCounts.size(0); ++b) {
843
+ const int32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
844
 
845
  for (int32_t row = 0; row < quadCt; ++row) {
846
  std::unordered_set<int32_t> fullAdjSet;
 
895
  }
896
  }
897
 
898
+ template<uint32_t NumWarps, typename T, bool IsSingleExample>
899
  __global__
900
+ void device_a2a_collapse(const uint64_t punCounts,
901
  torch::PackedTensorAccessor64<T, 3> allEmbedQuads,
902
  torch::PackedTensorAccessor64<bool, 2> allIsLeadRow,
903
  const int64_t *regionCounts,
 
917
  const uint32_t b = blockIdx.z;
918
  const uint32_t row = blockIdx.y;
919
 
920
+ const int32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
921
 
922
+ if constexpr (!IsSingleExample) {
923
+ if (row >= quadCt) {
924
+ return;
925
+ }
926
  }
927
 
928
  // Only process the lead rows
929
+ const auto isLeadRow = IsSingleExample ? allIsLeadRow.data() : allIsLeadRow[b].data();
930
  if (!isLeadRow[row]) {
931
  return;
932
  }
 
945
  __syncthreads();
946
  }
947
 
948
+ T *exData = IsSingleExample ? allEmbedQuads.data() : allEmbedQuads[b].data();
949
 
950
  const int32_t adjCount = allAdjCounts[b][row];
951
  const int32_t *adjIdxs = allAdjValues[b][row].data();
 
986
 
987
  // Figure out the output position
988
  uint32_t writePosition = 0;
989
+ if constexpr (!IsSingleExample) {
990
+ for (int32_t i = threadRank; i < b; i += BLOCK_WIDTH) {
991
+ writePosition += regionCounts[i];
992
+ }
993
  }
994
 
995
+ const int32_t numLongs = row >> 3; // Divide by 8
996
  const uint8_t *pCurrIsLeadRow = reinterpret_cast<const uint8_t*>(isLeadRow);
997
+ const uint64_t *lpCurrIsLeadRow = reinterpret_cast<const uint64_t*>(pCurrIsLeadRow);
998
+
999
+ for (int32_t i = threadRank; i < numLongs; i += BLOCK_WIDTH) {
1000
+ writePosition += __popcll(lpCurrIsLeadRow[i]);
1001
+ }
1002
+ for (int32_t i = (numLongs * 8) + threadRank; i < row; i += BLOCK_WIDTH) {
1003
  if (pCurrIsLeadRow[i]) {
1004
  ++writePosition;
1005
  }
 
1075
  int64_t embedSize = sizeof(EmbedQuad_<scalar_t>) / sizeof(scalar_t);
1076
  auto rowMergeTensor = torch::empty({ quads.size(0), embedSize, quads.size(1) * quads.size(2) }, quads.options());
1077
 
1078
+ #ifdef NMS_VERIFY_CORRECTNESS
1079
  auto idsTensor = torch::full({ quads.size(0), quads.size(1) * quads.size(2) },
1080
  std::numeric_limits<int32_t>::max(),
1081
  counts.options().dtype(torch::kInt32));
1082
+ #else
1083
+ torch::Tensor idsTensor;
1084
+ #endif
1085
 
1086
  dim3 blockSize(32, 3, 1);
1087
  dim3 gridSize(1,
 
1093
  probs.packed_accessor64<scalar_t, 3>(),
1094
  probThreshold, iouThreshold,
1095
  counts.packed_accessor64<int32_t, 1>(),
1096
+ rowMergeTensor.packed_accessor64<scalar_t, 3>()
1097
+ #ifdef NMS_VERIFY_CORRECTNESS
1098
+ , idsTensor.packed_accessor64<int32_t, 2>()
1099
+ #endif
1100
  );
1101
 
1102
  #ifdef NMS_VERIFY_CORRECTNESS
 
1119
 
1120
  counts = counts.slice(/*dim=*/ 0, 0, counts.size(0) - 1);
1121
 
1122
+ #ifdef NMS_VERIFY_CORRECTNESS
1123
  int64_t maxExCount;
1124
  if (counts.size(0) > 1) {
1125
  maxExCount = counts.max().item<int32_t>();
 
1131
 
1132
  rowMergeTensor = rowMergeTensor.slice(2, 0, maxExCount);
1133
  idsTensor = idsTensor.slice(1, 0, maxExCount);
1134
+ auto order = torch::argsort(idsTensor, /*dim=*/ 1, s_sortOrder); s_sortOrder = !s_sortOrder;
1135
 
1136
  auto embOrder = order.unsqueeze(1).expand_as(rowMergeTensor);
1137
 
1138
  rowMergeTensor = torch::gather(rowMergeTensor, /*dim=*/ 2, embOrder);
1139
  idsTensor = torch::gather(idsTensor, /*dim=*/ 1, order);
1140
+ #endif
1141
 
1142
  return { counts, rowMergeTensor, totalQuads, idsTensor, imageWidth, imageHeight };
1143
  }
 
1177
  int64_t MaxExCount;
1178
  };
1179
 
1180
+ template<bool IsSingleExample, typename T>
1181
+ void cpu_a2a_adjacency_sparse(const uint64_t punCounts,
1182
  const T iouThreshold,
1183
  torch::Tensor embedQuadsTensor,
1184
  torch::Tensor outIsStartTensorGPU,
 
1196
  auto adjValues = outSparseAdjTensor.accessor<int32_t, 3>();
1197
 
1198
  for (int32_t b = 0; b < embedQuadsTensor.size(0); ++b) {
1199
+ const int32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
1200
 
1201
  T *exData = embedQuads[b].data();
1202
 
 
1284
  counts.options().dtype(torch::kInt32));
1285
  #endif
1286
 
1287
+ // If the batch is only a single example, instead of hitting global memory for the count, we can
1288
+ // just encode the count into the pointer instead
1289
+ uint64_t ptrCounts = reinterpret_cast<uint64_t>(counts.data_ptr<int32_t>());
1290
+ if (counts.size(0) == 1) {
1291
+ ptrCounts = maxExCount;
1292
+ }
1293
+
1294
  #ifdef NMS_VERIFY_CORRECTNESS
1295
  auto cpuAdjValuesTensor = adjValuesTensor.cpu();
1296
  auto cpuAdjCountsTensor = adjCountsTensor.cpu();
 
1318
  //blockSize = dim3{ GRID_NUM_WARPS * 32, 1, 1 };
1319
  //gridSize = dim3{ 1, static_cast<uint32_t>(maxExCount), static_cast<uint32_t>(counts.size(0)) };
1320
 
1321
+ //auto buildGridFn = counts.size(0) == 1 ?
1322
+ // device_a2a_adjacency_build_grid<GRID_NUM_WARPS, true, scalar_t, CELL_SIZE> :
1323
+ // device_a2a_adjacency_build_grid<GRID_NUM_WARPS, false, scalar_t, CELL_SIZE>;
1324
+
1325
+ //buildGridFn KERNEL_ARG2(gridSize, blockSize) (
1326
+ // ptrCounts,
1327
  // collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
1328
  // gridCellsTensor.packed_accessor64<int32_t, 4>(),
1329
  // quadCellExtentsTensor.packed_accessor64<int32_t, 3>()
1330
  //);
1331
 
1332
+ //auto adjGridFn = counts.size(0) == 1 ?
1333
+ // device_a2a_adjacency_with_grid<GRID_NUM_WARPS, true, scalar_t> :
1334
+ // device_a2a_adjacency_with_grid<GRID_NUM_WARPS, false, scalar_t>;
1335
+
1336
+ //adjGridFn KERNEL_ARG3(gridSize, blockSize, smemSize) (
1337
+ // ptrCounts,
1338
  // iouThreshold,
1339
  // collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
1340
  // gridCellsTensor.packed_accessor64<int32_t, 4>(),
 
1351
  gridSize = dim3{div_up(totalWork, blockSize.x),
1352
  static_cast<uint32_t>(counts.size(0))};
1353
 
1354
+ auto adjFn = counts.size(0) == 1 ? device_a2a_adjacency_sparse<true, scalar_t> : device_a2a_adjacency_sparse<false, scalar_t>;
1355
+
1356
  // This algorithm is O(n^2) with n being the current number of quads
1357
+ adjFn KERNEL_ARG2(gridSize, blockSize) (
1358
+ ptrCounts,
1359
  iouThreshold,
1360
  collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
1361
  isStartTensor.packed_accessor64<bool, 2>(),
 
1365
 
1366
 
1367
  #ifdef NMS_VERIFY_CORRECTNESS
1368
+ cpu_a2a_adjacency_sparse<true>(ptrCounts, iouThreshold,
 
 
1369
  collapseResult.StridedMergeQuads, cpuIsStartTensor, cpuAdjCountsTensor, cpuAdjValuesTensor);
1370
 
1371
  adjValuesTensor = std::get<0>(torch::sort(adjValuesTensor, /*dim=*/ 2));
 
1380
  auto maxDepthTensor = torch::tensor(0, adjCountsTensor.options());
1381
  #endif
1382
 
1383
+ auto traverseFn = counts.size(0) == 1 ?
1384
+ device_flatten_graph_iterative<true> :
1385
+ device_flatten_graph_iterative<false>;
1386
+
1387
  blockSize = dim3{ 128, 1, 1 };
1388
  gridSize = dim3{ 1, static_cast<uint32_t>(maxExCount), static_cast<uint32_t>(counts.size(0)) };
1389
  smemSize = div_up(maxExCount * sizeof(visit_mask_t), sizeof(uint32_t)) * sizeof(uint32_t);
1390
 
1391
+ traverseFn KERNEL_ARG3(gridSize, blockSize, smemSize) (
1392
+ ptrCounts,
1393
  isStartTensor.packed_accessor64<bool, 2>(),
1394
  reinterpret_cast<uint32_t*>(adjCountsTensor.data_ptr<int32_t>()),
1395
  reinterpret_cast<uint32_t*>(adjValuesTensor.data_ptr<int32_t>())
 
1399
  );
1400
 
1401
  #ifdef NMS_VERIFY_CORRECTNESS
1402
+ cpu_flatten_graph<true>(ptrCounts, cpuIsStartTensor, cpuAdjCountsTensor, cpuAdjValuesTensor);
1403
 
1404
  cpuAdjValuesTensor = std::get<0>(torch::sort(cpuAdjValuesTensor, /*dim=*/ 2));
1405
  adjValuesTensor = std::get<0>(torch::sort(adjValuesTensor, /*dim=*/ 2));
 
1437
  cpuIsStartTensor = isStartTensor.cpu();
1438
  cpuAdjCountsTensor = adjCountsTensor.cpu();
1439
  cpuAdjValuesTensor = adjValuesTensor.cpu();
1440
+ auto cpuCounts = counts.cpu();
1441
  auto cpuCollapseIds = collapseResult.QuadIds.cpu();
1442
 
1443
  static std::vector<std::unordered_set<int32_t>> s_knownGroups;
 
1589
  dim3 blockSize(NUM_WARPS * 32, 1, 1);
1590
  dim3 gridSize(1, adjResult.MaxExCount, counts.size(0));
1591
 
1592
+ // If the batch is only a single example, instead of hitting global memory for the count, we can
1593
+ // just encode the count into the pointer instead
1594
+ uint64_t ptrCounts = reinterpret_cast<uint64_t>(counts.data_ptr<int32_t>());
1595
+ if (counts.size(0) == 1) {
1596
+ ptrCounts = adjResult.MaxExCount;
1597
+ }
1598
+
1599
  torch::Tensor outQuads = torch::empty({ numOutQuads, 4, 2 }, embedQuads.options());
1600
  torch::Tensor outConf = torch::empty({ numOutQuads }, embedQuads.options());
1601
 
1602
+ auto collapseFn = counts.size(0) == 1 ?
1603
+ device_a2a_collapse<NUM_WARPS, scalar_t, true> :
1604
+ device_a2a_collapse<NUM_WARPS, scalar_t, false>;
1605
+
1606
+ collapseFn KERNEL_ARG2(gridSize, blockSize) (
1607
+ ptrCounts,
1608
  embedQuads.packed_accessor64<scalar_t, 3>(),
1609
  isLeadRow.packed_accessor64<bool, 2>(),
1610
  regionCounts.data_ptr<int64_t>(),
{nemotron-ocr → nemo-retriever-ocr}/cpp/non_maximal_suppression/nms_common.h RENAMED
File without changes
{nemotron-ocr → nemo-retriever-ocr}/cpp/non_maximal_suppression/nms_kd_tree.h RENAMED
File without changes