Fix misaligned address error in quad NMS cuda implementation

Browse files

Files changed (2) hide show

nemotron-ocr/cpp/non_maximal_suppression/cuda_non_maximal_suppression.cu +67 -118
nemotron-ocr/src/nemotron_ocr/inference/pipeline.py +11 -23

nemotron-ocr/cpp/non_maximal_suppression/cuda_non_maximal_suppression.cu CHANGED Viewed

@@ -157,11 +157,8 @@ void device_row_collapse(torch::PackedTensorAccessor64<T, 5> allQuads,
     torch::PackedTensorAccessor64<T, 3> allConfs,
     T confThreshold, T iouThreshold,
     torch::PackedTensorAccessor64<int32_t, 1> allOutCounts,
-    torch::PackedTensorAccessor64<T, 3> allOutEmbedQuads
-#ifdef NMS_VERIFY_CORRECTNESS
-    , torch::PackedTensorAccessor64<int32_t, 2> allOutIds
-#endif
-    )
 {
     typedef InPlaceQuad_<T> Quadf;
     static_assert(sizeof(Quadf) == sizeof(T) * 8, "Invalid QuadMem size!");
@@ -306,11 +303,9 @@ void device_row_collapse(torch::PackedTensorAccessor64<T, 5> allQuads,
         }
         write_embed_quad(outEmbedQuads, outQuad, storeOff + procLabel - 1);
-#ifdef NMS_VERIFY_CORRECTNESS
         if (threadRank == 0) {
             allOutIds[b][storeOff + procLabel - 1] = r * 32 + startIdx;
         }
-#endif
     }
     if (threadRank == 0) {
@@ -321,9 +316,9 @@ void device_row_collapse(torch::PackedTensorAccessor64<T, 5> allQuads,
 #undef threadRank
 }
-template<bool IsSingleExample, typename T>
 __global__
-void device_a2a_adjacency_sparse(const uint64_t punCounts,
                                  T iouThreshold,
                                  torch::PackedTensorAccessor64<T, 3> embedQuads,
                                  torch::PackedTensorAccessor64<bool, 2> outIsStart,
@@ -332,7 +327,11 @@ void device_a2a_adjacency_sparse(const uint64_t punCounts,
 {
     const uint32_t b = blockIdx.y;
-    const int32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
     const int32_t jobIdx = blockIdx.x * blockDim.x + threadIdx.x;
     const int32_t row = jobIdx / quadCt;
@@ -343,7 +342,7 @@ void device_a2a_adjacency_sparse(const uint64_t punCounts,
         return;
     }
-    T* exData = IsSingleExample ? embedQuads.data() : embedQuads[b].data();
     const auto qRow = StridedEmbedQuad_<T>{ exData + row * embedQuads.stride(2), embedQuads.stride(1) }.Bounds(),
                qCol = StridedEmbedQuad_<T>{ exData + col * embedQuads.stride(2), embedQuads.stride(1) }.Bounds();
@@ -405,9 +404,9 @@ void device_a2a_adjacency_sparse(const uint64_t punCounts,
     }
 }
-template<uint32_t NumWarps, bool IsSingleExample, typename T, int32_t I_CELL_SIZE>
 __global__
-void device_a2a_adjacency_build_grid(const uint64_t punCounts,
                                      torch::PackedTensorAccessor64<T, 3> embedQuads,
                                      torch::PackedTensorAccessor64<int32_t, 4> outGridCells,
                                      torch::PackedTensorAccessor64<int32_t, 3> outQuadCells)
@@ -423,10 +422,10 @@ void device_a2a_adjacency_build_grid(const uint64_t punCounts,
     const uint32_t b = blockIdx.z;
-    const uint32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
     const uint32_t quadIdx = blockIdx.y;
-    if (!IsSingleExample && quadIdx >= quadCt) {
         return;
     }
@@ -485,9 +484,9 @@ void device_a2a_adjacency_build_grid(const uint64_t punCounts,
 typedef uint8_t visit_mask_t;
-template<uint32_t NumWarps, bool IsSingleExample, typename T>
 __global__
-void device_a2a_adjacency_with_grid(const uint64_t punCounts,
                                     T iouThreshold,
                                     torch::PackedTensorAccessor64<T, 3> allEmbedQuads,
                                     torch::PackedTensorAccessor64<int32_t, 4> allCells,
@@ -503,10 +502,10 @@ void device_a2a_adjacency_with_grid(const uint64_t punCounts,
     const uint32_t b = blockIdx.z;
-    const uint32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
     const uint32_t quadIdx = blockIdx.y;
-    if (!IsSingleExample && quadIdx >= quadCt) {
         return;
     }
@@ -535,7 +534,7 @@ void device_a2a_adjacency_with_grid(const uint64_t punCounts,
     auto exAdjCounts = reinterpret_cast<uint32_t*>(outAdjCounts[b].data());
     auto exAdjValues = outSparseAdj[b][quadIdx].data();
-    T *exData = IsSingleExample ? allEmbedQuads.data() : allEmbedQuads[b].data();
     const auto bdsAnchor = Quad_<T>{ s_quadVerts }.Bounds();
@@ -599,9 +598,8 @@ void device_a2a_adjacency_with_grid(const uint64_t punCounts,
     }
 }
-template<bool IsSingleExample>
 __global__
-void device_flatten_graph_iterative(const uint64_t punCounts,
                                     torch::PackedTensorAccessor64<bool, 2> allIsStart,
                                     volatile uint32_t *allAdjCounts,
                                     volatile uint32_t *allAdjValues
@@ -622,14 +620,12 @@ void device_flatten_graph_iterative(const uint64_t punCounts,
     const uint32_t b = blockIdx.z;
     const uint32_t anchorRow = blockIdx.y;
-    const uint32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
     // Only need to check this if there are multiple examples, since in the case of a single example,
     // the grid is precisely sized to that quadCt
-    if constexpr (!IsSingleExample) {
-        if (anchorRow >= quadCt) {
-            return;
-        }
     }
     auto isStart = allIsStart[b].data();
@@ -690,12 +686,13 @@ void device_flatten_graph_iterative(const uint64_t punCounts,
     visitStack[1] = anchorRow;
 #ifndef NDEBUG
     for (uint32_t i = 2; i < VISIT_STACK_SIZE; ++i) {
-        visitStack[i] = -2;
     }
 #endif
     int32_t visitPtr = 1;
-    while (true) {
 #ifdef NMS_VERIFY_CORRECTNESS
         assert(visitPtr >= 0 && visitPtr < VISIT_STACK_SIZE);
 #endif
@@ -707,7 +704,7 @@ void device_flatten_graph_iterative(const uint64_t punCounts,
         if (threadNextCol == warpNextCol) {
 #ifndef NDEBUG
             // This makes it easier to debug where the pointer is
-            visitStack[visitPtr] = -2;
 #endif
             --visitPtr;
         }
@@ -731,12 +728,15 @@ void device_flatten_graph_iterative(const uint64_t punCounts,
         const uint32_t procAdjCount = adjCounts[procRow];
         auto procAdjValues = adjValues + (procRow * maxExCount);
-        // Offsetting by the iteration number will help balance out the maximum depth of any stack in the warp.
-        // The reason behind this is due to how otherwise, warp-0 will always get a new element, warp-1 iff the adj graph
-        // has more than one element, warp-2 iff the adj graph has more than two elements, and so on. Basically,
-        // the warps have decreasing pressure. With the rotation mechanism, it helps to balance out stack usage.
         for (uint32_t i = threadRank; i < procAdjCount; i += WARP_SIZE) {
-            const uint32_t adjCol = procAdjValues[i];
             // This will set the queued flag for this column, if it's not already set.
             // It also returns the old state. In our case, we only want to add this value to the
@@ -748,7 +748,6 @@ void device_flatten_graph_iterative(const uint64_t punCounts,
             bool alreadyAdded = oldMask & ADDED_MASK;
-            auto group = cg::coalesced_threads();
             const uint32_t gThreadRank = group.thread_rank();
             uint32_t notAddedBallot = group.ballot(!alreadyAdded);
             if (notAddedBallot) {
@@ -825,8 +824,7 @@ void add_to_set(const torch::TensorAccessor<int32_t, 1>& adjCounts,
     }
 }
-template<bool IsSingleExample>
-void cpu_flatten_graph(const uint64_t punCounts,
                        torch::Tensor isStartTensorGPU,
                        torch::Tensor adjCountsTensorGPU,
                        torch::Tensor adjValuesTensorGPU)
@@ -840,7 +838,7 @@ void cpu_flatten_graph(const uint64_t punCounts,
     auto allAdjValues = adjValuesTensor.accessor<int32_t, 3>();
     for (int32_t b = 0; b < allAdjCounts.size(0); ++b) {
-        const int32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
         for (int32_t row = 0; row < quadCt; ++row) {
             std::unordered_set<int32_t> fullAdjSet;
@@ -895,9 +893,9 @@ void device_a2a_adj_cleanup(const int32_t *counts,
     }
 }
-template<uint32_t NumWarps, typename T, bool IsSingleExample>
 __global__
-void device_a2a_collapse(const uint64_t punCounts,
                          torch::PackedTensorAccessor64<T, 3> allEmbedQuads,
                          torch::PackedTensorAccessor64<bool, 2> allIsLeadRow,
                          const int64_t *regionCounts,
@@ -917,16 +915,14 @@ void device_a2a_collapse(const uint64_t punCounts,
     const uint32_t b = blockIdx.z;
     const uint32_t row = blockIdx.y;
-    const int32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
-    if constexpr (!IsSingleExample) {
-        if (row >= quadCt) {
-            return;
-        }
     }
     // Only process the lead rows
-    const auto isLeadRow = IsSingleExample ? allIsLeadRow.data() : allIsLeadRow[b].data();
     if (!isLeadRow[row]) {
         return;
     }
@@ -945,7 +941,7 @@ void device_a2a_collapse(const uint64_t punCounts,
         __syncthreads();
     }
-    T *exData = IsSingleExample ? allEmbedQuads.data() : allEmbedQuads[b].data();
     const int32_t adjCount = allAdjCounts[b][row];
     const int32_t *adjIdxs = allAdjValues[b][row].data();
@@ -986,20 +982,12 @@ void device_a2a_collapse(const uint64_t punCounts,
     // Figure out the output position
     uint32_t writePosition = 0;
-    if constexpr (!IsSingleExample) {
-        for (int32_t i = threadRank; i < b; i += BLOCK_WIDTH) {
-            writePosition += regionCounts[i];
-        }
     }
-    const int32_t numLongs = row >> 3; // Divide by 8
     const uint8_t *pCurrIsLeadRow = reinterpret_cast<const uint8_t*>(isLeadRow);
-    const uint64_t *lpCurrIsLeadRow = reinterpret_cast<const uint64_t*>(pCurrIsLeadRow);
-    for (int32_t i = threadRank; i < numLongs; i += BLOCK_WIDTH) {
-        writePosition += __popcll(lpCurrIsLeadRow[i]);
-    }
-    for (int32_t i = (numLongs * 8) + threadRank; i < row; i += BLOCK_WIDTH) {
         if (pCurrIsLeadRow[i]) {
             ++writePosition;
         }
@@ -1075,13 +1063,9 @@ CollapseRowsResult collapse_rows(
     int64_t embedSize = sizeof(EmbedQuad_<scalar_t>) / sizeof(scalar_t);
     auto rowMergeTensor = torch::empty({ quads.size(0), embedSize, quads.size(1) * quads.size(2) }, quads.options());
-#ifdef NMS_VERIFY_CORRECTNESS
     auto idsTensor = torch::full({ quads.size(0), quads.size(1) * quads.size(2) },
                                  std::numeric_limits<int32_t>::max(),
                                  counts.options().dtype(torch::kInt32));
-#else
-    torch::Tensor idsTensor;
-#endif
     dim3 blockSize(32, 3, 1);
     dim3 gridSize(1,
@@ -1093,10 +1077,8 @@ CollapseRowsResult collapse_rows(
         probs.packed_accessor64<scalar_t, 3>(),
         probThreshold, iouThreshold,
         counts.packed_accessor64<int32_t, 1>(),
-        rowMergeTensor.packed_accessor64<scalar_t, 3>()
-#ifdef NMS_VERIFY_CORRECTNESS
-        , idsTensor.packed_accessor64<int32_t, 2>()
-#endif
     );
 #ifdef NMS_VERIFY_CORRECTNESS
@@ -1119,7 +1101,6 @@ CollapseRowsResult collapse_rows(
     counts = counts.slice(/*dim=*/ 0, 0, counts.size(0) - 1);
-#ifdef NMS_VERIFY_CORRECTNESS
     int64_t maxExCount;
     if (counts.size(0) > 1) {
         maxExCount = counts.max().item<int32_t>();
@@ -1131,13 +1112,12 @@ CollapseRowsResult collapse_rows(
     rowMergeTensor = rowMergeTensor.slice(2, 0, maxExCount);
     idsTensor = idsTensor.slice(1, 0, maxExCount);
-    auto order = torch::argsort(idsTensor, /*dim=*/ 1, s_sortOrder); s_sortOrder = !s_sortOrder;
     auto embOrder = order.unsqueeze(1).expand_as(rowMergeTensor);
     rowMergeTensor = torch::gather(rowMergeTensor, /*dim=*/ 2, embOrder);
     idsTensor = torch::gather(idsTensor, /*dim=*/ 1, order);
-#endif
     return { counts, rowMergeTensor, totalQuads, idsTensor, imageWidth, imageHeight };
 }
@@ -1177,8 +1157,8 @@ struct AdjacencyResult {
     int64_t MaxExCount;
 };
-template<bool IsSingleExample, typename T>
-void cpu_a2a_adjacency_sparse(const uint64_t punCounts,
                               const T iouThreshold,
                               torch::Tensor embedQuadsTensor,
                               torch::Tensor outIsStartTensorGPU,
@@ -1196,7 +1176,7 @@ void cpu_a2a_adjacency_sparse(const uint64_t punCounts,
     auto adjValues = outSparseAdjTensor.accessor<int32_t, 3>();
     for (int32_t b = 0; b < embedQuadsTensor.size(0); ++b) {
-        const int32_t quadCt = IsSingleExample ? punCounts : reinterpret_cast<const int32_t*>(punCounts)[b];
         T *exData = embedQuads[b].data();
@@ -1284,13 +1264,6 @@ AdjacencyResult compute_all_to_all_adjacency(
                                        counts.options().dtype(torch::kInt32));
 #endif
-    // If the batch is only a single example, instead of hitting global memory for the count, we can
-    // just encode the count into the pointer instead
-    uint64_t ptrCounts = reinterpret_cast<uint64_t>(counts.data_ptr<int32_t>());
-    if (counts.size(0) == 1) {
-        ptrCounts = maxExCount;
-    }
 #ifdef NMS_VERIFY_CORRECTNESS
     auto cpuAdjValuesTensor = adjValuesTensor.cpu();
     auto cpuAdjCountsTensor = adjCountsTensor.cpu();
@@ -1318,23 +1291,15 @@ AdjacencyResult compute_all_to_all_adjacency(
     //blockSize = dim3{ GRID_NUM_WARPS * 32, 1, 1 };
     //gridSize = dim3{ 1, static_cast<uint32_t>(maxExCount), static_cast<uint32_t>(counts.size(0)) };
-    //auto buildGridFn = counts.size(0) == 1 ?
-    //    device_a2a_adjacency_build_grid<GRID_NUM_WARPS, true, scalar_t, CELL_SIZE> :
-    //    device_a2a_adjacency_build_grid<GRID_NUM_WARPS, false, scalar_t, CELL_SIZE>;
-    //buildGridFn KERNEL_ARG2(gridSize, blockSize) (
-    //    ptrCounts,
     //    collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
     //    gridCellsTensor.packed_accessor64<int32_t, 4>(),
     //    quadCellExtentsTensor.packed_accessor64<int32_t, 3>()
     //);
-    //auto adjGridFn = counts.size(0) == 1 ?
-    //    device_a2a_adjacency_with_grid<GRID_NUM_WARPS, true, scalar_t> :
-    //    device_a2a_adjacency_with_grid<GRID_NUM_WARPS, false, scalar_t>;
-    //adjGridFn KERNEL_ARG3(gridSize, blockSize, smemSize) (
-    //    ptrCounts,
     //    iouThreshold,
     //    collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
     //    gridCellsTensor.packed_accessor64<int32_t, 4>(),
@@ -1351,11 +1316,9 @@ AdjacencyResult compute_all_to_all_adjacency(
     gridSize = dim3{div_up(totalWork, blockSize.x),
                     static_cast<uint32_t>(counts.size(0))};
-    auto adjFn = counts.size(0) == 1 ? device_a2a_adjacency_sparse<true, scalar_t> : device_a2a_adjacency_sparse<false, scalar_t>;
     // This algorithm is O(n^2) with n being the current number of quads
-    adjFn KERNEL_ARG2(gridSize, blockSize) (
-        ptrCounts,
         iouThreshold,
         collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
         isStartTensor.packed_accessor64<bool, 2>(),
@@ -1365,7 +1328,9 @@ AdjacencyResult compute_all_to_all_adjacency(
 #ifdef NMS_VERIFY_CORRECTNESS
-    cpu_a2a_adjacency_sparse<true>(ptrCounts, iouThreshold,
         collapseResult.StridedMergeQuads, cpuIsStartTensor, cpuAdjCountsTensor, cpuAdjValuesTensor);
     adjValuesTensor = std::get<0>(torch::sort(adjValuesTensor, /*dim=*/ 2));
@@ -1380,16 +1345,12 @@ AdjacencyResult compute_all_to_all_adjacency(
     auto maxDepthTensor = torch::tensor(0, adjCountsTensor.options());
 #endif
-    auto traverseFn = counts.size(0) == 1 ?
-                        device_flatten_graph_iterative<true> :
-                        device_flatten_graph_iterative<false>;
     blockSize = dim3{ 128, 1, 1 };
     gridSize = dim3{ 1, static_cast<uint32_t>(maxExCount), static_cast<uint32_t>(counts.size(0)) };
     smemSize = div_up(maxExCount * sizeof(visit_mask_t), sizeof(uint32_t)) * sizeof(uint32_t);
-    traverseFn KERNEL_ARG3(gridSize, blockSize, smemSize) (
-        ptrCounts,
         isStartTensor.packed_accessor64<bool, 2>(),
         reinterpret_cast<uint32_t*>(adjCountsTensor.data_ptr<int32_t>()),
         reinterpret_cast<uint32_t*>(adjValuesTensor.data_ptr<int32_t>())
@@ -1399,7 +1360,7 @@ AdjacencyResult compute_all_to_all_adjacency(
     );
 #ifdef NMS_VERIFY_CORRECTNESS
-    cpu_flatten_graph<true>(ptrCounts, cpuIsStartTensor, cpuAdjCountsTensor, cpuAdjValuesTensor);
     cpuAdjValuesTensor = std::get<0>(torch::sort(cpuAdjValuesTensor, /*dim=*/ 2));
     adjValuesTensor = std::get<0>(torch::sort(adjValuesTensor, /*dim=*/ 2));
@@ -1437,7 +1398,6 @@ AdjacencyResult compute_all_to_all_adjacency(
     cpuIsStartTensor = isStartTensor.cpu();
     cpuAdjCountsTensor = adjCountsTensor.cpu();
     cpuAdjValuesTensor = adjValuesTensor.cpu();
-    auto cpuCounts = counts.cpu();
     auto cpuCollapseIds = collapseResult.QuadIds.cpu();
     static std::vector<std::unordered_set<int32_t>> s_knownGroups;
@@ -1589,22 +1549,11 @@ nms_result_t
     dim3 blockSize(NUM_WARPS * 32, 1, 1);
     dim3 gridSize(1, adjResult.MaxExCount, counts.size(0));
-    // If the batch is only a single example, instead of hitting global memory for the count, we can
-    // just encode the count into the pointer instead
-    uint64_t ptrCounts = reinterpret_cast<uint64_t>(counts.data_ptr<int32_t>());
-    if (counts.size(0) == 1) {
-        ptrCounts = adjResult.MaxExCount;
-    }
     torch::Tensor outQuads = torch::empty({ numOutQuads, 4, 2 }, embedQuads.options());
     torch::Tensor outConf = torch::empty({ numOutQuads }, embedQuads.options());
-    auto collapseFn = counts.size(0) == 1 ?
-        device_a2a_collapse<NUM_WARPS, scalar_t, true> :
-        device_a2a_collapse<NUM_WARPS, scalar_t, false>;
-    collapseFn KERNEL_ARG2(gridSize, blockSize) (
-        ptrCounts,
         embedQuads.packed_accessor64<scalar_t, 3>(),
         isLeadRow.packed_accessor64<bool, 2>(),
         regionCounts.data_ptr<int64_t>(),

     torch::PackedTensorAccessor64<T, 3> allConfs,
     T confThreshold, T iouThreshold,
     torch::PackedTensorAccessor64<int32_t, 1> allOutCounts,
+    torch::PackedTensorAccessor64<T, 3> allOutEmbedQuads,
+    torch::PackedTensorAccessor64<int32_t, 2> allOutIds)
 {
     typedef InPlaceQuad_<T> Quadf;
     static_assert(sizeof(Quadf) == sizeof(T) * 8, "Invalid QuadMem size!");
         }
         write_embed_quad(outEmbedQuads, outQuad, storeOff + procLabel - 1);
         if (threadRank == 0) {
             allOutIds[b][storeOff + procLabel - 1] = r * 32 + startIdx;
         }
     }
     if (threadRank == 0) {
 #undef threadRank
 }
+template<typename T>
 __global__
+void device_a2a_adjacency_sparse(const int32_t *ptrQuadCts,
                                  T iouThreshold,
                                  torch::PackedTensorAccessor64<T, 3> embedQuads,
                                  torch::PackedTensorAccessor64<bool, 2> outIsStart,
 {
     const uint32_t b = blockIdx.y;
+    const int32_t quadCt = ptrQuadCts[b];
+    if (quadCt == 0) {
+        return;
+    }
     const int32_t jobIdx = blockIdx.x * blockDim.x + threadIdx.x;
     const int32_t row = jobIdx / quadCt;
         return;
     }
+    T* exData = embedQuads[b].data();
     const auto qRow = StridedEmbedQuad_<T>{ exData + row * embedQuads.stride(2), embedQuads.stride(1) }.Bounds(),
                qCol = StridedEmbedQuad_<T>{ exData + col * embedQuads.stride(2), embedQuads.stride(1) }.Bounds();
     }
 }
+template<uint32_t NumWarps, typename T, int32_t I_CELL_SIZE>
 __global__
+void device_a2a_adjacency_build_grid(const int32_t *ptrQuadCts,
                                      torch::PackedTensorAccessor64<T, 3> embedQuads,
                                      torch::PackedTensorAccessor64<int32_t, 4> outGridCells,
                                      torch::PackedTensorAccessor64<int32_t, 3> outQuadCells)
     const uint32_t b = blockIdx.z;
+    const uint32_t quadCt = ptrQuadCts[b];
     const uint32_t quadIdx = blockIdx.y;
+    if (quadIdx >= quadCt) {
         return;
     }
 typedef uint8_t visit_mask_t;
+template<uint32_t NumWarps, typename T>
 __global__
+void device_a2a_adjacency_with_grid(const int32_t *ptrQuadCts,
                                     T iouThreshold,
                                     torch::PackedTensorAccessor64<T, 3> allEmbedQuads,
                                     torch::PackedTensorAccessor64<int32_t, 4> allCells,
     const uint32_t b = blockIdx.z;
+    const uint32_t quadCt = ptrQuadCts[b];
     const uint32_t quadIdx = blockIdx.y;
+    if (quadIdx >= quadCt) {
         return;
     }
     auto exAdjCounts = reinterpret_cast<uint32_t*>(outAdjCounts[b].data());
     auto exAdjValues = outSparseAdj[b][quadIdx].data();
+    T *exData = allEmbedQuads[b].data();
     const auto bdsAnchor = Quad_<T>{ s_quadVerts }.Bounds();
     }
 }
 __global__
+void device_flatten_graph_iterative(const int32_t *ptrQuadCts,
                                     torch::PackedTensorAccessor64<bool, 2> allIsStart,
                                     volatile uint32_t *allAdjCounts,
                                     volatile uint32_t *allAdjValues
     const uint32_t b = blockIdx.z;
     const uint32_t anchorRow = blockIdx.y;
+    const uint32_t quadCt = ptrQuadCts[b];
     // Only need to check this if there are multiple examples, since in the case of a single example,
     // the grid is precisely sized to that quadCt
+    if (anchorRow >= quadCt) {
+        return;
     }
     auto isStart = allIsStart[b].data();
     visitStack[1] = anchorRow;
 #ifndef NDEBUG
     for (uint32_t i = 2; i < VISIT_STACK_SIZE; ++i) {
+        visitStack[i] = TERM_VALUE;
     }
 #endif
     int32_t visitPtr = 1;
+    // NOTE: This loop is actually terminated by the `if (warpNextCol == TERM_VALUE)` check below
+    for (uint32_t dfsIter = 0; true; ++dfsIter) {
 #ifdef NMS_VERIFY_CORRECTNESS
         assert(visitPtr >= 0 && visitPtr < VISIT_STACK_SIZE);
 #endif
         if (threadNextCol == warpNextCol) {
 #ifndef NDEBUG
             // This makes it easier to debug where the pointer is
+            visitStack[visitPtr] = TERM_VALUE;
 #endif
             --visitPtr;
         }
         const uint32_t procAdjCount = adjCounts[procRow];
         auto procAdjValues = adjValues + (procRow * maxExCount);
         for (uint32_t i = threadRank; i < procAdjCount; i += WARP_SIZE) {
+            uint32_t adjCol = procAdjValues[i];
+            auto group = cg::coalesced_threads();
+            // Offsetting by the iteration number will help balance out the maximum depth of any stack in the warp.
+            // The reason behind this is due to how otherwise, warp-0 will always get a new element, warp-1 iff the adj graph
+            // has more than one element, warp-2 iff the adj graph has more than two elements, and so on. Basically,
+            // the warps have decreasing pressure. With the rotation mechanism, it helps to balance out stack usage.
+            adjCol = group.shfl(adjCol, (group.thread_rank() + dfsIter) % group.size());
             // This will set the queued flag for this column, if it's not already set.
             // It also returns the old state. In our case, we only want to add this value to the
             bool alreadyAdded = oldMask & ADDED_MASK;
             const uint32_t gThreadRank = group.thread_rank();
             uint32_t notAddedBallot = group.ballot(!alreadyAdded);
             if (notAddedBallot) {
     }
 }
+void cpu_flatten_graph(const int32_t *ptrQuadCts,
                        torch::Tensor isStartTensorGPU,
                        torch::Tensor adjCountsTensorGPU,
                        torch::Tensor adjValuesTensorGPU)
     auto allAdjValues = adjValuesTensor.accessor<int32_t, 3>();
     for (int32_t b = 0; b < allAdjCounts.size(0); ++b) {
+        const int32_t quadCt = ptrQuadCts[b];
         for (int32_t row = 0; row < quadCt; ++row) {
             std::unordered_set<int32_t> fullAdjSet;
     }
 }
+template<uint32_t NumWarps, typename T>
 __global__
+void device_a2a_collapse(torch::PackedTensorAccessor64<int32_t, 1> quadCounts,
                          torch::PackedTensorAccessor64<T, 3> allEmbedQuads,
                          torch::PackedTensorAccessor64<bool, 2> allIsLeadRow,
                          const int64_t *regionCounts,
     const uint32_t b = blockIdx.z;
     const uint32_t row = blockIdx.y;
+    const int32_t quadCt = quadCounts[b];
+    if (row >= quadCt) {
+        return;
     }
     // Only process the lead rows
+    const auto isLeadRow = allIsLeadRow[b].data();
     if (!isLeadRow[row]) {
         return;
     }
         __syncthreads();
     }
+    T *exData = allEmbedQuads[b].data();
     const int32_t adjCount = allAdjCounts[b][row];
     const int32_t *adjIdxs = allAdjValues[b][row].data();
     // Figure out the output position
     uint32_t writePosition = 0;
+    for (int32_t i = threadRank; i < b; i += BLOCK_WIDTH) {
+        writePosition += regionCounts[i];
     }
     const uint8_t *pCurrIsLeadRow = reinterpret_cast<const uint8_t*>(isLeadRow);
+    for (int32_t i = threadRank; i < row; i += BLOCK_WIDTH) {
         if (pCurrIsLeadRow[i]) {
             ++writePosition;
         }
     int64_t embedSize = sizeof(EmbedQuad_<scalar_t>) / sizeof(scalar_t);
     auto rowMergeTensor = torch::empty({ quads.size(0), embedSize, quads.size(1) * quads.size(2) }, quads.options());
     auto idsTensor = torch::full({ quads.size(0), quads.size(1) * quads.size(2) },
                                  std::numeric_limits<int32_t>::max(),
                                  counts.options().dtype(torch::kInt32));
     dim3 blockSize(32, 3, 1);
     dim3 gridSize(1,
         probs.packed_accessor64<scalar_t, 3>(),
         probThreshold, iouThreshold,
         counts.packed_accessor64<int32_t, 1>(),
+        rowMergeTensor.packed_accessor64<scalar_t, 3>(),
+        idsTensor.packed_accessor64<int32_t, 2>()
     );
 #ifdef NMS_VERIFY_CORRECTNESS
     counts = counts.slice(/*dim=*/ 0, 0, counts.size(0) - 1);
     int64_t maxExCount;
     if (counts.size(0) > 1) {
         maxExCount = counts.max().item<int32_t>();
     rowMergeTensor = rowMergeTensor.slice(2, 0, maxExCount);
     idsTensor = idsTensor.slice(1, 0, maxExCount);
+    auto order = torch::argsort(idsTensor, /*dim=*/ 1, s_sortOrder);
     auto embOrder = order.unsqueeze(1).expand_as(rowMergeTensor);
     rowMergeTensor = torch::gather(rowMergeTensor, /*dim=*/ 2, embOrder);
     idsTensor = torch::gather(idsTensor, /*dim=*/ 1, order);
     return { counts, rowMergeTensor, totalQuads, idsTensor, imageWidth, imageHeight };
 }
     int64_t MaxExCount;
 };
+template<typename T>
+void cpu_a2a_adjacency_sparse(const int32_t *ptrQuadCts,
                               const T iouThreshold,
                               torch::Tensor embedQuadsTensor,
                               torch::Tensor outIsStartTensorGPU,
     auto adjValues = outSparseAdjTensor.accessor<int32_t, 3>();
     for (int32_t b = 0; b < embedQuadsTensor.size(0); ++b) {
+        const int32_t quadCt = ptrQuadCts[b];
         T *exData = embedQuads[b].data();
                                        counts.options().dtype(torch::kInt32));
 #endif
 #ifdef NMS_VERIFY_CORRECTNESS
     auto cpuAdjValuesTensor = adjValuesTensor.cpu();
     auto cpuAdjCountsTensor = adjCountsTensor.cpu();
     //blockSize = dim3{ GRID_NUM_WARPS * 32, 1, 1 };
     //gridSize = dim3{ 1, static_cast<uint32_t>(maxExCount), static_cast<uint32_t>(counts.size(0)) };
+    //device_a2a_adjacency_build_grid<GRID_NUM_WARPS, scalar_t, CELL_SIZE> KERNEL_ARG2(gridSize, blockSize) (
+    //    counts.data_ptr<int32_t>(),
     //    collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
     //    gridCellsTensor.packed_accessor64<int32_t, 4>(),
     //    quadCellExtentsTensor.packed_accessor64<int32_t, 3>()
     //);
+    //device_a2a_adjacency_with_grid<GRID_NUM_WARPS, scalar_t> KERNEL_ARG3(gridSize, blockSize, smemSize) (
+    //    counts.data_ptr<int32_t>(),
     //    iouThreshold,
     //    collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
     //    gridCellsTensor.packed_accessor64<int32_t, 4>(),
     gridSize = dim3{div_up(totalWork, blockSize.x),
                     static_cast<uint32_t>(counts.size(0))};
     // This algorithm is O(n^2) with n being the current number of quads
+    device_a2a_adjacency_sparse<scalar_t> KERNEL_ARG2(gridSize, blockSize) (
+        counts.data_ptr<int32_t>(),
         iouThreshold,
         collapseResult.StridedMergeQuads.packed_accessor64<scalar_t, 3>(),
         isStartTensor.packed_accessor64<bool, 2>(),
 #ifdef NMS_VERIFY_CORRECTNESS
+    auto cpuCounts = counts.cpu();
+    cpu_a2a_adjacency_sparse<scalar_t>(cpuCounts.data_ptr<int32_t>(), iouThreshold,
         collapseResult.StridedMergeQuads, cpuIsStartTensor, cpuAdjCountsTensor, cpuAdjValuesTensor);
     adjValuesTensor = std::get<0>(torch::sort(adjValuesTensor, /*dim=*/ 2));
     auto maxDepthTensor = torch::tensor(0, adjCountsTensor.options());
 #endif
     blockSize = dim3{ 128, 1, 1 };
     gridSize = dim3{ 1, static_cast<uint32_t>(maxExCount), static_cast<uint32_t>(counts.size(0)) };
     smemSize = div_up(maxExCount * sizeof(visit_mask_t), sizeof(uint32_t)) * sizeof(uint32_t);
+    device_flatten_graph_iterative KERNEL_ARG3(gridSize, blockSize, smemSize) (
+        counts.data_ptr<int32_t>(),
         isStartTensor.packed_accessor64<bool, 2>(),
         reinterpret_cast<uint32_t*>(adjCountsTensor.data_ptr<int32_t>()),
         reinterpret_cast<uint32_t*>(adjValuesTensor.data_ptr<int32_t>())
     );
 #ifdef NMS_VERIFY_CORRECTNESS
+    cpu_flatten_graph(cpuCounts.data_ptr<int32_t>(), cpuIsStartTensor, cpuAdjCountsTensor, cpuAdjValuesTensor);
     cpuAdjValuesTensor = std::get<0>(torch::sort(cpuAdjValuesTensor, /*dim=*/ 2));
     adjValuesTensor = std::get<0>(torch::sort(adjValuesTensor, /*dim=*/ 2));
     cpuIsStartTensor = isStartTensor.cpu();
     cpuAdjCountsTensor = adjCountsTensor.cpu();
     cpuAdjValuesTensor = adjValuesTensor.cpu();
     auto cpuCollapseIds = collapseResult.QuadIds.cpu();
     static std::vector<std::unordered_set<int32_t>> s_knownGroups;
     dim3 blockSize(NUM_WARPS * 32, 1, 1);
     dim3 gridSize(1, adjResult.MaxExCount, counts.size(0));
     torch::Tensor outQuads = torch::empty({ numOutQuads, 4, 2 }, embedQuads.options());
     torch::Tensor outConf = torch::empty({ numOutQuads }, embedQuads.options());
+    device_a2a_collapse<NUM_WARPS, scalar_t>  KERNEL_ARG2(gridSize, blockSize) (
+        counts.packed_accessor64<int32_t, 1>(),
         embedQuads.packed_accessor64<scalar_t, 3>(),
         isLeadRow.packed_accessor64<bool, 2>(),
         regionCounts.data_ptr<int64_t>(),

nemotron-ocr/src/nemotron_ocr/inference/pipeline.py CHANGED Viewed

@@ -181,29 +181,17 @@ class NemotronOCR:
             e2e_det_conf = torch.sigmoid(det_conf)
             e2e_det_coords = rrect_to_quads(det_rboxes.float(), DETECTOR_DOWNSAMPLE)
-            # FIXME: quad_non_maximal_suppression fails with batch size > 1
-            all_quads = []
-            all_confidence = []
-            all_region_counts = []
-            for idx in range(e2e_det_coords.shape[0]):
-                quads, confidence, region_counts = quad_non_maximal_suppression(
-                    e2e_det_coords[idx].unsqueeze(0),
-                    e2e_det_conf[idx].unsqueeze(0),
-                    prob_threshold=NMS_PROB_THRESHOLD,
-                    iou_threshold=NMS_IOU_THRESHOLD,
-                    kernel_height=2,
-                    kernel_width=3,
-                    max_regions=NMS_MAX_REGIONS,
-                    verbose=False,
-                )[:3]
-                all_quads.append(quads)
-                all_confidence.append(confidence)
-                all_region_counts.append(region_counts)
-            quads = torch.cat(all_quads, dim=0)
-            confidence = torch.cat(all_confidence, dim=0)
-            region_counts = torch.cat(all_region_counts, dim=0)
         if quads.shape[0] == 0:
             rec_rectified_quads = torch.empty(0, 128, 8, 32, dtype=torch.float32, device=padded_image.device)

             e2e_det_conf = torch.sigmoid(det_conf)
             e2e_det_coords = rrect_to_quads(det_rboxes.float(), DETECTOR_DOWNSAMPLE)
+            quads, confidence, region_counts = quad_non_maximal_suppression(
+                e2e_det_coords[idx].unsqueeze(0),
+                e2e_det_conf[idx].unsqueeze(0),
+                prob_threshold=NMS_PROB_THRESHOLD,
+                iou_threshold=NMS_IOU_THRESHOLD,
+                kernel_height=2,
+                kernel_width=3,
+                max_regions=NMS_MAX_REGIONS,
+                verbose=False,
+            )[:3]
         if quads.shape[0] == 0:
             rec_rectified_quads = torch.empty(0, 128, 8, 32, dtype=torch.float32, device=padded_image.device)