/* * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "sampleUtils.h" #include "bfloat16.h" #include "common.h" #include "half.h" #include #include #if CUDA_VERSION >= 11060 #include #endif using namespace nvinfer1; namespace sample { int64_t volume(nvinfer1::Dims const& dims, nvinfer1::Dims const& strides, int32_t vecDim, int32_t comps, int32_t batch) { int64_t maxNbElems = 1; for (int32_t i = 0; i < dims.nbDims; ++i) { // Get effective length of axis. int64_t d = dims.d[i]; // Any dimension is 0, it is an empty tensor. if (d == 0) { return 0; } if (i == vecDim) { d = samplesCommon::divUp(d, comps); } maxNbElems = std::max(maxNbElems, d * strides.d[i]); } return maxNbElems * batch * (vecDim < 0 ? 1 : comps); } nvinfer1::Dims toDims(std::vector const& vec) { int32_t limit = static_cast(nvinfer1::Dims::MAX_DIMS); if (static_cast(vec.size()) > limit) { sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl; } // Pick first nvinfer1::Dims::MAX_DIMS elements nvinfer1::Dims dims{std::min(static_cast(vec.size()), limit), {}}; std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d)); return dims; } void loadFromFile(std::string const& fileName, char* dst, size_t size) { ASSERT(dst); std::ifstream file(fileName, std::ios::in | std::ios::binary); if (file.is_open()) { file.seekg(0, std::ios::end); int64_t fileSize = static_cast(file.tellg()); // Due to change from int32_t to int64_t VC engines created with earlier versions // may expect input of the half of the size if (fileSize != static_cast(size) && fileSize != static_cast(size * 2)) { std::ostringstream msg; msg << "Unexpected file size for input file: " << fileName << ". Note: Input binding size is: " << size << " bytes but the file size is " << fileSize << " bytes. Double check the size and datatype of the provided data."; throw std::invalid_argument(msg.str()); } // Move file pointer back to the beginning after reading file size. file.seekg(0, std::ios::beg); file.read(dst, size); size_t const nbBytesRead = file.gcount(); file.close(); if (nbBytesRead != size) { std::ostringstream msg; msg << "Unexpected file size for input file: " << fileName << ". Note: Expected: " << size << " bytes but only read: " << nbBytesRead << " bytes"; throw std::invalid_argument(msg.str()); } } else { std::ostringstream msg; msg << "Cannot open file " << fileName << "!"; throw std::invalid_argument(msg.str()); } } std::vector splitToStringVec(std::string const& s, char separator, int64_t maxSplit) { std::vector splitted; for (size_t start = 0; start < s.length();) { // If maxSplit is specified and we have reached maxSplit, emplace back the rest of the string and break the // loop. if (maxSplit >= 0 && static_cast(splitted.size()) == maxSplit) { splitted.emplace_back(s.substr(start, s.length() - start)); break; } size_t separatorIndex = s.find(separator, start); if (separatorIndex == std::string::npos) { separatorIndex = s.length(); } splitted.emplace_back(s.substr(start, separatorIndex - start)); // If the separator is the last character, then we should push an empty string at the end. if (separatorIndex == s.length() - 1) { splitted.emplace_back(""); } start = separatorIndex + 1; } return splitted; } bool broadcastIOFormats(std::vector const& formats, size_t nbBindings, bool isInput /*= true*/) { bool broadcast = formats.size() == 1; bool validFormatsCount = broadcast || (formats.size() == nbBindings); if (!formats.empty() && !validFormatsCount) { if (isInput) { throw std::invalid_argument( "The number of inputIOFormats must match network's inputs or be one for broadcasting."); } throw std::invalid_argument( "The number of outputIOFormats must match network's outputs or be one for broadcasting."); } return broadcast; } void sparsifyMatMulKernelWeights(nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights) { using TensorToLayer = std::unordered_map; using LayerToTensor = std::unordered_map; // 1. Collect layers and tensors information from the network. TensorToLayer matmulI2L; TensorToLayer constO2L; TensorToLayer shuffleI2L; LayerToTensor shuffleL2O; auto collectMappingInfo = [&](int32_t const idx) { ILayer* l = network.getLayer(idx); switch (l->getType()) { case nvinfer1::LayerType::kMATRIX_MULTIPLY: { // assume weights on the second input. matmulI2L.insert({l->getInput(1), l}); break; } case nvinfer1::LayerType::kCONSTANT: { DataType const dtype = static_cast(l)->getWeights().type; if (dtype == nvinfer1::DataType::kFLOAT || dtype == nvinfer1::DataType::kHALF) { // Sparsify float only. constO2L.insert({l->getOutput(0), l}); } break; } case nvinfer1::LayerType::kSHUFFLE: { shuffleI2L.insert({l->getInput(0), l}); shuffleL2O.insert({l, l->getOutput(0)}); break; } default: break; } }; int32_t const nbLayers = network.getNbLayers(); for (int32_t i = 0; i < nbLayers; ++i) { collectMappingInfo(i); } if (matmulI2L.size() == 0 || constO2L.size() == 0) { // No MatrixMultiply or Constant layer found, no weights to sparsify. return; } // Helper for analysis auto isTranspose = [](nvinfer1::Permutation const& perm) -> bool { return (perm.order[0] == 1 && perm.order[1] == 0); }; auto is2D = [](nvinfer1::Dims const& dims) -> bool { return dims.nbDims == 2; }; auto isIdenticalReshape = [](nvinfer1::Dims const& dims) -> bool { for (int32_t i = 0; i < dims.nbDims; ++i) { if (dims.d[i] != i || dims.d[i] != -1) { return false; } } return true; }; auto tensorReachedViaTranspose = [&](nvinfer1::ITensor* t, bool& needTranspose) -> ITensor* { while (shuffleI2L.find(t) != shuffleI2L.end()) { nvinfer1::IShuffleLayer* s = static_cast(shuffleI2L.at(t)); if (!is2D(s->getInput(0)->getDimensions()) || !is2D(s->getReshapeDimensions()) || !isIdenticalReshape(s->getReshapeDimensions())) { break; } if (isTranspose(s->getFirstTranspose())) { needTranspose = !needTranspose; } if (isTranspose(s->getSecondTranspose())) { needTranspose = !needTranspose; } t = shuffleL2O.at(s); } return t; }; // 2. Forward analysis to collect the Constant layers connected to MatMul via Transpose std::unordered_map constantLayerToSparse; for (auto& o2l : constO2L) { // If need to transpose the weights of the Constant layer. // Need to transpose by default due to semantic difference. bool needTranspose{true}; ITensor* t = tensorReachedViaTranspose(o2l.first, needTranspose); if (matmulI2L.find(t) == matmulI2L.end()) { continue; } // check MatMul params... IMatrixMultiplyLayer* mm = static_cast(matmulI2L.at(t)); bool const twoInputs = mm->getNbInputs() == 2; bool const all2D = is2D(mm->getInput(0)->getDimensions()) && is2D(mm->getInput(1)->getDimensions()); bool const isSimple = mm->getOperation(0) == nvinfer1::MatrixOperation::kNONE && mm->getOperation(1) != nvinfer1::MatrixOperation::kVECTOR; if (!(twoInputs && all2D && isSimple)) { continue; } if (mm->getOperation(1) == nvinfer1::MatrixOperation::kTRANSPOSE) { needTranspose = !needTranspose; } constantLayerToSparse.insert({static_cast(o2l.second), needTranspose}); } // 3. Finally, sparsify the weights auto sparsifyConstantWeights = [&sparseWeights](nvinfer1::IConstantLayer* layer, bool const needTranspose) { Dims dims = layer->getOutput(0)->getDimensions(); ASSERT(dims.nbDims == 2); int32_t const idxN = needTranspose ? 1 : 0; int32_t const n = dims.d[idxN]; int32_t const k = dims.d[1 - idxN]; sparseWeights.emplace_back(); std::vector& spw = sparseWeights.back(); Weights w = layer->getWeights(); DataType const dtype = w.type; ASSERT(dtype == nvinfer1::DataType::kFLOAT || dtype == nvinfer1::DataType::kHALF); // non-float weights should have been ignored. if (needTranspose) { if (dtype == nvinfer1::DataType::kFLOAT) { spw.resize(w.count * sizeof(float)); transpose2DWeights(spw.data(), w.values, k, n); } else if (dtype == nvinfer1::DataType::kHALF) { spw.resize(w.count * sizeof(half_float::half)); transpose2DWeights(spw.data(), w.values, k, n); } w.values = spw.data(); std::vector tmpW; sparsify(w, n, 1, tmpW); if (dtype == nvinfer1::DataType::kFLOAT) { transpose2DWeights(spw.data(), tmpW.data(), n, k); } else if (dtype == nvinfer1::DataType::kHALF) { transpose2DWeights(spw.data(), tmpW.data(), n, k); } } else { sparsify(w, n, 1, spw); } w.values = spw.data(); layer->setWeights(w); }; for (auto& l : constantLayerToSparse) { sparsifyConstantWeights(l.first, l.second); } } template void setSparseWeights(L& l, int32_t k, int32_t trs, std::vector& sparseWeights) { auto weights = l.getKernelWeights(); sparsify(weights, k, trs, sparseWeights); weights.values = sparseWeights.data(); l.setKernelWeights(weights); } // Explicit instantiation template void setSparseWeights( IConvolutionLayer& l, int32_t k, int32_t trs, std::vector& sparseWeights); void sparsify(nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights) { for (int32_t l = 0; l < network.getNbLayers(); ++l) { auto* layer = network.getLayer(l); auto const t = layer->getType(); if (t == nvinfer1::LayerType::kCONVOLUTION) { auto& conv = *static_cast(layer); auto const& dims = conv.getKernelSizeNd(); ASSERT(dims.nbDims == 2 || dims.nbDims == 3); auto const k = conv.getNbOutputMaps(); auto const trs = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies()); sparseWeights.emplace_back(); setSparseWeights(conv, k, trs, sparseWeights.back()); } } sparsifyMatMulKernelWeights(network, sparseWeights); sample::gLogVerbose << "--sparsity=force pruned " << sparseWeights.size() << " weights to be sparsity pattern." << std::endl; sample::gLogVerbose << "--sparsity=force has been deprecated. Please use to rewrite the " "weights to a sparsity pattern and then run with --sparsity=enable" << std::endl; } void sparsify(Weights const& weights, int32_t k, int32_t trs, std::vector& sparseWeights) { switch (weights.type) { case DataType::kFLOAT: sparsify(static_cast(weights.values), weights.count, k, trs, sparseWeights); break; case DataType::kHALF: sparsify(static_cast(weights.values), weights.count, k, trs, sparseWeights); break; case DataType::kBF16: sparsify(static_cast(weights.values), weights.count, k, trs, sparseWeights); break; case DataType::kINT8: case DataType::kINT32: case DataType::kUINT8: case DataType::kBOOL: case DataType::kINT4: case DataType::kFP8: case DataType::kINT64: case DataType::kFP4: ASSERT(false && "Unsupported data type"); case DataType::kE8M0: ASSERT(false && "E8M0 is not supported"); } } template void print(std::ostream& os, T v) { os << v; } void print(std::ostream& os, int8_t v) { os << static_cast(v); } void print(std::ostream& os, __half v) { os << static_cast(v); } #if CUDA_VERSION >= 11060 void print(std::ostream& os, __nv_fp8_e4m3 v) { os << static_cast(v); } #endif int32_t dataOffsetFromDims(int64_t v, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv) { int32_t dataOffset = 0; for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex) { int32_t dimVal = v % dims.d[dimIndex]; if (dimIndex == vectorDim) { dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv; } else { dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv); } v /= dims.d[dimIndex]; ASSERT(v >= 0); } return dataOffset; } template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv) { auto const vol = volume(dims); T const* typedBuffer = static_cast(buffer); for (int64_t v = 0; v < vol; ++v) { int32_t dataOffset = dataOffsetFromDims(v, dims, strides, vectorDim, spv); if (v > 0) { os << separator; } print(os, typedBuffer[dataOffset]); } } void dumpInt4Buffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv) { auto const vol = volume(dims); uint8_t const* typedBuffer = static_cast(buffer); for (int64_t v = 0; v < vol; ++v) { int32_t dataOffset = dataOffsetFromDims(v, dims, strides, vectorDim, spv); if (v > 0) { os << separator; } auto value = typedBuffer[dataOffset / 2]; if (dataOffset % 2 == 0) { // Cast to int8_t before right shift, so right-shift will sign-extend. // Left shift on int8_t can be undefined behaviour, must perform left shift on uint8_t. os << (static_cast(value << 4) >> 4); } else { os << (static_cast(value) >> 4); } } } // Explicit instantiation template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv); template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv); template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv); template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv); template void dumpBuffer<__half>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv); template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv); #if CUDA_VERSION >= 11060 template void dumpBuffer<__nv_fp8_e4m3>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv); #endif template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv); template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv); template void sparsify(T const* values, int64_t count, int32_t k, int32_t trs, std::vector& sparseWeights) { auto const c = count / (k * trs); sparseWeights.resize(count * sizeof(T)); auto* sparseValues = reinterpret_cast(sparseWeights.data()); constexpr int32_t window = 4; constexpr int32_t nonzeros = 2; int32_t const crs = c * trs; auto const getIndex = [=](int32_t ki, int32_t ci, int32_t rsi) { return ki * crs + ci * trs + rsi; }; for (int64_t ki = 0; ki < k; ++ki) { for (int64_t rsi = 0; rsi < trs; ++rsi) { int32_t w = 0; int32_t nz = 0; for (int64_t ci = 0; ci < c; ++ci) { auto const index = getIndex(ki, ci, rsi); if (nz < nonzeros) { sparseValues[index] = values[index]; ++nz; } else { sparseValues[index] = 0; } if (++w == window) { w = 0; nz = 0; } } } } } // Explicit instantiation template void sparsify( float const* values, int64_t count, int32_t k, int32_t trs, std::vector& sparseWeights); template void sparsify( half_float::half const* values, int64_t count, int32_t k, int32_t trs, std::vector& sparseWeights); template void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n) { ASSERT(dst != src); T* tdst = reinterpret_cast(dst); T const* tsrc = reinterpret_cast(src); for (int32_t mi = 0; mi < m; ++mi) { for (int32_t ni = 0; ni < n; ++ni) { int32_t const isrc = mi * n + ni; int32_t const idst = ni * m + mi; tdst[idst] = tsrc[isrc]; } } } // Explicit instantiation template void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n); template void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n); template , bool>> void fillBuffer(void* buffer, int64_t volume, int32_t min, int32_t max) { T* typedBuffer = static_cast(buffer); std::default_random_engine engine; std::uniform_int_distribution distribution(min, max); auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; std::generate(typedBuffer, typedBuffer + volume, generator); } template , bool>> void fillBuffer(void* buffer, int64_t volume, float min, float max) { T* typedBuffer = static_cast(buffer); std::default_random_engine engine; std::uniform_real_distribution distribution(min, max); auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; std::generate(typedBuffer, typedBuffer + volume, generator); } // Explicit instantiation template void fillBuffer(void* buffer, int64_t volume, int32_t min, int32_t max); template void fillBuffer(void* buffer, int64_t volume, int32_t min, int32_t max); template void fillBuffer(void* buffer, int64_t volume, int32_t min, int32_t max); template void fillBuffer(void* buffer, int64_t volume, float min, float max); template void fillBuffer<__half>(void* buffer, int64_t volume, float min, float max); template void fillBuffer(void* buffer, int64_t volume, float min, float max); #if CUDA_VERSION >= 11060 template void fillBuffer<__nv_fp8_e4m3>(void* buffer, int64_t volume, float min, float max); #endif template void fillBuffer(void* buffer, int64_t volume, int32_t min, int32_t max); template void fillBuffer(void* buffer, int64_t volume, int32_t min, int32_t max); bool matchStringWithOneWildcard(std::string const& pattern, std::string const& target) { auto const splitPattern = splitToStringVec(pattern, '*', 1); // If there is no wildcard, return if the two strings match exactly. if (splitPattern.size() == 1) { return pattern == target; } // Otherwise, target must follow prefix+anything+postfix pattern. return target.size() >= (splitPattern[0].size() + splitPattern[1].size()) && target.find(splitPattern[0]) == 0 && target.rfind(splitPattern[1]) == (target.size() - splitPattern[1].size()); } } // namespace sample