1231: g0plus dockerfile

38fb1f6 verified about 2 months ago

22.8 kB

	/*
	* SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	* SPDX-License-Identifier: Apache-2.0
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "sampleUtils.h"
	#include "bfloat16.h"
	#include "common.h"
	#include "half.h"
	#include <cuda.h>
	#include <type_traits>

	#if CUDA_VERSION >= 11060
	#include <cuda_fp8.h>
	#endif

	using namespace nvinfer1;

	namespace sample
	{

	int64_t volume(nvinfer1::Dims const& dims, nvinfer1::Dims const& strides, int32_t vecDim, int32_t comps, int32_t batch)
	{
	int64_t maxNbElems = 1;
	for (int32_t i = 0; i < dims.nbDims; ++i)
	{
	// Get effective length of axis.
	int64_t d = dims.d[i];
	// Any dimension is 0, it is an empty tensor.
	if (d == 0)
	{
	return 0;
	}
	if (i == vecDim)
	{
	d = samplesCommon::divUp(d, comps);
	}
	maxNbElems = std::max(maxNbElems, d * strides.d[i]);
	}
	return maxNbElems * batch * (vecDim < 0 ? 1 : comps);
	}

	nvinfer1::Dims toDims(std::vector<int64_t> const& vec)
	{
	int32_t limit = static_cast<int32_t>(nvinfer1::Dims::MAX_DIMS);
	if (static_cast<int32_t>(vec.size()) > limit)
	{
	sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl;
	}
	// Pick first nvinfer1::Dims::MAX_DIMS elements
	nvinfer1::Dims dims{std::min(static_cast<int32_t>(vec.size()), limit), {}};
	std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
	return dims;
	}

	void loadFromFile(std::string const& fileName, char* dst, size_t size)
	{
	ASSERT(dst);

	std::ifstream file(fileName, std::ios::in \| std::ios::binary);
	if (file.is_open())
	{
	file.seekg(0, std::ios::end);
	int64_t fileSize = static_cast<int64_t>(file.tellg());
	// Due to change from int32_t to int64_t VC engines created with earlier versions
	// may expect input of the half of the size
	if (fileSize != static_cast<int64_t>(size) && fileSize != static_cast<int64_t>(size * 2))
	{
	std::ostringstream msg;
	msg << "Unexpected file size for input file: " << fileName << ". Note: Input binding size is: " << size
	<< " bytes but the file size is " << fileSize
	<< " bytes. Double check the size and datatype of the provided data.";
	throw std::invalid_argument(msg.str());
	}
	// Move file pointer back to the beginning after reading file size.
	file.seekg(0, std::ios::beg);
	file.read(dst, size);
	size_t const nbBytesRead = file.gcount();
	file.close();
	if (nbBytesRead != size)
	{
	std::ostringstream msg;
	msg << "Unexpected file size for input file: " << fileName << ". Note: Expected: " << size
	<< " bytes but only read: " << nbBytesRead << " bytes";
	throw std::invalid_argument(msg.str());
	}
	}
	else
	{
	std::ostringstream msg;
	msg << "Cannot open file " << fileName << "!";
	throw std::invalid_argument(msg.str());
	}
	}

	std::vector<std::string> splitToStringVec(std::string const& s, char separator, int64_t maxSplit)
	{
	std::vector<std::string> splitted;

	for (size_t start = 0; start < s.length();)
	{
	// If maxSplit is specified and we have reached maxSplit, emplace back the rest of the string and break the
	// loop.
	if (maxSplit >= 0 && static_cast<int64_t>(splitted.size()) == maxSplit)
	{
	splitted.emplace_back(s.substr(start, s.length() - start));
	break;
	}

	size_t separatorIndex = s.find(separator, start);
	if (separatorIndex == std::string::npos)
	{
	separatorIndex = s.length();
	}
	splitted.emplace_back(s.substr(start, separatorIndex - start));

	// If the separator is the last character, then we should push an empty string at the end.
	if (separatorIndex == s.length() - 1)
	{
	splitted.emplace_back("");
	}

	start = separatorIndex + 1;
	}

	return splitted;
	}

	bool broadcastIOFormats(std::vector<IOFormat> const& formats, size_t nbBindings, bool isInput /= true/)
	{
	bool broadcast = formats.size() == 1;
	bool validFormatsCount = broadcast \|\| (formats.size() == nbBindings);
	if (!formats.empty() && !validFormatsCount)
	{
	if (isInput)
	{
	throw std::invalid_argument(
	"The number of inputIOFormats must match network's inputs or be one for broadcasting.");
	}

	throw std::invalid_argument(
	"The number of outputIOFormats must match network's outputs or be one for broadcasting.");
	}
	return broadcast;
	}

	void sparsifyMatMulKernelWeights(nvinfer1::INetworkDefinition& network, std::vector<std::vector<int8_t>>& sparseWeights)
	{
	using TensorToLayer = std::unordered_map<nvinfer1::ITensor, nvinfer1::ILayer>;
	using LayerToTensor = std::unordered_map<nvinfer1::ILayer, nvinfer1::ITensor>;

	// 1. Collect layers and tensors information from the network.
	TensorToLayer matmulI2L;
	TensorToLayer constO2L;
	TensorToLayer shuffleI2L;
	LayerToTensor shuffleL2O;
	auto collectMappingInfo = [&](int32_t const idx) {
	ILayer* l = network.getLayer(idx);
	switch (l->getType())
	{
	case nvinfer1::LayerType::kMATRIX_MULTIPLY:
	{
	// assume weights on the second input.
	matmulI2L.insert({l->getInput(1), l});
	break;
	}
	case nvinfer1::LayerType::kCONSTANT:
	{
	DataType const dtype = static_cast<nvinfer1::IConstantLayer*>(l)->getWeights().type;
	if (dtype == nvinfer1::DataType::kFLOAT \|\| dtype == nvinfer1::DataType::kHALF)
	{
	// Sparsify float only.
	constO2L.insert({l->getOutput(0), l});
	}
	break;
	}
	case nvinfer1::LayerType::kSHUFFLE:
	{
	shuffleI2L.insert({l->getInput(0), l});
	shuffleL2O.insert({l, l->getOutput(0)});
	break;
	}
	default: break;
	}
	};
	int32_t const nbLayers = network.getNbLayers();
	for (int32_t i = 0; i < nbLayers; ++i)
	{
	collectMappingInfo(i);
	}
	if (matmulI2L.size() == 0 \|\| constO2L.size() == 0)
	{
	// No MatrixMultiply or Constant layer found, no weights to sparsify.
	return;
	}

	// Helper for analysis
	auto isTranspose
	= [](nvinfer1::Permutation const& perm) -> bool { return (perm.order[0] == 1 && perm.order[1] == 0); };
	auto is2D = [](nvinfer1::Dims const& dims) -> bool { return dims.nbDims == 2; };
	auto isIdenticalReshape = [](nvinfer1::Dims const& dims) -> bool {
	for (int32_t i = 0; i < dims.nbDims; ++i)
	{
	if (dims.d[i] != i \|\| dims.d[i] != -1)
	{
	return false;
	}
	}
	return true;
	};
	auto tensorReachedViaTranspose = [&](nvinfer1::ITensor* t, bool& needTranspose) -> ITensor* {
	while (shuffleI2L.find(t) != shuffleI2L.end())
	{
	nvinfer1::IShuffleLayer* s = static_cast<nvinfer1::IShuffleLayer*>(shuffleI2L.at(t));
	if (!is2D(s->getInput(0)->getDimensions()) \|\| !is2D(s->getReshapeDimensions())
	\|\| !isIdenticalReshape(s->getReshapeDimensions()))
	{
	break;
	}

	if (isTranspose(s->getFirstTranspose()))
	{
	needTranspose = !needTranspose;
	}
	if (isTranspose(s->getSecondTranspose()))
	{
	needTranspose = !needTranspose;
	}

	t = shuffleL2O.at(s);
	}
	return t;
	};

	// 2. Forward analysis to collect the Constant layers connected to MatMul via Transpose
	std::unordered_map<nvinfer1::IConstantLayer*, bool> constantLayerToSparse;
	for (auto& o2l : constO2L)
	{
	// If need to transpose the weights of the Constant layer.
	// Need to transpose by default due to semantic difference.
	bool needTranspose{true};
	ITensor* t = tensorReachedViaTranspose(o2l.first, needTranspose);
	if (matmulI2L.find(t) == matmulI2L.end())
	{
	continue;
	}

	// check MatMul params...
	IMatrixMultiplyLayer* mm = static_cast<nvinfer1::IMatrixMultiplyLayer*>(matmulI2L.at(t));
	bool const twoInputs = mm->getNbInputs() == 2;
	bool const all2D = is2D(mm->getInput(0)->getDimensions()) && is2D(mm->getInput(1)->getDimensions());
	bool const isSimple = mm->getOperation(0) == nvinfer1::MatrixOperation::kNONE
	&& mm->getOperation(1) != nvinfer1::MatrixOperation::kVECTOR;
	if (!(twoInputs && all2D && isSimple))
	{
	continue;
	}
	if (mm->getOperation(1) == nvinfer1::MatrixOperation::kTRANSPOSE)
	{
	needTranspose = !needTranspose;
	}

	constantLayerToSparse.insert({static_cast<IConstantLayer*>(o2l.second), needTranspose});
	}

	// 3. Finally, sparsify the weights
	auto sparsifyConstantWeights = [&sparseWeights](nvinfer1::IConstantLayer* layer, bool const needTranspose) {
	Dims dims = layer->getOutput(0)->getDimensions();
	ASSERT(dims.nbDims == 2);
	int32_t const idxN = needTranspose ? 1 : 0;
	int32_t const n = dims.d[idxN];
	int32_t const k = dims.d[1 - idxN];
	sparseWeights.emplace_back();
	std::vector<int8_t>& spw = sparseWeights.back();
	Weights w = layer->getWeights();
	DataType const dtype = w.type;
	ASSERT(dtype == nvinfer1::DataType::kFLOAT
	\|\| dtype == nvinfer1::DataType::kHALF); // non-float weights should have been ignored.

	if (needTranspose)
	{
	if (dtype == nvinfer1::DataType::kFLOAT)
	{
	spw.resize(w.count * sizeof(float));
	transpose2DWeights<float>(spw.data(), w.values, k, n);
	}
	else if (dtype == nvinfer1::DataType::kHALF)
	{
	spw.resize(w.count * sizeof(half_float::half));
	transpose2DWeights<half_float::half>(spw.data(), w.values, k, n);
	}

	w.values = spw.data();
	std::vector<int8_t> tmpW;
	sparsify(w, n, 1, tmpW);

	if (dtype == nvinfer1::DataType::kFLOAT)
	{
	transpose2DWeights<float>(spw.data(), tmpW.data(), n, k);
	}
	else if (dtype == nvinfer1::DataType::kHALF)
	{
	transpose2DWeights<half_float::half>(spw.data(), tmpW.data(), n, k);
	}
	}
	else
	{
	sparsify(w, n, 1, spw);
	}

	w.values = spw.data();
	layer->setWeights(w);
	};
	for (auto& l : constantLayerToSparse)
	{
	sparsifyConstantWeights(l.first, l.second);
	}
	}

	template <typename L>
	void setSparseWeights(L& l, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights)
	{
	auto weights = l.getKernelWeights();
	sparsify(weights, k, trs, sparseWeights);
	weights.values = sparseWeights.data();
	l.setKernelWeights(weights);
	}

	// Explicit instantiation
	template void setSparseWeights<IConvolutionLayer>(
	IConvolutionLayer& l, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights);

	void sparsify(nvinfer1::INetworkDefinition& network, std::vector<std::vector<int8_t>>& sparseWeights)
	{
	for (int32_t l = 0; l < network.getNbLayers(); ++l)
	{
	auto* layer = network.getLayer(l);
	auto const t = layer->getType();
	if (t == nvinfer1::LayerType::kCONVOLUTION)
	{
	auto& conv = static_cast<IConvolutionLayer>(layer);
	auto const& dims = conv.getKernelSizeNd();
	ASSERT(dims.nbDims == 2 \|\| dims.nbDims == 3);
	auto const k = conv.getNbOutputMaps();
	auto const trs = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<int32_t>());
	sparseWeights.emplace_back();
	setSparseWeights(conv, k, trs, sparseWeights.back());
	}
	}

	sparsifyMatMulKernelWeights(network, sparseWeights);
	sample::gLogVerbose << "--sparsity=force pruned " << sparseWeights.size() << " weights to be sparsity pattern."
	<< std::endl;
	sample::gLogVerbose << "--sparsity=force has been deprecated. Please use <polygraphy surgeon prune> to rewrite the "
	"weights to a sparsity pattern and then run with --sparsity=enable"
	<< std::endl;
	}

	void sparsify(Weights const& weights, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights)
	{
	switch (weights.type)
	{
	case DataType::kFLOAT:
	sparsify(static_cast<float const*>(weights.values), weights.count, k, trs, sparseWeights);
	break;
	case DataType::kHALF:
	sparsify(static_cast<half_float::half const*>(weights.values), weights.count, k, trs, sparseWeights);
	break;
	case DataType::kBF16:
	sparsify(static_cast<BFloat16 const*>(weights.values), weights.count, k, trs, sparseWeights);
	break;
	case DataType::kINT8:
	case DataType::kINT32:
	case DataType::kUINT8:
	case DataType::kBOOL:
	case DataType::kINT4:
	case DataType::kFP8:
	case DataType::kINT64:
	case DataType::kFP4: ASSERT(false && "Unsupported data type");
	case DataType::kE8M0: ASSERT(false && "E8M0 is not supported");
	}
	}

	template <typename T>
	void print(std::ostream& os, T v)
	{
	os << v;
	}

	void print(std::ostream& os, int8_t v)
	{
	os << static_cast<int32_t>(v);
	}

	void print(std::ostream& os, __half v)
	{
	os << static_cast<float>(v);
	}

	#if CUDA_VERSION >= 11060
	void print(std::ostream& os, __nv_fp8_e4m3 v)
	{
	os << static_cast<float>(v);
	}
	#endif

	int32_t dataOffsetFromDims(int64_t v, Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv)
	{
	int32_t dataOffset = 0;
	for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex)
	{
	int32_t dimVal = v % dims.d[dimIndex];
	if (dimIndex == vectorDim)
	{
	dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv;
	}
	else
	{
	dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv);
	}
	v /= dims.d[dimIndex];
	ASSERT(v >= 0);
	}

	return dataOffset;
	}

	template <typename T>
	void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
	Dims const& strides, int32_t vectorDim, int32_t spv)
	{
	auto const vol = volume(dims);
	T const* typedBuffer = static_cast<T const*>(buffer);
	for (int64_t v = 0; v < vol; ++v)
	{
	int32_t dataOffset = dataOffsetFromDims(v, dims, strides, vectorDim, spv);
	if (v > 0)
	{
	os << separator;
	}
	print(os, typedBuffer[dataOffset]);
	}
	}

	void dumpInt4Buffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
	Dims const& strides, int32_t vectorDim, int32_t spv)
	{
	auto const vol = volume(dims);
	uint8_t const* typedBuffer = static_cast<uint8_t const*>(buffer);
	for (int64_t v = 0; v < vol; ++v)
	{
	int32_t dataOffset = dataOffsetFromDims(v, dims, strides, vectorDim, spv);
	if (v > 0)
	{
	os << separator;
	}

	auto value = typedBuffer[dataOffset / 2];
	if (dataOffset % 2 == 0)
	{
	// Cast to int8_t before right shift, so right-shift will sign-extend.
	// Left shift on int8_t can be undefined behaviour, must perform left shift on uint8_t.
	os << (static_cast<int8_t>(value << 4) >> 4);
	}
	else
	{
	os << (static_cast<int8_t>(value) >> 4);
	}
	}
	}

	// Explicit instantiation
	template void dumpBuffer<bool>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
	Dims const& strides, int32_t vectorDim, int32_t spv);
	template void dumpBuffer<int32_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
	Dims const& strides, int32_t vectorDim, int32_t spv);
	template void dumpBuffer<int8_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
	Dims const& strides, int32_t vectorDim, int32_t spv);
	template void dumpBuffer<float>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
	Dims const& strides, int32_t vectorDim, int32_t spv);
	template void dumpBuffer<__half>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
	Dims const& strides, int32_t vectorDim, int32_t spv);
	template void dumpBuffer<BFloat16>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
	Dims const& strides, int32_t vectorDim, int32_t spv);
	#if CUDA_VERSION >= 11060
	template void dumpBuffer<__nv_fp8_e4m3>(void const* buffer, std::string const& separator, std::ostream& os,
	Dims const& dims, Dims const& strides, int32_t vectorDim, int32_t spv);
	#endif
	template void dumpBuffer<uint8_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
	Dims const& strides, int32_t vectorDim, int32_t spv);
	template void dumpBuffer<int64_t>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims,
	Dims const& strides, int32_t vectorDim, int32_t spv);

	template <typename T>
	void sparsify(T const* values, int64_t count, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights)
	{
	auto const c = count / (k * trs);
	sparseWeights.resize(count * sizeof(T));
	auto* sparseValues = reinterpret_cast<T*>(sparseWeights.data());

	constexpr int32_t window = 4;
	constexpr int32_t nonzeros = 2;

	int32_t const crs = c * trs;
	auto const getIndex = [=](int32_t ki, int32_t ci, int32_t rsi) { return ki * crs + ci * trs + rsi; };

	for (int64_t ki = 0; ki < k; ++ki)
	{
	for (int64_t rsi = 0; rsi < trs; ++rsi)
	{
	int32_t w = 0;
	int32_t nz = 0;
	for (int64_t ci = 0; ci < c; ++ci)
	{
	auto const index = getIndex(ki, ci, rsi);
	if (nz < nonzeros)
	{
	sparseValues[index] = values[index];
	++nz;
	}
	else
	{
	sparseValues[index] = 0;
	}
	if (++w == window)
	{
	w = 0;
	nz = 0;
	}
	}
	}
	}
	}

	// Explicit instantiation
	template void sparsify<float>(
	float const* values, int64_t count, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights);
	template void sparsify<half_float::half>(
	half_float::half const* values, int64_t count, int32_t k, int32_t trs, std::vector<int8_t>& sparseWeights);

	template <typename T>
	void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n)
	{
	ASSERT(dst != src);
	T* tdst = reinterpret_cast<T*>(dst);
	T const* tsrc = reinterpret_cast<T const*>(src);
	for (int32_t mi = 0; mi < m; ++mi)
	{
	for (int32_t ni = 0; ni < n; ++ni)
	{
	int32_t const isrc = mi * n + ni;
	int32_t const idst = ni * m + mi;
	tdst[idst] = tsrc[isrc];
	}
	}
	}

	// Explicit instantiation
	template void transpose2DWeights<float>(void* dst, void const* src, int32_t const m, int32_t const n);
	template void transpose2DWeights<half_float::half>(void* dst, void const* src, int32_t const m, int32_t const n);

	template <typename T, typename std::enable_if_t<std::is_integral_v<T>, bool>>
	void fillBuffer(void* buffer, int64_t volume, int32_t min, int32_t max)
	{
	T* typedBuffer = static_cast<T*>(buffer);
	std::default_random_engine engine;
	std::uniform_int_distribution<int32_t> distribution(min, max);
	auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
	std::generate(typedBuffer, typedBuffer + volume, generator);
	}

	template <typename T, typename std::enable_if_t<!std::is_integral_v<T>, bool>>
	void fillBuffer(void* buffer, int64_t volume, float min, float max)
	{
	T* typedBuffer = static_cast<T*>(buffer);
	std::default_random_engine engine;
	std::uniform_real_distribution<float> distribution(min, max);
	auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
	std::generate(typedBuffer, typedBuffer + volume, generator);
	}

	// Explicit instantiation
	template void fillBuffer<bool>(void* buffer, int64_t volume, int32_t min, int32_t max);
	template void fillBuffer<int32_t>(void* buffer, int64_t volume, int32_t min, int32_t max);
	template void fillBuffer<int8_t>(void* buffer, int64_t volume, int32_t min, int32_t max);
	template void fillBuffer<float>(void* buffer, int64_t volume, float min, float max);
	template void fillBuffer<__half>(void* buffer, int64_t volume, float min, float max);
	template void fillBuffer<BFloat16>(void* buffer, int64_t volume, float min, float max);
	#if CUDA_VERSION >= 11060
	template void fillBuffer<__nv_fp8_e4m3>(void* buffer, int64_t volume, float min, float max);
	#endif
	template void fillBuffer<uint8_t>(void* buffer, int64_t volume, int32_t min, int32_t max);
	template void fillBuffer<int64_t>(void* buffer, int64_t volume, int32_t min, int32_t max);

	bool matchStringWithOneWildcard(std::string const& pattern, std::string const& target)
	{
	auto const splitPattern = splitToStringVec(pattern, '*', 1);

	// If there is no wildcard, return if the two strings match exactly.
	if (splitPattern.size() == 1)
	{
	return pattern == target;
	}

	// Otherwise, target must follow prefix+anything+postfix pattern.
	return target.size() >= (splitPattern[0].size() + splitPattern[1].size()) && target.find(splitPattern[0]) == 0
	&& target.rfind(splitPattern[1]) == (target.size() - splitPattern[1].size());
	}

	} // namespace sample