1231: g0plus dockerfile

38fb1f6 verified about 2 months ago

29.5 kB

	/*
	* SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	* SPDX-License-Identifier: Apache-2.0
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "debugTensorWriter.h"
	#include "common.h"
	#include <algorithm>
	#include <cuda_bf16.h>
	#include <cuda_fp16.h>
	#if CUDA_VERSION >= 11060
	#include <cuda_fp8.h>
	#endif
	#if CUDA_VERSION >= 12070
	#include <cuda_fp4.h>
	#endif
	#include <cuda_runtime_api.h>
	#include <numeric>
	namespace sample
	{

	namespace
	{

	class Int4
	{
	public:
	Int4() = default;
	explicit Int4(int8_t val)
	: mValue(val)
	{
	}

	operator int64_t() const
	{
	return static_cast<int64_t>(mValue);
	}

	private:
	int8_t mValue{};
	};

	class Int4x2
	{
	public:
	using StorageType = uint8_t;

	Int4x2() = default;
	explicit Int4x2(StorageType val)
	: mRep(val)
	{
	}

	// Get a single element
	inline Int4 element(int32_t index) const
	{
	ASSERT(index == 0 \|\| index == 1);
	return Int4(index == 0 ? static_cast<int8_t>(mRep << 4) >> 4 : static_cast<int8_t>(mRep) >> 4);
	}

	private:
	StorageType mRep{};
	};

	#if CUDA_VERSION >= 12070
	using Fp4 = __nv_fp4_e2m1;

	class Fp4x2
	{
	public:
	using StorageType = uint8_t;

	Fp4x2() = default;
	explicit Fp4x2(StorageType val)
	: mRep(val)
	{
	}

	// Get a single element
	inline Fp4 element(int32_t index) const
	{
	ASSERT(index == 0 \|\| index == 1);
	int8_t bits = index == 0 ? static_cast<int8_t>(mRep << 4) >> 4 : static_cast<int8_t>(mRep) >> 4;
	Fp4 fp4_el = reinterpret_cast<Fp4>(&bits);
	return fp4_el;
	}

	private:
	StorageType mRep{};
	};
	#endif

	// Iterator that can handle packed format data (int4 and fp4)
	template <typename T>
	class DataIterator
	{
	public:
	#if CUDA_VERSION >= 12070
	using value_type
	= std::conditional_t<std::is_same_v<T, Int4x2>, Int4, std::conditional_t<std::is_same_v<T, Fp4x2>, Fp4, T>>;
	#else
	using value_type = std::conditional_t<std::is_same_v<T, Int4x2>, Int4, T>;
	#endif

	DataIterator(void const* data, int64_t volume, int64_t index = 0)
	: mData(static_cast<uint8_t const*>(data))
	, mVolume(volume)
	, mIndex(index)
	{
	}

	value_type operator*() const
	{
	if constexpr (std::is_same_v<T, Int4x2>)
	{
	// For Int4x2, each byte contains two 4-bit integers
	Int4x2 packed(mData[mIndex / 2]);
	return packed.element(mIndex % 2);
	}
	#if CUDA_VERSION >= 12070
	else if constexpr (std::is_same_v<T, Fp4x2>)
	{
	// For Fp4x2, each byte contains two 4-bit floating point numbers
	Fp4x2 packed(mData[mIndex / 2]);
	return packed.element(mIndex % 2);
	}
	#endif
	else
	{
	return reinterpret_cast<T const*>(mData)[mIndex];
	}
	}

	DataIterator& operator++()
	{
	++mIndex;
	return *this;
	}

	DataIterator operator++(int)
	{
	DataIterator tmp = *this;
	++mIndex;
	return tmp;
	}

	bool operator==(DataIterator const& other) const
	{
	return mIndex == other.mIndex;
	}

	bool operator!=(DataIterator const& other) const
	{
	return mIndex != other.mIndex;
	}

	DataIterator operator+(int64_t n) const
	{
	DataIterator tmp = *this;
	tmp.mIndex += n;
	return tmp;
	}

	private:
	uint8_t const* mData;
	int64_t mVolume;
	int64_t mIndex;
	};

	template <typename T>
	class DataRange
	{
	public:
	using iterator = DataIterator<T>;
	using value_type = typename iterator::value_type;

	DataRange(void const* data, int64_t volume)
	: mData(data)
	, mVolume(volume)
	{
	}

	iterator begin() const
	{
	return iterator(mData, mVolume, 0);
	}
	iterator end() const
	{
	return iterator(mData, mVolume, mVolume);
	}

	private:
	void const* mData;
	int64_t mVolume;
	};

	template <typename T>
	static constexpr bool isFloatingPoint
	= std::is_floating_point_v<T> \|\| std::is_same_v<T, half> \|\| std::is_same_v<T, nv_bfloat16>
	#if CUDA_VERSION >= 11060
	\|\| std::is_same_v<T, __nv_fp8_e4m3>
	#endif
	#if CUDA_VERSION >= 12070
	\|\| std::is_same_v<T, Fp4> \|\| std::is_same_v<T, Fp4x2>
	#endif
	;

	constexpr int32_t kFLOATING_POINT_PRECISION = 6;
	constexpr int32_t kFLOATING_POINT_WIDTH = 13;

	std::string_view getDataTypeString(nvinfer1::DataType type)
	{
	switch (type)
	{
	case nvinfer1::DataType::kBOOL: return "BOOL";
	case nvinfer1::DataType::kINT4: return "INT4";
	case nvinfer1::DataType::kINT8: return "INT8";
	case nvinfer1::DataType::kINT32: return "INT32";
	case nvinfer1::DataType::kINT64: return "INT64";
	case nvinfer1::DataType::kUINT8: return "UINT8";
	case nvinfer1::DataType::kFP4: return "FP4";
	case nvinfer1::DataType::kFP8: return "FP8";
	case nvinfer1::DataType::kE8M0: return "E8M0";
	case nvinfer1::DataType::kHALF: return "HALF";
	case nvinfer1::DataType::kBF16: return "BF16";
	case nvinfer1::DataType::kFLOAT: return "FLOAT";
	}
	return "UNKNOWN";
	}

	template <typename T>
	void printTensorElements(T const* data, int64_t volume, std::ofstream& f)
	{
	f << " \"elements\": \"";
	constexpr int32_t kPRINT_ELEMENTS_COUNT = 10;
	int64_t firstHalf = std::min(static_cast<int64_t>(kPRINT_ELEMENTS_COUNT / 2), volume);
	int64_t secondHalf = (volume > kPRINT_ELEMENTS_COUNT)
	? kPRINT_ELEMENTS_COUNT / 2
	: std::max(static_cast<int64_t>(0), volume - kPRINT_ELEMENTS_COUNT / 2);

	auto printElement = [&f](auto value) {
	if constexpr (isFloatingPoint<T>)
	{
	f << static_cast<float>(value);
	}
	else
	{
	f << static_cast<int64_t>(value);
	}
	};

	DataRange<T> range(data, volume);
	auto it = range.begin();

	// Print first half elements
	std::string delimiter = "";
	for (int64_t i = 0; i < firstHalf; ++i)
	{
	f << delimiter;
	printElement(*it++);
	delimiter = ", ";
	}

	// Add ellipsis if needed
	f << (volume > kPRINT_ELEMENTS_COUNT ? ", ..." : "");

	// Print last elements
	it = range.begin() + (volume - secondHalf);
	for (int64_t i = volume - secondHalf; i < volume; ++i)
	{
	f << ", ";
	printElement(*it++);
	}

	f << "\"" << std::endl;
	}

	template <typename T>
	void processTensorSummary(void const* addr_host, int64_t volume, std::ofstream& f)
	{
	DataRange<T> range(addr_host, volume);

	if constexpr (isFloatingPoint<T>)
	{
	float minVal = std::numeric_limits<float>::max();
	float maxVal = std::numeric_limits<float>::lowest();
	double sum = 0.0;

	for (auto value : range)
	{
	float val = static_cast<float>(value);
	minVal = std::min(minVal, val);
	maxVal = std::max(maxVal, val);
	sum += val;
	}
	float avgVal = sum / volume;

	// nan and inf turn into string in json
	auto valueToStr = [](float val) -> std::string {
	std::stringstream ss;
	if (!std::isfinite(val))
	{
	ss << "\"" << val << "\"";
	}
	else
	{
	ss << val;
	}
	return ss.str();
	};
	f << " \"min\": " << valueToStr(minVal) << "," << std::endl;
	f << " \"max\": " << valueToStr(maxVal) << "," << std::endl;
	f << " \"avg\": " << valueToStr(avgVal) << "," << std::endl;
	}
	else
	{
	// For integer types, use int64_t for min/max calculation
	int64_t minVal = std::numeric_limits<int64_t>::max();
	int64_t maxVal = std::numeric_limits<int64_t>::lowest();
	int64_t sum = 0;

	for (auto value : range)
	{
	int64_t val = static_cast<int64_t>(value);
	minVal = std::min(minVal, val);
	maxVal = std::max(maxVal, val);
	sum += val;
	}
	double avgVal = static_cast<double>(sum) / volume;

	f << " \"min\": " << minVal << "," << std::endl;
	f << " \"max\": " << maxVal << "," << std::endl;
	f << " \"avg\": " << avgVal << "," << std::endl;
	}

	printTensorElements<T>(static_cast<T const*>(addr_host), volume, f);
	}

	std::string getCurrentTimeString()
	{
	auto now = std::chrono::system_clock::now();
	auto nowC = std::chrono::system_clock::to_time_t(now);
	std::stringstream ss;
	ss << std::put_time(std::localtime(&nowC), "%Y-%m-%dT%H:%M:%S%z");
	return ss.str();
	}

	template <typename T>
	void writeTensorStringRecursive(T const* data, nvinfer1::Dims const& shape, int32_t currentDim, int64_t offset,
	int64_t stride, std::ofstream& f, bool isFirstElement = true, int32_t indent = 0, int32_t maxWidth = 0)
	{
	bool isLastDim = currentDim == shape.nbDims - 1;
	if (isLastDim)
	{
	// Last dimension - print elements in a row
	f << std::string(indent, ' ') << "[";
	DataRange<T> range(data + offset, shape.d[currentDim]);
	auto it = range.begin();
	for (int32_t i = 0; i < shape.d[currentDim]; ++i)
	{
	if (i > 0)
	{
	f << " ";
	}
	if constexpr (isFloatingPoint<T>)
	{
	f << std::scientific << std::setprecision(kFLOATING_POINT_PRECISION) << std::setw(kFLOATING_POINT_WIDTH)
	<< std::right << static_cast<float>(*it++);
	}
	else
	{
	f << std::setw(maxWidth) << static_cast<int64_t>(*it++);
	}
	}
	f << "]" << std::endl;
	}
	else
	{
	// For higher dimensions, print each slice
	f << std::string(indent, ' ') << "[" << std::endl;
	for (int32_t i = 0; i < shape.d[currentDim]; ++i)
	{
	writeTensorStringRecursive(data, shape, currentDim + 1, offset + i * stride,
	stride / shape.d[currentDim + 1], f, i == 0, indent + 1, maxWidth);
	}
	f << std::string(indent, ' ') << "]" << std::endl;
	}
	}

	template <typename T>
	int32_t getMaxWidthInDimension(
	T const* data, nvinfer1::Dims const& shape, int32_t currentDim, int64_t offset, int64_t stride)
	{
	int32_t maxWidth = 0;
	if (currentDim == shape.nbDims - 1)
	{
	// Last dimension - check each element
	DataRange<T> range(data + offset, shape.d[currentDim]);
	for (auto value : range)
	{
	std::stringstream ss;
	ss << static_cast<int64_t>(value);
	maxWidth = std::max(maxWidth, static_cast<int32_t>(ss.str().length()));
	}
	}
	else
	{
	// For higher dimensions, check each slice
	for (int64_t i = 0; i < shape.d[currentDim]; ++i)
	{
	maxWidth = std::max(maxWidth,
	getMaxWidthInDimension(
	data, shape, currentDim + 1, offset + i * stride, stride / shape.d[currentDim + 1]));
	}
	}
	return maxWidth;
	}

	template <typename T>
	void writeTensorString(
	T const* data, nvinfer1::Dims const& shape, std::string_view tensorName, std::string const& fileName)
	{
	sample::gLogVerbose << "Writing debug tensor '" << tensorName << "' to file '" << fileName << "'" << std::endl;

	std::ofstream f(fileName, std::ios::out);
	if (!f)
	{
	sample::gLogError << "Cannot open file for write: " << fileName << std::endl;
	return;
	}

	if (shape.nbDims == 0)
	{
	f << "[]";
	return;
	}

	int64_t totalElements = 1;
	for (int32_t i = 0; i < shape.nbDims; ++i)
	{
	totalElements *= shape.d[i];
	}

	if (totalElements == 0)
	{
	f << "[]";
	return;
	}

	// Calculate stride for the first dimension
	int64_t stride = totalElements / shape.d[0];

	// Calculate max width for proper alignment only for non-floating point types
	int32_t maxWidth = 0;
	if constexpr (!isFloatingPoint<T>)
	{
	maxWidth = getMaxWidthInDimension(data, shape, 0, 0, stride);
	}

	writeTensorStringRecursive(data, shape, 0, 0, stride, f, true, 0, maxWidth);
	f << std::endl;
	}

	std::string writeStringFile(void const* addr_host, nvinfer1::DataType type, nvinfer1::Dims const& shape,
	std::string const& tensorName, std::string const& prefix)
	{
	std::string fileName = genFilenameSafeString(prefix + tensorName + ".str");

	switch (type)
	{
	case nvinfer1::DataType::kBOOL:
	writeTensorString(static_cast<bool const*>(addr_host), shape, tensorName, fileName);
	break;
	case nvinfer1::DataType::kINT4:
	writeTensorString(reinterpret_cast<Int4x2 const*>(addr_host), shape, tensorName, fileName);
	break;
	case nvinfer1::DataType::kINT8:
	writeTensorString(static_cast<int8_t const*>(addr_host), shape, tensorName, fileName);
	break;
	case nvinfer1::DataType::kINT32:
	writeTensorString(static_cast<int32_t const*>(addr_host), shape, tensorName, fileName);
	break;
	case nvinfer1::DataType::kINT64:
	writeTensorString(static_cast<int64_t const*>(addr_host), shape, tensorName, fileName);
	break;
	case nvinfer1::DataType::kUINT8:
	writeTensorString(static_cast<uint8_t const*>(addr_host), shape, tensorName, fileName);
	break;
	case nvinfer1::DataType::kFP4:
	#if CUDA_VERSION >= 12070
	writeTensorString(static_cast<Fp4x2 const*>(addr_host), shape, tensorName, fileName);
	break;
	#else
	sample::gLogWarning << "Unsupported data type kFP4 for tensor string dump in this CUDA version." << std::endl;
	return "";
	#endif
	case nvinfer1::DataType::kFP8:
	#if CUDA_VERSION >= 11060
	writeTensorString(static_cast<__nv_fp8_e4m3 const*>(addr_host), shape, tensorName, fileName);
	break;
	#else
	sample::gLogWarning << "Unsupported data type kFP8 for tensor string dump in this CUDA version." << std::endl;
	return "";
	#endif
	case nvinfer1::DataType::kE8M0:
	sample::gLogWarning << "Unsupported data type kE8M0 for tensor string dump." << std::endl;
	return "";
	case nvinfer1::DataType::kHALF:
	writeTensorString(static_cast<half const*>(addr_host), shape, tensorName, fileName);
	break;
	case nvinfer1::DataType::kBF16:
	writeTensorString(static_cast<nv_bfloat16 const*>(addr_host), shape, tensorName, fileName);
	break;
	case nvinfer1::DataType::kFLOAT:
	writeTensorString(static_cast<float const*>(addr_host), shape, tensorName, fileName);
	break;
	}
	return fileName;
	}

	std::string escapeJsonString(std::string_view str)
	{
	std::string result;
	result.reserve(str.length());
	for (char c : str)
	{
	switch (c)
	{
	case '\\': result += "\\\\"; break;
	case '\"': result += "\\\""; break;
	case '\b': result += "\\b"; break;
	case '\f': result += "\\f"; break;
	case '\n': result += "\\n"; break;
	case '\r': result += "\\r"; break;
	case '\t': result += "\\t"; break;
	default: result += c;
	}
	}
	return result;
	}

	template <typename U, typename T>
	std::vector<U> convertBufferTo(T const* data, int64_t volume)
	{
	std::vector<U> buffer(volume);
	DataRange<T> range(data, volume);
	int64_t i = 0;
	for (auto value : range)
	{
	buffer[i++] = static_cast<U>(value);
	}
	return buffer;
	}

	} // namespace

	DebugTensorWriter::DebugTensorWriter(std::unordered_map<std::string, std::string> const& debugTensorFileNames,
	std::vector<std::string> const& debugTensorFormats, std::string const& engineName, std::string const& cmdline)
	: mDebugTensorFileNames(debugTensorFileNames)
	, mDebugTensorFormats(debugTensorFormats)
	, mEngineName(engineName)
	, mCmdline(cmdline)
	{
	// Create a summary file if "summary" format is requested
	if (std::find(mDebugTensorFormats.begin(), mDebugTensorFormats.end(), "summary") != mDebugTensorFormats.end())
	{
	mSummaryFileName = "tensor_summary.json";
	mSummaryFile.open(mSummaryFileName, std::ios::out);
	if (mSummaryFile.is_open())
	{
	sample::gLogInfo << "Writing tensor summary to file: " << mSummaryFileName << std::endl;
	writeSummaryHeader();
	}
	else
	{
	sample::gLogError << "Failed to open tensor summary file: " << mSummaryFileName << std::endl;
	}
	}
	}

	DebugTensorWriter::~DebugTensorWriter()
	{
	// Close the summary file
	if (mSummaryFile.is_open())
	{
	writeSummaryFooter();
	mSummaryFile.close();
	}
	}

	void DebugTensorWriter::writeSummaryHeader()
	{
	mSummaryFile << "{" << std::endl;
	mSummaryFile << " \"metadata\": {" << std::endl;
	mSummaryFile << " \"title\": \"Tensor Summary Report\"," << std::endl;
	mSummaryFile << " \"time_generated\": \"" << getCurrentTimeString() << "\"," << std::endl;
	mSummaryFile << " \"engine_name\": \"" << mEngineName << "\"," << std::endl;
	mSummaryFile << " \"command_line\": \"" << escapeJsonString(mCmdline) << "\"" << std::endl;
	mSummaryFile << " }," << std::endl;
	mSummaryFile << " \"tensors\": [" << std::endl;
	}

	void DebugTensorWriter::writeSummaryFooter()
	{
	mSummaryFile << std::endl << " ]" << std::endl;
	mSummaryFile << "}" << std::endl;
	}

	void DebugTensorWriter::writeSummary(std::string_view name, nvinfer1::Dims const& shape, nvinfer1::DataType type,
	int64_t volume, void const* addr_host, std::string_view assignedFileName, std::string_view numpyFileName,
	std::string_view stringFileName, std::string_view rawFileName)
	{
	// Add comma separator if not the first tensor
	if (!mFirstTensor)
	{
	mSummaryFile << "," << std::endl;
	}
	mFirstTensor = false;

	// Write tensor information
	mSummaryFile << " {\n"
	<< " \"name\": \"" << name << "\",\n"
	<< " \"shape\": [";

	for (int32_t i = 0; i < shape.nbDims; ++i)
	{
	if (i > 0)
	{
	mSummaryFile << ", ";
	}
	mSummaryFile << shape.d[i];
	}

	mSummaryFile << "],\n"
	<< " \"type\": \"" << getDataTypeString(type) << "\",\n";

	// Write statistics
	mSummaryFile << " \"statistics\": {\n";

	switch (type)
	{
	case nvinfer1::DataType::kBOOL: processTensorSummary<bool>(addr_host, volume, mSummaryFile); break;
	case nvinfer1::DataType::kINT4: processTensorSummary<Int4x2>(addr_host, volume, mSummaryFile); break;
	case nvinfer1::DataType::kINT8: processTensorSummary<int8_t>(addr_host, volume, mSummaryFile); break;
	case nvinfer1::DataType::kINT32: processTensorSummary<int32_t>(addr_host, volume, mSummaryFile); break;
	case nvinfer1::DataType::kINT64: processTensorSummary<int64_t>(addr_host, volume, mSummaryFile); break;
	case nvinfer1::DataType::kUINT8: processTensorSummary<uint8_t>(addr_host, volume, mSummaryFile); break;
	case nvinfer1::DataType::kFP4:
	#if CUDA_VERSION >= 12070
	processTensorSummary<Fp4x2>(addr_host, volume, mSummaryFile);
	#else
	sample::gLogWarning << "Unsupported data type kFP4 for tensor '" << name
	<< "' summary dump in this CUDA version." << std::endl;
	#endif
	break;
	case nvinfer1::DataType::kFP8:
	#if CUDA_VERSION >= 11060
	processTensorSummary<__nv_fp8_e4m3>(addr_host, volume, mSummaryFile);
	break;
	#else
	sample::gLogWarning << "Unsupported data type kFP8 for tensor '" << name
	<< "' summary dump in this CUDA version." << std::endl;
	#endif
	break;
	case nvinfer1::DataType::kE8M0:
	sample::gLogWarning << "Unsupported data type kE8M0 for tensor '" << name << "' summary dump." << std::endl;
	break;
	case nvinfer1::DataType::kHALF: processTensorSummary<half>(addr_host, volume, mSummaryFile); break;
	case nvinfer1::DataType::kBF16: processTensorSummary<nv_bfloat16>(addr_host, volume, mSummaryFile); break;
	case nvinfer1::DataType::kFLOAT: processTensorSummary<float>(addr_host, volume, mSummaryFile); break;
	}

	mSummaryFile << " }";

	// Write file information only if at least one file exists
	if (!assignedFileName.empty() \|\| !numpyFileName.empty() \|\| !stringFileName.empty() \|\| !rawFileName.empty())
	{
	mSummaryFile << ",\n \"files\": {\n";
	std::string delimiter = "";

	if (!assignedFileName.empty())
	{
	mSummaryFile << delimiter << " \"assigned\": \"" << escapeJsonString(assignedFileName) << "\"";
	delimiter = ",\n";
	}

	if (!numpyFileName.empty())
	{
	mSummaryFile << delimiter << " \"numpy\": \"" << escapeJsonString(numpyFileName) << "\"";
	delimiter = ",\n";
	}

	if (!stringFileName.empty())
	{
	mSummaryFile << delimiter << " \"string\": \"" << escapeJsonString(stringFileName) << "\"";
	delimiter = ",\n";
	}

	if (!rawFileName.empty())
	{
	mSummaryFile << delimiter << " \"raw\": \"" << escapeJsonString(rawFileName) << "\"";
	}

	mSummaryFile << "\n }";
	}

	mSummaryFile << "\n }";
	}

	bool writeNumpyFile(void const* addr_host, std::string_view dtype, nvinfer1::Dims const& shape, int64_t size,
	std::string_view tensorName, std::string const& fileName)
	{
	sample::gLogVerbose << "Writing debug tensor '" << tensorName << "' to numpy file '" << fileName << "'"
	<< std::endl;

	std::ofstream f(fileName, std::ios::out \| std::ios::binary);
	if (!f)
	{
	sample::gLogError << "Cannot open file for write: " << fileName << std::endl;
	return false;
	}

	// Write numpy magic string and version
	char magic[] = {'\x93', 'N', 'U', 'M', 'P', 'Y'};
	char version[] = {'\x01', '\x00'};
	f.write(magic, sizeof(magic));
	f.write(version, sizeof(version));

	// Construct header
	std::stringstream header;
	header << "{'descr': '" << dtype << "', 'fortran_order': False, 'shape': (";

	for (int32_t i = 0; i < shape.nbDims; i++)
	{
	header << shape.d[i];
	header << ", ";
	}
	header << "), }";

	// Pad header to 16 bytes alignment
	std::string headerStr = header.str();
	int32_t headerLen = 10 + headerStr.length();
	int32_t padding = 16 - ((headerLen + 1) % 16);
	headerStr.append(padding, ' ');
	headerStr += '\n';

	// Write header length and header
	uint16_t headerSize = headerStr.length();
	f.write(reinterpret_cast<char*>(&headerSize), sizeof(uint16_t));
	f.write(headerStr.c_str(), headerSize);

	// Write data
	f.write(static_cast<char const*>(addr_host), size);
	f.close();

	return true;
	}

	std::string writeNumpy(nvinfer1::DataType type, void const* addr_host, int64_t volume, nvinfer1::Dims const& shape,
	std::string const& name, std::string const& prefix)
	{
	std::string fileName = prefix + name;
	std::string_view dtype = "";
	void const* data = addr_host;
	int64_t size = samplesCommon::getNbBytes(type, volume);
	std::vector<float> floatBuffer;
	std::vector<int8_t> int8Buffer;

	auto convertToFloat = [&](std::vector<float> const& buffer) {
	sample::gLogWarning << "Converting " << getDataTypeString(type) << " to float for numpy dump of tensor '"
	<< name << "'." << std::endl;
	dtype = "<f4";
	data = buffer.data();
	size = volume * sizeof(float);
	fileName += "_to_float";
	};

	auto convertToInt8 = [&](std::vector<int8_t> const& buffer) {
	sample::gLogWarning << "Converting " << getDataTypeString(type) << " to int8 for numpy dump of tensor '" << name
	<< "'." << std::endl;
	dtype = "<i1";
	data = buffer.data();
	size = volume * sizeof(int8_t);
	fileName += "_to_int8";
	};

	switch (type)
	{
	case nvinfer1::DataType::kBOOL: dtype = "\|b1"; break;
	case nvinfer1::DataType::kINT4:
	int8Buffer = convertBufferTo<int8_t>(reinterpret_cast<Int4x2 const*>(addr_host), volume);
	convertToInt8(int8Buffer);
	break;
	case nvinfer1::DataType::kINT8: dtype = "<i1"; break;
	case nvinfer1::DataType::kINT32: dtype = "<i4"; break;
	case nvinfer1::DataType::kINT64: dtype = "<i8"; break;
	case nvinfer1::DataType::kUINT8: dtype = "\|u1"; break;
	case nvinfer1::DataType::kFP4:
	#if CUDA_VERSION >= 12070
	floatBuffer = convertBufferTo<float>(static_cast<Fp4x2 const*>(addr_host), volume);
	convertToFloat(floatBuffer);
	#else
	sample::gLogWarning << "Unsupported data type kFP4 for tensor '" << name << "' numpy dump in this CUDA version."
	<< std::endl;
	return "";
	#endif
	break;
	case nvinfer1::DataType::kFP8:
	#if CUDA_VERSION >= 11060
	floatBuffer = convertBufferTo<float>(static_cast<__nv_fp8_e4m3 const*>(addr_host), volume);
	convertToFloat(floatBuffer);
	#else
	sample::gLogWarning << "Unsupported data type kFP8 for tensor '" << name << "' numpy dump in this CUDA version."
	<< std::endl;
	return "";
	#endif
	break;
	case nvinfer1::DataType::kE8M0:
	sample::gLogWarning << "Unsupported data type kE8M0 for tensor '" << name << "' numpy dump." << std::endl;
	return "";
	case nvinfer1::DataType::kHALF: dtype = "<f2"; break;
	case nvinfer1::DataType::kBF16:
	floatBuffer = convertBufferTo<float>(static_cast<nv_bfloat16 const*>(addr_host), volume);
	convertToFloat(floatBuffer);
	break;
	case nvinfer1::DataType::kFLOAT: dtype = "<f4"; break;
	}

	if (!dtype.empty())
	{

	fileName += ".npy";
	fileName = genFilenameSafeString(fileName);
	writeNumpyFile(data, dtype, shape, size, name, fileName);
	return fileName;
	}
	return "";
	}

	bool DebugTensorWriter::processDebugTensor(void const* addr, nvinfer1::TensorLocation location, nvinfer1::DataType type,
	nvinfer1::Dims const& shape, char const* name, cudaStream_t stream)
	{
	CHECK(cudaStreamSynchronize(stream));
	// Store data from callback.
	auto volume = std::accumulate(shape.d, shape.d + shape.nbDims, 1LL, std::multiplies<int64_t>{});
	int64_t size = samplesCommon::getNbBytes(type, volume);
	std::vector<char> hostDataOut;
	void const* addrHost = nullptr;
	if (location == nvinfer1::TensorLocation::kDEVICE)
	{
	hostDataOut.resize(size);
	CHECK(cudaMemcpy(hostDataOut.data(), addr, size, cudaMemcpyDeviceToHost));
	addrHost = hostDataOut.data();
	}
	else
	{
	addrHost = addr;
	}

	std::string assignedFileName;
	std::string numpyFileName;
	std::string rawFileName;
	std::string stringFileName;
	auto it = mDebugTensorFileNames.find(name);
	if (it != mDebugTensorFileNames.end())
	{
	assignedFileName = it->second;
	std::ofstream f(assignedFileName, std::ios::out \| std::ios::binary);
	ASSERT(f && "Cannot open file for write");
	sample::gLogVerbose << "Writing debug tensor '" << name << "' to file '" << assignedFileName << "'"
	<< std::endl;
	f.write(static_cast<char const*>(addrHost), size);
	f.close();
	}

	std::stringstream ss;
	ss << std::setw(4) << std::setfill('0') << mTensorIndex << "_";
	std::string prefix = ss.str();

	if (std::find(mDebugTensorFormats.begin(), mDebugTensorFormats.end(), "raw") != mDebugTensorFormats.end())
	{
	rawFileName = genFilenameSafeString(prefix + name + ".raw");
	sample::gLogVerbose << "Writing debug tensor '" << name << "' to raw file '" << rawFileName << "'" << std::endl;
	std::ofstream f(rawFileName, std::ios::out \| std::ios::binary);
	ASSERT(f && "Cannot open file for write");
	f.write(static_cast<char const*>(addrHost), size);
	f.close();
	}

	if (std::find(mDebugTensorFormats.begin(), mDebugTensorFormats.end(), "numpy") != mDebugTensorFormats.end())
	{
	numpyFileName = writeNumpy(type, addrHost, volume, shape, name, prefix);
	}

	if (std::find(mDebugTensorFormats.begin(), mDebugTensorFormats.end(), "string") != mDebugTensorFormats.end())
	{
	stringFileName = writeStringFile(addrHost, type, shape, name, prefix);
	}

	if (std::find(mDebugTensorFormats.begin(), mDebugTensorFormats.end(), "summary") != mDebugTensorFormats.end()
	&& mSummaryFile.is_open())
	{
	writeSummary(name, shape, type, volume, addrHost, assignedFileName, numpyFileName, stringFileName, rawFileName);
	mSummaryFile.flush();
	}

	mTensorIndex++;
	return true;
	}

	} // namespace sample