1231: g0plus dockerfile

38fb1f6 verified about 2 months ago

39.9 kB

	/*
	* SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	* SPDX-License-Identifier: Apache-2.0
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	//!
	//! sampleCharRNN.cpp
	//! This file contains the implementation of the char_rnn sample.
	//! It uses weights from a trained TensorFlow model and creates the network
	//! using the TensorRT network definition API
	//! It can be run with the following command line:
	//! Command: ./sample_char_rnn [-h or --help] [-d or --datadir=<path to data directory>]
	//!

	// Define TRT entrypoints used in common code
	#define DEFINE_TRT_ENTRYPOINTS 1

	#include <algorithm>
	#include <array>
	#include <cmath>
	#include <cstdio>
	#include <cstring>
	#include <ctime>
	#include <fstream>
	#include <iostream>
	#include <map>
	#include <sstream>
	#include <sys/stat.h>
	#include <unordered_set>
	#include <vector>

	#include "NvInfer.h"
	#include "argsParser.h"
	#include "buffers.h"
	#include "common.h"
	#include "cuda_runtime_api.h"
	#include "logger.h"
	#include "sampleEngines.h"
	using namespace nvinfer1;
	using samplesCommon::SampleUniquePtr;

	const std::string gSampleName = "TensorRT.sample_char_rnn";

	static const std::array<int, 4> INDICES{0, 1, 2, 3};

	// The model used by this sample was trained using github repository:
	// https://github.com/crazydonkey200/tensorflow-char-rnn
	//
	// The data set used: tensorflow-char-rnn/data/tiny_shakespeare.txt
	//
	// The command used to train:
	// python train.py --data_file=data/tiny_shakespeare.txt --num_epochs=100 --num_layer=2 --hidden_size=512
	// --embedding_size=512 --dropout=.5
	//
	// Epochs trained: 100
	// Test perplexity: 4.940
	//
	// Layer0 and Layer1 weights matrices are added as RNNW_L0_NAME and RNNW_L1_NAME, respectively.
	// Layer0 and Layer1 bias are added as RNNB_L0_NAME and RNNB_L1_NAME, respectively.
	// Embedded is added as EMBED_NAME.
	// fc_w is added as FCW_NAME.
	// fc_b is added as FCB_NAME.
	struct SampleCharRNNWeightNames
	{
	const std::string RNNW_L0_NAME{"rnn_multi_rnn_cell_cell_0_basic_lstm_cell_kernel"};
	const std::string RNNB_L0_NAME{"rnn_multi_rnn_cell_cell_0_basic_lstm_cell_bias"};
	const std::string RNNW_L1_NAME{"rnn_multi_rnn_cell_cell_1_basic_lstm_cell_kernel"};
	const std::string RNNB_L1_NAME{"rnn_multi_rnn_cell_cell_1_basic_lstm_cell_bias"};
	const std::string FCW_NAME{"softmax_softmax_w"};
	const std::string FCB_NAME{"softmax_softmax_b"};
	const std::string EMBED_NAME{"embedding"};

	std::unordered_set<std::string> names
	= {{RNNW_L0_NAME, RNNB_L0_NAME, RNNW_L1_NAME, RNNB_L1_NAME, FCW_NAME, FCB_NAME, EMBED_NAME}};
	};

	struct SampleCharRNNBindingNames
	{
	const char* INPUT_BLOB_NAME{"data"};
	const char* HIDDEN_IN_BLOB_NAME{"hiddenIn"};
	const char* CELL_IN_BLOB_NAME{"cellIn"};
	const char* HIDDEN_OUT_BLOB_NAME{"hiddenOut"};
	const char* CELL_OUT_BLOB_NAME{"cellOut"};
	const char* OUTPUT_BLOB_NAME{"pred"};
	const char* SEQ_LEN_IN_BLOB_NAME{"seqLen"};
	};

	struct SampleCharRNNMaps
	{
	// A mapping from character to index used by the tensorflow model.
	const std::map<char, int> charToID{{'\n', 0}, {'!', 1}, {' ', 2}, {'$', 3}, {'\'', 4}, {'&', 5}, {'-', 6}, {',', 7},
	{'.', 8}, {'3', 9}, {';', 10}, {':', 11}, {'?', 12}, {'A', 13}, {'C', 14}, {'B', 15}, {'E', 16}, {'D', 17},
	{'G', 18}, {'F', 19}, {'I', 20}, {'H', 21}, {'K', 22}, {'J', 23}, {'M', 24}, {'L', 25}, {'O', 26}, {'N', 27},
	{'Q', 28}, {'P', 29}, {'S', 30}, {'R', 31}, {'U', 32}, {'T', 33}, {'W', 34}, {'V', 35}, {'Y', 36}, {'X', 37},
	{'Z', 38}, {'a', 39}, {'c', 40}, {'b', 41}, {'e', 42}, {'d', 43}, {'g', 44}, {'f', 45}, {'i', 46}, {'h', 47},
	{'k', 48}, {'j', 49}, {'m', 50}, {'l', 51}, {'o', 52}, {'n', 53}, {'q', 54}, {'p', 55}, {'s', 56}, {'r', 57},
	{'u', 58}, {'t', 59}, {'w', 60}, {'v', 61}, {'y', 62}, {'x', 63}, {'z', 64}};

	// A mapping from index to character used by the tensorflow model.
	const std::vector<char> idToChar{{'\n', '!', ' ', '$', '\'', '&', '-', ',', '.', '3', ';', ':', '?', 'A', 'C', 'B',
	'E', 'D', 'G', 'F', 'I', 'H', 'K', 'J', 'M', 'L', 'O', 'N', 'Q', 'P', 'S', 'R', 'U', 'T', 'W', 'V', 'Y', 'X',
	'Z', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'q', 'p', 's', 'r', 'u', 't',
	'w', 'v', 'y', 'x', 'z'}};
	};

	struct SampleCharRNNParams : samplesCommon::SampleParams
	{
	int layerCount;
	int hiddenSize;
	int seqSize;
	int dataSize;
	int vocabSize;
	int outputSize;
	std::string weightFileName;

	std::string saveEngine;
	std::string loadEngine;

	SampleCharRNNMaps charMaps;
	SampleCharRNNWeightNames weightNames;
	SampleCharRNNBindingNames bindingNames;

	std::vector<std::string> inputSentences;
	std::vector<std::string> outputSentences;
	};

	//!
	//! \brief The SampleCharRNNBase class implements the char_rnn sample
	//!
	//! \details It uses weights from a trained TensorFlow model and creates
	//! the network using the TensorRT network definition API
	//!
	class SampleCharRNNBase
	{
	public:
	SampleCharRNNBase(const SampleCharRNNParams& params)
	: mParams(params)
	{
	}

	virtual ~SampleCharRNNBase() = default;

	//!
	//! \brief Builds the network engine
	//!
	bool build();

	//!
	//! \brief Runs the TensorRT inference engine for this sample
	//!
	bool infer();

	//!
	//! \brief Used to clean up any state created in the sample class
	//!
	bool teardown();

	protected:
	//!
	//! \brief Add inputs to the TensorRT network and configure LSTM layers using network definition API.
	//!
	virtual nvinfer1::ILayer* addLSTMLayers(SampleUniquePtr<nvinfer1::INetworkDefinition>& network) = 0;

	//!
	//! \brief Converts RNN weights from TensorFlow's format to TensorRT's format.
	//!
	nvinfer1::Weights convertRNNWeights(nvinfer1::Weights input, int dataSize);

	//!
	//! \brief Converts RNN Biases from TensorFlow's format to TensorRT's format.
	//!
	nvinfer1::Weights convertRNNBias(nvinfer1::Weights input);

	std::map<std::string, nvinfer1::Weights> mWeightMap;
	std::vector<std::unique_ptr<samplesCommon::HostMemory>> weightsMemory;
	SampleCharRNNParams mParams;

	nvinfer1::ITensor* addReshape(
	SampleUniquePtr<nvinfer1::INetworkDefinition>& network, nvinfer1::ITensor& tensor, nvinfer1::Dims dims);

	private:
	//!
	//! \brief Load requested weights from a formatted file into a map.
	//!
	std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);

	//!
	//! \brief Create full model using the TensorRT network definition API and build the engine.
	//!
	void constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
	SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config);

	//!
	//! \brief Looks up the embedding tensor for a given char and copies it to input buffer
	//!
	void copyEmbeddingToInput(samplesCommon::BufferManager& buffers, char const& c);

	//!
	//! \brief Perform one time step of inference with the TensorRT execution context
	//!
	bool stepOnce(samplesCommon::BufferManager& buffers, SampleUniquePtr<nvinfer1::IExecutionContext>& context,
	cudaStream_t& stream);

	//!
	//! \brief Copies Ct/Ht output from the RNN to the Ct-1/Ht-1 input buffers for next time step
	//!
	void copyRNNOutputsToInputs(samplesCommon::BufferManager& buffers);

	//!
	//! \brief Transposes a sub-buffer of size height * width.
	//!
	bool transposeSubBuffers(void* data, int64_t height, int64_t width) noexcept;

	std::shared_ptr<nvinfer1::IRuntime> mRuntime{nullptr}; //!< The TensorRT runtime used to run the network
	std::shared_ptr<nvinfer1::ICudaEngine> mEngine{nullptr}; //!< The TensorRT engine used to run the network
	};

	class SampleCharRNNLoop : public SampleCharRNNBase
	{
	public:
	struct LstmIO
	{
	nvinfer1::ITensor* data;
	nvinfer1::ITensor* hidden;
	nvinfer1::ITensor* cell;
	};

	struct LstmParams
	{
	nvinfer1::ITensor* inputWeights;
	nvinfer1::ITensor* recurrentWeights;
	nvinfer1::ITensor* inputBias;
	nvinfer1::ITensor* recurrentBias;
	nvinfer1::ITensor* maxSequenceSize;
	};

	SampleCharRNNLoop(SampleCharRNNParams params)
	: SampleCharRNNBase(params)
	{
	}

	protected:
	//!
	//! \brief Add inputs to the TensorRT network and configure LSTM layers using network definition API.
	//!
	nvinfer1::ILayer* addLSTMLayers(SampleUniquePtr<nvinfer1::INetworkDefinition>& network) final;

	private:
	nvinfer1::ILayer* addLSTMCell(SampleUniquePtr<nvinfer1::INetworkDefinition>& network, const LstmIO& inputTensors,
	nvinfer1::ITensor* sequenceSize, const LstmParams& params, LstmIO& outputTensors);
	};

	//!
	//! \brief Transpose a sub-buffer of size height * width.
	//!
	//! \param data The data to transpose. Serves as both input and output.
	//! \param height The size of the height dimension to transpose.
	//! \param width The size of the width dimension to transpose.
	//!
	//! \return True on success, false on failure.
	//!
	bool SampleCharRNNBase::transposeSubBuffers(void* data, int64_t height, int64_t width) noexcept
	{
	try
	{
	ASSERT(data != nullptr);
	ASSERT(height > 0);
	ASSERT(width > 0);
	int64_t const tmpSize = height * width * sizeof(float);
	samplesCommon::HostBuffer tmpbuf(tmpSize, DataType::kFLOAT);
	ASSERT(tmpbuf.data() != nullptr);
	auto in = static_cast<float*>(data);
	auto out = static_cast<float*>(tmpbuf.data());

	for (int64_t i{}; i < height; ++i)
	{
	for (int64_t j{}; j < width; ++j)
	{
	out[j * height + i] = in[i * width + j];
	}
	}

	std::copy(static_cast<uint8_t>(tmpbuf.data()), static_cast<uint8_t>(tmpbuf.data()) + tmpSize,
	static_cast<uint8_t*>(data));
	}
	catch (...)
	{
	return false;
	}
	return true;
	}

	//!
	//! \brief Creates the network, configures the builder and creates
	//! the network engine
	//!
	//! \details This function loads weights from a trained TensorFlow model,
	//! creates the network using the TensorRT network definition API,
	//! and builds a TensorRT engine.
	//!
	//! \return true if the engine was created successfully and false otherwise
	//!
	bool SampleCharRNNBase::build()
	{
	mWeightMap = SampleCharRNNBase::loadWeights(mParams.weightFileName);

	if (mParams.loadEngine.empty())
	{
	auto builder
	= SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()));
	if (!builder)
	{
	return false;
	}
	auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(0));
	if (!network)
	{
	return false;
	}
	auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
	if (!config)
	{
	return false;
	}

	config->setFlag(BuilderFlag::kGPU_FALLBACK);

	// CUDA stream used for profiling by the builder.
	auto profileStream = samplesCommon::makeCudaStream();
	if (!profileStream)
	{
	return false;
	}
	config->setProfileStream(*profileStream);

	constructNetwork(builder, network, config);
	}
	else
	{
	sample::gLogInfo << "Loading engine from: " << mParams.loadEngine << std::endl;
	mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
	sample::loadEngine(mParams.loadEngine, -1, std::cerr), samplesCommon::InferDeleter());
	}

	if (!mEngine)
	{
	return false;
	}

	if (!mParams.saveEngine.empty())
	{
	sample::gLogInfo << "Saving engine to: " << mParams.saveEngine << std::endl;
	sample::saveEngine(*mEngine, mParams.saveEngine, std::cerr);
	}

	return true;
	}

	//!
	//! \brief Load requested weights from a formatted file into a map.
	//!
	//! \param file Path to weights file. File has to be the formatted dump from
	//! the dumpTFWts.py script. Otherwise, this function will not work as
	//! intended.
	//!
	//! \return A map containing the extracted weights.
	//!
	//! \note Weight V2 files are in a very simple space delimited format.
	//! <number of buffers>
	//! for each buffer: [name] [type] [shape] <data as binary blob>\n
	//! Note: type is the integer value of the DataType enum in NvInfer.h.
	//!
	std::map<std::string, nvinfer1::Weights> SampleCharRNNBase::loadWeights(const std::string file)
	{
	std::map<std::string, nvinfer1::Weights> weightMap;

	std::ifstream input(file, std::ios_base::binary);
	ASSERT(input.is_open() && "Unable to load weight file.");

	int32_t count;
	input >> count;
	ASSERT(count > 0 && "Invalid weight map file.");

	while (count--)
	{
	if (mParams.weightNames.names.empty())
	{
	break;
	}

	nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};

	// parse name and DataType
	std::string name;
	uint32_t type;
	input >> name >> std::dec >> type;
	wt.type = static_cast<nvinfer1::DataType>(type);

	// extract shape
	std::string temp, shape;
	std::getline(std::getline(input, temp, '('), shape, ')');

	// calculate count based on shape
	wt.count = 1;
	std::istringstream shapeStream(shape);
	while (std::getline(shapeStream, temp, ','))
	wt.count *= std::stoul(temp);
	size_t numOfBytes = samplesCommon::getNbBytes(wt.type, wt.count);

	// skip reading of weights if name is not in the set of names requested for extraction
	if (mParams.weightNames.names.find(name) == mParams.weightNames.names.end())
	{
	input.seekg(input.tellg() + static_cast<std::streamoff>(2 + numOfBytes));
	continue;
	}
	else
	{
	mParams.weightNames.names.erase(name);
	}

	// Read weight values
	input.seekg(input.tellg() + static_cast<std::streamoff>(1)); // skip space char
	// We do not really care about the setup of DataType here. Use char here to avoid additional conversion
	auto mem = new samplesCommon::TypedHostMemory<char, nvinfer1::DataType::kINT8>(numOfBytes);
	weightsMemory.emplace_back(mem);
	auto wtVals = mem->raw();
	input.read(wtVals, numOfBytes);
	input.seekg(input.tellg() + static_cast<std::streamoff>(1)); // skip new-line char
	wt.values = wtVals;

	weightMap[name] = wt;
	}

	input.close();
	sample::gLogInfo << "Done reading weights from file..." << std::endl;
	return weightMap;
	}

	//!
	//! \brief Converts RNN weights from TensorFlow's format to TensorRT's format.
	//!
	//! \param input Weights that are stored in TensorFlow's format.
	//!
	//! \return Converted weights in TensorRT's format.
	//!
	//! \note TensorFlow weight parameters for BasicLSTMCell are formatted as:
	//! Each [WR][icfo] is hiddenSize sequential elements.
	//! CellN Row 0: WiT, WcT, WfT, WoT
	//! CellN Row 1: WiT, WcT, WfT, WoT
	//! ...
	//! CellN RowM-1: WiT, WcT, WfT, WoT
	//! CellN RowM+0: RiT, RcT, RfT, RoT
	//! CellN RowM+1: RiT, RcT, RfT, RoT
	//! ...
	//! CellNRow2M-1: RiT, RcT, RfT, RoT
	//!
	//! TensorRT expects the format to laid out in memory:
	//! CellN: Wi, Wc, Wf, Wo, Ri, Rc, Rf, Ro
	//!
	nvinfer1::Weights SampleCharRNNBase::convertRNNWeights(nvinfer1::Weights orig, int dataSize)
	{
	nvinfer1::Weights input{orig.type, orig.values, (dataSize + mParams.hiddenSize) * 4 * mParams.hiddenSize};
	auto mem = new samplesCommon::FloatMemory(input.count);
	weightsMemory.emplace_back(mem);
	auto ptr = mem->raw();
	float const* data = static_cast<float const*>(input.values);
	int64_t dimsW[2]{dataSize, 4 * mParams.hiddenSize};
	int64_t dimsR[2]{mParams.hiddenSize, 4 * mParams.hiddenSize};
	std::copy(data, data + input.count, ptr);
	ASSERT(transposeSubBuffers(ptr, dimsW[0], dimsW[1]));
	ASSERT(transposeSubBuffers(&ptr[dimsW[0] * dimsW[1]], dimsR[0], dimsR[1]));
	return nvinfer1::Weights{input.type, ptr, input.count};
	}

	//!
	//! \brief Converts RNN Biases from TensorFlow's format to TensorRT's format.
	//!
	//! \param input Biases that are stored in TensorFlow's format.
	//!
	//! \return Converted bias in TensorRT's format.
	//!
	//! \note TensorFlow bias parameters for BasicLSTMCell are formatted as:
	//! CellN: Bi, Bc, Bf, Bo
	//!
	//! TensorRT expects the format to be:
	//! CellN: Wi, Wc, Wf, Wo, Ri, Rc, Rf, Ro
	//!
	//! Since tensorflow already combines U and W,
	//! we double the size and set all of U to zero.
	nvinfer1::Weights SampleCharRNNBase::convertRNNBias(nvinfer1::Weights input)
	{
	auto mem = new samplesCommon::FloatMemory(input.count * 2);
	weightsMemory.emplace_back(mem);
	auto ptr = mem->raw();
	const float* iptr = static_cast<const float*>(input.values);
	int64_t count = 4 * mParams.hiddenSize;
	ASSERT(input.count == count);
	std::copy(iptr, iptr + count, ptr);
	float* shiftedPtr = ptr + count;
	std::fill(shiftedPtr, shiftedPtr + count, 0.0);
	return nvinfer1::Weights{input.type, ptr, input.count * 2};
	}

	nvinfer1::ILayer* SampleCharRNNLoop::addLSTMCell(SampleUniquePtr<nvinfer1::INetworkDefinition>& network,
	const LstmIO& inputTensors, nvinfer1::ITensor* sequenceSize, const LstmParams& params, LstmIO& outputTensors)
	{
	nvinfer1::ILoop* sequenceLoop = network->addLoop();
	sequenceLoop->addTripLimit(*sequenceSize, nvinfer1::TripLimit::kCOUNT);

	nvinfer1::ITensor* input = sequenceLoop->addIterator(*inputTensors.data)->getOutput(0);
	nvinfer1::IRecurrenceLayer* hidden = sequenceLoop->addRecurrence(*inputTensors.hidden);
	nvinfer1::IRecurrenceLayer* cell = sequenceLoop->addRecurrence(*inputTensors.cell);

	nvinfer1::ITensor* mmInput = network
	->addMatrixMultiply(*input, nvinfer1::MatrixOperation::kVECTOR,
	*params.inputWeights, nvinfer1::MatrixOperation::kTRANSPOSE)
	->getOutput(0);

	nvinfer1::ITensor* mmHidden = network
	->addMatrixMultiply(*hidden->getOutput(0), nvinfer1::MatrixOperation::kVECTOR,
	*params.recurrentWeights, nvinfer1::MatrixOperation::kTRANSPOSE)
	->getOutput(0);

	nvinfer1::ITensor* mm
	= network->addElementWise(mmInput, mmHidden, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);

	nvinfer1::ITensor* bias
	= network->addElementWise(params.inputBias, params.recurrentBias, nvinfer1::ElementWiseOperation::kSUM)
	->getOutput(0);

	nvinfer1::ITensor* gatesICFO
	= network->addElementWise(mm, bias, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);

	const auto isolateGate = [&](nvinfer1::ITensor& gates, int gateIndex) -> nvinfer1::ITensor* {
	nvinfer1::ISliceLayer* slice = network->addSlice(gates, nvinfer1::Dims{1, {gateIndex * mParams.hiddenSize}},
	nvinfer1::Dims{1, {mParams.hiddenSize}}, nvinfer1::Dims{1, {1}});
	return addReshape(network, *slice->getOutput(0), nvinfer1::Dims{1, {mParams.hiddenSize}});
	};

	nvinfer1::ITensor* i
	= network->addActivation(isolateGate(gatesICFO, 0), nvinfer1::ActivationType::kSIGMOID)->getOutput(0);
	nvinfer1::ITensor* c
	= network->addActivation(isolateGate(gatesICFO, 1), nvinfer1::ActivationType::kTANH)->getOutput(0);
	nvinfer1::ITensor* f
	= network->addActivation(isolateGate(gatesICFO, 2), nvinfer1::ActivationType::kSIGMOID)->getOutput(0);
	nvinfer1::ITensor* o
	= network->addActivation(isolateGate(gatesICFO, 3), nvinfer1::ActivationType::kSIGMOID)->getOutput(0);

	nvinfer1::ITensor* C
	= network
	->addElementWise(network->addElementWise(f, *cell->getOutput(0), nvinfer1::ElementWiseOperation::kPROD)
	->getOutput(0),
	network->addElementWise(i, *c, nvinfer1::ElementWiseOperation::kPROD)->getOutput(0),
	nvinfer1::ElementWiseOperation::kSUM)
	->getOutput(0);
	nvinfer1::ITensor* H
	= network
	->addElementWise(o, network->addActivation(*C, nvinfer1::ActivationType::kTANH)->getOutput(0),
	nvinfer1::ElementWiseOperation::kPROD)
	->getOutput(0);

	// Recurrent backedge input for hidden and cell.
	cell->setInput(1, *C);
	hidden->setInput(1, *H);

	nvinfer1::ILoopOutputLayer* outputLayer = sequenceLoop->addLoopOutput(*H, nvinfer1::LoopOutput::kCONCATENATE);
	outputLayer->setInput(1, *params.maxSequenceSize);
	nvinfer1::ITensor* hiddenOut
	= sequenceLoop->addLoopOutput(*hidden->getOutput(0), nvinfer1::LoopOutput::kLAST_VALUE)->getOutput(0);
	nvinfer1::ITensor* cellOut
	= sequenceLoop->addLoopOutput(*cell->getOutput(0), nvinfer1::LoopOutput::kLAST_VALUE)->getOutput(0);

	outputTensors = LstmIO{outputLayer->getOutput(0), hiddenOut, cellOut};
	return outputLayer;
	}

	nvinfer1::ITensor* SampleCharRNNBase::addReshape(
	SampleUniquePtr<nvinfer1::INetworkDefinition>& network, nvinfer1::ITensor& tensor, nvinfer1::Dims dims)
	{
	nvinfer1::IShuffleLayer* shuffle = network->addShuffle(tensor);
	shuffle->setReshapeDimensions(dims);
	return shuffle->getOutput(0);
	}

	nvinfer1::ILayer* SampleCharRNNLoop::addLSTMLayers(SampleUniquePtr<nvinfer1::INetworkDefinition>& network)
	{
	nvinfer1::ILayer* dataOut{nullptr};

	nvinfer1::ITensor* data = network->addInput(mParams.bindingNames.INPUT_BLOB_NAME, nvinfer1::DataType::kFLOAT,
	nvinfer1::Dims2(mParams.seqSize, mParams.dataSize));
	ASSERT(data != nullptr);

	nvinfer1::ITensor* hiddenLayers = network->addInput(mParams.bindingNames.HIDDEN_IN_BLOB_NAME,
	nvinfer1::DataType::kFLOAT, nvinfer1::Dims2(mParams.layerCount, mParams.hiddenSize));
	ASSERT(hiddenLayers != nullptr);

	nvinfer1::ITensor* cellLayers = network->addInput(mParams.bindingNames.CELL_IN_BLOB_NAME,
	nvinfer1::DataType::kFLOAT, nvinfer1::Dims2(mParams.layerCount, mParams.hiddenSize));
	ASSERT(cellLayers != nullptr);

	nvinfer1::ITensor* sequenceSize
	= network->addInput(mParams.bindingNames.SEQ_LEN_IN_BLOB_NAME, nvinfer1::DataType::kINT32, nvinfer1::Dims{});
	ASSERT(sequenceSize != nullptr);

	// convert tensorflow weight format to trt weight format
	std::array<nvinfer1::Weights, 2> rnnw{
	SampleCharRNNBase::convertRNNWeights(mWeightMap[mParams.weightNames.RNNW_L0_NAME], mParams.dataSize),
	SampleCharRNNBase::convertRNNWeights(mWeightMap[mParams.weightNames.RNNW_L1_NAME], mParams.hiddenSize)};
	std::array<nvinfer1::Weights, 2> rnnb{
	SampleCharRNNBase::convertRNNBias(mWeightMap[mParams.weightNames.RNNB_L0_NAME]),
	SampleCharRNNBase::convertRNNBias(mWeightMap[mParams.weightNames.RNNB_L1_NAME])};

	// Store the transformed weights in the weight map so the memory can be properly released later.
	mWeightMap["rnnwL0"] = rnnw[0];
	mWeightMap["rnnwL1"] = rnnw[1];
	mWeightMap["rnnbL0"] = rnnb[0];
	mWeightMap["rnnbL1"] = rnnb[1];

	nvinfer1::ITensor* maxSequenceSize
	= network->addConstant(nvinfer1::Dims{}, Weights{DataType::kINT32, &mParams.seqSize, 1})->getOutput(0);
	ASSERT(static_cast<size_t>(mParams.layerCount) <= INDICES.size());
	LstmIO lstmNext{data, nullptr, nullptr};
	std::vector<nvinfer1::ITensor*> hiddenOutputs;
	std::vector<nvinfer1::ITensor*> cellOutputs;
	nvinfer1::Dims2 dimWL0(4 * mParams.hiddenSize, mParams.dataSize);
	nvinfer1::Dims2 dimR(4 * mParams.hiddenSize, mParams.hiddenSize);
	nvinfer1::Dims dimB{1, {4 * mParams.hiddenSize}};
	nvinfer1::Dims dim0{1, {0}};
	auto extractWeights = [](nvinfer1::Weights weights, Dims start, Dims size) -> nvinfer1::Weights {
	const char* data = static_cast<const char*>(weights.values);
	int64_t shift = samplesCommon::volume(start);
	const int bufferSize = samplesCommon::getNbBytes(weights.type, shift);
	int64_t count = samplesCommon::volume(size);
	ASSERT(shift + count <= weights.count);
	return nvinfer1::Weights{weights.type, data + bufferSize, count};
	};
	for (int i = 0; i < mParams.layerCount; ++i)
	{
	nvinfer1::Dims dimW = i == 0 ? dimWL0 : dimR;
	nvinfer1::ITensor* index
	= network->addConstant(nvinfer1::Dims{}, Weights{DataType::kINT32, &INDICES[i], 1})->getOutput(0);
	nvinfer1::ITensor* hidden = network->addGather(hiddenLayers, index, 0)->getOutput(0);
	nvinfer1::ITensor* cell = network->addGather(cellLayers, index, 0)->getOutput(0);
	nvinfer1::ITensor* weightIn = network->addConstant(dimW, extractWeights(rnnw[i], dim0, dimW))->getOutput(0);
	nvinfer1::ITensor* weightRec = network->addConstant(dimR, extractWeights(rnnw[i], dimW, dimR))->getOutput(0);
	nvinfer1::ITensor* biasIn = network->addConstant(dimB, extractWeights(rnnb[i], dim0, dimB))->getOutput(0);
	nvinfer1::ITensor* biasRec = network->addConstant(dimB, extractWeights(rnnb[i], dimB, dimB))->getOutput(0);
	LstmIO lstmInput{lstmNext.data, hidden, cell};
	LstmParams params{weightIn, weightRec, biasIn, biasRec, maxSequenceSize};

	Dims2 dims{1, mParams.hiddenSize};
	dataOut = addLSTMCell(network, lstmInput, sequenceSize, params, lstmNext);
	hiddenOutputs.push_back(addReshape(network, *lstmNext.hidden, dims));
	cellOutputs.push_back(addReshape(network, *lstmNext.cell, dims));
	}

	auto addConcatenation = [&network](std::vector<nvinfer1::ITensor> tensors) -> nvinfer1::ITensor {
	nvinfer1::IConcatenationLayer* concat = network->addConcatenation(tensors.data(), tensors.size());
	concat->setAxis(0);
	return concat->getOutput(0);
	};

	nvinfer1::ITensor* hiddenNext = addConcatenation(hiddenOutputs);
	hiddenNext->setName(mParams.bindingNames.HIDDEN_OUT_BLOB_NAME);
	network->markOutput(*hiddenNext);

	nvinfer1::ITensor* cellNext = addConcatenation(cellOutputs);
	cellNext->setName(mParams.bindingNames.CELL_OUT_BLOB_NAME);
	network->markOutput(*cellNext);

	return dataOut;
	}

	//!
	//! \brief Create full model using the TensorRT network definition API and build the engine.
	//!
	//! \param weightMap Map that contains all the weights required by the model.
	//! \param modelStream The stream within which the engine is serialized once built.
	//!
	void SampleCharRNNBase::constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
	SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config)
	{
	// add RNNv2 layer and set its parameters
	auto rnn = addLSTMLayers(network);

	// Transpose FC weights since TensorFlow's weights are transposed when compared to TensorRT
	ASSERT(transposeSubBuffers(
	(void*) mWeightMap[mParams.weightNames.FCW_NAME].values, mParams.hiddenSize, mParams.vocabSize));

	// add Constant layers for fully connected weights
	auto fcwts = network->addConstant(
	nvinfer1::Dims2(mParams.vocabSize, mParams.hiddenSize), mWeightMap[mParams.weightNames.FCW_NAME]);

	// Add matrix multiplication layer for multiplying rnn output with FC weights
	auto matrixMultLayer = network->addMatrixMultiply(
	fcwts->getOutput(0), MatrixOperation::kNONE, rnn->getOutput(0), MatrixOperation::kTRANSPOSE);
	ASSERT(matrixMultLayer != nullptr);
	matrixMultLayer->getOutput(0)->setName("Matrix Multiplicaton output");

	// Add elementwise layer for adding bias
	auto fcbias = network->addConstant(nvinfer1::Dims2(mParams.vocabSize, 1), mWeightMap[mParams.weightNames.FCB_NAME]);
	auto addBiasLayer = network->addElementWise(
	matrixMultLayer->getOutput(0), fcbias->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
	ASSERT(addBiasLayer != nullptr);
	addBiasLayer->getOutput(0)->setName("Add Bias output");

	// Add TopK layer to determine which character has highest probability.
	int reduceAxis = 0x1; // reduce across vocab axis
	auto pred = network->addTopK(*addBiasLayer->getOutput(0), nvinfer1::TopKOperation::kMAX, 1, reduceAxis);
	ASSERT(pred != nullptr);
	pred->getOutput(1)->setName(mParams.bindingNames.OUTPUT_BLOB_NAME);

	// Mark the outputs for the network
	network->markOutput(*pred->getOutput(1));
	pred->getOutput(1)->setType(nvinfer1::DataType::kINT32);

	SampleUniquePtr<nvinfer1::ITimingCache> timingCache{};
	if (!mParams.timingCacheFile.empty())
	{
	timingCache
	= samplesCommon::buildTimingCacheFromFile(sample::gLogger.getTRTLogger(), *config, mParams.timingCacheFile);
	}

	sample::gLogInfo << "Done constructing network..." << std::endl;

	SampleUniquePtr<IHostMemory> plan{builder->buildSerializedNetwork(network, config)};
	if (!plan)
	{
	return;
	}

	if (timingCache != nullptr && !mParams.timingCacheFile.empty())
	{
	samplesCommon::updateTimingCacheFile(
	sample::gLogger.getTRTLogger(), mParams.timingCacheFile, timingCache.get(), *builder);
	}

	mRuntime = std::shared_ptr<nvinfer1::IRuntime>(createInferRuntime(sample::gLogger.getTRTLogger()));
	if (!mRuntime)
	{
	return;
	}

	mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
	mRuntime->deserializeCudaEngine(plan->data(), plan->size()), samplesCommon::InferDeleter());
	}

	//!
	//! \brief Runs the TensorRT inference engine for this sample
	//!
	//! \details This function is the main execution function of the sample. It
	//! allocates the buffer, sets inputs, executes the engine, and verifies the output.
	//!
	bool SampleCharRNNBase::infer()
	{
	// Create RAII buffer manager object
	samplesCommon::BufferManager buffers(mEngine, 0);

	auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());

	if (!context)
	{
	return false;
	}

	// Select a random seed string.
	srand(unsigned(time(nullptr)));
	int sentenceIndex = rand() % mParams.inputSentences.size();
	std::string inputSentence = mParams.inputSentences[sentenceIndex];
	std::string expected = mParams.outputSentences[sentenceIndex];
	std::string genstr;

	sample::gLogInfo << "RNN warmup sentence: " << inputSentence << std::endl;
	sample::gLogInfo << "Expected output: " << expected << std::endl;

	// create stream for trt execution
	cudaStream_t stream;
	CHECK(cudaStreamCreate(&stream));

	// Set sequence lengths to maximum
	int* sequenceLengthIn
	= reinterpret_cast<int32_t*>(buffers.getHostBuffer(mParams.bindingNames.SEQ_LEN_IN_BLOB_NAME));
	auto sequenceLengthTensorSize = buffers.size(mParams.bindingNames.SEQ_LEN_IN_BLOB_NAME);
	std::fill_n(sequenceLengthIn, sequenceLengthTensorSize / sizeof(mParams.seqSize), mParams.seqSize);

	// Initialize hiddenIn and cellIn tensors to zero before seeding
	void* hiddenIn = buffers.getHostBuffer(mParams.bindingNames.HIDDEN_IN_BLOB_NAME);
	auto hiddenTensorSize = buffers.size(mParams.bindingNames.HIDDEN_IN_BLOB_NAME);

	void* cellIn = buffers.getHostBuffer(mParams.bindingNames.CELL_IN_BLOB_NAME);
	auto cellTensorSize = buffers.size(mParams.bindingNames.CELL_IN_BLOB_NAME);

	std::memset(hiddenIn, 0, hiddenTensorSize);
	std::memset(cellIn, 0, cellTensorSize);

	// Seed the RNN with the input sentence.
	for (auto& a : inputSentence)
	{
	SampleCharRNNBase::copyEmbeddingToInput(buffers, a);

	if (!SampleCharRNNBase::stepOnce(buffers, context, stream))
	{
	return false;
	}

	SampleCharRNNBase::copyRNNOutputsToInputs(buffers);
	genstr.push_back(a);
	}

	// Extract first predicted character
	uint32_t predIdx = reinterpret_cast<uint32_t>(buffers.getHostBuffer(mParams.bindingNames.OUTPUT_BLOB_NAME));
	genstr.push_back(mParams.charMaps.idToChar.at(predIdx));

	// Generate predicted sequence of characters
	for (size_t x = 0, y = expected.size() - 1; x < y; x++)
	{
	SampleCharRNNBase::copyEmbeddingToInput(buffers, *genstr.rbegin());

	if (!SampleCharRNNBase::stepOnce(buffers, context, stream))
	{
	return false;
	}

	SampleCharRNNBase::copyRNNOutputsToInputs(buffers);
	predIdx = reinterpret_cast<uint32_t>(buffers.getHostBuffer(mParams.bindingNames.OUTPUT_BLOB_NAME));
	genstr.push_back(mParams.charMaps.idToChar.at(predIdx));
	}

	sample::gLogInfo << "Received: " << genstr.substr(inputSentence.size()) << std::endl;

	// release the stream
	CHECK(cudaStreamDestroy(stream));

	return genstr == (inputSentence + expected);
	}

	//!
	//! \brief Looks up the embedding tensor for a given char and copies it to input buffer
	//!
	void SampleCharRNNBase::copyEmbeddingToInput(samplesCommon::BufferManager& buffers, char const& c)
	{
	auto embed = mWeightMap[mParams.weightNames.EMBED_NAME];
	float* inputBuffer = static_cast<float*>(buffers.getHostBuffer(mParams.bindingNames.INPUT_BLOB_NAME));
	auto index = mParams.charMaps.charToID.at(c);
	auto bufSize = buffers.size(mParams.bindingNames.INPUT_BLOB_NAME);

	std::memcpy(inputBuffer, static_cast<const float>(embed.values) + index mParams.dataSize, bufSize);
	}

	//!
	//! \brief Perform one time step of inference with the TensorRT execution context
	//!
	bool SampleCharRNNBase::stepOnce(
	samplesCommon::BufferManager& buffers, SampleUniquePtr<nvinfer1::IExecutionContext>& context, cudaStream_t& stream)
	{
	// Asynchronously copy data from host input buffers to device input buffers
	buffers.copyInputToDeviceAsync(stream);

	for (int32_t i = 0, e = mEngine->getNbIOTensors(); i < e; i++)
	{
	auto const name = mEngine->getIOTensorName(i);
	context->setTensorAddress(name, buffers.getDeviceBuffer(name));
	}

	// Asynchronously enqueue the inference work
	ASSERT(context->enqueueV3(stream));
	// Asynchronously copy data from device output buffers to host output buffers
	buffers.copyOutputToHostAsync(stream);

	CHECK(cudaStreamSynchronize(stream));
	return true;
	}

	//!
	//! \brief Copies Ct/Ht output from the RNN to the Ct-1/Ht-1 input buffers for next time step
	//!
	void SampleCharRNNBase::copyRNNOutputsToInputs(samplesCommon::BufferManager& buffers)
	{
	// Copy Ct/Ht to the Ct-1/Ht-1 slots.
	void* hiddenIn = buffers.getHostBuffer(mParams.bindingNames.HIDDEN_IN_BLOB_NAME);
	void* hiddenOut = buffers.getHostBuffer(mParams.bindingNames.HIDDEN_OUT_BLOB_NAME);
	auto hiddenTensorSize = buffers.size(mParams.bindingNames.HIDDEN_IN_BLOB_NAME);

	void* cellIn = buffers.getHostBuffer(mParams.bindingNames.CELL_IN_BLOB_NAME);
	void* cellOut = buffers.getHostBuffer(mParams.bindingNames.CELL_OUT_BLOB_NAME);
	auto cellTensorSize = buffers.size(mParams.bindingNames.CELL_IN_BLOB_NAME);

	std::memcpy(hiddenIn, hiddenOut, hiddenTensorSize);
	std::memcpy(cellIn, cellOut, cellTensorSize);
	}

	//!
	//! \brief Used to clean up any state created in the sample class
	//!
	bool SampleCharRNNBase::teardown()
	{
	return true;
	}

	//!
	//! \brief Initializes members of the params struct using the
	//! command line args
	//!
	SampleCharRNNParams initializeSampleParams(const samplesCommon::Args& args)
	{
	SampleCharRNNParams params;

	if (args.dataDirs.empty())
	{
	params.dataDirs.push_back("data/char-rnn/");
	params.dataDirs.push_back("data/samples/char-rnn/");
	}
	else
	{
	params.dataDirs = args.dataDirs;
	}

	params.batchSize = 1;
	params.layerCount = 2;
	params.hiddenSize = 512;
	params.seqSize = 1;
	params.dataSize = params.hiddenSize;
	params.vocabSize = 65;
	params.outputSize = 1;
	params.weightFileName = samplesCommon::locateFile("char-rnn.wts", params.dataDirs);
	params.saveEngine = args.saveEngine;
	params.loadEngine = args.loadEngine;
	params.timingCacheFile = args.timingCacheFile;

	// Input strings and their respective expected output strings
	const std::vector<std::string> inS{
	"ROMEO",
	"JUL",
	"The K",
	"That tho",
	"KING",
	"beauty of",
	"birth of K",
	"Hi",
	"JACK",
	"interestingly, it was J",
	};
	const std::vector<std::string> outS{
	":\nThe sense to",
	"IET:\nWhat shall I shall be",
	"ing Richard shall be the strange",
	"u shalt be the",
	" HENRY VI:\nWhat",
	" the son,",
	"ing Richard's son",
	"ng of York,\nThat thou hast so the",
	"INGHAM:\nWhat shall I",
	"uliet",
	};

	params.inputSentences = inS;
	params.outputSentences = outS;

	return params;
	}

	//!
	//! \brief Prints the help information for running this sample
	//!
	void printHelpInfo()
	{
	std::cout << "Usage: ./sample_char_rnn [-h or --help] [-d or --datadir=<path to data directory>]\n";
	std::cout << "--help Display help information\n";
	std::cout << "--datadir Specify path to a data directory, overriding the default. This option can be used "
	"multiple times to add multiple directories. If no data directories are given, the default is to use "
	"data/samples/char-rnn/ and data/char-rnn/"
	<< std::endl;
	std::cout << "--loadEngine Specify path from which to load the engine. When this option is provided, engine "
	<< std::endl;
	std::cout << "--saveEngine Specify path at which to save the engine." << std::endl;
	std::cout << "--timingCacheFile Specify path to a timing cache file. If it does not already exist, it will be "
	<< "created." << std::endl;
	}

	//!
	//! \brief Runs the char-rnn model in TensorRT with a set of expected input and output strings.
	//!
	int main(int argc, char** argv)
	{
	sample::setReportableSeverity(sample::Logger::Severity::kVERBOSE);
	samplesCommon::Args args;
	bool argsOK = samplesCommon::parseArgs(args, argc, argv);
	if (!argsOK)
	{
	sample::gLogError << "Invalid arguments" << std::endl;
	printHelpInfo();
	return EXIT_FAILURE;
	}
	if (args.help)
	{
	printHelpInfo();
	return EXIT_SUCCESS;
	}

	auto sampleTest = sample::gLogger.defineTest(gSampleName, argc, argv);

	sample::gLogger.reportTestStart(sampleTest);

	SampleCharRNNParams params = initializeSampleParams(args);
	std::unique_ptr<SampleCharRNNBase> sample;

	sample.reset(new SampleCharRNNLoop(params));

	sample::gLogInfo << "Building and running a GPU inference engine for Char RNN model..." << std::endl;

	if (!sample->build())
	{
	return sample::gLogger.reportFail(sampleTest);
	}
	if (!sample->infer())
	{
	return sample::gLogger.reportFail(sampleTest);
	}
	if (!sample->teardown())
	{
	return sample::gLogger.reportFail(sampleTest);
	}

	return sample::gLogger.reportPass(sampleTest);
	}