NeMo_Canary / tests /export /test_tensorrt_llm.py

Upload folder using huggingface_hub

b386992 verified 6 months ago

8.25 kB

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import os
	import re

	import pytest
	import torch


	@pytest.mark.run_only_on('GPU')
	@pytest.mark.unit
	def test_get_nemo_to_trtllm_conversion_dict_on_nemo_model():
	try:
	from nemo.export.tensorrt_llm import TensorRTLLM
	except ImportError:
	pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
	return

	dummy_state = object()
	model_state_dict = {
	'model.embedding.word_embeddings.weight': dummy_state,
	'model.decoder.layers.0.self_attention.linear_proj.weight': dummy_state,
	}
	nemo_model_conversion_dict = TensorRTLLM.get_nemo_to_trtllm_conversion_dict(model_state_dict)

	# Check that every key starts with 'model.' and not 'model..' by using a regex
	# This pattern ensures:
	# - The key starts with 'model.'
	# - Immediately after 'model.', there must be at least one character that is NOT a '.'
	# (preventing the 'model..' scenario)
	pattern = re.compile(r'^model\.[^.].*')
	for key in nemo_model_conversion_dict.keys():
	assert pattern.match(key), f"Key '{key}' does not properly start with 'model.'"


	@pytest.mark.run_only_on('GPU')
	@pytest.mark.unit
	def test_get_nemo_to_trtllm_conversion_dict_on_mcore_model():
	try:
	from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import DEFAULT_CONVERSION_DICT

	from nemo.export.tensorrt_llm import TensorRTLLM
	except ImportError:
	pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
	return

	dummy_state = object()
	model_state_dict = {
	'embedding.word_embeddings.weight': dummy_state,
	'decoder.layers.0.self_attention.linear_proj.weight': dummy_state,
	}
	nemo_model_conversion_dict = TensorRTLLM.get_nemo_to_trtllm_conversion_dict(model_state_dict)

	# This is essentially a no-op
	assert nemo_model_conversion_dict == DEFAULT_CONVERSION_DICT


	@pytest.mark.run_only_on('GPU')
	@pytest.mark.unit
	def test_tensorrt_llm_initialization():
	try:
	from nemo.export.tensorrt_llm import TensorRTLLM
	except ImportError:
	pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
	return

	# Test basic initialization
	model_dir = "/tmp/test_model_dir"
	trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
	assert trt_llm.model_dir == model_dir
	assert trt_llm.engine_dir == os.path.join(model_dir, "trtllm_engine")
	assert trt_llm.model is None
	assert trt_llm.tokenizer is None
	assert trt_llm.config is None

	# Test initialization with lora checkpoints
	lora_ckpt_list = ["/path/to/lora1", "/path/to/lora2"]
	trt_llm = TensorRTLLM(model_dir=model_dir, lora_ckpt_list=lora_ckpt_list, load_model=False)
	assert trt_llm.lora_ckpt_list == lora_ckpt_list

	# Test initialization with python runtime options
	trt_llm = TensorRTLLM(
	model_dir=model_dir,
	use_python_runtime=False,
	enable_chunked_context=False,
	max_tokens_in_paged_kv_cache=None,
	load_model=False,
	)
	assert trt_llm.use_python_runtime is False
	assert trt_llm.enable_chunked_context is False
	assert trt_llm.max_tokens_in_paged_kv_cache is None


	@pytest.mark.run_only_on('GPU')
	@pytest.mark.unit
	def test_tensorrt_llm_supported_models():
	try:
	from nemo.export.tensorrt_llm import TensorRTLLM
	except ImportError:
	pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
	return

	model_dir = "/tmp/test_model_dir"
	trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)

	# Test supported models list
	supported_models = trt_llm.get_supported_models_list
	assert isinstance(supported_models, list)
	assert len(supported_models) > 0
	assert all(isinstance(model, str) for model in supported_models)

	# Test HF model mapping
	hf_mapping = trt_llm.get_supported_hf_model_mapping
	assert isinstance(hf_mapping, dict)
	assert len(hf_mapping) > 0


	@pytest.mark.run_only_on('GPU')
	@pytest.mark.unit
	def test_tensorrt_llm_input_dtype():
	try:
	from nemo.export.tensorrt_llm import TensorRTLLM
	except ImportError:
	pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
	return

	model_dir = "/tmp/test_model_dir"
	trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)

	from megatron.core.export.data_type import DataType

	# Test different storage dtypes
	test_cases = [
	(torch.float32, DataType.float32),
	(torch.float16, DataType.float16),
	(torch.bfloat16, DataType.bfloat16),
	]

	for storage_dtype, expected_dtype in test_cases:
	input_dtype = trt_llm.get_input_dtype(storage_dtype)
	assert input_dtype == expected_dtype, f"Expected {expected_dtype} for {storage_dtype}, got {input_dtype}"


	@pytest.mark.run_only_on('GPU')
	@pytest.mark.unit
	def test_tensorrt_llm_hidden_size():
	try:
	from nemo.export.tensorrt_llm import TensorRTLLM
	except ImportError:
	pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
	return

	model_dir = "/tmp/test_model_dir"
	trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)

	# Test hidden size property
	hidden_size = trt_llm.get_hidden_size
	if hidden_size is not None:
	assert isinstance(hidden_size, int)
	assert hidden_size > 0
	else:
	assert hidden_size is None


	@pytest.mark.run_only_on('GPU')
	@pytest.mark.unit
	def test_tensorrt_llm_triton_io():
	try:
	from nemo.export.tensorrt_llm import TensorRTLLM
	except ImportError:
	pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
	return

	model_dir = "/tmp/test_model_dir"
	trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)

	# Test Triton input configuration
	triton_input = trt_llm.get_triton_input
	assert isinstance(triton_input, tuple)
	assert triton_input[0].name == "prompts"
	assert triton_input[1].name == "max_output_len"
	assert triton_input[2].name == "top_k"
	assert triton_input[3].name == "top_p"
	assert triton_input[4].name == "temperature"
	assert triton_input[5].name == "random_seed"
	assert triton_input[6].name == "stop_words_list"
	assert triton_input[7].name == "bad_words_list"
	assert triton_input[8].name == "no_repeat_ngram_size"

	# Test Triton output configuration
	triton_output = trt_llm.get_triton_output
	assert isinstance(triton_output, tuple)
	assert triton_output[0].name == "outputs"
	assert triton_output[1].name == "generation_logits"
	assert triton_output[2].name == "context_logits"


	@pytest.mark.run_only_on('GPU')
	@pytest.mark.unit
	def test_tensorrt_llm_pad_logits():
	try:
	from nemo.export.tensorrt_llm import TensorRTLLM
	except ImportError:
	pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
	return

	model_dir = "/tmp/test_model_dir"
	trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)

	# Create a sample logits tensor
	batch_size = 2
	seq_len = 3
	vocab_size = 1000
	logits = torch.randn(batch_size, seq_len, vocab_size)

	# Test padding logits
	padded_logits = trt_llm._pad_logits(logits)
	assert isinstance(padded_logits, torch.Tensor)
	assert padded_logits.shape[0] == batch_size
	assert padded_logits.shape[1] == seq_len
	assert padded_logits.shape[2] >= vocab_size # Should be padded to a multiple of 8