|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import re |
|
|
|
|
|
import pytest |
|
|
import torch |
|
|
|
|
|
|
|
|
@pytest.mark.run_only_on('GPU') |
|
|
@pytest.mark.unit |
|
|
def test_get_nemo_to_trtllm_conversion_dict_on_nemo_model(): |
|
|
try: |
|
|
from nemo.export.tensorrt_llm import TensorRTLLM |
|
|
except ImportError: |
|
|
pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
|
|
return |
|
|
|
|
|
dummy_state = object() |
|
|
model_state_dict = { |
|
|
'model.embedding.word_embeddings.weight': dummy_state, |
|
|
'model.decoder.layers.0.self_attention.linear_proj.weight': dummy_state, |
|
|
} |
|
|
nemo_model_conversion_dict = TensorRTLLM.get_nemo_to_trtllm_conversion_dict(model_state_dict) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pattern = re.compile(r'^model\.[^.].*') |
|
|
for key in nemo_model_conversion_dict.keys(): |
|
|
assert pattern.match(key), f"Key '{key}' does not properly start with 'model.'" |
|
|
|
|
|
|
|
|
@pytest.mark.run_only_on('GPU') |
|
|
@pytest.mark.unit |
|
|
def test_get_nemo_to_trtllm_conversion_dict_on_mcore_model(): |
|
|
try: |
|
|
from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import DEFAULT_CONVERSION_DICT |
|
|
|
|
|
from nemo.export.tensorrt_llm import TensorRTLLM |
|
|
except ImportError: |
|
|
pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
|
|
return |
|
|
|
|
|
dummy_state = object() |
|
|
model_state_dict = { |
|
|
'embedding.word_embeddings.weight': dummy_state, |
|
|
'decoder.layers.0.self_attention.linear_proj.weight': dummy_state, |
|
|
} |
|
|
nemo_model_conversion_dict = TensorRTLLM.get_nemo_to_trtllm_conversion_dict(model_state_dict) |
|
|
|
|
|
|
|
|
assert nemo_model_conversion_dict == DEFAULT_CONVERSION_DICT |
|
|
|
|
|
|
|
|
@pytest.mark.run_only_on('GPU') |
|
|
@pytest.mark.unit |
|
|
def test_tensorrt_llm_initialization(): |
|
|
try: |
|
|
from nemo.export.tensorrt_llm import TensorRTLLM |
|
|
except ImportError: |
|
|
pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
|
|
return |
|
|
|
|
|
|
|
|
model_dir = "/tmp/test_model_dir" |
|
|
trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False) |
|
|
assert trt_llm.model_dir == model_dir |
|
|
assert trt_llm.engine_dir == os.path.join(model_dir, "trtllm_engine") |
|
|
assert trt_llm.model is None |
|
|
assert trt_llm.tokenizer is None |
|
|
assert trt_llm.config is None |
|
|
|
|
|
|
|
|
lora_ckpt_list = ["/path/to/lora1", "/path/to/lora2"] |
|
|
trt_llm = TensorRTLLM(model_dir=model_dir, lora_ckpt_list=lora_ckpt_list, load_model=False) |
|
|
assert trt_llm.lora_ckpt_list == lora_ckpt_list |
|
|
|
|
|
|
|
|
trt_llm = TensorRTLLM( |
|
|
model_dir=model_dir, |
|
|
use_python_runtime=False, |
|
|
enable_chunked_context=False, |
|
|
max_tokens_in_paged_kv_cache=None, |
|
|
load_model=False, |
|
|
) |
|
|
assert trt_llm.use_python_runtime is False |
|
|
assert trt_llm.enable_chunked_context is False |
|
|
assert trt_llm.max_tokens_in_paged_kv_cache is None |
|
|
|
|
|
|
|
|
@pytest.mark.run_only_on('GPU') |
|
|
@pytest.mark.unit |
|
|
def test_tensorrt_llm_supported_models(): |
|
|
try: |
|
|
from nemo.export.tensorrt_llm import TensorRTLLM |
|
|
except ImportError: |
|
|
pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
|
|
return |
|
|
|
|
|
model_dir = "/tmp/test_model_dir" |
|
|
trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False) |
|
|
|
|
|
|
|
|
supported_models = trt_llm.get_supported_models_list |
|
|
assert isinstance(supported_models, list) |
|
|
assert len(supported_models) > 0 |
|
|
assert all(isinstance(model, str) for model in supported_models) |
|
|
|
|
|
|
|
|
hf_mapping = trt_llm.get_supported_hf_model_mapping |
|
|
assert isinstance(hf_mapping, dict) |
|
|
assert len(hf_mapping) > 0 |
|
|
|
|
|
|
|
|
@pytest.mark.run_only_on('GPU') |
|
|
@pytest.mark.unit |
|
|
def test_tensorrt_llm_input_dtype(): |
|
|
try: |
|
|
from nemo.export.tensorrt_llm import TensorRTLLM |
|
|
except ImportError: |
|
|
pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
|
|
return |
|
|
|
|
|
model_dir = "/tmp/test_model_dir" |
|
|
trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False) |
|
|
|
|
|
from megatron.core.export.data_type import DataType |
|
|
|
|
|
|
|
|
test_cases = [ |
|
|
(torch.float32, DataType.float32), |
|
|
(torch.float16, DataType.float16), |
|
|
(torch.bfloat16, DataType.bfloat16), |
|
|
] |
|
|
|
|
|
for storage_dtype, expected_dtype in test_cases: |
|
|
input_dtype = trt_llm.get_input_dtype(storage_dtype) |
|
|
assert input_dtype == expected_dtype, f"Expected {expected_dtype} for {storage_dtype}, got {input_dtype}" |
|
|
|
|
|
|
|
|
@pytest.mark.run_only_on('GPU') |
|
|
@pytest.mark.unit |
|
|
def test_tensorrt_llm_hidden_size(): |
|
|
try: |
|
|
from nemo.export.tensorrt_llm import TensorRTLLM |
|
|
except ImportError: |
|
|
pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
|
|
return |
|
|
|
|
|
model_dir = "/tmp/test_model_dir" |
|
|
trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False) |
|
|
|
|
|
|
|
|
hidden_size = trt_llm.get_hidden_size |
|
|
if hidden_size is not None: |
|
|
assert isinstance(hidden_size, int) |
|
|
assert hidden_size > 0 |
|
|
else: |
|
|
assert hidden_size is None |
|
|
|
|
|
|
|
|
@pytest.mark.run_only_on('GPU') |
|
|
@pytest.mark.unit |
|
|
def test_tensorrt_llm_triton_io(): |
|
|
try: |
|
|
from nemo.export.tensorrt_llm import TensorRTLLM |
|
|
except ImportError: |
|
|
pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
|
|
return |
|
|
|
|
|
model_dir = "/tmp/test_model_dir" |
|
|
trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False) |
|
|
|
|
|
|
|
|
triton_input = trt_llm.get_triton_input |
|
|
assert isinstance(triton_input, tuple) |
|
|
assert triton_input[0].name == "prompts" |
|
|
assert triton_input[1].name == "max_output_len" |
|
|
assert triton_input[2].name == "top_k" |
|
|
assert triton_input[3].name == "top_p" |
|
|
assert triton_input[4].name == "temperature" |
|
|
assert triton_input[5].name == "random_seed" |
|
|
assert triton_input[6].name == "stop_words_list" |
|
|
assert triton_input[7].name == "bad_words_list" |
|
|
assert triton_input[8].name == "no_repeat_ngram_size" |
|
|
|
|
|
|
|
|
triton_output = trt_llm.get_triton_output |
|
|
assert isinstance(triton_output, tuple) |
|
|
assert triton_output[0].name == "outputs" |
|
|
assert triton_output[1].name == "generation_logits" |
|
|
assert triton_output[2].name == "context_logits" |
|
|
|
|
|
|
|
|
@pytest.mark.run_only_on('GPU') |
|
|
@pytest.mark.unit |
|
|
def test_tensorrt_llm_pad_logits(): |
|
|
try: |
|
|
from nemo.export.tensorrt_llm import TensorRTLLM |
|
|
except ImportError: |
|
|
pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
|
|
return |
|
|
|
|
|
model_dir = "/tmp/test_model_dir" |
|
|
trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False) |
|
|
|
|
|
|
|
|
batch_size = 2 |
|
|
seq_len = 3 |
|
|
vocab_size = 1000 |
|
|
logits = torch.randn(batch_size, seq_len, vocab_size) |
|
|
|
|
|
|
|
|
padded_logits = trt_llm._pad_logits(logits) |
|
|
assert isinstance(padded_logits, torch.Tensor) |
|
|
assert padded_logits.shape[0] == batch_size |
|
|
assert padded_logits.shape[1] == seq_len |
|
|
assert padded_logits.shape[2] >= vocab_size |
|
|
|