File size: 4,743 Bytes

b386992

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import os

import pytest
import torch
import torch.distributed as dist
import torch.multiprocessing as mp

from nemo.deploy.nlp.hf_deployable import HuggingFaceLLMDeploy
from nemo.deploy.utils import broadcast_list


@pytest.mark.run_only_on('GPU')
@pytest.mark.unit
def test_hf_generate():
    """Tests HF deployable class's generate function."""

    hf_deployable = HuggingFaceLLMDeploy(
        hf_model_id_path="/home/TestData/llm/models/llama3.2-1B-hf/",
        task="text-generation",
        trust_remote_code=True,
        device_map=None,
        tp_plan=None,
    )

    output = hf_deployable.generate(
        text_inputs=["What is the color of a banana? ", "Tell me a joke."],
        max_length=32,
        do_sample=True,
    )

    assert len(output) == 2, "Output should have to be a list."
    assert len(output[0]) > 0, "First list in the output should have more than 0 elements."
    assert len(output[1]) > 0, "Second list in the output should have more than 0 elements."

    # Test output_logits and output_scores
    output = hf_deployable.generate(
        text_inputs=["What is the color of a banana? ", "Tell me a joke."],
        max_length=32,
        do_sample=True,
        output_logits=True,
        output_scores=True,
        return_dict_in_generate=True,
    )
    assert "logits" in output, "Output should have logits."
    assert "scores" in output, "Output should have scores."
    assert "sentences" in output, "Output should have sentences."
    assert len(output["sentences"]) == 2, "Output should have 2 sentences."


@pytest.mark.run_only_on('GPU')
@pytest.mark.unit
@pytest.mark.skip(reason="will be enabled later.")
def test_hf_multigpu_generate():
    """Tests HF deployable class's generate function with multiple GPUs."""

    mp.spawn(_run_generate, nprocs=2)


def _run_generate(rank):
    """Code to run generate in each rank."""

    os.environ['WORLD_SIZE'] = '2'
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    if rank == 0:
        os.environ['RANK'] = str(rank)
        dist.init_process_group("nccl", rank=rank, world_size=2)
        _hf_generate_ranks()
        dist.destroy_process_group()
    else:
        os.environ['RANK'] = str(rank)
        dist.init_process_group("nccl", rank=rank, world_size=2)
        _hf_generate_ranks()
        dist.destroy_process_group()


def _hf_generate_ranks():
    """Generate by Ranks"""

    torch.cuda.set_device(dist.get_rank())

    hf_deployable = HuggingFaceLLMDeploy(
        hf_model_id_path="/home/TestData/llm/models/llama3.2-1B-hf/",
        task="text-generation",
        trust_remote_code=True,
        device_map=None,
        tp_plan=None,
    )

    if dist.get_rank() == 0:
        temperature = 1.0
        top_k = 1
        top_p = 0.0
        num_tokens_to_generate = 32
        output_logits = False
        output_scores = False

        prompts = ["What is the color of a banana? ", "Tell me a joke."]

        dist.broadcast(torch.tensor([0], dtype=torch.long, device="cuda"), src=0)
        broadcast_list(prompts, src=0)
        broadcast_list(
            data=[
                temperature,
                top_k,
                top_p,
                num_tokens_to_generate,
                output_logits,
                output_scores,
            ],
            src=0,
        )

        output = hf_deployable.generate(
            text_inputs=prompts,
            max_length=num_tokens_to_generate,
            do_sample=True,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            output_logits=output_logits,
            output_scores=output_scores,
        )
        dist.broadcast(torch.tensor([1], dtype=torch.long, device="cuda"), src=0)
    else:
        hf_deployable.generate_other_ranks()

    dist.barrier()

    if dist.get_rank() == 0:
        assert len(output) == 2, "Output should have to be a lists."
        assert len(output[0]) > 0, "First list in the output should have more than 0 elements."
        assert len(output[1]) > 0, "Second list in the output should have more than 0 elements."