# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Generator
import pytest
import requests
from nemo.collections.llm.evaluation.adapters.server import create_server_process
from nemo.collections.llm.evaluation.api import AdapterConfig
@pytest.fixture
def adapter_server(fake_openai_endpoint) -> Generator[AdapterConfig, Any, Any]:
# Create serializable configuration
adapter_config = AdapterConfig(
api_url="http://localhost:3300/v1/chat/completions",
use_reasoning=True,
end_reasoning_token="",
)
# Create server process and get a reference instance for config
p, adapter_config = create_server_process(adapter_config)
yield adapter_config
p.terminate()
@pytest.mark.parametrize(
"input_content,expected_content",
[
(
"Let me think about this...\nThis is my reasoning process that should be removed\nHere's my final answer.",
"Here's my final answer.",
),
(
"No reasoning tokens in this response.",
"No reasoning tokens in this response.",
),
(
"First I'll analyze the problem\nThen I'll solve it step by stepHere's the solution.",
"Here's the solution.",
),
],
)
def test_reasoning_responses(
adapter_server,
fake_openai_endpoint,
input_content,
expected_content,
):
url = f"http://localhost:{adapter_server.local_port}"
# We parametrize the response of the openai fake server.
response_data = {
"choices": [
{
"message": {
"role": "assistant",
"content": input_content,
}
}
]
}
data = {
"prompt": "This is a test prompt",
"max_tokens": 100,
"temperature": 0.5,
"fake_response": response_data,
}
response = requests.post(url, json=data)
assert response.status_code == 200
cleaned_data = response.json()
cleaned_content = cleaned_data["choices"][0]["message"]["content"]
assert cleaned_content == expected_content
def test_multiple_choices(
adapter_server,
fake_openai_endpoint,
):
# Given: A response with multiple choices containing reasoning tokens
url = f"http://localhost:{adapter_server.local_port}"
response_data = {
"choices": [
{
"message": {
"role": "assistant",
"content": "Reasoning 1Answer 1",
}
},
{
"message": {
"role": "assistant",
"content": "Reasoning 2Answer 2",
}
},
]
}
data = {
"prompt": "This is a test prompt",
"max_tokens": 100,
"temperature": 0.5,
"fake_response": response_data,
}
response = requests.post(url, json=data)
# Then: The reasoning tokens should be removed from all choices
assert response.status_code == 200
cleaned_data = response.json()
assert cleaned_data["choices"][0]["message"]["content"] == "Answer 1"
assert cleaned_data["choices"][1]["message"]["content"] == "Answer 2"
def test_non_assistant_role(
adapter_server,
fake_openai_endpoint,
):
# Given: A response with a non-assistant role message
url = f"http://localhost:{adapter_server.local_port}"
response_data = {
"choices": [
{
"message": {
"role": "system",
"content": "This should not be processedSystem message",
}
}
]
}
data = {
"prompt": "This is a test prompt",
"max_tokens": 100,
"temperature": 0.5,
"fake_response": response_data,
}
response = requests.post(url, json=data)
# Then: The content should remain unchanged
cleaned_data = response.json()
assert (
cleaned_data["choices"][0]["message"]["content"] == "This should not be processedSystem message"
)