open-finance-llm-8b / tests /performance /test_inference_speed.py
jeanbaptdzd's picture
Fix model ID and improve memory management
9db586c
"""
Performance tests for inference speed and token throughput
Run with: pytest tests/performance/test_inference_speed.py -v -s
"""
import pytest
import httpx
import time
import asyncio
from typing import List, Dict
# Test configuration
BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
# BASE_URL = "http://localhost:7860" # For local testing
@pytest.fixture
def client():
return httpx.AsyncClient(timeout=120.0)
@pytest.mark.asyncio
async def test_single_request_latency(client):
"""Test latency for a single chat completion request"""
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [
{"role": "user", "content": "What is the capital of France?"}
],
"max_tokens": 50,
"temperature": 0.7
}
start_time = time.time()
response = await client.post(f"{BASE_URL}/v1/chat/completions", json=payload)
end_time = time.time()
assert response.status_code == 200
data = response.json()
latency = end_time - start_time
prompt_tokens = data["usage"]["prompt_tokens"]
completion_tokens = data["usage"]["completion_tokens"]
total_tokens = data["usage"]["total_tokens"]
print(f"\n=== Single Request Performance ===")
print(f"Latency: {latency:.2f}s")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")
print(f"Tokens per second: {completion_tokens / latency:.2f}")
print(f"Response: {data['choices'][0]['message']['content'][:100]}...")
assert latency < 10.0, f"Latency too high: {latency:.2f}s"
assert completion_tokens > 0, "No tokens generated"
@pytest.mark.asyncio
async def test_token_throughput_various_lengths(client):
"""Test token generation speed with various output lengths"""
test_cases = [
{"max_tokens": 50, "prompt": "Explain photosynthesis in one sentence."},
{"max_tokens": 100, "prompt": "Explain photosynthesis in a short paragraph."},
{"max_tokens": 200, "prompt": "Explain photosynthesis in detail."},
{"max_tokens": 500, "prompt": "Write a detailed essay about photosynthesis."},
]
print(f"\n=== Token Throughput Test ===")
for test_case in test_cases:
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [{"role": "user", "content": test_case["prompt"]}],
"max_tokens": test_case["max_tokens"],
"temperature": 0.7
}
start_time = time.time()
response = await client.post(f"{BASE_URL}/v1/chat/completions", json=payload)
end_time = time.time()
assert response.status_code == 200
data = response.json()
latency = end_time - start_time
completion_tokens = data["usage"]["completion_tokens"]
tokens_per_sec = completion_tokens / latency if latency > 0 else 0
print(f"\nMax tokens: {test_case['max_tokens']}")
print(f" Generated: {completion_tokens} tokens")
print(f" Time: {latency:.2f}s")
print(f" Throughput: {tokens_per_sec:.2f} tokens/sec")
assert completion_tokens > 0
@pytest.mark.asyncio
async def test_concurrent_requests(client):
"""Test performance with concurrent requests"""
num_requests = 5
async def make_request(request_id: int):
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [
{"role": "user", "content": f"Request {request_id}: What is 2+2?"}
],
"max_tokens": 50,
"temperature": 0.7
}
start_time = time.time()
response = await client.post(f"{BASE_URL}/v1/chat/completions", json=payload)
end_time = time.time()
return {
"request_id": request_id,
"status": response.status_code,
"latency": end_time - start_time,
"response": response.json() if response.status_code == 200 else None
}
print(f"\n=== Concurrent Requests Test ({num_requests} requests) ===")
start_time = time.time()
results = await asyncio.gather(*[make_request(i) for i in range(num_requests)])
end_time = time.time()
total_time = end_time - start_time
successful = sum(1 for r in results if r["status"] == 200)
avg_latency = sum(r["latency"] for r in results) / len(results)
print(f"Total time: {total_time:.2f}s")
print(f"Successful requests: {successful}/{num_requests}")
print(f"Average latency: {avg_latency:.2f}s")
print(f"Requests per second: {num_requests / total_time:.2f}")
for result in results:
print(f" Request {result['request_id']}: {result['latency']:.2f}s - {result['status']}")
assert successful == num_requests
@pytest.mark.asyncio
async def test_time_to_first_token(client):
"""Test time to first token (TTFT) using streaming"""
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [
{"role": "user", "content": "Count from 1 to 10."}
],
"max_tokens": 100,
"temperature": 0.7,
"stream": True
}
start_time = time.time()
first_token_time = None
token_count = 0
async with client.stream("POST", f"{BASE_URL}/v1/chat/completions", json=payload) as response:
async for line in response.aiter_lines():
if line.startswith("data: ") and line.strip() != "data: [DONE]":
if first_token_time is None:
first_token_time = time.time()
token_count += 1
end_time = time.time()
if first_token_time:
ttft = first_token_time - start_time
total_time = end_time - start_time
print(f"\n=== Time to First Token ===")
print(f"TTFT: {ttft:.3f}s")
print(f"Total time: {total_time:.2f}s")
print(f"Chunks received: {token_count}")
assert ttft < 5.0, f"TTFT too high: {ttft:.3f}s"
@pytest.mark.asyncio
async def test_prompt_processing_speed(client):
"""Test speed with different prompt lengths"""
prompts = [
"Hi", # Very short
"What is artificial intelligence?" * 5, # Short
"Explain quantum computing. " * 20, # Medium
"Write a detailed explanation of machine learning. " * 50, # Long
]
print(f"\n=== Prompt Processing Speed ===")
for i, prompt in enumerate(prompts):
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 50,
"temperature": 0.7
}
start_time = time.time()
response = await client.post(f"{BASE_URL}/v1/chat/completions", json=payload)
end_time = time.time()
if response.status_code == 200:
data = response.json()
latency = end_time - start_time
prompt_tokens = data["usage"]["prompt_tokens"]
print(f"\nPrompt {i+1} (length ~{len(prompt)} chars):")
print(f" Prompt tokens: {prompt_tokens}")
print(f" Latency: {latency:.2f}s")
print(f" Tokens/sec: {prompt_tokens / latency:.2f}")
@pytest.mark.asyncio
async def test_temperature_variance(client):
"""Test response variance with different temperatures"""
temperatures = [0.0, 0.5, 1.0, 1.5]
prompt = "The future of artificial intelligence is"
print(f"\n=== Temperature Variance Test ===")
for temp in temperatures:
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 50,
"temperature": temp
}
response = await client.post(f"{BASE_URL}/v1/chat/completions", json=payload)
assert response.status_code == 200
data = response.json()
content = data['choices'][0]['message']['content']
print(f"\nTemperature: {temp}")
print(f"Response: {content[:100]}...")
if __name__ == "__main__":
pytest.main([__file__, "-v", "-s"])