File size: 8,344 Bytes
a4e7832 69e4f97 a4e7832 9db586c a4e7832 9db586c a4e7832 9db586c a4e7832 9db586c a4e7832 9db586c a4e7832 9db586c a4e7832 da484d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 |
"""
Performance tests for inference speed and token throughput
Run with: pytest tests/performance/test_inference_speed.py -v -s
"""
import pytest
import httpx
import time
import asyncio
from typing import List, Dict
# Test configuration
BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
# BASE_URL = "http://localhost:7860" # For local testing
@pytest.fixture
def client():
return httpx.AsyncClient(timeout=120.0)
@pytest.mark.asyncio
async def test_single_request_latency(client):
"""Test latency for a single chat completion request"""
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [
{"role": "user", "content": "What is the capital of France?"}
],
"max_tokens": 50,
"temperature": 0.7
}
start_time = time.time()
response = await client.post(f"{BASE_URL}/v1/chat/completions", json=payload)
end_time = time.time()
assert response.status_code == 200
data = response.json()
latency = end_time - start_time
prompt_tokens = data["usage"]["prompt_tokens"]
completion_tokens = data["usage"]["completion_tokens"]
total_tokens = data["usage"]["total_tokens"]
print(f"\n=== Single Request Performance ===")
print(f"Latency: {latency:.2f}s")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Completion tokens: {completion_tokens}")
print(f"Total tokens: {total_tokens}")
print(f"Tokens per second: {completion_tokens / latency:.2f}")
print(f"Response: {data['choices'][0]['message']['content'][:100]}...")
assert latency < 10.0, f"Latency too high: {latency:.2f}s"
assert completion_tokens > 0, "No tokens generated"
@pytest.mark.asyncio
async def test_token_throughput_various_lengths(client):
"""Test token generation speed with various output lengths"""
test_cases = [
{"max_tokens": 50, "prompt": "Explain photosynthesis in one sentence."},
{"max_tokens": 100, "prompt": "Explain photosynthesis in a short paragraph."},
{"max_tokens": 200, "prompt": "Explain photosynthesis in detail."},
{"max_tokens": 500, "prompt": "Write a detailed essay about photosynthesis."},
]
print(f"\n=== Token Throughput Test ===")
for test_case in test_cases:
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [{"role": "user", "content": test_case["prompt"]}],
"max_tokens": test_case["max_tokens"],
"temperature": 0.7
}
start_time = time.time()
response = await client.post(f"{BASE_URL}/v1/chat/completions", json=payload)
end_time = time.time()
assert response.status_code == 200
data = response.json()
latency = end_time - start_time
completion_tokens = data["usage"]["completion_tokens"]
tokens_per_sec = completion_tokens / latency if latency > 0 else 0
print(f"\nMax tokens: {test_case['max_tokens']}")
print(f" Generated: {completion_tokens} tokens")
print(f" Time: {latency:.2f}s")
print(f" Throughput: {tokens_per_sec:.2f} tokens/sec")
assert completion_tokens > 0
@pytest.mark.asyncio
async def test_concurrent_requests(client):
"""Test performance with concurrent requests"""
num_requests = 5
async def make_request(request_id: int):
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [
{"role": "user", "content": f"Request {request_id}: What is 2+2?"}
],
"max_tokens": 50,
"temperature": 0.7
}
start_time = time.time()
response = await client.post(f"{BASE_URL}/v1/chat/completions", json=payload)
end_time = time.time()
return {
"request_id": request_id,
"status": response.status_code,
"latency": end_time - start_time,
"response": response.json() if response.status_code == 200 else None
}
print(f"\n=== Concurrent Requests Test ({num_requests} requests) ===")
start_time = time.time()
results = await asyncio.gather(*[make_request(i) for i in range(num_requests)])
end_time = time.time()
total_time = end_time - start_time
successful = sum(1 for r in results if r["status"] == 200)
avg_latency = sum(r["latency"] for r in results) / len(results)
print(f"Total time: {total_time:.2f}s")
print(f"Successful requests: {successful}/{num_requests}")
print(f"Average latency: {avg_latency:.2f}s")
print(f"Requests per second: {num_requests / total_time:.2f}")
for result in results:
print(f" Request {result['request_id']}: {result['latency']:.2f}s - {result['status']}")
assert successful == num_requests
@pytest.mark.asyncio
async def test_time_to_first_token(client):
"""Test time to first token (TTFT) using streaming"""
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [
{"role": "user", "content": "Count from 1 to 10."}
],
"max_tokens": 100,
"temperature": 0.7,
"stream": True
}
start_time = time.time()
first_token_time = None
token_count = 0
async with client.stream("POST", f"{BASE_URL}/v1/chat/completions", json=payload) as response:
async for line in response.aiter_lines():
if line.startswith("data: ") and line.strip() != "data: [DONE]":
if first_token_time is None:
first_token_time = time.time()
token_count += 1
end_time = time.time()
if first_token_time:
ttft = first_token_time - start_time
total_time = end_time - start_time
print(f"\n=== Time to First Token ===")
print(f"TTFT: {ttft:.3f}s")
print(f"Total time: {total_time:.2f}s")
print(f"Chunks received: {token_count}")
assert ttft < 5.0, f"TTFT too high: {ttft:.3f}s"
@pytest.mark.asyncio
async def test_prompt_processing_speed(client):
"""Test speed with different prompt lengths"""
prompts = [
"Hi", # Very short
"What is artificial intelligence?" * 5, # Short
"Explain quantum computing. " * 20, # Medium
"Write a detailed explanation of machine learning. " * 50, # Long
]
print(f"\n=== Prompt Processing Speed ===")
for i, prompt in enumerate(prompts):
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 50,
"temperature": 0.7
}
start_time = time.time()
response = await client.post(f"{BASE_URL}/v1/chat/completions", json=payload)
end_time = time.time()
if response.status_code == 200:
data = response.json()
latency = end_time - start_time
prompt_tokens = data["usage"]["prompt_tokens"]
print(f"\nPrompt {i+1} (length ~{len(prompt)} chars):")
print(f" Prompt tokens: {prompt_tokens}")
print(f" Latency: {latency:.2f}s")
print(f" Tokens/sec: {prompt_tokens / latency:.2f}")
@pytest.mark.asyncio
async def test_temperature_variance(client):
"""Test response variance with different temperatures"""
temperatures = [0.0, 0.5, 1.0, 1.5]
prompt = "The future of artificial intelligence is"
print(f"\n=== Temperature Variance Test ===")
for temp in temperatures:
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 50,
"temperature": temp
}
response = await client.post(f"{BASE_URL}/v1/chat/completions", json=payload)
assert response.status_code == 200
data = response.json()
content = data['choices'][0]['message']['content']
print(f"\nTemperature: {temp}")
print(f"Response: {content[:100]}...")
if __name__ == "__main__":
pytest.main([__file__, "-v", "-s"])
|