jeanbaptdzd's picture
Fix model ID and improve memory management
9db586c
#!/usr/bin/env python3
"""
Comprehensive benchmark suite for LLM Pro Finance API
Run with: python tests/performance/benchmark.py
"""
import asyncio
import httpx
import time
import statistics
from typing import List, Dict
import json
# Configuration
BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
# BASE_URL = "http://localhost:7860" # For local testing
class Benchmark:
def __init__(self, base_url: str = BASE_URL):
self.base_url = base_url
self.client = httpx.AsyncClient(timeout=120.0)
self.results = {}
async def health_check(self) -> bool:
"""Check if service is available"""
try:
response = await self.client.get(f"{self.base_url}/health")
return response.status_code == 200
except:
return False
async def benchmark_single_request(self, num_runs: int = 10) -> Dict:
"""Benchmark single request latency"""
print(f"\n{'='*60}")
print("BENCHMARK: Single Request Latency")
print(f"{'='*60}")
latencies = []
tokens_per_sec = []
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [
{"role": "user", "content": "What is artificial intelligence?"}
],
"max_tokens": 100,
"temperature": 0.7
}
for i in range(num_runs):
start = time.time()
response = await self.client.post(
f"{self.base_url}/v1/chat/completions",
json=payload
)
end = time.time()
if response.status_code == 200:
data = response.json()
latency = end - start
completion_tokens = data["usage"]["completion_tokens"]
tps = completion_tokens / latency if latency > 0 else 0
latencies.append(latency)
tokens_per_sec.append(tps)
print(f"Run {i+1}/{num_runs}: {latency:.2f}s, {tps:.2f} tokens/sec")
results = {
"avg_latency": statistics.mean(latencies),
"min_latency": min(latencies),
"max_latency": max(latencies),
"std_latency": statistics.stdev(latencies) if len(latencies) > 1 else 0,
"avg_tokens_per_sec": statistics.mean(tokens_per_sec),
"max_tokens_per_sec": max(tokens_per_sec),
}
print(f"\nResults:")
print(f" Average latency: {results['avg_latency']:.2f}s (Β±{results['std_latency']:.2f}s)")
print(f" Min/Max latency: {results['min_latency']:.2f}s / {results['max_latency']:.2f}s")
print(f" Average throughput: {results['avg_tokens_per_sec']:.2f} tokens/sec")
print(f" Max throughput: {results['max_tokens_per_sec']:.2f} tokens/sec")
return results
async def benchmark_concurrent_load(self, num_concurrent: int = 10) -> Dict:
"""Benchmark concurrent request handling"""
print(f"\n{'='*60}")
print(f"BENCHMARK: Concurrent Load ({num_concurrent} requests)")
print(f"{'='*60}")
async def make_request(request_id: int):
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [
{"role": "user", "content": f"Request {request_id}: Explain machine learning."}
],
"max_tokens": 50,
"temperature": 0.7
}
start = time.time()
response = await self.client.post(
f"{self.base_url}/v1/chat/completions",
json=payload
)
end = time.time()
return {
"request_id": request_id,
"latency": end - start,
"status": response.status_code,
"data": response.json() if response.status_code == 200 else None
}
start_time = time.time()
results = await asyncio.gather(*[make_request(i) for i in range(num_concurrent)])
end_time = time.time()
total_time = end_time - start_time
successful = [r for r in results if r["status"] == 200]
latencies = [r["latency"] for r in successful]
benchmark_results = {
"total_time": total_time,
"num_requests": num_concurrent,
"successful": len(successful),
"failed": num_concurrent - len(successful),
"avg_latency": statistics.mean(latencies) if latencies else 0,
"requests_per_sec": num_concurrent / total_time,
}
print(f"\nResults:")
print(f" Total time: {total_time:.2f}s")
print(f" Successful: {len(successful)}/{num_concurrent}")
print(f" Average latency: {benchmark_results['avg_latency']:.2f}s")
print(f" Requests/sec: {benchmark_results['requests_per_sec']:.2f}")
return benchmark_results
async def benchmark_different_lengths(self) -> Dict:
"""Benchmark with different output lengths"""
print(f"\n{'='*60}")
print("BENCHMARK: Different Output Lengths")
print(f"{'='*60}")
test_cases = [
{"name": "Short (50 tokens)", "max_tokens": 50},
{"name": "Medium (100 tokens)", "max_tokens": 100},
{"name": "Long (200 tokens)", "max_tokens": 200},
{"name": "Very Long (500 tokens)", "max_tokens": 500},
]
results_by_length = {}
for test_case in test_cases:
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [
{"role": "user", "content": "Write about the history of computing."}
],
"max_tokens": test_case["max_tokens"],
"temperature": 0.7
}
start = time.time()
response = await self.client.post(
f"{self.base_url}/v1/chat/completions",
json=payload
)
end = time.time()
if response.status_code == 200:
data = response.json()
latency = end - start
completion_tokens = data["usage"]["completion_tokens"]
tps = completion_tokens / latency if latency > 0 else 0
results_by_length[test_case["name"]] = {
"latency": latency,
"tokens": completion_tokens,
"tokens_per_sec": tps
}
print(f"\n{test_case['name']}:")
print(f" Generated: {completion_tokens} tokens")
print(f" Time: {latency:.2f}s")
print(f" Throughput: {tps:.2f} tokens/sec")
return results_by_length
async def benchmark_openai_compatibility(self) -> Dict:
"""Test OpenAI API compatibility"""
print(f"\n{'='*60}")
print("BENCHMARK: OpenAI API Compatibility")
print(f"{'='*60}")
tests = {
"list_models": False,
"chat_completions": False,
"system_message": False,
"conversation_history": False,
"streaming": False,
"temperature_param": False,
"max_tokens_param": False,
}
# Test 1: List models
try:
response = await self.client.get(f"{self.base_url}/v1/models")
if response.status_code == 200:
data = response.json()
if "data" in data and len(data["data"]) > 0:
tests["list_models"] = True
print("βœ“ List models endpoint")
except:
pass
# Test 2: Chat completions
try:
payload = {"model": "DragonLLM/LLM-Pro-Finance-Small", "messages": [{"role": "user", "content": "Hi"}]}
response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload)
if response.status_code == 200:
data = response.json()
if "choices" in data and "usage" in data:
tests["chat_completions"] = True
print("βœ“ Chat completions endpoint")
except:
pass
# Test 3: System message
try:
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [
{"role": "system", "content": "Be helpful."},
{"role": "user", "content": "Hi"}
]
}
response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload)
if response.status_code == 200:
tests["system_message"] = True
print("βœ“ System message support")
except:
pass
# Test 4: Conversation history
try:
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [
{"role": "user", "content": "My name is Alice"},
{"role": "assistant", "content": "Hello Alice"},
{"role": "user", "content": "What's my name?"}
]
}
response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload)
if response.status_code == 200:
tests["conversation_history"] = True
print("βœ“ Conversation history")
except:
pass
# Test 5: Temperature parameter
try:
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [{"role": "user", "content": "Hi"}],
"temperature": 0.5
}
response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload)
if response.status_code == 200:
tests["temperature_param"] = True
print("βœ“ Temperature parameter")
except:
pass
# Test 6: Max tokens parameter
try:
payload = {
"model": "DragonLLM/Qwen-Open-Finance-R-8B",
"messages": [{"role": "user", "content": "Hi"}],
"max_tokens": 10
}
response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload)
if response.status_code == 200:
tests["max_tokens_param"] = True
print("βœ“ Max tokens parameter")
except:
pass
passed = sum(1 for v in tests.values() if v)
total = len(tests)
print(f"\nCompatibility Score: {passed}/{total} ({100*passed/total:.0f}%)")
return {"tests": tests, "score": f"{passed}/{total}"}
async def run_all_benchmarks(self):
"""Run all benchmarks"""
print(f"\n{'#'*60}")
print("LLM Pro Finance API - Comprehensive Benchmark Suite")
print(f"Service: {self.base_url}")
print(f"{'#'*60}")
# Health check
print("\nChecking service health...")
if not await self.health_check():
print("❌ Service is not available!")
return
print("βœ“ Service is healthy")
# Run benchmarks
self.results["single_request"] = await self.benchmark_single_request(num_runs=5)
self.results["concurrent_load"] = await self.benchmark_concurrent_load(num_concurrent=5)
self.results["different_lengths"] = await self.benchmark_different_lengths()
self.results["openai_compatibility"] = await self.benchmark_openai_compatibility()
# Summary
print(f"\n{'#'*60}")
print("SUMMARY")
print(f"{'#'*60}")
print(f"\n⚑ Performance:")
print(f" Average latency: {self.results['single_request']['avg_latency']:.2f}s")
print(f" Token throughput: {self.results['single_request']['avg_tokens_per_sec']:.2f} tokens/sec")
print(f" Concurrent capacity: {self.results['concurrent_load']['requests_per_sec']:.2f} req/sec")
print(f"\nπŸ”Œ OpenAI Compatibility: {self.results['openai_compatibility']['score']}")
# Save results
with open("benchmark_results.json", "w") as f:
json.dump(self.results, f, indent=2)
print(f"\nπŸ“Š Full results saved to benchmark_results.json")
await self.client.aclose()
async def main():
benchmark = Benchmark()
await benchmark.run_all_benchmarks()
if __name__ == "__main__":
asyncio.run(main())