|
|
|
|
|
""" |
|
|
Comprehensive benchmark suite for LLM Pro Finance API |
|
|
Run with: python tests/performance/benchmark.py |
|
|
""" |
|
|
import asyncio |
|
|
import httpx |
|
|
import time |
|
|
import statistics |
|
|
from typing import List, Dict |
|
|
import json |
|
|
|
|
|
|
|
|
BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space" |
|
|
|
|
|
|
|
|
|
|
|
class Benchmark: |
|
|
def __init__(self, base_url: str = BASE_URL): |
|
|
self.base_url = base_url |
|
|
self.client = httpx.AsyncClient(timeout=120.0) |
|
|
self.results = {} |
|
|
|
|
|
async def health_check(self) -> bool: |
|
|
"""Check if service is available""" |
|
|
try: |
|
|
response = await self.client.get(f"{self.base_url}/health") |
|
|
return response.status_code == 200 |
|
|
except: |
|
|
return False |
|
|
|
|
|
async def benchmark_single_request(self, num_runs: int = 10) -> Dict: |
|
|
"""Benchmark single request latency""" |
|
|
print(f"\n{'='*60}") |
|
|
print("BENCHMARK: Single Request Latency") |
|
|
print(f"{'='*60}") |
|
|
|
|
|
latencies = [] |
|
|
tokens_per_sec = [] |
|
|
|
|
|
payload = { |
|
|
"model": "DragonLLM/Qwen-Open-Finance-R-8B", |
|
|
"messages": [ |
|
|
{"role": "user", "content": "What is artificial intelligence?"} |
|
|
], |
|
|
"max_tokens": 100, |
|
|
"temperature": 0.7 |
|
|
} |
|
|
|
|
|
for i in range(num_runs): |
|
|
start = time.time() |
|
|
response = await self.client.post( |
|
|
f"{self.base_url}/v1/chat/completions", |
|
|
json=payload |
|
|
) |
|
|
end = time.time() |
|
|
|
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
latency = end - start |
|
|
completion_tokens = data["usage"]["completion_tokens"] |
|
|
tps = completion_tokens / latency if latency > 0 else 0 |
|
|
|
|
|
latencies.append(latency) |
|
|
tokens_per_sec.append(tps) |
|
|
|
|
|
print(f"Run {i+1}/{num_runs}: {latency:.2f}s, {tps:.2f} tokens/sec") |
|
|
|
|
|
results = { |
|
|
"avg_latency": statistics.mean(latencies), |
|
|
"min_latency": min(latencies), |
|
|
"max_latency": max(latencies), |
|
|
"std_latency": statistics.stdev(latencies) if len(latencies) > 1 else 0, |
|
|
"avg_tokens_per_sec": statistics.mean(tokens_per_sec), |
|
|
"max_tokens_per_sec": max(tokens_per_sec), |
|
|
} |
|
|
|
|
|
print(f"\nResults:") |
|
|
print(f" Average latency: {results['avg_latency']:.2f}s (Β±{results['std_latency']:.2f}s)") |
|
|
print(f" Min/Max latency: {results['min_latency']:.2f}s / {results['max_latency']:.2f}s") |
|
|
print(f" Average throughput: {results['avg_tokens_per_sec']:.2f} tokens/sec") |
|
|
print(f" Max throughput: {results['max_tokens_per_sec']:.2f} tokens/sec") |
|
|
|
|
|
return results |
|
|
|
|
|
async def benchmark_concurrent_load(self, num_concurrent: int = 10) -> Dict: |
|
|
"""Benchmark concurrent request handling""" |
|
|
print(f"\n{'='*60}") |
|
|
print(f"BENCHMARK: Concurrent Load ({num_concurrent} requests)") |
|
|
print(f"{'='*60}") |
|
|
|
|
|
async def make_request(request_id: int): |
|
|
payload = { |
|
|
"model": "DragonLLM/Qwen-Open-Finance-R-8B", |
|
|
"messages": [ |
|
|
{"role": "user", "content": f"Request {request_id}: Explain machine learning."} |
|
|
], |
|
|
"max_tokens": 50, |
|
|
"temperature": 0.7 |
|
|
} |
|
|
|
|
|
start = time.time() |
|
|
response = await self.client.post( |
|
|
f"{self.base_url}/v1/chat/completions", |
|
|
json=payload |
|
|
) |
|
|
end = time.time() |
|
|
|
|
|
return { |
|
|
"request_id": request_id, |
|
|
"latency": end - start, |
|
|
"status": response.status_code, |
|
|
"data": response.json() if response.status_code == 200 else None |
|
|
} |
|
|
|
|
|
start_time = time.time() |
|
|
results = await asyncio.gather(*[make_request(i) for i in range(num_concurrent)]) |
|
|
end_time = time.time() |
|
|
|
|
|
total_time = end_time - start_time |
|
|
successful = [r for r in results if r["status"] == 200] |
|
|
latencies = [r["latency"] for r in successful] |
|
|
|
|
|
benchmark_results = { |
|
|
"total_time": total_time, |
|
|
"num_requests": num_concurrent, |
|
|
"successful": len(successful), |
|
|
"failed": num_concurrent - len(successful), |
|
|
"avg_latency": statistics.mean(latencies) if latencies else 0, |
|
|
"requests_per_sec": num_concurrent / total_time, |
|
|
} |
|
|
|
|
|
print(f"\nResults:") |
|
|
print(f" Total time: {total_time:.2f}s") |
|
|
print(f" Successful: {len(successful)}/{num_concurrent}") |
|
|
print(f" Average latency: {benchmark_results['avg_latency']:.2f}s") |
|
|
print(f" Requests/sec: {benchmark_results['requests_per_sec']:.2f}") |
|
|
|
|
|
return benchmark_results |
|
|
|
|
|
async def benchmark_different_lengths(self) -> Dict: |
|
|
"""Benchmark with different output lengths""" |
|
|
print(f"\n{'='*60}") |
|
|
print("BENCHMARK: Different Output Lengths") |
|
|
print(f"{'='*60}") |
|
|
|
|
|
test_cases = [ |
|
|
{"name": "Short (50 tokens)", "max_tokens": 50}, |
|
|
{"name": "Medium (100 tokens)", "max_tokens": 100}, |
|
|
{"name": "Long (200 tokens)", "max_tokens": 200}, |
|
|
{"name": "Very Long (500 tokens)", "max_tokens": 500}, |
|
|
] |
|
|
|
|
|
results_by_length = {} |
|
|
|
|
|
for test_case in test_cases: |
|
|
payload = { |
|
|
"model": "DragonLLM/Qwen-Open-Finance-R-8B", |
|
|
"messages": [ |
|
|
{"role": "user", "content": "Write about the history of computing."} |
|
|
], |
|
|
"max_tokens": test_case["max_tokens"], |
|
|
"temperature": 0.7 |
|
|
} |
|
|
|
|
|
start = time.time() |
|
|
response = await self.client.post( |
|
|
f"{self.base_url}/v1/chat/completions", |
|
|
json=payload |
|
|
) |
|
|
end = time.time() |
|
|
|
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
latency = end - start |
|
|
completion_tokens = data["usage"]["completion_tokens"] |
|
|
tps = completion_tokens / latency if latency > 0 else 0 |
|
|
|
|
|
results_by_length[test_case["name"]] = { |
|
|
"latency": latency, |
|
|
"tokens": completion_tokens, |
|
|
"tokens_per_sec": tps |
|
|
} |
|
|
|
|
|
print(f"\n{test_case['name']}:") |
|
|
print(f" Generated: {completion_tokens} tokens") |
|
|
print(f" Time: {latency:.2f}s") |
|
|
print(f" Throughput: {tps:.2f} tokens/sec") |
|
|
|
|
|
return results_by_length |
|
|
|
|
|
async def benchmark_openai_compatibility(self) -> Dict: |
|
|
"""Test OpenAI API compatibility""" |
|
|
print(f"\n{'='*60}") |
|
|
print("BENCHMARK: OpenAI API Compatibility") |
|
|
print(f"{'='*60}") |
|
|
|
|
|
tests = { |
|
|
"list_models": False, |
|
|
"chat_completions": False, |
|
|
"system_message": False, |
|
|
"conversation_history": False, |
|
|
"streaming": False, |
|
|
"temperature_param": False, |
|
|
"max_tokens_param": False, |
|
|
} |
|
|
|
|
|
|
|
|
try: |
|
|
response = await self.client.get(f"{self.base_url}/v1/models") |
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
if "data" in data and len(data["data"]) > 0: |
|
|
tests["list_models"] = True |
|
|
print("β List models endpoint") |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
try: |
|
|
payload = {"model": "DragonLLM/LLM-Pro-Finance-Small", "messages": [{"role": "user", "content": "Hi"}]} |
|
|
response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload) |
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
if "choices" in data and "usage" in data: |
|
|
tests["chat_completions"] = True |
|
|
print("β Chat completions endpoint") |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
try: |
|
|
payload = { |
|
|
"model": "DragonLLM/Qwen-Open-Finance-R-8B", |
|
|
"messages": [ |
|
|
{"role": "system", "content": "Be helpful."}, |
|
|
{"role": "user", "content": "Hi"} |
|
|
] |
|
|
} |
|
|
response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload) |
|
|
if response.status_code == 200: |
|
|
tests["system_message"] = True |
|
|
print("β System message support") |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
try: |
|
|
payload = { |
|
|
"model": "DragonLLM/Qwen-Open-Finance-R-8B", |
|
|
"messages": [ |
|
|
{"role": "user", "content": "My name is Alice"}, |
|
|
{"role": "assistant", "content": "Hello Alice"}, |
|
|
{"role": "user", "content": "What's my name?"} |
|
|
] |
|
|
} |
|
|
response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload) |
|
|
if response.status_code == 200: |
|
|
tests["conversation_history"] = True |
|
|
print("β Conversation history") |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
try: |
|
|
payload = { |
|
|
"model": "DragonLLM/Qwen-Open-Finance-R-8B", |
|
|
"messages": [{"role": "user", "content": "Hi"}], |
|
|
"temperature": 0.5 |
|
|
} |
|
|
response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload) |
|
|
if response.status_code == 200: |
|
|
tests["temperature_param"] = True |
|
|
print("β Temperature parameter") |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
try: |
|
|
payload = { |
|
|
"model": "DragonLLM/Qwen-Open-Finance-R-8B", |
|
|
"messages": [{"role": "user", "content": "Hi"}], |
|
|
"max_tokens": 10 |
|
|
} |
|
|
response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload) |
|
|
if response.status_code == 200: |
|
|
tests["max_tokens_param"] = True |
|
|
print("β Max tokens parameter") |
|
|
except: |
|
|
pass |
|
|
|
|
|
passed = sum(1 for v in tests.values() if v) |
|
|
total = len(tests) |
|
|
|
|
|
print(f"\nCompatibility Score: {passed}/{total} ({100*passed/total:.0f}%)") |
|
|
|
|
|
return {"tests": tests, "score": f"{passed}/{total}"} |
|
|
|
|
|
async def run_all_benchmarks(self): |
|
|
"""Run all benchmarks""" |
|
|
print(f"\n{'#'*60}") |
|
|
print("LLM Pro Finance API - Comprehensive Benchmark Suite") |
|
|
print(f"Service: {self.base_url}") |
|
|
print(f"{'#'*60}") |
|
|
|
|
|
|
|
|
print("\nChecking service health...") |
|
|
if not await self.health_check(): |
|
|
print("β Service is not available!") |
|
|
return |
|
|
print("β Service is healthy") |
|
|
|
|
|
|
|
|
self.results["single_request"] = await self.benchmark_single_request(num_runs=5) |
|
|
self.results["concurrent_load"] = await self.benchmark_concurrent_load(num_concurrent=5) |
|
|
self.results["different_lengths"] = await self.benchmark_different_lengths() |
|
|
self.results["openai_compatibility"] = await self.benchmark_openai_compatibility() |
|
|
|
|
|
|
|
|
print(f"\n{'#'*60}") |
|
|
print("SUMMARY") |
|
|
print(f"{'#'*60}") |
|
|
print(f"\nβ‘ Performance:") |
|
|
print(f" Average latency: {self.results['single_request']['avg_latency']:.2f}s") |
|
|
print(f" Token throughput: {self.results['single_request']['avg_tokens_per_sec']:.2f} tokens/sec") |
|
|
print(f" Concurrent capacity: {self.results['concurrent_load']['requests_per_sec']:.2f} req/sec") |
|
|
print(f"\nπ OpenAI Compatibility: {self.results['openai_compatibility']['score']}") |
|
|
|
|
|
|
|
|
with open("benchmark_results.json", "w") as f: |
|
|
json.dump(self.results, f, indent=2) |
|
|
print(f"\nπ Full results saved to benchmark_results.json") |
|
|
|
|
|
await self.client.aclose() |
|
|
|
|
|
|
|
|
async def main(): |
|
|
benchmark = Benchmark() |
|
|
await benchmark.run_all_benchmarks() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
asyncio.run(main()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|