#!/usr/bin/env python3 """ Comprehensive benchmark suite for LLM Pro Finance API Run with: python tests/performance/benchmark.py """ import asyncio import httpx import time import statistics from typing import List, Dict import json # Configuration BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space" # BASE_URL = "http://localhost:7860" # For local testing class Benchmark: def __init__(self, base_url: str = BASE_URL): self.base_url = base_url self.client = httpx.AsyncClient(timeout=120.0) self.results = {} async def health_check(self) -> bool: """Check if service is available""" try: response = await self.client.get(f"{self.base_url}/health") return response.status_code == 200 except: return False async def benchmark_single_request(self, num_runs: int = 10) -> Dict: """Benchmark single request latency""" print(f"\n{'='*60}") print("BENCHMARK: Single Request Latency") print(f"{'='*60}") latencies = [] tokens_per_sec = [] payload = { "model": "DragonLLM/Qwen-Open-Finance-R-8B", "messages": [ {"role": "user", "content": "What is artificial intelligence?"} ], "max_tokens": 100, "temperature": 0.7 } for i in range(num_runs): start = time.time() response = await self.client.post( f"{self.base_url}/v1/chat/completions", json=payload ) end = time.time() if response.status_code == 200: data = response.json() latency = end - start completion_tokens = data["usage"]["completion_tokens"] tps = completion_tokens / latency if latency > 0 else 0 latencies.append(latency) tokens_per_sec.append(tps) print(f"Run {i+1}/{num_runs}: {latency:.2f}s, {tps:.2f} tokens/sec") results = { "avg_latency": statistics.mean(latencies), "min_latency": min(latencies), "max_latency": max(latencies), "std_latency": statistics.stdev(latencies) if len(latencies) > 1 else 0, "avg_tokens_per_sec": statistics.mean(tokens_per_sec), "max_tokens_per_sec": max(tokens_per_sec), } print(f"\nResults:") print(f" Average latency: {results['avg_latency']:.2f}s (±{results['std_latency']:.2f}s)") print(f" Min/Max latency: {results['min_latency']:.2f}s / {results['max_latency']:.2f}s") print(f" Average throughput: {results['avg_tokens_per_sec']:.2f} tokens/sec") print(f" Max throughput: {results['max_tokens_per_sec']:.2f} tokens/sec") return results async def benchmark_concurrent_load(self, num_concurrent: int = 10) -> Dict: """Benchmark concurrent request handling""" print(f"\n{'='*60}") print(f"BENCHMARK: Concurrent Load ({num_concurrent} requests)") print(f"{'='*60}") async def make_request(request_id: int): payload = { "model": "DragonLLM/Qwen-Open-Finance-R-8B", "messages": [ {"role": "user", "content": f"Request {request_id}: Explain machine learning."} ], "max_tokens": 50, "temperature": 0.7 } start = time.time() response = await self.client.post( f"{self.base_url}/v1/chat/completions", json=payload ) end = time.time() return { "request_id": request_id, "latency": end - start, "status": response.status_code, "data": response.json() if response.status_code == 200 else None } start_time = time.time() results = await asyncio.gather(*[make_request(i) for i in range(num_concurrent)]) end_time = time.time() total_time = end_time - start_time successful = [r for r in results if r["status"] == 200] latencies = [r["latency"] for r in successful] benchmark_results = { "total_time": total_time, "num_requests": num_concurrent, "successful": len(successful), "failed": num_concurrent - len(successful), "avg_latency": statistics.mean(latencies) if latencies else 0, "requests_per_sec": num_concurrent / total_time, } print(f"\nResults:") print(f" Total time: {total_time:.2f}s") print(f" Successful: {len(successful)}/{num_concurrent}") print(f" Average latency: {benchmark_results['avg_latency']:.2f}s") print(f" Requests/sec: {benchmark_results['requests_per_sec']:.2f}") return benchmark_results async def benchmark_different_lengths(self) -> Dict: """Benchmark with different output lengths""" print(f"\n{'='*60}") print("BENCHMARK: Different Output Lengths") print(f"{'='*60}") test_cases = [ {"name": "Short (50 tokens)", "max_tokens": 50}, {"name": "Medium (100 tokens)", "max_tokens": 100}, {"name": "Long (200 tokens)", "max_tokens": 200}, {"name": "Very Long (500 tokens)", "max_tokens": 500}, ] results_by_length = {} for test_case in test_cases: payload = { "model": "DragonLLM/Qwen-Open-Finance-R-8B", "messages": [ {"role": "user", "content": "Write about the history of computing."} ], "max_tokens": test_case["max_tokens"], "temperature": 0.7 } start = time.time() response = await self.client.post( f"{self.base_url}/v1/chat/completions", json=payload ) end = time.time() if response.status_code == 200: data = response.json() latency = end - start completion_tokens = data["usage"]["completion_tokens"] tps = completion_tokens / latency if latency > 0 else 0 results_by_length[test_case["name"]] = { "latency": latency, "tokens": completion_tokens, "tokens_per_sec": tps } print(f"\n{test_case['name']}:") print(f" Generated: {completion_tokens} tokens") print(f" Time: {latency:.2f}s") print(f" Throughput: {tps:.2f} tokens/sec") return results_by_length async def benchmark_openai_compatibility(self) -> Dict: """Test OpenAI API compatibility""" print(f"\n{'='*60}") print("BENCHMARK: OpenAI API Compatibility") print(f"{'='*60}") tests = { "list_models": False, "chat_completions": False, "system_message": False, "conversation_history": False, "streaming": False, "temperature_param": False, "max_tokens_param": False, } # Test 1: List models try: response = await self.client.get(f"{self.base_url}/v1/models") if response.status_code == 200: data = response.json() if "data" in data and len(data["data"]) > 0: tests["list_models"] = True print("✓ List models endpoint") except: pass # Test 2: Chat completions try: payload = {"model": "DragonLLM/LLM-Pro-Finance-Small", "messages": [{"role": "user", "content": "Hi"}]} response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload) if response.status_code == 200: data = response.json() if "choices" in data and "usage" in data: tests["chat_completions"] = True print("✓ Chat completions endpoint") except: pass # Test 3: System message try: payload = { "model": "DragonLLM/Qwen-Open-Finance-R-8B", "messages": [ {"role": "system", "content": "Be helpful."}, {"role": "user", "content": "Hi"} ] } response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload) if response.status_code == 200: tests["system_message"] = True print("✓ System message support") except: pass # Test 4: Conversation history try: payload = { "model": "DragonLLM/Qwen-Open-Finance-R-8B", "messages": [ {"role": "user", "content": "My name is Alice"}, {"role": "assistant", "content": "Hello Alice"}, {"role": "user", "content": "What's my name?"} ] } response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload) if response.status_code == 200: tests["conversation_history"] = True print("✓ Conversation history") except: pass # Test 5: Temperature parameter try: payload = { "model": "DragonLLM/Qwen-Open-Finance-R-8B", "messages": [{"role": "user", "content": "Hi"}], "temperature": 0.5 } response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload) if response.status_code == 200: tests["temperature_param"] = True print("✓ Temperature parameter") except: pass # Test 6: Max tokens parameter try: payload = { "model": "DragonLLM/Qwen-Open-Finance-R-8B", "messages": [{"role": "user", "content": "Hi"}], "max_tokens": 10 } response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload) if response.status_code == 200: tests["max_tokens_param"] = True print("✓ Max tokens parameter") except: pass passed = sum(1 for v in tests.values() if v) total = len(tests) print(f"\nCompatibility Score: {passed}/{total} ({100*passed/total:.0f}%)") return {"tests": tests, "score": f"{passed}/{total}"} async def run_all_benchmarks(self): """Run all benchmarks""" print(f"\n{'#'*60}") print("LLM Pro Finance API - Comprehensive Benchmark Suite") print(f"Service: {self.base_url}") print(f"{'#'*60}") # Health check print("\nChecking service health...") if not await self.health_check(): print("❌ Service is not available!") return print("✓ Service is healthy") # Run benchmarks self.results["single_request"] = await self.benchmark_single_request(num_runs=5) self.results["concurrent_load"] = await self.benchmark_concurrent_load(num_concurrent=5) self.results["different_lengths"] = await self.benchmark_different_lengths() self.results["openai_compatibility"] = await self.benchmark_openai_compatibility() # Summary print(f"\n{'#'*60}") print("SUMMARY") print(f"{'#'*60}") print(f"\n⚡ Performance:") print(f" Average latency: {self.results['single_request']['avg_latency']:.2f}s") print(f" Token throughput: {self.results['single_request']['avg_tokens_per_sec']:.2f} tokens/sec") print(f" Concurrent capacity: {self.results['concurrent_load']['requests_per_sec']:.2f} req/sec") print(f"\n🔌 OpenAI Compatibility: {self.results['openai_compatibility']['score']}") # Save results with open("benchmark_results.json", "w") as f: json.dump(self.results, f, indent=2) print(f"\n📊 Full results saved to benchmark_results.json") await self.client.aclose() async def main(): benchmark = Benchmark() await benchmark.run_all_benchmarks() if __name__ == "__main__": asyncio.run(main())