Spaces:

jeanbaptdzd
/

open-finance-llm-8b

Paused

App Files Files Community

open-finance-llm-8b / tests /performance /benchmark.py

jeanbaptdzd

Fix model ID and improve memory management

9db586c 25 days ago

raw

history blame contribute delete

13 kB

	#!/usr/bin/env python3
	"""
	Comprehensive benchmark suite for LLM Pro Finance API
	Run with: python tests/performance/benchmark.py
	"""
	import asyncio
	import httpx
	import time
	import statistics
	from typing import List, Dict
	import json

	# Configuration
	BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
	# BASE_URL = "http://localhost:7860" # For local testing


	class Benchmark:
	def __init__(self, base_url: str = BASE_URL):
	self.base_url = base_url
	self.client = httpx.AsyncClient(timeout=120.0)
	self.results = {}

	async def health_check(self) -> bool:
	"""Check if service is available"""
	try:
	response = await self.client.get(f"{self.base_url}/health")
	return response.status_code == 200
	except:
	return False

	async def benchmark_single_request(self, num_runs: int = 10) -> Dict:
	"""Benchmark single request latency"""
	print(f"\n{'='*60}")
	print("BENCHMARK: Single Request Latency")
	print(f"{'='*60}")

	latencies = []
	tokens_per_sec = []

	payload = {
	"model": "DragonLLM/Qwen-Open-Finance-R-8B",
	"messages": [
	{"role": "user", "content": "What is artificial intelligence?"}
	],
	"max_tokens": 100,
	"temperature": 0.7
	}

	for i in range(num_runs):
	start = time.time()
	response = await self.client.post(
	f"{self.base_url}/v1/chat/completions",
	json=payload
	)
	end = time.time()

	if response.status_code == 200:
	data = response.json()
	latency = end - start
	completion_tokens = data["usage"]["completion_tokens"]
	tps = completion_tokens / latency if latency > 0 else 0

	latencies.append(latency)
	tokens_per_sec.append(tps)

	print(f"Run {i+1}/{num_runs}: {latency:.2f}s, {tps:.2f} tokens/sec")

	results = {
	"avg_latency": statistics.mean(latencies),
	"min_latency": min(latencies),
	"max_latency": max(latencies),
	"std_latency": statistics.stdev(latencies) if len(latencies) > 1 else 0,
	"avg_tokens_per_sec": statistics.mean(tokens_per_sec),
	"max_tokens_per_sec": max(tokens_per_sec),
	}

	print(f"\nResults:")
	print(f" Average latency: {results['avg_latency']:.2f}s (±{results['std_latency']:.2f}s)")
	print(f" Min/Max latency: {results['min_latency']:.2f}s / {results['max_latency']:.2f}s")
	print(f" Average throughput: {results['avg_tokens_per_sec']:.2f} tokens/sec")
	print(f" Max throughput: {results['max_tokens_per_sec']:.2f} tokens/sec")

	return results

	async def benchmark_concurrent_load(self, num_concurrent: int = 10) -> Dict:
	"""Benchmark concurrent request handling"""
	print(f"\n{'='*60}")
	print(f"BENCHMARK: Concurrent Load ({num_concurrent} requests)")
	print(f"{'='*60}")

	async def make_request(request_id: int):
	payload = {
	"model": "DragonLLM/Qwen-Open-Finance-R-8B",
	"messages": [
	{"role": "user", "content": f"Request {request_id}: Explain machine learning."}
	],
	"max_tokens": 50,
	"temperature": 0.7
	}

	start = time.time()
	response = await self.client.post(
	f"{self.base_url}/v1/chat/completions",
	json=payload
	)
	end = time.time()

	return {
	"request_id": request_id,
	"latency": end - start,
	"status": response.status_code,
	"data": response.json() if response.status_code == 200 else None
	}

	start_time = time.time()
	results = await asyncio.gather(*[make_request(i) for i in range(num_concurrent)])
	end_time = time.time()

	total_time = end_time - start_time
	successful = [r for r in results if r["status"] == 200]
	latencies = [r["latency"] for r in successful]

	benchmark_results = {
	"total_time": total_time,
	"num_requests": num_concurrent,
	"successful": len(successful),
	"failed": num_concurrent - len(successful),
	"avg_latency": statistics.mean(latencies) if latencies else 0,
	"requests_per_sec": num_concurrent / total_time,
	}

	print(f"\nResults:")
	print(f" Total time: {total_time:.2f}s")
	print(f" Successful: {len(successful)}/{num_concurrent}")
	print(f" Average latency: {benchmark_results['avg_latency']:.2f}s")
	print(f" Requests/sec: {benchmark_results['requests_per_sec']:.2f}")

	return benchmark_results

	async def benchmark_different_lengths(self) -> Dict:
	"""Benchmark with different output lengths"""
	print(f"\n{'='*60}")
	print("BENCHMARK: Different Output Lengths")
	print(f"{'='*60}")

	test_cases = [
	{"name": "Short (50 tokens)", "max_tokens": 50},
	{"name": "Medium (100 tokens)", "max_tokens": 100},
	{"name": "Long (200 tokens)", "max_tokens": 200},
	{"name": "Very Long (500 tokens)", "max_tokens": 500},
	]

	results_by_length = {}

	for test_case in test_cases:
	payload = {
	"model": "DragonLLM/Qwen-Open-Finance-R-8B",
	"messages": [
	{"role": "user", "content": "Write about the history of computing."}
	],
	"max_tokens": test_case["max_tokens"],
	"temperature": 0.7
	}

	start = time.time()
	response = await self.client.post(
	f"{self.base_url}/v1/chat/completions",
	json=payload
	)
	end = time.time()

	if response.status_code == 200:
	data = response.json()
	latency = end - start
	completion_tokens = data["usage"]["completion_tokens"]
	tps = completion_tokens / latency if latency > 0 else 0

	results_by_length[test_case["name"]] = {
	"latency": latency,
	"tokens": completion_tokens,
	"tokens_per_sec": tps
	}

	print(f"\n{test_case['name']}:")
	print(f" Generated: {completion_tokens} tokens")
	print(f" Time: {latency:.2f}s")
	print(f" Throughput: {tps:.2f} tokens/sec")

	return results_by_length

	async def benchmark_openai_compatibility(self) -> Dict:
	"""Test OpenAI API compatibility"""
	print(f"\n{'='*60}")
	print("BENCHMARK: OpenAI API Compatibility")
	print(f"{'='*60}")

	tests = {
	"list_models": False,
	"chat_completions": False,
	"system_message": False,
	"conversation_history": False,
	"streaming": False,
	"temperature_param": False,
	"max_tokens_param": False,
	}

	# Test 1: List models
	try:
	response = await self.client.get(f"{self.base_url}/v1/models")
	if response.status_code == 200:
	data = response.json()
	if "data" in data and len(data["data"]) > 0:
	tests["list_models"] = True
	print("✓ List models endpoint")
	except:
	pass

	# Test 2: Chat completions
	try:
	payload = {"model": "DragonLLM/LLM-Pro-Finance-Small", "messages": [{"role": "user", "content": "Hi"}]}
	response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload)
	if response.status_code == 200:
	data = response.json()
	if "choices" in data and "usage" in data:
	tests["chat_completions"] = True
	print("✓ Chat completions endpoint")
	except:
	pass

	# Test 3: System message
	try:
	payload = {
	"model": "DragonLLM/Qwen-Open-Finance-R-8B",
	"messages": [
	{"role": "system", "content": "Be helpful."},
	{"role": "user", "content": "Hi"}
	]
	}
	response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload)
	if response.status_code == 200:
	tests["system_message"] = True
	print("✓ System message support")
	except:
	pass

	# Test 4: Conversation history
	try:
	payload = {
	"model": "DragonLLM/Qwen-Open-Finance-R-8B",
	"messages": [
	{"role": "user", "content": "My name is Alice"},
	{"role": "assistant", "content": "Hello Alice"},
	{"role": "user", "content": "What's my name?"}
	]
	}
	response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload)
	if response.status_code == 200:
	tests["conversation_history"] = True
	print("✓ Conversation history")
	except:
	pass

	# Test 5: Temperature parameter
	try:
	payload = {
	"model": "DragonLLM/Qwen-Open-Finance-R-8B",
	"messages": [{"role": "user", "content": "Hi"}],
	"temperature": 0.5
	}
	response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload)
	if response.status_code == 200:
	tests["temperature_param"] = True
	print("✓ Temperature parameter")
	except:
	pass

	# Test 6: Max tokens parameter
	try:
	payload = {
	"model": "DragonLLM/Qwen-Open-Finance-R-8B",
	"messages": [{"role": "user", "content": "Hi"}],
	"max_tokens": 10
	}
	response = await self.client.post(f"{self.base_url}/v1/chat/completions", json=payload)
	if response.status_code == 200:
	tests["max_tokens_param"] = True
	print("✓ Max tokens parameter")
	except:
	pass

	passed = sum(1 for v in tests.values() if v)
	total = len(tests)

	print(f"\nCompatibility Score: {passed}/{total} ({100*passed/total:.0f}%)")

	return {"tests": tests, "score": f"{passed}/{total}"}

	async def run_all_benchmarks(self):
	"""Run all benchmarks"""
	print(f"\n{'#'*60}")
	print("LLM Pro Finance API - Comprehensive Benchmark Suite")
	print(f"Service: {self.base_url}")
	print(f"{'#'*60}")

	# Health check
	print("\nChecking service health...")
	if not await self.health_check():
	print("❌ Service is not available!")
	return
	print("✓ Service is healthy")

	# Run benchmarks
	self.results["single_request"] = await self.benchmark_single_request(num_runs=5)
	self.results["concurrent_load"] = await self.benchmark_concurrent_load(num_concurrent=5)
	self.results["different_lengths"] = await self.benchmark_different_lengths()
	self.results["openai_compatibility"] = await self.benchmark_openai_compatibility()

	# Summary
	print(f"\n{'#'*60}")
	print("SUMMARY")
	print(f"{'#'*60}")
	print(f"\n⚡ Performance:")
	print(f" Average latency: {self.results['single_request']['avg_latency']:.2f}s")
	print(f" Token throughput: {self.results['single_request']['avg_tokens_per_sec']:.2f} tokens/sec")
	print(f" Concurrent capacity: {self.results['concurrent_load']['requests_per_sec']:.2f} req/sec")
	print(f"\n🔌 OpenAI Compatibility: {self.results['openai_compatibility']['score']}")

	# Save results
	with open("benchmark_results.json", "w") as f:
	json.dump(self.results, f, indent=2)
	print(f"\n📊 Full results saved to benchmark_results.json")

	await self.client.aclose()


	async def main():
	benchmark = Benchmark()
	await benchmark.run_all_benchmarks()


	if __name__ == "__main__":
	asyncio.run(main())