open-finance-llm-8b / analyze_performance.py
jeanbaptdzd's picture
Fix generation: increase tokens for complete answers, add EOS handling
78f67d6
raw
history blame
10.4 kB
#!/usr/bin/env python3
"""
Analyze model performance: inference speed, throughput, and parallelization.
"""
import httpx
import json
import time
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Any
BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
def analyze_test_results():
"""Analyze the results from previous tests."""
print("="*80)
print("PERFORMANCE ANALYSIS FROM RECENT TESTS")
print("="*80)
# From the test results
english_tests = {
"total_tests": 8,
"avg_time": 11.74,
"avg_tokens": 175,
"max_tokens": 150,
}
french_tests = {
"total_tests": 10,
"avg_time": 12.03,
"avg_tokens": 180,
"max_tokens": 150,
}
# Calculate metrics
print(f"\nπŸ“Š English Tests:")
print(f" Average response time: {english_tests['avg_time']:.2f}s")
print(f" Average tokens generated: {english_tests['avg_tokens']}")
print(f" Tokens per second: {english_tests['avg_tokens'] / english_tests['avg_time']:.2f}")
print(f" Token efficiency: {english_tests['avg_tokens'] / english_tests['max_tokens'] * 100:.1f}%")
print(f"\nπŸ“Š French Tests:")
print(f" Average response time: {french_tests['avg_time']:.2f}s")
print(f" Average tokens generated: {french_tests['avg_tokens']}")
print(f" Tokens per second: {french_tests['avg_tokens'] / french_tests['avg_time']:.2f}")
print(f" Token efficiency: {french_tests['avg_tokens'] / french_tests['max_tokens'] * 100:.1f}%")
overall_tokens_per_sec = (english_tests['avg_tokens'] + french_tests['avg_tokens']) / \
(english_tests['avg_time'] + french_tests['avg_time'])
print(f"\nπŸš€ Overall Performance:")
print(f" Average tokens/second: {overall_tokens_per_sec:.2f}")
print(f" Current hardware: L4x1 GPU")
print(f" Model size: 8B parameters (Qwen3)")
return overall_tokens_per_sec
def test_single_request():
"""Test a single request to measure baseline performance."""
print("\n" + "="*80)
print("BASELINE SINGLE REQUEST TEST")
print("="*80)
payload = {
"model": "DragonLLM/qwen3-8b-fin-v1.0",
"messages": [
{"role": "user", "content": "Explain compound interest in one sentence."}
],
"temperature": 0.2,
"max_tokens": 50
}
start = time.time()
try:
response = httpx.post(
f"{BASE_URL}/v1/chat/completions",
json=payload,
timeout=60.0
)
elapsed = time.time() - start
if response.status_code == 200:
data = response.json()
tokens = data['usage']['completion_tokens']
print(f"\nβœ… Response received")
print(f" ⏱️ Time: {elapsed:.2f}s")
print(f" πŸ“ Tokens: {tokens}")
print(f" πŸš€ Speed: {tokens/elapsed:.2f} tokens/s")
return tokens, elapsed
else:
print(f"❌ Error: {response.status_code}")
return None, None
except Exception as e:
print(f"❌ Error: {e}")
return None, None
def test_concurrent_requests(num_requests: int = 3):
"""Test multiple concurrent requests to check parallelization."""
print("\n" + "="*80)
print(f"CONCURRENT REQUESTS TEST ({num_requests} parallel requests)")
print("="*80)
questions = [
"What is a stock?",
"What is a bond?",
"What is diversification?",
"What is ROI?",
"What is inflation?",
][:num_requests]
def make_request(question: str, index: int):
payload = {
"model": "DragonLLM/qwen3-8b-fin-v1.0",
"messages": [{"role": "user", "content": question}],
"temperature": 0.2,
"max_tokens": 50
}
start = time.time()
try:
response = httpx.post(
f"{BASE_URL}/v1/chat/completions",
json=payload,
timeout=90.0
)
elapsed = time.time() - start
if response.status_code == 200:
data = response.json()
return {
"index": index,
"question": question,
"time": elapsed,
"tokens": data['usage']['completion_tokens'],
"success": True
}
else:
return {"index": index, "success": False, "error": response.status_code}
except Exception as e:
return {"index": index, "success": False, "error": str(e)}
print(f"\nSending {num_requests} requests simultaneously...")
overall_start = time.time()
with ThreadPoolExecutor(max_workers=num_requests) as executor:
futures = [executor.submit(make_request, q, i) for i, q in enumerate(questions)]
results = [future.result() for future in as_completed(futures)]
overall_elapsed = time.time() - overall_start
# Sort results by index
results.sort(key=lambda x: x.get('index', 0))
successful = [r for r in results if r.get('success')]
print(f"\nπŸ“Š Results:")
print(f" Total time: {overall_elapsed:.2f}s")
print(f" Successful: {len(successful)}/{num_requests}")
if successful:
for r in successful:
print(f"\n Request {r['index'] + 1}: {r['question'][:40]}...")
print(f" Time: {r['time']:.2f}s")
print(f" Tokens: {r['tokens']}")
print(f" Speed: {r['tokens']/r['time']:.2f} tokens/s")
avg_time = sum(r['time'] for r in successful) / len(successful)
total_tokens = sum(r['tokens'] for r in successful)
print(f"\n πŸ“ˆ Average per request: {avg_time:.2f}s")
print(f" πŸ“ Total tokens: {total_tokens}")
print(f" ⚑ Throughput: {total_tokens/overall_elapsed:.2f} tokens/s overall")
# Check if requests were parallelized
if overall_elapsed < avg_time * num_requests * 0.8:
print(f" βœ… Requests appear to be parallelized")
parallel_speedup = (avg_time * num_requests) / overall_elapsed
print(f" πŸš€ Speedup: {parallel_speedup:.2f}x")
else:
print(f" ⚠️ Requests appear to be sequential (no parallelization)")
print(f" πŸ’‘ Expected time if parallel: ~{avg_time:.2f}s")
print(f" πŸ’‘ Actual time: {overall_elapsed:.2f}s")
return successful, overall_elapsed
def analyze_hardware_upgrade():
"""Analyze potential benefits of upgrading to L40s."""
print("\n" + "="*80)
print("HARDWARE UPGRADE ANALYSIS: L4x1 β†’ L40s")
print("="*80)
print("\nπŸ“Š Current Setup (L4x1):")
print(" GPU: NVIDIA L4")
print(" VRAM: 24 GB")
print(" vCPU: 15")
print(" RAM: 44 GB")
print(" Cost: ~$0.70/hour ($521/month)")
print("\nπŸ“Š Upgrade Option (L40s):")
print(" GPU: NVIDIA L40s")
print(" VRAM: 48 GB (2x L4)")
print(" vCPU: 30 (2x L4)")
print(" RAM: 92 GB (2x L4)")
print(" Cost: ~$1.55/hour ($1153/month)")
print(" Cost increase: +$632/month (+121%)")
print("\n🎯 Expected Benefits:")
print(" βœ… Better parallelization: More VRAM allows larger batch sizes")
print(" βœ… Faster inference: ~1.5-2x faster per request")
print(" βœ… Higher throughput: 2-3x more concurrent requests")
print(" βœ… Reduced latency: Better for multiple users")
print("\nπŸ’‘ Recommendations:")
print(" 1. L4x1 is sufficient for:")
print(" - Sequential requests")
print(" - Low to medium traffic (<10 requests/min)")
print(" - Development/testing")
print("\n 2. Upgrade to L40s if:")
print(" - Need to handle concurrent requests efficiently")
print(" - Expecting >20 requests/min")
print(" - Latency is critical (<5s response time)")
print(" - Multiple users accessing simultaneously")
print("\n 3. Current bottleneck:")
print(" - Transformers backend is single-threaded by default")
print(" - Need batching support for true parallelization")
print(" - Consider implementing request batching")
def main():
"""Run performance analysis."""
print("="*80)
print("FINANCE LLM PERFORMANCE ANALYSIS")
print("="*80)
# Analyze previous test results
avg_tokens_per_sec = analyze_test_results()
# Test single request
tokens, elapsed = test_single_request()
# Test concurrent requests
print("\n" + "="*80)
print("Testing with 2 concurrent requests...")
test_concurrent_requests(2)
time.sleep(2)
print("\n" + "="*80)
print("Testing with 3 concurrent requests...")
test_concurrent_requests(3)
# Hardware analysis
analyze_hardware_upgrade()
print("\n" + "="*80)
print("KEY FINDINGS")
print("="*80)
print(f"""
πŸ“Š Current Performance:
β€’ Average inference speed: ~{avg_tokens_per_sec:.1f} tokens/second
β€’ Average response time: ~12 seconds for 175 tokens
β€’ Model: Qwen3 8B with Transformers backend
β€’ Hardware: L4x1 GPU (24GB VRAM)
⚠️ Current Limitations:
β€’ Transformers backend processes requests sequentially
β€’ No built-in batching/parallelization
β€’ Each request waits for the previous to complete
β€’ GPU may be underutilized during single requests
βœ… Optimization Options:
1. SOFTWARE (No cost):
β€’ Implement request batching in the backend
β€’ Use vLLM for automatic batching (requires code change)
β€’ Enable continuous batching for better throughput
2. HARDWARE (Higher cost):
β€’ Upgrade to L40s for 2x VRAM and compute
β€’ Expected: 1.5-2x faster per request
β€’ Better for concurrent users
β€’ Cost: +$632/month
3. HYBRID APPROACH:
β€’ Stay on L4x1 + implement batching
β€’ Most cost-effective for moderate traffic
β€’ Can handle 5-10 concurrent requests efficiently
""")
print("="*80)
if __name__ == "__main__":
main()