open-finance-llm-8b / test_finance_queries.py
jeanbaptdzd's picture
Fix generation: increase tokens for complete answers, add EOS handling
78f67d6
raw
history blame
8.08 kB
#!/usr/bin/env python3
"""
Test the deployed finance LLM with various finance-specific questions.
"""
import httpx
import json
import time
from typing import Dict, Any, List
BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"
# Finance test questions covering different domains
FINANCE_TESTS = [
{
"category": "Financial Calculations",
"question": "If I invest $10,000 at an annual interest rate of 5% compounded annually, how much will I have after 3 years?",
"expected_topics": ["compound interest", "10000", "5%", "3 years"]
},
{
"category": "Risk Management",
"question": "What is Value at Risk (VaR) and how is it used in portfolio management?",
"expected_topics": ["VaR", "risk", "portfolio", "loss"]
},
{
"category": "Financial Instruments",
"question": "Explain the difference between a call option and a put option.",
"expected_topics": ["call", "put", "option", "buy", "sell"]
},
{
"category": "Market Analysis",
"question": "What factors typically influence stock market volatility?",
"expected_topics": ["volatility", "market", "uncertainty", "factors"]
},
{
"category": "Corporate Finance",
"question": "What is the difference between EBITDA and net income?",
"expected_topics": ["EBITDA", "net income", "earnings", "depreciation"]
},
{
"category": "Investment Strategy",
"question": "What is diversification and why is it important in investing?",
"expected_topics": ["diversification", "risk", "portfolio", "assets"]
},
{
"category": "Financial Ratios",
"question": "How do you calculate and interpret the Price-to-Earnings (P/E) ratio?",
"expected_topics": ["P/E", "price", "earnings", "ratio", "valuation"]
},
{
"category": "Fixed Income",
"question": "What happens to bond prices when interest rates rise?",
"expected_topics": ["bond", "interest rate", "price", "inverse"]
},
]
def test_endpoint_availability():
"""Test if the endpoint is available."""
print("\n" + "="*80)
print("TESTING ENDPOINT AVAILABILITY")
print("="*80)
try:
response = httpx.get(f"{BASE_URL}/", timeout=30.0)
data = response.json()
print(f"βœ… Status: {response.status_code}")
print(f"βœ… Backend: {data.get('backend')}")
print(f"βœ… Model: {data.get('model')}")
print(f"βœ… Service: {data.get('service')}")
return True
except Exception as e:
print(f"❌ Error: {e}")
return False
def test_models_endpoint():
"""Test the /v1/models endpoint."""
print("\n" + "="*80)
print("TESTING MODELS ENDPOINT")
print("="*80)
try:
response = httpx.get(f"{BASE_URL}/v1/models", timeout=30.0)
data = response.json()
print(f"βœ… Status: {response.status_code}")
print(f"βœ… Available models: {len(data.get('data', []))}")
for model in data.get('data', []):
print(f" - {model.get('id')}")
return True
except Exception as e:
print(f"❌ Error: {e}")
return False
def run_finance_test(test: Dict[str, Any], max_tokens: int = 200) -> Dict[str, Any]:
"""Run a single finance test question."""
print(f"\n{'─'*80}")
print(f"Category: {test['category']}")
print(f"Question: {test['question']}")
print(f"{'─'*80}")
payload = {
"model": "DragonLLM/qwen3-8b-fin-v1.0",
"messages": [
{"role": "user", "content": test["question"]}
],
"temperature": 0.3,
"max_tokens": max_tokens
}
start_time = time.time()
try:
response = httpx.post(
f"{BASE_URL}/v1/chat/completions",
json=payload,
timeout=60.0
)
elapsed = time.time() - start_time
if response.status_code == 200:
data = response.json()
answer = data['choices'][0]['message']['content']
usage = data.get('usage', {})
print(f"\nπŸ“Š Response Stats:")
print(f" ⏱️ Time: {elapsed:.2f}s")
print(f" πŸ“ Tokens: {usage.get('total_tokens', 'N/A')} "
f"(prompt: {usage.get('prompt_tokens', 'N/A')}, "
f"completion: {usage.get('completion_tokens', 'N/A')})")
print(f"\nπŸ’¬ Answer:\n{answer}")
# Check if expected topics are mentioned
answer_lower = answer.lower()
topics_found = [topic for topic in test.get('expected_topics', [])
if topic.lower() in answer_lower]
if topics_found:
print(f"\nβœ… Relevant topics found: {', '.join(topics_found)}")
return {
"success": True,
"category": test['category'],
"time": elapsed,
"tokens": usage.get('total_tokens', 0),
"topics_found": len(topics_found),
"topics_expected": len(test.get('expected_topics', []))
}
else:
print(f"❌ Error: HTTP {response.status_code}")
print(f" {response.text}")
return {
"success": False,
"category": test['category'],
"error": f"HTTP {response.status_code}"
}
except Exception as e:
elapsed = time.time() - start_time
print(f"❌ Error after {elapsed:.2f}s: {e}")
return {
"success": False,
"category": test['category'],
"error": str(e)
}
def print_summary(results: List[Dict[str, Any]]):
"""Print test summary."""
print("\n" + "="*80)
print("TEST SUMMARY")
print("="*80)
successful = [r for r in results if r.get('success')]
failed = [r for r in results if not r.get('success')]
print(f"\nβœ… Successful: {len(successful)}/{len(results)}")
print(f"❌ Failed: {len(failed)}/{len(results)}")
if successful:
avg_time = sum(r['time'] for r in successful) / len(successful)
avg_tokens = sum(r['tokens'] for r in successful) / len(successful)
total_topics = sum(r['topics_found'] for r in successful)
expected_topics = sum(r['topics_expected'] for r in successful)
print(f"\nπŸ“Š Performance Metrics:")
print(f" ⏱️ Average response time: {avg_time:.2f}s")
print(f" πŸ“ Average tokens: {avg_tokens:.0f}")
print(f" 🎯 Topic coverage: {total_topics}/{expected_topics} "
f"({100*total_topics/expected_topics if expected_topics > 0 else 0:.1f}%)")
if failed:
print(f"\n❌ Failed Tests:")
for r in failed:
print(f" - {r['category']}: {r.get('error', 'Unknown error')}")
def main():
"""Run all finance tests."""
print("="*80)
print("FINANCE LLM TESTING SUITE")
print("="*80)
print(f"Target: {BASE_URL}")
print(f"Total tests: {len(FINANCE_TESTS)}")
# Test endpoint availability
if not test_endpoint_availability():
print("\n❌ Endpoint not available. Exiting.")
return
# Test models endpoint
if not test_models_endpoint():
print("\n⚠️ Models endpoint not available, but continuing...")
# Run finance tests
print("\n" + "="*80)
print("RUNNING FINANCE TESTS")
print("="*80)
results = []
for i, test in enumerate(FINANCE_TESTS, 1):
print(f"\n[Test {i}/{len(FINANCE_TESTS)}]")
result = run_finance_test(test)
results.append(result)
# Small delay between requests
if i < len(FINANCE_TESTS):
time.sleep(1)
# Print summary
print_summary(results)
print("\n" + "="*80)
print("TESTING COMPLETE")
print("="*80)
if __name__ == "__main__":
main()