|
|
|
|
|
""" |
|
|
Test the deployed finance LLM with various finance-specific questions. |
|
|
""" |
|
|
|
|
|
import httpx |
|
|
import json |
|
|
import time |
|
|
from typing import Dict, Any, List |
|
|
|
|
|
BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space" |
|
|
|
|
|
|
|
|
FINANCE_TESTS = [ |
|
|
{ |
|
|
"category": "Financial Calculations", |
|
|
"question": "If I invest $10,000 at an annual interest rate of 5% compounded annually, how much will I have after 3 years?", |
|
|
"expected_topics": ["compound interest", "10000", "5%", "3 years"] |
|
|
}, |
|
|
{ |
|
|
"category": "Risk Management", |
|
|
"question": "What is Value at Risk (VaR) and how is it used in portfolio management?", |
|
|
"expected_topics": ["VaR", "risk", "portfolio", "loss"] |
|
|
}, |
|
|
{ |
|
|
"category": "Financial Instruments", |
|
|
"question": "Explain the difference between a call option and a put option.", |
|
|
"expected_topics": ["call", "put", "option", "buy", "sell"] |
|
|
}, |
|
|
{ |
|
|
"category": "Market Analysis", |
|
|
"question": "What factors typically influence stock market volatility?", |
|
|
"expected_topics": ["volatility", "market", "uncertainty", "factors"] |
|
|
}, |
|
|
{ |
|
|
"category": "Corporate Finance", |
|
|
"question": "What is the difference between EBITDA and net income?", |
|
|
"expected_topics": ["EBITDA", "net income", "earnings", "depreciation"] |
|
|
}, |
|
|
{ |
|
|
"category": "Investment Strategy", |
|
|
"question": "What is diversification and why is it important in investing?", |
|
|
"expected_topics": ["diversification", "risk", "portfolio", "assets"] |
|
|
}, |
|
|
{ |
|
|
"category": "Financial Ratios", |
|
|
"question": "How do you calculate and interpret the Price-to-Earnings (P/E) ratio?", |
|
|
"expected_topics": ["P/E", "price", "earnings", "ratio", "valuation"] |
|
|
}, |
|
|
{ |
|
|
"category": "Fixed Income", |
|
|
"question": "What happens to bond prices when interest rates rise?", |
|
|
"expected_topics": ["bond", "interest rate", "price", "inverse"] |
|
|
}, |
|
|
] |
|
|
|
|
|
def test_endpoint_availability(): |
|
|
"""Test if the endpoint is available.""" |
|
|
print("\n" + "="*80) |
|
|
print("TESTING ENDPOINT AVAILABILITY") |
|
|
print("="*80) |
|
|
|
|
|
try: |
|
|
response = httpx.get(f"{BASE_URL}/", timeout=30.0) |
|
|
data = response.json() |
|
|
print(f"β
Status: {response.status_code}") |
|
|
print(f"β
Backend: {data.get('backend')}") |
|
|
print(f"β
Model: {data.get('model')}") |
|
|
print(f"β
Service: {data.get('service')}") |
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"β Error: {e}") |
|
|
return False |
|
|
|
|
|
def test_models_endpoint(): |
|
|
"""Test the /v1/models endpoint.""" |
|
|
print("\n" + "="*80) |
|
|
print("TESTING MODELS ENDPOINT") |
|
|
print("="*80) |
|
|
|
|
|
try: |
|
|
response = httpx.get(f"{BASE_URL}/v1/models", timeout=30.0) |
|
|
data = response.json() |
|
|
print(f"β
Status: {response.status_code}") |
|
|
print(f"β
Available models: {len(data.get('data', []))}") |
|
|
for model in data.get('data', []): |
|
|
print(f" - {model.get('id')}") |
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"β Error: {e}") |
|
|
return False |
|
|
|
|
|
def run_finance_test(test: Dict[str, Any], max_tokens: int = 200) -> Dict[str, Any]: |
|
|
"""Run a single finance test question.""" |
|
|
print(f"\n{'β'*80}") |
|
|
print(f"Category: {test['category']}") |
|
|
print(f"Question: {test['question']}") |
|
|
print(f"{'β'*80}") |
|
|
|
|
|
payload = { |
|
|
"model": "DragonLLM/qwen3-8b-fin-v1.0", |
|
|
"messages": [ |
|
|
{"role": "user", "content": test["question"]} |
|
|
], |
|
|
"temperature": 0.3, |
|
|
"max_tokens": max_tokens |
|
|
} |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
|
response = httpx.post( |
|
|
f"{BASE_URL}/v1/chat/completions", |
|
|
json=payload, |
|
|
timeout=60.0 |
|
|
) |
|
|
|
|
|
elapsed = time.time() - start_time |
|
|
|
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
answer = data['choices'][0]['message']['content'] |
|
|
usage = data.get('usage', {}) |
|
|
|
|
|
print(f"\nπ Response Stats:") |
|
|
print(f" β±οΈ Time: {elapsed:.2f}s") |
|
|
print(f" π Tokens: {usage.get('total_tokens', 'N/A')} " |
|
|
f"(prompt: {usage.get('prompt_tokens', 'N/A')}, " |
|
|
f"completion: {usage.get('completion_tokens', 'N/A')})") |
|
|
|
|
|
print(f"\n㪠Answer:\n{answer}") |
|
|
|
|
|
|
|
|
answer_lower = answer.lower() |
|
|
topics_found = [topic for topic in test.get('expected_topics', []) |
|
|
if topic.lower() in answer_lower] |
|
|
|
|
|
if topics_found: |
|
|
print(f"\nβ
Relevant topics found: {', '.join(topics_found)}") |
|
|
|
|
|
return { |
|
|
"success": True, |
|
|
"category": test['category'], |
|
|
"time": elapsed, |
|
|
"tokens": usage.get('total_tokens', 0), |
|
|
"topics_found": len(topics_found), |
|
|
"topics_expected": len(test.get('expected_topics', [])) |
|
|
} |
|
|
else: |
|
|
print(f"β Error: HTTP {response.status_code}") |
|
|
print(f" {response.text}") |
|
|
return { |
|
|
"success": False, |
|
|
"category": test['category'], |
|
|
"error": f"HTTP {response.status_code}" |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
elapsed = time.time() - start_time |
|
|
print(f"β Error after {elapsed:.2f}s: {e}") |
|
|
return { |
|
|
"success": False, |
|
|
"category": test['category'], |
|
|
"error": str(e) |
|
|
} |
|
|
|
|
|
def print_summary(results: List[Dict[str, Any]]): |
|
|
"""Print test summary.""" |
|
|
print("\n" + "="*80) |
|
|
print("TEST SUMMARY") |
|
|
print("="*80) |
|
|
|
|
|
successful = [r for r in results if r.get('success')] |
|
|
failed = [r for r in results if not r.get('success')] |
|
|
|
|
|
print(f"\nβ
Successful: {len(successful)}/{len(results)}") |
|
|
print(f"β Failed: {len(failed)}/{len(results)}") |
|
|
|
|
|
if successful: |
|
|
avg_time = sum(r['time'] for r in successful) / len(successful) |
|
|
avg_tokens = sum(r['tokens'] for r in successful) / len(successful) |
|
|
total_topics = sum(r['topics_found'] for r in successful) |
|
|
expected_topics = sum(r['topics_expected'] for r in successful) |
|
|
|
|
|
print(f"\nπ Performance Metrics:") |
|
|
print(f" β±οΈ Average response time: {avg_time:.2f}s") |
|
|
print(f" π Average tokens: {avg_tokens:.0f}") |
|
|
print(f" π― Topic coverage: {total_topics}/{expected_topics} " |
|
|
f"({100*total_topics/expected_topics if expected_topics > 0 else 0:.1f}%)") |
|
|
|
|
|
if failed: |
|
|
print(f"\nβ Failed Tests:") |
|
|
for r in failed: |
|
|
print(f" - {r['category']}: {r.get('error', 'Unknown error')}") |
|
|
|
|
|
def main(): |
|
|
"""Run all finance tests.""" |
|
|
print("="*80) |
|
|
print("FINANCE LLM TESTING SUITE") |
|
|
print("="*80) |
|
|
print(f"Target: {BASE_URL}") |
|
|
print(f"Total tests: {len(FINANCE_TESTS)}") |
|
|
|
|
|
|
|
|
if not test_endpoint_availability(): |
|
|
print("\nβ Endpoint not available. Exiting.") |
|
|
return |
|
|
|
|
|
|
|
|
if not test_models_endpoint(): |
|
|
print("\nβ οΈ Models endpoint not available, but continuing...") |
|
|
|
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("RUNNING FINANCE TESTS") |
|
|
print("="*80) |
|
|
|
|
|
results = [] |
|
|
for i, test in enumerate(FINANCE_TESTS, 1): |
|
|
print(f"\n[Test {i}/{len(FINANCE_TESTS)}]") |
|
|
result = run_finance_test(test) |
|
|
results.append(result) |
|
|
|
|
|
|
|
|
if i < len(FINANCE_TESTS): |
|
|
time.sleep(1) |
|
|
|
|
|
|
|
|
print_summary(results) |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("TESTING COMPLETE") |
|
|
print("="*80) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
|