|
|
|
|
|
""" |
|
|
Improved finance tests with better prompts for concise, complete answers. |
|
|
""" |
|
|
|
|
|
import httpx |
|
|
import json |
|
|
import time |
|
|
from typing import Dict, Any, List |
|
|
|
|
|
BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space" |
|
|
|
|
|
|
|
|
FINANCE_TESTS = [ |
|
|
{ |
|
|
"category": "Financial Calculations", |
|
|
"question": "Calculate: If I invest $10,000 at 5% annual interest compounded annually for 3 years, what will be the final amount? Show your calculation steps briefly.", |
|
|
"max_tokens": 150 |
|
|
}, |
|
|
{ |
|
|
"category": "Risk Management", |
|
|
"question": "Define Value at Risk (VaR) and explain its main use in portfolio management. Be concise but complete.", |
|
|
"max_tokens": 200 |
|
|
}, |
|
|
{ |
|
|
"category": "Financial Instruments", |
|
|
"question": "Explain the key difference between call and put options in 2-3 sentences.", |
|
|
"max_tokens": 100 |
|
|
}, |
|
|
{ |
|
|
"category": "Market Analysis", |
|
|
"question": "List 5 key factors that influence stock market volatility and briefly explain each.", |
|
|
"max_tokens": 250 |
|
|
}, |
|
|
{ |
|
|
"category": "Corporate Finance", |
|
|
"question": "Compare EBITDA vs Net Income: What's included in each and why does the difference matter?", |
|
|
"max_tokens": 200 |
|
|
}, |
|
|
{ |
|
|
"category": "Investment Strategy", |
|
|
"question": "Explain portfolio diversification and why it's important. Give a concrete example.", |
|
|
"max_tokens": 200 |
|
|
}, |
|
|
{ |
|
|
"category": "Financial Ratios", |
|
|
"question": "How do you calculate P/E ratio? What does a high vs low P/E tell you about a stock?", |
|
|
"max_tokens": 150 |
|
|
}, |
|
|
{ |
|
|
"category": "Fixed Income", |
|
|
"question": "Explain the inverse relationship between bond prices and interest rates. Why does this occur?", |
|
|
"max_tokens": 150 |
|
|
}, |
|
|
] |
|
|
|
|
|
|
|
|
FRENCH_FINANCE_TESTS = [ |
|
|
{ |
|
|
"category": "Calculs Financiers", |
|
|
"question": "Si j'investis 10 000€ avec un taux d'intérêt annuel de 5% composé annuellement pendant 3 ans, quel sera le montant final? Montrez vos calculs.", |
|
|
"max_tokens": 150 |
|
|
}, |
|
|
{ |
|
|
"category": "Gestion des Risques", |
|
|
"question": "Expliquez ce qu'est la VaR (Value at Risk / Valeur en Risque) et son utilisation dans la gestion de portefeuille.", |
|
|
"max_tokens": 200 |
|
|
}, |
|
|
{ |
|
|
"category": "Instruments Financiers", |
|
|
"question": "Quelle est la différence entre une option d'achat (call) et une option de vente (put)?", |
|
|
"max_tokens": 150 |
|
|
}, |
|
|
{ |
|
|
"category": "Analyse Boursière", |
|
|
"question": "Quels sont les principaux facteurs qui influencent la volatilité des marchés boursiers?", |
|
|
"max_tokens": 200 |
|
|
}, |
|
|
{ |
|
|
"category": "Finance d'Entreprise", |
|
|
"question": "Expliquez la différence entre l'EBITDA (Bénéfice avant intérêts, impôts, dépréciation et amortissement) et le résultat net.", |
|
|
"max_tokens": 200 |
|
|
}, |
|
|
{ |
|
|
"category": "Stratégie d'Investissement", |
|
|
"question": "Qu'est-ce que la diversification d'un portefeuille et pourquoi est-elle importante?", |
|
|
"max_tokens": 200 |
|
|
}, |
|
|
{ |
|
|
"category": "Ratios Financiers", |
|
|
"question": "Comment calculer le ratio cours/bénéfice (PER) et comment l'interpréter?", |
|
|
"max_tokens": 150 |
|
|
}, |
|
|
{ |
|
|
"category": "Obligations", |
|
|
"question": "Pourquoi les prix des obligations baissent-ils lorsque les taux d'intérêt augmentent?", |
|
|
"max_tokens": 150 |
|
|
}, |
|
|
{ |
|
|
"category": "Analyse Technique (Termes Français)", |
|
|
"question": "Expliquez les termes suivants utilisés en bourse française: CAC 40, PEA, sicav, et OAT.", |
|
|
"max_tokens": 200 |
|
|
}, |
|
|
{ |
|
|
"category": "Fiscalité (France)", |
|
|
"question": "Quelle est la différence entre la Flat Tax et le barème progressif pour l'imposition des revenus de capitaux mobiliers en France?", |
|
|
"max_tokens": 200 |
|
|
}, |
|
|
] |
|
|
|
|
|
def run_test(test: Dict[str, Any], language: str = "English") -> Dict[str, Any]: |
|
|
"""Run a single test.""" |
|
|
print(f"\n{'─'*80}") |
|
|
print(f"Catégorie: {test['category']}" if language == "French" else f"Category: {test['category']}") |
|
|
print(f"Question: {test['question']}") |
|
|
print(f"Max Tokens: {test.get('max_tokens', 200)}") |
|
|
print(f"{'─'*80}") |
|
|
|
|
|
payload = { |
|
|
"model": "DragonLLM/qwen3-8b-fin-v1.0", |
|
|
"messages": [ |
|
|
{"role": "user", "content": test["question"]} |
|
|
], |
|
|
"temperature": 0.2, |
|
|
"max_tokens": test.get('max_tokens', 200) |
|
|
} |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
|
response = httpx.post( |
|
|
f"{BASE_URL}/v1/chat/completions", |
|
|
json=payload, |
|
|
timeout=60.0 |
|
|
) |
|
|
|
|
|
elapsed = time.time() - start_time |
|
|
|
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
answer = data['choices'][0]['message']['content'] |
|
|
usage = data.get('usage', {}) |
|
|
finish_reason = data['choices'][0].get('finish_reason', 'unknown') |
|
|
|
|
|
print(f"\n📊 Stats:") |
|
|
print(f" ⏱️ Time: {elapsed:.2f}s") |
|
|
print(f" 📝 Tokens: {usage.get('completion_tokens', 'N/A')}/{test.get('max_tokens', 200)}") |
|
|
print(f" 🏁 Finish: {finish_reason}") |
|
|
|
|
|
print(f"\n💬 Answer:\n{answer}") |
|
|
|
|
|
|
|
|
is_complete = finish_reason == "stop" |
|
|
has_thinking = "<think>" in answer |
|
|
answer_content = answer.split("</think>")[-1].strip() if has_thinking else answer |
|
|
|
|
|
print(f"\n📈 Quality:") |
|
|
print(f" {'✅' if is_complete else '⚠️'} Complete: {is_complete}") |
|
|
print(f" {'✅' if has_thinking else '➖'} Shows reasoning: {has_thinking}") |
|
|
print(f" 📏 Answer length: {len(answer_content)} chars") |
|
|
|
|
|
return { |
|
|
"success": True, |
|
|
"category": test['category'], |
|
|
"time": elapsed, |
|
|
"tokens_used": usage.get('completion_tokens', 0), |
|
|
"tokens_limit": test.get('max_tokens', 200), |
|
|
"complete": is_complete, |
|
|
"has_reasoning": has_thinking |
|
|
} |
|
|
else: |
|
|
print(f"❌ Error: HTTP {response.status_code}") |
|
|
return {"success": False, "category": test['category'], "error": str(response.status_code)} |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Error: {e}") |
|
|
return {"success": False, "category": test['category'], "error": str(e)} |
|
|
|
|
|
def print_summary(results: List[Dict[str, Any]], language: str): |
|
|
"""Print test summary.""" |
|
|
print("\n" + "="*80) |
|
|
print("RÉSUMÉ DES TESTS" if language == "French" else "TEST SUMMARY") |
|
|
print("="*80) |
|
|
|
|
|
successful = [r for r in results if r.get('success')] |
|
|
failed = [r for r in results if not r.get('success')] |
|
|
|
|
|
print(f"\n✅ Successful: {len(successful)}/{len(results)}") |
|
|
print(f"❌ Failed: {len(failed)}/{len(results)}") |
|
|
|
|
|
if successful: |
|
|
avg_time = sum(r['time'] for r in successful) / len(successful) |
|
|
avg_tokens = sum(r['tokens_used'] for r in successful) / len(successful) |
|
|
complete_count = sum(1 for r in successful if r.get('complete')) |
|
|
reasoning_count = sum(1 for r in successful if r.get('has_reasoning')) |
|
|
|
|
|
print(f"\n📊 Performance Metrics:") |
|
|
print(f" ⏱️ Average response time: {avg_time:.2f}s") |
|
|
print(f" 📝 Average tokens used: {avg_tokens:.0f}") |
|
|
print(f" ✅ Complete answers: {complete_count}/{len(successful)} ({100*complete_count/len(successful):.1f}%)") |
|
|
print(f" 🧠 Answers with reasoning: {reasoning_count}/{len(successful)} ({100*reasoning_count/len(successful):.1f}%)") |
|
|
|
|
|
|
|
|
total_used = sum(r['tokens_used'] for r in successful) |
|
|
total_limit = sum(r['tokens_limit'] for r in successful) |
|
|
print(f" 💰 Token efficiency: {total_used}/{total_limit} ({100*total_used/total_limit:.1f}% utilization)") |
|
|
|
|
|
def main(): |
|
|
"""Run all tests.""" |
|
|
print("="*80) |
|
|
print("IMPROVED FINANCE LLM TESTING") |
|
|
print("="*80) |
|
|
print(f"Target: {BASE_URL}") |
|
|
|
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("ENGLISH FINANCE TESTS (Improved Prompts)") |
|
|
print("="*80) |
|
|
|
|
|
english_results = [] |
|
|
for i, test in enumerate(FINANCE_TESTS, 1): |
|
|
print(f"\n[Test {i}/{len(FINANCE_TESTS)}]") |
|
|
result = run_test(test, "English") |
|
|
english_results.append(result) |
|
|
if i < len(FINANCE_TESTS): |
|
|
time.sleep(1) |
|
|
|
|
|
print_summary(english_results, "English") |
|
|
|
|
|
|
|
|
print("\n\n" + "="*80) |
|
|
print("FRENCH FINANCE TESTS (Questions en Français)") |
|
|
print("="*80) |
|
|
print("Testing with French finance terminology...") |
|
|
|
|
|
french_results = [] |
|
|
for i, test in enumerate(FRENCH_FINANCE_TESTS, 1): |
|
|
print(f"\n[Test {i}/{len(FRENCH_FINANCE_TESTS)}]") |
|
|
result = run_test(test, "French") |
|
|
french_results.append(result) |
|
|
if i < len(FRENCH_FINANCE_TESTS): |
|
|
time.sleep(1) |
|
|
|
|
|
print_summary(french_results, "French") |
|
|
|
|
|
|
|
|
print("\n\n" + "="*80) |
|
|
print("OVERALL SUMMARY") |
|
|
print("="*80) |
|
|
|
|
|
total_tests = len(english_results) + len(french_results) |
|
|
total_success = sum(1 for r in english_results + french_results if r.get('success')) |
|
|
|
|
|
print(f"\n📊 Total Tests: {total_tests}") |
|
|
print(f"✅ Total Successful: {total_success}/{total_tests} ({100*total_success/total_tests:.1f}%)") |
|
|
print(f"🇬🇧 English: {len([r for r in english_results if r.get('success')])}/{len(english_results)}") |
|
|
print(f"🇫🇷 French: {len([r for r in french_results if r.get('success')])}/{len(french_results)}") |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("TESTING COMPLETE") |
|
|
print("="*80) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
|