Spaces:

jeanbaptdzd
/

open-finance-llm-8b

Paused

App Files Files Community

open-finance-llm-8b / test_finance_improved.py

jeanbaptdzd

Fix generation: increase tokens for complete answers, add EOS handling

78f67d6 about 2 months ago

raw

history blame

10.3 kB

	#!/usr/bin/env python3
	"""
	Improved finance tests with better prompts for concise, complete answers.
	"""

	import httpx
	import json
	import time
	from typing import Dict, Any, List

	BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"

	# Improved finance tests with prompts that encourage concise but complete answers
	FINANCE_TESTS = [
	{
	"category": "Financial Calculations",
	"question": "Calculate: If I invest $10,000 at 5% annual interest compounded annually for 3 years, what will be the final amount? Show your calculation steps briefly.",
	"max_tokens": 150
	},
	{
	"category": "Risk Management",
	"question": "Define Value at Risk (VaR) and explain its main use in portfolio management. Be concise but complete.",
	"max_tokens": 200
	},
	{
	"category": "Financial Instruments",
	"question": "Explain the key difference between call and put options in 2-3 sentences.",
	"max_tokens": 100
	},
	{
	"category": "Market Analysis",
	"question": "List 5 key factors that influence stock market volatility and briefly explain each.",
	"max_tokens": 250
	},
	{
	"category": "Corporate Finance",
	"question": "Compare EBITDA vs Net Income: What's included in each and why does the difference matter?",
	"max_tokens": 200
	},
	{
	"category": "Investment Strategy",
	"question": "Explain portfolio diversification and why it's important. Give a concrete example.",
	"max_tokens": 200
	},
	{
	"category": "Financial Ratios",
	"question": "How do you calculate P/E ratio? What does a high vs low P/E tell you about a stock?",
	"max_tokens": 150
	},
	{
	"category": "Fixed Income",
	"question": "Explain the inverse relationship between bond prices and interest rates. Why does this occur?",
	"max_tokens": 150
	},
	]

	# French finance tests with proper French terminology
	FRENCH_FINANCE_TESTS = [
	{
	"category": "Calculs Financiers",
	"question": "Si j'investis 10 000€ avec un taux d'intérêt annuel de 5% composé annuellement pendant 3 ans, quel sera le montant final? Montrez vos calculs.",
	"max_tokens": 150
	},
	{
	"category": "Gestion des Risques",
	"question": "Expliquez ce qu'est la VaR (Value at Risk / Valeur en Risque) et son utilisation dans la gestion de portefeuille.",
	"max_tokens": 200
	},
	{
	"category": "Instruments Financiers",
	"question": "Quelle est la différence entre une option d'achat (call) et une option de vente (put)?",
	"max_tokens": 150
	},
	{
	"category": "Analyse Boursière",
	"question": "Quels sont les principaux facteurs qui influencent la volatilité des marchés boursiers?",
	"max_tokens": 200
	},
	{
	"category": "Finance d'Entreprise",
	"question": "Expliquez la différence entre l'EBITDA (Bénéfice avant intérêts, impôts, dépréciation et amortissement) et le résultat net.",
	"max_tokens": 200
	},
	{
	"category": "Stratégie d'Investissement",
	"question": "Qu'est-ce que la diversification d'un portefeuille et pourquoi est-elle importante?",
	"max_tokens": 200
	},
	{
	"category": "Ratios Financiers",
	"question": "Comment calculer le ratio cours/bénéfice (PER) et comment l'interpréter?",
	"max_tokens": 150
	},
	{
	"category": "Obligations",
	"question": "Pourquoi les prix des obligations baissent-ils lorsque les taux d'intérêt augmentent?",
	"max_tokens": 150
	},
	{
	"category": "Analyse Technique (Termes Français)",
	"question": "Expliquez les termes suivants utilisés en bourse française: CAC 40, PEA, sicav, et OAT.",
	"max_tokens": 200
	},
	{
	"category": "Fiscalité (France)",
	"question": "Quelle est la différence entre la Flat Tax et le barème progressif pour l'imposition des revenus de capitaux mobiliers en France?",
	"max_tokens": 200
	},
	]

	def run_test(test: Dict[str, Any], language: str = "English") -> Dict[str, Any]:
	"""Run a single test."""
	print(f"\n{'─'*80}")
	print(f"Catégorie: {test['category']}" if language == "French" else f"Category: {test['category']}")
	print(f"Question: {test['question']}")
	print(f"Max Tokens: {test.get('max_tokens', 200)}")
	print(f"{'─'*80}")

	payload = {
	"model": "DragonLLM/qwen3-8b-fin-v1.0",
	"messages": [
	{"role": "user", "content": test["question"]}
	],
	"temperature": 0.2, # Lower for more focused answers
	"max_tokens": test.get('max_tokens', 200)
	}

	start_time = time.time()

	try:
	response = httpx.post(
	f"{BASE_URL}/v1/chat/completions",
	json=payload,
	timeout=60.0
	)

	elapsed = time.time() - start_time

	if response.status_code == 200:
	data = response.json()
	answer = data['choices'][0]['message']['content']
	usage = data.get('usage', {})
	finish_reason = data['choices'][0].get('finish_reason', 'unknown')

	print(f"\n📊 Stats:")
	print(f" ⏱️ Time: {elapsed:.2f}s")
	print(f" 📝 Tokens: {usage.get('completion_tokens', 'N/A')}/{test.get('max_tokens', 200)}")
	print(f" 🏁 Finish: {finish_reason}")

	print(f"\n💬 Answer:\n{answer}")

	# Evaluate answer quality
	is_complete = finish_reason == "stop"
	has_thinking = "<think>" in answer
	answer_content = answer.split("</think>")[-1].strip() if has_thinking else answer

	print(f"\n📈 Quality:")
	print(f" {'✅' if is_complete else '⚠️'} Complete: {is_complete}")
	print(f" {'✅' if has_thinking else '➖'} Shows reasoning: {has_thinking}")
	print(f" 📏 Answer length: {len(answer_content)} chars")

	return {
	"success": True,
	"category": test['category'],
	"time": elapsed,
	"tokens_used": usage.get('completion_tokens', 0),
	"tokens_limit": test.get('max_tokens', 200),
	"complete": is_complete,
	"has_reasoning": has_thinking
	}
	else:
	print(f"❌ Error: HTTP {response.status_code}")
	return {"success": False, "category": test['category'], "error": str(response.status_code)}

	except Exception as e:
	print(f"❌ Error: {e}")
	return {"success": False, "category": test['category'], "error": str(e)}

	def print_summary(results: List[Dict[str, Any]], language: str):
	"""Print test summary."""
	print("\n" + "="*80)
	print("RÉSUMÉ DES TESTS" if language == "French" else "TEST SUMMARY")
	print("="*80)

	successful = [r for r in results if r.get('success')]
	failed = [r for r in results if not r.get('success')]

	print(f"\n✅ Successful: {len(successful)}/{len(results)}")
	print(f"❌ Failed: {len(failed)}/{len(results)}")

	if successful:
	avg_time = sum(r['time'] for r in successful) / len(successful)
	avg_tokens = sum(r['tokens_used'] for r in successful) / len(successful)
	complete_count = sum(1 for r in successful if r.get('complete'))
	reasoning_count = sum(1 for r in successful if r.get('has_reasoning'))

	print(f"\n📊 Performance Metrics:")
	print(f" ⏱️ Average response time: {avg_time:.2f}s")
	print(f" 📝 Average tokens used: {avg_tokens:.0f}")
	print(f" ✅ Complete answers: {complete_count}/{len(successful)} ({100*complete_count/len(successful):.1f}%)")
	print(f" 🧠 Answers with reasoning: {reasoning_count}/{len(successful)} ({100*reasoning_count/len(successful):.1f}%)")

	# Token efficiency
	total_used = sum(r['tokens_used'] for r in successful)
	total_limit = sum(r['tokens_limit'] for r in successful)
	print(f" 💰 Token efficiency: {total_used}/{total_limit} ({100*total_used/total_limit:.1f}% utilization)")

	def main():
	"""Run all tests."""
	print("="*80)
	print("IMPROVED FINANCE LLM TESTING")
	print("="*80)
	print(f"Target: {BASE_URL}")

	# Test English questions
	print("\n" + "="*80)
	print("ENGLISH FINANCE TESTS (Improved Prompts)")
	print("="*80)

	english_results = []
	for i, test in enumerate(FINANCE_TESTS, 1):
	print(f"\n[Test {i}/{len(FINANCE_TESTS)}]")
	result = run_test(test, "English")
	english_results.append(result)
	if i < len(FINANCE_TESTS):
	time.sleep(1)

	print_summary(english_results, "English")

	# Test French questions
	print("\n\n" + "="*80)
	print("FRENCH FINANCE TESTS (Questions en Français)")
	print("="*80)
	print("Testing with French finance terminology...")

	french_results = []
	for i, test in enumerate(FRENCH_FINANCE_TESTS, 1):
	print(f"\n[Test {i}/{len(FRENCH_FINANCE_TESTS)}]")
	result = run_test(test, "French")
	french_results.append(result)
	if i < len(FRENCH_FINANCE_TESTS):
	time.sleep(1)

	print_summary(french_results, "French")

	# Overall summary
	print("\n\n" + "="*80)
	print("OVERALL SUMMARY")
	print("="*80)

	total_tests = len(english_results) + len(french_results)
	total_success = sum(1 for r in english_results + french_results if r.get('success'))

	print(f"\n📊 Total Tests: {total_tests}")
	print(f"✅ Total Successful: {total_success}/{total_tests} ({100*total_success/total_tests:.1f}%)")
	print(f"🇬🇧 English: {len([r for r in english_results if r.get('success')])}/{len(english_results)}")
	print(f"🇫🇷 French: {len([r for r in french_results if r.get('success')])}/{len(french_results)}")

	print("\n" + "="*80)
	print("TESTING COMPLETE")
	print("="*80)

	if __name__ == "__main__":
	main()