File size: 9,305 Bytes
78f67d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/env python3
"""
Final finance tests with proper token limits and French language support.
"""

import httpx
import json
import time
from typing import Dict, Any, List

BASE_URL = "https://jeanbaptdzd-open-finance-llm-8b.hf.space"

# English tests with increased token limits to handle thinking + answer
ENGLISH_TESTS = [
    {
        "category": "Financial Calculations",
        "question": "Calculate: If I invest $10,000 at 5% annual interest compounded annually for 3 years, what will be the final amount? Show your calculation and explain the formula.",
        "max_tokens": 300  # Increased for thinking + complete answer
    },
    {
        "category": "Risk Management",
        "question": "Define Value at Risk (VaR) and explain how it's used in portfolio management. Include examples.",
        "max_tokens": 350
    },
    {
        "category": "Options Trading",
        "question": "Explain call and put options. What are the key differences and when would you use each?",
        "max_tokens": 300
    },
]

# French tests with explicit language instructions
FRENCH_TESTS = [
    {
        "category": "Calculs Financiers",
        "question": "Si j'investis 10 000€ avec un taux d'intérêt annuel de 5% composé annuellement pendant 3 ans, quel sera le montant final? Montrez vos calculs et expliquez la formule. Répondez entièrement en français, y compris votre raisonnement.",
        "max_tokens": 300,
        "system_prompt": "Tu es un assistant financier qui répond toujours en français. Ton raisonnement et tes réponses doivent être entièrement en français."
    },
    {
        "category": "Gestion des Risques",
        "question": "Expliquez ce qu'est la VaR (Value at Risk / Valeur en Risque) et comment elle est utilisée dans la gestion de portefeuille. Donnez des exemples. Répondez entièrement en français.",
        "max_tokens": 350,
        "system_prompt": "Tu es un assistant financier qui répond toujours en français. Ton raisonnement et tes réponses doivent être entièrement en français."
    },
    {
        "category": "Options",
        "question": "Expliquez les options d'achat (call) et de vente (put). Quelles sont les différences clés et quand utiliser chacune? Répondez entièrement en français avec votre raisonnement en français.",
        "max_tokens": 300,
        "system_prompt": "Tu es un assistant financier qui répond toujours en français. Tout ton raisonnement interne et ta réponse finale doivent être en français."
    },
    {
        "category": "Termes Français",
        "question": "Expliquez les termes suivants de la bourse française: CAC 40, PEA, SICAV, et OAT. Pour chaque terme, donnez une définition claire. Répondez en français.",
        "max_tokens": 400,
        "system_prompt": "Tu es un expert en finance française. Réponds entièrement en français, y compris ton raisonnement."
    },
]

def run_test(test: Dict[str, Any], language: str = "English") -> Dict[str, Any]:
    """Run a single test."""
    print(f"\n{'='*80}")
    print(f"{'Catégorie' if language == 'French' else 'Category'}: {test['category']}")
    print(f"Question: {test['question'][:100]}...")
    print(f"Max Tokens: {test.get('max_tokens', 300)}")
    print(f"{'='*80}")
    
    messages = [{"role": "user", "content": test["question"]}]
    
    # Add system prompt for French tests
    if "system_prompt" in test:
        messages.insert(0, {"role": "system", "content": test["system_prompt"]})
    
    payload = {
        "model": "DragonLLM/qwen3-8b-fin-v1.0",
        "messages": messages,
        "temperature": 0.3,
        "max_tokens": test.get('max_tokens', 300)
    }
    
    start_time = time.time()
    
    try:
        response = httpx.post(
            f"{BASE_URL}/v1/chat/completions",
            json=payload,
            timeout=90.0
        )
        
        elapsed = time.time() - start_time
        
        if response.status_code == 200:
            data = response.json()
            answer = data['choices'][0]['message']['content']
            usage = data.get('usage', {})
            finish_reason = data['choices'][0].get('finish_reason', 'unknown')
            
            print(f"\n💬 Answer:")
            print(answer)
            
            print(f"\n📊 Stats:")
            print(f"   ⏱️  Time: {elapsed:.2f}s")
            print(f"   📝 Tokens: {usage.get('completion_tokens', 'N/A')}/{test.get('max_tokens', 300)}")
            print(f"   🏁 Finish: {finish_reason}")
            
            # Check if answer was complete
            is_complete = finish_reason == "stop"
            has_thinking = "<think>" in answer.lower()
            
            # For French tests, check if thinking is in French
            if language == "French":
                # Simple heuristic: check for French words in thinking section
                if has_thinking:
                    thinking_section = answer.split("</think>")[0].lower()
                    french_indicators = ["je", "le", "la", "est", "sont", "dans", "avec", "pour"]
                    english_indicators = ["the", "is", "are", "with", "for", "that"]
                    
                    french_count = sum(1 for word in french_indicators if word in thinking_section)
                    english_count = sum(1 for word in english_indicators if word in thinking_section)
                    
                    thinking_in_french = french_count > english_count
                    print(f"   🇫🇷 Thinking in French: {'✅' if thinking_in_french else '❌ (in English)'}")
            
            print(f"\n📈 Quality:")
            print(f"   {'✅' if is_complete else '⚠️  TRUNCATED'} Answer status: {finish_reason}")
            print(f"   {'✅' if has_thinking else '➖'} Shows reasoning: {has_thinking}")
            
            return {
                "success": True,
                "category": test['category'],
                "time": elapsed,
                "tokens_used": usage.get('completion_tokens', 0),
                "complete": is_complete,
                "has_reasoning": has_thinking
            }
        else:
            print(f"❌ Error: HTTP {response.status_code}")
            return {"success": False, "category": test['category'], "error": str(response.status_code)}
            
    except Exception as e:
        print(f"❌ Error: {e}")
        return {"success": False, "category": test['category'], "error": str(e)}

def print_summary(results: List[Dict[str, Any]], language: str):
    """Print test summary."""
    print("\n" + "="*80)
    print("RÉSUMÉ" if language == "French" else "SUMMARY")
    print("="*80)
    
    successful = [r for r in results if r.get('success')]
    failed = [r for r in results if not r.get('success')]
    complete = [r for r in successful if r.get('complete')]
    
    print(f"\n✅ Successful: {len(successful)}/{len(results)}")
    print(f"✅ Complete answers: {len(complete)}/{len(successful)} ({100*len(complete)/len(successful) if successful else 0:.1f}%)")
    print(f"❌ Failed: {len(failed)}/{len(results)}")
    
    if successful:
        avg_time = sum(r['time'] for r in successful) / len(successful)
        avg_tokens = sum(r['tokens_used'] for r in successful) / len(successful)
        
        print(f"\n📊 Metrics:")
        print(f"   ⏱️  Average time: {avg_time:.2f}s")
        print(f"   📝 Average tokens: {avg_tokens:.0f}")
        print(f"   🚀 Speed: {avg_tokens/avg_time:.2f} tokens/s")

def main():
    """Run all tests."""
    print("="*80)
    print("FINAL FINANCE LLM TESTS")
    print("="*80)
    print("Testing with proper token limits and language support")
    
    # English tests
    print("\n" + "="*80)
    print("ENGLISH TESTS")
    print("="*80)
    
    english_results = []
    for i, test in enumerate(ENGLISH_TESTS, 1):
        print(f"\n[Test {i}/{len(ENGLISH_TESTS)}]")
        result = run_test(test, "English")
        english_results.append(result)
        time.sleep(1)
    
    print_summary(english_results, "English")
    
    # French tests
    print("\n\n" + "="*80)
    print("FRENCH TESTS (with language instructions)")
    print("="*80)
    
    french_results = []
    for i, test in enumerate(FRENCH_TESTS, 1):
        print(f"\n[Test {i}/{len(FRENCH_TESTS)}]")
        result = run_test(test, "French")
        french_results.append(result)
        time.sleep(1)
    
    print_summary(french_results, "French")
    
    # Overall
    print("\n\n" + "="*80)
    print("OVERALL RESULTS")
    print("="*80)
    
    all_results = english_results + french_results
    all_successful = [r for r in all_results if r.get('success')]
    all_complete = [r for r in all_successful if r.get('complete')]
    
    print(f"\n📊 Total: {len(all_successful)}/{len(all_results)} successful")
    print(f"✅ Complete: {len(all_complete)}/{len(all_successful)} ({100*len(all_complete)/len(all_successful) if all_successful else 0:.1f}%)")
    print(f"🇬🇧 English: {len([r for r in english_results if r.get('success')])}/{len(ENGLISH_TESTS)}")
    print(f"🇫🇷 French: {len([r for r in french_results if r.get('success')])}/{len(FRENCH_TESTS)}")
    
    print("\n" + "="*80)

if __name__ == "__main__":
    main()