NeerajCodz commited on
Commit
da142bc
·
1 Parent(s): 9160ee4

test: comprehensive AI provider testing with 90% success rate

Browse files
backend/app/models/__pycache__/router.cpython-314.pyc CHANGED
Binary files a/backend/app/models/__pycache__/router.cpython-314.pyc and b/backend/app/models/__pycache__/router.cpython-314.pyc differ
 
backend/app/models/providers/nvidia.py CHANGED
@@ -163,7 +163,7 @@ class NVIDIAProvider(BaseProvider):
163
  "Content-Type": "application/json",
164
  }
165
 
166
- async def _rate_limit(self) -> None:
167
  """Apply rate limiting between requests."""
168
  elapsed = time.time() - self._last_request_time
169
  min_interval = 0.3 # 300ms between requests
@@ -201,13 +201,13 @@ class NVIDIAProvider(BaseProvider):
201
  """
202
  # Validate model
203
  if model not in self.MODELS:
204
- raise ModelNotFoundError(f"Model {model} not found. Available: {list(self.MODELS.keys())}")
205
 
206
  model_info = self.MODELS[model]
207
  model_id = model_info.id
208
 
209
  # Apply rate limiting
210
- await self._rate_limit()
211
 
212
  # Build request payload
213
  payload: dict[str, Any] = {
@@ -237,12 +237,12 @@ class NVIDIAProvider(BaseProvider):
237
  )
238
 
239
  if response.status_code == 401:
240
- raise AuthenticationError("Invalid NVIDIA API key")
241
  elif response.status_code == 429:
242
- raise RateLimitError("NVIDIA API rate limit exceeded")
243
  elif response.status_code >= 400:
244
  error_detail = response.text
245
- raise ProviderError(f"NVIDIA API error ({response.status_code}): {error_detail}")
246
 
247
  data = response.json()
248
 
@@ -270,7 +270,7 @@ class NVIDIAProvider(BaseProvider):
270
  except (AuthenticationError, RateLimitError, ProviderError, ModelNotFoundError):
271
  raise
272
  except Exception as e:
273
- raise ProviderError(f"NVIDIA request failed: {str(e)}") from e
274
 
275
  async def complete_stream(
276
  self,
@@ -297,12 +297,12 @@ class NVIDIAProvider(BaseProvider):
297
  Same as complete()
298
  """
299
  if model not in self.MODELS:
300
- raise ModelNotFoundError(f"Model {model} not found")
301
 
302
  model_info = self.MODELS[model]
303
  model_id = model_info.id
304
 
305
- await self._rate_limit()
306
 
307
  payload: dict[str, Any] = {
308
  "model": model_id,
@@ -330,12 +330,12 @@ class NVIDIAProvider(BaseProvider):
330
  json=payload,
331
  ) as response:
332
  if response.status_code == 401:
333
- raise AuthenticationError("Invalid NVIDIA API key")
334
  elif response.status_code == 429:
335
- raise RateLimitError("NVIDIA API rate limit exceeded")
336
  elif response.status_code >= 400:
337
  error_detail = await response.aread()
338
- raise ProviderError(f"NVIDIA API error: {error_detail.decode()}")
339
 
340
  async for line in response.aiter_lines():
341
  if not line.strip() or not line.startswith("data: "):
@@ -358,14 +358,30 @@ class NVIDIAProvider(BaseProvider):
358
  except (AuthenticationError, RateLimitError, ProviderError, ModelNotFoundError):
359
  raise
360
  except Exception as e:
361
- raise ProviderError(f"NVIDIA streaming failed: {str(e)}") from e
362
 
363
  def list_models(self) -> list[ModelInfo]:
364
  """List all available NVIDIA models."""
365
  return list(self.MODELS.values())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
  def get_model_info(self, model: str) -> ModelInfo:
368
  """Get information about a specific model."""
369
  if model not in self.MODELS:
370
- raise ModelNotFoundError(f"Model {model} not found")
371
  return self.MODELS[model]
 
163
  "Content-Type": "application/json",
164
  }
165
 
166
+ async def _apply_rate_limit(self) -> None:
167
  """Apply rate limiting between requests."""
168
  elapsed = time.time() - self._last_request_time
169
  min_interval = 0.3 # 300ms between requests
 
201
  """
202
  # Validate model
203
  if model not in self.MODELS:
204
+ raise ModelNotFoundError(self.PROVIDER_NAME, model)
205
 
206
  model_info = self.MODELS[model]
207
  model_id = model_info.id
208
 
209
  # Apply rate limiting
210
+ await self._apply_rate_limit()
211
 
212
  # Build request payload
213
  payload: dict[str, Any] = {
 
237
  )
238
 
239
  if response.status_code == 401:
240
+ raise AuthenticationError(self.PROVIDER_NAME, "Invalid NVIDIA API key")
241
  elif response.status_code == 429:
242
+ raise RateLimitError(self.PROVIDER_NAME)
243
  elif response.status_code >= 400:
244
  error_detail = response.text
245
+ raise ProviderError(f"NVIDIA API error ({response.status_code}): {error_detail}", self.PROVIDER_NAME)
246
 
247
  data = response.json()
248
 
 
270
  except (AuthenticationError, RateLimitError, ProviderError, ModelNotFoundError):
271
  raise
272
  except Exception as e:
273
+ raise ProviderError(f"NVIDIA request failed: {str(e)}", self.PROVIDER_NAME) from e
274
 
275
  async def complete_stream(
276
  self,
 
297
  Same as complete()
298
  """
299
  if model not in self.MODELS:
300
+ raise ModelNotFoundError(self.PROVIDER_NAME, model)
301
 
302
  model_info = self.MODELS[model]
303
  model_id = model_info.id
304
 
305
+ await self._apply_rate_limit()
306
 
307
  payload: dict[str, Any] = {
308
  "model": model_id,
 
330
  json=payload,
331
  ) as response:
332
  if response.status_code == 401:
333
+ raise AuthenticationError(self.PROVIDER_NAME, "Invalid NVIDIA API key")
334
  elif response.status_code == 429:
335
+ raise RateLimitError(self.PROVIDER_NAME)
336
  elif response.status_code >= 400:
337
  error_detail = await response.aread()
338
+ raise ProviderError(f"NVIDIA API error: {error_detail.decode()}", self.PROVIDER_NAME)
339
 
340
  async for line in response.aiter_lines():
341
  if not line.strip() or not line.startswith("data: "):
 
358
  except (AuthenticationError, RateLimitError, ProviderError, ModelNotFoundError):
359
  raise
360
  except Exception as e:
361
+ raise ProviderError(f"NVIDIA streaming failed: {str(e)}", self.PROVIDER_NAME) from e
362
 
363
  def list_models(self) -> list[ModelInfo]:
364
  """List all available NVIDIA models."""
365
  return list(self.MODELS.values())
366
+
367
+ def get_models(self) -> list[ModelInfo]:
368
+ """Get list of available models (required by abstract base)."""
369
+ return self.list_models()
370
+
371
+ async def stream(
372
+ self,
373
+ messages: list[dict[str, Any]],
374
+ model: str,
375
+ temperature: float = 0.7,
376
+ max_tokens: int | None = None,
377
+ **kwargs: Any,
378
+ ) -> AsyncIterator[str]:
379
+ """Stream a completion (delegates to complete_stream)."""
380
+ async for chunk in self.complete_stream(messages, model, temperature, max_tokens, **kwargs):
381
+ yield chunk
382
 
383
  def get_model_info(self, model: str) -> ModelInfo:
384
  """Get information about a specific model."""
385
  if model not in self.MODELS:
386
+ raise ModelNotFoundError(self.PROVIDER_NAME, model)
387
  return self.MODELS[model]
backend/app/models/router.py CHANGED
@@ -313,8 +313,12 @@ class SmartModelRouter:
313
  def get_provider_for_model(self, model: str) -> BaseProvider | None:
314
  """Get the provider for a specific model."""
315
  for provider in self.providers.values():
316
- if provider.get_model_info(model):
317
- return provider
 
 
 
 
318
 
319
  # Check aliases for Anthropic and Google
320
  if hasattr(provider, "MODEL_ALIASES"):
 
313
  def get_provider_for_model(self, model: str) -> BaseProvider | None:
314
  """Get the provider for a specific model."""
315
  for provider in self.providers.values():
316
+ try:
317
+ if provider.get_model_info(model):
318
+ return provider
319
+ except Exception:
320
+ # Model not found in this provider, continue to next
321
+ pass
322
 
323
  # Check aliases for Anthropic and Google
324
  if hasattr(provider, "MODEL_ALIASES"):
backend/docs/test/ai_provider_test_report.md ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AI Provider Test Report
2
+
3
+ **Generated:** 2026-04-05 02:23:10
4
+ **Test Duration:** 23.50s
5
+
6
+ ## Summary
7
+
8
+ - **Total Tests:** 10
9
+ - **Passed:** ✅ 9
10
+ - **Failed:** ❌ 1
11
+ - **Success Rate:** 90.0%
12
+
13
+ ## Test Results
14
+
15
+ ### 1. Code Generation ✅ PASS
16
+
17
+ **Task Type:** code
18
+ **Provider:** nvidia
19
+ **Model:** llama-3.3-70b
20
+ **Duration:** 5.41s
21
+
22
+ **Prompt:**
23
+ ```
24
+ Write a Python function to calculate fibonacci numbers recursively.
25
+ ```
26
+
27
+ **Response:**
28
+ ```
29
+ ### Recursive Fibonacci Function in Python
30
+ #### Overview
31
+
32
+ The Fibonacci sequence is a series of numbers where a number is the sum of the two preceding ones, usually starting with 0 and 1. This function calculates the nth Fibonacci number using recursion.
33
+
34
+ #### Code
35
+
36
+ ```python
37
+ def fibonacci(n):
38
+ """
39
+ Calculate the nth Fibonacci number recursively.
40
+
41
+ Args:
42
+ n (int): The position of the Fibonacci number to calculate.
43
+
44
+ Returns:
45
+ int: The nth Fibonacci number.
46
+
47
+ Raises:
48
+ ...
49
+ ```
50
+
51
+ **Metadata:**
52
+ - model_used: llama-3.3-70b
53
+ - provider_used: nvidia
54
+ - tokens: 429
55
+
56
+ ---
57
+
58
+ ### 2. Data Extraction ✅ PASS
59
+
60
+ **Task Type:** extraction
61
+ **Provider:** groq
62
+ **Model:** llama-3.3-70b-versatile
63
+ **Duration:** 0.78s
64
+
65
+ **Prompt:**
66
+ ```
67
+ Extract the key information from this text: 'John Doe, age 35, lives in New York and works as a software engineer at Tech Corp since 2020.'
68
+ ```
69
+
70
+ **Response:**
71
+ ```
72
+ The key information extracted from the text is:
73
+
74
+ 1. **Name**: John Doe
75
+ 2. **Age**: 35
76
+ 3. **Location**: New York
77
+ 4. **Occupation**: Software Engineer
78
+ 5. **Employer**: Tech Corp
79
+ 6. **Employment Start Date**: 2020
80
+ ```
81
+
82
+ **Metadata:**
83
+ - model_used: llama-3.3-70b-versatile
84
+ - provider_used: groq
85
+ - tokens: 132
86
+
87
+ ---
88
+
89
+ ### 3. Reasoning Task ✅ PASS
90
+
91
+ **Task Type:** reasoning
92
+ **Provider:** nvidia
93
+ **Model:** devstral-2-123b
94
+ **Duration:** 5.25s
95
+
96
+ **Prompt:**
97
+ ```
98
+ If a train travels 120 miles in 2 hours, and another train travels 180 miles in 3 hours, which train is faster and by how much?
99
+ ```
100
+
101
+ **Response:**
102
+ ```
103
+ To determine which train is faster and by how much, we'll calculate the speed of each train using the formula:
104
+
105
+ \[
106
+ \text{Speed} = \frac{\text{Distance}}{\text{Time}}
107
+ \]
108
+
109
+ ### **First Train:**
110
+ - **Distance:** 120 miles
111
+ - **Time:** 2 hours
112
+
113
+ \[
114
+ \text{Speed}_1 = \frac{120 \text{ miles}}{2 \text{ hours}} = 60 \text{ mph}
115
+ \]
116
+
117
+ ### **Second Train:**
118
+ - **Distance:** 180 miles
119
+ - **Time:** 3 hours
120
+
121
+ \[
122
+ \text{Speed}_2 = \frac{180 \text{ miles}}{3 \text{ hours}} = 60 \text{ mph}
123
+ \]
124
+
125
+ ### **Comparison:**
126
+ Both tr...
127
+ ```
128
+
129
+ **Metadata:**
130
+ - model_used: devstral-2-123b
131
+ - provider_used: nvidia
132
+ - tokens: 251
133
+
134
+ ---
135
+
136
+ ### 4. General Question ✅ PASS
137
+
138
+ **Task Type:** general
139
+ **Provider:** groq
140
+ **Model:** llama-3.3-70b-versatile
141
+ **Duration:** 0.92s
142
+
143
+ **Prompt:**
144
+ ```
145
+ What are the three primary colors?
146
+ ```
147
+
148
+ **Response:**
149
+ ```
150
+ The three primary colors are:
151
+ 1. Red
152
+ 2. Blue
153
+ 3. Yellow
154
+
155
+ These colors cannot be created by mixing other colors together, and they are the base for creating all other colors.
156
+ ```
157
+
158
+ **Metadata:**
159
+ - model_used: llama-3.3-70b-versatile
160
+ - provider_used: groq
161
+ - tokens: 83
162
+
163
+ ---
164
+
165
+ ### 5. JSON Generation ✅ PASS
166
+
167
+ **Task Type:** code
168
+ **Provider:** nvidia
169
+ **Model:** devstral-2-123b
170
+ **Duration:** 4.44s
171
+
172
+ **Prompt:**
173
+ ```
174
+ Generate a JSON object representing a user profile with name, email, age, and interests (array).
175
+ ```
176
+
177
+ **Response:**
178
+ ```
179
+ Here's a JSON object representing a user profile with the specified fields:
180
+
181
+ ```json
182
+ {
183
+ "userProfile": {
184
+ "name": "John Doe",
185
+ "email": "john.doe@example.com",
186
+ "age": 30,
187
+ "interests": ["hiking", "photography", "reading", "cooking"]
188
+ }
189
+ }
190
+ ```
191
+
192
+ ### Explanation:
193
+ - **"name"**: A string representing the user's full name.
194
+ - **"email"**: A string representing the user's email address.
195
+ - **"age"**: A number representing the user's age.
196
+ - **"interests"**: An array of strings listing the us...
197
+ ```
198
+
199
+ **Metadata:**
200
+ - model_used: devstral-2-123b
201
+ - provider_used: nvidia
202
+ - tokens: 187
203
+
204
+ ---
205
+
206
+ ### 6. Text Summarization ✅ PASS
207
+
208
+ **Task Type:** general
209
+ **Provider:** groq
210
+ **Model:** llama-3.3-70b-versatile
211
+ **Duration:** 0.33s
212
+
213
+ **Prompt:**
214
+ ```
215
+ Summarize in one sentence: Artificial intelligence is transforming industries by automating tasks, improving decision-making, and creating new opportunities for innovation.
216
+ ```
217
+
218
+ **Response:**
219
+ ```
220
+ Artificial intelligence is revolutionizing various industries by automating tasks, enhancing decision-making, and driving innovation, ultimately transforming the way businesses operate and creating new opportunities.
221
+ ```
222
+
223
+ **Metadata:**
224
+ - model_used: llama-3.3-70b-versatile
225
+ - provider_used: groq
226
+ - tokens: 96
227
+
228
+ ---
229
+
230
+ ### 7. Math Problem ✅ PASS
231
+
232
+ **Task Type:** reasoning
233
+ **Provider:** nvidia
234
+ **Model:** llama-3.3-70b
235
+ **Duration:** 2.71s
236
+
237
+ **Prompt:**
238
+ ```
239
+ Solve: 2x + 5 = 15. Show your work.
240
+ ```
241
+
242
+ **Response:**
243
+ ```
244
+ To solve for x, we need to isolate x on one side of the equation.
245
+
246
+ 2x + 5 = 15
247
+
248
+ First, subtract 5 from both sides of the equation:
249
+ 2x + 5 - 5 = 15 - 5
250
+ 2x = 10
251
+
252
+ Next, divide both sides of the equation by 2:
253
+ 2x / 2 = 10 / 2
254
+ x = 5
255
+
256
+ Therefore, the value of x is 5.
257
+ ```
258
+
259
+ **Metadata:**
260
+ - model_used: llama-3.3-70b
261
+ - provider_used: nvidia
262
+ - tokens: 152
263
+
264
+ ---
265
+
266
+ ### 8. Creative Writing ✅ PASS
267
+
268
+ **Task Type:** general
269
+ **Provider:** nvidia
270
+ **Model:** llama-3.3-70b
271
+ **Duration:** 1.33s
272
+
273
+ **Prompt:**
274
+ ```
275
+ Write a haiku about coding at night.
276
+ ```
277
+
278
+ **Response:**
279
+ ```
280
+ Moonlit code unfurls
281
+ Midnight keyboard symphony
282
+ Glowing screen delight
283
+ ```
284
+
285
+ **Metadata:**
286
+ - model_used: llama-3.3-70b
287
+ - provider_used: nvidia
288
+ - tokens: 62
289
+
290
+ ---
291
+
292
+ ### 9. Code Debug ✅ PASS
293
+
294
+ **Task Type:** code
295
+ **Provider:** groq
296
+ **Model:** llama-3.3-70b-versatile
297
+ **Duration:** 0.76s
298
+
299
+ **Prompt:**
300
+ ```
301
+ Find the bug in this code: def add(a, b): return a + b + 1
302
+ ```
303
+
304
+ **Response:**
305
+ ```
306
+ **Bug Identification**
307
+
308
+ The bug in the given code is that the function `add(a, b)` is supposed to return the sum of two numbers `a` and `b`, but it is currently adding an extra `1` to the result.
309
+
310
+ **Corrected Code**
311
+
312
+ ```python
313
+ def add(a, b):
314
+ """Returns the sum of two numbers."""
315
+ return a + b
316
+ ```
317
+
318
+ **Explanation**
319
+
320
+ The original code `return a + b + 1` is incrementing the sum by `1`, which is not the expected behavior of an `add` function. The corrected code simply returns the sum of `a` an...
321
+ ```
322
+
323
+ **Metadata:**
324
+ - model_used: llama-3.3-70b-versatile
325
+ - provider_used: groq
326
+ - tokens: 219
327
+
328
+ ---
329
+
330
+ ### 10. Complex Reasoning ❌ FAIL
331
+
332
+ **Task Type:** reasoning
333
+ **Provider:** nvidia
334
+ **Model:** devstral-2-123b
335
+ **Duration:** 1.56s
336
+
337
+ **Prompt:**
338
+ ```
339
+ If all roses are flowers, and some flowers fade quickly, can we conclude that some roses fade quickly?
340
+ ```
341
+
342
+ **Error:**
343
+ ```
344
+ [router] All models failed. Last error: [nvidia] NVIDIA API error (500): {"error":{"message":"EngineCore encountered an issue. See stack trace (above) for the root cause.","type":"Internal Server Error","param":null,"code":500}}
345
+ ```
346
+
347
+ ---
348
+
349
+ ## Provider Performance
350
+
351
+ | Provider | Tests | Passed | Failed | Success Rate | Avg Duration |
352
+ |----------|-------|--------|--------|--------------|-------------|
353
+ | groq | 4 | 4 | 0 | 100.0% | 0.70s |
354
+ | nvidia | 6 | 5 | 1 | 83.3% | 3.45s |
backend/test_ai_providers.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Comprehensive AI Provider Test Script
2
+ Tests NVIDIA, Groq, and Google Gemini providers with 10 different prompts.
3
+ """
4
+
5
+ import asyncio
6
+ import json
7
+ import time
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+
11
+ from app.config import get_settings
12
+ from app.models.router import SmartModelRouter, TaskType
13
+
14
+
15
+ # Test prompts covering different use cases
16
+ TEST_PROMPTS = [
17
+ {
18
+ "name": "Code Generation",
19
+ "prompt": "Write a Python function to calculate fibonacci numbers recursively.",
20
+ "task_type": TaskType.CODE,
21
+ "preferred_provider": "nvidia",
22
+ "preferred_model": "llama-3.3-70b",
23
+ },
24
+ {
25
+ "name": "Data Extraction",
26
+ "prompt": "Extract the key information from this text: 'John Doe, age 35, lives in New York and works as a software engineer at Tech Corp since 2020.'",
27
+ "task_type": TaskType.EXTRACTION,
28
+ "preferred_provider": "groq",
29
+ "preferred_model": "llama-3.3-70b-versatile",
30
+ },
31
+ {
32
+ "name": "Reasoning Task",
33
+ "prompt": "If a train travels 120 miles in 2 hours, and another train travels 180 miles in 3 hours, which train is faster and by how much?",
34
+ "task_type": TaskType.REASONING,
35
+ "preferred_provider": "nvidia",
36
+ "preferred_model": "devstral-2-123b",
37
+ },
38
+ {
39
+ "name": "General Question",
40
+ "prompt": "What are the three primary colors?",
41
+ "task_type": TaskType.GENERAL,
42
+ "preferred_provider": "groq",
43
+ "preferred_model": "llama-3.3-70b-versatile",
44
+ },
45
+ {
46
+ "name": "JSON Generation",
47
+ "prompt": "Generate a JSON object representing a user profile with name, email, age, and interests (array).",
48
+ "task_type": TaskType.CODE,
49
+ "preferred_provider": "nvidia",
50
+ "preferred_model": "devstral-2-123b",
51
+ },
52
+ {
53
+ "name": "Text Summarization",
54
+ "prompt": "Summarize in one sentence: Artificial intelligence is transforming industries by automating tasks, improving decision-making, and creating new opportunities for innovation.",
55
+ "task_type": TaskType.GENERAL,
56
+ "preferred_provider": "groq",
57
+ "preferred_model": "llama-3.3-70b-versatile",
58
+ },
59
+ {
60
+ "name": "Math Problem",
61
+ "prompt": "Solve: 2x + 5 = 15. Show your work.",
62
+ "task_type": TaskType.REASONING,
63
+ "preferred_provider": "nvidia",
64
+ "preferred_model": "llama-3.3-70b",
65
+ },
66
+ {
67
+ "name": "Creative Writing",
68
+ "prompt": "Write a haiku about coding at night.",
69
+ "task_type": TaskType.GENERAL,
70
+ "preferred_provider": "nvidia",
71
+ "preferred_model": "llama-3.3-70b",
72
+ },
73
+ {
74
+ "name": "Code Debug",
75
+ "prompt": "Find the bug in this code: def add(a, b): return a + b + 1",
76
+ "task_type": TaskType.CODE,
77
+ "preferred_provider": "groq",
78
+ "preferred_model": "llama-3.3-70b-versatile",
79
+ },
80
+ {
81
+ "name": "Complex Reasoning",
82
+ "prompt": "If all roses are flowers, and some flowers fade quickly, can we conclude that some roses fade quickly?",
83
+ "task_type": TaskType.REASONING,
84
+ "preferred_provider": "nvidia",
85
+ "preferred_model": "devstral-2-123b",
86
+ },
87
+ ]
88
+
89
+
90
+ class TestReporter:
91
+ """Test reporter for generating markdown reports."""
92
+
93
+ def __init__(self):
94
+ self.results = []
95
+ self.start_time = None
96
+ self.end_time = None
97
+
98
+ def start(self):
99
+ """Mark test start time."""
100
+ self.start_time = datetime.now()
101
+
102
+ def end(self):
103
+ """Mark test end time."""
104
+ self.end_time = datetime.now()
105
+
106
+ def add_result(self, test_case: dict, success: bool, response: str = None,
107
+ error: str = None, duration: float = 0, metadata: dict = None):
108
+ """Add a test result."""
109
+ self.results.append({
110
+ "test_name": test_case["name"],
111
+ "prompt": test_case["prompt"],
112
+ "task_type": test_case["task_type"].value,
113
+ "preferred_provider": test_case.get("preferred_provider"),
114
+ "preferred_model": test_case.get("preferred_model"),
115
+ "success": success,
116
+ "response": response,
117
+ "error": error,
118
+ "duration_seconds": duration,
119
+ "metadata": metadata or {},
120
+ "timestamp": datetime.now().isoformat(),
121
+ })
122
+
123
+ def generate_markdown(self) -> str:
124
+ """Generate markdown test report."""
125
+ total_tests = len(self.results)
126
+ passed = sum(1 for r in self.results if r["success"])
127
+ failed = total_tests - passed
128
+ success_rate = (passed / total_tests * 100) if total_tests > 0 else 0
129
+ total_duration = self.end_time - self.start_time if self.end_time and self.start_time else None
130
+
131
+ md = f"""# AI Provider Test Report
132
+
133
+ **Generated:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
134
+ **Test Duration:** {total_duration.total_seconds():.2f}s
135
+
136
+ ## Summary
137
+
138
+ - **Total Tests:** {total_tests}
139
+ - **Passed:** ✅ {passed}
140
+ - **Failed:** ❌ {failed}
141
+ - **Success Rate:** {success_rate:.1f}%
142
+
143
+ ## Test Results
144
+
145
+ """
146
+
147
+ for i, result in enumerate(self.results, 1):
148
+ status = "✅ PASS" if result["success"] else "❌ FAIL"
149
+ md += f"### {i}. {result['test_name']} {status}\n\n"
150
+ md += f"**Task Type:** {result['task_type']} \n"
151
+ md += f"**Provider:** {result['preferred_provider']} \n"
152
+ md += f"**Model:** {result['preferred_model']} \n"
153
+ md += f"**Duration:** {result['duration_seconds']:.2f}s \n\n"
154
+
155
+ md += f"**Prompt:**\n```\n{result['prompt']}\n```\n\n"
156
+
157
+ if result["success"]:
158
+ md += f"**Response:**\n```\n{result['response'][:500]}{'...' if len(result['response']) > 500 else ''}\n```\n\n"
159
+
160
+ if result["metadata"]:
161
+ md += f"**Metadata:**\n"
162
+ for key, value in result["metadata"].items():
163
+ md += f"- {key}: {value}\n"
164
+ md += "\n"
165
+ else:
166
+ md += f"**Error:**\n```\n{result['error']}\n```\n\n"
167
+
168
+ md += "---\n\n"
169
+
170
+ # Provider summary
171
+ md += "## Provider Performance\n\n"
172
+ providers = {}
173
+ for result in self.results:
174
+ provider = result["preferred_provider"]
175
+ if provider not in providers:
176
+ providers[provider] = {"total": 0, "passed": 0, "total_duration": 0}
177
+ providers[provider]["total"] += 1
178
+ if result["success"]:
179
+ providers[provider]["passed"] += 1
180
+ providers[provider]["total_duration"] += result["duration_seconds"]
181
+
182
+ md += "| Provider | Tests | Passed | Failed | Success Rate | Avg Duration |\n"
183
+ md += "|----------|-------|--------|--------|--------------|-------------|\n"
184
+
185
+ for provider, stats in sorted(providers.items()):
186
+ success_rate = (stats["passed"] / stats["total"] * 100) if stats["total"] > 0 else 0
187
+ avg_duration = stats["total_duration"] / stats["total"] if stats["total"] > 0 else 0
188
+ md += f"| {provider} | {stats['total']} | {stats['passed']} | {stats['total'] - stats['passed']} | {success_rate:.1f}% | {avg_duration:.2f}s |\n"
189
+
190
+ return md
191
+
192
+
193
+ async def run_tests():
194
+ """Run all test cases."""
195
+ print("="*80)
196
+ print("AI Provider Comprehensive Test Suite")
197
+ print("="*80)
198
+ print()
199
+
200
+ # Initialize settings and router
201
+ settings = get_settings()
202
+ print("Initializing model router...")
203
+ print(f" NVIDIA API Key: {'[SET]' if settings.nvidia_api_key else '[NOT SET]'}")
204
+ print(f" Groq API Key: {'[SET]' if settings.groq_api_key else '[NOT SET]'}")
205
+ print(f" Google API Key: {'[SET]' if settings.google_api_key else '[NOT SET]'}")
206
+ print()
207
+
208
+ router = SmartModelRouter(
209
+ openai_api_key=settings.openai_api_key,
210
+ anthropic_api_key=settings.anthropic_api_key,
211
+ google_api_key=settings.google_api_key,
212
+ groq_api_key=settings.groq_api_key,
213
+ nvidia_api_key=settings.nvidia_api_key,
214
+ )
215
+ await router.initialize()
216
+
217
+ available_providers = [p for p in router.providers.keys()]
218
+ print(f"Available providers: {', '.join(available_providers)}")
219
+ print()
220
+
221
+ reporter = TestReporter()
222
+ reporter.start()
223
+
224
+ # Run tests
225
+ for i, test_case in enumerate(TEST_PROMPTS, 1):
226
+ print(f"[{i}/{len(TEST_PROMPTS)}] Running: {test_case['name']}")
227
+ print(f" Provider: {test_case['preferred_provider']}")
228
+ print(f" Model: {test_case['preferred_model']}")
229
+ print(f" Task Type: {test_case['task_type'].value}")
230
+
231
+ start_time = time.time()
232
+
233
+ try:
234
+ response = await router.complete(
235
+ messages=[{"role": "user", "content": test_case["prompt"]}],
236
+ model=test_case.get("preferred_model"),
237
+ task_type=test_case["task_type"],
238
+ fallback=False, # No fallback - test only the requested model
239
+ max_tokens=500,
240
+ temperature=0.7,
241
+ )
242
+
243
+ duration = time.time() - start_time
244
+
245
+ if response and response.content:
246
+ print(f" [OK] Success ({duration:.2f}s)")
247
+ print(f" Response: {response.content[:100]}...")
248
+
249
+ reporter.add_result(
250
+ test_case=test_case,
251
+ success=True,
252
+ response=response.content,
253
+ duration=duration,
254
+ metadata={
255
+ "model_used": response.model,
256
+ "provider_used": response.provider,
257
+ "tokens": response.usage.total_tokens if response.usage else 0,
258
+ }
259
+ )
260
+ else:
261
+ print(f" [FAIL] Failed: Empty response")
262
+ reporter.add_result(
263
+ test_case=test_case,
264
+ success=False,
265
+ error="Empty response from provider",
266
+ duration=duration,
267
+ )
268
+
269
+ except Exception as e:
270
+ duration = time.time() - start_time
271
+ print(f" [FAIL] Failed ({duration:.2f}s): {str(e)}")
272
+ reporter.add_result(
273
+ test_case=test_case,
274
+ success=False,
275
+ error=str(e),
276
+ duration=duration,
277
+ )
278
+
279
+ print()
280
+
281
+ reporter.end()
282
+
283
+ # Generate report
284
+ print("="*80)
285
+ print("Generating test report...")
286
+
287
+ report_md = reporter.generate_markdown()
288
+
289
+ # Save report
290
+ report_path = Path("docs/test/ai_provider_test_report.md")
291
+ report_path.parent.mkdir(parents=True, exist_ok=True)
292
+ report_path.write_text(report_md, encoding="utf-8")
293
+
294
+ print(f"[OK] Report saved to: {report_path}")
295
+ print()
296
+
297
+ # Print summary
298
+ total = len(reporter.results)
299
+ passed = sum(1 for r in reporter.results if r["success"])
300
+ failed = total - passed
301
+
302
+ print("="*80)
303
+ print("TEST SUMMARY")
304
+ print("="*80)
305
+ print(f"Total Tests: {total}")
306
+ print(f"Passed: [OK] {passed}")
307
+ print(f"Failed: [X] {failed}")
308
+ print(f"Success Rate: {(passed/total*100):.1f}%")
309
+ print("="*80)
310
+
311
+ return passed == total
312
+
313
+
314
+ if __name__ == "__main__":
315
+ success = asyncio.run(run_tests())
316
+ exit(0 if success else 1)