Spaces:

NeerajCodz
/

scrapeRL

Sleeping

App Files Files Community

NeerajCodz commited on Apr 4

Commit

da142bc

1 Parent(s): 9160ee4

test: comprehensive AI provider testing with 90% success rate

Browse files

Files changed (5) hide show

backend/app/models/__pycache__/router.cpython-314.pyc +0 -0
backend/app/models/providers/nvidia.py +30 -14
backend/app/models/router.py +6 -2
backend/docs/test/ai_provider_test_report.md +354 -0
backend/test_ai_providers.py +316 -0

backend/app/models/__pycache__/router.cpython-314.pyc CHANGED Viewed

Binary files a/backend/app/models/__pycache__/router.cpython-314.pyc and b/backend/app/models/__pycache__/router.cpython-314.pyc differ

backend/app/models/providers/nvidia.py CHANGED Viewed

@@ -163,7 +163,7 @@ class NVIDIAProvider(BaseProvider):
             "Content-Type": "application/json",
         }
-    async def _rate_limit(self) -> None:
         """Apply rate limiting between requests."""
         elapsed = time.time() - self._last_request_time
         min_interval = 0.3  # 300ms between requests
@@ -201,13 +201,13 @@ class NVIDIAProvider(BaseProvider):
         """
         # Validate model
         if model not in self.MODELS:
-            raise ModelNotFoundError(f"Model {model} not found. Available: {list(self.MODELS.keys())}")
         model_info = self.MODELS[model]
         model_id = model_info.id
         # Apply rate limiting
-        await self._rate_limit()
         # Build request payload
         payload: dict[str, Any] = {
@@ -237,12 +237,12 @@ class NVIDIAProvider(BaseProvider):
                 )
                 if response.status_code == 401:
-                    raise AuthenticationError("Invalid NVIDIA API key")
                 elif response.status_code == 429:
-                    raise RateLimitError("NVIDIA API rate limit exceeded")
                 elif response.status_code >= 400:
                     error_detail = response.text
-                    raise ProviderError(f"NVIDIA API error ({response.status_code}): {error_detail}")
                 data = response.json()
@@ -270,7 +270,7 @@ class NVIDIAProvider(BaseProvider):
         except (AuthenticationError, RateLimitError, ProviderError, ModelNotFoundError):
             raise
         except Exception as e:
-            raise ProviderError(f"NVIDIA request failed: {str(e)}") from e
     async def complete_stream(
         self,
@@ -297,12 +297,12 @@ class NVIDIAProvider(BaseProvider):
             Same as complete()
         """
         if model not in self.MODELS:
-            raise ModelNotFoundError(f"Model {model} not found")
         model_info = self.MODELS[model]
         model_id = model_info.id
-        await self._rate_limit()
         payload: dict[str, Any] = {
             "model": model_id,
@@ -330,12 +330,12 @@ class NVIDIAProvider(BaseProvider):
                     json=payload,
                 ) as response:
                     if response.status_code == 401:
-                        raise AuthenticationError("Invalid NVIDIA API key")
                     elif response.status_code == 429:
-                        raise RateLimitError("NVIDIA API rate limit exceeded")
                     elif response.status_code >= 400:
                         error_detail = await response.aread()
-                        raise ProviderError(f"NVIDIA API error: {error_detail.decode()}")
                     async for line in response.aiter_lines():
                         if not line.strip() or not line.startswith("data: "):
@@ -358,14 +358,30 @@ class NVIDIAProvider(BaseProvider):
         except (AuthenticationError, RateLimitError, ProviderError, ModelNotFoundError):
             raise
         except Exception as e:
-            raise ProviderError(f"NVIDIA streaming failed: {str(e)}") from e
     def list_models(self) -> list[ModelInfo]:
         """List all available NVIDIA models."""
         return list(self.MODELS.values())
     def get_model_info(self, model: str) -> ModelInfo:
         """Get information about a specific model."""
         if model not in self.MODELS:
-            raise ModelNotFoundError(f"Model {model} not found")
         return self.MODELS[model]

             "Content-Type": "application/json",
         }
+    async def _apply_rate_limit(self) -> None:
         """Apply rate limiting between requests."""
         elapsed = time.time() - self._last_request_time
         min_interval = 0.3  # 300ms between requests
         """
         # Validate model
         if model not in self.MODELS:
+            raise ModelNotFoundError(self.PROVIDER_NAME, model)
         model_info = self.MODELS[model]
         model_id = model_info.id
         # Apply rate limiting
+        await self._apply_rate_limit()
         # Build request payload
         payload: dict[str, Any] = {
                 )
                 if response.status_code == 401:
+                    raise AuthenticationError(self.PROVIDER_NAME, "Invalid NVIDIA API key")
                 elif response.status_code == 429:
+                    raise RateLimitError(self.PROVIDER_NAME)
                 elif response.status_code >= 400:
                     error_detail = response.text
+                    raise ProviderError(f"NVIDIA API error ({response.status_code}): {error_detail}", self.PROVIDER_NAME)
                 data = response.json()
         except (AuthenticationError, RateLimitError, ProviderError, ModelNotFoundError):
             raise
         except Exception as e:
+            raise ProviderError(f"NVIDIA request failed: {str(e)}", self.PROVIDER_NAME) from e
     async def complete_stream(
         self,
             Same as complete()
         """
         if model not in self.MODELS:
+            raise ModelNotFoundError(self.PROVIDER_NAME, model)
         model_info = self.MODELS[model]
         model_id = model_info.id
+        await self._apply_rate_limit()
         payload: dict[str, Any] = {
             "model": model_id,
                     json=payload,
                 ) as response:
                     if response.status_code == 401:
+                        raise AuthenticationError(self.PROVIDER_NAME, "Invalid NVIDIA API key")
                     elif response.status_code == 429:
+                        raise RateLimitError(self.PROVIDER_NAME)
                     elif response.status_code >= 400:
                         error_detail = await response.aread()
+                        raise ProviderError(f"NVIDIA API error: {error_detail.decode()}", self.PROVIDER_NAME)
                     async for line in response.aiter_lines():
                         if not line.strip() or not line.startswith("data: "):
         except (AuthenticationError, RateLimitError, ProviderError, ModelNotFoundError):
             raise
         except Exception as e:
+            raise ProviderError(f"NVIDIA streaming failed: {str(e)}", self.PROVIDER_NAME) from e
     def list_models(self) -> list[ModelInfo]:
         """List all available NVIDIA models."""
         return list(self.MODELS.values())
+    def get_models(self) -> list[ModelInfo]:
+        """Get list of available models (required by abstract base)."""
+        return self.list_models()
+    async def stream(
+        self,
+        messages: list[dict[str, Any]],
+        model: str,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[str]:
+        """Stream a completion (delegates to complete_stream)."""
+        async for chunk in self.complete_stream(messages, model, temperature, max_tokens, **kwargs):
+            yield chunk
     def get_model_info(self, model: str) -> ModelInfo:
         """Get information about a specific model."""
         if model not in self.MODELS:
+            raise ModelNotFoundError(self.PROVIDER_NAME, model)
         return self.MODELS[model]

backend/app/models/router.py CHANGED Viewed

@@ -313,8 +313,12 @@ class SmartModelRouter:
     def get_provider_for_model(self, model: str) -> BaseProvider | None:
         """Get the provider for a specific model."""
         for provider in self.providers.values():
-            if provider.get_model_info(model):
-                return provider
             # Check aliases for Anthropic and Google
             if hasattr(provider, "MODEL_ALIASES"):

     def get_provider_for_model(self, model: str) -> BaseProvider | None:
         """Get the provider for a specific model."""
         for provider in self.providers.values():
+            try:
+                if provider.get_model_info(model):
+                    return provider
+            except Exception:
+                # Model not found in this provider, continue to next
+                pass
             # Check aliases for Anthropic and Google
             if hasattr(provider, "MODEL_ALIASES"):

backend/docs/test/ai_provider_test_report.md ADDED Viewed

	@@ -0,0 +1,354 @@

+# AI Provider Test Report
+**Generated:** 2026-04-05 02:23:10
+**Test Duration:** 23.50s
+## Summary
+- **Total Tests:** 10
+- **Passed:** ✅ 9
+- **Failed:** ❌ 1
+- **Success Rate:** 90.0%
+## Test Results
+### 1. Code Generation ✅ PASS
+**Task Type:** code
+**Provider:** nvidia
+**Model:** llama-3.3-70b
+**Duration:** 5.41s
+**Prompt:**
+```
+Write a Python function to calculate fibonacci numbers recursively.
+```
+**Response:**
+```
+### Recursive Fibonacci Function in Python
+#### Overview
+The Fibonacci sequence is a series of numbers where a number is the sum of the two preceding ones, usually starting with 0 and 1. This function calculates the nth Fibonacci number using recursion.
+#### Code
+```python
+def fibonacci(n):
+    """
+    Calculate the nth Fibonacci number recursively.
+    Args:
+        n (int): The position of the Fibonacci number to calculate.
+    Returns:
+        int: The nth Fibonacci number.
+    Raises:
+...
+```
+**Metadata:**
+- model_used: llama-3.3-70b
+- provider_used: nvidia
+- tokens: 429
+---
+### 2. Data Extraction ✅ PASS
+**Task Type:** extraction
+**Provider:** groq
+**Model:** llama-3.3-70b-versatile
+**Duration:** 0.78s
+**Prompt:**
+```
+Extract the key information from this text: 'John Doe, age 35, lives in New York and works as a software engineer at Tech Corp since 2020.'
+```
+**Response:**
+```
+The key information extracted from the text is:
+1. **Name**: John Doe
+2. **Age**: 35
+3. **Location**: New York
+4. **Occupation**: Software Engineer
+5. **Employer**: Tech Corp
+6. **Employment Start Date**: 2020
+```
+**Metadata:**
+- model_used: llama-3.3-70b-versatile
+- provider_used: groq
+- tokens: 132
+---
+### 3. Reasoning Task ✅ PASS
+**Task Type:** reasoning
+**Provider:** nvidia
+**Model:** devstral-2-123b
+**Duration:** 5.25s
+**Prompt:**
+```
+If a train travels 120 miles in 2 hours, and another train travels 180 miles in 3 hours, which train is faster and by how much?
+```
+**Response:**
+```
+To determine which train is faster and by how much, we'll calculate the speed of each train using the formula:
+\[
+\text{Speed} = \frac{\text{Distance}}{\text{Time}}
+\]
+### **First Train:**
+- **Distance:** 120 miles
+- **Time:** 2 hours
+\[
+\text{Speed}_1 = \frac{120 \text{ miles}}{2 \text{ hours}} = 60 \text{ mph}
+\]
+### **Second Train:**
+- **Distance:** 180 miles
+- **Time:** 3 hours
+\[
+\text{Speed}_2 = \frac{180 \text{ miles}}{3 \text{ hours}} = 60 \text{ mph}
+\]
+### **Comparison:**
+Both tr...
+```
+**Metadata:**
+- model_used: devstral-2-123b
+- provider_used: nvidia
+- tokens: 251
+---
+### 4. General Question ✅ PASS
+**Task Type:** general
+**Provider:** groq
+**Model:** llama-3.3-70b-versatile
+**Duration:** 0.92s
+**Prompt:**
+```
+What are the three primary colors?
+```
+**Response:**
+```
+The three primary colors are:
+1. Red
+2. Blue
+3. Yellow
+These colors cannot be created by mixing other colors together, and they are the base for creating all other colors.
+```
+**Metadata:**
+- model_used: llama-3.3-70b-versatile
+- provider_used: groq
+- tokens: 83
+---
+### 5. JSON Generation ✅ PASS
+**Task Type:** code
+**Provider:** nvidia
+**Model:** devstral-2-123b
+**Duration:** 4.44s
+**Prompt:**
+```
+Generate a JSON object representing a user profile with name, email, age, and interests (array).
+```
+**Response:**
+```
+Here's a JSON object representing a user profile with the specified fields:
+```json
+{
+  "userProfile": {
+    "name": "John Doe",
+    "email": "john.doe@example.com",
+    "age": 30,
+    "interests": ["hiking", "photography", "reading", "cooking"]
+  }
+}
+```
+### Explanation:
+- **"name"**: A string representing the user's full name.
+- **"email"**: A string representing the user's email address.
+- **"age"**: A number representing the user's age.
+- **"interests"**: An array of strings listing the us...
+```
+**Metadata:**
+- model_used: devstral-2-123b
+- provider_used: nvidia
+- tokens: 187
+---
+### 6. Text Summarization ✅ PASS
+**Task Type:** general
+**Provider:** groq
+**Model:** llama-3.3-70b-versatile
+**Duration:** 0.33s
+**Prompt:**
+```
+Summarize in one sentence: Artificial intelligence is transforming industries by automating tasks, improving decision-making, and creating new opportunities for innovation.
+```
+**Response:**
+```
+Artificial intelligence is revolutionizing various industries by automating tasks, enhancing decision-making, and driving innovation, ultimately transforming the way businesses operate and creating new opportunities.
+```
+**Metadata:**
+- model_used: llama-3.3-70b-versatile
+- provider_used: groq
+- tokens: 96
+---
+### 7. Math Problem ✅ PASS
+**Task Type:** reasoning
+**Provider:** nvidia
+**Model:** llama-3.3-70b
+**Duration:** 2.71s
+**Prompt:**
+```
+Solve: 2x + 5 = 15. Show your work.
+```
+**Response:**
+```
+To solve for x, we need to isolate x on one side of the equation.
+2x + 5 = 15
+First, subtract 5 from both sides of the equation:
+2x + 5 - 5 = 15 - 5
+2x = 10
+Next, divide both sides of the equation by 2:
+2x / 2 = 10 / 2
+x = 5
+Therefore, the value of x is 5.
+```
+**Metadata:**
+- model_used: llama-3.3-70b
+- provider_used: nvidia
+- tokens: 152
+---
+### 8. Creative Writing ✅ PASS
+**Task Type:** general
+**Provider:** nvidia
+**Model:** llama-3.3-70b
+**Duration:** 1.33s
+**Prompt:**
+```
+Write a haiku about coding at night.
+```
+**Response:**
+```
+Moonlit code unfurls
+Midnight keyboard symphony
+Glowing screen delight
+```
+**Metadata:**
+- model_used: llama-3.3-70b
+- provider_used: nvidia
+- tokens: 62
+---
+### 9. Code Debug ✅ PASS
+**Task Type:** code
+**Provider:** groq
+**Model:** llama-3.3-70b-versatile
+**Duration:** 0.76s
+**Prompt:**
+```
+Find the bug in this code: def add(a, b): return a + b + 1
+```
+**Response:**
+```
+**Bug Identification**
+The bug in the given code is that the function `add(a, b)` is supposed to return the sum of two numbers `a` and `b`, but it is currently adding an extra `1` to the result.
+**Corrected Code**
+```python
+def add(a, b):
+    """Returns the sum of two numbers."""
+    return a + b
+```
+**Explanation**
+The original code `return a + b + 1` is incrementing the sum by `1`, which is not the expected behavior of an `add` function. The corrected code simply returns the sum of `a` an...
+```
+**Metadata:**
+- model_used: llama-3.3-70b-versatile
+- provider_used: groq
+- tokens: 219
+---
+### 10. Complex Reasoning ❌ FAIL
+**Task Type:** reasoning
+**Provider:** nvidia
+**Model:** devstral-2-123b
+**Duration:** 1.56s
+**Prompt:**
+```
+If all roses are flowers, and some flowers fade quickly, can we conclude that some roses fade quickly?
+```
+**Error:**
+```
+[router] All models failed. Last error: [nvidia] NVIDIA API error (500): {"error":{"message":"EngineCore encountered an issue. See stack trace (above) for the root cause.","type":"Internal Server Error","param":null,"code":500}}
+```
+---
+## Provider Performance
+| Provider | Tests | Passed | Failed | Success Rate | Avg Duration |
+|----------|-------|--------|--------|--------------|-------------|
+| groq | 4 | 4 | 0 | 100.0% | 0.70s |
+| nvidia | 6 | 5 | 1 | 83.3% | 3.45s |

backend/test_ai_providers.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""Comprehensive AI Provider Test Script
+Tests NVIDIA, Groq, and Google Gemini providers with 10 different prompts.
+"""
+import asyncio
+import json
+import time
+from datetime import datetime
+from pathlib import Path
+from app.config import get_settings
+from app.models.router import SmartModelRouter, TaskType
+# Test prompts covering different use cases
+TEST_PROMPTS = [
+    {
+        "name": "Code Generation",
+        "prompt": "Write a Python function to calculate fibonacci numbers recursively.",
+        "task_type": TaskType.CODE,
+        "preferred_provider": "nvidia",
+        "preferred_model": "llama-3.3-70b",
+    },
+    {
+        "name": "Data Extraction",
+        "prompt": "Extract the key information from this text: 'John Doe, age 35, lives in New York and works as a software engineer at Tech Corp since 2020.'",
+        "task_type": TaskType.EXTRACTION,
+        "preferred_provider": "groq",
+        "preferred_model": "llama-3.3-70b-versatile",
+    },
+    {
+        "name": "Reasoning Task",
+        "prompt": "If a train travels 120 miles in 2 hours, and another train travels 180 miles in 3 hours, which train is faster and by how much?",
+        "task_type": TaskType.REASONING,
+        "preferred_provider": "nvidia",
+        "preferred_model": "devstral-2-123b",
+    },
+    {
+        "name": "General Question",
+        "prompt": "What are the three primary colors?",
+        "task_type": TaskType.GENERAL,
+        "preferred_provider": "groq",
+        "preferred_model": "llama-3.3-70b-versatile",
+    },
+    {
+        "name": "JSON Generation",
+        "prompt": "Generate a JSON object representing a user profile with name, email, age, and interests (array).",
+        "task_type": TaskType.CODE,
+        "preferred_provider": "nvidia",
+        "preferred_model": "devstral-2-123b",
+    },
+    {
+        "name": "Text Summarization",
+        "prompt": "Summarize in one sentence: Artificial intelligence is transforming industries by automating tasks, improving decision-making, and creating new opportunities for innovation.",
+        "task_type": TaskType.GENERAL,
+        "preferred_provider": "groq",
+        "preferred_model": "llama-3.3-70b-versatile",
+    },
+    {
+        "name": "Math Problem",
+        "prompt": "Solve: 2x + 5 = 15. Show your work.",
+        "task_type": TaskType.REASONING,
+        "preferred_provider": "nvidia",
+        "preferred_model": "llama-3.3-70b",
+    },
+    {
+        "name": "Creative Writing",
+        "prompt": "Write a haiku about coding at night.",
+        "task_type": TaskType.GENERAL,
+        "preferred_provider": "nvidia",
+        "preferred_model": "llama-3.3-70b",
+    },
+    {
+        "name": "Code Debug",
+        "prompt": "Find the bug in this code: def add(a, b): return a + b + 1",
+        "task_type": TaskType.CODE,
+        "preferred_provider": "groq",
+        "preferred_model": "llama-3.3-70b-versatile",
+    },
+    {
+        "name": "Complex Reasoning",
+        "prompt": "If all roses are flowers, and some flowers fade quickly, can we conclude that some roses fade quickly?",
+        "task_type": TaskType.REASONING,
+        "preferred_provider": "nvidia",
+        "preferred_model": "devstral-2-123b",
+    },
+]
+class TestReporter:
+    """Test reporter for generating markdown reports."""
+    def __init__(self):
+        self.results = []
+        self.start_time = None
+        self.end_time = None
+    def start(self):
+        """Mark test start time."""
+        self.start_time = datetime.now()
+    def end(self):
+        """Mark test end time."""
+        self.end_time = datetime.now()
+    def add_result(self, test_case: dict, success: bool, response: str = None,
+                   error: str = None, duration: float = 0, metadata: dict = None):
+        """Add a test result."""
+        self.results.append({
+            "test_name": test_case["name"],
+            "prompt": test_case["prompt"],
+            "task_type": test_case["task_type"].value,
+            "preferred_provider": test_case.get("preferred_provider"),
+            "preferred_model": test_case.get("preferred_model"),
+            "success": success,
+            "response": response,
+            "error": error,
+            "duration_seconds": duration,
+            "metadata": metadata or {},
+            "timestamp": datetime.now().isoformat(),
+        })
+    def generate_markdown(self) -> str:
+        """Generate markdown test report."""
+        total_tests = len(self.results)
+        passed = sum(1 for r in self.results if r["success"])
+        failed = total_tests - passed
+        success_rate = (passed / total_tests * 100) if total_tests > 0 else 0
+        total_duration = self.end_time - self.start_time if self.end_time and self.start_time else None
+        md = f"""# AI Provider Test Report
+**Generated:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
+**Test Duration:** {total_duration.total_seconds():.2f}s
+## Summary
+- **Total Tests:** {total_tests}
+- **Passed:** ✅ {passed}
+- **Failed:** ❌ {failed}
+- **Success Rate:** {success_rate:.1f}%
+## Test Results
+"""
+        for i, result in enumerate(self.results, 1):
+            status = "✅ PASS" if result["success"] else "❌ FAIL"
+            md += f"### {i}. {result['test_name']} {status}\n\n"
+            md += f"**Task Type:** {result['task_type']}  \n"
+            md += f"**Provider:** {result['preferred_provider']}  \n"
+            md += f"**Model:** {result['preferred_model']}  \n"
+            md += f"**Duration:** {result['duration_seconds']:.2f}s  \n\n"
+            md += f"**Prompt:**\n```\n{result['prompt']}\n```\n\n"
+            if result["success"]:
+                md += f"**Response:**\n```\n{result['response'][:500]}{'...' if len(result['response']) > 500 else ''}\n```\n\n"
+                if result["metadata"]:
+                    md += f"**Metadata:**\n"
+                    for key, value in result["metadata"].items():
+                        md += f"- {key}: {value}\n"
+                    md += "\n"
+            else:
+                md += f"**Error:**\n```\n{result['error']}\n```\n\n"
+            md += "---\n\n"
+        # Provider summary
+        md += "## Provider Performance\n\n"
+        providers = {}
+        for result in self.results:
+            provider = result["preferred_provider"]
+            if provider not in providers:
+                providers[provider] = {"total": 0, "passed": 0, "total_duration": 0}
+            providers[provider]["total"] += 1
+            if result["success"]:
+                providers[provider]["passed"] += 1
+            providers[provider]["total_duration"] += result["duration_seconds"]
+        md += "| Provider | Tests | Passed | Failed | Success Rate | Avg Duration |\n"
+        md += "|----------|-------|--------|--------|--------------|-------------|\n"
+        for provider, stats in sorted(providers.items()):
+            success_rate = (stats["passed"] / stats["total"] * 100) if stats["total"] > 0 else 0
+            avg_duration = stats["total_duration"] / stats["total"] if stats["total"] > 0 else 0
+            md += f"| {provider} | {stats['total']} | {stats['passed']} | {stats['total'] - stats['passed']} | {success_rate:.1f}% | {avg_duration:.2f}s |\n"
+        return md
+async def run_tests():
+    """Run all test cases."""
+    print("="*80)
+    print("AI Provider Comprehensive Test Suite")
+    print("="*80)
+    print()
+    # Initialize settings and router
+    settings = get_settings()
+    print("Initializing model router...")
+    print(f"  NVIDIA API Key: {'[SET]' if settings.nvidia_api_key else '[NOT SET]'}")
+    print(f"  Groq API Key: {'[SET]' if settings.groq_api_key else '[NOT SET]'}")
+    print(f"  Google API Key: {'[SET]' if settings.google_api_key else '[NOT SET]'}")
+    print()
+    router = SmartModelRouter(
+        openai_api_key=settings.openai_api_key,
+        anthropic_api_key=settings.anthropic_api_key,
+        google_api_key=settings.google_api_key,
+        groq_api_key=settings.groq_api_key,
+        nvidia_api_key=settings.nvidia_api_key,
+    )
+    await router.initialize()
+    available_providers = [p for p in router.providers.keys()]
+    print(f"Available providers: {', '.join(available_providers)}")
+    print()
+    reporter = TestReporter()
+    reporter.start()
+    # Run tests
+    for i, test_case in enumerate(TEST_PROMPTS, 1):
+        print(f"[{i}/{len(TEST_PROMPTS)}] Running: {test_case['name']}")
+        print(f"  Provider: {test_case['preferred_provider']}")
+        print(f"  Model: {test_case['preferred_model']}")
+        print(f"  Task Type: {test_case['task_type'].value}")
+        start_time = time.time()
+        try:
+            response = await router.complete(
+                messages=[{"role": "user", "content": test_case["prompt"]}],
+                model=test_case.get("preferred_model"),
+                task_type=test_case["task_type"],
+                fallback=False,  # No fallback - test only the requested model
+                max_tokens=500,
+                temperature=0.7,
+            )
+            duration = time.time() - start_time
+            if response and response.content:
+                print(f"  [OK] Success ({duration:.2f}s)")
+                print(f"  Response: {response.content[:100]}...")
+                reporter.add_result(
+                    test_case=test_case,
+                    success=True,
+                    response=response.content,
+                    duration=duration,
+                    metadata={
+                        "model_used": response.model,
+                        "provider_used": response.provider,
+                        "tokens": response.usage.total_tokens if response.usage else 0,
+                    }
+                )
+            else:
+                print(f"  [FAIL] Failed: Empty response")
+                reporter.add_result(
+                    test_case=test_case,
+                    success=False,
+                    error="Empty response from provider",
+                    duration=duration,
+                )
+        except Exception as e:
+            duration = time.time() - start_time
+            print(f"  [FAIL] Failed ({duration:.2f}s): {str(e)}")
+            reporter.add_result(
+                test_case=test_case,
+                success=False,
+                error=str(e),
+                duration=duration,
+            )
+        print()
+    reporter.end()
+    # Generate report
+    print("="*80)
+    print("Generating test report...")
+    report_md = reporter.generate_markdown()
+    # Save report
+    report_path = Path("docs/test/ai_provider_test_report.md")
+    report_path.parent.mkdir(parents=True, exist_ok=True)
+    report_path.write_text(report_md, encoding="utf-8")
+    print(f"[OK] Report saved to: {report_path}")
+    print()
+    # Print summary
+    total = len(reporter.results)
+    passed = sum(1 for r in reporter.results if r["success"])
+    failed = total - passed
+    print("="*80)
+    print("TEST SUMMARY")
+    print("="*80)
+    print(f"Total Tests: {total}")
+    print(f"Passed: [OK] {passed}")
+    print(f"Failed: [X] {failed}")
+    print(f"Success Rate: {(passed/total*100):.1f}%")
+    print("="*80)
+    return passed == total
+if __name__ == "__main__":
+    success = asyncio.run(run_tests())
+    exit(0 if success else 1)