Spaces:

NeerajCodz
/

scrapeRL

Sleeping

File size: 47,936 Bytes

#!/usr/bin/env python3
"""
Comprehensive ScrapeRL System Test Suite

Tests all components at LOW, MID, and HIGH complexity levels:
- Scraper environment and actions
- Reward function calculations
- Plugin system
- Embeddings with Gemini
- Vector search (memory)
- AI providers (NVIDIA, Groq)
- API endpoints

Author: ScrapeRL Test Suite
"""

import asyncio
import json
import sys
import os
import time
from datetime import datetime
from typing import Any
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path

# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))

# Load environment variables
from dotenv import load_dotenv
load_dotenv()


class TestComplexity(str, Enum):
    LOW = "low"
    MID = "mid"
    HIGH = "high"


@dataclass
class TestResult:
    """Individual test result."""
    name: str
    complexity: TestComplexity
    component: str
    passed: bool
    duration: float
    details: dict[str, Any] = field(default_factory=dict)
    error: str | None = None


class TestReporter:
    """Generates comprehensive test reports."""
    
    def __init__(self):
        self.results: list[TestResult] = []
        self.start_time: datetime = datetime.now()
    
    def add_result(self, result: TestResult):
        self.results.append(result)
        status = "✅ PASS" if result.passed else "❌ FAIL"
        print(f"  [{result.complexity.value.upper()}] {result.name}: {status} ({result.duration:.2f}s)")
        if result.error:
            print(f"      Error: {result.error[:100]}")
    
    def generate_report(self) -> str:
        """Generate markdown test report."""
        end_time = datetime.now()
        duration = (end_time - self.start_time).total_seconds()
        
        passed = sum(1 for r in self.results if r.passed)
        failed = sum(1 for r in self.results if not r.passed)
        success_rate = (passed / len(self.results) * 100) if self.results else 0
        
        report = f"""# ScrapeRL Comprehensive Test Report

**Generated:** {end_time.strftime('%Y-%m-%d %H:%M:%S')}  
**Test Duration:** {duration:.2f}s  

## Summary

- **Total Tests:** {len(self.results)}
- **Passed:** ✅ {passed}
- **Failed:** ❌ {failed}
- **Success Rate:** {success_rate:.1f}%

## Tests by Complexity

"""
        # Group by complexity
        for complexity in TestComplexity:
            comp_results = [r for r in self.results if r.complexity == complexity]
            if comp_results:
                comp_passed = sum(1 for r in comp_results if r.passed)
                report += f"### {complexity.value.upper()} Complexity ({comp_passed}/{len(comp_results)} passed)\n\n"
                
                for result in comp_results:
                    status = "✅ PASS" if result.passed else "❌ FAIL"
                    report += f"#### {result.name} {status}\n\n"
                    report += f"**Component:** {result.component}  \n"
                    report += f"**Duration:** {result.duration:.2f}s  \n\n"
                    
                    if result.details:
                        report += "**Details:**\n```json\n"
                        report += json.dumps(result.details, indent=2, default=str)[:1000]
                        report += "\n```\n\n"
                    
                    if result.error:
                        report += f"**Error:**\n```\n{result.error[:500]}\n```\n\n"
                    
                    report += "---\n\n"

        # Component summary
        report += "## Component Summary\n\n"
        report += "| Component | Tests | Passed | Failed | Success Rate |\n"
        report += "|-----------|-------|--------|--------|-------------|\n"
        
        components = set(r.component for r in self.results)
        for comp in sorted(components):
            comp_results = [r for r in self.results if r.component == comp]
            comp_passed = sum(1 for r in comp_results if r.passed)
            comp_failed = len(comp_results) - comp_passed
            comp_rate = (comp_passed / len(comp_results) * 100) if comp_results else 0
            report += f"| {comp} | {len(comp_results)} | {comp_passed} | {comp_failed} | {comp_rate:.1f}% |\n"
        
        return report


class ScrapeRLTestSuite:
    """Comprehensive test suite for ScrapeRL."""
    
    def __init__(self):
        self.reporter = TestReporter()
    
    async def run_all_tests(self):
        """Run all tests."""
        print("\n" + "="*60)
        print("🧪 ScrapeRL Comprehensive Test Suite")
        print("="*60 + "\n")
        
        # Test categories
        test_categories = [
            ("Scraper Environment", self.test_scraper_environment),
            ("Reward Function", self.test_reward_function),
            ("Plugins System", self.test_plugins),
            ("Embeddings (Gemini)", self.test_embeddings),
            ("Vector Search / Memory", self.test_vector_search),
            ("AI Providers", self.test_ai_providers),
            ("API Endpoints", self.test_api_endpoints),
        ]
        
        for category_name, test_func in test_categories:
            print(f"\n📋 Testing: {category_name}")
            print("-" * 40)
            try:
                await test_func()
            except Exception as e:
                print(f"  ❌ Category failed: {e}")
        
        # Generate report
        report = self.reporter.generate_report()
        
        # Save report
        report_path = Path(__file__).parent.parent / "docs" / "test" / "comprehensive-test-report.md"
        report_path.parent.mkdir(parents=True, exist_ok=True)
        report_path.write_text(report, encoding='utf-8')
        
        print("\n" + "="*60)
        print(f"📊 Test Report saved to: {report_path}")
        passed = sum(1 for r in self.reporter.results if r.passed)
        total = len(self.reporter.results)
        print(f"✅ Final Results: {passed}/{total} tests passed ({passed/total*100:.1f}%)")
        print("="*60 + "\n")
        
        return self.reporter.results

    # =========================================================================
    # SCRAPER ENVIRONMENT TESTS
    # =========================================================================
    
    async def test_scraper_environment(self):
        """Test the scraper environment at different complexity levels."""
        
        # LOW: Basic environment creation and reset
        start = time.time()
        try:
            from app.core.env import WebScraperEnv
            from app.config import get_settings
            
            settings = get_settings()
            env = WebScraperEnv(episode_id="test-001", settings=settings)
            
            # Test reset
            obs, info = await env.reset(task_id="task_001")
            
            passed = obs is not None and info.get("episode_id") == "test-001"
            details = {
                "episode_id": info.get("episode_id"),
                "task_id": info.get("task_id"),
                "observation_fields": list(obs.__dict__.keys()) if obs else []
            }
            
            self.reporter.add_result(TestResult(
                name="Environment Reset",
                complexity=TestComplexity.LOW,
                component="Scraper",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Environment Reset",
                complexity=TestComplexity.LOW,
                component="Scraper",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))
        
        # MID: Navigation and extraction actions
        start = time.time()
        try:
            from app.core.env import WebScraperEnv
            from app.core.action import Action, ActionType
            from app.config import get_settings
            
            settings = get_settings()
            env = WebScraperEnv(episode_id="test-002", settings=settings)
            await env.reset(task_id="task_001")
            
            # Navigate action
            nav_action = Action(
                action_type=ActionType.NAVIGATE,
                parameters={"url": "https://example.com"},
                reasoning="Testing navigation"
            )
            obs, reward, breakdown, terminated, truncated, info = await env.step(nav_action)
            
            # Extract action
            extract_action = Action(
                action_type=ActionType.EXTRACT_FIELD,
                parameters={"field_name": "product_name", "selector": "h1"},
                reasoning="Testing extraction"
            )
            obs2, reward2, breakdown2, terminated2, truncated2, info2 = await env.step(extract_action)
            
            passed = obs is not None and reward is not None and obs2 is not None
            details = {
                "nav_reward": reward,
                "extract_reward": reward2,
                "extracted_fields": len(obs2.extracted_so_far) if obs2 else 0,
                "current_url": obs.current_url if obs else None
            }
            
            self.reporter.add_result(TestResult(
                name="Navigation & Extraction",
                complexity=TestComplexity.MID,
                component="Scraper",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Navigation & Extraction",
                complexity=TestComplexity.MID,
                component="Scraper",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))
        
        # HIGH: Full episode with multiple actions and completion
        start = time.time()
        try:
            from app.core.env import WebScraperEnv
            from app.core.action import Action, ActionType
            from app.config import get_settings
            
            settings = get_settings()
            env = WebScraperEnv(episode_id="test-003", settings=settings)
            await env.reset(task_id="task_001")
            
            actions = [
                Action(action_type=ActionType.NAVIGATE, parameters={"url": "https://example.com/product/123"}, reasoning="Navigate to product"),
                Action(action_type=ActionType.EXTRACT_FIELD, parameters={"field_name": "product_name"}, reasoning="Extract name"),
                Action(action_type=ActionType.EXTRACT_FIELD, parameters={"field_name": "price"}, reasoning="Extract price"),
                Action(action_type=ActionType.EXTRACT_FIELD, parameters={"field_name": "description"}, reasoning="Extract description"),
                Action(action_type=ActionType.DONE, parameters={"success": True}, reasoning="Task complete"),
            ]
            
            total_reward = 0
            final_obs = None
            for action in actions:
                obs, reward, breakdown, terminated, truncated, info = await env.step(action)
                total_reward += reward
                final_obs = obs
                if terminated or truncated:
                    break
            
            state = env.get_state()
            passed = state.get("is_terminal", False) and len(final_obs.extracted_so_far) >= 3
            details = {
                "total_reward": total_reward,
                "steps_taken": state.get("step_number", 0),
                "extracted_fields": len(final_obs.extracted_so_far) if final_obs else 0,
                "is_terminal": state.get("is_terminal", False),
                "status": state.get("status", "unknown")
            }
            
            self.reporter.add_result(TestResult(
                name="Full Episode Completion",
                complexity=TestComplexity.HIGH,
                component="Scraper",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Full Episode Completion",
                complexity=TestComplexity.HIGH,
                component="Scraper",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))

    # =========================================================================
    # REWARD FUNCTION TESTS
    # =========================================================================
    
    async def test_reward_function(self):
        """Test reward calculation at different complexity levels."""
        
        # LOW: Basic reward computation
        start = time.time()
        try:
            from app.core.reward import RewardEngine, RewardBreakdown
            from app.core.action import Action, ActionType
            from app.core.observation import Observation, TaskContext, ExtractedField
            from app.config import get_settings
            
            settings = get_settings()
            engine = RewardEngine(settings)
            
            # Create test observation
            prev_obs = Observation(
                episode_id="test",
                task_id="task_001",
                step_number=0,
                extraction_progress=0.0
            )
            new_obs = Observation(
                episode_id="test",
                task_id="task_001",
                step_number=1,
                extraction_progress=0.33,
                extracted_so_far=[
                    ExtractedField(field_name="product_name", value="Test Product", confidence=0.9)
                ]
            )
            action = Action(action_type=ActionType.EXTRACT_FIELD, parameters={"field_name": "product_name"})
            
            reward, breakdown = engine.compute_reward(action, prev_obs, new_obs, max_steps=50)
            
            passed = isinstance(reward, float) and isinstance(breakdown, RewardBreakdown)
            details = {
                "reward": reward,
                "accuracy": breakdown.accuracy,
                "efficiency": breakdown.efficiency,
                "completeness": breakdown.completeness,
                "total": breakdown.total
            }
            
            self.reporter.add_result(TestResult(
                name="Basic Reward Computation",
                complexity=TestComplexity.LOW,
                component="Reward",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Basic Reward Computation",
                complexity=TestComplexity.LOW,
                component="Reward",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))
        
        # MID: Reward with ground truth accuracy
        start = time.time()
        try:
            from app.core.reward import RewardEngine
            from app.core.action import Action, ActionType
            from app.core.observation import Observation, ExtractedField
            from app.config import get_settings
            
            settings = get_settings()
            engine = RewardEngine(settings)
            engine.reset()
            
            # Test with ground truth
            ground_truth = {"product_name": "Test Product", "price": 99.99}
            
            prev_obs = Observation(episode_id="test", task_id="task_001", step_number=0, extraction_progress=0.0)
            new_obs = Observation(
                episode_id="test",
                task_id="task_001",
                step_number=1,
                extraction_progress=0.5,
                extracted_so_far=[
                    ExtractedField(field_name="product_name", value="Test Product", confidence=0.95),
                    ExtractedField(field_name="price", value=99.99, confidence=0.9),
                ]
            )
            action = Action(action_type=ActionType.EXTRACT_FIELD, parameters={"field_name": "price"})
            
            reward, breakdown = engine.compute_reward(action, prev_obs, new_obs, ground_truth=ground_truth, max_steps=50)
            
            passed = breakdown.accuracy == 1.0  # Perfect match
            details = {
                "reward": reward,
                "accuracy": breakdown.accuracy,
                "ground_truth_match": breakdown.accuracy == 1.0,
                "progress_bonus": breakdown.progress_bonus
            }
            
            self.reporter.add_result(TestResult(
                name="Reward with Ground Truth",
                complexity=TestComplexity.MID,
                component="Reward",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Reward with Ground Truth",
                complexity=TestComplexity.MID,
                component="Reward",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))
        
        # HIGH: Terminal reward and penalties
        start = time.time()
        try:
            from app.core.reward import RewardEngine
            from app.core.observation import Observation, ExtractedField
            from app.config import get_settings
            
            settings = get_settings()
            engine = RewardEngine(settings)
            
            # Test terminal reward
            final_obs = Observation(
                episode_id="test",
                task_id="task_001",
                step_number=10,
                extraction_progress=1.0,
                extracted_so_far=[
                    ExtractedField(field_name="product_name", value="Test Product", confidence=0.95),
                    ExtractedField(field_name="price", value=99.99, confidence=0.9),
                    ExtractedField(field_name="description", value="Great product", confidence=0.85),
                ]
            )
            
            ground_truth = {"product_name": "Test Product", "price": 99.99, "description": "Great product"}
            
            terminal_reward, terminal_breakdown = engine.compute_terminal_reward(
                final_obs, success=True, ground_truth=ground_truth
            )
            
            passed = terminal_reward > 0 and terminal_breakdown.completeness == 1.0
            details = {
                "terminal_reward": terminal_reward,
                "completeness": terminal_breakdown.completeness,
                "accuracy": terminal_breakdown.accuracy,
                "efficiency": terminal_breakdown.efficiency,
                "progress_bonus": terminal_breakdown.progress_bonus
            }
            
            self.reporter.add_result(TestResult(
                name="Terminal Reward Calculation",
                complexity=TestComplexity.HIGH,
                component="Reward",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Terminal Reward Calculation",
                complexity=TestComplexity.HIGH,
                component="Reward",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))

    # =========================================================================
    # PLUGINS TESTS
    # =========================================================================
    
    async def test_plugins(self):
        """Test plugin system at different complexity levels."""
        
        # LOW: List plugins
        start = time.time()
        try:
            from app.api.routes.plugins import PLUGIN_REGISTRY, _installed_plugins
            
            total_plugins = sum(len(plugins) for plugins in PLUGIN_REGISTRY.values())
            categories = list(PLUGIN_REGISTRY.keys())
            
            passed = total_plugins > 0 and len(categories) > 0
            details = {
                "total_plugins": total_plugins,
                "categories": categories,
                "installed_count": len(_installed_plugins)
            }
            
            self.reporter.add_result(TestResult(
                name="List Plugins",
                complexity=TestComplexity.LOW,
                component="Plugins",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="List Plugins",
                complexity=TestComplexity.LOW,
                component="Plugins",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))
        
        # MID: Install/uninstall plugin
        start = time.time()
        try:
            from app.api.routes.plugins import _installed_plugins, PLUGIN_REGISTRY
            
            # Find a plugin that's not installed
            test_plugin_id = None
            for plugins in PLUGIN_REGISTRY.values():
                for plugin in plugins:
                    if plugin["id"] not in _installed_plugins and "captcha" not in plugin["id"]:
                        test_plugin_id = plugin["id"]
                        break
                if test_plugin_id:
                    break
            
            if test_plugin_id:
                # Install
                _installed_plugins.add(test_plugin_id)
                is_installed = test_plugin_id in _installed_plugins
                
                # Uninstall
                _installed_plugins.discard(test_plugin_id)
                is_uninstalled = test_plugin_id not in _installed_plugins
                
                passed = is_installed and is_uninstalled
                details = {
                    "test_plugin": test_plugin_id,
                    "install_success": is_installed,
                    "uninstall_success": is_uninstalled
                }
            else:
                passed = True
                details = {"message": "No test plugin available (all installed)"}
            
            self.reporter.add_result(TestResult(
                name="Install/Uninstall Plugin",
                complexity=TestComplexity.MID,
                component="Plugins",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Install/Uninstall Plugin",
                complexity=TestComplexity.MID,
                component="Plugins",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))
        
        # HIGH: Plugin categories and core plugins check
        start = time.time()
        try:
            from app.api.routes.plugins import PLUGIN_REGISTRY, _installed_plugins
            
            # Check that all categories have plugins
            categories_with_plugins = {cat: len(plugins) for cat, plugins in PLUGIN_REGISTRY.items()}
            
            # Check core plugins are installed
            core_plugins = {"mcp-browser", "mcp-search", "mcp-html", "skill-planner", "skill-navigator", "skill-extractor", "skill-verifier", "proc-json"}
            core_installed = core_plugins.intersection(_installed_plugins)
            
            # Check AI providers
            ai_providers = {"google-api", "groq-api", "nvidia-api"}
            ai_installed = ai_providers.intersection(_installed_plugins)
            
            passed = len(core_installed) >= 6 and len(ai_installed) >= 2
            details = {
                "categories": categories_with_plugins,
                "core_plugins_installed": list(core_installed),
                "ai_providers_installed": list(ai_installed),
                "total_installed": len(_installed_plugins)
            }
            
            self.reporter.add_result(TestResult(
                name="Plugin Categories & Core Plugins",
                complexity=TestComplexity.HIGH,
                component="Plugins",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Plugin Categories & Core Plugins",
                complexity=TestComplexity.HIGH,
                component="Plugins",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))

    # =========================================================================
    # EMBEDDINGS TESTS (Gemini)
    # =========================================================================
    
    async def test_embeddings(self):
        """Test embeddings service with Gemini."""
        
        # LOW: Create embeddings service
        start = time.time()
        try:
            from app.core.embeddings import EmbeddingsService, create_embeddings_service
            
            api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
            model = os.getenv("GEMINI_MODEL_EMBEDDING", "models/gemini-embedding-2-preview")
            
            service = create_embeddings_service(
                provider="google",
                model=model,
                api_key=api_key
            )
            
            passed = service is not None and service.provider == "google"
            details = {
                "provider": service.provider,
                "model": service.model,
                "has_api_key": api_key is not None
            }
            
            self.reporter.add_result(TestResult(
                name="Create Embeddings Service",
                complexity=TestComplexity.LOW,
                component="Embeddings",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Create Embeddings Service",
                complexity=TestComplexity.LOW,
                component="Embeddings",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))
        
        # MID: Generate single embedding
        start = time.time()
        try:
            from app.core.embeddings import create_embeddings_service
            import numpy as np
            
            api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
            model = os.getenv("GEMINI_MODEL_EMBEDDING", "models/gemini-embedding-2-preview")
            
            service = create_embeddings_service(
                provider="google",
                model=model,
                api_key=api_key
            )
            
            # Generate embedding
            text = "This is a test document about web scraping and data extraction."
            embedding = await service.embed_text(text)
            
            passed = isinstance(embedding, np.ndarray) and len(embedding) > 0
            details = {
                "embedding_dim": len(embedding),
                "embedding_type": str(embedding.dtype),
                "text_length": len(text),
                "sample_values": embedding[:5].tolist() if len(embedding) > 5 else embedding.tolist()
            }
            
            self.reporter.add_result(TestResult(
                name="Generate Single Embedding",
                complexity=TestComplexity.MID,
                component="Embeddings",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Generate Single Embedding",
                complexity=TestComplexity.MID,
                component="Embeddings",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))
        
        # HIGH: Batch embeddings and similarity search
        start = time.time()
        try:
            from app.core.embeddings import create_embeddings_service
            import numpy as np
            
            api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
            model = os.getenv("GEMINI_MODEL_EMBEDDING", "models/gemini-embedding-2-preview")
            
            service = create_embeddings_service(
                provider="google",
                model=model,
                api_key=api_key
            )
            
            # Generate batch embeddings
            texts = [
                "Web scraping extracts data from websites",
                "Machine learning uses neural networks",
                "Data extraction from HTML pages",
            ]
            
            embeddings = await service.embed_batch(texts)
            query_embedding = await service.embed_query("scraping data from web")
            
            # Find most similar
            similar = service.find_most_similar(query_embedding, list(embeddings), top_k=2)
            
            passed = len(embeddings) == 3 and len(similar) == 2
            details = {
                "batch_size": len(texts),
                "embeddings_shape": embeddings.shape if hasattr(embeddings, 'shape') else len(embeddings),
                "top_match_index": similar[0][0] if similar else None,
                "top_match_score": similar[0][1] if similar else None,
                "similarity_ranking": [(idx, round(score, 4)) for idx, score in similar]
            }
            
            self.reporter.add_result(TestResult(
                name="Batch Embeddings & Similarity Search",
                complexity=TestComplexity.HIGH,
                component="Embeddings",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Batch Embeddings & Similarity Search",
                complexity=TestComplexity.HIGH,
                component="Embeddings",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))

    # =========================================================================
    # VECTOR SEARCH / MEMORY TESTS
    # =========================================================================
    
    async def test_vector_search(self):
        """Test vector search and memory system."""
        
        # LOW: Initialize memory manager
        start = time.time()
        try:
            from app.memory.manager import MemoryManager, MemoryType
            from app.config import get_settings
            
            settings = get_settings()
            manager = MemoryManager(settings)
            await manager.initialize()
            
            passed = manager.is_initialized
            stats = await manager.get_stats()
            details = {
                "initialized": manager.is_initialized,
                "short_term_stats": stats.short_term,
                "working_stats": stats.working,
                "long_term_stats": stats.long_term
            }
            
            self.reporter.add_result(TestResult(
                name="Initialize Memory Manager",
                complexity=TestComplexity.LOW,
                component="Memory",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Initialize Memory Manager",
                complexity=TestComplexity.LOW,
                component="Memory",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))
        
        # MID: Store and retrieve from different memory types
        start = time.time()
        try:
            from app.memory.manager import MemoryManager, MemoryType
            from app.config import get_settings
            
            settings = get_settings()
            manager = MemoryManager(settings)
            await manager.initialize()
            
            # Test short-term memory
            await manager.store("test_key", "test_value", MemoryType.SHORT_TERM)
            short_term_result = await manager.retrieve("test_key", MemoryType.SHORT_TERM)
            
            # Test working memory
            await manager.store("thought_1", "This is a test thought", MemoryType.WORKING, priority=0.5)
            working_result = await manager.retrieve("thought_1", MemoryType.WORKING)
            
            # Test shared memory
            await manager.store("shared_key", {"data": "shared_value"}, MemoryType.SHARED)
            shared_result = await manager.retrieve("shared_key", MemoryType.SHARED)
            
            passed = (
                short_term_result == "test_value" and
                working_result == "This is a test thought" and
                shared_result == {"data": "shared_value"}
            )
            details = {
                "short_term": short_term_result,
                "working": working_result,
                "shared": shared_result
            }
            
            # Cleanup
            await manager.clear()
            
            self.reporter.add_result(TestResult(
                name="Store & Retrieve Memory",
                complexity=TestComplexity.MID,
                component="Memory",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Store & Retrieve Memory",
                complexity=TestComplexity.MID,
                component="Memory",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))
        
        # HIGH: Long-term memory with vector search
        start = time.time()
        try:
            from app.memory.manager import MemoryManager, MemoryType
            from app.config import get_settings
            
            settings = get_settings()
            manager = MemoryManager(settings)
            await manager.initialize()
            
            # Store documents
            doc1 = await manager.remember("Web scraping extracts data from websites using automated tools")
            doc2 = await manager.remember("Machine learning models can predict outcomes based on data")
            doc3 = await manager.remember("Data extraction from HTML pages requires parsing the DOM")
            
            # Search
            results = await manager.recall("scraping data from web", top_k=2)
            
            passed = len(results) >= 1 or manager.long_term._using_fallback
            details = {
                "documents_stored": 3,
                "search_results": len(results),
                "using_fallback": manager.long_term._using_fallback,
                "top_result_score": results[0].score if results else None
            }
            
            # Cleanup
            await manager.clear(MemoryType.LONG_TERM)
            
            self.reporter.add_result(TestResult(
                name="Long-term Memory & Vector Search",
                complexity=TestComplexity.HIGH,
                component="Memory",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Long-term Memory & Vector Search",
                complexity=TestComplexity.HIGH,
                component="Memory",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))

    # =========================================================================
    # AI PROVIDERS TESTS
    # =========================================================================
    
    async def test_ai_providers(self):
        """Test AI providers (NVIDIA, Groq)."""
        
        # LOW: Test NVIDIA provider initialization
        start = time.time()
        try:
            from app.models.router import SmartModelRouter
            
            nvidia_key = os.getenv("NVIDIA_API_KEY")
            groq_key = os.getenv("GROQ_API_KEY")
            google_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
            
            router = SmartModelRouter(
                nvidia_api_key=nvidia_key,
                groq_api_key=groq_key,
                google_api_key=google_key
            )
            await router.initialize()
            
            providers = list(router.providers.keys())
            
            has_nvidia = "nvidia" in providers
            has_groq = "groq" in providers
            
            passed = has_nvidia or has_groq
            details = {
                "available_providers": providers,
                "has_nvidia": has_nvidia,
                "has_groq": has_groq,
                "nvidia_key_present": nvidia_key is not None,
                "groq_key_present": groq_key is not None
            }
            
            self.reporter.add_result(TestResult(
                name="AI Provider Initialization",
                complexity=TestComplexity.LOW,
                component="AI Providers",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="AI Provider Initialization",
                complexity=TestComplexity.LOW,
                component="AI Providers",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))
        
        # MID: Test NVIDIA completion
        start = time.time()
        try:
            from app.models.router import SmartModelRouter
            from app.models.providers.base import TaskType
            
            nvidia_key = os.getenv("NVIDIA_API_KEY")
            groq_key = os.getenv("GROQ_API_KEY")
            google_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
            
            router = SmartModelRouter(
                nvidia_api_key=nvidia_key,
                groq_api_key=groq_key,
                google_api_key=google_key
            )
            await router.initialize()
            
            messages = [{"role": "user", "content": "What is 2+2? Reply with just the number."}]
            
            response = await router.complete(
                messages=messages,
                task_type=TaskType.GENERAL,
                model="llama-3.3-70b",
                max_tokens=50,
                fallback=False
            )
            
            passed = response is not None and response.content is not None
            details = {
                "model_used": response.model if response else None,
                "provider_used": response.provider if response else None,
                "content_preview": response.content[:100] if response and response.content else None,
                "total_tokens": response.usage.total_tokens if response and response.usage else None
            }
            
            self.reporter.add_result(TestResult(
                name="NVIDIA Completion",
                complexity=TestComplexity.MID,
                component="AI Providers",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="NVIDIA Completion",
                complexity=TestComplexity.MID,
                component="AI Providers",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))
        
        # HIGH: Test Groq completion and fallback
        start = time.time()
        try:
            from app.models.router import SmartModelRouter
            from app.models.providers.base import TaskType
            
            nvidia_key = os.getenv("NVIDIA_API_KEY")
            groq_key = os.getenv("GROQ_API_KEY")
            google_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
            
            router = SmartModelRouter(
                nvidia_api_key=nvidia_key,
                groq_api_key=groq_key,
                google_api_key=google_key
            )
            await router.initialize()
            
            messages = [{"role": "user", "content": "Write a Python function to calculate factorial. Be concise."}]
            
            # Test Groq
            response = await router.complete(
                messages=messages,
                task_type=TaskType.CODE,
                model="llama-3.3-70b-versatile",
                max_tokens=200,
                fallback=False
            )
            
            passed = response is not None and response.content is not None and "def" in response.content.lower()
            details = {
                "model_used": response.model if response else None,
                "provider_used": response.provider if response else None,
                "content_preview": response.content[:200] if response and response.content else None,
                "has_code": "def" in response.content.lower() if response and response.content else False
            }
            
            self.reporter.add_result(TestResult(
                name="Groq Code Generation",
                complexity=TestComplexity.HIGH,
                component="AI Providers",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Groq Code Generation",
                complexity=TestComplexity.HIGH,
                component="AI Providers",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))

    # =========================================================================
    # API ENDPOINTS TESTS
    # =========================================================================
    
    async def test_api_endpoints(self):
        """Test API endpoints."""
        
        # LOW: Test tasks endpoint
        start = time.time()
        try:
            from app.api.routes.tasks import TASK_REPOSITORY, list_tasks
            
            # Direct function call (simulating endpoint)
            response = await list_tasks()
            
            passed = response.total > 0 and len(response.tasks) > 0
            details = {
                "total_tasks": response.total,
                "tasks_returned": len(response.tasks),
                "task_ids": [t.id for t in response.tasks]
            }
            
            self.reporter.add_result(TestResult(
                name="List Tasks Endpoint",
                complexity=TestComplexity.LOW,
                component="API",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="List Tasks Endpoint",
                complexity=TestComplexity.LOW,
                component="API",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))
        
        # MID: Test plugins endpoint
        start = time.time()
        try:
            from app.api.routes.plugins import list_plugins, list_installed_plugins
            
            all_plugins = await list_plugins()
            installed = await list_installed_plugins()
            
            passed = "plugins" in all_plugins and installed["count"] > 0
            details = {
                "total_plugins": all_plugins["stats"]["total"],
                "installed": installed["count"],
                "categories": all_plugins["categories"]
            }
            
            self.reporter.add_result(TestResult(
                name="Plugins Endpoint",
                complexity=TestComplexity.MID,
                component="API",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Plugins Endpoint",
                complexity=TestComplexity.MID,
                component="API",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))
        
        # HIGH: Test episode lifecycle
        start = time.time()
        try:
            from app.api.deps import create_environment, get_environment, remove_environment, list_environments
            from app.config import get_settings
            
            settings = get_settings()
            
            # Create environment
            episode_id = "api-test-001"
            env = create_environment(episode_id, settings)
            
            # Reset
            obs, info = await env.reset(task_id="task_001")
            
            # List
            envs = list_environments()
            
            # Get state
            state = env.get_state()
            
            # Remove
            removed = remove_environment(episode_id)
            
            passed = (
                episode_id in envs and
                state["task_id"] == "task_001" and
                removed
            )
            details = {
                "episode_id": episode_id,
                "task_id": state.get("task_id"),
                "environments_listed": len(envs),
                "removed": removed
            }
            
            self.reporter.add_result(TestResult(
                name="Episode Lifecycle",
                complexity=TestComplexity.HIGH,
                component="API",
                passed=passed,
                duration=time.time() - start,
                details=details
            ))
        except Exception as e:
            self.reporter.add_result(TestResult(
                name="Episode Lifecycle",
                complexity=TestComplexity.HIGH,
                component="API",
                passed=False,
                duration=time.time() - start,
                error=str(e)
            ))


async def main():
    """Run the test suite."""
    suite = ScrapeRLTestSuite()
    results = await suite.run_all_tests()
    
    # Return exit code based on test results
    passed = sum(1 for r in results if r.passed)
    total = len(results)
    
    return 0 if passed == total else 1


if __name__ == "__main__":
    exit_code = asyncio.run(main())
    sys.exit(exit_code)