File size: 11,286 Bytes

3f6526a

#!/usr/bin/env python3
"""
Step-by-step integration testing for EV2 Service.

This script runs small-scale evolution experiments to verify:
1. Notification mechanism works
2. Evolution results are identical with/without service
3. Service doesn't interfere with evolution
"""

import sys
import json
import time
import shutil
import requests
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))


def test_service_health():
    """Test if service is running and responsive."""
    print("\n" + "=" * 60)
    print("TEST 1: Service Health Check")
    print("=" * 60)
    
    service_url = "http://localhost:8765"
    
    try:
        response = requests.get(f"{service_url}/api/v1/status", timeout=2)
        if response.status_code == 200:
            status = response.json()
            print(f"✅ Service is running")
            print(f"   Status: {status['status']}")
            stats = status.get('statistics', {})
            print(f"   Notifications: {stats.get('total_notifications', 0)}")
            print(f"   Agent runs: {stats.get('total_agent_runs', 0)}")
            return True
        else:
            print(f"❌ Service returned HTTP {response.status_code}")
            return False
    except requests.exceptions.ConnectionError:
        print("❌ Service is not running")
        print("   Please start it with:")
        print("   uv run eval_agent/ev2_service_standalone.py --config eval_agent/ev2_service_config.yaml")
        return False
    except Exception as e:
        print(f"❌ Error: {e}")
        return False


def test_notification_mechanism():
    """Test that notifications can be sent successfully."""
    print("\n" + "=" * 60)
    print("TEST 2: Notification Mechanism")
    print("=" * 60)
    
    service_url = "http://localhost:8765"
    
    # Send a test notification
    payload = {
        "generation": 1,
        "results_dir": "/tmp/test_notification",
        "primary_score": 0.5
    }
    
    try:
        response = requests.post(
            f"{service_url}/api/v1/notify/generation_complete",
            json=payload,
            timeout=2
        )
        
        if response.status_code == 200:
            result = response.json()
            print(f"✅ Notification sent successfully")
            print(f"   Response: {json.dumps(result, indent=2)}")
            return True
        else:
            print(f"❌ Notification failed: HTTP {response.status_code}")
            return False
    except Exception as e:
        print(f"❌ Error sending notification: {e}")
        return False


def check_service_state():
    """Check service state after notifications."""
    print("\n" + "=" * 60)
    print("TEST 3: Service State After Notifications")
    print("=" * 60)
    
    service_url = "http://localhost:8765"
    
    try:
        response = requests.get(f"{service_url}/api/v1/status", timeout=2)
        if response.status_code == 200:
            status = response.json()
            stats = status.get('statistics', {})
            print(f"✅ Service state retrieved")
            print(f"   Total notifications: {stats.get('total_notifications', 0)}")
            print(f"   Agent runs: {stats.get('total_agent_runs', 0)}")
            print(f"   Generations tracked: {stats.get('generations_tracked', 0)}")
            print(f"   Last trigger gen: {stats.get('last_agent_trigger_gen', -1)}")
            return status
        else:
            print(f"❌ Failed to get status")
            return None
    except Exception as e:
        print(f"❌ Error: {e}")
        return None


def run_mini_evolution_without_service():
    """Run a mini evolution without eval service (baseline)."""
    print("\n" + "=" * 60)
    print("TEST 4: Mini Evolution WITHOUT Service (Baseline)")
    print("=" * 60)
    
    # Import here to avoid loading everything upfront
    from shinka.core import EvolutionRunner, EvolutionConfig
    from shinka.launch import LocalJobConfig
    from shinka.database import DatabaseConfig
    
    # Create minimal config
    results_dir = Path("/tmp/test_shinka_baseline")
    
    # Clean up previous run
    if results_dir.exists():
        shutil.rmtree(results_dir)
    results_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"📁 Results dir: {results_dir}")
    
    # This is a minimal test - we'll use a dummy evaluator
    # that just returns a constant score
    eval_script = results_dir / "eval_dummy.py"
    eval_script.write_text("""
import json
import sys

# Dummy evaluator: always returns same score
result = {
    "correct": {"correct": True},
    "metrics": {
        "combined_score": 0.5,
        "public": {"dummy_metric": 0.5},
        "private": {}
    }
}

print(json.dumps(result))
sys.exit(0)
""")
    
    # Create dummy initial program
    init_program = results_dir / "initial.py"
    init_program.write_text("""
def dummy_function():
    return "Hello, World!"
""")
    
    try:
        evo_config = EvolutionConfig(
            num_generations=3,  # Very small for testing
            max_parallel_jobs=1,
            results_dir=str(results_dir),
            init_program_path=str(init_program),
            eval_service_url=None,  # NO SERVICE
        )
        
        job_config = LocalJobConfig(
            eval_program_path=str(eval_script),
        )
        
        db_config = DatabaseConfig(
            db_path="evolution.db"
        )
        
        print("🚀 Starting evolution (3 generations)...")
        start_time = time.time()
        
        runner = EvolutionRunner(
            evo_config=evo_config,
            job_config=job_config,
            db_config=db_config,
            verbose=False  # Reduce noise
        )
        
        # Note: runner.run() will fail because we don't have LLM configured
        # For this test, we just verify initialization works
        print("✅ Evolution runner initialized successfully")
        print(f"   - eval_service_url: {runner.eval_service_url}")
        print(f"   - results_dir: {runner.results_dir}")
        
        elapsed = time.time() - start_time
        print(f"⏱️  Initialization took {elapsed:.2f}s")
        
        return True
        
    except Exception as e:
        print(f"❌ Evolution failed: {e}")
        import traceback
        traceback.print_exc()
        return False


def run_mini_evolution_with_service():
    """Run a mini evolution WITH eval service (should be identical)."""
    print("\n" + "=" * 60)
    print("TEST 5: Mini Evolution WITH Service (Should be Identical)")
    print("=" * 60)
    
    from shinka.core import EvolutionRunner, EvolutionConfig
    from shinka.launch import LocalJobConfig
    from shinka.database import DatabaseConfig
    
    results_dir = Path("/tmp/test_shinka_with_service")
    
    # Clean up previous run
    if results_dir.exists():
        shutil.rmtree(results_dir)
    results_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"📁 Results dir: {results_dir}")
    
    # Create same dummy evaluator
    eval_script = results_dir / "eval_dummy.py"
    eval_script.write_text("""
import json
import sys

result = {
    "correct": {"correct": True},
    "metrics": {
        "combined_score": 0.5,
        "public": {"dummy_metric": 0.5},
        "private": {}
    }
}

print(json.dumps(result))
sys.exit(0)
""")
    
    # Create dummy initial program
    init_program = results_dir / "initial.py"
    init_program.write_text("""
def dummy_function():
    return "Hello, World!"
""")
    
    try:
        evo_config = EvolutionConfig(
            num_generations=3,
            max_parallel_jobs=1,
            results_dir=str(results_dir),
            init_program_path=str(init_program),
            eval_service_url="http://localhost:8765",  # WITH SERVICE
        )
        
        job_config = LocalJobConfig(
            eval_program_path=str(eval_script),
        )
        
        db_config = DatabaseConfig(
            db_path="evolution.db"
        )
        
        print("🚀 Starting evolution (3 generations)...")
        start_time = time.time()
        
        runner = EvolutionRunner(
            evo_config=evo_config,
            job_config=job_config,
            db_config=db_config,
            verbose=False
        )
        
        print("✅ Evolution runner initialized successfully")
        print(f"   - eval_service_url: {runner.eval_service_url}")
        print(f"   - results_dir: {runner.results_dir}")
        
        elapsed = time.time() - start_time
        print(f"⏱️  Initialization took {elapsed:.2f}s")
        
        # Check that service received notifications (if evolution actually ran)
        # For now, just verify the runner has the URL set
        assert runner.eval_service_url == "http://localhost:8765"
        print("✅ Service URL correctly configured")
        
        return True
        
    except Exception as e:
        print(f"❌ Evolution failed: {e}")
        import traceback
        traceback.print_exc()
        return False


def main():
    """Run all tests."""
    print("\n" + "=" * 70)
    print("🧪 EV2 SERVICE INTEGRATION - STEP BY STEP TESTING")
    print("=" * 70)
    
    results = []
    
    # Test 1: Service health
    service_running = test_service_health()
    results.append(("Service Health", service_running))
    
    if not service_running:
        print("\n" + "=" * 70)
        print("⚠️  Service not running. Please start it first:")
        print("   Terminal 1:")
        print("   cd /home/tengxiao/pj/ShinkaEvolve")
        print("   uv run eval_agent/ev2_service_standalone.py --config eval_agent/ev2_service_config.yaml")
        print("=" * 70)
        return
    
    # Test 2: Notification mechanism
    notif_works = test_notification_mechanism()
    results.append(("Notification Mechanism", notif_works))
    
    # Test 3: Service state
    state = check_service_state()
    results.append(("Service State Check", state is not None))
    
    # Test 4: Evolution without service (baseline)
    baseline_works = run_mini_evolution_without_service()
    results.append(("Evolution WITHOUT Service", baseline_works))
    
    # Test 5: Evolution with service
    with_service_works = run_mini_evolution_with_service()
    results.append(("Evolution WITH Service", with_service_works))
    
    # Final service state
    if service_running:
        final_state = check_service_state()
    
    # Summary
    print("\n" + "=" * 70)
    print("📊 TEST SUMMARY")
    print("=" * 70)
    
    for test_name, passed in results:
        status = "✅ PASS" if passed else "❌ FAIL"
        print(f"  {status}  {test_name}")
    
    all_passed = all(passed for _, passed in results)
    
    print("=" * 70)
    if all_passed:
        print("🎉 All tests passed! Integration is working correctly.")
        print("\nNext steps:")
        print("  1. Run actual evolution experiment (without service first)")
        print("  2. Run same experiment with service enabled")
        print("  3. Compare results to verify they're identical")
    else:
        print("❌ Some tests failed. Please review errors above.")
    print("=" * 70)


if __name__ == "__main__":
    main()