shinka-backup / eval_agent /design_draft /test_integration_step_by_step.py
JustinTX's picture
Add files using upload-large-folder tool
3f6526a verified
#!/usr/bin/env python3
"""
Step-by-step integration testing for EV2 Service.
This script runs small-scale evolution experiments to verify:
1. Notification mechanism works
2. Evolution results are identical with/without service
3. Service doesn't interfere with evolution
"""
import sys
import json
import time
import shutil
import requests
from pathlib import Path
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
def test_service_health():
"""Test if service is running and responsive."""
print("\n" + "=" * 60)
print("TEST 1: Service Health Check")
print("=" * 60)
service_url = "http://localhost:8765"
try:
response = requests.get(f"{service_url}/api/v1/status", timeout=2)
if response.status_code == 200:
status = response.json()
print(f"βœ… Service is running")
print(f" Status: {status['status']}")
stats = status.get('statistics', {})
print(f" Notifications: {stats.get('total_notifications', 0)}")
print(f" Agent runs: {stats.get('total_agent_runs', 0)}")
return True
else:
print(f"❌ Service returned HTTP {response.status_code}")
return False
except requests.exceptions.ConnectionError:
print("❌ Service is not running")
print(" Please start it with:")
print(" uv run eval_agent/ev2_service_standalone.py --config eval_agent/ev2_service_config.yaml")
return False
except Exception as e:
print(f"❌ Error: {e}")
return False
def test_notification_mechanism():
"""Test that notifications can be sent successfully."""
print("\n" + "=" * 60)
print("TEST 2: Notification Mechanism")
print("=" * 60)
service_url = "http://localhost:8765"
# Send a test notification
payload = {
"generation": 1,
"results_dir": "/tmp/test_notification",
"primary_score": 0.5
}
try:
response = requests.post(
f"{service_url}/api/v1/notify/generation_complete",
json=payload,
timeout=2
)
if response.status_code == 200:
result = response.json()
print(f"βœ… Notification sent successfully")
print(f" Response: {json.dumps(result, indent=2)}")
return True
else:
print(f"❌ Notification failed: HTTP {response.status_code}")
return False
except Exception as e:
print(f"❌ Error sending notification: {e}")
return False
def check_service_state():
"""Check service state after notifications."""
print("\n" + "=" * 60)
print("TEST 3: Service State After Notifications")
print("=" * 60)
service_url = "http://localhost:8765"
try:
response = requests.get(f"{service_url}/api/v1/status", timeout=2)
if response.status_code == 200:
status = response.json()
stats = status.get('statistics', {})
print(f"βœ… Service state retrieved")
print(f" Total notifications: {stats.get('total_notifications', 0)}")
print(f" Agent runs: {stats.get('total_agent_runs', 0)}")
print(f" Generations tracked: {stats.get('generations_tracked', 0)}")
print(f" Last trigger gen: {stats.get('last_agent_trigger_gen', -1)}")
return status
else:
print(f"❌ Failed to get status")
return None
except Exception as e:
print(f"❌ Error: {e}")
return None
def run_mini_evolution_without_service():
"""Run a mini evolution without eval service (baseline)."""
print("\n" + "=" * 60)
print("TEST 4: Mini Evolution WITHOUT Service (Baseline)")
print("=" * 60)
# Import here to avoid loading everything upfront
from shinka.core import EvolutionRunner, EvolutionConfig
from shinka.launch import LocalJobConfig
from shinka.database import DatabaseConfig
# Create minimal config
results_dir = Path("/tmp/test_shinka_baseline")
# Clean up previous run
if results_dir.exists():
shutil.rmtree(results_dir)
results_dir.mkdir(parents=True, exist_ok=True)
print(f"πŸ“ Results dir: {results_dir}")
# This is a minimal test - we'll use a dummy evaluator
# that just returns a constant score
eval_script = results_dir / "eval_dummy.py"
eval_script.write_text("""
import json
import sys
# Dummy evaluator: always returns same score
result = {
"correct": {"correct": True},
"metrics": {
"combined_score": 0.5,
"public": {"dummy_metric": 0.5},
"private": {}
}
}
print(json.dumps(result))
sys.exit(0)
""")
# Create dummy initial program
init_program = results_dir / "initial.py"
init_program.write_text("""
def dummy_function():
return "Hello, World!"
""")
try:
evo_config = EvolutionConfig(
num_generations=3, # Very small for testing
max_parallel_jobs=1,
results_dir=str(results_dir),
init_program_path=str(init_program),
eval_service_url=None, # NO SERVICE
)
job_config = LocalJobConfig(
eval_program_path=str(eval_script),
)
db_config = DatabaseConfig(
db_path="evolution.db"
)
print("πŸš€ Starting evolution (3 generations)...")
start_time = time.time()
runner = EvolutionRunner(
evo_config=evo_config,
job_config=job_config,
db_config=db_config,
verbose=False # Reduce noise
)
# Note: runner.run() will fail because we don't have LLM configured
# For this test, we just verify initialization works
print("βœ… Evolution runner initialized successfully")
print(f" - eval_service_url: {runner.eval_service_url}")
print(f" - results_dir: {runner.results_dir}")
elapsed = time.time() - start_time
print(f"⏱️ Initialization took {elapsed:.2f}s")
return True
except Exception as e:
print(f"❌ Evolution failed: {e}")
import traceback
traceback.print_exc()
return False
def run_mini_evolution_with_service():
"""Run a mini evolution WITH eval service (should be identical)."""
print("\n" + "=" * 60)
print("TEST 5: Mini Evolution WITH Service (Should be Identical)")
print("=" * 60)
from shinka.core import EvolutionRunner, EvolutionConfig
from shinka.launch import LocalJobConfig
from shinka.database import DatabaseConfig
results_dir = Path("/tmp/test_shinka_with_service")
# Clean up previous run
if results_dir.exists():
shutil.rmtree(results_dir)
results_dir.mkdir(parents=True, exist_ok=True)
print(f"πŸ“ Results dir: {results_dir}")
# Create same dummy evaluator
eval_script = results_dir / "eval_dummy.py"
eval_script.write_text("""
import json
import sys
result = {
"correct": {"correct": True},
"metrics": {
"combined_score": 0.5,
"public": {"dummy_metric": 0.5},
"private": {}
}
}
print(json.dumps(result))
sys.exit(0)
""")
# Create dummy initial program
init_program = results_dir / "initial.py"
init_program.write_text("""
def dummy_function():
return "Hello, World!"
""")
try:
evo_config = EvolutionConfig(
num_generations=3,
max_parallel_jobs=1,
results_dir=str(results_dir),
init_program_path=str(init_program),
eval_service_url="http://localhost:8765", # WITH SERVICE
)
job_config = LocalJobConfig(
eval_program_path=str(eval_script),
)
db_config = DatabaseConfig(
db_path="evolution.db"
)
print("πŸš€ Starting evolution (3 generations)...")
start_time = time.time()
runner = EvolutionRunner(
evo_config=evo_config,
job_config=job_config,
db_config=db_config,
verbose=False
)
print("βœ… Evolution runner initialized successfully")
print(f" - eval_service_url: {runner.eval_service_url}")
print(f" - results_dir: {runner.results_dir}")
elapsed = time.time() - start_time
print(f"⏱️ Initialization took {elapsed:.2f}s")
# Check that service received notifications (if evolution actually ran)
# For now, just verify the runner has the URL set
assert runner.eval_service_url == "http://localhost:8765"
print("βœ… Service URL correctly configured")
return True
except Exception as e:
print(f"❌ Evolution failed: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Run all tests."""
print("\n" + "=" * 70)
print("πŸ§ͺ EV2 SERVICE INTEGRATION - STEP BY STEP TESTING")
print("=" * 70)
results = []
# Test 1: Service health
service_running = test_service_health()
results.append(("Service Health", service_running))
if not service_running:
print("\n" + "=" * 70)
print("⚠️ Service not running. Please start it first:")
print(" Terminal 1:")
print(" cd /home/tengxiao/pj/ShinkaEvolve")
print(" uv run eval_agent/ev2_service_standalone.py --config eval_agent/ev2_service_config.yaml")
print("=" * 70)
return
# Test 2: Notification mechanism
notif_works = test_notification_mechanism()
results.append(("Notification Mechanism", notif_works))
# Test 3: Service state
state = check_service_state()
results.append(("Service State Check", state is not None))
# Test 4: Evolution without service (baseline)
baseline_works = run_mini_evolution_without_service()
results.append(("Evolution WITHOUT Service", baseline_works))
# Test 5: Evolution with service
with_service_works = run_mini_evolution_with_service()
results.append(("Evolution WITH Service", with_service_works))
# Final service state
if service_running:
final_state = check_service_state()
# Summary
print("\n" + "=" * 70)
print("πŸ“Š TEST SUMMARY")
print("=" * 70)
for test_name, passed in results:
status = "βœ… PASS" if passed else "❌ FAIL"
print(f" {status} {test_name}")
all_passed = all(passed for _, passed in results)
print("=" * 70)
if all_passed:
print("πŸŽ‰ All tests passed! Integration is working correctly.")
print("\nNext steps:")
print(" 1. Run actual evolution experiment (without service first)")
print(" 2. Run same experiment with service enabled")
print(" 3. Compare results to verify they're identical")
else:
print("❌ Some tests failed. Please review errors above.")
print("=" * 70)
if __name__ == "__main__":
main()