shinka-backup / eval_agent /design_draft /test_integration_step_by_step.py

Add files using upload-large-folder tool

3f6526a verified about 1 month ago

11.3 kB

	#!/usr/bin/env python3
	"""
	Step-by-step integration testing for EV2 Service.

	This script runs small-scale evolution experiments to verify:
	1. Notification mechanism works
	2. Evolution results are identical with/without service
	3. Service doesn't interfere with evolution
	"""

	import sys
	import json
	import time
	import shutil
	import requests
	from pathlib import Path

	# Add parent directory to path
	sys.path.insert(0, str(Path(__file__).parent.parent))


	def test_service_health():
	"""Test if service is running and responsive."""
	print("\n" + "=" * 60)
	print("TEST 1: Service Health Check")
	print("=" * 60)

	service_url = "http://localhost:8765"

	try:
	response = requests.get(f"{service_url}/api/v1/status", timeout=2)
	if response.status_code == 200:
	status = response.json()
	print(f"✅ Service is running")
	print(f" Status: {status['status']}")
	stats = status.get('statistics', {})
	print(f" Notifications: {stats.get('total_notifications', 0)}")
	print(f" Agent runs: {stats.get('total_agent_runs', 0)}")
	return True
	else:
	print(f"❌ Service returned HTTP {response.status_code}")
	return False
	except requests.exceptions.ConnectionError:
	print("❌ Service is not running")
	print(" Please start it with:")
	print(" uv run eval_agent/ev2_service_standalone.py --config eval_agent/ev2_service_config.yaml")
	return False
	except Exception as e:
	print(f"❌ Error: {e}")
	return False


	def test_notification_mechanism():
	"""Test that notifications can be sent successfully."""
	print("\n" + "=" * 60)
	print("TEST 2: Notification Mechanism")
	print("=" * 60)

	service_url = "http://localhost:8765"

	# Send a test notification
	payload = {
	"generation": 1,
	"results_dir": "/tmp/test_notification",
	"primary_score": 0.5
	}

	try:
	response = requests.post(
	f"{service_url}/api/v1/notify/generation_complete",
	json=payload,
	timeout=2
	)

	if response.status_code == 200:
	result = response.json()
	print(f"✅ Notification sent successfully")
	print(f" Response: {json.dumps(result, indent=2)}")
	return True
	else:
	print(f"❌ Notification failed: HTTP {response.status_code}")
	return False
	except Exception as e:
	print(f"❌ Error sending notification: {e}")
	return False


	def check_service_state():
	"""Check service state after notifications."""
	print("\n" + "=" * 60)
	print("TEST 3: Service State After Notifications")
	print("=" * 60)

	service_url = "http://localhost:8765"

	try:
	response = requests.get(f"{service_url}/api/v1/status", timeout=2)
	if response.status_code == 200:
	status = response.json()
	stats = status.get('statistics', {})
	print(f"✅ Service state retrieved")
	print(f" Total notifications: {stats.get('total_notifications', 0)}")
	print(f" Agent runs: {stats.get('total_agent_runs', 0)}")
	print(f" Generations tracked: {stats.get('generations_tracked', 0)}")
	print(f" Last trigger gen: {stats.get('last_agent_trigger_gen', -1)}")
	return status
	else:
	print(f"❌ Failed to get status")
	return None
	except Exception as e:
	print(f"❌ Error: {e}")
	return None


	def run_mini_evolution_without_service():
	"""Run a mini evolution without eval service (baseline)."""
	print("\n" + "=" * 60)
	print("TEST 4: Mini Evolution WITHOUT Service (Baseline)")
	print("=" * 60)

	# Import here to avoid loading everything upfront
	from shinka.core import EvolutionRunner, EvolutionConfig
	from shinka.launch import LocalJobConfig
	from shinka.database import DatabaseConfig

	# Create minimal config
	results_dir = Path("/tmp/test_shinka_baseline")

	# Clean up previous run
	if results_dir.exists():
	shutil.rmtree(results_dir)
	results_dir.mkdir(parents=True, exist_ok=True)

	print(f"📁 Results dir: {results_dir}")

	# This is a minimal test - we'll use a dummy evaluator
	# that just returns a constant score
	eval_script = results_dir / "eval_dummy.py"
	eval_script.write_text("""
	import json
	import sys

	# Dummy evaluator: always returns same score
	result = {
	"correct": {"correct": True},
	"metrics": {
	"combined_score": 0.5,
	"public": {"dummy_metric": 0.5},
	"private": {}
	}
	}

	print(json.dumps(result))
	sys.exit(0)
	""")

	# Create dummy initial program
	init_program = results_dir / "initial.py"
	init_program.write_text("""
	def dummy_function():
	return "Hello, World!"
	""")

	try:
	evo_config = EvolutionConfig(
	num_generations=3, # Very small for testing
	max_parallel_jobs=1,
	results_dir=str(results_dir),
	init_program_path=str(init_program),
	eval_service_url=None, # NO SERVICE
	)

	job_config = LocalJobConfig(
	eval_program_path=str(eval_script),
	)

	db_config = DatabaseConfig(
	db_path="evolution.db"
	)

	print("🚀 Starting evolution (3 generations)...")
	start_time = time.time()

	runner = EvolutionRunner(
	evo_config=evo_config,
	job_config=job_config,
	db_config=db_config,
	verbose=False # Reduce noise
	)

	# Note: runner.run() will fail because we don't have LLM configured
	# For this test, we just verify initialization works
	print("✅ Evolution runner initialized successfully")
	print(f" - eval_service_url: {runner.eval_service_url}")
	print(f" - results_dir: {runner.results_dir}")

	elapsed = time.time() - start_time
	print(f"⏱️ Initialization took {elapsed:.2f}s")

	return True

	except Exception as e:
	print(f"❌ Evolution failed: {e}")
	import traceback
	traceback.print_exc()
	return False


	def run_mini_evolution_with_service():
	"""Run a mini evolution WITH eval service (should be identical)."""
	print("\n" + "=" * 60)
	print("TEST 5: Mini Evolution WITH Service (Should be Identical)")
	print("=" * 60)

	from shinka.core import EvolutionRunner, EvolutionConfig
	from shinka.launch import LocalJobConfig
	from shinka.database import DatabaseConfig

	results_dir = Path("/tmp/test_shinka_with_service")

	# Clean up previous run
	if results_dir.exists():
	shutil.rmtree(results_dir)
	results_dir.mkdir(parents=True, exist_ok=True)

	print(f"📁 Results dir: {results_dir}")

	# Create same dummy evaluator
	eval_script = results_dir / "eval_dummy.py"
	eval_script.write_text("""
	import json
	import sys

	result = {
	"correct": {"correct": True},
	"metrics": {
	"combined_score": 0.5,
	"public": {"dummy_metric": 0.5},
	"private": {}
	}
	}

	print(json.dumps(result))
	sys.exit(0)
	""")

	# Create dummy initial program
	init_program = results_dir / "initial.py"
	init_program.write_text("""
	def dummy_function():
	return "Hello, World!"
	""")

	try:
	evo_config = EvolutionConfig(
	num_generations=3,
	max_parallel_jobs=1,
	results_dir=str(results_dir),
	init_program_path=str(init_program),
	eval_service_url="http://localhost:8765", # WITH SERVICE
	)

	job_config = LocalJobConfig(
	eval_program_path=str(eval_script),
	)

	db_config = DatabaseConfig(
	db_path="evolution.db"
	)

	print("🚀 Starting evolution (3 generations)...")
	start_time = time.time()

	runner = EvolutionRunner(
	evo_config=evo_config,
	job_config=job_config,
	db_config=db_config,
	verbose=False
	)

	print("✅ Evolution runner initialized successfully")
	print(f" - eval_service_url: {runner.eval_service_url}")
	print(f" - results_dir: {runner.results_dir}")

	elapsed = time.time() - start_time
	print(f"⏱️ Initialization took {elapsed:.2f}s")

	# Check that service received notifications (if evolution actually ran)
	# For now, just verify the runner has the URL set
	assert runner.eval_service_url == "http://localhost:8765"
	print("✅ Service URL correctly configured")

	return True

	except Exception as e:
	print(f"❌ Evolution failed: {e}")
	import traceback
	traceback.print_exc()
	return False


	def main():
	"""Run all tests."""
	print("\n" + "=" * 70)
	print("🧪 EV2 SERVICE INTEGRATION - STEP BY STEP TESTING")
	print("=" * 70)

	results = []

	# Test 1: Service health
	service_running = test_service_health()
	results.append(("Service Health", service_running))

	if not service_running:
	print("\n" + "=" * 70)
	print("⚠️ Service not running. Please start it first:")
	print(" Terminal 1:")
	print(" cd /home/tengxiao/pj/ShinkaEvolve")
	print(" uv run eval_agent/ev2_service_standalone.py --config eval_agent/ev2_service_config.yaml")
	print("=" * 70)
	return

	# Test 2: Notification mechanism
	notif_works = test_notification_mechanism()
	results.append(("Notification Mechanism", notif_works))

	# Test 3: Service state
	state = check_service_state()
	results.append(("Service State Check", state is not None))

	# Test 4: Evolution without service (baseline)
	baseline_works = run_mini_evolution_without_service()
	results.append(("Evolution WITHOUT Service", baseline_works))

	# Test 5: Evolution with service
	with_service_works = run_mini_evolution_with_service()
	results.append(("Evolution WITH Service", with_service_works))

	# Final service state
	if service_running:
	final_state = check_service_state()

	# Summary
	print("\n" + "=" * 70)
	print("📊 TEST SUMMARY")
	print("=" * 70)

	for test_name, passed in results:
	status = "✅ PASS" if passed else "❌ FAIL"
	print(f" {status} {test_name}")

	all_passed = all(passed for _, passed in results)

	print("=" * 70)
	if all_passed:
	print("🎉 All tests passed! Integration is working correctly.")
	print("\nNext steps:")
	print(" 1. Run actual evolution experiment (without service first)")
	print(" 2. Run same experiment with service enabled")
	print(" 3. Compare results to verify they're identical")
	else:
	print("❌ Some tests failed. Please review errors above.")
	print("=" * 70)


	if __name__ == "__main__":
	main()