shinka-backup / scripts /dev /run_experiment.py
JustinTX's picture
Add files using upload-large-folder tool
3f6526a verified
#!/usr/bin/env python3
"""
Universal experiment runner for ShinkaEvolve with Eval Service integration.
Usage: python run_experiment.py --experiment-name NAME [options]
Based on run_full_experiment.py pattern.
"""
import argparse
import sys
import time
from pathlib import Path
from datetime import datetime
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from shinka.core import EvolutionRunner, EvolutionConfig
from shinka.launch import LocalJobConfig
from shinka.database import DatabaseConfig
def parse_args():
parser = argparse.ArgumentParser(
description="Run ShinkaEvolve experiment with Eval Service",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
# Experiment configuration
parser.add_argument("--experiment-name", type=str, required=True,
help="Experiment name (e.g., 'quick_test', 'full_50gen')")
parser.add_argument("--num-generations", type=int, default=50,
help="Number of generations to evolve")
parser.add_argument("--max-parallel-jobs", type=int, default=4,
help="Maximum parallel evaluation jobs")
# Task configuration
parser.add_argument("--task", type=str, default="circle_packing",
help="Task domain")
parser.add_argument("--initial-code", type=str, default=None,
help="Path to initial code")
parser.add_argument("--evaluator", type=str, default=None,
help="Path to evaluator")
# Eval Service configuration
parser.add_argument("--use-eval-service", action="store_true", default=True,
help="Use eval service for evaluation")
parser.add_argument("--eval-service-url", type=str, default="http://localhost:8765",
help="Eval service URL")
parser.add_argument("--evaluator-module", type=str, default=None,
help="Evaluator module")
parser.add_argument("--evaluator-function", type=str, default="main",
help="Evaluator function name")
parser.add_argument("--eval-trigger-mode", type=str, default=None,
choices=["always", "periodic", "plateau", "mixed"],
help="Override eval service trigger mode for this experiment")
parser.add_argument("--eval-trigger-interval", type=int, default=None,
help="Override eval service trigger interval for this experiment")
# Database configuration
parser.add_argument("--num-islands", type=int, default=2,
help="Number of islands for island model")
parser.add_argument("--archive-size", type=int, default=40,
help="Archive size for elites")
# Meta configuration
parser.add_argument("--meta-interval", type=int, default=10,
help="Meta-summarizer interval (generations)")
parser.add_argument("--meta-max-recommendations", type=int, default=5,
help="Maximum meta recommendations")
parser.add_argument("--persistent-agents", action="store_true", default=False,
help="Enable optional long-horizon shared-memory agents for Shinka and eval service")
parser.add_argument("--persistent-context-refresh-interval", type=int, default=10,
help="How often persistent contexts are compacted and rebuilt")
parser.add_argument("--persistent-context-max-recent-attempts", type=int, default=12,
help="Max recent search attempts injected into persistent context")
parser.add_argument("--persistent-context-max-recent-insights", type=int, default=8,
help="Max recent eval insights injected into persistent context")
parser.add_argument("--persistent-invalid-burst-threshold", type=int, default=3,
help="Invalid-output threshold that forces a persistent-context refresh")
parser.add_argument("--persistent-invalid-burst-window", type=int, default=5,
help="Lookback window for invalid-output burst detection")
parser.add_argument("--use-text-feedback", dest="use_text_feedback", action="store_true", default=True,
help="Include evaluator text_feedback (including auxiliary metric descriptions) in mutation prompts")
parser.add_argument("--no-text-feedback", dest="use_text_feedback", action="store_false",
help="Disable text_feedback injection into mutation prompts")
# LLM configuration
parser.add_argument("--llm-models", type=str, nargs="+",
default=["native-gemini-2.5-flash", "native-gemini-2.5-pro"],
help="LLM models to use")
parser.add_argument("--llm-selection", type=str, default="ucb1",
choices=["ucb1", "thompson", "epsilon_greedy", "random"],
help="LLM dynamic selection strategy")
parser.add_argument("--llm-temperatures", type=float, nargs="+",
default=[0.5, 0.7, 1.0],
help="LLM temperatures")
parser.add_argument("--llm-max-tokens", type=int, default=65536,
help="LLM max tokens")
parser.add_argument("--trajectory-log", action="store_true", default=False,
help="Enable per-LLM-call trajectory logging for Shinka mutation loop")
parser.add_argument("--trajectory-log-dir", type=str, default="llm_trajectories",
help="Directory (relative to gen dir or absolute) for trajectory JSON files")
# Patch configuration
parser.add_argument("--patch-types", type=str, nargs="+",
default=["diff", "full", "cross"],
help="Patch types")
parser.add_argument("--patch-probs", type=float, nargs="+",
default=[0.6, 0.3, 0.1],
help="Patch type probabilities")
# WandB configuration
parser.add_argument("--use-wandb", action="store_true", default=False,
help="Enable WandB logging")
parser.add_argument("--wandb-project", type=str, default="ev2",
help="WandB project name")
parser.add_argument("--wandb-entity", type=str, default="tengxiao",
help="WandB entity")
parser.add_argument("--wandb-run-name", type=str, default=None,
help="WandB run name")
parser.add_argument("--wandb-tags", type=str, nargs="*", default=None,
help="WandB tags")
# Output configuration
parser.add_argument("--results-dir", type=str, default=None,
help="Results directory")
parser.add_argument("--verbose", action="store_true", default=True,
help="Verbose output")
return parser.parse_args()
def setup_defaults(args):
"""Setup default values based on task."""
# Set task-specific defaults
if args.initial_code is None:
args.initial_code = f"examples/{args.task}/initial.py"
if args.evaluator is None:
args.evaluator = f"examples/{args.task}/evaluate_ori.py"
if args.evaluator_module is None:
args.evaluator_module = f"examples.{args.task}.evaluate_ori"
# Setup results directory
if args.results_dir is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
args.results_dir = f"examples/{args.task}/results/results_{args.experiment_name}_{timestamp}"
# Setup WandB run name
if args.use_wandb and args.wandb_run_name is None:
args.wandb_run_name = f"{args.experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
return args
def check_eval_service(url):
"""Check if eval service is running."""
import requests
try:
response = requests.get(f"{url}/api/v1/status", timeout=2.0)
if response.status_code == 200:
return True, response.json()
except Exception as e:
return False, str(e)
return False, "Unknown error"
def get_task_description(task):
"""Get task-specific description."""
if task == "circle_packing":
return """You are an expert mathematician specializing in circle packing problems and computational geometry.
Your task is to maximize the sum of radii when packing 26 circles in a unit square [0,1] x [0,1].
The best known result is 2.635 (sum of radii).
Key strategies to consider:
1. Efficient spatial distribution - avoid clustering
2. Utilize corners and edges effectively
3. Balance between many small circles vs fewer large circles
4. Consider geometric patterns: grid, hexagonal, concentric rings
5. Optimize placement to minimize wasted space
You will receive:
- Current code implementation
- Performance metrics (sum of radii)
- Circle center coordinates as text
Make improvements based on the numerical data and geometric reasoning.
Ensure all circles are disjoint and lie inside the unit square."""
else:
return "Solve the given task optimally."
def main():
args = parse_args()
args = setup_defaults(args)
results_dir = Path(args.results_dir)
results_dir.mkdir(parents=True, exist_ok=True)
# Print header
print("=" * 80)
print("πŸš€ ShinkaEvolve Experiment Runner")
print("=" * 80)
print(f"πŸ“… Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"πŸ”¬ Experiment: {args.experiment_name}")
print(f"🎯 Task: {args.task}")
print(f"πŸ“Š Generations: {args.num_generations}")
print(f"⚑ Parallel Jobs: {args.max_parallel_jobs}")
print(f"πŸ“ Results: {results_dir}")
print("=" * 80)
print()
# Check eval service
if args.use_eval_service:
print(f"πŸ” Checking eval service at {args.eval_service_url}...")
is_running, info = check_eval_service(args.eval_service_url)
if is_running:
print(f"βœ… Eval service is running")
if isinstance(info, dict):
print(f" Status: {info.get('status', 'unknown')}")
else:
print(f"❌ Eval service not running: {info}")
print()
print("Please start eval service first:")
print(f" bash scripts/dev/start_eval_server.sh")
print()
sys.exit(1)
print()
# Setup configurations
job_config = LocalJobConfig(
eval_program_path=args.evaluator
)
db_config = DatabaseConfig(
num_islands=args.num_islands,
archive_size=args.archive_size,
elite_selection_ratio=0.3,
num_archive_inspirations=4,
num_top_k_inspirations=2,
migration_interval=10,
migration_rate=0.1,
island_elitism=True,
parent_selection_strategy="weighted",
parent_selection_lambda=10.0,
)
evo_config = EvolutionConfig(
task_sys_msg=get_task_description(args.task),
patch_types=args.patch_types,
patch_type_probs=args.patch_probs,
num_generations=args.num_generations,
max_parallel_jobs=args.max_parallel_jobs,
max_patch_resamples=3,
max_patch_attempts=3,
job_type="local",
language="python",
# LLM configuration
llm_models=args.llm_models,
llm_kwargs=dict(
temperatures=args.llm_temperatures,
max_tokens=args.llm_max_tokens,
),
llm_dynamic_selection=args.llm_selection,
llm_dynamic_selection_kwargs=dict(exploration_coef=1.0),
# Meta configuration
meta_rec_interval=args.meta_interval,
meta_llm_models=["native-gemini-2.5-flash"],
meta_llm_kwargs=dict(temperatures=[0.7], max_tokens=16384),
meta_max_recommendations=args.meta_max_recommendations,
# Embedding for novelty
embedding_model="text-embedding-3-small",
code_embed_sim_threshold=0.995,
novelty_llm_models=["native-gemini-2.5-flash"],
novelty_llm_kwargs=dict(temperatures=[0.7], max_tokens=16384),
# Paths
init_program_path=args.initial_code,
results_dir=str(results_dir),
use_text_feedback=args.use_text_feedback,
# Eval Service
eval_service_url=args.eval_service_url if args.use_eval_service else None,
use_eval_service=args.use_eval_service,
evaluator_module=args.evaluator_module if args.use_eval_service else None,
evaluator_function=args.evaluator_function,
eval_service_trigger_mode=args.eval_trigger_mode if args.use_eval_service else None,
eval_service_trigger_interval=args.eval_trigger_interval if args.use_eval_service else None,
# WandB
enable_wandb=args.use_wandb,
wandb_project=args.wandb_project if args.use_wandb else None,
wandb_entity=args.wandb_entity if args.use_wandb else None,
wandb_run_name=args.wandb_run_name if args.use_wandb else None,
wandb_tags=args.wandb_tags if args.use_wandb else None,
trajectory_log=args.trajectory_log,
trajectory_log_dir=args.trajectory_log_dir,
persistent_agents_enabled=args.persistent_agents,
persistent_context_refresh_interval=args.persistent_context_refresh_interval,
persistent_context_max_recent_attempts=args.persistent_context_max_recent_attempts,
persistent_context_max_recent_insights=args.persistent_context_max_recent_insights,
persistent_invalid_burst_threshold=args.persistent_invalid_burst_threshold,
persistent_invalid_burst_window=args.persistent_invalid_burst_window,
)
# Print configuration summary
print("πŸ“‹ Configuration Summary:")
print(f" β€’ Generations: {evo_config.num_generations}")
print(f" β€’ Parallel Jobs: {evo_config.max_parallel_jobs}")
print(f" β€’ Islands: {db_config.num_islands}")
print(f" β€’ Archive Size: {db_config.archive_size}")
print(f" β€’ Models: {', '.join(evo_config.llm_models)}")
print(f" β€’ LLM Selection: {evo_config.llm_dynamic_selection}")
print(f" β€’ Meta Interval: {evo_config.meta_rec_interval}")
print(f" β€’ Evaluator: {args.evaluator}")
if args.use_eval_service:
print(f" β€’ Eval Service: {evo_config.eval_service_url} βœ…")
if args.eval_trigger_mode is not None:
print(f" β€’ Eval Trigger Mode: {args.eval_trigger_mode}")
if args.eval_trigger_interval is not None:
print(f" β€’ Eval Trigger Interval: {args.eval_trigger_interval}")
if args.use_wandb:
print(f" β€’ WandB: {args.wandb_project}/{args.wandb_run_name}")
if args.trajectory_log:
print(f" β€’ Trajectory Log: enabled ({args.trajectory_log_dir})")
print(f" β€’ Text Feedback in Prompt: {'enabled' if args.use_text_feedback else 'disabled'}")
print(f" β€’ Persistent Agents: {'enabled' if args.persistent_agents else 'disabled'}")
print()
# Confirmation
try:
input("Press Enter to start (Ctrl+C to cancel)...")
except KeyboardInterrupt:
print("\n❌ Cancelled")
sys.exit(0)
print()
print("=" * 80)
print("πŸƒ Starting Evolution")
print("=" * 80)
print()
start_time = time.time()
# Run evolution
try:
runner = EvolutionRunner(
evo_config=evo_config,
job_config=job_config,
db_config=db_config
)
runner.run()
elapsed = time.time() - start_time
print()
print("=" * 80)
print("βœ… Experiment Completed!")
print("=" * 80)
print(f"⏱️ Total time: {elapsed/3600:.2f} hours")
print(f"πŸ“ Results: {results_dir}")
print()
print("πŸ“Š Check:")
print(f" β€’ Best program: {results_dir}/best/")
print(f" β€’ Database: {results_dir}/evolution_db.sqlite")
if args.use_eval_service:
print(f" β€’ Eval agent memory: {results_dir}/eval_agent_memory/")
print(f" β€’ Metrics docs: {results_dir}/eval_agent_memory/EVAL_AGENTS.md")
if args.use_wandb:
wandb_url = f"https://wandb.ai/{args.wandb_entity or 'YOUR_ENTITY'}/{args.wandb_project}"
print(f" β€’ WandB: {wandb_url}")
print()
except KeyboardInterrupt:
print()
print("=" * 80)
print("⚠️ Experiment Interrupted")
print("=" * 80)
print(f"πŸ“ Partial results: {results_dir}")
sys.exit(130)
except Exception as e:
print()
print("=" * 80)
print("❌ Experiment Failed")
print("=" * 80)
print(f"Error: {e}")
print()
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()