File size: 16,913 Bytes
3f6526a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
#!/usr/bin/env python3
"""
Universal experiment runner for ShinkaEvolve with Eval Service integration.

Usage: python run_experiment.py --experiment-name NAME [options]

Based on run_full_experiment.py pattern.
"""

import argparse
import sys
import time
from pathlib import Path
from datetime import datetime

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))

from shinka.core import EvolutionRunner, EvolutionConfig
from shinka.launch import LocalJobConfig
from shinka.database import DatabaseConfig


def parse_args():
    parser = argparse.ArgumentParser(
        description="Run ShinkaEvolve experiment with Eval Service",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    
    # Experiment configuration
    parser.add_argument("--experiment-name", type=str, required=True,
                      help="Experiment name (e.g., 'quick_test', 'full_50gen')")
    parser.add_argument("--num-generations", type=int, default=50,
                      help="Number of generations to evolve")
    parser.add_argument("--max-parallel-jobs", type=int, default=4,
                      help="Maximum parallel evaluation jobs")
    
    # Task configuration
    parser.add_argument("--task", type=str, default="circle_packing",
                      help="Task domain")
    parser.add_argument("--initial-code", type=str, default=None,
                      help="Path to initial code")
    parser.add_argument("--evaluator", type=str, default=None,
                      help="Path to evaluator")
    
    # Eval Service configuration
    parser.add_argument("--use-eval-service", action="store_true", default=True,
                      help="Use eval service for evaluation")
    parser.add_argument("--eval-service-url", type=str, default="http://localhost:8765",
                      help="Eval service URL")
    parser.add_argument("--evaluator-module", type=str, default=None,
                      help="Evaluator module")
    parser.add_argument("--evaluator-function", type=str, default="main",
                      help="Evaluator function name")
    parser.add_argument("--eval-trigger-mode", type=str, default=None,
                      choices=["always", "periodic", "plateau", "mixed"],
                      help="Override eval service trigger mode for this experiment")
    parser.add_argument("--eval-trigger-interval", type=int, default=None,
                      help="Override eval service trigger interval for this experiment")
    
    # Database configuration
    parser.add_argument("--num-islands", type=int, default=2,
                      help="Number of islands for island model")
    parser.add_argument("--archive-size", type=int, default=40,
                      help="Archive size for elites")
    
    # Meta configuration
    parser.add_argument("--meta-interval", type=int, default=10,
                      help="Meta-summarizer interval (generations)")
    parser.add_argument("--meta-max-recommendations", type=int, default=5,
                      help="Maximum meta recommendations")
    parser.add_argument("--persistent-agents", action="store_true", default=False,
                      help="Enable optional long-horizon shared-memory agents for Shinka and eval service")
    parser.add_argument("--persistent-context-refresh-interval", type=int, default=10,
                      help="How often persistent contexts are compacted and rebuilt")
    parser.add_argument("--persistent-context-max-recent-attempts", type=int, default=12,
                      help="Max recent search attempts injected into persistent context")
    parser.add_argument("--persistent-context-max-recent-insights", type=int, default=8,
                      help="Max recent eval insights injected into persistent context")
    parser.add_argument("--persistent-invalid-burst-threshold", type=int, default=3,
                      help="Invalid-output threshold that forces a persistent-context refresh")
    parser.add_argument("--persistent-invalid-burst-window", type=int, default=5,
                      help="Lookback window for invalid-output burst detection")
    parser.add_argument("--use-text-feedback", dest="use_text_feedback", action="store_true", default=True,
                      help="Include evaluator text_feedback (including auxiliary metric descriptions) in mutation prompts")
    parser.add_argument("--no-text-feedback", dest="use_text_feedback", action="store_false",
                      help="Disable text_feedback injection into mutation prompts")
    
    # LLM configuration
    parser.add_argument("--llm-models", type=str, nargs="+",
                      default=["native-gemini-2.5-flash", "native-gemini-2.5-pro"],
                      help="LLM models to use")
    parser.add_argument("--llm-selection", type=str, default="ucb1",
                      choices=["ucb1", "thompson", "epsilon_greedy", "random"],
                      help="LLM dynamic selection strategy")
    parser.add_argument("--llm-temperatures", type=float, nargs="+",
                      default=[0.5, 0.7, 1.0],
                      help="LLM temperatures")
    parser.add_argument("--llm-max-tokens", type=int, default=65536,
                      help="LLM max tokens")
    parser.add_argument("--trajectory-log", action="store_true", default=False,
                      help="Enable per-LLM-call trajectory logging for Shinka mutation loop")
    parser.add_argument("--trajectory-log-dir", type=str, default="llm_trajectories",
                      help="Directory (relative to gen dir or absolute) for trajectory JSON files")
    
    # Patch configuration
    parser.add_argument("--patch-types", type=str, nargs="+",
                      default=["diff", "full", "cross"],
                      help="Patch types")
    parser.add_argument("--patch-probs", type=float, nargs="+",
                      default=[0.6, 0.3, 0.1],
                      help="Patch type probabilities")
    
    # WandB configuration
    parser.add_argument("--use-wandb", action="store_true", default=False,
                      help="Enable WandB logging")
    parser.add_argument("--wandb-project", type=str, default="ev2",
                      help="WandB project name")
    parser.add_argument("--wandb-entity", type=str, default="tengxiao",
                      help="WandB entity")
    parser.add_argument("--wandb-run-name", type=str, default=None,
                      help="WandB run name")
    parser.add_argument("--wandb-tags", type=str, nargs="*", default=None,
                      help="WandB tags")
    
    # Output configuration
    parser.add_argument("--results-dir", type=str, default=None,
                      help="Results directory")
    parser.add_argument("--verbose", action="store_true", default=True,
                      help="Verbose output")
    
    return parser.parse_args()


def setup_defaults(args):
    """Setup default values based on task."""
    # Set task-specific defaults
    if args.initial_code is None:
        args.initial_code = f"examples/{args.task}/initial.py"
    if args.evaluator is None:
        args.evaluator = f"examples/{args.task}/evaluate_ori.py"
    if args.evaluator_module is None:
        args.evaluator_module = f"examples.{args.task}.evaluate_ori"
    
    # Setup results directory
    if args.results_dir is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        args.results_dir = f"examples/{args.task}/results/results_{args.experiment_name}_{timestamp}"
    
    # Setup WandB run name
    if args.use_wandb and args.wandb_run_name is None:
        args.wandb_run_name = f"{args.experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    
    return args


def check_eval_service(url):
    """Check if eval service is running."""
    import requests
    try:
        response = requests.get(f"{url}/api/v1/status", timeout=2.0)
        if response.status_code == 200:
            return True, response.json()
    except Exception as e:
        return False, str(e)
    return False, "Unknown error"


def get_task_description(task):
    """Get task-specific description."""
    if task == "circle_packing":
        return """You are an expert mathematician specializing in circle packing problems and computational geometry.

Your task is to maximize the sum of radii when packing 26 circles in a unit square [0,1] x [0,1].
The best known result is 2.635 (sum of radii).

Key strategies to consider:
1. Efficient spatial distribution - avoid clustering
2. Utilize corners and edges effectively
3. Balance between many small circles vs fewer large circles
4. Consider geometric patterns: grid, hexagonal, concentric rings
5. Optimize placement to minimize wasted space

You will receive:
- Current code implementation
- Performance metrics (sum of radii)
- Circle center coordinates as text

Make improvements based on the numerical data and geometric reasoning.
Ensure all circles are disjoint and lie inside the unit square."""
    else:
        return "Solve the given task optimally."


def main():
    args = parse_args()
    args = setup_defaults(args)
    
    results_dir = Path(args.results_dir)
    results_dir.mkdir(parents=True, exist_ok=True)
    
    # Print header
    print("=" * 80)
    print("πŸš€ ShinkaEvolve Experiment Runner")
    print("=" * 80)
    print(f"πŸ“… Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"πŸ”¬ Experiment: {args.experiment_name}")
    print(f"🎯 Task: {args.task}")
    print(f"πŸ“Š Generations: {args.num_generations}")
    print(f"⚑ Parallel Jobs: {args.max_parallel_jobs}")
    print(f"πŸ“ Results: {results_dir}")
    print("=" * 80)
    print()
    
    # Check eval service
    if args.use_eval_service:
        print(f"πŸ” Checking eval service at {args.eval_service_url}...")
        is_running, info = check_eval_service(args.eval_service_url)
        if is_running:
            print(f"βœ… Eval service is running")
            if isinstance(info, dict):
                print(f"   Status: {info.get('status', 'unknown')}")
        else:
            print(f"❌ Eval service not running: {info}")
            print()
            print("Please start eval service first:")
            print(f"  bash scripts/dev/start_eval_server.sh")
            print()
            sys.exit(1)
        print()
    
    # Setup configurations
    job_config = LocalJobConfig(
        eval_program_path=args.evaluator
    )
    
    db_config = DatabaseConfig(
        num_islands=args.num_islands,
        archive_size=args.archive_size,
        elite_selection_ratio=0.3,
        num_archive_inspirations=4,
        num_top_k_inspirations=2,
        migration_interval=10,
        migration_rate=0.1,
        island_elitism=True,
        parent_selection_strategy="weighted",
        parent_selection_lambda=10.0,
    )
    
    evo_config = EvolutionConfig(
        task_sys_msg=get_task_description(args.task),
        patch_types=args.patch_types,
        patch_type_probs=args.patch_probs,
        num_generations=args.num_generations,
        max_parallel_jobs=args.max_parallel_jobs,
        max_patch_resamples=3,
        max_patch_attempts=3,
        job_type="local",
        language="python",
        
        # LLM configuration
        llm_models=args.llm_models,
        llm_kwargs=dict(
            temperatures=args.llm_temperatures,
            max_tokens=args.llm_max_tokens,
        ),
        llm_dynamic_selection=args.llm_selection,
        llm_dynamic_selection_kwargs=dict(exploration_coef=1.0),
        
        # Meta configuration
        meta_rec_interval=args.meta_interval,
        meta_llm_models=["native-gemini-2.5-flash"],
        meta_llm_kwargs=dict(temperatures=[0.7], max_tokens=16384),
        meta_max_recommendations=args.meta_max_recommendations,
        
        # Embedding for novelty
        embedding_model="text-embedding-3-small",
        code_embed_sim_threshold=0.995,
        novelty_llm_models=["native-gemini-2.5-flash"],
        novelty_llm_kwargs=dict(temperatures=[0.7], max_tokens=16384),
        
        # Paths
        init_program_path=args.initial_code,
        results_dir=str(results_dir),
        use_text_feedback=args.use_text_feedback,
        
        # Eval Service
        eval_service_url=args.eval_service_url if args.use_eval_service else None,
        use_eval_service=args.use_eval_service,
        evaluator_module=args.evaluator_module if args.use_eval_service else None,
        evaluator_function=args.evaluator_function,
        eval_service_trigger_mode=args.eval_trigger_mode if args.use_eval_service else None,
        eval_service_trigger_interval=args.eval_trigger_interval if args.use_eval_service else None,
        
        # WandB
        enable_wandb=args.use_wandb,
        wandb_project=args.wandb_project if args.use_wandb else None,
        wandb_entity=args.wandb_entity if args.use_wandb else None,
        wandb_run_name=args.wandb_run_name if args.use_wandb else None,
        wandb_tags=args.wandb_tags if args.use_wandb else None,
        trajectory_log=args.trajectory_log,
        trajectory_log_dir=args.trajectory_log_dir,
        persistent_agents_enabled=args.persistent_agents,
        persistent_context_refresh_interval=args.persistent_context_refresh_interval,
        persistent_context_max_recent_attempts=args.persistent_context_max_recent_attempts,
        persistent_context_max_recent_insights=args.persistent_context_max_recent_insights,
        persistent_invalid_burst_threshold=args.persistent_invalid_burst_threshold,
        persistent_invalid_burst_window=args.persistent_invalid_burst_window,
    )
    
    # Print configuration summary
    print("πŸ“‹ Configuration Summary:")
    print(f"  β€’ Generations: {evo_config.num_generations}")
    print(f"  β€’ Parallel Jobs: {evo_config.max_parallel_jobs}")
    print(f"  β€’ Islands: {db_config.num_islands}")
    print(f"  β€’ Archive Size: {db_config.archive_size}")
    print(f"  β€’ Models: {', '.join(evo_config.llm_models)}")
    print(f"  β€’ LLM Selection: {evo_config.llm_dynamic_selection}")
    print(f"  β€’ Meta Interval: {evo_config.meta_rec_interval}")
    print(f"  β€’ Evaluator: {args.evaluator}")
    if args.use_eval_service:
        print(f"  β€’ Eval Service: {evo_config.eval_service_url} βœ…")
        if args.eval_trigger_mode is not None:
            print(f"  β€’ Eval Trigger Mode: {args.eval_trigger_mode}")
        if args.eval_trigger_interval is not None:
            print(f"  β€’ Eval Trigger Interval: {args.eval_trigger_interval}")
    if args.use_wandb:
        print(f"  β€’ WandB: {args.wandb_project}/{args.wandb_run_name}")
    if args.trajectory_log:
        print(f"  β€’ Trajectory Log: enabled ({args.trajectory_log_dir})")
    print(f"  β€’ Text Feedback in Prompt: {'enabled' if args.use_text_feedback else 'disabled'}")
    print(f"  β€’ Persistent Agents: {'enabled' if args.persistent_agents else 'disabled'}")
    print()
    
    # Confirmation
    try:
        input("Press Enter to start (Ctrl+C to cancel)...")
    except KeyboardInterrupt:
        print("\n❌ Cancelled")
        sys.exit(0)
    
    print()
    print("=" * 80)
    print("πŸƒ Starting Evolution")
    print("=" * 80)
    print()
    
    start_time = time.time()
    
    # Run evolution
    try:
        runner = EvolutionRunner(
            evo_config=evo_config,
            job_config=job_config,
            db_config=db_config
        )
        
        runner.run()
        
        elapsed = time.time() - start_time
        
        print()
        print("=" * 80)
        print("βœ… Experiment Completed!")
        print("=" * 80)
        print(f"⏱️  Total time: {elapsed/3600:.2f} hours")
        print(f"πŸ“ Results: {results_dir}")
        print()
        
        print("πŸ“Š Check:")
        print(f"  β€’ Best program: {results_dir}/best/")
        print(f"  β€’ Database: {results_dir}/evolution_db.sqlite")
        if args.use_eval_service:
            print(f"  β€’ Eval agent memory: {results_dir}/eval_agent_memory/")
            print(f"  β€’ Metrics docs: {results_dir}/eval_agent_memory/EVAL_AGENTS.md")
        if args.use_wandb:
            wandb_url = f"https://wandb.ai/{args.wandb_entity or 'YOUR_ENTITY'}/{args.wandb_project}"
            print(f"  β€’ WandB: {wandb_url}")
        print()
        
    except KeyboardInterrupt:
        print()
        print("=" * 80)
        print("⚠️  Experiment Interrupted")
        print("=" * 80)
        print(f"πŸ“ Partial results: {results_dir}")
        sys.exit(130)
        
    except Exception as e:
        print()
        print("=" * 80)
        print("❌ Experiment Failed")
        print("=" * 80)
        print(f"Error: {e}")
        print()
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()