Add files using upload-large-folder tool

2facf1f verified about 1 month ago

6.47 kB

	#!/usr/bin/env python3
	"""
	Fork an existing multi-problem experiment at a given generation.

	Copies the first N generations from a baseline run into a new directory,
	so the runner can resume from gen N+1 with a different configuration
	(e.g., with eval agent enabled).

	Usage:
	python scripts/fork_experiment.py \
	results/frontier_cs_algorithmic/vanilla_g50_20260327_055051 \
	results/frontier_cs_algorithmic/agent_fork_g5 \
	--fork-at 5

	This will:
	1. For each problem (p0, p1, ...) in the source:
	- Copy gen_0/ through gen_{fork_at-1}/ directories
	- Copy experiment_config.yaml
	- Create a new evolution_db.sqlite with only programs from gen 0..fork_at-1
	- Set last_iteration = fork_at-1 so the runner resumes from fork_at
	"""

	import argparse
	import shutil
	import sqlite3
	import sys
	from pathlib import Path


	def fork_problem(src_dir: Path, dst_dir: Path, fork_at: int) -> bool:
	"""Fork a single problem directory at the given generation."""
	problem_name = src_dir.name

	# Check source DB exists
	src_db = src_dir / "evolution_db.sqlite"
	if not src_db.exists():
	print(f" SKIP {problem_name}: no evolution_db.sqlite")
	return False

	dst_dir.mkdir(parents=True, exist_ok=True)

	# 1. Copy experiment_config.yaml
	src_config = src_dir / "experiment_config.yaml"
	if src_config.exists():
	shutil.copy2(src_config, dst_dir / "experiment_config.yaml")

	# 2. Copy gen_0 through gen_{fork_at-1}
	for gen in range(fork_at):
	src_gen = src_dir / f"gen_{gen}"
	dst_gen = dst_dir / f"gen_{gen}"
	if src_gen.exists():
	shutil.copytree(src_gen, dst_gen, dirs_exist_ok=True)

	# 3. Create forked DB with only programs from gen 0..fork_at-1
	dst_db_path = dst_dir / "evolution_db.sqlite"
	src_conn = sqlite3.connect(str(src_db))
	dst_conn = sqlite3.connect(str(dst_db_path))

	# Copy schema
	src_conn.row_factory = sqlite3.Row
	schema_rows = src_conn.execute(
	"SELECT sql FROM sqlite_master WHERE type='table' AND sql IS NOT NULL"
	).fetchall()
	for row in schema_rows:
	dst_conn.execute(row["sql"])

	# Copy indexes
	index_rows = src_conn.execute(
	"SELECT sql FROM sqlite_master WHERE type='index' AND sql IS NOT NULL"
	).fetchall()
	for row in index_rows:
	try:
	dst_conn.execute(row["sql"])
	except sqlite3.OperationalError:
	pass # Index may already exist from CREATE TABLE

	# Copy programs from gen 0..fork_at-1
	programs = src_conn.execute(
	"SELECT * FROM programs WHERE generation < ?", (fork_at,)
	).fetchall()

	if programs:
	cols = [desc[0] for desc in src_conn.execute("SELECT * FROM programs LIMIT 0").description]
	placeholders = ", ".join(["?"] * len(cols))
	col_names = ", ".join(cols)
	for prog in programs:
	dst_conn.execute(
	f"INSERT INTO programs ({col_names}) VALUES ({placeholders})",
	tuple(prog),
	)

	# Copy archive entries that reference copied programs
	program_ids = set()
	for prog in programs:
	program_ids.add(prog[0]) # id is first column

	archive_rows = src_conn.execute("SELECT program_id FROM archive").fetchall()
	for row in archive_rows:
	if row[0] in program_ids:
	dst_conn.execute("INSERT INTO archive (program_id) VALUES (?)", (row[0],))

	# Set metadata
	dst_conn.execute(
	"INSERT OR REPLACE INTO metadata_store (key, value) VALUES (?, ?)",
	("last_iteration", str(fork_at - 1)),
	)

	# Find best program in the forked range
	best_row = dst_conn.execute(
	"SELECT id FROM programs WHERE combined_score = (SELECT MAX(combined_score) FROM programs) LIMIT 1"
	).fetchone()
	if best_row:
	dst_conn.execute(
	"INSERT OR REPLACE INTO metadata_store (key, value) VALUES (?, ?)",
	("best_program_id", best_row[0]),
	)

	dst_conn.commit()

	n_programs = dst_conn.execute("SELECT COUNT(*) FROM programs").fetchone()[0]
	n_archive = dst_conn.execute("SELECT COUNT(*) FROM archive").fetchone()[0]

	src_conn.close()
	dst_conn.close()

	print(f" OK {problem_name}: {n_programs} programs, {n_archive} archived, last_iteration={fork_at-1}")
	return True


	def main():
	parser = argparse.ArgumentParser(
	description="Fork an experiment at a given generation for controlled comparison"
	)
	parser.add_argument("source", help="Source experiment directory")
	parser.add_argument("destination", help="Destination directory for forked experiment")
	parser.add_argument("--fork-at", type=int, required=True,
	help="Fork at this generation (copies gen 0..fork_at-1)")
	parser.add_argument("--problems", type=str, default=None,
	help="Comma-separated problem IDs to fork (e.g., 'p0,p1,p5'). Default: all")

	args = parser.parse_args()

	src_root = Path(args.source)
	dst_root = Path(args.destination)

	if not src_root.exists():
	print(f"Error: source directory not found: {src_root}")
	sys.exit(1)

	if dst_root.exists():
	print(f"Error: destination already exists: {dst_root}")
	print("Remove it first or choose a different path.")
	sys.exit(1)

	# Discover problems
	all_problems = sorted(
	[d for d in src_root.iterdir() if d.is_dir() and d.name.startswith("p") and d.name[1:].isdigit()],
	key=lambda d: int(d.name[1:]),
	)

	if args.problems:
	selected = set(args.problems.split(","))
	all_problems = [d for d in all_problems if d.name in selected]

	print(f"Forking experiment at generation {args.fork_at}")
	print(f" Source: {src_root}")
	print(f" Destination: {dst_root}")
	print(f" Problems: {len(all_problems)}")
	print()

	dst_root.mkdir(parents=True, exist_ok=True)

	success = 0
	for problem_dir in all_problems:
	dst_problem = dst_root / problem_dir.name
	if fork_problem(problem_dir, dst_problem, args.fork_at):
	success += 1

	print()
	print(f"Done: {success}/{len(all_problems)} problems forked successfully")
	print(f"Output: {dst_root}")
	print()
	print(f"To run with eval agent, point your experiment runner at {dst_root}/pN/")
	print("The runner will auto-resume from gen", args.fork_at)


	if __name__ == "__main__":
	main()