Spaces:

sanjay7676
/

Team404_FORGE

Sleeping

App Files Files Community

Team404_FORGE / cli_demo.py

sanjay7676

Finalize eval-friendly defaults: offline baseline, deterministic API reset, docs cleanup

0c741d9 about 1 month ago

raw

history blame contribute delete

13.8 kB

	# app.py
	# Main runner script for FORGE-v4.
	# Runs one demo episode with the improving_coder and tiered BreakerAgent,
	# then prints a structured results report.

	import sys

	from env import FORGEEnv
	from memory import CoachMemory
	from agents import get_coder_code, coder_version_label
	from logger import log_episode, update_summary, print_log_paths, write_episode_report
	from config import DEFAULT_CANDIDATES_PER_STEP, STEPS_PER_EPISODE, ensure_runtime_dirs
	from policies.factory import build_policy
	from trainer import run_benchmark_mode, run_compare_mode


	# ──────────────────────────────────────────────
	# Demo configuration
	# ──────────────────────────────────────────────
	DEFAULT_CODER_VERSION = "improving_coder"
	DEFAULT_POLICY = "heuristic"


	def run_demo_episode(
	coder_version: str = DEFAULT_CODER_VERSION,
	policy_name: str = DEFAULT_POLICY,
	candidates_per_step: int = DEFAULT_CANDIDATES_PER_STEP,
	generate_metrics: bool = False,
	) -> None:
	"""
	Execute one demo episode and print a rich results report.

	Args:
	coder_version: Which coder strategy to use.
	"weak_coder_v1" \| "weak_coder_v2" \| "improving_coder"
	"""
	_banner()

	ensure_runtime_dirs()
	memory = CoachMemory()
	memory.clear() # Start fresh for the demo run
	env = FORGEEnv(memory=memory)
	policy = build_policy(policy_name, strategy=coder_version)
	state = env.reset()

	episode = state["episode"]
	print(f"\n{'─'*60}")
	print(f" Task ID : {state['task_id']}")
	print(f" Episode : {episode}")
	print(f" Coder : {coder_version_label(coder_version, episode)}")
	print(f" Breaker : {env.breaker.tier_name} (starts here, tiers up during run)")
	print(f"{'─'*60}")
	print(f"\n Problem:\n")
	print(f" {state['problem_description']}")
	print()

	# ── Accumulators ──────────────────────────────────────────────────────
	ep_coder_rewards: list[float] = []
	ep_breaker_rewards: list[float] = []
	ep_pass_rates: list[float] = []
	ep_fail_counts: list[int] = []
	ep_error_counts: list[int] = []
	ep_timeout_counts: list[int] = []
	ep_break_rates: list[float] = []

	for step_num in range(1, STEPS_PER_EPISODE + 1):
	# Build coder action
	candidates = policy.generate_candidates(state, num_candidates=candidates_per_step)
	candidate_solutions = [candidate.code for candidate in candidates if candidate.code.strip()]
	fallback_code = get_coder_code(coder_version, episode=episode)
	action = {
	"coder_code": candidate_solutions[0] if candidate_solutions else fallback_code,
	"candidate_solutions": candidate_solutions,
	"coder_version": coder_version,
	}

	result = env.step(action)
	state = result["state"]

	cr = result["coder_reward"]
	br = result["breaker_reward"]
	info = result["info"]

	# Accumulate
	ep_coder_rewards.append(cr["total_reward"])
	ep_breaker_rewards.append(br["total_reward"])
	ep_pass_rates.append(cr["pass_rate"])
	ep_fail_counts.append(cr["fail_count"])
	ep_error_counts.append(cr["error_count"])
	ep_timeout_counts.append(cr.get("timeout_count", 0))
	ep_break_rates.append(br["break_rate"])

	# Per-step print
	print(f" ── Step {step_num}/{STEPS_PER_EPISODE} [breaker: {info['breaker_tier_name']}]")
	print(
	f" Coder → pass_rate: {cr['pass_rate']:.2f} "
	f"\| passes: {cr['pass_count']} "
	f"\| fails: {cr['fail_count']} "
	f"\| errors: {cr['error_count']} "
	f"\| reward: {cr['total_reward']:+.2f}"
	)
	print(
	f" Breaker → break_rate: {br['break_rate']:.2f} "
	f"\| breaks: {br['breaks']} "
	f"\| no-break: {br['passes']} "
	f"\| reward: {br['total_reward']:+.2f}"
	)
	rankings = info.get("candidate_rankings", [])
	if rankings:
	best = rankings[0]
	print(
	f" Candidate ranking → count: {len(rankings)} \| "
	f"selected_idx: {info.get('selected_candidate_index', -1)} \| "
	f"best pass_rate: {best['pass_rate']:.2f} \| "
	f"best runtime_ms: {best['avg_runtime_ms']:.2f}"
	)
	if state.get("recent_breaker_case") is not None:
	print(f" Recent adversarial input: {state['recent_breaker_case']}")
	print()

	if result["done"]:
	break

	# ── Episode log ───────────────────────────────────────────────────────
	def avg(lst: list) -> float:
	return round(sum(lst) / len(lst), 4) if lst else 0.0

	log_episode(
	episode=episode,
	coder_version=coder_version,
	breaker_tier=env.breaker.current_tier,
	avg_coder_reward=avg(ep_coder_rewards),
	avg_breaker_reward=avg(ep_breaker_rewards),
	avg_pass_rate=avg(ep_pass_rates),
	total_fail_count=sum(ep_fail_counts),
	total_error_count=sum(ep_error_counts),
	total_timeout_count=sum(ep_timeout_counts),
	avg_break_rate=avg(ep_break_rates),
	steps=env.step_count,
	)

	update_summary(
	total_episodes=1,
	coder_version=coder_version,
	final_breaker_tier=env.breaker.current_tier,
	all_coder_rewards=ep_coder_rewards,
	all_breaker_rewards=ep_breaker_rewards,
	all_pass_rates=ep_pass_rates,
	all_break_rates=ep_break_rates,
	coach_memory_summary=memory.summary(),
	)

	write_episode_report(
	episode=episode,
	payload={
	"episode": episode,
	"coder_version": coder_version,
	"policy": policy.name,
	"avg_coder_reward": avg(ep_coder_rewards),
	"avg_breaker_reward": avg(ep_breaker_rewards),
	"avg_pass_rate": avg(ep_pass_rates),
	"avg_break_rate": avg(ep_break_rates),
	"total_fail_count": sum(ep_fail_counts),
	"total_error_count": sum(ep_error_counts),
	"total_timeout_count": sum(ep_timeout_counts),
	"steps": env.step_count,
	},
	)

	# ── Final report ──────────────────────────────────────────────────────
	print(f"{'═'*60}")
	print(" EPISODE SUMMARY")
	print(f"{'═'*60}")
	print(f" Coder version : {coder_version_label(coder_version, episode)}")
	print(f" Final breaker tier : {env.breaker.tier_name}")
	print(f" Avg pass rate : {avg(ep_pass_rates):.2f}")
	print(f" Avg coder reward : {avg(ep_coder_rewards):+.4f}")
	print(f" Avg breaker reward : {avg(ep_breaker_rewards):+.4f}")
	print(f" Total fail count : {sum(ep_fail_counts)}")
	print(f" Total error count : {sum(ep_error_counts)}")
	print(f" Avg break rate : {avg(ep_break_rates):.2f}")
	print()
	print(" Coach memory summary:")
	summary = memory.summary()
	print(f" Lessons stored : {summary.get('total_lessons', 0)}")
	notes = summary.get("recent_coach_notes", [])
	if notes:
	print(" Recent coach notes:")
	for note in notes:
	print(f" • {note}")
	print()
	print(" Log files updated:")
	print_log_paths()
	if generate_metrics:
	from metrics import generate_charts
	chart_paths = generate_charts()
	if chart_paths:
	print(" Charts generated:")
	for key, path in chart_paths.items():
	print(f" - {key}: {path}")
	print(f"{'═'*60}")


	# ──────────────────────────────────────────────
	# Helpers
	# ──────────────────────────────────────────────

	def _banner() -> None:
	print()
	print("╔══════════════════════════════════════════════════════════╗")
	print("║ FORGE-v4 \| Adversarial Code Generation Environment ║")
	print("╚══════════════════════════════════════════════════════════╝")


	def _print_help() -> None:
	print("Usage: python app.py [OPTIONS]")
	print()
	print("Options:")
	print(" --coder VERSION Coder strategy to use:")
	print(" weak_coder_v1 (bubble sort — slow/weak)")
	print(" weak_coder_v2 (selection sort + abs() bug)")
	print(" improving_coder (adapts each episode) [default]")
	print(" --steps N Override STEPS_PER_EPISODE for this run")
	print(" --policy NAME Defender policy: heuristic \| api \| local \| offline \| model")
	print(" --candidates N Candidate solutions to evaluate per step")
	print(" --charts Generate trend charts in outputs/")
	print(" --benchmark N Run benchmark mode for N episodes (minimum 20)")
	print(" --compare Run baseline heuristic vs model policy comparison")
	print(" --help / -h Show this message")


	# ──────────────────────────────────────────────
	# Entry point
	# ──────────────────────────────────────────────

	def main() -> None:
	args = sys.argv[1:]

	if "--help" in args or "-h" in args:
	_print_help()
	sys.exit(0)

	coder_version = DEFAULT_CODER_VERSION
	policy_name = DEFAULT_POLICY
	candidates_per_step = DEFAULT_CANDIDATES_PER_STEP
	if "--coder" in args:
	idx = args.index("--coder")
	try:
	coder_version = args[idx + 1]
	valid = ("weak_coder_v1", "weak_coder_v2", "improving_coder")
	if coder_version not in valid:
	print(f"Error: unknown coder version '{coder_version}'. Choose from: {valid}")
	sys.exit(1)
	except IndexError:
	print("Error: --coder requires a version argument.")
	sys.exit(1)

	if "--steps" in args:
	idx = args.index("--steps")
	try:
	import config
	config.STEPS_PER_EPISODE = int(args[idx + 1])
	except (IndexError, ValueError):
	print("Error: --steps requires an integer argument.")
	sys.exit(1)

	if "--policy" in args:
	idx = args.index("--policy")
	try:
	policy_name = args[idx + 1].strip().lower()
	if policy_name not in ("heuristic", "api", "local", "offline", "mock", "model"):
	raise ValueError(policy_name)
	except (IndexError, ValueError):
	print("Error: --policy must be one of: heuristic, api, local, offline, model.")
	sys.exit(1)

	if "--candidates" in args:
	idx = args.index("--candidates")
	try:
	candidates_per_step = max(1, int(args[idx + 1]))
	except (IndexError, ValueError):
	print("Error: --candidates requires an integer >= 1.")
	sys.exit(1)

	if "--compare" in args:
	report = run_compare_mode(
	model_policy_name="model",
	episodes=20,
	candidates_per_step=candidates_per_step,
	verbose=False,
	)
	print("Comparison complete")
	print(f" Pass-rate delta : {report['improvement']['pass_rate_delta']:+.4f}")
	print(f" Defender reward delta: {report['improvement']['defender_reward_delta']:+.4f}")
	print(f" Adversary reward delta: {report['improvement']['adversary_reward_delta']:+.4f}")
	print(f" Tier Progression Delta: {report['improvement']['max_tier_delta']:+d}")
	print(" Judge assets exported to outputs/")
	sys.exit(0)

	if "--benchmark" in args:
	idx = args.index("--benchmark")
	try:
	benchmark_episodes = int(args[idx + 1])
	except (IndexError, ValueError):
	print("Error: --benchmark requires an integer argument.")
	sys.exit(1)

	report = run_benchmark_mode(
	policy_name=policy_name,
	episodes=benchmark_episodes,
	candidates_per_step=candidates_per_step,
	verbose=False,
	)
	print("Benchmark complete")
	print(f" Episodes: {report['episodes']}")
	for row in report.get("rows", []):
	print(
	f" Ep {row['episode']:>3} \| pass={row['pass_rate']:.2f} "
	f"\| defender={row['defender_reward']:+.2f} "
	f"\| adversary={row['adversary_reward']:+.2f} "
	f"\| rank={row['chosen_candidate_rank']} "
	f"\| tier={row['tier_progression']}"
	)
	print(" Judge assets exported to outputs/")
	sys.exit(0)

	run_demo_episode(
	coder_version=coder_version,
	policy_name=policy_name,
	candidates_per_step=candidates_per_step,
	generate_metrics=("--charts" in args),
	)


	if __name__ == "__main__":
	main()