Spaces:
Sleeping
Sleeping
ksanjuma1234 commited on
Commit ·
656b5db
1
Parent(s): fc01d79
Enhance code generation environment with diverse coder strategies and tiered adversarial testing
Browse filesImplement multiple coder strategies (bubble sort, selection sort with bug, and an improving coder) and a tiered breaker agent with progressively harder test cases. Add comprehensive logging for rewards and episode metrics. Improve environment state clarity and coach memory with human-readable lessons.
Replit-Commit-Author: Agent
Replit-Commit-Session-Id: a7518b1f-70c7-4487-82d2-42195935723e
Replit-Commit-Checkpoint-Type: full_checkpoint
Replit-Commit-Event-Id: 6f92db1c-7ebb-4a38-b6ed-3dc81054bda2
Replit-Helium-Checkpoint-Created: true
- FORGE-v4/agents.py +263 -0
- FORGE-v4/app.py +156 -35
- FORGE-v4/config.py +42 -14
- FORGE-v4/env.py +144 -65
- FORGE-v4/logger.py +191 -0
- FORGE-v4/logs/episodes.csv +3 -0
- FORGE-v4/logs/rewards.json +86 -0
- FORGE-v4/logs/summary.json +23 -0
- FORGE-v4/memory.py +101 -20
- FORGE-v4/trainer.py +115 -69
- attached_assets/Pasted-Upgrade-the-existing-FORGE-v4-project-from-starter-leve_1777106296176.txt +163 -0
- replit.md +40 -0
FORGE-v4/agents.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# agents.py
|
| 2 |
+
# Coder strategies and tiered Breaker agent for FORGE-v4.
|
| 3 |
+
#
|
| 4 |
+
# Coder strategies:
|
| 5 |
+
# weak_coder_v1 — bubble sort (O(n²), slow on large arrays)
|
| 6 |
+
# weak_coder_v2 — selection sort with a subtle bug on negatives
|
| 7 |
+
# improving_coder — picks strategy based on episode count
|
| 8 |
+
#
|
| 9 |
+
# Breaker agent:
|
| 10 |
+
# BreakerAgent — tiered adversarial test case generator
|
| 11 |
+
|
| 12 |
+
import random
|
| 13 |
+
from typing import Any
|
| 14 |
+
from config import (
|
| 15 |
+
ARRAY_VALUE_RANGE,
|
| 16 |
+
MAX_ARRAY_SIZE,
|
| 17 |
+
BREAKER_TIER_UNLOCK_RATE,
|
| 18 |
+
BREAKER_TIER3_MIN_EPISODE,
|
| 19 |
+
BREAKER_TIER4_MIN_EPISODE,
|
| 20 |
+
IMPROVING_CODER_TIER1_UNTIL,
|
| 21 |
+
IMPROVING_CODER_TIER2_UNTIL,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ══════════════════════════════════════════════
|
| 26 |
+
# CODER STRATEGIES
|
| 27 |
+
# ══════════════════════════════════════════════
|
| 28 |
+
|
| 29 |
+
# Each strategy returns a Python source string that defines solution(arr).
|
| 30 |
+
|
| 31 |
+
WEAK_CODER_V1_CODE = '''
|
| 32 |
+
def solution(arr):
|
| 33 |
+
"""Bubble sort — O(n²), fails slowly on large arrays."""
|
| 34 |
+
a = list(arr)
|
| 35 |
+
n = len(a)
|
| 36 |
+
for i in range(n):
|
| 37 |
+
for j in range(n - i - 1):
|
| 38 |
+
if a[j] > a[j + 1]:
|
| 39 |
+
a[j], a[j + 1] = a[j + 1], a[j]
|
| 40 |
+
return a
|
| 41 |
+
'''
|
| 42 |
+
|
| 43 |
+
WEAK_CODER_V2_CODE = '''
|
| 44 |
+
def solution(arr):
|
| 45 |
+
"""
|
| 46 |
+
Selection sort — correct for positive-only arrays.
|
| 47 |
+
Bug: uses abs() comparison, so negatives can end up out of order.
|
| 48 |
+
"""
|
| 49 |
+
a = list(arr)
|
| 50 |
+
n = len(a)
|
| 51 |
+
for i in range(n):
|
| 52 |
+
min_idx = i
|
| 53 |
+
for j in range(i + 1, n):
|
| 54 |
+
# BUG: comparing absolute values breaks negative ordering
|
| 55 |
+
if abs(a[j]) < abs(a[min_idx]):
|
| 56 |
+
min_idx = j
|
| 57 |
+
a[i], a[min_idx] = a[min_idx], a[i]
|
| 58 |
+
return a
|
| 59 |
+
'''
|
| 60 |
+
|
| 61 |
+
IMPROVING_CODER_TEMPLATE = '''
|
| 62 |
+
def solution(arr):
|
| 63 |
+
"""
|
| 64 |
+
Improving coder — strategy selected by episode {episode}.
|
| 65 |
+
Episode <= {tier1_until}: bubble sort (weakest)
|
| 66 |
+
Episode <= {tier2_until}: selection sort (medium)
|
| 67 |
+
Episode > {tier2_until}: built-in sorted (strongest)
|
| 68 |
+
"""
|
| 69 |
+
episode = {episode}
|
| 70 |
+
a = list(arr)
|
| 71 |
+
|
| 72 |
+
if episode <= {tier1_until}:
|
| 73 |
+
# Bubble sort
|
| 74 |
+
n = len(a)
|
| 75 |
+
for i in range(n):
|
| 76 |
+
for j in range(n - i - 1):
|
| 77 |
+
if a[j] > a[j + 1]:
|
| 78 |
+
a[j], a[j + 1] = a[j + 1], a[j]
|
| 79 |
+
return a
|
| 80 |
+
elif episode <= {tier2_until}:
|
| 81 |
+
# Selection sort with abs() bug
|
| 82 |
+
n = len(a)
|
| 83 |
+
for i in range(n):
|
| 84 |
+
min_idx = i
|
| 85 |
+
for j in range(i + 1, n):
|
| 86 |
+
if abs(a[j]) < abs(a[min_idx]):
|
| 87 |
+
min_idx = j
|
| 88 |
+
a[i], a[min_idx] = a[min_idx], a[i]
|
| 89 |
+
return a
|
| 90 |
+
else:
|
| 91 |
+
# Strong solution
|
| 92 |
+
return sorted(a)
|
| 93 |
+
'''
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def get_coder_code(version: str, episode: int = 1) -> str:
|
| 97 |
+
"""
|
| 98 |
+
Return the Python source code for the given coder version.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
version: "weak_coder_v1" | "weak_coder_v2" | "improving_coder"
|
| 102 |
+
episode: current episode number (used by improving_coder)
|
| 103 |
+
"""
|
| 104 |
+
if version == "weak_coder_v1":
|
| 105 |
+
return WEAK_CODER_V1_CODE
|
| 106 |
+
|
| 107 |
+
if version == "weak_coder_v2":
|
| 108 |
+
return WEAK_CODER_V2_CODE
|
| 109 |
+
|
| 110 |
+
if version == "improving_coder":
|
| 111 |
+
return IMPROVING_CODER_TEMPLATE.format(
|
| 112 |
+
episode=episode,
|
| 113 |
+
tier1_until=IMPROVING_CODER_TIER1_UNTIL,
|
| 114 |
+
tier2_until=IMPROVING_CODER_TIER2_UNTIL,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
raise ValueError(f"Unknown coder version: {version!r}")
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def coder_version_label(version: str, episode: int) -> str:
|
| 121 |
+
"""Human-readable label for what strategy the coder is using this episode."""
|
| 122 |
+
if version == "weak_coder_v1":
|
| 123 |
+
return "weak_coder_v1 (bubble sort)"
|
| 124 |
+
if version == "weak_coder_v2":
|
| 125 |
+
return "weak_coder_v2 (selection sort / abs-bug)"
|
| 126 |
+
if version == "improving_coder":
|
| 127 |
+
if episode <= IMPROVING_CODER_TIER1_UNTIL:
|
| 128 |
+
return f"improving_coder → bubble sort (ep {episode} ≤ {IMPROVING_CODER_TIER1_UNTIL})"
|
| 129 |
+
if episode <= IMPROVING_CODER_TIER2_UNTIL:
|
| 130 |
+
return f"improving_coder → selection sort (ep {episode} ≤ {IMPROVING_CODER_TIER2_UNTIL})"
|
| 131 |
+
return f"improving_coder → sorted() (ep {episode} > {IMPROVING_CODER_TIER2_UNTIL})"
|
| 132 |
+
return version
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# ══════════════════════════════════════════════
|
| 136 |
+
# TIERED BREAKER AGENT
|
| 137 |
+
# ══════════════════════════════════════════════
|
| 138 |
+
|
| 139 |
+
# Test case banks per tier
|
| 140 |
+
_TIER1_CASES: list[list[int]] = [
|
| 141 |
+
[],
|
| 142 |
+
[1],
|
| 143 |
+
[2, 1],
|
| 144 |
+
[3, 2, 1],
|
| 145 |
+
[1, 2, 3],
|
| 146 |
+
]
|
| 147 |
+
|
| 148 |
+
_TIER2_CASES: list[list[int]] = [
|
| 149 |
+
[1, 1, 1, 1], # all duplicates
|
| 150 |
+
[2, 2, 1, 1, 3, 3], # duplicate pairs
|
| 151 |
+
[-5, -1, -3, -7, -2], # all negatives
|
| 152 |
+
[-3, 0, 3, -1, 1], # mixed sign
|
| 153 |
+
[1, 2, 3, 4, 5], # already sorted
|
| 154 |
+
[5, 4, 3, 2, 1], # reverse sorted
|
| 155 |
+
[0, 0, 0], # all zeros
|
| 156 |
+
]
|
| 157 |
+
|
| 158 |
+
_TIER3_CASES: list[list[int]] = [
|
| 159 |
+
list(range(MAX_ARRAY_SIZE, 0, -1)), # full reverse
|
| 160 |
+
[random.choice([1, 2]) for _ in range(MAX_ARRAY_SIZE)], # heavy duplicates
|
| 161 |
+
[random.randint(-100, 100) for _ in range(MAX_ARRAY_SIZE)], # large random
|
| 162 |
+
[0] * MAX_ARRAY_SIZE, # all zeros, large
|
| 163 |
+
list(range(MAX_ARRAY_SIZE)), # sorted ascending, large
|
| 164 |
+
]
|
| 165 |
+
|
| 166 |
+
_TIER4_CASES: list[list[int]] = [
|
| 167 |
+
[-100, 100], # boundary values only
|
| 168 |
+
[100, 100, 100, -100, -100, -100], # boundary duplicates
|
| 169 |
+
[-100] * 10 + [100] * 10, # boundary mixed
|
| 170 |
+
list(range(-10, 11)), # full range small
|
| 171 |
+
[random.randint(-100, 100) for _ in range(MAX_ARRAY_SIZE)], # stress random
|
| 172 |
+
]
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
class BreakerAgent:
|
| 176 |
+
"""
|
| 177 |
+
Adversarial test-case generator with four tiers of difficulty.
|
| 178 |
+
|
| 179 |
+
Tier unlocking rules:
|
| 180 |
+
Tier 2 → always available from episode 1
|
| 181 |
+
Tier 3 → unlocks when break_rate >= BREAKER_TIER_UNLOCK_RATE
|
| 182 |
+
AND episode >= BREAKER_TIER3_MIN_EPISODE
|
| 183 |
+
Tier 4 → unlocks when at tier 3 AND episode >= BREAKER_TIER4_MIN_EPISODE
|
| 184 |
+
|
| 185 |
+
The agent samples cases from all unlocked tiers, weighted toward the
|
| 186 |
+
current (highest) tier for maximum adversarial pressure.
|
| 187 |
+
"""
|
| 188 |
+
|
| 189 |
+
def __init__(self) -> None:
|
| 190 |
+
self.current_tier: int = 1
|
| 191 |
+
self._recent_break_rates: list[float] = []
|
| 192 |
+
|
| 193 |
+
def update_tier(self, break_rate: float, episode: int) -> None:
|
| 194 |
+
"""
|
| 195 |
+
Update the current tier based on recent performance and episode count.
|
| 196 |
+
|
| 197 |
+
Args:
|
| 198 |
+
break_rate: Breaker's break_rate from the last step.
|
| 199 |
+
episode: Current episode number.
|
| 200 |
+
"""
|
| 201 |
+
self._recent_break_rates.append(break_rate)
|
| 202 |
+
# Use rolling window of last 3 steps to smooth noise
|
| 203 |
+
recent = self._recent_break_rates[-3:]
|
| 204 |
+
avg_break = sum(recent) / len(recent)
|
| 205 |
+
|
| 206 |
+
if self.current_tier == 1 and avg_break >= BREAKER_TIER_UNLOCK_RATE:
|
| 207 |
+
self.current_tier = 2
|
| 208 |
+
|
| 209 |
+
if self.current_tier == 2 and (
|
| 210 |
+
avg_break >= BREAKER_TIER_UNLOCK_RATE
|
| 211 |
+
and episode >= BREAKER_TIER3_MIN_EPISODE
|
| 212 |
+
):
|
| 213 |
+
self.current_tier = 3
|
| 214 |
+
|
| 215 |
+
if self.current_tier == 3 and episode >= BREAKER_TIER4_MIN_EPISODE:
|
| 216 |
+
self.current_tier = 4
|
| 217 |
+
|
| 218 |
+
def get_tests(self, n_per_tier: int = 2) -> list[dict[str, Any]]:
|
| 219 |
+
"""
|
| 220 |
+
Return adversarial test cases sampled from all unlocked tiers,
|
| 221 |
+
with extra weight on the current highest tier.
|
| 222 |
+
|
| 223 |
+
Args:
|
| 224 |
+
n_per_tier: Number of cases to sample from each unlocked tier.
|
| 225 |
+
|
| 226 |
+
Returns:
|
| 227 |
+
List of {"input": [...], "expected_output": [...]} dicts.
|
| 228 |
+
"""
|
| 229 |
+
pools: list[tuple[int, list[list[int]]]] = [
|
| 230 |
+
(1, _TIER1_CASES),
|
| 231 |
+
(2, _TIER2_CASES),
|
| 232 |
+
(3, _TIER3_CASES),
|
| 233 |
+
(4, _TIER4_CASES),
|
| 234 |
+
]
|
| 235 |
+
|
| 236 |
+
selected: list[list[int]] = []
|
| 237 |
+
for tier_num, pool in pools:
|
| 238 |
+
if tier_num > self.current_tier:
|
| 239 |
+
break
|
| 240 |
+
# Sample more from the highest tier
|
| 241 |
+
k = n_per_tier * 2 if tier_num == self.current_tier else n_per_tier
|
| 242 |
+
k = min(k, len(pool))
|
| 243 |
+
selected.extend(random.sample(pool, k))
|
| 244 |
+
|
| 245 |
+
# Remove duplicates (by converting to tuple for hashability)
|
| 246 |
+
seen: set[tuple[int, ...]] = set()
|
| 247 |
+
unique: list[list[int]] = []
|
| 248 |
+
for arr in selected:
|
| 249 |
+
key = tuple(arr)
|
| 250 |
+
if key not in seen:
|
| 251 |
+
seen.add(key)
|
| 252 |
+
unique.append(arr)
|
| 253 |
+
|
| 254 |
+
return [
|
| 255 |
+
{"input": arr, "expected_output": sorted(arr)}
|
| 256 |
+
for arr in unique
|
| 257 |
+
]
|
| 258 |
+
|
| 259 |
+
@property
|
| 260 |
+
def tier_name(self) -> str:
|
| 261 |
+
"""Human-readable tier label."""
|
| 262 |
+
from config import BREAKER_TIER_NAMES
|
| 263 |
+
return BREAKER_TIER_NAMES.get(self.current_tier, f"Tier-{self.current_tier}")
|
FORGE-v4/app.py
CHANGED
|
@@ -1,84 +1,205 @@
|
|
| 1 |
# app.py
|
| 2 |
# Main runner script for FORGE-v4.
|
| 3 |
-
# Runs
|
|
|
|
| 4 |
|
| 5 |
import sys
|
| 6 |
import json
|
|
|
|
| 7 |
from env import FORGEEnv
|
| 8 |
from memory import CoachMemory
|
| 9 |
-
from
|
|
|
|
| 10 |
from config import STEPS_PER_EPISODE
|
| 11 |
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
"""
|
| 15 |
-
Execute
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
"""
|
| 17 |
-
|
| 18 |
-
print(" FORGE-v4 | Adversarial Code Generation Environment")
|
| 19 |
-
print("=" * 60)
|
| 20 |
|
| 21 |
-
# Initialise coach memory and environment
|
| 22 |
memory = CoachMemory()
|
|
|
|
| 23 |
env = FORGEEnv(memory=memory)
|
| 24 |
-
|
| 25 |
-
# Reset to start the episode
|
| 26 |
state = env.reset()
|
| 27 |
|
| 28 |
-
|
| 29 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
print()
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
action = {
|
| 40 |
-
"coder_code": coder_code,
|
| 41 |
-
"breaker_tests": breaker_tests,
|
| 42 |
-
}
|
| 43 |
|
| 44 |
result = env.step(action)
|
|
|
|
| 45 |
|
| 46 |
cr = result["coder_reward"]
|
| 47 |
br = result["breaker_reward"]
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
print(
|
| 50 |
-
f"
|
| 51 |
f"| passes: {cr['pass_count']} "
|
| 52 |
f"| fails: {cr['fail_count']} "
|
| 53 |
f"| errors: {cr['error_count']} "
|
| 54 |
f"| reward: {cr['total_reward']:+.2f}"
|
| 55 |
)
|
| 56 |
print(
|
| 57 |
-
f"
|
| 58 |
f"| breaks: {br['breaks']} "
|
| 59 |
-
f"|
|
| 60 |
f"| reward: {br['total_reward']:+.2f}"
|
| 61 |
)
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
if result["done"]:
|
| 64 |
break
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
def main() -> None:
|
| 73 |
-
"""Entry point — parse minimal CLI args and run."""
|
| 74 |
args = sys.argv[1:]
|
| 75 |
|
| 76 |
if "--help" in args or "-h" in args:
|
| 77 |
-
|
| 78 |
-
print(" --steps N Override STEPS_PER_EPISODE for this run (default: from config.py)")
|
| 79 |
sys.exit(0)
|
| 80 |
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
if "--steps" in args:
|
| 83 |
idx = args.index("--steps")
|
| 84 |
try:
|
|
@@ -88,7 +209,7 @@ def main() -> None:
|
|
| 88 |
print("Error: --steps requires an integer argument.")
|
| 89 |
sys.exit(1)
|
| 90 |
|
| 91 |
-
run_demo_episode()
|
| 92 |
|
| 93 |
|
| 94 |
if __name__ == "__main__":
|
|
|
|
| 1 |
# app.py
|
| 2 |
# Main runner script for FORGE-v4.
|
| 3 |
+
# Runs one demo episode with the improving_coder and tiered BreakerAgent,
|
| 4 |
+
# then prints a structured results report.
|
| 5 |
|
| 6 |
import sys
|
| 7 |
import json
|
| 8 |
+
|
| 9 |
from env import FORGEEnv
|
| 10 |
from memory import CoachMemory
|
| 11 |
+
from agents import get_coder_code, coder_version_label, BreakerAgent
|
| 12 |
+
from logger import log_episode, update_summary, print_log_paths
|
| 13 |
from config import STEPS_PER_EPISODE
|
| 14 |
|
| 15 |
|
| 16 |
+
# ──────────────────────────────────────────────
|
| 17 |
+
# Demo configuration
|
| 18 |
+
# ──────────────────────────────────────────────
|
| 19 |
+
DEFAULT_CODER_VERSION = "improving_coder"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def run_demo_episode(coder_version: str = DEFAULT_CODER_VERSION) -> None:
|
| 23 |
"""
|
| 24 |
+
Execute one demo episode and print a rich results report.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
coder_version: Which coder strategy to use.
|
| 28 |
+
"weak_coder_v1" | "weak_coder_v2" | "improving_coder"
|
| 29 |
"""
|
| 30 |
+
_banner()
|
|
|
|
|
|
|
| 31 |
|
|
|
|
| 32 |
memory = CoachMemory()
|
| 33 |
+
memory.clear() # Start fresh for the demo run
|
| 34 |
env = FORGEEnv(memory=memory)
|
|
|
|
|
|
|
| 35 |
state = env.reset()
|
| 36 |
|
| 37 |
+
episode = state["episode"]
|
| 38 |
+
print(f"\n{'─'*60}")
|
| 39 |
+
print(f" Task ID : {state['task_id']}")
|
| 40 |
+
print(f" Episode : {episode}")
|
| 41 |
+
print(f" Coder : {coder_version_label(coder_version, episode)}")
|
| 42 |
+
print(f" Breaker : {env.breaker.tier_name} (starts here, tiers up during run)")
|
| 43 |
+
print(f"{'─'*60}")
|
| 44 |
+
print(f"\n Problem:\n")
|
| 45 |
+
print(f" {state['problem_description']}")
|
| 46 |
print()
|
| 47 |
|
| 48 |
+
# ── Accumulators ──────────────────────────────────────────────────────
|
| 49 |
+
ep_coder_rewards: list[float] = []
|
| 50 |
+
ep_breaker_rewards: list[float] = []
|
| 51 |
+
ep_pass_rates: list[float] = []
|
| 52 |
+
ep_fail_counts: list[int] = []
|
| 53 |
+
ep_error_counts: list[int] = []
|
| 54 |
+
ep_timeout_counts: list[int] = []
|
| 55 |
+
ep_break_rates: list[float] = []
|
| 56 |
|
| 57 |
+
for step_num in range(1, STEPS_PER_EPISODE + 1):
|
| 58 |
+
# Build coder action
|
| 59 |
+
code = get_coder_code(coder_version, episode=episode)
|
| 60 |
+
action = {"coder_code": code, "coder_version": coder_version}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
result = env.step(action)
|
| 63 |
+
state = result["state"]
|
| 64 |
|
| 65 |
cr = result["coder_reward"]
|
| 66 |
br = result["breaker_reward"]
|
| 67 |
+
info = result["info"]
|
| 68 |
+
|
| 69 |
+
# Accumulate
|
| 70 |
+
ep_coder_rewards.append(cr["total_reward"])
|
| 71 |
+
ep_breaker_rewards.append(br["total_reward"])
|
| 72 |
+
ep_pass_rates.append(cr["pass_rate"])
|
| 73 |
+
ep_fail_counts.append(cr["fail_count"])
|
| 74 |
+
ep_error_counts.append(cr["error_count"])
|
| 75 |
+
ep_timeout_counts.append(cr["error_count"])
|
| 76 |
+
ep_break_rates.append(br["break_rate"])
|
| 77 |
+
|
| 78 |
+
# Per-step print
|
| 79 |
+
print(f" ── Step {step_num}/{STEPS_PER_EPISODE} [breaker: {info['breaker_tier_name']}]")
|
| 80 |
print(
|
| 81 |
+
f" Coder → pass_rate: {cr['pass_rate']:.2f} "
|
| 82 |
f"| passes: {cr['pass_count']} "
|
| 83 |
f"| fails: {cr['fail_count']} "
|
| 84 |
f"| errors: {cr['error_count']} "
|
| 85 |
f"| reward: {cr['total_reward']:+.2f}"
|
| 86 |
)
|
| 87 |
print(
|
| 88 |
+
f" Breaker → break_rate: {br['break_rate']:.2f} "
|
| 89 |
f"| breaks: {br['breaks']} "
|
| 90 |
+
f"| no-break: {br['passes']} "
|
| 91 |
f"| reward: {br['total_reward']:+.2f}"
|
| 92 |
)
|
| 93 |
+
if state.get("recent_breaker_case") is not None:
|
| 94 |
+
print(f" Recent adversarial input: {state['recent_breaker_case']}")
|
| 95 |
+
print()
|
| 96 |
|
| 97 |
if result["done"]:
|
| 98 |
break
|
| 99 |
|
| 100 |
+
# ── Episode log ───���───────────────────────────────────────────────────
|
| 101 |
+
def avg(lst: list) -> float:
|
| 102 |
+
return round(sum(lst) / len(lst), 4) if lst else 0.0
|
| 103 |
+
|
| 104 |
+
log_episode(
|
| 105 |
+
episode=episode,
|
| 106 |
+
coder_version=coder_version,
|
| 107 |
+
breaker_tier=env.breaker.current_tier,
|
| 108 |
+
avg_coder_reward=avg(ep_coder_rewards),
|
| 109 |
+
avg_breaker_reward=avg(ep_breaker_rewards),
|
| 110 |
+
avg_pass_rate=avg(ep_pass_rates),
|
| 111 |
+
total_fail_count=sum(ep_fail_counts),
|
| 112 |
+
total_error_count=sum(ep_error_counts),
|
| 113 |
+
total_timeout_count=sum(ep_timeout_counts),
|
| 114 |
+
avg_break_rate=avg(ep_break_rates),
|
| 115 |
+
steps=env.step_count,
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
update_summary(
|
| 119 |
+
total_episodes=1,
|
| 120 |
+
coder_version=coder_version,
|
| 121 |
+
final_breaker_tier=env.breaker.current_tier,
|
| 122 |
+
all_coder_rewards=ep_coder_rewards,
|
| 123 |
+
all_breaker_rewards=ep_breaker_rewards,
|
| 124 |
+
all_pass_rates=ep_pass_rates,
|
| 125 |
+
all_break_rates=ep_break_rates,
|
| 126 |
+
coach_memory_summary=memory.summary(),
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
# ── Final report ──────────────────────────────────────────────────────
|
| 130 |
+
print(f"{'═'*60}")
|
| 131 |
+
print(" EPISODE SUMMARY")
|
| 132 |
+
print(f"{'═'*60}")
|
| 133 |
+
print(f" Coder version : {coder_version_label(coder_version, episode)}")
|
| 134 |
+
print(f" Final breaker tier : {env.breaker.tier_name}")
|
| 135 |
+
print(f" Avg pass rate : {avg(ep_pass_rates):.2f}")
|
| 136 |
+
print(f" Avg coder reward : {avg(ep_coder_rewards):+.4f}")
|
| 137 |
+
print(f" Avg breaker reward : {avg(ep_breaker_rewards):+.4f}")
|
| 138 |
+
print(f" Total fail count : {sum(ep_fail_counts)}")
|
| 139 |
+
print(f" Total error count : {sum(ep_error_counts)}")
|
| 140 |
+
print(f" Avg break rate : {avg(ep_break_rates):.2f}")
|
| 141 |
+
print()
|
| 142 |
+
print(" Coach memory summary:")
|
| 143 |
+
summary = memory.summary()
|
| 144 |
+
print(f" Lessons stored : {summary.get('total_lessons', 0)}")
|
| 145 |
+
notes = summary.get("recent_coach_notes", [])
|
| 146 |
+
if notes:
|
| 147 |
+
print(" Recent coach notes:")
|
| 148 |
+
for note in notes:
|
| 149 |
+
print(f" • {note}")
|
| 150 |
+
print()
|
| 151 |
+
print(" Log files updated:")
|
| 152 |
+
print_log_paths()
|
| 153 |
+
print(f"{'═'*60}")
|
| 154 |
+
|
| 155 |
|
| 156 |
+
# ──────────────────────────────────────────────
|
| 157 |
+
# Helpers
|
| 158 |
+
# ──────────────────────────────────────────────
|
| 159 |
+
|
| 160 |
+
def _banner() -> None:
|
| 161 |
+
print()
|
| 162 |
+
print("╔══════════════════════════════════════════════════════════╗")
|
| 163 |
+
print("║ FORGE-v4 | Adversarial Code Generation Environment ║")
|
| 164 |
+
print("╚══════════════════════════════════════════════════════════╝")
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def _print_help() -> None:
|
| 168 |
+
print("Usage: python app.py [OPTIONS]")
|
| 169 |
+
print()
|
| 170 |
+
print("Options:")
|
| 171 |
+
print(" --coder VERSION Coder strategy to use:")
|
| 172 |
+
print(" weak_coder_v1 (bubble sort — slow/weak)")
|
| 173 |
+
print(" weak_coder_v2 (selection sort + abs() bug)")
|
| 174 |
+
print(" improving_coder (adapts each episode) [default]")
|
| 175 |
+
print(" --steps N Override STEPS_PER_EPISODE for this run")
|
| 176 |
+
print(" --help / -h Show this message")
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
# ──────────────────────────────────────────────
|
| 180 |
+
# Entry point
|
| 181 |
+
# ──────────────────────────────────────────────
|
| 182 |
|
| 183 |
def main() -> None:
|
|
|
|
| 184 |
args = sys.argv[1:]
|
| 185 |
|
| 186 |
if "--help" in args or "-h" in args:
|
| 187 |
+
_print_help()
|
|
|
|
| 188 |
sys.exit(0)
|
| 189 |
|
| 190 |
+
coder_version = DEFAULT_CODER_VERSION
|
| 191 |
+
if "--coder" in args:
|
| 192 |
+
idx = args.index("--coder")
|
| 193 |
+
try:
|
| 194 |
+
coder_version = args[idx + 1]
|
| 195 |
+
valid = ("weak_coder_v1", "weak_coder_v2", "improving_coder")
|
| 196 |
+
if coder_version not in valid:
|
| 197 |
+
print(f"Error: unknown coder version '{coder_version}'. Choose from: {valid}")
|
| 198 |
+
sys.exit(1)
|
| 199 |
+
except IndexError:
|
| 200 |
+
print("Error: --coder requires a version argument.")
|
| 201 |
+
sys.exit(1)
|
| 202 |
+
|
| 203 |
if "--steps" in args:
|
| 204 |
idx = args.index("--steps")
|
| 205 |
try:
|
|
|
|
| 209 |
print("Error: --steps requires an integer argument.")
|
| 210 |
sys.exit(1)
|
| 211 |
|
| 212 |
+
run_demo_episode(coder_version=coder_version)
|
| 213 |
|
| 214 |
|
| 215 |
if __name__ == "__main__":
|
FORGE-v4/config.py
CHANGED
|
@@ -18,20 +18,43 @@ NUM_HIDDEN_TESTS = 5 # Number of hidden test cases per task
|
|
| 18 |
# ──────────────────────────────────────────────
|
| 19 |
# Reward settings
|
| 20 |
# ──────────────────────────────────────────────
|
| 21 |
-
#
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
CODER_ERROR_PENALTY = -1.0 # Penalty when code raises an error
|
| 25 |
|
| 26 |
-
# Breaker reward weights
|
| 27 |
BREAKER_BREAK_REWARD = 1.0 # Reward when breaker's test breaks coder
|
| 28 |
-
BREAKER_FAIL_PENALTY = -0.3 # Penalty when
|
| 29 |
|
| 30 |
# ──────────────────────────────────────────────
|
| 31 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
# ──────────────────────────────────────────────
|
| 33 |
TIER_THRESHOLDS = {
|
| 34 |
-
"novice": (0.0, 0.4),
|
| 35 |
"intermediate": (0.4, 0.7),
|
| 36 |
"advanced": (0.7, 0.9),
|
| 37 |
"expert": (0.9, 1.01),
|
|
@@ -40,13 +63,18 @@ TIER_THRESHOLDS = {
|
|
| 40 |
# ──────────────────────────────────────────────
|
| 41 |
# Memory / logging
|
| 42 |
# ──────────────────────────────────────────────
|
| 43 |
-
MEMORY_FILE
|
| 44 |
-
LOG_DIR
|
| 45 |
-
MODELS_DIR
|
| 46 |
-
OUTPUTS_DIR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
# ──────────────────────────────────────────────
|
| 49 |
# Training placeholders
|
| 50 |
# ──────────────────────────────────────────────
|
| 51 |
-
MAX_EPISODES
|
| 52 |
-
STEPS_PER_EPISODE =
|
|
|
|
| 18 |
# ──────────────────────────────────────────────
|
| 19 |
# Reward settings
|
| 20 |
# ──────────────────────────────────────────────
|
| 21 |
+
CODER_PASS_REWARD = 1.0 # Reward per passing hidden test
|
| 22 |
+
CODER_FAIL_PENALTY = -0.5 # Penalty per failing hidden test
|
| 23 |
+
CODER_ERROR_PENALTY = -1.0 # Penalty when code raises an error/timeout
|
|
|
|
| 24 |
|
|
|
|
| 25 |
BREAKER_BREAK_REWARD = 1.0 # Reward when breaker's test breaks coder
|
| 26 |
+
BREAKER_FAIL_PENALTY = -0.3 # Penalty when coder survives a breaker test
|
| 27 |
|
| 28 |
# ──────────────────────────────────────────────
|
| 29 |
+
# Coder agent versions
|
| 30 |
+
# ──────────────────────────────────────────────
|
| 31 |
+
CODER_VERSIONS = ["weak_coder_v1", "weak_coder_v2", "improving_coder"]
|
| 32 |
+
|
| 33 |
+
# improving_coder tier-up thresholds (episode numbers)
|
| 34 |
+
IMPROVING_CODER_TIER1_UNTIL = 3 # Episodes 1–3 → uses weak strategy
|
| 35 |
+
IMPROVING_CODER_TIER2_UNTIL = 6 # Episodes 4–6 → uses mid strategy
|
| 36 |
+
|
| 37 |
+
# ──────────────────────────────────────────────
|
| 38 |
+
# Breaker tier system
|
| 39 |
+
# ──────────────────────────────────────────────
|
| 40 |
+
BREAKER_TIER_NAMES = {
|
| 41 |
+
1: "Tier-1 (basic)",
|
| 42 |
+
2: "Tier-2 (edge cases)",
|
| 43 |
+
3: "Tier-3 (stress)",
|
| 44 |
+
4: "Tier-4 (boundary/extreme)",
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
# Minimum break_rate to unlock next tier
|
| 48 |
+
BREAKER_TIER_UNLOCK_RATE = 0.6 # 60% break rate needed to promote
|
| 49 |
+
# Minimum episode before tier 3 unlocks (regardless of break rate)
|
| 50 |
+
BREAKER_TIER3_MIN_EPISODE = 4
|
| 51 |
+
BREAKER_TIER4_MIN_EPISODE = 7
|
| 52 |
+
|
| 53 |
+
# ──────────────────────────────────────────────
|
| 54 |
+
# Tier thresholds (coder skill levels — for display/labelling)
|
| 55 |
# ──────────────────────────────────────────────
|
| 56 |
TIER_THRESHOLDS = {
|
| 57 |
+
"novice": (0.0, 0.4),
|
| 58 |
"intermediate": (0.4, 0.7),
|
| 59 |
"advanced": (0.7, 0.9),
|
| 60 |
"expert": (0.9, 1.01),
|
|
|
|
| 63 |
# ──────────────────────────────────────────────
|
| 64 |
# Memory / logging
|
| 65 |
# ──────────────────────────────────────────────
|
| 66 |
+
MEMORY_FILE = "data/coach_memory.json"
|
| 67 |
+
LOG_DIR = "logs/"
|
| 68 |
+
MODELS_DIR = "models/"
|
| 69 |
+
OUTPUTS_DIR = "outputs/"
|
| 70 |
+
|
| 71 |
+
# Log file paths (within LOG_DIR)
|
| 72 |
+
LOG_REWARDS_FILE = "logs/rewards.json"
|
| 73 |
+
LOG_EPISODES_FILE = "logs/episodes.csv"
|
| 74 |
+
LOG_SUMMARY_FILE = "logs/summary.json"
|
| 75 |
|
| 76 |
# ──────────────────────────────────────────────
|
| 77 |
# Training placeholders
|
| 78 |
# ──────────────────────────────────────────────
|
| 79 |
+
MAX_EPISODES = 100
|
| 80 |
+
STEPS_PER_EPISODE = 3 # Kept short for fast demo runs
|
FORGE-v4/env.py
CHANGED
|
@@ -1,12 +1,16 @@
|
|
| 1 |
# env.py
|
| 2 |
# Main OpenEnv-style reinforcement learning environment for FORGE-v4.
|
| 3 |
-
# Manages
|
| 4 |
|
|
|
|
| 5 |
from typing import Any
|
| 6 |
-
|
|
|
|
| 7 |
from sandbox import run_code_against_tests
|
| 8 |
from rewards import coder_reward, breaker_reward
|
| 9 |
from memory import CoachMemory
|
|
|
|
|
|
|
| 10 |
from config import STEPS_PER_EPISODE
|
| 11 |
|
| 12 |
|
|
@@ -15,29 +19,68 @@ class FORGEEnv:
|
|
| 15 |
Two-agent adversarial environment for code generation tasks.
|
| 16 |
|
| 17 |
Agents:
|
| 18 |
-
- Coder:
|
| 19 |
-
- Breaker:
|
| 20 |
|
| 21 |
Episode flow:
|
| 22 |
-
1. reset()
|
| 23 |
-
2. step(action)
|
| 24 |
-
3.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
{
|
| 28 |
-
"
|
| 29 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
"""
|
| 32 |
|
| 33 |
def __init__(self, memory: CoachMemory | None = None):
|
| 34 |
-
self.memory
|
| 35 |
-
self.
|
|
|
|
| 36 |
self.step_count: int = 0
|
| 37 |
self.current_task: dict[str, Any] = {}
|
| 38 |
-
self.done: bool
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
# ──────────────────────────────────────────────
|
| 43 |
# Core env methods
|
|
@@ -45,38 +88,42 @@ class FORGEEnv:
|
|
| 45 |
|
| 46 |
def reset(self) -> dict[str, Any]:
|
| 47 |
"""
|
| 48 |
-
Start a new episode.
|
| 49 |
|
| 50 |
Returns:
|
| 51 |
-
Initial state dict
|
| 52 |
"""
|
| 53 |
self.episode += 1
|
| 54 |
self.step_count = 0
|
| 55 |
-
self.done
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
| 57 |
self._last_coder_pass_rate = 0.0
|
| 58 |
|
| 59 |
self.current_task = generate_task()
|
|
|
|
| 60 |
|
| 61 |
-
|
| 62 |
-
return state
|
| 63 |
|
| 64 |
def step(self, action: dict[str, Any]) -> dict[str, Any]:
|
| 65 |
"""
|
| 66 |
Advance the environment by one step.
|
| 67 |
|
| 68 |
Args:
|
| 69 |
-
action:
|
| 70 |
-
"coder_code"
|
| 71 |
-
"
|
|
|
|
| 72 |
|
| 73 |
Returns:
|
| 74 |
{
|
| 75 |
-
"state":
|
| 76 |
-
"coder_reward": coder reward
|
| 77 |
-
"breaker_reward": breaker reward
|
| 78 |
-
"done": bool
|
| 79 |
-
"info":
|
| 80 |
}
|
| 81 |
"""
|
| 82 |
if self.done:
|
|
@@ -84,33 +131,66 @@ class FORGEEnv:
|
|
| 84 |
|
| 85 |
self.step_count += 1
|
| 86 |
coder_code = action.get("coder_code", "")
|
| 87 |
-
|
|
|
|
| 88 |
|
| 89 |
-
# ──
|
| 90 |
-
|
|
|
|
|
|
|
| 91 |
|
| 92 |
-
# ──
|
|
|
|
| 93 |
breaker_info = self._evaluate_breaker(coder_code, breaker_tests, coder_info)
|
| 94 |
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
| 96 |
self.memory.add_lesson(
|
| 97 |
episode=self.episode,
|
| 98 |
agent="env",
|
| 99 |
observation=(
|
| 100 |
f"Step {self.step_count}: "
|
| 101 |
-
f"coder
|
| 102 |
-
f"
|
|
|
|
|
|
|
| 103 |
),
|
| 104 |
coder_reward=coder_info["total_reward"],
|
| 105 |
breaker_reward=breaker_info["total_reward"],
|
| 106 |
extra={
|
| 107 |
-
"step":
|
| 108 |
-
"
|
| 109 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
},
|
| 111 |
)
|
| 112 |
|
| 113 |
-
# ──
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
if self.step_count >= STEPS_PER_EPISODE:
|
| 115 |
self.done = True
|
| 116 |
|
|
@@ -120,26 +200,32 @@ class FORGEEnv:
|
|
| 120 |
"breaker_reward": breaker_info,
|
| 121 |
"done": self.done,
|
| 122 |
"info": {
|
| 123 |
-
"episode":
|
| 124 |
-
"step":
|
|
|
|
|
|
|
|
|
|
| 125 |
},
|
| 126 |
}
|
| 127 |
|
| 128 |
def get_state(self) -> dict[str, Any]:
|
| 129 |
-
"""
|
| 130 |
-
Return the current observable state of the environment.
|
| 131 |
-
"""
|
| 132 |
return {
|
| 133 |
-
"
|
| 134 |
-
"
|
| 135 |
-
"
|
| 136 |
-
"
|
| 137 |
-
"
|
| 138 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
}
|
| 140 |
|
| 141 |
# ──────────────────────────────────────────────
|
| 142 |
-
# Private helpers
|
| 143 |
# ──────────────────────────────────────────────
|
| 144 |
|
| 145 |
def _evaluate_coder(self, code: str) -> dict[str, Any]:
|
|
@@ -147,17 +233,11 @@ class FORGEEnv:
|
|
| 147 |
hidden_tests = self.current_task.get("hidden_tests", [])
|
| 148 |
|
| 149 |
if not code or not hidden_tests:
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
info = coder_reward(results)
|
| 156 |
-
|
| 157 |
-
# Cache for Breaker quality multiplier
|
| 158 |
-
self._last_coder_code = code
|
| 159 |
-
self._last_coder_pass_rate = info["pass_rate"]
|
| 160 |
-
return info
|
| 161 |
|
| 162 |
def _evaluate_breaker(
|
| 163 |
self,
|
|
@@ -165,9 +245,8 @@ class FORGEEnv:
|
|
| 165 |
breaker_tests: list[dict[str, Any]],
|
| 166 |
coder_info: dict[str, Any],
|
| 167 |
) -> dict[str, Any]:
|
| 168 |
-
"""Run the coder's code against
|
| 169 |
if not coder_code or not breaker_tests:
|
| 170 |
-
# No submission from one of the agents
|
| 171 |
dummy = [{"status": "pass"} for _ in breaker_tests or [{}]]
|
| 172 |
return breaker_reward(dummy, coder_base_pass_rate=coder_info["pass_rate"])
|
| 173 |
|
|
|
|
| 1 |
# env.py
|
| 2 |
# Main OpenEnv-style reinforcement learning environment for FORGE-v4.
|
| 3 |
+
# Manages Coder Agent, Breaker Agent, Sandbox, Rewards, Memory, and Logging.
|
| 4 |
|
| 5 |
+
import uuid
|
| 6 |
from typing import Any
|
| 7 |
+
|
| 8 |
+
from tasks import generate_task
|
| 9 |
from sandbox import run_code_against_tests
|
| 10 |
from rewards import coder_reward, breaker_reward
|
| 11 |
from memory import CoachMemory
|
| 12 |
+
from agents import BreakerAgent, coder_version_label
|
| 13 |
+
from logger import log_step
|
| 14 |
from config import STEPS_PER_EPISODE
|
| 15 |
|
| 16 |
|
|
|
|
| 19 |
Two-agent adversarial environment for code generation tasks.
|
| 20 |
|
| 21 |
Agents:
|
| 22 |
+
- Coder: submits Python code defining solution(arr).
|
| 23 |
+
- Breaker: submits adversarial test cases via a BreakerAgent.
|
| 24 |
|
| 25 |
Episode flow:
|
| 26 |
+
1. reset() → returns initial state
|
| 27 |
+
2. step(action) × N → coder vs breaker, rewards, memory, logs
|
| 28 |
+
3. done=True → call reset() for next episode
|
| 29 |
+
|
| 30 |
+
Action format passed to step():
|
| 31 |
+
{
|
| 32 |
+
"coder_code": str, # Python source defining solution(arr)
|
| 33 |
+
"coder_version": str, # label, e.g. "weak_coder_v1"
|
| 34 |
+
}
|
| 35 |
+
The BreakerAgent is managed internally by the environment.
|
| 36 |
|
| 37 |
+
State returned by get_state() / reset() / step():
|
| 38 |
{
|
| 39 |
+
"task_id": str,
|
| 40 |
+
"problem_description": str,
|
| 41 |
+
"episode": int,
|
| 42 |
+
"episode_step": int,
|
| 43 |
+
"done": bool,
|
| 44 |
+
"coder_version": str,
|
| 45 |
+
"current_tier": int,
|
| 46 |
+
"recent_breaker_case": list[int],
|
| 47 |
+
"pass_rate_history": list[float],
|
| 48 |
+
"coach_memory_summary": dict,
|
| 49 |
+
"public_example": dict,
|
| 50 |
}
|
| 51 |
+
|
| 52 |
+
step() returns:
|
| 53 |
+
{
|
| 54 |
+
"state": dict,
|
| 55 |
+
"coder_reward": dict, # from rewards.coder_reward()
|
| 56 |
+
"breaker_reward": dict, # from rewards.breaker_reward()
|
| 57 |
+
"done": bool,
|
| 58 |
+
"info": dict, # diagnostics
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
Explicit step() flow:
|
| 62 |
+
1. Run coder code against hidden tests in sandbox
|
| 63 |
+
2. Run breaker tests against coder code in sandbox
|
| 64 |
+
3. Assign coder_reward and breaker_reward
|
| 65 |
+
4. Update coach memory with structured lesson
|
| 66 |
+
5. Log step metrics to logs/rewards.json
|
| 67 |
+
6. Advance breaker tier based on break_rate
|
| 68 |
+
7. Return next_state, rewards, done, info
|
| 69 |
"""
|
| 70 |
|
| 71 |
def __init__(self, memory: CoachMemory | None = None):
|
| 72 |
+
self.memory = memory or CoachMemory()
|
| 73 |
+
self.breaker = BreakerAgent()
|
| 74 |
+
self.episode: int = 0
|
| 75 |
self.step_count: int = 0
|
| 76 |
self.current_task: dict[str, Any] = {}
|
| 77 |
+
self.done: bool = True
|
| 78 |
+
|
| 79 |
+
# Tracked across the episode
|
| 80 |
+
self._coder_version: str = "unknown"
|
| 81 |
+
self._pass_rate_history: list[float] = []
|
| 82 |
+
self._recent_breaker_case: list[int] = []
|
| 83 |
+
self._last_coder_pass_rate: float = 0.0
|
| 84 |
|
| 85 |
# ──────────────────────────────────────────────
|
| 86 |
# Core env methods
|
|
|
|
| 88 |
|
| 89 |
def reset(self) -> dict[str, Any]:
|
| 90 |
"""
|
| 91 |
+
Start a new episode. Generates a fresh task and resets counters.
|
| 92 |
|
| 93 |
Returns:
|
| 94 |
+
Initial state dict.
|
| 95 |
"""
|
| 96 |
self.episode += 1
|
| 97 |
self.step_count = 0
|
| 98 |
+
self.done = False
|
| 99 |
+
|
| 100 |
+
self._coder_version = "unknown"
|
| 101 |
+
self._pass_rate_history = []
|
| 102 |
+
self._recent_breaker_case = []
|
| 103 |
self._last_coder_pass_rate = 0.0
|
| 104 |
|
| 105 |
self.current_task = generate_task()
|
| 106 |
+
self.current_task["task_id"] = str(uuid.uuid4())[:8]
|
| 107 |
|
| 108 |
+
return self.get_state()
|
|
|
|
| 109 |
|
| 110 |
def step(self, action: dict[str, Any]) -> dict[str, Any]:
|
| 111 |
"""
|
| 112 |
Advance the environment by one step.
|
| 113 |
|
| 114 |
Args:
|
| 115 |
+
action: {
|
| 116 |
+
"coder_code": str — Python source defining solution(arr)
|
| 117 |
+
"coder_version": str — human label for the coder strategy used
|
| 118 |
+
}
|
| 119 |
|
| 120 |
Returns:
|
| 121 |
{
|
| 122 |
+
"state": dict — next observable state,
|
| 123 |
+
"coder_reward": dict — coder reward breakdown,
|
| 124 |
+
"breaker_reward": dict — breaker reward breakdown,
|
| 125 |
+
"done": bool,
|
| 126 |
+
"info": dict — diagnostics,
|
| 127 |
}
|
| 128 |
"""
|
| 129 |
if self.done:
|
|
|
|
| 131 |
|
| 132 |
self.step_count += 1
|
| 133 |
coder_code = action.get("coder_code", "")
|
| 134 |
+
coder_version = action.get("coder_version", "unknown")
|
| 135 |
+
self._coder_version = coder_version
|
| 136 |
|
| 137 |
+
# ── 1. Get breaker tests for this step ───────────────────────────
|
| 138 |
+
breaker_tests = self.breaker.get_tests(n_per_tier=2)
|
| 139 |
+
if breaker_tests:
|
| 140 |
+
self._recent_breaker_case = breaker_tests[-1]["input"]
|
| 141 |
|
| 142 |
+
# ── 2 & 3. Run sandbox + compute rewards ──────────────────────────
|
| 143 |
+
coder_info = self._evaluate_coder(coder_code)
|
| 144 |
breaker_info = self._evaluate_breaker(coder_code, breaker_tests, coder_info)
|
| 145 |
|
| 146 |
+
self._pass_rate_history.append(coder_info["pass_rate"])
|
| 147 |
+
self._last_coder_pass_rate = coder_info["pass_rate"]
|
| 148 |
+
|
| 149 |
+
# ── 4. Update coach memory with rich lesson ───────────────────────
|
| 150 |
self.memory.add_lesson(
|
| 151 |
episode=self.episode,
|
| 152 |
agent="env",
|
| 153 |
observation=(
|
| 154 |
f"Step {self.step_count}: "
|
| 155 |
+
f"coder={coder_version}, "
|
| 156 |
+
f"pass_rate={coder_info['pass_rate']:.2f}, "
|
| 157 |
+
f"breaker_tier={self.breaker.current_tier}, "
|
| 158 |
+
f"break_rate={breaker_info['break_rate']:.2f}"
|
| 159 |
),
|
| 160 |
coder_reward=coder_info["total_reward"],
|
| 161 |
breaker_reward=breaker_info["total_reward"],
|
| 162 |
extra={
|
| 163 |
+
"step": self.step_count,
|
| 164 |
+
"coder_version": coder_version,
|
| 165 |
+
"breaker_tier": self.breaker.current_tier,
|
| 166 |
+
"coder_pass_rate": coder_info["pass_rate"],
|
| 167 |
+
"fail_count": coder_info["fail_count"],
|
| 168 |
+
"error_count": coder_info["error_count"],
|
| 169 |
+
"timeout_count": coder_info["error_count"], # errors include timeouts
|
| 170 |
+
"breaker_break_rate": breaker_info["break_rate"],
|
| 171 |
+
"recent_breaker_case": self._recent_breaker_case,
|
| 172 |
},
|
| 173 |
)
|
| 174 |
|
| 175 |
+
# ── 5. Log step metrics ───────────────────────────────────────────
|
| 176 |
+
log_step(
|
| 177 |
+
episode=self.episode,
|
| 178 |
+
step=self.step_count,
|
| 179 |
+
coder_version=coder_version,
|
| 180 |
+
breaker_tier=self.breaker.current_tier,
|
| 181 |
+
coder_reward=coder_info["total_reward"],
|
| 182 |
+
breaker_reward=breaker_info["total_reward"],
|
| 183 |
+
pass_rate=coder_info["pass_rate"],
|
| 184 |
+
fail_count=coder_info["fail_count"],
|
| 185 |
+
error_count=coder_info["error_count"],
|
| 186 |
+
timeout_count=coder_info["error_count"],
|
| 187 |
+
break_rate=breaker_info["break_rate"],
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
# ── 6. Advance breaker tier ────────────────────────────────────────
|
| 191 |
+
self.breaker.update_tier(breaker_info["break_rate"], self.episode)
|
| 192 |
+
|
| 193 |
+
# ── 7. Check done + return ────────────────────────────────────────
|
| 194 |
if self.step_count >= STEPS_PER_EPISODE:
|
| 195 |
self.done = True
|
| 196 |
|
|
|
|
| 200 |
"breaker_reward": breaker_info,
|
| 201 |
"done": self.done,
|
| 202 |
"info": {
|
| 203 |
+
"episode": self.episode,
|
| 204 |
+
"step": self.step_count,
|
| 205 |
+
"coder_version": coder_version,
|
| 206 |
+
"breaker_tier": self.breaker.current_tier,
|
| 207 |
+
"breaker_tier_name": self.breaker.tier_name,
|
| 208 |
},
|
| 209 |
}
|
| 210 |
|
| 211 |
def get_state(self) -> dict[str, Any]:
|
| 212 |
+
"""Return the current observable state of the environment."""
|
|
|
|
|
|
|
| 213 |
return {
|
| 214 |
+
"task_id": self.current_task.get("task_id", ""),
|
| 215 |
+
"problem_description": self.current_task.get("prompt", ""),
|
| 216 |
+
"episode": self.episode,
|
| 217 |
+
"episode_step": self.step_count,
|
| 218 |
+
"done": self.done,
|
| 219 |
+
"coder_version": self._coder_version,
|
| 220 |
+
"current_tier": self.breaker.current_tier,
|
| 221 |
+
"recent_breaker_case": self._recent_breaker_case,
|
| 222 |
+
"pass_rate_history": list(self._pass_rate_history),
|
| 223 |
+
"coach_memory_summary": self.memory.summary(),
|
| 224 |
+
"public_example": self.current_task.get("public_example", {}),
|
| 225 |
}
|
| 226 |
|
| 227 |
# ──────────────────────────────────────────────
|
| 228 |
+
# Private evaluation helpers
|
| 229 |
# ──────────────────────────────────────────────
|
| 230 |
|
| 231 |
def _evaluate_coder(self, code: str) -> dict[str, Any]:
|
|
|
|
| 233 |
hidden_tests = self.current_task.get("hidden_tests", [])
|
| 234 |
|
| 235 |
if not code or not hidden_tests:
|
| 236 |
+
dummy = [{"status": "error"} for _ in hidden_tests or [{}]]
|
| 237 |
+
return coder_reward(dummy)
|
| 238 |
+
|
| 239 |
+
results = run_code_against_tests(code, hidden_tests)
|
| 240 |
+
return coder_reward(results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
def _evaluate_breaker(
|
| 243 |
self,
|
|
|
|
| 245 |
breaker_tests: list[dict[str, Any]],
|
| 246 |
coder_info: dict[str, Any],
|
| 247 |
) -> dict[str, Any]:
|
| 248 |
+
"""Run the coder's code against breaker adversarial tests."""
|
| 249 |
if not coder_code or not breaker_tests:
|
|
|
|
| 250 |
dummy = [{"status": "pass"} for _ in breaker_tests or [{}]]
|
| 251 |
return breaker_reward(dummy, coder_base_pass_rate=coder_info["pass_rate"])
|
| 252 |
|
FORGE-v4/logger.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# logger.py
|
| 2 |
+
# Metrics logging for FORGE-v4.
|
| 3 |
+
# Writes structured logs to logs/rewards.json, logs/episodes.csv, logs/summary.json.
|
| 4 |
+
|
| 5 |
+
import csv
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from typing import Any
|
| 10 |
+
|
| 11 |
+
from config import LOG_REWARDS_FILE, LOG_EPISODES_FILE, LOG_SUMMARY_FILE, LOG_DIR
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# ──────────────────────────────────────────────
|
| 15 |
+
# Internal helpers
|
| 16 |
+
# ──────────────────────────────────────────────
|
| 17 |
+
|
| 18 |
+
def _ensure_log_dir() -> None:
|
| 19 |
+
os.makedirs(LOG_DIR, exist_ok=True)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _load_json(path: str, default: Any) -> Any:
|
| 23 |
+
if os.path.exists(path):
|
| 24 |
+
try:
|
| 25 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 26 |
+
return json.load(f)
|
| 27 |
+
except (json.JSONDecodeError, IOError):
|
| 28 |
+
pass
|
| 29 |
+
return default
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _write_json(path: str, data: Any) -> None:
|
| 33 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 34 |
+
json.dump(data, f, indent=2)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# ──────────────────────────────────────────────
|
| 38 |
+
# Step-level logging
|
| 39 |
+
# ──────────────────────────────────────────────
|
| 40 |
+
|
| 41 |
+
def log_step(
|
| 42 |
+
episode: int,
|
| 43 |
+
step: int,
|
| 44 |
+
coder_version: str,
|
| 45 |
+
breaker_tier: int,
|
| 46 |
+
coder_reward: float,
|
| 47 |
+
breaker_reward: float,
|
| 48 |
+
pass_rate: float,
|
| 49 |
+
fail_count: int,
|
| 50 |
+
error_count: int,
|
| 51 |
+
timeout_count: int,
|
| 52 |
+
break_rate: float,
|
| 53 |
+
) -> None:
|
| 54 |
+
"""
|
| 55 |
+
Append one step's metrics to logs/rewards.json.
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
episode: Episode index.
|
| 59 |
+
step: Step index within the episode.
|
| 60 |
+
coder_version: Name of the coder strategy used.
|
| 61 |
+
breaker_tier: Current breaker tier number.
|
| 62 |
+
coder_reward: Total coder reward this step.
|
| 63 |
+
breaker_reward: Total breaker reward this step.
|
| 64 |
+
pass_rate: Fraction of hidden tests passed.
|
| 65 |
+
fail_count: Number of failing tests.
|
| 66 |
+
error_count: Number of error/timeout tests.
|
| 67 |
+
timeout_count: Number of sandbox timeouts specifically.
|
| 68 |
+
break_rate: Fraction of breaker tests that broke the coder.
|
| 69 |
+
"""
|
| 70 |
+
_ensure_log_dir()
|
| 71 |
+
records: list[dict[str, Any]] = _load_json(LOG_REWARDS_FILE, [])
|
| 72 |
+
|
| 73 |
+
record = {
|
| 74 |
+
"timestamp": datetime.utcnow().isoformat(),
|
| 75 |
+
"episode": episode,
|
| 76 |
+
"step": step,
|
| 77 |
+
"coder_version": coder_version,
|
| 78 |
+
"breaker_tier": breaker_tier,
|
| 79 |
+
"coder_reward": coder_reward,
|
| 80 |
+
"breaker_reward": breaker_reward,
|
| 81 |
+
"pass_rate": pass_rate,
|
| 82 |
+
"fail_count": fail_count,
|
| 83 |
+
"error_count": error_count,
|
| 84 |
+
"timeout_count": timeout_count,
|
| 85 |
+
"break_rate": break_rate,
|
| 86 |
+
}
|
| 87 |
+
records.append(record)
|
| 88 |
+
_write_json(LOG_REWARDS_FILE, records)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
# ──────────────────────────────────────────────
|
| 92 |
+
# Episode-level logging
|
| 93 |
+
# ──────────────────────────────────────────────
|
| 94 |
+
|
| 95 |
+
# CSV column order
|
| 96 |
+
_CSV_FIELDS = [
|
| 97 |
+
"timestamp", "episode", "coder_version", "breaker_tier",
|
| 98 |
+
"avg_coder_reward", "avg_breaker_reward",
|
| 99 |
+
"avg_pass_rate", "total_fail_count", "total_error_count",
|
| 100 |
+
"total_timeout_count", "avg_break_rate", "steps",
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def log_episode(
|
| 105 |
+
episode: int,
|
| 106 |
+
coder_version: str,
|
| 107 |
+
breaker_tier: int,
|
| 108 |
+
avg_coder_reward: float,
|
| 109 |
+
avg_breaker_reward: float,
|
| 110 |
+
avg_pass_rate: float,
|
| 111 |
+
total_fail_count: int,
|
| 112 |
+
total_error_count: int,
|
| 113 |
+
total_timeout_count: int,
|
| 114 |
+
avg_break_rate: float,
|
| 115 |
+
steps: int,
|
| 116 |
+
) -> None:
|
| 117 |
+
"""
|
| 118 |
+
Append one episode summary row to logs/episodes.csv.
|
| 119 |
+
"""
|
| 120 |
+
_ensure_log_dir()
|
| 121 |
+
file_exists = os.path.exists(LOG_EPISODES_FILE)
|
| 122 |
+
|
| 123 |
+
row = {
|
| 124 |
+
"timestamp": datetime.utcnow().isoformat(),
|
| 125 |
+
"episode": episode,
|
| 126 |
+
"coder_version": coder_version,
|
| 127 |
+
"breaker_tier": breaker_tier,
|
| 128 |
+
"avg_coder_reward": round(avg_coder_reward, 4),
|
| 129 |
+
"avg_breaker_reward": round(avg_breaker_reward, 4),
|
| 130 |
+
"avg_pass_rate": round(avg_pass_rate, 4),
|
| 131 |
+
"total_fail_count": total_fail_count,
|
| 132 |
+
"total_error_count": total_error_count,
|
| 133 |
+
"total_timeout_count":total_timeout_count,
|
| 134 |
+
"avg_break_rate": round(avg_break_rate, 4),
|
| 135 |
+
"steps": steps,
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
with open(LOG_EPISODES_FILE, "a", newline="", encoding="utf-8") as f:
|
| 139 |
+
writer = csv.DictWriter(f, fieldnames=_CSV_FIELDS)
|
| 140 |
+
if not file_exists:
|
| 141 |
+
writer.writeheader()
|
| 142 |
+
writer.writerow(row)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
# ──────────────────────────────────────────────
|
| 146 |
+
# Summary logging
|
| 147 |
+
# ──────────────────────────────────────────────
|
| 148 |
+
|
| 149 |
+
def update_summary(
|
| 150 |
+
total_episodes: int,
|
| 151 |
+
coder_version: str,
|
| 152 |
+
final_breaker_tier: int,
|
| 153 |
+
all_coder_rewards: list[float],
|
| 154 |
+
all_breaker_rewards: list[float],
|
| 155 |
+
all_pass_rates: list[float],
|
| 156 |
+
all_break_rates: list[float],
|
| 157 |
+
coach_memory_summary: dict[str, Any],
|
| 158 |
+
) -> None:
|
| 159 |
+
"""
|
| 160 |
+
Overwrite logs/summary.json with the latest aggregate statistics.
|
| 161 |
+
"""
|
| 162 |
+
_ensure_log_dir()
|
| 163 |
+
|
| 164 |
+
def avg(lst: list[float]) -> float:
|
| 165 |
+
return round(sum(lst) / len(lst), 4) if lst else 0.0
|
| 166 |
+
|
| 167 |
+
summary = {
|
| 168 |
+
"generated_at": datetime.utcnow().isoformat(),
|
| 169 |
+
"total_episodes": total_episodes,
|
| 170 |
+
"coder_version": coder_version,
|
| 171 |
+
"final_breaker_tier": final_breaker_tier,
|
| 172 |
+
"avg_coder_reward": avg(all_coder_rewards),
|
| 173 |
+
"avg_breaker_reward": avg(all_breaker_rewards),
|
| 174 |
+
"avg_pass_rate": avg(all_pass_rates),
|
| 175 |
+
"avg_break_rate": avg(all_break_rates),
|
| 176 |
+
"best_coder_reward": round(max(all_coder_rewards), 4) if all_coder_rewards else 0.0,
|
| 177 |
+
"worst_coder_reward": round(min(all_coder_rewards), 4) if all_coder_rewards else 0.0,
|
| 178 |
+
"coach_memory_summary": coach_memory_summary,
|
| 179 |
+
}
|
| 180 |
+
_write_json(LOG_SUMMARY_FILE, summary)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
# ──────────────────────────────────────────────
|
| 184 |
+
# Convenience: print a compact log path report
|
| 185 |
+
# ──────────────────────────────────────────────
|
| 186 |
+
|
| 187 |
+
def print_log_paths() -> None:
|
| 188 |
+
"""Print the paths of all updated log files."""
|
| 189 |
+
for path in [LOG_REWARDS_FILE, LOG_EPISODES_FILE, LOG_SUMMARY_FILE]:
|
| 190 |
+
exists = "✓" if os.path.exists(path) else "✗"
|
| 191 |
+
print(f" {exists} {path}")
|
FORGE-v4/logs/episodes.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
timestamp,episode,coder_version,breaker_tier,avg_coder_reward,avg_breaker_reward,avg_pass_rate,total_fail_count,total_error_count,total_timeout_count,avg_break_rate,steps
|
| 2 |
+
2026-04-25T08:42:22.041578,1,improving_coder,1,5.0,-1.2,1.0,0,0,0,0.0,3
|
| 3 |
+
2026-04-25T08:42:31.074377,1,weak_coder_v2,1,-1.0,-1.2,0.2,12,0,0,0.0,3
|
FORGE-v4/logs/rewards.json
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"timestamp": "2026-04-25T08:42:19.501582",
|
| 4 |
+
"episode": 1,
|
| 5 |
+
"step": 1,
|
| 6 |
+
"coder_version": "improving_coder",
|
| 7 |
+
"breaker_tier": 1,
|
| 8 |
+
"coder_reward": 5.0,
|
| 9 |
+
"breaker_reward": -1.2,
|
| 10 |
+
"pass_rate": 1.0,
|
| 11 |
+
"fail_count": 0,
|
| 12 |
+
"error_count": 0,
|
| 13 |
+
"timeout_count": 0,
|
| 14 |
+
"break_rate": 0.0
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"timestamp": "2026-04-25T08:42:20.777574",
|
| 18 |
+
"episode": 1,
|
| 19 |
+
"step": 2,
|
| 20 |
+
"coder_version": "improving_coder",
|
| 21 |
+
"breaker_tier": 1,
|
| 22 |
+
"coder_reward": 5.0,
|
| 23 |
+
"breaker_reward": -1.2,
|
| 24 |
+
"pass_rate": 1.0,
|
| 25 |
+
"fail_count": 0,
|
| 26 |
+
"error_count": 0,
|
| 27 |
+
"timeout_count": 0,
|
| 28 |
+
"break_rate": 0.0
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"timestamp": "2026-04-25T08:42:22.039872",
|
| 32 |
+
"episode": 1,
|
| 33 |
+
"step": 3,
|
| 34 |
+
"coder_version": "improving_coder",
|
| 35 |
+
"breaker_tier": 1,
|
| 36 |
+
"coder_reward": 5.0,
|
| 37 |
+
"breaker_reward": -1.2,
|
| 38 |
+
"pass_rate": 1.0,
|
| 39 |
+
"fail_count": 0,
|
| 40 |
+
"error_count": 0,
|
| 41 |
+
"timeout_count": 0,
|
| 42 |
+
"break_rate": 0.0
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"timestamp": "2026-04-25T08:42:28.577096",
|
| 46 |
+
"episode": 1,
|
| 47 |
+
"step": 1,
|
| 48 |
+
"coder_version": "weak_coder_v2",
|
| 49 |
+
"breaker_tier": 1,
|
| 50 |
+
"coder_reward": -1.0,
|
| 51 |
+
"breaker_reward": -1.2,
|
| 52 |
+
"pass_rate": 0.2,
|
| 53 |
+
"fail_count": 4,
|
| 54 |
+
"error_count": 0,
|
| 55 |
+
"timeout_count": 0,
|
| 56 |
+
"break_rate": 0.0
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"timestamp": "2026-04-25T08:42:29.829535",
|
| 60 |
+
"episode": 1,
|
| 61 |
+
"step": 2,
|
| 62 |
+
"coder_version": "weak_coder_v2",
|
| 63 |
+
"breaker_tier": 1,
|
| 64 |
+
"coder_reward": -1.0,
|
| 65 |
+
"breaker_reward": -1.2,
|
| 66 |
+
"pass_rate": 0.2,
|
| 67 |
+
"fail_count": 4,
|
| 68 |
+
"error_count": 0,
|
| 69 |
+
"timeout_count": 0,
|
| 70 |
+
"break_rate": 0.0
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"timestamp": "2026-04-25T08:42:31.072423",
|
| 74 |
+
"episode": 1,
|
| 75 |
+
"step": 3,
|
| 76 |
+
"coder_version": "weak_coder_v2",
|
| 77 |
+
"breaker_tier": 1,
|
| 78 |
+
"coder_reward": -1.0,
|
| 79 |
+
"breaker_reward": -1.2,
|
| 80 |
+
"pass_rate": 0.2,
|
| 81 |
+
"fail_count": 4,
|
| 82 |
+
"error_count": 0,
|
| 83 |
+
"timeout_count": 0,
|
| 84 |
+
"break_rate": 0.0
|
| 85 |
+
}
|
| 86 |
+
]
|
FORGE-v4/logs/summary.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"generated_at": "2026-04-25T08:42:31.075228",
|
| 3 |
+
"total_episodes": 1,
|
| 4 |
+
"coder_version": "weak_coder_v2",
|
| 5 |
+
"final_breaker_tier": 1,
|
| 6 |
+
"avg_coder_reward": -1.0,
|
| 7 |
+
"avg_breaker_reward": -1.2,
|
| 8 |
+
"avg_pass_rate": 0.2,
|
| 9 |
+
"avg_break_rate": 0.0,
|
| 10 |
+
"best_coder_reward": -1.0,
|
| 11 |
+
"worst_coder_reward": -1.0,
|
| 12 |
+
"coach_memory_summary": {
|
| 13 |
+
"total_lessons": 3,
|
| 14 |
+
"episodes_seen": 1,
|
| 15 |
+
"avg_coder_reward": -1.0,
|
| 16 |
+
"avg_breaker_reward": -1.2,
|
| 17 |
+
"recent_coach_notes": [
|
| 18 |
+
"Episode 1: Coder (weak_coder_v2) failed 4 test(s) at breaker 1 \u2192 review edge case handling",
|
| 19 |
+
"Episode 1: Coder (weak_coder_v2) failed 4 test(s) at breaker 1 \u2192 review edge case handling",
|
| 20 |
+
"Episode 1: Coder (weak_coder_v2) failed 4 test(s) at breaker 1 \u2192 review edge case handling"
|
| 21 |
+
]
|
| 22 |
+
}
|
| 23 |
+
}
|
FORGE-v4/memory.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# memory.py
|
| 2 |
# Coach Memory system for FORGE-v4.
|
| 3 |
-
# Stores lessons learned across episodes in a JSON file.
|
| 4 |
|
| 5 |
import json
|
| 6 |
import os
|
|
@@ -14,6 +14,8 @@ class CoachMemory:
|
|
| 14 |
Persistent memory that accumulates lessons learned across training episodes.
|
| 15 |
|
| 16 |
Lessons are stored as a list of dicts in a JSON file and loaded on startup.
|
|
|
|
|
|
|
| 17 |
"""
|
| 18 |
|
| 19 |
def __init__(self, filepath: str = MEMORY_FILE):
|
|
@@ -41,16 +43,19 @@ class CoachMemory:
|
|
| 41 |
Args:
|
| 42 |
episode: Episode index.
|
| 43 |
agent: "coder" | "breaker" | "env".
|
| 44 |
-
observation:
|
| 45 |
coder_reward: Total coder reward for this step.
|
| 46 |
breaker_reward: Total breaker reward for this step.
|
| 47 |
-
extra: Optional
|
| 48 |
"""
|
|
|
|
|
|
|
| 49 |
lesson = {
|
| 50 |
"timestamp": datetime.utcnow().isoformat(),
|
| 51 |
"episode": episode,
|
| 52 |
"agent": agent,
|
| 53 |
"observation": observation,
|
|
|
|
| 54 |
"coder_reward": coder_reward,
|
| 55 |
"breaker_reward": breaker_reward,
|
| 56 |
}
|
|
@@ -60,16 +65,13 @@ class CoachMemory:
|
|
| 60 |
self.lessons.append(lesson)
|
| 61 |
self.save()
|
| 62 |
|
| 63 |
-
def get_lessons(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
"""
|
| 65 |
Retrieve stored lessons, optionally filtered by agent and/or limited to the last N.
|
| 66 |
-
|
| 67 |
-
Args:
|
| 68 |
-
agent: Filter to a specific agent ("coder", "breaker", "env"), or None for all.
|
| 69 |
-
last_n: Return only the last N lessons if provided.
|
| 70 |
-
|
| 71 |
-
Returns:
|
| 72 |
-
List of lesson dicts.
|
| 73 |
"""
|
| 74 |
result = self.lessons
|
| 75 |
if agent is not None:
|
|
@@ -78,28 +80,29 @@ class CoachMemory:
|
|
| 78 |
result = result[-last_n:]
|
| 79 |
return result
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
def summary(self) -> dict[str, Any]:
|
| 82 |
-
"""
|
| 83 |
-
Return a high-level summary of stored lessons.
|
| 84 |
-
"""
|
| 85 |
if not self.lessons:
|
| 86 |
return {"total_lessons": 0, "episodes_seen": 0}
|
| 87 |
|
| 88 |
episodes = {l["episode"] for l in self.lessons}
|
| 89 |
-
coder_rewards
|
| 90 |
breaker_rewards = [l["breaker_reward"] for l in self.lessons]
|
| 91 |
|
| 92 |
return {
|
| 93 |
"total_lessons": len(self.lessons),
|
| 94 |
"episodes_seen": len(episodes),
|
| 95 |
-
"avg_coder_reward": round(sum(coder_rewards)
|
| 96 |
"avg_breaker_reward": round(sum(breaker_rewards) / len(breaker_rewards), 4),
|
|
|
|
| 97 |
}
|
| 98 |
|
| 99 |
def clear(self) -> None:
|
| 100 |
-
"""
|
| 101 |
-
Wipe all stored lessons (use with caution).
|
| 102 |
-
"""
|
| 103 |
self.lessons = []
|
| 104 |
self.save()
|
| 105 |
|
|
@@ -119,11 +122,89 @@ class CoachMemory:
|
|
| 119 |
with open(self.filepath, "r", encoding="utf-8") as f:
|
| 120 |
self.lessons = json.load(f)
|
| 121 |
except (json.JSONDecodeError, IOError):
|
| 122 |
-
# Start fresh if file is corrupted
|
| 123 |
self.lessons = []
|
| 124 |
else:
|
| 125 |
self.lessons = []
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
# ──────────────────────────────────────────────
|
| 128 |
# Internal helpers
|
| 129 |
# ──────────────────────────────────────────────
|
|
|
|
| 1 |
# memory.py
|
| 2 |
# Coach Memory system for FORGE-v4.
|
| 3 |
+
# Stores structured lessons learned across episodes in a JSON file.
|
| 4 |
|
| 5 |
import json
|
| 6 |
import os
|
|
|
|
| 14 |
Persistent memory that accumulates lessons learned across training episodes.
|
| 15 |
|
| 16 |
Lessons are stored as a list of dicts in a JSON file and loaded on startup.
|
| 17 |
+
Each lesson includes a human-readable "coach_note" derived from the metrics
|
| 18 |
+
so the history is understandable without post-processing.
|
| 19 |
"""
|
| 20 |
|
| 21 |
def __init__(self, filepath: str = MEMORY_FILE):
|
|
|
|
| 43 |
Args:
|
| 44 |
episode: Episode index.
|
| 45 |
agent: "coder" | "breaker" | "env".
|
| 46 |
+
observation: Raw observation string from the environment.
|
| 47 |
coder_reward: Total coder reward for this step.
|
| 48 |
breaker_reward: Total breaker reward for this step.
|
| 49 |
+
extra: Optional metadata (pass_rate, fail_count, etc.).
|
| 50 |
"""
|
| 51 |
+
coach_note = self._derive_coach_note(episode, extra or {})
|
| 52 |
+
|
| 53 |
lesson = {
|
| 54 |
"timestamp": datetime.utcnow().isoformat(),
|
| 55 |
"episode": episode,
|
| 56 |
"agent": agent,
|
| 57 |
"observation": observation,
|
| 58 |
+
"coach_note": coach_note,
|
| 59 |
"coder_reward": coder_reward,
|
| 60 |
"breaker_reward": breaker_reward,
|
| 61 |
}
|
|
|
|
| 65 |
self.lessons.append(lesson)
|
| 66 |
self.save()
|
| 67 |
|
| 68 |
+
def get_lessons(
|
| 69 |
+
self,
|
| 70 |
+
agent: str | None = None,
|
| 71 |
+
last_n: int | None = None,
|
| 72 |
+
) -> list[dict[str, Any]]:
|
| 73 |
"""
|
| 74 |
Retrieve stored lessons, optionally filtered by agent and/or limited to the last N.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
"""
|
| 76 |
result = self.lessons
|
| 77 |
if agent is not None:
|
|
|
|
| 80 |
result = result[-last_n:]
|
| 81 |
return result
|
| 82 |
|
| 83 |
+
def get_coach_notes(self, last_n: int = 5) -> list[str]:
|
| 84 |
+
"""Return the most recent human-readable coach notes."""
|
| 85 |
+
return [l["coach_note"] for l in self.lessons[-last_n:] if l.get("coach_note")]
|
| 86 |
+
|
| 87 |
def summary(self) -> dict[str, Any]:
|
| 88 |
+
"""Return a high-level summary of stored lessons."""
|
|
|
|
|
|
|
| 89 |
if not self.lessons:
|
| 90 |
return {"total_lessons": 0, "episodes_seen": 0}
|
| 91 |
|
| 92 |
episodes = {l["episode"] for l in self.lessons}
|
| 93 |
+
coder_rewards = [l["coder_reward"] for l in self.lessons]
|
| 94 |
breaker_rewards = [l["breaker_reward"] for l in self.lessons]
|
| 95 |
|
| 96 |
return {
|
| 97 |
"total_lessons": len(self.lessons),
|
| 98 |
"episodes_seen": len(episodes),
|
| 99 |
+
"avg_coder_reward": round(sum(coder_rewards) / len(coder_rewards), 4),
|
| 100 |
"avg_breaker_reward": round(sum(breaker_rewards) / len(breaker_rewards), 4),
|
| 101 |
+
"recent_coach_notes": self.get_coach_notes(last_n=3),
|
| 102 |
}
|
| 103 |
|
| 104 |
def clear(self) -> None:
|
| 105 |
+
"""Wipe all stored lessons (use with caution)."""
|
|
|
|
|
|
|
| 106 |
self.lessons = []
|
| 107 |
self.save()
|
| 108 |
|
|
|
|
| 122 |
with open(self.filepath, "r", encoding="utf-8") as f:
|
| 123 |
self.lessons = json.load(f)
|
| 124 |
except (json.JSONDecodeError, IOError):
|
|
|
|
| 125 |
self.lessons = []
|
| 126 |
else:
|
| 127 |
self.lessons = []
|
| 128 |
|
| 129 |
+
# ──────────────────────────────────────────────
|
| 130 |
+
# Coach note derivation
|
| 131 |
+
# ──────────────────────────────────────────────
|
| 132 |
+
|
| 133 |
+
def _derive_coach_note(self, episode: int, extra: dict[str, Any]) -> str:
|
| 134 |
+
"""
|
| 135 |
+
Generate a human-readable coaching note from step metadata.
|
| 136 |
+
|
| 137 |
+
Examples:
|
| 138 |
+
"Episode 4: Coder failed on duplicates → handle duplicate values safely"
|
| 139 |
+
"Episode 8: Coder timed out on large arrays → avoid O(n²) for large inputs"
|
| 140 |
+
"Episode 2: Strong performance (pass_rate=1.00) → keep current strategy"
|
| 141 |
+
"""
|
| 142 |
+
pass_rate = extra.get("coder_pass_rate", None)
|
| 143 |
+
fail_count = extra.get("fail_count", 0)
|
| 144 |
+
error_count = extra.get("error_count", 0)
|
| 145 |
+
timeout_count = extra.get("timeout_count", 0)
|
| 146 |
+
breaker_tier = extra.get("breaker_tier", 1)
|
| 147 |
+
coder_version = extra.get("coder_version", "unknown")
|
| 148 |
+
recent_case = extra.get("recent_breaker_case", [])
|
| 149 |
+
|
| 150 |
+
prefix = f"Episode {episode}"
|
| 151 |
+
|
| 152 |
+
# Timeout note
|
| 153 |
+
if timeout_count > 0:
|
| 154 |
+
return (
|
| 155 |
+
f"{prefix}: Coder timed out on {timeout_count} test(s)"
|
| 156 |
+
f" [tier={breaker_tier}] → avoid O(n²) or infinite loops for large inputs"
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
# Error note
|
| 160 |
+
if error_count > 0 and pass_rate is not None and pass_rate < 0.5:
|
| 161 |
+
return (
|
| 162 |
+
f"{prefix}: Coder raised errors on {error_count} test(s)"
|
| 163 |
+
f" → add input validation and handle edge cases"
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
# Negative/duplicate failure detection from recent breaker case
|
| 167 |
+
if fail_count > 0 and recent_case:
|
| 168 |
+
has_neg = any(x < 0 for x in recent_case)
|
| 169 |
+
has_dups = len(recent_case) != len(set(recent_case))
|
| 170 |
+
is_large = len(recent_case) >= 10
|
| 171 |
+
|
| 172 |
+
if has_neg and has_dups:
|
| 173 |
+
return (
|
| 174 |
+
f"{prefix}: Coder ({coder_version}) failed on negatives+duplicates"
|
| 175 |
+
f" → ensure sort key uses true value, not abs()"
|
| 176 |
+
)
|
| 177 |
+
if has_neg:
|
| 178 |
+
return (
|
| 179 |
+
f"{prefix}: Coder ({coder_version}) failed on negative values"
|
| 180 |
+
f" → handle negative integers in comparison logic"
|
| 181 |
+
)
|
| 182 |
+
if has_dups:
|
| 183 |
+
return (
|
| 184 |
+
f"{prefix}: Coder ({coder_version}) failed on duplicate values"
|
| 185 |
+
f" → ensure stable sort handles equal elements correctly"
|
| 186 |
+
)
|
| 187 |
+
if is_large:
|
| 188 |
+
return (
|
| 189 |
+
f"{prefix}: Coder ({coder_version}) failed on large array (n={len(recent_case)})"
|
| 190 |
+
f" → consider O(n log n) algorithm"
|
| 191 |
+
)
|
| 192 |
+
return (
|
| 193 |
+
f"{prefix}: Coder ({coder_version}) failed {fail_count} test(s)"
|
| 194 |
+
f" at breaker {breaker_tier} → review edge case handling"
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
# Good performance
|
| 198 |
+
if pass_rate is not None and pass_rate >= 0.8:
|
| 199 |
+
return (
|
| 200 |
+
f"{prefix}: Strong performance (pass_rate={pass_rate:.2f})"
|
| 201 |
+
f" [{coder_version}] → breaker should escalate tier"
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
# Generic fallback
|
| 205 |
+
pr = f"{pass_rate:.2f}" if pass_rate is not None else "N/A"
|
| 206 |
+
return f"{prefix}: pass_rate={pr}, fail={fail_count}, errors={error_count}"
|
| 207 |
+
|
| 208 |
# ──────────────────────────────────────────────
|
| 209 |
# Internal helpers
|
| 210 |
# ──────────────────────────────────────────────
|
FORGE-v4/trainer.py
CHANGED
|
@@ -1,47 +1,44 @@
|
|
| 1 |
# trainer.py
|
| 2 |
-
#
|
| 3 |
-
#
|
|
|
|
| 4 |
|
| 5 |
from typing import Any, Callable
|
| 6 |
from env import FORGEEnv
|
| 7 |
from memory import CoachMemory
|
|
|
|
|
|
|
| 8 |
from config import MAX_EPISODES, STEPS_PER_EPISODE
|
| 9 |
|
| 10 |
|
| 11 |
# ──────────────────────────────────────────────
|
| 12 |
-
#
|
| 13 |
# ──────────────────────────────────────────────
|
| 14 |
|
| 15 |
-
def
|
| 16 |
"""
|
| 17 |
-
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
|
| 22 |
-
|
|
|
|
| 23 |
"""
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
def default_breaker_policy(state: dict[str, Any]) -> list[dict[str, Any]]:
|
| 29 |
-
"""
|
| 30 |
-
Placeholder Breaker policy.
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
# TODO: Replace with adversarial LLM inference call
|
| 38 |
-
return [
|
| 39 |
-
{"input": [], "expected_output": []},
|
| 40 |
-
{"input": [1], "expected_output": [1]},
|
| 41 |
-
{"input": [3, 1, 2], "expected_output": [1, 2, 3]},
|
| 42 |
-
{"input": [-5, -1, -3], "expected_output": [-5, -3, -1]},
|
| 43 |
-
{"input": [0, 0, 0, 0], "expected_output": [0, 0, 0, 0]},
|
| 44 |
-
]
|
| 45 |
|
| 46 |
|
| 47 |
# ──────────────────────────────────────────────
|
|
@@ -49,81 +46,133 @@ def default_breaker_policy(state: dict[str, Any]) -> list[dict[str, Any]]:
|
|
| 49 |
# ──────────────────────────────────────────────
|
| 50 |
|
| 51 |
def train(
|
| 52 |
-
coder_policy: Callable[[dict[str, Any]], str] = default_coder_policy,
|
| 53 |
-
breaker_policy: Callable[[dict[str, Any]], list[dict[str, Any]]] = default_breaker_policy,
|
| 54 |
num_episodes: int = MAX_EPISODES,
|
| 55 |
verbose: bool = True,
|
| 56 |
) -> dict[str, Any]:
|
| 57 |
"""
|
| 58 |
Run the FORGE-v4 training loop.
|
| 59 |
|
|
|
|
|
|
|
|
|
|
| 60 |
Args:
|
| 61 |
-
coder_policy:
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
verbose: Print per-episode summaries when True.
|
| 65 |
|
| 66 |
Returns:
|
| 67 |
-
Training summary dict
|
| 68 |
"""
|
| 69 |
memory = CoachMemory()
|
| 70 |
-
env
|
| 71 |
|
| 72 |
episode_history: list[dict[str, Any]] = []
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
for ep in range(1, num_episodes + 1):
|
| 75 |
state = env.reset()
|
| 76 |
-
episode_coder_rewards = []
|
| 77 |
-
episode_breaker_rewards = []
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
"breaker_tests": breaker_tests,
|
| 87 |
-
}
|
| 88 |
|
| 89 |
-
|
|
|
|
| 90 |
result = env.step(action)
|
| 91 |
state = result["state"]
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
if result["done"]:
|
| 97 |
break
|
| 98 |
|
| 99 |
# ── Episode summary ────────────────────────────────────────────────
|
| 100 |
-
|
| 101 |
-
|
| 102 |
|
| 103 |
ep_summary = {
|
| 104 |
"episode": ep,
|
| 105 |
-
"
|
| 106 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
"steps": env.step_count,
|
| 108 |
}
|
| 109 |
episode_history.append(ep_summary)
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
if verbose:
|
|
|
|
| 112 |
print(
|
| 113 |
-
f"[
|
| 114 |
-
f"
|
| 115 |
-
f"
|
|
|
|
|
|
|
|
|
|
| 116 |
)
|
| 117 |
|
| 118 |
-
# ── TRL / Unsloth hook
|
| 119 |
_on_episode_end(ep, ep_summary, memory)
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
}
|
| 126 |
-
return training_summary
|
| 127 |
|
| 128 |
|
| 129 |
# ──────────────────────────────────────────────
|
|
@@ -136,23 +185,20 @@ def _on_episode_end(
|
|
| 136 |
memory: CoachMemory,
|
| 137 |
) -> None:
|
| 138 |
"""
|
| 139 |
-
Called at
|
| 140 |
|
| 141 |
TODO: Plug in TRL PPOTrainer / Unsloth model updates here.
|
| 142 |
E.g.:
|
| 143 |
trainer.step(queries, responses, rewards)
|
| 144 |
model.save_pretrained(f"models/checkpoint-ep{episode}")
|
| 145 |
"""
|
| 146 |
-
pass
|
| 147 |
|
| 148 |
|
| 149 |
-
def _on_step_end(
|
| 150 |
-
step: int,
|
| 151 |
-
result: dict[str, Any],
|
| 152 |
-
) -> None:
|
| 153 |
"""
|
| 154 |
Called after every environment step.
|
| 155 |
|
| 156 |
-
TODO: Plug in per-step reward logging (
|
| 157 |
"""
|
| 158 |
-
pass
|
|
|
|
| 1 |
# trainer.py
|
| 2 |
+
# Training loop for FORGE-v4.
|
| 3 |
+
# Uses the real coder strategies and tiered BreakerAgent from agents.py.
|
| 4 |
+
# Hook placeholders are ready for TRL / Unsloth / Hugging Face integration.
|
| 5 |
|
| 6 |
from typing import Any, Callable
|
| 7 |
from env import FORGEEnv
|
| 8 |
from memory import CoachMemory
|
| 9 |
+
from agents import get_coder_code, coder_version_label
|
| 10 |
+
from logger import log_episode, update_summary
|
| 11 |
from config import MAX_EPISODES, STEPS_PER_EPISODE
|
| 12 |
|
| 13 |
|
| 14 |
# ──────────────────────────────────────────────
|
| 15 |
+
# Built-in coder policies
|
| 16 |
# ──────────────────────────────────────────────
|
| 17 |
|
| 18 |
+
def make_coder_policy(version: str) -> Callable[[dict[str, Any]], dict[str, str]]:
|
| 19 |
"""
|
| 20 |
+
Factory: return a coder policy function for the given version name.
|
| 21 |
|
| 22 |
+
The returned callable takes a state dict and returns an action dict:
|
| 23 |
+
{"coder_code": str, "coder_version": str}
|
| 24 |
|
| 25 |
+
Args:
|
| 26 |
+
version: "weak_coder_v1" | "weak_coder_v2" | "improving_coder"
|
| 27 |
"""
|
| 28 |
+
def policy(state: dict[str, Any]) -> dict[str, str]:
|
| 29 |
+
episode = state.get("episode", 1)
|
| 30 |
+
code = get_coder_code(version, episode=episode)
|
| 31 |
+
return {"coder_code": code, "coder_version": version}
|
| 32 |
+
return policy
|
| 33 |
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
+
# Convenience pre-built policies
|
| 36 |
+
weak_coder_v1_policy = make_coder_policy("weak_coder_v1")
|
| 37 |
+
weak_coder_v2_policy = make_coder_policy("weak_coder_v2")
|
| 38 |
+
improving_coder_policy = make_coder_policy("improving_coder")
|
| 39 |
|
| 40 |
+
# Default used by app.py
|
| 41 |
+
default_coder_policy = improving_coder_policy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
# ──────────────────────────────────────────────
|
|
|
|
| 46 |
# ──────────────────────────────────────────────
|
| 47 |
|
| 48 |
def train(
|
| 49 |
+
coder_policy: Callable[[dict[str, Any]], dict[str, str]] = default_coder_policy,
|
|
|
|
| 50 |
num_episodes: int = MAX_EPISODES,
|
| 51 |
verbose: bool = True,
|
| 52 |
) -> dict[str, Any]:
|
| 53 |
"""
|
| 54 |
Run the FORGE-v4 training loop.
|
| 55 |
|
| 56 |
+
The BreakerAgent is managed by the environment — it automatically tiers up
|
| 57 |
+
based on performance. Only the coder policy needs to be supplied here.
|
| 58 |
+
|
| 59 |
Args:
|
| 60 |
+
coder_policy: Callable(state) → {"coder_code": str, "coder_version": str}
|
| 61 |
+
num_episodes: Number of episodes to run.
|
| 62 |
+
verbose: Print per-episode summaries when True.
|
|
|
|
| 63 |
|
| 64 |
Returns:
|
| 65 |
+
Training summary dict.
|
| 66 |
"""
|
| 67 |
memory = CoachMemory()
|
| 68 |
+
env = FORGEEnv(memory=memory)
|
| 69 |
|
| 70 |
episode_history: list[dict[str, Any]] = []
|
| 71 |
|
| 72 |
+
# Aggregate accumulators for final summary
|
| 73 |
+
all_coder_rewards: list[float] = []
|
| 74 |
+
all_breaker_rewards: list[float] = []
|
| 75 |
+
all_pass_rates: list[float] = []
|
| 76 |
+
all_break_rates: list[float] = []
|
| 77 |
+
|
| 78 |
for ep in range(1, num_episodes + 1):
|
| 79 |
state = env.reset()
|
|
|
|
|
|
|
| 80 |
|
| 81 |
+
ep_coder_rewards: list[float] = []
|
| 82 |
+
ep_breaker_rewards: list[float] = []
|
| 83 |
+
ep_pass_rates: list[float] = []
|
| 84 |
+
ep_fail_counts: list[int] = []
|
| 85 |
+
ep_error_counts: list[int] = []
|
| 86 |
+
ep_timeout_counts: list[int] = []
|
| 87 |
+
ep_break_rates: list[float] = []
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
for _ in range(STEPS_PER_EPISODE):
|
| 90 |
+
action = coder_policy(state)
|
| 91 |
result = env.step(action)
|
| 92 |
state = result["state"]
|
| 93 |
|
| 94 |
+
cr = result["coder_reward"]
|
| 95 |
+
br = result["breaker_reward"]
|
| 96 |
+
|
| 97 |
+
ep_coder_rewards.append(cr["total_reward"])
|
| 98 |
+
ep_breaker_rewards.append(br["total_reward"])
|
| 99 |
+
ep_pass_rates.append(cr["pass_rate"])
|
| 100 |
+
ep_fail_counts.append(cr["fail_count"])
|
| 101 |
+
ep_error_counts.append(cr["error_count"])
|
| 102 |
+
ep_timeout_counts.append(cr["error_count"])
|
| 103 |
+
ep_break_rates.append(br["break_rate"])
|
| 104 |
|
| 105 |
if result["done"]:
|
| 106 |
break
|
| 107 |
|
| 108 |
# ── Episode summary ────────────────────────────────────────────────
|
| 109 |
+
def avg(lst: list) -> float:
|
| 110 |
+
return round(sum(lst) / len(lst), 4) if lst else 0.0
|
| 111 |
|
| 112 |
ep_summary = {
|
| 113 |
"episode": ep,
|
| 114 |
+
"coder_version": action.get("coder_version", "unknown"),
|
| 115 |
+
"breaker_tier": env.breaker.current_tier,
|
| 116 |
+
"avg_coder_reward": avg(ep_coder_rewards),
|
| 117 |
+
"avg_breaker_reward": avg(ep_breaker_rewards),
|
| 118 |
+
"avg_pass_rate": avg(ep_pass_rates),
|
| 119 |
+
"avg_break_rate": avg(ep_break_rates),
|
| 120 |
"steps": env.step_count,
|
| 121 |
}
|
| 122 |
episode_history.append(ep_summary)
|
| 123 |
|
| 124 |
+
# ── Log episode to CSV ─────────────────────────────────────────────
|
| 125 |
+
log_episode(
|
| 126 |
+
episode=ep,
|
| 127 |
+
coder_version=ep_summary["coder_version"],
|
| 128 |
+
breaker_tier=ep_summary["breaker_tier"],
|
| 129 |
+
avg_coder_reward=ep_summary["avg_coder_reward"],
|
| 130 |
+
avg_breaker_reward=ep_summary["avg_breaker_reward"],
|
| 131 |
+
avg_pass_rate=ep_summary["avg_pass_rate"],
|
| 132 |
+
total_fail_count=sum(ep_fail_counts),
|
| 133 |
+
total_error_count=sum(ep_error_counts),
|
| 134 |
+
total_timeout_count=sum(ep_timeout_counts),
|
| 135 |
+
avg_break_rate=ep_summary["avg_break_rate"],
|
| 136 |
+
steps=ep_summary["steps"],
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
# ── Accumulate for final summary ───────────────────────────────────
|
| 140 |
+
all_coder_rewards.extend(ep_coder_rewards)
|
| 141 |
+
all_breaker_rewards.extend(ep_breaker_rewards)
|
| 142 |
+
all_pass_rates.extend(ep_pass_rates)
|
| 143 |
+
all_break_rates.extend(ep_break_rates)
|
| 144 |
+
|
| 145 |
if verbose:
|
| 146 |
+
label = coder_version_label(ep_summary["coder_version"], ep)
|
| 147 |
print(
|
| 148 |
+
f" [Ep {ep:>3}] Coder: {label:<50} "
|
| 149 |
+
f"pass={ep_summary['avg_pass_rate']:.2f} "
|
| 150 |
+
f"reward={ep_summary['avg_coder_reward']:+.2f} | "
|
| 151 |
+
f"Breaker: {env.breaker.tier_name:<22} "
|
| 152 |
+
f"break={ep_summary['avg_break_rate']:.2f} "
|
| 153 |
+
f"reward={ep_summary['avg_breaker_reward']:+.2f}"
|
| 154 |
)
|
| 155 |
|
| 156 |
+
# ── TRL / Unsloth hook ─────────────────────────────────────────────
|
| 157 |
_on_episode_end(ep, ep_summary, memory)
|
| 158 |
|
| 159 |
+
# ── Final summary JSON ────────────────────────────────────────────────
|
| 160 |
+
update_summary(
|
| 161 |
+
total_episodes=num_episodes,
|
| 162 |
+
coder_version=episode_history[-1]["coder_version"] if episode_history else "unknown",
|
| 163 |
+
final_breaker_tier=env.breaker.current_tier,
|
| 164 |
+
all_coder_rewards=all_coder_rewards,
|
| 165 |
+
all_breaker_rewards=all_breaker_rewards,
|
| 166 |
+
all_pass_rates=all_pass_rates,
|
| 167 |
+
all_break_rates=all_break_rates,
|
| 168 |
+
coach_memory_summary=memory.summary(),
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
return {
|
| 172 |
+
"total_episodes": num_episodes,
|
| 173 |
+
"episode_history": episode_history,
|
| 174 |
+
"memory_summary": memory.summary(),
|
| 175 |
}
|
|
|
|
| 176 |
|
| 177 |
|
| 178 |
# ──────────────────────────────────────────────
|
|
|
|
| 185 |
memory: CoachMemory,
|
| 186 |
) -> None:
|
| 187 |
"""
|
| 188 |
+
Called at end of every episode.
|
| 189 |
|
| 190 |
TODO: Plug in TRL PPOTrainer / Unsloth model updates here.
|
| 191 |
E.g.:
|
| 192 |
trainer.step(queries, responses, rewards)
|
| 193 |
model.save_pretrained(f"models/checkpoint-ep{episode}")
|
| 194 |
"""
|
| 195 |
+
pass
|
| 196 |
|
| 197 |
|
| 198 |
+
def _on_step_end(step: int, result: dict[str, Any]) -> None:
|
|
|
|
|
|
|
|
|
|
| 199 |
"""
|
| 200 |
Called after every environment step.
|
| 201 |
|
| 202 |
+
TODO: Plug in per-step reward logging (W&B, TensorBoard) here.
|
| 203 |
"""
|
| 204 |
+
pass
|
attached_assets/Pasted-Upgrade-the-existing-FORGE-v4-project-from-starter-leve_1777106296176.txt
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Upgrade the existing FORGE-v4 project from starter-level skeleton into a stronger hackathon-ready backend prototype.
|
| 2 |
+
|
| 3 |
+
Do NOT rebuild from scratch. Modify the current files intelligently.
|
| 4 |
+
|
| 5 |
+
Current Issues To Fix:
|
| 6 |
+
|
| 7 |
+
1. Placeholder Coder currently uses Python sorted() and is too perfect.
|
| 8 |
+
2. Breaker attacks are static and too weak.
|
| 9 |
+
3. Reward logs / metrics are not realistic enough.
|
| 10 |
+
4. OpenEnv environment state/action flow needs stronger clarity.
|
| 11 |
+
|
| 12 |
+
Your task is to upgrade the existing project with the following improvements:
|
| 13 |
+
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
## A. Replace Perfect Placeholder Coder
|
| 17 |
+
|
| 18 |
+
Create multiple baseline coder strategies inside the project:
|
| 19 |
+
|
| 20 |
+
1. weak_coder_v1
|
| 21 |
+
|
| 22 |
+
* bubble sort style
|
| 23 |
+
* slow for large arrays
|
| 24 |
+
|
| 25 |
+
2. weak_coder_v2
|
| 26 |
+
|
| 27 |
+
* handles normal arrays
|
| 28 |
+
* fails on duplicates or negatives sometimes
|
| 29 |
+
|
| 30 |
+
3. improving_coder
|
| 31 |
+
|
| 32 |
+
* chooses stronger strategy based on episode count
|
| 33 |
+
|
| 34 |
+
Use these instead of always using sorted().
|
| 35 |
+
|
| 36 |
+
This is important so learning progress can be shown later.
|
| 37 |
+
|
| 38 |
+
---
|
| 39 |
+
|
| 40 |
+
## B. Upgrade Breaker into Tiered Adversarial System
|
| 41 |
+
|
| 42 |
+
Implement breaker difficulty tiers.
|
| 43 |
+
|
| 44 |
+
Tier 1:
|
| 45 |
+
[]
|
| 46 |
+
[1]
|
| 47 |
+
[2,1]
|
| 48 |
+
|
| 49 |
+
Tier 2:
|
| 50 |
+
duplicates
|
| 51 |
+
negative values
|
| 52 |
+
already sorted
|
| 53 |
+
reverse sorted
|
| 54 |
+
|
| 55 |
+
Tier 3:
|
| 56 |
+
large arrays
|
| 57 |
+
many duplicates
|
| 58 |
+
hard distributions
|
| 59 |
+
|
| 60 |
+
Tier 4:
|
| 61 |
+
boundary integer values
|
| 62 |
+
stress tests
|
| 63 |
+
|
| 64 |
+
Unlock next tier based on breaker success rate or episode progress.
|
| 65 |
+
|
| 66 |
+
Breaker should dynamically choose test cases based on current tier.
|
| 67 |
+
|
| 68 |
+
---
|
| 69 |
+
|
| 70 |
+
## C. Add Real Metrics + Reward Logging
|
| 71 |
+
|
| 72 |
+
Create logs folder outputs such as:
|
| 73 |
+
|
| 74 |
+
logs/rewards.json
|
| 75 |
+
logs/episodes.csv
|
| 76 |
+
logs/summary.json
|
| 77 |
+
|
| 78 |
+
Track:
|
| 79 |
+
|
| 80 |
+
* episode number
|
| 81 |
+
* coder reward
|
| 82 |
+
* breaker reward
|
| 83 |
+
* pass rate
|
| 84 |
+
* current tier
|
| 85 |
+
* number of failed tests
|
| 86 |
+
* timeout count
|
| 87 |
+
|
| 88 |
+
Also create helper functions to export metrics cleanly.
|
| 89 |
+
|
| 90 |
+
---
|
| 91 |
+
|
| 92 |
+
## D. Improve OpenEnv Style Clarity
|
| 93 |
+
|
| 94 |
+
In env.py make state/action flow cleaner.
|
| 95 |
+
|
| 96 |
+
Environment state should include:
|
| 97 |
+
|
| 98 |
+
{
|
| 99 |
+
task_id,
|
| 100 |
+
problem_description,
|
| 101 |
+
episode_step,
|
| 102 |
+
coder_version,
|
| 103 |
+
current_tier,
|
| 104 |
+
recent_breaker_case,
|
| 105 |
+
pass_rate_history,
|
| 106 |
+
coach_memory_summary
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
step(action) should clearly:
|
| 110 |
+
|
| 111 |
+
1. run coder
|
| 112 |
+
2. run breaker
|
| 113 |
+
3. sandbox evaluate
|
| 114 |
+
4. assign rewards
|
| 115 |
+
5. update memory
|
| 116 |
+
6. log metrics
|
| 117 |
+
7. return next_state
|
| 118 |
+
|
| 119 |
+
---
|
| 120 |
+
|
| 121 |
+
## E. Improve Coach Memory
|
| 122 |
+
|
| 123 |
+
Store lessons like:
|
| 124 |
+
|
| 125 |
+
Episode 4:
|
| 126 |
+
Coder failed on duplicates
|
| 127 |
+
Lesson: handle duplicate values safely
|
| 128 |
+
|
| 129 |
+
Episode 8:
|
| 130 |
+
Coder timed out on large arrays
|
| 131 |
+
Lesson: avoid O(n²) for large arrays
|
| 132 |
+
|
| 133 |
+
---
|
| 134 |
+
|
| 135 |
+
## F. Keep Existing Structure
|
| 136 |
+
|
| 137 |
+
Do not remove current modular structure.
|
| 138 |
+
|
| 139 |
+
Files should still use:
|
| 140 |
+
|
| 141 |
+
app.py
|
| 142 |
+
env.py
|
| 143 |
+
tasks.py
|
| 144 |
+
rewards.py
|
| 145 |
+
sandbox.py
|
| 146 |
+
memory.py
|
| 147 |
+
trainer.py
|
| 148 |
+
config.py
|
| 149 |
+
|
| 150 |
+
---
|
| 151 |
+
|
| 152 |
+
## G. Final Result Needed
|
| 153 |
+
|
| 154 |
+
After modifications, python app.py should run successfully and show:
|
| 155 |
+
|
| 156 |
+
* coder version used
|
| 157 |
+
* breaker tier used
|
| 158 |
+
* test result summary
|
| 159 |
+
* rewards
|
| 160 |
+
* logs updated
|
| 161 |
+
* coach lessons updated
|
| 162 |
+
|
| 163 |
+
Keep code clean, modular, production-ready, and easy for later VS Code + Copilot + Google Colab upgrades.
|
replit.md
CHANGED
|
@@ -25,3 +25,43 @@ pnpm workspace monorepo using TypeScript. Each package manages its own dependenc
|
|
| 25 |
- `pnpm --filter @workspace/api-server run dev` — run API server locally
|
| 26 |
|
| 27 |
See the `pnpm-workspace` skill for workspace structure, TypeScript setup, and package details.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
- `pnpm --filter @workspace/api-server run dev` — run API server locally
|
| 26 |
|
| 27 |
See the `pnpm-workspace` skill for workspace structure, TypeScript setup, and package details.
|
| 28 |
+
|
| 29 |
+
## FORGE-v4 (Python — Adversarial RL Environment)
|
| 30 |
+
|
| 31 |
+
Located at `FORGE-v4/`. A standalone Python project; run independently of the pnpm workspace.
|
| 32 |
+
|
| 33 |
+
### Quick start
|
| 34 |
+
```bash
|
| 35 |
+
cd FORGE-v4
|
| 36 |
+
python3 app.py # improving_coder vs tiered Breaker
|
| 37 |
+
python3 app.py --coder weak_coder_v1 # bubble sort strategy
|
| 38 |
+
python3 app.py --coder weak_coder_v2 # selection sort w/ abs() bug
|
| 39 |
+
python3 app.py --steps 5 # override step count
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
### Key files
|
| 43 |
+
| File | Purpose |
|
| 44 |
+
|------|---------|
|
| 45 |
+
| `app.py` | CLI entry point |
|
| 46 |
+
| `env.py` | `FORGEEnv` — reset/step/get_state |
|
| 47 |
+
| `agents.py` | Coder strategies + `BreakerAgent` (tiered) |
|
| 48 |
+
| `tasks.py` | Task and hidden test generation |
|
| 49 |
+
| `sandbox.py` | Subprocess code execution with timeout |
|
| 50 |
+
| `rewards.py` | `coder_reward()` / `breaker_reward()` |
|
| 51 |
+
| `memory.py` | `CoachMemory` — JSON-backed lessons |
|
| 52 |
+
| `logger.py` | Writes `logs/rewards.json`, `logs/episodes.csv`, `logs/summary.json` |
|
| 53 |
+
| `trainer.py` | Training loop + TRL/Unsloth hook placeholders |
|
| 54 |
+
| `config.py` | All constants |
|
| 55 |
+
|
| 56 |
+
### Coder strategies
|
| 57 |
+
- `weak_coder_v1` — bubble sort (O(n²), slow on large arrays)
|
| 58 |
+
- `weak_coder_v2` — selection sort with abs() bug (fails on negatives)
|
| 59 |
+
- `improving_coder` — bubble sort → selection sort → `sorted()` by episode
|
| 60 |
+
|
| 61 |
+
### Breaker tiers
|
| 62 |
+
- Tier 1: empty / single element / tiny arrays
|
| 63 |
+
- Tier 2: duplicates, negatives, sorted/reverse-sorted
|
| 64 |
+
- Tier 3: large arrays, heavy duplicates, stress cases
|
| 65 |
+
- Tier 4: boundary integers (±100), extreme stress
|
| 66 |
+
|
| 67 |
+
Tier unlocks at 60% break rate; Tier 3 needs episode ≥ 4, Tier 4 needs episode ≥ 7.
|