ksanjuma1234 commited on
Commit
656b5db
·
1 Parent(s): fc01d79

Enhance code generation environment with diverse coder strategies and tiered adversarial testing

Browse files

Implement multiple coder strategies (bubble sort, selection sort with bug, and an improving coder) and a tiered breaker agent with progressively harder test cases. Add comprehensive logging for rewards and episode metrics. Improve environment state clarity and coach memory with human-readable lessons.

Replit-Commit-Author: Agent
Replit-Commit-Session-Id: a7518b1f-70c7-4487-82d2-42195935723e
Replit-Commit-Checkpoint-Type: full_checkpoint
Replit-Commit-Event-Id: 6f92db1c-7ebb-4a38-b6ed-3dc81054bda2
Replit-Helium-Checkpoint-Created: true

FORGE-v4/agents.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # agents.py
2
+ # Coder strategies and tiered Breaker agent for FORGE-v4.
3
+ #
4
+ # Coder strategies:
5
+ # weak_coder_v1 — bubble sort (O(n²), slow on large arrays)
6
+ # weak_coder_v2 — selection sort with a subtle bug on negatives
7
+ # improving_coder — picks strategy based on episode count
8
+ #
9
+ # Breaker agent:
10
+ # BreakerAgent — tiered adversarial test case generator
11
+
12
+ import random
13
+ from typing import Any
14
+ from config import (
15
+ ARRAY_VALUE_RANGE,
16
+ MAX_ARRAY_SIZE,
17
+ BREAKER_TIER_UNLOCK_RATE,
18
+ BREAKER_TIER3_MIN_EPISODE,
19
+ BREAKER_TIER4_MIN_EPISODE,
20
+ IMPROVING_CODER_TIER1_UNTIL,
21
+ IMPROVING_CODER_TIER2_UNTIL,
22
+ )
23
+
24
+
25
+ # ══════════════════════════════════════════════
26
+ # CODER STRATEGIES
27
+ # ══════════════════════════════════════════════
28
+
29
+ # Each strategy returns a Python source string that defines solution(arr).
30
+
31
+ WEAK_CODER_V1_CODE = '''
32
+ def solution(arr):
33
+ """Bubble sort — O(n²), fails slowly on large arrays."""
34
+ a = list(arr)
35
+ n = len(a)
36
+ for i in range(n):
37
+ for j in range(n - i - 1):
38
+ if a[j] > a[j + 1]:
39
+ a[j], a[j + 1] = a[j + 1], a[j]
40
+ return a
41
+ '''
42
+
43
+ WEAK_CODER_V2_CODE = '''
44
+ def solution(arr):
45
+ """
46
+ Selection sort — correct for positive-only arrays.
47
+ Bug: uses abs() comparison, so negatives can end up out of order.
48
+ """
49
+ a = list(arr)
50
+ n = len(a)
51
+ for i in range(n):
52
+ min_idx = i
53
+ for j in range(i + 1, n):
54
+ # BUG: comparing absolute values breaks negative ordering
55
+ if abs(a[j]) < abs(a[min_idx]):
56
+ min_idx = j
57
+ a[i], a[min_idx] = a[min_idx], a[i]
58
+ return a
59
+ '''
60
+
61
+ IMPROVING_CODER_TEMPLATE = '''
62
+ def solution(arr):
63
+ """
64
+ Improving coder — strategy selected by episode {episode}.
65
+ Episode <= {tier1_until}: bubble sort (weakest)
66
+ Episode <= {tier2_until}: selection sort (medium)
67
+ Episode > {tier2_until}: built-in sorted (strongest)
68
+ """
69
+ episode = {episode}
70
+ a = list(arr)
71
+
72
+ if episode <= {tier1_until}:
73
+ # Bubble sort
74
+ n = len(a)
75
+ for i in range(n):
76
+ for j in range(n - i - 1):
77
+ if a[j] > a[j + 1]:
78
+ a[j], a[j + 1] = a[j + 1], a[j]
79
+ return a
80
+ elif episode <= {tier2_until}:
81
+ # Selection sort with abs() bug
82
+ n = len(a)
83
+ for i in range(n):
84
+ min_idx = i
85
+ for j in range(i + 1, n):
86
+ if abs(a[j]) < abs(a[min_idx]):
87
+ min_idx = j
88
+ a[i], a[min_idx] = a[min_idx], a[i]
89
+ return a
90
+ else:
91
+ # Strong solution
92
+ return sorted(a)
93
+ '''
94
+
95
+
96
+ def get_coder_code(version: str, episode: int = 1) -> str:
97
+ """
98
+ Return the Python source code for the given coder version.
99
+
100
+ Args:
101
+ version: "weak_coder_v1" | "weak_coder_v2" | "improving_coder"
102
+ episode: current episode number (used by improving_coder)
103
+ """
104
+ if version == "weak_coder_v1":
105
+ return WEAK_CODER_V1_CODE
106
+
107
+ if version == "weak_coder_v2":
108
+ return WEAK_CODER_V2_CODE
109
+
110
+ if version == "improving_coder":
111
+ return IMPROVING_CODER_TEMPLATE.format(
112
+ episode=episode,
113
+ tier1_until=IMPROVING_CODER_TIER1_UNTIL,
114
+ tier2_until=IMPROVING_CODER_TIER2_UNTIL,
115
+ )
116
+
117
+ raise ValueError(f"Unknown coder version: {version!r}")
118
+
119
+
120
+ def coder_version_label(version: str, episode: int) -> str:
121
+ """Human-readable label for what strategy the coder is using this episode."""
122
+ if version == "weak_coder_v1":
123
+ return "weak_coder_v1 (bubble sort)"
124
+ if version == "weak_coder_v2":
125
+ return "weak_coder_v2 (selection sort / abs-bug)"
126
+ if version == "improving_coder":
127
+ if episode <= IMPROVING_CODER_TIER1_UNTIL:
128
+ return f"improving_coder → bubble sort (ep {episode} ≤ {IMPROVING_CODER_TIER1_UNTIL})"
129
+ if episode <= IMPROVING_CODER_TIER2_UNTIL:
130
+ return f"improving_coder → selection sort (ep {episode} ≤ {IMPROVING_CODER_TIER2_UNTIL})"
131
+ return f"improving_coder → sorted() (ep {episode} > {IMPROVING_CODER_TIER2_UNTIL})"
132
+ return version
133
+
134
+
135
+ # ══════════════════════════════════════════════
136
+ # TIERED BREAKER AGENT
137
+ # ══════════════════════════════════════════════
138
+
139
+ # Test case banks per tier
140
+ _TIER1_CASES: list[list[int]] = [
141
+ [],
142
+ [1],
143
+ [2, 1],
144
+ [3, 2, 1],
145
+ [1, 2, 3],
146
+ ]
147
+
148
+ _TIER2_CASES: list[list[int]] = [
149
+ [1, 1, 1, 1], # all duplicates
150
+ [2, 2, 1, 1, 3, 3], # duplicate pairs
151
+ [-5, -1, -3, -7, -2], # all negatives
152
+ [-3, 0, 3, -1, 1], # mixed sign
153
+ [1, 2, 3, 4, 5], # already sorted
154
+ [5, 4, 3, 2, 1], # reverse sorted
155
+ [0, 0, 0], # all zeros
156
+ ]
157
+
158
+ _TIER3_CASES: list[list[int]] = [
159
+ list(range(MAX_ARRAY_SIZE, 0, -1)), # full reverse
160
+ [random.choice([1, 2]) for _ in range(MAX_ARRAY_SIZE)], # heavy duplicates
161
+ [random.randint(-100, 100) for _ in range(MAX_ARRAY_SIZE)], # large random
162
+ [0] * MAX_ARRAY_SIZE, # all zeros, large
163
+ list(range(MAX_ARRAY_SIZE)), # sorted ascending, large
164
+ ]
165
+
166
+ _TIER4_CASES: list[list[int]] = [
167
+ [-100, 100], # boundary values only
168
+ [100, 100, 100, -100, -100, -100], # boundary duplicates
169
+ [-100] * 10 + [100] * 10, # boundary mixed
170
+ list(range(-10, 11)), # full range small
171
+ [random.randint(-100, 100) for _ in range(MAX_ARRAY_SIZE)], # stress random
172
+ ]
173
+
174
+
175
+ class BreakerAgent:
176
+ """
177
+ Adversarial test-case generator with four tiers of difficulty.
178
+
179
+ Tier unlocking rules:
180
+ Tier 2 → always available from episode 1
181
+ Tier 3 → unlocks when break_rate >= BREAKER_TIER_UNLOCK_RATE
182
+ AND episode >= BREAKER_TIER3_MIN_EPISODE
183
+ Tier 4 → unlocks when at tier 3 AND episode >= BREAKER_TIER4_MIN_EPISODE
184
+
185
+ The agent samples cases from all unlocked tiers, weighted toward the
186
+ current (highest) tier for maximum adversarial pressure.
187
+ """
188
+
189
+ def __init__(self) -> None:
190
+ self.current_tier: int = 1
191
+ self._recent_break_rates: list[float] = []
192
+
193
+ def update_tier(self, break_rate: float, episode: int) -> None:
194
+ """
195
+ Update the current tier based on recent performance and episode count.
196
+
197
+ Args:
198
+ break_rate: Breaker's break_rate from the last step.
199
+ episode: Current episode number.
200
+ """
201
+ self._recent_break_rates.append(break_rate)
202
+ # Use rolling window of last 3 steps to smooth noise
203
+ recent = self._recent_break_rates[-3:]
204
+ avg_break = sum(recent) / len(recent)
205
+
206
+ if self.current_tier == 1 and avg_break >= BREAKER_TIER_UNLOCK_RATE:
207
+ self.current_tier = 2
208
+
209
+ if self.current_tier == 2 and (
210
+ avg_break >= BREAKER_TIER_UNLOCK_RATE
211
+ and episode >= BREAKER_TIER3_MIN_EPISODE
212
+ ):
213
+ self.current_tier = 3
214
+
215
+ if self.current_tier == 3 and episode >= BREAKER_TIER4_MIN_EPISODE:
216
+ self.current_tier = 4
217
+
218
+ def get_tests(self, n_per_tier: int = 2) -> list[dict[str, Any]]:
219
+ """
220
+ Return adversarial test cases sampled from all unlocked tiers,
221
+ with extra weight on the current highest tier.
222
+
223
+ Args:
224
+ n_per_tier: Number of cases to sample from each unlocked tier.
225
+
226
+ Returns:
227
+ List of {"input": [...], "expected_output": [...]} dicts.
228
+ """
229
+ pools: list[tuple[int, list[list[int]]]] = [
230
+ (1, _TIER1_CASES),
231
+ (2, _TIER2_CASES),
232
+ (3, _TIER3_CASES),
233
+ (4, _TIER4_CASES),
234
+ ]
235
+
236
+ selected: list[list[int]] = []
237
+ for tier_num, pool in pools:
238
+ if tier_num > self.current_tier:
239
+ break
240
+ # Sample more from the highest tier
241
+ k = n_per_tier * 2 if tier_num == self.current_tier else n_per_tier
242
+ k = min(k, len(pool))
243
+ selected.extend(random.sample(pool, k))
244
+
245
+ # Remove duplicates (by converting to tuple for hashability)
246
+ seen: set[tuple[int, ...]] = set()
247
+ unique: list[list[int]] = []
248
+ for arr in selected:
249
+ key = tuple(arr)
250
+ if key not in seen:
251
+ seen.add(key)
252
+ unique.append(arr)
253
+
254
+ return [
255
+ {"input": arr, "expected_output": sorted(arr)}
256
+ for arr in unique
257
+ ]
258
+
259
+ @property
260
+ def tier_name(self) -> str:
261
+ """Human-readable tier label."""
262
+ from config import BREAKER_TIER_NAMES
263
+ return BREAKER_TIER_NAMES.get(self.current_tier, f"Tier-{self.current_tier}")
FORGE-v4/app.py CHANGED
@@ -1,84 +1,205 @@
1
  # app.py
2
  # Main runner script for FORGE-v4.
3
- # Runs a minimal CLI demo of one sample episode.
 
4
 
5
  import sys
6
  import json
 
7
  from env import FORGEEnv
8
  from memory import CoachMemory
9
- from trainer import default_coder_policy, default_breaker_policy
 
10
  from config import STEPS_PER_EPISODE
11
 
12
 
13
- def run_demo_episode() -> None:
 
 
 
 
 
 
14
  """
15
- Execute a single demo episode and print the results to stdout.
 
 
 
 
16
  """
17
- print("=" * 60)
18
- print(" FORGE-v4 | Adversarial Code Generation Environment")
19
- print("=" * 60)
20
 
21
- # Initialise coach memory and environment
22
  memory = CoachMemory()
 
23
  env = FORGEEnv(memory=memory)
24
-
25
- # Reset to start the episode
26
  state = env.reset()
27
 
28
- print(f"\n[Episode {state['episode']}] Task prompt:\n")
29
- print(state["task_prompt"])
 
 
 
 
 
 
 
30
  print()
31
 
32
- for step in range(1, STEPS_PER_EPISODE + 1):
33
- print(f"── Step {step}/{STEPS_PER_EPISODE} " + "─" * 40)
 
 
 
 
 
 
34
 
35
- # Agents produce their actions (placeholder policies for the demo)
36
- coder_code = default_coder_policy(state)
37
- breaker_tests = default_breaker_policy(state)
38
-
39
- action = {
40
- "coder_code": coder_code,
41
- "breaker_tests": breaker_tests,
42
- }
43
 
44
  result = env.step(action)
 
45
 
46
  cr = result["coder_reward"]
47
  br = result["breaker_reward"]
48
-
 
 
 
 
 
 
 
 
 
 
 
 
49
  print(
50
- f" Coder → pass_rate: {cr['pass_rate']:.2f} "
51
  f"| passes: {cr['pass_count']} "
52
  f"| fails: {cr['fail_count']} "
53
  f"| errors: {cr['error_count']} "
54
  f"| reward: {cr['total_reward']:+.2f}"
55
  )
56
  print(
57
- f" Breaker → break_rate: {br['break_rate']:.2f} "
58
  f"| breaks: {br['breaks']} "
59
- f"| passes: {br['passes']} "
60
  f"| reward: {br['total_reward']:+.2f}"
61
  )
 
 
 
62
 
63
  if result["done"]:
64
  break
65
 
66
- print("\n" + "=" * 60)
67
- print(" Episode complete. Coach memory summary:")
68
- print(json.dumps(memory.summary(), indent=2))
69
- print("=" * 60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  def main() -> None:
73
- """Entry point — parse minimal CLI args and run."""
74
  args = sys.argv[1:]
75
 
76
  if "--help" in args or "-h" in args:
77
- print("Usage: python app.py [--steps N]")
78
- print(" --steps N Override STEPS_PER_EPISODE for this run (default: from config.py)")
79
  sys.exit(0)
80
 
81
- # Optional: override step count via CLI
 
 
 
 
 
 
 
 
 
 
 
 
82
  if "--steps" in args:
83
  idx = args.index("--steps")
84
  try:
@@ -88,7 +209,7 @@ def main() -> None:
88
  print("Error: --steps requires an integer argument.")
89
  sys.exit(1)
90
 
91
- run_demo_episode()
92
 
93
 
94
  if __name__ == "__main__":
 
1
  # app.py
2
  # Main runner script for FORGE-v4.
3
+ # Runs one demo episode with the improving_coder and tiered BreakerAgent,
4
+ # then prints a structured results report.
5
 
6
  import sys
7
  import json
8
+
9
  from env import FORGEEnv
10
  from memory import CoachMemory
11
+ from agents import get_coder_code, coder_version_label, BreakerAgent
12
+ from logger import log_episode, update_summary, print_log_paths
13
  from config import STEPS_PER_EPISODE
14
 
15
 
16
+ # ──────────────────────────────────────────────
17
+ # Demo configuration
18
+ # ──────────────────────────────────────────────
19
+ DEFAULT_CODER_VERSION = "improving_coder"
20
+
21
+
22
+ def run_demo_episode(coder_version: str = DEFAULT_CODER_VERSION) -> None:
23
  """
24
+ Execute one demo episode and print a rich results report.
25
+
26
+ Args:
27
+ coder_version: Which coder strategy to use.
28
+ "weak_coder_v1" | "weak_coder_v2" | "improving_coder"
29
  """
30
+ _banner()
 
 
31
 
 
32
  memory = CoachMemory()
33
+ memory.clear() # Start fresh for the demo run
34
  env = FORGEEnv(memory=memory)
 
 
35
  state = env.reset()
36
 
37
+ episode = state["episode"]
38
+ print(f"\n{'─'*60}")
39
+ print(f" Task ID : {state['task_id']}")
40
+ print(f" Episode : {episode}")
41
+ print(f" Coder : {coder_version_label(coder_version, episode)}")
42
+ print(f" Breaker : {env.breaker.tier_name} (starts here, tiers up during run)")
43
+ print(f"{'─'*60}")
44
+ print(f"\n Problem:\n")
45
+ print(f" {state['problem_description']}")
46
  print()
47
 
48
+ # ── Accumulators ──────────────────────────────────────────────────────
49
+ ep_coder_rewards: list[float] = []
50
+ ep_breaker_rewards: list[float] = []
51
+ ep_pass_rates: list[float] = []
52
+ ep_fail_counts: list[int] = []
53
+ ep_error_counts: list[int] = []
54
+ ep_timeout_counts: list[int] = []
55
+ ep_break_rates: list[float] = []
56
 
57
+ for step_num in range(1, STEPS_PER_EPISODE + 1):
58
+ # Build coder action
59
+ code = get_coder_code(coder_version, episode=episode)
60
+ action = {"coder_code": code, "coder_version": coder_version}
 
 
 
 
61
 
62
  result = env.step(action)
63
+ state = result["state"]
64
 
65
  cr = result["coder_reward"]
66
  br = result["breaker_reward"]
67
+ info = result["info"]
68
+
69
+ # Accumulate
70
+ ep_coder_rewards.append(cr["total_reward"])
71
+ ep_breaker_rewards.append(br["total_reward"])
72
+ ep_pass_rates.append(cr["pass_rate"])
73
+ ep_fail_counts.append(cr["fail_count"])
74
+ ep_error_counts.append(cr["error_count"])
75
+ ep_timeout_counts.append(cr["error_count"])
76
+ ep_break_rates.append(br["break_rate"])
77
+
78
+ # Per-step print
79
+ print(f" ── Step {step_num}/{STEPS_PER_EPISODE} [breaker: {info['breaker_tier_name']}]")
80
  print(
81
+ f" Coder → pass_rate: {cr['pass_rate']:.2f} "
82
  f"| passes: {cr['pass_count']} "
83
  f"| fails: {cr['fail_count']} "
84
  f"| errors: {cr['error_count']} "
85
  f"| reward: {cr['total_reward']:+.2f}"
86
  )
87
  print(
88
+ f" Breaker → break_rate: {br['break_rate']:.2f} "
89
  f"| breaks: {br['breaks']} "
90
+ f"| no-break: {br['passes']} "
91
  f"| reward: {br['total_reward']:+.2f}"
92
  )
93
+ if state.get("recent_breaker_case") is not None:
94
+ print(f" Recent adversarial input: {state['recent_breaker_case']}")
95
+ print()
96
 
97
  if result["done"]:
98
  break
99
 
100
+ # ── Episode log ───���───────────────────────────────────────────────────
101
+ def avg(lst: list) -> float:
102
+ return round(sum(lst) / len(lst), 4) if lst else 0.0
103
+
104
+ log_episode(
105
+ episode=episode,
106
+ coder_version=coder_version,
107
+ breaker_tier=env.breaker.current_tier,
108
+ avg_coder_reward=avg(ep_coder_rewards),
109
+ avg_breaker_reward=avg(ep_breaker_rewards),
110
+ avg_pass_rate=avg(ep_pass_rates),
111
+ total_fail_count=sum(ep_fail_counts),
112
+ total_error_count=sum(ep_error_counts),
113
+ total_timeout_count=sum(ep_timeout_counts),
114
+ avg_break_rate=avg(ep_break_rates),
115
+ steps=env.step_count,
116
+ )
117
+
118
+ update_summary(
119
+ total_episodes=1,
120
+ coder_version=coder_version,
121
+ final_breaker_tier=env.breaker.current_tier,
122
+ all_coder_rewards=ep_coder_rewards,
123
+ all_breaker_rewards=ep_breaker_rewards,
124
+ all_pass_rates=ep_pass_rates,
125
+ all_break_rates=ep_break_rates,
126
+ coach_memory_summary=memory.summary(),
127
+ )
128
+
129
+ # ── Final report ──────────────────────────────────────────────────────
130
+ print(f"{'═'*60}")
131
+ print(" EPISODE SUMMARY")
132
+ print(f"{'═'*60}")
133
+ print(f" Coder version : {coder_version_label(coder_version, episode)}")
134
+ print(f" Final breaker tier : {env.breaker.tier_name}")
135
+ print(f" Avg pass rate : {avg(ep_pass_rates):.2f}")
136
+ print(f" Avg coder reward : {avg(ep_coder_rewards):+.4f}")
137
+ print(f" Avg breaker reward : {avg(ep_breaker_rewards):+.4f}")
138
+ print(f" Total fail count : {sum(ep_fail_counts)}")
139
+ print(f" Total error count : {sum(ep_error_counts)}")
140
+ print(f" Avg break rate : {avg(ep_break_rates):.2f}")
141
+ print()
142
+ print(" Coach memory summary:")
143
+ summary = memory.summary()
144
+ print(f" Lessons stored : {summary.get('total_lessons', 0)}")
145
+ notes = summary.get("recent_coach_notes", [])
146
+ if notes:
147
+ print(" Recent coach notes:")
148
+ for note in notes:
149
+ print(f" • {note}")
150
+ print()
151
+ print(" Log files updated:")
152
+ print_log_paths()
153
+ print(f"{'═'*60}")
154
+
155
 
156
+ # ──────────────────────────────────────────────
157
+ # Helpers
158
+ # ──────────────────────────────────────────────
159
+
160
+ def _banner() -> None:
161
+ print()
162
+ print("╔══════════════════════════════════════════════════════════╗")
163
+ print("║ FORGE-v4 | Adversarial Code Generation Environment ║")
164
+ print("╚══════════════════════════════════════════════════════════╝")
165
+
166
+
167
+ def _print_help() -> None:
168
+ print("Usage: python app.py [OPTIONS]")
169
+ print()
170
+ print("Options:")
171
+ print(" --coder VERSION Coder strategy to use:")
172
+ print(" weak_coder_v1 (bubble sort — slow/weak)")
173
+ print(" weak_coder_v2 (selection sort + abs() bug)")
174
+ print(" improving_coder (adapts each episode) [default]")
175
+ print(" --steps N Override STEPS_PER_EPISODE for this run")
176
+ print(" --help / -h Show this message")
177
+
178
+
179
+ # ──────────────────────────────────────────────
180
+ # Entry point
181
+ # ──────────────────────────────────────────────
182
 
183
  def main() -> None:
 
184
  args = sys.argv[1:]
185
 
186
  if "--help" in args or "-h" in args:
187
+ _print_help()
 
188
  sys.exit(0)
189
 
190
+ coder_version = DEFAULT_CODER_VERSION
191
+ if "--coder" in args:
192
+ idx = args.index("--coder")
193
+ try:
194
+ coder_version = args[idx + 1]
195
+ valid = ("weak_coder_v1", "weak_coder_v2", "improving_coder")
196
+ if coder_version not in valid:
197
+ print(f"Error: unknown coder version '{coder_version}'. Choose from: {valid}")
198
+ sys.exit(1)
199
+ except IndexError:
200
+ print("Error: --coder requires a version argument.")
201
+ sys.exit(1)
202
+
203
  if "--steps" in args:
204
  idx = args.index("--steps")
205
  try:
 
209
  print("Error: --steps requires an integer argument.")
210
  sys.exit(1)
211
 
212
+ run_demo_episode(coder_version=coder_version)
213
 
214
 
215
  if __name__ == "__main__":
FORGE-v4/config.py CHANGED
@@ -18,20 +18,43 @@ NUM_HIDDEN_TESTS = 5 # Number of hidden test cases per task
18
  # ──────────────────────────────────────────────
19
  # Reward settings
20
  # ──────────────────────────────────────────────
21
- # Coder reward weights
22
- CODER_PASS_REWARD = 1.0 # Reward per passing hidden test
23
- CODER_FAIL_PENALTY = -0.5 # Penalty per failing hidden test
24
- CODER_ERROR_PENALTY = -1.0 # Penalty when code raises an error
25
 
26
- # Breaker reward weights
27
  BREAKER_BREAK_REWARD = 1.0 # Reward when breaker's test breaks coder
28
- BREAKER_FAIL_PENALTY = -0.3 # Penalty when breaker's test does NOT break coder
29
 
30
  # ──────────────────────────────────────────────
31
- # Tier thresholds (coder skill levels)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  # ──────────────────────────────────────────────
33
  TIER_THRESHOLDS = {
34
- "novice": (0.0, 0.4), # pass-rate range [low, high)
35
  "intermediate": (0.4, 0.7),
36
  "advanced": (0.7, 0.9),
37
  "expert": (0.9, 1.01),
@@ -40,13 +63,18 @@ TIER_THRESHOLDS = {
40
  # ──────────────────────────────────────────────
41
  # Memory / logging
42
  # ──────────────────────────────────────────────
43
- MEMORY_FILE = "data/coach_memory.json" # Persistent memory path
44
- LOG_DIR = "logs/" # Directory for episode logs
45
- MODELS_DIR = "models/" # Saved model checkpoints
46
- OUTPUTS_DIR = "outputs/" # Generated code outputs
 
 
 
 
 
47
 
48
  # ──────────────────────────────────────────────
49
  # Training placeholders
50
  # ──────────────────────────────────────────────
51
- MAX_EPISODES = 100 # Default training episode count
52
- STEPS_PER_EPISODE = 10 # Steps per episode
 
18
  # ──────────────────────────────────────────────
19
  # Reward settings
20
  # ──────────────────────────────────────────────
21
+ CODER_PASS_REWARD = 1.0 # Reward per passing hidden test
22
+ CODER_FAIL_PENALTY = -0.5 # Penalty per failing hidden test
23
+ CODER_ERROR_PENALTY = -1.0 # Penalty when code raises an error/timeout
 
24
 
 
25
  BREAKER_BREAK_REWARD = 1.0 # Reward when breaker's test breaks coder
26
+ BREAKER_FAIL_PENALTY = -0.3 # Penalty when coder survives a breaker test
27
 
28
  # ──────────────────────────────────────────────
29
+ # Coder agent versions
30
+ # ──────────────────────────────────────────────
31
+ CODER_VERSIONS = ["weak_coder_v1", "weak_coder_v2", "improving_coder"]
32
+
33
+ # improving_coder tier-up thresholds (episode numbers)
34
+ IMPROVING_CODER_TIER1_UNTIL = 3 # Episodes 1–3 → uses weak strategy
35
+ IMPROVING_CODER_TIER2_UNTIL = 6 # Episodes 4–6 → uses mid strategy
36
+
37
+ # ──────────────────────────────────────────────
38
+ # Breaker tier system
39
+ # ──────────────────────────────────────────────
40
+ BREAKER_TIER_NAMES = {
41
+ 1: "Tier-1 (basic)",
42
+ 2: "Tier-2 (edge cases)",
43
+ 3: "Tier-3 (stress)",
44
+ 4: "Tier-4 (boundary/extreme)",
45
+ }
46
+
47
+ # Minimum break_rate to unlock next tier
48
+ BREAKER_TIER_UNLOCK_RATE = 0.6 # 60% break rate needed to promote
49
+ # Minimum episode before tier 3 unlocks (regardless of break rate)
50
+ BREAKER_TIER3_MIN_EPISODE = 4
51
+ BREAKER_TIER4_MIN_EPISODE = 7
52
+
53
+ # ──────────────────────────────────────────────
54
+ # Tier thresholds (coder skill levels — for display/labelling)
55
  # ──────────────────────────────────────────────
56
  TIER_THRESHOLDS = {
57
+ "novice": (0.0, 0.4),
58
  "intermediate": (0.4, 0.7),
59
  "advanced": (0.7, 0.9),
60
  "expert": (0.9, 1.01),
 
63
  # ──────────────────────────────────────────────
64
  # Memory / logging
65
  # ──────────────────────────────────────────────
66
+ MEMORY_FILE = "data/coach_memory.json"
67
+ LOG_DIR = "logs/"
68
+ MODELS_DIR = "models/"
69
+ OUTPUTS_DIR = "outputs/"
70
+
71
+ # Log file paths (within LOG_DIR)
72
+ LOG_REWARDS_FILE = "logs/rewards.json"
73
+ LOG_EPISODES_FILE = "logs/episodes.csv"
74
+ LOG_SUMMARY_FILE = "logs/summary.json"
75
 
76
  # ──────────────────────────────────────────────
77
  # Training placeholders
78
  # ──────────────────────────────────────────────
79
+ MAX_EPISODES = 100
80
+ STEPS_PER_EPISODE = 3 # Kept short for fast demo runs
FORGE-v4/env.py CHANGED
@@ -1,12 +1,16 @@
1
  # env.py
2
  # Main OpenEnv-style reinforcement learning environment for FORGE-v4.
3
- # Manages the interaction between the Coder Agent, Breaker Agent, and Sandbox.
4
 
 
5
  from typing import Any
6
- from tasks import generate_task, generate_breaker_task
 
7
  from sandbox import run_code_against_tests
8
  from rewards import coder_reward, breaker_reward
9
  from memory import CoachMemory
 
 
10
  from config import STEPS_PER_EPISODE
11
 
12
 
@@ -15,29 +19,68 @@ class FORGEEnv:
15
  Two-agent adversarial environment for code generation tasks.
16
 
17
  Agents:
18
- - Coder: writes Python code to solve array-sorting tasks.
19
- - Breaker: generates adversarial test cases to break the Coder's solution.
20
 
21
  Episode flow:
22
- 1. reset() → returns the initial task state
23
- 2. step(action) × STEPS_PER_EPISODE steps
24
- 3. Rewards assigned to both agents at each step
 
 
 
 
 
 
 
25
 
26
- Action format:
27
  {
28
- "coder_code": str | None, # Python source defining solution(arr)
29
- "breaker_tests": list | None, # List of {"input": [...]} dicts
 
 
 
 
 
 
 
 
 
30
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  """
32
 
33
  def __init__(self, memory: CoachMemory | None = None):
34
- self.memory = memory or CoachMemory()
35
- self.episode: int = 0
 
36
  self.step_count: int = 0
37
  self.current_task: dict[str, Any] = {}
38
- self.done: bool = True
39
- self._last_coder_code: str = ""
40
- self._last_coder_pass_rate: float = 0.0
 
 
 
 
41
 
42
  # ──────────────────────────────────────────────
43
  # Core env methods
@@ -45,38 +88,42 @@ class FORGEEnv:
45
 
46
  def reset(self) -> dict[str, Any]:
47
  """
48
- Start a new episode.
49
 
50
  Returns:
51
- Initial state dict containing the task prompt and public example.
52
  """
53
  self.episode += 1
54
  self.step_count = 0
55
- self.done = False
56
- self._last_coder_code = ""
 
 
 
57
  self._last_coder_pass_rate = 0.0
58
 
59
  self.current_task = generate_task()
 
60
 
61
- state = self.get_state()
62
- return state
63
 
64
  def step(self, action: dict[str, Any]) -> dict[str, Any]:
65
  """
66
  Advance the environment by one step.
67
 
68
  Args:
69
- action: dict with optional keys:
70
- "coder_code" Python source defining solution(arr)
71
- "breaker_tests" list of {"input": [...]} dicts
 
72
 
73
  Returns:
74
  {
75
- "state": current env state,
76
- "coder_reward": coder reward info dict,
77
- "breaker_reward": breaker reward info dict,
78
- "done": bool (True when episode ends),
79
- "info": extra diagnostics,
80
  }
81
  """
82
  if self.done:
@@ -84,33 +131,66 @@ class FORGEEnv:
84
 
85
  self.step_count += 1
86
  coder_code = action.get("coder_code", "")
87
- breaker_tests = action.get("breaker_tests", [])
 
88
 
89
- # ── Evaluate Coder ────────────────────────────────────────────────
90
- coder_info = self._evaluate_coder(coder_code)
 
 
91
 
92
- # ── Evaluate Breaker ──────────────────────────────────────────────
 
93
  breaker_info = self._evaluate_breaker(coder_code, breaker_tests, coder_info)
94
 
95
- # ── Log to Coach Memory ───────────────────────────────────────────
 
 
 
96
  self.memory.add_lesson(
97
  episode=self.episode,
98
  agent="env",
99
  observation=(
100
  f"Step {self.step_count}: "
101
- f"coder pass_rate={coder_info['pass_rate']:.2f}, "
102
- f"breaker break_rate={breaker_info['break_rate']:.2f}"
 
 
103
  ),
104
  coder_reward=coder_info["total_reward"],
105
  breaker_reward=breaker_info["total_reward"],
106
  extra={
107
- "step": self.step_count,
108
- "coder_pass_rate": coder_info["pass_rate"],
109
- "breaker_break_rate": breaker_info["break_rate"],
 
 
 
 
 
 
110
  },
111
  )
112
 
113
- # ── Check done ────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  if self.step_count >= STEPS_PER_EPISODE:
115
  self.done = True
116
 
@@ -120,26 +200,32 @@ class FORGEEnv:
120
  "breaker_reward": breaker_info,
121
  "done": self.done,
122
  "info": {
123
- "episode": self.episode,
124
- "step": self.step_count,
 
 
 
125
  },
126
  }
127
 
128
  def get_state(self) -> dict[str, Any]:
129
- """
130
- Return the current observable state of the environment.
131
- """
132
  return {
133
- "episode": self.episode,
134
- "step": self.step_count,
135
- "done": self.done,
136
- "task_prompt": self.current_task.get("prompt", ""),
137
- "public_example": self.current_task.get("public_example", {}),
138
- "last_pass_rate": self._last_coder_pass_rate,
 
 
 
 
 
139
  }
140
 
141
  # ──────────────────────────────────────────────
142
- # Private helpers
143
  # ──────────────────────────────────────────────
144
 
145
  def _evaluate_coder(self, code: str) -> dict[str, Any]:
@@ -147,17 +233,11 @@ class FORGEEnv:
147
  hidden_tests = self.current_task.get("hidden_tests", [])
148
 
149
  if not code or not hidden_tests:
150
- # No code submitted max penalty
151
- dummy_results = [{"status": "error"} for _ in hidden_tests or [{}]]
152
- info = coder_reward(dummy_results)
153
- else:
154
- results = run_code_against_tests(code, hidden_tests)
155
- info = coder_reward(results)
156
-
157
- # Cache for Breaker quality multiplier
158
- self._last_coder_code = code
159
- self._last_coder_pass_rate = info["pass_rate"]
160
- return info
161
 
162
  def _evaluate_breaker(
163
  self,
@@ -165,9 +245,8 @@ class FORGEEnv:
165
  breaker_tests: list[dict[str, Any]],
166
  coder_info: dict[str, Any],
167
  ) -> dict[str, Any]:
168
- """Run the coder's code against the breaker's adversarial tests."""
169
  if not coder_code or not breaker_tests:
170
- # No submission from one of the agents
171
  dummy = [{"status": "pass"} for _ in breaker_tests or [{}]]
172
  return breaker_reward(dummy, coder_base_pass_rate=coder_info["pass_rate"])
173
 
 
1
  # env.py
2
  # Main OpenEnv-style reinforcement learning environment for FORGE-v4.
3
+ # Manages Coder Agent, Breaker Agent, Sandbox, Rewards, Memory, and Logging.
4
 
5
+ import uuid
6
  from typing import Any
7
+
8
+ from tasks import generate_task
9
  from sandbox import run_code_against_tests
10
  from rewards import coder_reward, breaker_reward
11
  from memory import CoachMemory
12
+ from agents import BreakerAgent, coder_version_label
13
+ from logger import log_step
14
  from config import STEPS_PER_EPISODE
15
 
16
 
 
19
  Two-agent adversarial environment for code generation tasks.
20
 
21
  Agents:
22
+ - Coder: submits Python code defining solution(arr).
23
+ - Breaker: submits adversarial test cases via a BreakerAgent.
24
 
25
  Episode flow:
26
+ 1. reset() → returns initial state
27
+ 2. step(action) × N → coder vs breaker, rewards, memory, logs
28
+ 3. done=True → call reset() for next episode
29
+
30
+ Action format passed to step():
31
+ {
32
+ "coder_code": str, # Python source defining solution(arr)
33
+ "coder_version": str, # label, e.g. "weak_coder_v1"
34
+ }
35
+ The BreakerAgent is managed internally by the environment.
36
 
37
+ State returned by get_state() / reset() / step():
38
  {
39
+ "task_id": str,
40
+ "problem_description": str,
41
+ "episode": int,
42
+ "episode_step": int,
43
+ "done": bool,
44
+ "coder_version": str,
45
+ "current_tier": int,
46
+ "recent_breaker_case": list[int],
47
+ "pass_rate_history": list[float],
48
+ "coach_memory_summary": dict,
49
+ "public_example": dict,
50
  }
51
+
52
+ step() returns:
53
+ {
54
+ "state": dict,
55
+ "coder_reward": dict, # from rewards.coder_reward()
56
+ "breaker_reward": dict, # from rewards.breaker_reward()
57
+ "done": bool,
58
+ "info": dict, # diagnostics
59
+ }
60
+
61
+ Explicit step() flow:
62
+ 1. Run coder code against hidden tests in sandbox
63
+ 2. Run breaker tests against coder code in sandbox
64
+ 3. Assign coder_reward and breaker_reward
65
+ 4. Update coach memory with structured lesson
66
+ 5. Log step metrics to logs/rewards.json
67
+ 6. Advance breaker tier based on break_rate
68
+ 7. Return next_state, rewards, done, info
69
  """
70
 
71
  def __init__(self, memory: CoachMemory | None = None):
72
+ self.memory = memory or CoachMemory()
73
+ self.breaker = BreakerAgent()
74
+ self.episode: int = 0
75
  self.step_count: int = 0
76
  self.current_task: dict[str, Any] = {}
77
+ self.done: bool = True
78
+
79
+ # Tracked across the episode
80
+ self._coder_version: str = "unknown"
81
+ self._pass_rate_history: list[float] = []
82
+ self._recent_breaker_case: list[int] = []
83
+ self._last_coder_pass_rate: float = 0.0
84
 
85
  # ──────────────────────────────────────────────
86
  # Core env methods
 
88
 
89
  def reset(self) -> dict[str, Any]:
90
  """
91
+ Start a new episode. Generates a fresh task and resets counters.
92
 
93
  Returns:
94
+ Initial state dict.
95
  """
96
  self.episode += 1
97
  self.step_count = 0
98
+ self.done = False
99
+
100
+ self._coder_version = "unknown"
101
+ self._pass_rate_history = []
102
+ self._recent_breaker_case = []
103
  self._last_coder_pass_rate = 0.0
104
 
105
  self.current_task = generate_task()
106
+ self.current_task["task_id"] = str(uuid.uuid4())[:8]
107
 
108
+ return self.get_state()
 
109
 
110
  def step(self, action: dict[str, Any]) -> dict[str, Any]:
111
  """
112
  Advance the environment by one step.
113
 
114
  Args:
115
+ action: {
116
+ "coder_code": str — Python source defining solution(arr)
117
+ "coder_version": str — human label for the coder strategy used
118
+ }
119
 
120
  Returns:
121
  {
122
+ "state": dict next observable state,
123
+ "coder_reward": dict — coder reward breakdown,
124
+ "breaker_reward": dict — breaker reward breakdown,
125
+ "done": bool,
126
+ "info": dict diagnostics,
127
  }
128
  """
129
  if self.done:
 
131
 
132
  self.step_count += 1
133
  coder_code = action.get("coder_code", "")
134
+ coder_version = action.get("coder_version", "unknown")
135
+ self._coder_version = coder_version
136
 
137
+ # ── 1. Get breaker tests for this step ───────────────────────────
138
+ breaker_tests = self.breaker.get_tests(n_per_tier=2)
139
+ if breaker_tests:
140
+ self._recent_breaker_case = breaker_tests[-1]["input"]
141
 
142
+ # ── 2 & 3. Run sandbox + compute rewards ──────────────────────────
143
+ coder_info = self._evaluate_coder(coder_code)
144
  breaker_info = self._evaluate_breaker(coder_code, breaker_tests, coder_info)
145
 
146
+ self._pass_rate_history.append(coder_info["pass_rate"])
147
+ self._last_coder_pass_rate = coder_info["pass_rate"]
148
+
149
+ # ── 4. Update coach memory with rich lesson ───────────────────────
150
  self.memory.add_lesson(
151
  episode=self.episode,
152
  agent="env",
153
  observation=(
154
  f"Step {self.step_count}: "
155
+ f"coder={coder_version}, "
156
+ f"pass_rate={coder_info['pass_rate']:.2f}, "
157
+ f"breaker_tier={self.breaker.current_tier}, "
158
+ f"break_rate={breaker_info['break_rate']:.2f}"
159
  ),
160
  coder_reward=coder_info["total_reward"],
161
  breaker_reward=breaker_info["total_reward"],
162
  extra={
163
+ "step": self.step_count,
164
+ "coder_version": coder_version,
165
+ "breaker_tier": self.breaker.current_tier,
166
+ "coder_pass_rate": coder_info["pass_rate"],
167
+ "fail_count": coder_info["fail_count"],
168
+ "error_count": coder_info["error_count"],
169
+ "timeout_count": coder_info["error_count"], # errors include timeouts
170
+ "breaker_break_rate": breaker_info["break_rate"],
171
+ "recent_breaker_case": self._recent_breaker_case,
172
  },
173
  )
174
 
175
+ # ── 5. Log step metrics ───────────────────────────────────────────
176
+ log_step(
177
+ episode=self.episode,
178
+ step=self.step_count,
179
+ coder_version=coder_version,
180
+ breaker_tier=self.breaker.current_tier,
181
+ coder_reward=coder_info["total_reward"],
182
+ breaker_reward=breaker_info["total_reward"],
183
+ pass_rate=coder_info["pass_rate"],
184
+ fail_count=coder_info["fail_count"],
185
+ error_count=coder_info["error_count"],
186
+ timeout_count=coder_info["error_count"],
187
+ break_rate=breaker_info["break_rate"],
188
+ )
189
+
190
+ # ── 6. Advance breaker tier ────────────────────────────────────────
191
+ self.breaker.update_tier(breaker_info["break_rate"], self.episode)
192
+
193
+ # ── 7. Check done + return ────────────────────────────────────────
194
  if self.step_count >= STEPS_PER_EPISODE:
195
  self.done = True
196
 
 
200
  "breaker_reward": breaker_info,
201
  "done": self.done,
202
  "info": {
203
+ "episode": self.episode,
204
+ "step": self.step_count,
205
+ "coder_version": coder_version,
206
+ "breaker_tier": self.breaker.current_tier,
207
+ "breaker_tier_name": self.breaker.tier_name,
208
  },
209
  }
210
 
211
  def get_state(self) -> dict[str, Any]:
212
+ """Return the current observable state of the environment."""
 
 
213
  return {
214
+ "task_id": self.current_task.get("task_id", ""),
215
+ "problem_description": self.current_task.get("prompt", ""),
216
+ "episode": self.episode,
217
+ "episode_step": self.step_count,
218
+ "done": self.done,
219
+ "coder_version": self._coder_version,
220
+ "current_tier": self.breaker.current_tier,
221
+ "recent_breaker_case": self._recent_breaker_case,
222
+ "pass_rate_history": list(self._pass_rate_history),
223
+ "coach_memory_summary": self.memory.summary(),
224
+ "public_example": self.current_task.get("public_example", {}),
225
  }
226
 
227
  # ──────────────────────────────────────────────
228
+ # Private evaluation helpers
229
  # ──────────────────────────────────────────────
230
 
231
  def _evaluate_coder(self, code: str) -> dict[str, Any]:
 
233
  hidden_tests = self.current_task.get("hidden_tests", [])
234
 
235
  if not code or not hidden_tests:
236
+ dummy = [{"status": "error"} for _ in hidden_tests or [{}]]
237
+ return coder_reward(dummy)
238
+
239
+ results = run_code_against_tests(code, hidden_tests)
240
+ return coder_reward(results)
 
 
 
 
 
 
241
 
242
  def _evaluate_breaker(
243
  self,
 
245
  breaker_tests: list[dict[str, Any]],
246
  coder_info: dict[str, Any],
247
  ) -> dict[str, Any]:
248
+ """Run the coder's code against breaker adversarial tests."""
249
  if not coder_code or not breaker_tests:
 
250
  dummy = [{"status": "pass"} for _ in breaker_tests or [{}]]
251
  return breaker_reward(dummy, coder_base_pass_rate=coder_info["pass_rate"])
252
 
FORGE-v4/logger.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # logger.py
2
+ # Metrics logging for FORGE-v4.
3
+ # Writes structured logs to logs/rewards.json, logs/episodes.csv, logs/summary.json.
4
+
5
+ import csv
6
+ import json
7
+ import os
8
+ from datetime import datetime
9
+ from typing import Any
10
+
11
+ from config import LOG_REWARDS_FILE, LOG_EPISODES_FILE, LOG_SUMMARY_FILE, LOG_DIR
12
+
13
+
14
+ # ──────────────────────────────────────────────
15
+ # Internal helpers
16
+ # ──────────────────────────────────────────────
17
+
18
+ def _ensure_log_dir() -> None:
19
+ os.makedirs(LOG_DIR, exist_ok=True)
20
+
21
+
22
+ def _load_json(path: str, default: Any) -> Any:
23
+ if os.path.exists(path):
24
+ try:
25
+ with open(path, "r", encoding="utf-8") as f:
26
+ return json.load(f)
27
+ except (json.JSONDecodeError, IOError):
28
+ pass
29
+ return default
30
+
31
+
32
+ def _write_json(path: str, data: Any) -> None:
33
+ with open(path, "w", encoding="utf-8") as f:
34
+ json.dump(data, f, indent=2)
35
+
36
+
37
+ # ──────────────────────────────────────────────
38
+ # Step-level logging
39
+ # ──────────────────────────────────────────────
40
+
41
+ def log_step(
42
+ episode: int,
43
+ step: int,
44
+ coder_version: str,
45
+ breaker_tier: int,
46
+ coder_reward: float,
47
+ breaker_reward: float,
48
+ pass_rate: float,
49
+ fail_count: int,
50
+ error_count: int,
51
+ timeout_count: int,
52
+ break_rate: float,
53
+ ) -> None:
54
+ """
55
+ Append one step's metrics to logs/rewards.json.
56
+
57
+ Args:
58
+ episode: Episode index.
59
+ step: Step index within the episode.
60
+ coder_version: Name of the coder strategy used.
61
+ breaker_tier: Current breaker tier number.
62
+ coder_reward: Total coder reward this step.
63
+ breaker_reward: Total breaker reward this step.
64
+ pass_rate: Fraction of hidden tests passed.
65
+ fail_count: Number of failing tests.
66
+ error_count: Number of error/timeout tests.
67
+ timeout_count: Number of sandbox timeouts specifically.
68
+ break_rate: Fraction of breaker tests that broke the coder.
69
+ """
70
+ _ensure_log_dir()
71
+ records: list[dict[str, Any]] = _load_json(LOG_REWARDS_FILE, [])
72
+
73
+ record = {
74
+ "timestamp": datetime.utcnow().isoformat(),
75
+ "episode": episode,
76
+ "step": step,
77
+ "coder_version": coder_version,
78
+ "breaker_tier": breaker_tier,
79
+ "coder_reward": coder_reward,
80
+ "breaker_reward": breaker_reward,
81
+ "pass_rate": pass_rate,
82
+ "fail_count": fail_count,
83
+ "error_count": error_count,
84
+ "timeout_count": timeout_count,
85
+ "break_rate": break_rate,
86
+ }
87
+ records.append(record)
88
+ _write_json(LOG_REWARDS_FILE, records)
89
+
90
+
91
+ # ──────────────────────────────────────────────
92
+ # Episode-level logging
93
+ # ──────────────────────────────────────────────
94
+
95
+ # CSV column order
96
+ _CSV_FIELDS = [
97
+ "timestamp", "episode", "coder_version", "breaker_tier",
98
+ "avg_coder_reward", "avg_breaker_reward",
99
+ "avg_pass_rate", "total_fail_count", "total_error_count",
100
+ "total_timeout_count", "avg_break_rate", "steps",
101
+ ]
102
+
103
+
104
+ def log_episode(
105
+ episode: int,
106
+ coder_version: str,
107
+ breaker_tier: int,
108
+ avg_coder_reward: float,
109
+ avg_breaker_reward: float,
110
+ avg_pass_rate: float,
111
+ total_fail_count: int,
112
+ total_error_count: int,
113
+ total_timeout_count: int,
114
+ avg_break_rate: float,
115
+ steps: int,
116
+ ) -> None:
117
+ """
118
+ Append one episode summary row to logs/episodes.csv.
119
+ """
120
+ _ensure_log_dir()
121
+ file_exists = os.path.exists(LOG_EPISODES_FILE)
122
+
123
+ row = {
124
+ "timestamp": datetime.utcnow().isoformat(),
125
+ "episode": episode,
126
+ "coder_version": coder_version,
127
+ "breaker_tier": breaker_tier,
128
+ "avg_coder_reward": round(avg_coder_reward, 4),
129
+ "avg_breaker_reward": round(avg_breaker_reward, 4),
130
+ "avg_pass_rate": round(avg_pass_rate, 4),
131
+ "total_fail_count": total_fail_count,
132
+ "total_error_count": total_error_count,
133
+ "total_timeout_count":total_timeout_count,
134
+ "avg_break_rate": round(avg_break_rate, 4),
135
+ "steps": steps,
136
+ }
137
+
138
+ with open(LOG_EPISODES_FILE, "a", newline="", encoding="utf-8") as f:
139
+ writer = csv.DictWriter(f, fieldnames=_CSV_FIELDS)
140
+ if not file_exists:
141
+ writer.writeheader()
142
+ writer.writerow(row)
143
+
144
+
145
+ # ──────────────────────────────────────────────
146
+ # Summary logging
147
+ # ──────────────────────────────────────────────
148
+
149
+ def update_summary(
150
+ total_episodes: int,
151
+ coder_version: str,
152
+ final_breaker_tier: int,
153
+ all_coder_rewards: list[float],
154
+ all_breaker_rewards: list[float],
155
+ all_pass_rates: list[float],
156
+ all_break_rates: list[float],
157
+ coach_memory_summary: dict[str, Any],
158
+ ) -> None:
159
+ """
160
+ Overwrite logs/summary.json with the latest aggregate statistics.
161
+ """
162
+ _ensure_log_dir()
163
+
164
+ def avg(lst: list[float]) -> float:
165
+ return round(sum(lst) / len(lst), 4) if lst else 0.0
166
+
167
+ summary = {
168
+ "generated_at": datetime.utcnow().isoformat(),
169
+ "total_episodes": total_episodes,
170
+ "coder_version": coder_version,
171
+ "final_breaker_tier": final_breaker_tier,
172
+ "avg_coder_reward": avg(all_coder_rewards),
173
+ "avg_breaker_reward": avg(all_breaker_rewards),
174
+ "avg_pass_rate": avg(all_pass_rates),
175
+ "avg_break_rate": avg(all_break_rates),
176
+ "best_coder_reward": round(max(all_coder_rewards), 4) if all_coder_rewards else 0.0,
177
+ "worst_coder_reward": round(min(all_coder_rewards), 4) if all_coder_rewards else 0.0,
178
+ "coach_memory_summary": coach_memory_summary,
179
+ }
180
+ _write_json(LOG_SUMMARY_FILE, summary)
181
+
182
+
183
+ # ──────────────────────────────────────────────
184
+ # Convenience: print a compact log path report
185
+ # ──────────────────────────────────────────────
186
+
187
+ def print_log_paths() -> None:
188
+ """Print the paths of all updated log files."""
189
+ for path in [LOG_REWARDS_FILE, LOG_EPISODES_FILE, LOG_SUMMARY_FILE]:
190
+ exists = "✓" if os.path.exists(path) else "✗"
191
+ print(f" {exists} {path}")
FORGE-v4/logs/episodes.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ timestamp,episode,coder_version,breaker_tier,avg_coder_reward,avg_breaker_reward,avg_pass_rate,total_fail_count,total_error_count,total_timeout_count,avg_break_rate,steps
2
+ 2026-04-25T08:42:22.041578,1,improving_coder,1,5.0,-1.2,1.0,0,0,0,0.0,3
3
+ 2026-04-25T08:42:31.074377,1,weak_coder_v2,1,-1.0,-1.2,0.2,12,0,0,0.0,3
FORGE-v4/logs/rewards.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "timestamp": "2026-04-25T08:42:19.501582",
4
+ "episode": 1,
5
+ "step": 1,
6
+ "coder_version": "improving_coder",
7
+ "breaker_tier": 1,
8
+ "coder_reward": 5.0,
9
+ "breaker_reward": -1.2,
10
+ "pass_rate": 1.0,
11
+ "fail_count": 0,
12
+ "error_count": 0,
13
+ "timeout_count": 0,
14
+ "break_rate": 0.0
15
+ },
16
+ {
17
+ "timestamp": "2026-04-25T08:42:20.777574",
18
+ "episode": 1,
19
+ "step": 2,
20
+ "coder_version": "improving_coder",
21
+ "breaker_tier": 1,
22
+ "coder_reward": 5.0,
23
+ "breaker_reward": -1.2,
24
+ "pass_rate": 1.0,
25
+ "fail_count": 0,
26
+ "error_count": 0,
27
+ "timeout_count": 0,
28
+ "break_rate": 0.0
29
+ },
30
+ {
31
+ "timestamp": "2026-04-25T08:42:22.039872",
32
+ "episode": 1,
33
+ "step": 3,
34
+ "coder_version": "improving_coder",
35
+ "breaker_tier": 1,
36
+ "coder_reward": 5.0,
37
+ "breaker_reward": -1.2,
38
+ "pass_rate": 1.0,
39
+ "fail_count": 0,
40
+ "error_count": 0,
41
+ "timeout_count": 0,
42
+ "break_rate": 0.0
43
+ },
44
+ {
45
+ "timestamp": "2026-04-25T08:42:28.577096",
46
+ "episode": 1,
47
+ "step": 1,
48
+ "coder_version": "weak_coder_v2",
49
+ "breaker_tier": 1,
50
+ "coder_reward": -1.0,
51
+ "breaker_reward": -1.2,
52
+ "pass_rate": 0.2,
53
+ "fail_count": 4,
54
+ "error_count": 0,
55
+ "timeout_count": 0,
56
+ "break_rate": 0.0
57
+ },
58
+ {
59
+ "timestamp": "2026-04-25T08:42:29.829535",
60
+ "episode": 1,
61
+ "step": 2,
62
+ "coder_version": "weak_coder_v2",
63
+ "breaker_tier": 1,
64
+ "coder_reward": -1.0,
65
+ "breaker_reward": -1.2,
66
+ "pass_rate": 0.2,
67
+ "fail_count": 4,
68
+ "error_count": 0,
69
+ "timeout_count": 0,
70
+ "break_rate": 0.0
71
+ },
72
+ {
73
+ "timestamp": "2026-04-25T08:42:31.072423",
74
+ "episode": 1,
75
+ "step": 3,
76
+ "coder_version": "weak_coder_v2",
77
+ "breaker_tier": 1,
78
+ "coder_reward": -1.0,
79
+ "breaker_reward": -1.2,
80
+ "pass_rate": 0.2,
81
+ "fail_count": 4,
82
+ "error_count": 0,
83
+ "timeout_count": 0,
84
+ "break_rate": 0.0
85
+ }
86
+ ]
FORGE-v4/logs/summary.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "generated_at": "2026-04-25T08:42:31.075228",
3
+ "total_episodes": 1,
4
+ "coder_version": "weak_coder_v2",
5
+ "final_breaker_tier": 1,
6
+ "avg_coder_reward": -1.0,
7
+ "avg_breaker_reward": -1.2,
8
+ "avg_pass_rate": 0.2,
9
+ "avg_break_rate": 0.0,
10
+ "best_coder_reward": -1.0,
11
+ "worst_coder_reward": -1.0,
12
+ "coach_memory_summary": {
13
+ "total_lessons": 3,
14
+ "episodes_seen": 1,
15
+ "avg_coder_reward": -1.0,
16
+ "avg_breaker_reward": -1.2,
17
+ "recent_coach_notes": [
18
+ "Episode 1: Coder (weak_coder_v2) failed 4 test(s) at breaker 1 \u2192 review edge case handling",
19
+ "Episode 1: Coder (weak_coder_v2) failed 4 test(s) at breaker 1 \u2192 review edge case handling",
20
+ "Episode 1: Coder (weak_coder_v2) failed 4 test(s) at breaker 1 \u2192 review edge case handling"
21
+ ]
22
+ }
23
+ }
FORGE-v4/memory.py CHANGED
@@ -1,6 +1,6 @@
1
  # memory.py
2
  # Coach Memory system for FORGE-v4.
3
- # Stores lessons learned across episodes in a JSON file.
4
 
5
  import json
6
  import os
@@ -14,6 +14,8 @@ class CoachMemory:
14
  Persistent memory that accumulates lessons learned across training episodes.
15
 
16
  Lessons are stored as a list of dicts in a JSON file and loaded on startup.
 
 
17
  """
18
 
19
  def __init__(self, filepath: str = MEMORY_FILE):
@@ -41,16 +43,19 @@ class CoachMemory:
41
  Args:
42
  episode: Episode index.
43
  agent: "coder" | "breaker" | "env".
44
- observation: Human-readable description of what happened.
45
  coder_reward: Total coder reward for this step.
46
  breaker_reward: Total breaker reward for this step.
47
- extra: Optional additional metadata.
48
  """
 
 
49
  lesson = {
50
  "timestamp": datetime.utcnow().isoformat(),
51
  "episode": episode,
52
  "agent": agent,
53
  "observation": observation,
 
54
  "coder_reward": coder_reward,
55
  "breaker_reward": breaker_reward,
56
  }
@@ -60,16 +65,13 @@ class CoachMemory:
60
  self.lessons.append(lesson)
61
  self.save()
62
 
63
- def get_lessons(self, agent: str | None = None, last_n: int | None = None) -> list[dict[str, Any]]:
 
 
 
 
64
  """
65
  Retrieve stored lessons, optionally filtered by agent and/or limited to the last N.
66
-
67
- Args:
68
- agent: Filter to a specific agent ("coder", "breaker", "env"), or None for all.
69
- last_n: Return only the last N lessons if provided.
70
-
71
- Returns:
72
- List of lesson dicts.
73
  """
74
  result = self.lessons
75
  if agent is not None:
@@ -78,28 +80,29 @@ class CoachMemory:
78
  result = result[-last_n:]
79
  return result
80
 
 
 
 
 
81
  def summary(self) -> dict[str, Any]:
82
- """
83
- Return a high-level summary of stored lessons.
84
- """
85
  if not self.lessons:
86
  return {"total_lessons": 0, "episodes_seen": 0}
87
 
88
  episodes = {l["episode"] for l in self.lessons}
89
- coder_rewards = [l["coder_reward"] for l in self.lessons]
90
  breaker_rewards = [l["breaker_reward"] for l in self.lessons]
91
 
92
  return {
93
  "total_lessons": len(self.lessons),
94
  "episodes_seen": len(episodes),
95
- "avg_coder_reward": round(sum(coder_rewards) / len(coder_rewards), 4),
96
  "avg_breaker_reward": round(sum(breaker_rewards) / len(breaker_rewards), 4),
 
97
  }
98
 
99
  def clear(self) -> None:
100
- """
101
- Wipe all stored lessons (use with caution).
102
- """
103
  self.lessons = []
104
  self.save()
105
 
@@ -119,11 +122,89 @@ class CoachMemory:
119
  with open(self.filepath, "r", encoding="utf-8") as f:
120
  self.lessons = json.load(f)
121
  except (json.JSONDecodeError, IOError):
122
- # Start fresh if file is corrupted
123
  self.lessons = []
124
  else:
125
  self.lessons = []
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  # ──────────────────────────────────────────────
128
  # Internal helpers
129
  # ──────────────────────────────────────────────
 
1
  # memory.py
2
  # Coach Memory system for FORGE-v4.
3
+ # Stores structured lessons learned across episodes in a JSON file.
4
 
5
  import json
6
  import os
 
14
  Persistent memory that accumulates lessons learned across training episodes.
15
 
16
  Lessons are stored as a list of dicts in a JSON file and loaded on startup.
17
+ Each lesson includes a human-readable "coach_note" derived from the metrics
18
+ so the history is understandable without post-processing.
19
  """
20
 
21
  def __init__(self, filepath: str = MEMORY_FILE):
 
43
  Args:
44
  episode: Episode index.
45
  agent: "coder" | "breaker" | "env".
46
+ observation: Raw observation string from the environment.
47
  coder_reward: Total coder reward for this step.
48
  breaker_reward: Total breaker reward for this step.
49
+ extra: Optional metadata (pass_rate, fail_count, etc.).
50
  """
51
+ coach_note = self._derive_coach_note(episode, extra or {})
52
+
53
  lesson = {
54
  "timestamp": datetime.utcnow().isoformat(),
55
  "episode": episode,
56
  "agent": agent,
57
  "observation": observation,
58
+ "coach_note": coach_note,
59
  "coder_reward": coder_reward,
60
  "breaker_reward": breaker_reward,
61
  }
 
65
  self.lessons.append(lesson)
66
  self.save()
67
 
68
+ def get_lessons(
69
+ self,
70
+ agent: str | None = None,
71
+ last_n: int | None = None,
72
+ ) -> list[dict[str, Any]]:
73
  """
74
  Retrieve stored lessons, optionally filtered by agent and/or limited to the last N.
 
 
 
 
 
 
 
75
  """
76
  result = self.lessons
77
  if agent is not None:
 
80
  result = result[-last_n:]
81
  return result
82
 
83
+ def get_coach_notes(self, last_n: int = 5) -> list[str]:
84
+ """Return the most recent human-readable coach notes."""
85
+ return [l["coach_note"] for l in self.lessons[-last_n:] if l.get("coach_note")]
86
+
87
  def summary(self) -> dict[str, Any]:
88
+ """Return a high-level summary of stored lessons."""
 
 
89
  if not self.lessons:
90
  return {"total_lessons": 0, "episodes_seen": 0}
91
 
92
  episodes = {l["episode"] for l in self.lessons}
93
+ coder_rewards = [l["coder_reward"] for l in self.lessons]
94
  breaker_rewards = [l["breaker_reward"] for l in self.lessons]
95
 
96
  return {
97
  "total_lessons": len(self.lessons),
98
  "episodes_seen": len(episodes),
99
+ "avg_coder_reward": round(sum(coder_rewards) / len(coder_rewards), 4),
100
  "avg_breaker_reward": round(sum(breaker_rewards) / len(breaker_rewards), 4),
101
+ "recent_coach_notes": self.get_coach_notes(last_n=3),
102
  }
103
 
104
  def clear(self) -> None:
105
+ """Wipe all stored lessons (use with caution)."""
 
 
106
  self.lessons = []
107
  self.save()
108
 
 
122
  with open(self.filepath, "r", encoding="utf-8") as f:
123
  self.lessons = json.load(f)
124
  except (json.JSONDecodeError, IOError):
 
125
  self.lessons = []
126
  else:
127
  self.lessons = []
128
 
129
+ # ──────────────────────────────────────────────
130
+ # Coach note derivation
131
+ # ──────────────────────────────────────────────
132
+
133
+ def _derive_coach_note(self, episode: int, extra: dict[str, Any]) -> str:
134
+ """
135
+ Generate a human-readable coaching note from step metadata.
136
+
137
+ Examples:
138
+ "Episode 4: Coder failed on duplicates → handle duplicate values safely"
139
+ "Episode 8: Coder timed out on large arrays → avoid O(n²) for large inputs"
140
+ "Episode 2: Strong performance (pass_rate=1.00) → keep current strategy"
141
+ """
142
+ pass_rate = extra.get("coder_pass_rate", None)
143
+ fail_count = extra.get("fail_count", 0)
144
+ error_count = extra.get("error_count", 0)
145
+ timeout_count = extra.get("timeout_count", 0)
146
+ breaker_tier = extra.get("breaker_tier", 1)
147
+ coder_version = extra.get("coder_version", "unknown")
148
+ recent_case = extra.get("recent_breaker_case", [])
149
+
150
+ prefix = f"Episode {episode}"
151
+
152
+ # Timeout note
153
+ if timeout_count > 0:
154
+ return (
155
+ f"{prefix}: Coder timed out on {timeout_count} test(s)"
156
+ f" [tier={breaker_tier}] → avoid O(n²) or infinite loops for large inputs"
157
+ )
158
+
159
+ # Error note
160
+ if error_count > 0 and pass_rate is not None and pass_rate < 0.5:
161
+ return (
162
+ f"{prefix}: Coder raised errors on {error_count} test(s)"
163
+ f" → add input validation and handle edge cases"
164
+ )
165
+
166
+ # Negative/duplicate failure detection from recent breaker case
167
+ if fail_count > 0 and recent_case:
168
+ has_neg = any(x < 0 for x in recent_case)
169
+ has_dups = len(recent_case) != len(set(recent_case))
170
+ is_large = len(recent_case) >= 10
171
+
172
+ if has_neg and has_dups:
173
+ return (
174
+ f"{prefix}: Coder ({coder_version}) failed on negatives+duplicates"
175
+ f" → ensure sort key uses true value, not abs()"
176
+ )
177
+ if has_neg:
178
+ return (
179
+ f"{prefix}: Coder ({coder_version}) failed on negative values"
180
+ f" → handle negative integers in comparison logic"
181
+ )
182
+ if has_dups:
183
+ return (
184
+ f"{prefix}: Coder ({coder_version}) failed on duplicate values"
185
+ f" → ensure stable sort handles equal elements correctly"
186
+ )
187
+ if is_large:
188
+ return (
189
+ f"{prefix}: Coder ({coder_version}) failed on large array (n={len(recent_case)})"
190
+ f" → consider O(n log n) algorithm"
191
+ )
192
+ return (
193
+ f"{prefix}: Coder ({coder_version}) failed {fail_count} test(s)"
194
+ f" at breaker {breaker_tier} → review edge case handling"
195
+ )
196
+
197
+ # Good performance
198
+ if pass_rate is not None and pass_rate >= 0.8:
199
+ return (
200
+ f"{prefix}: Strong performance (pass_rate={pass_rate:.2f})"
201
+ f" [{coder_version}] → breaker should escalate tier"
202
+ )
203
+
204
+ # Generic fallback
205
+ pr = f"{pass_rate:.2f}" if pass_rate is not None else "N/A"
206
+ return f"{prefix}: pass_rate={pr}, fail={fail_count}, errors={error_count}"
207
+
208
  # ──────────────────────────────────────────────
209
  # Internal helpers
210
  # ──────────────────────────────────────────────
FORGE-v4/trainer.py CHANGED
@@ -1,47 +1,44 @@
1
  # trainer.py
2
- # Placeholder training loop hooks for FORGE-v4.
3
- # Ready for future TRL / Unsloth / Hugging Face integration.
 
4
 
5
  from typing import Any, Callable
6
  from env import FORGEEnv
7
  from memory import CoachMemory
 
 
8
  from config import MAX_EPISODES, STEPS_PER_EPISODE
9
 
10
 
11
  # ──────────────────────────────────────────────
12
- # Placeholder agent policy functions
13
  # ──────────────────────────────────────────────
14
 
15
- def default_coder_policy(state: dict[str, Any]) -> str:
16
  """
17
- Placeholder Coder policy.
18
 
19
- In production this will call a fine-tuned LLM (e.g. via TRL/Unsloth) to
20
- generate Python code from the task prompt.
21
 
22
- Currently returns a trivial reference solution so the environment runs.
 
23
  """
24
- # TODO: Replace with LLM inference call
25
- return "def solution(arr):\n return sorted(arr)\n"
26
-
 
 
27
 
28
- def default_breaker_policy(state: dict[str, Any]) -> list[dict[str, Any]]:
29
- """
30
- Placeholder Breaker policy.
31
 
32
- In production this will call a fine-tuned adversarial LLM to generate
33
- adversarial test cases from the task prompt.
 
 
34
 
35
- Currently returns a fixed set of edge-case test inputs.
36
- """
37
- # TODO: Replace with adversarial LLM inference call
38
- return [
39
- {"input": [], "expected_output": []},
40
- {"input": [1], "expected_output": [1]},
41
- {"input": [3, 1, 2], "expected_output": [1, 2, 3]},
42
- {"input": [-5, -1, -3], "expected_output": [-5, -3, -1]},
43
- {"input": [0, 0, 0, 0], "expected_output": [0, 0, 0, 0]},
44
- ]
45
 
46
 
47
  # ──────────────────────────────────────────────
@@ -49,81 +46,133 @@ def default_breaker_policy(state: dict[str, Any]) -> list[dict[str, Any]]:
49
  # ──────────────────────────────────────────────
50
 
51
  def train(
52
- coder_policy: Callable[[dict[str, Any]], str] = default_coder_policy,
53
- breaker_policy: Callable[[dict[str, Any]], list[dict[str, Any]]] = default_breaker_policy,
54
  num_episodes: int = MAX_EPISODES,
55
  verbose: bool = True,
56
  ) -> dict[str, Any]:
57
  """
58
  Run the FORGE-v4 training loop.
59
 
 
 
 
60
  Args:
61
- coder_policy: Callable(state) → Python source string.
62
- breaker_policy: Callable(state) → list of test-case dicts.
63
- num_episodes: Number of training episodes to run.
64
- verbose: Print per-episode summaries when True.
65
 
66
  Returns:
67
- Training summary dict with per-episode reward histories.
68
  """
69
  memory = CoachMemory()
70
- env = FORGEEnv(memory=memory)
71
 
72
  episode_history: list[dict[str, Any]] = []
73
 
 
 
 
 
 
 
74
  for ep in range(1, num_episodes + 1):
75
  state = env.reset()
76
- episode_coder_rewards = []
77
- episode_breaker_rewards = []
78
 
79
- for _ in range(STEPS_PER_EPISODE):
80
- # ── Agent decisions ────────────────────────────────────────────
81
- coder_code = coder_policy(state)
82
- breaker_tests = breaker_policy(state)
83
-
84
- action = {
85
- "coder_code": coder_code,
86
- "breaker_tests": breaker_tests,
87
- }
88
 
89
- # ── Environment step ───────────────────────────────────────────
 
90
  result = env.step(action)
91
  state = result["state"]
92
 
93
- episode_coder_rewards.append(result["coder_reward"]["total_reward"])
94
- episode_breaker_rewards.append(result["breaker_reward"]["total_reward"])
 
 
 
 
 
 
 
 
95
 
96
  if result["done"]:
97
  break
98
 
99
  # ── Episode summary ────────────────────────────────────────────────
100
- avg_cr = round(sum(episode_coder_rewards) / len(episode_coder_rewards), 4)
101
- avg_br = round(sum(episode_breaker_rewards) / len(episode_breaker_rewards), 4)
102
 
103
  ep_summary = {
104
  "episode": ep,
105
- "avg_coder_reward": avg_cr,
106
- "avg_breaker_reward": avg_br,
 
 
 
 
107
  "steps": env.step_count,
108
  }
109
  episode_history.append(ep_summary)
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  if verbose:
 
112
  print(
113
- f"[Episode {ep:>4}/{num_episodes}] "
114
- f"Coder avg reward: {avg_cr:+.4f} | "
115
- f"Breaker avg reward: {avg_br:+.4f}"
 
 
 
116
  )
117
 
118
- # ── TRL / Unsloth hook placeholders ───────────────────────────────
119
  _on_episode_end(ep, ep_summary, memory)
120
 
121
- training_summary = {
122
- "total_episodes": num_episodes,
123
- "episode_history": episode_history,
124
- "memory_summary": memory.summary(),
 
 
 
 
 
 
 
 
 
 
 
 
125
  }
126
- return training_summary
127
 
128
 
129
  # ──────────────────────────────────────────────
@@ -136,23 +185,20 @@ def _on_episode_end(
136
  memory: CoachMemory,
137
  ) -> None:
138
  """
139
- Called at the end of every episode.
140
 
141
  TODO: Plug in TRL PPOTrainer / Unsloth model updates here.
142
  E.g.:
143
  trainer.step(queries, responses, rewards)
144
  model.save_pretrained(f"models/checkpoint-ep{episode}")
145
  """
146
- pass # placeholder
147
 
148
 
149
- def _on_step_end(
150
- step: int,
151
- result: dict[str, Any],
152
- ) -> None:
153
  """
154
  Called after every environment step.
155
 
156
- TODO: Plug in per-step reward logging (e.g. W&B, TensorBoard) here.
157
  """
158
- pass # placeholder
 
1
  # trainer.py
2
+ # Training loop for FORGE-v4.
3
+ # Uses the real coder strategies and tiered BreakerAgent from agents.py.
4
+ # Hook placeholders are ready for TRL / Unsloth / Hugging Face integration.
5
 
6
  from typing import Any, Callable
7
  from env import FORGEEnv
8
  from memory import CoachMemory
9
+ from agents import get_coder_code, coder_version_label
10
+ from logger import log_episode, update_summary
11
  from config import MAX_EPISODES, STEPS_PER_EPISODE
12
 
13
 
14
  # ──────────────────────────────────────────────
15
+ # Built-in coder policies
16
  # ──────────────────────────────────────────────
17
 
18
+ def make_coder_policy(version: str) -> Callable[[dict[str, Any]], dict[str, str]]:
19
  """
20
+ Factory: return a coder policy function for the given version name.
21
 
22
+ The returned callable takes a state dict and returns an action dict:
23
+ {"coder_code": str, "coder_version": str}
24
 
25
+ Args:
26
+ version: "weak_coder_v1" | "weak_coder_v2" | "improving_coder"
27
  """
28
+ def policy(state: dict[str, Any]) -> dict[str, str]:
29
+ episode = state.get("episode", 1)
30
+ code = get_coder_code(version, episode=episode)
31
+ return {"coder_code": code, "coder_version": version}
32
+ return policy
33
 
 
 
 
34
 
35
+ # Convenience pre-built policies
36
+ weak_coder_v1_policy = make_coder_policy("weak_coder_v1")
37
+ weak_coder_v2_policy = make_coder_policy("weak_coder_v2")
38
+ improving_coder_policy = make_coder_policy("improving_coder")
39
 
40
+ # Default used by app.py
41
+ default_coder_policy = improving_coder_policy
 
 
 
 
 
 
 
 
42
 
43
 
44
  # ──────────────────────────────────────────────
 
46
  # ──────────────────────────────────────────────
47
 
48
  def train(
49
+ coder_policy: Callable[[dict[str, Any]], dict[str, str]] = default_coder_policy,
 
50
  num_episodes: int = MAX_EPISODES,
51
  verbose: bool = True,
52
  ) -> dict[str, Any]:
53
  """
54
  Run the FORGE-v4 training loop.
55
 
56
+ The BreakerAgent is managed by the environment — it automatically tiers up
57
+ based on performance. Only the coder policy needs to be supplied here.
58
+
59
  Args:
60
+ coder_policy: Callable(state) → {"coder_code": str, "coder_version": str}
61
+ num_episodes: Number of episodes to run.
62
+ verbose: Print per-episode summaries when True.
 
63
 
64
  Returns:
65
+ Training summary dict.
66
  """
67
  memory = CoachMemory()
68
+ env = FORGEEnv(memory=memory)
69
 
70
  episode_history: list[dict[str, Any]] = []
71
 
72
+ # Aggregate accumulators for final summary
73
+ all_coder_rewards: list[float] = []
74
+ all_breaker_rewards: list[float] = []
75
+ all_pass_rates: list[float] = []
76
+ all_break_rates: list[float] = []
77
+
78
  for ep in range(1, num_episodes + 1):
79
  state = env.reset()
 
 
80
 
81
+ ep_coder_rewards: list[float] = []
82
+ ep_breaker_rewards: list[float] = []
83
+ ep_pass_rates: list[float] = []
84
+ ep_fail_counts: list[int] = []
85
+ ep_error_counts: list[int] = []
86
+ ep_timeout_counts: list[int] = []
87
+ ep_break_rates: list[float] = []
 
 
88
 
89
+ for _ in range(STEPS_PER_EPISODE):
90
+ action = coder_policy(state)
91
  result = env.step(action)
92
  state = result["state"]
93
 
94
+ cr = result["coder_reward"]
95
+ br = result["breaker_reward"]
96
+
97
+ ep_coder_rewards.append(cr["total_reward"])
98
+ ep_breaker_rewards.append(br["total_reward"])
99
+ ep_pass_rates.append(cr["pass_rate"])
100
+ ep_fail_counts.append(cr["fail_count"])
101
+ ep_error_counts.append(cr["error_count"])
102
+ ep_timeout_counts.append(cr["error_count"])
103
+ ep_break_rates.append(br["break_rate"])
104
 
105
  if result["done"]:
106
  break
107
 
108
  # ── Episode summary ────────────────────────────────────────────────
109
+ def avg(lst: list) -> float:
110
+ return round(sum(lst) / len(lst), 4) if lst else 0.0
111
 
112
  ep_summary = {
113
  "episode": ep,
114
+ "coder_version": action.get("coder_version", "unknown"),
115
+ "breaker_tier": env.breaker.current_tier,
116
+ "avg_coder_reward": avg(ep_coder_rewards),
117
+ "avg_breaker_reward": avg(ep_breaker_rewards),
118
+ "avg_pass_rate": avg(ep_pass_rates),
119
+ "avg_break_rate": avg(ep_break_rates),
120
  "steps": env.step_count,
121
  }
122
  episode_history.append(ep_summary)
123
 
124
+ # ── Log episode to CSV ─────────────────────────────────────────────
125
+ log_episode(
126
+ episode=ep,
127
+ coder_version=ep_summary["coder_version"],
128
+ breaker_tier=ep_summary["breaker_tier"],
129
+ avg_coder_reward=ep_summary["avg_coder_reward"],
130
+ avg_breaker_reward=ep_summary["avg_breaker_reward"],
131
+ avg_pass_rate=ep_summary["avg_pass_rate"],
132
+ total_fail_count=sum(ep_fail_counts),
133
+ total_error_count=sum(ep_error_counts),
134
+ total_timeout_count=sum(ep_timeout_counts),
135
+ avg_break_rate=ep_summary["avg_break_rate"],
136
+ steps=ep_summary["steps"],
137
+ )
138
+
139
+ # ── Accumulate for final summary ───────────────────────────────────
140
+ all_coder_rewards.extend(ep_coder_rewards)
141
+ all_breaker_rewards.extend(ep_breaker_rewards)
142
+ all_pass_rates.extend(ep_pass_rates)
143
+ all_break_rates.extend(ep_break_rates)
144
+
145
  if verbose:
146
+ label = coder_version_label(ep_summary["coder_version"], ep)
147
  print(
148
+ f" [Ep {ep:>3}] Coder: {label:<50} "
149
+ f"pass={ep_summary['avg_pass_rate']:.2f} "
150
+ f"reward={ep_summary['avg_coder_reward']:+.2f} | "
151
+ f"Breaker: {env.breaker.tier_name:<22} "
152
+ f"break={ep_summary['avg_break_rate']:.2f} "
153
+ f"reward={ep_summary['avg_breaker_reward']:+.2f}"
154
  )
155
 
156
+ # ── TRL / Unsloth hook ─────────────────────────────────────────────
157
  _on_episode_end(ep, ep_summary, memory)
158
 
159
+ # ── Final summary JSON ────────────────────────────────────────────────
160
+ update_summary(
161
+ total_episodes=num_episodes,
162
+ coder_version=episode_history[-1]["coder_version"] if episode_history else "unknown",
163
+ final_breaker_tier=env.breaker.current_tier,
164
+ all_coder_rewards=all_coder_rewards,
165
+ all_breaker_rewards=all_breaker_rewards,
166
+ all_pass_rates=all_pass_rates,
167
+ all_break_rates=all_break_rates,
168
+ coach_memory_summary=memory.summary(),
169
+ )
170
+
171
+ return {
172
+ "total_episodes": num_episodes,
173
+ "episode_history": episode_history,
174
+ "memory_summary": memory.summary(),
175
  }
 
176
 
177
 
178
  # ──────────────────────────────────────────────
 
185
  memory: CoachMemory,
186
  ) -> None:
187
  """
188
+ Called at end of every episode.
189
 
190
  TODO: Plug in TRL PPOTrainer / Unsloth model updates here.
191
  E.g.:
192
  trainer.step(queries, responses, rewards)
193
  model.save_pretrained(f"models/checkpoint-ep{episode}")
194
  """
195
+ pass
196
 
197
 
198
+ def _on_step_end(step: int, result: dict[str, Any]) -> None:
 
 
 
199
  """
200
  Called after every environment step.
201
 
202
+ TODO: Plug in per-step reward logging (W&B, TensorBoard) here.
203
  """
204
+ pass
attached_assets/Pasted-Upgrade-the-existing-FORGE-v4-project-from-starter-leve_1777106296176.txt ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Upgrade the existing FORGE-v4 project from starter-level skeleton into a stronger hackathon-ready backend prototype.
2
+
3
+ Do NOT rebuild from scratch. Modify the current files intelligently.
4
+
5
+ Current Issues To Fix:
6
+
7
+ 1. Placeholder Coder currently uses Python sorted() and is too perfect.
8
+ 2. Breaker attacks are static and too weak.
9
+ 3. Reward logs / metrics are not realistic enough.
10
+ 4. OpenEnv environment state/action flow needs stronger clarity.
11
+
12
+ Your task is to upgrade the existing project with the following improvements:
13
+
14
+ ---
15
+
16
+ ## A. Replace Perfect Placeholder Coder
17
+
18
+ Create multiple baseline coder strategies inside the project:
19
+
20
+ 1. weak_coder_v1
21
+
22
+ * bubble sort style
23
+ * slow for large arrays
24
+
25
+ 2. weak_coder_v2
26
+
27
+ * handles normal arrays
28
+ * fails on duplicates or negatives sometimes
29
+
30
+ 3. improving_coder
31
+
32
+ * chooses stronger strategy based on episode count
33
+
34
+ Use these instead of always using sorted().
35
+
36
+ This is important so learning progress can be shown later.
37
+
38
+ ---
39
+
40
+ ## B. Upgrade Breaker into Tiered Adversarial System
41
+
42
+ Implement breaker difficulty tiers.
43
+
44
+ Tier 1:
45
+ []
46
+ [1]
47
+ [2,1]
48
+
49
+ Tier 2:
50
+ duplicates
51
+ negative values
52
+ already sorted
53
+ reverse sorted
54
+
55
+ Tier 3:
56
+ large arrays
57
+ many duplicates
58
+ hard distributions
59
+
60
+ Tier 4:
61
+ boundary integer values
62
+ stress tests
63
+
64
+ Unlock next tier based on breaker success rate or episode progress.
65
+
66
+ Breaker should dynamically choose test cases based on current tier.
67
+
68
+ ---
69
+
70
+ ## C. Add Real Metrics + Reward Logging
71
+
72
+ Create logs folder outputs such as:
73
+
74
+ logs/rewards.json
75
+ logs/episodes.csv
76
+ logs/summary.json
77
+
78
+ Track:
79
+
80
+ * episode number
81
+ * coder reward
82
+ * breaker reward
83
+ * pass rate
84
+ * current tier
85
+ * number of failed tests
86
+ * timeout count
87
+
88
+ Also create helper functions to export metrics cleanly.
89
+
90
+ ---
91
+
92
+ ## D. Improve OpenEnv Style Clarity
93
+
94
+ In env.py make state/action flow cleaner.
95
+
96
+ Environment state should include:
97
+
98
+ {
99
+ task_id,
100
+ problem_description,
101
+ episode_step,
102
+ coder_version,
103
+ current_tier,
104
+ recent_breaker_case,
105
+ pass_rate_history,
106
+ coach_memory_summary
107
+ }
108
+
109
+ step(action) should clearly:
110
+
111
+ 1. run coder
112
+ 2. run breaker
113
+ 3. sandbox evaluate
114
+ 4. assign rewards
115
+ 5. update memory
116
+ 6. log metrics
117
+ 7. return next_state
118
+
119
+ ---
120
+
121
+ ## E. Improve Coach Memory
122
+
123
+ Store lessons like:
124
+
125
+ Episode 4:
126
+ Coder failed on duplicates
127
+ Lesson: handle duplicate values safely
128
+
129
+ Episode 8:
130
+ Coder timed out on large arrays
131
+ Lesson: avoid O(n²) for large arrays
132
+
133
+ ---
134
+
135
+ ## F. Keep Existing Structure
136
+
137
+ Do not remove current modular structure.
138
+
139
+ Files should still use:
140
+
141
+ app.py
142
+ env.py
143
+ tasks.py
144
+ rewards.py
145
+ sandbox.py
146
+ memory.py
147
+ trainer.py
148
+ config.py
149
+
150
+ ---
151
+
152
+ ## G. Final Result Needed
153
+
154
+ After modifications, python app.py should run successfully and show:
155
+
156
+ * coder version used
157
+ * breaker tier used
158
+ * test result summary
159
+ * rewards
160
+ * logs updated
161
+ * coach lessons updated
162
+
163
+ Keep code clean, modular, production-ready, and easy for later VS Code + Copilot + Google Colab upgrades.
replit.md CHANGED
@@ -25,3 +25,43 @@ pnpm workspace monorepo using TypeScript. Each package manages its own dependenc
25
  - `pnpm --filter @workspace/api-server run dev` — run API server locally
26
 
27
  See the `pnpm-workspace` skill for workspace structure, TypeScript setup, and package details.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  - `pnpm --filter @workspace/api-server run dev` — run API server locally
26
 
27
  See the `pnpm-workspace` skill for workspace structure, TypeScript setup, and package details.
28
+
29
+ ## FORGE-v4 (Python — Adversarial RL Environment)
30
+
31
+ Located at `FORGE-v4/`. A standalone Python project; run independently of the pnpm workspace.
32
+
33
+ ### Quick start
34
+ ```bash
35
+ cd FORGE-v4
36
+ python3 app.py # improving_coder vs tiered Breaker
37
+ python3 app.py --coder weak_coder_v1 # bubble sort strategy
38
+ python3 app.py --coder weak_coder_v2 # selection sort w/ abs() bug
39
+ python3 app.py --steps 5 # override step count
40
+ ```
41
+
42
+ ### Key files
43
+ | File | Purpose |
44
+ |------|---------|
45
+ | `app.py` | CLI entry point |
46
+ | `env.py` | `FORGEEnv` — reset/step/get_state |
47
+ | `agents.py` | Coder strategies + `BreakerAgent` (tiered) |
48
+ | `tasks.py` | Task and hidden test generation |
49
+ | `sandbox.py` | Subprocess code execution with timeout |
50
+ | `rewards.py` | `coder_reward()` / `breaker_reward()` |
51
+ | `memory.py` | `CoachMemory` — JSON-backed lessons |
52
+ | `logger.py` | Writes `logs/rewards.json`, `logs/episodes.csv`, `logs/summary.json` |
53
+ | `trainer.py` | Training loop + TRL/Unsloth hook placeholders |
54
+ | `config.py` | All constants |
55
+
56
+ ### Coder strategies
57
+ - `weak_coder_v1` — bubble sort (O(n²), slow on large arrays)
58
+ - `weak_coder_v2` — selection sort with abs() bug (fails on negatives)
59
+ - `improving_coder` — bubble sort → selection sort → `sorted()` by episode
60
+
61
+ ### Breaker tiers
62
+ - Tier 1: empty / single element / tiny arrays
63
+ - Tier 2: duplicates, negatives, sorted/reverse-sorted
64
+ - Tier 3: large arrays, heavy duplicates, stress cases
65
+ - Tier 4: boundary integers (±100), extreme stress
66
+
67
+ Tier unlocks at 60% break rate; Tier 3 needs episode ≥ 4, Tier 4 needs episode ≥ 7.