Spaces:

Tejasghatule
/

code-revieww-env

Sleeping

App Files Files Community

codemaverick2 commited on Apr 7

Commit

116a4b1

1 Parent(s): 78f3eb2

Add GRPO batch endpoint, task replay in curriculum, update to v2.1.0

Browse files

Files changed (2) hide show

openenv.yaml +11 -3
server/app.py +89 -2

openenv.yaml CHANGED Viewed

@@ -1,12 +1,13 @@
 spec_version: 1
 name: code_review_env
-version: "2.0.0"
 description: >
   A code review and security audit RL environment for training AI agents.
   The agent identifies bugs, security vulnerabilities, and performance issues
   across 7 tasks of increasing difficulty (easy → medium → medium-hard → hard).
   Features: PBRS reward shaping, graduated near-miss rewards, flood protection,
-  CAMRL curriculum selector, VL return normalization, and cross-language tasks.
 type: space
 runtime: fastapi
 app: server.app:app
@@ -53,6 +54,13 @@ tasks:
 reward_design:
   terminal: "0.70 * F1 + 0.30 * severity_accuracy"
   shaping: "PBRS (Ng et al. 1999): phi(s) = (tp/total_gt) * 0.5"
-  near_miss: "exponential decay: 0.10 * exp(-0.6 * (line_diff - 2))"
   flood_protection: "escalating FP penalty after 3rd false positive"
   normalization: "VL Norm (2025): normalized_return = cumulative / steps_used"

 spec_version: 1
 name: code_review_env
+version: "2.1.0"
 description: >
   A code review and security audit RL environment for training AI agents.
   The agent identifies bugs, security vulnerabilities, and performance issues
   across 7 tasks of increasing difficulty (easy → medium → medium-hard → hard).
   Features: PBRS reward shaping, graduated near-miss rewards, flood protection,
+  CAMRL curriculum with task replay, VL return normalization, GRPO batch endpoint,
+  diversity/exploration bonuses, and cross-language tasks (Python + JavaScript).
 type: space
 runtime: fastapi
 app: server.app:app
 reward_design:
   terminal: "0.70 * F1 + 0.30 * severity_accuracy"
   shaping: "PBRS (Ng et al. 1999): phi(s) = (tp/total_gt) * 0.5"
+  near_miss: "exponential decay: 0.10 * exp(-0.6 * (line_diff - 2)), requires compatible type"
+  diversity_bonus: "+0.02 for first TP in a new issue category"
+  exploration_bonus: "+0.01 for first TP in a new file (multi-file tasks)"
   flood_protection: "escalating FP penalty after 3rd false positive"
   normalization: "VL Norm (2025): normalized_return = cumulative / steps_used"
+training:
+  grpo_endpoint: "/grpo_batch — group-relative advantages A_i = (r_i - mean) / std"
+  curriculum: "CAMRL with 20% task replay to prevent forgetting"
+  rollout: "/trl_rollout — TRL GRPOTrainer compatible batch rollout"

server/app.py CHANGED Viewed

@@ -303,6 +303,7 @@ class CurriculumRequest(BaseModel):
     agent_performance: Optional[Dict[str, Any]] = None
     easy_threshold: float = 0.30
     hard_threshold: float = 0.70
 @app.post("/curriculum")
@@ -341,7 +342,14 @@ async def curriculum_task_selector(request: CurriculumRequest):
     else:
         avg_success = 0.0
-    if avg_success < easy_thresh:
         phase = "easy"
         # Focus on task with lowest ground truth issue count (most approachable)
         recommended = min(ALL_TASKS.keys(), key=lambda t: len(ALL_TASKS[t]["ground_truth_issues"]))
@@ -352,7 +360,6 @@ async def curriculum_task_selector(request: CurriculumRequest):
     else:
         phase = "medium"
         # Mix: pick a task proportional to difficulty (harder = more likely)
-        import random
         weights = list(task_difficulty.values())
         total_w = sum(weights) or 1.0
         probs = [w / total_w for w in weights]
@@ -473,6 +480,86 @@ async def trl_rollout(request: TRLRolloutRequest):
     }
 def main():
     import uvicorn
     port = int(os.environ.get("PORT", 7860))

     agent_performance: Optional[Dict[str, Any]] = None
     easy_threshold: float = 0.30
     hard_threshold: float = 0.70
+    replay_fraction: float = 0.20  # fraction of time to replay earlier tasks (prevents forgetting)
 @app.post("/curriculum")
     else:
         avg_success = 0.0
+    # Task replay (prevents catastrophic forgetting, arxiv 2506.06632):
+    # With replay_fraction probability, pick an easy/mastered task instead
+    replay_frac = request.replay_fraction
+    if perf and random.random() < replay_frac:
+        # Replay: pick easiest task (lowest GT count) to maintain baseline skills
+        phase = "replay"
+        recommended = min(ALL_TASKS.keys(), key=lambda t: len(ALL_TASKS[t]["ground_truth_issues"]))
+    elif avg_success < easy_thresh:
         phase = "easy"
         # Focus on task with lowest ground truth issue count (most approachable)
         recommended = min(ALL_TASKS.keys(), key=lambda t: len(ALL_TASKS[t]["ground_truth_issues"]))
     else:
         phase = "medium"
         # Mix: pick a task proportional to difficulty (harder = more likely)
         weights = list(task_difficulty.values())
         total_w = sum(weights) or 1.0
         probs = [w / total_w for w in weights]
     }
+class GRPOBatchRequest(BaseModel):
+    task_id: Optional[str] = None
+    seed: Optional[int] = None
+    group: List[List[Dict[str, Any]]]  # G action sequences for group-relative comparison
+@app.post("/grpo_batch")
+async def grpo_batch(request: GRPOBatchRequest):
+    """
+    GRPO group-relative rollout batch (DeepSeek-R1 / DeepSeekMath style).
+    Runs G action sequences on the SAME task, computes group-relative advantages:
+      A_i = (r_i - mean(r_1..r_G)) / std(r_1..r_G)
+    This replaces the PPO critic entirely — no value network needed.
+    Recommended group size G=64 (DeepSeekMath), G=8-16 for faster iteration.
+    Body:
+      task_id: str (optional)
+      seed: int (optional, ensures same task state for all rollouts)
+      group: [[actions_1], [actions_2], ..., [actions_G]]
+    Returns:
+      rollouts: [{episode_return, final_score, advantage, ...}]
+      group_stats: {mean, std, G}
+    """
+    G = len(request.group)
+    if G < 2:
+        raise HTTPException(400, "GRPO requires at least 2 rollouts in the group")
+    returns = []
+    rollout_results = []
+    for action_seq in request.group:
+        rollout_env = CodeReviewEnvironment()
+        rollout_env.reset(task_id=request.task_id, seed=request.seed)
+        episode_return = 0.0
+        final_score = 0.0
+        n_steps = 0
+        for action_dict in action_seq:
+            action = ReviewAction.from_dict(action_dict)
+            obs_step = rollout_env.step(action)
+            step_data = _serialize(obs_step)
+            reward = step_data.get("reward") or 0.0
+            episode_return += reward
+            n_steps += 1
+            if step_data.get("done", False):
+                final_score = step_data.get("reward", step_data.get("current_score", 0.0)) or 0.0
+                break
+        returns.append(final_score)
+        rollout_results.append({
+            "episode_return": round(episode_return, 4),
+            "final_score": round(final_score, 4),
+            "num_steps": n_steps,
+        })
+    # Compute group-relative advantages: A_i = (r_i - mean) / std
+    mean_r = sum(returns) / G
+    variance = sum((r - mean_r) ** 2 for r in returns) / G
+    std_r = max(variance ** 0.5, 1e-8)
+    for i, result in enumerate(rollout_results):
+        result["advantage"] = round((returns[i] - mean_r) / std_r, 4)
+    return {
+        "task_id": request.task_id,
+        "rollouts": rollout_results,
+        "group_stats": {
+            "mean": round(mean_r, 4),
+            "std": round(std_r, 4),
+            "G": G,
+        },
+        "method": "GRPO",
+    }
 def main():
     import uvicorn
     port = int(os.environ.get("PORT", 7860))