Spaces:

Dishaaa25
/

meta-rl-dsa-solver

Running

App Files Files Community

kaustubhg73 commited on Apr 25

Commit

96b50a5

1 Parent(s): 864223c

Add HF support

Browse files

Files changed (14) hide show

README.md +38 -1
client.py +1 -1
env/adapt_env.py +14 -3
env/executor.py +1 -1
inference.py +159 -0
models.py +16 -1
pyproject.toml +1 -1
scripts/test_env.py +9 -0
scripts/test_verifier.py +10 -1
server/app.py +150 -16
test.py +1 -1
training/train_grpo.py +141 -156
verifier/sandbox.py +3 -2
verifier/verifier.py +8 -2

README.md CHANGED Viewed

@@ -14,7 +14,7 @@ tags:
 ADAPT, the Adversarial DSA Tutor, is an OpenEnv-compliant RLVR environment for training code-generation agents on small DSA tasks. The agent receives a problem prompt, examples, and visible tests, then submits Python code. The environment runs the code against visible and hidden tests and returns reward, pass-rate metrics, execution status, and feedback.
-This repo now focuses on the environment layer only. Verifier work and training scripts are owned separately.
 ## Why This Environment
@@ -120,11 +120,15 @@ uvicorn server.app:app --host 0.0.0.0 --port 7860
 Useful endpoints:
 - `GET /health`
 - `GET /schema`
 - `POST /reset`
 - `POST /step`
 - `GET /state`
 Example step request:
@@ -132,12 +136,45 @@ Example step request:
 curl -X POST http://localhost:7860/step -H "Content-Type: application/json" -d "{\"action\":{\"code\":\"n=int(input())\nprint(n*2)\"}}"
 ```
 Validate with OpenEnv once dependencies are installed:
 ```powershell
 openenv validate .
 ```
 ## Hugging Face Spaces
 This repo is Docker Space ready:

 ADAPT, the Adversarial DSA Tutor, is an OpenEnv-compliant RLVR environment for training code-generation agents on small DSA tasks. The agent receives a problem prompt, examples, and visible tests, then submits Python code. The environment runs the code against visible and hidden tests and returns reward, pass-rate metrics, execution status, and feedback.
+This repo includes the environment, verifier helpers, a baseline inference runner, and a GRPO training entrypoint so the full submission flow can be exercised from one codebase.
 ## Why This Environment
 Useful endpoints:
+- `GET /`
 - `GET /health`
+- `GET /metadata`
+- `GET /tasks`
 - `GET /schema`
 - `POST /reset`
 - `POST /step`
 - `GET /state`
+- `POST /mcp`
 Example step request:
 curl -X POST http://localhost:7860/step -H "Content-Type: application/json" -d "{\"action\":{\"code\":\"n=int(input())\nprint(n*2)\"}}"
 ```
+You can also send the raw action body:
+```powershell
+curl -X POST http://localhost:7860/step -H "Content-Type: application/json" -d "{\"code\":\"n=int(input())\nprint(n*2)\"}"
+```
 Validate with OpenEnv once dependencies are installed:
 ```powershell
 openenv validate .
 ```
+Run the verifier smoke test:
+```powershell
+python scripts\test_verifier.py
+```
+Run the environment smoke test:
+```powershell
+python scripts\test_env.py
+```
+Run the baseline model loop:
+```powershell
+$env:HF_TOKEN="..."
+$env:API_BASE_URL="https://router.huggingface.co/v1"
+$env:MODEL_NAME="openai/gpt-oss-120b"
+python inference.py
+```
+Run GRPO training:
+```powershell
+python training\train_grpo.py --output-dir outputs_v2 --bf16
+```
 ## Hugging Face Spaces
 This repo is Docker Space ready:

client.py CHANGED Viewed

@@ -21,7 +21,7 @@ class AdaptEnvClient:
         return response.json()
     def step(self, code: str) -> dict[str, Any]:
-        response = self._client.post("/step", json={"action": AdaptAction(code=code).model_dump()})
         response.raise_for_status()
         return response.json()

         return response.json()
     def step(self, code: str) -> dict[str, Any]:
+        response = self._client.post("/step", json=AdaptAction(code=code).model_dump())
         response.raise_for_status()
         return response.json()

env/adapt_env.py CHANGED Viewed

@@ -1,15 +1,26 @@
 from __future__ import annotations
 import ast
-from typing import Any
 from uuid import uuid4
-from openenv.core.env_server.interfaces import Environment
 from env.executor import run_code
 from env.test_cases import get_test_cases, load_problem, split_test_cases
 from models import AdaptAction, AdaptObservation, AdaptState
 FORBIDDEN_IMPORTS = {"os", "pathlib", "shutil", "socket", "subprocess"}
 DIFFICULTY_LABELS = {1: "easy", 2: "medium", 3: "hard"}

 from __future__ import annotations
 import ast
+from typing import Any, Generic, TypeVar
 from uuid import uuid4
 from env.executor import run_code
 from env.test_cases import get_test_cases, load_problem, split_test_cases
 from models import AdaptAction, AdaptObservation, AdaptState
+try:
+    from openenv.core.env_server.interfaces import Environment
+except ImportError:
+    ActionT = TypeVar("ActionT")
+    ObservationT = TypeVar("ObservationT")
+    StateT = TypeVar("StateT")
+    class Environment(Generic[ActionT, ObservationT, StateT]):
+        SUPPORTS_CONCURRENT_SESSIONS = False
+        def __init__(self) -> None:
+            pass
 FORBIDDEN_IMPORTS = {"os", "pathlib", "shutil", "socket", "subprocess"}
 DIFFICULTY_LABELS = {1: "easy", 2: "medium", 3: "hard"}

env/executor.py CHANGED Viewed

@@ -23,7 +23,7 @@ def run_code(code: str, input_data: str) -> dict:
         try:
             result = subprocess.run(
-                ["python3", str(file_path)],
                 input=input_data,
                 text=True,
                 capture_output=True,

         try:
             result = subprocess.run(
+                [sys.executable, str(file_path)],
                 input=input_data,
                 text=True,
                 capture_output=True,

inference.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+STDOUT FORMAT (must match exactly):
+[START] task=<task_name> env=adapt_dsa_tutor model=<model_name>
+[STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
+[END]   success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
+"""
+from __future__ import annotations
+import json
+import os
+from typing import Any
+from env.adapt_env import AdaptEnvironment
+from env.test_cases import load_problem_bank
+from models import AdaptAction
+BENCHMARK = "adapt_dsa_tutor"
+TASKS = [problem["problem_id"] for problem in load_problem_bank()]
+SYSTEM_PROMPT = """You are solving a programming problem in Python.
+You will receive:
+- a problem statement
+- input format
+- constraints
+- worked examples
+- visible tests
+- feedback from previous attempts
+Reply with ONLY runnable Python code. The code must read from stdin and print to stdout.
+Do not include markdown fences or explanations."""
+def require_env(name: str, value: str | None) -> str:
+    if value:
+        return value
+    raise RuntimeError(f"Missing required environment variable: {name}")
+def safe_log_value(value: str | None) -> str:
+    if not value:
+        return "null"
+    return str(value).replace("\n", "_").replace("\r", "_").replace("\t", "_").replace(" ", "_")
+def extract_code(response_text: str) -> str:
+    text = response_text.strip()
+    if text.startswith("```"):
+        parts = text.split("\n", 1)
+        text = parts[1] if len(parts) > 1 else text[3:]
+        if text.endswith("```"):
+            text = text[:-3]
+        text = text.strip()
+    if text.startswith("python"):
+        text = text[6:].strip()
+    return text
+def build_user_prompt(observation: dict[str, Any]) -> str:
+    payload = {
+        "problem_id": observation["problem_id"],
+        "difficulty": observation["difficulty"],
+        "problem": observation["problem"],
+        "input_format": observation["input_format"],
+        "constraints": observation["constraints"],
+        "examples": observation["examples"],
+        "visible_tests": observation["visible_tests"],
+        "feedback": observation["feedback"],
+    }
+    return json.dumps(payload, indent=2)
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(step: int, action_str: str, reward: float, done: bool, error: str | None) -> None:
+    print(
+        f"[STEP] step={step} action={safe_log_value(action_str)} reward={reward:.2f} "
+        f"done={str(done).lower()} error={safe_log_value(error)}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
+    rewards_str = ",".join(f"{reward:.2f}" for reward in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps} score={score:.2f} rewards={rewards_str}",
+        flush=True,
+    )
+def run_task(task_name: str) -> float:
+    try:
+        from openai import OpenAI
+    except ImportError as exc:
+        raise RuntimeError(
+            "The `openai` package is required for inference runs. Install it before running inference.py."
+        ) from exc
+    api_key = require_env("HF_TOKEN", os.getenv("HF_TOKEN"))
+    base_url = require_env("API_BASE_URL", os.getenv("API_BASE_URL", "https://router.huggingface.co/v1"))
+    model_name = require_env("MODEL_NAME", os.getenv("MODEL_NAME", "openai/gpt-oss-120b"))
+    client = OpenAI(base_url=base_url, api_key=api_key)
+    env = AdaptEnvironment()
+    observation = env.reset(problem_id=task_name)
+    log_start(task_name, BENCHMARK, model_name)
+    rewards = []
+    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    max_steps = 3
+    for step_index in range(1, max_steps + 1):
+        messages.append({"role": "user", "content": build_user_prompt(observation.model_dump())})
+        try:
+            response = client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                temperature=0.0,
+                max_tokens=512,
+            )
+            response_text = response.choices[0].message.content or ""
+            messages.append({"role": "assistant", "content": response_text})
+            code = extract_code(response_text)
+            observation = env.step(AdaptAction(code=code))
+            rewards.append(float(observation.reward))
+            log_step(step_index, "submit_code", float(observation.reward), bool(observation.done), None)
+            if observation.pass_rate == 1.0 or observation.done:
+                break
+        except Exception as exc:
+            rewards.append(0.0)
+            log_step(step_index, "parse_error", 0.0, False, str(exc))
+            messages.append(
+                {
+                    "role": "user",
+                    "content": f"Your last response failed. Error: {exc}. Reply with only Python code.",
+                }
+            )
+    success = observation.pass_rate == 1.0
+    score = float(observation.reward)
+    log_end(success, len(rewards), score, rewards)
+    return score
+def main() -> dict[str, float]:
+    scores = {}
+    for task in TASKS:
+        scores[task] = run_task(task)
+    return scores
+if __name__ == "__main__":
+    main()

models.py CHANGED Viewed

@@ -2,7 +2,22 @@ from __future__ import annotations
 from typing import Any
-from openenv.core.env_server.types import Action, Observation, State
 from pydantic import Field

 from typing import Any
+from pydantic import BaseModel, Field
+try:
+    from openenv.core.env_server.types import Action, Observation, State
+except ImportError:
+    class Action(BaseModel):
+        model_config = {"extra": "forbid"}
+    class Observation(BaseModel):
+        reward: float = Field(default=0.0, ge=0.0, le=1.0)
+        done: bool = False
+    class State(BaseModel):
+        episode_id: str = ""
+        step_count: int = 0
 from pydantic import Field

pyproject.toml CHANGED Viewed

@@ -26,5 +26,5 @@ server = "server.app:main"
 [tool.setuptools]
 include-package-data = true
-packages = ["env", "server"]
 py-modules = ["app", "client", "models"]

 [tool.setuptools]
 include-package-data = true
+packages = ["env", "server", "verifier"]
 py-modules = ["app", "client", "models"]

scripts/test_env.py CHANGED Viewed

@@ -1,3 +1,12 @@
 from env.adapt_env import AdaptEnvironment
 from models import AdaptAction

+from __future__ import annotations
+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
 from env.adapt_env import AdaptEnvironment
 from models import AdaptAction

scripts/test_verifier.py CHANGED Viewed

@@ -1,3 +1,12 @@
 from verifier.verifier import verify
@@ -38,4 +47,4 @@ for name, code in [
     print("Pass rate:", info["pass_rate"])
     print("Passed:", info["passed"], "/", info["total"])
     print("Timeouts:", info["timeout_count"])
-    print("Errors:", info["error_count"])

+from __future__ import annotations
+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
 from verifier.verifier import verify
     print("Pass rate:", info["pass_rate"])
     print("Passed:", info["passed"], "/", info["total"])
     print("Timeouts:", info["timeout_count"])
+    print("Errors:", info["error_count"])

server/app.py CHANGED Viewed

@@ -1,28 +1,162 @@
 from __future__ import annotations
-try:
-    from openenv.core.env_server.http_server import create_app
-except Exception as exc:  # pragma: no cover
-    raise ImportError(
-        "openenv-core>=0.2.3 is required. Install with: pip install -e ."
-    ) from exc
-from env.adapt_env import AdaptEnvironment
-from models import AdaptAction, AdaptObservation
-app = create_app(
-    AdaptEnvironment,
-    AdaptAction,
-    AdaptObservation,
-    env_name="adapt_dsa_tutor",
-    max_concurrent_envs=4,
 )
-def main(host: str = "0.0.0.0", port: int = 7860) -> None:
-    import uvicorn
     uvicorn.run(app, host=host, port=port)

 from __future__ import annotations
+import argparse
+from typing import Any
+import uvicorn
+from fastapi import Body, FastAPI, HTTPException, Request
+from fastapi.responses import RedirectResponse, Response
+from pydantic import BaseModel
+from env.adapt_env import AdaptEnvironment
+from env.test_cases import load_problem_bank
+from models import AdaptAction, AdaptObservation, AdaptState
+ENV_NAME = "adapt_dsa_tutor"
+ENV_DESCRIPTION = (
+    "RL environment for DSA code generation with hidden tests, tiered problems, "
+    "and verifier-aware reward shaping."
 )
+TASKS = [
+    {
+        "name": problem["problem_id"],
+        "difficulty": problem["difficulty"],
+        "description": problem["problem"],
+    }
+    for problem in load_problem_bank()
+]
+app = FastAPI(title="ADAPT DSA Tutor OpenEnv", version="0.2.0")
+ENV = AdaptEnvironment()
+class ResetRequest(BaseModel):
+    seed: int | None = None
+    episode_id: str | None = None
+    problem_id: str | None = None
+    difficulty: str | None = None
+def _metadata() -> dict[str, Any]:
+    return {
+        "name": ENV_NAME,
+        "description": ENV_DESCRIPTION,
+        "version": "0.2.0",
+        "tasks": TASKS,
+        "mode": "simulation",
+    }
+@app.get("/")
+def root() -> dict[str, Any]:
+    payload = _metadata()
+    payload["status"] = "ok"
+    return payload
+@app.get("/web", include_in_schema=False)
+def web_root() -> RedirectResponse:
+    return RedirectResponse(url="/", status_code=307)
+@app.get("/web/", include_in_schema=False)
+def web_root_slash() -> RedirectResponse:
+    return RedirectResponse(url="/", status_code=307)
+@app.get("/favicon.ico", include_in_schema=False)
+def favicon() -> Response:
+    return Response(status_code=204)
+@app.get("/health")
+def health() -> dict[str, str]:
+    return {"status": "healthy"}
+@app.get("/metadata")
+def metadata() -> dict[str, Any]:
+    return _metadata()
+@app.get("/tasks")
+def list_tasks() -> dict[str, Any]:
+    return {"tasks": TASKS}
+@app.get("/schema")
+def schema() -> dict[str, Any]:
+    return {
+        "action": AdaptAction.model_json_schema(),
+        "observation": AdaptObservation.model_json_schema(),
+        "state": AdaptState.model_json_schema(),
+    }
+@app.post("/mcp")
+def mcp(payload: dict[str, Any] = Body(default_factory=dict)) -> dict[str, Any]:
+    return {
+        "jsonrpc": "2.0",
+        "id": payload.get("id"),
+        "error": {
+            "code": -32601,
+            "message": "MCP methods are not implemented for this environment.",
+        },
+    }
+@app.post("/reset")
+def reset(request: ResetRequest | None = None) -> dict[str, Any]:
+    effective_request = request or ResetRequest()
+    observation = ENV.reset(
+        seed=effective_request.seed,
+        episode_id=effective_request.episode_id,
+        problem_id=effective_request.problem_id,
+        difficulty=effective_request.difficulty,
+    )
+    return observation.model_dump()
+@app.post("/step")
+async def step(request: Request) -> dict[str, Any]:
+    payload = await request.json()
+    if not isinstance(payload, dict):
+        raise HTTPException(status_code=422, detail="Request body must be a JSON object.")
+    raw_action = payload.get("action", payload)
+    try:
+        effective_action = AdaptAction.model_validate(raw_action)
+    except Exception as exc:
+        raise HTTPException(status_code=422, detail=f"Invalid action payload: {exc}") from exc
+    observation = ENV.step(effective_action)
+    return {
+        "observation": observation.model_dump(),
+        "reward": float(observation.reward),
+        "done": bool(observation.done),
+        "info": {
+            "feedback": observation.feedback,
+            "pass_rate": observation.pass_rate,
+            "execution_status": observation.execution_status,
+        },
+    }
+@app.get("/state")
+def state() -> dict[str, Any]:
+    if not ENV.problem:
+        ENV.reset()
+    return ENV.state.model_dump()
+def main(host: str | None = None, port: int | None = None) -> None:
+    if host is None or port is None:
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--host", default="0.0.0.0")
+        parser.add_argument("--port", type=int, default=7860)
+        args = parser.parse_args()
+        host = args.host if host is None else host
+        port = args.port if port is None else port
     uvicorn.run(app, host=host, port=port)

test.py CHANGED Viewed

@@ -12,7 +12,7 @@ def assert_hidden_tests_are_not_exposed(payload: dict) -> None:
 def main() -> None:
     env = AdaptEnvironment()
-    observation = env.reset()
     assert isinstance(observation, AdaptObservation)
     assert observation.visible_tests
     assert observation.problem_id == "easy_double"

 def main() -> None:
     env = AdaptEnvironment()
+    observation = env.reset(problem_id="easy_double")
     assert isinstance(observation, AdaptObservation)
     assert observation.visible_tests
     assert observation.problem_id == "easy_double"

training/train_grpo.py CHANGED Viewed

@@ -1,167 +1,152 @@
-import torch
-<<<<<<< HEAD
-from unsloth import FastLanguageModel, PatchFastRL
-from trl import GRPOTrainer, GRPOConfig
-from meta_rl_dsa_solver_env import DsaEnv
-# 1. Patch Unsloth for RL speedups
-PatchFastRL("GRPO", FastLanguageModel)
-# 2. Load Model & Tokenizer
-model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name = "unsloth/Llama-3.2-3B-Instruct", # Use appropriate 2026 base
-    max_seq_length = 2048,
-    load_in_4bit = True,
-    fast_inference = True,
-=======
-import numpy as np
-from unsloth import FastLanguageModel, PatchFastRL
-from trl import GRPOTrainer, GRPOConfig
-from meta_rl_dsa_solver_env.env.adapt_env import AdaptEnvironment
-from meta_rl_dsa_solver_env.models import AdaptAction
-# 1. Initialize Model & Speedups
-PatchFastRL("GRPO", FastLanguageModel)
-model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name = "unsloth/Llama-3.2-3B-Instruct",
-    max_seq_length = 2048,
-    load_in_4bit = True,
->>>>>>> environment-v2
-)
-model = FastLanguageModel.get_peft_model(
-    model,
-    r = 16,
-    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
-    lora_alpha = 16,
-<<<<<<< HEAD
-    lora_dropout = 0,
-)
-# 3. Define the Reward Function (Interface for Person 2)
-def reward_function(prompts, completions, **kwargs) -> list[float]:
-    """
-    In GRPO, the reward function is called on the batch of completions.
-    For V0, we manually trigger the Env's step logic.
-    """
-    env = DsaEnv()
-    rewards = []
-    for completion in completions:
-        # Extract code from completion (assuming markdown tags)
-        code = completion.split("```python")[-1].split("```")[0].strip() if "```" in completion else completion
-        _, reward, _, _, _ = env.step(code)
-        rewards.append(reward)
-    return rewards
-# 4. Training Configuration
-training_args = GRPOConfig(
-    output_dir = "./outputs",
-    learning_rate = 5e-6,
-    per_device_train_batch_size = 4,
-    gradient_accumulation_steps = 4,
-    max_prompt_length = 512,
-    max_completion_length = 512,
-    num_generations = 8, # Group size for GRPO
-    logging_steps = 1,
-    max_steps = 100, # Quick run for MVP
-)
-# 5. Initialize Trainer
-trainer = GRPOTrainer(
-    model = model,
-    reward_funcs = [reward_function],
-    args = training_args,
-    train_dataset = [
-        {"prompt": "Write a function `sum_list(arr: list) -> int` that returns the sum of a list."}
-    ] * 100, # Dummy dataset for V0 validation
-)
-if __name__ == "__main__":
-    print("Starting V0 Training...")
-    trainer.train()
-    model.save_pretrained_merged("final_v0_model", tokenizer, save_method = "merged_16bit")
-=======
-)
-# 2. V2 Heuristic State Machine
 class CurriculumManager:
-    def __init__(self):
-        self.difficulties = ["easy", "medium", "hard"]
-        self.current_idx = 0
-        self.success_history = []
-        self.window_size = 10  # Moving average window
-    def get_current_difficulty(self):
         return self.difficulties[self.current_idx]
-    def update(self, success_rate):
-        self.success_history.append(success_rate)
         if len(self.success_history) > self.window_size:
             self.success_history.pop(0)
-        # V2 Logic: If moving average > 70%, increase difficulty
-        avg_success = np.mean(self.success_history)
-        if avg_success > 0.70 and self.current_idx < len(self.difficulties) - 1:
             self.current_idx += 1
-            print(f"--- HEURISTIC LEVEL UP: Moving to {self.difficulties[self.current_idx]} ---")
-            self.success_history = [] # Reset for the new tier
-curriculum = CurriculumManager()
-# 3. V2 Reward Function with Curriculum Feedback
-def v2_reward_func(prompts, completions, **kwargs) -> list[float]:
-    env = AdaptEnvironment()
-    rewards = []
-    successes = []
-    current_diff = curriculum.get_current_difficulty()
-    for completion in completions:
-        # Load problem based on current heuristic difficulty
-        env.reset(difficulty=current_diff)
-        code = completion.split("```python")[-1].split("```")[0].strip() if "```" in completion else completion
-        action = AdaptAction(code=code)
-        obs = env.step(action)
-        rewards.append(float(obs.reward))
-        successes.append(1.0 if obs.pass_rate == 1.0 else 0.0)
-    # Update the curriculum manager based on this batch
-    batch_success_rate = np.mean(successes)
-    curriculum.update(batch_success_rate)
-    return rewards
-# 4. Dataset: Transition from single prompt to generic instruction
-# This forces the LLM to look at the 'problem statement' in the observation
-dataset = [
-    {"prompt": "Read the problem statement and constraints carefully. Write a Python solution that reads from stdin and prints to stdout."}
-] * 200 # Larger dataset for multi-tier learning
-# 5. Config
-training_args = GRPOConfig(
-    output_dir = "./outputs_v2",
-    learning_rate = 5e-6,
-    per_device_train_batch_size = 1,
-    gradient_accumulation_steps = 8, # Higher for stability during transitions
-    num_generations = 8,
-    max_steps = 250,
-    bf16 = True,
-    logging_steps = 1,
-)
-trainer = GRPOTrainer(
-    model = model,
-    reward_funcs = [v2_reward_func],
-    args = training_args,
-    train_dataset = dataset,
-)
 if __name__ == "__main__":
-    print(f"Starting V2 Training. Initial Difficulty: {curriculum.get_current_difficulty()}")
-    trainer.train()
->>>>>>> environment-v2

+from __future__ import annotations
+import argparse
+from dataclasses import dataclass, field
+from typing import Any
+from env.adapt_env import AdaptEnvironment
+from models import AdaptAction
+def extract_code(completion: str) -> str:
+    text = completion.strip()
+    if "```python" in text:
+        return text.split("```python", 1)[1].split("```", 1)[0].strip()
+    if "```" in text:
+        return text.split("```", 1)[1].split("```", 1)[0].strip()
+    return text
+@dataclass
 class CurriculumManager:
+    difficulties: list[str] = field(default_factory=lambda: ["easy", "medium", "hard"])
+    current_idx: int = 0
+    success_history: list[float] = field(default_factory=list)
+    window_size: int = 10
+    def current_difficulty(self) -> str:
         return self.difficulties[self.current_idx]
+    def update(self, batch_success_rate: float) -> None:
+        self.success_history.append(float(batch_success_rate))
         if len(self.success_history) > self.window_size:
             self.success_history.pop(0)
+        moving_average = sum(self.success_history) / len(self.success_history)
+        if moving_average > 0.70 and self.current_idx < len(self.difficulties) - 1:
             self.current_idx += 1
+            self.success_history.clear()
+            print(
+                f"[curriculum] promoted to {self.current_difficulty()} "
+                f"(moving_success={moving_average:.2f})"
+            )
+def build_reward_func(curriculum: CurriculumManager):
+    def reward_func(prompts, completions, **kwargs) -> list[float]:
+        del prompts, kwargs
+        env = AdaptEnvironment()
+        rewards: list[float] = []
+        successes: list[float] = []
+        difficulty = curriculum.current_difficulty()
+        for completion in completions:
+            env.reset(difficulty=difficulty)
+            observation = env.step(AdaptAction(code=extract_code(completion)))
+            rewards.append(float(observation.reward))
+            successes.append(1.0 if observation.pass_rate == 1.0 else 0.0)
+        if successes:
+            curriculum.update(sum(successes) / len(successes))
+        return rewards
+    return reward_func
+def build_dataset(size: int) -> list[dict[str, str]]:
+    prompt = (
+        "Read the problem statement carefully. "
+        "Write a Python solution that reads from stdin and prints to stdout."
+    )
+    return [{"prompt": prompt}] * size
+def run_training(args: argparse.Namespace) -> None:
+    try:
+        from trl import GRPOConfig, GRPOTrainer
+        from unsloth import FastLanguageModel, PatchFastRL
+    except ImportError as exc:
+        raise RuntimeError(
+            "Training dependencies are missing. Install `trl` and `unsloth` before running GRPO training."
+        ) from exc
+    PatchFastRL("GRPO", FastLanguageModel)
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=args.model_name,
+        max_seq_length=args.max_seq_length,
+        load_in_4bit=not args.disable_4bit,
+    )
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=args.lora_rank,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+        lora_alpha=args.lora_alpha,
+        lora_dropout=0.0,
+    )
+    curriculum = CurriculumManager()
+    training_args = GRPOConfig(
+        output_dir=args.output_dir,
+        learning_rate=args.learning_rate,
+        per_device_train_batch_size=args.batch_size,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        num_generations=args.num_generations,
+        max_prompt_length=args.max_prompt_length,
+        max_completion_length=args.max_completion_length,
+        max_steps=args.max_steps,
+        logging_steps=1,
+        bf16=args.bf16,
+    )
+    trainer = GRPOTrainer(
+        model=model,
+        reward_funcs=[build_reward_func(curriculum)],
+        args=training_args,
+        train_dataset=build_dataset(args.dataset_size),
+    )
+    trainer.train()
+    model.save_pretrained(args.output_dir)
+    tokenizer.save_pretrained(args.output_dir)
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="GRPO training entrypoint for the ADAPT DSA environment.")
+    parser.add_argument("--model-name", default="unsloth/Llama-3.2-3B-Instruct")
+    parser.add_argument("--output-dir", default="outputs_v2")
+    parser.add_argument("--dataset-size", type=int, default=200)
+    parser.add_argument("--max-steps", type=int, default=250)
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--gradient-accumulation-steps", type=int, default=8)
+    parser.add_argument("--num-generations", type=int, default=8)
+    parser.add_argument("--max-seq-length", type=int, default=2048)
+    parser.add_argument("--max-prompt-length", type=int, default=512)
+    parser.add_argument("--max-completion-length", type=int, default=512)
+    parser.add_argument("--learning-rate", type=float, default=5e-6)
+    parser.add_argument("--lora-rank", type=int, default=16)
+    parser.add_argument("--lora-alpha", type=int, default=16)
+    parser.add_argument("--disable-4bit", action="store_true")
+    parser.add_argument("--bf16", action="store_true")
+    return parser
+def main(argv: list[str] | None = None) -> None:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    run_training(args)
 if __name__ == "__main__":
+    main()

verifier/sandbox.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import subprocess
 import tempfile
@@ -11,7 +12,7 @@ def run_code(code: str, stdin: str, timeout: int = 2):
             path = f.name
         result = subprocess.run(
-            ["python3", path],
             input=stdin,
             text=True,
             capture_output=True,
@@ -31,4 +32,4 @@ def run_code(code: str, stdin: str, timeout: int = 2):
     finally:
         if path and os.path.exists(path):
-            os.remove(path)

 import os
 import subprocess
+import sys
 import tempfile
             path = f.name
         result = subprocess.run(
+            [sys.executable, path],
             input=stdin,
             text=True,
             capture_output=True,
     finally:
         if path and os.path.exists(path):
+            os.remove(path)

verifier/verifier.py CHANGED Viewed

@@ -5,7 +5,13 @@ from verifier.metrics import compute_pass_rate
 def verify(code: str, test_cases):
     results = []
-    for stdin, expected in test_cases:
         ok, output = run_code(code, stdin)
         passed = ok and output.strip() == expected.strip()
@@ -23,4 +29,4 @@ def verify(code: str, test_cases):
     return reward, {
         **metrics,
         "results": results,
-    }

 def verify(code: str, test_cases):
     results = []
+    for test_case in test_cases:
+        if isinstance(test_case, dict):
+            stdin = str(test_case.get("input", ""))
+            expected = str(test_case.get("output", ""))
+        else:
+            stdin, expected = test_case
         ok, output = run_code(code, stdin)
         passed = ok and output.strip() == expected.strip()
     return reward, {
         **metrics,
         "results": results,
+    }