Spaces:

ncncomplete
/

code-review-env

Sleeping

App Files Files Community

ncncomplete commited on Apr 8

Commit

d145b94

verified ·

1 Parent(s): 092031c

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

README.md +35 -8
client.py +7 -0
inference.py +153 -121
models.py +17 -9
server/app.py +26 -2
server/python_codeact_env.py +53 -55
server/task_bank.py +157 -0

README.md CHANGED Viewed

@@ -13,7 +13,9 @@ tags:
 # Coding Environment
-A Python code execution environment that runs arbitrary Python code and returns results. Perfect for testing code execution infrastructure and demonstrating environment usage patterns.
 ## Quick Start
@@ -77,20 +79,45 @@ docker build -t coding-env:latest -f envs/coding_env/server/Dockerfile .
 ## Environment Details
 ### Action
-**CodeAction**: Contains a single field
-- `code` (str) - The Python code to execute
 ### Observation
-**CodeObservation**: Contains the execution results
-- `stdout` (str) - Standard output from code execution
-- `stderr` (str) - Standard error from code execution
-- `exit_code` (int) - Exit code (0 for success, non-zero for errors)
 ### State
 **CodeState**: Tracks execution state
 - `episode_id` (str) - Unique identifier for the episode
 - `step_count` (int) - Number of steps taken
-- `last_exit_code` (int) - Exit code from the last execution
 ## Advanced Usage

 # Coding Environment
+A code-review benchmark environment with three graded tasks (easy/medium/hard).
+Each episode provides a buggy snippet and asks the agent to return a structured
+review (`bug_type`, `line_number`, `review`, `confidence`).
 ## Quick Start
 ## Environment Details
 ### Action
+**CodeAction** fields:
+- `review` (str) - Human-readable review summary
+- `bug_type` (str) - One of `syntax | logic | security | none`
+- `line_number` (int) - Suspected faulty line
+- `confidence` (float) - Confidence score in `[0.0, 1.0]`
 ### Observation
+**CodeObservation** fields:
+- `task_id` (str) - Current task id
+- `difficulty` (str) - Task difficulty (`easy|medium|hard`)
+- `task_description` (str) - Review instructions
+- `code_snippet` (str) - Code to analyze
+- `previous_feedback` (str) - Grader feedback from latest step
+- `reward` (float) - Normalized score contribution `[0.0, 1.0]`
+- `done` (bool) - Episode termination flag
 ### State
 **CodeState**: Tracks execution state
 - `episode_id` (str) - Unique identifier for the episode
 - `step_count` (int) - Number of steps taken
+- `task_id` (str) - Active task id
+- `difficulty` (str) - Active task difficulty
+- `last_score` (float) - Last normalized score
+## Built-in Tasks and Graders
+The server exposes:
+- `GET /tasks` to list all benchmark tasks.
+- `GET /grader?task_id=<id>&episode_id=<id>` to read final normalized score.
+Shipped tasks:
+- `task_easy_1` (logic)
+- `task_medium_1` (security)
+- `task_hard_1` (logic/performance-concurrency)
+Rewards are in `[0.0, 1.0]` with partial progress:
+- bug type correctness
+- line number accuracy (exact/near miss)
+- review evidence keywords
 ## Advanced Usage

client.py CHANGED Viewed

@@ -27,6 +27,10 @@ class CodingEnv(EnvClient[CodeAction, CodeObservation, CodeState]):
     def _step_payload(self, action: CodeAction) -> dict:
         # Shape expected by the server's /step endpoint under "action"
         return {
             "code": action.code,
         }
@@ -53,4 +57,7 @@ class CodingEnv(EnvClient[CodeAction, CodeObservation, CodeState]):
             episode_id=payload.get("episode_id"),
             step_count=payload.get("step_count", 0),
             last_exit_code=payload.get("last_exit_code", 0),
         )

     def _step_payload(self, action: CodeAction) -> dict:
         # Shape expected by the server's /step endpoint under "action"
         return {
+            "review": action.review,
+            "bug_type": action.bug_type,
+            "line_number": action.line_number,
+            "confidence": action.confidence,
             "code": action.code,
         }
             episode_id=payload.get("episode_id"),
             step_count=payload.get("step_count", 0),
             last_exit_code=payload.get("last_exit_code", 0),
+            task_id=payload.get("task_id", ""),
+            difficulty=payload.get("difficulty", ""),
+            last_score=float(payload.get("last_score", 0.0)),
         )

inference.py CHANGED Viewed

@@ -1,171 +1,203 @@
 #!/usr/bin/env python3
-"""Code Review Environment Baseline Evaluation.
-This script is hardened for validator compatibility:
-- Always prints [START]/[STEP]/[END] to stdout with flush=True
-- Avoids failing before first [START] due to optional deps/credentials
-- Never redirects stdout
 """
 from __future__ import annotations
 import json
 import os
-from typing import Any, Dict, Optional
-try:
-    import requests
-except Exception:
-    requests = None  # type: ignore[assignment]
-# ---------------------------------------------------------------------------
-# Configuration
-# ---------------------------------------------------------------------------
-# Required checklist variables:
-# - API_BASE_URL and MODEL_NAME have defaults
-# - HF_TOKEN has no default
-# - LOCAL_IMAGE_NAME is optional
-API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000")
-MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
 HF_TOKEN = os.getenv("HF_TOKEN")
 LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
-# List of task IDs to evaluate
-TASKS = os.getenv("TASKS", "task_1,task_2,task_3").split(",")
-# ---------------------------------------------------------------------------
-# Main Task Runner
-# ---------------------------------------------------------------------------
-def _build_action(task_description: str, code_snippet: str) -> Dict[str, Any]:
-    """Build an action via LLM when available; otherwise return safe fallback."""
-    fallback_action: Dict[str, Any] = {
-        "review": "Unable to run model; submitting safe fallback review.",
-        "bug_type": "none",
-        "line_number": -1,
-        "confidence": 0.0,
-    }
-    if not HF_TOKEN:
-        return fallback_action
-    try:
-        from openai import OpenAI  # Lazy import to avoid failing at module import time
-        client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
     except Exception:
-        return fallback_action
-    prompt = f"""You are a code reviewer. {task_description}
-Code to review:
 ```python
 {code_snippet}
 ```
-Respond ONLY with valid JSON, no markdown:
-{{
-  "review": "your detailed analysis",
-  "bug_type": "syntax or logic or security or none",
-  "line_number": <integer>,
-  "confidence": <float 0.0-1.0>
-}}"""
     try:
         response = client.chat.completions.create(
             model=MODEL_NAME,
-            messages=[{"role": "user", "content": prompt}],
             temperature=0.0,
         )
         raw = (response.choices[0].message.content or "").strip()
         raw = raw.replace("```json", "").replace("```", "").strip()
         parsed = json.loads(raw)
-        if isinstance(parsed, dict):
-            return parsed
-        return fallback_action
-    except Exception:
-        return fallback_action
-def _safe_post_json(url: str, payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
-    """Return JSON body or None on any network/JSON failure."""
-    if requests is None:
-        return None
-    try:
-        response = requests.post(url, json=payload, timeout=30)
-        return response.json()
-    except Exception:
-        return None
-def _safe_get_json(url: str) -> Optional[Dict[str, Any]]:
-    """Return JSON body or None on any network/JSON failure."""
-    if requests is None:
-        return None
-    try:
-        response = requests.get(url, timeout=30)
-        return response.json()
     except Exception:
-        return None
-def run_task(task_id: str) -> float:
-    """Run a single code review task and return the score."""
-    print(f"[START] task={task_id}", flush=True)
     score = 0.0
-    steps = 1
-    reset_data = _safe_post_json(f"{API_BASE_URL}/reset", {"task_id": task_id}) or {}
-    obs = reset_data.get("observation", {}) if isinstance(reset_data, dict) else {}
-    code_snippet = obs.get("code_snippet", "")
-    task_description = obs.get("task_description", "Review the provided code.")
-    action = _build_action(str(task_description), str(code_snippet))
-    # If stepping fails, we still emit structured output with reward=0.0
-    _safe_post_json(f"{API_BASE_URL}/step", {"action": action})
-    grader_data = _safe_get_json(
-        f"{API_BASE_URL}/grader?task_id={task_id}&episode_id=baseline"
-    ) or {}
-    if isinstance(grader_data, dict):
-        try:
-            score = float(grader_data.get("score", 0.0))
-        except Exception:
-            score = 0.0
-    print(f"[STEP] step=1 reward={score}", flush=True)
-    print(f"[END] task={task_id} score={score} steps={steps}", flush=True)
     return score
-# ---------------------------------------------------------------------------
-# Entrypoint
-# ---------------------------------------------------------------------------
-def main():
-    scores = {}
-    normalized_tasks = [t.strip() for t in TASKS if t.strip()]
-    if not normalized_tasks:
-        normalized_tasks = ["task_1"]
-    for task_id in normalized_tasks:
-        scores[task_id] = run_task(task_id)
-    average = round(sum(scores.values()) / len(scores), 4)
-    scores["average"] = average
-    print(f"\nBaseline Results: {json.dumps(scores, indent=2)}", flush=True)
-    with open("baseline_scores.json", "w") as f:
-        json.dump(scores, f, indent=2)
     return scores

 #!/usr/bin/env python3
+"""Hackathon baseline inference for coding_env.
+MANDATORY environment variables handled here:
+- API_BASE_URL (defaulted)
+- MODEL_NAME (defaulted)
+- HF_TOKEN (no default)
+- LOCAL_IMAGE_NAME (optional, for local Docker workflows)
 """
 from __future__ import annotations
 import json
 import os
+from typing import Any, Dict, List
+import requests
+from openai import OpenAI
+API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
 HF_TOKEN = os.getenv("HF_TOKEN")
 LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
+ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000")
+BENCHMARK = os.getenv("BENCHMARK", "coding_env")
+MAX_STEPS = int(os.getenv("MAX_STEPS", "1"))
+SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.60"))
+def _bool_text(value: bool) -> str:
+    return "true" if value else "false"
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(
+    step: int, action: str, reward: float, done: bool, error: str | None
+) -> None:
+    error_value = error if error else "null"
+    print(
+        f"[STEP] step={step} action={action} reward={reward:.2f} "
+        f"done={_bool_text(done)} error={error_value}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(
+        f"[END] success={_bool_text(success)} steps={steps} "
+        f"score={score:.2f} rewards={rewards_str}",
+        flush=True,
+    )
+def _safe_json(method: str, url: str, **kwargs: Any) -> Dict[str, Any]:
+    try:
+        response = requests.request(method, url, timeout=30, **kwargs)
+        response.raise_for_status()
+        data = response.json()
+        if isinstance(data, dict):
+            return data
     except Exception:
+        pass
+    return {}
+def _task_list() -> List[str]:
+    data = _safe_json("GET", f"{ENV_BASE_URL}/tasks")
+    tasks = data.get("tasks", [])
+    if isinstance(tasks, list):
+        values: List[str] = []
+        for item in tasks:
+            if isinstance(item, dict) and item.get("task_id"):
+                values.append(str(item["task_id"]))
+        if values:
+            return values
+    return ["task_easy_1", "task_medium_1", "task_hard_1"]
+def _build_action(client: OpenAI | None, task_description: str, code_snippet: str) -> Dict[str, Any]:
+    fallback = {
+        "review": "Potential logic issue found; needs targeted fix.",
+        "bug_type": "logic",
+        "line_number": 1,
+        "confidence": 0.20,
+    }
+    if client is None:
+        return fallback
+    prompt = f"""You are a strict code reviewer.
+Task: {task_description}
+Code:
 ```python
 {code_snippet}
 ```
+Return ONLY valid JSON with keys:
+review (string), bug_type (one of syntax|logic|security|none),
+line_number (integer), confidence (0.0-1.0 float)
+"""
     try:
         response = client.chat.completions.create(
             model=MODEL_NAME,
             temperature=0.0,
+            messages=[{"role": "user", "content": prompt}],
         )
         raw = (response.choices[0].message.content or "").strip()
         raw = raw.replace("```json", "").replace("```", "").strip()
         parsed = json.loads(raw)
+        if not isinstance(parsed, dict):
+            return fallback
+        return {
+            "review": str(parsed.get("review", fallback["review"])),
+            "bug_type": str(parsed.get("bug_type", fallback["bug_type"])),
+            "line_number": int(parsed.get("line_number", fallback["line_number"])),
+            "confidence": float(parsed.get("confidence", fallback["confidence"])),
+        }
     except Exception:
+        return fallback
+def run_task(task_id: str, client: OpenAI | None) -> float:
+    episode_id = f"baseline-{task_id}"
+    rewards: List[float] = []
     score = 0.0
+    success = False
+    last_error: str | None = None
+    steps_taken = 0
+    log_start(task_id, BENCHMARK, MODEL_NAME)
+    try:
+        reset_data = _safe_json(
+            "POST",
+            f"{ENV_BASE_URL}/reset",
+            json={"task_id": task_id, "episode_id": episode_id},
+        )
+        obs = reset_data.get("observation", {}) if isinstance(reset_data, dict) else {}
+        task_description = str(obs.get("task_description", "Review code quality and bugs."))
+        code_snippet = str(obs.get("code_snippet", ""))
+        for step_num in range(1, MAX_STEPS + 1):
+            action = _build_action(client, task_description, code_snippet)
+            action_str = (
+                f"bug_type={action['bug_type']};"
+                f"line={action['line_number']};"
+                f"confidence={float(action['confidence']):.2f}"
+            )
+            step_data = _safe_json("POST", f"{ENV_BASE_URL}/step", json={"action": action})
+            reward = float(step_data.get("reward", 0.0) or 0.0)
+            done = bool(step_data.get("done", False))
+            obs_after = step_data.get("observation", {}) if isinstance(step_data, dict) else {}
+            raw_error = obs_after.get("last_action_error")
+            last_error = str(raw_error) if raw_error else None
+            rewards.append(reward)
+            steps_taken = step_num
+            log_step(step_num, action_str, reward, done, last_error)
+            if done:
+                break
+        grader_data = _safe_json(
+            "GET", f"{ENV_BASE_URL}/grader?task_id={task_id}&episode_id={episode_id}"
+        )
+        score = float(grader_data.get("score", rewards[-1] if rewards else 0.0))
+        success = score >= SUCCESS_SCORE_THRESHOLD
+    except Exception as exc:
+        last_error = str(exc)
+        if steps_taken == 0:
+            log_step(1, "bug_type=none;line=-1;confidence=0.00", 0.0, True, last_error)
+            rewards.append(0.0)
+            steps_taken = 1
+        score = 0.0
+        success = False
+    finally:
+        log_end(success, max(1, steps_taken), score, rewards or [0.0])
     return score
+def main() -> Dict[str, float]:
+    client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN) if HF_TOKEN else None
+    tasks = _task_list()
+    scores: Dict[str, float] = {}
+    for task_id in tasks:
+        scores[task_id] = run_task(task_id, client)
+    avg = sum(scores.values()) / len(scores) if scores else 0.0
+    scores["average"] = round(avg, 4)
+    print(json.dumps({"summary": scores}, separators=(",", ":")), flush=True)
     return scores

models.py CHANGED Viewed

@@ -10,25 +10,33 @@ from openenv.core.env_server.interfaces import Action, Observation, State
 class CodeAction(Action):
-    """
-    Represents a single code execution request.
-    """
-    code: str
-    # Optional: future fields like 'lint': bool, 'timeout_s': float, etc.
 class CodeObservation(Observation):
-    """
-    Result of executing code in the environment.
-    """
     stdout: str = ""
     stderr: str = ""
     exit_code: int = 0
 class CodeState(State):
-    """State for CodeAct environment with persistent execution context."""
     last_exit_code: int = 0

 class CodeAction(Action):
+    """Represents a single code-review submission."""
+    review: str = ""
+    bug_type: str = "none"
+    line_number: int = -1
+    confidence: float = 0.0
+    # Optional fallback for compatibility with earlier code-exec flows.
+    code: str = ""
 class CodeObservation(Observation):
+    """Observation returned by the code-review environment."""
     stdout: str = ""
     stderr: str = ""
     exit_code: int = 0
+    task_id: str = ""
+    difficulty: str = ""
+    task_description: str = ""
+    code_snippet: str = ""
+    previous_feedback: str = ""
 class CodeState(State):
+    """State for code-review episodes."""
     last_exit_code: int = 0
+    task_id: str = ""
+    difficulty: str = ""
+    last_score: float = 0.0

server/app.py CHANGED Viewed

@@ -21,8 +21,16 @@ Usage:
     python -m envs.coding_env.server.app
 """
-from coding_env.models import CodeAction, CodeObservation
-from coding_env.server.python_codeact_env import PythonCodeActEnv
 from openenv.core.env_server import create_app
 # Create the app with web interface and README integration
@@ -30,6 +38,22 @@ from openenv.core.env_server import create_app
 app = create_app(PythonCodeActEnv, CodeAction, CodeObservation, env_name="coding_env")
 if __name__ == "__main__":
     import uvicorn

     python -m envs.coding_env.server.app
 """
+from fastapi import Query
+try:
+    from coding_env.models import CodeAction, CodeObservation
+    from coding_env.server.python_codeact_env import PythonCodeActEnv
+    from coding_env.server.task_bank import get_episode_score, list_tasks
+except ImportError:
+    from ..models import CodeAction, CodeObservation
+    from .python_codeact_env import PythonCodeActEnv
+    from .task_bank import get_episode_score, list_tasks
 from openenv.core.env_server import create_app
 # Create the app with web interface and README integration
 app = create_app(PythonCodeActEnv, CodeAction, CodeObservation, env_name="coding_env")
+@app.get("/tasks", tags=["Environment Info"])
+def tasks():
+    """Return available benchmark tasks and their difficulty."""
+    return {"tasks": list_tasks()}
+@app.get("/grader", tags=["Environment Info"])
+def grader(
+    task_id: str = Query(..., description="Task identifier"),
+    episode_id: str = Query(..., description="Episode identifier"),
+):
+    """Return normalized score in [0.0, 1.0] for task/episode."""
+    score = get_episode_score(task_id, episode_id)
+    return {"task_id": task_id, "episode_id": episode_id, "score": float(score)}
 if __name__ == "__main__":
     import uvicorn

server/python_codeact_env.py CHANGED Viewed

@@ -4,75 +4,68 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-"""
-Python Code Action Environment.
-This module provides a server-side environment implementation for executing
-Python code actions using PyExecutor.
-"""
 import uuid
 from openenv.core.env_server.interfaces import Action, Environment, Observation
 from ..models import CodeAction, CodeObservation, CodeState
-from .python_executor import PyExecutor
-from .transforms import create_safe_coding_transform
 class PythonCodeActEnv(Environment):
     """
-    Python Code Action Environment for executing code and tracking state.
-    This environment executes Python code submitted as CodeAction during step,
-    maintains the last exit code in its state, and returns results wrapped
-    in CodeObservation.
-    Args:
-        transform: Optional transform to apply to observations
-        additional_imports: List of additional module imports to authorize
-                          (e.g., ["numpy", "pandas", "matplotlib"])
-    Example:
-        >>> env = PythonCodeActEnv()
-        >>> obs = env.reset()
-        >>> action = CodeAction(code="print('Hello, World!')")
-        >>> obs = env.step(action)
-        >>> print(obs.stdout)  # "Hello, World!\n"
-        >>> print(obs.exit_code)  # 0
-        >>> print(env.state.last_exit_code)  # 0
     """
     def __init__(
         self,
     ):
-        self.transform = create_safe_coding_transform()
-        self._executor = PyExecutor()
         self._state = CodeState()
-    def reset(self) -> Observation:
         """
-        Reset environment and start fresh execution session.
-        Returns:
-            Initial observation with empty stdout/stderr and exit_code=0
         """
-        # Initialize fresh state
-        self._state = CodeState(episode_id=str(uuid.uuid4()), step_count=0)
-        # Add last_exit_code to state
         self._state.last_exit_code = 0
-        # Reset executor to clear any previously defined variables/functions
-        self._executor = PyExecutor()
-        # Reset transform to clear any accumulated state
-        self.transform = create_safe_coding_transform()
-        # Return initial observation
         observation = CodeObservation(
-            stdout="",
             stderr="",
             exit_code=0,
         )
         return self._apply_transform(observation)
@@ -93,20 +86,25 @@ class PythonCodeActEnv(Environment):
         if not isinstance(action, CodeAction):
             raise ValueError(f"Expected CodeAction, got {type(action)}")
-        # Execute the code using PyExecutor
-        result = self._executor.run(action.code)
-        # Update state
         self._state.step_count += 1
-        self._state.last_exit_code = result.exit_code
-        # Create observation from execution result
-        # Include code in metadata for transform reward calculation
         observation = CodeObservation(
-            stdout=result.stdout,
-            stderr=result.stderr,
-            exit_code=result.exit_code,
-            metadata={"last_code": action.code},
         )
         return self._apply_transform(observation)

 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Code review environment with task-based grading and normalized rewards."""
 import uuid
+from typing import Any
 from openenv.core.env_server.interfaces import Action, Environment, Observation
 from ..models import CodeAction, CodeObservation, CodeState
+from .task_bank import get_task, grade_action, list_tasks, record_episode_score
 class PythonCodeActEnv(Environment):
     """
+    Task-driven code-review environment.
+    Episodes are single-step:
+    1. `reset(task_id=...)` returns a code snippet + task description.
+    2. Agent submits CodeAction(review, bug_type, line_number, confidence).
+    3. `step()` returns graded reward in [0.0, 1.0] and done=True.
     """
     def __init__(
         self,
     ):
+        super().__init__(transform=None)
         self._state = CodeState()
+        self._current_task_id = "task_easy_1"
+    def reset(
+        self,
+        seed: int | None = None,
+        episode_id: str | None = None,
+        **kwargs: Any,
+    ) -> Observation:
         """
+        Reset environment and pick a task (easy/medium/hard).
         """
+        requested_task_id = kwargs.get("task_id", self._current_task_id)
+        task = get_task(str(requested_task_id))
+        self._current_task_id = task.task_id
+        self._state = CodeState(
+            episode_id=episode_id or str(uuid.uuid4()),
+            step_count=0,
+            task_id=task.task_id,
+            difficulty=task.difficulty,
+            last_score=0.0,
+        )
         self._state.last_exit_code = 0
         observation = CodeObservation(
+            stdout="Task initialized.",
             stderr="",
             exit_code=0,
+            task_id=task.task_id,
+            difficulty=task.difficulty,
+            task_description=task.task_description,
+            code_snippet=task.code_snippet,
+            previous_feedback="",
+            done=False,
+            reward=0.0,
+            metadata={"available_tasks": list_tasks()},
         )
         return self._apply_transform(observation)
         if not isinstance(action, CodeAction):
             raise ValueError(f"Expected CodeAction, got {type(action)}")
+        task = get_task(self._state.task_id or self._current_task_id)
+        reward, feedback = grade_action(action, task)
         self._state.step_count += 1
+        self._state.last_exit_code = 0
+        self._state.last_score = reward
+        record_episode_score(task.task_id, self._state.episode_id or "default", reward)
         observation = CodeObservation(
+            stdout=feedback,
+            stderr="",
+            exit_code=0,
+            task_id=task.task_id,
+            difficulty=task.difficulty,
+            task_description=task.task_description,
+            code_snippet=task.code_snippet,
+            previous_feedback=feedback,
+            reward=reward,
+            done=True,
         )
         return self._apply_transform(observation)

server/task_bank.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""Task definitions and grading utilities for coding_env."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+try:
+    from coding_env.models import CodeAction
+except ImportError:
+    from ..models import CodeAction
+@dataclass(frozen=True)
+class CodeReviewTask:
+    task_id: str
+    difficulty: str
+    task_description: str
+    code_snippet: str
+    expected_bug_type: str
+    expected_line_number: int
+    expected_keywords: Tuple[str, ...]
+TASKS: Dict[str, CodeReviewTask] = {
+    "task_easy_1": CodeReviewTask(
+        task_id="task_easy_1",
+        difficulty="easy",
+        task_description=(
+            "Find the primary bug in this function and report bug_type, line_number, "
+            "and a concise explanation."
+        ),
+        code_snippet=(
+            "def average(nums):\n"
+            "    total = 0\n"
+            "    for n in nums:\n"
+            "        total += n\n"
+            "    return total / len(total)\n"
+        ),
+        expected_bug_type="logic",
+        expected_line_number=5,
+        expected_keywords=("len", "total", "typeerror"),
+    ),
+    "task_medium_1": CodeReviewTask(
+        task_id="task_medium_1",
+        difficulty="medium",
+        task_description=(
+            "Review for a security issue. Identify the vulnerability type and precise line."
+        ),
+        code_snippet=(
+            "import sqlite3\n"
+            "\n"
+            "def login(conn, username, password):\n"
+            "    query = f\"SELECT * FROM users WHERE name='{username}' AND pw='{password}'\"\n"
+            "    return conn.execute(query).fetchone() is not None\n"
+        ),
+        expected_bug_type="security",
+        expected_line_number=4,
+        expected_keywords=("sql", "injection", "parameterized"),
+    ),
+    "task_hard_1": CodeReviewTask(
+        task_id="task_hard_1",
+        difficulty="hard",
+        task_description=(
+            "Find the concurrency/performance bug and explain why it impacts production latency."
+        ),
+        code_snippet=(
+            "from threading import Lock\n"
+            "lock = Lock()\n"
+            "cache = {}\n"
+            "\n"
+            "def get_user(user_id, db):\n"
+            "    with lock:\n"
+            "        if user_id in cache:\n"
+            "            return cache[user_id]\n"
+            "        data = db.fetch_user(user_id)\n"
+            "        cache[user_id] = data\n"
+            "        return data\n"
+        ),
+        expected_bug_type="logic",
+        expected_line_number=9,
+        expected_keywords=("lock", "critical section", "latency"),
+    ),
+}
+EPISODE_SCORES: Dict[tuple[str, str], float] = {}
+def list_tasks() -> List[Dict[str, str]]:
+    """Return public task metadata."""
+    return [
+        {"task_id": t.task_id, "difficulty": t.difficulty}
+        for t in sorted(TASKS.values(), key=lambda item: item.task_id)
+    ]
+def get_task(task_id: str) -> CodeReviewTask:
+    """Resolve task by id."""
+    if task_id not in TASKS:
+        raise ValueError(
+            f"Unknown task_id '{task_id}'. Available tasks: {', '.join(sorted(TASKS))}"
+        )
+    return TASKS[task_id]
+def _normalize(value: str) -> str:
+    return value.strip().lower().replace("-", "_")
+def grade_action(action: CodeAction, task: CodeReviewTask) -> tuple[float, str]:
+    """Score a code-review action in [0.0, 1.0] with partial credit."""
+    score = 0.0
+    parts: List[str] = []
+    if _normalize(action.bug_type) == _normalize(task.expected_bug_type):
+        score += 0.5
+        parts.append("bug_type matched (+0.50)")
+    else:
+        parts.append(
+            f"bug_type mismatch (expected {task.expected_bug_type}, got {action.bug_type})"
+        )
+    if action.line_number == task.expected_line_number:
+        score += 0.3
+        parts.append("line_number matched (+0.30)")
+    elif abs(action.line_number - task.expected_line_number) <= 1:
+        score += 0.15
+        parts.append("line_number near miss (+0.15)")
+    else:
+        parts.append(
+            f"line_number mismatch (expected {task.expected_line_number}, got {action.line_number})"
+        )
+    review_text = (action.review or "").lower()
+    keyword_hits = sum(
+        1 for keyword in task.expected_keywords if keyword.lower() in review_text
+    )
+    if keyword_hits > 0:
+        keyword_bonus = min(0.2, keyword_hits * 0.1)
+        score += keyword_bonus
+        parts.append(f"review evidence matched (+{keyword_bonus:.2f})")
+    else:
+        parts.append("review lacks key evidence (+0.00)")
+    score = max(0.0, min(1.0, round(score, 4)))
+    return score, "; ".join(parts)
+def record_episode_score(task_id: str, episode_id: str, score: float) -> None:
+    """Persist normalized score for grader endpoint."""
+    EPISODE_SCORES[(task_id, episode_id)] = max(0.0, min(1.0, float(score)))
+def get_episode_score(task_id: str, episode_id: str) -> float:
+    """Read score for task/episode pair."""
+    return EPISODE_SCORES.get((task_id, episode_id), 0.0)