Revanth-ml commited on
Commit
e2eb9d7
Β·
verified Β·
1 Parent(s): c1fd719

Upload folder using huggingface_hub

Browse files
Files changed (16) hide show
  1. Dockerfile +82 -0
  2. README.md +115 -5
  3. __init__.py +7 -0
  4. client.py +38 -0
  5. inference.py +278 -0
  6. models.py +86 -0
  7. openenv.yaml +7 -0
  8. pyproject.toml +45 -0
  9. server/__init__.py +7 -0
  10. server/app.py +156 -0
  11. server/environment.py +288 -0
  12. server/inference.py +342 -0
  13. server/requirements.txt +6 -0
  14. server/tasks.py +428 -0
  15. server/tools.py +308 -0
  16. uv.lock +0 -0
Dockerfile ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
+
16
+ WORKDIR /app
17
+
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
+ RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git && \
21
+ rm -rf /var/lib/apt/lists/*
22
+
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=agentops_gym
26
+
27
+ # Copy environment code (always at root of build context)
28
+ COPY . /app/env
29
+
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
+ WORKDIR /app/env
33
+
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
+
62
+ # Copy the virtual environment from builder
63
+ COPY --from=builder /app/env/.venv /app/.venv
64
+
65
+ # Copy the environment code
66
+ COPY --from=builder /app/env /app/env
67
+
68
+ # Set PATH to use the virtual environment
69
+ ENV PATH="/app/.venv/bin:$PATH"
70
+
71
+ # Set PYTHONPATH so imports work correctly
72
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
+
74
+ ENV ENABLE_WEB_INTERFACE=true
75
+
76
+ # Health check
77
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
78
+ CMD curl -f http://localhost:8000/health || exit 1
79
+
80
+ # Run the FastAPI server
81
+ # The module path is constructed to work with the /app/env structure
82
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
README.md CHANGED
@@ -1,10 +1,120 @@
1
  ---
2
- title: Agentops Gym
3
- emoji: πŸš€
4
- colorFrom: indigo
5
- colorTo: purple
6
  sdk: docker
7
  pinned: false
 
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Agentops Gym Environment Server
3
+ emoji: 🏏
4
+ colorFrom: gray
5
+ colorTo: pink
6
  sdk: docker
7
  pinned: false
8
+ app_port: 8000
9
+ base_path: /web
10
+ tags:
11
+ - openenv
12
  ---
13
 
14
+ # Agentops Gym Environment
15
+
16
+ Stateful, partially observable, efficiency-penalizing RL environment for training agents on software engineering tool-use tasks.
17
+
18
+ ## Quick Start
19
+
20
+ The simplest way to use the Agentops Gym environment is through the `AgentopsGymEnv` class:
21
+
22
+ ```python
23
+ from agentops_gym import AgentopsGymAction, AgentopsGymEnv
24
+ from agentops_gym.models import ToolCall
25
+
26
+ try:
27
+ # Create environment from Docker image
28
+ agentops_gymenv = AgentopsGymEnv.from_docker_image("agentops_gym-env:latest")
29
+
30
+ # Reset to start a task
31
+ result = agentops_gymenv.reset(task_id="task_1")
32
+ print(f"Task: {result.observation.task_description}")
33
+
34
+ # Use tools to complete the task
35
+ # Example: Search for a pattern
36
+ action = AgentopsGymAction(
37
+ tool_call=ToolCall(tool="Grep", parameters={"pattern": "json"})
38
+ )
39
+ result = agentops_gymenv.step(action)
40
+ print(f"Grep Result: {result.observation.last_tool_result}")
41
+
42
+ finally:
43
+ # Always clean up
44
+ agentops_gymenv.close()
45
+ ```
46
+
47
+ ## Building the Docker Image
48
+
49
+ Before using the environment, you need to build the Docker image:
50
+
51
+ ```bash
52
+ # From project root
53
+ docker build -t agentops_gym-env:latest -f agentops_gym/server/Dockerfile .
54
+ ```
55
+
56
+ ## Environment Details
57
+
58
+ ### Action
59
+ **AgentopsGymAction**:
60
+ - `tool_call` (ToolCall) - The tool to execute (Grep, FileRead, FileWrite, Bash, TodoWrite, Submit)
61
+ - `reasoning` (str, optional) - Agent's explanation for the action
62
+
63
+ ### Observation
64
+ **AgentopsGymObservation**:
65
+ - `task_description` (str) - The task objective
66
+ - `visible_files` (list[str]) - Files discovered so far
67
+ - `last_tool_result` (str) - Output of the last tool call
68
+ - `action_history` (list[str]) - Previous actions in this episode
69
+ - `step_count` (int) - Current step number
70
+ - `max_steps` (int) - Maximum allowed steps
71
+ - `done` (bool) - Whether the episode is complete
72
+ - `feedback` (str, optional) - Warnings or penalties from the environment
73
+
74
+ ### Available Tools
75
+ - **Grep**: Search for patterns in the virtual filesystem.
76
+ - **FileRead**: Read file contents.
77
+ - **FileWrite**: Modify file contents.
78
+ - **Bash**: Run simulated commands (lint, test).
79
+ - **TodoWrite**: Save a plan for the task.
80
+ - **Submit**: Submit the final answer.
81
+
82
+ ## Advanced Usage
83
+
84
+ ### Using the Context Manager
85
+
86
+ ```python
87
+ from agentops_gym import AgentopsGymAction, AgentopsGymEnv
88
+ from agentops_gym.models import ToolCall
89
+
90
+ with AgentopsGymEnv(base_url="http://localhost:8000") as env:
91
+ result = env.reset(task_id="task_1")
92
+ # Execute steps...
93
+ action = AgentopsGymAction(tool_call=ToolCall(tool="FileRead", parameters={"filename": "README.md"}))
94
+ result = env.step(action)
95
+ ```
96
+
97
+ ## Running Locally
98
+
99
+ Run the server locally for development:
100
+
101
+ ```bash
102
+ cd agentops_gym
103
+ uvicorn server.app:app --reload
104
+ ```
105
+
106
+ ## Project Structure
107
+
108
+ ```
109
+ agentops_gym/
110
+ β”œβ”€β”€ __init__.py # Module exports
111
+ β”œβ”€β”€ README.md # This file
112
+ β”œβ”€β”€ openenv.yaml # OpenEnv manifest
113
+ β”œβ”€β”€ pyproject.toml # Project metadata and dependencies
114
+ β”œβ”€β”€ models.py # Action and Observation models
115
+ └── server/
116
+ β”œβ”€β”€ __init__.py # Server module exports
117
+ β”œβ”€β”€ agentops_gym_environment.py # Core environment logic
118
+ β”œβ”€β”€ app.py # FastAPI application
119
+ └── Dockerfile # Container image definition
120
+ ```
__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """AgentOps Gym β€” Tool-use efficiency environment for LLM agents."""
client.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AgentOps Gym β€” Environment client.
3
+
4
+ Wraps WebSocket communication with the environment server.
5
+ Provides typed step/reset/state methods for the agent.
6
+ """
7
+
8
+ from typing import Dict, Any
9
+ from openenv.core.env_client import EnvClient
10
+ from openenv.core.client_types import StepResult
11
+
12
+ from agentops_gym.models import ToolCall, AgentObservation, AgentState
13
+
14
+
15
+ class AgentOpsEnv(EnvClient[ToolCall, AgentObservation, AgentState]):
16
+ """Client for the AgentOps Gym environment."""
17
+
18
+ def _step_payload(self, action: ToolCall) -> Dict[str, Any]:
19
+ """Convert a ToolCall action to the JSON payload expected by the server."""
20
+ return action.model_dump()
21
+
22
+ def _parse_result(self, payload: Dict[str, Any]) -> StepResult[AgentObservation]:
23
+ """Parse server response into a StepResult with typed observation."""
24
+ obs_data = payload.get("observation", {})
25
+ obs = AgentObservation(
26
+ **obs_data,
27
+ done=payload.get("done", False),
28
+ reward=payload.get("reward"),
29
+ )
30
+ return StepResult(
31
+ observation=obs,
32
+ reward=payload.get("reward"),
33
+ done=payload.get("done", False),
34
+ )
35
+
36
+ def _parse_state(self, payload: Dict[str, Any]) -> AgentState:
37
+ """Parse server state response into typed State object."""
38
+ return AgentState(**payload)
inference.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ AgentOps Gym β€” Baseline inference script.
4
+
5
+ Runs an LLM agent against all 3 AgentOps Gym tasks (tool-use efficiency)
6
+ and reports per-task scores in the mandatory OpenEnv stdout format.
7
+
8
+ Environment variables (MANDATORY):
9
+ API_BASE_URL The API endpoint for the LLM (default: HF router)
10
+ MODEL_NAME The model identifier (default: Qwen/Qwen2.5-72B-Instruct)
11
+ HF_TOKEN Your Hugging Face / API key (must be set)
12
+ IMAGE_NAME Docker image name for the environment (must be set)
13
+
14
+ Usage:
15
+ IMAGE_NAME=agentops-gym HF_TOKEN=xxx python inference.py
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import asyncio
21
+ import json
22
+ import os
23
+ import sys
24
+ from typing import Any, Dict, List, Optional
25
+
26
+ from openai import OpenAI
27
+
28
+ from agentops_gym.client import AgentOpsEnv
29
+ from agentops_gym.models import ToolCall
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Configuration
33
+ # ---------------------------------------------------------------------------
34
+
35
+ IMAGE_NAME = os.getenv("IMAGE_NAME")
36
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
37
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
38
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
39
+
40
+ BENCHMARK = "agentops-gym"
41
+ MAX_STEPS = 10
42
+ TEMPERATURE = 0.0
43
+ MAX_TOKENS = 600
44
+
45
+ ALL_TASKS = ["task_1", "task_2", "task_3", "task_4"]
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # System prompt
49
+ # ---------------------------------------------------------------------------
50
+
51
+ SYSTEM_PROMPT = """\
52
+ You are an expert software engineer agent. You solve coding tasks by calling tools.
53
+
54
+ Available tools:
55
+ FileRead β€” Read a file. Parameters: {"filename": "path/to/file.py"}
56
+ FileWrite β€” Write/overwrite a file. Parameters: {"filename": "...", "content": "..."}
57
+ Grep β€” Search files for a pattern. Parameters: {"pattern": "regex_or_string"}
58
+ Bash β€” Run simulated shell command. Parameters: {"command": "lint main.py"}
59
+ WebSearch β€” Search documentation. Parameters: {"query": "python lru_cache"}
60
+ TodoWrite β€” Write a plan. Parameters: {"plan": "1. Do X\\n2. Do Y"}
61
+
62
+ RULES:
63
+ 1. Respond ONLY with a single JSON object β€” no markdown, no explanation.
64
+ 2. Format: {"tool": "ToolName", "parameters": {...}, "reasoning": "why"}
65
+ 3. Be efficient β€” minimize total tool calls.
66
+ 4. For hard tasks: use TodoWrite FIRST to plan, then act.
67
+ 5. Never call the exact same tool+parameters twice.
68
+
69
+ Example response:
70
+ {"tool": "Grep", "parameters": {"pattern": "def fetch"}, "reasoning": "Find the function location"}
71
+ """
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # Logging helpers (mandatory OpenEnv stdout format)
75
+ # ---------------------------------------------------------------------------
76
+
77
+ def log_start(task: str, env: str, model: str) -> None:
78
+ print(f"[START] task={task} env={env} model={model}", flush=True)
79
+
80
+
81
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
82
+ err_val = error if error else "null"
83
+ done_val = str(done).lower()
84
+ action_short = action.replace("\n", " ")[:200]
85
+ print(
86
+ f"[STEP] step={step} action={action_short} reward={reward:.2f} done={done_val} error={err_val}",
87
+ flush=True,
88
+ )
89
+
90
+
91
+ def log_end(success: bool, steps: int, rewards: List[float]) -> None:
92
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
93
+ print(
94
+ f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}",
95
+ flush=True,
96
+ )
97
+
98
+ # ---------------------------------------------------------------------------
99
+ # Prompt builder
100
+ # ---------------------------------------------------------------------------
101
+
102
+ def build_prompt(obs_data: Dict[str, Any]) -> str:
103
+ parts = [f"TASK: {obs_data.get('task_description', '')}"]
104
+ parts.append(f"\nVisible files: {obs_data.get('visible_files', [])}")
105
+ if obs_data.get("last_tool_result"):
106
+ parts.append(f"\nLast tool result:\n{obs_data['last_tool_result']}")
107
+ history = obs_data.get("action_history", [])
108
+ if history:
109
+ parts.append(f"\nHistory ({len(history)} calls): {history[-3:]}") # last 3
110
+ if obs_data.get("message"):
111
+ parts.append(f"\nEnvironment message: {obs_data['message']}")
112
+ meta = obs_data.get("metadata", {})
113
+ parts.append(f"\nStep {obs_data.get('step_count', 0)}/{meta.get('max_steps', 10)}, "
114
+ f"steps remaining: {meta.get('steps_remaining', '?')}")
115
+ parts.append("\nRespond with a single JSON tool call:")
116
+ return "\n".join(parts)
117
+
118
+
119
+ def extract_tool_call(text: str) -> Optional[Dict]:
120
+ """Extract JSON tool call from model response."""
121
+ text = text.strip()
122
+ # Strip markdown fences if present
123
+ if "```" in text:
124
+ blocks = text.split("```")
125
+ for b in blocks:
126
+ b = b.strip().lstrip("json").strip()
127
+ if b.startswith("{"):
128
+ text = b
129
+ break
130
+ # Try direct JSON parse
131
+ try:
132
+ obj = json.loads(text)
133
+ if "tool" in obj:
134
+ return obj
135
+ except json.JSONDecodeError:
136
+ pass
137
+ # Try to extract first {...} block
138
+ import re
139
+ m = re.search(r'\{[^{}]+\}', text, re.DOTALL)
140
+ if m:
141
+ try:
142
+ obj = json.loads(m.group())
143
+ if "tool" in obj:
144
+ return obj
145
+ except json.JSONDecodeError:
146
+ pass
147
+ return None
148
+
149
+ # ---------------------------------------------------------------------------
150
+ # Episode runner
151
+ # ---------------------------------------------------------------------------
152
+
153
+ async def run_episode(
154
+ env: AgentOpsEnv,
155
+ client: OpenAI,
156
+ task_id: str,
157
+ ) -> Dict[str, Any]:
158
+ log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
159
+
160
+ rewards: List[float] = []
161
+ steps_taken = 0
162
+ score = 0.0
163
+ success = False
164
+
165
+ try:
166
+ result = await env.reset(seed=None, task_id=task_id)
167
+ obs = result.observation
168
+ obs_data = obs.model_dump() if hasattr(obs, "model_dump") else obs.dict()
169
+
170
+ for step in range(1, MAX_STEPS + 1):
171
+ if result.done:
172
+ break
173
+
174
+ prompt = build_prompt(obs_data)
175
+ completion = client.chat.completions.create(
176
+ model=MODEL_NAME,
177
+ messages=[
178
+ {"role": "system", "content": SYSTEM_PROMPT},
179
+ {"role": "user", "content": prompt},
180
+ ],
181
+ max_tokens=MAX_TOKENS,
182
+ temperature=TEMPERATURE,
183
+ )
184
+
185
+ raw = (completion.choices[0].message.content or "").strip()
186
+ tool_call = extract_tool_call(raw)
187
+
188
+ if tool_call is None:
189
+ # Fallback: emit a safe no-op
190
+ tool_call = {"tool": "Grep", "parameters": {"pattern": "def "}, "reasoning": "fallback"}
191
+
192
+ tool = tool_call.get("tool", "Grep")
193
+ parameters = tool_call.get("parameters", {})
194
+ reasoning = tool_call.get("reasoning", "")
195
+ action_str = f"{tool}({json.dumps(parameters)})"
196
+
197
+ result = await env.step(ToolCall(tool=tool, parameters=parameters, reasoning=reasoning))
198
+ obs = result.observation
199
+ obs_data = obs.model_dump() if hasattr(obs, "model_dump") else obs.dict()
200
+
201
+ reward = result.reward or 0.0
202
+ done = result.done
203
+ error = None # tools return errors inside last_tool_result, not as exceptions
204
+
205
+ rewards.append(reward)
206
+ steps_taken = step
207
+
208
+ log_step(step=step, action=action_str, reward=reward, done=done, error=error)
209
+
210
+ if done:
211
+ break
212
+
213
+ meta = obs_data.get("metadata", {})
214
+ score = meta.get("grader_score") or 0.0
215
+ success = score >= 0.5
216
+
217
+ except Exception as exc:
218
+ print(f"[DEBUG] Episode error for {task_id}: {exc}", flush=True)
219
+
220
+ finally:
221
+ log_end(success=success, steps=steps_taken, rewards=rewards)
222
+
223
+ return {
224
+ "task_id": task_id,
225
+ "score": score,
226
+ "steps": steps_taken,
227
+ "success": success,
228
+ "rewards": rewards,
229
+ }
230
+
231
+ # ---------------------------------------------------------------------------
232
+ # Entrypoint
233
+ # ---------------------------------------------------------------------------
234
+
235
+ async def async_main() -> None:
236
+ if not API_KEY:
237
+ raise SystemExit(
238
+ "HF_TOKEN (or API_KEY) must be set.\n"
239
+ " export HF_TOKEN=your_token_here"
240
+ )
241
+ if not IMAGE_NAME:
242
+ raise SystemExit(
243
+ "IMAGE_NAME must be set.\n"
244
+ " export IMAGE_NAME=agentops-gym"
245
+ )
246
+
247
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
248
+
249
+ async with AgentOpsEnv.from_docker_image(IMAGE_NAME) as env:
250
+ results = []
251
+ for task_id in ALL_TASKS:
252
+ result = await run_episode(env, client, task_id)
253
+ results.append(result)
254
+
255
+ # Summary
256
+ print(f"\n{'='*60}", flush=True)
257
+ print("SUMMARY", flush=True)
258
+ print(f"{'='*60}", flush=True)
259
+
260
+ total = sum(r["score"] for r in results)
261
+ resolved = sum(1 for r in results if r["success"])
262
+ avg = total / len(results) if results else 0.0
263
+
264
+ for r in results:
265
+ status = "SOLVED" if r["success"] else "FAILED"
266
+ print(f" {r['task_id']:>8}: score={r['score']:.3f} steps={r['steps']} {status}", flush=True)
267
+
268
+ print(f"\n Total: {total:.3f} / {len(results)}", flush=True)
269
+ print(f" Average: {avg:.3f}", flush=True)
270
+ print(f" Solved: {resolved} / {len(results)}", flush=True)
271
+
272
+
273
+ def main() -> None:
274
+ asyncio.run(async_main())
275
+
276
+
277
+ if __name__ == "__main__":
278
+ main()
models.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AgentOps Gym β€” Pydantic models for Action, Observation, and State.
3
+
4
+ The agent operates on a simulated Python codebase by calling tools.
5
+ The environment is partially observable, stateful, and efficiency-aware.
6
+ Rewards shrink with wasteful or redundant tool calls.
7
+ """
8
+
9
+ from typing import Optional, List, Dict, Any
10
+ from pydantic import Field
11
+ from openenv.core.env_server.types import Action, Observation, State
12
+
13
+
14
+ class ToolCall(Action):
15
+ """Agent submits a tool call with a name and parameters.
16
+
17
+ Open action space: any valid tool name from AVAILABLE_TOOLS with
18
+ any parameters. This mirrors how real agents interact with tool-use
19
+ environments β€” no artificial discretization.
20
+ """
21
+ tool: str = Field(
22
+ ...,
23
+ description="Tool name (FileRead, FileWrite, Grep, Bash, WebSearch, TodoWrite)"
24
+ )
25
+ parameters: Dict[str, Any] = Field(
26
+ default_factory=dict,
27
+ description="Tool parameters, e.g. {'filename': 'main.py'} or {'pattern': 'def fetch'}"
28
+ )
29
+ reasoning: Optional[str] = Field(
30
+ default=None,
31
+ description="Optional: why the agent is calling this tool (for interpretability)"
32
+ )
33
+
34
+
35
+ class AgentObservation(Observation):
36
+ """What the agent sees after each action.
37
+
38
+ Inherits from Observation which provides:
39
+ - done: bool
40
+ - reward: Optional[float]
41
+ - metadata: Dict[str, Any]
42
+ """
43
+ # Files the agent has discovered so far (partial observability)
44
+ visible_files: List[str] = Field(
45
+ default_factory=list,
46
+ description="Files the agent currently knows exist in the project"
47
+ )
48
+ # Output of the most recent tool call
49
+ last_tool_result: Optional[str] = Field(
50
+ default=None,
51
+ description="Output string from the last tool call"
52
+ )
53
+ # Sequential history of tool calls made this episode
54
+ action_history: List[str] = Field(
55
+ default_factory=list,
56
+ description="e.g. ['Grep(pattern=timeout)', 'FileRead(config.json)']"
57
+ )
58
+ step_count: int = Field(default=0, description="How many steps taken so far")
59
+ task_description: str = Field(default="", description="The task the agent must solve")
60
+ # Feedback from the environment on quality of last action
61
+ message: Optional[str] = Field(
62
+ default=None,
63
+ description="Environment feedback e.g. 'redundant call detected'"
64
+ )
65
+
66
+
67
+ class AgentState(State):
68
+ """Episode metadata for training harnesses and curriculum schedulers.
69
+
70
+ Inherits from State which provides:
71
+ - episode_id: Optional[str]
72
+ - step_count: int
73
+ """
74
+ task_id: str = Field(default="", description="Current task identifier")
75
+ task_description: str = Field(default="", description="Human-readable task description")
76
+ difficulty: str = Field(default="", description="easy / medium / hard")
77
+ max_steps: int = Field(default=10, description="Max steps allowed this episode")
78
+ visible_files: List[str] = Field(default_factory=list)
79
+ discovered_files: List[str] = Field(default_factory=list)
80
+ action_history: List[str] = Field(default_factory=list)
81
+ current_reward: float = Field(default=0.0, description="Cumulative reward so far")
82
+ completed: bool = Field(default=False)
83
+ grader_score: Optional[float] = Field(
84
+ default=None,
85
+ description="Final grader score (0.0-1.0), set at end of episode"
86
+ )
openenv.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: agentops_gym
3
+ type: space
4
+ runtime: fastapi
5
+ app: server.app:app
6
+ port: 8000
7
+
pyproject.toml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "openenv-agentops_gym"
13
+ version = "0.1.0"
14
+ description = "Agentops Gym environment for OpenEnv"
15
+ requires-python = ">=3.10"
16
+ dependencies = [
17
+ # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
+ # install from github
19
+ # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
+ "openenv-core[core]>=0.2.2",
21
+ # Environment-specific dependencies
22
+ # Add all dependencies needed for your environment here
23
+ # Examples:
24
+ # "numpy>=1.19.0",
25
+ # "torch>=2.0.0",
26
+ # "gymnasium>=0.29.0",
27
+ # "openspiel>=1.0.0",
28
+ # "smolagents>=1.22.0,<2",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "pytest>=8.0.0",
34
+ "pytest-cov>=4.0.0",
35
+ ]
36
+
37
+ [project.scripts]
38
+ # Server entry point - enables running via: uv run --project . server
39
+ # or: python -m agentops_gym.server.app
40
+ server = "agentops_gym.server.app:main"
41
+
42
+ [tool.setuptools]
43
+ include-package-data = true
44
+ packages = ["agentops_gym", "agentops_gym.server"]
45
+ package-dir = { "agentops_gym" = ".", "agentops_gym.server" = "server" }
server/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """AgentOps Gym β€” Server package."""
server/app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AgentOps Gym β€” FastAPI application.
3
+
4
+ Exposes the OpenEnv-compatible HTTP + WebSocket API via openenv-core's
5
+ create_app(), plus custom endpoints: /tasks, /grader, /health.
6
+
7
+ A persistent singleton environment handles HTTP /reset and /step (for
8
+ the baseline script and interactive testing). WebSocket connections each
9
+ get their own AgentOpsEnvironment instance (via create_app factory pattern).
10
+ """
11
+
12
+ import threading
13
+ import logging
14
+ from typing import Optional
15
+
16
+ from fastapi.responses import JSONResponse
17
+
18
+ from openenv.core.env_server.http_server import create_app
19
+
20
+ from agentops_gym.models import ToolCall, AgentObservation
21
+ from agentops_gym.server.environment import AgentOpsEnvironment, get_last_grader_result
22
+ from agentops_gym.server.tasks import TASK_REGISTRY
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ app = create_app(
27
+ AgentOpsEnvironment,
28
+ ToolCall,
29
+ AgentObservation,
30
+ env_name="agentops-gym",
31
+ )
32
+
33
+ _env = AgentOpsEnvironment()
34
+ _env_lock = threading.Lock()
35
+
36
+
37
+ def _serialize(obs: AgentObservation) -> dict:
38
+ return obs.model_dump() if hasattr(obs, "model_dump") else obs.dict()
39
+
40
+
41
+ app.router.routes = [
42
+ r for r in app.router.routes
43
+ if not (hasattr(r, "path") and r.path in ("/reset", "/step"))
44
+ ]
45
+
46
+
47
+ @app.post("/reset")
48
+ async def stateful_reset(request: dict = None):
49
+ """Reset environment for a new episode. Pass {'task_id': 'task_1'} etc."""
50
+ import asyncio
51
+ request = request or {}
52
+ task_id = request.get("task_id", "task_1")
53
+
54
+ def _do():
55
+ with _env_lock:
56
+ obs = _env.reset(task_id=task_id)
57
+ return _serialize(obs)
58
+
59
+ loop = asyncio.get_event_loop()
60
+ obs_dict = await loop.run_in_executor(None, _do)
61
+ return {"observation": obs_dict, "reward": 0.0, "done": False}
62
+
63
+
64
+ @app.post("/step")
65
+ async def stateful_step(request: dict = None):
66
+ """Execute one tool call.
67
+
68
+ Accepts two body shapes:
69
+ 1. {"action": {"tool": "...", "parameters": {...}}} ← inference script
70
+ 2. {"tool": "...", "parameters": {...}} ← direct curl
71
+ """
72
+ import asyncio
73
+ request = request or {}
74
+
75
+ if "action" in request:
76
+ action_data = request["action"]
77
+ else:
78
+ action_data = request
79
+
80
+ tool = action_data.get("tool", "")
81
+ parameters = action_data.get("parameters", {})
82
+ reasoning = action_data.get("reasoning", "")
83
+
84
+ if not tool:
85
+ return JSONResponse(
86
+ status_code=400,
87
+ content={"error": "'tool' field is required. Body must be {'action': {'tool': '...', 'parameters': {...}}}"},
88
+ )
89
+
90
+ def _do():
91
+ with _env_lock:
92
+ obs = _env.step(ToolCall(tool=tool, parameters=parameters, reasoning=reasoning))
93
+ return _serialize(obs)
94
+
95
+ loop = asyncio.get_event_loop()
96
+ obs_dict = await loop.run_in_executor(None, _do)
97
+ return {
98
+ "observation": obs_dict,
99
+ "reward": obs_dict.get("reward", 0.0),
100
+ "done": obs_dict.get("done", False),
101
+ }
102
+
103
+
104
+
105
+ @app.get("/tasks")
106
+ async def list_tasks():
107
+ """List all available tasks with metadata."""
108
+ tasks = []
109
+ for tid, t in TASK_REGISTRY.items():
110
+ tasks.append({
111
+ "id": tid,
112
+ "name": t["name"],
113
+ "difficulty": t["difficulty"],
114
+ "description": t["description"],
115
+ "max_steps": t["max_steps"],
116
+ "optimal_steps": t["optimal_steps"],
117
+ })
118
+ return {
119
+ "tasks": tasks,
120
+ "action_schema": {
121
+ "tool": "string β€” one of FileRead|FileWrite|Grep|Bash|WebSearch|TodoWrite",
122
+ "parameters": "dict β€” tool-specific params",
123
+ "reasoning": "string (optional) β€” agent's reasoning",
124
+ },
125
+ }
126
+
127
+
128
+ @app.get("/grader")
129
+ async def grader_score():
130
+ """Return the grader score for the last completed episode."""
131
+ result = get_last_grader_result()
132
+ if result is None:
133
+ return JSONResponse(
134
+ status_code=404,
135
+ content={"error": "No episode graded yet. Complete an episode first."},
136
+ )
137
+ return result
138
+
139
+
140
+ @app.get("/health")
141
+ async def health():
142
+ return {"status": "ok", "env": "agentops-gym"}
143
+
144
+
145
+ def main():
146
+ import uvicorn
147
+ import os
148
+ host = os.getenv("HOST", "0.0.0.0")
149
+ port = int(os.getenv("PORT", 8000))
150
+ uvicorn.run(app, host=host, port=port)
151
+
152
+
153
+ if __name__ == "__main__":
154
+ main()
155
+
156
+
server/environment.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AgentOps Gym β€” Core Environment class.
3
+
4
+ Implements the OpenEnv Environment interface: reset(), step(), state.
5
+ Orchestrates tool execution, reward shaping, and episode grading.
6
+
7
+ Each episode is fully deterministic given a task_id:
8
+ - Snapshot is restored from PROJECT_SNAPSHOTS on reset
9
+ - All tool calls operate on the in-memory snapshot
10
+ - No real filesystem, no real subprocess
11
+ """
12
+
13
+ import copy
14
+ import logging
15
+ import uuid
16
+ from typing import Optional, Any
17
+
18
+ from openenv.core.env_server.interfaces import Environment
19
+
20
+ from agentops_gym.models import ToolCall, AgentObservation, AgentState
21
+ from agentops_gym.server.tools import run_tool, PROJECT_SNAPSHOTS, AVAILABLE_TOOLS
22
+ from agentops_gym.server.tasks import (
23
+ TASK_REGISTRY,
24
+ get_task,
25
+ list_task_ids,
26
+ compute_step_reward,
27
+ grade_episode,
28
+ )
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ _last_grader_result: Optional[dict] = None
33
+
34
+
35
+ class AgentOpsEnvironment(Environment[ToolCall, AgentObservation, AgentState]):
36
+ """Tool-use efficiency training environment.
37
+
38
+ Each episode:
39
+ 1. reset() selects a task, initialises the in-memory snapshot, returns initial obs
40
+ 2. step() executes a tool call, computes reward, checks completion
41
+ 3. state property returns current episode metadata
42
+ """
43
+
44
+ def __init__(self):
45
+ super().__init__()
46
+ self._episode_id: str = ""
47
+ self._task_id: str = ""
48
+ self._task: dict = {}
49
+ self._snapshot: dict = {}
50
+ self._visible_files: list = []
51
+ self._discovered_files: list = []
52
+ self._action_history: list = []
53
+ self._step_count: int = 0
54
+ self._max_steps: int = 10
55
+ self._done: bool = True
56
+ self._cumulative_reward: float = 0.0
57
+ self._grader_score: Optional[float] = None
58
+
59
+
60
+ def reset(
61
+ self,
62
+ seed: Optional[int] = None,
63
+ episode_id: Optional[str] = None,
64
+ **kwargs: Any,
65
+ ) -> AgentObservation:
66
+ """Start a new episode.
67
+
68
+ kwargs may include 'task_id' to select a specific task.
69
+ If not given, defaults to task_1 (can be cycled externally).
70
+ """
71
+ task_id = kwargs.get("task_id", "task_1")
72
+ if task_id not in TASK_REGISTRY:
73
+ task_id = "task_1"
74
+
75
+ self._episode_id = episode_id or str(uuid.uuid4())
76
+ self._task_id = task_id
77
+ self._task = get_task(task_id)
78
+ self._max_steps = self._task["max_steps"]
79
+
80
+ self._snapshot = copy.deepcopy(PROJECT_SNAPSHOTS.get(task_id, {}))
81
+
82
+ self._visible_files = list(self._task["initial_visible_files"])
83
+ self._discovered_files = list(self._visible_files)
84
+
85
+ self._action_history = []
86
+ self._step_count = 0
87
+ self._done = False
88
+ self._cumulative_reward = 0.0
89
+ self._grader_score = None
90
+
91
+ logger.info("Episode %s started: task=%s", self._episode_id, task_id)
92
+
93
+ return AgentObservation(
94
+ visible_files=list(self._visible_files),
95
+ last_tool_result=None,
96
+ action_history=[],
97
+ step_count=0,
98
+ task_description=self._task["description"],
99
+ message=f"Episode started. Available tools: {', '.join(AVAILABLE_TOOLS.keys())}",
100
+ done=False,
101
+ reward=0.0,
102
+ metadata={
103
+ "task_id": task_id,
104
+ "difficulty": self._task["difficulty"],
105
+ "max_steps": self._max_steps,
106
+ "available_tools": list(AVAILABLE_TOOLS.keys()),
107
+ },
108
+ )
109
+
110
+ def step(
111
+ self,
112
+ action: ToolCall,
113
+ **kwargs: Any,
114
+ ) -> AgentObservation:
115
+ """Execute one tool call and return updated observation."""
116
+ if self._done:
117
+ return self._terminal_obs("Episode already done. Call reset() first.")
118
+
119
+ self._step_count += 1
120
+ tool = action.tool
121
+ params = action.parameters
122
+
123
+ tool_result, self._snapshot, self._discovered_files = run_tool(
124
+ tool=tool,
125
+ parameters=params,
126
+ snapshot=self._snapshot,
127
+ discovered_files=self._discovered_files,
128
+ )
129
+
130
+ history_before = list(self._action_history)
131
+
132
+ action_str = f"{tool}({params})"
133
+ self._action_history.append(action_str)
134
+
135
+ for f in self._discovered_files:
136
+ if f not in self._visible_files:
137
+ self._visible_files.append(f)
138
+
139
+ step_reward, reward_breakdown = compute_step_reward(
140
+ task_id=self._task_id,
141
+ tool=tool,
142
+ parameters=params,
143
+ tool_result=tool_result,
144
+ action_history=history_before,
145
+ discovered_files=self._discovered_files,
146
+ snapshot=self._snapshot,
147
+ )
148
+ self._cumulative_reward += step_reward
149
+ self._cumulative_reward = max(0.0, min(1.0, self._cumulative_reward))
150
+
151
+ done = False
152
+ message = None
153
+
154
+ if self._step_count >= self._max_steps:
155
+ done = True
156
+ message = f"Max steps ({self._max_steps}) reached."
157
+
158
+ # Hard cap for task_3
159
+ if self._task_id == "task_3" and self._step_count > 8:
160
+ done = True
161
+ message = "Hard step cap (8) exceeded. Score capped at 0.3."
162
+
163
+ # ── Task completion detection ──────────────────────────────────
164
+ # task_1: linter ran and found the bug (or agent read main.py + grepped json)
165
+ if self._task_id == "task_1":
166
+ linted = any("BASH" in h.upper() and "LINT" in h.upper() for h in self._action_history)
167
+ read_main = any("FILEREAD" in h.upper() and "MAIN.PY" in h.upper() for h in self._action_history)
168
+ found_json = any("GREP" in h.upper() and "JSON" in h.upper() for h in self._action_history)
169
+ if linted or (read_main and found_json):
170
+ done = True
171
+ message = "Bug identified β€” grading episode."
172
+
173
+ # task_2: config.json was written with timeout=10
174
+ elif self._task_id == "task_2":
175
+ import json as _json
176
+ try:
177
+ cfg = _json.loads(self._snapshot.get("config.json", "{}"))
178
+ if cfg.get("timeout") == 10:
179
+ done = True
180
+ message = "Config patched successfully β€” grading episode."
181
+ except Exception:
182
+ pass
183
+
184
+ # task_3: main.py now contains a cache mechanism
185
+ elif self._task_id == "task_3":
186
+ main_src = self._snapshot.get("main.py", "")
187
+ if "lru_cache" in main_src or "_cache" in main_src:
188
+ done = True
189
+ message = "Caching implemented β€” grading episode."
190
+
191
+ # task_4: .env contains API_KEY and main.py uses os.getenv
192
+ elif self._task_id == "task_4":
193
+ main_src = self._snapshot.get("main.py", "")
194
+ env_src = self._snapshot.get(".env", "")
195
+ if "API_KEY=SECRET_TOKEN_XYZ" in env_src.replace(" ", "") and \
196
+ "os.getenv" in main_src and \
197
+ "SECRET_TOKEN_XYZ" not in main_src:
198
+ done = True
199
+ message = "Secret migrated successfully β€” grading episode."
200
+
201
+ # Redundant call message (non-terminating)
202
+ if len(self._action_history) >= 2 and self._action_history[-1] == self._action_history[-2]:
203
+ message = (message or "") + " Redundant call detected."
204
+
205
+ self._done = done
206
+
207
+ # Compute final grader score at episode end
208
+ grader_score = None
209
+ if done:
210
+ grader_score, breakdown = grade_episode(
211
+ task_id=self._task_id,
212
+ snapshot=self._snapshot,
213
+ action_history=self._action_history,
214
+ steps_used=self._step_count,
215
+ )
216
+ self._grader_score = grader_score
217
+ # Store globally for /grader endpoint
218
+ global _last_grader_result
219
+ _last_grader_result = {
220
+ "task_id": self._task_id,
221
+ "episode_id": self._episode_id,
222
+ "score": grader_score,
223
+ "breakdown": breakdown,
224
+ "steps_used": self._step_count,
225
+ }
226
+ # Add completion bonus proportional to grader score
227
+ step_reward += grader_score * 0.5
228
+ logger.info(
229
+ "Episode %s done: task=%s score=%.3f steps=%d",
230
+ self._episode_id, self._task_id, grader_score, self._step_count,
231
+ )
232
+
233
+ return AgentObservation(
234
+ visible_files=list(self._visible_files),
235
+ last_tool_result=tool_result,
236
+ action_history=list(self._action_history),
237
+ step_count=self._step_count,
238
+ task_description=self._task["description"],
239
+ message=message,
240
+ done=done,
241
+ reward=round(step_reward, 4),
242
+ metadata={
243
+ "task_id": self._task_id,
244
+ "difficulty": self._task["difficulty"],
245
+ "cumulative_reward": round(self._cumulative_reward, 4),
246
+ "grader_score": grader_score,
247
+ "reward_breakdown": reward_breakdown,
248
+ "steps_remaining": self._max_steps - self._step_count,
249
+ },
250
+ )
251
+
252
+ @property
253
+ def state(self) -> AgentState:
254
+ return AgentState(
255
+ episode_id=self._episode_id,
256
+ step_count=self._step_count,
257
+ task_id=self._task_id,
258
+ task_description=self._task.get("description", ""),
259
+ difficulty=self._task.get("difficulty", ""),
260
+ max_steps=self._max_steps,
261
+ visible_files=list(self._visible_files),
262
+ discovered_files=list(self._discovered_files),
263
+ action_history=list(self._action_history),
264
+ current_reward=round(self._cumulative_reward, 4),
265
+ completed=self._done,
266
+ grader_score=self._grader_score,
267
+ )
268
+
269
+ def close(self) -> None:
270
+ pass
271
+
272
+
273
+ def _terminal_obs(self, msg: str) -> AgentObservation:
274
+ return AgentObservation(
275
+ visible_files=list(self._visible_files),
276
+ last_tool_result=msg,
277
+ action_history=list(self._action_history),
278
+ step_count=self._step_count,
279
+ task_description=self._task.get("description", ""),
280
+ message=msg,
281
+ done=True,
282
+ reward=0.0,
283
+ metadata={"task_id": self._task_id, "grader_score": self._grader_score},
284
+ )
285
+
286
+
287
+ def get_last_grader_result() -> Optional[dict]:
288
+ return _last_grader_result
server/inference.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ AgentOps Gym β€” Baseline inference script.
4
+
5
+ Runs an LLM agent against all 3 tasks and reports per-task scores
6
+ in the mandatory OpenEnv stdout format.
7
+
8
+ Environment variables (MANDATORY):
9
+ API_BASE_URL LLM API endpoint (default: https://router.huggingface.co/v1)
10
+ MODEL_NAME Model identifier (default: Qwen/Qwen2.5-72B-Instruct)
11
+ HF_TOKEN HuggingFace / API key (must be set)
12
+ IMAGE_NAME Docker image name (must be set)
13
+
14
+ Usage:
15
+ IMAGE_NAME=agentops-gym HF_TOKEN=xxx python inference.py
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import json
21
+ import os
22
+ import re
23
+ import sys
24
+ import time
25
+ from typing import Any, Dict, List, Optional
26
+
27
+ import requests
28
+ from openai import OpenAI
29
+
30
+
31
+ # Load .env file if present (works without it too)
32
+ try:
33
+ from dotenv import load_dotenv
34
+ load_dotenv()
35
+ except ImportError:
36
+ pass
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Configuration
40
+ # ---------------------------------------------------------------------------
41
+
42
+ IMAGE_NAME = os.getenv("IMAGE_NAME")
43
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
44
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
45
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
46
+ BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000")
47
+
48
+ BENCHMARK = "agentops-gym"
49
+ MAX_STEPS = 10
50
+ TEMPERATURE = 0.3
51
+ MAX_TOKENS = 600
52
+
53
+ ALL_TASKS = ["task_1", "task_2", "task_3", "task_4"]
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # System prompt
57
+ # ---------------------------------------------------------------------------
58
+
59
+ SYSTEM_PROMPT = """\
60
+ You are an expert software engineer agent. You solve coding tasks by calling tools.
61
+
62
+ Available tools:
63
+ FileRead β€” Read a file. Parameters: {"filename": "path/to/file.py"}
64
+ FileWrite β€” Write/overwrite. Parameters: {"filename": "...", "content": "..."}
65
+ Grep β€” Search all files. Parameters: {"pattern": "regex_or_string"}
66
+ Bash β€” Simulated shell. Parameters: {"command": "lint main.py"}
67
+ WebSearch β€” Search docs. Parameters: {"query": "python lru_cache"}
68
+ TodoWrite β€” Record a plan. Parameters: {"plan": "1. Do X\\n2. Do Y"}
69
+
70
+ RULES:
71
+ 1. Respond ONLY with a single JSON object β€” no markdown, no extra text.
72
+ 2. Format exactly: {"tool": "ToolName", "parameters": {...}, "reasoning": "why"}
73
+ 3. Be efficient β€” minimize total tool calls.
74
+ 4. For hard tasks: call TodoWrite FIRST to plan, then act.
75
+ 5. Never repeat the exact same tool + parameters twice in a row.
76
+
77
+ Example:
78
+ {"tool": "Grep", "parameters": {"pattern": "def fetch"}, "reasoning": "Find the function"}
79
+ """
80
+
81
+ # ---------------------------------------------------------------------------
82
+ # Mandatory stdout log helpers
83
+ # ---------------------------------------------------------------------------
84
+
85
+ def log_start(task: str, env: str, model: str) -> None:
86
+ print(f"[START] task={task} env={env} model={model}", flush=True)
87
+
88
+
89
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
90
+ err_val = error if error else "null"
91
+ action_short = str(action).replace("\n", " ")[:200]
92
+ print(
93
+ f"[STEP] step={step} action={action_short} "
94
+ f"reward={reward:.2f} done={str(done).lower()} error={err_val}",
95
+ flush=True,
96
+ )
97
+
98
+
99
+ def log_end(success: bool, steps: int, rewards: List[float]) -> None:
100
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
101
+ print(
102
+ f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}",
103
+ flush=True,
104
+ )
105
+
106
+ # ---------------------------------------------------------------------------
107
+ # HTTP helpers
108
+ # ---------------------------------------------------------------------------
109
+
110
+ def http_reset(task_id: str) -> Dict:
111
+ """POST /reset and return the observation dict."""
112
+ resp = requests.post(
113
+ f"{BASE_URL}/reset",
114
+ json={"task_id": task_id},
115
+ timeout=30,
116
+ )
117
+ resp.raise_for_status()
118
+ return resp.json()
119
+
120
+
121
+ def http_step(tool: str, parameters: Dict, reasoning: str = "") -> Dict:
122
+ """POST /step with the correct body shape and return the response dict."""
123
+ body = {
124
+ "action": {
125
+ "tool": tool,
126
+ "parameters": parameters,
127
+ "reasoning": reasoning,
128
+ }
129
+ }
130
+ resp = requests.post(
131
+ f"{BASE_URL}/step",
132
+ json=body,
133
+ timeout=30,
134
+ )
135
+ resp.raise_for_status()
136
+ return resp.json()
137
+
138
+
139
+ def http_grader() -> Dict:
140
+ resp = requests.get(f"{BASE_URL}/grader", timeout=10)
141
+ if resp.status_code == 200:
142
+ return resp.json()
143
+ return {}
144
+
145
+ # ---------------------------------------------------------------------------
146
+ # Prompt builder
147
+ # ---------------------------------------------------------------------------
148
+
149
+ def build_prompt(obs: Dict) -> str:
150
+ parts = [f"TASK: {obs.get('task_description', '')}"]
151
+ parts.append(f"\nVisible files: {obs.get('visible_files', [])}")
152
+ last = obs.get("last_tool_result")
153
+ if last:
154
+ # Truncate long outputs
155
+ parts.append(f"\nLast tool result:\n{str(last)[:1500]}")
156
+ history = obs.get("action_history", [])
157
+ if history:
158
+ parts.append(f"\nHistory (last 3): {history[-3:]}")
159
+ if obs.get("message"):
160
+ parts.append(f"\nEnv message: {obs['message']}")
161
+ meta = obs.get("metadata", {})
162
+ steps_rem = meta.get("steps_remaining", "?")
163
+ parts.append(f"\nStep {obs.get('step_count', 0)}, steps remaining: {steps_rem}")
164
+ parts.append("\nRespond with a single JSON tool call:")
165
+ return "\n".join(parts)
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # JSON extraction
169
+ # ---------------------------------------------------------------------------
170
+
171
+ def extract_tool_call(text: str) -> Optional[Dict]:
172
+ """Extract a valid JSON tool call from model output."""
173
+ text = text.strip()
174
+ # Strip markdown fences
175
+ if "```" in text:
176
+ for block in text.split("```"):
177
+ block = block.strip().lstrip("json").strip()
178
+ if block.startswith("{"):
179
+ text = block
180
+ break
181
+ # Direct parse
182
+ try:
183
+ obj = json.loads(text)
184
+ if "tool" in obj:
185
+ return obj
186
+ except json.JSONDecodeError:
187
+ pass
188
+ # Extract first {...} block
189
+ m = re.search(r'\{[^{}]+\}', text, re.DOTALL)
190
+ if m:
191
+ try:
192
+ obj = json.loads(m.group())
193
+ if "tool" in obj:
194
+ return obj
195
+ except json.JSONDecodeError:
196
+ pass
197
+ return None
198
+
199
+ # ---------------------------------------------------------------------------
200
+ # Episode runner
201
+ # ---------------------------------------------------------------------------
202
+
203
+ def run_episode(client: OpenAI, task_id: str) -> Dict:
204
+ log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
205
+
206
+ rewards: List[float] = []
207
+ steps_taken = 0
208
+ score = 0.0
209
+ success = False
210
+ error_msg = None
211
+
212
+ try:
213
+ # Reset
214
+ reset_resp = http_reset(task_id)
215
+ obs = reset_resp.get("observation", {})
216
+
217
+ for step in range(1, MAX_STEPS + 1):
218
+ if reset_resp.get("done") or obs.get("done"):
219
+ break
220
+
221
+ # Ask the model
222
+ prompt = build_prompt(obs)
223
+ try:
224
+ completion = client.chat.completions.create(
225
+ model=MODEL_NAME,
226
+ messages=[
227
+ {"role": "system", "content": SYSTEM_PROMPT},
228
+ {"role": "user", "content": prompt},
229
+ ],
230
+ max_tokens=MAX_TOKENS,
231
+ temperature=TEMPERATURE,
232
+ )
233
+ raw = (completion.choices[0].message.content or "").strip()
234
+ except Exception as e:
235
+ error_msg = f"LLM error: {e}"
236
+ log_step(step=step, action="(llm_error)", reward=0.0, done=True, error=str(e))
237
+ break
238
+
239
+ tool_call = extract_tool_call(raw)
240
+ if tool_call is None:
241
+ # Fallback: safe no-op grep
242
+ tool_call = {
243
+ "tool": "Grep",
244
+ "parameters": {"pattern": "def "},
245
+ "reasoning": "fallback β€” could not parse model output",
246
+ }
247
+
248
+ tool = tool_call.get("tool", "Grep")
249
+ params = tool_call.get("parameters", {})
250
+ reasoning = tool_call.get("reasoning", "")
251
+ action_str = f"{tool}({json.dumps(params)})"
252
+
253
+ # Execute
254
+ try:
255
+ step_resp = http_step(tool, params, reasoning)
256
+ except requests.HTTPError as e:
257
+ error_msg = str(e)
258
+ log_step(step=step, action=action_short, reward=0.0, done=True, error=error_msg)
259
+ break
260
+
261
+ obs = step_resp.get("observation", {})
262
+ reward = float(step_resp.get("reward", 0.0) or 0.0)
263
+ done = bool(step_resp.get("done", False))
264
+ rewards.append(reward)
265
+ steps_taken = step
266
+
267
+ log_step(step=step, action=action_str, reward=reward, done=done, error=None)
268
+
269
+ if done:
270
+ break
271
+
272
+ # Fetch grader score
273
+ grader = http_grader()
274
+ score = float(grader.get("score", 0.0) or 0.0)
275
+ success = score >= 0.5
276
+
277
+ except Exception as exc:
278
+ print(f"[DEBUG] Episode error for {task_id}: {exc}", flush=True)
279
+
280
+ finally:
281
+ log_end(success=success, steps=steps_taken, rewards=rewards)
282
+
283
+ return {
284
+ "task_id": task_id,
285
+ "score": score,
286
+ "steps": steps_taken,
287
+ "success": success,
288
+ "rewards": rewards,
289
+ }
290
+
291
+
292
+ def main() -> None:
293
+ if not API_KEY:
294
+ print("ERROR: HF_TOKEN (or API_KEY) must be set.", file=sys.stderr)
295
+ print(" export HF_TOKEN=hf_xxx", file=sys.stderr)
296
+ sys.exit(1)
297
+
298
+ for attempt in range(10):
299
+ try:
300
+ r = requests.get(f"{BASE_URL}/health", timeout=5)
301
+ if r.status_code == 200:
302
+ break
303
+ except Exception:
304
+ pass
305
+ print(f"[DEBUG] Waiting for server... attempt {attempt+1}/10", flush=True)
306
+ time.sleep(2)
307
+ else:
308
+ print("ERROR: Server did not become ready.", file=sys.stderr)
309
+ sys.exit(1)
310
+
311
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
312
+
313
+ print("=" * 60, flush=True)
314
+ print(f"AgentOps Gym β€” Baseline Inference", flush=True)
315
+ print(f"Model: {MODEL_NAME} | Server: {BASE_URL}", flush=True)
316
+ print("=" * 60, flush=True)
317
+
318
+ results = []
319
+ for task_id in ALL_TASKS:
320
+ print("─" * 40, flush=True)
321
+ result = run_episode(client, task_id)
322
+ results.append(result)
323
+
324
+ print("=" * 60, flush=True)
325
+ print("BASELINE SUMMARY", flush=True)
326
+ print("=" * 60, flush=True)
327
+
328
+ total = sum(r["score"] for r in results)
329
+ solved = sum(1 for r in results if r["success"])
330
+ avg = total / len(results) if results else 0.0
331
+
332
+ for r in results:
333
+ status = "βœ… PASS" if r["success"] else "❌ FAIL"
334
+ print(f" {r['task_id']:>8} score={r['score']:.3f} steps={r['steps']:2d} {status}", flush=True)
335
+
336
+ print(f"\n Average score: {avg:.3f}", flush=True)
337
+ print(f" Solved: {solved} / {len(results)}", flush=True)
338
+ print("=" * 60, flush=True)
339
+
340
+
341
+ if __name__ == "__main__":
342
+ main()
server/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ openenv[core]>=0.2.0
2
+ fastapi>=0.115.0
3
+ uvicorn>=0.24.0
4
+
5
+
6
+
server/tasks.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AgentOps Gym β€” Task definitions and deterministic graders.
3
+
4
+ 3 tasks with a clear difficulty gradient:
5
+ task_1 (easy) β€” Bug Localization
6
+ task_2 (medium) β€” Config Patching
7
+ task_3 (hard) β€” Caching Implementation
8
+
9
+ Each grader returns a float in [0.0, 1.0] and a breakdown dict.
10
+ Graders check the in-memory snapshot state, not keyword matching.
11
+ """
12
+
13
+ import json
14
+ import re
15
+ from typing import Dict, Any, List, Tuple, Optional
16
+
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # Task registry
20
+ # ---------------------------------------------------------------------------
21
+
22
+ TASK_REGISTRY: Dict[str, Dict[str, Any]] = {
23
+ "task_1": {
24
+ "name": "Bug Localization",
25
+ "difficulty": "easy",
26
+ "max_steps": 8,
27
+ "optimal_steps": 3,
28
+ "description": (
29
+ "The fetch_user function in this project is broken. "
30
+ "Users report it always returns None instead of user data. "
31
+ "Find the bug and report which file and line number contains it."
32
+ ),
33
+ "initial_visible_files": ["README.md"],
34
+ },
35
+ "task_2": {
36
+ "name": "Config Patching",
37
+ "difficulty": "medium",
38
+ "max_steps": 10,
39
+ "optimal_steps": 4,
40
+ "description": (
41
+ "Production is timing out. Someone reported the API timeout is misconfigured. "
42
+ "Find the config file and change the timeout value from 30 to 10."
43
+ ),
44
+ "initial_visible_files": ["main.py", "README.md"],
45
+ },
46
+ "task_3": {
47
+ "name": "Caching Implementation",
48
+ "difficulty": "hard",
49
+ "max_steps": 8,
50
+ "optimal_steps": 6,
51
+ "description": (
52
+ "API latency is high. Logs show fetch_user() is being called repeatedly "
53
+ "with the same user_id. Implement simple in-memory caching for fetch_user. "
54
+ "You have 8 tool calls max. Plan before acting."
55
+ ),
56
+ "initial_visible_files": ["README.md"],
57
+ },
58
+ "task_4": {
59
+ "name": "Secret Migration",
60
+ "difficulty": "medium",
61
+ "max_steps": 10,
62
+ "optimal_steps": 4,
63
+ "description": (
64
+ "Security audit found a hardcoded API key in main.py. "
65
+ "Move the key 'SECRET_TOKEN_XYZ' to a new .env file as API_KEY=SECRET_TOKEN_XYZ "
66
+ "and update main.py to load it using os.getenv('API_KEY')."
67
+ ),
68
+ "initial_visible_files": ["main.py", "README.md"],
69
+ },
70
+ }
71
+
72
+
73
+ def get_task(task_id: str) -> Dict[str, Any]:
74
+ if task_id not in TASK_REGISTRY:
75
+ raise KeyError(f"Unknown task_id: {task_id!r}. Available: {list(TASK_REGISTRY.keys())}")
76
+ return TASK_REGISTRY[task_id]
77
+
78
+
79
+ def list_task_ids() -> List[str]:
80
+ return list(TASK_REGISTRY.keys())
81
+
82
+
83
+ # ---------------------------------------------------------------------------
84
+ # Step-level reward (called on every step)
85
+ # ---------------------------------------------------------------------------
86
+
87
+ def compute_step_reward(
88
+ task_id: str,
89
+ tool: str,
90
+ parameters: Dict[str, Any],
91
+ tool_result: str,
92
+ action_history: List[str],
93
+ discovered_files: List[str],
94
+ snapshot: Dict[str, str],
95
+ ) -> Tuple[float, Dict[str, float]]:
96
+ """Compute per-step reward signal.
97
+
98
+ action_history is the history BEFORE this step was appended,
99
+ so the current action is NOT yet in the list.
100
+ Returns (reward_value, breakdown_dict).
101
+ """
102
+ reward = 0.0
103
+ breakdown: Dict[str, float] = {}
104
+
105
+ current_action = f"{tool}({parameters})"
106
+
107
+ # ── Penalty: exact repeated call (compare against previous entries only) ──
108
+ if len(action_history) >= 1 and action_history[-1] == current_action:
109
+ reward -= 0.15
110
+ breakdown["repeat_penalty"] = -0.15
111
+
112
+ # ── Penalty: FileRead/FileWrite on unknown file ──
113
+ if tool in ("FileRead", "FileWrite"):
114
+ fname = parameters.get("filename", "")
115
+ if fname and fname not in discovered_files:
116
+ reward -= 0.10
117
+ breakdown["hallucination_penalty"] = -0.10
118
+
119
+ # ── Bonus: TodoWrite at step 0 (planning bonus) ──
120
+ # action_history is pre-append, so empty means this IS step 1
121
+ if tool == "TodoWrite" and len(action_history) == 0:
122
+ reward += 0.05
123
+ breakdown["planning_bonus"] = 0.05
124
+
125
+ # ── Penalty: error result ──
126
+ if tool_result.startswith("ERROR:"):
127
+ reward -= 0.05
128
+ breakdown["error_penalty"] = -0.05
129
+
130
+ # ── Task-specific step signals ──
131
+ step_signal = _task_step_signal(task_id, tool, parameters, tool_result, action_history)
132
+ if step_signal != 0.0:
133
+ reward += step_signal
134
+ breakdown["task_signal"] = step_signal
135
+
136
+ return round(reward, 3), breakdown
137
+
138
+
139
+ def _task_step_signal(
140
+ task_id: str, tool: str, params: Dict, result: str, history: List[str]
141
+ ) -> float:
142
+ """Small positive reward for productive actions toward the task goal."""
143
+ if task_id == "task_1":
144
+ # Reward discovering relevant files/patterns
145
+ if tool == "Grep" and "json" in str(params).lower():
146
+ return 0.05
147
+ if tool == "FileRead" and params.get("filename") == "main.py":
148
+ return 0.10
149
+ if tool == "Bash" and "lint" in str(params).lower():
150
+ return 0.05
151
+ elif task_id == "task_2":
152
+ if tool == "Grep" and "timeout" in str(params).lower():
153
+ return 0.05
154
+ if tool == "FileRead" and params.get("filename") == "config.json":
155
+ return 0.10
156
+ if tool == "FileWrite" and params.get("filename") == "config.json":
157
+ return 0.05
158
+ elif task_id == "task_3":
159
+ if tool == "TodoWrite":
160
+ return 0.05
161
+ if tool == "WebSearch" and "cache" in str(params).lower():
162
+ return 0.05
163
+ if tool == "FileRead" and params.get("filename") == "main.py":
164
+ return 0.05
165
+ if tool == "FileWrite" and params.get("filename") == "main.py":
166
+ return 0.05
167
+ elif task_id == "task_4":
168
+ if tool == "FileWrite" and params.get("filename") == ".env":
169
+ return 0.10
170
+ if tool == "FileRead" and params.get("filename") == "main.py":
171
+ return 0.05
172
+ if tool == "Grep" and "SECRET_TOKEN" in str(params).upper():
173
+ return 0.05
174
+ return 0.0
175
+
176
+
177
+ # ---------------------------------------------------------------------------
178
+ # Episode-level graders (called at done=True)
179
+ # ---------------------------------------------------------------------------
180
+
181
+ def grade_episode(
182
+ task_id: str,
183
+ snapshot: Dict[str, str],
184
+ action_history: List[str],
185
+ steps_used: int,
186
+ ) -> Tuple[float, Dict[str, float]]:
187
+ """Compute final episode score. Returns (score, breakdown)."""
188
+ graders = {
189
+ "task_1": _grade_task1,
190
+ "task_2": _grade_task2,
191
+ "task_3": _grade_task3,
192
+ "task_4": _grade_task4,
193
+ }
194
+ fn = graders.get(task_id)
195
+ if fn is None:
196
+ return 0.0, {"error": f"No grader for {task_id}"}
197
+ try:
198
+ return fn(snapshot, action_history, steps_used)
199
+ except Exception as e:
200
+ return 0.0, {"error": str(e)}
201
+
202
+
203
+ def _efficiency_score(steps_used: int, optimal_steps: int) -> float:
204
+ """Efficiency component: 1.0 at optimal, -0.08 per extra step, min 0."""
205
+ return max(0.0, 1.0 - (steps_used - optimal_steps) * 0.08)
206
+
207
+
208
+ def _history_contains(history: List[str], *keywords: str) -> bool:
209
+ """True if any history entry contains ALL keywords (case-insensitive)."""
210
+ for entry in history:
211
+ upper = entry.upper()
212
+ if all(kw.upper() in upper for kw in keywords):
213
+ return True
214
+ return False
215
+
216
+
217
+ def _history_contains_any(history: List[str], *keywords: str) -> bool:
218
+ for entry in history:
219
+ upper = entry.upper()
220
+ if any(kw.upper() in upper for kw in keywords):
221
+ return True
222
+ return False
223
+
224
+
225
+ # ── Task 1: Bug Localization ──────────────────────────────────────────────
226
+
227
+ def _grade_task1(
228
+ snapshot: Dict[str, str],
229
+ history: List[str],
230
+ steps_used: int,
231
+ ) -> Tuple[float, Dict[str, float]]:
232
+ """
233
+ Grader checks:
234
+ +0.30 β€” agent found correct file (main.py referenced)
235
+ +0.40 β€” agent found correct line (line 6 or mentions the bug location)
236
+ +0.30 β€” agent's answer mentions .json() fix
237
+ Efficiency multiplier applied to correctness * 0.7 + efficiency * 0.3
238
+ """
239
+ breakdown: Dict[str, float] = {}
240
+ score = 0.0
241
+
242
+ # Found correct file
243
+ if _history_contains_any(history, "MAIN.PY"):
244
+ breakdown["found_correct_file"] = 0.30
245
+ score += 0.30
246
+
247
+ # Found correct line β€” check if agent read main.py and referenced line 6
248
+ main_read = _history_contains(history, "FILEREAD", "MAIN.PY")
249
+ grep_json = _history_contains_any(history, "RESPONSE.JSON", "JSON")
250
+ if main_read and grep_json:
251
+ breakdown["found_correct_line"] = 0.40
252
+ score += 0.40
253
+
254
+ # Answer mentions fix
255
+ bash_lint = _history_contains_any(history, "BASH", "LINT")
256
+ if bash_lint:
257
+ breakdown["ran_linter"] = 0.30
258
+ score += 0.30
259
+
260
+ eff = _efficiency_score(steps_used, TASK_REGISTRY["task_1"]["optimal_steps"])
261
+ final = score * 0.7 + eff * 0.3
262
+ breakdown["efficiency"] = round(eff, 3)
263
+ return round(min(1.0, final), 4), breakdown
264
+
265
+
266
+ # ── Task 2: Config Patching ──────────────────────────────────────────────
267
+
268
+ def _grade_task2(
269
+ snapshot: Dict[str, str],
270
+ history: List[str],
271
+ steps_used: int,
272
+ ) -> Tuple[float, Dict[str, float]]:
273
+ """
274
+ +0.20 β€” found config.json (referenced in history)
275
+ +0.20 β€” read config before writing (FileRead before FileWrite)
276
+ +0.40 β€” timeout correctly set to 10 in the snapshot
277
+ +0.20 β€” config is valid JSON after write
278
+ """
279
+ breakdown: Dict[str, float] = {}
280
+ score = 0.0
281
+
282
+ # Found config.json
283
+ if _history_contains_any(history, "CONFIG.JSON"):
284
+ breakdown["found_config"] = 0.20
285
+ score += 0.20
286
+
287
+ # Read before write (good safety practice)
288
+ read_idx = next((i for i, h in enumerate(history) if "FILEREAD" in h.upper() and "CONFIG" in h.upper()), None)
289
+ write_idx = next((i for i, h in enumerate(history) if "FILEWRITE" in h.upper() and "CONFIG" in h.upper()), None)
290
+ if read_idx is not None and write_idx is not None and read_idx < write_idx:
291
+ breakdown["read_before_write"] = 0.20
292
+ score += 0.20
293
+ elif write_idx is not None and read_idx is None:
294
+ # Destructive write without reading
295
+ breakdown["destructive_write_penalty"] = -0.20
296
+ score -= 0.20
297
+
298
+ # Correct value in snapshot
299
+ config_content = snapshot.get("config.json", "")
300
+ try:
301
+ cfg = json.loads(config_content)
302
+ if cfg.get("timeout") == 10:
303
+ breakdown["correct_timeout_value"] = 0.40
304
+ score += 0.40
305
+ # Valid JSON
306
+ breakdown["valid_json"] = 0.20
307
+ score += 0.20
308
+ except (json.JSONDecodeError, Exception):
309
+ breakdown["invalid_json_penalty"] = -0.10
310
+ score -= 0.10
311
+
312
+ eff = _efficiency_score(steps_used, TASK_REGISTRY["task_2"]["optimal_steps"])
313
+ final = score * 0.7 + eff * 0.3
314
+ breakdown["efficiency"] = round(eff, 3)
315
+ return round(min(1.0, max(0.0, final)), 4), breakdown
316
+
317
+
318
+ # ── Task 3: Caching Implementation ───────────────────────────────────────
319
+
320
+ def _grade_task3(
321
+ snapshot: Dict[str, str],
322
+ history: List[str],
323
+ steps_used: int,
324
+ ) -> Tuple[float, Dict[str, float]]:
325
+ """
326
+ +0.30 β€” cache mechanism present in main.py (lru_cache or dict cache)
327
+ +0.30 β€” correct function decorated/modified (fetch_user)
328
+ +0.20 β€” code is syntactically clean (Bash lint passes)
329
+ +0.10 β€” used TodoWrite before acting
330
+ +0.10 β€” used WebSearch for docs
331
+ Hard cap: if steps > 8, done=True and score capped at 0.3
332
+ """
333
+ breakdown: Dict[str, float] = {}
334
+ score = 0.0
335
+
336
+ main_content = snapshot.get("main.py", "")
337
+
338
+ # Cache mechanism present
339
+ has_lru = "lru_cache" in main_content
340
+ has_dict_cache = re.search(r'_cache\s*=\s*\{', main_content) or re.search(r'cache\s*=\s*\{\}', main_content)
341
+ if has_lru or has_dict_cache:
342
+ breakdown["cache_mechanism_present"] = 0.30
343
+ score += 0.30
344
+
345
+ # Correct function modified
346
+ if "fetch_user" in main_content and (has_lru or has_dict_cache):
347
+ # Check lru_cache is on the right function
348
+ if re.search(r'@.*lru_cache.*\ndef fetch_user', main_content, re.DOTALL) or \
349
+ re.search(r'lru_cache.*fetch_user', main_content):
350
+ breakdown["correct_function_modified"] = 0.30
351
+ score += 0.30
352
+ elif has_dict_cache and "fetch_user" in main_content:
353
+ breakdown["correct_function_modified"] = 0.20
354
+ score += 0.20
355
+
356
+ # Lint passed β€” no obvious bugs introduced
357
+ bash_lint = _history_contains_any(history, "BASH", "LINT")
358
+ if bash_lint and not _history_contains_any(history, "ISSUE(S) FOUND", "ERROR"):
359
+ breakdown["lint_passes"] = 0.20
360
+ score += 0.20
361
+
362
+ # Used TodoWrite at start
363
+ if _history_contains_any(history, "TODOWRITE"):
364
+ breakdown["planning_bonus"] = 0.10
365
+ score += 0.10
366
+
367
+ # Used WebSearch
368
+ if _history_contains_any(history, "WEBSEARCH"):
369
+ breakdown["websearch_bonus"] = 0.10
370
+ score += 0.10
371
+
372
+ # Hard cap for exceeding 8 steps
373
+ if steps_used > 8:
374
+ score = min(score, 0.30)
375
+ breakdown["hard_cap_applied"] = True
376
+
377
+ eff = _efficiency_score(steps_used, TASK_REGISTRY["task_3"]["optimal_steps"])
378
+ final = score * 0.7 + eff * 0.3
379
+ breakdown["efficiency"] = round(eff, 3)
380
+ return round(min(1.0, max(0.0, final)), 4), breakdown
381
+
382
+
383
+ # ── Task 4: Secret Migration ──────────────────────────────────────────────
384
+
385
+ def _grade_task4(
386
+ snapshot: Dict[str, str],
387
+ history: List[str],
388
+ steps_used: int,
389
+ ) -> Tuple[float, Dict[str, float]]:
390
+ """
391
+ +0.30 β€” .env file contains API_KEY=SECRET_TOKEN_XYZ
392
+ +0.40 β€” main.py imports os and uses os.getenv('API_KEY')
393
+ +0.20 β€” main.py no longer contains hardcoded secret
394
+ +0.10 β€” planning bonus (TodoWrite)
395
+ """
396
+ breakdown: Dict[str, float] = {}
397
+ score = 0.0
398
+
399
+ env_content = snapshot.get(".env", "")
400
+ main_content = snapshot.get("main.py", "")
401
+
402
+ # .env check
403
+ if "API_KEY=SECRET_TOKEN_XYZ" in env_content.replace(" ", ""):
404
+ breakdown["env_file_correct"] = 0.30
405
+ score += 0.30
406
+
407
+ # main.py check
408
+ if "import os" in main_content and "os.getenv('API_KEY')" in main_content:
409
+ breakdown["main_uses_getenv"] = 0.40
410
+ score += 0.40
411
+ elif "import os" in main_content and 'os.getenv("API_KEY")' in main_content:
412
+ breakdown["main_uses_getenv"] = 0.40
413
+ score += 0.40
414
+
415
+ # Secret removal
416
+ if "SECRET_TOKEN_XYZ" not in main_content:
417
+ breakdown["secret_removed_from_main"] = 0.20
418
+ score += 0.20
419
+
420
+ # Planning bonus
421
+ if _history_contains_any(history, "TODOWRITE"):
422
+ breakdown["planning_bonus"] = 0.10
423
+ score += 0.10
424
+
425
+ eff = _efficiency_score(steps_used, TASK_REGISTRY["task_4"]["optimal_steps"])
426
+ final = score * 0.7 + eff * 0.3
427
+ breakdown["efficiency"] = round(eff, 3)
428
+ return round(min(1.0, max(0.0, final)), 4), breakdown
server/tools.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AgentOps Gym β€” Simulated tool implementations.
3
+
4
+ All tools operate on an in-memory filesystem snapshot. No real subprocess,
5
+ no real filesystem, fully deterministic and reproducible. The fake linter/
6
+ test runner uses static analysis of the snapshot strings.
7
+ """
8
+
9
+ import re
10
+ import json
11
+ from typing import Dict, Optional, Tuple
12
+
13
+ # ---------------------------------------------------------------------------
14
+ # In-memory project snapshots (one per task)
15
+ # ---------------------------------------------------------------------------
16
+
17
+ PROJECT_SNAPSHOTS: Dict[str, Dict[str, str]] = {
18
+ "task_1": {
19
+ "main.py": """\
20
+ import requests
21
+
22
+ def fetch_user(user_id):
23
+ url = f"https://api.example.com/users/{user_id}"
24
+ response = requests.get(url)
25
+ return response.json # BUG: missing () β€” should be response.json()
26
+
27
+ def main():
28
+ user = fetch_user(123)
29
+ print(user['name'])
30
+
31
+ if __name__ == "__main__":
32
+ main()
33
+ """,
34
+ "utils.py": "def helper(): pass\n",
35
+ "config.json": '{"api_url": "https://api.example.com", "timeout": 30}\n',
36
+ "README.md": "# Example Project\n",
37
+ },
38
+ "task_2": {
39
+ "main.py": """\
40
+ import requests
41
+ import json
42
+
43
+ def fetch_data(endpoint):
44
+ url = f"https://api.example.com/{endpoint}"
45
+ response = requests.get(url, timeout=30)
46
+ return response.json()
47
+
48
+ def main():
49
+ data = fetch_data("data")
50
+ print(data)
51
+ """,
52
+ "utils.py": "def helper(): pass\n",
53
+ "config.json": '{"api_url": "https://api.example.com", "timeout": 30}\n',
54
+ "README.md": "# Example Project\n",
55
+ },
56
+ "task_3": {
57
+ "main.py": """\
58
+ import requests
59
+
60
+ def fetch_user(user_id):
61
+ url = f"https://api.example.com/users/{user_id}"
62
+ response = requests.get(url)
63
+ return response.json()
64
+
65
+ def main():
66
+ for uid in range(100):
67
+ user = fetch_user(uid)
68
+ print(user['name'])
69
+
70
+ if __name__ == "__main__":
71
+ main()
72
+ """,
73
+ "utils.py": "def helper(): pass\n",
74
+ "config.json": '{"api_url": "https://api.example.com", "timeout": 30}\n',
75
+ "README.md": "# Example Project\n",
76
+ "tests/test_main.py": """\
77
+ from main import fetch_user
78
+
79
+ def test_fetch_user():
80
+ result = fetch_user(1)
81
+ assert result is not None
82
+ """,
83
+ },
84
+ "task_4": {
85
+ "main.py": """\
86
+ import requests
87
+
88
+ API_KEY = "SECRET_TOKEN_XYZ"
89
+
90
+ def fetch_data():
91
+ headers = {"Authorization": f"Bearer {API_KEY}"}
92
+ response = requests.get("https://api.example.com/data", headers=headers)
93
+ return response.json()
94
+
95
+ if __name__ == "__main__":
96
+ print(fetch_data())
97
+ """,
98
+ "README.md": "# Project Alpha\nSecure the API key.\n",
99
+ },
100
+ }
101
+
102
+ # ---------------------------------------------------------------------------
103
+ # Simulated web search index
104
+ # ---------------------------------------------------------------------------
105
+
106
+ WEB_SEARCH_DOCS: Dict[str, str] = {
107
+ "lru_cache": """\
108
+ functools.lru_cache β€” Python docs
109
+ @functools.lru_cache(maxsize=128)
110
+ def my_function(arg): ...
111
+ Caches results of function calls. Use maxsize=None for unlimited cache.
112
+ """,
113
+ "response.json": """\
114
+ requests.Response.json() β€” requests docs
115
+ response.json() returns the JSON-encoded content of the response.
116
+ Note: json is a method, must be called with parentheses: response.json()
117
+ """,
118
+ "timeout": """\
119
+ requests timeout β€” requests docs
120
+ Set timeout in seconds: requests.get(url, timeout=10)
121
+ Recommended: keep timeout low (5-15s) for production APIs.
122
+ """,
123
+ "python caching": """\
124
+ Python caching patterns:
125
+ 1. functools.lru_cache β€” in-memory memoization decorator
126
+ 2. dict-based cache β€” manual dict for full control
127
+ 3. joblib.Memory β€” disk-backed cache
128
+ For simple in-memory caching, lru_cache is idiomatic Python.
129
+ """,
130
+ "getenv": """\
131
+ os.getenv(key, default=None) β€” Python docs
132
+ Return the value of the environment variable key if it exists, or default if it doesn't.
133
+ Example:
134
+ import os
135
+ api_key = os.getenv('API_KEY')
136
+ """,
137
+ ".env": """\
138
+ .env files β€” Best Practices
139
+ Store secrets and configuration in a .env file:
140
+ API_KEY=your_secret_here
141
+ Never commit .env files to version control.
142
+ """,
143
+ }
144
+
145
+ # ---------------------------------------------------------------------------
146
+ # Tool implementations
147
+ # ---------------------------------------------------------------------------
148
+
149
+ AVAILABLE_TOOLS = {
150
+ "FileRead": "Read contents of a specific file",
151
+ "FileWrite": "Write/edit a specific file with new content",
152
+ "Grep": "Search for a pattern across all files",
153
+ "Bash": "Run a shell command (simulated: lint, test runner)",
154
+ "WebSearch": "Search for documentation (simulated)",
155
+ "TodoWrite": "Write a plan/todo list before acting",
156
+ }
157
+
158
+
159
+ def run_tool(
160
+ tool: str,
161
+ parameters: Dict,
162
+ snapshot: Dict[str, str],
163
+ discovered_files: list,
164
+ ) -> Tuple[str, Dict[str, str], list]:
165
+ """
166
+ Execute a simulated tool and return (result_string, updated_snapshot, updated_discovered).
167
+ All mutations to the snapshot are returned as a new dict.
168
+ """
169
+ snapshot = dict(snapshot)
170
+ discovered = list(discovered_files)
171
+
172
+ if tool == "FileRead":
173
+ return _file_read(parameters, snapshot, discovered)
174
+ elif tool == "FileWrite":
175
+ return _file_write(parameters, snapshot, discovered)
176
+ elif tool == "Grep":
177
+ return _grep(parameters, snapshot, discovered)
178
+ elif tool == "Bash":
179
+ return _bash(parameters, snapshot)
180
+ elif tool == "WebSearch":
181
+ return _web_search(parameters), snapshot, discovered
182
+ elif tool == "TodoWrite":
183
+ return _todo_write(parameters), snapshot, discovered
184
+ else:
185
+ return f"ERROR: Unknown tool '{tool}'. Available: {list(AVAILABLE_TOOLS.keys())}", snapshot, discovered
186
+
187
+
188
+ def _file_read(params, snapshot, discovered):
189
+ fname = params.get("filename", "")
190
+ if not fname:
191
+ return "ERROR: 'filename' parameter required for FileRead.", snapshot, discovered
192
+ if fname not in snapshot:
193
+ return f"ERROR: File '{fname}' not found in project.", snapshot, discovered
194
+ # Reveal file in discovered list
195
+ if fname not in discovered:
196
+ discovered.append(fname)
197
+ content = snapshot[fname]
198
+ lines = content.splitlines()
199
+ numbered = "\n".join(f"{i+1:3}: {line}" for i, line in enumerate(lines))
200
+ return f"=== {fname} ===\n{numbered}", snapshot, discovered
201
+
202
+
203
+ def _file_write(params, snapshot, discovered):
204
+ fname = params.get("filename", "")
205
+ content = params.get("content", "")
206
+ if not fname:
207
+ return "ERROR: 'filename' parameter required for FileWrite.", snapshot, discovered
208
+ snapshot[fname] = content
209
+ if fname not in discovered:
210
+ discovered.append(fname)
211
+ return f"Write successful: {fname} ({len(content)} bytes written)", snapshot, discovered
212
+
213
+
214
+ def _grep(params, snapshot, discovered):
215
+ pattern = params.get("pattern", "")
216
+ if not pattern:
217
+ return "ERROR: 'pattern' parameter required for Grep.", snapshot, discovered
218
+ results = []
219
+ for fname, content in snapshot.items():
220
+ for i, line in enumerate(content.splitlines(), 1):
221
+ if re.search(pattern, line, re.IGNORECASE):
222
+ results.append(f"{fname}:{i} β†’ {line.strip()}")
223
+ # Discovering a file via grep reveals it
224
+ if fname not in discovered:
225
+ discovered.append(fname)
226
+ if not results:
227
+ return f"No matches for pattern '{pattern}'.", snapshot, discovered
228
+ return "\n".join(results), snapshot, discovered
229
+
230
+
231
+ def _bash(params, snapshot):
232
+ cmd = params.get("command", "")
233
+ if not cmd:
234
+ return "ERROR: 'command' parameter required for Bash.", snapshot, []
235
+
236
+ cmd_lower = cmd.lower()
237
+
238
+ # Simulated linter
239
+ if "lint" in cmd_lower or "flake8" in cmd_lower or "pylint" in cmd_lower:
240
+ fname = None
241
+ for f in snapshot:
242
+ if f.endswith(".py") and f in cmd:
243
+ fname = f
244
+ break
245
+ if fname and fname in snapshot:
246
+ return _lint_file(fname, snapshot[fname]), snapshot, []
247
+ # Lint all py files
248
+ out = []
249
+ for f, content in snapshot.items():
250
+ if f.endswith(".py"):
251
+ out.append(_lint_file(f, content))
252
+ return "\n".join(out) if out else "No Python files found.", snapshot, []
253
+
254
+ # Simulated test runner
255
+ if "pytest" in cmd_lower or "test" in cmd_lower:
256
+ test_files = [f for f in snapshot if "test" in f]
257
+ if not test_files:
258
+ return "No test files found.", snapshot, []
259
+ # Check if main.py has obvious bugs
260
+ main_content = snapshot.get("main.py", "")
261
+ if "response.json\n" in main_content or "response.json " in main_content:
262
+ return '{"status": "error", "file": "main.py", "line": 6, "message": "AttributeError: method object is not subscriptable β€” did you forget response.json()?"}'
263
+ return '{"status": "pass", "passed": 1, "failed": 0}', snapshot, []
264
+
265
+ # Simulated validate (for config check)
266
+ if "validate" in cmd_lower or "json" in cmd_lower:
267
+ for fname, content in snapshot.items():
268
+ if fname.endswith(".json") and fname in cmd:
269
+ try:
270
+ json.loads(content)
271
+ return f"βœ“ {fname} is valid JSON", snapshot, []
272
+ except json.JSONDecodeError as e:
273
+ return f"βœ— {fname} invalid JSON: {e}", snapshot, []
274
+ return "Validation complete.", snapshot, []
275
+
276
+ return f"$ {cmd}\n(simulated) Command executed. No output.", snapshot, []
277
+
278
+
279
+ def _lint_file(fname: str, content: str) -> str:
280
+ errors = []
281
+ for i, line in enumerate(content.splitlines(), 1):
282
+ # Check for common bug: response.json without ()
283
+ if re.search(r'response\.json\b(?!\()', line):
284
+ errors.append(f' {fname}:{i}: E001 response.json called without parentheses β€” should be response.json()')
285
+ # Check for bare except
286
+ if re.match(r'\s*except\s*:', line):
287
+ errors.append(f' {fname}:{i}: W001 Bare except clause detected')
288
+ # Check for hardcoded secrets (task_4)
289
+ if "SECRET_TOKEN_XYZ" in line and fname == "main.py":
290
+ errors.append(f' {fname}:{i}: E002 Hardcoded secret detected β€” use environment variables')
291
+ if errors:
292
+ return f'{fname}: {len(errors)} issue(s) found\n' + '\n'.join(errors)
293
+ return f'{fname}: OK'
294
+
295
+
296
+ def _web_search(params) -> str:
297
+ query = params.get("query", "").lower()
298
+ for key, doc in WEB_SEARCH_DOCS.items():
299
+ if key in query:
300
+ return doc
301
+ return f"No results found for '{params.get('query', '')}'. Try more specific terms."
302
+
303
+
304
+ def _todo_write(params) -> str:
305
+ plan = params.get("plan", params.get("content", ""))
306
+ if not plan:
307
+ return "ERROR: 'plan' parameter required for TodoWrite."
308
+ return f"βœ“ Plan recorded:\n{plan}"
uv.lock ADDED
The diff for this file is too large to render. See raw diff