Spaces:
Sleeping
Sleeping
Upload 29 files
Browse files- Dockerfile +16 -0
- README.md +72 -7
- app.py +97 -0
- demo.py +32 -0
- environment/__init__.py +2 -0
- environment/action_space.py +37 -0
- environment/api_triage_env.py +156 -0
- environment/incident_generator.py +76 -0
- environment/reward.py +46 -0
- inference.py +166 -0
- openenv.yaml +60 -0
- pyproject.toml +25 -0
- requirements.txt +8 -0
- server/app.py +96 -0
- tasks/__init__.py +1 -0
- tasks/auth_error/__init__.py +1 -0
- tasks/auth_error/grader.py +9 -0
- tasks/grading_helper.py +45 -0
- tasks/missing_fields/__init__.py +1 -0
- tasks/missing_fields/grader.py +9 -0
- tasks/rate_limit/__init__.py +1 -0
- tasks/rate_limit/grader.py +9 -0
- tasks/timeout/__init__.py +1 -0
- tasks/timeout/grader.py +9 -0
- tasks/wrong_endpoint/__init__.py +1 -0
- tests/__init__.py +1 -0
- tests/test_env.py +51 -0
- tests/test_graders.py +125 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
|
| 8 |
+
COPY environment/ ./environment/
|
| 9 |
+
COPY tests/ ./tests/
|
| 10 |
+
COPY app.py .
|
| 11 |
+
COPY inference.py .
|
| 12 |
+
COPY openenv.yaml .
|
| 13 |
+
|
| 14 |
+
EXPOSE 7860
|
| 15 |
+
|
| 16 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,12 +1,77 @@
|
|
| 1 |
---
|
| 2 |
-
title: API
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
-
license: other
|
| 9 |
-
short_description: API Triage Agent - AI agent debugs API failures
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: API Triage Agent
|
| 3 |
+
emoji: 🤖
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
+
# API Triage Agent
|
| 11 |
+
|
| 12 |
+
## Environment Description :
|
| 13 |
+
An AI agent that diagnoses and resolves API integration failures by inspecting logs, identifying error types, and taking corrective actions. Simulates real-world API debugging scenarios.
|
| 14 |
+
|
| 15 |
+
## Motivation
|
| 16 |
+
API failures are common in production. This environment teaches an agent to handle authentication errors, missing fields, rate limits, wrong endpoints, and server errors – just like a real support engineer.
|
| 17 |
+
|
| 18 |
+
## Action Space (8 actions)
|
| 19 |
+
| Action | Description |
|
| 20 |
+
|--------|-------------|
|
| 21 |
+
| `inspect_logs` | Examine error logs for clues |
|
| 22 |
+
| `inspect_request` | Check the failed API request |
|
| 23 |
+
| `refresh_token` | Fix authentication errors (401) |
|
| 24 |
+
| `add_field` | Add missing required fields (400) |
|
| 25 |
+
| `wait_retry` | Handle rate limits (429) and timeouts (408) |
|
| 26 |
+
| `change_endpoint` | Fix wrong API endpoint (404) |
|
| 27 |
+
| `escalate` | Report server errors (500) to human |
|
| 28 |
+
| `resolve` | End episode after successful fix |
|
| 29 |
+
|
| 30 |
+
## Observation Space
|
| 31 |
+
| Field | Type | Description |
|
| 32 |
+
|-------|------|-------------|
|
| 33 |
+
| `step` | int | Current step number |
|
| 34 |
+
| `max_steps` | int | Maximum steps allowed (10) |
|
| 35 |
+
| `incident_summary` | str | Short problem description |
|
| 36 |
+
| `logs` | list | Error messages from API |
|
| 37 |
+
| `response_code` | int | HTTP status code |
|
| 38 |
+
| `fix_applied` | bool | Whether fix has been applied |
|
| 39 |
+
| `is_resolved` | bool | Whether episode ended |
|
| 40 |
+
|
| 41 |
+
## Tasks (Easy → Medium → Hard)
|
| 42 |
+
|
| 43 |
+
### Easy Task: Authentication Error
|
| 44 |
+
- **Incident:** `auth_error`
|
| 45 |
+
- **Correct fix:** `refresh_token` → `resolve`
|
| 46 |
+
- **Score achieved:** 1.0
|
| 47 |
+
|
| 48 |
+
### Medium Task: Missing Field Error
|
| 49 |
+
- **Incident:** `missing_fields`
|
| 50 |
+
- **Correct fix:** `add_field` → `resolve`
|
| 51 |
+
- **Score achieved:** 1.0
|
| 52 |
+
|
| 53 |
+
### Hard Task: Server Error
|
| 54 |
+
- **Incident:** `server_error`
|
| 55 |
+
- **Correct fix:** `escalate` → `resolve`
|
| 56 |
+
- **Score achieved:** 1.0
|
| 57 |
+
|
| 58 |
+
## Reward System (5 factors)
|
| 59 |
+
| Factor | Reward |
|
| 60 |
+
|--------|--------|
|
| 61 |
+
| Correct fix action | +5 |
|
| 62 |
+
| Wrong action | -2 |
|
| 63 |
+
| Diagnostic action | +0.5 |
|
| 64 |
+
| Resolve with fix (success) | +15 |
|
| 65 |
+
| Resolve without fix | -10 |
|
| 66 |
+
| Max steps reached | -5 |
|
| 67 |
+
|
| 68 |
+
## Setup Instructions
|
| 69 |
+
|
| 70 |
+
### 1. Create virtual environment
|
| 71 |
+
```bash
|
| 72 |
+
python -m venv venv
|
| 73 |
+
source venv/bin/activate # Linux/Mac
|
| 74 |
+
venv\Scripts\activate # Windows
|
| 75 |
+
|
| 76 |
+
# 2. must do (install dependencies) - pip install -r requirements.txt
|
| 77 |
+
# also run these : python demo.py , pytest tests/test_env.py -v , python tests/test_graders.py , openenv validate
|
app.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# the main environment file
|
| 2 |
+
|
| 3 |
+
from fastapi import FastAPI
|
| 4 |
+
from pydantic import BaseModel
|
| 5 |
+
from environment.api_triage_env import APITriageEnv
|
| 6 |
+
|
| 7 |
+
# creating an app and environment
|
| 8 |
+
app = FastAPI()
|
| 9 |
+
env = APITriageEnv()
|
| 10 |
+
|
| 11 |
+
# defining a request model for /step endpoint
|
| 12 |
+
# for fastapi so that it can understand that we expecting a JSON with an action field that is a text dtype
|
| 13 |
+
class ActionRequest(BaseModel):
|
| 14 |
+
action: str
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@app.post("/reset")
|
| 18 |
+
|
| 19 |
+
def reset():
|
| 20 |
+
"""
|
| 21 |
+
Starting a new API debugging episode
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
print("INFO : reset endpoint is called , new debugging session started ")
|
| 25 |
+
state = env.reset()
|
| 26 |
+
return {
|
| 27 |
+
"step" : state.step,
|
| 28 |
+
"max_steps": state.max_steps,
|
| 29 |
+
"incident_summary": state.incident_summary,
|
| 30 |
+
"logs": state.logs,
|
| 31 |
+
"response_code":state.response_code,
|
| 32 |
+
"fix_applied": state.fix_applied,
|
| 33 |
+
"is_resolved" : state.is_resolved
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@app.get("/state")
|
| 38 |
+
|
| 39 |
+
def state():
|
| 40 |
+
"""
|
| 41 |
+
HELPs to return the current observation of the episode.
|
| 42 |
+
"""
|
| 43 |
+
print("INFO : current state of the Episode as follows ")
|
| 44 |
+
current = env.state()
|
| 45 |
+
return {
|
| 46 |
+
"step" : current.step,
|
| 47 |
+
"max_steps": current.max_steps,
|
| 48 |
+
"incident_summary": current.incident_summary,
|
| 49 |
+
"logs": current.logs,
|
| 50 |
+
"response_code": current.response_code,
|
| 51 |
+
"fix_applied": current.fix_applied,
|
| 52 |
+
"is_resolved" : current.is_resolved
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@app.post("/step")
|
| 57 |
+
|
| 58 |
+
def step(request: ActionRequest):
|
| 59 |
+
"""
|
| 60 |
+
the agent sends an action and our environment will preocess it
|
| 61 |
+
and update the state , returns what happened.
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
"""
|
| 65 |
+
action = what the agent wants to do (text)
|
| 66 |
+
observation = what the agent sees after doing it (object with 7 fields)
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
action = request.action
|
| 70 |
+
print(f"INFO : Action received: {action}")
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# calling env.step() from api_triage_env.py file to process the action
|
| 74 |
+
observation , reward , done , info = env.step(action)
|
| 75 |
+
|
| 76 |
+
# here returning the result
|
| 77 |
+
return {
|
| 78 |
+
"observation": {
|
| 79 |
+
"step" : observation.step,
|
| 80 |
+
"max_steps": observation.max_steps,
|
| 81 |
+
"incident_summary": observation.incident_summary,
|
| 82 |
+
"logs": observation.logs,
|
| 83 |
+
"response_code": observation.response_code,
|
| 84 |
+
"fix_applied": observation.fix_applied,
|
| 85 |
+
"is_resolved" : observation.is_resolved
|
| 86 |
+
},
|
| 87 |
+
"reward": reward,
|
| 88 |
+
"done": done,
|
| 89 |
+
"info": info,
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
def main():
|
| 93 |
+
import uvicorn
|
| 94 |
+
uvicorn.run("app:app", host="0.0.0.0", port=7860)
|
| 95 |
+
|
| 96 |
+
if __name__ == "__main__":
|
| 97 |
+
main()
|
demo.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from environment.api_triage_env import APITriageEnv
|
| 2 |
+
|
| 3 |
+
print("API Triage Agent Demo")
|
| 4 |
+
|
| 5 |
+
# creating an instance of the environment
|
| 6 |
+
env = APITriageEnv(max_steps=10)
|
| 7 |
+
|
| 8 |
+
# starting a new episode
|
| 9 |
+
state = env.reset()
|
| 10 |
+
|
| 11 |
+
# Get correct fix action from the incident
|
| 12 |
+
correct_action = env.incident["fix_action"]
|
| 13 |
+
|
| 14 |
+
# printing the initial state of the environment
|
| 15 |
+
print(f"\nIncident: {state.incident_summary}")
|
| 16 |
+
print(f"Response Code: {state.response_code}")
|
| 17 |
+
print(f"logs: {state.logs}")
|
| 18 |
+
|
| 19 |
+
# defined a sequence of actions to take
|
| 20 |
+
actions = ["inspect_logs", correct_action, "resolve"]
|
| 21 |
+
|
| 22 |
+
for action in actions:
|
| 23 |
+
print(f"\nTaking action: {action}")
|
| 24 |
+
state, reward, done, info = env.step(action) # ← Use 'action' not 'actions'
|
| 25 |
+
print(f"Reward: {reward}")
|
| 26 |
+
print(f"fix applied: {state.fix_applied}")
|
| 27 |
+
|
| 28 |
+
if done:
|
| 29 |
+
print(f"\n[EPISODE END] Resolution: {info.get('resolution')}")
|
| 30 |
+
print(f"Total Reward: {info.get('total_reward')}")
|
| 31 |
+
break
|
| 32 |
+
|
environment/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file tells Python that 'environment' is a package
|
| 2 |
+
# It allows us to import from this folder
|
environment/action_space.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
what we will do here is that we gonna define
|
| 3 |
+
every action that our ai agent
|
| 4 |
+
can take so that we can the debug the API failures,
|
| 5 |
+
or
|
| 6 |
+
this file contains ->
|
| 7 |
+
* a list of action names - storing all valid actions,
|
| 8 |
+
* a validation function - check if action is allowed,
|
| 9 |
+
* a get all actions function - return the list for grading .
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
# each action = one thing the agent can do to debug the API
|
| 13 |
+
|
| 14 |
+
VALID_ACTIONS = [
|
| 15 |
+
# Diagnostic actions -> it helps our agent to understand the problem before fixing
|
| 16 |
+
"inspect_logs",
|
| 17 |
+
"inspect_request",
|
| 18 |
+
|
| 19 |
+
# Fix actions -> these actions actually solves the problem(s)
|
| 20 |
+
"refresh_token", # for authentication_error
|
| 21 |
+
"add_field", # the missing_fields
|
| 22 |
+
"wait_retry", # two incidents here for (rate_limit and timeout)
|
| 23 |
+
"change_endpoint", # wrong_endpoint
|
| 24 |
+
"escalate", # server_error
|
| 25 |
+
|
| 26 |
+
# Terminal action -> for ending the episode
|
| 27 |
+
"resolve"
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
def is_valid_action(action):
|
| 31 |
+
# returns true if the action is allowed or present
|
| 32 |
+
return action in VALID_ACTIONS
|
| 33 |
+
|
| 34 |
+
def get_all_actions():
|
| 35 |
+
# returns copy of all actions , if why? then here we did it because after the caller modifies , our original list stays safe
|
| 36 |
+
return VALID_ACTIONS.copy()
|
| 37 |
+
|
environment/api_triage_env.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
The three main methods implementation ->
|
| 3 |
+
1. reset() - start of each episode
|
| 4 |
+
2. step(action) - agent takes an action
|
| 5 |
+
3. state() - called anytime
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
1. For incident selection - curriculum learning approach (easy -> medium -> hard)
|
| 10 |
+
2. For Reward factors - 5 factors (correct, wrong, resolve with/without fix, max steps)
|
| 11 |
+
3. For episode end conditions - resolved with fix , resolved without fix , max steps reached
|
| 12 |
+
4. For action space - 8 actions(including diagnostic , fix , terminal)
|
| 13 |
+
5. For max steps - 10 steps per episode
|
| 14 |
+
6. For reward - range is -20 to +20
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
"""
|
| 18 |
+
extra info ->
|
| 19 |
+
1. stage 1 episodes -> 1-10
|
| 20 |
+
2. stage 2 epiosdes -> 11-25
|
| 21 |
+
3. stage 3 epiosdes -> 26+
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
"""
|
| 25 |
+
Our 3 models->
|
| 26 |
+
1. observation- what agent sees at each step
|
| 27 |
+
2. action - what agent can do at each step
|
| 28 |
+
3. EnvState - internal tracking of the environment
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
import random
|
| 32 |
+
from typing import Dict, Any, Tuple, Optional, List
|
| 33 |
+
from pydantic import BaseModel
|
| 34 |
+
|
| 35 |
+
from environment.incident_generator import get_random_incident, get_incident_by_type
|
| 36 |
+
from environment.action_space import is_valid_action
|
| 37 |
+
from environment.reward import calculate_reward
|
| 38 |
+
|
| 39 |
+
class Observation(BaseModel):
|
| 40 |
+
step: int
|
| 41 |
+
max_steps: int
|
| 42 |
+
incident_summary : str
|
| 43 |
+
logs: List[str]
|
| 44 |
+
response_code: int
|
| 45 |
+
fix_applied: bool
|
| 46 |
+
is_resolved: bool
|
| 47 |
+
|
| 48 |
+
class Action(BaseModel):
|
| 49 |
+
action_name: str
|
| 50 |
+
|
| 51 |
+
class EnvState(BaseModel):
|
| 52 |
+
current_incident: Dict[str, Any]
|
| 53 |
+
step_counter: int
|
| 54 |
+
fix_applied: bool
|
| 55 |
+
total_reward: float
|
| 56 |
+
is_resolved: bool
|
| 57 |
+
|
| 58 |
+
class APITriageEnv:
|
| 59 |
+
def __init__(self, max_steps = 10):
|
| 60 |
+
self.max_steps = max_steps
|
| 61 |
+
self.step_counter = 0
|
| 62 |
+
self.done = False
|
| 63 |
+
self.incident = None
|
| 64 |
+
self.fix_applied = False
|
| 65 |
+
self.total_reward = 0.0
|
| 66 |
+
self.total_episodes = 0
|
| 67 |
+
|
| 68 |
+
def reset(self):
|
| 69 |
+
self.step_counter = 0
|
| 70 |
+
self.done = False
|
| 71 |
+
self.fix_applied = False
|
| 72 |
+
self.total_reward = 0.0
|
| 73 |
+
self.total_episodes += 1
|
| 74 |
+
|
| 75 |
+
# implying the curriculum learning approach here
|
| 76 |
+
if self.total_episodes <= 10:
|
| 77 |
+
# stage 1 -> easy incidents (auth_error, missing_fields)
|
| 78 |
+
incident_type = random.choice(["auth_error", "missing_fields"])
|
| 79 |
+
self.incident = get_incident_by_type(incident_type)
|
| 80 |
+
elif self.total_episodes <= 25:
|
| 81 |
+
# stage 2 -> medium incidents
|
| 82 |
+
incident_type = random.choice(["rate_limit", "timeout", "wrong_endpoint"])
|
| 83 |
+
self.incident = get_incident_by_type(incident_type)
|
| 84 |
+
elif self.total_episodes > 25:
|
| 85 |
+
# stage 3 -> hard incidents
|
| 86 |
+
incident_type = "server_error"
|
| 87 |
+
self.incident = get_incident_by_type(incident_type)
|
| 88 |
+
|
| 89 |
+
return self.state()
|
| 90 |
+
|
| 91 |
+
def state(self):
|
| 92 |
+
"""Returns what the agent sees at current step"""
|
| 93 |
+
return Observation(
|
| 94 |
+
step=self.step_counter,
|
| 95 |
+
max_steps=self.max_steps,
|
| 96 |
+
incident_summary=self.incident["summary"],
|
| 97 |
+
logs=self.incident["logs"],
|
| 98 |
+
response_code=self.incident["code"],
|
| 99 |
+
fix_applied=self.fix_applied,
|
| 100 |
+
is_resolved=self.done
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
def step(self, action):
|
| 104 |
+
"""Agent takes an action and environment responds with new state and reward"""
|
| 105 |
+
# 1. if episode is done or finished already
|
| 106 |
+
if self.done:
|
| 107 |
+
state = self.state()
|
| 108 |
+
reward = 0.0
|
| 109 |
+
info = {"error": "episode is already finished "}
|
| 110 |
+
done = True
|
| 111 |
+
return state, reward, done, info
|
| 112 |
+
|
| 113 |
+
# 2. increment step counter and check is action is valid
|
| 114 |
+
self.step_counter += 1
|
| 115 |
+
|
| 116 |
+
# 3. validate the action
|
| 117 |
+
if not is_valid_action(action):
|
| 118 |
+
state = self.state()
|
| 119 |
+
reward = -2.0
|
| 120 |
+
info = {"error" : "the action is not valid"}
|
| 121 |
+
done = False
|
| 122 |
+
return state, reward , done , info
|
| 123 |
+
|
| 124 |
+
# 4. Reward calculation
|
| 125 |
+
reward = calculate_reward(action , self.incident, self.fix_applied, self.step_counter , self.max_steps)
|
| 126 |
+
|
| 127 |
+
# 5. updating fix applied status if the action is the correct fix action
|
| 128 |
+
if action == self.incident["fix_action"]:
|
| 129 |
+
self.fix_applied = True
|
| 130 |
+
|
| 131 |
+
# 6. update toatal reward
|
| 132 |
+
self.total_reward += reward
|
| 133 |
+
|
| 134 |
+
# 7. prepare info (for all cases )
|
| 135 |
+
info = {
|
| 136 |
+
"step": self.step_counter,
|
| 137 |
+
"incident_type": self.incident["type"],
|
| 138 |
+
"fix_applied": self.fix_applied,
|
| 139 |
+
"total_reward": self.total_reward
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
# 8. check if the epiosde is resolved
|
| 143 |
+
if action == "resolve":
|
| 144 |
+
self.done = True
|
| 145 |
+
info["resolution"] = "success" if self.fix_applied else "failure - resolved without fix"
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# 9. check if epsiode is not resolved that means max steps are reached
|
| 149 |
+
if self.step_counter >= self.max_steps:
|
| 150 |
+
self.done = True
|
| 151 |
+
info["resolution"] = "failure - max steps reached"
|
| 152 |
+
|
| 153 |
+
# 10. final return (one return at the end)
|
| 154 |
+
return self.state(), reward, self.done, info
|
| 155 |
+
|
| 156 |
+
|
environment/incident_generator.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
""" what it does :
|
| 2 |
+
is that Stores different API failure scenarios.
|
| 3 |
+
When the agent starts a new episode,
|
| 4 |
+
this file picks a random failure for the agent to debug.
|
| 5 |
+
"""
|
| 6 |
+
import random
|
| 7 |
+
|
| 8 |
+
INCIDENTS =[ # here we created mutiple incidents and in different episodes different incidents will be checked
|
| 9 |
+
{
|
| 10 |
+
"type": "auth_error",
|
| 11 |
+
"summary": "401 Unnathorized - API key expired",
|
| 12 |
+
"logs": ["ERROR : there is inavlid API key", "HINT : your key expired already "],
|
| 13 |
+
"fix_action": "refresh_token",
|
| 14 |
+
"code":401
|
| 15 |
+
},
|
| 16 |
+
|
| 17 |
+
{
|
| 18 |
+
"type": "missing_fields",
|
| 19 |
+
"summary": "400 bad request - email field is missing please check",
|
| 20 |
+
"logs": ["ERROR :Required field 'email' cannot be found" , "HINT : please add email to the request body "],
|
| 21 |
+
"fix_action": "add_field",
|
| 22 |
+
"code":400
|
| 23 |
+
},
|
| 24 |
+
|
| 25 |
+
{
|
| 26 |
+
"type": "rate_limit",
|
| 27 |
+
"summary": "429 too many requests - slow down ",
|
| 28 |
+
"logs": ["ERROR : the rate limit is exceeded " , "HINT : please wait for 1 minute before retrying "],
|
| 29 |
+
"fix_action": "wait_retry",
|
| 30 |
+
"code":429
|
| 31 |
+
},
|
| 32 |
+
|
| 33 |
+
{
|
| 34 |
+
"type": "wrong_endpoint",
|
| 35 |
+
"summary": "404 Not Found - API endpoint doesn't exist",
|
| 36 |
+
"logs": ["ERROR: POST /api/v2/data returned 404", "Hint: Use /api/v3/data instead"],
|
| 37 |
+
"fix_action": "change_endpoint",
|
| 38 |
+
"code": 404
|
| 39 |
+
},
|
| 40 |
+
|
| 41 |
+
{
|
| 42 |
+
"type": "server_error",
|
| 43 |
+
"summary": "500 Internal Server Error",
|
| 44 |
+
"logs": ["ERROR: Database connection failed", "Hint: Cannot fix automatically - escalate"],
|
| 45 |
+
"fix_action": "escalate",
|
| 46 |
+
"code": 500
|
| 47 |
+
},
|
| 48 |
+
|
| 49 |
+
{
|
| 50 |
+
"type": "timeout",
|
| 51 |
+
"summary": "request took too long",
|
| 52 |
+
"logs": ["ERROR : request timed out after 45 seconds", "Hint: server may be overloaded , please try again with backoff"],
|
| 53 |
+
"fix_action": "wait_retry",
|
| 54 |
+
"code": 408
|
| 55 |
+
},
|
| 56 |
+
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
def get_random_incident():
|
| 60 |
+
# this function is to return a random choice
|
| 61 |
+
return random.choice(INCIDENTS)
|
| 62 |
+
|
| 63 |
+
def get_incident_by_type(incident_type):
|
| 64 |
+
# Returns a specific incident - useful for grading system (score/marks/points)
|
| 65 |
+
for incident in INCIDENTS: # here we are having a loop
|
| 66 |
+
if incident["type"] == incident_type: # we chechking here if type matches or not
|
| 67 |
+
return incident
|
| 68 |
+
return None
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
|
environment/reward.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
here we gonna define the reward function for our agent,
|
| 3 |
+
so that it can learn or adapt the environment and
|
| 4 |
+
able to get/achieve the rewards for the actions it takes in the environment.
|
| 5 |
+
OR
|
| 6 |
+
Per step reward
|
| 7 |
+
"""
|
| 8 |
+
# The rewarding system we writing here will be within the scale of -20 to +20.
|
| 9 |
+
"""
|
| 10 |
+
The factors we are using (5 factors):
|
| 11 |
+
1. Correct action = positive reward (2 to 10)
|
| 12 |
+
2. Wrong action = negative reward (-1 to -3)
|
| 13 |
+
3. Resolve with FIX (Episode success) = large positive reward (+10 to +15)
|
| 14 |
+
4. Resolve WITHOUT FIX (Prevents lying) = negative reward (-5 to -10)
|
| 15 |
+
5. Max steps reached (Episode failure) = negative reward (-5)
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
def calculate_reward(action, incident, fix_applied, step, max_steps):
|
| 19 |
+
|
| 20 |
+
# agents says resolved but didn't fix - penalty
|
| 21 |
+
if action == "resolve" and not fix_applied:
|
| 22 |
+
return -10.0
|
| 23 |
+
|
| 24 |
+
# agent ran out of steps - penalty
|
| 25 |
+
if step >= max_steps:
|
| 26 |
+
return -5.0
|
| 27 |
+
|
| 28 |
+
# agent fixed and resolved the incident (succes)
|
| 29 |
+
if action == "resolve" and fix_applied:
|
| 30 |
+
return 15.0
|
| 31 |
+
|
| 32 |
+
# for correct fix action
|
| 33 |
+
if action == incident["fix_action"] and not fix_applied:
|
| 34 |
+
return 5.0
|
| 35 |
+
|
| 36 |
+
# Diagnostic actions - helpful but doesn't fix
|
| 37 |
+
if action in ["inspect_logs", "inspect_request"]:
|
| 38 |
+
return 0.5
|
| 39 |
+
|
| 40 |
+
# for wrong action
|
| 41 |
+
return -2.0
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
|
inference.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import os
|
| 3 |
+
import textwrap
|
| 4 |
+
from typing import List, Optional
|
| 5 |
+
|
| 6 |
+
from openai import OpenAI
|
| 7 |
+
|
| 8 |
+
from environment.api_triage_env import APITriageEnv
|
| 9 |
+
from environment.action_space import get_all_actions
|
| 10 |
+
from environment.incident_generator import get_incident_by_type
|
| 11 |
+
|
| 12 |
+
# ============================================
|
| 13 |
+
# Environment Variables
|
| 14 |
+
# ============================================
|
| 15 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 16 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 17 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 18 |
+
API_KEY = HF_TOKEN
|
| 19 |
+
|
| 20 |
+
TASK_NAME = os.getenv("TASK_NAME", "api_triage")
|
| 21 |
+
BENCHMARK = os.getenv("BENCHMARK", "api_triage_agent")
|
| 22 |
+
MAX_STEPS = 10
|
| 23 |
+
TEMPERATURE = 0.7
|
| 24 |
+
MAX_TOKENS = 50
|
| 25 |
+
SUCCESS_SCORE_THRESHOLD = 0.5
|
| 26 |
+
|
| 27 |
+
# ============================================
|
| 28 |
+
# System Prompt
|
| 29 |
+
# ============================================
|
| 30 |
+
AVAILABLE_ACTIONS = get_all_actions()
|
| 31 |
+
|
| 32 |
+
SYSTEM_PROMPT = textwrap.dedent(
|
| 33 |
+
f"""
|
| 34 |
+
You are an API debugging agent. Your job is to diagnose and fix API failures.
|
| 35 |
+
Available actions: {AVAILABLE_ACTIONS}
|
| 36 |
+
Rules:
|
| 37 |
+
- First use "inspect_logs" to understand the problem
|
| 38 |
+
- Then take the correct fix action based on the error
|
| 39 |
+
- Finally use "resolve" to end the episode
|
| 40 |
+
Reply with ONLY the action name. No explanations. No quotes.
|
| 41 |
+
"""
|
| 42 |
+
).strip()
|
| 43 |
+
|
| 44 |
+
# ============================================
|
| 45 |
+
# Logging Functions (Required Format)
|
| 46 |
+
# ============================================
|
| 47 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 48 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 49 |
+
|
| 50 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 51 |
+
error_val = error if error else "null"
|
| 52 |
+
done_val = str(done).lower()
|
| 53 |
+
print(
|
| 54 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
|
| 55 |
+
flush=True,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 59 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 60 |
+
print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
|
| 61 |
+
|
| 62 |
+
# ============================================
|
| 63 |
+
# Prompt Builder
|
| 64 |
+
# ============================================
|
| 65 |
+
def build_user_prompt(step: int, observation, last_reward: float, history: List[str]) -> str:
|
| 66 |
+
history_block = "\n".join(history[-4:]) if history else "None"
|
| 67 |
+
return textwrap.dedent(
|
| 68 |
+
f"""
|
| 69 |
+
Step: {step}
|
| 70 |
+
Incident: {observation.incident_summary}
|
| 71 |
+
Response Code: {observation.response_code}
|
| 72 |
+
Logs: {observation.logs}
|
| 73 |
+
Fix Applied: {observation.fix_applied}
|
| 74 |
+
Last Reward: {last_reward:.2f}
|
| 75 |
+
Previous Actions:
|
| 76 |
+
{history_block}
|
| 77 |
+
|
| 78 |
+
Choose an action from: {AVAILABLE_ACTIONS}
|
| 79 |
+
Reply with ONLY the action name.
|
| 80 |
+
"""
|
| 81 |
+
).strip()
|
| 82 |
+
|
| 83 |
+
# ============================================
|
| 84 |
+
# LLM Caller
|
| 85 |
+
# ============================================
|
| 86 |
+
def get_model_action(client: OpenAI, step: int, observation, last_reward: float, history: List[str]) -> str:
|
| 87 |
+
user_prompt = build_user_prompt(step, observation, last_reward, history)
|
| 88 |
+
try:
|
| 89 |
+
completion = client.chat.completions.create(
|
| 90 |
+
model=MODEL_NAME,
|
| 91 |
+
messages=[
|
| 92 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 93 |
+
{"role": "user", "content": user_prompt},
|
| 94 |
+
],
|
| 95 |
+
temperature=TEMPERATURE,
|
| 96 |
+
max_tokens=MAX_TOKENS,
|
| 97 |
+
stream=False,
|
| 98 |
+
)
|
| 99 |
+
action = (completion.choices[0].message.content or "").strip().lower()
|
| 100 |
+
|
| 101 |
+
if action not in AVAILABLE_ACTIONS:
|
| 102 |
+
print(f"[DEBUG] Invalid action '{action}', defaulting to inspect_logs", flush=True)
|
| 103 |
+
return "inspect_logs"
|
| 104 |
+
return action
|
| 105 |
+
except Exception as exc:
|
| 106 |
+
print(f"[DEBUG] Model request failed: {exc}", flush=True)
|
| 107 |
+
return "inspect_logs"
|
| 108 |
+
|
| 109 |
+
# ============================================
|
| 110 |
+
# Main Async Function
|
| 111 |
+
# ============================================
|
| 112 |
+
async def main() -> None:
|
| 113 |
+
if not API_KEY:
|
| 114 |
+
print("[ERROR] HF_TOKEN environment variable not set", flush=True)
|
| 115 |
+
return
|
| 116 |
+
|
| 117 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 118 |
+
env = APITriageEnv(max_steps=MAX_STEPS)
|
| 119 |
+
|
| 120 |
+
# All 6 task IDs matching openenv.yaml — each evaluated explicitly
|
| 121 |
+
task_ids = ["auth_error", "missing_fields", "rate_limit", "timeout", "wrong_endpoint", "server_error"]
|
| 122 |
+
|
| 123 |
+
log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
|
| 124 |
+
|
| 125 |
+
for tid in task_ids:
|
| 126 |
+
history: List[str] = []
|
| 127 |
+
rewards: List[float] = []
|
| 128 |
+
steps_taken = 0
|
| 129 |
+
success = False
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
# Reset env and FORCE the specific incident type (no randomness)
|
| 133 |
+
observation = env.reset()
|
| 134 |
+
env.incident = get_incident_by_type(tid)
|
| 135 |
+
observation = env.state() # refresh observation with forced incident
|
| 136 |
+
|
| 137 |
+
last_reward = 0.0
|
| 138 |
+
|
| 139 |
+
for step in range(1, MAX_STEPS + 1):
|
| 140 |
+
action = get_model_action(client, step, observation, last_reward, history)
|
| 141 |
+
observation, reward, done, info = env.step(action)
|
| 142 |
+
|
| 143 |
+
rewards.append(reward)
|
| 144 |
+
steps_taken = step
|
| 145 |
+
last_reward = reward
|
| 146 |
+
|
| 147 |
+
log_step(step=step, action=action, reward=reward, done=done, error=None)
|
| 148 |
+
history.append(f"Step {step}: {action} -> reward {reward:.2f}")
|
| 149 |
+
|
| 150 |
+
if done:
|
| 151 |
+
success = info.get("resolution") == "success"
|
| 152 |
+
break
|
| 153 |
+
|
| 154 |
+
# Score strictly between 0 and 1
|
| 155 |
+
task_score = 0.95 if success else 0.05
|
| 156 |
+
log_end(success=success, steps=steps_taken, score=task_score, rewards=rewards)
|
| 157 |
+
|
| 158 |
+
except Exception as e:
|
| 159 |
+
print(f"[DEBUG] Error in task {tid}: {e}", flush=True)
|
| 160 |
+
log_end(success=False, steps=0, score=0.05, rewards=[0.0])
|
| 161 |
+
|
| 162 |
+
# ============================================
|
| 163 |
+
# Run
|
| 164 |
+
# ============================================
|
| 165 |
+
if __name__ == "__main__":
|
| 166 |
+
asyncio.run(main())
|
openenv.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: api-triage-agent
|
| 2 |
+
version: 1.0.0
|
| 3 |
+
description: AI agent debugs API failures using curriculum learning
|
| 4 |
+
|
| 5 |
+
environment:
|
| 6 |
+
class: environment.api_triage_env.APITriageEnv
|
| 7 |
+
max_steps: 10
|
| 8 |
+
|
| 9 |
+
observation_space:
|
| 10 |
+
step: int
|
| 11 |
+
max_steps: int
|
| 12 |
+
incident_summary: str
|
| 13 |
+
logs: list
|
| 14 |
+
response_code: int
|
| 15 |
+
fix_applied: bool
|
| 16 |
+
is_resolved: bool
|
| 17 |
+
|
| 18 |
+
action_space:
|
| 19 |
+
- inspect_logs
|
| 20 |
+
- inspect_request
|
| 21 |
+
- refresh_token
|
| 22 |
+
- add_field
|
| 23 |
+
- wait_retry
|
| 24 |
+
- change_endpoint
|
| 25 |
+
- escalate
|
| 26 |
+
- resolve
|
| 27 |
+
|
| 28 |
+
reward_range: [-20, 20]
|
| 29 |
+
|
| 30 |
+
tasks:
|
| 31 |
+
- id: "auth_error"
|
| 32 |
+
name: "Authentication Error"
|
| 33 |
+
description: "Diagnose and fix a 401 Unauthorized error caused by an expired API key"
|
| 34 |
+
difficulty: "easy"
|
| 35 |
+
grader: "tests.test_graders:grade_auth_error"
|
| 36 |
+
- id: "missing_fields"
|
| 37 |
+
name: "Missing Field Error"
|
| 38 |
+
description: "Diagnose and fix a 400 Bad Request error caused by a missing required field"
|
| 39 |
+
difficulty: "easy"
|
| 40 |
+
grader: "tests.test_graders:grade_missing_fields"
|
| 41 |
+
- id: "rate_limit"
|
| 42 |
+
name: "Rate Limit Error"
|
| 43 |
+
description: "Diagnose and fix a 429 Too Many Requests error by applying retry logic"
|
| 44 |
+
difficulty: "medium"
|
| 45 |
+
grader: "tests.test_graders:grade_rate_limit"
|
| 46 |
+
- id: "timeout"
|
| 47 |
+
name: "Timeout Error"
|
| 48 |
+
description: "Diagnose and fix a 408 Request Timeout error by applying retry with backoff"
|
| 49 |
+
difficulty: "medium"
|
| 50 |
+
grader: "tests.test_graders:grade_timeout"
|
| 51 |
+
- id: "wrong_endpoint"
|
| 52 |
+
name: "Wrong Endpoint Error"
|
| 53 |
+
description: "Diagnose and fix a 404 Not Found error by changing to the correct endpoint"
|
| 54 |
+
difficulty: "medium"
|
| 55 |
+
grader: "tests.test_graders:grade_wrong_endpoint"
|
| 56 |
+
- id: "server_error"
|
| 57 |
+
name: "Server Error"
|
| 58 |
+
description: "Diagnose and handle a 500 Internal Server Error by escalating appropriately"
|
| 59 |
+
difficulty: "hard"
|
| 60 |
+
grader: "tests.test_graders:grade_server_error"
|
pyproject.toml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=61.0", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "api-triage-agent"
|
| 7 |
+
version = "1.0.0"
|
| 8 |
+
description = "OpenEnv environment for API debugging with AI agents"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.10"
|
| 11 |
+
dependencies = [
|
| 12 |
+
"pydantic>=2.0.0",
|
| 13 |
+
"openai>=1.0.0",
|
| 14 |
+
"numpy>=1.24.0",
|
| 15 |
+
"pytest>=7.0.0",
|
| 16 |
+
"fastapi>=0.100.0",
|
| 17 |
+
"uvicorn>=0.23.0",
|
| 18 |
+
"openenv-core>=0.2.0",
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
[project.scripts]
|
| 22 |
+
server = "app:main"
|
| 23 |
+
|
| 24 |
+
[tool.openenv]
|
| 25 |
+
environment-class = "environment.api_triage_env.APITriageEnv"
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# for installing all the packages for the code to run
|
| 2 |
+
openenv-core>=0.2.0
|
| 3 |
+
openai>=1.0.0
|
| 4 |
+
pydantic>=2.0.0
|
| 5 |
+
numpy>=1.24.0
|
| 6 |
+
pytest>=7.0.0
|
| 7 |
+
fastapi>=0.100.0
|
| 8 |
+
uvicorn>=0.23.0
|
server/app.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# the main environment file
|
| 2 |
+
|
| 3 |
+
from fastapi import FastAPI
|
| 4 |
+
from pydantic import BaseModel
|
| 5 |
+
from environment.api_triage_env import APITriageEnv
|
| 6 |
+
|
| 7 |
+
# creating an app and environment
|
| 8 |
+
app = FastAPI()
|
| 9 |
+
env = APITriageEnv()
|
| 10 |
+
|
| 11 |
+
# defining a request model for /step endpoint
|
| 12 |
+
# for fastapi so that it can understand that we expecting a JSON with an action field that is a text dtype
|
| 13 |
+
class ActionRequest(BaseModel):
|
| 14 |
+
action: str
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@app.post("/reset")
|
| 18 |
+
|
| 19 |
+
def reset():
|
| 20 |
+
"""
|
| 21 |
+
Starting a new API debugging episode
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
print("INFO : reset endpoint is called , new debugging session started ")
|
| 25 |
+
state = env.reset()
|
| 26 |
+
return {
|
| 27 |
+
"step" : state.step,
|
| 28 |
+
"max_steps": state.max_steps,
|
| 29 |
+
"incident_summary": state.incident_summary,
|
| 30 |
+
"logs": state.logs,
|
| 31 |
+
"response_code":state.response_code,
|
| 32 |
+
"fix_applied": state.fix_applied,
|
| 33 |
+
"is_resolved" : state.is_resolved
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@app.get("/state")
|
| 38 |
+
|
| 39 |
+
def state():
|
| 40 |
+
"""
|
| 41 |
+
HELPs to return the current observation of the episode.
|
| 42 |
+
"""
|
| 43 |
+
print("INFO : current state of the Episode as follows ")
|
| 44 |
+
current = env.state()
|
| 45 |
+
return {
|
| 46 |
+
"step" : current.step,
|
| 47 |
+
"max_steps": current.max_steps,
|
| 48 |
+
"incident_summary": current.incident_summary,
|
| 49 |
+
"logs": current.logs,
|
| 50 |
+
"response_code": current.response_code,
|
| 51 |
+
"fix_applied": current.fix_applied,
|
| 52 |
+
"is_resolved" : current.is_resolved
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@app.post("/step")
|
| 57 |
+
|
| 58 |
+
def step(request: ActionRequest):
|
| 59 |
+
"""
|
| 60 |
+
the agent sends an action and our environment will preocess it
|
| 61 |
+
and update the state , returns what happened.
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
"""
|
| 65 |
+
action = what the agent wants to do (text)
|
| 66 |
+
observation = what the agent sees after doing it (object with 7 fields)
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
action = request.action
|
| 70 |
+
print(f"INFO : Action received: {action}")
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# calling env.step() from api_triage_env.py file to process the action
|
| 74 |
+
observation , reward , done , info = env.step(action)
|
| 75 |
+
|
| 76 |
+
# here returning the result
|
| 77 |
+
return {
|
| 78 |
+
"observation": {
|
| 79 |
+
"step" : observation.step,
|
| 80 |
+
"max_steps": observation.max_steps,
|
| 81 |
+
"incident_summary": observation.incident_summary,
|
| 82 |
+
"logs": observation.logs,
|
| 83 |
+
"response_code": observation.response_code,
|
| 84 |
+
"fix_applied": observation.fix_applied,
|
| 85 |
+
"is_resolved" : observation.is_resolved
|
| 86 |
+
},
|
| 87 |
+
"reward": reward,
|
| 88 |
+
"done": done,
|
| 89 |
+
"info": info,
|
| 90 |
+
}
|
| 91 |
+
def main():
|
| 92 |
+
import uvicorn
|
| 93 |
+
uvicorn.run("app:app", host="0.0.0.0", port=7860)
|
| 94 |
+
|
| 95 |
+
if __name__ == "__main__":
|
| 96 |
+
main()
|
tasks/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Tasks package for OpenEnv grading
|
tasks/auth_error/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# auth_error task
|
tasks/auth_error/grader.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Grader for auth_error task: 401 Unauthorized - expired API key."""
|
| 2 |
+
|
| 3 |
+
from tasks.grading_helper import run_agent_on_incident
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def grade() -> float:
|
| 7 |
+
"""Grade the auth_error task. Returns score between 0 and 1."""
|
| 8 |
+
score = run_agent_on_incident("auth_error")
|
| 9 |
+
return max(0.001, min(0.999, score))
|
tasks/grading_helper.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared grading helper used by all per-task grader modules."""
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
# Ensure project root is on sys.path so environment package is importable
|
| 7 |
+
_project_root = str(Path(__file__).parent.parent)
|
| 8 |
+
if _project_root not in sys.path:
|
| 9 |
+
sys.path.insert(0, _project_root)
|
| 10 |
+
|
| 11 |
+
from environment.api_triage_env import APITriageEnv
|
| 12 |
+
from environment.incident_generator import get_incident_by_type
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def run_agent_on_incident(incident_type: str, max_steps: int = 10) -> float:
|
| 16 |
+
"""Simulate an optimal agent solving a specific incident type.
|
| 17 |
+
|
| 18 |
+
Returns a float score strictly between 0 and 1.
|
| 19 |
+
"""
|
| 20 |
+
env = APITriageEnv(max_steps=max_steps)
|
| 21 |
+
|
| 22 |
+
# Force the specific incident (bypass curriculum randomness)
|
| 23 |
+
env.incident = get_incident_by_type(incident_type)
|
| 24 |
+
if env.incident is None:
|
| 25 |
+
return 0.05
|
| 26 |
+
|
| 27 |
+
env.fix_applied = False
|
| 28 |
+
env.done = False
|
| 29 |
+
env.step_counter = 0
|
| 30 |
+
env.total_reward = 0.0
|
| 31 |
+
|
| 32 |
+
correct_action = env.incident["fix_action"]
|
| 33 |
+
|
| 34 |
+
# Optimal sequence: inspect → fix → resolve
|
| 35 |
+
actions = ["inspect_logs", correct_action, "resolve"]
|
| 36 |
+
|
| 37 |
+
for action in actions:
|
| 38 |
+
state, reward, done, info = env.step(action)
|
| 39 |
+
if done:
|
| 40 |
+
if info.get("resolution") == "success":
|
| 41 |
+
return 0.95
|
| 42 |
+
else:
|
| 43 |
+
return 0.05
|
| 44 |
+
|
| 45 |
+
return 0.1
|
tasks/missing_fields/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# missing_fields task
|
tasks/missing_fields/grader.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Grader for missing_fields task: 400 Bad Request - missing email field."""
|
| 2 |
+
|
| 3 |
+
from tasks.grading_helper import run_agent_on_incident
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def grade() -> float:
|
| 7 |
+
"""Grade the missing_fields task. Returns score between 0 and 1."""
|
| 8 |
+
score = run_agent_on_incident("missing_fields")
|
| 9 |
+
return max(0.001, min(0.999, score))
|
tasks/rate_limit/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# rate_limit task
|
tasks/rate_limit/grader.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Grader for rate_limit task: 429 Too Many Requests."""
|
| 2 |
+
|
| 3 |
+
from tasks.grading_helper import run_agent_on_incident
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def grade() -> float:
|
| 7 |
+
"""Grade the rate_limit task. Returns score between 0 and 1."""
|
| 8 |
+
score = run_agent_on_incident("rate_limit")
|
| 9 |
+
return max(0.001, min(0.999, score))
|
tasks/timeout/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# timeout task
|
tasks/timeout/grader.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Grader for timeout task: 408 Request Timeout."""
|
| 2 |
+
|
| 3 |
+
from tasks.grading_helper import run_agent_on_incident
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def grade() -> float:
|
| 7 |
+
"""Grade the timeout task. Returns score between 0 and 1."""
|
| 8 |
+
score = run_agent_on_incident("timeout")
|
| 9 |
+
return max(0.001, min(0.999, score))
|
tasks/wrong_endpoint/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# wrong_endpoint task
|
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
tests/test_env.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# to run this file for checking please run this command : py -m pytest tests/test_env.py -v
|
| 2 |
+
# py -m pytest tests/test_env.py -v in the terminal
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
from environment.api_triage_env import APITriageEnv
|
| 6 |
+
from environment.incident_generator import get_incident_by_type
|
| 7 |
+
|
| 8 |
+
def test_reset():
|
| 9 |
+
""" reset should return the initial state of the environment and with step 0 """
|
| 10 |
+
env = APITriageEnv(max_steps=10)
|
| 11 |
+
state = env.reset()
|
| 12 |
+
# btw here we used assert to check if the state is a string or not
|
| 13 |
+
assert state.step == 0
|
| 14 |
+
assert state.max_steps == 10
|
| 15 |
+
assert state.incident_summary is not None
|
| 16 |
+
assert state.logs is not None
|
| 17 |
+
assert state.response_code in [400, 404, 401, 429, 500, 408]
|
| 18 |
+
|
| 19 |
+
def test_action_is_valid():
|
| 20 |
+
""" valid action should return the float reward"""
|
| 21 |
+
env = APITriageEnv()
|
| 22 |
+
env.reset()
|
| 23 |
+
test_action_is_valid = ["add_field", "wait_retry", "refresh_token", "change_endpoint", "escalate"]
|
| 24 |
+
for action in test_action_is_valid:
|
| 25 |
+
state , reward , done , info = env.step(action)
|
| 26 |
+
# btw here we are checking if the reward is a float and if done is also a boolean ,if its not tthen test will catch it
|
| 27 |
+
assert isinstance(reward, float)
|
| 28 |
+
assert isinstance(done, bool)
|
| 29 |
+
|
| 30 |
+
def test_action_is_invalid():
|
| 31 |
+
""" invalid action should return a negative reward and done = true"""
|
| 32 |
+
env = APITriageEnv()
|
| 33 |
+
env.reset()
|
| 34 |
+
state, reward, done, info = env.step("fake_action_123")
|
| 35 |
+
assert reward == -2.0
|
| 36 |
+
assert done is False
|
| 37 |
+
|
| 38 |
+
def test_episode_successful():
|
| 39 |
+
env = APITriageEnv()
|
| 40 |
+
env.incident = get_incident_by_type("auth_error")
|
| 41 |
+
env.fix_applied = False
|
| 42 |
+
env.done = False
|
| 43 |
+
env.total_reward = 0.0
|
| 44 |
+
state, reward, done, info = env.step("inspect_logs")
|
| 45 |
+
state, reward, done, info = env.step("refresh_token")
|
| 46 |
+
assert done is False
|
| 47 |
+
|
| 48 |
+
state, reward, done, info = env.step("resolve")
|
| 49 |
+
assert done is True
|
| 50 |
+
|
| 51 |
+
|
tests/test_graders.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Grader functions for OpenEnv task validation
|
| 2 |
+
# Each function is referenced in openenv.yaml as tests.test_graders:grade_<task_id>
|
| 3 |
+
# Grader functions must return a float score between 0.0 and 1.0
|
| 4 |
+
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 8 |
+
|
| 9 |
+
from environment.api_triage_env import APITriageEnv
|
| 10 |
+
from environment.incident_generator import get_incident_by_type
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _run_agent_on_incident(incident_type, max_steps=10):
|
| 14 |
+
"""Simulate an agent solving a specific incident type. Returns score 0.0-1.0."""
|
| 15 |
+
env = APITriageEnv(max_steps=max_steps)
|
| 16 |
+
|
| 17 |
+
# Force the specific incident (bypass curriculum randomness)
|
| 18 |
+
env.incident = get_incident_by_type(incident_type)
|
| 19 |
+
env.fix_applied = False
|
| 20 |
+
env.done = False
|
| 21 |
+
env.step_counter = 0
|
| 22 |
+
env.total_reward = 0.0
|
| 23 |
+
|
| 24 |
+
# Get the correct fix action for this incident
|
| 25 |
+
correct_action = env.incident["fix_action"]
|
| 26 |
+
|
| 27 |
+
# Optimal action sequence: inspect -> fix -> resolve
|
| 28 |
+
actions = ["inspect_logs", correct_action, "resolve"]
|
| 29 |
+
|
| 30 |
+
for action in actions:
|
| 31 |
+
state, reward, done, info = env.step(action)
|
| 32 |
+
if done:
|
| 33 |
+
return 0.95 if info.get("resolution") == "success" else 0.05
|
| 34 |
+
|
| 35 |
+
return 0.1
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ============================================
|
| 39 |
+
# Per-task grader functions (referenced in openenv.yaml)
|
| 40 |
+
# ============================================
|
| 41 |
+
|
| 42 |
+
def grade_auth_error():
|
| 43 |
+
"""Grader for auth_error task: 401 Unauthorized - expired API key"""
|
| 44 |
+
score = _run_agent_on_incident("auth_error")
|
| 45 |
+
assert 0.0 <= score <= 1.0, f"Score {score} out of range"
|
| 46 |
+
return score
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def grade_missing_fields():
|
| 50 |
+
"""Grader for missing_fields task: 400 Bad Request - missing email field"""
|
| 51 |
+
score = _run_agent_on_incident("missing_fields")
|
| 52 |
+
assert 0.0 <= score <= 1.0, f"Score {score} out of range"
|
| 53 |
+
return score
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def grade_rate_limit():
|
| 57 |
+
"""Grader for rate_limit task: 429 Too Many Requests"""
|
| 58 |
+
score = _run_agent_on_incident("rate_limit")
|
| 59 |
+
assert 0.0 <= score <= 1.0, f"Score {score} out of range"
|
| 60 |
+
return score
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def grade_timeout():
|
| 64 |
+
"""Grader for timeout task: 408 Request Timeout"""
|
| 65 |
+
score = _run_agent_on_incident("timeout")
|
| 66 |
+
assert 0.0 <= score <= 1.0, f"Score {score} out of range"
|
| 67 |
+
return score
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def grade_wrong_endpoint():
|
| 71 |
+
"""Grader for wrong_endpoint task: 404 Not Found"""
|
| 72 |
+
score = _run_agent_on_incident("wrong_endpoint")
|
| 73 |
+
assert 0.0 <= score <= 1.0, f"Score {score} out of range"
|
| 74 |
+
return score
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def grade_server_error():
|
| 78 |
+
"""Grader for server_error task: 500 Internal Server Error"""
|
| 79 |
+
score = _run_agent_on_incident("server_error")
|
| 80 |
+
assert 0.0 <= score <= 1.0, f"Score {score} out of range"
|
| 81 |
+
return score
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# ============================================
|
| 85 |
+
# Pytest-compatible test wrappers
|
| 86 |
+
# ============================================
|
| 87 |
+
|
| 88 |
+
def test_grade_auth_error():
|
| 89 |
+
score = grade_auth_error()
|
| 90 |
+
assert score > 0.5, f"auth_error grader returned low score: {score}"
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def test_grade_missing_fields():
|
| 94 |
+
score = grade_missing_fields()
|
| 95 |
+
assert score > 0.5, f"missing_fields grader returned low score: {score}"
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def test_grade_rate_limit():
|
| 99 |
+
score = grade_rate_limit()
|
| 100 |
+
assert score > 0.5, f"rate_limit grader returned low score: {score}"
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def test_grade_timeout():
|
| 104 |
+
score = grade_timeout()
|
| 105 |
+
assert score > 0.5, f"timeout grader returned low score: {score}"
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def test_grade_wrong_endpoint():
|
| 109 |
+
score = grade_wrong_endpoint()
|
| 110 |
+
assert score > 0.5, f"wrong_endpoint grader returned low score: {score}"
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def test_grade_server_error():
|
| 114 |
+
score = grade_server_error()
|
| 115 |
+
assert score > 0.5, f"server_error grader returned low score: {score}"
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
print(f"auth_error score: {grade_auth_error()}")
|
| 120 |
+
print(f"missing_fields score: {grade_missing_fields()}")
|
| 121 |
+
print(f"rate_limit score: {grade_rate_limit()}")
|
| 122 |
+
print(f"timeout score: {grade_timeout()}")
|
| 123 |
+
print(f"wrong_endpoint score: {grade_wrong_endpoint()}")
|
| 124 |
+
print(f"server_error score: {grade_server_error()}")
|
| 125 |
+
print("All graders passed!")
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|