Spaces:

Kavya988
/

API_DEBUG_SOLVER

Sleeping

App Files Files Community

Kavya988 commited on Apr 12

Commit

d416acc

verified ·

1 Parent(s): 92f22c6

Upload 29 files

Browse files

Files changed (29) hide show

Dockerfile +16 -0
README.md +72 -7
app.py +97 -0
demo.py +32 -0
environment/__init__.py +2 -0
environment/action_space.py +37 -0
environment/api_triage_env.py +156 -0
environment/incident_generator.py +76 -0
environment/reward.py +46 -0
inference.py +166 -0
openenv.yaml +60 -0
pyproject.toml +25 -0
requirements.txt +8 -0
server/app.py +96 -0
tasks/__init__.py +1 -0
tasks/auth_error/__init__.py +1 -0
tasks/auth_error/grader.py +9 -0
tasks/grading_helper.py +45 -0
tasks/missing_fields/__init__.py +1 -0
tasks/missing_fields/grader.py +9 -0
tasks/rate_limit/__init__.py +1 -0
tasks/rate_limit/grader.py +9 -0
tasks/timeout/__init__.py +1 -0
tasks/timeout/grader.py +9 -0
tasks/wrong_endpoint/__init__.py +1 -0
tests/__init__.py +1 -0
tests/test_env.py +51 -0
tests/test_graders.py +125 -0
uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY environment/ ./environment/
+COPY tests/ ./tests/
+COPY app.py .
+COPY inference.py .
+COPY openenv.yaml .
+EXPOSE 7860
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,12 +1,77 @@
 ---
-title: API DEBUG SOLVER
-emoji: 🏃
-colorFrom: purple
-colorTo: green
 sdk: docker
 pinned: false
-license: other
-short_description: API Triage Agent - AI agent debugs API failures
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: API Triage Agent
+emoji: 🤖
+colorFrom: blue
+colorTo: purple
 sdk: docker
 pinned: false
 ---
+# API Triage Agent
+## Environment Description :
+An AI agent that diagnoses and resolves API integration failures by inspecting logs, identifying error types, and taking corrective actions. Simulates real-world API debugging scenarios.
+## Motivation
+API failures are common in production. This environment teaches an agent to handle authentication errors, missing fields, rate limits, wrong endpoints, and server errors – just like a real support engineer.
+## Action Space (8 actions)
+| Action | Description |
+|--------|-------------|
+| `inspect_logs` | Examine error logs for clues |
+| `inspect_request` | Check the failed API request |
+| `refresh_token` | Fix authentication errors (401) |
+| `add_field` | Add missing required fields (400) |
+| `wait_retry` | Handle rate limits (429) and timeouts (408) |
+| `change_endpoint` | Fix wrong API endpoint (404) |
+| `escalate` | Report server errors (500) to human |
+| `resolve` | End episode after successful fix |
+## Observation Space
+| Field | Type | Description |
+|-------|------|-------------|
+| `step` | int | Current step number |
+| `max_steps` | int | Maximum steps allowed (10) |
+| `incident_summary` | str | Short problem description |
+| `logs` | list | Error messages from API |
+| `response_code` | int | HTTP status code |
+| `fix_applied` | bool | Whether fix has been applied |
+| `is_resolved` | bool | Whether episode ended |
+## Tasks (Easy → Medium → Hard)
+### Easy Task: Authentication Error
+- **Incident:** `auth_error`
+- **Correct fix:** `refresh_token` → `resolve`
+- **Score achieved:** 1.0
+### Medium Task: Missing Field Error
+- **Incident:** `missing_fields`
+- **Correct fix:** `add_field` → `resolve`
+- **Score achieved:** 1.0
+### Hard Task: Server Error
+- **Incident:** `server_error`
+- **Correct fix:** `escalate` → `resolve`
+- **Score achieved:** 1.0
+## Reward System (5 factors)
+| Factor | Reward |
+|--------|--------|
+| Correct fix action | +5 |
+| Wrong action | -2 |
+| Diagnostic action | +0.5 |
+| Resolve with fix (success) | +15 |
+| Resolve without fix | -10 |
+| Max steps reached | -5 |
+## Setup Instructions
+### 1. Create virtual environment
+```bash
+python -m venv venv
+source venv/bin/activate  # Linux/Mac
+venv\Scripts\activate     # Windows
+# 2. must do (install dependencies) - pip install -r requirements.txt
+# also run these : python demo.py , pytest tests/test_env.py -v , python tests/test_graders.py , openenv validate

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# the main environment file
+from fastapi import FastAPI
+from pydantic import BaseModel
+from environment.api_triage_env import APITriageEnv
+# creating an app and environment
+app = FastAPI()
+env = APITriageEnv()
+# defining a request model for /step endpoint
+# for fastapi so that it can understand that we expecting a JSON with an action field that is a text dtype
+class ActionRequest(BaseModel):
+  action: str
+@app.post("/reset")
+def reset():
+  """
+  Starting a new API debugging episode
+  """
+  print("INFO : reset endpoint is called , new debugging session started ")
+  state = env.reset()
+  return {
+    "step" : state.step,
+    "max_steps": state.max_steps,
+    "incident_summary": state.incident_summary,
+    "logs": state.logs,
+    "response_code":state.response_code,
+    "fix_applied": state.fix_applied,
+    "is_resolved" : state.is_resolved
+  }
+@app.get("/state")
+def state():
+  """
+  HELPs to return the current observation of the episode.
+  """
+  print("INFO : current state of the Episode as follows ")
+  current = env.state()
+  return {
+    "step" : current.step,
+    "max_steps": current.max_steps,
+    "incident_summary": current.incident_summary,
+    "logs": current.logs,
+    "response_code": current.response_code,
+    "fix_applied": current.fix_applied,
+    "is_resolved" : current.is_resolved
+  }
+@app.post("/step")
+def step(request: ActionRequest):
+  """
+  the agent sends an action and our environment will preocess it
+  and update the state , returns what happened.
+  """
+  """
+  action = what the agent wants to do (text)
+  observation = what the agent sees after doing it (object with 7 fields)
+  """
+  action = request.action
+  print(f"INFO : Action received: {action}")
+  # calling env.step() from api_triage_env.py file to process the action
+  observation , reward , done , info = env.step(action)
+  # here returning the result
+  return {
+    "observation": {
+    "step" : observation.step,
+    "max_steps": observation.max_steps,
+    "incident_summary": observation.incident_summary,
+    "logs": observation.logs,
+    "response_code": observation.response_code,
+    "fix_applied": observation.fix_applied,
+    "is_resolved" : observation.is_resolved
+    },
+    "reward": reward,
+    "done": done,
+    "info": info,
+  }
+def main():
+    import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=7860)
+if __name__ == "__main__":
+    main()

demo.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from environment.api_triage_env import APITriageEnv
+print("API Triage Agent Demo")
+# creating an instance of the environment
+env = APITriageEnv(max_steps=10)
+# starting a new episode
+state = env.reset()
+# Get correct fix action from the incident
+correct_action = env.incident["fix_action"]
+# printing the initial state of the environment
+print(f"\nIncident: {state.incident_summary}")
+print(f"Response Code: {state.response_code}")
+print(f"logs: {state.logs}")
+# defined a sequence of actions to take
+actions = ["inspect_logs", correct_action, "resolve"]
+for action in actions:
+  print(f"\nTaking action: {action}")
+  state, reward, done, info = env.step(action)  # ← Use 'action' not 'actions'
+  print(f"Reward: {reward}")
+  print(f"fix applied: {state.fix_applied}")
+  if done:
+    print(f"\n[EPISODE END] Resolution: {info.get('resolution')}")
+    print(f"Total Reward: {info.get('total_reward')}")
+    break

environment/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # This file tells Python that 'environment' is a package
2	+ # It allows us to import from this folder

environment/action_space.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+what we will do here is that we gonna define
+every action that our ai agent
+can take so that we can the  debug the API failures,
+or
+this file contains ->
+* a list of action names - storing all valid actions,
+* a validation function - check if action is allowed,
+* a get all actions function - return the list for grading .
+"""
+# each action = one thing the agent can do to debug the API
+VALID_ACTIONS = [
+    # Diagnostic actions -> it helps our agent to understand the problem before fixing
+    "inspect_logs",
+    "inspect_request",
+    # Fix actions -> these actions actually solves the problem(s)
+    "refresh_token",     # for authentication_error
+    "add_field",         # the missing_fields
+    "wait_retry",        # two incidents here for (rate_limit and timeout)
+    "change_endpoint",   # wrong_endpoint
+    "escalate",          # server_error
+    # Terminal action -> for ending the episode
+    "resolve"
+]
+def is_valid_action(action):
+  # returns true if the action is allowed or present
+  return action in VALID_ACTIONS
+def get_all_actions():
+  # returns copy of all actions , if why? then here we did it because after the caller modifies , our original list stays safe
+  return VALID_ACTIONS.copy()

environment/api_triage_env.py ADDED Viewed

	@@ -0,0 +1,156 @@

+"""
+The three main methods implementation ->
+1. reset() - start of each episode
+2. step(action) - agent takes an action
+3. state() - called anytime
+"""
+"""
+1. For incident selection - curriculum learning approach (easy -> medium -> hard)
+2. For Reward factors - 5 factors (correct, wrong, resolve with/without fix, max steps)
+3. For episode end conditions - resolved with fix , resolved without fix , max steps reached
+4. For action space -  8 actions(including diagnostic , fix , terminal)
+5. For max steps - 10 steps per episode
+6. For reward - range is -20 to +20
+"""
+"""
+extra info ->
+1. stage 1 episodes -> 1-10
+2. stage 2 epiosdes -> 11-25
+3. stage 3 epiosdes -> 26+
+"""
+"""
+Our 3 models->
+1. observation-  what agent sees at each step
+2. action - what agent can do at each step
+3. EnvState - internal tracking of the environment
+"""
+import random
+from typing import Dict, Any, Tuple, Optional, List
+from pydantic import BaseModel
+from environment.incident_generator import get_random_incident, get_incident_by_type
+from environment.action_space import is_valid_action
+from environment.reward import calculate_reward
+class Observation(BaseModel):
+  step: int
+  max_steps: int
+  incident_summary : str
+  logs: List[str]
+  response_code: int
+  fix_applied: bool
+  is_resolved: bool
+class Action(BaseModel):
+  action_name: str
+class EnvState(BaseModel):
+  current_incident: Dict[str, Any]
+  step_counter: int
+  fix_applied: bool
+  total_reward: float
+  is_resolved: bool
+class APITriageEnv:
+  def __init__(self, max_steps = 10):
+    self.max_steps = max_steps
+    self.step_counter = 0
+    self.done = False
+    self.incident = None
+    self.fix_applied = False
+    self.total_reward = 0.0
+    self.total_episodes = 0
+  def reset(self):
+    self.step_counter = 0
+    self.done = False
+    self.fix_applied = False
+    self.total_reward = 0.0
+    self.total_episodes += 1
+    # implying the curriculum learning approach here
+    if self.total_episodes <= 10:
+      # stage 1 -> easy incidents (auth_error, missing_fields)
+      incident_type  = random.choice(["auth_error", "missing_fields"])
+      self.incident = get_incident_by_type(incident_type)
+    elif self.total_episodes <= 25:
+      # stage 2 -> medium incidents
+      incident_type = random.choice(["rate_limit", "timeout", "wrong_endpoint"])
+      self.incident = get_incident_by_type(incident_type)
+    elif self.total_episodes > 25:
+      # stage 3 -> hard incidents
+      incident_type = "server_error"
+      self.incident = get_incident_by_type(incident_type)
+    return self.state()
+  def state(self):
+    """Returns what the agent sees at current step"""
+    return Observation(
+        step=self.step_counter,
+        max_steps=self.max_steps,
+        incident_summary=self.incident["summary"],
+        logs=self.incident["logs"],
+        response_code=self.incident["code"],
+        fix_applied=self.fix_applied,
+        is_resolved=self.done
+    )
+  def step(self, action):
+    """Agent takes an action and environment responds with new state and reward"""
+    # 1. if episode is done or finished already
+    if self.done:
+      state = self.state()
+      reward = 0.0
+      info  = {"error": "episode is already finished "}
+      done = True
+      return state, reward, done, info
+    # 2. increment step counter and check is action is valid
+    self.step_counter += 1
+    # 3. validate the action
+    if not is_valid_action(action):
+      state = self.state()
+      reward = -2.0
+      info = {"error" : "the action is not valid"}
+      done = False
+      return state, reward , done , info
+    # 4. Reward calculation
+    reward = calculate_reward(action , self.incident, self.fix_applied, self.step_counter , self.max_steps)
+    # 5. updating fix applied status if the action is the correct fix action
+    if action == self.incident["fix_action"]:
+      self.fix_applied = True
+    # 6. update toatal reward
+    self.total_reward += reward
+    # 7. prepare info (for all cases )
+    info = {
+      "step": self.step_counter,
+      "incident_type": self.incident["type"],
+      "fix_applied": self.fix_applied,
+      "total_reward": self.total_reward
+    }
+    # 8. check if the epiosde is resolved
+    if action == "resolve":
+      self.done = True
+      info["resolution"] = "success" if self.fix_applied else "failure - resolved without fix"
+    # 9. check if epsiode is not resolved that means max steps are reached
+    if self.step_counter >= self.max_steps:
+      self.done = True
+      info["resolution"] = "failure - max steps reached"
+    # 10. final return (one return at the end)
+    return self.state(), reward, self.done, info

environment/incident_generator.py ADDED Viewed

	@@ -0,0 +1,76 @@

+""" what it does :
+ is that Stores different API failure scenarios.
+ When the agent starts a new episode,
+ this file picks a random failure for the agent to debug.
+"""
+import random
+INCIDENTS =[ # here we created mutiple incidents and in different episodes different incidents will be checked
+  {
+  "type": "auth_error",
+  "summary": "401 Unnathorized - API key expired",
+  "logs": ["ERROR : there is inavlid API key", "HINT : your key expired already "],
+  "fix_action": "refresh_token",
+  "code":401
+},
+{
+  "type": "missing_fields",
+  "summary": "400 bad request - email field is missing please check",
+  "logs": ["ERROR :Required field 'email' cannot be found" , "HINT : please add email to the request body "],
+  "fix_action": "add_field",
+  "code":400
+},
+{
+  "type": "rate_limit",
+  "summary": "429 too many requests - slow down ",
+  "logs": ["ERROR : the rate limit is exceeded   " , "HINT : please wait for 1 minute before retrying "],
+  "fix_action": "wait_retry",
+  "code":429
+},
+{
+  "type": "wrong_endpoint",
+  "summary": "404 Not Found - API endpoint doesn't exist",
+  "logs": ["ERROR: POST /api/v2/data returned 404", "Hint: Use /api/v3/data instead"],
+  "fix_action": "change_endpoint",
+  "code": 404
+},
+{
+  "type": "server_error",
+  "summary": "500 Internal Server Error",
+  "logs": ["ERROR: Database connection failed", "Hint: Cannot fix automatically - escalate"],
+  "fix_action": "escalate",
+  "code": 500
+},
+{
+  "type": "timeout",
+  "summary": "request took too long",
+  "logs": ["ERROR : request timed out after 45 seconds", "Hint: server may be overloaded , please try again with backoff"],
+  "fix_action": "wait_retry",
+  "code": 408
+},
+]
+def get_random_incident():
+    # this function is  to  return a random choice
+    return random.choice(INCIDENTS)
+def get_incident_by_type(incident_type):
+    # Returns a specific incident - useful for grading system (score/marks/points)
+    for incident in INCIDENTS: # here we are having a loop
+        if incident["type"] == incident_type: # we chechking here if type matches or not
+            return incident
+    return None

environment/reward.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+here we gonna define the reward function for our agent,
+so that it can learn or adapt the environment and
+able to get/achieve the rewards for the actions it takes in the environment.
+OR
+Per step reward
+"""
+# The rewarding system we writing here will be within the scale of -20 to +20.
+"""
+The factors we are using (5 factors):
+1. Correct action = positive reward (2 to 10)
+2. Wrong action = negative reward (-1 to -3)
+3. Resolve with FIX (Episode success) = large positive reward (+10 to +15)
+4. Resolve WITHOUT FIX (Prevents lying) = negative reward (-5 to -10)
+5. Max steps reached (Episode failure) = negative reward (-5)
+"""
+def calculate_reward(action, incident, fix_applied, step, max_steps):
+  # agents says resolved but didn't fix - penalty
+  if action == "resolve" and not fix_applied:
+    return -10.0
+  # agent ran out of steps - penalty
+  if step >= max_steps:
+    return -5.0
+  # agent fixed and resolved the incident (succes)
+  if action == "resolve" and fix_applied:
+    return 15.0
+  # for correct fix action
+  if action == incident["fix_action"] and not fix_applied:
+    return 5.0
+  # Diagnostic actions - helpful but doesn't fix
+  if action in ["inspect_logs", "inspect_request"]:
+    return 0.5
+  # for wrong action
+  return -2.0

inference.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import asyncio
+import os
+import textwrap
+from typing import List, Optional
+from openai import OpenAI
+from environment.api_triage_env import APITriageEnv
+from environment.action_space import get_all_actions
+from environment.incident_generator import get_incident_by_type
+# ============================================
+# Environment Variables
+# ============================================
+API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
+HF_TOKEN = os.getenv("HF_TOKEN")
+API_KEY = HF_TOKEN
+TASK_NAME = os.getenv("TASK_NAME", "api_triage")
+BENCHMARK = os.getenv("BENCHMARK", "api_triage_agent")
+MAX_STEPS = 10
+TEMPERATURE = 0.7
+MAX_TOKENS = 50
+SUCCESS_SCORE_THRESHOLD = 0.5
+# ============================================
+# System Prompt
+# ============================================
+AVAILABLE_ACTIONS = get_all_actions()
+SYSTEM_PROMPT = textwrap.dedent(
+    f"""
+    You are an API debugging agent. Your job is to diagnose and fix API failures.
+    Available actions: {AVAILABLE_ACTIONS}
+    Rules:
+    - First use "inspect_logs" to understand the problem
+    - Then take the correct fix action based on the error
+    - Finally use "resolve" to end the episode
+    Reply with ONLY the action name. No explanations. No quotes.
+    """
+).strip()
+# ============================================
+# Logging Functions (Required Format)
+# ============================================
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
+    error_val = error if error else "null"
+    done_val = str(done).lower()
+    print(
+        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
+# ============================================
+# Prompt Builder
+# ============================================
+def build_user_prompt(step: int, observation, last_reward: float, history: List[str]) -> str:
+    history_block = "\n".join(history[-4:]) if history else "None"
+    return textwrap.dedent(
+        f"""
+        Step: {step}
+        Incident: {observation.incident_summary}
+        Response Code: {observation.response_code}
+        Logs: {observation.logs}
+        Fix Applied: {observation.fix_applied}
+        Last Reward: {last_reward:.2f}
+        Previous Actions:
+        {history_block}
+        Choose an action from: {AVAILABLE_ACTIONS}
+        Reply with ONLY the action name.
+        """
+    ).strip()
+# ============================================
+# LLM Caller
+# ============================================
+def get_model_action(client: OpenAI, step: int, observation, last_reward: float, history: List[str]) -> str:
+    user_prompt = build_user_prompt(step, observation, last_reward, history)
+    try:
+        completion = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=TEMPERATURE,
+            max_tokens=MAX_TOKENS,
+            stream=False,
+        )
+        action = (completion.choices[0].message.content or "").strip().lower()
+        if action not in AVAILABLE_ACTIONS:
+            print(f"[DEBUG] Invalid action '{action}', defaulting to inspect_logs", flush=True)
+            return "inspect_logs"
+        return action
+    except Exception as exc:
+        print(f"[DEBUG] Model request failed: {exc}", flush=True)
+        return "inspect_logs"
+# ============================================
+# Main Async Function
+# ============================================
+async def main() -> None:
+    if not API_KEY:
+        print("[ERROR] HF_TOKEN environment variable not set", flush=True)
+        return
+    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    env = APITriageEnv(max_steps=MAX_STEPS)
+    # All 6 task IDs matching openenv.yaml — each evaluated explicitly
+    task_ids = ["auth_error", "missing_fields", "rate_limit", "timeout", "wrong_endpoint", "server_error"]
+    log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
+    for tid in task_ids:
+        history: List[str] = []
+        rewards: List[float] = []
+        steps_taken = 0
+        success = False
+        try:
+            # Reset env and FORCE the specific incident type (no randomness)
+            observation = env.reset()
+            env.incident = get_incident_by_type(tid)
+            observation = env.state()  # refresh observation with forced incident
+            last_reward = 0.0
+            for step in range(1, MAX_STEPS + 1):
+                action = get_model_action(client, step, observation, last_reward, history)
+                observation, reward, done, info = env.step(action)
+                rewards.append(reward)
+                steps_taken = step
+                last_reward = reward
+                log_step(step=step, action=action, reward=reward, done=done, error=None)
+                history.append(f"Step {step}: {action} -> reward {reward:.2f}")
+                if done:
+                    success = info.get("resolution") == "success"
+                    break
+            # Score strictly between 0 and 1
+            task_score = 0.95 if success else 0.05
+            log_end(success=success, steps=steps_taken, score=task_score, rewards=rewards)
+        except Exception as e:
+            print(f"[DEBUG] Error in task {tid}: {e}", flush=True)
+            log_end(success=False, steps=0, score=0.05, rewards=[0.0])
+# ============================================
+# Run
+# ============================================
+if __name__ == "__main__":
+    asyncio.run(main())

openenv.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+name: api-triage-agent
+version: 1.0.0
+description: AI agent debugs API failures using curriculum learning
+environment:
+  class: environment.api_triage_env.APITriageEnv
+  max_steps: 10
+observation_space:
+  step: int
+  max_steps: int
+  incident_summary: str
+  logs: list
+  response_code: int
+  fix_applied: bool
+  is_resolved: bool
+action_space:
+  - inspect_logs
+  - inspect_request
+  - refresh_token
+  - add_field
+  - wait_retry
+  - change_endpoint
+  - escalate
+  - resolve
+reward_range: [-20, 20]
+tasks:
+  - id: "auth_error"
+    name: "Authentication Error"
+    description: "Diagnose and fix a 401 Unauthorized error caused by an expired API key"
+    difficulty: "easy"
+    grader: "tests.test_graders:grade_auth_error"
+  - id: "missing_fields"
+    name: "Missing Field Error"
+    description: "Diagnose and fix a 400 Bad Request error caused by a missing required field"
+    difficulty: "easy"
+    grader: "tests.test_graders:grade_missing_fields"
+  - id: "rate_limit"
+    name: "Rate Limit Error"
+    description: "Diagnose and fix a 429 Too Many Requests error by applying retry logic"
+    difficulty: "medium"
+    grader: "tests.test_graders:grade_rate_limit"
+  - id: "timeout"
+    name: "Timeout Error"
+    description: "Diagnose and fix a 408 Request Timeout error by applying retry with backoff"
+    difficulty: "medium"
+    grader: "tests.test_graders:grade_timeout"
+  - id: "wrong_endpoint"
+    name: "Wrong Endpoint Error"
+    description: "Diagnose and fix a 404 Not Found error by changing to the correct endpoint"
+    difficulty: "medium"
+    grader: "tests.test_graders:grade_wrong_endpoint"
+  - id: "server_error"
+    name: "Server Error"
+    description: "Diagnose and handle a 500 Internal Server Error by escalating appropriately"
+    difficulty: "hard"
+    grader: "tests.test_graders:grade_server_error"

pyproject.toml ADDED Viewed

	@@ -0,0 +1,25 @@

+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "api-triage-agent"
+version = "1.0.0"
+description = "OpenEnv environment for API debugging with AI agents"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "pydantic>=2.0.0",
+    "openai>=1.0.0",
+    "numpy>=1.24.0",
+    "pytest>=7.0.0",
+    "fastapi>=0.100.0",
+    "uvicorn>=0.23.0",
+    "openenv-core>=0.2.0",
+]
+[project.scripts]
+server = "app:main"
+[tool.openenv]
+environment-class = "environment.api_triage_env.APITriageEnv"

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+# for installing all the packages for the code to run
+openenv-core>=0.2.0
+openai>=1.0.0
+pydantic>=2.0.0
+numpy>=1.24.0
+pytest>=7.0.0
+fastapi>=0.100.0
+uvicorn>=0.23.0

server/app.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# the main environment file
+from fastapi import FastAPI
+from pydantic import BaseModel
+from environment.api_triage_env import APITriageEnv
+# creating an app and environment
+app = FastAPI()
+env = APITriageEnv()
+# defining a request model for /step endpoint
+# for fastapi so that it can understand that we expecting a JSON with an action field that is a text dtype
+class ActionRequest(BaseModel):
+  action: str
+@app.post("/reset")
+def reset():
+  """
+  Starting a new API debugging episode
+  """
+  print("INFO : reset endpoint is called , new debugging session started ")
+  state = env.reset()
+  return {
+    "step" : state.step,
+    "max_steps": state.max_steps,
+    "incident_summary": state.incident_summary,
+    "logs": state.logs,
+    "response_code":state.response_code,
+    "fix_applied": state.fix_applied,
+    "is_resolved" : state.is_resolved
+  }
+@app.get("/state")
+def state():
+  """
+  HELPs to return the current observation of the episode.
+  """
+  print("INFO : current state of the Episode as follows ")
+  current = env.state()
+  return {
+    "step" : current.step,
+    "max_steps": current.max_steps,
+    "incident_summary": current.incident_summary,
+    "logs": current.logs,
+    "response_code": current.response_code,
+    "fix_applied": current.fix_applied,
+    "is_resolved" : current.is_resolved
+  }
+@app.post("/step")
+def step(request: ActionRequest):
+  """
+  the agent sends an action and our environment will preocess it
+  and update the state , returns what happened.
+  """
+  """
+  action = what the agent wants to do (text)
+  observation = what the agent sees after doing it (object with 7 fields)
+  """
+  action = request.action
+  print(f"INFO : Action received: {action}")
+  # calling env.step() from api_triage_env.py file to process the action
+  observation , reward , done , info = env.step(action)
+  # here returning the result
+  return {
+    "observation": {
+    "step" : observation.step,
+    "max_steps": observation.max_steps,
+    "incident_summary": observation.incident_summary,
+    "logs": observation.logs,
+    "response_code": observation.response_code,
+    "fix_applied": observation.fix_applied,
+    "is_resolved" : observation.is_resolved
+    },
+    "reward": reward,
+    "done": done,
+    "info": info,
+  }
+def main():
+    import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=7860)
+if __name__ == "__main__":
+    main()

tasks/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Tasks package for OpenEnv grading

tasks/auth_error/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # auth_error task

tasks/auth_error/grader.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""Grader for auth_error task: 401 Unauthorized - expired API key."""
+from tasks.grading_helper import run_agent_on_incident
+def grade() -> float:
+    """Grade the auth_error task. Returns score between 0 and 1."""
+    score = run_agent_on_incident("auth_error")
+    return max(0.001, min(0.999, score))

tasks/grading_helper.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""Shared grading helper used by all per-task grader modules."""
+import sys
+from pathlib import Path
+# Ensure project root is on sys.path so environment package is importable
+_project_root = str(Path(__file__).parent.parent)
+if _project_root not in sys.path:
+    sys.path.insert(0, _project_root)
+from environment.api_triage_env import APITriageEnv
+from environment.incident_generator import get_incident_by_type
+def run_agent_on_incident(incident_type: str, max_steps: int = 10) -> float:
+    """Simulate an optimal agent solving a specific incident type.
+    Returns a float score strictly between 0 and 1.
+    """
+    env = APITriageEnv(max_steps=max_steps)
+    # Force the specific incident (bypass curriculum randomness)
+    env.incident = get_incident_by_type(incident_type)
+    if env.incident is None:
+        return 0.05
+    env.fix_applied = False
+    env.done = False
+    env.step_counter = 0
+    env.total_reward = 0.0
+    correct_action = env.incident["fix_action"]
+    # Optimal sequence: inspect → fix → resolve
+    actions = ["inspect_logs", correct_action, "resolve"]
+    for action in actions:
+        state, reward, done, info = env.step(action)
+        if done:
+            if info.get("resolution") == "success":
+                return 0.95
+            else:
+                return 0.05
+    return 0.1

tasks/missing_fields/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # missing_fields task

tasks/missing_fields/grader.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""Grader for missing_fields task: 400 Bad Request - missing email field."""
+from tasks.grading_helper import run_agent_on_incident
+def grade() -> float:
+    """Grade the missing_fields task. Returns score between 0 and 1."""
+    score = run_agent_on_incident("missing_fields")
+    return max(0.001, min(0.999, score))

tasks/rate_limit/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # rate_limit task

tasks/rate_limit/grader.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""Grader for rate_limit task: 429 Too Many Requests."""
+from tasks.grading_helper import run_agent_on_incident
+def grade() -> float:
+    """Grade the rate_limit task. Returns score between 0 and 1."""
+    score = run_agent_on_incident("rate_limit")
+    return max(0.001, min(0.999, score))

tasks/timeout/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # timeout task

tasks/timeout/grader.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""Grader for timeout task: 408 Request Timeout."""
+from tasks.grading_helper import run_agent_on_incident
+def grade() -> float:
+    """Grade the timeout task. Returns score between 0 and 1."""
+    score = run_agent_on_incident("timeout")
+    return max(0.001, min(0.999, score))

tasks/wrong_endpoint/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # wrong_endpoint task

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

tests/test_env.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# to run this file for checking please run this command :  py -m pytest tests/test_env.py -v
+# py -m pytest tests/test_env.py -v  in the terminal
+import pytest
+from environment.api_triage_env import APITriageEnv
+from environment.incident_generator import get_incident_by_type
+def test_reset():
+  """ reset should return the initial state of  the environment  and with step 0 """
+  env = APITriageEnv(max_steps=10)
+  state = env.reset()
+  # btw here we used assert to check if the state is a string or not
+  assert state.step == 0
+  assert state.max_steps == 10
+  assert state.incident_summary is not None
+  assert state.logs is not None
+  assert state.response_code in [400, 404, 401, 429, 500, 408]
+def test_action_is_valid():
+  """ valid action should return the float reward"""
+  env = APITriageEnv()
+  env.reset()
+  test_action_is_valid = ["add_field", "wait_retry", "refresh_token", "change_endpoint", "escalate"]
+  for action in test_action_is_valid:
+    state , reward , done , info = env.step(action)
+    # btw here we are checking if the reward is a float and if done is also a boolean ,if its not tthen test will catch it
+    assert isinstance(reward, float)
+    assert isinstance(done, bool)
+def test_action_is_invalid():
+  """ invalid action should return a negative reward and done = true"""
+  env = APITriageEnv()
+  env.reset()
+  state, reward, done, info = env.step("fake_action_123")
+  assert reward == -2.0
+  assert done is False
+def test_episode_successful():
+  env = APITriageEnv()
+  env.incident = get_incident_by_type("auth_error")
+  env.fix_applied = False
+  env.done = False
+  env.total_reward = 0.0
+  state, reward, done, info = env.step("inspect_logs")
+  state, reward, done, info = env.step("refresh_token")
+  assert done is False
+  state, reward, done, info = env.step("resolve")
+  assert done is True

tests/test_graders.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Grader functions for OpenEnv task validation
+# Each function is referenced in openenv.yaml as tests.test_graders:grade_<task_id>
+# Grader functions must return a float score between 0.0 and 1.0
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from environment.api_triage_env import APITriageEnv
+from environment.incident_generator import get_incident_by_type
+def _run_agent_on_incident(incident_type, max_steps=10):
+    """Simulate an agent solving a specific incident type. Returns score 0.0-1.0."""
+    env = APITriageEnv(max_steps=max_steps)
+    # Force the specific incident (bypass curriculum randomness)
+    env.incident = get_incident_by_type(incident_type)
+    env.fix_applied = False
+    env.done = False
+    env.step_counter = 0
+    env.total_reward = 0.0
+    # Get the correct fix action for this incident
+    correct_action = env.incident["fix_action"]
+    # Optimal action sequence: inspect -> fix -> resolve
+    actions = ["inspect_logs", correct_action, "resolve"]
+    for action in actions:
+        state, reward, done, info = env.step(action)
+        if done:
+            return 0.95 if info.get("resolution") == "success" else 0.05
+    return 0.1
+# ============================================
+# Per-task grader functions (referenced in openenv.yaml)
+# ============================================
+def grade_auth_error():
+    """Grader for auth_error task: 401 Unauthorized - expired API key"""
+    score = _run_agent_on_incident("auth_error")
+    assert 0.0 <= score <= 1.0, f"Score {score} out of range"
+    return score
+def grade_missing_fields():
+    """Grader for missing_fields task: 400 Bad Request - missing email field"""
+    score = _run_agent_on_incident("missing_fields")
+    assert 0.0 <= score <= 1.0, f"Score {score} out of range"
+    return score
+def grade_rate_limit():
+    """Grader for rate_limit task: 429 Too Many Requests"""
+    score = _run_agent_on_incident("rate_limit")
+    assert 0.0 <= score <= 1.0, f"Score {score} out of range"
+    return score
+def grade_timeout():
+    """Grader for timeout task: 408 Request Timeout"""
+    score = _run_agent_on_incident("timeout")
+    assert 0.0 <= score <= 1.0, f"Score {score} out of range"
+    return score
+def grade_wrong_endpoint():
+    """Grader for wrong_endpoint task: 404 Not Found"""
+    score = _run_agent_on_incident("wrong_endpoint")
+    assert 0.0 <= score <= 1.0, f"Score {score} out of range"
+    return score
+def grade_server_error():
+    """Grader for server_error task: 500 Internal Server Error"""
+    score = _run_agent_on_incident("server_error")
+    assert 0.0 <= score <= 1.0, f"Score {score} out of range"
+    return score
+# ============================================
+# Pytest-compatible test wrappers
+# ============================================
+def test_grade_auth_error():
+    score = grade_auth_error()
+    assert score > 0.5, f"auth_error grader returned low score: {score}"
+def test_grade_missing_fields():
+    score = grade_missing_fields()
+    assert score > 0.5, f"missing_fields grader returned low score: {score}"
+def test_grade_rate_limit():
+    score = grade_rate_limit()
+    assert score > 0.5, f"rate_limit grader returned low score: {score}"
+def test_grade_timeout():
+    score = grade_timeout()
+    assert score > 0.5, f"timeout grader returned low score: {score}"
+def test_grade_wrong_endpoint():
+    score = grade_wrong_endpoint()
+    assert score > 0.5, f"wrong_endpoint grader returned low score: {score}"
+def test_grade_server_error():
+    score = grade_server_error()
+    assert score > 0.5, f"server_error grader returned low score: {score}"
+if __name__ == "__main__":
+    print(f"auth_error score:      {grade_auth_error()}")
+    print(f"missing_fields score:  {grade_missing_fields()}")
+    print(f"rate_limit score:      {grade_rate_limit()}")
+    print(f"timeout score:         {grade_timeout()}")
+    print(f"wrong_endpoint score:  {grade_wrong_endpoint()}")
+    print(f"server_error score:    {grade_server_error()}")
+    print("All graders passed!")

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff