Spaces:
Running
Running
| """ | |
| OpenEnv Environment for API Integration Testing. | |
| The agent interacts with a deliberately buggy REST API, discovering endpoints, | |
| crafting requests, and finding bugs. Rewards are multi-signal: coverage, | |
| validity, bug discovery, and exploration. | |
| """ | |
| import logging | |
| import random | |
| import time | |
| import json | |
| from typing import Any, Optional | |
| from fastapi.testclient import TestClient | |
| from openenv.core.env_server.interfaces import Environment | |
| try: | |
| from ..models import APITestAction, APITestObservation, APITestState | |
| except ImportError: | |
| from models import APITestAction, APITestObservation, APITestState | |
| from .buggy_api.database import Database | |
| from .buggy_api.main import create_buggy_api | |
| from .bug_detector import BugDetector | |
| from .reward import RewardComputer | |
| from .graders import TaskGrader, generate_bug_report | |
| from .graders import TaskGrader | |
| logger = logging.getLogger(__name__) | |
| # Task definitions | |
| TASKS = { | |
| "basic_validation": { | |
| "id": "basic_validation", | |
| "description": ( | |
| "Test all CRUD endpoints with valid inputs and verify correct status codes. " | |
| "Find basic bugs like wrong status codes and missing field handling. " | |
| "Available endpoints: GET /tasks, POST /tasks, GET /tasks/{id}, PUT /tasks/{id}, " | |
| "DELETE /tasks/{id}, GET /users, POST /users, POST /auth/login. " | |
| "Try different methods on each endpoint and verify responses match the expected behavior." | |
| ), | |
| "difficulty": "easy", | |
| "max_steps": 25, | |
| "total_bugs": 3, | |
| }, | |
| "edge_cases": { | |
| "id": "edge_cases", | |
| "description": ( | |
| "Test boundary conditions, invalid inputs, and error responses. " | |
| "Send missing fields, wrong types, negative page numbers, huge limits. " | |
| "Test with non-existent resource IDs (e.g., /tasks/999999). " | |
| "Chain operations: create a resource, then read/update/delete it. " | |
| "Find bugs in input validation, pagination, and error handling." | |
| ), | |
| "difficulty": "medium", | |
| "max_steps": 35, | |
| "total_bugs": 9, | |
| }, | |
| "security_workflows": { | |
| "id": "security_workflows", | |
| "description": ( | |
| "Discover authorization flaws, injection vulnerabilities, and workflow bugs. " | |
| "Login as different users (alice/password, bob/password, charlie/password) and " | |
| "try accessing each other's resources. Test SQL injection patterns in input fields. " | |
| "Execute multi-step workflows: create -> modify -> verify -> delete -> re-fetch. " | |
| "Check if auth tokens properly scope access. Test with very long inputs." | |
| ), | |
| "difficulty": "hard", | |
| "max_steps": 45, | |
| "total_bugs": 13, | |
| }, | |
| } | |
| # OpenAPI-like spec for the agent | |
| API_SPEC = [ | |
| { | |
| "method": "GET", | |
| "path": "/tasks", | |
| "summary": "List all tasks. Supports filtering by status, priority; pagination with page & limit; sorting with sort.", | |
| "parameters": [ | |
| {"name": "status", "in": "query", "type": "string", "enum": ["pending", "in_progress", "done"]}, | |
| {"name": "priority", "in": "query", "type": "string", "enum": ["low", "medium", "high"]}, | |
| {"name": "sort", "in": "query", "type": "string", "enum": ["created_at", "updated_at", "title"]}, | |
| {"name": "page", "in": "query", "type": "integer"}, | |
| {"name": "limit", "in": "query", "type": "integer"}, | |
| ], | |
| }, | |
| { | |
| "method": "POST", | |
| "path": "/tasks", | |
| "summary": "Create a new task. Requires 'title' field. Optional: description, status, priority, assignee_email.", | |
| "request_body": { | |
| "required": ["title"], | |
| "properties": { | |
| "title": {"type": "string"}, | |
| "description": {"type": "string"}, | |
| "status": {"type": "string", "enum": ["pending", "in_progress", "done"]}, | |
| "priority": {"type": "string", "enum": ["low", "medium", "high"]}, | |
| "assignee_email": {"type": "string", "format": "email"}, | |
| }, | |
| }, | |
| }, | |
| { | |
| "method": "GET", | |
| "path": "/tasks/{id}", | |
| "summary": "Get a specific task by ID.", | |
| "parameters": [{"name": "id", "in": "path", "type": "integer", "required": True}], | |
| }, | |
| { | |
| "method": "PUT", | |
| "path": "/tasks/{id}", | |
| "summary": "Update a task. All fields optional.", | |
| "parameters": [{"name": "id", "in": "path", "type": "integer", "required": True}], | |
| "request_body": { | |
| "properties": { | |
| "title": {"type": "string"}, | |
| "description": {"type": "string"}, | |
| "status": {"type": "string"}, | |
| "priority": {"type": "string"}, | |
| "assignee_email": {"type": "string", "format": "email"}, | |
| }, | |
| }, | |
| }, | |
| { | |
| "method": "DELETE", | |
| "path": "/tasks/{id}", | |
| "summary": "Delete a task by ID.", | |
| "parameters": [{"name": "id", "in": "path", "type": "integer", "required": True}], | |
| }, | |
| { | |
| "method": "GET", | |
| "path": "/users", | |
| "summary": "List all users.", | |
| }, | |
| { | |
| "method": "POST", | |
| "path": "/users", | |
| "summary": "Create a new user. Requires username, email, password.", | |
| "request_body": { | |
| "required": ["username", "email", "password"], | |
| "properties": { | |
| "username": {"type": "string"}, | |
| "email": {"type": "string", "format": "email"}, | |
| "password": {"type": "string"}, | |
| "role": {"type": "string", "enum": ["user", "admin"]}, | |
| }, | |
| }, | |
| }, | |
| { | |
| "method": "GET", | |
| "path": "/users/{id}", | |
| "summary": "Get a specific user by ID.", | |
| "parameters": [{"name": "id", "in": "path", "type": "integer", "required": True}], | |
| }, | |
| { | |
| "method": "POST", | |
| "path": "/auth/login", | |
| "summary": "Login and receive an auth token. Pre-seeded users: alice, bob, charlie (password: any string).", | |
| "request_body": { | |
| "required": ["username", "password"], | |
| "properties": { | |
| "username": {"type": "string"}, | |
| "password": {"type": "string"}, | |
| }, | |
| }, | |
| }, | |
| ] | |
| class APITestEnvironment(Environment): | |
| """OpenEnv environment for API integration testing. | |
| The agent tests a deliberately buggy REST API by sending HTTP requests | |
| and analyzing responses. It earns rewards for coverage, finding bugs, | |
| and exploring edge cases. | |
| """ | |
| SUPPORTS_CONCURRENT_SESSIONS = False | |
| def __init__(self, **kwargs): | |
| super().__init__(**kwargs) | |
| self._db: Optional[Database] = None | |
| self._api: Optional[TestClient] = None | |
| self._bug_detector: Optional[BugDetector] = None | |
| self._reward_computer: Optional[RewardComputer] = None | |
| self._task: Optional[dict] = None | |
| self._found_bugs: set[str] = set() | |
| self._steps_taken: int = 0 | |
| self._cumulative_reward: float = 0.0 | |
| self._action_history: list[dict] = [] | |
| self._auth_tokens: dict[str, str] = {} | |
| self._episode_id: str = "" | |
| def reset(self, seed=None, episode_id=None, **kwargs) -> APITestObservation: | |
| """Reset the environment for a new episode. | |
| Args: | |
| seed: Random seed for domain randomization. When provided, the | |
| database is populated with different users, tasks, and data | |
| so each training episode is unique. None = fixed default data. | |
| episode_id: Optional episode identifier for tracking. | |
| kwargs: | |
| task_id: str - one of "basic_validation", "edge_cases", "security_workflows" | |
| """ | |
| task_id = kwargs.get("task_id", "basic_validation") | |
| if task_id not in TASKS: | |
| task_id = "basic_validation" | |
| self._task = TASKS[task_id] | |
| self._seed = seed | |
| self._episode_id = episode_id or f"ep_{int(time.time())}" | |
| # Reset database with seed for domain randomization | |
| # seed=None → fixed data (manual testing / Gradio) | |
| # seed=int → randomized data (GRPO training) | |
| self._db = Database(seed=seed) | |
| buggy_app = create_buggy_api(self._db) | |
| self._api = TestClient(buggy_app, raise_server_exceptions=False) | |
| # Build dynamic task description that includes actual usernames | |
| user_names = self._db.user_names | |
| user_list = ", ".join(user_names) | |
| dynamic_description = ( | |
| f"{self._task['description']} " | |
| f"Users in the system: {user_list} (use any password to login)." | |
| ) | |
| # Reset tracking | |
| self._bug_detector = BugDetector(task_id) | |
| self._reward_computer = RewardComputer() | |
| self._found_bugs = set() | |
| self._steps_taken = 0 | |
| self._cumulative_reward = 0.0 | |
| self._action_history = [] | |
| self._auth_tokens = {} | |
| logger.info(f"Reset environment: task={task_id}, seed={seed}, episode={self._episode_id}") | |
| return APITestObservation( | |
| available_endpoints=API_SPEC, | |
| status_code=0, | |
| response_body=None, | |
| response_headers={}, | |
| response_time_ms=0, | |
| feedback=( | |
| f"Environment reset. Task: {dynamic_description} " | |
| f"You have {self._task['max_steps']} steps. Start testing the API!" | |
| ), | |
| bugs_found_so_far=0, | |
| coverage_summary=self._reward_computer.coverage.summary(), | |
| known_resource_ids=self._reward_computer.created_ids, | |
| auth_tokens=self._auth_tokens, | |
| task_id=task_id, | |
| task_description=dynamic_description, | |
| steps_taken=0, | |
| max_steps=self._task["max_steps"], | |
| done=False, | |
| reward=0.0, | |
| ) | |
| def step(self, action: APITestAction, timeout_s=None, **kwargs) -> APITestObservation: | |
| """Execute an API test action and return observation + reward.""" | |
| self._steps_taken += 1 | |
| # Forward request to buggy API | |
| method = action.method.value if hasattr(action.method, "value") else str(action.method) | |
| endpoint = action.endpoint | |
| headers = dict(action.headers) if action.headers else {} | |
| query_params = dict(action.query_params) if action.query_params else {} | |
| body = action.body | |
| # Make the request | |
| start_time = time.time() | |
| try: | |
| response = self._api.request( | |
| method=method.upper(), | |
| url=endpoint, | |
| headers=headers, | |
| params=query_params if query_params else None, | |
| json=body, | |
| ) | |
| elapsed_ms = (time.time() - start_time) * 1000 | |
| response_status = response.status_code | |
| try: | |
| response_body = response.json() | |
| except Exception: | |
| response_body = response.text | |
| response_headers = dict(response.headers) | |
| except Exception as e: | |
| elapsed_ms = (time.time() - start_time) * 1000 | |
| response_status = 0 | |
| response_body = {"error": str(e)} | |
| response_headers = {} | |
| # Track auth tokens from login responses | |
| if ( | |
| endpoint == "/auth/login" | |
| and response_status == 200 | |
| and isinstance(response_body, dict) | |
| and "token" in response_body | |
| ): | |
| username = body.get("username", "unknown") if body else "unknown" | |
| self._auth_tokens[username] = response_body["token"] | |
| # Check for bug detection | |
| detection = self._bug_detector.check( | |
| method=method, | |
| endpoint=endpoint, | |
| headers=headers, | |
| query_params=query_params, | |
| body=body, | |
| expected_status=action.expected_status, | |
| response_status=response_status, | |
| response_body=response_body, | |
| action_history=self._action_history, | |
| found_bugs=self._found_bugs, | |
| ) | |
| bug_severity = None | |
| bug_id = None | |
| if detection: | |
| bug_severity = detection.bug.severity | |
| bug_id = detection.bug.id | |
| self._found_bugs.add(bug_id) | |
| # Compute reward | |
| reward_breakdown = self._reward_computer.compute( | |
| method=method, | |
| endpoint=endpoint, | |
| headers=headers, | |
| query_params=query_params, | |
| body=body, | |
| expected_status=action.expected_status, | |
| response_status=response_status, | |
| response_body=response_body, | |
| bug_found=bug_severity, | |
| bug_id=bug_id, | |
| ) | |
| self._cumulative_reward += reward_breakdown.total | |
| # Record action in history | |
| self._action_history.append({ | |
| "method": method, | |
| "endpoint": endpoint, | |
| "headers": headers, | |
| "query_params": query_params, | |
| "body": body, | |
| "response_status": response_status, | |
| "response_body": response_body, | |
| }) | |
| # Generate feedback | |
| feedback_parts = [f"{method} {endpoint} -> {response_status}"] | |
| if detection: | |
| feedback_parts.append(f"BUG FOUND ({detection.bug.severity})! {detection.evidence}") | |
| if reward_breakdown.coverage > 0: | |
| feedback_parts.append(f"Coverage +{reward_breakdown.coverage:.2f}") | |
| if reward_breakdown.penalty < 0: | |
| feedback_parts.append("Repeated request penalty") | |
| done = self._steps_taken >= self._task["max_steps"] | |
| # Compute final grade if done | |
| if done: | |
| grade = TaskGrader.grade( | |
| task_id=self._task["id"], | |
| bugs_found=self._found_bugs, | |
| coverage_pct=self._reward_computer.coverage.summary()["coverage_pct"], | |
| endpoints_tested=len(self._reward_computer.coverage.endpoints_hit), | |
| total_endpoints=self._reward_computer.coverage.total_endpoints, | |
| method_endpoint_pairs=len(self._reward_computer.coverage.method_endpoint_pairs), | |
| status_codes_seen=self._reward_computer.coverage.status_codes_seen, | |
| action_history=self._action_history, | |
| created_resources=self._reward_computer.created_ids, | |
| ) | |
| # Generate bug bounty report | |
| report = generate_bug_report(list(self._found_bugs), self._action_history) | |
| feedback_parts.append( | |
| f"\n=== EPISODE COMPLETE ===\n" | |
| f"Final Score: {grade.score:.4f}\n" | |
| f"Bugs Found: {len(self._found_bugs)}/{self._task['total_bugs']}\n" | |
| f"Grade Breakdown: {json.dumps(grade.breakdown, indent=2)}\n" | |
| f"Feedback: {grade.feedback}\n\n" | |
| f"{report}" | |
| ) | |
| # Add grade as bonus on top of step reward (not replacement) | |
| final_reward = reward_breakdown.total + grade.score | |
| else: | |
| final_reward = reward_breakdown.total | |
| return APITestObservation( | |
| available_endpoints=API_SPEC, | |
| status_code=response_status, | |
| response_body=response_body, | |
| response_headers={k: v for k, v in list(response_headers.items())[:20]}, | |
| response_time_ms=round(elapsed_ms, 2), | |
| feedback=" | ".join(feedback_parts), | |
| bugs_found_so_far=len(self._found_bugs), | |
| coverage_summary=self._reward_computer.coverage.summary(), | |
| known_resource_ids=self._reward_computer.created_ids, | |
| auth_tokens=self._auth_tokens, | |
| task_id=self._task["id"], | |
| task_description=self._task["description"], | |
| steps_taken=self._steps_taken, | |
| max_steps=self._task["max_steps"], | |
| done=done, | |
| reward=final_reward, | |
| metadata={"reward_breakdown": reward_breakdown.as_dict()}, | |
| ) | |
| def state(self) -> APITestState: | |
| """Return current episode state.""" | |
| if not self._task: | |
| return APITestState() | |
| coverage = self._reward_computer.coverage.summary() if self._reward_computer else {} | |
| return APITestState( | |
| episode_id=self._episode_id, | |
| step_count=self._steps_taken, | |
| task_id=self._task["id"], | |
| task_description=self._task["description"], | |
| difficulty=self._task["difficulty"], | |
| steps_taken=self._steps_taken, | |
| max_steps=self._task["max_steps"], | |
| bugs_found=len(self._found_bugs), | |
| total_bugs=self._task["total_bugs"], | |
| bugs_found_ids=list(self._found_bugs), | |
| coverage_pct=coverage.get("coverage_pct", 0.0), | |
| endpoints_tested=coverage.get("endpoints_tested", 0), | |
| total_endpoints=coverage.get("total_endpoints", 0), | |
| current_score=0.0, | |
| cumulative_reward=round(self._cumulative_reward, 4), | |
| ) | |