#!/usr/bin/env python3
"""
Gradio UI for the API Testing Environment.
"""
import json
import os
import time
import argparse
from dataclasses import dataclass, field
from typing import Optional
import gradio as gr
from models import APITestAction, APITestObservation, HTTPMethod
from server.environment import APITestEnvironment, TASKS, API_SPEC
# =====================================================================
# Editorial blog-style documentation rendered below the playground.
# Uses an inline
Where agents learn to break APIs.
An OpenEnv reinforcement learning environment for API security testing. A live REST API with thirteen planted vulnerabilities, a verifiable reward function mapped to the OWASP API Security Top 10, and an episode that ends with a structured bug report.
01The premise
What this is.
A Gradio playground for an OpenEnv RL environment that trains AI agents to test REST APIs the way a security engineer would. Behind the UI is a Task Management API with 13 deliberately planted bugs covering 6 categories from the OWASP API Security Top 10.
The agent connects, sends HTTP requests, earns rewards for finding bugs and covering endpoints, and generates a bug bounty report when the episode ends.
Real API. Real bugs. Real OWASP categories — verifiable end to end.
02The gap
Why bother.
Every team ships APIs and every API has bugs. The usual tools Postman Schemathesis OWASP ZAP either need humans writing tests by hand or fall back to brute-force fuzzing.
This environment is the benchmark.
The agent doesn't get a written test plan. It reads the API spec, plans a campaign, runs it, and reports what broke. The reward function is verifiable — no LLM judge, no soft heuristics — and every signal maps to a real OWASP category, so episodes can be scored deterministically.
03How reward works
Five signals,
one episode.
The reward function is verifiable — no LLM judge, no soft heuristics. Each step accumulates from five components and the task grader caps the episode with a terminal score in [0, 1].
Bug discovery
+0.10 / +0.15 / +0.25
Finding a planted bug, scaled by severity. Easy bugs (status codes, missing fields) are worth 0.10. Medium (validation, auth) gets 0.15. Hard (BOLA, injection, broken auth chains) gets 0.25.
Hitting endpoints, methods, and status codes the agent hasn't tried yet.
Well-formed requests, plus chaining IDs from previous responses.
Trying genuinely novel action patterns the agent hasn't tried before.
Repeating the same exact request twice — anti-spam, anti-loop.
04How to use this
Five steps
to a verdict.
Pick a task
Three difficulty tiers in the dropdown on the left, from a CRUD smoke-test to a full BOLA + injection chain.
basic_validation
edge_cases
security_workflows
Reset the environment
Every reset spins up a fresh database with new users, new tasks, and randomized ownership, so the agent can't memorize answers between episodes.
Run a baseline
The Run Baseline Agent tab is open by default. Pick a strategy and watch it test the API step by step.
random
sequential
smart
Or test manually
Switch to Manual Testing. Quick Actions give one-click bug hunts, or craft your own request from scratch — method, endpoint, headers, body, expected status.
Watch the panel
Discovered Bugs and the Activity Log update live as the agent works. When the episode ends, expand the Bug Report (OWASP) drawer for the full structured findings, severities, and fix recommendations.
05Under the hood
Three layers.
Self-contained, reproducible, and runs on a free-tier HuggingFace Space.
L1 · ENVIRONMENT
FastAPI + SQLite
A buggy Task Management API wrapped in OpenEnv's step() / reset() / state() contract. Runs in-process or as a Docker image, with seed-randomized data on every reset so episodes can't be memorized.
L2 · INFERENCE
OpenAI-compatible client
inference.py talks to any HuggingFace-hosted model through the OpenAI SDK and structured JSON output. Plug in any model that follows the protocol — no environment-specific glue.
L3 · DEPLOY
Docker + HF Spaces
Containerized on top of the official openenv-base image and deployed as a public HuggingFace Space, so judges can hit it with a single HTTP call.
06The artifacts
Everything reproducible.
Source code, deployed environment, framework. Open and inspectable.
"""
@dataclass
class SessionState:
env: APITestEnvironment = field(default_factory=APITestEnvironment)
initialized: bool = False
task_id: str = ""
step_log: list[dict] = field(default_factory=list)
total_reward: float = 0.0
last_obs: Optional[APITestObservation] = None
def new_session():
return SessionState()
# =====================================================================
# Core logic
# =====================================================================
def _generate_report(bug_ids, action_history):
"""Generate OWASP bug bounty report from discovered bugs."""
from server.graders import generate_bug_report
return generate_bug_report(bug_ids, action_history)
def reset_env(task_id, state):
if not state:
state = new_session()
obs = state.env.reset(task_id=task_id)
state.initialized = True
state.task_id = task_id
state.step_log = []
state.total_reward = 0.0
state.last_obs = obs
t = TASKS[task_id]
return (
state,
f"Environment reset. Task: **{task_id}** ({t['difficulty']})\n\nMax steps: {t['max_steps']} | Bugs to find: {t['total_bugs']}",
obs.feedback,
"",
format_reward_display(0, 0, {}),
f"0 / {t['total_bugs']}",
format_coverage(obs.coverage_summary),
"",
f"0 / {t['max_steps']}",
"No bugs found yet.",
"No bugs found yet. Send requests to discover vulnerabilities.",
"No tokens acquired yet.",
"No resources created yet.",
)
def send_request(method, endpoint, headers_str, params_str, body_str, expected_status, state):
if not state or not state.initialized:
return (state, "Environment not initialized. Click 'Reset' first.", "", "", "", "", "", "", "", "", "", "")
try:
headers = json.loads(headers_str) if headers_str.strip() else {}
except json.JSONDecodeError:
return (state, "Invalid JSON in headers.", "", "", "", "", "", "", "", "", "", "")
try:
query_params = json.loads(params_str) if params_str.strip() else {}
except json.JSONDecodeError:
return (state, "Invalid JSON in query params.", "", "", "", "", "", "", "", "", "", "")
try:
body = json.loads(body_str) if body_str.strip() else None
except json.JSONDecodeError:
return (state, "Invalid JSON in body.", "", "", "", "", "", "", "", "", "", "")
exp = int(expected_status) if expected_status.strip() else None
action = APITestAction(
method=HTTPMethod(method), endpoint=endpoint,
headers=headers, query_params=query_params,
body=body, expected_status=exp,
)
obs = state.env.step(action)
reward = obs.reward or 0.0
state.total_reward += reward
state.last_obs = obs
resp_body = obs.response_body
if isinstance(resp_body, (dict, list)):
resp_str = json.dumps(resp_body, indent=2)
else:
resp_str = str(resp_body)
state.step_log.append({
"step": obs.steps_taken, "method": method, "endpoint": endpoint,
"status": obs.status_code, "reward": round(reward, 4), "bugs": obs.bugs_found_so_far,
})
breakdown = obs.metadata.get("reward_breakdown", {})
reward_detail = format_reward_display(reward, state.total_reward, breakdown)
t = TASKS[state.task_id]
es = state.env.state
status = ""
if obs.done:
status = (
f"\n\n**EPISODE COMPLETE**\n\n"
f"Final Score: {reward:.4f}\n"
f"Bugs: {obs.bugs_found_so_far}/{t['total_bugs']}\n"
f"Steps: {obs.steps_taken}/{obs.max_steps}"
)
return (
state,
obs.feedback + status,
f"**{obs.status_code}** — {obs.response_time_ms:.1f}ms\n\n```json\n{resp_str}\n```",
reward_detail,
f"{obs.bugs_found_so_far} / {t['total_bugs']}",
format_coverage(obs.coverage_summary),
format_log(state.step_log),
f"{obs.steps_taken} / {obs.max_steps}" + (" (DONE)" if obs.done else ""),
format_bug_list(es.bugs_found_ids),
_generate_report(es.bugs_found_ids, state.step_log),
format_auth_tokens(obs.auth_tokens),
format_resources(obs.known_resource_ids),
)
def apply_quick_action(action_name, _state):
quick_actions = {
"GET /tasks": ("GET", "/tasks", "{}", "{}", "", "200"),
"GET /users": ("GET", "/users", "{}", "{}", "", "200"),
"GET /tasks/1": ("GET", "/tasks/1", "{}", "{}", "", "200"),
"GET /tasks/999999 (bug hunt)": ("GET", "/tasks/999999", "{}", "{}", "", "404"),
"POST create task": ("POST", "/tasks", "{}", "{}", '{"title": "Test Task", "description": "Created via UI"}', "201"),
"POST missing title (bug hunt)": ("POST", "/tasks", "{}", "{}", '{"description": "no title"}', "400"),
"Login as alice": ("POST", "/auth/login", "{}", "{}", '{"username": "alice", "password": "pass"}', "200"),
"Login as bob": ("POST", "/auth/login", "{}", "{}", '{"username": "bob", "password": "pass"}', "200"),
"Login empty pwd (bug hunt)": ("POST", "/auth/login", "{}", "{}", '{"username": "alice", "password": ""}', "401"),
"Negative page (bug hunt)": ("GET", "/tasks", "{}", '{"page": -1, "limit": 10}', "", "400"),
"Huge limit (bug hunt)": ("GET", "/tasks", "{}", '{"limit": 999999}', "", "200"),
"Invalid email PUT (bug hunt)": ("PUT", "/tasks/1", "{}", "{}", '{"assignee_email": "not-an-email"}', "422"),
"DELETE non-existent (bug hunt)": ("DELETE", "/tasks/99999", "{}", "{}", "", "404"),
"Create user invalid email (bug)": ("POST", "/users", "{}", "{}", '{"username": "baduser", "email": "nope", "password": "x"}', "422"),
"SQL injection test": ("POST", "/tasks", "{}", "{}", '{"title": "test\'; DROP TABLE tasks;--"}', "201"),
"Long title crash (bug hunt)": ("POST", "/tasks", "{}", "{}", '{"title": "' + "A" * 6000 + '"}', "400"),
}
if action_name and action_name in quick_actions:
return quick_actions[action_name]
return [gr.update()] * 6
def run_baseline_agent(agent_type, state):
if not state or not state.initialized:
yield state, "Environment not initialized.", "", "", "", "", "", "", "", "", "", ""
return
from training.agents import RandomAgent, SequentialAgent, SmartAgent
agents = {"random": RandomAgent, "sequential": SequentialAgent, "smart": SmartAgent}
agent = agents[agent_type]()
t = TASKS[state.task_id]
obs = state.env.reset(task_id=state.task_id)
state.step_log = []
state.total_reward = 0.0
state.last_obs = obs
while not obs.done:
obs_dict = {
"status_code": obs.status_code, "response_body": obs.response_body,
"feedback": obs.feedback, "bugs_found_so_far": obs.bugs_found_so_far,
"coverage_summary": obs.coverage_summary, "known_resource_ids": obs.known_resource_ids,
"auth_tokens": obs.auth_tokens, "steps_taken": obs.steps_taken, "max_steps": obs.max_steps,
}
action = agent.act(obs_dict)
obs = state.env.step(action)
reward = obs.reward or 0.0
state.total_reward += reward
state.last_obs = obs
ms = action.method.value if hasattr(action.method, "value") else str(action.method)
state.step_log.append({
"step": obs.steps_taken, "method": ms, "endpoint": action.endpoint,
"status": obs.status_code, "reward": round(reward, 4), "bugs": obs.bugs_found_so_far,
})
resp_body = obs.response_body
if isinstance(resp_body, (dict, list)):
resp_str = json.dumps(resp_body, indent=2)
else:
resp_str = str(resp_body)
breakdown = obs.metadata.get("reward_breakdown", {})
reward_detail = format_reward_display(reward, state.total_reward, breakdown)
es = state.env.state
done_text = ""
if obs.done:
done_text = f"\n\n**EPISODE COMPLETE** — Final Score: {reward:.4f} | Bugs: {obs.bugs_found_so_far}/{t['total_bugs']}"
yield (
state,
f"[{agent_type}] {ms} {action.endpoint} -> {obs.status_code}{done_text}",
f"**{obs.status_code}**\n```json\n{resp_str[:500]}\n```",
reward_detail,
f"{obs.bugs_found_so_far} / {t['total_bugs']}",
format_coverage(obs.coverage_summary),
format_log(state.step_log),
f"{obs.steps_taken} / {obs.max_steps}" + (" (DONE)" if obs.done else ""),
format_bug_list(es.bugs_found_ids),
_generate_report(es.bugs_found_ids, state.step_log),
format_auth_tokens(obs.auth_tokens),
format_resources(obs.known_resource_ids),
)
time.sleep(0.3)
# =====================================================================
# Formatters
# =====================================================================
def format_reward_display(step_reward, cumulative, breakdown):
"""Render reward metrics as styled HTML with explanations."""
components = [
("Coverage", breakdown.get("coverage", 0),
"Reward for testing new endpoints and methods"),
("Validity", breakdown.get("validity", 0),
"Reward for sending well-formed requests that return expected status codes"),
("Bug", breakdown.get("bug_discovery", 0),
"Bonus for discovering a new bug in the API"),
("Explore", breakdown.get("exploration", 0),
"Reward for trying new parameter combinations and edge cases"),
("Penalty", breakdown.get("penalty", 0),
"Deduction for repeated or invalid requests"),
]
bars = []
for label, value, tip in components:
val_color = "#16a34a" if value > 0 else "#dc2626" if value < 0 else "inherit"
bars.append(
f''
f''
f'{label}'
f''
f'{value:+.3f}
'
)
cum_color = "#16a34a" if cumulative > 0 else "#dc2626" if cumulative < 0 else "inherit"
step_color = "#16a34a" if step_reward > 0 else "#dc2626" if step_reward < 0 else "inherit"
return (
f''
f'
'
f'
STEP REWARD
'
f'
'
f'{step_reward:+.4f}
'
f'
'
f'
CUMULATIVE
'
f'
'
f'{cumulative:.4f}
'
f''
f'
'
f'REWARD BREAKDOWN '
f'ⓘ
'
+ "".join(bars)
+ "
"
)
def format_coverage(summary):
if not summary:
return "No data"
pct = summary.get("coverage_pct", 0)
tested = summary.get("endpoints_tested", 0)
total = summary.get("total_endpoints", 0)
pairs = summary.get("method_endpoint_pairs", 0)
codes = summary.get("status_codes_seen", [])
color = "#dc2626" if pct < 30 else "#d97706" if pct < 70 else "#16a34a"
bar_html = (
f''
)
code_pills = ""
for c in codes:
cc = "#16a34a" if 200 <= c < 300 else "#d97706" if 300 <= c < 400 else "#dc2626"
code_pills += (
f'{c}'
)
return (
f"{bar_html}"
f''
f'
'
f'
ENDPOINTS
'
f'
{tested}/{total}
'
f'
'
f'
METHOD+PATH
'
f'
{pairs}
'
f''
f'STATUS CODES SEEN '
f'{code_pills}
'
)
def format_log(log):
if not log:
return (
''
"Each row shows an API request the agent made, the HTTP status it got back, "
"and the reward earned. Green = positive reward, red = penalty."
"
"
)
method_colors = {
"GET": "#2563eb", "POST": "#16a34a", "PUT": "#d97706",
"DELETE": "#dc2626", "PATCH": "#9333ea",
}
rows = []
for entry in log[-20:]:
m = entry["method"]
mcol = method_colors.get(m, "#6b7280")
r = entry["reward"]
rcol = "#16a34a" if r > 0 else "#dc2626" if r < 0 else "inherit"
bug_tag = (
'BUG FOUND'
) if r > 0.2 else ""
status = entry["status"]
scol = "#16a34a" if 200 <= status < 300 else "#d97706" if 300 <= status < 400 else "#dc2626"
rows.append(
f''
f'{entry["step"]}'
f'{m}'
f'{entry["endpoint"]}'
f'{status}'
f'{r:+.3f}{bug_tag}
'
)
omitted = ""
if len(log) > 20:
omitted = (
f''
f'... {len(log) - 20} earlier steps not shown
'
)
header = (
''
"API requests made by the agent. Each row: step number, HTTP method, "
"endpoint, status code, and reward earned.
"
''
'#'
'Method'
'Endpoint'
'Status'
'Reward
'
)
return header + omitted + "\n".join(rows)
def format_bug_list(bug_ids):
if not bug_ids:
return "No bugs found yet."
from server.bug_detector import BugDetector
detector = BugDetector("security_workflows")
severity_colors = {
"easy": "#16a34a",
"medium": "#d97706",
"hard": "#dc2626",
}
cards = []
for bid in sorted(bug_ids):
bug = detector.bugs.get(bid)
if bug:
fg = severity_colors.get(bug.severity, "#6b7280")
owasp_badge = f' | {bug.owasp.split(" ")[0]}' if bug.owasp else ""
cards.append(
f''
f'
'
f'{bid}'
f'{bug.severity.upper()}{owasp_badge}
'
f'
'
f'{bug.description}
'
f'
'
f'{bug.owasp}
'
)
return "\n".join(cards)
def format_auth_tokens(tokens):
if not tokens:
return (
''
"No tokens yet. Login via POST /auth/login to get auth tokens "
"for testing protected endpoints.
"
)
cards = []
for user, token in tokens.items():
cards.append(
f''
f'{user}'
f'{token[:20]}...
'
)
return (
''
"AUTHENTICATED USERS
"
+ "".join(cards)
)
def format_resources(ids):
if not ids:
return (
''
"No resources created. Use POST endpoints to create tasks or users "
"and track their IDs here.
"
)
sections = []
type_colors = {"tasks": "#d97706", "users": "#2563eb"}
for rtype, id_list in ids.items():
color = type_colors.get(rtype, "#6b7280")
ids_str = ", ".join(str(i) for i in id_list) if isinstance(id_list, list) else str(id_list)
sections.append(
f''
f'{rtype}'
f'IDs: {ids_str}
'
)
return (
''
"CREATED RESOURCES
"
+ "".join(sections)
)
def format_endpoints():
lines = []
for ep in API_SPEC:
lines.append(f"**{ep['method']}** `{ep['path']}` — {ep.get('summary', '')}")
return "\n\n".join(lines)
# =====================================================================
# UI
# =====================================================================
_GRADIO_THEME = gr.themes.Soft(
primary_hue="emerald",
secondary_hue="green",
neutral_hue="slate",
font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
font_mono=[gr.themes.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace"],
)
# Custom CSS injected into the Gradio app to highlight important interactive
# elements (primary buttons, active tabs, hover states) so the playground
# doesn't feel washed out. Works in both light and dark mode.
_GRADIO_CSS = """
/* ─── Mintlify-inspired green palette (flat, no gradients) ─── */
:root {
--accent: #18E299; /* Brand Green */
--accent-hover: #0fa76e; /* Brand Green Deep */
--accent-soft: #d4fae8; /* Brand Green Light */
--accent-border: rgba(15, 167, 110, 0.28);
--ink: #0d0d0d;
--ink-muted: #666666;
--line: #e5e5e5;
--surface: #fafafa;
--success: #16a34a;
--danger: #dc2626;
--info: #2563eb;
}
.dark {
--accent: #18E299;
--accent-hover: #34efaa;
--accent-soft: rgba(24, 226, 153, 0.14);
--accent-border: rgba(24, 226, 153, 0.35);
--ink: #f5f5f5;
--ink-muted: #a0a0a0;
--line: rgba(255, 255, 255, 0.10);
--surface: #141414;
}
/* ─── Primary buttons ─────────────────────────────────────────────
Light mode: near-black surface, white text, green hover.
Dark mode: bright green surface, near-black text.
Both flat (no gradients, no glow). */
button.primary,
.gr-button.primary,
button[class*="primary"],
.gr-button-primary {
background: #0d0d0d !important;
background-image: none !important;
color: #ffffff !important;
border: 1px solid #0d0d0d !important;
box-shadow: none !important;
font-weight: 600 !important;
letter-spacing: 0.01em !important;
transition: background-color 0.15s ease, border-color 0.15s ease, color 0.15s ease !important;
}
button.primary:hover,
.gr-button.primary:hover,
button[class*="primary"]:hover,
.gr-button-primary:hover {
background: #0fa76e !important;
background-image: none !important;
color: #ffffff !important;
border-color: #0fa76e !important;
box-shadow: none !important;
transform: none !important;
filter: none !important;
}
button.primary:active,
.gr-button.primary:active,
.gr-button-primary:active {
background: #0a8a5a !important;
border-color: #0a8a5a !important;
transform: none !important;
filter: none !important;
}
/* Dark-mode override: bright green CTA pops against the dark surface */
.dark button.primary,
.dark .gr-button.primary,
.dark button[class*="primary"],
.dark .gr-button-primary {
background: #18E299 !important;
color: #07301f !important;
border: 1px solid #18E299 !important;
}
.dark button.primary:hover,
.dark .gr-button.primary:hover,
.dark button[class*="primary"]:hover,
.dark .gr-button-primary:hover {
background: #34efaa !important;
border-color: #34efaa !important;
color: #07301f !important;
}
/* ─── Secondary buttons ──────────────────────────────────────────
Light mode: white with dark border, fills near-black on hover.
Dark mode: ghost button with green border. */
button.secondary,
.gr-button.secondary,
.gr-button-secondary {
background: #ffffff !important;
background-image: none !important;
border: 1px solid #0d0d0d !important;
color: #0d0d0d !important;
font-weight: 500 !important;
box-shadow: none !important;
transition: background-color 0.15s ease, color 0.15s ease, border-color 0.15s ease !important;
}
button.secondary:hover,
.gr-button.secondary:hover,
.gr-button-secondary:hover {
background: #0d0d0d !important;
color: #ffffff !important;
border-color: #0d0d0d !important;
}
.dark button.secondary,
.dark .gr-button.secondary,
.dark .gr-button-secondary {
background: transparent !important;
border: 1px solid var(--accent-border) !important;
color: var(--accent) !important;
}
.dark button.secondary:hover,
.dark .gr-button.secondary:hover,
.dark .gr-button-secondary:hover {
background: var(--accent) !important;
color: #07301f !important;
border-color: var(--accent) !important;
}
/* ─── Tabs (selected tab uses brand green) ─── */
button[role="tab"][aria-selected="true"],
.tab-nav button.selected,
.tab-nav button[aria-selected="true"] {
color: var(--accent-hover) !important;
border-bottom: 2px solid var(--accent) !important;
font-weight: 600 !important;
}
.dark button[role="tab"][aria-selected="true"],
.dark .tab-nav button.selected,
.dark .tab-nav button[aria-selected="true"] {
color: var(--accent) !important;
}
button[role="tab"]:hover,
.tab-nav button:hover {
color: var(--accent-hover) !important;
}
.dark button[role="tab"]:hover,
.dark .tab-nav button:hover {
color: var(--accent) !important;
}
/* ─── Inputs (focus ring uses brand green, no glow) ─── */
.gr-dropdown,
.gr-input,
.gr-textbox,
input[type="text"],
input[type="number"],
textarea,
select {
transition: border-color 0.15s ease, box-shadow 0.15s ease !important;
}
.gr-dropdown:focus-within,
.gr-input:focus-within,
.gr-textbox:focus-within,
input:focus,
textarea:focus,
select:focus {
border-color: var(--accent) !important;
box-shadow: 0 0 0 3px var(--accent-soft) !important;
outline: none !important;
}
/* ─── Section headings ─── */
h1, h2, h3 {
letter-spacing: -0.01em !important;
}
h1, h2 {
color: #0d0d0d !important;
}
.dark h1, .dark h2 {
color: #f5f5f5 !important;
}
h3 {
color: #0d0d0d !important;
font-weight: 600 !important;
border-bottom: 1px solid var(--line) !important;
padding-bottom: 6px !important;
margin-bottom: 12px !important;
position: relative !important;
}
/* small green accent bar before each section heading for brand identity */
h3::before {
content: "" !important;
display: inline-block !important;
width: 4px !important;
height: 14px !important;
background: #18E299 !important;
border-radius: 2px !important;
margin-right: 8px !important;
vertical-align: -2px !important;
}
.dark h3 {
color: #f5f5f5 !important;
}
/* ─── Markdown links ─── */
.prose a, .markdown a, a {
color: var(--accent-hover) !important;
text-decoration: none !important;
border-bottom: 1px solid var(--accent-border) !important;
}
.dark .prose a, .dark .markdown a, .dark a {
color: var(--accent) !important;
}
.prose a:hover, .markdown a:hover, a:hover {
border-bottom-color: var(--accent) !important;
}
/* ─── Accordion headers ─── */
.gr-accordion > button,
button[class*="accordion"] {
color: var(--accent-hover) !important;
font-weight: 600 !important;
}
.dark .gr-accordion > button,
.dark button[class*="accordion"] {
color: var(--accent) !important;
}
/* ─── Card borders (Mintlify principle: borders, not shadows) ─── */
.gr-block.gr-box {
border-color: var(--line) !important;
box-shadow: none !important;
}
/* ─── Match the Gradio dark surface to the blog section ──────────
The blog section below uses #0a0a0a as its background. Override
Gradio's default slate so the page reads as one continuous canvas. */
.dark {
--body-background-fill: #0a0a0a !important;
--background-fill-primary: #0a0a0a !important;
--background-fill-secondary: #131313 !important;
--block-background-fill: #131313 !important;
--panel-background-fill: #131313 !important;
--input-background-fill: #131313 !important;
--border-color-primary: rgba(255, 255, 255, 0.08) !important;
}
.dark,
.dark body,
.dark gradio-app,
.dark .gradio-container,
.dark .main,
.dark .wrap,
.dark .app,
.dark .contain {
background: #0a0a0a !important;
background-color: #0a0a0a !important;
}
/* Cards / blocks get a slightly lighter surface so they remain
visually separated from the page background. */
.dark .gr-block,
.dark .gr-box,
.dark .gr-form,
.dark .gr-panel,
.dark .block,
.dark .form {
background: #131313 !important;
background-color: #131313 !important;
border-color: rgba(255, 255, 255, 0.08) !important;
}
"""
def build_ui():
# Mintlify-inspired green Soft theme — adapts to ?__theme=light / ?__theme=dark
# URL params on HuggingFace Spaces. The blog section below also reads the
# .dark body class so the entire page adapts together.
with gr.Blocks(title="API Testing Environment", theme=_GRADIO_THEME, css=_GRADIO_CSS) as demo:
session = gr.State(value=new_session())
gr.Markdown(
"# API Testing Environment\n"
"An OpenEnv RL environment that trains AI agents to become automated **API security testers**. "
"A simulated API server with **13 hidden vulnerabilities** mapped to the **OWASP API Security Top 10** is provided. "
"Send HTTP requests, earn rewards for finding bugs and covering endpoints, and generate a **bug bounty report** at episode end. "
"Use **Manual Testing** to craft requests yourself, or run a **Baseline Agent** to watch an automated strategy."
)
with gr.Row():
# ── Left Panel ──
with gr.Column(scale=1):
gr.Markdown("### Environment Control")
task_dropdown = gr.Dropdown(choices=list(TASKS.keys()), value="basic_validation", label="Select Task")
reset_btn = gr.Button("Reset Environment", variant="primary", size="lg")
gr.Markdown(
''
"Switch task or click Reset to start a fresh episode. "
"Resets all scores, bugs, and step count."
)
status_box = gr.Markdown("Initializing...")
gr.Markdown("---")
gr.Markdown("### Scoreboard")
gr.Markdown(
''
"Tracks your testing progress. Steps are API calls you've made; "
"bugs are issues discovered in the API; reward measures how well "
"the agent is testing."
)
with gr.Row():
step_display = gr.Markdown("0 / 25", label="Steps")
bug_display = gr.Markdown("0 / 3", label="Bugs")
reward_display = gr.Markdown(format_reward_display(0, 0, {}), label="Reward")
coverage_display = gr.Markdown("No data", label="Coverage")
gr.Markdown("---")
gr.Markdown("### Session Context")
gr.Markdown(
''
"Tokens and resources gathered during this episode. "
"Use tokens to test auth-protected endpoints and resource IDs for "
"GET/PUT/DELETE requests."
)
auth_display = gr.Markdown(format_auth_tokens({}))
resource_display = gr.Markdown(format_resources({}))
gr.Markdown("---")
with gr.Accordion("API Specification", open=False):
gr.Markdown(format_endpoints())
# ── Center Panel ──
with gr.Column(scale=2):
with gr.Tabs():
with gr.Tab("Run Baseline Agent"):
gr.Markdown("### Automated Agents\nWatch a baseline agent test the API step by step. Pick a strategy and click Run Agent.")
agent_dropdown = gr.Dropdown(choices=["random", "sequential", "smart"], value="smart", label="Agent Type")
run_agent_btn = gr.Button("Run Agent", variant="primary", size="lg")
with gr.Tab("Manual Testing"):
gr.Markdown("### Craft Your Request")
with gr.Row():
method_input = gr.Dropdown(
choices=["GET", "POST", "PUT", "DELETE", "PATCH"],
value="GET", label="Method", scale=1,
)
endpoint_input = gr.Textbox(value="/tasks", label="Endpoint", placeholder="/tasks, /users/1, /auth/login", scale=3)
expected_input = gr.Textbox(value="200", label="Expected Status", placeholder="200", scale=1)
with gr.Row():
headers_input = gr.Textbox(value="{}", label="Headers (JSON)", placeholder='{"Authorization": "Bearer ..."}', lines=1)
params_input = gr.Textbox(value="{}", label="Query Params (JSON)", placeholder='{"page": 1, "limit": 10}', lines=1)
body_input = gr.Textbox(value="", label="Request Body (JSON)", placeholder='{"title": "My Task", "description": "..."}', lines=3)
send_btn = gr.Button("Send Request", variant="primary", size="lg")
gr.Markdown("### Quick Actions")
quick_actions = gr.Dropdown(
choices=[
"GET /tasks", "GET /users", "GET /tasks/1",
"GET /tasks/999999 (bug hunt)", "POST create task",
"POST missing title (bug hunt)", "Login as alice", "Login as bob",
"Login empty pwd (bug hunt)", "Negative page (bug hunt)",
"Huge limit (bug hunt)", "Invalid email PUT (bug hunt)",
"DELETE non-existent (bug hunt)", "Create user invalid email (bug)",
"SQL injection test", "Long title crash (bug hunt)",
],
label="Quick Actions", value=None,
)
quick_btn = gr.Button("Load Quick Action", variant="secondary")
gr.Markdown("---")
gr.Markdown("### Response")
response_display = gr.Markdown("")
gr.Markdown("### Feedback")
feedback_display = gr.Markdown("")
# ── Right Panel ──
# Stacked (no tabs) so Discovered Bugs and Activity Log are both
# visible at once — users shouldn't have to click to see the log.
with gr.Column(scale=1):
gr.Markdown("### Discovered Bugs")
bug_list_display = gr.Markdown("No bugs found yet.")
gr.Markdown("### Activity Log")
log_display = gr.Markdown("No steps yet.")
with gr.Accordion("Bug Report (OWASP)", open=False):
gr.Markdown("*Auto-generated OWASP security report. Populates as bugs are found.*")
bug_report_display = gr.Markdown("No bugs found yet. Send requests to discover vulnerabilities.")
# ── Demo video (embedded between the app and the blog) ──
gr.HTML(
"""
"""
)
# ── Editorial blog-style documentation below the app ──
gr.HTML(BLOG_HTML)
# ── Wiring ──
reset_outputs = [
session, status_box, feedback_display, response_display,
reward_display, bug_display, coverage_display, log_display,
step_display, bug_list_display, bug_report_display, auth_display, resource_display,
]
step_outputs = [
session, feedback_display, response_display, reward_display,
bug_display, coverage_display, log_display, step_display,
bug_list_display, bug_report_display, auth_display, resource_display,
]
reset_btn.click(fn=reset_env, inputs=[task_dropdown, session], outputs=reset_outputs)
send_btn.click(
fn=send_request,
inputs=[method_input, endpoint_input, headers_input, params_input, body_input, expected_input, session],
outputs=step_outputs,
)
quick_btn.click(
fn=apply_quick_action, inputs=[quick_actions, session],
outputs=[method_input, endpoint_input, headers_input, params_input, body_input, expected_input],
)
run_agent_btn.click(fn=run_baseline_agent, inputs=[agent_dropdown, session], outputs=step_outputs)
# Auto-reset on page load so users can start testing immediately
demo.load(fn=reset_env, inputs=[task_dropdown, session], outputs=reset_outputs)
return demo
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--port", type=int, default=int(os.getenv("GRADIO_SERVER_PORT", "7860")))
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--share", action="store_true")
args = parser.parse_args()
# Pass theme + css to both Blocks() (Gradio 5.x) and launch() (Gradio 6.0+)
# so it works on whichever version the host runs.
launch_kwargs = dict(server_name=args.host, server_port=args.port, share=args.share)
try:
build_ui().launch(theme=_GRADIO_THEME, css=_GRADIO_CSS, **launch_kwargs)
except TypeError:
# Older Gradio: launch() doesn't accept theme/css — Blocks() already has them
build_ui().launch(**launch_kwargs)