Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Gradio UI for the API Testing Environment. | |
| """ | |
| import json | |
| import os | |
| import time | |
| import argparse | |
| from dataclasses import dataclass, field | |
| from typing import Optional | |
| import gradio as gr | |
| from models import APITestAction, APITestObservation, HTTPMethod | |
| from server.environment import APITestEnvironment, TASKS, API_SPEC | |
| # ===================================================================== | |
| # Editorial blog-style documentation rendered below the playground. | |
| # Uses an inline <style> block so it works without external CSS files, | |
| # and adapts to both light and dark Gradio themes via CSS variables. | |
| # Aesthetic: terminal-framed editorial zine β EB Garamond + JetBrains | |
| # Mono, parchment cream over ink black, amber + forensic-green accents. | |
| # ===================================================================== | |
| BLOG_HTML = r""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Fraunces:opsz,wght,SOFT@9..144,200..400,30..100&family=IBM+Plex+Mono:wght@400;500&display=swap'); | |
| /* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ElevenLabs-inspired editorial section. | |
| - Light Fraunces (used as Waldenburg-substitute) display | |
| - Warm stone surfaces with sub-0.1 shadow stacks | |
| - Pill buttons, generous whitespace | |
| - Adapts to both light and dark Gradio themes | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| /* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| Theme variables. | |
| Strategy: ALWAYS default to LIGHT. Only flip to dark when an | |
| explicit `.dark` class is present on the body (Gradio sets this | |
| reliably based on ?__theme=dark URL param or system preference). | |
| We do NOT use `prefers-color-scheme` here because Gradio already | |
| reads it once and forwards the result via the body class β using | |
| the media query as well causes double-flipping in light mode when | |
| the OS is dark. | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */ | |
| .eleven { | |
| /* Default = LIGHT theme (always β overridden by .dark below) */ | |
| --bg: #ffffff; | |
| --bg-soft: #f5f5f5; | |
| --stone: rgba(245, 242, 239, 0.85); | |
| --stone-solid: #f5f2ef; | |
| --fg: #000000; | |
| --fg-2: #4e4e4e; | |
| --fg-3: #777169; | |
| --hairline: rgba(0, 0, 0, 0.05); | |
| --border: #e5e5e5; | |
| --warm-shadow: rgba(78, 50, 23, 0.04); | |
| --inset-edge: rgba(0, 0, 0, 0.075); | |
| --outline-ring: rgba(0, 0, 0, 0.06); | |
| --soft-elev: rgba(0, 0, 0, 0.04); | |
| --accent-mint: #2db97a; | |
| --accent-amber: #d97757; | |
| --accent-coral: #c4513a; | |
| --accent-blue: #4a6fa5; | |
| --display: 'Fraunces', 'Iowan Old Style', Georgia, serif; | |
| --body: 'Inter', system-ui, sans-serif; | |
| --mono: 'IBM Plex Mono', 'SF Mono', Menlo, monospace; | |
| display: block; | |
| width: 100%; | |
| background: var(--bg); | |
| color: var(--fg); | |
| font-family: var(--body); | |
| margin-top: 48px; | |
| border-top: 1px solid var(--hairline); | |
| } | |
| /* DARK theme β triggered by Gradio's .dark class on body / container. | |
| Gradio handles `?__theme=dark` and `prefers-color-scheme: dark` for | |
| us by setting this class, so a single rule covers all cases. */ | |
| .dark .eleven, | |
| body.dark .eleven, | |
| .gradio-container.dark .eleven, | |
| html.dark .eleven { | |
| --bg: #0a0a0a; | |
| --bg-soft: #131313; | |
| --stone: rgba(28, 24, 20, 0.9); | |
| --stone-solid: #1c1814; | |
| --fg: #f5f2ef; | |
| --fg-2: #b8b3ad; | |
| --fg-3: #8a847d; | |
| --hairline: rgba(245, 242, 239, 0.06); | |
| --border: rgba(245, 242, 239, 0.10); | |
| --warm-shadow: rgba(0, 0, 0, 0.4); | |
| --inset-edge: rgba(245, 242, 239, 0.08); | |
| --outline-ring: rgba(245, 242, 239, 0.08); | |
| --soft-elev: rgba(0, 0, 0, 0.35); | |
| } | |
| /* ββ Section container ββ */ | |
| .eleven-section { | |
| padding: 96px 64px; | |
| max-width: 1280px; | |
| margin: 0 auto; | |
| } | |
| .eleven-section.alt { | |
| background: var(--bg-soft); | |
| max-width: none; | |
| padding: 96px 64px; | |
| } | |
| .eleven-section.alt > .eleven-section-inner { | |
| max-width: 1280px; | |
| margin: 0 auto; | |
| } | |
| @media (max-width: 900px) { | |
| .eleven-section, | |
| .eleven-section.alt { padding: 64px 24px; } | |
| } | |
| /* ββ Eyebrow label ββ */ | |
| .eleven-eyebrow { | |
| font-family: var(--mono); | |
| font-size: 12px; | |
| font-weight: 500; | |
| text-transform: uppercase; | |
| letter-spacing: 0.18em; | |
| color: var(--fg-3); | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 10px; | |
| margin-bottom: 28px; | |
| } | |
| .eleven-eyebrow::before { | |
| content: ""; | |
| width: 6px; | |
| height: 6px; | |
| border-radius: 50%; | |
| background: var(--accent-mint); | |
| box-shadow: 0 0 12px var(--accent-mint); | |
| } | |
| /* ββ Hero masthead ββ */ | |
| .eleven-hero { | |
| text-align: center; | |
| padding: 120px 24px 64px 24px; | |
| max-width: 960px; | |
| margin: 0 auto; | |
| } | |
| .eleven-hero h1 { | |
| font-family: var(--display); | |
| font-size: clamp(48px, 7vw, 96px); | |
| font-weight: 300; | |
| font-variation-settings: "SOFT" 50, "opsz" 144; | |
| line-height: 1.04; | |
| letter-spacing: -0.025em; | |
| color: var(--fg); | |
| margin: 0 0 24px 0; | |
| } | |
| .eleven-hero h1 em { | |
| font-style: italic; | |
| font-weight: 300; | |
| color: var(--fg); | |
| font-variation-settings: "SOFT" 100, "opsz" 144; | |
| } | |
| .eleven-hero .eleven-deck { | |
| font-family: var(--body); | |
| font-size: 20px; | |
| font-weight: 400; | |
| line-height: 1.55; | |
| letter-spacing: 0.1px; | |
| color: var(--fg-2); | |
| margin: 0 auto 40px auto; | |
| max-width: 640px; | |
| } | |
| /* ββ Pill buttons (hero CTA row) ββ */ | |
| .eleven-pills { | |
| display: inline-flex; | |
| gap: 12px; | |
| flex-wrap: wrap; | |
| justify-content: center; | |
| } | |
| .eleven-pill { | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 8px; | |
| padding: 12px 22px; | |
| border-radius: 9999px; | |
| text-decoration: none; | |
| font-family: var(--body); | |
| font-size: 15px; | |
| font-weight: 500; | |
| letter-spacing: 0.1px; | |
| transition: transform 0.2s ease, box-shadow 0.2s ease; | |
| } | |
| .eleven-pill.dark { | |
| background: var(--fg); | |
| color: var(--bg); | |
| } | |
| .eleven-pill.dark:hover { | |
| transform: translateY(-1px); | |
| } | |
| .eleven-pill.warm { | |
| background: var(--stone); | |
| color: var(--fg); | |
| box-shadow: | |
| var(--inset-edge) 0 0 0 0.5px inset, | |
| var(--warm-shadow) 0 6px 16px; | |
| } | |
| .eleven-pill.warm:hover { | |
| transform: translateY(-1px); | |
| box-shadow: | |
| var(--inset-edge) 0 0 0 0.5px inset, | |
| var(--warm-shadow) 0 8px 20px; | |
| } | |
| .eleven-pill .arrow { | |
| font-family: var(--body); | |
| font-weight: 400; | |
| font-size: 16px; | |
| } | |
| /* ββ Two-column section: label + content ββ */ | |
| .eleven-row { | |
| display: grid; | |
| grid-template-columns: 240px 1fr; | |
| gap: 80px; | |
| align-items: start; | |
| } | |
| @media (max-width: 900px) { | |
| .eleven-row { grid-template-columns: 1fr; gap: 24px; } | |
| } | |
| .eleven-label { | |
| font-family: var(--mono); | |
| font-size: 11px; | |
| font-weight: 600; | |
| letter-spacing: 0.2em; | |
| text-transform: uppercase; | |
| color: var(--fg-3); | |
| position: sticky; | |
| top: 32px; | |
| } | |
| .eleven-label .num { | |
| display: block; | |
| font-family: var(--display); | |
| font-size: 56px; | |
| font-weight: 300; | |
| font-variation-settings: "SOFT" 100, "opsz" 144; | |
| letter-spacing: -0.04em; | |
| color: var(--fg); | |
| line-height: 1; | |
| margin-bottom: 12px; | |
| } | |
| .eleven-content h2 { | |
| font-family: var(--display); | |
| font-size: clamp(36px, 4.5vw, 56px); | |
| font-weight: 300; | |
| font-variation-settings: "SOFT" 50, "opsz" 144; | |
| line-height: 1.08; | |
| letter-spacing: -0.018em; | |
| color: var(--fg); | |
| margin: 0 0 28px 0; | |
| } | |
| .eleven-content h2 em { | |
| font-style: italic; | |
| font-weight: 300; | |
| font-variation-settings: "SOFT" 100, "opsz" 144; | |
| color: var(--fg); | |
| } | |
| .eleven-content p { | |
| font-family: var(--body); | |
| font-size: 18px; | |
| font-weight: 400; | |
| line-height: 1.62; | |
| letter-spacing: 0.18px; | |
| color: var(--fg-2); | |
| margin: 0 0 18px 0; | |
| max-width: 64ch; | |
| } | |
| .eleven-content p strong { | |
| color: var(--fg); | |
| font-weight: 600; | |
| } | |
| /* Inline tag chip */ | |
| .eleven-chip { | |
| display: inline-flex; | |
| align-items: center; | |
| font-family: var(--mono); | |
| font-size: 12px; | |
| font-weight: 500; | |
| background: var(--bg); | |
| color: var(--fg); | |
| padding: 2px 10px; | |
| border-radius: 9999px; | |
| margin: 0 2px; | |
| box-shadow: | |
| var(--inset-edge) 0 0 0 0.5px inset, | |
| var(--soft-elev) 0 1px 2px; | |
| vertical-align: 2px; | |
| } | |
| /* ββ Pull quote (subtle, ElevenLabs-style β not loud) ββ */ | |
| .eleven-quote { | |
| font-family: var(--display); | |
| font-size: clamp(28px, 3.5vw, 42px); | |
| font-weight: 300; | |
| font-variation-settings: "SOFT" 80, "opsz" 144; | |
| line-height: 1.2; | |
| letter-spacing: -0.012em; | |
| color: var(--fg); | |
| margin: 32px 0 16px 0; | |
| max-width: 28ch; | |
| } | |
| .eleven-quote em { | |
| font-style: italic; | |
| font-weight: 300; | |
| color: var(--fg-3); | |
| font-variation-settings: "SOFT" 100, "opsz" 144; | |
| } | |
| /* ββ Reward cards (5 elegant cards instead of bar chart) ββ */ | |
| .eleven-rewards { | |
| display: grid; | |
| grid-template-columns: repeat(2, 1fr); | |
| gap: 16px; | |
| margin: 32px 0 24px 0; | |
| } | |
| @media (max-width: 700px) { | |
| .eleven-rewards { grid-template-columns: 1fr; } | |
| } | |
| .eleven-reward { | |
| background: var(--bg); | |
| border-radius: 16px; | |
| padding: 24px 26px; | |
| box-shadow: | |
| var(--inset-edge) 0 0 0 0.5px inset, | |
| var(--outline-ring) 0 0 0 1px, | |
| var(--soft-elev) 0 4px 12px; | |
| display: flex; | |
| flex-direction: column; | |
| gap: 8px; | |
| } | |
| .eleven-reward.featured { | |
| grid-column: 1 / -1; | |
| background: var(--stone); | |
| box-shadow: | |
| var(--inset-edge) 0 0 0 0.5px inset, | |
| var(--warm-shadow) 0 6px 16px; | |
| } | |
| .eleven-reward-head { | |
| display: flex; | |
| align-items: baseline; | |
| justify-content: space-between; | |
| gap: 12px; | |
| margin-bottom: 4px; | |
| } | |
| .eleven-reward-name { | |
| font-family: var(--display); | |
| font-size: 22px; | |
| font-weight: 300; | |
| font-variation-settings: "SOFT" 80, "opsz" 144; | |
| letter-spacing: -0.005em; | |
| color: var(--fg); | |
| } | |
| .eleven-reward-val { | |
| font-family: var(--mono); | |
| font-size: 14px; | |
| font-weight: 600; | |
| color: var(--accent-mint); | |
| white-space: nowrap; | |
| } | |
| .eleven-reward-val.neg { | |
| color: var(--accent-coral); | |
| } | |
| .eleven-reward p { | |
| font-family: var(--body); | |
| font-size: 15px; | |
| line-height: 1.55; | |
| letter-spacing: 0.14px; | |
| color: var(--fg-2); | |
| margin: 0; | |
| max-width: none; | |
| } | |
| .eleven-r-foot { | |
| font-family: var(--display); | |
| font-size: 20px; | |
| font-style: italic; | |
| font-weight: 300; | |
| font-variation-settings: "SOFT" 100, "opsz" 144; | |
| line-height: 1.5; | |
| color: var(--fg-2); | |
| margin: 32px 0 0 0; | |
| max-width: 64ch; | |
| padding-left: 18px; | |
| border-left: 1px solid var(--border); | |
| } | |
| /* ββ Steps as an editorial table ββ */ | |
| .eleven-steps { | |
| margin: 40px 0 0 0; | |
| border-top: 1px solid var(--border); | |
| counter-reset: eleven-step; | |
| } | |
| .eleven-step { | |
| display: grid; | |
| grid-template-columns: 72px minmax(180px, 1.1fr) minmax(0, 2.4fr); | |
| column-gap: 32px; | |
| row-gap: 8px; | |
| align-items: start; | |
| padding: 32px 8px; | |
| border-bottom: 1px solid var(--border); | |
| counter-increment: eleven-step; | |
| transition: background 0.18s ease; | |
| } | |
| .eleven-step:hover { | |
| background: var(--bg-soft); | |
| } | |
| @media (max-width: 900px) { | |
| .eleven-step { | |
| grid-template-columns: 56px 1fr; | |
| column-gap: 16px; | |
| padding: 24px 4px; | |
| } | |
| .eleven-step-body { grid-column: 2 / -1; } | |
| } | |
| .eleven-step-num { | |
| font-family: var(--mono); | |
| font-size: 12px; | |
| font-weight: 600; | |
| letter-spacing: 0.18em; | |
| color: var(--fg-3); | |
| padding-top: 10px; | |
| position: relative; | |
| } | |
| .eleven-step-num::before { | |
| content: counter(eleven-step, decimal-leading-zero); | |
| } | |
| .eleven-step-num::after { | |
| content: ""; | |
| position: absolute; | |
| left: 0; | |
| top: 0; | |
| width: 18px; | |
| height: 2px; | |
| background: var(--accent-mint); | |
| } | |
| .eleven-step-title { | |
| font-family: var(--display); | |
| font-size: 24px; | |
| font-weight: 400; | |
| font-variation-settings: "SOFT" 80, "opsz" 144; | |
| letter-spacing: -0.012em; | |
| line-height: 1.2; | |
| color: var(--fg); | |
| } | |
| .eleven-step-body { | |
| font-family: var(--body); | |
| font-size: 15px; | |
| line-height: 1.65; | |
| color: var(--fg-2); | |
| } | |
| .eleven-step-body p { | |
| margin: 0; | |
| } | |
| .eleven-step-chips { | |
| display: flex; | |
| flex-wrap: wrap; | |
| gap: 6px; | |
| margin-top: 14px; | |
| } | |
| .eleven-step-chip { | |
| font-family: var(--mono); | |
| font-size: 11px; | |
| font-weight: 500; | |
| letter-spacing: 0.02em; | |
| padding: 5px 11px; | |
| border-radius: 999px; | |
| background: var(--bg-soft); | |
| color: var(--fg); | |
| border: 1px solid var(--border); | |
| white-space: nowrap; | |
| } | |
| .dark .eleven-step-chip { | |
| background: rgba(245, 242, 239, 0.04); | |
| } | |
| .eleven-step-chip.accent { | |
| background: rgba(45, 185, 122, 0.10); | |
| color: var(--accent-mint); | |
| border-color: rgba(45, 185, 122, 0.35); | |
| } | |
| /* ββ Stack tiles (4 cards) ββ */ | |
| .eleven-stack { | |
| display: grid; | |
| grid-template-columns: repeat(2, 1fr); | |
| gap: 16px; | |
| margin: 32px 0 0 0; | |
| } | |
| @media (max-width: 700px) { | |
| .eleven-stack { grid-template-columns: 1fr; } | |
| } | |
| .eleven-tile { | |
| background: var(--bg); | |
| border-radius: 20px; | |
| padding: 32px 32px; | |
| display: flex; | |
| flex-direction: column; | |
| gap: 14px; | |
| box-shadow: | |
| var(--inset-edge) 0 0 0 0.5px inset, | |
| var(--outline-ring) 0 0 0 1px, | |
| var(--soft-elev) 0 4px 12px; | |
| } | |
| .eleven-tile-tag { | |
| font-family: var(--mono); | |
| font-size: 11px; | |
| font-weight: 500; | |
| text-transform: uppercase; | |
| letter-spacing: 0.18em; | |
| color: var(--fg-3); | |
| } | |
| .eleven-tile h3 { | |
| font-family: var(--display); | |
| font-size: 28px; | |
| font-weight: 300; | |
| font-variation-settings: "SOFT" 80, "opsz" 144; | |
| letter-spacing: -0.012em; | |
| line-height: 1.1; | |
| color: var(--fg); | |
| margin: 0; | |
| } | |
| .eleven-tile p { | |
| font-family: var(--body); | |
| font-size: 15px; | |
| line-height: 1.55; | |
| letter-spacing: 0.14px; | |
| color: var(--fg-2); | |
| margin: 0; | |
| } | |
| .eleven-tile p code { | |
| font-family: var(--mono); | |
| font-size: 12px; | |
| background: var(--bg-soft); | |
| padding: 1px 6px; | |
| border-radius: 4px; | |
| color: var(--fg); | |
| font-weight: 500; | |
| box-shadow: var(--inset-edge) 0 0 0 0.5px inset; | |
| } | |
| /* ββ Link list (minimal underlined text) ββ */ | |
| .eleven-links { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 14px; | |
| margin: 32px 0 0 0; | |
| } | |
| .eleven-link { | |
| font-family: var(--mono); | |
| font-size: 14px; | |
| line-height: 1.55; | |
| color: var(--fg); | |
| text-decoration: underline; | |
| text-decoration-color: var(--border); | |
| text-underline-offset: 4px; | |
| text-decoration-thickness: 1px; | |
| word-break: break-all; | |
| transition: text-decoration-color 0.18s ease, color 0.18s ease; | |
| } | |
| .eleven-link:hover { | |
| text-decoration-color: var(--accent-mint); | |
| color: var(--accent-mint); | |
| } | |
| </style> | |
| <div class="eleven"> | |
| <!-- βββ HERO βββ --> | |
| <div class="eleven-hero"> | |
| <h1>Where agents learn to <em>break APIs.</em></h1> | |
| <p class="eleven-deck">An OpenEnv reinforcement learning environment for API security testing. A live REST API with thirteen planted vulnerabilities, a verifiable reward function mapped to the OWASP API Security Top 10, and an episode that ends with a structured bug report.</p> | |
| </div> | |
| <!-- βββ 01 WHAT IS THIS βββ --> | |
| <div class="eleven-section"> | |
| <div class="eleven-row"> | |
| <div class="eleven-label"><span class="num">01</span>The premise</div> | |
| <div class="eleven-content"> | |
| <h2>What <em>this is.</em></h2> | |
| <p>A Gradio playground for an OpenEnv RL environment that trains AI agents to test REST APIs the way a security engineer would. Behind the UI is a Task Management API with <strong>13 deliberately planted bugs</strong> covering 6 categories from the <strong>OWASP API Security Top 10</strong>.</p> | |
| <p>The agent connects, sends HTTP requests, earns rewards for finding bugs and covering endpoints, and generates a bug bounty report when the episode ends.</p> | |
| <div class="eleven-quote">Real API. Real bugs. Real OWASP categories β <em>verifiable end to end.</em></div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- βββ 02 WHY BOTHER βββ --> | |
| <div class="eleven-section alt"> | |
| <div class="eleven-section-inner"> | |
| <div class="eleven-row"> | |
| <div class="eleven-label"><span class="num">02</span>The gap</div> | |
| <div class="eleven-content"> | |
| <h2>Why <em>bother.</em></h2> | |
| <p>Every team ships APIs and every API has bugs. The usual tools <span class="eleven-chip">Postman</span> <span class="eleven-chip">Schemathesis</span> <span class="eleven-chip">OWASP ZAP</span> either need humans writing tests by hand or fall back to brute-force fuzzing.</p> | |
| <div class="eleven-quote">This environment <em>is the benchmark.</em></div> | |
| <p>The agent doesn't get a written test plan. It reads the API spec, plans a campaign, runs it, and reports what broke. The reward function is verifiable β no LLM judge, no soft heuristics β and every signal maps to a real OWASP category, so episodes can be scored deterministically.</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- βββ 03 REWARD βββ --> | |
| <div class="eleven-section"> | |
| <div class="eleven-row"> | |
| <div class="eleven-label"><span class="num">03</span>How reward works</div> | |
| <div class="eleven-content"> | |
| <h2>Five signals,<br><em>one episode.</em></h2> | |
| <p>The reward function is verifiable β no LLM judge, no soft heuristics. Each step accumulates from five components and the task grader caps the episode with a terminal score in <span class="eleven-chip">[0, 1]</span>.</p> | |
| <div class="eleven-rewards"> | |
| <div class="eleven-reward featured"> | |
| <div class="eleven-reward-head"> | |
| <div class="eleven-reward-name">Bug discovery</div> | |
| <div class="eleven-reward-val">+0.10 / +0.15 / +0.25</div> | |
| </div> | |
| <p>Finding a planted bug, scaled by severity. Easy bugs (status codes, missing fields) are worth 0.10. Medium (validation, auth) gets 0.15. Hard (BOLA, injection, broken auth chains) gets 0.25.</p> | |
| </div> | |
| <div class="eleven-reward"> | |
| <div class="eleven-reward-head"> | |
| <div class="eleven-reward-name">Coverage</div> | |
| <div class="eleven-reward-val">+0.20</div> | |
| </div> | |
| <p>Hitting endpoints, methods, and status codes the agent hasn't tried yet.</p> | |
| </div> | |
| <div class="eleven-reward"> | |
| <div class="eleven-reward-head"> | |
| <div class="eleven-reward-name">Validity</div> | |
| <div class="eleven-reward-val">+0.18</div> | |
| </div> | |
| <p>Well-formed requests, plus chaining IDs from previous responses.</p> | |
| </div> | |
| <div class="eleven-reward"> | |
| <div class="eleven-reward-head"> | |
| <div class="eleven-reward-name">Exploration</div> | |
| <div class="eleven-reward-val">+0.05</div> | |
| </div> | |
| <p>Trying genuinely novel action patterns the agent hasn't tried before.</p> | |
| </div> | |
| <div class="eleven-reward"> | |
| <div class="eleven-reward-head"> | |
| <div class="eleven-reward-name">Penalty</div> | |
| <div class="eleven-reward-val neg">β0.08</div> | |
| </div> | |
| <p>Repeating the same exact request twice β anti-spam, anti-loop.</p> | |
| </div> | |
| </div> | |
| <p class="eleven-r-foot">When the episode ends, the task grader adds a terminal score based on its own criteria β CRUD coverage, dependency chaining, security probing, that kind of thing.</p> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- βββ 04 HOW TO USE βββ --> | |
| <div class="eleven-section alt"> | |
| <div class="eleven-section-inner"> | |
| <div class="eleven-row"> | |
| <div class="eleven-label"><span class="num">04</span>How to use this</div> | |
| <div class="eleven-content"> | |
| <h2>Five steps<br><em>to a verdict.</em></h2> | |
| <div class="eleven-steps"> | |
| <div class="eleven-step"> | |
| <div class="eleven-step-num"></div> | |
| <div class="eleven-step-title">Pick a task</div> | |
| <div class="eleven-step-body"> | |
| <p>Three difficulty tiers in the dropdown on the left, from a CRUD smoke-test to a full BOLA + injection chain.</p> | |
| <div class="eleven-step-chips"> | |
| <span class="eleven-step-chip accent">basic_validation</span> | |
| <span class="eleven-step-chip accent">edge_cases</span> | |
| <span class="eleven-step-chip accent">security_workflows</span> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="eleven-step"> | |
| <div class="eleven-step-num"></div> | |
| <div class="eleven-step-title">Reset the environment</div> | |
| <div class="eleven-step-body"> | |
| <p>Every reset spins up a fresh database with new users, new tasks, and randomized ownership, so the agent can't memorize answers between episodes.</p> | |
| </div> | |
| </div> | |
| <div class="eleven-step"> | |
| <div class="eleven-step-num"></div> | |
| <div class="eleven-step-title">Run a baseline</div> | |
| <div class="eleven-step-body"> | |
| <p>The Run Baseline Agent tab is open by default. Pick a strategy and watch it test the API step by step.</p> | |
| <div class="eleven-step-chips"> | |
| <span class="eleven-step-chip">random</span> | |
| <span class="eleven-step-chip">sequential</span> | |
| <span class="eleven-step-chip">smart</span> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="eleven-step"> | |
| <div class="eleven-step-num"></div> | |
| <div class="eleven-step-title">Or test manually</div> | |
| <div class="eleven-step-body"> | |
| <p>Switch to Manual Testing. Quick Actions give one-click bug hunts, or craft your own request from scratch β method, endpoint, headers, body, expected status.</p> | |
| </div> | |
| </div> | |
| <div class="eleven-step"> | |
| <div class="eleven-step-num"></div> | |
| <div class="eleven-step-title">Watch the panel</div> | |
| <div class="eleven-step-body"> | |
| <p>Discovered Bugs and the Activity Log update live as the agent works. When the episode ends, expand the Bug Report (OWASP) drawer for the full structured findings, severities, and fix recommendations.</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- βββ 05 STACK βββ --> | |
| <div class="eleven-section"> | |
| <div class="eleven-row"> | |
| <div class="eleven-label"><span class="num">05</span>Under the hood</div> | |
| <div class="eleven-content"> | |
| <h2>Three <em>layers.</em></h2> | |
| <p>Self-contained, reproducible, and runs on a free-tier HuggingFace Space.</p> | |
| <div class="eleven-stack"> | |
| <div class="eleven-tile"> | |
| <span class="eleven-tile-tag">L1 Β· ENVIRONMENT</span> | |
| <h3>FastAPI + SQLite</h3> | |
| <p>A buggy Task Management API wrapped in OpenEnv's <code>step()</code> / <code>reset()</code> / <code>state()</code> contract. Runs in-process or as a Docker image, with seed-randomized data on every reset so episodes can't be memorized.</p> | |
| </div> | |
| <div class="eleven-tile"> | |
| <span class="eleven-tile-tag">L2 Β· INFERENCE</span> | |
| <h3>OpenAI-compatible client</h3> | |
| <p><code>inference.py</code> talks to any HuggingFace-hosted model through the OpenAI SDK and structured JSON output. Plug in any model that follows the protocol β no environment-specific glue.</p> | |
| </div> | |
| <div class="eleven-tile"> | |
| <span class="eleven-tile-tag">L3 Β· DEPLOY</span> | |
| <h3>Docker + HF Spaces</h3> | |
| <p>Containerized on top of the official <code>openenv-base</code> image and deployed as a public HuggingFace Space, so judges can hit it with a single HTTP call.</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- βββ 06 LINKS βββ --> | |
| <div class="eleven-section alt"> | |
| <div class="eleven-section-inner"> | |
| <div class="eleven-row"> | |
| <div class="eleven-label"><span class="num">06</span>The artifacts</div> | |
| <div class="eleven-content"> | |
| <h2>Everything <em>reproducible.</em></h2> | |
| <p>Source code, deployed environment, framework. Open and inspectable.</p> | |
| <div class="eleven-links"> | |
| <a class="eleven-link" href="https://github.com/Mayankpratapsingh022/API-Testing-RL" target="_blank" rel="noopener">https://github.com/Mayankpratapsingh022/API-Testing-RL</a> | |
| <a class="eleven-link" href="https://meta-pytorch.org/OpenEnv/" target="_blank" rel="noopener">https://meta-pytorch.org/OpenEnv/</a> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| class SessionState: | |
| env: APITestEnvironment = field(default_factory=APITestEnvironment) | |
| initialized: bool = False | |
| task_id: str = "" | |
| step_log: list[dict] = field(default_factory=list) | |
| total_reward: float = 0.0 | |
| last_obs: Optional[APITestObservation] = None | |
| def new_session(): | |
| return SessionState() | |
| # ===================================================================== | |
| # Core logic | |
| # ===================================================================== | |
| def _generate_report(bug_ids, action_history): | |
| """Generate OWASP bug bounty report from discovered bugs.""" | |
| from server.graders import generate_bug_report | |
| return generate_bug_report(bug_ids, action_history) | |
| def reset_env(task_id, state): | |
| if not state: | |
| state = new_session() | |
| obs = state.env.reset(task_id=task_id) | |
| state.initialized = True | |
| state.task_id = task_id | |
| state.step_log = [] | |
| state.total_reward = 0.0 | |
| state.last_obs = obs | |
| t = TASKS[task_id] | |
| return ( | |
| state, | |
| f"Environment reset. Task: **{task_id}** ({t['difficulty']})\n\nMax steps: {t['max_steps']} | Bugs to find: {t['total_bugs']}", | |
| obs.feedback, | |
| "", | |
| format_reward_display(0, 0, {}), | |
| f"0 / {t['total_bugs']}", | |
| format_coverage(obs.coverage_summary), | |
| "", | |
| f"0 / {t['max_steps']}", | |
| "No bugs found yet.", | |
| "No bugs found yet. Send requests to discover vulnerabilities.", | |
| "No tokens acquired yet.", | |
| "No resources created yet.", | |
| ) | |
| def send_request(method, endpoint, headers_str, params_str, body_str, expected_status, state): | |
| if not state or not state.initialized: | |
| return (state, "Environment not initialized. Click 'Reset' first.", "", "", "", "", "", "", "", "", "", "") | |
| try: | |
| headers = json.loads(headers_str) if headers_str.strip() else {} | |
| except json.JSONDecodeError: | |
| return (state, "Invalid JSON in headers.", "", "", "", "", "", "", "", "", "", "") | |
| try: | |
| query_params = json.loads(params_str) if params_str.strip() else {} | |
| except json.JSONDecodeError: | |
| return (state, "Invalid JSON in query params.", "", "", "", "", "", "", "", "", "", "") | |
| try: | |
| body = json.loads(body_str) if body_str.strip() else None | |
| except json.JSONDecodeError: | |
| return (state, "Invalid JSON in body.", "", "", "", "", "", "", "", "", "", "") | |
| exp = int(expected_status) if expected_status.strip() else None | |
| action = APITestAction( | |
| method=HTTPMethod(method), endpoint=endpoint, | |
| headers=headers, query_params=query_params, | |
| body=body, expected_status=exp, | |
| ) | |
| obs = state.env.step(action) | |
| reward = obs.reward or 0.0 | |
| state.total_reward += reward | |
| state.last_obs = obs | |
| resp_body = obs.response_body | |
| if isinstance(resp_body, (dict, list)): | |
| resp_str = json.dumps(resp_body, indent=2) | |
| else: | |
| resp_str = str(resp_body) | |
| state.step_log.append({ | |
| "step": obs.steps_taken, "method": method, "endpoint": endpoint, | |
| "status": obs.status_code, "reward": round(reward, 4), "bugs": obs.bugs_found_so_far, | |
| }) | |
| breakdown = obs.metadata.get("reward_breakdown", {}) | |
| reward_detail = format_reward_display(reward, state.total_reward, breakdown) | |
| t = TASKS[state.task_id] | |
| es = state.env.state | |
| status = "" | |
| if obs.done: | |
| status = ( | |
| f"\n\n**EPISODE COMPLETE**\n\n" | |
| f"Final Score: {reward:.4f}\n" | |
| f"Bugs: {obs.bugs_found_so_far}/{t['total_bugs']}\n" | |
| f"Steps: {obs.steps_taken}/{obs.max_steps}" | |
| ) | |
| return ( | |
| state, | |
| obs.feedback + status, | |
| f"**{obs.status_code}** β {obs.response_time_ms:.1f}ms\n\n```json\n{resp_str}\n```", | |
| reward_detail, | |
| f"{obs.bugs_found_so_far} / {t['total_bugs']}", | |
| format_coverage(obs.coverage_summary), | |
| format_log(state.step_log), | |
| f"{obs.steps_taken} / {obs.max_steps}" + (" (DONE)" if obs.done else ""), | |
| format_bug_list(es.bugs_found_ids), | |
| _generate_report(es.bugs_found_ids, state.step_log), | |
| format_auth_tokens(obs.auth_tokens), | |
| format_resources(obs.known_resource_ids), | |
| ) | |
| def apply_quick_action(action_name, _state): | |
| quick_actions = { | |
| "GET /tasks": ("GET", "/tasks", "{}", "{}", "", "200"), | |
| "GET /users": ("GET", "/users", "{}", "{}", "", "200"), | |
| "GET /tasks/1": ("GET", "/tasks/1", "{}", "{}", "", "200"), | |
| "GET /tasks/999999 (bug hunt)": ("GET", "/tasks/999999", "{}", "{}", "", "404"), | |
| "POST create task": ("POST", "/tasks", "{}", "{}", '{"title": "Test Task", "description": "Created via UI"}', "201"), | |
| "POST missing title (bug hunt)": ("POST", "/tasks", "{}", "{}", '{"description": "no title"}', "400"), | |
| "Login as alice": ("POST", "/auth/login", "{}", "{}", '{"username": "alice", "password": "pass"}', "200"), | |
| "Login as bob": ("POST", "/auth/login", "{}", "{}", '{"username": "bob", "password": "pass"}', "200"), | |
| "Login empty pwd (bug hunt)": ("POST", "/auth/login", "{}", "{}", '{"username": "alice", "password": ""}', "401"), | |
| "Negative page (bug hunt)": ("GET", "/tasks", "{}", '{"page": -1, "limit": 10}', "", "400"), | |
| "Huge limit (bug hunt)": ("GET", "/tasks", "{}", '{"limit": 999999}', "", "200"), | |
| "Invalid email PUT (bug hunt)": ("PUT", "/tasks/1", "{}", "{}", '{"assignee_email": "not-an-email"}', "422"), | |
| "DELETE non-existent (bug hunt)": ("DELETE", "/tasks/99999", "{}", "{}", "", "404"), | |
| "Create user invalid email (bug)": ("POST", "/users", "{}", "{}", '{"username": "baduser", "email": "nope", "password": "x"}', "422"), | |
| "SQL injection test": ("POST", "/tasks", "{}", "{}", '{"title": "test\'; DROP TABLE tasks;--"}', "201"), | |
| "Long title crash (bug hunt)": ("POST", "/tasks", "{}", "{}", '{"title": "' + "A" * 6000 + '"}', "400"), | |
| } | |
| if action_name and action_name in quick_actions: | |
| return quick_actions[action_name] | |
| return [gr.update()] * 6 | |
| def run_baseline_agent(agent_type, state): | |
| if not state or not state.initialized: | |
| yield state, "Environment not initialized.", "", "", "", "", "", "", "", "", "", "" | |
| return | |
| from training.agents import RandomAgent, SequentialAgent, SmartAgent | |
| agents = {"random": RandomAgent, "sequential": SequentialAgent, "smart": SmartAgent} | |
| agent = agents[agent_type]() | |
| t = TASKS[state.task_id] | |
| obs = state.env.reset(task_id=state.task_id) | |
| state.step_log = [] | |
| state.total_reward = 0.0 | |
| state.last_obs = obs | |
| while not obs.done: | |
| obs_dict = { | |
| "status_code": obs.status_code, "response_body": obs.response_body, | |
| "feedback": obs.feedback, "bugs_found_so_far": obs.bugs_found_so_far, | |
| "coverage_summary": obs.coverage_summary, "known_resource_ids": obs.known_resource_ids, | |
| "auth_tokens": obs.auth_tokens, "steps_taken": obs.steps_taken, "max_steps": obs.max_steps, | |
| } | |
| action = agent.act(obs_dict) | |
| obs = state.env.step(action) | |
| reward = obs.reward or 0.0 | |
| state.total_reward += reward | |
| state.last_obs = obs | |
| ms = action.method.value if hasattr(action.method, "value") else str(action.method) | |
| state.step_log.append({ | |
| "step": obs.steps_taken, "method": ms, "endpoint": action.endpoint, | |
| "status": obs.status_code, "reward": round(reward, 4), "bugs": obs.bugs_found_so_far, | |
| }) | |
| resp_body = obs.response_body | |
| if isinstance(resp_body, (dict, list)): | |
| resp_str = json.dumps(resp_body, indent=2) | |
| else: | |
| resp_str = str(resp_body) | |
| breakdown = obs.metadata.get("reward_breakdown", {}) | |
| reward_detail = format_reward_display(reward, state.total_reward, breakdown) | |
| es = state.env.state | |
| done_text = "" | |
| if obs.done: | |
| done_text = f"\n\n**EPISODE COMPLETE** β Final Score: {reward:.4f} | Bugs: {obs.bugs_found_so_far}/{t['total_bugs']}" | |
| yield ( | |
| state, | |
| f"[{agent_type}] {ms} {action.endpoint} -> {obs.status_code}{done_text}", | |
| f"**{obs.status_code}**\n```json\n{resp_str[:500]}\n```", | |
| reward_detail, | |
| f"{obs.bugs_found_so_far} / {t['total_bugs']}", | |
| format_coverage(obs.coverage_summary), | |
| format_log(state.step_log), | |
| f"{obs.steps_taken} / {obs.max_steps}" + (" (DONE)" if obs.done else ""), | |
| format_bug_list(es.bugs_found_ids), | |
| _generate_report(es.bugs_found_ids, state.step_log), | |
| format_auth_tokens(obs.auth_tokens), | |
| format_resources(obs.known_resource_ids), | |
| ) | |
| time.sleep(0.3) | |
| # ===================================================================== | |
| # Formatters | |
| # ===================================================================== | |
| def format_reward_display(step_reward, cumulative, breakdown): | |
| """Render reward metrics as styled HTML with explanations.""" | |
| components = [ | |
| ("Coverage", breakdown.get("coverage", 0), | |
| "Reward for testing new endpoints and methods"), | |
| ("Validity", breakdown.get("validity", 0), | |
| "Reward for sending well-formed requests that return expected status codes"), | |
| ("Bug", breakdown.get("bug_discovery", 0), | |
| "Bonus for discovering a new bug in the API"), | |
| ("Explore", breakdown.get("exploration", 0), | |
| "Reward for trying new parameter combinations and edge cases"), | |
| ("Penalty", breakdown.get("penalty", 0), | |
| "Deduction for repeated or invalid requests"), | |
| ] | |
| bars = [] | |
| for label, value, tip in components: | |
| val_color = "#16a34a" if value > 0 else "#dc2626" if value < 0 else "inherit" | |
| bars.append( | |
| f'<div style="display:flex;justify-content:space-between;align-items:center;' | |
| f'padding:2px 0;font-size:0.82em;" title="{tip}">' | |
| f'<span style="opacity:0.6;cursor:help;border-bottom:1px dotted currentColor;">' | |
| f'{label}</span>' | |
| f'<span style="color:{val_color};font-family:monospace;font-weight:600;">' | |
| f'{value:+.3f}</span></div>' | |
| ) | |
| cum_color = "#16a34a" if cumulative > 0 else "#dc2626" if cumulative < 0 else "inherit" | |
| step_color = "#16a34a" if step_reward > 0 else "#dc2626" if step_reward < 0 else "inherit" | |
| return ( | |
| f'<div style="display:flex;gap:16px;margin-bottom:8px;">' | |
| f'<div style="flex:1;text-align:center;padding:6px;background:rgba(128,128,128,0.1);' | |
| f'border-radius:8px;">' | |
| f'<div style="font-size:0.72em;opacity:0.55;">STEP REWARD</div>' | |
| f'<div style="font-size:1.3em;font-weight:700;color:{step_color};">' | |
| f'{step_reward:+.4f}</div></div>' | |
| f'<div style="flex:1;text-align:center;padding:6px;background:rgba(128,128,128,0.1);' | |
| f'border-radius:8px;">' | |
| f'<div style="font-size:0.72em;opacity:0.55;">CUMULATIVE</div>' | |
| f'<div style="font-size:1.3em;font-weight:700;color:{cum_color};">' | |
| f'{cumulative:.4f}</div></div></div>' | |
| f'<div style="border:1px solid rgba(128,128,128,0.2);border-radius:8px;padding:6px 10px;">' | |
| f'<div style="font-size:0.72em;opacity:0.5;margin-bottom:4px;">' | |
| f'REWARD BREAKDOWN ' | |
| f'<span title="How the reward for the last step was calculated"' | |
| f' style="cursor:help;">ⓘ</span></div>' | |
| + "".join(bars) | |
| + "</div>" | |
| ) | |
| def format_coverage(summary): | |
| if not summary: | |
| return "No data" | |
| pct = summary.get("coverage_pct", 0) | |
| tested = summary.get("endpoints_tested", 0) | |
| total = summary.get("total_endpoints", 0) | |
| pairs = summary.get("method_endpoint_pairs", 0) | |
| codes = summary.get("status_codes_seen", []) | |
| color = "#dc2626" if pct < 30 else "#d97706" if pct < 70 else "#16a34a" | |
| bar_html = ( | |
| f'<div style="display:flex;align-items:center;gap:8px;margin:4px 0;">' | |
| f'<div style="flex:1;background:rgba(128,128,128,0.15);border-radius:6px;height:14px;overflow:hidden;">' | |
| f'<div style="width:{pct:.1f}%;height:100%;background:{color};border-radius:6px;' | |
| f'transition:width 0.3s ease;"></div></div>' | |
| f'<span style="font-weight:700;min-width:48px;text-align:right;">{pct:.1f}%</span></div>' | |
| ) | |
| code_pills = "" | |
| for c in codes: | |
| cc = "#16a34a" if 200 <= c < 300 else "#d97706" if 300 <= c < 400 else "#dc2626" | |
| code_pills += ( | |
| f'<span style="background:{cc}18;color:{cc};padding:1px 7px;border-radius:10px;' | |
| f'font-size:0.78em;font-weight:600;margin-right:4px;">{c}</span>' | |
| ) | |
| return ( | |
| f"{bar_html}" | |
| f'<div style="display:flex;gap:10px;margin:6px 0;font-size:0.82em;">' | |
| f'<div style="flex:1;text-align:center;padding:4px;background:rgba(128,128,128,0.1);border-radius:6px;"' | |
| f' title="How many unique API endpoints have been called">' | |
| f'<div style="font-size:0.72em;opacity:0.5;">ENDPOINTS</div>' | |
| f'<div style="font-weight:700;">{tested}/{total}</div></div>' | |
| f'<div style="flex:1;text-align:center;padding:4px;background:rgba(128,128,128,0.1);border-radius:6px;"' | |
| f' title="Unique combinations of HTTP method + endpoint path tested">' | |
| f'<div style="font-size:0.72em;opacity:0.5;">METHOD+PATH</div>' | |
| f'<div style="font-weight:700;">{pairs}</div></div></div>' | |
| f'<div style="margin-top:4px;" title="HTTP status codes received from the API so far">' | |
| f'<span style="font-size:0.72em;opacity:0.5;">STATUS CODES SEEN </span>' | |
| f'{code_pills}</div>' | |
| ) | |
| def format_log(log): | |
| if not log: | |
| return ( | |
| '<div style="opacity:0.55;font-size:0.85em;">' | |
| "Each row shows an API request the agent made, the HTTP status it got back, " | |
| "and the reward earned. Green = positive reward, red = penalty." | |
| "</div>" | |
| ) | |
| method_colors = { | |
| "GET": "#2563eb", "POST": "#16a34a", "PUT": "#d97706", | |
| "DELETE": "#dc2626", "PATCH": "#9333ea", | |
| } | |
| rows = [] | |
| for entry in log[-20:]: | |
| m = entry["method"] | |
| mcol = method_colors.get(m, "#6b7280") | |
| r = entry["reward"] | |
| rcol = "#16a34a" if r > 0 else "#dc2626" if r < 0 else "inherit" | |
| bug_tag = ( | |
| '<span style="background:#92400e;color:#fef08a;padding:0 5px;border-radius:4px;' | |
| 'font-size:0.7em;margin-left:4px;">BUG FOUND</span>' | |
| ) if r > 0.2 else "" | |
| status = entry["status"] | |
| scol = "#16a34a" if 200 <= status < 300 else "#d97706" if 300 <= status < 400 else "#dc2626" | |
| rows.append( | |
| f'<div style="display:flex;align-items:center;gap:6px;padding:3px 0;' | |
| f'border-bottom:1px solid rgba(128,128,128,0.1);font-size:0.82em;">' | |
| f'<span style="opacity:0.45;min-width:20px;text-align:right;">{entry["step"]}</span>' | |
| f'<span style="background:{mcol}18;color:{mcol};padding:1px 6px;border-radius:4px;' | |
| f'font-weight:600;font-size:0.8em;min-width:52px;text-align:center;">{m}</span>' | |
| f'<span style="flex:1;overflow:hidden;text-overflow:ellipsis;' | |
| f'white-space:nowrap;">{entry["endpoint"]}</span>' | |
| f'<span style="color:{scol};font-weight:600;min-width:28px;text-align:right;">{status}</span>' | |
| f'<span style="color:{rcol};min-width:52px;text-align:right;font-family:monospace;' | |
| f'font-size:0.85em;">{r:+.3f}</span>{bug_tag}</div>' | |
| ) | |
| omitted = "" | |
| if len(log) > 20: | |
| omitted = ( | |
| f'<div style="opacity:0.45;font-size:0.78em;padding:4px 0;text-align:center;">' | |
| f'... {len(log) - 20} earlier steps not shown</div>' | |
| ) | |
| header = ( | |
| '<div style="opacity:0.55;font-size:0.78em;margin-bottom:6px;">' | |
| "API requests made by the agent. Each row: step number, HTTP method, " | |
| "endpoint, status code, and reward earned.</div>" | |
| '<div style="display:flex;gap:6px;padding:2px 0 6px;border-bottom:1px solid rgba(128,128,128,0.2);' | |
| 'font-size:0.75em;opacity:0.5;">' | |
| '<span style="min-width:20px;text-align:right;">#</span>' | |
| '<span style="min-width:52px;text-align:center;">Method</span>' | |
| '<span style="flex:1;">Endpoint</span>' | |
| '<span style="min-width:28px;text-align:right;">Status</span>' | |
| '<span style="min-width:52px;text-align:right;">Reward</span></div>' | |
| ) | |
| return header + omitted + "\n".join(rows) | |
| def format_bug_list(bug_ids): | |
| if not bug_ids: | |
| return "No bugs found yet." | |
| from server.bug_detector import BugDetector | |
| detector = BugDetector("security_workflows") | |
| severity_colors = { | |
| "easy": "#16a34a", | |
| "medium": "#d97706", | |
| "hard": "#dc2626", | |
| } | |
| cards = [] | |
| for bid in sorted(bug_ids): | |
| bug = detector.bugs.get(bid) | |
| if bug: | |
| fg = severity_colors.get(bug.severity, "#6b7280") | |
| owasp_badge = f' | {bug.owasp.split(" ")[0]}' if bug.owasp else "" | |
| cards.append( | |
| f'<div style="border:1px solid {fg}40;border-radius:8px;padding:8px 10px;' | |
| f'margin-bottom:6px;background:{fg}0d;">' | |
| f'<div style="display:flex;justify-content:space-between;align-items:center;">' | |
| f'<span style="font-weight:700;font-size:0.85em;">{bid}</span>' | |
| f'<span style="background:{fg};color:#fff;padding:1px 8px;border-radius:10px;' | |
| f'font-size:0.75em;font-weight:600;">{bug.severity.upper()}{owasp_badge}</span></div>' | |
| f'<div style="margin-top:4px;font-size:0.85em;opacity:0.7;">' | |
| f'{bug.description}</div>' | |
| f'<div style="margin-top:2px;font-size:0.78em;opacity:0.5;font-style:italic;">' | |
| f'{bug.owasp}</div></div>' | |
| ) | |
| return "\n".join(cards) | |
| def format_auth_tokens(tokens): | |
| if not tokens: | |
| return ( | |
| '<div style="opacity:0.5;font-size:0.85em;">' | |
| "No tokens yet. Login via <code>POST /auth/login</code> to get auth tokens " | |
| "for testing protected endpoints.</div>" | |
| ) | |
| cards = [] | |
| for user, token in tokens.items(): | |
| cards.append( | |
| f'<div style="display:flex;align-items:center;gap:8px;padding:4px 0;' | |
| f'border-bottom:1px solid rgba(128,128,128,0.1);font-size:0.85em;">' | |
| f'<span style="background:#2563eb18;color:#2563eb;padding:1px 8px;border-radius:10px;' | |
| f'font-weight:600;font-size:0.8em;">{user}</span>' | |
| f'<code style="opacity:0.55;font-size:0.82em;">{token[:20]}...</code></div>' | |
| ) | |
| return ( | |
| '<div style="font-size:0.72em;opacity:0.5;margin-bottom:4px;"' | |
| ' title="Auth tokens obtained by logging in. Use these in the Authorization header.">' | |
| "AUTHENTICATED USERS</div>" | |
| + "".join(cards) | |
| ) | |
| def format_resources(ids): | |
| if not ids: | |
| return ( | |
| '<div style="opacity:0.5;font-size:0.85em;">' | |
| "No resources created. Use POST endpoints to create tasks or users " | |
| "and track their IDs here.</div>" | |
| ) | |
| sections = [] | |
| type_colors = {"tasks": "#d97706", "users": "#2563eb"} | |
| for rtype, id_list in ids.items(): | |
| color = type_colors.get(rtype, "#6b7280") | |
| ids_str = ", ".join(str(i) for i in id_list) if isinstance(id_list, list) else str(id_list) | |
| sections.append( | |
| f'<div style="padding:4px 0;border-bottom:1px solid rgba(128,128,128,0.1);font-size:0.85em;">' | |
| f'<span style="background:{color}18;color:{color};padding:1px 8px;border-radius:10px;' | |
| f'font-weight:600;font-size:0.8em;text-transform:uppercase;">{rtype}</span>' | |
| f'<span style="margin-left:8px;opacity:0.7;">IDs: {ids_str}</span></div>' | |
| ) | |
| return ( | |
| '<div style="font-size:0.72em;opacity:0.5;margin-bottom:4px;"' | |
| ' title="Resources created during this episode. Use these IDs in GET/PUT/DELETE requests.">' | |
| "CREATED RESOURCES</div>" | |
| + "".join(sections) | |
| ) | |
| def format_endpoints(): | |
| lines = [] | |
| for ep in API_SPEC: | |
| lines.append(f"**{ep['method']}** `{ep['path']}` β {ep.get('summary', '')}") | |
| return "\n\n".join(lines) | |
| # ===================================================================== | |
| # UI | |
| # ===================================================================== | |
| _GRADIO_THEME = gr.themes.Soft( | |
| primary_hue="emerald", | |
| secondary_hue="green", | |
| neutral_hue="slate", | |
| font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"], | |
| font_mono=[gr.themes.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace"], | |
| ) | |
| # Custom CSS injected into the Gradio app to highlight important interactive | |
| # elements (primary buttons, active tabs, hover states) so the playground | |
| # doesn't feel washed out. Works in both light and dark mode. | |
| _GRADIO_CSS = """ | |
| /* βββ Mintlify-inspired green palette (flat, no gradients) βββ */ | |
| :root { | |
| --accent: #18E299; /* Brand Green */ | |
| --accent-hover: #0fa76e; /* Brand Green Deep */ | |
| --accent-soft: #d4fae8; /* Brand Green Light */ | |
| --accent-border: rgba(15, 167, 110, 0.28); | |
| --ink: #0d0d0d; | |
| --ink-muted: #666666; | |
| --line: #e5e5e5; | |
| --surface: #fafafa; | |
| --success: #16a34a; | |
| --danger: #dc2626; | |
| --info: #2563eb; | |
| } | |
| .dark { | |
| --accent: #18E299; | |
| --accent-hover: #34efaa; | |
| --accent-soft: rgba(24, 226, 153, 0.14); | |
| --accent-border: rgba(24, 226, 153, 0.35); | |
| --ink: #f5f5f5; | |
| --ink-muted: #a0a0a0; | |
| --line: rgba(255, 255, 255, 0.10); | |
| --surface: #141414; | |
| } | |
| /* βββ Primary buttons βββββββββββββββββββββββββββββββββββββββββββββ | |
| Light mode: near-black surface, white text, green hover. | |
| Dark mode: bright green surface, near-black text. | |
| Both flat (no gradients, no glow). */ | |
| button.primary, | |
| .gr-button.primary, | |
| button[class*="primary"], | |
| .gr-button-primary { | |
| background: #0d0d0d !important; | |
| background-image: none !important; | |
| color: #ffffff !important; | |
| border: 1px solid #0d0d0d !important; | |
| box-shadow: none !important; | |
| font-weight: 600 !important; | |
| letter-spacing: 0.01em !important; | |
| transition: background-color 0.15s ease, border-color 0.15s ease, color 0.15s ease !important; | |
| } | |
| button.primary:hover, | |
| .gr-button.primary:hover, | |
| button[class*="primary"]:hover, | |
| .gr-button-primary:hover { | |
| background: #0fa76e !important; | |
| background-image: none !important; | |
| color: #ffffff !important; | |
| border-color: #0fa76e !important; | |
| box-shadow: none !important; | |
| transform: none !important; | |
| filter: none !important; | |
| } | |
| button.primary:active, | |
| .gr-button.primary:active, | |
| .gr-button-primary:active { | |
| background: #0a8a5a !important; | |
| border-color: #0a8a5a !important; | |
| transform: none !important; | |
| filter: none !important; | |
| } | |
| /* Dark-mode override: bright green CTA pops against the dark surface */ | |
| .dark button.primary, | |
| .dark .gr-button.primary, | |
| .dark button[class*="primary"], | |
| .dark .gr-button-primary { | |
| background: #18E299 !important; | |
| color: #07301f !important; | |
| border: 1px solid #18E299 !important; | |
| } | |
| .dark button.primary:hover, | |
| .dark .gr-button.primary:hover, | |
| .dark button[class*="primary"]:hover, | |
| .dark .gr-button-primary:hover { | |
| background: #34efaa !important; | |
| border-color: #34efaa !important; | |
| color: #07301f !important; | |
| } | |
| /* βββ Secondary buttons ββββββββββββββββββββββββββββββββββββββββββ | |
| Light mode: white with dark border, fills near-black on hover. | |
| Dark mode: ghost button with green border. */ | |
| button.secondary, | |
| .gr-button.secondary, | |
| .gr-button-secondary { | |
| background: #ffffff !important; | |
| background-image: none !important; | |
| border: 1px solid #0d0d0d !important; | |
| color: #0d0d0d !important; | |
| font-weight: 500 !important; | |
| box-shadow: none !important; | |
| transition: background-color 0.15s ease, color 0.15s ease, border-color 0.15s ease !important; | |
| } | |
| button.secondary:hover, | |
| .gr-button.secondary:hover, | |
| .gr-button-secondary:hover { | |
| background: #0d0d0d !important; | |
| color: #ffffff !important; | |
| border-color: #0d0d0d !important; | |
| } | |
| .dark button.secondary, | |
| .dark .gr-button.secondary, | |
| .dark .gr-button-secondary { | |
| background: transparent !important; | |
| border: 1px solid var(--accent-border) !important; | |
| color: var(--accent) !important; | |
| } | |
| .dark button.secondary:hover, | |
| .dark .gr-button.secondary:hover, | |
| .dark .gr-button-secondary:hover { | |
| background: var(--accent) !important; | |
| color: #07301f !important; | |
| border-color: var(--accent) !important; | |
| } | |
| /* βββ Tabs (selected tab uses brand green) βββ */ | |
| button[role="tab"][aria-selected="true"], | |
| .tab-nav button.selected, | |
| .tab-nav button[aria-selected="true"] { | |
| color: var(--accent-hover) !important; | |
| border-bottom: 2px solid var(--accent) !important; | |
| font-weight: 600 !important; | |
| } | |
| .dark button[role="tab"][aria-selected="true"], | |
| .dark .tab-nav button.selected, | |
| .dark .tab-nav button[aria-selected="true"] { | |
| color: var(--accent) !important; | |
| } | |
| button[role="tab"]:hover, | |
| .tab-nav button:hover { | |
| color: var(--accent-hover) !important; | |
| } | |
| .dark button[role="tab"]:hover, | |
| .dark .tab-nav button:hover { | |
| color: var(--accent) !important; | |
| } | |
| /* βββ Inputs (focus ring uses brand green, no glow) βββ */ | |
| .gr-dropdown, | |
| .gr-input, | |
| .gr-textbox, | |
| input[type="text"], | |
| input[type="number"], | |
| textarea, | |
| select { | |
| transition: border-color 0.15s ease, box-shadow 0.15s ease !important; | |
| } | |
| .gr-dropdown:focus-within, | |
| .gr-input:focus-within, | |
| .gr-textbox:focus-within, | |
| input:focus, | |
| textarea:focus, | |
| select:focus { | |
| border-color: var(--accent) !important; | |
| box-shadow: 0 0 0 3px var(--accent-soft) !important; | |
| outline: none !important; | |
| } | |
| /* βββ Section headings βββ */ | |
| h1, h2, h3 { | |
| letter-spacing: -0.01em !important; | |
| } | |
| h1, h2 { | |
| color: #0d0d0d !important; | |
| } | |
| .dark h1, .dark h2 { | |
| color: #f5f5f5 !important; | |
| } | |
| h3 { | |
| color: #0d0d0d !important; | |
| font-weight: 600 !important; | |
| border-bottom: 1px solid var(--line) !important; | |
| padding-bottom: 6px !important; | |
| margin-bottom: 12px !important; | |
| position: relative !important; | |
| } | |
| /* small green accent bar before each section heading for brand identity */ | |
| h3::before { | |
| content: "" !important; | |
| display: inline-block !important; | |
| width: 4px !important; | |
| height: 14px !important; | |
| background: #18E299 !important; | |
| border-radius: 2px !important; | |
| margin-right: 8px !important; | |
| vertical-align: -2px !important; | |
| } | |
| .dark h3 { | |
| color: #f5f5f5 !important; | |
| } | |
| /* βββ Markdown links βββ */ | |
| .prose a, .markdown a, a { | |
| color: var(--accent-hover) !important; | |
| text-decoration: none !important; | |
| border-bottom: 1px solid var(--accent-border) !important; | |
| } | |
| .dark .prose a, .dark .markdown a, .dark a { | |
| color: var(--accent) !important; | |
| } | |
| .prose a:hover, .markdown a:hover, a:hover { | |
| border-bottom-color: var(--accent) !important; | |
| } | |
| /* βββ Accordion headers βββ */ | |
| .gr-accordion > button, | |
| button[class*="accordion"] { | |
| color: var(--accent-hover) !important; | |
| font-weight: 600 !important; | |
| } | |
| .dark .gr-accordion > button, | |
| .dark button[class*="accordion"] { | |
| color: var(--accent) !important; | |
| } | |
| /* βββ Card borders (Mintlify principle: borders, not shadows) βββ */ | |
| .gr-block.gr-box { | |
| border-color: var(--line) !important; | |
| box-shadow: none !important; | |
| } | |
| /* βββ Match the Gradio dark surface to the blog section ββββββββββ | |
| The blog section below uses #0a0a0a as its background. Override | |
| Gradio's default slate so the page reads as one continuous canvas. */ | |
| .dark { | |
| --body-background-fill: #0a0a0a !important; | |
| --background-fill-primary: #0a0a0a !important; | |
| --background-fill-secondary: #131313 !important; | |
| --block-background-fill: #131313 !important; | |
| --panel-background-fill: #131313 !important; | |
| --input-background-fill: #131313 !important; | |
| --border-color-primary: rgba(255, 255, 255, 0.08) !important; | |
| } | |
| .dark, | |
| .dark body, | |
| .dark gradio-app, | |
| .dark .gradio-container, | |
| .dark .main, | |
| .dark .wrap, | |
| .dark .app, | |
| .dark .contain { | |
| background: #0a0a0a !important; | |
| background-color: #0a0a0a !important; | |
| } | |
| /* Cards / blocks get a slightly lighter surface so they remain | |
| visually separated from the page background. */ | |
| .dark .gr-block, | |
| .dark .gr-box, | |
| .dark .gr-form, | |
| .dark .gr-panel, | |
| .dark .block, | |
| .dark .form { | |
| background: #131313 !important; | |
| background-color: #131313 !important; | |
| border-color: rgba(255, 255, 255, 0.08) !important; | |
| } | |
| """ | |
| def build_ui(): | |
| # Mintlify-inspired green Soft theme β adapts to ?__theme=light / ?__theme=dark | |
| # URL params on HuggingFace Spaces. The blog section below also reads the | |
| # .dark body class so the entire page adapts together. | |
| with gr.Blocks(title="API Testing Environment", theme=_GRADIO_THEME, css=_GRADIO_CSS) as demo: | |
| session = gr.State(value=new_session()) | |
| gr.Markdown( | |
| "# API Testing Environment\n" | |
| "An OpenEnv RL environment that trains AI agents to become automated **API security testers**. " | |
| "A simulated API server with **13 hidden vulnerabilities** mapped to the **OWASP API Security Top 10** is provided. " | |
| "Send HTTP requests, earn rewards for finding bugs and covering endpoints, and generate a **bug bounty report** at episode end. " | |
| "Use **Manual Testing** to craft requests yourself, or run a **Baseline Agent** to watch an automated strategy." | |
| ) | |
| with gr.Row(): | |
| # ββ Left Panel ββ | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Environment Control") | |
| task_dropdown = gr.Dropdown(choices=list(TASKS.keys()), value="basic_validation", label="Select Task") | |
| reset_btn = gr.Button("Reset Environment", variant="primary", size="lg") | |
| gr.Markdown( | |
| '<span style="font-size:0.8em;opacity:0.55;">' | |
| "Switch task or click Reset to start a fresh episode. " | |
| "Resets all scores, bugs, and step count.</span>" | |
| ) | |
| status_box = gr.Markdown("Initializing...") | |
| gr.Markdown("---") | |
| gr.Markdown("### Scoreboard") | |
| gr.Markdown( | |
| '<span style="font-size:0.78em;opacity:0.55;">' | |
| "Tracks your testing progress. Steps are API calls you've made; " | |
| "bugs are issues discovered in the API; reward measures how well " | |
| "the agent is testing.</span>" | |
| ) | |
| with gr.Row(): | |
| step_display = gr.Markdown("0 / 25", label="Steps") | |
| bug_display = gr.Markdown("0 / 3", label="Bugs") | |
| reward_display = gr.Markdown(format_reward_display(0, 0, {}), label="Reward") | |
| coverage_display = gr.Markdown("No data", label="Coverage") | |
| gr.Markdown("---") | |
| gr.Markdown("### Session Context") | |
| gr.Markdown( | |
| '<span style="font-size:0.78em;opacity:0.55;">' | |
| "Tokens and resources gathered during this episode. " | |
| "Use tokens to test auth-protected endpoints and resource IDs for " | |
| "GET/PUT/DELETE requests.</span>" | |
| ) | |
| auth_display = gr.Markdown(format_auth_tokens({})) | |
| resource_display = gr.Markdown(format_resources({})) | |
| gr.Markdown("---") | |
| with gr.Accordion("API Specification", open=False): | |
| gr.Markdown(format_endpoints()) | |
| # ββ Center Panel ββ | |
| with gr.Column(scale=2): | |
| with gr.Tabs(): | |
| with gr.Tab("Run Baseline Agent"): | |
| gr.Markdown("### Automated Agents\nWatch a baseline agent test the API step by step. Pick a strategy and click Run Agent.") | |
| agent_dropdown = gr.Dropdown(choices=["random", "sequential", "smart"], value="smart", label="Agent Type") | |
| run_agent_btn = gr.Button("Run Agent", variant="primary", size="lg") | |
| with gr.Tab("Manual Testing"): | |
| gr.Markdown("### Craft Your Request") | |
| with gr.Row(): | |
| method_input = gr.Dropdown( | |
| choices=["GET", "POST", "PUT", "DELETE", "PATCH"], | |
| value="GET", label="Method", scale=1, | |
| ) | |
| endpoint_input = gr.Textbox(value="/tasks", label="Endpoint", placeholder="/tasks, /users/1, /auth/login", scale=3) | |
| expected_input = gr.Textbox(value="200", label="Expected Status", placeholder="200", scale=1) | |
| with gr.Row(): | |
| headers_input = gr.Textbox(value="{}", label="Headers (JSON)", placeholder='{"Authorization": "Bearer ..."}', lines=1) | |
| params_input = gr.Textbox(value="{}", label="Query Params (JSON)", placeholder='{"page": 1, "limit": 10}', lines=1) | |
| body_input = gr.Textbox(value="", label="Request Body (JSON)", placeholder='{"title": "My Task", "description": "..."}', lines=3) | |
| send_btn = gr.Button("Send Request", variant="primary", size="lg") | |
| gr.Markdown("### Quick Actions") | |
| quick_actions = gr.Dropdown( | |
| choices=[ | |
| "GET /tasks", "GET /users", "GET /tasks/1", | |
| "GET /tasks/999999 (bug hunt)", "POST create task", | |
| "POST missing title (bug hunt)", "Login as alice", "Login as bob", | |
| "Login empty pwd (bug hunt)", "Negative page (bug hunt)", | |
| "Huge limit (bug hunt)", "Invalid email PUT (bug hunt)", | |
| "DELETE non-existent (bug hunt)", "Create user invalid email (bug)", | |
| "SQL injection test", "Long title crash (bug hunt)", | |
| ], | |
| label="Quick Actions", value=None, | |
| ) | |
| quick_btn = gr.Button("Load Quick Action", variant="secondary") | |
| gr.Markdown("---") | |
| gr.Markdown("### Response") | |
| response_display = gr.Markdown("") | |
| gr.Markdown("### Feedback") | |
| feedback_display = gr.Markdown("") | |
| # ββ Right Panel ββ | |
| # Stacked (no tabs) so Discovered Bugs and Activity Log are both | |
| # visible at once β users shouldn't have to click to see the log. | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Discovered Bugs") | |
| bug_list_display = gr.Markdown("No bugs found yet.") | |
| gr.Markdown("### Activity Log") | |
| log_display = gr.Markdown("No steps yet.") | |
| with gr.Accordion("Bug Report (OWASP)", open=False): | |
| gr.Markdown("*Auto-generated OWASP security report. Populates as bugs are found.*") | |
| bug_report_display = gr.Markdown("No bugs found yet. Send requests to discover vulnerabilities.") | |
| # ββ Demo video (embedded between the app and the blog) ββ | |
| gr.HTML( | |
| """ | |
| <div style="max-width: 900px; margin: 32px auto; padding: 0 16px;"> | |
| <div style="position: relative; padding-bottom: 56.25%; height: 0; overflow: hidden; border-radius: 12px; box-shadow: 0 8px 32px rgba(0,0,0,0.4);"> | |
| <iframe | |
| src="https://www.youtube.com/embed/9psbwJug6G4" | |
| title="YouTube video player" | |
| frameborder="0" | |
| allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" | |
| referrerpolicy="strict-origin-when-cross-origin" | |
| allowfullscreen | |
| style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;"> | |
| </iframe> | |
| </div> | |
| </div> | |
| """ | |
| ) | |
| # ββ Editorial blog-style documentation below the app ββ | |
| gr.HTML(BLOG_HTML) | |
| # ββ Wiring ββ | |
| reset_outputs = [ | |
| session, status_box, feedback_display, response_display, | |
| reward_display, bug_display, coverage_display, log_display, | |
| step_display, bug_list_display, bug_report_display, auth_display, resource_display, | |
| ] | |
| step_outputs = [ | |
| session, feedback_display, response_display, reward_display, | |
| bug_display, coverage_display, log_display, step_display, | |
| bug_list_display, bug_report_display, auth_display, resource_display, | |
| ] | |
| reset_btn.click(fn=reset_env, inputs=[task_dropdown, session], outputs=reset_outputs) | |
| send_btn.click( | |
| fn=send_request, | |
| inputs=[method_input, endpoint_input, headers_input, params_input, body_input, expected_input, session], | |
| outputs=step_outputs, | |
| ) | |
| quick_btn.click( | |
| fn=apply_quick_action, inputs=[quick_actions, session], | |
| outputs=[method_input, endpoint_input, headers_input, params_input, body_input, expected_input], | |
| ) | |
| run_agent_btn.click(fn=run_baseline_agent, inputs=[agent_dropdown, session], outputs=step_outputs) | |
| # Auto-reset on page load so users can start testing immediately | |
| demo.load(fn=reset_env, inputs=[task_dropdown, session], outputs=reset_outputs) | |
| return demo | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--port", type=int, default=int(os.getenv("GRADIO_SERVER_PORT", "7860"))) | |
| parser.add_argument("--host", default="0.0.0.0") | |
| parser.add_argument("--share", action="store_true") | |
| args = parser.parse_args() | |
| # Pass theme + css to both Blocks() (Gradio 5.x) and launch() (Gradio 6.0+) | |
| # so it works on whichever version the host runs. | |
| launch_kwargs = dict(server_name=args.host, server_port=args.port, share=args.share) | |
| try: | |
| build_ui().launch(theme=_GRADIO_THEME, css=_GRADIO_CSS, **launch_kwargs) | |
| except TypeError: | |
| # Older Gradio: launch() doesn't accept theme/css β Blocks() already has them | |
| build_ui().launch(**launch_kwargs) | |