aws_rl_env / server /templates /index.html
Sizzing's picture
Upload folder using huggingface_hub
c745a99 verified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>AWS RL Environment</title>
<style>
/* ===== CSS Variables — matches portfolio.udaykp.dev ===== */
:root {
--bg-color: #ffffff;
--surface-color: #ffffff;
--surface-hover: #f8f9fa;
--text-main: #202124;
--text-muted: #5f6368;
--accent-color: #202124;
--accent-hover: #000000;
--border-color: #9aa0a6;
--grid-dot: #a8adb3;
--nav-height: 72px;
--blue-accent: #1a73e8;
--blue-hover: #1557b0;
}
/* ===== Reset ===== */
* {
margin: 0;
padding: 0;
box-sizing: border-box;
font-family: 'Google Sans', 'Roboto', system-ui, -apple-system, sans-serif;
}
html {
font-size: 18px;
}
body {
background-color: var(--bg-color);
color: var(--text-main);
line-height: 1.6;
-webkit-font-smoothing: antialiased;
}
h1,
h2,
h3,
h4 {
font-weight: 500;
color: var(--text-main);
line-height: 1.2;
}
p {
color: var(--text-muted);
margin-bottom: 1rem;
font-size: 1.1rem;
}
a {
text-decoration: none;
color: inherit;
}
/* ===== Navigation ===== */
nav {
position: fixed;
top: 0;
left: 50%;
transform: translateX(-50%);
width: 100%;
height: var(--nav-height);
background: rgba(255, 255, 255, 0.55);
backdrop-filter: blur(16px) saturate(180%);
-webkit-backdrop-filter: blur(16px) saturate(180%);
border-bottom: 1px solid rgba(0, 0, 0, 0.12);
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.08);
display: flex;
align-items: center;
justify-content: center;
z-index: 1000;
transition: all 0.4s ease;
}
nav.scrolled {
top: 16px;
width: max-content;
max-width: calc(100% - 32px);
height: 56px;
border-radius: 28px;
border: 1px solid rgba(0, 0, 0, 0.1);
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.1), 0 1px 4px rgba(0, 0, 0, 0.06);
background: rgba(255, 255, 255, 0.5);
backdrop-filter: blur(16px) saturate(180%);
-webkit-backdrop-filter: blur(16px) saturate(180%);
padding: 0 1.5rem;
}
.nav-links {
display: flex;
gap: 0.5rem;
list-style: none;
}
.nav-links a {
font-family: 'Google Sans', 'Roboto', sans-serif;
font-size: 1.05rem;
font-weight: 400;
color: #3c4043;
transition: all 0.2s ease;
padding: 0.5rem 1.2rem;
border-radius: 24px;
}
.nav-links a:hover {
color: #202124;
background: #f1f3f4;
}
.nav-links a.active {
color: var(--blue-accent);
background: #e8f0fe;
font-weight: 500;
}
/* ===== Hero ===== */
.hero {
height: 100vh;
display: flex;
flex-direction: column;
justify-content: center;
align-items: center;
text-align: center;
padding: 2rem;
position: relative;
background-color: var(--bg-color);
overflow: hidden;
}
.hero-bg {
position: absolute;
inset: 0;
background-image: radial-gradient(circle, var(--grid-dot) 1.5px, transparent 1.5px);
background-size: 36px 36px;
background-position: calc(50% + var(--bg-x, 0px)) calc(50% + var(--bg-y, 0px));
transition: background-position 0.15s cubic-bezier(0.25, 1, 0.5, 1);
z-index: 0;
}
.hero-bg::before {
content: '';
position: absolute;
inset: 0;
background: radial-gradient(700px circle at var(--mouse-x, 50%) var(--mouse-y, 50%), rgba(26, 115, 232, 0.25), transparent 55%);
z-index: 1;
pointer-events: none;
}
.hero::after {
content: '';
position: absolute;
inset: 0;
background: linear-gradient(to bottom, rgba(255, 255, 255, 0) 0%, rgba(255, 255, 255, 1) 100%);
pointer-events: none;
z-index: 2;
}
.hero-content {
position: relative;
z-index: 3;
}
.hero h1 {
font-size: 4rem;
letter-spacing: -1.5px;
margin-bottom: 1rem;
}
.hero h2 {
font-size: 1.5rem;
color: var(--text-muted);
font-weight: 400;
margin-bottom: 2.5rem;
}
/* Typewriter */
.type-animate .char {
opacity: 0;
transition: opacity 0.05s;
}
.type-animate .char.visible {
opacity: 1;
}
.typing-cursor {
display: inline-block;
width: 0;
overflow: visible;
color: var(--blue-accent);
font-weight: 300;
animation: blink 1s step-start infinite;
pointer-events: none;
}
@keyframes blink {
50% {
opacity: 0;
}
}
/* Hero buttons */
.hero-cta-container {
display: flex;
gap: 1rem;
justify-content: center;
margin-bottom: 1.5rem;
}
.hero-fade-up {
opacity: 0;
transform: translateY(20px);
transition: opacity 0.8s ease, transform 0.8s ease;
}
.hero-fade-up.visible {
opacity: 1;
transform: translateY(0);
}
/* ===== Buttons ===== */
.btn-primary {
background: var(--blue-accent);
color: white;
padding: 0.75rem 2rem;
border-radius: 50px;
font-weight: 500;
font-size: 1.05rem;
transition: all 0.2s ease;
display: inline-flex;
align-items: center;
gap: 0.5rem;
border: 1px solid var(--blue-accent);
cursor: pointer;
font-family: 'Google Sans', 'Roboto', sans-serif;
}
.btn-primary:hover {
background: var(--blue-hover);
box-shadow: 0 4px 12px rgba(26, 115, 232, 0.3);
transform: translateY(-1px);
}
.btn-primary:disabled {
background: var(--border-color);
border-color: var(--border-color);
color: var(--text-muted);
cursor: not-allowed;
transform: none;
box-shadow: none;
}
.btn-secondary {
background: #f8f9fa;
color: var(--text-main);
padding: 0.75rem 2rem;
border-radius: 50px;
font-weight: 500;
font-size: 1.05rem;
transition: all 0.2s ease;
display: inline-flex;
align-items: center;
gap: 0.5rem;
border: 1px solid var(--border-color);
cursor: pointer;
font-family: 'Google Sans', 'Roboto', sans-serif;
}
.btn-secondary:hover {
background: #f1f3f4;
box-shadow: 0 1px 2px 0 rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
}
.btn-full {
width: 100%;
justify-content: center;
}
/* ===== Container & Section Wrapper ===== */
.container {
max-width: 1000px;
margin: 0 auto;
padding: 0 2rem;
position: relative;
}
.section-wrapper {
display: flex;
padding: 6rem 0;
border-bottom: 1px solid var(--border-color);
gap: 4rem;
}
.section-wrapper:last-child {
border-bottom: none;
}
/* Sticky left column */
.left-col {
flex: 0 0 120px;
position: sticky;
top: calc(var(--nav-height) + 40px);
height: max-content;
display: flex;
flex-direction: column;
align-items: center;
text-align: center;
}
.icon-container {
width: 56px;
height: 56px;
background: var(--surface-color);
border: 1px solid var(--border-color);
border-radius: 16px;
display: flex;
align-items: center;
justify-content: center;
color: var(--accent-color);
margin-bottom: 1rem;
transition: all 0.2s ease;
}
.icon-container svg {
width: 24px;
height: 24px;
stroke: currentColor;
fill: none;
stroke-width: 1.5;
}
.section-wrapper:hover .icon-container {
background: rgba(26, 115, 232, 0.04);
border-color: var(--blue-accent);
transform: scale(1.02);
}
.section-title {
font-size: 1rem;
letter-spacing: 0.5px;
color: var(--text-main);
font-weight: 600;
text-transform: uppercase;
}
.right-col {
flex: 1;
}
/* ===== Cards ===== */
.card {
background: var(--surface-color);
border: 1px solid var(--border-color);
border-radius: 24px;
padding: 2.5rem;
margin-bottom: 2rem;
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.08);
transition: box-shadow 0.2s ease, border-color 0.2s ease;
position: relative;
overflow: hidden;
}
.card::before {
content: '';
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
border-radius: inherit;
background: radial-gradient(600px circle at var(--mouse-x, 0) var(--mouse-y, 0), rgba(26, 115, 232, 0.08), transparent 40%);
opacity: 0;
transition: opacity 0.3s ease;
pointer-events: none;
z-index: 0;
}
.card>* {
position: relative;
z-index: 1;
}
.card:hover {
box-shadow: 0 1px 2px 0 rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
border-color: transparent;
}
.card:hover::before {
opacity: 1;
}
.card:last-child {
margin-bottom: 0;
}
.card h3 {
font-size: 1.4rem;
font-weight: 600;
margin-bottom: 0.5rem;
}
.card p,
.card li {
font-weight: 450;
}
.cta-card {
border: 1.5px solid var(--border-color);
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.06);
}
.minimal-card {
background: var(--surface-color);
border: 1px solid var(--border-color);
border-radius: 16px;
padding: 1.5rem;
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.08);
transition: all 0.2s ease;
height: 100%;
position: relative;
overflow: hidden;
}
.minimal-card::before {
content: '';
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
border-radius: inherit;
background: radial-gradient(600px circle at var(--mouse-x, 0) var(--mouse-y, 0), rgba(26, 115, 232, 0.08), transparent 40%);
opacity: 0;
transition: opacity 0.3s ease;
pointer-events: none;
z-index: 0;
}
.minimal-card>* {
position: relative;
z-index: 1;
}
.minimal-card:hover {
box-shadow: 0 1px 2px 0 rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
border-color: transparent;
}
.minimal-card:hover::before {
opacity: 1;
}
/* ===== Grid ===== */
.grid-2 {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 1.5rem;
}
/* ===== Tags ===== */
.skills-container {
display: flex;
flex-wrap: wrap;
gap: 0.5rem;
}
.skill-tag {
background: #f1f3f4;
border: 1px solid transparent;
padding: 0.375rem 1rem;
border-radius: 16px;
font-weight: 450;
font-size: 1rem;
color: var(--text-main);
font-weight: 400;
transition: all 0.2s ease;
}
.skill-tag:hover {
background: #e8eaed;
}
.skill-tag.accent {
background: #e8f0fe;
color: var(--blue-accent);
}
/* ===== Tier list ===== */
.tier-item {
display: flex;
align-items: center;
gap: 1.5rem;
padding: 1rem 1.25rem;
border-radius: 16px;
border: 1px solid var(--border-color);
margin-bottom: 0.75rem;
transition: all 0.2s ease;
}
.tier-item:hover {
border-color: transparent;
box-shadow: 0 1px 2px 0 rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
}
.tier-item:last-child {
margin-bottom: 0;
}
.tier-badge {
display: inline-block;
padding: 0.25rem 1rem;
border-radius: 16px;
font-size: 0.85rem;
font-weight: 500;
min-width: 110px;
text-align: center;
}
.tier-badge.warmup {
background: #e6f4ea;
color: #137333;
}
.tier-badge.beginner {
background: #e8f0fe;
color: #174ea6;
}
.tier-badge.intermediate {
background: #fef7e0;
color: #b05a00;
}
.tier-badge.advanced {
background: #fce8e6;
color: #c5221f;
}
.tier-badge.expert {
background: #f3e8fd;
color: #7627bb;
}
.tier-tasks {
font-size: 0.9rem;
color: var(--text-muted);
min-width: 60px;
}
.tier-desc {
font-size: 0.95rem;
color: var(--text-main);
}
/* ===== Feature grid ===== */
.feature-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
gap: 1.5rem;
}
.feature-icon {
width: 48px;
height: 48px;
border-radius: 16px;
background: #e8f0fe;
color: var(--blue-accent);
display: flex;
align-items: center;
justify-content: center;
font-size: 20px;
margin-bottom: 1rem;
}
/* ===== Code block ===== */
.code-header {
display: flex;
align-items: center;
justify-content: space-between;
margin-bottom: 0.75rem;
}
.copy-btn {
display: inline-flex;
align-items: center;
gap: 0.4rem;
background: #f1f3f4;
border: 1px solid var(--border-color);
border-radius: 8px;
padding: 0.4rem 0.75rem;
font-size: 0.8rem;
font-family: 'Google Sans', 'Roboto', sans-serif;
color: var(--text-muted);
cursor: pointer;
transition: all 0.2s ease;
}
.copy-btn:hover {
background: #e8eaed;
color: var(--text-main);
}
.copy-btn.copied {
background: #e6f4ea;
border-color: #34a853;
color: #137333;
}
/* Syntax highlighting */
.code-block span {
font-family: inherit;
font-size: inherit;
}
.hl-keyword {
color: #1a73e8;
font-weight: 500;
}
.hl-string {
color: #137333;
}
.hl-comment {
color: #9aa0a6;
font-style: italic;
}
.hl-builtin {
color: #7627bb;
}
.hl-punct {
color: #5f6368;
}
.code-block {
background: #f8f9fa;
border: 1px solid var(--border-color);
border-radius: 16px;
padding: 1.5rem;
font-family: 'Google Sans Mono', 'SF Mono', 'Fira Code', 'Consolas', monospace;
font-size: 0.85rem;
overflow-x: auto;
white-space: pre;
color: var(--text-main);
line-height: 1.7;
}
/* ===== Playground ===== */
.pg-row-2col {
display: grid;
grid-template-columns: 280px 1fr;
gap: 1rem;
align-items: start;
}
.card-label {
font-size: 0.8rem;
text-transform: uppercase;
letter-spacing: 1px;
color: var(--blue-accent);
font-weight: 700;
margin-bottom: 0.75rem;
display: block;
}
.cmd-input {
width: 100%;
background: #f8f9fa;
border: 1px solid var(--border-color);
border-radius: 12px;
color: var(--text-main);
font-family: 'Google Sans Mono', 'SF Mono', 'Fira Code', 'Consolas', monospace;
font-size: 0.9rem;
padding: 0.75rem 1rem;
outline: none;
transition: border-color 0.2s ease, box-shadow 0.2s ease;
}
.cmd-input:focus {
border-color: var(--blue-accent);
box-shadow: 0 0 0 3px rgba(26, 115, 232, 0.12);
}
.cmd-input::placeholder {
color: #9aa0a6;
}
.cmd-input:disabled {
background: #f1f3f4;
color: #9aa0a6;
cursor: not-allowed;
}
.btn-secondary:disabled {
background: #f1f3f4;
color: #9aa0a6;
cursor: not-allowed;
border-color: var(--border-color);
box-shadow: none;
}
/* State box */
.state-info {
display: flex;
flex-direction: column;
gap: 0.75rem;
}
.state-row {
display: flex;
align-items: center;
justify-content: space-between;
gap: 0.75rem;
}
.state-label {
font-size: 0.9rem;
color: var(--text-main);
font-weight: 500;
}
/* Solution button */
.btn-solution {
background: #fef7e0;
color: #b05a00;
padding: 0.6rem 1.5rem;
border-radius: 50px;
font-weight: 500;
font-size: 0.95rem;
transition: all 0.2s ease;
display: inline-flex;
align-items: center;
justify-content: center;
gap: 0.5rem;
border: 1px solid #f9ab00;
cursor: pointer;
font-family: 'Google Sans', 'Roboto', sans-serif;
}
.btn-solution:hover {
background: #f9ab00;
color: #fff;
}
.btn-solution:disabled {
background: #f1f3f4;
border-color: var(--border-color);
color: #9aa0a6;
cursor: not-allowed;
}
/* Solution panel */
.solution-panel {
border-radius: 16px;
padding: 1.25rem;
background: #fffbeb;
border: 1px solid #f9ab00;
border-left: 4px solid #f9ab00;
}
.solution-header {
display: flex;
align-items: center;
justify-content: space-between;
margin-bottom: 0.75rem;
}
.solution-commands {
display: flex;
flex-direction: column;
gap: 0.5rem;
}
.solution-cmd {
display: flex;
align-items: flex-start;
gap: 0.75rem;
background: #fff;
border: 1px solid #f0e6c8;
border-radius: 10px;
padding: 0.75rem 1rem;
}
.solution-step {
min-width: 24px;
height: 24px;
border-radius: 50%;
background: #f9ab00;
color: #fff;
display: flex;
align-items: center;
justify-content: center;
font-size: 0.75rem;
font-weight: 600;
flex-shrink: 0;
margin-top: 0.1rem;
}
.solution-cmd code {
font-family: 'Google Sans Mono', 'SF Mono', 'Fira Code', 'Consolas', monospace;
font-size: 0.85rem;
color: var(--text-main);
word-break: break-all;
line-height: 1.5;
}
.solution-cmd.is-note {
background: #fff8e1;
border-style: dashed;
}
.solution-cmd.is-note code {
color: #b05a00;
font-style: italic;
font-family: 'Google Sans', 'Roboto', sans-serif;
font-size: 0.9rem;
}
.solution-cmd.is-note .solution-step {
background: #e0a800;
}
.solution-commands-scroll {
max-height: 150px;
overflow-y: auto;
}
.state-value {
font-size: 0.95rem;
font-weight: 500;
color: var(--text-main);
}
.progress-bar-container {
flex: 1;
max-width: 120px;
height: 8px;
background: #f1f3f4;
border-radius: 4px;
overflow: hidden;
}
.progress-bar-fill {
height: 100%;
background: var(--blue-accent);
border-radius: 4px;
transition: width 0.4s ease;
}
/* Infrastructure tiles */
.infra-tiles {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(90px, 1fr));
gap: 0.75rem;
}
.infra-tile {
aspect-ratio: 1;
border: 1px solid var(--border-color);
border-radius: 14px;
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
gap: 0.3rem;
cursor: pointer;
transition: all 0.2s ease;
position: relative;
padding: 0.5rem;
}
.infra-tile:hover {
border-color: var(--blue-accent);
box-shadow: 0 2px 8px rgba(26, 115, 232, 0.12);
transform: translateY(-2px);
}
.infra-tile.has-resources {
border-color: var(--blue-accent);
background: rgba(26, 115, 232, 0.04);
}
.infra-tile-icon {
width: 32px;
height: 32px;
display: flex;
align-items: center;
justify-content: center;
color: var(--text-muted);
}
.infra-tile.has-resources .infra-tile-icon {
color: var(--blue-accent);
}
.infra-tile-icon svg {
width: 24px;
height: 24px;
stroke: currentColor;
fill: none;
stroke-width: 1.5;
}
.infra-tile-name {
font-size: 0.6rem;
text-transform: uppercase;
letter-spacing: 0.2px;
color: var(--text-muted);
font-weight: 600;
text-align: center;
line-height: 1.2;
max-width: 100%;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.infra-tile.has-resources .infra-tile-name {
color: var(--blue-accent);
}
.infra-tile-badge {
position: absolute;
top: -6px;
right: -6px;
min-width: 20px;
height: 20px;
border-radius: 10px;
background: var(--blue-accent);
color: #fff;
font-size: 0.7rem;
font-weight: 600;
display: flex;
align-items: center;
justify-content: center;
padding: 0 5px;
}
/* Log scroll */
.log-scroll {
max-height: 250px;
overflow-y: auto;
}
.log-table tbody tr {
cursor: pointer;
transition: background 0.15s ease;
}
.log-table tbody tr:hover {
background: #f8f9fa;
}
/* Infra modal */
#infra-modal .modal-container,
#log-modal .modal-container {
max-width: 700px;
}
#infra-modal,
#log-modal {
position: fixed;
inset: 0;
background: rgba(0, 0, 0, 0.4);
z-index: 2000;
display: none;
opacity: 0;
transition: opacity 0.3s ease;
backdrop-filter: blur(8px);
-webkit-backdrop-filter: blur(8px);
overflow-y: auto;
padding: 4rem 1rem;
}
#infra-modal.open,
#log-modal.open {
display: block;
opacity: 1;
}
.infra-res-group {
border: 1px solid var(--border-color);
border-radius: 12px;
margin-bottom: 0.75rem;
overflow: hidden;
}
.infra-res-header {
display: flex;
align-items: center;
justify-content: space-between;
padding: 0.75rem 1rem;
cursor: pointer;
transition: background 0.15s ease;
user-select: none;
}
.infra-res-header:hover {
background: #f8f9fa;
}
.infra-res-title {
font-size: 0.95rem;
font-weight: 500;
color: var(--text-main);
text-transform: capitalize;
}
.infra-res-count {
font-size: 0.85rem;
color: var(--text-muted);
background: #f1f3f4;
padding: 0.15rem 0.6rem;
border-radius: 8px;
}
.infra-res-body {
display: none;
padding: 0 1rem 0.75rem;
border-top: 1px solid var(--border-color);
}
.infra-res-body.open {
display: block;
}
.infra-res-item {
font-size: 0.85rem;
font-family: 'Google Sans Mono', monospace;
color: var(--text-main);
padding: 0.35rem 0;
border-bottom: 1px solid #f1f3f4;
}
.infra-res-item:last-child {
border-bottom: none;
}
.chaos-active {
color: #ea4335;
font-weight: 500;
}
.chaos-inactive {
color: var(--text-muted);
}
.state-episode-id {
font-size: 0.7rem;
word-break: break-all;
}
/* Task box */
.task-box {
border-radius: 24px;
padding: 2rem;
border: 1px solid var(--border-color);
border-left: 4px solid var(--border-color);
min-height: 80px;
display: flex;
flex-direction: column;
justify-content: center;
transition: border-color 0.2s ease;
}
.task-box.empty {
text-align: center;
color: var(--text-muted);
}
.task-box .task-badge {
display: inline-block;
padding: 0.15rem 0.9rem;
border-radius: 12px;
font-size: 0.75rem;
font-weight: 600;
text-transform: uppercase;
letter-spacing: 0.3px;
margin-right: 0.5rem;
}
.task-meta {
color: var(--text-muted);
font-size: 0.85rem;
}
.task-desc {
color: var(--text-main);
font-size: 1rem;
line-height: 1.5;
margin-top: 0.75rem;
}
/* Status bar */
.status-bar {
font-size: 0.9rem;
padding: 0.75rem 1.25rem;
border-radius: 16px;
background: #f8f9fa;
border: 1px solid var(--border-color);
border-left: 3px solid var(--border-color);
min-height: 40px;
color: var(--text-muted);
}
.status-bar.success {
border-left-color: #34a853;
background: #e6f4ea;
color: #137333;
}
.status-bar.error {
border-left-color: #ea4335;
background: #fce8e6;
color: #c5221f;
}
.status-bar.info {
border-left-color: var(--blue-accent);
background: #e8f0fe;
color: #174ea6;
}
/* Output box */
.output-box {
background: #f8f9fa;
border: 1px solid var(--border-color);
border-radius: 16px;
padding: 1.25rem;
font-family: 'Google Sans Mono', 'SF Mono', 'Fira Code', 'Consolas', monospace;
font-size: 0.85rem;
white-space: pre-wrap;
word-break: break-word;
min-height: 100px;
max-height: 280px;
overflow-y: auto;
color: var(--text-main);
line-height: 1.6;
}
/* Log table */
.log-table {
width: 100%;
border-collapse: collapse;
font-size: 0.9rem;
}
.log-table th {
text-align: left;
color: var(--text-muted);
font-weight: 500;
padding: 0.75rem 1rem;
border-bottom: 1px solid var(--border-color);
font-size: 0.8rem;
text-transform: uppercase;
letter-spacing: 1px;
}
.log-table td {
padding: 0.6rem 1rem;
border-bottom: 1px solid #f1f3f4;
color: var(--text-main);
}
.log-table .cmd {
font-family: 'Google Sans Mono', 'SF Mono', 'Fira Code', 'Consolas', monospace;
font-size: 0.8rem;
}
.log-table .yes {
color: #34a853;
font-weight: 500;
}
.log-table .no {
color: #ea4335;
font-weight: 500;
}
.log-empty {
color: var(--text-muted);
text-align: center;
padding: 2rem;
font-size: 0.9rem;
}
/* Spinner */
.spinner {
display: inline-block;
width: 14px;
height: 14px;
border: 2px solid var(--border-color);
border-top-color: var(--blue-accent);
border-radius: 50%;
animation: spin 0.6s linear infinite;
vertical-align: middle;
margin-right: 6px;
}
@keyframes spin {
to {
transform: rotate(360deg);
}
}
/* Animations */
.animate-up {
opacity: 0;
transform: translateY(30px);
transition: opacity 0.8s cubic-bezier(0.16, 1, 0.3, 1), transform 0.8s cubic-bezier(0.16, 1, 0.3, 1);
}
.animate-up.visible {
opacity: 1;
transform: translateY(0);
}
/* ===== Timeline ===== */
.timeline {
border-left: 2px dashed var(--border-color);
padding-left: 2.5rem;
margin-left: 0.5rem;
}
.timeline-item {
position: relative;
margin-bottom: 3rem;
}
.timeline-item:last-child {
margin-bottom: 0;
}
.timeline-item::before {
content: '';
position: absolute;
left: -2.85rem;
top: 0.35rem;
width: 12px;
height: 12px;
background: var(--dot-bg, var(--surface-color));
border: 2.5px solid var(--dot-color, var(--border-color));
border-radius: 50%;
transition: all 0.2s ease;
}
.timeline-item.active::before {
background: var(--dot-color, var(--blue-accent));
border-color: var(--dot-color, var(--blue-accent));
box-shadow: 0 0 0 4px var(--dot-bg, rgba(26, 115, 232, 0.1));
}
.timeline-header {
display: flex;
justify-content: space-between;
align-items: baseline;
margin-bottom: 0.25rem;
flex-wrap: wrap;
gap: 0.5rem;
}
.role-title {
font-size: 1.3rem;
color: var(--text-main);
font-weight: 600;
}
.date-badge {
color: var(--text-muted);
font-size: 0.95rem;
font-weight: 450;
}
.timeline-subtitle {
color: var(--text-muted);
font-size: 1rem;
font-weight: 450;
margin-bottom: 0.75rem;
}
.timeline-points {
list-style: none;
padding: 0;
margin: 0;
}
.timeline-points li {
position: relative;
padding: 0.35rem 0 0.35rem 1.25rem;
color: var(--text-muted);
font-size: 0.95rem;
font-weight: 450;
line-height: 1.5;
}
.timeline-points li::before {
content: '\2022';
position: absolute;
left: 0.15rem;
color: var(--dot-color, var(--blue-accent));
font-weight: bold;
font-size: 1.1rem;
line-height: 1.4;
}
.timeline-points li strong {
color: var(--text-main);
font-weight: 600;
}
/* Footer */
footer {
padding: 4rem 2rem 2rem;
border-top: 1px solid var(--border-color);
max-width: 1200px;
margin: 0 auto;
}
.footer-content {
display: grid;
grid-template-columns: 2fr 1fr 1fr 1fr;
gap: 2.5rem;
margin-bottom: 3rem;
}
.footer-brand h3 {
font-size: 1.2rem;
font-weight: 600;
color: var(--text-main);
margin-bottom: 0.75rem;
}
.footer-brand p {
font-size: 0.9rem;
color: var(--text-muted);
font-weight: 400;
line-height: 1.6;
max-width: 300px;
}
.footer-links-group h4 {
font-size: 0.85rem;
font-weight: 600;
color: var(--text-main);
text-transform: uppercase;
letter-spacing: 0.5px;
margin-bottom: 1rem;
}
.footer-links-group ul {
list-style: none;
padding: 0;
margin: 0;
}
.footer-links-group li {
margin-bottom: 0.6rem;
}
.footer-links-group a {
font-size: 0.9rem;
font-weight: 450;
color: var(--text-muted);
transition: color 0.2s ease;
}
.footer-links-group a:hover {
color: var(--text-main);
}
.footer-bottom {
border-top: 1px solid var(--border-color);
padding-top: 1.5rem;
text-align: center;
}
.footer-bottom p {
font-size: 0.85rem;
color: var(--text-muted);
font-weight: 400;
margin-bottom: 0;
}
/* ===== Responsive ===== */
@media (max-width: 768px) {
.footer-content {
grid-template-columns: 1fr 1fr;
gap: 2rem;
}
.footer-brand {
grid-column: 1 / -1;
}
.hero h1 {
font-size: 3rem;
}
.hero-cta-container {
flex-direction: column;
width: 100%;
max-width: 280px;
margin-left: auto;
margin-right: auto;
}
.section-wrapper {
flex-direction: column;
gap: 2rem;
padding: 4rem 0;
}
.left-col {
position: relative;
top: 0;
flex: none;
flex-direction: row;
justify-content: flex-start;
align-items: center;
gap: 1rem;
text-align: left;
padding-bottom: 1rem;
border-bottom: 1px solid var(--border-color);
}
.icon-container {
margin-bottom: 0;
width: 48px;
height: 48px;
border-radius: 14px;
}
.section-title {
font-size: 1.2rem;
}
.grid-2 {
grid-template-columns: 1fr;
}
.pg-row-2col {
grid-template-columns: 1fr;
}
.nav-links {
display: none;
}
nav.scrolled {
max-width: max-content;
padding: 0 1.5rem;
}
nav::after {
content: attr(data-active-section);
font-weight: 500;
font-family: 'Google Sans', 'Roboto', sans-serif;
color: var(--text-main);
font-size: 1.1rem;
}
.modal-grid {
grid-template-columns: 1fr !important;
}
}
/* ===== Feature Chips ===== */
.feature-chips {
display: flex;
flex-direction: column;
gap: 0.75rem;
}
.feature-chip {
display: flex;
align-items: center;
gap: 1rem;
padding: 1rem 1.25rem;
border: 1px solid var(--border-color);
border-radius: 16px;
cursor: pointer;
transition: all 0.2s ease;
position: relative;
overflow: hidden;
}
.feature-chip::before {
content: '';
position: absolute;
inset: 0;
border-radius: inherit;
background: radial-gradient(400px circle at var(--mouse-x, 0) var(--mouse-y, 0), rgba(26, 115, 232, 0.06), transparent 40%);
opacity: 0;
transition: opacity 0.3s ease;
pointer-events: none;
}
.feature-chip:hover {
border-color: var(--blue-accent);
box-shadow: 0 2px 8px rgba(26, 115, 232, 0.12);
transform: translateX(4px);
}
.feature-chip:hover::before {
opacity: 1;
}
.feature-chip-icon {
width: 40px;
height: 40px;
min-width: 40px;
border-radius: 12px;
background: #e8f0fe;
color: var(--blue-accent);
display: flex;
align-items: center;
justify-content: center;
font-size: 18px;
transition: all 0.2s ease;
}
.feature-chip:hover .feature-chip-icon {
background: var(--blue-accent);
color: white;
}
.feature-chip div {
flex: 1;
min-width: 0;
}
.feature-chip strong {
display: block;
font-size: 1rem;
font-weight: 500;
color: var(--text-main);
margin-bottom: 0.15rem;
}
.feature-chip span {
font-size: 0.9rem;
color: var(--text-muted);
}
.feature-chip code {
background: #f1f3f4;
padding: 0.1rem 0.4rem;
border-radius: 4px;
font-size: 0.85rem;
font-family: 'Google Sans Mono', 'SF Mono', monospace;
}
.feature-chip-arrow {
color: var(--border-color);
transition: all 0.2s ease;
flex-shrink: 0;
}
.feature-chip:hover .feature-chip-arrow {
color: var(--blue-accent);
transform: translateX(2px);
}
/* ===== Feature Modal ===== */
#feature-modal {
position: fixed;
inset: 0;
background: rgba(0, 0, 0, 0.4);
z-index: 2000;
display: none;
opacity: 0;
transition: opacity 0.3s ease;
backdrop-filter: blur(8px);
-webkit-backdrop-filter: blur(8px);
overflow-y: auto;
padding: 4rem 1rem;
}
#feature-modal.open {
display: block;
opacity: 1;
}
.modal-container {
max-width: 900px;
margin: 0 auto;
background: #fff;
border-radius: 32px;
padding: 3rem;
border: 1px solid var(--border-color);
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.05);
position: relative;
}
.close-modal {
position: absolute;
top: 2rem;
right: 2rem;
width: 44px;
height: 44px;
border-radius: 50%;
background: #f1f3f4;
display: flex;
align-items: center;
justify-content: center;
cursor: pointer;
border: none;
font-size: 1.5rem;
color: var(--text-muted);
transition: all 0.2s ease;
}
.close-modal:hover {
background: #e8eaed;
transform: scale(1.1);
}
.modal-container h2 {
font-size: 1.8rem;
margin-bottom: 1.5rem;
padding-right: 3rem;
}
.modal-grid {
display: grid;
grid-template-columns: 1.5fr 1fr;
gap: 3rem;
margin-top: 1rem;
}
.modal-section {
margin-bottom: 1rem;
}
.modal-label {
font-size: 0.8rem;
text-transform: uppercase;
letter-spacing: 1px;
color: var(--blue-accent);
font-weight: 700;
margin-bottom: 0.5rem;
display: block;
}
.modal-section p {
font-size: 1rem;
line-height: 1.7;
margin-bottom: 1.5rem;
}
.diag-container {
background: #f8f9fa;
border-radius: 20px;
padding: 1.5rem;
border: 1px solid var(--border-color);
margin-top: 0.5rem;
}
.diag-container svg {
width: 100%;
height: auto;
}
.perf-card {
background: #e8f0fe;
border-radius: 16px;
padding: 1rem;
margin-bottom: 0.75rem;
border: 1px solid rgba(26, 115, 232, 0.1);
}
.perf-val {
font-size: 1.5rem;
font-weight: 500;
color: var(--blue-accent);
display: block;
}
.perf-label {
font-size: 0.85rem;
color: var(--text-muted);
}
/* ===== Training Figures Gallery ===== */
.figure-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
gap: 1rem;
margin-top: 0.5rem;
}
.figure-grid.full {
grid-template-columns: 1fr;
}
.figure-card {
background: var(--surface-color);
border: 1px solid var(--border-color);
border-radius: 16px;
overflow: hidden;
transition: all 0.25s ease;
display: flex;
flex-direction: column;
cursor: zoom-in;
position: relative;
}
.figure-card:hover {
transform: translateY(-2px);
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.08);
border-color: var(--blue-accent);
}
.figure-card .figure-img-wrap {
background: #f8f9fa;
padding: 0.75rem;
display: flex;
align-items: center;
justify-content: center;
min-height: 160px;
}
.figure-card img {
width: 100%;
height: auto;
display: block;
border-radius: 8px;
}
.figure-caption {
padding: 0.85rem 1rem 1rem;
border-top: 1px solid var(--border-color);
}
.figure-caption strong {
display: block;
font-size: 0.95rem;
font-weight: 600;
color: var(--text-main);
margin-bottom: 0.2rem;
}
.figure-caption span {
font-size: 0.85rem;
color: var(--text-muted);
line-height: 1.45;
}
/* ===== Stats strip ===== */
.stats-strip {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
gap: 0.75rem;
margin: 1.25rem 0 0.5rem;
}
.stat-tile {
background: #f8f9fa;
border: 1px solid var(--border-color);
border-radius: 14px;
padding: 1rem;
text-align: center;
transition: all 0.2s ease;
}
.stat-tile:hover {
border-color: var(--blue-accent);
box-shadow: 0 2px 8px rgba(26, 115, 232, 0.1);
}
.stat-tile .stat-val {
display: block;
font-size: 1.6rem;
font-weight: 600;
color: var(--blue-accent);
line-height: 1.1;
margin-bottom: 0.2rem;
}
.stat-tile .stat-label {
font-size: 0.78rem;
text-transform: uppercase;
letter-spacing: 0.5px;
color: var(--text-muted);
font-weight: 500;
}
.stat-tile.success .stat-val {
color: #137333;
}
.stat-tile.warning .stat-val {
color: #b05a00;
}
/* ===== Lightbox for figures ===== */
#figure-lightbox {
position: fixed;
inset: 0;
background: rgba(0, 0, 0, 0.85);
z-index: 3000;
display: none;
align-items: center;
justify-content: center;
padding: 2rem;
cursor: zoom-out;
opacity: 0;
transition: opacity 0.2s ease;
}
#figure-lightbox.open {
display: flex;
opacity: 1;
}
#figure-lightbox img {
max-width: 92vw;
max-height: 88vh;
border-radius: 12px;
box-shadow: 0 12px 48px rgba(0, 0, 0, 0.4);
}
/* ===== Comparison table ===== */
.results-table {
width: 100%;
border-collapse: collapse;
font-size: 0.92rem;
margin-top: 0.5rem;
}
.results-table th,
.results-table td {
padding: 0.7rem 1rem;
text-align: left;
border-bottom: 1px solid var(--border-color);
}
.results-table th {
font-weight: 600;
color: var(--text-muted);
font-size: 0.78rem;
text-transform: uppercase;
letter-spacing: 0.5px;
background: #f8f9fa;
}
.results-table th.num,
.results-table td.num {
text-align: right;
}
.results-table td.num {
font-family: 'Google Sans Mono', 'SF Mono', monospace;
}
.delta-up {
color: #137333;
font-weight: 600;
}
.delta-down {
color: #c5221f;
font-weight: 600;
}
.delta-flat {
color: var(--text-muted);
}
</style>
</head>
<body>
<!-- Navigation -->
<nav id="navbar">
<ul class="nav-links">
<li><a href="#about" class="nav-link active">About</a></li>
<li><a href="#tiers" class="nav-link">Tasks</a></li>
<li><a href="#features" class="nav-link">Features</a></li>
<li><a href="#results" class="nav-link">Results</a></li>
<li><a href="#api" class="nav-link">API</a></li>
<li><a href="#playground" class="nav-link">Playground</a></li>
<li><a href="#links" class="nav-link">Links</a></li>
</ul>
</nav>
<!-- Hero -->
<section class="hero">
<div class="hero-bg"></div>
<div class="hero-content">
<h1 id="hero-title" class="type-animate">AWS Cloud Operations &middot; RL Environment &amp; Training Pipeline</h1>
<h2 id="hero-subtitle" class="type-animate">Cloud agents fail in production not because they don&rsquo;t know the
commands &mdash; but because state drifts, services hiccup, and reward signals get gamed. We built an
environment that simulates all three: 120+ AWS tasks under chaos and drift, an 8-layer anti-reward-hacking
stack, and an adversarial curriculum that targets the agent&rsquo;s own weak spots. After SFT &rarr; GRPO on a
single GPU with 8 parallel rollouts, format compliance hit 100%, exact-match jumped 39% &rarr; 89%, and
intermediate-tier success climbed 81% &rarr; 87%.</h2>
<div class="hero-cta-container hero-fade-up">
<a href="#playground" class="btn-primary">
Try the Playground
<svg width="18" height="18" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2">
<path stroke-linecap="round" stroke-linejoin="round" d="M13 7l5 5m0 0l-5 5m5-5H6" />
</svg>
</a>
<a href="#api" class="btn-secondary">
API Docs
<svg width="18" height="18" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2">
<path stroke-linecap="round" stroke-linejoin="round"
d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14" />
</svg>
</a>
</div>
</div>
</section>
<!-- Content -->
<div class="container">
<!-- About -->
<div class="section-wrapper" id="about">
<div class="left-col">
<div class="icon-container">
<svg viewBox="0 0 24 24">
<path d="M12 2L2 7l10 5 10-5-10-5zM2 17l10 5 10-5M2 12l10 5 10-5" />
</svg>
</div>
<span class="section-title">About</span>
</div>
<div class="right-col">
<div class="card animate-up">
<h3>Learn AWS by doing.</h3>
<p>An OpenEnv-compatible RL environment where agents execute real AWS CLI commands against a vendored
MiniStack simulator that responds with production-equivalent JSON. 120+ tasks across 5 tiers (warmup &rarr;
expert) with adaptive selection, mastery tracking, spaced repetition, chaos injection and drift-detection
scenarios &mdash; every feature designed to keep the reward signal honest and prevent the agent from gaming
it. Trained end-to-end with a 1,500-row synthetic SFT dataset and TRL GRPO with 8-way parallel rollouts on a
single GPU.</p>
<div class="stats-strip">
<div class="stat-tile"><span class="stat-val">120+</span><span class="stat-label">Tasks</span></div>
<div class="stat-tile"><span class="stat-val">5</span><span class="stat-label">Difficulty Tiers</span></div>
<div class="stat-tile"><span class="stat-val">34</span><span class="stat-label">AWS Services</span></div>
<div class="stat-tile"><span class="stat-val">8</span><span class="stat-label">Parallel Rollouts</span></div>
<div class="stat-tile success"><span class="stat-val">+50pp</span><span class="stat-label">Exact-match Δ
(SFT)</span></div>
<div class="stat-tile success"><span class="stat-val">100%</span><span class="stat-label">Format Compliance
(post-SFT)</span></div>
</div>
<div class="skills-container" style="margin-top: 1.5rem;">
<span class="skill-tag">S3</span>
<span class="skill-tag">EC2</span>
<span class="skill-tag">DynamoDB</span>
<span class="skill-tag">Lambda</span>
<span class="skill-tag">SQS</span>
<span class="skill-tag">SNS</span>
<span class="skill-tag">IAM</span>
<span class="skill-tag">RDS</span>
<span class="skill-tag">API Gateway</span>
<span class="skill-tag">CloudFormation</span>
<span class="skill-tag">CloudWatch</span>
<span class="skill-tag">Kinesis</span>
<span class="skill-tag">SES</span>
<span class="skill-tag">Step Functions</span>
<span class="skill-tag">Secrets Manager</span>
<span class="skill-tag">ELBv2</span>
<span class="skill-tag">Route53</span>
<span class="skill-tag">Glue</span>
<span class="skill-tag">Athena</span>
<span class="skill-tag">EFS</span>
<span class="skill-tag accent">+ 14 more</span>
</div>
</div>
</div>
</div>
<!-- Task Tiers (Timeline) -->
<div class="section-wrapper" id="tiers">
<div class="left-col">
<div class="icon-container">
<svg viewBox="0 0 24 24">
<path d="M13 2L3 14h9l-1 8 10-12h-9l1-8z" />
</svg>
</div>
<span class="section-title">Tasks</span>
</div>
<div class="right-col">
<div class="card animate-up">
<div class="timeline">
<div class="timeline-item active" style="--dot-color: #137333; --dot-bg: #e6f4ea;">
<div class="timeline-header">
<div class="role-title">Warmup</div>
<div class="date-badge">25 tasks</div>
</div>
<p class="timeline-subtitle">List resources &mdash; single read-only commands</p>
<ul class="timeline-points">
<li>Run one AWS CLI command to list or describe a resource type</li>
<li>S3 buckets, EC2 instances, DynamoDB tables, Lambda functions, RDS, EBS volumes</li>
<li>Graded by <strong>command_match</strong> &mdash; checks operation + service pair</li>
<li>No setup required, no state mutations</li>
</ul>
</div>
<div class="timeline-item active" style="--dot-color: #174ea6; --dot-bg: #e8f0fe;">
<div class="timeline-header">
<div class="role-title">Beginner</div>
<div class="date-badge">25 tasks</div>
</div>
<p class="timeline-subtitle">Create single resources with verification</p>
<ul class="timeline-points">
<li>Create an S3 bucket, DynamoDB table, SQS queue, or Lambda function</li>
<li>Graded by <strong>resource_creation</strong> &mdash; verifies the exact resource exists in AWS
Infrastructure Simulator
</li>
<li>Introduces resource name validation &mdash; "my-bucket-2" won't satisfy a check for "my-bucket"</li>
<li>First tier where idempotency bonus (+0.02) can be earned</li>
</ul>
</div>
<div class="timeline-item active" style="--dot-color: #b05a00; --dot-bg: #fef7e0;">
<div class="timeline-header">
<div class="role-title">Intermediate</div>
<div class="date-badge">25 tasks</div>
</div>
<p class="timeline-subtitle">Multi-step workflows &mdash; create, configure, connect</p>
<ul class="timeline-points">
<li>Ordered sequences: create a bucket then enable versioning, create a table then add an item</li>
<li>Graded by <strong>multi_step</strong> &mdash; validates each step was completed in order</li>
<li>Chaos injection begins at <strong>10% probability</strong> &mdash; resources may be silently mutated
mid-episode</li>
<li>Rollback penalty (-0.1) starts to matter with multi-step create/delete patterns</li>
</ul>
</div>
<div class="timeline-item active" style="--dot-color: #c5221f; --dot-bg: #fce8e6;">
<div class="timeline-header">
<div class="role-title">Advanced</div>
<div class="date-badge">25 tasks</div>
</div>
<p class="timeline-subtitle">Cross-service architectures spanning multiple AWS services</p>
<ul class="timeline-points">
<li>Wire Lambda to SQS, configure API Gateway with integrations, build event-driven pipelines</li>
<li>Graded by <strong>multi_step + services</strong> &mdash; all required services must be configured
</li>
<li>Chaos injection escalates to <strong>20% probability</strong> &mdash; DynamoDB throughput, Lambda
configs may change</li>
<li>Hints cost more: 3 hints = only 61% of max reward (0.85&sup3; decay)</li>
</ul>
</div>
<div class="timeline-item active" style="--dot-color: #7627bb; --dot-bg: #f3e8fd;">
<div class="timeline-header">
<div class="role-title">Expert</div>
<div class="date-badge">24 tasks + 9 drift</div>
</div>
<p class="timeline-subtitle">SRE incidents &amp; drift detection &mdash; diagnose and fix</p>
<ul class="timeline-points">
<li>Fix overly permissive S3 policies, replace broad IAM inline policies, repair broken infra</li>
<li>Graded by <strong>state_checks</strong> &mdash; actual CLI commands run against MiniStack at grading
time</li>
<li>Chaos injection at <strong>30% probability</strong> &mdash; maximum perturbation frequency</li>
<li><strong>9 drift detection tasks</strong> &mdash; correct infra is provisioned, then 2&ndash;3 random
mutations applied from a pool</li>
<li>Agent must audit environment, discover which resources drifted, and fix only those</li>
<li>Drift is randomized per episode &mdash; prevents memorization of fix sequences</li>
</ul>
</div>
</div>
</div>
</div>
</div>
<!-- Features -->
<div class="section-wrapper" id="features">
<div class="left-col">
<div class="icon-container">
<svg viewBox="0 0 24 24">
<path
d="M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2m-6 9l2 2 4-4" />
</svg>
</div>
<span class="section-title">Features</span>
</div>
<div class="right-col">
<!-- Curriculum & Training -->
<div class="card animate-up">
<h3>Curriculum & Training</h3>
<p>Adaptive learning system that tracks mastery and selects optimal tasks.</p>
<div class="feature-chips">
<div class="feature-chip" data-feature-id="progressive-difficulty">
<span class="feature-chip-icon">&#x2191;</span>
<div>
<strong>Progressive Difficulty</strong>
<span>5 tiers from warmup to expert SRE</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="mastery-tracking">
<span class="feature-chip-icon">&#x2713;</span>
<div>
<strong>Mastery Tracking</strong>
<span>Per-task graduation with sustained performance</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="spaced-repetition">
<span class="feature-chip-icon">&#x21BB;</span>
<div>
<strong>Spaced Repetition</strong>
<span>Graduated tasks resurface at increasing intervals</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="priority-selection">
<span class="feature-chip-icon">&#x25CE;</span>
<div>
<strong>Priority Selection</strong>
<span>Novelty, weakness, and recency scoring</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="tier-progression">
<span class="feature-chip-icon">&#x2B06;</span>
<div>
<strong>Tier Progression</strong>
<span>Standard promotion and fast-track system</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
</div>
</div>
<!-- Reward Shaping -->
<div class="card animate-up">
<h3>Reward Shaping</h3>
<p>Dense reward signals that encourage operational discipline and real progress.</p>
<div class="feature-chips">
<div class="feature-chip" data-feature-id="rollback-penalty">
<span class="feature-chip-icon">&#x2696;</span>
<div>
<strong>Rollback Penalty &amp; Idempotency Bonus</strong>
<span>Operational discipline rewards</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="shaped-rewards">
<span class="feature-chip-icon">&#x1F4C8;</span>
<div>
<strong>Shaped Reward System</strong>
<span>Progress bonus, failure penalty, clamped rewards</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="multi-strategy-grading">
<span class="feature-chip-icon">&#x2605;</span>
<div>
<strong>Multi-Strategy Grading</strong>
<span>5 grading strategies across tiers</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
</div>
</div>
<!-- Resilience & Adaptability -->
<div class="card animate-up">
<h3>Resilience & Adaptability</h3>
<p>Features that test agent robustness under unpredictable conditions.</p>
<div class="feature-chips">
<div class="feature-chip" data-feature-id="progressive-hints">
<span class="feature-chip-icon">&#x1F4A1;</span>
<div>
<strong>Progressive Hint System</strong>
<span>3-level hints with reward decay</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="chaos-injection">
<span class="feature-chip-icon">&#x26A1;</span>
<div>
<strong>Chaos Injection Engine</strong>
<span>Silent mid-episode perturbations</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="drift-detection">
<span class="feature-chip-icon">&#x1F50D;</span>
<div>
<strong>Drift Detection Tasks</strong>
<span>Randomized config drift per episode</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
</div>
</div>
<!-- Security Posture Audit -->
<div class="card animate-up">
<h3>Security Posture Audit</h3>
<p>Tests reasoning about configuration state &mdash; working but insecure infrastructure the agent must
analyze and harden.</p>
<div class="feature-chips">
<div class="feature-chip" data-feature-id="s3-lockdown">
<span class="feature-chip-icon">&#x1F512;</span>
<div>
<strong>Public S3 Bucket Lockdown</strong>
<span>Detect &amp; fix overly permissive bucket policies</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="iam-least-privilege">
<span class="feature-chip-icon">&#x1F6E1;</span>
<div>
<strong>IAM Least Privilege</strong>
<span>Replace wildcard policies with scoped permissions</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="secrets-rotation">
<span class="feature-chip-icon">&#x1F510;</span>
<div>
<strong>Secrets in Lambda Environment</strong>
<span>Move plaintext credentials to Secrets Manager</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
</div>
</div>
<!-- Anti-Reward-Hacking -->
<div class="card animate-up">
<h3>Anti-Reward-Hacking</h3>
<p>8 defense layers that prevent the agent from gaming the reward system.</p>
<div class="feature-chips">
<div class="feature-chip" data-feature-id="ground-truth">
<span class="feature-chip-icon">&#x1F50E;</span>
<div>
<strong>Ground-Truth Verification</strong>
<span>MiniStack queries for 20+ services</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="command-allowlisting">
<span class="feature-chip-icon">&#x1F6E1;</span>
<div>
<strong>Command Allowlisting</strong>
<span>Only <code>aws</code> CLI commands allowed</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="deduplication">
<span class="feature-chip-icon">&#x1F6AB;</span>
<div>
<strong>Deduplication</strong>
<span>No reward for repeated commands</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="grader-invisibility">
<span class="feature-chip-icon">&#x1F441;</span>
<div>
<strong>Grader Invisibility</strong>
<span>Verification commands hidden from agent</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="no-verification-reward">
<span class="feature-chip-icon">&#x1F50D;</span>
<div>
<strong>No Verification Reward</strong>
<span>Read-only commands earn zero progress</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="monotonic-progress">
<span class="feature-chip-icon">&#x2197;</span>
<div>
<strong>Monotonic Progress</strong>
<span>Progress can only increase, never re-earn</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="resource-validation">
<span class="feature-chip-icon">&#x1F3AF;</span>
<div>
<strong>Resource Name Validation</strong>
<span>Exact name match required</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
<div class="feature-chip" data-feature-id="state-checks">
<span class="feature-chip-icon">&#x2611;</span>
<div>
<strong>State Checks</strong>
<span>Verify final state, not command history</span>
</div>
<svg class="feature-chip-arrow" width="16" height="16" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2">
<path d="M9 18l6-6-6-6" />
</svg>
</div>
</div>
</div>
</div>
</div>
<!-- Results & Training -->
<div class="section-wrapper" id="results">
<div class="left-col">
<div class="icon-container">
<svg viewBox="0 0 24 24">
<path d="M3 3v18h18M7 14l4-4 4 4 6-6" />
</svg>
</div>
<span class="section-title">Results</span>
</div>
<div class="right-col">
<!-- Pipeline summary -->
<div class="card animate-up">
<h3>SFT &rarr; GRPO Training Pipeline</h3>
<p>Two-stage training on <code style="background:#f1f3f4;padding:0.1rem 0.4rem;border-radius:4px;font-size:0.9rem;">unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit</code> — the
base picked from an 11-model benchmark on 27 held-out prompts. Stage 1: LoRA SFT on 1,500 synthetic
trajectories spanning 5 shapes. Stage 2: TRL GRPO with multi-turn rollouts, group-relative advantages, KL
to SFT reference, and Optuna search over an 8-dim hyperparameter space.</p>
<div class="stats-strip">
<div class="stat-tile"><span class="stat-val">1,500</span><span class="stat-label">SFT Train Rows</span>
</div>
<div class="stat-tile"><span class="stat-val">G=8</span><span class="stat-label">Rollouts / Step</span>
</div>
<div class="stat-tile"><span class="stat-val">200</span><span class="stat-label">Final GRPO Steps</span>
</div>
<div class="stat-tile"><span class="stat-val">11</span><span class="stat-label">Models Benchmarked</span>
</div>
<div class="stat-tile success"><span class="stat-val">+66.7pp</span><span class="stat-label">Format
Δ (SFT)</span></div>
<div class="stat-tile success"><span class="stat-val">+50.0pp</span><span class="stat-label">Exact-match
Δ (SFT)</span></div>
</div>
</div>
<!-- Base model selection -->
<div class="card animate-up">
<h3>Base-Model Selection</h3>
<p>11 chat models &times; 27 held-out prompts. Qwen2.5-Coder-3B-Instruct wins on every metric that matters:
41% exact match, 63% operation match, 3.1 s/call (3&times; faster than the 4B runner-up).</p>
<div class="figure-grid full">
<div class="figure-card" data-fig="/static/figures/model_eval_chart.png">
<div class="figure-img-wrap"><img src="/static/figures/model_eval_chart.png"
alt="Top 4 candidate models on the held-out benchmark"></div>
<div class="figure-caption">
<strong>Top 4 Candidate Models</strong>
<span>Exact match, operation match, latency &mdash; head-to-head on 27 held-out prompts.</span>
</div>
</div>
</div>
</div>
<!-- Base vs SFT -->
<div class="card animate-up">
<h3>Base vs SFT &mdash; Eval Delta</h3>
<p>After running the SFT pipeline end-to-end, format compliance is now perfect and exact-match jumped from
39% to 89%.</p>
<table class="results-table">
<thead>
<tr>
<th>Metric</th>
<th class="num">Base</th>
<th class="num">Post-SFT</th>
<th class="num">Δ</th>
</tr>
</thead>
<tbody>
<tr>
<td>Format</td>
<td class="num">33.3%</td>
<td class="num">100.0%</td>
<td class="num delta-up">+66.7 pp</td>
</tr>
<tr>
<td>Exact match</td>
<td class="num">38.9%</td>
<td class="num">88.9%</td>
<td class="num delta-up">+50.0 pp</td>
</tr>
<tr>
<td>Service match</td>
<td class="num">77.8%</td>
<td class="num">88.9%</td>
<td class="num delta-up">+11.1 pp</td>
</tr>
<tr>
<td>Operation match</td>
<td class="num">61.1%</td>
<td class="num">88.9%</td>
<td class="num delta-up">+27.8 pp</td>
</tr>
<tr>
<td>Latency</td>
<td class="num">2.03 s</td>
<td class="num">1.40 s</td>
<td class="num delta-up">&minus;0.63 s</td>
</tr>
</tbody>
</table>
<div class="figure-grid" style="margin-top: 1.25rem;">
<div class="figure-card" data-fig="/static/figures/base_vs_sft_success.png">
<div class="figure-img-wrap"><img src="/static/figures/base_vs_sft_success.png"
alt="Base vs SFT eval-metrics comparison"></div>
<div class="figure-caption">
<strong>Base vs SFT &mdash; Eval Metrics</strong>
<span>Per-metric comparison on the held-out prompt set.</span>
</div>
</div>
<div class="figure-card" data-fig="/static/figures/compare_dataset.png">
<div class="figure-img-wrap"><img src="/static/figures/compare_dataset.png"
alt="Dataset comparison: base vs SFT (per-row scores)"></div>
<div class="figure-caption">
<strong>Dataset Comparison</strong>
<span>Per-row scores: base vs SFT on the SFT validation set.</span>
</div>
</div>
<div class="figure-card" data-fig="/static/figures/compare_rl_env.png">
<div class="figure-img-wrap"><img src="/static/figures/compare_rl_env.png"
alt="RL env comparison: base vs SFT"></div>
<div class="figure-caption">
<strong>Live RL Env Comparison</strong>
<span>Per-episode rewards on the live MiniStack-backed environment.</span>
</div>
</div>
</div>
</div>
<!-- SFT training -->
<div class="card animate-up">
<h3>SFT Training Curves &amp; Optuna</h3>
<p>Best SFT trial (out of 6): <code style="background:#f1f3f4;padding:0.1rem 0.4rem;border-radius:4px;font-size:0.9rem;">lora_r=16, lora_alpha=16, dropout=0.0058, lr=4.03e-4,
warmup=0.1</code>.</p>
<div class="figure-grid">
<div class="figure-card" data-fig="/static/figures/sft_loss_curve.png">
<div class="figure-img-wrap"><img src="/static/figures/sft_loss_curve.png"
alt="SFT loss curve over training"></div>
<div class="figure-caption">
<strong>SFT Loss Curve</strong>
<span>Train + validation loss across the SFT run.</span>
</div>
</div>
<div class="figure-card" data-fig="/static/figures/optuna_param_importance.png">
<div class="figure-img-wrap"><img src="/static/figures/optuna_param_importance.png"
alt="Optuna parameter importances"></div>
<div class="figure-caption">
<strong>Optuna Parameter Importances</strong>
<span>Which hyperparameters mattered most for the SFT objective.</span>
</div>
</div>
<div class="figure-card" data-fig="/static/figures/optuna_history.png">
<div class="figure-img-wrap"><img src="/static/figures/optuna_history.png"
alt="Optuna optimization history"></div>
<div class="figure-caption">
<strong>Optuna History</strong>
<span>Best objective value over the 6-trial TPE search.</span>
</div>
</div>
</div>
</div>
<!-- GRPO results -->
<div class="card animate-up">
<h3>GRPO &mdash; Live Multi-step Env Eval</h3>
<p>After 35 GRPO steps on top of the SFT adapter (best Optuna config: <code style="background:#f1f3f4;padding:0.1rem 0.4rem;border-radius:4px;font-size:0.9rem;">lr=1.6e-5, β=0.0021,
T=0.99</code>), re-evaluated end-to-end on 100+ episodes.</p>
<table class="results-table">
<thead>
<tr>
<th>Metric</th>
<th class="num">Base + SFT</th>
<th class="num">+ GRPO</th>
<th class="num">Δ</th>
</tr>
</thead>
<tbody>
<tr>
<td>Overall success</td>
<td class="num">86.8%</td>
<td class="num">86.2%</td>
<td class="num delta-flat">&minus;0.5 pp</td>
</tr>
<tr>
<td>Beginner</td>
<td class="num">96.2%</td>
<td class="num">100.0%</td>
<td class="num delta-up">+3.8 pp</td>
</tr>
<tr>
<td>Intermediate</td>
<td class="num">81.0%</td>
<td class="num">87.0%</td>
<td class="num delta-up">+6.0 pp</td>
</tr>
<tr>
<td>Expert</td>
<td class="num">22.2%</td>
<td class="num">22.2%</td>
<td class="num delta-flat">flat</td>
</tr>
<tr>
<td>Drift repair</td>
<td class="num">22.2%</td>
<td class="num">22.2%</td>
<td class="num delta-flat">flat</td>
</tr>
<tr>
<td>Destructive-action fail</td>
<td class="num">15.1%</td>
<td class="num">14.7%</td>
<td class="num delta-up">&minus;0.4 pp</td>
</tr>
</tbody>
</table>
<p style="margin-top: 1rem; font-size: 0.95rem;"><strong>Honest reading:</strong> the 35-step GRPO run
preserves the SFT gains and modestly improves the middle tiers, but does not crack the expert-tier
bottleneck. Longer runs and more curriculum exposure to expert tasks are next.</p>
<div class="figure-grid" style="margin-top: 1.25rem;">
<div class="figure-card" data-fig="/static/figures/sft_vs_grpo_metrics_grid.png">
<div class="figure-img-wrap"><img src="/static/figures/sft_vs_grpo_metrics_grid.png"
alt="SFT vs GRPO metrics grid"></div>
<div class="figure-caption">
<strong>SFT vs GRPO &mdash; Metrics Grid</strong>
<span>Side-by-side eval across all multi-step metrics.</span>
</div>
</div>
<div class="figure-card" data-fig="/static/figures/sft_vs_grpo_by_tier.png">
<div class="figure-img-wrap"><img src="/static/figures/sft_vs_grpo_by_tier.png"
alt="SFT vs GRPO by tier"></div>
<div class="figure-caption">
<strong>SFT vs GRPO &mdash; By Tier</strong>
<span>Where GRPO actually moves the needle (and where it doesn&rsquo;t).</span>
</div>
</div>
<div class="figure-card" data-fig="/static/figures/qualitative_rollouts.png">
<div class="figure-img-wrap"><img src="/static/figures/qualitative_rollouts.png"
alt="Qualitative rollouts on representative tasks"></div>
<div class="figure-caption">
<strong>Qualitative Rollouts</strong>
<span>One sample episode per tier, post-GRPO.</span>
</div>
</div>
</div>
</div>
<!-- GRPO training curves -->
<div class="card animate-up">
<h3>GRPO Training Curves</h3>
<p>Per-step training signals from the final 35-step GRPO run, plus the 4-trial Optuna search that picked the
final config.</p>
<div class="figure-grid">
<div class="figure-card" data-fig="/static/figures/grpo_reward_curve.png">
<div class="figure-img-wrap"><img src="/static/figures/grpo_reward_curve.png"
alt="GRPO env reward over training"></div>
<div class="figure-caption">
<strong>GRPO Env Reward</strong>
<span>Mean reward across G=8 rollouts at each training step.</span>
</div>
</div>
<div class="figure-card" data-fig="/static/figures/grpo_per_tier_curve.png">
<div class="figure-img-wrap"><img src="/static/figures/grpo_per_tier_curve.png"
alt="GRPO per-tier reward curve"></div>
<div class="figure-caption">
<strong>Per-Tier Reward Curve</strong>
<span>How each curriculum tier responds to GRPO updates.</span>
</div>
</div>
<div class="figure-card" data-fig="/static/figures/grpo_final_per_step.png">
<div class="figure-img-wrap"><img src="/static/figures/grpo_final_per_step.png"
alt="GRPO final per-step training signals"></div>
<div class="figure-caption">
<strong>Final Per-Step Signals</strong>
<span>Reward, KL, loss, and policy ratio across the final run.</span>
</div>
</div>
<div class="figure-card" data-fig="/static/figures/grpo_optuna_trials_comparison.png">
<div class="figure-img-wrap"><img src="/static/figures/grpo_optuna_trials_comparison.png"
alt="GRPO Optuna trial comparison"></div>
<div class="figure-caption">
<strong>GRPO Optuna Trials</strong>
<span>Reward trajectories across 4 Optuna trials.</span>
</div>
</div>
<div class="figure-card" data-fig="/static/figures/grpo_optuna_importances.png">
<div class="figure-img-wrap"><img src="/static/figures/grpo_optuna_importances.png"
alt="GRPO Optuna parameter importances"></div>
<div class="figure-caption">
<strong>GRPO Param Importances</strong>
<span>Which knobs moved GRPO objective the most.</span>
</div>
</div>
<div class="figure-card" data-fig="/static/figures/grpo_optuna_history.png">
<div class="figure-img-wrap"><img src="/static/figures/grpo_optuna_history.png"
alt="GRPO Optuna optimization history"></div>
<div class="figure-caption">
<strong>GRPO Optuna History</strong>
<span>Best objective value over the 4-trial search.</span>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- API Access -->
<div class="section-wrapper" id="api">
<div class="left-col">
<div class="icon-container">
<svg viewBox="0 0 24 24">
<path d="M8 9l3 3-3 3m5 0h3M5 20h14a2 2 0 002-2V6a2 2 0 00-2-2H5a2 2 0 00-2 2v12a2 2 0 002 2z" />
</svg>
</div>
<span class="section-title">API</span>
</div>
<div class="right-col">
<div class="animate-up">
<div class="card">
<div class="code-header">
<h3 style="font-size:1.1rem;margin-bottom:0;">WebSocket</h3>
<button class="copy-btn" onclick="copyCode(this)">
<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<rect x="9" y="9" width="13" height="13" rx="2" />
<path d="M5 15H4a2 2 0 01-2-2V4a2 2 0 012-2h9a2 2 0 012 2v1" />
</svg>
Copy
</button>
</div>
<div class="code-block" data-lang="python">
import websockets, json
async def main():
async with websockets.connect("wss://sizzing-aws-rl-env.hf.space/ws"
) as ws:
# Reset environment
await ws.send(json.dumps({
"type": "reset"
}))
obs = json.loads(await ws.recv())
# Execute a command
await ws.send(json.dumps({
"type": "step",
"data": {"command": "aws s3 ls"}
}))
obs = json.loads(await ws.recv())
if __name__ == "__main__":
import asyncio
asyncio.run(main())
</div>
</div>
<div class="card">
<div class="code-header">
<h3 style="font-size:1.1rem;margin-bottom:0;">Python Client</h3>
<button class="copy-btn" onclick="copyCode(this)">
<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<rect x="9" y="9" width="13" height="13" rx="2" />
<path d="M5 15H4a2 2 0 01-2-2V4a2 2 0 012-2h9a2 2 0 012 2v1" />
</svg>
Copy
</button>
</div>
<div class="code-block" data-lang="python">
import asyncio
from aws_rl_env import AwsRlEnv, AwsRlAction
async def main():
async with AwsRlEnv.from_env(
"sizzing/aws-rl-env"
) as env:
result = await env.step(
AwsRlAction(command="aws s3 ls")
)
asyncio.run(main())
</div>
</div>
</div>
</div>
</div>
<!-- Playground -->
<div class="section-wrapper" id="playground">
<div class="left-col">
<div class="icon-container">
<svg viewBox="0 0 24 24">
<path
d="M14.752 11.168l-3.197-2.132A1 1 0 0010 9.87v4.263a1 1 0 001.555.832l3.197-2.132a1 1 0 000-1.664z" />
<path d="M21 12a9 9 0 11-18 0 9 9 0 0118 0z" />
</svg>
</div>
<span class="section-title">Play</span>
</div>
<div class="right-col">
<div class="animate-up">
<!-- Row 1: Controls + Task -->
<div class="pg-row-2col" style="margin-bottom:1rem;">
<div class="minimal-card" style="display:flex;flex-direction:column;gap:0.75rem;">
<span class="card-label">Controls</span>
<button class="btn-primary btn-full" id="resetBtn" onclick="resetEnv()">New Episode</button>
</div>
<div class="task-box empty" id="taskBox">
<p style="font-size:1rem;font-weight:500;margin-bottom:.25rem;">Click New Episode to start</p>
<p style="font-size:.9rem;color:var(--text-muted);margin-bottom:0;">The curriculum assigns a task matching
your skill level</p>
</div>
</div>
<!-- Row 2: Command + Status/Solution -->
<div class="pg-row-2col" style="margin-bottom:1rem;">
<div class="minimal-card" style="display:flex;flex-direction:column;gap:0.75rem;">
<span class="card-label">Command</span>
<input class="cmd-input" id="cmdInput" type="text" placeholder="aws s3 ls"
onkeydown="if(event.key==='Enter')runCmd()" disabled>
<button class="btn-secondary btn-full" id="runBtn" onclick="runCmd()" disabled>Run Command</button>
<button class="btn-solution btn-full" id="solutionBtn" onclick="toggleSolution()" disabled>
<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<path
d="M9.663 17h4.673M12 3v1m6.364 1.636l-.707.707M21 12h-1M4 12H3m3.343-5.657l-.707-.707m2.828 9.9a5 5 0 117.072 0l-.548.547A3.374 3.374 0 0014 18.469V19a2 2 0 11-4 0v-.531c0-.895-.356-1.754-.988-2.386l-.548-.547z" />
</svg>
Show Solution
</button>
</div>
<div style="display:flex;flex-direction:column;gap:0.75rem;">
<div class="status-bar" id="statusBar">Ready.</div>
<div class="solution-panel" id="solutionPanel" style="display:none;">
<span class="card-label" style="margin-bottom:0.75rem;">Solution Steps</span>
<div class="solution-commands-scroll">
<div class="solution-commands" id="solutionCommands"></div>
</div>
</div>
</div>
</div>
<!-- Row 3: State + Output -->
<div class="pg-row-2col" style="margin-bottom:1rem;">
<div class="minimal-card">
<span class="card-label">State</span>
<div class="state-info">
<div class="state-row"><span class="state-label">Tier</span><span class="state-value"
id="stateTier">&mdash;</span></div>
<div class="state-row"><span class="state-label">Episode</span><span
class="state-value state-episode-id" id="stateEpisode">&mdash;</span></div>
<div class="state-row"><span class="state-label">Progress</span>
<div class="progress-bar-container">
<div class="progress-bar-fill" id="stateProgress" style="width:0%"></div>
</div>
</div>
<div class="state-row"><span class="state-label">Reward</span><span class="state-value"
id="stateReward">0.00</span></div>
<div class="state-row"><span class="state-label">Steps</span><span class="state-value"
id="stateSteps">0</span></div>
<div class="state-row"><span class="state-label">Hints</span><span class="state-value"
id="stateHints">0</span></div>
<div class="state-row"><span class="state-label">Chaos</span><span class="state-value"
id="stateChaos">&mdash;</span></div>
</div>
</div>
<div class="card" style="margin-bottom:0;">
<span class="card-label">Output</span>
<div class="output-box" id="outputBox">No output yet.</div>
</div>
</div>
<!-- Row 4: Command Log (full width) -->
<div class="card" style="margin-bottom:1rem;">
<span class="card-label">Command Log</span>
<div class="log-scroll">
<table class="log-table">
<thead>
<tr>
<th>#</th>
<th>Command</th>
<th>OK</th>
<th>Reward</th>
</tr>
</thead>
<tbody id="logBody">
<tr>
<td colspan="4" class="log-empty">No commands executed yet</td>
</tr>
</tbody>
</table>
</div>
</div>
<!-- Row 5: AWS Environment (full width) -->
<div class="card">
<span class="card-label">AWS Environment</span>
<div id="infraGrid">
<p style="color:var(--text-muted);font-size:0.9rem;margin:0;">Start an episode to see live infrastructure
state.</p>
</div>
</div>
</div>
</div>
</div>
<!-- Links -->
<div class="section-wrapper" id="links">
<div class="left-col">
<div class="icon-container">
<svg viewBox="0 0 24 24">
<path
d="M13.828 10.172a4 4 0 00-5.656 0l-4 4a4 4 0 105.656 5.656l1.102-1.101m-.758-4.899a4 4 0 005.656 0l4-4a4 4 0 00-5.656-5.656l-1.1 1.1" />
</svg>
</div>
<span class="section-title">Links</span>
</div>
<div class="right-col">
<div class="card cta-card animate-up" style="text-align:center; padding: 3rem 2rem;">
<h3 style="font-size:1.8rem; margin-bottom:0.75rem;">Build the Future of AI</h3>
<p style="max-width:520px; margin: 0 auto 2rem; font-size:1.05rem;">
Star it, fork it, break it, fix it &mdash;
every episode makes AI agents better at cloud operations.
</p>
<div style="display:flex; gap:1rem; justify-content:center; flex-wrap:wrap;">
<a href="https://github.com/udaykiranpadhy/aws-rl-env" target="_blank" rel="noopener"
class="btn-primary" style="display:inline-flex; align-items:center; gap:0.5rem;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor">
<path
d="M12 0C5.37 0 0 5.37 0 12c0 5.31 3.435 9.795 8.205 11.385.6.105.825-.255.825-.57 0-.285-.015-1.23-.015-2.235-3.015.555-3.795-.735-4.035-1.41-.135-.345-.72-1.41-1.23-1.695-.42-.225-1.02-.78-.015-.795.945-.015 1.62.87 1.845 1.23 1.08 1.815 2.805 1.305 3.495.99.105-.78.42-1.305.765-1.605-2.67-.3-5.46-1.335-5.46-5.925 0-1.305.465-2.385 1.23-3.225-.12-.3-.54-1.53.12-3.18 0 0 1.005-.315 3.3 1.23.96-.27 1.98-.405 3-.405s2.04.135 3 .405c2.295-1.56 3.3-1.23 3.3-1.23.66 1.65.24 2.88.12 3.18.765.84 1.23 1.905 1.23 3.225 0 4.605-2.805 5.625-5.475 5.925.435.375.81 1.095.81 2.22 0 1.605-.015 2.895-.015 3.3 0 .315.225.69.825.57A12.02 12.02 0 0024 12c0-6.63-5.37-12-12-12z" />
</svg>
GitHub Repo
</a>
<a href="https://huggingface.co/spaces/Sizzing/aws_rl_env" target="_blank" rel="noopener"
class="btn-secondary" style="display:inline-flex; align-items:center; gap:0.5rem;">
&#129303; HF Space
</a>
<a href="https://huggingface.co/Sizzing/aws-rl-sft-qwen25coder3b-adapter" target="_blank" rel="noopener"
class="btn-secondary" style="display:inline-flex; align-items:center; gap:0.5rem;">
&#129303; SFT Adapter
</a>
<a href="https://huggingface.co/datasets/Sizzing/aws-rl-sft" target="_blank" rel="noopener"
class="btn-secondary" style="display:inline-flex; align-items:center; gap:0.5rem;">
&#129303; Dataset
</a>
</div>
</div>
</div>
</div>
</div>
<!-- Figure Lightbox -->
<div id="figure-lightbox">
<img id="figure-lightbox-img" src="" alt="">
</div>
<!-- Feature Detail Modal -->
<div id="feature-modal">
<div class="modal-container">
<button class="close-modal" id="close-modal-btn">&times;</button>
<h2 id="modal-title"></h2>
<div class="modal-grid">
<div class="modal-section">
<span class="modal-label">What it does</span>
<p id="modal-what"></p>
<span class="modal-label">Why it matters</span>
<p id="modal-why"></p>
<span class="modal-label">How it works</span>
<p id="modal-how"></p>
</div>
<div class="modal-section">
<span class="modal-label">Key metrics</span>
<div id="modal-metrics"></div>
<span class="modal-label">Architecture</span>
<div class="diag-container" id="modal-diagram"></div>
</div>
</div>
</div>
</div>
<!-- Command Log Detail Modal -->
<div id="log-modal">
<div class="modal-container">
<button class="close-modal" onclick="closeLogModal()">&times;</button>
<h2 id="log-modal-title">Command Detail</h2>
<div class="log-modal-body">
<span class="modal-label">Command</span>
<div class="code-block" id="log-modal-cmd" style="margin-bottom:1.5rem;"></div>
<div class="pg-row-2col" style="gap:1.5rem;">
<div>
<span class="modal-label">Status</span>
<p id="log-modal-status"></p>
<span class="modal-label">Reward</span>
<p id="log-modal-reward"></p>
</div>
<div>
<span class="modal-label">Output</span>
<div class="output-box" id="log-modal-output" style="max-height:300px;"></div>
</div>
</div>
</div>
</div>
</div>
<!-- Infra Service Detail Modal -->
<div id="infra-modal">
<div class="modal-container">
<button class="close-modal" onclick="closeInfraModal()">&times;</button>
<h2 id="infra-modal-title"></h2>
<div id="infra-modal-body"></div>
</div>
</div>
<footer>
<div class="footer-content">
<div class="footer-brand">
<h3>AWS RL Environment</h3>
<p>Train AI agents on real AWS cloud operations with curriculum-based reinforcement learning.</p>
</div>
<div class="footer-links-group">
<h4>Navigation</h4>
<ul>
<li><a href="#about">About</a></li>
<li><a href="#tiers">Tasks</a></li>
<li><a href="#features">Features</a></li>
<li><a href="#results">Results</a></li>
<li><a href="#api">API Docs</a></li>
<li><a href="#playground">Playground</a></li>
</ul>
</div>
<div class="footer-links-group">
<h4>Resources</h4>
<ul>
<li><a href="https://github.com/udaykiranpadhy/aws-rl-env" target="_blank" rel="noopener">GitHub</a></li>
<li><a href="https://huggingface.co/spaces/Sizzing/aws_rl_env" target="_blank" rel="noopener">HF Space</a></li>
<li><a href="https://huggingface.co/Sizzing/aws-rl-sft-qwen25coder3b-adapter" target="_blank"
rel="noopener">SFT Adapter</a></li>
<li><a href="https://huggingface.co/datasets/Sizzing/aws-rl-sft" target="_blank" rel="noopener">SFT Dataset</a>
</li>
<li><a href="/docs" target="_blank">API Reference</a></li>
<li><a href="/redoc" target="_blank">ReDoc</a></li>
</ul>
</div>
<div class="footer-links-group">
<h4>Connect</h4>
<ul>
<li><a href="https://portfolio.udaykp.dev" target="_blank" rel="noopener">Portfolio</a></li>
<li><a href="https://linkedin.com/in/udaykiranpadhy" target="_blank" rel="noopener">LinkedIn</a></li>
<li><a href="mailto:kiranu941@gmail.com">Email</a></li>
</ul>
</div>
</div>
<div class="footer-bottom">
<p>&copy; 2026 Uday Kiran Padhy. Built from scratch with Python, FastAPI &amp; JS.</p>
</div>
</footer>
<script>
/* ===== Navbar scroll — pill shape on scroll ===== */
const nav = document.getElementById('navbar');
window.addEventListener('scroll', () => {
nav.classList.toggle('scrolled', window.scrollY > 40);
}, { passive: true });
/* ===== Active nav link on scroll ===== */
const sectionWrappers = document.querySelectorAll('.section-wrapper[id]');
const navLinks = document.querySelectorAll('.nav-links a');
function updateActiveNav() {
const readingLine = window.scrollY + window.innerHeight / 3;
let current = '';
sectionWrappers.forEach(s => {
const rect = s.getBoundingClientRect();
const absoluteTop = rect.top + window.scrollY;
if (absoluteTop <= readingLine) {
current = s.id;
}
});
// Update nav links
navLinks.forEach(l => {
const href = l.getAttribute('href');
const isActive = href === '#' + current;
l.classList.toggle('active', isActive);
});
// Mobile: show active section name
nav.setAttribute('data-active-section', current || '');
}
window.addEventListener('scroll', updateActiveNav, { passive: true });
updateActiveNav();
/* ===== Smooth scroll with offset for fixed nav ===== */
document.querySelectorAll('a[href^="#"]').forEach(link => {
link.addEventListener('click', e => {
const target = document.querySelector(link.getAttribute('href'));
if (!target) return;
e.preventDefault();
const rect = target.getBoundingClientRect();
const absoluteTop = rect.top + window.scrollY;
const offset = 100;
window.scrollTo({
top: absoluteTop - offset,
behavior: 'smooth'
});
});
});
/* ===== Hero parallax grid & spotlight ===== */
const heroBg = document.querySelector('.hero-bg');
document.addEventListener('mousemove', e => {
const x = e.clientX;
const y = e.clientY;
// Hero parallax
if (heroBg) {
heroBg.style.setProperty('--bg-x', (x * 0.02) + 'px');
heroBg.style.setProperty('--bg-y', (y * 0.02) + 'px');
heroBg.style.setProperty('--mouse-x', x + 'px');
heroBg.style.setProperty('--mouse-y', y + 'px');
}
// Card spotlight tracking
document.querySelectorAll('.card, .minimal-card').forEach(card => {
const r = card.getBoundingClientRect();
card.style.setProperty('--mouse-x', (x - r.left) + 'px');
card.style.setProperty('--mouse-y', (y - r.top) + 'px');
});
}, { passive: true });
/* ===== Typewriter — character-by-character reveal ===== */
function typewrite(el, delay, speed) {
speed = speed || 30;
const text = el.textContent;
el.innerHTML = '';
const chars = [];
for (const ch of text) {
const span = document.createElement('span');
span.classList.add('char');
span.textContent = ch;
el.appendChild(span);
chars.push(span);
}
// Insert a real cursor element that moves with the text
const cursor = document.createElement('span');
cursor.classList.add('typing-cursor');
cursor.textContent = '|';
return new Promise(resolve => {
chars.forEach((span, i) => {
setTimeout(() => {
span.classList.add('visible');
// Move cursor right after the latest visible char
span.after(cursor);
if (i === chars.length - 1) {
resolve();
}
}, delay + i * speed);
});
if (chars.length === 0) resolve();
});
}
// Typewrite hero elements sequentially: subtitle starts after title finishes
(async function () {
const heroTitle = document.getElementById('hero-title');
const heroSub = document.getElementById('hero-subtitle');
// Hide subtitle until its turn
if (heroSub) heroSub.style.visibility = 'hidden';
if (heroTitle) {
await typewrite(heroTitle, 300);
heroTitle.querySelector('.typing-cursor')?.remove();
}
if (heroSub) {
heroSub.style.visibility = 'visible';
await typewrite(heroSub, 200, 12);
heroSub.querySelector('.typing-cursor')?.remove();
}
// Fade in hero CTA after both animations complete
setTimeout(() => {
document.querySelectorAll('.hero-fade-up').forEach(el => el.classList.add('visible'));
}, 200);
})();
/* ===== Intersection Observer — fade-up on scroll ===== */
const observer = new IntersectionObserver(entries => {
entries.forEach(e => {
if (e.isIntersecting) {
e.target.classList.add('visible');
observer.unobserve(e.target);
}
});
}, { threshold: 0.1, rootMargin: '-50px' });
document.querySelectorAll('.animate-up').forEach(el => observer.observe(el));
/* ===== Playground Logic ===== */
const COLORS = {
warmup: '#34a853', beginner: '#1a73e8', intermediate: '#f9ab00',
advanced: '#ea4335', expert: '#7627bb'
};
const COLOR_BG = {
warmup: '#e6f4ea', beginner: '#e8f0fe', intermediate: '#fef7e0',
advanced: '#fce8e6', expert: '#f3e8fd'
};
let stepCount = 0;
// Services that have official AWS SVG files in /static/img/aws/
const SVC_IMG_FILES = ['s3', 'sqs', 'sns', 'lambda', 'dynamodb', 'iam', 'ec2', 'rds', 'rds-data', 'cloudformation', 'cloudwatch', 'route53', 'apigateway', 'apigateway_v1', 'elasticache', 'elbv2', 'events', 'ssm', 'cognito-idp', 'cognito-identity', 'glue', 'firehose', 'athena', 'emr', 'efs', 'ebs', 'kinesis', 'logs', 'monitoring', 'ses', 'ses_v2', 'acm', 'wafv2', 'states', 'secretsmanager', 'ecs', 'ecr', 'eks', 'elasticmapreduce', 'elasticloadbalancing', 'elasticfilesystem', 'appconfig', 'appsync', 'autoscaling', 'cloudfront', 'codebuild', 'kms', 'scheduler', 'servicediscovery', 'sts', 'tagging', 'transfer', 's3files'];
const DEFAULT_ICON = '<circle cx="12" cy="12" r="10"/><path d="M12 8v4M12 16h.01"/>';
function _svcIconHtml(svc) {
if (SVC_IMG_FILES.includes(svc)) {
return '<img src="/static/img/aws/' + svc + '.svg" alt="' + svc + '" style="width:36px;height:36px;border-radius:6px;">';
}
return '<svg viewBox="0 0 24 24">' + DEFAULT_ICON + '</svg>';
}
// Cache infra data for modal drill-down
let _lastInfraServices = {};
async function refreshState() {
try {
const res = await fetch('/web/state');
const state = await res.json();
// Update sidebar stats
document.getElementById('stateSteps').textContent = state.tracker ? state.tracker.step_count : '0';
document.getElementById('stateHints').textContent = state.tracker ? state.tracker.hints_used : '0';
const chaosEl = document.getElementById('stateChaos');
if (state.chaos_occurred) {
chaosEl.textContent = 'Active';
chaosEl.className = 'state-value chaos-active';
} else {
chaosEl.textContent = 'None';
chaosEl.className = 'state-value chaos-inactive';
}
// Render infra tiles
const grid = document.getElementById('infraGrid');
const services = state.infra_state && state.infra_state.services ? state.infra_state.services : {};
_lastInfraServices = services;
const svcKeys = Object.keys(services);
if (svcKeys.length === 0) {
grid.innerHTML = '<p style="color:var(--text-muted);font-size:0.9rem;margin:0;">No data.</p>';
return;
}
let html = '';
for (const svc of svcKeys) {
const data = services[svc];
let totalCount = 0;
for (const [, resData] of Object.entries(data)) {
if (resData && typeof resData === 'object') {
if (typeof resData.count === 'number') {
totalCount += resData.count;
} else if (Array.isArray(resData)) {
totalCount += resData.length;
} else {
// Nested object keyed by ID (e.g. apigateway_v1 rest_apis)
const keys = Object.keys(resData);
if (keys.length > 0) totalCount += keys.length;
}
}
}
const hasRes = totalCount > 0;
html += '<div class="infra-tile' + (hasRes ? ' has-resources' : '') + '" onclick="openInfraModal(\'' + svc + '\')">' +
(hasRes ? '<span class="infra-tile-badge">' + totalCount + '</span>' : '') +
'<div class="infra-tile-icon">' + _svcIconHtml(svc) + '</div>' +
'<span class="infra-tile-name">' + escHtml(svc) + '</span>' +
'</div>';
}
grid.className = 'infra-tiles';
grid.innerHTML = html;
} catch (e) {
// Silent fail
}
}
// Infra modal
function _renderResItems(obj) {
// Renders items for the modal body — handles arrays, {count,names}, and nested objects
if (!obj || typeof obj !== 'object') return '<div class="infra-res-item">' + escHtml(String(obj)) + '</div>';
if (Array.isArray(obj)) {
return obj.map(function (item) { return '<div class="infra-res-item">' + escHtml(String(item)) + '</div>'; }).join('');
}
// Has {count, names/ids} pattern
if (typeof obj.count === 'number') {
var items = obj.names || obj.ids || [];
return items.map(function (item) { return '<div class="infra-res-item">' + escHtml(String(item)) + '</div>'; }).join('') ||
'<div class="infra-res-item" style="color:var(--text-muted);">Empty (' + obj.count + ')</div>';
}
// Nested keyed object — render each key as a sub-item
var keys = Object.keys(obj);
if (keys.length === 0) return '';
var out = '';
for (var k of keys) {
var val = obj[k];
if (val && typeof val === 'object' && !Array.isArray(val)) {
// Show key with a summary
var name = val.name || val.Name || val.id || val.Id || k;
var detail = val.description || val.engine || val.runtime || val.protocol || '';
out += '<div class="infra-res-item"><strong>' + escHtml(String(name)) + '</strong>' +
(detail ? ' <span style="color:var(--text-muted);">\u2014 ' + escHtml(String(detail)) + '</span>' : '') +
'</div>';
} else {
out += '<div class="infra-res-item">' + escHtml(k + ': ' + JSON.stringify(val)) + '</div>';
}
}
return out;
}
function _countResources(resData) {
if (!resData || typeof resData !== 'object') return 0;
if (typeof resData.count === 'number') return resData.count;
if (Array.isArray(resData)) return resData.length;
return Object.keys(resData).length;
}
function openInfraModal(svc) {
const data = _lastInfraServices[svc];
if (!data) return;
document.getElementById('infra-modal-title').textContent = svc.toUpperCase();
const body = document.getElementById('infra-modal-body');
let html = '';
for (const [resType, resData] of Object.entries(data)) {
if (!resData || typeof resData !== 'object') continue;
var count = _countResources(resData);
const groupId = 'infra-g-' + svc + '-' + resType.replace(/[^a-z0-9]/gi, '');
html += '<div class="infra-res-group">' +
'<div class="infra-res-header" onclick="var el=document.getElementById(\'' + groupId + '\');if(el)el.classList.toggle(\'open\')">' +
'<span class="infra-res-title">' + escHtml(resType.replace(/_/g, ' ')) + '</span>' +
'<span class="infra-res-count">' + count + '</span>' +
'</div>';
var itemsHtml = _renderResItems(resData);
if (itemsHtml) {
html += '<div class="infra-res-body" id="' + groupId + '">' + itemsHtml + '</div>';
}
html += '</div>';
}
body.innerHTML = html || '<p style="color:var(--text-muted);">No resources in this service.</p>';
document.getElementById('infra-modal').classList.add('open');
document.body.style.overflow = 'hidden';
}
function closeInfraModal() {
document.getElementById('infra-modal').classList.remove('open');
document.body.style.overflow = '';
}
// Command log modal
let _logEntries = [];
function openLogModal(index) {
const entry = _logEntries[index];
if (!entry) return;
document.getElementById('log-modal-title').textContent = 'Step #' + entry.step;
document.getElementById('log-modal-cmd').textContent = entry.command;
document.getElementById('log-modal-status').innerHTML = entry.success
? '<span style="color:#34a853;font-weight:500;">Success</span>'
: '<span style="color:#ea4335;font-weight:500;">Failed</span>';
document.getElementById('log-modal-reward').textContent = (entry.reward >= 0 ? '+' : '') + entry.reward.toFixed(2);
document.getElementById('log-modal-output').textContent = entry.output || 'No output';
document.getElementById('log-modal').classList.add('open');
document.body.style.overflow = 'hidden';
}
function closeLogModal() {
document.getElementById('log-modal').classList.remove('open');
document.body.style.overflow = '';
}
// Close modals on Escape / backdrop click
document.addEventListener('keydown', function (e) {
if (e.key === 'Escape') { closeInfraModal(); closeLogModal(); }
});
['infra-modal', 'log-modal'].forEach(function (id) {
var el = document.getElementById(id);
if (el) el.addEventListener('click', function (e) {
if (e.target.id === id) { closeInfraModal(); closeLogModal(); }
});
});
function setStatus(msg, type) {
const bar = document.getElementById('statusBar');
bar.className = 'status-bar ' + (type || '');
bar.innerHTML = msg;
}
function setLoading(btn, loading) {
if (loading) {
btn.disabled = true;
btn.dataset.orig = btn.textContent;
}
btn.innerHTML = loading
? '<span class="spinner"></span>' + (btn.dataset.orig || '')
: (btn.dataset.orig || btn.textContent);
}
function escHtml(s) {
return s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
}
async function resetEnv() {
const btn = document.getElementById('resetBtn');
setLoading(btn, true);
setStatus('Resetting environment...', 'info');
try {
const res = await fetch('/web/reset', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: '{}'
});
const data = await res.json();
const obs = data.observation;
stepCount = 0;
const task = obs.task;
const box = document.getElementById('taskBox');
if (task) {
const color = COLORS[task.difficulty] || '#5f6368';
const bg = COLOR_BG[task.difficulty] || '#f1f3f4';
box.className = 'task-box';
box.style.borderLeftColor = color;
box.innerHTML =
'<div>' +
'<span class="task-badge" style="background:' + bg + ';color:' + color + ';">' + escHtml(task.difficulty) + '</span>' +
'<span class="task-meta">Task #' + task.task_id + '</span>' +
'</div>' +
'<p class="task-desc">' + escHtml(task.description) + '</p>';
}
document.getElementById('outputBox').textContent = obs.command_output || '';
document.getElementById('logBody').innerHTML =
'<tr><td colspan="4" class="log-empty">No commands executed yet</td></tr>';
_logEntries = [];
// Enable command controls
document.getElementById('cmdInput').disabled = false;
document.getElementById('runBtn').disabled = false;
delete document.getElementById('runBtn').dataset.ended;
document.getElementById('solutionBtn').disabled = false;
document.getElementById('solutionBtn').innerHTML = '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M9.663 17h4.673M12 3v1m6.364 1.636l-.707.707M21 12h-1M4 12H3m3.343-5.657l-.707-.707m2.828 9.9a5 5 0 117.072 0l-.548.547A3.374 3.374 0 0014 18.469V19a2 2 0 11-4 0v-.531c0-.895-.356-1.754-.988-2.386l-.548-.547z"/></svg> Show Solution';
document.getElementById('solutionPanel').style.display = 'none';
document.getElementById('solutionCommands').innerHTML = '';
document.getElementById('cmdInput').value = '';
document.getElementById('cmdInput').focus();
// Update state box
document.getElementById('stateTier').textContent = task ? task.difficulty : '\u2014';
document.getElementById('stateEpisode').textContent = obs.episode_id || '1';
document.getElementById('stateProgress').style.width = '0%';
document.getElementById('stateReward').textContent = '0.00';
setStatus('New episode started. Difficulty: <strong>' + (task ? escHtml(task.difficulty) : 'unknown') + '</strong>', 'info');
refreshState();
} catch (e) {
setStatus('Reset failed: ' + escHtml(e.message), 'error');
} finally {
setLoading(btn, false);
btn.disabled = false;
}
}
async function runCmd() {
const input = document.getElementById('cmdInput');
const cmd = input.value.trim();
if (!cmd) return;
const btn = document.getElementById('runBtn');
setLoading(btn, true);
try {
const res = await fetch('/web/step', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ action: { command: cmd } })
});
const data = await res.json();
if (!res.ok) {
setStatus('Error: ' + escHtml(data.detail || JSON.stringify(data)), 'error');
return;
}
const obs = data.observation;
stepCount++;
const output = obs.command_success
? (obs.command_output || '')
: (obs.error || obs.command_output || '');
document.getElementById('outputBox').textContent = output;
const tbody = document.getElementById('logBody');
if (stepCount === 1) { tbody.innerHTML = ''; _logEntries = []; }
const reward = (obs.reward != null ? obs.reward : (data.reward || 0));
const logIdx = _logEntries.length;
_logEntries.push({ step: stepCount, command: cmd, success: obs.command_success, reward: reward, output: output });
const tr = document.createElement('tr');
tr.onclick = function () { openLogModal(logIdx); };
const displayCmd = cmd.length > 60 ? cmd.slice(0, 57) + '...' : cmd;
tr.innerHTML =
'<td>' + stepCount + '</td>' +
'<td class="cmd">' + escHtml(displayCmd) + '</td>' +
'<td class="' + (obs.command_success ? 'yes' : 'no') + '">' + (obs.command_success ? 'Yes' : 'No') + '</td>' +
'<td>' + (reward >= 0 ? '+' : '') + Number(reward).toFixed(2) + '</td>';
tbody.appendChild(tr);
// Update state box
const progress = obs.partial_progress != null ? obs.partial_progress : 0;
document.getElementById('stateProgress').style.width = (progress * 100) + '%';
const cumReward = parseFloat(document.getElementById('stateReward').textContent) + reward;
document.getElementById('stateReward').textContent = cumReward.toFixed(2);
if (obs.task_achieved) {
setStatus('Task completed! Step ' + obs.step_count + ', reward: +' + Number(reward).toFixed(2) + '. Click <strong>New Episode</strong> for the next task.', 'success');
document.getElementById('cmdInput').disabled = true;
document.getElementById('runBtn').disabled = true;
document.getElementById('runBtn').dataset.ended = '1';
document.getElementById('solutionBtn').disabled = true;
} else if (data.done) {
setStatus('Episode ended. Click <strong>New Episode</strong> to try again.', 'error');
document.getElementById('cmdInput').disabled = true;
document.getElementById('runBtn').disabled = true;
document.getElementById('runBtn').dataset.ended = '1';
document.getElementById('solutionBtn').disabled = true;
} else {
setStatus('Step <strong>' + obs.step_count + '</strong> &mdash; ' + (obs.command_success ? 'Command succeeded.' : 'Command failed.'), obs.command_success ? 'info' : 'error');
}
refreshState();
input.value = '';
input.focus();
} catch (e) {
setStatus('Request failed: ' + escHtml(e.message), 'error');
} finally {
setLoading(btn, false);
// Re-enable if episode is still active (not disabled by completion/done handlers above)
if (!btn.dataset.ended) {
btn.disabled = false;
}
}
}
</script>
<script>
// Python syntax highlighting
function highlightPython(code) {
const esc = s => s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
const tokens = [];
let i = 0;
while (i < code.length) {
// Comments
if (code[i] === '#') {
let end = code.indexOf('\n', i);
if (end === -1) end = code.length;
tokens.push('<span class="hl-comment">' + esc(code.slice(i, end)) + '</span>');
i = end;
}
// Strings (double or single quoted)
else if (code[i] === '"' || code[i] === "'") {
const q = code[i];
let j = i + 1;
while (j < code.length && code[j] !== q) { if (code[j] === '\\') j++; j++; }
j = Math.min(j + 1, code.length);
tokens.push('<span class="hl-string">' + esc(code.slice(i, j)) + '</span>');
i = j;
}
// Words (keywords, builtins, etc)
else if (/[a-zA-Z_]/.test(code[i])) {
let j = i;
while (j < code.length && /[a-zA-Z0-9_]/.test(code[j])) j++;
const word = code.slice(i, j);
const kw = ['import', 'from', 'as', 'with', 'async', 'await', 'if', 'else', 'elif', 'for', 'while', 'return', 'def', 'class', 'try', 'except', 'finally', 'raise', 'yield', 'pass', 'break', 'continue', 'and', 'or', 'not', 'in', 'is', 'True', 'False', 'None'];
const bi = ['print', 'len', 'range', 'type', 'str', 'int', 'float', 'list', 'dict', 'set', 'tuple', 'json', 'self'];
if (kw.includes(word)) tokens.push('<span class="hl-keyword">' + esc(word) + '</span>');
else if (bi.includes(word)) tokens.push('<span class="hl-builtin">' + esc(word) + '</span>');
else tokens.push(esc(word));
i = j;
}
// Parens/brackets
else if ('()[]{}:'.includes(code[i])) {
tokens.push('<span class="hl-punct">' + esc(code[i]) + '</span>');
i++;
}
else {
tokens.push(esc(code[i]));
i++;
}
}
return tokens.join('');
}
document.querySelectorAll('.code-block[data-lang="python"]').forEach(block => {
const raw = block.textContent;
block.dataset.raw = raw;
block.innerHTML = highlightPython(raw);
});
// Copy code to clipboard
function copyCode(btn) {
var codeBlock = btn.closest('.card').querySelector('.code-block');
var text = codeBlock.dataset.raw || codeBlock.textContent;
function onSuccess() {
btn.classList.add('copied');
btn.innerHTML = '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M20 6L9 17l-5-5"/></svg> Copied!';
setTimeout(function () {
btn.classList.remove('copied');
btn.innerHTML = '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><rect x="9" y="9" width="13" height="13" rx="2"/><path d="M5 15H4a2 2 0 01-2-2V4a2 2 0 012-2h9a2 2 0 012 2v1"/></svg> Copy';
}, 2000);
}
function fallbackCopy() {
var textarea = document.createElement('textarea');
textarea.value = text;
textarea.setAttribute('readonly', '');
textarea.style.position = 'absolute';
textarea.style.left = '-9999px';
document.body.appendChild(textarea);
textarea.select();
try { document.execCommand('copy'); onSuccess(); }
catch (e) { /* silent */ }
document.body.removeChild(textarea);
}
if (navigator.clipboard && navigator.clipboard.writeText) {
navigator.clipboard.writeText(text).then(onSuccess).catch(fallbackCopy);
} else {
fallbackCopy();
}
}
// Solution — show next step
async function toggleSolution() {
var panel = document.getElementById('solutionPanel');
var btn = document.getElementById('solutionBtn');
var container = document.getElementById('solutionCommands');
var cmdInput = document.getElementById('cmdInput');
try {
var res = await fetch('/web/solution');
var data = await res.json();
if (data.error) {
setStatus(data.error, 'error');
return;
}
if (!data.command) {
container.innerHTML = '<div class="solution-cmd is-note"><span class="solution-step">&#x2713;</span><code>All solution steps shown. Run the commands above to complete the task.</code></div>';
panel.style.display = 'block';
return;
}
// Append to solution panel
var stepEl = document.createElement('div');
stepEl.className = 'solution-cmd';
stepEl.innerHTML = '<span class="solution-step">' + data.step + '</span><code>' + escHtml(data.command) + '</code>';
container.appendChild(stepEl);
panel.style.display = 'block';
// Auto-fill the command input
cmdInput.value = data.command;
cmdInput.focus();
// Update button text with step info
btn.innerHTML = '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M9.663 17h4.673M12 3v1m6.364 1.636l-.707.707M21 12h-1M4 12H3m3.343-5.657l-.707-.707m2.828 9.9a5 5 0 117.072 0l-.548.547A3.374 3.374 0 0014 18.469V19a2 2 0 11-4 0v-.531c0-.895-.356-1.754-.988-2.386l-.548-.547z"/></svg> Next Step (' + data.step + '/' + data.total_steps + ')';
} catch (e) {
setStatus('Failed to load solution: ' + escHtml(e.message), 'error');
}
}
// Feature details database
const featureDetails = {
'progressive-difficulty': {
title: 'Progressive Difficulty',
what: 'The environment organizes more than 120 tasks across 5 tiers: Warmup, Beginner, Intermediate, Advanced, and Expert. Tasks progress from simple listing operations to complex SRE incident response and drift detection scenarios.',
why: 'Prevents the agent from being overwhelmed by complex tasks early on. Scaffolded difficulty ensures the agent builds foundational skills before tackling multi-service architectures.',
how: 'The CurriculumManager maintains per-agent tier state. Promotion requires meeting a minimum episode count and success rate threshold. A fast-track mechanism allows agents scoring 90%+ on 3 consecutive episodes to skip the minimum wait.',
metrics: [
{ v: '5', l: 'Difficulty Tiers' },
{ v: '120+', l: 'Total Tasks' },
{ v: '90%', l: 'Fast-track Threshold' }
],
diagram: `<svg viewBox="0 0 320 180" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect x="10" y="140" width="56" height="30" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1.5"/>
<text x="38" y="159" text-anchor="middle" font-size="9" fill="#137333" font-weight="500">Warmup</text>
<rect x="76" y="110" width="56" height="60" rx="6" fill="#e8f0fe" stroke="#174ea6" stroke-width="1.5"/>
<text x="104" y="145" text-anchor="middle" font-size="9" fill="#174ea6" font-weight="500">Beginner</text>
<rect x="142" y="80" width="56" height="90" rx="6" fill="#fef7e0" stroke="#b05a00" stroke-width="1.5"/>
<text x="170" y="130" text-anchor="middle" font-size="9" fill="#b05a00" font-weight="500">Intermed.</text>
<rect x="208" y="50" width="56" height="120" rx="6" fill="#fce8e6" stroke="#c5221f" stroke-width="1.5"/>
<text x="236" y="115" text-anchor="middle" font-size="9" fill="#c5221f" font-weight="500">Advanced</text>
<rect x="274" y="20" width="42" height="150" rx="6" fill="#f3e8fd" stroke="#7627bb" stroke-width="1.5"/>
<text x="295" y="100" text-anchor="middle" font-size="9" fill="#7627bb" font-weight="500">Expert</text>
<path d="M66 155 L76 140" stroke="#dadce0" stroke-width="1.5" stroke-dasharray="4 2"/>
<path d="M132 140 L142 125" stroke="#dadce0" stroke-width="1.5" stroke-dasharray="4 2"/>
<path d="M198 125 L208 110" stroke="#dadce0" stroke-width="1.5" stroke-dasharray="4 2"/>
<path d="M264 110 L274 95" stroke="#dadce0" stroke-width="1.5" stroke-dasharray="4 2"/>
<text x="160" y="12" text-anchor="middle" font-size="10" fill="#5f6368">Task Complexity Staircase</text>
</svg>`
},
'mastery-tracking': {
title: 'Mastery Tracking',
what: 'Each task independently tracks the agent\'s performance using a weighted success rate over a sliding window. Tasks "graduate" when performance exceeds the mastery threshold consistently.',
why: 'Ensures the agent truly masters a skill before moving on. Prevents lucky single completions from being treated as mastery. Un-graduation catches skill decay.',
how: 'A mastery_window of 10 episodes and mastery_threshold of 0.7 (70% success). Minimum 3 attempts required before graduation. Recent results are weighted more heavily using exponential decay (factor 0.85). Graduated tasks can un-graduate if performance drops.',
metrics: [
{ v: '70%', l: 'Mastery Threshold' },
{ v: '10', l: 'Window Size (episodes)' },
{ v: '0.85', l: 'Decay Factor' }
],
diagram: `<svg viewBox="0 0 320 160" fill="none" xmlns="http://www.w3.org/2000/svg">
<line x1="30" y1="130" x2="300" y2="130" stroke="#dadce0" stroke-width="1"/>
<line x1="30" y1="130" x2="30" y2="20" stroke="#dadce0" stroke-width="1"/>
<line x1="30" y1="52" x2="300" y2="52" stroke="#1a73e8" stroke-width="1" stroke-dasharray="6 3"/>
<text x="305" y="55" font-size="8" fill="#1a73e8">70% threshold</text>
<polyline points="50,120 80,100 110,90 140,70 170,60 200,45 230,40 260,38" stroke="#34a853" stroke-width="2" fill="none" stroke-linecap="round" stroke-linejoin="round"/>
<circle cx="200" cy="45" r="4" fill="#34a853"/>
<text x="200" y="38" text-anchor="middle" font-size="8" fill="#34a853">Graduated!</text>
<text x="165" y="150" text-anchor="middle" font-size="9" fill="#5f6368">Episodes</text>
<text x="15" y="75" font-size="9" fill="#5f6368" transform="rotate(-90 15 75)">Success %</text>
</svg>`
},
'spaced-repetition': {
title: 'Spaced Repetition',
what: 'Graduated tasks don\'t disappear — they resurface at exponentially increasing intervals (3, 6, 12, 24, 48 episodes) for re-testing, earning a +30 priority bonus when due.',
why: 'Prevents catastrophic forgetting. The agent must retain skills even as it learns new ones. Exponential spacing is the most efficient retention schedule, borrowed from cognitive science.',
how: 'Each task tracks a spaced_rep_interval starting at 3 episodes. When the task is re-tested and passes, the interval doubles (up to 48). If it fails, the interval resets. The _is_spaced_rep_due() method checks elapsed episodes against the interval.',
metrics: [
{ v: '+30', l: 'Spaced Rep Bonus' },
{ v: '3\u219248', l: 'Interval Range (episodes)' },
{ v: '2x', l: 'Interval Growth' }
],
diagram: `<svg viewBox="0 0 320 140" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Spaced Repetition Schedule</text>
<rect x="20" y="35" width="30" height="20" rx="4" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1"/>
<text x="35" y="49" text-anchor="middle" font-size="8" fill="#1a73e8">3</text>
<rect x="70" y="35" width="40" height="20" rx="4" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1"/>
<text x="90" y="49" text-anchor="middle" font-size="8" fill="#1a73e8">6</text>
<rect x="130" y="35" width="50" height="20" rx="4" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1"/>
<text x="155" y="49" text-anchor="middle" font-size="8" fill="#1a73e8">12</text>
<rect x="200" y="35" width="50" height="20" rx="4" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1"/>
<text x="225" y="49" text-anchor="middle" font-size="8" fill="#1a73e8">24</text>
<rect x="270" y="35" width="40" height="20" rx="4" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1"/>
<text x="290" y="49" text-anchor="middle" font-size="8" fill="#1a73e8">48</text>
<path d="M50 45 L70 45" stroke="#dadce0" stroke-width="1.5" marker-end="url(#arr)"/>
<path d="M110 45 L130 45" stroke="#dadce0" stroke-width="1.5" marker-end="url(#arr)"/>
<path d="M180 45 L200 45" stroke="#dadce0" stroke-width="1.5" marker-end="url(#arr)"/>
<path d="M250 45 L270 45" stroke="#dadce0" stroke-width="1.5" marker-end="url(#arr)"/>
<defs><marker id="arr" markerWidth="6" markerHeight="4" refX="6" refY="2" orient="auto"><path d="M0 0 L6 2 L0 4" fill="#dadce0"/></marker></defs>
<text x="160" y="80" text-anchor="middle" font-size="9" fill="#5f6368">Episodes between reviews (doubles on success)</text>
<rect x="60" y="95" width="200" height="35" rx="8" fill="#f8f9fa" stroke="#dadce0" stroke-width="1"/>
<text x="160" y="110" text-anchor="middle" font-size="9" fill="#202124">Pass \u2192 interval doubles</text>
<text x="160" y="122" text-anchor="middle" font-size="9" fill="#ea4335">Fail \u2192 interval resets to 3</text>
</svg>`
},
'priority-selection': {
title: 'Priority Selection',
what: 'Tasks are ranked by a composite score combining novelty, weakness, spaced repetition due dates, and recency. The highest-scoring task is selected for each episode.',
why: 'Optimizes the training curriculum by ensuring the agent explores new tasks, practices weak areas, revisits graduated skills, and maintains variety \u2014 all balanced automatically.',
how: 'score = novelty_bonus (+100 if never attempted) + weakness_weight (+50 \u00d7 (1 - success_rate)) + spaced_rep_bonus (+30 if due) - recency_penalty (-20 if attempted in last 2 episodes). The weighted success rate uses exponential decay (0.85) to emphasize recent performance.',
metrics: [
{ v: '+100', l: 'Novelty Bonus' },
{ v: '+50', l: 'Max Weakness Weight' },
{ v: '-20', l: 'Recency Penalty' }
],
diagram: `<svg viewBox="0 0 320 170" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Priority Score Formula</text>
<rect x="20" y="30" width="130" height="28" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1"/>
<text x="85" y="48" text-anchor="middle" font-size="9" fill="#137333">novelty_bonus: +100</text>
<rect x="170" y="30" width="130" height="28" rx="6" fill="#fef7e0" stroke="#b05a00" stroke-width="1"/>
<text x="235" y="48" text-anchor="middle" font-size="9" fill="#b05a00">weakness: +50\u00d7(1-rate)</text>
<rect x="20" y="70" width="130" height="28" rx="6" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1"/>
<text x="85" y="88" text-anchor="middle" font-size="9" fill="#1a73e8">spaced_rep: +30</text>
<rect x="170" y="70" width="130" height="28" rx="6" fill="#fce8e6" stroke="#c5221f" stroke-width="1"/>
<text x="235" y="88" text-anchor="middle" font-size="9" fill="#c5221f">recency: -20</text>
<text x="160" y="120" text-anchor="middle" font-size="11" fill="#202124" font-weight="500">= priority score</text>
<rect x="60" y="130" width="200" height="30" rx="8" fill="#f8f9fa" stroke="#dadce0" stroke-width="1"/>
<text x="160" y="149" text-anchor="middle" font-size="9" fill="#5f6368">Highest score \u2192 next task selected</text>
</svg>`
},
'tier-progression': {
title: 'Tier Progression',
what: 'Agents advance through tiers via standard promotion (minimum episodes + success rate) or fast-track (3 consecutive high-scoring episodes). Tiers gate access to increasingly complex task pools.',
why: 'Provides structure to the learning process. Standard promotion ensures sufficient exposure; fast-track rewards agents that demonstrate immediate competence.',
how: 'Standard: complete min_episodes at current tier with success_rate >= advance_rate. Fast-track: 3 consecutive episodes at >= 90% success bypasses the minimum episode requirement. Un-promotion is not supported \u2014 agents cannot drop tiers.',
metrics: [
{ v: '3', l: 'Fast-track Streak' },
{ v: '90%', l: 'Fast-track Rate' },
{ v: '5', l: 'Total Tiers' }
],
diagram: `<svg viewBox="0 0 320 140" fill="none" xmlns="http://www.w3.org/2000/svg">
<rect x="10" y="90" width="55" height="24" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1.5"/>
<text x="37" y="106" text-anchor="middle" font-size="8" fill="#137333">Warmup</text>
<rect x="80" y="90" width="55" height="24" rx="6" fill="#e8f0fe" stroke="#174ea6" stroke-width="1.5"/>
<text x="107" y="106" text-anchor="middle" font-size="8" fill="#174ea6">Beginner</text>
<rect x="150" y="90" width="55" height="24" rx="6" fill="#fef7e0" stroke="#b05a00" stroke-width="1.5"/>
<text x="177" y="106" text-anchor="middle" font-size="8" fill="#b05a00">Intermed.</text>
<rect x="220" y="90" width="55" height="24" rx="6" fill="#fce8e6" stroke="#c5221f" stroke-width="1.5"/>
<text x="247" y="106" text-anchor="middle" font-size="8" fill="#c5221f">Advanced</text>
<rect x="290" y="90" width="25" height="24" rx="6" fill="#f3e8fd" stroke="#7627bb" stroke-width="1.5"/>
<text x="302" y="106" text-anchor="middle" font-size="7" fill="#7627bb">Exp</text>
<path d="M65 102 L80 102" stroke="#34a853" stroke-width="2" marker-end="url(#arr2)"/>
<path d="M135 102 L150 102" stroke="#34a853" stroke-width="2" marker-end="url(#arr2)"/>
<path d="M205 102 L220 102" stroke="#34a853" stroke-width="2" marker-end="url(#arr2)"/>
<path d="M275 102 L290 102" stroke="#34a853" stroke-width="2" marker-end="url(#arr2)"/>
<defs><marker id="arr2" markerWidth="6" markerHeight="4" refX="6" refY="2" orient="auto"><path d="M0 0 L6 2 L0 4" fill="#34a853"/></marker></defs>
<path d="M37 90 Q37 50 107 50 Q177 50 177 90" stroke="#1a73e8" stroke-width="1.5" stroke-dasharray="4 2" fill="none"/>
<text x="107" y="42" text-anchor="middle" font-size="8" fill="#1a73e8">Fast-track (3x 90%+)</text>
<text x="107" y="130" text-anchor="middle" font-size="8" fill="#5f6368">Standard: min episodes + success rate</text>
</svg>`
},
'rollback-penalty': {
title: 'Rollback Penalty & Idempotency Bonus',
what: 'Detects create\u2192delete pairs on the same resource (rollbacks) and penalizes them (-0.1 each). Rewards graceful "already exists" handling (+0.02) where the agent retries idempotently.',
why: 'First RL environment rewarding operational discipline. In production, create-then-delete cycles are wasteful. Handling "already exists" gracefully is a sign of robust automation.',
how: 'EpisodeTracker.detect_rollbacks() scans command history for paired create/delete operations on the same resource. Idempotency detection looks for commands that fail with "already exists" patterns (BucketAlreadyExists, ResourceInUseException, etc.) followed by successful continuation.',
metrics: [
{ v: '-0.1', l: 'Rollback Penalty' },
{ v: '+0.02', l: 'Idempotency Bonus' },
{ v: 'Per pair', l: 'Detection Granularity' }
],
diagram: `<svg viewBox="0 0 320 160" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Operational Discipline Detection</text>
<rect x="20" y="35" width="130" height="50" rx="8" fill="#fce8e6" stroke="#c5221f" stroke-width="1.5"/>
<text x="85" y="53" text-anchor="middle" font-size="9" fill="#c5221f" font-weight="500">Rollback</text>
<text x="85" y="66" text-anchor="middle" font-size="8" fill="#5f6368">create-bucket \u2192 delete-bucket</text>
<text x="85" y="78" text-anchor="middle" font-size="9" fill="#c5221f" font-weight="500">-0.1 penalty</text>
<rect x="170" y="35" width="130" height="50" rx="8" fill="#e6f4ea" stroke="#137333" stroke-width="1.5"/>
<text x="235" y="53" text-anchor="middle" font-size="9" fill="#137333" font-weight="500">Idempotent</text>
<text x="235" y="66" text-anchor="middle" font-size="8" fill="#5f6368">create fails \u2192 continues ok</text>
<text x="235" y="78" text-anchor="middle" font-size="9" fill="#137333" font-weight="500">+0.02 bonus</text>
<rect x="40" y="105" width="240" height="40" rx="8" fill="#f8f9fa" stroke="#dadce0" stroke-width="1"/>
<text x="160" y="122" text-anchor="middle" font-size="9" fill="#202124">episode_tracker.detect_rollbacks()</text>
<text x="160" y="136" text-anchor="middle" font-size="8" fill="#5f6368">Scans full command history at grading time</text>
</svg>`
},
'shaped-rewards': {
title: 'Shaped Reward System',
what: 'Rewards are carefully shaped: 1.0 for full completion, 0.0-0.8 for partial progress, +0.1 progress bonus for advancing, \u00d70.5 for failures, capped at 0.99 without completion. Chaos bonus (\u00d71.05) and hint decay (\u00d70.85^n) layer on top.',
why: 'Dense reward signal prevents sparse-reward stagnation. The agent gets meaningful feedback on every step, not just at episode end. Capping at 0.99 ensures only real completion earns full credit.',
how: 'TaskGrader dispatches to 5 strategies by tier: command_match (warmup), resource_creation (beginner), multi_step (intermediate), multi_step+services (advanced), and state_checks (expert). Each returns partial_progress which is converted to reward with bonuses/penalties applied.',
metrics: [
{ v: '1.0', l: 'Max Reward' },
{ v: '0.99', l: 'Progress Cap' },
{ v: '\u00d71.05', l: 'Chaos Bonus' }
],
diagram: `<svg viewBox="0 0 320 170" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Reward Pipeline</text>
<rect x="10" y="30" width="70" height="30" rx="6" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1"/>
<text x="45" y="49" text-anchor="middle" font-size="8" fill="#1a73e8">Base grade</text>
<rect x="90" y="30" width="70" height="30" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1"/>
<text x="125" y="49" text-anchor="middle" font-size="8" fill="#137333">+0.1 progress</text>
<rect x="170" y="30" width="70" height="30" rx="6" fill="#fce8e6" stroke="#c5221f" stroke-width="1"/>
<text x="205" y="49" text-anchor="middle" font-size="8" fill="#c5221f">\u00d70.5 fail</text>
<rect x="250" y="30" width="60" height="30" rx="6" fill="#f3e8fd" stroke="#7627bb" stroke-width="1"/>
<text x="280" y="49" text-anchor="middle" font-size="8" fill="#7627bb">Clamp</text>
<path d="M80 45 L90 45" stroke="#dadce0" stroke-width="1.5"/>
<path d="M160 45 L170 45" stroke="#dadce0" stroke-width="1.5"/>
<path d="M240 45 L250 45" stroke="#dadce0" stroke-width="1.5"/>
<rect x="40" y="80" width="100" height="25" rx="6" fill="#fef7e0" stroke="#b05a00" stroke-width="1"/>
<text x="90" y="96" text-anchor="middle" font-size="8" fill="#b05a00">\u00d70.85^hints decay</text>
<rect x="180" y="80" width="100" height="25" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1"/>
<text x="230" y="96" text-anchor="middle" font-size="8" fill="#137333">\u00d71.05 chaos bonus</text>
<rect x="80" y="120" width="160" height="30" rx="8" fill="#1a73e8"/>
<text x="160" y="139" text-anchor="middle" font-size="10" fill="white" font-weight="500">Final Reward</text>
</svg>`
},
'multi-strategy-grading': {
title: 'Multi-Strategy Grading',
what: 'Five distinct grading strategies, one per tier: command_match checks operation+service pairs, resource_creation verifies resources exist, multi_step validates ordered sequences, advanced adds service coverage, and expert runs state_checks against MiniStack.',
why: 'Each tier tests fundamentally different skills. A single grading strategy would either be too lenient for beginners or miss the nuance needed for expert SRE tasks.',
how: 'TaskGrader.grade() dispatches based on the task\'s grading_strategy field. Each strategy returns a GradeResult with partial_progress (0.0-1.0), completed flag, and details. The grading is deterministic and fully automated.',
metrics: [
{ v: '5', l: 'Grading Strategies' },
{ v: '100%', l: 'Automated' },
{ v: 'Per-tier', l: 'Strategy Selection' }
],
diagram: `<svg viewBox="0 0 320 170" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Grading Strategy Dispatch</text>
<rect x="110" y="25" width="100" height="28" rx="6" fill="#1a73e8"/>
<text x="160" y="43" text-anchor="middle" font-size="9" fill="white" font-weight="500">TaskGrader</text>
<line x1="130" y1="53" x2="40" y2="75" stroke="#dadce0" stroke-width="1"/>
<line x1="145" y1="53" x2="100" y2="75" stroke="#dadce0" stroke-width="1"/>
<line x1="160" y1="53" x2="160" y2="75" stroke="#dadce0" stroke-width="1"/>
<line x1="175" y1="53" x2="220" y2="75" stroke="#dadce0" stroke-width="1"/>
<line x1="190" y1="53" x2="280" y2="75" stroke="#dadce0" stroke-width="1"/>
<rect x="10" y="75" width="60" height="40" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1"/>
<text x="40" y="92" text-anchor="middle" font-size="7" fill="#137333">command</text>
<text x="40" y="103" text-anchor="middle" font-size="7" fill="#137333">_match</text>
<rect x="80" y="75" width="60" height="40" rx="6" fill="#e8f0fe" stroke="#174ea6" stroke-width="1"/>
<text x="110" y="92" text-anchor="middle" font-size="7" fill="#174ea6">resource</text>
<text x="110" y="103" text-anchor="middle" font-size="7" fill="#174ea6">_creation</text>
<rect x="150" y="75" width="60" height="40" rx="6" fill="#fef7e0" stroke="#b05a00" stroke-width="1"/>
<text x="180" y="92" text-anchor="middle" font-size="7" fill="#b05a00">multi</text>
<text x="180" y="103" text-anchor="middle" font-size="7" fill="#b05a00">_step</text>
<rect x="220" y="75" width="60" height="40" rx="6" fill="#fce8e6" stroke="#c5221f" stroke-width="1"/>
<text x="250" y="92" text-anchor="middle" font-size="7" fill="#c5221f">multi_step</text>
<text x="250" y="103" text-anchor="middle" font-size="7" fill="#c5221f">+services</text>
<rect x="290" y="75" width="25" height="40" rx="6" fill="#f3e8fd" stroke="#7627bb" stroke-width="1"/>
<text x="302" y="98" text-anchor="middle" font-size="7" fill="#7627bb">state</text>
<text x="40" y="128" text-anchor="middle" font-size="7" fill="#5f6368">Warmup</text>
<text x="110" y="128" text-anchor="middle" font-size="7" fill="#5f6368">Beginner</text>
<text x="180" y="128" text-anchor="middle" font-size="7" fill="#5f6368">Intermed.</text>
<text x="250" y="128" text-anchor="middle" font-size="7" fill="#5f6368">Advanced</text>
<text x="302" y="128" text-anchor="middle" font-size="7" fill="#5f6368">Expert</text>
</svg>`
},
'progressive-hints': {
title: 'Progressive Hint System',
what: 'A 3-level hint system where each level reveals progressively more detail: Level 1 names the AWS services, Level 2 describes the operations, Level 3 gives near-complete command structure. Each hint reduces the final reward by \u00d70.85.',
why: 'Creates an information-reward tradeoff. The agent learns to wean off hints over time \u2014 initially relying on them for unfamiliar tasks, then solving independently for maximum reward.',
how: 'Agent issues special command "aws help --task-hint" as its action. 3 hint levels, each more specific:- Hint 1: Which AWS services to use (e.g., "Youll need IAM and Lambda"), Hint 2: Which operations (e.g., "Start with create-role, then put-role-policy"), Hint 3: Near-complete command structure (e.g., "Use: aws iam create-role --role-name ..."). Reward decay: "final_reward *= 0.85 ^ hints_used". Curriculum naturally penalizes hint-dependent agents: lower rewards → slower graduation. Hints auto-generated from `SuccessCriteria` fields (services, steps, operations)',
metrics: [
{ v: '3', l: 'Hint Levels' },
{ v: '\u00d70.85', l: 'Decay Per Hint' },
{ v: '~61%', l: 'Reward with 3 Hints' }
],
diagram: `<svg viewBox="0 0 320 160" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Hint Levels and Reward Decay</text>
<rect x="20" y="35" width="80" height="45" rx="8" fill="#e6f4ea" stroke="#137333" stroke-width="1.5"/>
<text x="60" y="52" text-anchor="middle" font-size="9" fill="#137333" font-weight="500">Level 1</text>
<text x="60" y="66" text-anchor="middle" font-size="7" fill="#5f6368">Which services</text>
<text x="60" y="76" text-anchor="middle" font-size="8" fill="#137333">\u00d70.85</text>
<rect x="120" y="35" width="80" height="45" rx="8" fill="#fef7e0" stroke="#b05a00" stroke-width="1.5"/>
<text x="160" y="52" text-anchor="middle" font-size="9" fill="#b05a00" font-weight="500">Level 2</text>
<text x="160" y="66" text-anchor="middle" font-size="7" fill="#5f6368">Which operations</text>
<text x="160" y="76" text-anchor="middle" font-size="8" fill="#b05a00">\u00d70.72</text>
<rect x="220" y="35" width="80" height="45" rx="8" fill="#fce8e6" stroke="#c5221f" stroke-width="1.5"/>
<text x="260" y="52" text-anchor="middle" font-size="9" fill="#c5221f" font-weight="500">Level 3</text>
<text x="260" y="66" text-anchor="middle" font-size="7" fill="#5f6368">Full structure</text>
<text x="260" y="76" text-anchor="middle" font-size="8" fill="#c5221f">\u00d70.61</text>
<path d="M100 57 L120 57" stroke="#dadce0" stroke-width="1.5" marker-end="url(#arr3)"/>
<path d="M200 57 L220 57" stroke="#dadce0" stroke-width="1.5" marker-end="url(#arr3)"/>
<defs><marker id="arr3" markerWidth="6" markerHeight="4" refX="6" refY="2" orient="auto"><path d="M0 0 L6 2 L0 4" fill="#dadce0"/></marker></defs>
<rect x="40" y="100" width="240" height="40" rx="8" fill="#f8f9fa" stroke="#dadce0" stroke-width="1"/>
<text x="160" y="117" text-anchor="middle" font-size="9" fill="#202124">Agent calls: aws help --task-hint</text>
<text x="160" y="132" text-anchor="middle" font-size="8" fill="#5f6368">Reward = base_reward \u00d7 0.85^hints_used</text>
</svg>`
},
'chaos-injection': {
title: 'Chaos Injection Engine',
what: 'Silently mutates AWS resource state mid-episode to test agent resilience. Perturbations are scoped to services the current task uses. If the agent completes despite chaos, it earns a \u00d71.05 bonus.',
why: 'Tests whether the agent can handle unexpected state changes \u2014 a critical SRE skill. Prevents brittle memorization of exact command sequences. Probability scales with tier difficulty.',
how: 'ChaosEngine selects perturbation templates specific to the services in use (S3 policy changes, DynamoDB throughput modifications, Lambda config alterations, etc.). Resource names are extracted from successful commands via regex. Chaos probability: 10% (Intermediate), 20% (Advanced), 30% (Expert).',
metrics: [
{ v: '\u00d71.05', l: 'Chaos Survival Bonus' },
{ v: '10-30%', l: 'Probability by Tier' },
{ v: '5', l: 'Service Templates' }
],
diagram: `<svg viewBox="0 0 320 170" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Chaos Injection Flow</text>
<rect x="110" y="30" width="100" height="28" rx="6" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1.5"/>
<text x="160" y="48" text-anchor="middle" font-size="9" fill="#1a73e8">Agent Step</text>
<path d="M160 58 L160 75" stroke="#dadce0" stroke-width="1.5"/>
<rect x="90" y="75" width="140" height="28" rx="6" fill="#fef7e0" stroke="#b05a00" stroke-width="1.5"/>
<text x="160" y="93" text-anchor="middle" font-size="9" fill="#b05a00">Chaos Check (10-30%)</text>
<path d="M160 103 L160 115" stroke="#dadce0" stroke-width="1.5"/>
<path d="M90 89 L30 89 L30 120" stroke="#34a853" stroke-width="1" stroke-dasharray="4 2"/>
<rect x="5" y="120" width="60" height="24" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1"/>
<text x="35" y="136" text-anchor="middle" font-size="8" fill="#137333">No chaos</text>
<rect x="80" y="115" width="160" height="40" rx="6" fill="#fce8e6" stroke="#c5221f" stroke-width="1.5"/>
<text x="160" y="132" text-anchor="middle" font-size="8" fill="#c5221f" font-weight="500">Mutate resource state</text>
<text x="160" y="147" text-anchor="middle" font-size="7" fill="#5f6368">S3 policy | DynamoDB throughput | Lambda config</text>
</svg>`
},
'drift-detection': {
title: 'Drift Detection Tasks',
what: '6 expert-tier tasks (24-29) where infrastructure is provisioned correctly, then 2-3 random mutations are applied from a pool. The agent must audit, discover drifted resources, and fix only those \u2014 without knowing which drifted.',
why: 'Randomized per episode, preventing memorization. Tests real SRE audit skills: the agent must reason about desired vs. actual state, not just follow a script.',
how: 'DriftEngine randomly selects 2-3 mutations from a task\'s possible_drifts pool and applies them after setup. Each task defines a desired_state_spec (natural language) and state_checks (ground truth CLI commands). Examples: S3 versioning/encryption drift, DynamoDB throughput changes, SNS subscription modifications.',
metrics: [
{ v: '6', l: 'Drift Tasks' },
{ v: '2-3', l: 'Mutations Per Episode' },
{ v: 'Random', l: 'Selection Per Run' }
],
diagram: `<svg viewBox="0 0 320 170" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Drift Detection Flow</text>
<rect x="20" y="35" width="80" height="30" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1.5"/>
<text x="60" y="54" text-anchor="middle" font-size="9" fill="#137333">Setup OK</text>
<path d="M100 50 L120 50" stroke="#dadce0" stroke-width="1.5" marker-end="url(#arr4)"/>
<rect x="120" y="35" width="80" height="30" rx="6" fill="#fce8e6" stroke="#c5221f" stroke-width="1.5"/>
<text x="160" y="49" text-anchor="middle" font-size="8" fill="#c5221f">Apply 2-3</text>
<text x="160" y="59" text-anchor="middle" font-size="8" fill="#c5221f">random drifts</text>
<path d="M200 50 L220 50" stroke="#dadce0" stroke-width="1.5" marker-end="url(#arr4)"/>
<rect x="220" y="35" width="80" height="30" rx="6" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1.5"/>
<text x="260" y="54" text-anchor="middle" font-size="9" fill="#1a73e8">Agent audits</text>
<defs><marker id="arr4" markerWidth="6" markerHeight="4" refX="6" refY="2" orient="auto"><path d="M0 0 L6 2 L0 4" fill="#dadce0"/></marker></defs>
<rect x="40" y="85" width="240" height="65" rx="8" fill="#f8f9fa" stroke="#dadce0" stroke-width="1"/>
<text x="160" y="100" text-anchor="middle" font-size="9" fill="#202124" font-weight="500">Possible Drift Pool</text>
<text x="160" y="115" text-anchor="middle" font-size="8" fill="#5f6368">S3 versioning off | Encryption disabled | Lifecycle removed</text>
<text x="160" y="128" text-anchor="middle" font-size="8" fill="#5f6368">DynamoDB throughput changed | SNS subscriptions altered</text>
<text x="160" y="141" text-anchor="middle" font-size="8" fill="#ea4335">Random 2-3 selected each episode</text>
</svg>`
},
'ground-truth': {
title: 'Ground-Truth Verification via MiniStack',
what: 'The grader never trusts agent command output. It independently queries MiniStack (the simulated AWS backend) to verify resource state for 20+ services. Even if the agent crafts fake-looking stdout, the grader checks actual state.',
why: 'Prevents reward hacking through output fabrication. The agent cannot game the system by producing convincing but fake CLI output \u2014 ground truth is always checked server-side.',
how: 'ResourceVerifier has per-service verification methods that query MiniStack directly. For expert tasks, StateCheck assertions run actual AWS CLI commands against MiniStack at grading time, checking either output_contains (substring) or json_path extraction with expected values.',
metrics: [
{ v: '20+', l: 'Verified Services' },
{ v: '100%', l: 'Server-side' },
{ v: '0', l: 'Agent Visibility' }
],
diagram: `<svg viewBox="0 0 320 160" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Ground-Truth Verification</text>
<rect x="20" y="35" width="90" height="35" rx="8" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1.5"/>
<text x="65" y="50" text-anchor="middle" font-size="9" fill="#1a73e8">Agent</text>
<text x="65" y="62" text-anchor="middle" font-size="7" fill="#5f6368">runs commands</text>
<rect x="210" y="35" width="90" height="35" rx="8" fill="#e6f4ea" stroke="#137333" stroke-width="1.5"/>
<text x="255" y="50" text-anchor="middle" font-size="9" fill="#137333">MiniStack</text>
<text x="255" y="62" text-anchor="middle" font-size="7" fill="#5f6368">actual state</text>
<path d="M110 52 L210 52" stroke="#dadce0" stroke-width="1.5"/>
<rect x="130" y="85" width="120" height="35" rx="8" fill="#fef7e0" stroke="#b05a00" stroke-width="1.5"/>
<text x="190" y="100" text-anchor="middle" font-size="9" fill="#b05a00">ResourceVerifier</text>
<text x="190" y="112" text-anchor="middle" font-size="7" fill="#5f6368">queries MiniStack directly</text>
<path d="M255 70 L190 85" stroke="#137333" stroke-width="1" stroke-dasharray="4 2"/>
<path d="M65 70 L150 85" stroke="#1a73e8" stroke-width="1" stroke-dasharray="4 2"/>
<text x="160" y="140" text-anchor="middle" font-size="8" fill="#c5221f">Agent output is NEVER trusted \u2014 grader verifies independently</text>
</svg>`
},
'command-allowlisting': {
title: 'Command Allowlisting',
what: 'Only commands starting with "aws" are executed. Any attempt to run shell commands, pipe to other tools, use redirects, or escape the sandbox is rejected with success=False.',
why: 'Prevents the agent from escaping the AWS CLI sandbox. Without this, the agent could potentially execute arbitrary shell commands, access the filesystem, or interfere with the environment.',
how: 'The environment\'s step() method validates the command before execution. Commands not starting with "aws" are immediately rejected. This is tested extensively in TestStepRejection.',
metrics: [
{ v: 'aws *', l: 'Allowed Pattern' },
{ v: '0', l: 'Shell Access' },
{ v: 'Instant', l: 'Rejection Speed' }
],
diagram: `<svg viewBox="0 0 320 130" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Command Validation Gate</text>
<rect x="20" y="35" width="100" height="30" rx="6" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1.5"/>
<text x="70" y="54" text-anchor="middle" font-size="9" fill="#1a73e8">Agent command</text>
<path d="M120 50 L150 50" stroke="#dadce0" stroke-width="1.5"/>
<polygon points="160,35 180,50 160,65" fill="#fef7e0" stroke="#b05a00" stroke-width="1.5"/>
<text x="165" y="53" font-size="8" fill="#b05a00">?</text>
<path d="M180 42 L220 30" stroke="#34a853" stroke-width="1.5"/>
<path d="M180 58 L220 75" stroke="#c5221f" stroke-width="1.5"/>
<rect x="220" y="18" width="85" height="25" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1"/>
<text x="262" y="35" text-anchor="middle" font-size="8" fill="#137333">aws * \u2192 Execute</text>
<rect x="220" y="65" width="85" height="25" rx="6" fill="#fce8e6" stroke="#c5221f" stroke-width="1"/>
<text x="262" y="82" text-anchor="middle" font-size="8" fill="#c5221f">else \u2192 Reject</text>
<text x="160" y="110" text-anchor="middle" font-size="8" fill="#5f6368">No pipes | No redirects | No shell escape</text>
</svg>`
},
'deduplication': {
title: 'Deduplication',
what: 'EpisodeTracker.has_executed_operation() tracks which (operation, resource) pairs have been credited. Running the same successful command twice does NOT increase partial_progress. Progress can only increase, never re-earn.',
why: 'Prevents the agent from gaming the reward system by repeating the same command to accumulate credit. Each unique operation earns credit exactly once.',
how: 'The credit_operation() method records each (operation, resource) pair. Before granting credit, is_operation_already_credited() checks if this exact pair was already rewarded. The check is deterministic and happens at grading time.',
metrics: [
{ v: '1x', l: 'Credit Per Operation' },
{ v: 'Exact', l: 'Match Type' },
{ v: '(op, res)', l: 'Tracking Granularity' }
],
diagram: `<svg viewBox="0 0 320 130" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Operation Deduplication</text>
<rect x="20" y="35" width="100" height="28" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1.5"/>
<text x="70" y="53" text-anchor="middle" font-size="8" fill="#137333">1st: create-bucket OK</text>
<rect x="20" y="73" width="100" height="28" rx="6" fill="#fce8e6" stroke="#c5221f" stroke-width="1.5"/>
<text x="70" y="91" text-anchor="middle" font-size="8" fill="#c5221f">2nd: create-bucket X</text>
<path d="M120 49 L160 49" stroke="#34a853" stroke-width="1.5" marker-end="url(#arr5)"/>
<path d="M120 87 L160 87" stroke="#c5221f" stroke-width="1.5"/>
<text x="175" y="91" font-size="8" fill="#c5221f">blocked</text>
<rect x="160" y="35" width="140" height="28" rx="6" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1"/>
<text x="230" y="53" text-anchor="middle" font-size="8" fill="#1a73e8">+credit \u2192 progress increases</text>
<defs><marker id="arr5" markerWidth="6" markerHeight="4" refX="6" refY="2" orient="auto"><path d="M0 0 L6 2 L0 4" fill="#34a853"/></marker></defs>
<text x="160" y="120" text-anchor="middle" font-size="8" fill="#5f6368">Same (operation, resource) pair \u2192 no additional reward</text>
</svg>`
},
'grader-invisibility': {
title: 'Grader Commands Invisible to Agent',
what: 'The verification commands run by ResourceVerifier are NOT returned in the observation\u2019s command_output. They happen server-side during grading. The agent cannot observe or mimic them.',
why: 'If the agent could see which verification commands the grader runs, it could learn to craft fake outputs that match expected patterns. Keeping grader logic invisible forces the agent to actually perform the task.',
how: 'ResourceVerifier executes AWS CLI commands against MiniStack in a separate execution context. Results are consumed internally by the grading pipeline. The observation returned to the agent only contains output from the agent\u2019s own commands.',
metrics: [
{ v: '0', l: 'Grader Cmds Exposed' },
{ v: 'Server', l: 'Execution Context' },
{ v: '20+', l: 'Hidden Verifications' }
],
diagram: `<svg viewBox="0 0 320 140" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Grader Invisibility</text>
<rect x="20" y="35" width="80" height="40" rx="8" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1.5"/>
<text x="60" y="52" text-anchor="middle" font-size="8" fill="#1a73e8">Agent</text>
<text x="60" y="64" text-anchor="middle" font-size="7" fill="#5f6368">sees own output</text>
<rect x="130" y="30" width="3" height="90" rx="1" fill="#c5221f"/>
<text x="131" y="130" text-anchor="middle" font-size="7" fill="#c5221f">visibility wall</text>
<rect x="165" y="35" width="130" height="40" rx="8" fill="#fef7e0" stroke="#b05a00" stroke-width="1.5"/>
<text x="230" y="52" text-anchor="middle" font-size="8" fill="#b05a00">ResourceVerifier</text>
<text x="230" y="64" text-anchor="middle" font-size="7" fill="#5f6368">hidden AWS CLI checks</text>
<rect x="165" y="85" width="130" height="30" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1"/>
<text x="230" y="104" text-anchor="middle" font-size="8" fill="#137333">MiniStack ground truth</text>
</svg>`
},
'no-verification-reward': {
title: 'No Reward for Verification-Only Commands',
what: 'If the agent runs a command that matches a state_check command exactly (e.g., aws s3api get-bucket-versioning --bucket app-config-store), it gets no progress credit. Progress is only earned through steps operations (mutating commands), not read-only queries.',
why: 'Prevents the agent from gaming progress by running the same verification commands the grader uses. The agent can run read commands to understand state, but only mutation commands earn progress.',
how: 'During grading, the TaskGrader checks if the agent\u2019s command matches any state_check command. Matching commands are flagged as verification-only and excluded from credit. Only commands matching steps operations (create, put, update, delete) earn partial_progress.',
metrics: [
{ v: '0', l: 'Credit for Reads' },
{ v: 'Mutate', l: 'Rewarded Actions' },
{ v: 'Exact', l: 'Match Detection' }
],
diagram: `<svg viewBox="0 0 320 140" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Verification vs Mutation</text>
<rect x="20" y="35" width="130" height="35" rx="8" fill="#fce8e6" stroke="#c5221f" stroke-width="1.5"/>
<text x="85" y="50" text-anchor="middle" font-size="8" fill="#c5221f">get-bucket-versioning</text>
<text x="85" y="62" text-anchor="middle" font-size="7" fill="#5f6368">read-only \u2192 0 credit</text>
<rect x="170" y="35" width="130" height="35" rx="8" fill="#e6f4ea" stroke="#137333" stroke-width="1.5"/>
<text x="235" y="50" text-anchor="middle" font-size="8" fill="#137333">put-bucket-versioning</text>
<text x="235" y="62" text-anchor="middle" font-size="7" fill="#5f6368">mutation \u2192 +credit</text>
<rect x="60" y="90" width="200" height="30" rx="6" fill="#fef7e0" stroke="#b05a00" stroke-width="1"/>
<text x="160" y="109" text-anchor="middle" font-size="8" fill="#b05a00">state_check matches \u2192 no progress earned</text>
</svg>`
},
'monotonic-progress': {
title: 'Monotonic Progress',
what: 'partial_progress can only increase within an episode. It is clamped to [0.0, 0.99] \u2014 reaching 1.0 requires actual task completion. The agent cannot lose progress, but also cannot re-earn it.',
why: 'Prevents cycling strategies where the agent creates and destroys resources repeatedly. Combined with deduplication, this ensures steady forward progress.',
how: 'In TaskGrader, previous_progress tracks the highest progress seen. New progress is always max(previous, current). Reward is clamped at 0.99 for partial completion, reserving 1.0 exclusively for verified full completion.',
metrics: [
{ v: '0.99', l: 'Max Without Completion' },
{ v: '1.0', l: 'Requires Full Completion' },
{ v: 'max()', l: 'Progress Function' }
],
diagram: `<svg viewBox="0 0 320 130" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Monotonic Progress</text>
<line x1="30" y1="100" x2="290" y2="100" stroke="#dadce0" stroke-width="1"/>
<line x1="30" y1="100" x2="30" y2="30" stroke="#dadce0" stroke-width="1"/>
<line x1="30" y1="35" x2="290" y2="35" stroke="#c5221f" stroke-width="1" stroke-dasharray="4 2"/>
<text x="296" y="38" font-size="7" fill="#c5221f">0.99 cap</text>
<polyline points="50,95 90,80 130,80 170,60 210,45 250,40 270,38" stroke="#34a853" stroke-width="2" fill="none" stroke-linecap="round" stroke-linejoin="round"/>
<polyline points="90,80 130,85" stroke="#c5221f" stroke-width="1" stroke-dasharray="3 2"/>
<text x="130" y="93" font-size="7" fill="#c5221f">can't drop</text>
<text x="160" y="115" text-anchor="middle" font-size="8" fill="#5f6368">Steps \u2192</text>
<text x="20" y="65" font-size="7" fill="#5f6368" transform="rotate(-90 20 65)">Progress</text>
</svg>`
},
'resource-validation': {
title: 'Resource Name Validation',
what: 'For resource_exists checks, the verifier matches the exact resource name, not just any resource of that type. Creating "my-test-bucket-2" doesn\'t satisfy a check for "my-test-bucket".',
why: 'Prevents the agent from creating arbitrarily named resources to game the verification system. Forces precise execution of the task requirements.',
how: 'ResourceVerifier\'s per-service methods (verify_s3_bucket, verify_dynamodb_table, etc.) compare against the exact expected resource name from the task definition. Each of the 20+ supported services has its own verification logic.',
metrics: [
{ v: 'Exact', l: 'Name Matching' },
{ v: '20+', l: 'Verified Services' },
{ v: '0', l: 'Partial Matches' }
],
diagram: `<svg viewBox="0 0 320 120" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Exact Name Validation</text>
<rect x="20" y="35" width="120" height="30" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1.5"/>
<text x="80" y="54" text-anchor="middle" font-size="8" fill="#137333">my-test-bucket OK</text>
<rect x="20" y="75" width="120" height="30" rx="6" fill="#fce8e6" stroke="#c5221f" stroke-width="1.5"/>
<text x="80" y="94" text-anchor="middle" font-size="8" fill="#c5221f">my-test-bucket-2 X</text>
<rect x="180" y="50" width="120" height="30" rx="6" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1"/>
<text x="240" y="69" text-anchor="middle" font-size="8" fill="#1a73e8">Expected: my-test-bucket</text>
<path d="M140 50 L180 65" stroke="#34a853" stroke-width="1.5"/>
<path d="M140 90 L180 75" stroke="#c5221f" stroke-width="1" stroke-dasharray="3 2"/>
</svg>`
},
'state-checks': {
title: 'State Checks Verify Final State',
what: 'For expert SRE tasks, state_checks run actual AWS CLI commands against MiniStack at grading time. The grader verifies the final infrastructure state \u2014 not the commands the agent ran.',
why: 'The agent cannot fake the state. MiniStack is the ground truth. This decouples "what the agent did" from "what was actually achieved", making reward hacking extremely difficult.',
how: 'Each expert task defines state_checks with command + assertion pairs. Assertions support output_contains (substring match on CLI output) and json_path + expected (JSON extraction). The grader runs these checks against the live MiniStack state independently of the agent.',
metrics: [
{ v: 'CLI', l: 'Verification Method' },
{ v: '2', l: 'Assertion Types' },
{ v: 'Live', l: 'State Source' }
],
diagram: `<svg viewBox="0 0 320 150" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">State Check Verification</text>
<rect x="20" y="35" width="120" height="40" rx="8" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1.5"/>
<text x="80" y="52" text-anchor="middle" font-size="8" fill="#1a73e8">Agent completes task</text>
<text x="80" y="64" text-anchor="middle" font-size="7" fill="#5f6368">(commands executed)</text>
<path d="M140 55 L170 55" stroke="#dadce0" stroke-width="1.5" marker-end="url(#arr6)"/>
<rect x="170" y="35" width="130" height="40" rx="8" fill="#fef7e0" stroke="#b05a00" stroke-width="1.5"/>
<text x="235" y="52" text-anchor="middle" font-size="8" fill="#b05a00">Grader runs state_checks</text>
<text x="235" y="64" text-anchor="middle" font-size="7" fill="#5f6368">against MiniStack directly</text>
<defs><marker id="arr6" markerWidth="6" markerHeight="4" refX="6" refY="2" orient="auto"><path d="M0 0 L6 2 L0 4" fill="#dadce0"/></marker></defs>
<rect x="40" y="95" width="110" height="35" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1"/>
<text x="95" y="110" text-anchor="middle" font-size="8" fill="#137333">output_contains</text>
<text x="95" y="122" text-anchor="middle" font-size="7" fill="#5f6368">substring match</text>
<rect x="170" y="95" width="110" height="35" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1"/>
<text x="225" y="110" text-anchor="middle" font-size="8" fill="#137333">json_path + expected</text>
<text x="225" y="122" text-anchor="middle" font-size="7" fill="#5f6368">JSON extraction</text>
</svg>`
},
's3-lockdown': {
title: 'Public S3 Bucket Lockdown',
what: 'A pre-provisioned S3 bucket "public-assets" has an overly permissive bucket policy granting access to any principal (Principal: *). The agent must read the policy, identify the vulnerability, and replace it with a restrictive policy allowing only a specific IAM role.',
why: 'Tests security reasoning \u2014 the infrastructure is functional but insecure. Unlike SRE tasks where things are broken, here the agent must understand what "correct" security posture looks like and make the right judgment call.',
how: 'Setup creates the bucket with a wide-open policy. State checks verify the new policy denies Principal: * and only allows the app-role principal to perform s3:GetObject.',
metrics: [
{ v: 'S3', l: 'Target Service' },
{ v: 'Policy', l: 'Attack Surface' },
{ v: 'Expert', l: 'Difficulty Tier' }
],
diagram: `<svg viewBox="0 0 320 140" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">S3 Policy Lockdown Flow</text>
<rect x="20" y="35" width="80" height="40" rx="8" fill="#fce8e6" stroke="#c5221f" stroke-width="1.5"/>
<text x="60" y="52" text-anchor="middle" font-size="8" fill="#c5221f">Principal: *</text>
<text x="60" y="64" text-anchor="middle" font-size="7" fill="#5f6368">open policy</text>
<path d="M100 55 L130 55" stroke="#dadce0" stroke-width="1.5"/>
<rect x="130" y="35" width="60" height="40" rx="8" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1.5"/>
<text x="160" y="52" text-anchor="middle" font-size="8" fill="#1a73e8">Agent</text>
<text x="160" y="64" text-anchor="middle" font-size="7" fill="#5f6368">analyzes</text>
<path d="M190 55 L220 55" stroke="#dadce0" stroke-width="1.5"/>
<rect x="220" y="35" width="80" height="40" rx="8" fill="#e6f4ea" stroke="#137333" stroke-width="1.5"/>
<text x="260" y="52" text-anchor="middle" font-size="8" fill="#137333">app-role only</text>
<text x="260" y="64" text-anchor="middle" font-size="7" fill="#5f6368">scoped policy</text>
<rect x="80" y="95" width="160" height="30" rx="6" fill="#fef7e0" stroke="#b05a00" stroke-width="1"/>
<text x="160" y="114" text-anchor="middle" font-size="8" fill="#b05a00">State check: no Principal:* in policy</text>
</svg>`
},
'iam-least-privilege': {
title: 'IAM Least Privilege',
what: 'An IAM role "app-role" has an inline policy with Action: * and Resource: * \u2014 full admin access. The agent must replace it with a least-privilege policy allowing only dynamodb:GetItem and dynamodb:PutItem on the users table.',
why: 'IAM misconfiguration is the #1 cloud security risk. This task tests whether the agent understands permission scoping and can reason about what access an application actually needs vs. what it currently has.',
how: 'Setup creates the role with a wildcard policy. The agent must craft a replacement policy document with specific actions and resource ARN. State checks verify the policy document matches the expected least-privilege permissions.',
metrics: [
{ v: 'IAM', l: 'Target Service' },
{ v: '2', l: 'Allowed Actions' },
{ v: 'Expert', l: 'Difficulty Tier' }
],
diagram: `<svg viewBox="0 0 320 140" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">IAM Policy Scoping</text>
<rect x="20" y="35" width="120" height="40" rx="8" fill="#fce8e6" stroke="#c5221f" stroke-width="1.5"/>
<text x="80" y="52" text-anchor="middle" font-size="8" fill="#c5221f">Action: *, Resource: *</text>
<text x="80" y="64" text-anchor="middle" font-size="7" fill="#5f6368">overprivileged</text>
<path d="M140 55 L170 55" stroke="#dadce0" stroke-width="1.5"/>
<text x="155" y="48" text-anchor="middle" font-size="7" fill="#1a73e8">\u2192</text>
<rect x="170" y="30" width="130" height="50" rx="8" fill="#e6f4ea" stroke="#137333" stroke-width="1.5"/>
<text x="235" y="48" text-anchor="middle" font-size="8" fill="#137333">dynamodb:GetItem</text>
<text x="235" y="60" text-anchor="middle" font-size="8" fill="#137333">dynamodb:PutItem</text>
<text x="235" y="74" text-anchor="middle" font-size="7" fill="#5f6368">on users table only</text>
<rect x="60" y="100" width="200" height="25" rx="6" fill="#fef7e0" stroke="#b05a00" stroke-width="1"/>
<text x="160" y="116" text-anchor="middle" font-size="8" fill="#b05a00">State check: policy matches expected JSON</text>
</svg>`
},
'secrets-rotation': {
title: 'Secrets in Lambda Environment',
what: 'A Lambda function "data-processor" has a database password stored as a plaintext environment variable (DB_PASSWORD=hunter2). The agent must create a secret in Secrets Manager, update the Lambda to reference the secret ARN, and remove the plaintext variable.',
why: 'Plaintext secrets in environment variables is a critical security anti-pattern. This task combines multiple services (Lambda + Secrets Manager) and tests the agent\'s ability to perform a safe credential rotation without breaking the function.',
how: 'Setup creates the Lambda with the plaintext env var. The agent must: (1) create a secret in Secrets Manager, (2) add SECRET_ARN env var to Lambda, (3) remove DB_PASSWORD. State checks verify all three conditions.',
metrics: [
{ v: '2', l: 'Services Involved' },
{ v: '3', l: 'Required Steps' },
{ v: 'Expert', l: 'Difficulty Tier' }
],
diagram: `<svg viewBox="0 0 320 150" fill="none" xmlns="http://www.w3.org/2000/svg">
<text x="160" y="15" text-anchor="middle" font-size="10" fill="#202124" font-weight="500">Secret Rotation Flow</text>
<rect x="20" y="35" width="90" height="35" rx="8" fill="#fce8e6" stroke="#c5221f" stroke-width="1.5"/>
<text x="65" y="52" text-anchor="middle" font-size="8" fill="#c5221f">DB_PASSWORD</text>
<text x="65" y="63" text-anchor="middle" font-size="7" fill="#5f6368">plaintext env</text>
<path d="M110 52 L135 52" stroke="#dadce0" stroke-width="1.5"/>
<rect x="135" y="30" width="60" height="45" rx="8" fill="#e8f0fe" stroke="#1a73e8" stroke-width="1.5"/>
<text x="165" y="50" text-anchor="middle" font-size="8" fill="#1a73e8">Agent</text>
<text x="165" y="65" text-anchor="middle" font-size="7" fill="#5f6368">rotates</text>
<path d="M195 42 L225 35" stroke="#dadce0" stroke-width="1.5"/>
<path d="M195 62 L225 70" stroke="#dadce0" stroke-width="1.5"/>
<rect x="225" y="20" width="80" height="30" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1"/>
<text x="265" y="39" text-anchor="middle" font-size="7" fill="#137333">Secrets Manager</text>
<rect x="225" y="60" width="80" height="30" rx="6" fill="#e6f4ea" stroke="#137333" stroke-width="1"/>
<text x="265" y="79" text-anchor="middle" font-size="7" fill="#137333">SECRET_ARN env</text>
<rect x="60" y="110" width="200" height="25" rx="6" fill="#fef7e0" stroke="#b05a00" stroke-width="1"/>
<text x="160" y="126" text-anchor="middle" font-size="8" fill="#b05a00">State: no DB_PASSWORD, secret exists, ARN set</text>
</svg>`
}
};
// Modal logic
function openFeatureModal(id) {
const data = featureDetails[id];
if (!data) return;
document.getElementById('modal-title').textContent = data.title;
document.getElementById('modal-what').textContent = data.what;
document.getElementById('modal-why').textContent = data.why;
document.getElementById('modal-how').textContent = data.how;
document.getElementById('modal-diagram').innerHTML = data.diagram;
const metricsContainer = document.getElementById('modal-metrics');
metricsContainer.innerHTML = '';
data.metrics.forEach(m => {
const card = document.createElement('div');
card.className = 'perf-card';
card.innerHTML = `<span class="perf-val">${m.v}</span><span class="perf-label">${m.l}</span>`;
metricsContainer.appendChild(card);
});
const modal = document.getElementById('feature-modal');
modal.classList.add('open');
document.body.style.overflow = 'hidden';
}
document.getElementById('close-modal-btn').addEventListener('click', () => {
document.getElementById('feature-modal').classList.remove('open');
document.body.style.overflow = '';
});
document.getElementById('feature-modal').addEventListener('click', (e) => {
if (e.target.id === 'feature-modal') {
document.getElementById('feature-modal').classList.remove('open');
document.body.style.overflow = '';
}
});
document.addEventListener('keydown', (e) => {
if (e.key === 'Escape') {
document.getElementById('feature-modal').classList.remove('open');
document.body.style.overflow = '';
}
});
// Attach click listeners to feature chips
document.querySelectorAll('.feature-chip[data-feature-id]').forEach(chip => {
chip.addEventListener('click', () => {
openFeatureModal(chip.getAttribute('data-feature-id'));
});
});
// Figure lightbox — click any figure-card to zoom
(function () {
const lightbox = document.getElementById('figure-lightbox');
const lightboxImg = document.getElementById('figure-lightbox-img');
if (!lightbox || !lightboxImg) return;
document.querySelectorAll('.figure-card[data-fig]').forEach(card => {
card.addEventListener('click', () => {
const src = card.getAttribute('data-fig');
if (!src) return;
lightboxImg.src = src;
const captionEl = card.querySelector('.figure-caption strong');
lightboxImg.alt = captionEl ? captionEl.textContent : '';
lightbox.classList.add('open');
document.body.style.overflow = 'hidden';
});
});
function closeLightbox() {
lightbox.classList.remove('open');
document.body.style.overflow = '';
}
lightbox.addEventListener('click', closeLightbox);
document.addEventListener('keydown', e => {
if (e.key === 'Escape' && lightbox.classList.contains('open')) closeLightbox();
});
})();
</script>
</body>
</html>