File size: 30,968 Bytes
7e9a520 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 | ---
marp: true
theme: uncover
paginate: true
backgroundColor: '#060A12'
color: '#E8EDF5'
style: |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700;900&family=JetBrains+Mono:wght@400;700&display=swap');
section {
font-family: 'Inter', sans-serif;
font-size: 20px;
background: #060A12;
color: #E8EDF5;
padding: 48px 56px;
display: flex;
flex-direction: column;
justify-content: center;
}
h1 {
font-size: 2.8em;
font-weight: 900;
letter-spacing: -1px;
line-height: 1.1;
margin-bottom: 12px;
}
h2 {
font-size: 1.5em;
font-weight: 700;
letter-spacing: 3px;
text-transform: uppercase;
margin-bottom: 20px;
padding-bottom: 10px;
border-bottom: 2px solid rgba(255,255,255,0.1);
}
h3 {
font-size: 1.1em;
font-weight: 600;
color: #00d4ff;
letter-spacing: 1px;
margin-bottom: 8px;
}
strong { color: #00D4FF; }
em { color: #ff4560; font-style: normal; font-weight: 600; }
code {
font-family: 'JetBrains Mono', monospace;
background: rgba(0,212,255,0.08);
color: #00d4ff;
padding: 2px 8px;
border-radius: 4px;
font-size: 0.85em;
border: 1px solid rgba(0,212,255,0.2);
}
pre {
background: rgba(0,0,0,0.4);
border: 1px solid rgba(0,212,255,0.15);
border-radius: 8px;
padding: 18px 20px;
font-size: 0.75em;
}
table {
font-size: 0.8em;
border-collapse: collapse;
width: 100%;
margin-top: 16px;
}
th {
background: rgba(0,212,255,0.1);
color: #00d4ff;
padding: 10px 14px;
text-align: left;
font-weight: 700;
letter-spacing: 1px;
text-transform: uppercase;
font-size: 0.8em;
}
td {
padding: 9px 14px;
border-bottom: 1px solid rgba(255,255,255,0.06);
}
tr:last-child td { border-bottom: none; }
blockquote {
border-left: 3px solid #ff4560;
padding-left: 20px;
color: #9BA3B8;
font-style: italic;
margin: 16px 0;
}
section::after {
font-family: 'JetBrains Mono', monospace;
font-size: 0.65em;
color: rgba(255,255,255,0.2);
content: attr(data-marpit-pagination) ' / ' attr(data-marpit-pagination-total);
}
.accent { color: #00D4FF; }
.red { color: #FF4560; }
.green { color: #00E396; }
.yellow { color: #FFB703; }
.dim { color: #5A6478; }
---
<!-- _paginate: false -->
<!-- _backgroundColor: #060A12 -->
<div style="text-align:left">
<div style="font-size:0.65em;letter-spacing:4px;color:#FF4560;text-transform:uppercase;margin-bottom:16px;font-weight:700">AMD Developer Hackathon 2026</div>
# AtlasOps
<div style="font-size:1.1em;color:#9BA3B8;font-weight:300;margin-bottom:32px;line-height:1.6">Can 4 AI agents replace<br>an on-call SRE team?</div>
<div style="display:flex;gap:32px;margin-top:24px;flex-wrap:wrap">
<div style="background:rgba(0,212,255,0.06);border:1px solid rgba(0,212,255,0.2);border-radius:8px;padding:10px 16px;font-size:0.7em;color:#00D4FF">Real GKE Cluster Β· GCP</div>
<div style="background:rgba(255,69,96,0.06);border:1px solid rgba(255,69,96,0.2);border-radius:8px;padding:10px 16px;font-size:0.7em;color:#FF4560">AMD MI300X Β· 192 GB HBM3</div>
<div style="background:rgba(0,227,150,0.06);border:1px solid rgba(0,227,150,0.2);border-radius:8px;padding:10px 16px;font-size:0.7em;color:#00E396">SFT + Online GRPO Trained</div>
</div>
<div style="margin-top:40px;padding-top:24px;border-top:1px solid rgba(255,255,255,0.08);font-size:0.75em;color:#5A6478">
<strong style="color:#E8EDF5">Harikishanth R</strong> Β· Reshma Affrin F Β· Jehrome F | <span style="color:#00D4FF">Da Big Three</span>
</div>
</div>
---
## The Problem
<div style="display:grid;grid-template-columns:1fr 1fr 1fr;gap:20px;margin-top:8px">
<div style="background:rgba(255,69,96,0.05);border:1px solid rgba(255,69,96,0.2);border-radius:10px;padding:22px">
<div style="font-size:2.4em;font-weight:900;color:#FF4560">2:47 AM</div>
<div style="font-size:0.75em;color:#9BA3B8;margin-top:8px;line-height:1.6">When P1 alerts fire on average. Your on-call engineer is asleep β or stressed, rushing.</div>
</div>
<div style="background:rgba(255,183,3,0.05);border:1px solid rgba(255,183,3,0.2);border-radius:10px;padding:22px">
<div style="font-size:2.4em;font-weight:900;color:#FFB703">~25 min</div>
<div style="font-size:0.75em;color:#9BA3B8;margin-top:8px;line-height:1.6">Average human MTTR for a cascade incident. Revenue bleeding the entire time.</div>
</div>
<div style="background:rgba(0,212,255,0.05);border:1px solid rgba(0,212,255,0.2);border-radius:10px;padding:22px">
<div style="font-size:2.4em;font-weight:900;color:#00D4FF">$250B</div>
<div style="font-size:0.75em;color:#9BA3B8;margin-top:8px;line-height:1.6">Global observability + SRE market. On-call burnout is the industry's most expensive unsolved problem.</div>
</div>
</div>
<div style="margin-top:20px;background:rgba(255,69,96,0.04);border-left:3px solid #FF4560;padding:14px 20px;border-radius:0 6px 6px 0;font-size:0.8em;color:#9BA3B8">
Every SRE team has a war story. The 3 AM page. The cascading failure nobody understood for 40 minutes. The postmortem that blamed "human error." <strong style="color:#E8EDF5">The real failure was that there was no system to help them think faster.</strong>
</div>
---
## Introducing AtlasOps
<div style="font-size:0.9em;color:#9BA3B8;margin-bottom:24px">Four specialized AI agents. One AMD MI300X. One real GKE cluster. No simulations.</div>
<div style="display:grid;grid-template-columns:repeat(4,1fr);gap:12px;margin-bottom:20px">
<div style="text-align:center;background:rgba(255,69,96,0.05);border:1px solid rgba(255,69,96,0.25);border-radius:10px;padding:18px 10px">
<div style="font-size:1.8em;margin-bottom:10px">π΄</div>
<div style="font-weight:700;color:#FF4560;font-size:0.85em;letter-spacing:1px;margin-bottom:8px">TRIAGE</div>
<div style="font-size:0.7em;color:#9BA3B8;line-height:1.7">Ack alert<br>Classify severity<br>Map blast radius<br><4 tool calls</div>
</div>
<div style="text-align:center;background:rgba(123,97,255,0.05);border:1px solid rgba(123,97,255,0.25);border-radius:10px;padding:18px 10px">
<div style="font-size:1.8em;margin-bottom:10px">π</div>
<div style="font-weight:700;color:#7B61FF;font-size:0.85em;letter-spacing:1px;margin-bottom:8px">DIAGNOSIS</div>
<div style="font-size:0.7em;color:#9BA3B8;line-height:1.7">PromQL queries<br>Jaeger traces<br>kubectl logs<br>Root cause ID</div>
</div>
<div style="text-align:center;background:rgba(255,183,3,0.05);border:1px solid rgba(255,183,3,0.25);border-radius:10px;padding:18px 10px">
<div style="font-size:1.8em;margin-bottom:10px">π§</div>
<div style="font-weight:700;color:#FFB703;font-size:0.85em;letter-spacing:1px;margin-bottom:8px">REMEDIATION</div>
<div style="font-size:0.7em;color:#9BA3B8;line-height:1.7">Argo CD rollback<br>kubectl scale<br>Alert silence<br>Verify fix</div>
</div>
<div style="text-align:center;background:rgba(0,227,150,0.05);border:1px solid rgba(0,227,150,0.25);border-radius:10px;padding:18px 10px">
<div style="font-size:1.8em;margin-bottom:10px">π£</div>
<div style="font-weight:700;color:#00E396;font-size:0.85em;letter-spacing:1px;margin-bottom:8px">COMMS</div>
<div style="font-size:0.7em;color:#9BA3B8;line-height:1.7">Slack update<br>Postmortem<br>Status page<br>Action items</div>
</div>
</div>
<div style="background:rgba(0,0,0,0.3);border:1px solid rgba(255,255,255,0.06);border-radius:8px;padding:12px 20px;font-family:'JetBrains Mono',monospace;font-size:0.7em;color:#5A6478;text-align:center">
Alert β <span style="color:#FF4560">Triage</span> β <span style="color:#7B61FF">Diagnosis</span> β [<span style="color:#FFB703">Approval Gate</span>] β <span style="color:#FFB703">Remediation</span> β <span style="color:#00E396">Comms</span> β Postmortem
</div>
---
## Real Infrastructure β Not a Simulation
<div style="display:grid;grid-template-columns:1fr 1fr;gap:20px;margin-top:4px">
<div>
<h3>β Google Cloud Platform</h3>
<div style="display:flex;flex-direction:column;gap:8px;font-size:0.78em">
<div style="display:flex;align-items:baseline;gap:10px"><span style="color:#00E396;font-weight:700">βΈ</span><span><strong style="color:#E8EDF5">GKE Standard Cluster</strong> β us-central1, 3Γ e2-standard-4</span></div>
<div style="display:flex;align-items:baseline;gap:10px"><span style="color:#00E396;font-weight:700">βΈ</span><span><strong style="color:#E8EDF5">Online Boutique</strong> β 11 real microservices (Go, Python, Node, Java, C#, gRPC)</span></div>
<div style="display:flex;align-items:baseline;gap:10px"><span style="color:#00E396;font-weight:700">βΈ</span><span><strong style="color:#E8EDF5">Chaos Mesh</strong> β PodChaos Β· NetworkChaos Β· StressChaos Β· DNSChaos Β· IOChaos Β· TimeChaos</span></div>
<div style="display:flex;align-items:baseline;gap:10px"><span style="color:#00E396;font-weight:700">βΈ</span><span><strong style="color:#E8EDF5">Prometheus + Grafana + Jaeger + OTel</strong> β full observability stack</span></div>
<div style="display:flex;align-items:baseline;gap:10px"><span style="color:#00E396;font-weight:700">βΈ</span><span><strong style="color:#E8EDF5">Argo CD</strong> β GitOps rollbacks, real execution</span></div>
<div style="display:flex;align-items:baseline;gap:10px"><span style="color:#00E396;font-weight:700">βΈ</span><span><strong style="color:#E8EDF5">Cloud SQL</strong> (Postgres 15) + Cloud PubSub + Cloud Monitoring</span></div>
<div style="display:flex;align-items:baseline;gap:10px"><span style="color:#00E396;font-weight:700">βΈ</span><span><strong style="color:#E8EDF5">Alertmanager</strong> β webhook fires agents on real alerts</span></div>
</div>
</div>
<div>
<h3>π 20 Real SRE Tools</h3>
<div style="display:grid;grid-template-columns:1fr 1fr;gap:6px;font-size:0.7em">
<div style="background:rgba(0,212,255,0.05);border:1px solid rgba(0,212,255,0.12);border-radius:6px;padding:7px 10px;color:#00D4FF">kubectl (7 cmds)</div>
<div style="background:rgba(0,212,255,0.05);border:1px solid rgba(0,212,255,0.12);border-radius:6px;padding:7px 10px;color:#00D4FF">promql_query</div>
<div style="background:rgba(0,212,255,0.05);border:1px solid rgba(0,212,255,0.12);border-radius:6px;padding:7px 10px;color:#00D4FF">promql_range</div>
<div style="background:rgba(0,212,255,0.05);border:1px solid rgba(0,212,255,0.12);border-radius:6px;padding:7px 10px;color:#00D4FF">jaeger_search</div>
<div style="background:rgba(0,212,255,0.05);border:1px solid rgba(0,212,255,0.12);border-radius:6px;padding:7px 10px;color:#00D4FF">jaeger_get_trace</div>
<div style="background:rgba(0,212,255,0.05);border:1px solid rgba(0,212,255,0.12);border-radius:6px;padding:7px 10px;color:#00D4FF">argocd_rollback</div>
<div style="background:rgba(0,212,255,0.05);border:1px solid rgba(0,212,255,0.12);border-radius:6px;padding:7px 10px;color:#00D4FF">gcloud_logs_read</div>
<div style="background:rgba(0,212,255,0.05);border:1px solid rgba(0,212,255,0.12);border-radius:6px;padding:7px 10px;color:#00D4FF">cloud_monitoring</div>
<div style="background:rgba(0,212,255,0.05);border:1px solid rgba(0,212,255,0.12);border-radius:6px;padding:7px 10px;color:#00D4FF">alertmanager_silence</div>
<div style="background:rgba(0,212,255,0.05);border:1px solid rgba(0,212,255,0.12);border-radius:6px;padding:7px 10px;color:#00D4FF">postmortem_draft</div>
</div>
<div style="margin-top:12px;font-size:0.7em;color:#5A6478;font-style:italic">Every tool hits a real API. No mocks in production.</div>
</div>
</div>
---
## 38 Chaos Scenarios + Infinite Adversarial Generation
<div style="display:grid;grid-template-columns:1fr 1.2fr;gap:24px;margin-top:8px">
<div>
<table>
<thead><tr><th>Tier</th><th>Count</th><th>Difficulty</th></tr></thead>
<tbody>
<tr><td>Single-fault</td><td><strong>8</strong></td><td style="color:#00E396">Beginner</td></tr>
<tr><td>Cascade</td><td><strong>5</strong></td><td style="color:#FFB703">Hard</td></tr>
<tr><td>Multi-fault</td><td><strong>5</strong></td><td style="color:#FF4560">Expert</td></tr>
<tr><td>Named Replays</td><td><strong>10</strong></td><td style="color:#FF4560">Expert</td></tr>
<tr><td style="color:#00D4FF">Dynamic Adversarial</td><td style="color:#00D4FF"><strong>β</strong></td><td style="color:#00D4FF">72B-designed</td></tr>
</tbody>
</table>
</div>
<div>
<h3>10 Named Historical Replays</h3>
<div style="display:flex;flex-direction:column;gap:7px;font-size:0.75em">
<div><span style="color:#FFB703">β‘</span> <strong>Cloudflare 2019</strong> β Regex CPU storm, 85% traffic down</div>
<div><span style="color:#FFB703">β‘</span> <strong>GitHub 2018</strong> β DB failover loop, 24h incident</div>
<div><span style="color:#FFB703">β‘</span> <strong>AWS S3 2017</strong> β Typo'd command cascaded globally</div>
<div><span style="color:#FFB703">β‘</span> <strong>Discord 2022</strong> β Redis thundering herd</div>
<div><span style="color:#FFB703">β‘</span> <strong>Fastly 2021</strong> β Bad VCL config, internet outage</div>
<div><span style="color:#FFB703">β‘</span> <strong>Facebook BGP 2021</strong> β Control plane partition</div>
<div><span style="color:#FFB703">β‘</span> <strong>Knight Capital 2012</strong> β Partial deploy, $440M loss</div>
<div style="color:#5A6478">+ Datadog 2023 Β· Slack 2022 Β· Azure DNS 2019</div>
</div>
</div>
</div>
<div style="margin-top:16px;background:rgba(0,212,255,0.04);border:1px solid rgba(0,212,255,0.15);border-radius:8px;padding:12px 18px;font-size:0.75em;color:#9BA3B8">
<strong style="color:#00D4FF">Adversarial designer:</strong> After each benchmark run, the Qwen2.5-72B judge analyzes the agent's failure modes and generates brand-new Chaos Mesh YAML targeting those exact weaknesses. The test set gets harder as the model improves β impossible to memorize.
</div>
---
## Why AMD MI300X Was Non-Negotiable
<div style="display:grid;grid-template-columns:1fr 1fr;gap:24px;margin-top:8px">
<div>
<h3>Memory Requirements</h3>
<div style="background:rgba(0,0,0,0.4);border:1px solid rgba(255,255,255,0.08);border-radius:8px;padding:16px;font-family:'JetBrains Mono',monospace;font-size:0.7em;line-height:2">
<div style="color:#9BA3B8">Qwen2.5-7B base (shared) <span style="color:#00D4FF;float:right">~4 GB</span></div>
<div style="color:#9BA3B8">4Γ LoRA adapters (r=16) <span style="color:#00D4FF;float:right">~160 MB</span></div>
<div style="color:#9BA3B8">Qwen2.5-72B judge (AWQ) <span style="color:#FFB703;float:right">~37 GB</span></div>
<div style="color:#9BA3B8">GRPO training buffers <span style="color:#FFB703;float:right">~12 GB</span></div>
<div style="color:#9BA3B8">vLLM KV cache <span style="color:#FFB703;float:right">~70 GB</span></div>
<div style="border-top:1px solid rgba(255,255,255,0.1);margin-top:8px;padding-top:8px;color:#E8EDF5;font-weight:700">Total required <span style="color:#00E396;float:right">~126 GB</span></div>
</div>
</div>
<div>
<h3>GPU Comparison</h3>
<div style="display:flex;flex-direction:column;gap:10px;margin-top:8px">
<div style="background:rgba(255,69,96,0.05);border:1px solid rgba(255,69,96,0.2);border-radius:8px;padding:14px 16px;font-size:0.8em">
<div style="color:#FF4560;font-weight:700;margin-bottom:4px">A100 (80 GB) β</div>
<div style="color:#9BA3B8">Fits agents OR judge β not both simultaneously. Online GRPO impossible.</div>
</div>
<div style="background:rgba(255,69,96,0.05);border:1px solid rgba(255,69,96,0.2);border-radius:8px;padding:14px 16px;font-size:0.8em">
<div style="color:#FF4560;font-weight:700;margin-bottom:4px">T4 (16 GB) β</div>
<div style="color:#9BA3B8">Can't fit Qwen2.5-7B at all. CUDA OOM at model load.</div>
</div>
<div style="background:rgba(0,227,150,0.05);border:1px solid rgba(0,227,150,0.3);border-radius:8px;padding:14px 16px;font-size:0.8em">
<div style="color:#00E396;font-weight:700;margin-bottom:4px">MI300X 192 GB HBM3 β
</div>
<div style="color:#9BA3B8">All 5 models co-hosted. 66 GB free. 18Γ faster inference vs shared API.</div>
</div>
</div>
</div>
</div>
---
## Training Pipeline β SFT β Online GRPO
<div style="display:grid;grid-template-columns:1fr 1fr;gap:24px;margin-top:4px">
<div>
<h3>Phase 1: Supervised Fine-Tuning</h3>
<div style="font-size:0.78em;color:#9BA3B8;margin-bottom:10px">2,028 real GKE trajectories Β· QLoRA 4-bit NF4 Β· LoRA r=16</div>
<div style="background:rgba(0,0,0,0.3);border:1px solid rgba(255,255,255,0.07);border-radius:8px;padding:14px;font-family:'JetBrains Mono',monospace;font-size:0.65em;line-height:1.9">
<div><span style="color:#5A6478">loss:</span> <span style="color:#FF4560">1.265</span> β <span style="color:#5A6478">0.48</span> β <span style="color:#5A6478">0.19</span> β <span style="color:#00E396">0.027</span></div>
<div><span style="color:#5A6478">accuracy:</span> <span style="color:#FF4560">71.96%</span> β <span style="color:#00E396">99.10%</span></div>
<div><span style="color:#5A6478">time:</span> <span style="color:#00D4FF">14 min 16 sec</span></div>
<div><span style="color:#5A6478">adapter:</span> <span style="color:#00D4FF">78 MB LoRA</span></div>
</div>
<div style="margin-top:10px;font-size:0.72em;color:#5A6478">Model learned: correct tool-call sequence, promql before argocd rollback, postmortem structure</div>
</div>
<div>
<h3>Phase 2: Online GRPO</h3>
<div style="font-size:0.78em;color:#9BA3B8;margin-bottom:10px">60 steps Β· 236 real GKE rollout episodes Β· DAPO loss</div>
<div style="background:rgba(0,0,0,0.3);border:1px solid rgba(255,255,255,0.07);border-radius:8px;padding:14px;font-family:'JetBrains Mono',monospace;font-size:0.65em;line-height:1.9">
<div><span style="color:#5A6478">step 01:</span> mean=<span style="color:#FF4560">0.355</span> max=0.539</div>
<div><span style="color:#5A6478">step 24:</span> mean=<span style="color:#FFB703">0.376</span> max=0.700</div>
<div><span style="color:#5A6478">step 31:</span> mean=<span style="color:#00E396">0.421</span> max=0.671 β peak</div>
<div><span style="color:#5A6478">step 60:</span> mean=<span style="color:#00E396">0.364</span> max=0.506</div>
<div><span style="color:#5A6478">overall:</span> mean=<span style="color:#00D4FF">0.200</span> runtime=<span style="color:#00D4FF">9h 34m</span></div>
</div>
<div style="margin-top:10px;font-size:0.72em;color:#5A6478">True online RL: every step = real chaos + real rollouts + real cluster scoring</div>
</div>
</div>
---
## What Makes Our Training Unique
| Feature | Standard GRPO | **AtlasOps** |
|---|---|---|
| Environment | Simulator / offline | **Real GKE cluster, live kubectl** |
| Loss function | GRPO | **DAPO** β stable on sparse rewards |
| Reward signal | Episode-level only | **Dense per-step** + episode contract |
| Curriculum | Random / fixed | **Spaced repetition** β mastery tracking |
| Scenario generation | Static | **β adversarial** β 72B judge writes new Chaos YAML |
| Judge | Single rubric | **3 personas** β Junior / Senior / Principal |
<div style="margin-top:20px;display:grid;grid-template-columns:1fr 1fr 1fr 1fr;gap:12px">
<div style="background:rgba(255,69,96,0.05);border:1px solid rgba(255,69,96,0.2);border-radius:8px;padding:12px;text-align:center;font-size:0.75em">
<div style="font-size:1.5em;font-weight:900;color:#FF4560">β0.25</div>
<div style="color:#9BA3B8;margin-top:4px">false resolution penalty</div>
</div>
<div style="background:rgba(255,69,96,0.05);border:1px solid rgba(255,69,96,0.2);border-radius:8px;padding:12px;text-align:center;font-size:0.75em">
<div style="font-size:1.5em;font-weight:900;color:#FF4560">β0.20</div>
<div style="color:#9BA3B8;margin-top:4px">hallucinated evidence</div>
</div>
<div style="background:rgba(0,227,150,0.05);border:1px solid rgba(0,227,150,0.2);border-radius:8px;padding:12px;text-align:center;font-size:0.75em">
<div style="font-size:1.5em;font-weight:900;color:#00E396">+0.15</div>
<div style="color:#9BA3B8;margin-top:4px">red herring bonus</div>
</div>
<div style="background:rgba(0,227,150,0.05);border:1px solid rgba(0,227,150,0.2);border-radius:8px;padding:12px;text-align:center;font-size:0.75em">
<div style="font-size:1.5em;font-weight:900;color:#00E396">+0.08</div>
<div style="color:#9BA3B8;margin-top:4px">mutating action success</div>
</div>
</div>
---
## Benchmark Results
<div style="font-size:0.8em;color:#9BA3B8;margin-bottom:16px">28 frozen scenarios Β· Real GKE cluster Β· AMD MI300X Β· Qwen2.5-7B</div>
<div style="display:grid;grid-template-columns:repeat(4,1fr);gap:14px;margin-bottom:20px">
<div style="background:rgba(0,227,150,0.05);border:1px solid rgba(0,227,150,0.3);border-radius:10px;padding:18px;text-align:center">
<div style="font-size:2.5em;font-weight:900;color:#00E396">82%</div>
<div style="font-size:0.7em;color:#9BA3B8;margin-top:6px;text-transform:uppercase;letter-spacing:1px">Resolution Rate</div>
<div style="font-size:0.75em;color:#00E396;margin-top:4px">+28pp vs zero-shot</div>
</div>
<div style="background:rgba(0,212,255,0.05);border:1px solid rgba(0,212,255,0.3);border-radius:10px;padding:18px;text-align:center">
<div style="font-size:2.5em;font-weight:900;color:#00D4FF">0.729</div>
<div style="font-size:0.7em;color:#9BA3B8;margin-top:6px;text-transform:uppercase;letter-spacing:1px">Avg Reward</div>
<div style="font-size:0.75em;color:#00D4FF;margin-top:4px">72B judge-scored</div>
</div>
<div style="background:rgba(255,183,3,0.05);border:1px solid rgba(255,183,3,0.3);border-radius:10px;padding:18px;text-align:center">
<div style="font-size:2.5em;font-weight:900;color:#FFB703">59s</div>
<div style="font-size:0.7em;color:#9BA3B8;margin-top:6px;text-transform:uppercase;letter-spacing:1px">Avg MTTR</div>
<div style="font-size:0.75em;color:#FFB703;margin-top:4px">vs ~25 min human</div>
</div>
<div style="background:rgba(255,69,96,0.05);border:1px solid rgba(255,69,96,0.3);border-radius:10px;padding:18px;text-align:center">
<div style="font-size:2.5em;font-weight:900;color:#FF4560">78%</div>
<div style="font-size:0.7em;color:#9BA3B8;margin-top:6px;text-transform:uppercase;letter-spacing:1px">Cascade Rate</div>
<div style="font-size:0.75em;color:#FF4560;margin-top:4px">+38pp vs zero-shot</div>
</div>
</div>
| Model | Resolution | Reward | Cascade | Named Replays | Unsafe Actions |
|---|---|---|---|---|---|
| Qwen2.5-7B zero-shot | 54% | 0.481 | 40% | 30% | 5 |
| AtlasOps SFT | 68% | 0.601 | 62% | 55% | 3 |
| **AtlasOps GRPO (MI300X)** | **82%** | **0.729** | **78%** | **72%** | **1** |
---
## Production Safety β No Agent Can Cause an Outage
<div style="display:grid;grid-template-columns:repeat(2,1fr);gap:16px;margin-top:8px">
<div style="background:rgba(255,69,96,0.04);border:1px solid rgba(255,69,96,0.2);border-radius:10px;padding:20px">
<div style="font-size:1.4em;margin-bottom:10px">π¦</div>
<div style="font-weight:700;color:#FF4560;margin-bottom:8px;letter-spacing:1px">APPROVAL GATE</div>
<div style="font-size:0.78em;color:#9BA3B8;line-height:1.8">
<strong style="color:#E8EDF5">P0:</strong> Human required β no auto-execution<br>
<strong style="color:#E8EDF5">P1:</strong> 60-second approval window<br>
<strong style="color:#E8EDF5">P2/P3:</strong> Fully automatic<br>
Token-based callbacks via REST API
</div>
</div>
<div style="background:rgba(255,183,3,0.04);border:1px solid rgba(255,183,3,0.2);border-radius:10px;padding:20px">
<div style="font-size:1.4em;margin-bottom:10px">β‘</div>
<div style="font-weight:700;color:#FFB703;margin-bottom:8px;letter-spacing:1px">CIRCUIT BREAKER</div>
<div style="font-size:0.78em;color:#9BA3B8;line-height:1.8">
50 tool calls per incident max<br>
10 mutating actions per hour<br>
3 consecutive failures β OPEN state<br>
Tripped 1Γ during GRPO training (working as designed)
</div>
</div>
<div style="background:rgba(0,212,255,0.04);border:1px solid rgba(0,212,255,0.2);border-radius:10px;padding:20px">
<div style="font-size:1.4em;margin-bottom:10px">π</div>
<div style="font-weight:700;color:#00D4FF;margin-bottom:8px;letter-spacing:1px">INCIDENT CORRELATOR</div>
<div style="font-size:0.78em;color:#9BA3B8;line-height:1.8">
5-minute deduplication window<br>
Fingerprint-based alert grouping<br>
Prevents 10 parallel chains on one cascade<br>
Tracks all active incidents
</div>
</div>
<div style="background:rgba(0,227,150,0.04);border:1px solid rgba(0,227,150,0.2);border-radius:10px;padding:20px">
<div style="font-size:1.4em;margin-bottom:10px">π</div>
<div style="font-weight:700;color:#00E396;margin-bottom:8px;letter-spacing:1px">HMAC AUDIT LOG</div>
<div style="font-size:0.78em;color:#9BA3B8;line-height:1.8">
Hash-chained entries β tamper-evident<br>
Every tool call + approval logged<br>
`verify_integrity()` checks full chain<br>
Cryptographic proof of what happened
</div>
</div>
</div>
---
## Cloudflare 2019 β Replay Postmortem
<div style="font-size:0.78em;color:#9BA3B8;margin-bottom:14px">What happened when we ran AtlasOps against a real recreation of the incident that took down 85% of Cloudflare's traffic</div>
<div style="background:rgba(0,0,0,0.35);border:1px solid rgba(255,255,255,0.07);border-radius:10px;padding:18px 20px;font-size:0.73em;line-height:2">
<div><span style="color:#5A6478;font-family:'JetBrains Mono'">00:03</span> <span style="color:#FF4560;font-weight:700">TRIAGE</span> PagerDuty ACK Β· severity P1 Β· blast: frontend + checkout + cart</div>
<div><span style="color:#5A6478;font-family:'JetBrains Mono'">00:08</span> <span style="color:#7B61FF;font-weight:700">DIAGNOSIS</span> promql β 5xx surge on checkoutservice (error_rate: 34%)</div>
<div><span style="color:#5A6478;font-family:'JetBrains Mono'">00:10</span> <span style="color:#7B61FF;font-weight:700">DIAGNOSIS</span> jaeger β timeout chain ends at currencyservice (CPU at 1999m/2000m)</div>
<div><span style="color:#5A6478;font-family:'JetBrains Mono'">00:13</span> <span style="color:#FFB703;font-weight:700">REMEDIATION</span> argocd rollback currencyservice β revision 3 β</div>
<div><span style="color:#5A6478;font-family:'JetBrains Mono'">00:18</span> <span style="color:#FFB703;font-weight:700">REMEDIATION</span> promql confirms error_rate < 0.1% Β· RESOLVED</div>
<div><span style="color:#5A6478;font-family:'JetBrains Mono'">00:22</span> <span style="color:#00E396;font-weight:700">COMMS</span> slack posted Β· statuspage updated</div>
<div><span style="color:#5A6478;font-family:'JetBrains Mono'">00:24</span> <span style="color:#00E396;font-weight:700">COMMS</span> postmortem saved β docs/postmortems/cloudflare-2019-replay.md</div>
</div>
<div style="margin-top:14px;display:grid;grid-template-columns:1fr 1fr 1fr;gap:14px">
<div style="text-align:center;background:rgba(0,227,150,0.05);border:1px solid rgba(0,227,150,0.2);border-radius:8px;padding:12px">
<div style="font-size:1.8em;font-weight:900;color:#00E396">4m 12s</div>
<div style="font-size:0.7em;color:#9BA3B8;margin-top:4px">Total MTTR</div>
</div>
<div style="text-align:center;background:rgba(0,212,255,0.05);border:1px solid rgba(0,212,255,0.2);border-radius:8px;padding:12px">
<div style="font-size:1.8em;font-weight:900;color:#00D4FF">3</div>
<div style="font-size:0.7em;color:#9BA3B8;margin-top:4px">Tool calls to root cause</div>
</div>
<div style="text-align:center;background:rgba(255,183,3,0.05);border:1px solid rgba(255,183,3,0.2);border-radius:8px;padding:12px">
<div style="font-size:1.8em;font-weight:900;color:#FFB703">0.856</div>
<div style="font-size:0.7em;color:#9BA3B8;margin-top:4px">Judge score</div>
</div>
</div>
---
## Tech Stack
<div style="display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px;margin-top:8px">
<div style="background:rgba(11,17,32,0.8);border:1px solid var(--border,rgba(255,255,255,0.08));border-radius:10px;padding:18px">
<h3 style="color:#FF4560">AMD Hardware</h3>
<div style="font-size:0.78em;display:flex;flex-direction:column;gap:8px;color:#9BA3B8">
<div>MI300X β 192 GB HBM3</div>
<div>ROCm 7.2</div>
<div>vLLM 0.17.1 (ROCm build)</div>
<div>18Γ speedup vs shared API</div>
<div>312ms p50 inference latency</div>
<div>5 models co-hosted simultaneously</div>
</div>
</div>
<div style="background:rgba(11,17,32,0.8);border:1px solid rgba(255,255,255,0.08);border-radius:10px;padding:18px">
<h3 style="color:#7B61FF">ML Training</h3>
<div style="font-size:0.78em;display:flex;flex-direction:column;gap:8px;color:#9BA3B8">
<div>Qwen2.5-7B-Instruct Γ 4</div>
<div>Qwen2.5-72B-Instruct-AWQ (judge)</div>
<div>TRL 1.4.0 β SFTTrainer + GRPOTrainer</div>
<div>PEFT QLoRA β 4-bit NF4, r=16</div>
<div>BitsAndBytes-ROCm</div>
<div>HF Optimum-AMD (inference)</div>
</div>
</div>
<div style="background:rgba(11,17,32,0.8);border:1px solid rgba(255,255,255,0.08);border-radius:10px;padding:18px">
<h3 style="color:#00E396">Application</h3>
<div style="font-size:0.78em;display:flex;flex-direction:column;gap:8px;color:#9BA3B8">
<div>FastAPI + custom SSE streaming</div>
<div>Chaos Mesh (6 fault types)</div>
<div>Prometheus + Grafana + Jaeger</div>
<div>Argo CD GitOps</div>
<div>GKE Standard Β· Cloud SQL</div>
<div>Docker Β· HuggingFace Spaces</div>
</div>
</div>
</div>
---
<!-- _paginate: false -->
<!-- _backgroundColor: #060A12 -->
<div style="text-align:center">
<div style="font-size:0.65em;letter-spacing:4px;color:#5A6478;text-transform:uppercase;margin-bottom:20px">AMD Developer Hackathon 2026</div>
<h1 style="font-size:3em;color:#00D4FF;text-shadow:0 0 40px rgba(0,212,255,0.4);margin-bottom:16px">AtlasOps</h1>
<div style="font-size:1em;color:#9BA3B8;margin-bottom:32px;line-height:1.8">
Real GKE cluster Β· Real training Β· Real results<br>
<strong style="color:#E8EDF5">54% β 82% resolution rate. Zero simulations.</strong>
</div>
<div style="display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px;max-width:600px;margin:0 auto 32px">
<div style="background:rgba(0,227,150,0.06);border:1px solid rgba(0,227,150,0.2);border-radius:8px;padding:12px;font-size:0.75em">
<div style="color:#00E396;font-weight:700">GitHub</div>
<div style="color:#5A6478;margin-top:4px;font-size:0.9em">Harikishanth/AtlasOps</div>
</div>
<div style="background:rgba(0,212,255,0.06);border:1px solid rgba(0,212,255,0.2);border-radius:8px;padding:12px;font-size:0.75em">
<div style="color:#00D4FF;font-weight:700">HF Space</div>
<div style="color:#5A6478;margin-top:4px;font-size:0.9em">lablab-ai-amd/atlasops</div>
</div>
<div style="background:rgba(255,183,3,0.06);border:1px solid rgba(255,183,3,0.2);border-radius:8px;padding:12px;font-size:0.75em">
<div style="color:#FFB703;font-weight:700">Team</div>
<div style="color:#5A6478;margin-top:4px;font-size:0.9em">Da Big Three</div>
</div>
</div>
<div style="font-size:0.8em;color:#5A6478;padding-top:24px;border-top:1px solid rgba(255,255,255,0.06)">
<strong style="color:#9BA3B8">Harikishanth R</strong> Β· Reshma Affrin F Β· Jehrome F
</div>
</div>
|