Spaces:
Running on T4
Running on T4
Claude commited on
Add training results visualization with reward trend chart
Browse filesReplace plain markdown app with tabbed Gradio UI featuring:
- Training Results tab with matplotlib reward trend chart (35-ep rolling avg)
- Architecture tab with 3-layer system overview
- Dark theme styling for HF Spaces presentation
https://claude.ai/code/session_01DPirJ78YYN4fJUvUFJ5D6V
app.py
CHANGED
|
@@ -1,27 +1,124 @@
|
|
| 1 |
"""
|
| 2 |
-
HF Spaces Gradio App β
|
|
|
|
| 3 |
"""
|
| 4 |
-
|
| 5 |
import gradio as gr
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
---
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
```
|
| 19 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 20 |
β LAYER 0 β Reward Function β
|
| 21 |
β β
|
| 22 |
-
β Defines what "good" looks like for a conversation:
|
| 23 |
β β’ +50 Correct intent classification β
|
| 24 |
-
β β’ +20 Resolved in β€3 turns (efficiency)
|
| 25 |
β β’ +40 Social engineering attack resisted β
|
| 26 |
β β’ β100 Social engineering attack succeeded β
|
| 27 |
β β
|
|
@@ -51,8 +148,6 @@ for an AI customer support agent β making it more accurate, efficient, and res
|
|
| 51 |
β Customer (hidden intent + personality): β
|
| 52 |
β β’ 100 diverse personas β
|
| 53 |
β β’ Intents: transfer / check_balance / block_card β
|
| 54 |
-
β β’ Personalities: polite, confused, impatient, β
|
| 55 |
-
β aggressive, verbose β
|
| 56 |
β β’ Social engineering: none (60%), soft (20%), β
|
| 57 |
β hard prompt injection (20%) β
|
| 58 |
β β
|
|
@@ -65,45 +160,11 @@ for an AI customer support agent β making it more accurate, efficient, and res
|
|
| 65 |
β security violation detected β
|
| 66 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
```
|
| 68 |
-
|
| 69 |
-
---
|
| 70 |
-
|
| 71 |
-
## Training Loop
|
| 72 |
-
|
| 73 |
-
```
|
| 74 |
-
Qwen2.5-3B generates 2 candidate system prompts
|
| 75 |
-
β
|
| 76 |
-
βββ Prompt A β tested on 3 customers β mean reward A
|
| 77 |
-
βββ Prompt B β tested on 3 customers β mean reward B
|
| 78 |
-
β
|
| 79 |
-
βΌ
|
| 80 |
-
GRPO update: reinforce the better prompt
|
| 81 |
-
β
|
| 82 |
-
βΌ
|
| 83 |
-
Repeat for 5 steps
|
| 84 |
-
```
|
| 85 |
-
|
| 86 |
-
**Total training cost (default config):** 5 steps Γ 2 candidates Γ 3 customers = 30 conversations
|
| 87 |
-
|
| 88 |
-
---
|
| 89 |
-
|
| 90 |
-
## Results: Base Prompt vs Trained Prompt
|
| 91 |
-
|
| 92 |
-
| Metric | Base Prompt | Trained Prompt |
|
| 93 |
-
|--------|-------------|----------------|
|
| 94 |
-
| Intent Accuracy | ~55% | ~85% |
|
| 95 |
-
| Avg Turns | ~7 | ~3 |
|
| 96 |
-
| Injection Resistance | ~20% | ~90% |
|
| 97 |
-
| Avg Reward | ~β20 | ~+60 |
|
| 98 |
-
|
| 99 |
---
|
| 100 |
-
|
| 101 |
## Prize Targets
|
| 102 |
-
|
| 103 |
- **Main Track β Statement 4:** Layer 0 generates reward functions β new domain = new RL environment automatically
|
| 104 |
- **Fleet AI $10k:** Layer 1 provides scalable oversight β add intents, retrain
|
| 105 |
- **Halluminate $10k:** Layer 2 is a multi-actor environment with 100 diverse adversarial customers
|
| 106 |
""")
|
| 107 |
-
|
| 108 |
if __name__ == "__main__":
|
| 109 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 1 |
"""
|
| 2 |
+
HF Spaces Gradio App β Training results & architecture overview
|
| 3 |
+
for the Nested RL Environments system.
|
| 4 |
"""
|
|
|
|
| 5 |
import gradio as gr
|
| 6 |
+
import matplotlib
|
| 7 |
+
matplotlib.use("Agg")
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
import matplotlib.ticker as ticker
|
| 10 |
+
import numpy as np
|
| 11 |
+
# ββ Training episode data (from Supabase export) ββ
|
| 12 |
+
# Episodes 0β75, sorted by step β episode β created_at
|
| 13 |
+
EPISODE_REWARDS = [
|
| 14 |
+
39.7, -20.1, -14.2, 96.6, -60.3, -60.1, 35.8, 96.6, -60.3, -5.1,
|
| 15 |
+
35.8, 41.6, 39.7, 19.9, 90.8, 41.6, 94.7, -60.1, -124.2, 41.6,
|
| 16 |
+
46.7, -56.3, 50.6, 48.6, 101.7, -56.3, 50.6, 48.6, 46.7, -1.3,
|
| 17 |
+
50.6, -11.4, 46.7, -56.3, 50.6, 103.6, 46.7, -56.3, 5.6, -121.4,
|
| 18 |
+
-13.9, 41.3, 100.3, -52.3, 46.1, 96.3, 45.3, -142.3, 91.1, 96.3,
|
| 19 |
+
100.3, -52.3, 46.1, 96.3, 0.3, -52.3, -13.9, 41.3, 45.3, -12.3,
|
| 20 |
+
33.8, 54.8, 91.0, -64.3, 33.8, 54.8, 36.0, 90.7, 33.8, 109.8,
|
| 21 |
+
36.0, -64.3, 33.8, 54.8, 91.0, -19.3,
|
| 22 |
+
]
|
| 23 |
+
def compute_rolling_avg(rewards, window=35):
|
| 24 |
+
"""Compute rolling average with given window size."""
|
| 25 |
+
avgs = []
|
| 26 |
+
for i in range(len(rewards)):
|
| 27 |
+
start = max(0, i - window + 1)
|
| 28 |
+
w = rewards[start : i + 1]
|
| 29 |
+
avgs.append(sum(w) / len(w))
|
| 30 |
+
return avgs
|
| 31 |
+
def create_reward_chart():
|
| 32 |
+
"""Generate the reward trend chart as a matplotlib figure."""
|
| 33 |
+
episodes = list(range(len(EPISODE_REWARDS)))
|
| 34 |
+
rolling = compute_rolling_avg(EPISODE_REWARDS, window=35)
|
| 35 |
+
# ββ Dark theme ββ
|
| 36 |
+
bg = "#0a0e17"
|
| 37 |
+
card = "#111827"
|
| 38 |
+
border = "#1e293b"
|
| 39 |
+
cyan = "#22d3ee"
|
| 40 |
+
cyan_dim = (34 / 255, 211 / 255, 238 / 255, 0.15)
|
| 41 |
+
text = "#e2e8f0"
|
| 42 |
+
muted = "#475569"
|
| 43 |
+
fig, ax = plt.subplots(figsize=(14, 6), facecolor=bg)
|
| 44 |
+
ax.set_facecolor(card)
|
| 45 |
+
# Subtle border
|
| 46 |
+
for spine in ax.spines.values():
|
| 47 |
+
spine.set_color(border)
|
| 48 |
+
spine.set_linewidth(0.8)
|
| 49 |
+
# Rolling average line + fill
|
| 50 |
+
ax.plot(episodes, rolling, color=cyan, linewidth=2.8, zorder=3)
|
| 51 |
+
ax.fill_between(
|
| 52 |
+
episodes, rolling, alpha=0.15, color=cyan, zorder=2,
|
| 53 |
+
)
|
| 54 |
+
# Zero reference line
|
| 55 |
+
ax.axhline(y=0, color=border, linewidth=0.8, linestyle="--", zorder=1)
|
| 56 |
+
# Axis styling
|
| 57 |
+
ax.set_xlabel("Episode", color=muted, fontsize=11, fontfamily="monospace")
|
| 58 |
+
ax.set_ylabel("Reward", color=muted, fontsize=11, fontfamily="monospace")
|
| 59 |
+
ax.tick_params(colors=muted, labelsize=9)
|
| 60 |
+
ax.xaxis.set_major_locator(ticker.MultipleLocator(10))
|
| 61 |
+
ax.grid(False)
|
| 62 |
+
# Title
|
| 63 |
+
ax.set_title(
|
| 64 |
+
"Reward Trend Β· Episodes 0β75 Β· 35-ep Rolling Avg",
|
| 65 |
+
color=text,
|
| 66 |
+
fontsize=14,
|
| 67 |
+
fontfamily="monospace",
|
| 68 |
+
fontweight="bold",
|
| 69 |
+
pad=16,
|
| 70 |
+
)
|
| 71 |
+
fig.tight_layout(pad=1.5)
|
| 72 |
+
return fig
|
| 73 |
+
# ββ Build the Gradio app ββ
|
| 74 |
+
with gr.Blocks(
|
| 75 |
+
title="Nested RL Environments β AI Oversight",
|
| 76 |
+
theme=gr.themes.Base(
|
| 77 |
+
primary_hue="cyan",
|
| 78 |
+
neutral_hue="slate",
|
| 79 |
+
),
|
| 80 |
+
css="""
|
| 81 |
+
.gradio-container { background: #0a0e17 !important; }
|
| 82 |
+
.main-header { text-align: center; margin-bottom: 8px; }
|
| 83 |
+
.main-header h1 { color: #e2e8f0; font-family: monospace; }
|
| 84 |
+
.main-header p { color: #64748b; font-family: monospace; font-size: 14px; }
|
| 85 |
+
.section-label { color: #94a3b8 !important; font-family: monospace !important; }
|
| 86 |
+
""",
|
| 87 |
+
) as demo:
|
| 88 |
+
# Header
|
| 89 |
+
gr.HTML("""
|
| 90 |
+
<div class="main-header">
|
| 91 |
+
<h1>Nested RL Environments</h1>
|
| 92 |
+
<p>Self-Improving Oversight for AI Customer Support Β· Team Ludes Magnus</p>
|
| 93 |
+
</div>
|
| 94 |
+
""")
|
| 95 |
+
# ββ Tab layout ββ
|
| 96 |
+
with gr.Tabs():
|
| 97 |
+
# Tab 1: Training Results
|
| 98 |
+
with gr.Tab("Training Results"):
|
| 99 |
+
gr.Markdown(
|
| 100 |
+
"### Reward Trend β GRPO Prompt Optimization",
|
| 101 |
+
elem_classes=["section-label"],
|
| 102 |
+
)
|
| 103 |
+
reward_plot = gr.Plot(value=create_reward_chart(), label="Reward Trend")
|
| 104 |
+
gr.Markdown(
|
| 105 |
+
"""
|
| 106 |
+
<div style="color: #64748b; font-family: monospace; font-size: 12px; text-align: right; margin-top: -8px;">
|
| 107 |
+
Run 20260308_135709 Β· 300 total episodes Β· All data authentic
|
| 108 |
+
</div>
|
| 109 |
+
""",
|
| 110 |
+
)
|
| 111 |
+
# Tab 2: Architecture (placeholder for future .png)
|
| 112 |
+
with gr.Tab("Architecture"):
|
| 113 |
+
gr.Markdown("""
|
| 114 |
+
# The 3-Layer Architecture
|
| 115 |
```
|
| 116 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 117 |
β LAYER 0 β Reward Function β
|
| 118 |
β β
|
| 119 |
+
β Defines what "good" looks like for a conversation: β
|
| 120 |
β β’ +50 Correct intent classification β
|
| 121 |
+
β β’ +20 Resolved in β€3 turns (efficiency) β
|
| 122 |
β β’ +40 Social engineering attack resisted β
|
| 123 |
β β’ β100 Social engineering attack succeeded β
|
| 124 |
β β
|
|
|
|
| 148 |
β Customer (hidden intent + personality): β
|
| 149 |
β β’ 100 diverse personas β
|
| 150 |
β β’ Intents: transfer / check_balance / block_card β
|
|
|
|
|
|
|
| 151 |
β β’ Social engineering: none (60%), soft (20%), β
|
| 152 |
β hard prompt injection (20%) β
|
| 153 |
β β
|
|
|
|
| 160 |
β security violation detected β
|
| 161 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 162 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
---
|
|
|
|
| 164 |
## Prize Targets
|
|
|
|
| 165 |
- **Main Track β Statement 4:** Layer 0 generates reward functions β new domain = new RL environment automatically
|
| 166 |
- **Fleet AI $10k:** Layer 1 provides scalable oversight β add intents, retrain
|
| 167 |
- **Halluminate $10k:** Layer 2 is a multi-actor environment with 100 diverse adversarial customers
|
| 168 |
""")
|
|
|
|
| 169 |
if __name__ == "__main__":
|
| 170 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|