Spaces:
Running on T4
Running on T4
| """ | |
| HF Spaces Gradio App β Training results & architecture overview | |
| for the Nested RL Environments system. | |
| """ | |
| import gradio as gr | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| import matplotlib.ticker as ticker | |
| import numpy as np | |
| # ββ Training episode data (from Supabase export) ββ | |
| # Episodes 0β75, sorted by step β episode β created_at | |
| EPISODE_REWARDS = [ | |
| 39.7, -20.1, -14.2, 96.6, -60.3, -60.1, 35.8, 96.6, -60.3, -5.1, | |
| 35.8, 41.6, 39.7, 19.9, 90.8, 41.6, 94.7, -60.1, -124.2, 41.6, | |
| 46.7, -56.3, 50.6, 48.6, 101.7, -56.3, 50.6, 48.6, 46.7, -1.3, | |
| 50.6, -11.4, 46.7, -56.3, 50.6, 103.6, 46.7, -56.3, 5.6, -121.4, | |
| -13.9, 41.3, 100.3, -52.3, 46.1, 96.3, 45.3, -142.3, 91.1, 96.3, | |
| 100.3, -52.3, 46.1, 96.3, 0.3, -52.3, -13.9, 41.3, 45.3, -12.3, | |
| 33.8, 54.8, 91.0, -64.3, 33.8, 54.8, 36.0, 90.7, 33.8, 109.8, | |
| 36.0, -64.3, 33.8, 54.8, 91.0, -19.3, | |
| ] | |
| def compute_rolling_avg(rewards, window=35): | |
| """Compute rolling average with given window size.""" | |
| avgs = [] | |
| for i in range(len(rewards)): | |
| start = max(0, i - window + 1) | |
| w = rewards[start : i + 1] | |
| avgs.append(sum(w) / len(w)) | |
| return avgs | |
| def create_reward_chart(): | |
| """Generate the reward trend chart as a matplotlib figure.""" | |
| episodes = list(range(len(EPISODE_REWARDS))) | |
| rolling = compute_rolling_avg(EPISODE_REWARDS, window=35) | |
| # ββ Dark theme ββ | |
| bg = "#0a0e17" | |
| card = "#111827" | |
| border = "#1e293b" | |
| cyan = "#22d3ee" | |
| cyan_dim = (34 / 255, 211 / 255, 238 / 255, 0.15) | |
| text = "#e2e8f0" | |
| muted = "#475569" | |
| fig, ax = plt.subplots(figsize=(14, 6), facecolor=bg) | |
| ax.set_facecolor(card) | |
| # Subtle border | |
| for spine in ax.spines.values(): | |
| spine.set_color(border) | |
| spine.set_linewidth(0.8) | |
| # Rolling average line + fill | |
| ax.plot(episodes, rolling, color=cyan, linewidth=2.8, zorder=3) | |
| ax.fill_between( | |
| episodes, rolling, alpha=0.15, color=cyan, zorder=2, | |
| ) | |
| # Zero reference line | |
| ax.axhline(y=0, color=border, linewidth=0.8, linestyle="--", zorder=1) | |
| # Axis styling | |
| ax.set_xlabel("Episode", color=muted, fontsize=11, fontfamily="monospace") | |
| ax.set_ylabel("Reward", color=muted, fontsize=11, fontfamily="monospace") | |
| ax.tick_params(colors=muted, labelsize=9) | |
| ax.xaxis.set_major_locator(ticker.MultipleLocator(10)) | |
| ax.grid(False) | |
| # Title | |
| ax.set_title( | |
| "Reward Trend Β· Episodes 0β75 Β· 35-ep Rolling Avg", | |
| color=text, | |
| fontsize=14, | |
| fontfamily="monospace", | |
| fontweight="bold", | |
| pad=16, | |
| ) | |
| fig.tight_layout(pad=1.5) | |
| return fig | |
| # ββ Build the Gradio app ββ | |
| _theme = gr.themes.Base( | |
| primary_hue="cyan", | |
| neutral_hue="slate", | |
| ) | |
| _css = """ | |
| .gradio-container { background: #0a0e17 !important; } | |
| .main-header { text-align: center; margin-bottom: 8px; } | |
| .main-header h1 { color: #e2e8f0; font-family: monospace; } | |
| .main-header p { color: #64748b; font-family: monospace; font-size: 14px; } | |
| .section-label { color: #94a3b8 !important; font-family: monospace !important; } | |
| """ | |
| with gr.Blocks( | |
| title="Nested RL Environments β AI Oversight", | |
| ) as demo: | |
| # Header | |
| gr.HTML(""" | |
| <div class="main-header"> | |
| <h1>Nested RL Environments</h1> | |
| <p>Self-Improving Oversight for AI Customer Support Β· Team Ludes Magnus</p> | |
| </div> | |
| """) | |
| # ββ Tab layout ββ | |
| with gr.Tabs(): | |
| # Tab 1: Architecture (default) | |
| with gr.Tab("Architecture"): | |
| gr.Image( | |
| value="assets/architecture.png", | |
| label="3-Layer Architecture", | |
| show_label=False, | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ## Prize Targets | |
| - **Main Track β Statement 4:** Layer 0 generates reward functions β new domain = new RL environment automatically | |
| - **Fleet AI $10k:** Layer 1 provides scalable oversight β add intents, retrain | |
| - **Halluminate $10k:** Layer 2 is a multi-actor environment with 100 diverse adversarial customers | |
| """) | |
| # Tab 2: Training Results | |
| with gr.Tab("Training Results"): | |
| gr.Markdown( | |
| "### Reward Trend β GRPO Prompt Optimization", | |
| elem_classes=["section-label"], | |
| ) | |
| reward_plot = gr.Plot(value=create_reward_chart(), label="Reward Trend") | |
| gr.Markdown( | |
| """ | |
| <div style="color: #64748b; font-family: monospace; font-size: 12px; text-align: right; margin-top: -8px;"> | |
| Run 20260308_135709 Β· 300 total episodes Β· All data authentic | |
| </div> | |
| """, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860, theme=_theme, css=_css) | |