Openenv / app.py
saravanatanjiro's picture
Fix Gradio runtime error by moving theme to gr.Blocks
a593df9
"""
Cloud Arena — RL Training on HF Spaces
Two SEPARATE models:
1. Mathematical Model (MaskablePPO + MLP) — tab "Math RL"
2. LLM Model (LLaMA 3.1 8B + REINFORCE + LoRA) — tab "LLM RL"
"""
import os
import gradio as gr
import numpy as np
os.makedirs("./models", exist_ok=True)
os.makedirs("./outputs", exist_ok=True)
# ── Mathematical Model Training ──────────────────────────────────────────────
def run_math_training(timesteps):
from cloud_arena.training import train_model
try:
ts = max(int(timesteps), 5000) # minimum 5000 to avoid sampling errors
if int(timesteps) < 5000:
print(f"⚠️ Timesteps too low ({int(timesteps)}), using minimum 5000")
model, callback, _ = train_model(total_timesteps=ts)
from cloud_arena.visualization import generate_dashboard
img_path = generate_dashboard(callback, "outputs/dashboard.png")
summary = (
f"✅ Math Model Training Complete\n"
f"Episodes: {len(callback.episode_rewards)}\n"
f"Final Phase: {callback.current_level}\n"
f"EMA Win Rate: {callback.ema_win_rate*100:.1f}%\n"
f"Avg Savings: {np.mean(callback.episode_savings):.1f}%"
)
return summary, img_path
except Exception as e:
return f"❌ Error: {e}", None
def run_math_evaluation():
from cloud_arena.evaluation import evaluate_model
try:
results = evaluate_model()
wr = np.mean(results["win"]) * 100
cost = np.mean(results["cost_score"])
sec = np.mean(results["security_score"])
sav = np.mean(results["savings_pct"])
return (
f"Win Rate: {wr:.1f}%\nCost Score: {cost:.3f}\n"
f"Security: {sec:.3f}\nSavings: {sav:.1f}%"
)
except Exception as e:
return f"❌ Error: {e}"
# ── LLM Model Training ───────────────────────────────────────────────────────
def run_llm_training(
model_name,
num_iterations,
steps_per_episode,
group_size,
clip_epsilon,
kl_coef,
entropy_coef,
max_gen_tokens,
temperature,
):
from cloud_arena.llm_training import train_llm
from cloud_arena.llm_environment import OpenEnvAdapter
try:
# Explicitly initialize the OpenEnv adapter contract used by training runtime.
_openenv_runtime = OpenEnvAdapter()
_ = _openenv_runtime.get_state()
iters = int(num_iterations)
all_rewards, full_log, graph_path, log_text = train_llm(
model_name=model_name,
num_iterations=iters,
steps_per_episode=int(steps_per_episode),
group_size=int(group_size),
clip_epsilon=float(clip_epsilon),
kl_coef=float(kl_coef),
entropy_coef=float(entropy_coef),
max_gen_tokens=int(max_gen_tokens),
temperature=float(temperature),
)
delta = all_rewards[-1] - all_rewards[0]
summary = (
f"✅ LLM GRPO Training Complete\n"
f"Model: {model_name}\n"
f"Algorithm: Custom GRPO\n"
f"Pre-training reward: {all_rewards[0]:+.3f}\n"
f"Post-training reward: {all_rewards[-1]:+.3f}\n"
f"Δ Change: {delta:+.3f}\n\n"
f"─── Full Log ───\n{log_text}"
)
return summary, graph_path
except Exception as e:
import traceback
return f"❌ Error: {e}\n{traceback.format_exc()}", None
# ── Gradio UI ─────────────────────────────────────────────────────────────────
with gr.Blocks(title="Cloud Arena RL", theme=gr.themes.Base()) as demo:
gr.Markdown("# ☁️ Cloud Arena — RL Training Space")
gr.Markdown("Two separate RL systems: **Mathematical Model** (MaskablePPO) and **LLM Model** (LLaMA + LoRA)")
with gr.Tab("🧮 Math RL"):
gr.Markdown("### Mathematical Model — MaskablePPO (MLP Neural Network)")
ts_input = gr.Number(value=500000, label="Total Timesteps")
train_btn = gr.Button("🚀 Start Math Training", variant="primary")
math_output = gr.Textbox(label="Status", lines=6)
math_img = gr.Image(label="Dashboard")
train_btn.click(run_math_training, inputs=ts_input, outputs=[math_output, math_img])
gr.Markdown("---")
eval_btn = gr.Button("📊 Evaluate Math Model")
eval_output = gr.Textbox(label="Eval Results", lines=6)
eval_btn.click(run_math_evaluation, outputs=eval_output)
with gr.Tab("🧠 LLM RL"):
gr.Markdown("### Multi-Model RL Benchmark — Custom GRPO + LoRA")
gr.Markdown("> Comma-separate model names to benchmark multiple models sequentially")
llm_model = gr.Textbox(
value="unsloth/Qwen2.5-Math-7B-Instruct-bnb-4bit, unsloth/gemma-2b-it-bnb-4bit, unsloth/llama-3-8b-Instruct-bnb-4bit",
label="Model(s) — comma-separated for multi-model benchmark"
)
llm_iters = gr.Number(value=200, label="Training Iterations per Model")
llm_steps = gr.Number(value=15, label="Steps per Episode")
grpo_group = gr.Number(value=8, label="GRPO Group Size (K)")
grpo_clip = gr.Number(value=0.2, label="GRPO Clip Epsilon")
grpo_kl = gr.Number(value=0.01, label="KL Coefficient")
grpo_entropy = gr.Number(value=0.001, label="Entropy Coefficient")
grpo_tokens = gr.Number(value=60, label="Max Generation Tokens")
grpo_temp = gr.Number(value=0.7, label="Sampling Temperature")
llm_btn = gr.Button("🚀 Start LLM Training", variant="primary")
llm_output = gr.Textbox(label="Training Log", lines=15)
llm_img = gr.Image(label="Results")
llm_btn.click(
run_llm_training,
inputs=[
llm_model,
llm_iters,
llm_steps,
grpo_group,
grpo_clip,
grpo_kl,
grpo_entropy,
grpo_tokens,
grpo_temp,
],
outputs=[llm_output, llm_img],
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)