Spaces:
Paused
Paused
File size: 6,497 Bytes
12263fa ee3dfa7 12263fa ee3dfa7 12263fa ee3dfa7 12263fa 8d95050 12263fa ee3dfa7 12263fa ee3dfa7 12263fa ee3dfa7 12263fa ee3dfa7 dfc5996 ee3dfa7 d81b76a 12263fa d81b76a f20bc34 ee3dfa7 f20bc34 ee3dfa7 dfc5996 ee3dfa7 dfc5996 ee3dfa7 dfc5996 ee3dfa7 12263fa ee3dfa7 12263fa ee3dfa7 12263fa a593df9 ee3dfa7 12263fa ee3dfa7 12263fa ee3dfa7 dfc5996 af6bbef 93d0171 5a0c6af dfc5996 5a0c6af dfc5996 ee3dfa7 dfc5996 12263fa a593df9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | """
Cloud Arena — RL Training on HF Spaces
Two SEPARATE models:
1. Mathematical Model (MaskablePPO + MLP) — tab "Math RL"
2. LLM Model (LLaMA 3.1 8B + REINFORCE + LoRA) — tab "LLM RL"
"""
import os
import gradio as gr
import numpy as np
os.makedirs("./models", exist_ok=True)
os.makedirs("./outputs", exist_ok=True)
# ── Mathematical Model Training ──────────────────────────────────────────────
def run_math_training(timesteps):
from cloud_arena.training import train_model
try:
ts = max(int(timesteps), 5000) # minimum 5000 to avoid sampling errors
if int(timesteps) < 5000:
print(f"⚠️ Timesteps too low ({int(timesteps)}), using minimum 5000")
model, callback, _ = train_model(total_timesteps=ts)
from cloud_arena.visualization import generate_dashboard
img_path = generate_dashboard(callback, "outputs/dashboard.png")
summary = (
f"✅ Math Model Training Complete\n"
f"Episodes: {len(callback.episode_rewards)}\n"
f"Final Phase: {callback.current_level}\n"
f"EMA Win Rate: {callback.ema_win_rate*100:.1f}%\n"
f"Avg Savings: {np.mean(callback.episode_savings):.1f}%"
)
return summary, img_path
except Exception as e:
return f"❌ Error: {e}", None
def run_math_evaluation():
from cloud_arena.evaluation import evaluate_model
try:
results = evaluate_model()
wr = np.mean(results["win"]) * 100
cost = np.mean(results["cost_score"])
sec = np.mean(results["security_score"])
sav = np.mean(results["savings_pct"])
return (
f"Win Rate: {wr:.1f}%\nCost Score: {cost:.3f}\n"
f"Security: {sec:.3f}\nSavings: {sav:.1f}%"
)
except Exception as e:
return f"❌ Error: {e}"
# ── LLM Model Training ───────────────────────────────────────────────────────
def run_llm_training(
model_name,
num_iterations,
steps_per_episode,
group_size,
clip_epsilon,
kl_coef,
entropy_coef,
max_gen_tokens,
temperature,
):
from cloud_arena.llm_training import train_llm
from cloud_arena.llm_environment import OpenEnvAdapter
try:
# Explicitly initialize the OpenEnv adapter contract used by training runtime.
_openenv_runtime = OpenEnvAdapter()
_ = _openenv_runtime.get_state()
iters = int(num_iterations)
all_rewards, full_log, graph_path, log_text = train_llm(
model_name=model_name,
num_iterations=iters,
steps_per_episode=int(steps_per_episode),
group_size=int(group_size),
clip_epsilon=float(clip_epsilon),
kl_coef=float(kl_coef),
entropy_coef=float(entropy_coef),
max_gen_tokens=int(max_gen_tokens),
temperature=float(temperature),
)
delta = all_rewards[-1] - all_rewards[0]
summary = (
f"✅ LLM GRPO Training Complete\n"
f"Model: {model_name}\n"
f"Algorithm: Custom GRPO\n"
f"Pre-training reward: {all_rewards[0]:+.3f}\n"
f"Post-training reward: {all_rewards[-1]:+.3f}\n"
f"Δ Change: {delta:+.3f}\n\n"
f"─── Full Log ───\n{log_text}"
)
return summary, graph_path
except Exception as e:
import traceback
return f"❌ Error: {e}\n{traceback.format_exc()}", None
# ── Gradio UI ─────────────────────────────────────────────────────────────────
with gr.Blocks(title="Cloud Arena RL", theme=gr.themes.Base()) as demo:
gr.Markdown("# ☁️ Cloud Arena — RL Training Space")
gr.Markdown("Two separate RL systems: **Mathematical Model** (MaskablePPO) and **LLM Model** (LLaMA + LoRA)")
with gr.Tab("🧮 Math RL"):
gr.Markdown("### Mathematical Model — MaskablePPO (MLP Neural Network)")
ts_input = gr.Number(value=500000, label="Total Timesteps")
train_btn = gr.Button("🚀 Start Math Training", variant="primary")
math_output = gr.Textbox(label="Status", lines=6)
math_img = gr.Image(label="Dashboard")
train_btn.click(run_math_training, inputs=ts_input, outputs=[math_output, math_img])
gr.Markdown("---")
eval_btn = gr.Button("📊 Evaluate Math Model")
eval_output = gr.Textbox(label="Eval Results", lines=6)
eval_btn.click(run_math_evaluation, outputs=eval_output)
with gr.Tab("🧠 LLM RL"):
gr.Markdown("### Multi-Model RL Benchmark — Custom GRPO + LoRA")
gr.Markdown("> Comma-separate model names to benchmark multiple models sequentially")
llm_model = gr.Textbox(
value="unsloth/Qwen2.5-Math-7B-Instruct-bnb-4bit, unsloth/gemma-2b-it-bnb-4bit, unsloth/llama-3-8b-Instruct-bnb-4bit",
label="Model(s) — comma-separated for multi-model benchmark"
)
llm_iters = gr.Number(value=200, label="Training Iterations per Model")
llm_steps = gr.Number(value=15, label="Steps per Episode")
grpo_group = gr.Number(value=8, label="GRPO Group Size (K)")
grpo_clip = gr.Number(value=0.2, label="GRPO Clip Epsilon")
grpo_kl = gr.Number(value=0.01, label="KL Coefficient")
grpo_entropy = gr.Number(value=0.001, label="Entropy Coefficient")
grpo_tokens = gr.Number(value=60, label="Max Generation Tokens")
grpo_temp = gr.Number(value=0.7, label="Sampling Temperature")
llm_btn = gr.Button("🚀 Start LLM Training", variant="primary")
llm_output = gr.Textbox(label="Training Log", lines=15)
llm_img = gr.Image(label="Results")
llm_btn.click(
run_llm_training,
inputs=[
llm_model,
llm_iters,
llm_steps,
grpo_group,
grpo_clip,
grpo_kl,
grpo_entropy,
grpo_tokens,
grpo_temp,
],
outputs=[llm_output, llm_img],
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|