""" Cloud Arena — RL Training on HF Spaces Two SEPARATE models: 1. Mathematical Model (MaskablePPO + MLP) — tab "Math RL" 2. LLM Model (LLaMA 3.1 8B + REINFORCE + LoRA) — tab "LLM RL" """ import os import gradio as gr import numpy as np os.makedirs("./models", exist_ok=True) os.makedirs("./outputs", exist_ok=True) # ── Mathematical Model Training ────────────────────────────────────────────── def run_math_training(timesteps): from cloud_arena.training import train_model try: ts = max(int(timesteps), 5000) # minimum 5000 to avoid sampling errors if int(timesteps) < 5000: print(f"⚠️ Timesteps too low ({int(timesteps)}), using minimum 5000") model, callback, _ = train_model(total_timesteps=ts) from cloud_arena.visualization import generate_dashboard img_path = generate_dashboard(callback, "outputs/dashboard.png") summary = ( f"✅ Math Model Training Complete\n" f"Episodes: {len(callback.episode_rewards)}\n" f"Final Phase: {callback.current_level}\n" f"EMA Win Rate: {callback.ema_win_rate*100:.1f}%\n" f"Avg Savings: {np.mean(callback.episode_savings):.1f}%" ) return summary, img_path except Exception as e: return f"❌ Error: {e}", None def run_math_evaluation(): from cloud_arena.evaluation import evaluate_model try: results = evaluate_model() wr = np.mean(results["win"]) * 100 cost = np.mean(results["cost_score"]) sec = np.mean(results["security_score"]) sav = np.mean(results["savings_pct"]) return ( f"Win Rate: {wr:.1f}%\nCost Score: {cost:.3f}\n" f"Security: {sec:.3f}\nSavings: {sav:.1f}%" ) except Exception as e: return f"❌ Error: {e}" # ── LLM Model Training ─────────────────────────────────────────────────────── def run_llm_training( model_name, num_iterations, steps_per_episode, group_size, clip_epsilon, kl_coef, entropy_coef, max_gen_tokens, temperature, ): from cloud_arena.llm_training import train_llm from cloud_arena.llm_environment import OpenEnvAdapter try: # Explicitly initialize the OpenEnv adapter contract used by training runtime. _openenv_runtime = OpenEnvAdapter() _ = _openenv_runtime.get_state() iters = int(num_iterations) all_rewards, full_log, graph_path, log_text = train_llm( model_name=model_name, num_iterations=iters, steps_per_episode=int(steps_per_episode), group_size=int(group_size), clip_epsilon=float(clip_epsilon), kl_coef=float(kl_coef), entropy_coef=float(entropy_coef), max_gen_tokens=int(max_gen_tokens), temperature=float(temperature), ) delta = all_rewards[-1] - all_rewards[0] summary = ( f"✅ LLM GRPO Training Complete\n" f"Model: {model_name}\n" f"Algorithm: Custom GRPO\n" f"Pre-training reward: {all_rewards[0]:+.3f}\n" f"Post-training reward: {all_rewards[-1]:+.3f}\n" f"Δ Change: {delta:+.3f}\n\n" f"─── Full Log ───\n{log_text}" ) return summary, graph_path except Exception as e: import traceback return f"❌ Error: {e}\n{traceback.format_exc()}", None # ── Gradio UI ───────────────────────────────────────────────────────────────── with gr.Blocks(title="Cloud Arena RL", theme=gr.themes.Base()) as demo: gr.Markdown("# ☁️ Cloud Arena — RL Training Space") gr.Markdown("Two separate RL systems: **Mathematical Model** (MaskablePPO) and **LLM Model** (LLaMA + LoRA)") with gr.Tab("🧮 Math RL"): gr.Markdown("### Mathematical Model — MaskablePPO (MLP Neural Network)") ts_input = gr.Number(value=500000, label="Total Timesteps") train_btn = gr.Button("🚀 Start Math Training", variant="primary") math_output = gr.Textbox(label="Status", lines=6) math_img = gr.Image(label="Dashboard") train_btn.click(run_math_training, inputs=ts_input, outputs=[math_output, math_img]) gr.Markdown("---") eval_btn = gr.Button("📊 Evaluate Math Model") eval_output = gr.Textbox(label="Eval Results", lines=6) eval_btn.click(run_math_evaluation, outputs=eval_output) with gr.Tab("🧠 LLM RL"): gr.Markdown("### Multi-Model RL Benchmark — Custom GRPO + LoRA") gr.Markdown("> Comma-separate model names to benchmark multiple models sequentially") llm_model = gr.Textbox( value="unsloth/Qwen2.5-Math-7B-Instruct-bnb-4bit, unsloth/gemma-2b-it-bnb-4bit, unsloth/llama-3-8b-Instruct-bnb-4bit", label="Model(s) — comma-separated for multi-model benchmark" ) llm_iters = gr.Number(value=200, label="Training Iterations per Model") llm_steps = gr.Number(value=15, label="Steps per Episode") grpo_group = gr.Number(value=8, label="GRPO Group Size (K)") grpo_clip = gr.Number(value=0.2, label="GRPO Clip Epsilon") grpo_kl = gr.Number(value=0.01, label="KL Coefficient") grpo_entropy = gr.Number(value=0.001, label="Entropy Coefficient") grpo_tokens = gr.Number(value=60, label="Max Generation Tokens") grpo_temp = gr.Number(value=0.7, label="Sampling Temperature") llm_btn = gr.Button("🚀 Start LLM Training", variant="primary") llm_output = gr.Textbox(label="Training Log", lines=15) llm_img = gr.Image(label="Results") llm_btn.click( run_llm_training, inputs=[ llm_model, llm_iters, llm_steps, grpo_group, grpo_clip, grpo_kl, grpo_entropy, grpo_tokens, grpo_temp, ], outputs=[llm_output, llm_img], ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)