Spaces:
Paused
Paused
| """ | |
| Cloud Arena — RL Training on HF Spaces | |
| Two SEPARATE models: | |
| 1. Mathematical Model (MaskablePPO + MLP) — tab "Math RL" | |
| 2. LLM Model (LLaMA 3.1 8B + REINFORCE + LoRA) — tab "LLM RL" | |
| """ | |
| import os | |
| import gradio as gr | |
| import numpy as np | |
| os.makedirs("./models", exist_ok=True) | |
| os.makedirs("./outputs", exist_ok=True) | |
| # ── Mathematical Model Training ────────────────────────────────────────────── | |
| def run_math_training(timesteps): | |
| from cloud_arena.training import train_model | |
| try: | |
| ts = max(int(timesteps), 5000) # minimum 5000 to avoid sampling errors | |
| if int(timesteps) < 5000: | |
| print(f"⚠️ Timesteps too low ({int(timesteps)}), using minimum 5000") | |
| model, callback, _ = train_model(total_timesteps=ts) | |
| from cloud_arena.visualization import generate_dashboard | |
| img_path = generate_dashboard(callback, "outputs/dashboard.png") | |
| summary = ( | |
| f"✅ Math Model Training Complete\n" | |
| f"Episodes: {len(callback.episode_rewards)}\n" | |
| f"Final Phase: {callback.current_level}\n" | |
| f"EMA Win Rate: {callback.ema_win_rate*100:.1f}%\n" | |
| f"Avg Savings: {np.mean(callback.episode_savings):.1f}%" | |
| ) | |
| return summary, img_path | |
| except Exception as e: | |
| return f"❌ Error: {e}", None | |
| def run_math_evaluation(): | |
| from cloud_arena.evaluation import evaluate_model | |
| try: | |
| results = evaluate_model() | |
| wr = np.mean(results["win"]) * 100 | |
| cost = np.mean(results["cost_score"]) | |
| sec = np.mean(results["security_score"]) | |
| sav = np.mean(results["savings_pct"]) | |
| return ( | |
| f"Win Rate: {wr:.1f}%\nCost Score: {cost:.3f}\n" | |
| f"Security: {sec:.3f}\nSavings: {sav:.1f}%" | |
| ) | |
| except Exception as e: | |
| return f"❌ Error: {e}" | |
| # ── LLM Model Training ─────────────────────────────────────────────────────── | |
| def run_llm_training( | |
| model_name, | |
| num_iterations, | |
| steps_per_episode, | |
| group_size, | |
| clip_epsilon, | |
| kl_coef, | |
| entropy_coef, | |
| max_gen_tokens, | |
| temperature, | |
| ): | |
| from cloud_arena.llm_training import train_llm | |
| from cloud_arena.llm_environment import OpenEnvAdapter | |
| try: | |
| # Explicitly initialize the OpenEnv adapter contract used by training runtime. | |
| _openenv_runtime = OpenEnvAdapter() | |
| _ = _openenv_runtime.get_state() | |
| iters = int(num_iterations) | |
| all_rewards, full_log, graph_path, log_text = train_llm( | |
| model_name=model_name, | |
| num_iterations=iters, | |
| steps_per_episode=int(steps_per_episode), | |
| group_size=int(group_size), | |
| clip_epsilon=float(clip_epsilon), | |
| kl_coef=float(kl_coef), | |
| entropy_coef=float(entropy_coef), | |
| max_gen_tokens=int(max_gen_tokens), | |
| temperature=float(temperature), | |
| ) | |
| delta = all_rewards[-1] - all_rewards[0] | |
| summary = ( | |
| f"✅ LLM GRPO Training Complete\n" | |
| f"Model: {model_name}\n" | |
| f"Algorithm: Custom GRPO\n" | |
| f"Pre-training reward: {all_rewards[0]:+.3f}\n" | |
| f"Post-training reward: {all_rewards[-1]:+.3f}\n" | |
| f"Δ Change: {delta:+.3f}\n\n" | |
| f"─── Full Log ───\n{log_text}" | |
| ) | |
| return summary, graph_path | |
| except Exception as e: | |
| import traceback | |
| return f"❌ Error: {e}\n{traceback.format_exc()}", None | |
| # ── Gradio UI ───────────────────────────────────────────────────────────────── | |
| with gr.Blocks(title="Cloud Arena RL", theme=gr.themes.Base()) as demo: | |
| gr.Markdown("# ☁️ Cloud Arena — RL Training Space") | |
| gr.Markdown("Two separate RL systems: **Mathematical Model** (MaskablePPO) and **LLM Model** (LLaMA + LoRA)") | |
| with gr.Tab("🧮 Math RL"): | |
| gr.Markdown("### Mathematical Model — MaskablePPO (MLP Neural Network)") | |
| ts_input = gr.Number(value=500000, label="Total Timesteps") | |
| train_btn = gr.Button("🚀 Start Math Training", variant="primary") | |
| math_output = gr.Textbox(label="Status", lines=6) | |
| math_img = gr.Image(label="Dashboard") | |
| train_btn.click(run_math_training, inputs=ts_input, outputs=[math_output, math_img]) | |
| gr.Markdown("---") | |
| eval_btn = gr.Button("📊 Evaluate Math Model") | |
| eval_output = gr.Textbox(label="Eval Results", lines=6) | |
| eval_btn.click(run_math_evaluation, outputs=eval_output) | |
| with gr.Tab("🧠 LLM RL"): | |
| gr.Markdown("### Multi-Model RL Benchmark — Custom GRPO + LoRA") | |
| gr.Markdown("> Comma-separate model names to benchmark multiple models sequentially") | |
| llm_model = gr.Textbox( | |
| value="unsloth/Qwen2.5-Math-7B-Instruct-bnb-4bit, unsloth/gemma-2b-it-bnb-4bit, unsloth/llama-3-8b-Instruct-bnb-4bit", | |
| label="Model(s) — comma-separated for multi-model benchmark" | |
| ) | |
| llm_iters = gr.Number(value=200, label="Training Iterations per Model") | |
| llm_steps = gr.Number(value=15, label="Steps per Episode") | |
| grpo_group = gr.Number(value=8, label="GRPO Group Size (K)") | |
| grpo_clip = gr.Number(value=0.2, label="GRPO Clip Epsilon") | |
| grpo_kl = gr.Number(value=0.01, label="KL Coefficient") | |
| grpo_entropy = gr.Number(value=0.001, label="Entropy Coefficient") | |
| grpo_tokens = gr.Number(value=60, label="Max Generation Tokens") | |
| grpo_temp = gr.Number(value=0.7, label="Sampling Temperature") | |
| llm_btn = gr.Button("🚀 Start LLM Training", variant="primary") | |
| llm_output = gr.Textbox(label="Training Log", lines=15) | |
| llm_img = gr.Image(label="Results") | |
| llm_btn.click( | |
| run_llm_training, | |
| inputs=[ | |
| llm_model, | |
| llm_iters, | |
| llm_steps, | |
| grpo_group, | |
| grpo_clip, | |
| grpo_kl, | |
| grpo_entropy, | |
| grpo_tokens, | |
| grpo_temp, | |
| ], | |
| outputs=[llm_output, llm_img], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |