File size: 6,497 Bytes
12263fa
ee3dfa7
 
 
 
12263fa
 
 
 
 
 
 
 
 
 
ee3dfa7
12263fa
ee3dfa7
12263fa
 
8d95050
 
 
 
12263fa
 
 
ee3dfa7
12263fa
 
 
 
 
 
 
 
 
 
ee3dfa7
12263fa
 
 
 
 
 
 
 
ee3dfa7
 
12263fa
 
 
 
 
ee3dfa7
 
dfc5996
 
 
 
 
 
 
 
 
 
 
ee3dfa7
d81b76a
12263fa
d81b76a
 
 
f20bc34
ee3dfa7
 
f20bc34
ee3dfa7
dfc5996
 
 
 
 
 
ee3dfa7
 
 
dfc5996
ee3dfa7
dfc5996
ee3dfa7
 
 
 
 
 
12263fa
ee3dfa7
 
 
12263fa
ee3dfa7
12263fa
a593df9
ee3dfa7
 
12263fa
ee3dfa7
 
12263fa
ee3dfa7
 
 
 
 
 
 
 
 
 
 
dfc5996
af6bbef
 
 
 
 
 
93d0171
5a0c6af
dfc5996
 
 
5a0c6af
dfc5996
ee3dfa7
 
 
dfc5996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12263fa
 
a593df9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""
Cloud Arena — RL Training on HF Spaces
Two SEPARATE models:
  1. Mathematical Model (MaskablePPO + MLP) — tab "Math RL"
  2. LLM Model (LLaMA 3.1 8B + REINFORCE + LoRA) — tab "LLM RL"
"""

import os
import gradio as gr
import numpy as np

os.makedirs("./models", exist_ok=True)
os.makedirs("./outputs", exist_ok=True)


# ── Mathematical Model Training ──────────────────────────────────────────────

def run_math_training(timesteps):
    from cloud_arena.training import train_model
    try:
        ts = max(int(timesteps), 5000)  # minimum 5000 to avoid sampling errors
        if int(timesteps) < 5000:
            print(f"⚠️ Timesteps too low ({int(timesteps)}), using minimum 5000")
        model, callback, _ = train_model(total_timesteps=ts)
        from cloud_arena.visualization import generate_dashboard
        img_path = generate_dashboard(callback, "outputs/dashboard.png")
        summary = (
            f"✅ Math Model Training Complete\n"
            f"Episodes: {len(callback.episode_rewards)}\n"
            f"Final Phase: {callback.current_level}\n"
            f"EMA Win Rate: {callback.ema_win_rate*100:.1f}%\n"
            f"Avg Savings: {np.mean(callback.episode_savings):.1f}%"
        )
        return summary, img_path
    except Exception as e:
        return f"❌ Error: {e}", None


def run_math_evaluation():
    from cloud_arena.evaluation import evaluate_model
    try:
        results = evaluate_model()
        wr = np.mean(results["win"]) * 100
        cost = np.mean(results["cost_score"])
        sec = np.mean(results["security_score"])
        sav = np.mean(results["savings_pct"])
        return (
            f"Win Rate: {wr:.1f}%\nCost Score: {cost:.3f}\n"
            f"Security: {sec:.3f}\nSavings: {sav:.1f}%"
        )
    except Exception as e:
        return f"❌ Error: {e}"


# ── LLM Model Training ───────────────────────────────────────────────────────

def run_llm_training(
    model_name,
    num_iterations,
    steps_per_episode,
    group_size,
    clip_epsilon,
    kl_coef,
    entropy_coef,
    max_gen_tokens,
    temperature,
):
    from cloud_arena.llm_training import train_llm
    from cloud_arena.llm_environment import OpenEnvAdapter
    try:
        # Explicitly initialize the OpenEnv adapter contract used by training runtime.
        _openenv_runtime = OpenEnvAdapter()
        _ = _openenv_runtime.get_state()
        iters = int(num_iterations)
        all_rewards, full_log, graph_path, log_text = train_llm(
            model_name=model_name,
            num_iterations=iters,
            steps_per_episode=int(steps_per_episode),
            group_size=int(group_size),
            clip_epsilon=float(clip_epsilon),
            kl_coef=float(kl_coef),
            entropy_coef=float(entropy_coef),
            max_gen_tokens=int(max_gen_tokens),
            temperature=float(temperature),
        )
        delta = all_rewards[-1] - all_rewards[0]
        summary = (
            f"✅ LLM GRPO Training Complete\n"
            f"Model: {model_name}\n"
            f"Algorithm: Custom GRPO\n"
            f"Pre-training reward: {all_rewards[0]:+.3f}\n"
            f"Post-training reward: {all_rewards[-1]:+.3f}\n"
            f"Δ Change: {delta:+.3f}\n\n"
            f"─── Full Log ───\n{log_text}"
        )
        return summary, graph_path
    except Exception as e:
        import traceback
        return f"❌ Error: {e}\n{traceback.format_exc()}", None


# ── Gradio UI ─────────────────────────────────────────────────────────────────

with gr.Blocks(title="Cloud Arena RL", theme=gr.themes.Base()) as demo:
    gr.Markdown("# ☁️ Cloud Arena — RL Training Space")
    gr.Markdown("Two separate RL systems: **Mathematical Model** (MaskablePPO) and **LLM Model** (LLaMA + LoRA)")

    with gr.Tab("🧮 Math RL"):
        gr.Markdown("### Mathematical Model — MaskablePPO (MLP Neural Network)")
        ts_input = gr.Number(value=500000, label="Total Timesteps")
        train_btn = gr.Button("🚀 Start Math Training", variant="primary")
        math_output = gr.Textbox(label="Status", lines=6)
        math_img = gr.Image(label="Dashboard")
        train_btn.click(run_math_training, inputs=ts_input, outputs=[math_output, math_img])

        gr.Markdown("---")
        eval_btn = gr.Button("📊 Evaluate Math Model")
        eval_output = gr.Textbox(label="Eval Results", lines=6)
        eval_btn.click(run_math_evaluation, outputs=eval_output)

    with gr.Tab("🧠 LLM RL"):
        gr.Markdown("### Multi-Model RL Benchmark — Custom GRPO + LoRA")
        gr.Markdown("> Comma-separate model names to benchmark multiple models sequentially")
        llm_model = gr.Textbox(
            value="unsloth/Qwen2.5-Math-7B-Instruct-bnb-4bit, unsloth/gemma-2b-it-bnb-4bit, unsloth/llama-3-8b-Instruct-bnb-4bit",
            label="Model(s) — comma-separated for multi-model benchmark"
        )
        llm_iters = gr.Number(value=200, label="Training Iterations per Model")
        llm_steps = gr.Number(value=15, label="Steps per Episode")
        grpo_group = gr.Number(value=8, label="GRPO Group Size (K)")
        grpo_clip = gr.Number(value=0.2, label="GRPO Clip Epsilon")
        grpo_kl = gr.Number(value=0.01, label="KL Coefficient")
        grpo_entropy = gr.Number(value=0.001, label="Entropy Coefficient")
        grpo_tokens = gr.Number(value=60, label="Max Generation Tokens")
        grpo_temp = gr.Number(value=0.7, label="Sampling Temperature")
        llm_btn = gr.Button("🚀 Start LLM Training", variant="primary")
        llm_output = gr.Textbox(label="Training Log", lines=15)
        llm_img = gr.Image(label="Results")
        llm_btn.click(
            run_llm_training,
            inputs=[
                llm_model,
                llm_iters,
                llm_steps,
                grpo_group,
                grpo_clip,
                grpo_kl,
                grpo_entropy,
                grpo_tokens,
                grpo_temp,
            ],
            outputs=[llm_output, llm_img],
        )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)