"""
Cloud Arena — RL Training on HF Spaces
Two SEPARATE models:
  1. Mathematical Model (MaskablePPO + MLP) — tab "Math RL"
  2. LLM Model (LLaMA 3.1 8B + REINFORCE + LoRA) — tab "LLM RL"
"""

import os
import gradio as gr
import numpy as np

os.makedirs("./models", exist_ok=True)
os.makedirs("./outputs", exist_ok=True)


# ── Mathematical Model Training ──────────────────────────────────────────────

def run_math_training(timesteps):
    from cloud_arena.training import train_model
    try:
        ts = max(int(timesteps), 5000)  # minimum 5000 to avoid sampling errors
        if int(timesteps) < 5000:
            print(f"⚠️ Timesteps too low ({int(timesteps)}), using minimum 5000")
        model, callback, _ = train_model(total_timesteps=ts)
        from cloud_arena.visualization import generate_dashboard
        img_path = generate_dashboard(callback, "outputs/dashboard.png")
        summary = (
            f"✅ Math Model Training Complete\n"
            f"Episodes: {len(callback.episode_rewards)}\n"
            f"Final Phase: {callback.current_level}\n"
            f"EMA Win Rate: {callback.ema_win_rate*100:.1f}%\n"
            f"Avg Savings: {np.mean(callback.episode_savings):.1f}%"
        )
        return summary, img_path
    except Exception as e:
        return f"❌ Error: {e}", None


def run_math_evaluation():
    from cloud_arena.evaluation import evaluate_model
    try:
        results = evaluate_model()
        wr = np.mean(results["win"]) * 100
        cost = np.mean(results["cost_score"])
        sec = np.mean(results["security_score"])
        sav = np.mean(results["savings_pct"])
        return (
            f"Win Rate: {wr:.1f}%\nCost Score: {cost:.3f}\n"
            f"Security: {sec:.3f}\nSavings: {sav:.1f}%"
        )
    except Exception as e:
        return f"❌ Error: {e}"


# ── LLM Model Training ───────────────────────────────────────────────────────

def run_llm_training(
    model_name,
    num_iterations,
    steps_per_episode,
    group_size,
    clip_epsilon,
    kl_coef,
    entropy_coef,
    max_gen_tokens,
    temperature,
):
    from cloud_arena.llm_training import train_llm
    from cloud_arena.llm_environment import OpenEnvAdapter
    try:
        # Explicitly initialize the OpenEnv adapter contract used by training runtime.
        _openenv_runtime = OpenEnvAdapter()
        _ = _openenv_runtime.get_state()
        iters = int(num_iterations)
        all_rewards, full_log, graph_path, log_text = train_llm(
            model_name=model_name,
            num_iterations=iters,
            steps_per_episode=int(steps_per_episode),
            group_size=int(group_size),
            clip_epsilon=float(clip_epsilon),
            kl_coef=float(kl_coef),
            entropy_coef=float(entropy_coef),
            max_gen_tokens=int(max_gen_tokens),
            temperature=float(temperature),
        )
        delta = all_rewards[-1] - all_rewards[0]
        summary = (
            f"✅ LLM GRPO Training Complete\n"
            f"Model: {model_name}\n"
            f"Algorithm: Custom GRPO\n"
            f"Pre-training reward: {all_rewards[0]:+.3f}\n"
            f"Post-training reward: {all_rewards[-1]:+.3f}\n"
            f"Δ Change: {delta:+.3f}\n\n"
            f"─── Full Log ───\n{log_text}"
        )
        return summary, graph_path
    except Exception as e:
        import traceback
        return f"❌ Error: {e}\n{traceback.format_exc()}", None


# ── Gradio UI ─────────────────────────────────────────────────────────────────

with gr.Blocks(title="Cloud Arena RL", theme=gr.themes.Base()) as demo:
    gr.Markdown("# ☁️ Cloud Arena — RL Training Space")
    gr.Markdown("Two separate RL systems: **Mathematical Model** (MaskablePPO) and **LLM Model** (LLaMA + LoRA)")

    with gr.Tab("🧮 Math RL"):
        gr.Markdown("### Mathematical Model — MaskablePPO (MLP Neural Network)")
        ts_input = gr.Number(value=500000, label="Total Timesteps")
        train_btn = gr.Button("🚀 Start Math Training", variant="primary")
        math_output = gr.Textbox(label="Status", lines=6)
        math_img = gr.Image(label="Dashboard")
        train_btn.click(run_math_training, inputs=ts_input, outputs=[math_output, math_img])

        gr.Markdown("---")
        eval_btn = gr.Button("📊 Evaluate Math Model")
        eval_output = gr.Textbox(label="Eval Results", lines=6)
        eval_btn.click(run_math_evaluation, outputs=eval_output)

    with gr.Tab("🧠 LLM RL"):
        gr.Markdown("### Multi-Model RL Benchmark — Custom GRPO + LoRA")
        gr.Markdown("> Comma-separate model names to benchmark multiple models sequentially")
        llm_model = gr.Textbox(
            value="unsloth/Qwen2.5-Math-7B-Instruct-bnb-4bit, unsloth/gemma-2b-it-bnb-4bit, unsloth/llama-3-8b-Instruct-bnb-4bit",
            label="Model(s) — comma-separated for multi-model benchmark"
        )
        llm_iters = gr.Number(value=200, label="Training Iterations per Model")
        llm_steps = gr.Number(value=15, label="Steps per Episode")
        grpo_group = gr.Number(value=8, label="GRPO Group Size (K)")
        grpo_clip = gr.Number(value=0.2, label="GRPO Clip Epsilon")
        grpo_kl = gr.Number(value=0.01, label="KL Coefficient")
        grpo_entropy = gr.Number(value=0.001, label="Entropy Coefficient")
        grpo_tokens = gr.Number(value=60, label="Max Generation Tokens")
        grpo_temp = gr.Number(value=0.7, label="Sampling Temperature")
        llm_btn = gr.Button("🚀 Start LLM Training", variant="primary")
        llm_output = gr.Textbox(label="Training Log", lines=15)
        llm_img = gr.Image(label="Results")
        llm_btn.click(
            run_llm_training,
            inputs=[
                llm_model,
                llm_iters,
                llm_steps,
                grpo_group,
                grpo_clip,
                grpo_kl,
                grpo_entropy,
                grpo_tokens,
                grpo_temp,
            ],
            outputs=[llm_output, llm_img],
        )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)