Spaces:

saravanatanjiro
/

Openenv

Paused

App Files Files Community

kavin57447 commited on 29 days ago

Commit

ee3dfa7

1 Parent(s): 1c86d42

Add LLM RL training with Gemma 7B + LoRA

Browse files

Files changed (4) hide show

app.py +59 -46
cloud_arena/llm_environment.py +438 -0
cloud_arena/llm_training.py +271 -0
requirements.txt +8 -2

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 """
-Cloud Arena — Mathematical Model RL Training on HF Spaces
-This is the MATHEMATICAL model (MaskablePPO + MLP), NOT the LLM model.
-The LLM model (cell5_ppo.py) is a SEPARATE system.
 """
 import os
@@ -11,25 +12,17 @@ import numpy as np
 os.makedirs("./models", exist_ok=True)
 os.makedirs("./outputs", exist_ok=True)
-# Global state
-training_state = {"model": None, "callback": None, "status": "idle"}
-def run_training(timesteps):
     from cloud_arena.training import train_model
-    training_state["status"] = "training"
     try:
-        ts = int(timesteps)
-        model, callback, _ = train_model(total_timesteps=ts)
-        training_state["model"] = model
-        training_state["callback"] = callback
-        training_state["status"] = "done"
         from cloud_arena.visualization import generate_dashboard
         img_path = generate_dashboard(callback, "outputs/dashboard.png")
         summary = (
-            f"✅ Training Complete\n"
             f"Episodes: {len(callback.episode_rewards)}\n"
             f"Final Phase: {callback.current_level}\n"
             f"EMA Win Rate: {callback.ema_win_rate*100:.1f}%\n"
@@ -37,11 +30,10 @@ def run_training(timesteps):
         )
         return summary, img_path
     except Exception as e:
-        training_state["status"] = "error"
         return f"❌ Error: {e}", None
-def run_evaluation():
     from cloud_arena.evaluation import evaluate_model
     try:
         results = evaluate_model()
@@ -50,47 +42,68 @@ def run_evaluation():
         sec = np.mean(results["security_score"])
         sav = np.mean(results["savings_pct"])
         return (
-            f"Win Rate: {wr:.1f}%\n"
-            f"Cost Score: {cost:.3f}\n"
-            f"Security: {sec:.3f}\n"
-            f"Savings: {sav:.1f}%"
         )
     except Exception as e:
         return f"❌ Error: {e}"
-def run_bosses():
-    from cloud_arena.evaluation import run_boss_fights, BOSS_NAMES
     try:
-        scores = run_boss_fights()
-        lines = [f"{BOSS_NAMES[k]}: {v:.1f}%" for k, v in scores.items()]
-        overall = np.mean(list(scores.values()))
-        lines.append(f"\nOverall: {overall:.1f}%")
-        return "\n".join(lines)
     except Exception as e:
-        return f"❌ Error: {e}"
 with gr.Blocks(title="Cloud Arena RL") as demo:
-    gr.Markdown("# ☁️ Cloud Arena — Mathematical Model RL")
-    gr.Markdown("MaskablePPO training on a multi-objective cloud ops environment.")
-    with gr.Tab("Train"):
         ts_input = gr.Number(value=500000, label="Total Timesteps")
-        train_btn = gr.Button("🚀 Start Training", variant="primary")
-        train_output = gr.Textbox(label="Status", lines=6)
-        train_img = gr.Image(label="Dashboard")
-        train_btn.click(run_training, inputs=ts_input, outputs=[train_output, train_img])
-    with gr.Tab("Evaluate"):
-        eval_btn = gr.Button("📊 Run Evaluation")
-        eval_output = gr.Textbox(label="Results", lines=8)
-        eval_btn.click(run_evaluation, outputs=eval_output)
-    with gr.Tab("Boss Fights"):
-        boss_btn = gr.Button("⚔️ Run Boss Fights")
-        boss_output = gr.Textbox(label="Boss Scores", lines=8)
-        boss_btn.click(run_bosses, outputs=boss_output)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, theme=gr.themes.Base())

 """
+Cloud Arena — RL Training on HF Spaces
+Two SEPARATE models:
+  1. Mathematical Model (MaskablePPO + MLP) — tab "Math RL"
+  2. LLM Model (LLaMA 3.1 8B + REINFORCE + LoRA) — tab "LLM RL"
 """
 import os
 os.makedirs("./models", exist_ok=True)
 os.makedirs("./outputs", exist_ok=True)
+# ── Mathematical Model Training ──────────────────────────────────────────────
+def run_math_training(timesteps):
     from cloud_arena.training import train_model
     try:
+        model, callback, _ = train_model(total_timesteps=int(timesteps))
         from cloud_arena.visualization import generate_dashboard
         img_path = generate_dashboard(callback, "outputs/dashboard.png")
         summary = (
+            f"✅ Math Model Training Complete\n"
             f"Episodes: {len(callback.episode_rewards)}\n"
             f"Final Phase: {callback.current_level}\n"
             f"EMA Win Rate: {callback.ema_win_rate*100:.1f}%\n"
         )
         return summary, img_path
     except Exception as e:
         return f"❌ Error: {e}", None
+def run_math_evaluation():
     from cloud_arena.evaluation import evaluate_model
     try:
         results = evaluate_model()
         sec = np.mean(results["security_score"])
         sav = np.mean(results["savings_pct"])
         return (
+            f"Win Rate: {wr:.1f}%\nCost Score: {cost:.3f}\n"
+            f"Security: {sec:.3f}\nSavings: {sav:.1f}%"
         )
     except Exception as e:
         return f"❌ Error: {e}"
+# ── LLM Model Training ───────────────────────────────────────────────────────
+def run_llm_training(model_name, num_iterations, steps_per_episode):
+    from cloud_arena.llm_training import train_llm
     try:
+        all_rewards, full_log, graph_path, log_text = train_llm(
+            model_name=model_name,
+            num_iterations=int(num_iterations),
+            steps_per_episode=int(steps_per_episode),
+        )
+        delta = all_rewards[-1] - all_rewards[0]
+        summary = (
+            f"✅ LLM Training Complete\n"
+            f"Model: {model_name}\n"
+            f"Pre-training reward: {all_rewards[0]:+.3f}\n"
+            f"Post-training reward: {all_rewards[-1]:+.3f}\n"
+            f"Δ Change: {delta:+.3f}\n\n"
+            f"─── Full Log ───\n{log_text}"
+        )
+        return summary, graph_path
     except Exception as e:
+        import traceback
+        return f"❌ Error: {e}\n{traceback.format_exc()}", None
+# ── Gradio UI ─────────────────────────────────────────────────────────────────
 with gr.Blocks(title="Cloud Arena RL") as demo:
+    gr.Markdown("# ☁️ Cloud Arena — RL Training Space")
+    gr.Markdown("Two separate RL systems: **Mathematical Model** (MaskablePPO) and **LLM Model** (LLaMA + LoRA)")
+    with gr.Tab("🧮 Math RL"):
+        gr.Markdown("### Mathematical Model — MaskablePPO (MLP Neural Network)")
         ts_input = gr.Number(value=500000, label="Total Timesteps")
+        train_btn = gr.Button("🚀 Start Math Training", variant="primary")
+        math_output = gr.Textbox(label="Status", lines=6)
+        math_img = gr.Image(label="Dashboard")
+        train_btn.click(run_math_training, inputs=ts_input, outputs=[math_output, math_img])
+        gr.Markdown("---")
+        eval_btn = gr.Button("📊 Evaluate Math Model")
+        eval_output = gr.Textbox(label="Eval Results", lines=6)
+        eval_btn.click(run_math_evaluation, outputs=eval_output)
+    with gr.Tab("🧠 LLM RL"):
+        gr.Markdown("### LLM Model — Gemma 7B + REINFORCE + LoRA")
+        gr.Markdown("> ⚠️ Requires `HF_TOKEN` secret set in Space settings + accepted model license")
+        llm_model = gr.Textbox(value="google/gemma-7b-it", label="Model Name")
+        llm_iters = gr.Number(value=10, label="Training Iterations")
+        llm_steps = gr.Number(value=5, label="Steps per Episode")
+        llm_btn = gr.Button("🚀 Start LLM Training", variant="primary")
+        llm_output = gr.Textbox(label="Training Log", lines=15)
+        llm_img = gr.Image(label="Results")
+        llm_btn.click(run_llm_training, inputs=[llm_model, llm_iters, llm_steps],
+                      outputs=[llm_output, llm_img])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, theme=gr.themes.Base())

cloud_arena/llm_environment.py ADDED Viewed

	@@ -0,0 +1,438 @@

+# ============================================================
+# CELL 3 — Cloud FinOps Environment (Final Fixed Version)
+#
+# ALL loopholes closed:
+#   1. CHECK_DEPENDENCIES after cap → hesitation penalty (not 0.0)
+#      This kills the "+0.200 every episode" passive policy
+#   2. W_HESITATION = 0.10 — strong enough to force action
+#   3. Win bonus +2.0 — rewards completing the goal, not just steps
+#   4. RESIZE guaranteed to reduce cost (uniform 0.40-0.65)
+#   5. MIN_DELETABLE_COST_RATIO = 0.35 — win is always reachable
+#   6. Stronger semantic veto — also catches high-dependency temp nodes
+# ============================================================
+import numpy as np
+import gymnasium as gym
+from gymnasium import spaces
+from enum import IntEnum
+import random
+random.seed(42)
+np.random.seed(42)
+# ─── Action Space ─────────────────────────────────────────────────────────────
+class Action(IntEnum):
+    NOOP               = 0
+    CHECK_DEPENDENCIES = 1
+    RESIZE             = 2
+    STOP               = 3
+    DELETE             = 4
+NUM_ACTIONS = len(Action)
+# ─── Constants ────────────────────────────────────────────────────────────────
+N_RESOURCES = 6
+OBS_PER_RES = 5
+OBS_DIM     = N_RESOURCES * OBS_PER_RES + 2   # = 32
+PROD_NAMES = [
+    "storage-prod-db",      "core-auth-router",     "primary-k8s-master",
+    "billing-db-01",        "payment-gateway-prod",  "prod-cache-redis",
+    "prod-elb-frontend",    "rds-prod-main",          "main-api-prod",
+    "prod-cosmos-db",       "primary-gke-cluster",   "prod-spanner-db",
+]
+TEMP_NAMES = [
+    "worker-node-temp",     "test-frontend-ui",      "sandbox-db-04",
+    "batch-processor-temp", "dev-cache-redis",        "temp-worker-88",
+    "staging-api-v2",       "dev-log-collector",      "temp-ecs-task",
+    "dev-gke-node",         "test-bigquery-scratch",  "sandbox-spanner-dev",
+]
+# ─── Cloud Resource ───────────────────────────────────────────────────────────
+class CloudResource:
+    def __init__(self, name, cost_per_hr, cpu_pct, dependencies, is_prod):
+        self.name         = name
+        self.cost_per_hr  = cost_per_hr
+        self.cpu_pct      = cpu_pct
+        self.dependencies = dependencies
+        self.is_prod      = is_prod
+        self.active       = True
+    def to_obs_array(self):
+        return np.array([
+            self.cpu_pct / 100.0,
+            self.cost_per_hr / 5.0,
+            len(self.dependencies) / 14.0,
+            float(self.is_prod),
+            float(self.active),
+        ], dtype=np.float32)
+# ─── Resource Generator ───────────────────────────────────────────────────────
+def generate_resources(n=N_RESOURCES):
+    """
+    Creates a randomised pool of cloud resources per episode.
+    Guarantees:
+      1. One production trap with low CPU (looks deletable but isn't)
+      2. Temp resources account for >= 35% of total cost
+         so the 20% savings win condition is always reachable
+    """
+    resources = []
+    # Guaranteed prod trap — low CPU makes it look safe to delete
+    prod_name = random.choice(PROD_NAMES)
+    resources.append(CloudResource(
+        name         = prod_name,
+        cost_per_hr  = round(random.uniform(0.5, 3.0), 2),
+        cpu_pct      = random.randint(2, 12),
+        dependencies = random.sample(TEMP_NAMES, k=random.randint(2, 4)),
+        is_prod      = True,
+    ))
+    # Fill remaining slots with random mix
+    for _ in range(n - 1):
+        is_prod   = random.random() < 0.30   # 30% chance prod
+        name_pool = PROD_NAMES if is_prod else TEMP_NAMES
+        dep_count = random.randint(1, 5) if is_prod else random.randint(0, 3)
+        resources.append(CloudResource(
+            name         = random.choice(name_pool),
+            cost_per_hr  = round(random.uniform(0.8, 4.0), 2),
+            cpu_pct      = random.randint(1, 95),
+            dependencies = random.sample(TEMP_NAMES, k=min(dep_count, len(TEMP_NAMES))),
+            is_prod      = is_prod,
+        ))
+    # ── Guarantee minimum deletable cost ratio ────────────────────────────
+    # Raises temp resource costs until they represent >= 35% of total.
+    # Without this guarantee, some episodes are mathematically unwinnable.
+    MIN_RATIO = 0.35
+    for _ in range(10):   # iterate up to 10x to converge
+        total      = sum(r.cost_per_hr for r in resources)
+        temp_total = sum(r.cost_per_hr for r in resources if not r.is_prod)
+        if total > 0 and (temp_total / total) < MIN_RATIO:
+            for r in resources:
+                if not r.is_prod:
+                    r.cost_per_hr = round(r.cost_per_hr * 1.3, 2)
+        else:
+            break
+    return resources
+# ─── Core Environment (OpenEnv dict API) ─────────────────────────────────────
+class AWSCostEnv:
+    """
+    Cloud FinOps Optimisation Environment — OpenEnv dict API.
+    Wrap with SB3Adapter for stable-baselines3 PPO training.
+    REWARD FORMULA
+    --------------
+    Savings  : clip(delta_cost_pct × W_SAVINGS, -5, +5)
+    Win bonus: +W_WIN_BONUS when savings >= target (one-time)
+    NOOP     : -W_HESITATION per step
+    Tool     : +W_TOOL per new node checked (capped at W_TOOL_EPISODE_CAP)
+               After cap → -W_HESITATION (closes passive policy loophole)
+    Veto     : PENALTY_VETO (semantic guardrail blocked the action)
+    Crash    : PENALTY_CRASH, episode ends immediately
+    KEY LOOPHOLE FIXES
+    ------------------
+    Fix 1 — CHECK after cap returns -W_HESITATION not 0.0
+             Prevents "+0.200 every episode" passive exploit
+    Fix 2 — RESIZE guaranteed to reduce cost (0.40-0.65 multiplier)
+             Prevents zero-saving resize farming
+    Fix 3 — Tool cap resets every episode via reset()
+    Fix 4 — Semantic veto also catches high-dependency temp nodes
+    Fix 5 — Min deletable ratio guarantee makes win always reachable
+    """
+    # ── Reward weights (do not change without updating Cell 4 too) ──────────
+    W_SAVINGS          = 20.0
+    W_HESITATION       = 0.10    # raised: strong enough to force decisive action
+    W_TOOL             = 0.20
+    W_TOOL_EPISODE_CAP = 0.60    # max tool reward per episode (3 uses)
+    W_WIN_BONUS        = 2.0     # one-time bonus for completing the goal
+    PENALTY_CRASH      = -10.0
+    PENALTY_VETO       = -0.50
+    MAX_STEPS          = 100
+    def __init__(self, n_resources=N_RESOURCES, target_savings=0.20):
+        self.n_resources    = n_resources
+        self.target_savings = target_savings
+        self.resources      = []
+        self.baseline_cost  = 0.0
+        self.current_cost   = 0.0
+        self.current_step   = 0
+        self.nodes_investigated_this_episode = set()
+        self.total_tool_reward_this_episode  = 0.0
+    # ── Private helpers ──────────────────────────────────────────────────────
+    def _resource_from_action(self, action_idx):
+        idx = (action_idx - 2) % self.n_resources
+        return self.resources[idx % len(self.resources)]
+    def _has_dependency_violation(self, resource):
+        """True if deleting this resource breaks any other active resource."""
+        for other in self.resources:
+            if other.active and other.name != resource.name:
+                if resource.name in other.dependencies:
+                    return True
+        return False
+    def _calc_cost(self):
+        return sum(r.cost_per_hr for r in self.resources if r.active)
+    def _get_obs(self):
+        obs = []
+        for r in self.resources:
+            obs.extend(r.to_obs_array())
+        budget_used = (
+            1.0 - (self.current_cost / self.baseline_cost)
+            if self.baseline_cost > 0 else 0.0
+        )
+        steps_left = 1.0 - (self.current_step / self.MAX_STEPS)
+        obs.extend([budget_used, steps_left])
+        return np.array(obs, dtype=np.float32)
+    def _get_internal_state(self):
+        """Human-readable state dict for OpenEnv /state endpoint."""
+        return {
+            "step":          self.current_step,
+            "baseline_cost": self.baseline_cost,
+            "current_cost":  self.current_cost,
+            "savings_pct":   round(
+                (1 - self.current_cost / self.baseline_cost) * 100, 2
+            ) if self.baseline_cost > 0 else 0.0,
+            "resources": [{
+                "name":         r.name,
+                "active":       r.active,
+                "is_prod":      r.is_prod,
+                "cost_per_hr":  r.cost_per_hr,
+                "cpu_pct":      r.cpu_pct,
+                "dependencies": r.dependencies,
+            } for r in self.resources]
+        }
+    def _semantic_veto(self, name: str, dep_count: int) -> bool:
+        """
+        Semantic guardrail — returns True if action should be blocked.
+        Two veto triggers:
+          1. Name contains production keywords (primary check)
+          2. High dependency count on any resource (structural safety net)
+             Even temp-named nodes with 5+ deps get vetoed
+             This catches the edge case that caused the -31.800 crash
+        In production: replace with call to fine-tuned Llama inference endpoint.
+        """
+        name_lower    = name.lower()
+        prod_keywords = [
+            "prod", "primary", "main", "core",
+            "billing", "payment", "rds", "master"
+        ]
+        # Primary: semantic name check
+        if any(kw in name_lower for kw in prod_keywords):
+            return True
+        # Secondary: structural safety net — high deps = critical regardless of name
+        if dep_count >= 5:
+            return True
+        return False
+    # ── Lifecycle ─────────────────────────────────────────────────────────────
+    def reset(self):
+        """Reset environment for a new episode. Returns OpenEnv dict."""
+        self.current_step   = 0
+        self.nodes_investigated_this_episode = set()
+        self.total_tool_reward_this_episode  = 0.0
+        self.resources      = generate_resources(self.n_resources)
+        self.baseline_cost  = self._calc_cost()
+        self.current_cost   = self.baseline_cost
+        return {
+            "observation": self._get_obs(),
+            "info": {
+                "msg":           "Episode reset",
+                "baseline_cost": self.baseline_cost,
+            }
+        }
+    def step(self, action):
+        """
+        Execute one environment step.
+        Args:
+            action : int, one of Action enum values (0-4)
+        Returns:
+            dict with keys: observation, state, reward, done, info
+        """
+        self.current_step += 1
+        truncated = self.current_step >= self.MAX_STEPS
+        # ── 1. NOOP — hesitation penalty ──────────────────────────────────
+        if action == Action.NOOP:
+            return {
+                "observation": self._get_obs(),
+                "state":       self._get_internal_state(),
+                "reward":      float(-self.W_HESITATION),
+                "done":        bool(truncated),
+                "info":        {"msg": "Hesitation penalty", "win": False,
+                                "savings_pct": round(
+                    (1 - self.current_cost / self.baseline_cost) * 100, 2)}
+            }
+        target = self._resource_from_action(action)
+        # ── 2. CHECK_DEPENDENCIES ─────────────────────────────────────────
+        # LOOPHOLE FIX: After cap is reached, return hesitation penalty
+        # instead of 0.0. This kills the passive "+0.200 every episode" policy.
+        if action == Action.CHECK_DEPENDENCIES:
+            under_cap  = self.total_tool_reward_this_episode < self.W_TOOL_EPISODE_CAP
+            new_node   = target.name not in self.nodes_investigated_this_episode
+            if new_node and under_cap:
+                # Valid tool use — reward it
+                self.nodes_investigated_this_episode.add(target.name)
+                self.total_tool_reward_this_episode += self.W_TOOL
+                tool_reward = self.W_TOOL
+                msg = f"Checked {target.name}"
+            else:
+                # Cap reached or node already checked — penalise like NOOP
+                tool_reward = -self.W_HESITATION
+                msg = "Tool cap reached — penalised"
+            return {
+                "observation": self._get_obs(),
+                "state":       self._get_internal_state(),
+                "reward":      float(tool_reward),
+                "done":        bool(truncated),
+                "info":        {"msg": msg, "win": False,
+                                "savings_pct": round(
+                    (1 - self.current_cost / self.baseline_cost) * 100, 2)}
+            }
+        # ── 3. SEMANTIC + STRUCTURAL GUARDRAIL ────────────────────────────
+        # Blocks dangerous actions using name keywords AND dependency count.
+        # Dependency count fix closes the edge case that caused -31.800 crash.
+        danger = action in (Action.STOP, Action.DELETE)
+        if danger and self._semantic_veto(target.name, len(target.dependencies)):
+            return {
+                "observation": self._get_obs(),
+                "state":       self._get_internal_state(),
+                "reward":      float(self.PENALTY_VETO),
+                "done":        bool(truncated),
+                "info":        {"msg": f"SEMANTIC VETO on {target.name}",
+                                "win": False,
+                                "savings_pct": round(
+                    (1 - self.current_cost / self.baseline_cost) * 100, 2)}
+            }
+        # ── 4. EXECUTE ACTION ─────────────────────────────────────────────
+        prev_cost = self.current_cost
+        if action == Action.RESIZE:
+            if target.active:
+                old_cost = target.cost_per_hr
+                # LOOPHOLE FIX: 0.40-0.65 multiplier guarantees meaningful reduction
+                target.cost_per_hr = round(
+                    target.cost_per_hr * random.uniform(0.40, 0.65), 2
+                )
+                # Extra safety: if somehow no reduction, penalise
+                if target.cost_per_hr >= old_cost:
+                    target.cost_per_hr = round(old_cost * 0.50, 2)
+        elif action in (Action.STOP, Action.DELETE):
+            # ── 5. STRUCTURAL DEPENDENCY CHECK ────────────────────────────
+            if self._has_dependency_violation(target):
+                return {
+                    "observation": self._get_obs(),
+                    "state":       self._get_internal_state(),
+                    "reward":      float(self.PENALTY_CRASH),
+                    "done":        True,
+                    "info":        {
+                        "msg":         f"CATASTROPHIC FAILURE: {target.name}",
+                        "win":         False,
+                        "savings_pct": round(
+                            (1 - self.current_cost / self.baseline_cost) * 100, 2)
+                    }
+                }
+            target.active = False
+        # ── 6. FINANCIAL REWARD ───────────────────────────────────────────
+        self.current_cost = self._calc_cost()
+        delta_pct         = (prev_cost - self.current_cost) / self.baseline_cost
+        savings_reward    = float(np.clip(delta_pct * self.W_SAVINGS, -5.0, 5.0))
+        # ── 7. WIN CONDITION + BONUS ──────────────────────────────────────
+        total_saved = (
+            (self.baseline_cost - self.current_cost) / self.baseline_cost
+        )
+        is_win = total_saved >= self.target_savings
+        # One-time win bonus — rewards completing the goal
+        if is_win:
+            savings_reward += self.W_WIN_BONUS
+        is_done = bool(is_win or truncated)
+        return {
+            "observation": self._get_obs(),
+            "state":       self._get_internal_state(),
+            "reward":      savings_reward,
+            "done":        is_done,
+            "info": {
+                "msg":         "Win!" if is_win else "Action Successful",
+                "win":         is_win,
+                "savings_pct": round(total_saved * 100, 2),
+            }
+        }
+# ─── SB3 Adapter (Gymnasium wrapper for PPO) ─────────────────────────────────
+class SB3Adapter(gym.Env):
+    """
+    Wraps AWSCostEnv (OpenEnv dict API) into the Gymnasium 5-tuple API
+    that stable-baselines3 PPO expects.
+    terminated = agent achieved the savings target (win)
+    truncated  = MAX_STEPS reached without winning
+    """
+    metadata = {"render_modes": []}
+    def __init__(self):
+        super().__init__()
+        self.core = AWSCostEnv()
+        self.action_space = spaces.Discrete(NUM_ACTIONS)
+        self.observation_space = spaces.Box(
+            low=-np.inf, high=np.inf, shape=(OBS_DIM,), dtype=np.float32
+        )
+    def reset(self, seed=None, options=None):
+        super().reset(seed=seed)
+        result = self.core.reset()
+        return result["observation"], result["info"]
+    def step(self, action):
+        result     = self.core.step(action)
+        terminated = result["done"] and result["info"].get("win", False)
+        truncated  = result["done"] and not result["info"].get("win", False)
+        return (
+            result["observation"],
+            result["reward"],
+            terminated,
+            truncated,
+            result["info"],
+        )
+    def render(self):
+        pass

cloud_arena/llm_training.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# ============================================================
+# LLM RL Training — LLaMA 3.1 8B + REINFORCE + LoRA
+# This is the LLM model, SEPARATE from the mathematical model.
+# Uses AWSCostEnv (llm_environment.py), NOT CloudArenaEnv.
+# ============================================================
+import os
+import re
+import json
+import time
+import warnings
+import numpy as np
+import torch
+import torch.nn.functional as F
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+from cloud_arena.llm_environment import SB3Adapter, Action, AWSCostEnv
+# ─── Constants ────────────────────────────────────────────────────────────────
+ACTION_NAMES = {0: "NOOP", 1: "CHECK_DEPS", 2: "RESIZE", 3: "STOP", 4: "DELETE"}
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def format_prompt(state_dict):
+    resources_text = ""
+    for r in state_dict["resources"]:
+        status = "ACTIVE" if r["active"] else "STOPPED"
+        tag = "PRODUCTION" if r["is_prod"] else "Temporary"
+        resources_text += (
+            f"  - {r['name']} [{status}] ({tag}): "
+            f"Cost=${r['cost_per_hr']:.2f}/hr, CPU={r['cpu_pct']}%, "
+            f"Deps={len(r['dependencies'])}\n"
+        )
+    savings_pct = state_dict.get("savings_pct", 0.0)
+    return (
+        f"You are a Cloud FinOps AI. Reduce cloud cost by >=20% without breaking production.\n\n"
+        f"Actions: 0=NOOP, 1=CHECK_DEPS, 2=RESIZE, 3=STOP, 4=DELETE\n\n"
+        f"Resources:\n{resources_text}\n"
+        f"Baseline: ${state_dict['baseline_cost']:.2f}/hr | "
+        f"Current: ${state_dict['current_cost']:.2f}/hr | "
+        f"Savings: {savings_pct:.1f}%\n\n"
+        f"Rules:\n"
+        f"- Never delete/stop prod resources or those with >=5 deps\n"
+        f"- Temp resources with 0-1 deps are safe to delete\n"
+        f"- RESIZE is always safe\n\n"
+        f"REASONING:"
+    )
+def extract_action_and_reasoning(response_text):
+    reasoning = response_text.strip()
+    action = 2
+    action_match = re.search(r'ACTION:\s*(\d)', response_text, re.IGNORECASE)
+    if action_match:
+        parsed = int(action_match.group(1))
+        if 0 <= parsed <= 4:
+            action = parsed
+    else:
+        digit_matches = re.findall(r'\b([0-4])\b', response_text[-50:])
+        if digit_matches:
+            action = int(digit_matches[-1])
+    return action, reasoning
+def policy_gradient_step(model, tokenizer, prompt, response_text, reward, optimizer):
+    full_text = prompt + response_text
+    encodings = tokenizer(full_text, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
+    prompt_encodings = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+    prompt_len = prompt_encodings["input_ids"].shape[1]
+    outputs = model(**encodings, labels=encodings["input_ids"])
+    logits = outputs.logits[:, prompt_len-1:-1, :]
+    targets = encodings["input_ids"][:, prompt_len:]
+    if targets.shape[1] == 0 or logits.shape[1] == 0:
+        return 0.0
+    min_len = min(logits.shape[1], targets.shape[1])
+    logits = logits[:, :min_len, :]
+    targets = targets[:, :min_len]
+    log_probs = F.log_softmax(logits, dim=-1)
+    token_log_probs = log_probs.gather(2, targets.unsqueeze(-1)).squeeze(-1)
+    avg_log_prob = token_log_probs.mean()
+    loss = -reward * avg_log_prob
+    optimizer.zero_grad()
+    loss.backward()
+    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+    optimizer.step()
+    return loss.item()
+def run_episode(model, tokenizer, env, is_training=False, optimizer=None,
+                steps_per_episode=5, max_new_tokens=128):
+    obs, info = env.reset()
+    state_dict = env.core._get_internal_state()
+    done = False
+    episode_reward = 0.0
+    step_count = 0
+    reasoning_log = []
+    losses = []
+    while not done and step_count < steps_per_episode:
+        prompt = format_prompt(state_dict)
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+        input_ids = inputs["input_ids"].to(DEVICE)
+        with torch.no_grad():
+            gen_outputs = model.generate(
+                input_ids, max_new_tokens=max_new_tokens,
+                do_sample=True, temperature=0.7, top_p=0.95,
+                pad_token_id=tokenizer.pad_token_id,
+            )
+        response_ids = gen_outputs[0][input_ids.shape[1]:]
+        response_text = tokenizer.decode(response_ids, skip_special_tokens=True)
+        action, reasoning = extract_action_and_reasoning(response_text)
+        next_obs, reward, terminated, truncated, next_info = env.step(action)
+        done = terminated or truncated
+        episode_reward += reward
+        reasoning_log.append({
+            "step": step_count + 1,
+            "reasoning": reasoning[:300],
+            "action": action,
+            "action_name": ACTION_NAMES.get(action, "UNKNOWN"),
+            "reward": round(reward, 4),
+            "message": next_info.get("msg", ""),
+        })
+        if is_training and optimizer is not None:
+            loss = policy_gradient_step(model, tokenizer, prompt, response_text, reward, optimizer)
+            losses.append(loss)
+        obs = next_obs
+        state_dict = env.core._get_internal_state()
+        step_count += 1
+    return episode_reward, reasoning_log
+def train_llm(model_name="google/gemma-7b-it",
+              num_iterations=10, steps_per_episode=5, learning_rate=5e-5,
+              progress_callback=None):
+    """
+    Full LLM RL training pipeline. Returns (all_rewards, full_log, graph_path).
+    """
+    hf_token = os.environ.get("HF_TOKEN")
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    from peft import get_peft_model, LoraConfig, TaskType
+    log_lines = []
+    def log(msg):
+        print(msg)
+        log_lines.append(msg)
+        if progress_callback:
+            progress_callback("\n".join(log_lines))
+    log(f"🖥️  Device: {DEVICE}")
+    log(f"🧠 Model: {model_name}")
+    log(f"🔁 Iterations: {num_iterations}")
+    log("📦 Loading model and tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name, torch_dtype=torch.bfloat16, token=hf_token,
+    ).to(DEVICE)
+    lora_config = LoraConfig(
+        r=16, lora_alpha=16,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+        lora_dropout=0.0, bias="none",
+        task_type=TaskType.CAUSAL_LM,
+    )
+    model = get_peft_model(model, lora_config)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total = sum(p.numel() for p in model.parameters())
+    log(f"✅ Model loaded. Trainable: {trainable:,} / {total:,} params")
+    optimizer = torch.optim.AdamW(
+        filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate
+    )
+    env = SB3Adapter()
+    all_rewards = []
+    full_log = []
+    # Pre-training eval
+    log("\n▶ PRE-TRAINING EVAL")
+    model.eval()
+    pre_reward, pre_log_data = run_episode(model, tokenizer, env, steps_per_episode=steps_per_episode)
+    all_rewards.append(pre_reward)
+    full_log.append({"phase": "pre-training", "reward": pre_reward, "reasoning": pre_log_data})
+    log(f"  Reward: {pre_reward:+.3f}")
+    # Training
+    log(f"\n▶ TRAINING ({num_iterations} iterations)")
+    model.train()
+    for i in range(num_iterations):
+        reward, train_log_data = run_episode(
+            model, tokenizer, env, is_training=True, optimizer=optimizer,
+            steps_per_episode=steps_per_episode,
+        )
+        all_rewards.append(reward)
+        full_log.append({"phase": f"training-{i+1}", "reward": reward, "reasoning": train_log_data})
+        log(f"  Iter {i+1}/{num_iterations}: reward={reward:+.3f}")
+    # Post-training eval
+    log("\n▶ POST-TRAINING EVAL")
+    model.eval()
+    post_reward, post_log_data = run_episode(model, tokenizer, env, steps_per_episode=steps_per_episode)
+    all_rewards.append(post_reward)
+    full_log.append({"phase": "post-training", "reward": post_reward, "reasoning": post_log_data})
+    log(f"  Reward: {post_reward:+.3f}")
+    delta = all_rewards[-1] - all_rewards[0]
+    log(f"\n✅ DONE | Pre: {all_rewards[0]:+.3f} → Post: {all_rewards[-1]:+.3f} | Δ={delta:+.3f}")
+    # Save log
+    with open("outputs/llm_training_log.json", "w") as f:
+        json.dump(full_log, f, indent=2, default=str)
+    # Generate graph
+    graph_path = _generate_graph(all_rewards, num_iterations, model_name)
+    return all_rewards, full_log, graph_path, "\n".join(log_lines)
+def _generate_graph(all_rewards, num_iterations, model_name):
+    labels = ["Before"] + [f"Iter {i+1}" for i in range(num_iterations)] + ["After"]
+    colors = ["#ef4444"] + ["#3b82f6"] * num_iterations + ["#22c55e"]
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6), facecolor="#0e1117")
+    for ax in [ax1, ax2]:
+        ax.set_facecolor("#0e1117")
+        ax.tick_params(colors="#e6e6e6")
+        ax.grid(axis="y", alpha=0.1, color="white")
+        for s in ['top','right']:
+            ax.spines[s].set_visible(False)
+        for s in ['left','bottom']:
+            ax.spines[s].set_color('#333')
+    ax1.bar(range(len(all_rewards)), all_rewards, color=colors, edgecolor="white", lw=1.5, width=0.6)
+    ax1.set_xticks(range(len(labels)))
+    ax1.set_xticklabels(labels, fontsize=8, color="#e6e6e6", rotation=45)
+    ax1.set_title(f"LLM RL: {model_name.split('/')[-1]}", color="#e6e6e6", fontsize=13, fontweight="bold")
+    ax1.set_ylabel("Reward", color="#e6e6e6")
+    comp = [all_rewards[0], all_rewards[-1]]
+    ax2.bar(["Before", "After"], comp, color=["#ef4444", "#22c55e"], edgecolor="white", lw=2, width=0.5)
+    ax2.set_title("Before vs After", color="#e6e6e6", fontsize=13, fontweight="bold")
+    ax2.set_ylabel("Reward", color="#e6e6e6")
+    plt.tight_layout()
+    path = "outputs/llm_training_results.png"
+    plt.savefig(path, dpi=200, bbox_inches="tight", facecolor="#0e1117")
+    plt.close()
+    return path

requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
-# ── Mathematical Model RL Dependencies ONLY ──
-# DO NOT add transformers/peft/trl here — those belong to the LLM model
 gymnasium>=0.29.0
 stable-baselines3>=2.3.0
 sb3-contrib>=2.3.0
@@ -7,3 +6,10 @@ numpy>=1.24.0
 torch>=2.0.0
 matplotlib>=3.7.0
 gradio>=4.0.0

+# ── Mathematical Model RL ──────────────────────────────────
 gymnasium>=0.29.0
 stable-baselines3>=2.3.0
 sb3-contrib>=2.3.0
 torch>=2.0.0
 matplotlib>=3.7.0
 gradio>=4.0.0
+# ── LLM Model RL (LLaMA 3.1 8B + LoRA) ───────────────────
+transformers>=4.40.0
+peft>=0.10.0
+accelerate>=0.30.0
+bitsandbytes>=0.43.0
+sentencepiece