Spaces:

Naseer-010
/

DIME

Configuration error

App Files Files Community

Naseer-010 commited on Apr 26

Commit

09fcbfb

1 Parent(s): b54ab02

fixing the reward tirage

Browse files

Files changed (2) hide show

inference.py +50 -18
train_grpo_unsloth.py +219 -160

inference.py CHANGED Viewed

@@ -90,33 +90,37 @@ Available commands:
 CRITICAL INCIDENT TRIAGE TREE (Follow strictly in order):
 1. OOM IMMINENT (Memory Leak): IF ANY 'mem_utilizations' > 0.92:
    IMMEDIATELY output: kubectl delete pod node-5 (or whichever node is leaking. Scaling does NOT fix memory leaks!)
-2. SPLIT-BRAIN (Disk I/O Bottleneck): IF node_0 'io_wait' > 0.80:
    Output: kubectl throttle ingress --rate=0.5 (Do NOT scale up; more workers will lock the DB disk further).
-3. HOT SHARD (Load Balancer Skew): IF one worker's CPU > 0.90 but the cluster average is low:
    Output: kubectl exec -it istio-proxy -- traffic shift --from=<high_cpu_node> --to=<low_cpu_node>
-4. RETRY STORM / THUNDERING HERD: IF 'p99_latency' > 100.0 AND traffic is spiking:
    Output: kubectl throttle ingress --rate=0.4 (Break the exponential retry loop).
-5. CONNECTION DEADLOCK (Zombie Node): IF a worker's CPU is incredibly low (< 0.10) BUT 'p99_latency' is huge:
    Output: kubectl exec -it istio-proxy -- traffic shift --from=<zombie_node> --to=<healthy_node>
-6. BLACK SWAN (Multi-Node Death): IF multiple nodes are in 'failed_nodes':
    Output: kubectl throttle ingress --rate=0.3 (Shed load to protect survivors while you recover).
-7. DATABASE SURVIVAL: IF node-0 (DB) cpu_load > 0.80:
    Output: kubectl throttle ingress --rate=0.7
-8. SAFE SCALING: IF avg worker CPU > 0.75 AND 'error_budget' > 20:
    Output: kubectl scale deployment frontend --replicas=10
-9. HEALTHY / FLAPPING TRAP: If metrics are stable or oscillating slightly:
    Output: no_op
 Respond using the following STRICT format. You must include the XML reasoning tags:
-<reasoning>Diagnose the telemetry. Identify which of the 9 Triage rules applies.</reasoning>
 <action>
 {"command": "your_kubectl_command_or_no_op_here"}
 </action>"""
@@ -649,16 +653,30 @@ def _get_direct_env():
     global _direct_env
     if _direct_env is None:
         from server.environment import DistributedInfraEnvironment
         _direct_env = DistributedInfraEnvironment()
     return _direct_env
 def _infraobs_to_dict(obs) -> dict:
     keys = [
-        "cpu_loads", "mem_utilizations", "queue_lengths", "failed_nodes",
-        "latency_ms", "request_rate", "io_wait", "p99_latency", "error_budget",
-        "step", "task_hint", "action_errors", "cloud_budget",
-        "task_score", "done", "reward", "uptime_pct",
     ]
     return {k: getattr(obs, k) for k in keys if hasattr(obs, k)}
@@ -669,6 +687,7 @@ def env_reset_direct(task_id: str) -> dict:
 def env_step_direct(action_dict: dict) -> dict:
     from server.models import InfraAction
     env = _get_direct_env()
     raw_cmd = action_dict.get("raw_command")
     if raw_cmd and raw_cmd != "no_op":
@@ -676,7 +695,10 @@ def env_step_direct(action_dict: dict) -> dict:
     else:
         act_type = action_dict.get("action_type", "no_op")
         kwargs: dict = {"action_type": act_type}
-        if act_type in ("restart_node", "query_logs") and action_dict.get("target") is not None:
             kwargs["target"] = int(action_dict["target"])
         elif act_type == "reroute_traffic":
             kwargs["from_node"] = int(action_dict.get("from_node", 0))
@@ -689,7 +711,13 @@ def env_step_direct(action_dict: dict) -> dict:
             action = InfraAction(action_type="no_op")
     obs = env.step(action)
     obs_dict = _infraobs_to_dict(obs)
-    return {"data": {"observation": obs_dict, "reward": getattr(obs, "reward", 0.0), "done": getattr(obs, "done", False)}}
 # ---------------------------------------------------------------------------
@@ -764,7 +792,11 @@ def run_task(
         done = False
         try:
-            result = env_step_direct(backend_action) if use_direct else env_step(env_url, backend_action)
             data_block = result.get("data", result)
             if "observation" in data_block and isinstance(
@@ -923,7 +955,7 @@ def main():
         print(f"  STRUCTURED LOGS: {log_dir}")
         print("==================================================")
-        use_direct = (mode == "local")
         for task_id in tasks:
             stats = run_task(
                 task_id,

 CRITICAL INCIDENT TRIAGE TREE (Follow strictly in order):
 1. OOM IMMINENT (Memory Leak): IF ANY 'mem_utilizations' > 0.92:
    IMMEDIATELY output: kubectl delete pod node-5 (or whichever node is leaking. Scaling does NOT fix memory leaks!)
+2. DB RECOVERY: IF node-0 is in 'failed_nodes':
+   Output: kubectl delete pod node-0
+   (The DB is a SPOF. If it's dead, ALL other actions are futile until it restarts.)
+3. SPLIT-BRAIN (Disk I/O Bottleneck): IF node_0 'io_wait' > 0.80:
    Output: kubectl throttle ingress --rate=0.5 (Do NOT scale up; more workers will lock the DB disk further).
+4. HOT SHARD (Load Balancer Skew): IF one worker's CPU > 0.90 but the cluster average is low:
    Output: kubectl exec -it istio-proxy -- traffic shift --from=<high_cpu_node> --to=<low_cpu_node>
+5. RETRY STORM / THUNDERING HERD: IF 'p99_latency' > 100.0 AND traffic is spiking:
    Output: kubectl throttle ingress --rate=0.4 (Break the exponential retry loop).
+6. CONNECTION DEADLOCK (Zombie Node): IF a worker's CPU is incredibly low (< 0.10) BUT 'p99_latency' is huge:
    Output: kubectl exec -it istio-proxy -- traffic shift --from=<zombie_node> --to=<healthy_node>
+7. BLACK SWAN (Multi-Node Death): IF multiple nodes are in 'failed_nodes' (but DB is alive):
    Output: kubectl throttle ingress --rate=0.3 (Shed load to protect survivors while you recover).
+8. DATABASE SURVIVAL: IF node-0 (DB) cpu_load > 0.80:
    Output: kubectl throttle ingress --rate=0.7
+9. SAFE SCALING: IF avg worker CPU > 0.75 AND 'error_budget' > 20:
    Output: kubectl scale deployment frontend --replicas=10
+10. HEALTHY / FLAPPING TRAP: If metrics are stable or oscillating slightly:
    Output: no_op
 Respond using the following STRICT format. You must include the XML reasoning tags:
+<reasoning>Diagnose the telemetry. Identify which of the 10 Triage rules applies.</reasoning>
 <action>
 {"command": "your_kubectl_command_or_no_op_here"}
 </action>"""
     global _direct_env
     if _direct_env is None:
         from server.environment import DistributedInfraEnvironment
         _direct_env = DistributedInfraEnvironment()
     return _direct_env
 def _infraobs_to_dict(obs) -> dict:
     keys = [
+        "cpu_loads",
+        "mem_utilizations",
+        "queue_lengths",
+        "failed_nodes",
+        "latency_ms",
+        "request_rate",
+        "io_wait",
+        "p99_latency",
+        "error_budget",
+        "step",
+        "task_hint",
+        "action_errors",
+        "cloud_budget",
+        "task_score",
+        "done",
+        "reward",
+        "uptime_pct",
     ]
     return {k: getattr(obs, k) for k in keys if hasattr(obs, k)}
 def env_step_direct(action_dict: dict) -> dict:
     from server.models import InfraAction
     env = _get_direct_env()
     raw_cmd = action_dict.get("raw_command")
     if raw_cmd and raw_cmd != "no_op":
     else:
         act_type = action_dict.get("action_type", "no_op")
         kwargs: dict = {"action_type": act_type}
+        if (
+            act_type in ("restart_node", "query_logs")
+            and action_dict.get("target") is not None
+        ):
             kwargs["target"] = int(action_dict["target"])
         elif act_type == "reroute_traffic":
             kwargs["from_node"] = int(action_dict.get("from_node", 0))
             action = InfraAction(action_type="no_op")
     obs = env.step(action)
     obs_dict = _infraobs_to_dict(obs)
+    return {
+        "data": {
+            "observation": obs_dict,
+            "reward": getattr(obs, "reward", 0.0),
+            "done": getattr(obs, "done", False),
+        }
+    }
 # ---------------------------------------------------------------------------
         done = False
         try:
+            result = (
+                env_step_direct(backend_action)
+                if use_direct
+                else env_step(env_url, backend_action)
+            )
             data_block = result.get("data", result)
             if "observation" in data_block and isinstance(
         print(f"  STRUCTURED LOGS: {log_dir}")
         print("==================================================")
+        use_direct = mode == "local"
         for task_id in tasks:
             stats = run_task(
                 task_id,

train_grpo_unsloth.py CHANGED Viewed

@@ -26,6 +26,7 @@ Post-training benchmark:
 # UNSLOTH_VLLM_STANDBY must be set before unsloth is imported.
 import os
 os.environ["UNSLOTH_VLLM_STANDBY"] = "1"
 # ---------------------------------------------------------------------------
@@ -64,6 +65,7 @@ from server.models import InfraAction, InfraObservation
 from server.command_parser import parse_command, CommandParseError
 from server.rubrics import calculate_step_reward as _calculate_step_reward
 def _probe_rubrics() -> bool:
     """Return True if rubrics returns bounded rewards (main branch), False if -1000 (nithish)."""
     try:
@@ -75,33 +77,47 @@ def _probe_rubrics() -> bool:
     except Exception:
         return False
 _RUBRICS_BOUNDED = _probe_rubrics()
-print(f"[GRPO] rubrics version: {'main (bounded [-5,+5])' if _RUBRICS_BOUNDED else 'nithish (WARNING: -1000 cliff — using fallback)'}")
 # ---------------------------------------------------------------------------
 # Config
 # ---------------------------------------------------------------------------
-MODEL_NAME       = "unsloth/Qwen3-8B"
-MAX_SEQ_LENGTH   = 2048
-LORA_RANK        = 32
-OUTPUT_DIR       = "checkpoints/qwen3_grpo_unsloth"
-DATASET_EPISODES = 500      # env rollouts to build the training dataset
-MAX_STEPS        = 300      # GRPOTrainer update steps
-NUM_GENERATIONS  = 4        # G — completions per prompt; reward_env is CPU-bound, keep small
-MAX_COMPLETION_LENGTH = 512 # Qwen3 no-think response is ~60 tokens; 512 is a safe ceiling
-SAVE_STEPS       = 100
 ALL_TASKS = [
-    "traffic_spike", "node_failure", "cascading_failure", "flash_crowd",
-    "thundering_herd", "zombie_node", "hot_shard_skew", "memory_leak_slow_burn",
-    "split_brain_io_bottleneck", "black_swan_az_failure",
-    "retry_storm", "connection_pool_deadlock", "autoscaler_flapping_trap",
 ]
 # ---------------------------------------------------------------------------
-# System prompt — identical to inference.py + rule 8 (DB recovery)
 # ---------------------------------------------------------------------------
 SYSTEM_PROMPT = """You are an expert Site Reliability Engineer (SRE) managing a highly volatile Kubernetes cluster.
@@ -124,15 +140,16 @@ Available commands:
 CRITICAL INCIDENT TRIAGE TREE (Follow strictly in order):
 1. OOM IMMINENT: IF ANY 'mem_utilizations' > 0.92 → kubectl delete pod node-<leaking_node>
-2. SPLIT-BRAIN: IF node_0 'io_wait' > 0.80 → kubectl throttle ingress --rate=0.5
-3. HOT SHARD: IF one worker CPU > 0.90 but cluster average is low
    → kubectl exec -it istio-proxy -- traffic shift --from=<hot> --to=<cold>
-4. RETRY STORM: IF 'p99_latency' > 100.0 AND traffic spiking → kubectl throttle ingress --rate=0.4
-5. ZOMBIE NODE: IF worker CPU < 0.10 BUT 'p99_latency' is huge
    → kubectl exec -it istio-proxy -- traffic shift --from=<zombie> --to=<healthy>
-6. BLACK SWAN: IF multiple nodes in 'failed_nodes' → kubectl throttle ingress --rate=0.3
-7. DATABASE SURVIVAL: IF node-0 cpu_load > 0.80 → kubectl throttle ingress --rate=0.7
-8. DB RECOVERY: IF node-0 is in 'failed_nodes' → kubectl delete pod node-0
 9. SAFE SCALING: IF avg worker CPU > 0.75 AND 'error_budget' > 20
    → kubectl scale deployment frontend --replicas=10
 10. HEALTHY: If metrics are stable → no_op
@@ -147,26 +164,40 @@ Respond in this exact format:
 # Triage oracle — deterministic expected action for any observation
 # ---------------------------------------------------------------------------
 def _get_expected_action(obs: dict) -> str:
-    """Return the kubectl command the triage tree mandates, or 'no_op'."""
-    cpu   = obs.get("cpu_loads", [0.3] * 8)
-    mem   = obs.get("mem_utilizations", [0.2] * 8)
-    fail  = set(obs.get("failed_nodes", []))
-    io    = float(obs.get("io_wait", 0.0))
-    p99   = float(obs.get("p99_latency", 0.0))
-    rr    = float(obs.get("request_rate", 100.0))
-    bud   = float(obs.get("error_budget", 100.0))
-    # Rule 1: OOM
     for i, m in enumerate(mem):
         if float(m) > 0.92:
             return f"kubectl delete pod node-{i}"
-    # Rule 2: Split-brain
     if io > 0.80:
         return "kubectl throttle ingress --rate=0.5"
-    # Rule 3: Hot shard
     workers = [(i, float(c)) for i, c in enumerate(cpu[1:], 1) if float(c) >= 0]
     if workers:
         avg = sum(c for _, c in workers) / len(workers)
@@ -180,11 +211,11 @@ def _get_expected_action(obs: dict) -> str:
                 if dst is not None:
                     return f"kubectl exec -it istio-proxy -- traffic shift --from={i} --to={dst}"
-    # Rule 4: Retry storm
     if p99 > 100.0 and rr > 150:
         return "kubectl throttle ingress --rate=0.4"
-    # Rule 5: Zombie node
     for i, c in workers:
         if 0 <= c < 0.10 and p99 > 100.0:
             dst = next(
@@ -194,34 +225,42 @@ def _get_expected_action(obs: dict) -> str:
             if dst is not None:
                 return f"kubectl exec -it istio-proxy -- traffic shift --from={i} --to={dst}"
-    # Rule 6: Black swan
     if len(fail) >= 2:
         return "kubectl throttle ingress --rate=0.3"
-    # Rule 7: DB survival
     db_cpu = float(cpu[0]) if cpu and float(cpu[0]) >= 0 else 0.0
     if db_cpu > 0.80:
         return "kubectl throttle ingress --rate=0.7"
-    # Rule 8: DB recovery  ← this rule was MISSING from inference.py
-    if 0 in fail:
-        return "kubectl delete pod node-0"
     # Rule 9: Safe scaling
     if workers and sum(c for _, c in workers) / len(workers) > 0.75 and bud > 20:
         return "kubectl scale deployment frontend --replicas=10"
     return "no_op"
 # ---------------------------------------------------------------------------
 # Dataset collection
 # ---------------------------------------------------------------------------
 def _obs_to_dict(obs: InfraObservation) -> dict:
     keys = [
-        "cpu_loads", "mem_utilizations", "queue_lengths", "failed_nodes",
-        "latency_ms", "request_rate", "io_wait", "p99_latency", "error_budget",
-        "step", "task_hint", "action_errors", "cloud_budget",
     ]
     return {k: getattr(obs, k) for k in keys if hasattr(obs, k)}
@@ -231,11 +270,14 @@ def _heuristic_action(obs: InfraObservation) -> InfraAction:
     if random.random() < 0.30:
         atype = random.choice(["no_op", "restart_node", "throttle", "scale_up"])
         if atype == "restart_node":
-            return InfraAction(action_type="restart_node",
-                               target=random.randint(0, min(7, len(obs.cpu_loads) - 1)))
         if atype == "throttle":
-            return InfraAction(action_type="throttle",
-                               rate=random.choice([0.3, 0.5, 0.7]))
         if atype == "scale_up":
             return InfraAction(action_type="scale_up")
         return InfraAction(action_type="no_op")
@@ -270,21 +312,23 @@ def collect_dataset(n_episodes: int, tasks: List[str]) -> Dataset:
         for _ in range(20):
             d = _obs_to_dict(obs)
-            rows.append({
-                "prompt": [
-                    {"role": "system", "content": SYSTEM_PROMPT},
-                    {
-                        "role": "user",
-                        "content": (
-                            "/no_think\n"   # suppress Qwen3 <think> block — response is ~60 tokens, not ~600
-                            f"Current system state:\n{json.dumps(d)}\n"
-                            "Respond with the required XML and JSON format."
-                        ),
-                    },
-                ],
-                "obs_json": json.dumps(d),
-                "task": task,
-            })
             action = _heuristic_action(obs)
             try:
@@ -295,15 +339,17 @@ def collect_dataset(n_episodes: int, tasks: List[str]) -> Dataset:
                 break
         if (ep + 1) % 50 == 0:
-            print(f"  [dataset] episode {ep+1}/{n_episodes} → {len(rows)} rows")
     print(f"  [dataset] collected {len(rows)} total rows")
     return Dataset.from_list(rows)
 # ---------------------------------------------------------------------------
 # Helpers shared by reward functions
 # ---------------------------------------------------------------------------
 def _get_completion_text(comp) -> str:
     """
     Extract completion text from TRL GRPOTrainer's format.
@@ -341,7 +387,7 @@ def _restore_env_state(env: DistributedInfraEnvironment, obs: dict) -> None:
     sim = env.sim
     cpu_l = obs.get("cpu_loads", [])
     mem_l = obs.get("mem_utilizations", [])
-    q_l   = obs.get("queue_lengths", [])
     for i in range(min(len(cpu_l), len(sim.nodes))):
         if float(cpu_l[i]) >= 0:
@@ -355,10 +401,11 @@ def _restore_env_state(env: DistributedInfraEnvironment, obs: dict) -> None:
         if 0 <= idx < len(sim.nodes):
             sim.nodes[idx].is_failed = True
-    sim.latency_ms              = float(obs.get("latency_ms", 20.0))
-    sim.error_budget            = float(obs.get("error_budget", 100.0))
-    sim.last_trace_p99_latency  = float(obs.get("p99_latency", 0.0))
-    sim.last_trace_node_0_io    = float(obs.get("io_wait", 0.0))
 # ---------------------------------------------------------------------------
 # Reward functions (TRL GRPOTrainer signature)
@@ -371,6 +418,7 @@ def _restore_env_state(env: DistributedInfraEnvironment, obs: dict) -> None:
 #   len(obs_json)    == G  (same value repeated G times by TRL)
 # ---------------------------------------------------------------------------
 def reward_format(completions: List, **kwargs) -> List[float]:
     """
     Reward XML structure compliance (mirrors Unsloth's match_format_exactly /
@@ -383,18 +431,18 @@ def reward_format(completions: List, **kwargs) -> List[float]:
     scores = []
     for comp in completions:
         text = _get_completion_text(comp)
-        n_re  = text.count("<reasoning>")
         n_re_ = text.count("</reasoning>")
-        n_ac  = text.count("<action>")
         n_ac_ = text.count("</action>")
         if n_re == 1 and n_re_ == 1 and n_ac == 1 and n_ac_ == 1:
             scores.append(3.0)
         else:
             s = 0.0
-            s += 0.5 if n_re_  == 1 else -1.0
-            s += 0.5 if n_ac   == 1 else -1.0
-            s += 0.5 if n_ac_  == 1 else -1.0
             scores.append(s)
     return scores
@@ -431,26 +479,25 @@ def reward_env(
     **kwargs,
 ) -> List[float]:
     """
-    Environment simulation reward using the production-grade SRE reward function.
-    Uses calculate_step_reward() from server/rubrics.py (friend's improved version):
       - 7 components: uptime, DB CPU, memory cliff, p99 latency, load shedding,
         action efficiency, temporal friction
       - Bounded to [-5.0, +5.0] — no -1000 cliff, gradients always flow
-      - Action-aware: penalises unnecessary throttling and no-ops under load
-    Requires the updated rubrics.py (main branch) where calculate_step_reward
-    returns -5.0 for DB failure instead of -1000.
-    Range: [−5.0, +5.0]
     """
     scores = []
     for i, comp in enumerate(completions):
         try:
-            obs_data  = json.loads(obs_json[i]) if obs_json else {}
             task_name = task[i] if task else "traffic_spike"
         except (TypeError, IndexError, json.JSONDecodeError):
-            scores.append(-5.0)
             continue
         env = DistributedInfraEnvironment()
@@ -472,17 +519,17 @@ def reward_env(
             pass
         if _RUBRICS_BOUNDED:
-            # Main branch: full 7-component bounded reward [-5.0, +5.0]
-            scores.append(_calculate_step_reward(env.sim))
         else:
-            # Nithish branch fallback: simple 3-component formula [-2.5, +0.5]
-            sim   = env.sim
             nodes = sim.nodes
             alive = sum(1 for n in nodes if not n.is_failed)
-            r_up  = 0.5 * (alive / max(len(nodes), 1))
             r_lat = -0.5 * min((max(0.0, sim.latency_ms - 50.0) / 100.0) ** 2, 1.0)
-            r_db  = -2.0 if (nodes and nodes[0].is_failed) else 0.0
-            scores.append(r_up + r_lat + r_db)
     return scores
@@ -493,61 +540,62 @@ def reward_triage(
     **kwargs,
 ) -> List[float]:
     """
-    Triage oracle reward (mirrors Unsloth's check_answer / check_numbers pattern).
     Compares the model's action against the deterministic triage tree output.
-    +5.0 — exact command match with expected action
-    +2.0 — same action_type but different parameters (e.g., throttle at wrong rate)
-     0.0 — no_op when a specific action is expected
-    -2.0 — completely wrong action type
-    -1.0 — unnecessary action when system is healthy (expected no_op)
-    This reward is intentionally strong to bootstrap correct rule-following.
     """
     scores = []
     for i, comp in enumerate(completions):
         try:
             obs_data = json.loads(obs_json[i]) if obs_json else {}
         except (TypeError, IndexError, json.JSONDecodeError):
-            scores.append(-2.0)
             continue
-        expected  = _get_expected_action(obs_data)
         predicted = _extract_command(_get_completion_text(comp))
         if predicted is None:
-            scores.append(-2.0)
             continue
         if predicted.strip() == expected.strip():
-            scores.append(5.0)
             continue
         if expected == "no_op":
-            # Healthy system — penalise unnecessary intervention
-            scores.append(-1.0 if predicted != "no_op" else 0.0)
             continue
         if predicted == "no_op":
-            # Missed a required action
-            scores.append(-2.0)
             continue
         # Same action type, wrong parameters?
         try:
             act_p = parse_command(predicted)
             act_e = parse_command(expected)
-            scores.append(2.0 if act_p.action_type == act_e.action_type else -2.0)
         except CommandParseError:
-            scores.append(-2.0)
     return scores
 # ---------------------------------------------------------------------------
 # Main
 # ---------------------------------------------------------------------------
 def main() -> None:
     # ---- Load model (Unsloth FastLanguageModel + LoRA + FP8) ----
     print(f"[GRPO] Loading {MODEL_NAME} ...")
@@ -555,45 +603,56 @@ def main() -> None:
     # compilation_config=0 → basic CUDA graphs only; skips piecewise graph-split that
     #                        crashes on A100 SM 8.0 (vLLM bug in _decompose_size_nodes)
     model, tokenizer = FastLanguageModel.from_pretrained(
-        model_name          = MODEL_NAME,
-        max_seq_length      = MAX_SEQ_LENGTH,
-        load_in_4bit        = False,
-        fast_inference      = True,
-        max_lora_rank       = LORA_RANK,
-        load_in_fp8         = False,       # FP8 requires compute capability 8.9+; A100 is 8.0
-        compilation_config  = 0,           # avoid piecewise graph-split crash; still uses CUDA graphs
     )
     model = FastLanguageModel.get_peft_model(
         model,
-        r                     = LORA_RANK,
-        lora_alpha            = LORA_RANK * 2,   # 2× alpha speeds up training
-        target_modules        = [
-            "q_proj", "k_proj", "v_proj", "o_proj",
-            "gate_proj", "up_proj", "down_proj",  # include MLP for DIME reasoning
         ],
-        use_gradient_checkpointing = "unsloth",  # 30% memory reduction
-        random_state          = 3407,
     )
     # ---- Collect dataset ----
-    print(f"\n[GRPO] Collecting dataset ({DATASET_EPISODES} episodes, {len(ALL_TASKS)} tasks)...")
     dataset = collect_dataset(DATASET_EPISODES, ALL_TASKS)
     # Filter prompts that exceed 90th-percentile token length (avoids outlier OOM)
     print("[GRPO] Filtering dataset by prompt length...")
     prompt_lens = [
-        len(tokenizer.apply_chat_template(
-            row["prompt"], add_generation_prompt=True, tokenize=True
-        ))
         for row in dataset
     ]
-    max_prompt_len  = int(np.quantile(prompt_lens, 0.90)) + 1
-    max_comp_len    = MAX_SEQ_LENGTH - max_prompt_len
-    keep_idx        = [i for i, L in enumerate(prompt_lens) if L <= max_prompt_len]
-    dataset         = dataset.select(keep_idx)
-    print(f"[GRPO] Final dataset: {len(dataset)} rows | "
-          f"max_prompt={max_prompt_len} max_completion={max_comp_len}")
     # ---- Sleep vLLM engine if available (frees VRAM during training) ----
     if hasattr(model, "vllm_engine") and model.vllm_engine is not None:
@@ -607,40 +666,40 @@ def main() -> None:
     # PatchFastRL above also re-adds vllm_sampling_params as an alias, but the
     # native params are clearer and forward-compatible.
     training_args = TRLGRPOConfig(
-        temperature                  = 1.0,
-        top_k                        = 50,
-        top_p                        = 0.95,
-        min_p                        = 0.1,
-        learning_rate                = 5e-6,
-        weight_decay                 = 0.01,
-        warmup_ratio                 = 0.1,
-        lr_scheduler_type            = "cosine",
-        optim                        = "adamw_8bit",
-        logging_steps                = 5,
-        per_device_train_batch_size  = 1,     # keep small: reward_env is CPU-bound, not GPU-bound
-        gradient_accumulation_steps  = 4,     # effective batch = 4
-        num_generations              = NUM_GENERATIONS,
-        vllm_gpu_memory_utilization  = 0.7,   # 70% of remaining VRAM for KV cache → faster generation
-        max_prompt_length            = max_prompt_len,
-        max_completion_length        = MAX_COMPLETION_LENGTH,  # hard cap: prevents Qwen3 think-block bloat
-        max_steps                    = MAX_STEPS,
-        save_steps                   = SAVE_STEPS,
-        output_dir                   = OUTPUT_DIR,
-        report_to                    = "none",
     )
     # ---- Trainer ----
     trainer = GRPOTrainer(
-        model            = model,
-        processing_class = tokenizer,
-        reward_funcs     = [
-            reward_format,    # structural: <reasoning><action> tags          ← early signal
             reward_validity,  # syntactic: command parses without error        ← anti-hallucination
-            reward_env,       # semantic: env simulation, uptime+latency       ← main SRE signal
-            reward_triage,    # oracle: matches triage tree expected action    ← strong supervision
         ],
-        args             = training_args,
-        train_dataset    = dataset,
     )
     print("\n[GRPO] Training starts...")

 # UNSLOTH_VLLM_STANDBY must be set before unsloth is imported.
 import os
 os.environ["UNSLOTH_VLLM_STANDBY"] = "1"
 # ---------------------------------------------------------------------------
 from server.command_parser import parse_command, CommandParseError
 from server.rubrics import calculate_step_reward as _calculate_step_reward
 def _probe_rubrics() -> bool:
     """Return True if rubrics returns bounded rewards (main branch), False if -1000 (nithish)."""
     try:
     except Exception:
         return False
 _RUBRICS_BOUNDED = _probe_rubrics()
+print(
+    f"[GRPO] rubrics version: {'main (bounded [-5,+5])' if _RUBRICS_BOUNDED else 'nithish (WARNING: -1000 cliff — using fallback)'}"
+)
 # ---------------------------------------------------------------------------
 # Config
 # ---------------------------------------------------------------------------
+MODEL_NAME = "unsloth/Qwen3-8B"
+MAX_SEQ_LENGTH = 2048
+LORA_RANK = 32
+OUTPUT_DIR = "checkpoints/qwen3_grpo_unsloth"
+DATASET_EPISODES = 500  # env rollouts to build the training dataset
+MAX_STEPS = 300  # GRPOTrainer update steps
+NUM_GENERATIONS = 4  # G — completions per prompt; reward_env is CPU-bound, keep small
+MAX_COMPLETION_LENGTH = (
+    512  # Qwen3 no-think response is ~60 tokens; 512 is a safe ceiling
+)
+SAVE_STEPS = 100
 ALL_TASKS = [
+    "traffic_spike",
+    "node_failure",
+    "cascading_failure",
+    "flash_crowd",
+    "thundering_herd",
+    "zombie_node",
+    "hot_shard_skew",
+    "memory_leak_slow_burn",
+    "split_brain_io_bottleneck",
+    "black_swan_az_failure",
+    "retry_storm",
+    "connection_pool_deadlock",
+    "autoscaler_flapping_trap",
 ]
 # ---------------------------------------------------------------------------
+# System prompt — shared with inference.py
 # ---------------------------------------------------------------------------
 SYSTEM_PROMPT = """You are an expert Site Reliability Engineer (SRE) managing a highly volatile Kubernetes cluster.
 CRITICAL INCIDENT TRIAGE TREE (Follow strictly in order):
 1. OOM IMMINENT: IF ANY 'mem_utilizations' > 0.92 → kubectl delete pod node-<leaking_node>
+2. DB RECOVERY: IF node-0 is in 'failed_nodes' → kubectl delete pod node-0
+   (The DB is a SPOF. If it's dead, ALL other actions are futile until it restarts.)
+3. SPLIT-BRAIN: IF node_0 'io_wait' > 0.80 → kubectl throttle ingress --rate=0.5
+4. HOT SHARD: IF one worker CPU > 0.90 but cluster average is low
    → kubectl exec -it istio-proxy -- traffic shift --from=<hot> --to=<cold>
+5. RETRY STORM: IF 'p99_latency' > 100.0 AND traffic spiking → kubectl throttle ingress --rate=0.4
+6. ZOMBIE NODE: IF worker CPU < 0.10 BUT 'p99_latency' is huge
    → kubectl exec -it istio-proxy -- traffic shift --from=<zombie> --to=<healthy>
+7. BLACK SWAN: IF multiple nodes in 'failed_nodes' (but DB is alive) → kubectl throttle ingress --rate=0.3
+8. DATABASE SURVIVAL: IF node-0 cpu_load > 0.80 → kubectl throttle ingress --rate=0.7
 9. SAFE SCALING: IF avg worker CPU > 0.75 AND 'error_budget' > 20
    → kubectl scale deployment frontend --replicas=10
 10. HEALTHY: If metrics are stable → no_op
 # Triage oracle — deterministic expected action for any observation
 # ---------------------------------------------------------------------------
 def _get_expected_action(obs: dict) -> str:
+    """Return the kubectl command the triage tree mandates, or 'no_op'.
+    Rule ordering is critical for RL convergence:
+      1. OOM — immediate life-or-death
+      2. DB Recovery — SPOF must be restored before anything else
+      3-6. Network/traffic rules
+      7. Black Swan — only fires if DB is alive
+      8-9. Proactive scaling
+      10. Healthy
+    """
+    cpu = obs.get("cpu_loads", [0.3] * 8)
+    mem = obs.get("mem_utilizations", [0.2] * 8)
+    fail = set(obs.get("failed_nodes", []))
+    io = float(obs.get("io_wait", 0.0))
+    p99 = float(obs.get("p99_latency", 0.0))
+    rr = float(obs.get("request_rate", 100.0))
+    bud = float(obs.get("error_budget", 100.0))
+    # Rule 1: OOM — instant kill prevention
     for i, m in enumerate(mem):
         if float(m) > 0.92:
             return f"kubectl delete pod node-{i}"
+    # Rule 2: DB RECOVERY — the DB is a SPOF; if it's dead, nothing else matters
+    if 0 in fail:
+        return "kubectl delete pod node-0"
+    # Rule 3: Split-brain
     if io > 0.80:
         return "kubectl throttle ingress --rate=0.5"
+    # Rule 4: Hot shard
     workers = [(i, float(c)) for i, c in enumerate(cpu[1:], 1) if float(c) >= 0]
     if workers:
         avg = sum(c for _, c in workers) / len(workers)
                 if dst is not None:
                     return f"kubectl exec -it istio-proxy -- traffic shift --from={i} --to={dst}"
+    # Rule 5: Retry storm
     if p99 > 100.0 and rr > 150:
         return "kubectl throttle ingress --rate=0.4"
+    # Rule 6: Zombie node
     for i, c in workers:
         if 0 <= c < 0.10 and p99 > 100.0:
             dst = next(
             if dst is not None:
                 return f"kubectl exec -it istio-proxy -- traffic shift --from={i} --to={dst}"
+    # Rule 7: Black swan (only fires when DB is alive — DB recovery is above)
     if len(fail) >= 2:
         return "kubectl throttle ingress --rate=0.3"
+    # Rule 8: DB survival (protect a living DB under load)
     db_cpu = float(cpu[0]) if cpu and float(cpu[0]) >= 0 else 0.0
     if db_cpu > 0.80:
         return "kubectl throttle ingress --rate=0.7"
     # Rule 9: Safe scaling
     if workers and sum(c for _, c in workers) / len(workers) > 0.75 and bud > 20:
         return "kubectl scale deployment frontend --replicas=10"
     return "no_op"
 # ---------------------------------------------------------------------------
 # Dataset collection
 # ---------------------------------------------------------------------------
 def _obs_to_dict(obs: InfraObservation) -> dict:
     keys = [
+        "cpu_loads",
+        "mem_utilizations",
+        "queue_lengths",
+        "failed_nodes",
+        "latency_ms",
+        "request_rate",
+        "io_wait",
+        "p99_latency",
+        "error_budget",
+        "step",
+        "task_hint",
+        "action_errors",
+        "cloud_budget",
     ]
     return {k: getattr(obs, k) for k in keys if hasattr(obs, k)}
     if random.random() < 0.30:
         atype = random.choice(["no_op", "restart_node", "throttle", "scale_up"])
         if atype == "restart_node":
+            return InfraAction(
+                action_type="restart_node",
+                target=random.randint(0, min(7, len(obs.cpu_loads) - 1)),
+            )
         if atype == "throttle":
+            return InfraAction(
+                action_type="throttle", rate=random.choice([0.3, 0.5, 0.7])
+            )
         if atype == "scale_up":
             return InfraAction(action_type="scale_up")
         return InfraAction(action_type="no_op")
         for _ in range(20):
             d = _obs_to_dict(obs)
+            rows.append(
+                {
+                    "prompt": [
+                        {"role": "system", "content": SYSTEM_PROMPT},
+                        {
+                            "role": "user",
+                            "content": (
+                                "/no_think\n"  # suppress Qwen3 <think> block — response is ~60 tokens, not ~600
+                                f"Current system state:\n{json.dumps(d)}\n"
+                                "Respond with the required XML and JSON format."
+                            ),
+                        },
+                    ],
+                    "obs_json": json.dumps(d),
+                    "task": task,
+                }
+            )
             action = _heuristic_action(obs)
             try:
                 break
         if (ep + 1) % 50 == 0:
+            print(f"  [dataset] episode {ep + 1}/{n_episodes} → {len(rows)} rows")
     print(f"  [dataset] collected {len(rows)} total rows")
     return Dataset.from_list(rows)
 # ---------------------------------------------------------------------------
 # Helpers shared by reward functions
 # ---------------------------------------------------------------------------
 def _get_completion_text(comp) -> str:
     """
     Extract completion text from TRL GRPOTrainer's format.
     sim = env.sim
     cpu_l = obs.get("cpu_loads", [])
     mem_l = obs.get("mem_utilizations", [])
+    q_l = obs.get("queue_lengths", [])
     for i in range(min(len(cpu_l), len(sim.nodes))):
         if float(cpu_l[i]) >= 0:
         if 0 <= idx < len(sim.nodes):
             sim.nodes[idx].is_failed = True
+    sim.latency_ms = float(obs.get("latency_ms", 20.0))
+    sim.error_budget = float(obs.get("error_budget", 100.0))
+    sim.last_trace_p99_latency = float(obs.get("p99_latency", 0.0))
+    sim.last_trace_node_0_io = float(obs.get("io_wait", 0.0))
 # ---------------------------------------------------------------------------
 # Reward functions (TRL GRPOTrainer signature)
 #   len(obs_json)    == G  (same value repeated G times by TRL)
 # ---------------------------------------------------------------------------
 def reward_format(completions: List, **kwargs) -> List[float]:
     """
     Reward XML structure compliance (mirrors Unsloth's match_format_exactly /
     scores = []
     for comp in completions:
         text = _get_completion_text(comp)
+        n_re = text.count("<reasoning>")
         n_re_ = text.count("</reasoning>")
+        n_ac = text.count("<action>")
         n_ac_ = text.count("</action>")
         if n_re == 1 and n_re_ == 1 and n_ac == 1 and n_ac_ == 1:
             scores.append(3.0)
         else:
             s = 0.0
+            s += 0.5 if n_re_ == 1 else -1.0
+            s += 0.5 if n_ac == 1 else -1.0
+            s += 0.5 if n_ac_ == 1 else -1.0
             scores.append(s)
     return scores
     **kwargs,
 ) -> List[float]:
     """
+    Environment simulation reward — the PRIMARY training signal.
+    Uses calculate_step_reward() from server/rubrics.py:
       - 7 components: uptime, DB CPU, memory cliff, p99 latency, load shedding,
         action efficiency, temporal friction
       - Bounded to [-5.0, +5.0] — no -1000 cliff, gradients always flow
+    Output is scaled by 2× so the environment physics dominates over the
+    oracle (reward_triage) in the total reward signal.
+    Range: [−10.0, +10.0]  (2× the raw [-5, +5])
     """
     scores = []
     for i, comp in enumerate(completions):
         try:
+            obs_data = json.loads(obs_json[i]) if obs_json else {}
             task_name = task[i] if task else "traffic_spike"
         except (TypeError, IndexError, json.JSONDecodeError):
+            scores.append(-10.0)
             continue
         env = DistributedInfraEnvironment()
             pass
         if _RUBRICS_BOUNDED:
+            # Main branch: 2× scaled to dominate over oracle reward
+            scores.append(2.0 * _calculate_step_reward(env.sim))
         else:
+            # Nithish branch fallback: simple 3-component formula [-5.0, +1.0]
+            sim = env.sim
             nodes = sim.nodes
             alive = sum(1 for n in nodes if not n.is_failed)
+            r_up = 0.5 * (alive / max(len(nodes), 1))
             r_lat = -0.5 * min((max(0.0, sim.latency_ms - 50.0) / 100.0) ** 2, 1.0)
+            r_db = -2.0 if (nodes and nodes[0].is_failed) else 0.0
+            scores.append(2.0 * (r_up + r_lat + r_db))
     return scores
     **kwargs,
 ) -> List[float]:
     """
+    Triage oracle reward — gentle guidance, NOT the primary teacher.
     Compares the model's action against the deterministic triage tree output.
+    Kept intentionally weak so reward_env (physics) dominates learning.
+    +1.0 — exact command match with expected action
+    +0.5 — same action_type but different parameters
+     0.0 — no_op when a specific action is expected, or healthy system
+    -0.5 — completely wrong action type
+    -0.5 — unnecessary action when system is healthy (expected no_op)
     """
     scores = []
     for i, comp in enumerate(completions):
         try:
             obs_data = json.loads(obs_json[i]) if obs_json else {}
         except (TypeError, IndexError, json.JSONDecodeError):
+            scores.append(-0.5)
             continue
+        expected = _get_expected_action(obs_data)
         predicted = _extract_command(_get_completion_text(comp))
         if predicted is None:
+            scores.append(-0.5)
             continue
         if predicted.strip() == expected.strip():
+            scores.append(1.0)
             continue
         if expected == "no_op":
+            # Healthy system — mild penalty for unnecessary intervention
+            scores.append(-0.5 if predicted != "no_op" else 0.0)
             continue
         if predicted == "no_op":
+            # Missed a required action — mild penalty
+            scores.append(0.0)
             continue
         # Same action type, wrong parameters?
         try:
             act_p = parse_command(predicted)
             act_e = parse_command(expected)
+            scores.append(0.5 if act_p.action_type == act_e.action_type else -0.5)
         except CommandParseError:
+            scores.append(-0.5)
     return scores
 # ---------------------------------------------------------------------------
 # Main
 # ---------------------------------------------------------------------------
 def main() -> None:
     # ---- Load model (Unsloth FastLanguageModel + LoRA + FP8) ----
     print(f"[GRPO] Loading {MODEL_NAME} ...")
     # compilation_config=0 → basic CUDA graphs only; skips piecewise graph-split that
     #                        crashes on A100 SM 8.0 (vLLM bug in _decompose_size_nodes)
     model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=MODEL_NAME,
+        max_seq_length=MAX_SEQ_LENGTH,
+        load_in_4bit=False,
+        fast_inference=True,
+        max_lora_rank=LORA_RANK,
+        load_in_fp8=False,  # FP8 requires compute capability 8.9+; A100 is 8.0
+        compilation_config=0,  # avoid piecewise graph-split crash; still uses CUDA graphs
     )
     model = FastLanguageModel.get_peft_model(
         model,
+        r=LORA_RANK,
+        lora_alpha=LORA_RANK * 2,  # 2× alpha speeds up training
+        target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",  # include MLP for DIME reasoning
         ],
+        use_gradient_checkpointing="unsloth",  # 30% memory reduction
+        random_state=3407,
     )
     # ---- Collect dataset ----
+    print(
+        f"\n[GRPO] Collecting dataset ({DATASET_EPISODES} episodes, {len(ALL_TASKS)} tasks)..."
+    )
     dataset = collect_dataset(DATASET_EPISODES, ALL_TASKS)
     # Filter prompts that exceed 90th-percentile token length (avoids outlier OOM)
     print("[GRPO] Filtering dataset by prompt length...")
     prompt_lens = [
+        len(
+            tokenizer.apply_chat_template(
+                row["prompt"], add_generation_prompt=True, tokenize=True
+            )
+        )
         for row in dataset
     ]
+    max_prompt_len = int(np.quantile(prompt_lens, 0.90)) + 1
+    max_comp_len = MAX_SEQ_LENGTH - max_prompt_len
+    keep_idx = [i for i, L in enumerate(prompt_lens) if L <= max_prompt_len]
+    dataset = dataset.select(keep_idx)
+    print(
+        f"[GRPO] Final dataset: {len(dataset)} rows | "
+        f"max_prompt={max_prompt_len} max_completion={max_comp_len}"
+    )
     # ---- Sleep vLLM engine if available (frees VRAM during training) ----
     if hasattr(model, "vllm_engine") and model.vllm_engine is not None:
     # PatchFastRL above also re-adds vllm_sampling_params as an alias, but the
     # native params are clearer and forward-compatible.
     training_args = TRLGRPOConfig(
+        temperature=1.0,
+        top_k=50,
+        top_p=0.95,
+        min_p=0.1,
+        learning_rate=5e-6,
+        weight_decay=0.01,
+        warmup_ratio=0.1,
+        lr_scheduler_type="cosine",
+        optim="adamw_8bit",
+        logging_steps=5,
+        per_device_train_batch_size=1,  # keep small: reward_env is CPU-bound, not GPU-bound
+        gradient_accumulation_steps=4,  # effective batch = 4
+        num_generations=NUM_GENERATIONS,
+        vllm_gpu_memory_utilization=0.7,  # 70% of remaining VRAM for KV cache → faster generation
+        max_prompt_length=max_prompt_len,
+        max_completion_length=MAX_COMPLETION_LENGTH,  # hard cap: prevents Qwen3 think-block bloat
+        max_steps=MAX_STEPS,
+        save_steps=SAVE_STEPS,
+        output_dir=OUTPUT_DIR,
+        report_to="none",
     )
     # ---- Trainer ----
     trainer = GRPOTrainer(
+        model=model,
+        processing_class=tokenizer,
+        reward_funcs=[
+            reward_format,  # structural: <reasoning><action> tags          ← early signal
             reward_validity,  # syntactic: command parses without error        ← anti-hallucination
+            reward_env,  # semantic: env simulation, uptime+latency       ← main SRE signal
+            reward_triage,  # oracle: matches triage tree expected action    ← strong supervision
         ],
+        args=training_args,
+        train_dataset=dataset,
     )
     print("\n[GRPO] Training starts...")