Spaces:

scaler-hack
/

scaler-openenv

Sleeping

Hacktrix-121 commited on Apr 5

Commit

abacc33

1 Parent(s): d5d74e8

minor fix

Files changed (2) hide show

rl_agent.py CHANGED Viewed

@@ -650,13 +650,16 @@ class PPOTrainer:
     # ------------------------------------------------------------------
     def act(self, obs) -> Any:
-        """Greedy action for deployment / evaluation."""
         from adaptive_alert_triage.models import Action
         if not obs.alerts:
             raise ValueError("No alerts")
         s = encode_state(obs)
         probs, _ = self.net.forward(s)
-        a = int(np.argmax(probs))
         alert = _select_alert(obs, a)
         return Action(alert_id=alert.id, action_type=_ACTION_NAMES[a])

     # ------------------------------------------------------------------
     def act(self, obs) -> Any:
+        """Stochastic action matching training behavior."""
         from adaptive_alert_triage.models import Action
         if not obs.alerts:
             raise ValueError("No alerts")
         s = encode_state(obs)
         probs, _ = self.net.forward(s)
+        # Sample from policy distribution (same as training), NOT argmax!
+        # argmax collapses a learned distribution like [0.35, 0.25, 0.22, 0.18]
+        # into always picking the same action.
+        a = int(np.random.choice(4, p=probs))
         alert = _select_alert(obs, a)
         return Action(alert_id=alert.id, action_type=_ACTION_NAMES[a])

src/adaptive_alert_triage/server.py CHANGED Viewed

@@ -485,7 +485,11 @@ async def recommend():
             ppo.net.h, ppo.net.c = old_h, old_c
             # -----------------------------------------------------------------
-            idx   = int(np.argmax(probs))
             act   = _ACTION_NAMES[idx]
             conf  = round(float(probs[idx]) * 100, 1)
             return {

             ppo.net.h, ppo.net.c = old_h, old_c
             # -----------------------------------------------------------------
+            # CRITICAL: Use sampling (same as training), NOT argmax!
+            # argmax always picks the single highest prob, collapsing a
+            # balanced policy like [0.35, 0.25, 0.22, 0.18] into "always
+            # INVESTIGATE". Sampling reproduces the trained behavior.
+            idx   = int(np.random.choice(4, p=probs))
             act   = _ACTION_NAMES[idx]
             conf  = round(float(probs[idx]) * 100, 1)
             return {