Spaces:
Running
Running
Surface the measured supervisor result (AUC 0.99 on real DROID actions); sync supervisor.py (adds read-only drift_score)
Browse files- app/supervisor.py +21 -1
- app/templates/index.html +11 -0
app/supervisor.py
CHANGED
|
@@ -14,7 +14,12 @@ What it checks, on every action:
|
|
| 14 |
- in-bounds: every dimension stays inside the action limits.
|
| 15 |
- drift (OOD): how far the action sits from the calibration set, as a per-dim
|
| 16 |
z-score pooled into one distance. This is a deliberately simple v0 (diagonal
|
| 17 |
-
Gaussian); it catches gross drift, not subtle correlated shifts.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
- jerk: how big the jump is from the last accepted action, against the
|
| 19 |
calibration jerk.
|
| 20 |
|
|
@@ -83,6 +88,21 @@ class Supervisor:
|
|
| 83 |
def _pooled_z(self, x, mean, std):
|
| 84 |
return float(np.sqrt(np.mean(((x - mean) / std) ** 2)))
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
def _safe_out(self):
|
| 87 |
if self._last_safe is not None:
|
| 88 |
return np.clip(self._last_safe, self.cfg.action_low, self.cfg.action_high)
|
|
|
|
| 14 |
- in-bounds: every dimension stays inside the action limits.
|
| 15 |
- drift (OOD): how far the action sits from the calibration set, as a per-dim
|
| 16 |
z-score pooled into one distance. This is a deliberately simple v0 (diagonal
|
| 17 |
+
Gaussian); it catches gross drift, not subtle correlated shifts. The
|
| 18 |
+
drift_thresh isn't a guess: evaluate.py sweeps it against a labelled set
|
| 19 |
+
(real DROID actions + injected faults) to pick an operating point. On DROID
|
| 20 |
+
the detector scores AUC 0.99, and a threshold of ~2.2 catches 91% of faults
|
| 21 |
+
at a 1% false-positive rate; the shipped default (4.0) is conservative on
|
| 22 |
+
purpose, so tune it to your fleet with evaluate.py.
|
| 23 |
- jerk: how big the jump is from the last accepted action, against the
|
| 24 |
calibration jerk.
|
| 25 |
|
|
|
|
| 88 |
def _pooled_z(self, x, mean, std):
|
| 89 |
return float(np.sqrt(np.mean(((x - mean) / std) ** 2)))
|
| 90 |
|
| 91 |
+
def drift_score(self, action):
|
| 92 |
+
"""Pooled z-distance of an action from the calibration set, read-only.
|
| 93 |
+
|
| 94 |
+
Same quantity step() thresholds for drift (computed on the in-bounds
|
| 95 |
+
action), but with no side effects, so you can sweep a threshold over a
|
| 96 |
+
labelled set to get an ROC. Non-finite or wrong-shape actions score inf.
|
| 97 |
+
"""
|
| 98 |
+
if self._mean is None:
|
| 99 |
+
raise RuntimeError("calibrate() before scoring")
|
| 100 |
+
a = np.asarray(action, dtype=np.float64).reshape(-1)
|
| 101 |
+
if a.size != self._mean.size or not np.all(np.isfinite(a)):
|
| 102 |
+
return float("inf")
|
| 103 |
+
clipped = np.clip(a, self.cfg.action_low, self.cfg.action_high)
|
| 104 |
+
return self._pooled_z(clipped, self._mean, self._std)
|
| 105 |
+
|
| 106 |
def _safe_out(self):
|
| 107 |
if self._last_safe is not None:
|
| 108 |
return np.clip(self._last_safe, self.cfg.action_low, self.cfg.action_high)
|
app/templates/index.html
CHANGED
|
@@ -181,6 +181,17 @@
|
|
| 181 |
</div>
|
| 182 |
</div>
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
<div class="mt-6 grid gap-5 lg:grid-cols-[0.85fr_1.15fr]">
|
| 185 |
<!-- Scenario picker -->
|
| 186 |
<div class="ee-card p-6">
|
|
|
|
| 181 |
</div>
|
| 182 |
</div>
|
| 183 |
|
| 184 |
+
<div class="mt-4 flex flex-wrap items-center gap-2 text-xs">
|
| 185 |
+
<span class="ee-chip bg-emerald-50 text-emerald-700 ring-1 ring-inset ring-emerald-200/70 dark:bg-emerald-500/10 dark:text-emerald-300 dark:ring-emerald-400/20">
|
| 186 |
+
<span class="ee-light ee-light--go"></span> measured, not asserted
|
| 187 |
+
</span>
|
| 188 |
+
<span class="text-slate-500 dark:text-slate-400">
|
| 189 |
+
On <strong>real DROID robot actions</strong> + labelled faults: the drift detector scores
|
| 190 |
+
<span class="ee-mono font-semibold text-slate-700 dark:text-slate-200">AUC 0.99</span>, and tuned to a 1% false-alarm budget it catches
|
| 191 |
+
<span class="ee-mono font-semibold text-slate-700 dark:text-slate-200">91%</span> of faults. Eval in the repo.
|
| 192 |
+
</span>
|
| 193 |
+
</div>
|
| 194 |
+
|
| 195 |
<div class="mt-6 grid gap-5 lg:grid-cols-[0.85fr_1.15fr]">
|
| 196 |
<!-- Scenario picker -->
|
| 197 |
<div class="ee-card p-6">
|