Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
|
@@ -44,14 +44,14 @@ flag = (pi >= pi_thresh) OR (pi >= pi_lower_bound AND toxic >= toxic_thresh)
|
|
| 44 |
|
| 45 |
| Parameter | Value |
|
| 46 |
|:----------|------:|
|
| 47 |
-
| `pi_thresh` | 0.
|
| 48 |
| `pi_lower_bound` | 0.50 |
|
| 49 |
-
| `toxic_thresh` | 0.
|
| 50 |
|
| 51 |
| Dataset | Recall | FPR |
|
| 52 |
|:--------|-------:|----:|
|
| 53 |
-
| test (262K) |
|
| 54 |
-
| customer_test (1.4M) |
|
| 55 |
|
| 56 |
### Thresholds at 1% FPR
|
| 57 |
|
|
@@ -66,6 +66,19 @@ flag = (pi >= pi_thresh) OR (pi >= pi_lower_bound AND toxic >= toxic_thresh)
|
|
| 66 |
| test (262K) | 75.26% | 1.008% |
|
| 67 |
| customer_test (1.4M) | 76.30% | 3.271% |
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
---
|
| 70 |
|
| 71 |
## Comparison vs pi-mmbert-v2
|
|
@@ -73,7 +86,7 @@ flag = (pi >= pi_thresh) OR (pi >= pi_lower_bound AND toxic >= toxic_thresh)
|
|
| 73 |
| Test FPR | pi-mmbert-v2 Recall | pi-mmbert-v3.5 Recall | Δ |
|
| 74 |
|:---------|--------------------:|----------------------:|--:|
|
| 75 |
| 0.127% | 35.31% | **43.59%** | +8.28pp |
|
| 76 |
-
| 0.
|
| 77 |
| 1.008% | 67.13% | **75.26%** | +8.13pp |
|
| 78 |
|
| 79 |
> v2 is a single PI-head model; v3.5 adds a toxic head with tiered detection. v2 thresholds calibrated on test benign to match v3.5 FPR exactly.
|
|
@@ -122,9 +135,9 @@ stride = max_length - chunk_overlap # 312
|
|
| 122 |
# pi_lower_bound = 0.5
|
| 123 |
# toxic_thresh = 1.00 # effectively disabled
|
| 124 |
# --- Tiered thresholds (0.5% FPR) ---
|
| 125 |
-
pi_thresh = 0.
|
| 126 |
pi_lower_bound = 0.5
|
| 127 |
-
toxic_thresh = 0.
|
| 128 |
# --- Tiered thresholds (1% FPR) ---
|
| 129 |
# pi_thresh = 0.982
|
| 130 |
# pi_lower_bound = 0.5
|
|
@@ -209,3 +222,9 @@ print(f"PI probability: {pi_prob:.4f}")
|
|
| 209 |
print(f"Toxic probability: {toxic_prob:.4f}")
|
| 210 |
print(f"Prompt injection detected? {'FLAG' if is_flagged else 'ALLOW'}")
|
| 211 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
| Parameter | Value |
|
| 46 |
|:----------|------:|
|
| 47 |
+
| `pi_thresh` | 0.984 |
|
| 48 |
| `pi_lower_bound` | 0.50 |
|
| 49 |
+
| `toxic_thresh` | 0.98 |
|
| 50 |
|
| 51 |
| Dataset | Recall | FPR |
|
| 52 |
|:--------|-------:|----:|
|
| 53 |
+
| test (262K) | 70.56% | 0.625% |
|
| 54 |
+
| customer_test (1.4M) | 73.34% | 2.691% |
|
| 55 |
|
| 56 |
### Thresholds at 1% FPR
|
| 57 |
|
|
|
|
| 66 |
| test (262K) | 75.26% | 1.008% |
|
| 67 |
| customer_test (1.4M) | 76.30% | 3.271% |
|
| 68 |
|
| 69 |
+
### Thresholds for POV
|
| 70 |
+
|
| 71 |
+
| Parameter | Value |
|
| 72 |
+
|:----------|------:|
|
| 73 |
+
| `pi_thresh` | 0.29 |
|
| 74 |
+
| `pi_lower_bound` | 0.10 |
|
| 75 |
+
| `toxic_thresh` | 0.89 |
|
| 76 |
+
|
| 77 |
+
| Dataset | Recall | FPR |
|
| 78 |
+
|:--------|-------:|----:|
|
| 79 |
+
| test (262K) | 97.34% | 9.336% |
|
| 80 |
+
| customer_test (1.4M) | 94.82% | 6.338% |
|
| 81 |
+
|
| 82 |
---
|
| 83 |
|
| 84 |
## Comparison vs pi-mmbert-v2
|
|
|
|
| 86 |
| Test FPR | pi-mmbert-v2 Recall | pi-mmbert-v3.5 Recall | Δ |
|
| 87 |
|:---------|--------------------:|----------------------:|--:|
|
| 88 |
| 0.127% | 35.31% | **43.59%** | +8.28pp |
|
| 89 |
+
| 0.625% | 60.46% | **70.56%** | +10.10pp |
|
| 90 |
| 1.008% | 67.13% | **75.26%** | +8.13pp |
|
| 91 |
|
| 92 |
> v2 is a single PI-head model; v3.5 adds a toxic head with tiered detection. v2 thresholds calibrated on test benign to match v3.5 FPR exactly.
|
|
|
|
| 135 |
# pi_lower_bound = 0.5
|
| 136 |
# toxic_thresh = 1.00 # effectively disabled
|
| 137 |
# --- Tiered thresholds (0.5% FPR) ---
|
| 138 |
+
pi_thresh = 0.984
|
| 139 |
pi_lower_bound = 0.5
|
| 140 |
+
toxic_thresh = 0.98
|
| 141 |
# --- Tiered thresholds (1% FPR) ---
|
| 142 |
# pi_thresh = 0.982
|
| 143 |
# pi_lower_bound = 0.5
|
|
|
|
| 222 |
print(f"Toxic probability: {toxic_prob:.4f}")
|
| 223 |
print(f"Prompt injection detected? {'FLAG' if is_flagged else 'ALLOW'}")
|
| 224 |
```
|
| 225 |
+
|
| 226 |
+
---
|
| 227 |
+
|
| 228 |
+
## Author
|
| 229 |
+
|
| 230 |
+
**Karthick** — [karthkal@cisco.com](mailto:karthkal@cisco.com)
|