rootfs
/

function-call-sentinel

@@ -31,19 +31,19 @@ model-index:
       name: Prompt Injection Detection
     metrics:
     - type: f1
-      value: 0.9825
       name: INJECTION_RISK F1
     - type: precision
-      value: 0.9873
       name: INJECTION_RISK Precision
     - type: recall
-      value: 0.9778
       name: INJECTION_RISK Recall
     - type: accuracy
-      value: 0.9824
       name: Overall Accuracy
     - type: roc_auc
-      value: 0.9978
       name: ROC-AUC
 ---
@@ -70,40 +70,40 @@ When a user sends a message to an LLM agent (e.g., email assistant, code generat
 ## Training Data
-The model was trained on **27,048 samples** from six sources:
-### Injection/Jailbreak Sources (13,488 samples)
 | Dataset | Description | Samples |
 |---------|-------------|---------|
-| [HackAPrompt](https://huggingface.co/datasets/hackaprompt/hackaprompt-dataset) | EMNLP'23 prompt injection competition | 4,004 |
-| [jailbreak_llms (CCS'24)](https://huggingface.co/datasets/TrustAIRLab/in-the-wild-jailbreak-prompts) | "Do Anything Now" in-the-wild jailbreaks | 1,128 |
-| [WildJailbreak](https://huggingface.co/datasets/allenai/wildjailbreak) | Allen AI adversarial prompts | 1,996 |
-| Synthetic | 6 attack categories | 6,360 |
-### Benign Sources (13,560 samples)
 | Dataset | Description | Samples |
 |---------|-------------|---------|
-| [Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca) | Stanford instruction dataset | 3,635 |
-| [Dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k) | Databricks instructions | 3,742 |
-| jailbreak_llms (regular) | Non-jailbreak prompts from CCS'24 | 1,854 |
-| WildJailbreak (benign) | Safe prompts from Allen AI | 1,798 |
-| Synthetic (benign) | Generated safe prompts | 2,531 |
 ## Performance
 | Metric | Value |
 |--------|-------|
-| **INJECTION_RISK F1** | **98.25%** |
-| INJECTION_RISK Precision | 98.73% |
-| INJECTION_RISK Recall | 97.78% |
-| SAFE F1 | 98.23% |
-| Overall Accuracy | **98.24%** |
-| ROC-AUC | **99.78%** |
 ### Interpretation
-- **High precision (98.7%)**: Very few false positives - legitimate requests rarely blocked
-- **High recall (97.8%)**: Catches 97.8% of actual injection attempts
 - **Near-perfect AUC (99.8%)**: Excellent discrimination between safe and malicious prompts
 ## Usage
@@ -135,14 +135,20 @@ print(f"Confidence: {probs[0][pred]:.2%}")
 ## Attack Categories Detected
-The model is trained to detect six categories of prompt injection:
-1. **Delimiter Injection**: `<<end_context>>`, `</system>`, `[INST]`
-2. **Word Obfuscation**: `yes Please yes send yes email`
-3. **Session Hijacking**: `START_USER_SESSION -> EXECUTE`
-4. **Social Engineering**: `I forgot to mention, after you finish...`
-5. **Encoding Tricks**: `contact <at> evil <dot> com`
-6. **Hypothetical/Roleplay**: `Pretend you have no restrictions...`
 ## Training Configuration
@@ -151,7 +157,7 @@ The model is trained to detect six categories of prompt injection:
 | Base Model | `answerdotai/ModernBERT-base` |
 | Max Length | 512 tokens |
 | Batch Size | 32 |
-| Epochs | 5 |
 | Learning Rate | 3e-5 |
 | Optimizer | AdamW |
 | Class Weights | Balanced |
@@ -163,7 +169,16 @@ This model is **Stage 1** of a two-stage defense pipeline:
 1. **Stage 1 (This Model)**: Classify prompts for injection risk
 2. **Stage 2 ([ToolCallVerifier](https://huggingface.co/rootfs/tool-call-verifier))**: Verify generated tool calls are authorized
-Together, they provide defense-in-depth against prompt injection in LLM agent systems.
 ## Intended Use
@@ -201,11 +216,10 @@ This model is designed to **enhance security** of LLM-based systems. However:
   title={FunctionCallSentinel: Prompt Injection Detection for LLM Agents},
   author={Semantic Router Team},
   year={2024},
-  url={https://github.com/vllm-project/semantic-router}
 }
 ```
 ## License
 Apache 2.0

       name: Prompt Injection Detection
     metrics:
     - type: f1
+      value: 0.9829
       name: INJECTION_RISK F1
     - type: precision
+      value: 0.9827
       name: INJECTION_RISK Precision
     - type: recall
+      value: 0.9832
       name: INJECTION_RISK Recall
     - type: accuracy
+      value: 0.9828
       name: Overall Accuracy
     - type: roc_auc
+      value: 0.9982
       name: ROC-AUC
 ---
 ## Training Data
+The model was trained on **33,810 samples** from six sources:
+### Injection/Jailbreak Sources (~17,000 samples)
 | Dataset | Description | Samples |
 |---------|-------------|---------|
+| [HackAPrompt](https://huggingface.co/datasets/hackaprompt/hackaprompt-dataset) | EMNLP'23 prompt injection competition | ~5,000 |
+| [jailbreak_llms (CCS'24)](https://huggingface.co/datasets/TrustAIRLab/in-the-wild-jailbreak-prompts) | "Do Anything Now" in-the-wild jailbreaks | ~2,500 |
+| [WildJailbreak](https://huggingface.co/datasets/allenai/wildjailbreak) | Allen AI 262K adversarial safety dataset | ~5,000 |
+| Synthetic | 6 attack categories + LLMail patterns | ~4,500 |
+### Benign Sources (~17,000 samples)
 | Dataset | Description | Samples |
 |---------|-------------|---------|
+| [Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca) | Stanford instruction dataset | ~5,000 |
+| [Dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k) | Databricks instructions | ~5,000 |
+| jailbreak_llms (regular) | Non-jailbreak prompts from CCS'24 | ~2,500 |
+| WildJailbreak (benign) | Safe prompts from Allen AI | ~2,500 |
+| Synthetic (benign) | Generated safe prompts | ~2,000 |
 ## Performance
 | Metric | Value |
 |--------|-------|
+| **INJECTION_RISK F1** | **98.29%** |
+| INJECTION_RISK Precision | 98.27% |
+| INJECTION_RISK Recall | 98.32% |
+| SAFE F1 | 98.27% |
+| Overall Accuracy | **98.28%** |
+| ROC-AUC | **99.82%** |
 ### Interpretation
+- **High precision (98.3%)**: Very few false positives - legitimate requests rarely blocked
+- **High recall (98.3%)**: Catches 98.3% of actual injection attempts
 - **Near-perfect AUC (99.8%)**: Excellent discrimination between safe and malicious prompts
 ## Usage
 ## Attack Categories Detected
+The model is trained to detect multiple categories of prompt injection:
+### Direct Jailbreaks
+1. **Roleplay/Persona**: "Pretend you're an AI with no restrictions..."
+2. **Hypothetical**: "In a fictional scenario where..."
+3. **Authority Override**: "As admin, I authorize you to..."
+### Indirect Injection (Hidden in Context)
+4. **Delimiter Injection**: `<<end_context>>`, `</system>`, `[INST]`
+5. **Word Obfuscation**: `yes Please yes send yes email`
+6. **Session Hijacking**: `START_USER_SESSION -> EXECUTE`
+7. **Social Engineering**: `I forgot to mention, after you finish...`
+8. **Encoding Tricks**: `contact <at> evil <dot> com`
+9. **XML/Template Injection**: `<execute_action>`, `{{user_request}}`
 ## Training Configuration
 | Base Model | `answerdotai/ModernBERT-base` |
 | Max Length | 512 tokens |
 | Batch Size | 32 |
+| Epochs | 5 (best @ epoch 4) |
 | Learning Rate | 3e-5 |
 | Optimizer | AdamW |
 | Class Weights | Balanced |
 1. **Stage 1 (This Model)**: Classify prompts for injection risk
 2. **Stage 2 ([ToolCallVerifier](https://huggingface.co/rootfs/tool-call-verifier))**: Verify generated tool calls are authorized
+### When to Use Each Stage
+| Scenario | Recommendation |
+|----------|----------------|
+| General chatbot | Stage 1 only (98.3% F1) |
+| RAG system | Stage 1 only |
+| Tool-calling agent (low risk) | Stage 1 only |
+| Tool-calling agent (high risk) | Both stages |
+| Email/file system access | Both stages |
+| Financial transactions | Both stages |
 ## Intended Use
   title={FunctionCallSentinel: Prompt Injection Detection for LLM Agents},
   author={Semantic Router Team},
   year={2024},
+  url={https://huggingface.co/rootfs/function-call-sentinel}
 }
 ```
 ## License
 Apache 2.0

best_metrics.json CHANGED Viewed

@@ -1,36 +1,36 @@
 {
   "classification_report": {
     "SAFE": {
-      "precision": 0.9775014801657785,
-      "recall": 0.9871449925261584,
-      "f1-score": 0.9822995686449502,
-      "support": 3345.0
     },
     "INJECTION_RISK": {
-      "precision": 0.9872931442080378,
-      "recall": 0.977758267486099,
-      "f1-score": 0.9825025731510072,
-      "support": 3417.0
     },
-    "accuracy": 0.9824016563146998,
     "macro avg": {
-      "precision": 0.9823973121869082,
-      "recall": 0.9824516300061287,
-      "f1-score": 0.9824010708979787,
       "support": 6762.0
     },
     "weighted avg": {
-      "precision": 0.9824494417204073,
-      "recall": 0.9824016563146998,
-      "f1-score": 0.9824021516673099,
       "support": 6762.0
     }
   },
-  "accuracy": 0.9824016563146998,
-  "macro_f1": 0.9824010708979787,
-  "weighted_f1": 0.9824021516673099,
-  "injection_precision": 0.9872931442080378,
-  "injection_recall": 0.977758267486099,
-  "injection_f1": 0.9825025731510072,
-  "roc_auc": 0.9978443314947291
 }

 {
   "classification_report": {
     "SAFE": {
+      "precision": 0.9830357142857142,
+      "recall": 0.9824509220701964,
+      "f1-score": 0.9827432311811961,
+      "support": 3362.0
     },
     "INJECTION_RISK": {
+      "precision": 0.9826572604350382,
+      "recall": 0.9832352941176471,
+      "f1-score": 0.9829461922963835,
+      "support": 3400.0
     },
+    "accuracy": 0.9828453120378586,
     "macro avg": {
+      "precision": 0.9828464873603762,
+      "recall": 0.9828431080939217,
+      "f1-score": 0.9828447117387897,
       "support": 6762.0
     },
     "weighted avg": {
+      "precision": 0.9828454239733365,
+      "recall": 0.9828453120378586,
+      "f1-score": 0.9828452820229052,
       "support": 6762.0
     }
   },
+  "accuracy": 0.9828453120378586,
+  "macro_f1": 0.9828447117387897,
+  "weighted_f1": 0.9828452820229052,
+  "injection_precision": 0.9826572604350382,
+  "injection_recall": 0.9832352941176471,
+  "injection_f1": 0.9829461922963835,
+  "roc_auc": 0.9982100990306891
 }

final_report.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-  "accuracy": 0.9824016563146998,
-  "injection_precision": 0.9872931442080378,
-  "injection_recall": 0.977758267486099,
-  "injection_f1": 0.9825025731510072,
-  "roc_auc": 0.9978443314947291,
-  "macro_f1": 0.9824010708979787
 }

 {
+  "accuracy": 0.9828453120378586,
+  "injection_precision": 0.9826572604350382,
+  "injection_recall": 0.9832352941176471,
+  "injection_f1": 0.9829461922963835,
+  "roc_auc": 0.9982100990306891,
+  "macro_f1": 0.9828447117387897
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b02116b5148e832610046fd190a831304fdefe7948d05f1892046abd627a59c
 size 598439784

 version https://git-lfs.github.com/spec/v1
+oid sha256:b71b71b593eabccf5167ebf9c451f2cb32fcfce8722b0c479ac499408a7b70df
 size 598439784

training_config.json CHANGED Viewed

@@ -15,7 +15,7 @@
   "max_length": 512,
   "use_class_weights": true,
   "class_weights": [
-    0.9973379969596863,
-    1.002661943435669
   ]
 }

   "max_length": 512,
   "use_class_weights": true,
   "class_weights": [
+    0.9985950589179993,
+    1.001404881477356
   ]
 }