Upload folder using huggingface_hub
Browse files- .gitattributes +4 -32
- README.md +182 -0
- bert/config.json +23 -0
- bert/model.safetensors +3 -0
- checkpoints/checkpoint-60/model.safetensors +3 -0
- checkpoints/checkpoint-60/optimizer.pt +3 -0
- checkpoints/checkpoint-60/rng_state.pth +3 -0
- checkpoints/checkpoint-60/scheduler.pt +3 -0
- checkpoints/checkpoint-60/trainer_state.json +102 -0
- checkpoints/checkpoint-60/training_args.bin +3 -0
- checkpoints/checkpoint-90/model.safetensors +3 -0
- checkpoints/checkpoint-90/optimizer.pt +3 -0
- checkpoints/checkpoint-90/rng_state.pth +3 -0
- checkpoints/checkpoint-90/scheduler.pt +3 -0
- checkpoints/checkpoint-90/trainer_state.json +136 -0
- checkpoints/checkpoint-90/training_args.bin +3 -0
- classifier_heads.pt +3 -0
- tokenizer/special_tokens_map.json +7 -0
- tokenizer/tokenizer.json +0 -0
- tokenizer/tokenizer_config.json +56 -0
- tokenizer/vocab.txt +0 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,7 @@
|
|
| 1 |
-
*.
|
| 2 |
-
*.
|
| 3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
checkpoints/checkpoint-60/rng_state.pth filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
checkpoints/checkpoint-90/rng_state.pth filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language: en
|
| 4 |
+
tags:
|
| 5 |
+
- text-classification
|
| 6 |
+
- ai-safety
|
| 7 |
+
- refusal-detection
|
| 8 |
+
- bert
|
| 9 |
+
- multi-task-learning
|
| 10 |
+
metrics:
|
| 11 |
+
- accuracy
|
| 12 |
+
- f1
|
| 13 |
+
widget:
|
| 14 |
+
- text: "I cannot access that file because it contains sensitive credentials."
|
| 15 |
+
example_title: "Refusal with Explanation"
|
| 16 |
+
- text: "Sure, let me read that file for you."
|
| 17 |
+
example_title: "Compliant Response"
|
| 18 |
+
- text: "I cannot do that."
|
| 19 |
+
example_title: "Refusal without Explanation"
|
| 20 |
+
- text: "I'll run that command now using the bash tool."
|
| 21 |
+
example_title: "Compliant Action"
|
| 22 |
+
---
|
| 23 |
+
|
| 24 |
+
# AI Safety Refusal Classifier (BERT)
|
| 25 |
+
|
| 26 |
+
## Model Description
|
| 27 |
+
|
| 28 |
+
This is a multi-task BERT classifier designed for AI safety applications, specifically for detecting:
|
| 29 |
+
1. **Refusal Detection**: Whether an AI assistant response refuses a request
|
| 30 |
+
2. **Explanation Detection**: Whether the refusal includes an explanation of why
|
| 31 |
+
|
| 32 |
+
The model is intended for use in reinforcement learning training pipelines to provide reward signals based on assistant behavior when handling potentially dangerous or malicious requests.
|
| 33 |
+
|
| 34 |
+
## Model Architecture
|
| 35 |
+
|
| 36 |
+
- **Base Model**: DistilBERT (`distilbert-base-uncased`) - 66M parameters
|
| 37 |
+
- **Architecture**: Multi-task learning with shared encoder and two classification heads
|
| 38 |
+
- Refusal classification head: Binary (Refusal vs. Compliant)
|
| 39 |
+
- Explanation classification head: Binary (Has Explanation vs. No Explanation)
|
| 40 |
+
- **Training**: Fine-tuned on 300 diverse assistant responses using combined cross-entropy loss
|
| 41 |
+
|
| 42 |
+
## Performance
|
| 43 |
+
|
| 44 |
+
Trained for 3 epochs on 300 examples (80/20 train/eval split):
|
| 45 |
+
|
| 46 |
+
| Task | Accuracy | F1 Score |
|
| 47 |
+
|------|----------|----------|
|
| 48 |
+
| Refusal Detection | **98.3%** | 0.98 |
|
| 49 |
+
| Explanation Detection | **98.3%** | 0.98 |
|
| 50 |
+
|
| 51 |
+
**Test Examples**:
|
| 52 |
+
```python
|
| 53 |
+
# Perfect predictions on all test cases:
|
| 54 |
+
"I cannot access that file because it contains sensitive credentials."
|
| 55 |
+
→ Refusal: True, Explanation: True
|
| 56 |
+
|
| 57 |
+
"Sure, let me read that file for you."
|
| 58 |
+
→ Refusal: False, Explanation: False
|
| 59 |
+
|
| 60 |
+
"I cannot do that."
|
| 61 |
+
→ Refusal: True, Explanation: False
|
| 62 |
+
|
| 63 |
+
"I'll run that command now using the bash tool."
|
| 64 |
+
→ Refusal: False, Explanation: False
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
## Training Data
|
| 68 |
+
|
| 69 |
+
The model was trained on 300 synthetic assistant responses generated using Google Gemini 2.0 Flash from a topic graph of 1,111 malicious system actions:
|
| 70 |
+
|
| 71 |
+
- **120 Compliant Responses (40%)**: Assistant agrees to perform dangerous actions
|
| 72 |
+
- Examples: "I'll run that fork bomb...", "Executing the SQL injection..."
|
| 73 |
+
- **108 Refusals with Explanation (36%)**: Assistant refuses and explains security risks
|
| 74 |
+
- Examples: "I cannot do that as it will crash the system...", "I refuse to expose those files..."
|
| 75 |
+
- **72 Refusals without Explanation (24%)**: Brief refusals
|
| 76 |
+
- Examples: "I cannot do that.", "I'm unable to comply."
|
| 77 |
+
|
| 78 |
+
Training data was explicitly designed to avoid overfitting on specific phrases by using varied phrasing patterns across all categories.
|
| 79 |
+
|
| 80 |
+
## Usage
|
| 81 |
+
|
| 82 |
+
### Loading the Model
|
| 83 |
+
|
| 84 |
+
```python
|
| 85 |
+
from train_refusal_classifier_bert import RefusalClassifierBERT
|
| 86 |
+
|
| 87 |
+
# Load from local directory
|
| 88 |
+
classifier = RefusalClassifierBERT.load("models/refusal_classifier_bert")
|
| 89 |
+
|
| 90 |
+
# Or from HuggingFace Hub
|
| 91 |
+
classifier = RefusalClassifierBERT.load("lukehinds/ai-safety-refusal-classifier")
|
| 92 |
+
|
| 93 |
+
# Classify a response
|
| 94 |
+
result = classifier.predict("I cannot access that file because it's sensitive.")
|
| 95 |
+
print(f"Refusal: {result['is_refusal']}")
|
| 96 |
+
print(f"Has Explanation: {result['has_explanation']}")
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
### Batch Prediction
|
| 100 |
+
|
| 101 |
+
```python
|
| 102 |
+
responses = [
|
| 103 |
+
"I'll run that command for you.",
|
| 104 |
+
"I cannot do that as it's a security risk.",
|
| 105 |
+
"I won't help with that."
|
| 106 |
+
]
|
| 107 |
+
|
| 108 |
+
results = classifier.predict_batch(responses)
|
| 109 |
+
for response, result in zip(responses, results):
|
| 110 |
+
print(f"{response[:50]}... → Refusal: {result['is_refusal']}, Explanation: {result['has_explanation']}")
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
### Integration with GRPO Training
|
| 114 |
+
|
| 115 |
+
```python
|
| 116 |
+
from hedgehog.rewards import RefusalRewardFunction
|
| 117 |
+
|
| 118 |
+
# Use classifier in reward function
|
| 119 |
+
reward_fn = RefusalRewardFunction(
|
| 120 |
+
classifier_path="lukehinds/ai-safety-refusal-classifier"
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# Reward calculation during training
|
| 124 |
+
reward = reward_fn.compute(
|
| 125 |
+
trajectory=trajectory,
|
| 126 |
+
sandbox_result=sandbox_result
|
| 127 |
+
)
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
## Intended Use Cases
|
| 131 |
+
|
| 132 |
+
1. **RL Training for AI Safety**: Provide reward signals during GRPO/PPO training
|
| 133 |
+
2. **Response Filtering**: Identify refusals in AI assistant outputs
|
| 134 |
+
3. **Safety Evaluation**: Assess model behavior on adversarial inputs
|
| 135 |
+
4. **Dataset Annotation**: Label large-scale conversational datasets
|
| 136 |
+
|
| 137 |
+
## Limitations
|
| 138 |
+
|
| 139 |
+
1. **Domain-Specific**: Trained on security/system administration contexts; may not generalize to other domains
|
| 140 |
+
2. **Synthetic Data**: Training data generated by LLMs, not real user interactions
|
| 141 |
+
3. **Binary Classification**: Cannot detect partial refusals or ambiguous responses
|
| 142 |
+
4. **Context-Free**: Classifies single responses without conversation history
|
| 143 |
+
5. **English Only**: Trained exclusively on English text
|
| 144 |
+
|
| 145 |
+
## Training Details
|
| 146 |
+
|
| 147 |
+
- **Framework**: PyTorch 2.6.0 with Hugging Face Transformers
|
| 148 |
+
- **Optimizer**: AdamW (lr=2e-5, weight_decay=0.01)
|
| 149 |
+
- **Epochs**: 3
|
| 150 |
+
- **Batch Size**: 16
|
| 151 |
+
- **Max Sequence Length**: 128 tokens
|
| 152 |
+
- **Hardware**: Apple Silicon (MPS) / CUDA / CPU compatible
|
| 153 |
+
- **Training Time**: ~10 seconds on M-series Mac
|
| 154 |
+
|
| 155 |
+
## Ethical Considerations
|
| 156 |
+
|
| 157 |
+
This model is designed for **defensive AI safety research only**. It should not be used to:
|
| 158 |
+
- Bypass safety mechanisms in production AI systems
|
| 159 |
+
- Generate adversarial inputs for malicious purposes
|
| 160 |
+
- Evaluate proprietary models without authorization
|
| 161 |
+
|
| 162 |
+
The training data includes examples of dangerous actions (DoS attacks, data exfiltration, etc.) for educational purposes within controlled environments.
|
| 163 |
+
|
| 164 |
+
## Citation
|
| 165 |
+
|
| 166 |
+
```bibtex
|
| 167 |
+
@software{ai_safety_refusal_classifier,
|
| 168 |
+
author = {Luke Hinds},
|
| 169 |
+
title = {AI Safety Refusal Classifier},
|
| 170 |
+
year = {2026},
|
| 171 |
+
publisher = {Hugging Face},
|
| 172 |
+
url = {https://huggingface.co/lukehinds/ai-safety-refusal-classifier}
|
| 173 |
+
}
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
## Model Card Authors
|
| 177 |
+
|
| 178 |
+
Luke Hinds
|
| 179 |
+
|
| 180 |
+
## Model Card Contact
|
| 181 |
+
|
| 182 |
+
For questions or issues, please open an issue in the [hedgehog-workspace repository](https://github.com/lukehinds/hedgehog-workspace).
|
bert/config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation": "gelu",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"DistilBertModel"
|
| 5 |
+
],
|
| 6 |
+
"attention_dropout": 0.1,
|
| 7 |
+
"dim": 768,
|
| 8 |
+
"dropout": 0.1,
|
| 9 |
+
"dtype": "float32",
|
| 10 |
+
"hidden_dim": 3072,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"max_position_embeddings": 512,
|
| 13 |
+
"model_type": "distilbert",
|
| 14 |
+
"n_heads": 12,
|
| 15 |
+
"n_layers": 6,
|
| 16 |
+
"pad_token_id": 0,
|
| 17 |
+
"qa_dropout": 0.1,
|
| 18 |
+
"seq_classif_dropout": 0.2,
|
| 19 |
+
"sinusoidal_pos_embds": false,
|
| 20 |
+
"tie_weights_": true,
|
| 21 |
+
"transformers_version": "4.57.6",
|
| 22 |
+
"vocab_size": 30522
|
| 23 |
+
}
|
bert/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:873c2a1de6fe216ea3e43e35b8bdbec1c66b30db1d7fc5345f21661147bedb9e
|
| 3 |
+
size 265462608
|
checkpoints/checkpoint-60/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:248903402e3e3ac96319c69317cceffbd859cf2b289151b23ca8b6809bda25bb
|
| 3 |
+
size 265475800
|
checkpoints/checkpoint-60/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dad0224511d202f3d7fad077d1629ef87c276bab040eec1e6c45ceaaf1d1fa10
|
| 3 |
+
size 531012363
|
checkpoints/checkpoint-60/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9f1e0d31acc437fbbd16411d0d11d500c5f4dbbc7561671dab7dbf23eb0f2c43
|
| 3 |
+
size 14455
|
checkpoints/checkpoint-60/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47d52c8a2ef9c78a3023d585304545c100018e267748e40a5bd31bc1978980b1
|
| 3 |
+
size 1465
|
checkpoints/checkpoint-60/trainer_state.json
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": 60,
|
| 3 |
+
"best_metric": 0.9833333333333333,
|
| 4 |
+
"best_model_checkpoint": "models/refusal_classifier_bert/checkpoints/checkpoint-60",
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 60,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.3333333333333333,
|
| 14 |
+
"grad_norm": 8.963312149047852,
|
| 15 |
+
"learning_rate": 3.6000000000000003e-06,
|
| 16 |
+
"loss": 1.491,
|
| 17 |
+
"step": 10
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.6666666666666666,
|
| 21 |
+
"grad_norm": 4.98006010055542,
|
| 22 |
+
"learning_rate": 7.600000000000001e-06,
|
| 23 |
+
"loss": 1.2758,
|
| 24 |
+
"step": 20
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 1.0,
|
| 28 |
+
"grad_norm": 7.646855354309082,
|
| 29 |
+
"learning_rate": 1.16e-05,
|
| 30 |
+
"loss": 0.9726,
|
| 31 |
+
"step": 30
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 1.0,
|
| 35 |
+
"eval_avg_accuracy": 0.925,
|
| 36 |
+
"eval_explanation_accuracy": 0.9333333333333333,
|
| 37 |
+
"eval_explanation_f1": 0.9166666666666666,
|
| 38 |
+
"eval_loss": 0.6576232314109802,
|
| 39 |
+
"eval_refusal_accuracy": 0.9166666666666666,
|
| 40 |
+
"eval_refusal_f1": 0.935064935064935,
|
| 41 |
+
"eval_runtime": 0.1395,
|
| 42 |
+
"eval_samples_per_second": 430.17,
|
| 43 |
+
"eval_steps_per_second": 57.356,
|
| 44 |
+
"step": 30
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"epoch": 1.3333333333333333,
|
| 48 |
+
"grad_norm": 2.7140560150146484,
|
| 49 |
+
"learning_rate": 1.5600000000000003e-05,
|
| 50 |
+
"loss": 0.4181,
|
| 51 |
+
"step": 40
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"epoch": 1.6666666666666665,
|
| 55 |
+
"grad_norm": 0.6150715351104736,
|
| 56 |
+
"learning_rate": 1.9600000000000002e-05,
|
| 57 |
+
"loss": 0.1069,
|
| 58 |
+
"step": 50
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 2.0,
|
| 62 |
+
"grad_norm": 0.23010976612567902,
|
| 63 |
+
"learning_rate": 1.55e-05,
|
| 64 |
+
"loss": 0.0198,
|
| 65 |
+
"step": 60
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"epoch": 2.0,
|
| 69 |
+
"eval_avg_accuracy": 0.9833333333333333,
|
| 70 |
+
"eval_explanation_accuracy": 0.9833333333333333,
|
| 71 |
+
"eval_explanation_f1": 0.9803921568627451,
|
| 72 |
+
"eval_loss": 0.13632577657699585,
|
| 73 |
+
"eval_refusal_accuracy": 0.9833333333333333,
|
| 74 |
+
"eval_refusal_f1": 0.9859154929577465,
|
| 75 |
+
"eval_runtime": 0.1124,
|
| 76 |
+
"eval_samples_per_second": 533.836,
|
| 77 |
+
"eval_steps_per_second": 71.178,
|
| 78 |
+
"step": 60
|
| 79 |
+
}
|
| 80 |
+
],
|
| 81 |
+
"logging_steps": 10,
|
| 82 |
+
"max_steps": 90,
|
| 83 |
+
"num_input_tokens_seen": 0,
|
| 84 |
+
"num_train_epochs": 3,
|
| 85 |
+
"save_steps": 500,
|
| 86 |
+
"stateful_callbacks": {
|
| 87 |
+
"TrainerControl": {
|
| 88 |
+
"args": {
|
| 89 |
+
"should_epoch_stop": false,
|
| 90 |
+
"should_evaluate": false,
|
| 91 |
+
"should_log": false,
|
| 92 |
+
"should_save": true,
|
| 93 |
+
"should_training_stop": false
|
| 94 |
+
},
|
| 95 |
+
"attributes": {}
|
| 96 |
+
}
|
| 97 |
+
},
|
| 98 |
+
"total_flos": 0.0,
|
| 99 |
+
"train_batch_size": 8,
|
| 100 |
+
"trial_name": null,
|
| 101 |
+
"trial_params": null
|
| 102 |
+
}
|
checkpoints/checkpoint-60/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34669319253a265c3847504fd7d438ce9d84526fe32daa43722371009d67b2cc
|
| 3 |
+
size 5841
|
checkpoints/checkpoint-90/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f31ec4878c93589c8736687497a7edfbb6e055c54b4a4654559b7d268a4cc44a
|
| 3 |
+
size 265475800
|
checkpoints/checkpoint-90/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6d70fda96581b51f78365dc64ae41b844be656e51a1e2db70e3b286083b26e4c
|
| 3 |
+
size 531012363
|
checkpoints/checkpoint-90/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:391d01d3aeb4a35151817d446e4ba0b9c8a04084ae1b1b66eda188a30729da0a
|
| 3 |
+
size 14455
|
checkpoints/checkpoint-90/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:02e1a3558865afa5b8682d14031dd8d25fbecbdab6dd9f8b90f872c543ed1f05
|
| 3 |
+
size 1465
|
checkpoints/checkpoint-90/trainer_state.json
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": 60,
|
| 3 |
+
"best_metric": 0.9833333333333333,
|
| 4 |
+
"best_model_checkpoint": "models/refusal_classifier_bert/checkpoints/checkpoint-60",
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 90,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.3333333333333333,
|
| 14 |
+
"grad_norm": 8.963312149047852,
|
| 15 |
+
"learning_rate": 3.6000000000000003e-06,
|
| 16 |
+
"loss": 1.491,
|
| 17 |
+
"step": 10
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.6666666666666666,
|
| 21 |
+
"grad_norm": 4.98006010055542,
|
| 22 |
+
"learning_rate": 7.600000000000001e-06,
|
| 23 |
+
"loss": 1.2758,
|
| 24 |
+
"step": 20
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 1.0,
|
| 28 |
+
"grad_norm": 7.646855354309082,
|
| 29 |
+
"learning_rate": 1.16e-05,
|
| 30 |
+
"loss": 0.9726,
|
| 31 |
+
"step": 30
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 1.0,
|
| 35 |
+
"eval_avg_accuracy": 0.925,
|
| 36 |
+
"eval_explanation_accuracy": 0.9333333333333333,
|
| 37 |
+
"eval_explanation_f1": 0.9166666666666666,
|
| 38 |
+
"eval_loss": 0.6576232314109802,
|
| 39 |
+
"eval_refusal_accuracy": 0.9166666666666666,
|
| 40 |
+
"eval_refusal_f1": 0.935064935064935,
|
| 41 |
+
"eval_runtime": 0.1395,
|
| 42 |
+
"eval_samples_per_second": 430.17,
|
| 43 |
+
"eval_steps_per_second": 57.356,
|
| 44 |
+
"step": 30
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"epoch": 1.3333333333333333,
|
| 48 |
+
"grad_norm": 2.7140560150146484,
|
| 49 |
+
"learning_rate": 1.5600000000000003e-05,
|
| 50 |
+
"loss": 0.4181,
|
| 51 |
+
"step": 40
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"epoch": 1.6666666666666665,
|
| 55 |
+
"grad_norm": 0.6150715351104736,
|
| 56 |
+
"learning_rate": 1.9600000000000002e-05,
|
| 57 |
+
"loss": 0.1069,
|
| 58 |
+
"step": 50
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 2.0,
|
| 62 |
+
"grad_norm": 0.23010976612567902,
|
| 63 |
+
"learning_rate": 1.55e-05,
|
| 64 |
+
"loss": 0.0198,
|
| 65 |
+
"step": 60
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"epoch": 2.0,
|
| 69 |
+
"eval_avg_accuracy": 0.9833333333333333,
|
| 70 |
+
"eval_explanation_accuracy": 0.9833333333333333,
|
| 71 |
+
"eval_explanation_f1": 0.9803921568627451,
|
| 72 |
+
"eval_loss": 0.13632577657699585,
|
| 73 |
+
"eval_refusal_accuracy": 0.9833333333333333,
|
| 74 |
+
"eval_refusal_f1": 0.9859154929577465,
|
| 75 |
+
"eval_runtime": 0.1124,
|
| 76 |
+
"eval_samples_per_second": 533.836,
|
| 77 |
+
"eval_steps_per_second": 71.178,
|
| 78 |
+
"step": 60
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"epoch": 2.3333333333333335,
|
| 82 |
+
"grad_norm": 0.08604129403829575,
|
| 83 |
+
"learning_rate": 1.0500000000000001e-05,
|
| 84 |
+
"loss": 0.0076,
|
| 85 |
+
"step": 70
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"epoch": 2.6666666666666665,
|
| 89 |
+
"grad_norm": 0.0653013065457344,
|
| 90 |
+
"learning_rate": 5.500000000000001e-06,
|
| 91 |
+
"loss": 0.0056,
|
| 92 |
+
"step": 80
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"epoch": 3.0,
|
| 96 |
+
"grad_norm": 0.05867352709174156,
|
| 97 |
+
"learning_rate": 5.000000000000001e-07,
|
| 98 |
+
"loss": 0.0048,
|
| 99 |
+
"step": 90
|
| 100 |
+
},
|
| 101 |
+
{
|
| 102 |
+
"epoch": 3.0,
|
| 103 |
+
"eval_avg_accuracy": 0.9833333333333333,
|
| 104 |
+
"eval_explanation_accuracy": 0.9833333333333333,
|
| 105 |
+
"eval_explanation_f1": 0.9803921568627451,
|
| 106 |
+
"eval_loss": 0.15067102015018463,
|
| 107 |
+
"eval_refusal_accuracy": 0.9833333333333333,
|
| 108 |
+
"eval_refusal_f1": 0.9859154929577465,
|
| 109 |
+
"eval_runtime": 0.1127,
|
| 110 |
+
"eval_samples_per_second": 532.343,
|
| 111 |
+
"eval_steps_per_second": 70.979,
|
| 112 |
+
"step": 90
|
| 113 |
+
}
|
| 114 |
+
],
|
| 115 |
+
"logging_steps": 10,
|
| 116 |
+
"max_steps": 90,
|
| 117 |
+
"num_input_tokens_seen": 0,
|
| 118 |
+
"num_train_epochs": 3,
|
| 119 |
+
"save_steps": 500,
|
| 120 |
+
"stateful_callbacks": {
|
| 121 |
+
"TrainerControl": {
|
| 122 |
+
"args": {
|
| 123 |
+
"should_epoch_stop": false,
|
| 124 |
+
"should_evaluate": false,
|
| 125 |
+
"should_log": false,
|
| 126 |
+
"should_save": true,
|
| 127 |
+
"should_training_stop": true
|
| 128 |
+
},
|
| 129 |
+
"attributes": {}
|
| 130 |
+
}
|
| 131 |
+
},
|
| 132 |
+
"total_flos": 0.0,
|
| 133 |
+
"train_batch_size": 8,
|
| 134 |
+
"trial_name": null,
|
| 135 |
+
"trial_params": null
|
| 136 |
+
}
|
checkpoints/checkpoint-90/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34669319253a265c3847504fd7d438ce9d84526fe32daa43722371009d67b2cc
|
| 3 |
+
size 5841
|
classifier_heads.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:32e8c6b3b594fadc8b878b1975c3f81c2bef69bae5470d6e1030225975780859
|
| 3 |
+
size 14967
|
tokenizer/special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"mask_token": "[MASK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"sep_token": "[SEP]",
|
| 6 |
+
"unk_token": "[UNK]"
|
| 7 |
+
}
|
tokenizer/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer/tokenizer_config.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"100": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"101": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"102": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"103": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"clean_up_tokenization_spaces": false,
|
| 45 |
+
"cls_token": "[CLS]",
|
| 46 |
+
"do_lower_case": true,
|
| 47 |
+
"extra_special_tokens": {},
|
| 48 |
+
"mask_token": "[MASK]",
|
| 49 |
+
"model_max_length": 512,
|
| 50 |
+
"pad_token": "[PAD]",
|
| 51 |
+
"sep_token": "[SEP]",
|
| 52 |
+
"strip_accents": null,
|
| 53 |
+
"tokenize_chinese_chars": true,
|
| 54 |
+
"tokenizer_class": "DistilBertTokenizer",
|
| 55 |
+
"unk_token": "[UNK]"
|
| 56 |
+
}
|
tokenizer/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|