v5: hyperparameter tuning, F1 98.31%
Browse files- README.md +64 -107
- config.cfg +5 -5
- meta.json +16 -16
- ner/model +2 -2
- transformer/model +2 -2
README.md
CHANGED
|
@@ -14,82 +14,80 @@ model-index:
|
|
| 14 |
name: Named Entity Recognition
|
| 15 |
metrics:
|
| 16 |
- type: f1
|
| 17 |
-
value: 0.
|
| 18 |
name: F1
|
| 19 |
- type: precision
|
| 20 |
-
value: 0.
|
| 21 |
name: Precision
|
| 22 |
- type: recall
|
| 23 |
-
value: 0.
|
| 24 |
name: Recall
|
| 25 |
---
|
| 26 |
|
| 27 |
# Cybersecurity NER Model
|
| 28 |
|
| 29 |
-
|
| 30 |
|
| 31 |
-
## Model
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
**Version:** v4
|
| 36 |
**Framework:** spaCy 3.8+
|
| 37 |
**Training Date:** 2025-12-29
|
| 38 |
-
**
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
| 46 |
-
|
|
| 47 |
-
| SECURITY_TOOL | 100
|
| 48 |
-
|
|
| 49 |
-
|
|
| 50 |
-
|
|
| 51 |
-
|
|
| 52 |
-
|
|
| 53 |
-
|
|
| 54 |
-
|
|
| 55 |
-
|
|
| 56 |
-
| ACRONYM |
|
|
|
|
| 57 |
|
| 58 |
## Performance
|
| 59 |
|
| 60 |
-
**
|
| 61 |
-
- F1:
|
| 62 |
-
- Precision: 97.
|
| 63 |
-
- Recall: 98.
|
| 64 |
-
- Inference
|
| 65 |
|
| 66 |
-
**
|
| 67 |
-
-
|
| 68 |
-
-
|
| 69 |
-
-
|
| 70 |
|
| 71 |
-
##
|
| 72 |
|
| 73 |
-
|
| 74 |
-
- **Split:** Stratified by entity type (80% train, 10% dev, 10% test)
|
| 75 |
-
- **Examples:** 1922 total
|
| 76 |
-
- **Transformer:** Domain-adapted RoBERTa (roberta-cybersecurity)
|
| 77 |
|
| 78 |
-
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
|
|
|
|
|
|
|
| 81 |
|
| 82 |
```bash
|
| 83 |
pip install spacy>=3.7.0 spacy-transformers>=1.3.0
|
| 84 |
```
|
| 85 |
|
| 86 |
-
### Load Model
|
| 87 |
-
|
| 88 |
```python
|
| 89 |
import spacy
|
| 90 |
|
| 91 |
nlp = spacy.load("pki/ner-cybersecurity")
|
| 92 |
-
doc = nlp("CISO with CISSP
|
| 93 |
|
| 94 |
for ent in doc.ents:
|
| 95 |
print(f"{ent.text:20} | {ent.label_}")
|
|
@@ -103,86 +101,45 @@ Splunk | SECURITY_TOOL
|
|
| 103 |
ISO 27001 | FRAMEWORK
|
| 104 |
```
|
| 105 |
|
| 106 |
-
### FastAPI Service
|
| 107 |
-
|
| 108 |
-
```python
|
| 109 |
-
from fastapi import FastAPI
|
| 110 |
-
import spacy
|
| 111 |
-
|
| 112 |
-
app = FastAPI()
|
| 113 |
-
nlp = spacy.load("pki/ner-cybersecurity")
|
| 114 |
-
|
| 115 |
-
@app.post("/extract")
|
| 116 |
-
async def extract_entities(text: str):
|
| 117 |
-
doc = nlp(text)
|
| 118 |
-
return {
|
| 119 |
-
"entities": [
|
| 120 |
-
{
|
| 121 |
-
"text": ent.text,
|
| 122 |
-
"label": ent.label_,
|
| 123 |
-
"start": ent.start_char,
|
| 124 |
-
"end": ent.end_char
|
| 125 |
-
}
|
| 126 |
-
for ent in doc.ents
|
| 127 |
-
]
|
| 128 |
-
}
|
| 129 |
-
```
|
| 130 |
-
|
| 131 |
## Use Cases
|
| 132 |
|
| 133 |
-
-
|
| 134 |
-
-
|
| 135 |
-
-
|
| 136 |
-
-
|
| 137 |
|
| 138 |
-
## Training
|
| 139 |
|
| 140 |
```ini
|
| 141 |
-
[training]
|
| 142 |
max_steps = 8000
|
|
|
|
|
|
|
|
|
|
| 143 |
hidden_width = 128
|
| 144 |
-
|
| 145 |
batch_size = 128
|
| 146 |
-
|
| 147 |
-
[transformer]
|
| 148 |
-
name = roberta-cybersecurity (domain-adapted)
|
| 149 |
```
|
| 150 |
|
| 151 |
## Limitations
|
| 152 |
|
| 153 |
-
-
|
| 154 |
-
-
|
| 155 |
-
-
|
| 156 |
-
|
| 157 |
-
## Model Card Authors
|
| 158 |
-
|
| 159 |
-
PKI Team
|
| 160 |
-
|
| 161 |
-
## Citation
|
| 162 |
-
|
| 163 |
-
```bibtex
|
| 164 |
-
@misc{ner-cybersecurity-v4,
|
| 165 |
-
title={Cybersecurity NER Model v4},
|
| 166 |
-
author={PKI Team},
|
| 167 |
-
year={2025},
|
| 168 |
-
publisher={Hugging Face},
|
| 169 |
-
url={https://huggingface.co/pki/ner-cybersecurity}
|
| 170 |
-
}
|
| 171 |
-
```
|
| 172 |
|
| 173 |
## License
|
| 174 |
|
| 175 |
-
MIT
|
| 176 |
|
| 177 |
## Version History
|
| 178 |
|
| 179 |
-
| Version | Date | F1 |
|
| 180 |
-
|
| 181 |
-
|
|
|
|
|
| 182 |
| v3 | 2025-01 | 69.4% | 1000 | spaCy 3.x migration |
|
| 183 |
-
| v2 | 2024-12 | 99.5%* | 1805 | spaCy 2.x (*train accuracy
|
| 184 |
-
| v1 | 2024-11 | N/A | N/A | Initial Prodigy training |
|
| 185 |
|
| 186 |
## Contact
|
| 187 |
|
| 188 |
-
|
|
|
|
| 14 |
name: Named Entity Recognition
|
| 15 |
metrics:
|
| 16 |
- type: f1
|
| 17 |
+
value: 0.9831
|
| 18 |
name: F1
|
| 19 |
- type: precision
|
| 20 |
+
value: 0.9792
|
| 21 |
name: Precision
|
| 22 |
- type: recall
|
| 23 |
+
value: 0.9869
|
| 24 |
name: Recall
|
| 25 |
---
|
| 26 |
|
| 27 |
# Cybersecurity NER Model
|
| 28 |
|
| 29 |
+
NER model for cybersecurity domain. F1: 98.31%.
|
| 30 |
|
| 31 |
+
## Model Details
|
| 32 |
|
| 33 |
+
**Version:** v5
|
|
|
|
|
|
|
| 34 |
**Framework:** spaCy 3.8+
|
| 35 |
**Training Date:** 2025-12-29
|
| 36 |
+
**Examples:** 1922 (stratified 80/10/10)
|
| 37 |
+
**Backbone:** Domain-adapted RoBERTa
|
| 38 |
+
|
| 39 |
+
## Entities (13)
|
| 40 |
+
|
| 41 |
+
| Entity | F1 | Examples |
|
| 42 |
+
|--------|-----|----------|
|
| 43 |
+
| CERTIFICATION | 100% | CISSP, OSCP, CEH |
|
| 44 |
+
| SECURITY_ROLE | 100% | CISO, SOC Analyst |
|
| 45 |
+
| SECURITY_TOOL | 100% | Splunk, Metasploit |
|
| 46 |
+
| ATTACK_TECHNIQUE | 100% | SQL Injection, XSS |
|
| 47 |
+
| FRAMEWORK | 100% | NIST CSF, ISO 27001 |
|
| 48 |
+
| THREAT_TYPE | 100% | APT, ransomware |
|
| 49 |
+
| AUDIT_TERM | 100% | Compliance, Audit |
|
| 50 |
+
| CVE | 100% | CVE-2021-44228 |
|
| 51 |
+
| SECURITY_DOMAIN | 99.10% | Cloud Security |
|
| 52 |
+
| TECHNICAL_SKILL | 95.30% | Incident Response |
|
| 53 |
+
| REGULATION | 94.44% | GDPR, HIPAA |
|
| 54 |
+
| ACRONYM | 88.89% | SIEM, EDR |
|
| 55 |
+
| CONTROL_ID | 0% | See hybrid approach |
|
| 56 |
|
| 57 |
## Performance
|
| 58 |
|
| 59 |
+
**Metrics:**
|
| 60 |
+
- F1: 98.31%
|
| 61 |
+
- Precision: 97.92%
|
| 62 |
+
- Recall: 98.69%
|
| 63 |
+
- Inference: ~60ms/doc
|
| 64 |
|
| 65 |
+
**v5 changes from v4:**
|
| 66 |
+
- Tuned hyperparameters (dropout 0.25, L2 0.02)
|
| 67 |
+
- Improved REGULATION (+6.64pp), ACRONYM (+22.22pp)
|
| 68 |
+
- Overall +0.25pp F1
|
| 69 |
|
| 70 |
+
## CONTROL_ID Handling
|
| 71 |
|
| 72 |
+
Model F1 for CONTROL_ID: 0% (insufficient training data: 25 examples).
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
**Solution:** Hybrid approach - regex extraction for production use.
|
| 75 |
+
|
| 76 |
+
Patterns: ISO 27001, NIST CSF, CIS Controls, SOC 2, PCI-DSS.
|
| 77 |
|
| 78 |
+
See service implementation for details.
|
| 79 |
+
|
| 80 |
+
## Usage
|
| 81 |
|
| 82 |
```bash
|
| 83 |
pip install spacy>=3.7.0 spacy-transformers>=1.3.0
|
| 84 |
```
|
| 85 |
|
|
|
|
|
|
|
| 86 |
```python
|
| 87 |
import spacy
|
| 88 |
|
| 89 |
nlp = spacy.load("pki/ner-cybersecurity")
|
| 90 |
+
doc = nlp("CISO with CISSP, expert in Splunk and ISO 27001")
|
| 91 |
|
| 92 |
for ent in doc.ents:
|
| 93 |
print(f"{ent.text:20} | {ent.label_}")
|
|
|
|
| 101 |
ISO 27001 | FRAMEWORK
|
| 102 |
```
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
## Use Cases
|
| 105 |
|
| 106 |
+
- Job/CV matching
|
| 107 |
+
- Threat intelligence extraction
|
| 108 |
+
- Compliance documentation parsing
|
| 109 |
+
- Security policy analysis
|
| 110 |
|
| 111 |
+
## Training Config
|
| 112 |
|
| 113 |
```ini
|
|
|
|
| 114 |
max_steps = 8000
|
| 115 |
+
dropout = 0.25
|
| 116 |
+
L2 = 0.02
|
| 117 |
+
learning_rate = 0.00003
|
| 118 |
hidden_width = 128
|
| 119 |
+
maxout_pieces = 3
|
| 120 |
batch_size = 128
|
|
|
|
|
|
|
|
|
|
| 121 |
```
|
| 122 |
|
| 123 |
## Limitations
|
| 124 |
|
| 125 |
+
- ACRONYM: Lower F1 (88.89%) - limited examples (46)
|
| 126 |
+
- CONTROL_ID: Requires hybrid regex approach
|
| 127 |
+
- Domain-specific: Optimized for cybersecurity text
|
| 128 |
+
- Context-dependent ambiguity on some terms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
## License
|
| 131 |
|
| 132 |
+
MIT
|
| 133 |
|
| 134 |
## Version History
|
| 135 |
|
| 136 |
+
| Version | Date | F1 | Examples | Notes |
|
| 137 |
+
|---------|------|-----|----------|-------|
|
| 138 |
+
| v5 | 2025-12-29 | 98.31% | 1922 | Hyperparameter tuning |
|
| 139 |
+
| v4 | 2025-12-29 | 98.06% | 1922 | Stratified split, domain RoBERTa |
|
| 140 |
| v3 | 2025-01 | 69.4% | 1000 | spaCy 3.x migration |
|
| 141 |
+
| v2 | 2024-12 | 99.5%* | 1805 | spaCy 2.x (*train accuracy) |
|
|
|
|
| 142 |
|
| 143 |
## Contact
|
| 144 |
|
| 145 |
+
Issues: Model repository
|
config.cfg
CHANGED
|
@@ -33,7 +33,7 @@ update_with_oracle_cut_size = 100
|
|
| 33 |
state_type = "ner"
|
| 34 |
extra_state_tokens = false
|
| 35 |
hidden_width = 128
|
| 36 |
-
maxout_pieces =
|
| 37 |
use_upper = true
|
| 38 |
nO = null
|
| 39 |
|
|
@@ -88,9 +88,9 @@ dev_corpus = "corpora.dev"
|
|
| 88 |
train_corpus = "corpora.train"
|
| 89 |
seed = ${system.seed}
|
| 90 |
gpu_allocator = ${system.gpu_allocator}
|
| 91 |
-
dropout = 0.
|
| 92 |
accumulate_gradient = 3
|
| 93 |
-
patience =
|
| 94 |
max_epochs = 0
|
| 95 |
max_steps = 8000
|
| 96 |
eval_frequency = 200
|
|
@@ -115,7 +115,7 @@ progress_bar = true
|
|
| 115 |
beta1 = 0.9
|
| 116 |
beta2 = 0.999
|
| 117 |
L2_is_weight_decay = true
|
| 118 |
-
L2 = 0.
|
| 119 |
grad_clip = 1.0
|
| 120 |
use_averages = false
|
| 121 |
eps = 0.00000001
|
|
@@ -124,7 +124,7 @@ eps = 0.00000001
|
|
| 124 |
@schedules = "warmup_linear.v1"
|
| 125 |
warmup_steps = 500
|
| 126 |
total_steps = 8000
|
| 127 |
-
initial_rate = 0.
|
| 128 |
|
| 129 |
[training.score_weights]
|
| 130 |
ents_f = 1.0
|
|
|
|
| 33 |
state_type = "ner"
|
| 34 |
extra_state_tokens = false
|
| 35 |
hidden_width = 128
|
| 36 |
+
maxout_pieces = 3
|
| 37 |
use_upper = true
|
| 38 |
nO = null
|
| 39 |
|
|
|
|
| 88 |
train_corpus = "corpora.train"
|
| 89 |
seed = ${system.seed}
|
| 90 |
gpu_allocator = ${system.gpu_allocator}
|
| 91 |
+
dropout = 0.25
|
| 92 |
accumulate_gradient = 3
|
| 93 |
+
patience = 2000
|
| 94 |
max_epochs = 0
|
| 95 |
max_steps = 8000
|
| 96 |
eval_frequency = 200
|
|
|
|
| 115 |
beta1 = 0.9
|
| 116 |
beta2 = 0.999
|
| 117 |
L2_is_weight_decay = true
|
| 118 |
+
L2 = 0.02
|
| 119 |
grad_clip = 1.0
|
| 120 |
use_averages = false
|
| 121 |
eps = 0.00000001
|
|
|
|
| 124 |
@schedules = "warmup_linear.v1"
|
| 125 |
warmup_steps = 500
|
| 126 |
total_steps = 8000
|
| 127 |
+
initial_rate = 0.00003
|
| 128 |
|
| 129 |
[training.score_weights]
|
| 130 |
ents_f = 1.0
|
meta.json
CHANGED
|
@@ -48,9 +48,9 @@
|
|
| 48 |
|
| 49 |
],
|
| 50 |
"performance":{
|
| 51 |
-
"ents_f":0.
|
| 52 |
-
"ents_p":0.
|
| 53 |
-
"ents_r":0.
|
| 54 |
"ents_per_type":{
|
| 55 |
"SECURITY_ROLE":{
|
| 56 |
"p":1.0,
|
|
@@ -58,19 +58,19 @@
|
|
| 58 |
"f":1.0
|
| 59 |
},
|
| 60 |
"SECURITY_TOOL":{
|
| 61 |
-
"p":0
|
| 62 |
"r":1.0,
|
| 63 |
-
"f":0
|
| 64 |
},
|
| 65 |
"TECHNICAL_SKILL":{
|
| 66 |
-
"p":0.
|
| 67 |
-
"r":0.
|
| 68 |
-
"f":0.
|
| 69 |
},
|
| 70 |
"ATTACK_TECHNIQUE":{
|
| 71 |
-
"p":0
|
| 72 |
"r":1.0,
|
| 73 |
-
"f":0
|
| 74 |
},
|
| 75 |
"FRAMEWORK":{
|
| 76 |
"p":1.0,
|
|
@@ -88,9 +88,9 @@
|
|
| 88 |
"f":1.0
|
| 89 |
},
|
| 90 |
"REGULATION":{
|
| 91 |
-
"p":0.
|
| 92 |
"r":1.0,
|
| 93 |
-
"f":0.
|
| 94 |
},
|
| 95 |
"THREAT_TYPE":{
|
| 96 |
"p":1.0,
|
|
@@ -98,9 +98,9 @@
|
|
| 98 |
"f":1.0
|
| 99 |
},
|
| 100 |
"ACRONYM":{
|
| 101 |
-
"p":
|
| 102 |
"r":1.0,
|
| 103 |
-
"f":
|
| 104 |
},
|
| 105 |
"AUDIT_TERM":{
|
| 106 |
"p":1.0,
|
|
@@ -118,7 +118,7 @@
|
|
| 118 |
"f":1.0
|
| 119 |
}
|
| 120 |
},
|
| 121 |
-
"transformer_loss":
|
| 122 |
-
"ner_loss":
|
| 123 |
}
|
| 124 |
}
|
|
|
|
| 48 |
|
| 49 |
],
|
| 50 |
"performance":{
|
| 51 |
+
"ents_f":0.9830508475,
|
| 52 |
+
"ents_p":0.9792207792,
|
| 53 |
+
"ents_r":0.9869109948,
|
| 54 |
"ents_per_type":{
|
| 55 |
"SECURITY_ROLE":{
|
| 56 |
"p":1.0,
|
|
|
|
| 58 |
"f":1.0
|
| 59 |
},
|
| 60 |
"SECURITY_TOOL":{
|
| 61 |
+
"p":1.0,
|
| 62 |
"r":1.0,
|
| 63 |
+
"f":1.0
|
| 64 |
},
|
| 65 |
"TECHNICAL_SKILL":{
|
| 66 |
+
"p":0.9342105263,
|
| 67 |
+
"r":0.9726027397,
|
| 68 |
+
"f":0.9530201342
|
| 69 |
},
|
| 70 |
"ATTACK_TECHNIQUE":{
|
| 71 |
+
"p":1.0,
|
| 72 |
"r":1.0,
|
| 73 |
+
"f":1.0
|
| 74 |
},
|
| 75 |
"FRAMEWORK":{
|
| 76 |
"p":1.0,
|
|
|
|
| 88 |
"f":1.0
|
| 89 |
},
|
| 90 |
"REGULATION":{
|
| 91 |
+
"p":0.8947368421,
|
| 92 |
"r":1.0,
|
| 93 |
+
"f":0.9444444444
|
| 94 |
},
|
| 95 |
"THREAT_TYPE":{
|
| 96 |
"p":1.0,
|
|
|
|
| 98 |
"f":1.0
|
| 99 |
},
|
| 100 |
"ACRONYM":{
|
| 101 |
+
"p":0.8,
|
| 102 |
"r":1.0,
|
| 103 |
+
"f":0.8888888889
|
| 104 |
},
|
| 105 |
"AUDIT_TERM":{
|
| 106 |
"p":1.0,
|
|
|
|
| 118 |
"f":1.0
|
| 119 |
}
|
| 120 |
},
|
| 121 |
+
"transformer_loss":29.5171592474,
|
| 122 |
+
"ner_loss":20.9466311453
|
| 123 |
}
|
| 124 |
}
|
ner/model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d6aa3567ecbde04a9c944d91a90fab7d2e0561e228c667e66a3482fcefcfa94
|
| 3 |
+
size 1018547
|
transformer/model
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09c07fd5b912da2969f0cecb91b599432839cd1b3faa7b80877203f886e82928
|
| 3 |
+
size 503478228
|