Upload PII-NER v1 model (F1=0.904, 40 entity types)
Browse files- README.md +175 -0
- config.json +207 -0
- model.safetensors +3 -0
- test_results.json +70 -0
- tokenizer.json +0 -0
- training_args.bin +3 -0
- training_config.json +21 -0
README.md
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
license: apache-2.0
|
| 4 |
+
language:
|
| 5 |
+
- en
|
| 6 |
+
tags:
|
| 7 |
+
- token-classification
|
| 8 |
+
- ner
|
| 9 |
+
- pii
|
| 10 |
+
- privacy
|
| 11 |
+
- deberta
|
| 12 |
+
- crf
|
| 13 |
+
datasets:
|
| 14 |
+
- ai4privacy/internationalised_pii_dataset
|
| 15 |
+
- gretelai/gretel-pii-masking-en-v1
|
| 16 |
+
pipeline_tag: token-classification
|
| 17 |
+
model-index:
|
| 18 |
+
- name: datafog-pii-ner-v1
|
| 19 |
+
results:
|
| 20 |
+
- task:
|
| 21 |
+
type: token-classification
|
| 22 |
+
name: Named Entity Recognition
|
| 23 |
+
metrics:
|
| 24 |
+
- type: f1
|
| 25 |
+
value: 0.904
|
| 26 |
+
name: Overall F1
|
| 27 |
+
- type: precision
|
| 28 |
+
value: 0.907
|
| 29 |
+
name: Overall Precision
|
| 30 |
+
- type: recall
|
| 31 |
+
value: 0.902
|
| 32 |
+
name: Overall Recall
|
| 33 |
+
---
|
| 34 |
+
|
| 35 |
+
# DataFog PII-NER v1
|
| 36 |
+
|
| 37 |
+
A token classification model for detecting **Personally Identifiable Information (PII)** in English text. Built on DeBERTa-v3-xsmall with character-level CNN features and a CRF decoding head for structured BIO tag prediction.
|
| 38 |
+
|
| 39 |
+
## Model Details
|
| 40 |
+
|
| 41 |
+
| Property | Value |
|
| 42 |
+
|----------|-------|
|
| 43 |
+
| Architecture | DeBERTa-v3-xsmall + CharCNN + CRF |
|
| 44 |
+
| Parameters | ~22.7M total |
|
| 45 |
+
| Labels | 89 BIO tags (40 entity types) |
|
| 46 |
+
| Max sequence length | 256 tokens |
|
| 47 |
+
| Training data | ~135K examples from 3 datasets |
|
| 48 |
+
| Training hardware | NVIDIA A100 (Colab), BF16 mixed precision |
|
| 49 |
+
| Framework | Transformers 5.0, PyTorch 2.x |
|
| 50 |
+
|
| 51 |
+
## Architecture
|
| 52 |
+
|
| 53 |
+
```
|
| 54 |
+
Input text
|
| 55 |
+
|
|
| 56 |
+
v
|
| 57 |
+
DeBERTa-v3-xsmall (70.7M pretrained params)
|
| 58 |
+
|
|
| 59 |
+
v
|
| 60 |
+
Character CNN (3/4/5-gram filters)
|
| 61 |
+
|
|
| 62 |
+
v
|
| 63 |
+
Gating Fusion (learned weighted combination)
|
| 64 |
+
|
|
| 65 |
+
v
|
| 66 |
+
CRF Head (sequence-level decoding)
|
| 67 |
+
|
|
| 68 |
+
v
|
| 69 |
+
89 BIO tag predictions
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
The CRF head enforces valid BIO tag sequences (e.g., I-PERSON can only follow B-PERSON or I-PERSON), which improves entity boundary detection compared to independent per-token classification.
|
| 73 |
+
|
| 74 |
+
## Supported Entity Types (40 types, 4 tiers)
|
| 75 |
+
|
| 76 |
+
### Tier 1 -- Critical PII
|
| 77 |
+
SSN, Credit Card, Bank Account, Passport Number, Drivers License, Tax ID
|
| 78 |
+
|
| 79 |
+
### Tier 2 -- High Sensitivity
|
| 80 |
+
Person, Email, Phone, Date of Birth, Street Address, IP Address
|
| 81 |
+
|
| 82 |
+
### Tier 3 -- Moderate Sensitivity
|
| 83 |
+
Username, Date, Location, Organization, URL, License Plate, Age, Nationality, Gender, Ethnicity, Religion, Marital Status
|
| 84 |
+
|
| 85 |
+
### Tier 4 -- Domain-Specific
|
| 86 |
+
Medical Record, Employee ID, Student ID, Account Number, PIN, Password, Biometric, Vehicle ID, Device ID, Crypto Wallet, IBAN, Swift Code, Insurance Number, Salary, Criminal Record, Political Affiliation, Sexual Orientation, Health Condition, Genetic Data, Trade Union
|
| 87 |
+
|
| 88 |
+
## Test Set Results
|
| 89 |
+
|
| 90 |
+
| Metric | Value |
|
| 91 |
+
|--------|-------|
|
| 92 |
+
| **Overall F1** | **0.904** |
|
| 93 |
+
| Overall Precision | 0.907 |
|
| 94 |
+
| Overall Recall | 0.902 |
|
| 95 |
+
|
| 96 |
+
### Tier Recall
|
| 97 |
+
|
| 98 |
+
| Tier | Recall | Target |
|
| 99 |
+
|------|--------|--------|
|
| 100 |
+
| Tier 1 (Critical) | 0.722 | 0.98 |
|
| 101 |
+
| Tier 2 (High) | 0.934 | 0.95 |
|
| 102 |
+
| Tier 3 (Moderate) | 0.919 | 0.90 |
|
| 103 |
+
| Tier 4 (Domain) | 0.866 | 0.85 |
|
| 104 |
+
|
| 105 |
+
### Per-Entity F1 (All Types)
|
| 106 |
+
|
| 107 |
+
| Entity Type | F1 | Recall |
|
| 108 |
+
|-------------|-----|--------|
|
| 109 |
+
| Biometric | 0.996 | 0.996 |
|
| 110 |
+
| URL | 0.994 | 0.995 |
|
| 111 |
+
| Email | 0.991 | 0.987 |
|
| 112 |
+
| IP Address | 0.988 | 0.992 |
|
| 113 |
+
| Date of Birth | 0.978 | 0.980 |
|
| 114 |
+
| Vehicle ID | 0.964 | 0.989 |
|
| 115 |
+
| Phone | 0.963 | 0.961 |
|
| 116 |
+
| Employee ID | 0.962 | 0.959 |
|
| 117 |
+
| License Plate | 0.960 | 0.952 |
|
| 118 |
+
| Gender | 0.952 | 0.949 |
|
| 119 |
+
| IBAN | 0.930 | 0.898 |
|
| 120 |
+
| Swift Code | 0.926 | 0.980 |
|
| 121 |
+
| Username | 0.924 | 0.912 |
|
| 122 |
+
| Location | 0.922 | 0.908 |
|
| 123 |
+
| Account Number | 0.908 | 0.917 |
|
| 124 |
+
| Organization | 0.898 | 0.903 |
|
| 125 |
+
| SSN | 0.891 | 0.858 |
|
| 126 |
+
| Drivers License | 0.885 | 0.881 |
|
| 127 |
+
| Password | 0.878 | 0.885 |
|
| 128 |
+
| Date | 0.875 | 0.869 |
|
| 129 |
+
| Person | 0.861 | 0.868 |
|
| 130 |
+
| Credit Card | 0.862 | 0.839 |
|
| 131 |
+
| Age | 0.851 | 0.861 |
|
| 132 |
+
| Street Address | 0.834 | 0.817 |
|
| 133 |
+
| Bank Account | 0.791 | 0.746 |
|
| 134 |
+
| Tax ID | 0.665 | 0.624 |
|
| 135 |
+
| Passport Number | 0.469 | 0.385 |
|
| 136 |
+
| PIN | 0.432 | 0.302 |
|
| 137 |
+
|
| 138 |
+
## Training Details
|
| 139 |
+
|
| 140 |
+
- **Backbone LR:** 2e-5 (with AdamW eps=1.0 to prevent NaN)
|
| 141 |
+
- **Head LR:** 1e-3 (50x faster than backbone)
|
| 142 |
+
- **Warmup:** 10% of steps
|
| 143 |
+
- **Epochs:** 10 (best checkpoint at epoch 5)
|
| 144 |
+
- **Effective batch size:** 32
|
| 145 |
+
- **Mixed precision:** BF16
|
| 146 |
+
|
| 147 |
+
## Training Data
|
| 148 |
+
|
| 149 |
+
Trained on a combined dataset of ~135K examples from:
|
| 150 |
+
- [AI4Privacy PII Dataset](https://huggingface.co/datasets/ai4privacy/internationalised_pii_dataset)
|
| 151 |
+
- [Nemotron PII](https://huggingface.co/datasets/ai4privacy/pii-masking-400k)
|
| 152 |
+
- [Gretel PII Masking](https://huggingface.co/datasets/gretelai/gretel-pii-masking-en-v1)
|
| 153 |
+
|
| 154 |
+
## Limitations
|
| 155 |
+
|
| 156 |
+
- Tier 1 recall (0.722) is below the 0.98 target -- critical PII types like SSN, Credit Card, and Passport Number need improvement
|
| 157 |
+
- Rare entity types (PIN, Passport Number, Tax ID) have low F1 due to limited training examples
|
| 158 |
+
- English-only
|
| 159 |
+
- Max 256 tokens per input (longer documents need chunking)
|
| 160 |
+
- Custom architecture requires the `datafog-pii-ner` package for loading (not a standard HuggingFace token classifier)
|
| 161 |
+
|
| 162 |
+
## Citation
|
| 163 |
+
|
| 164 |
+
```bibtex
|
| 165 |
+
@software{datafog_pii_ner_v1,
|
| 166 |
+
title={DataFog PII-NER v1: Token Classification for PII Detection},
|
| 167 |
+
author={DataFog},
|
| 168 |
+
year={2026},
|
| 169 |
+
url={https://github.com/DataFog/datafog-labs}
|
| 170 |
+
}
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
## License
|
| 174 |
+
|
| 175 |
+
Apache 2.0
|
config.json
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"PiiNerModel"
|
| 4 |
+
],
|
| 5 |
+
"backbone": "microsoft/deberta-v3-xsmall",
|
| 6 |
+
"char_cnn_filters": [
|
| 7 |
+
50,
|
| 8 |
+
50,
|
| 9 |
+
50
|
| 10 |
+
],
|
| 11 |
+
"char_cnn_widths": [
|
| 12 |
+
3,
|
| 13 |
+
4,
|
| 14 |
+
5
|
| 15 |
+
],
|
| 16 |
+
"char_embed_dim": 50,
|
| 17 |
+
"char_vocab_size": 256,
|
| 18 |
+
"dropout": 0.1,
|
| 19 |
+
"dtype": "float16",
|
| 20 |
+
"id2label": {
|
| 21 |
+
"0": "O",
|
| 22 |
+
"1": "B-SSN",
|
| 23 |
+
"2": "I-SSN",
|
| 24 |
+
"3": "B-CREDIT_CARD",
|
| 25 |
+
"4": "I-CREDIT_CARD",
|
| 26 |
+
"5": "B-BANK_ACCOUNT",
|
| 27 |
+
"6": "I-BANK_ACCOUNT",
|
| 28 |
+
"7": "B-PASSPORT_NUMBER",
|
| 29 |
+
"8": "I-PASSPORT_NUMBER",
|
| 30 |
+
"9": "B-DRIVERS_LICENSE",
|
| 31 |
+
"10": "I-DRIVERS_LICENSE",
|
| 32 |
+
"11": "B-TAX_ID",
|
| 33 |
+
"12": "I-TAX_ID",
|
| 34 |
+
"13": "B-PERSON",
|
| 35 |
+
"14": "I-PERSON",
|
| 36 |
+
"15": "B-EMAIL",
|
| 37 |
+
"16": "I-EMAIL",
|
| 38 |
+
"17": "B-PHONE",
|
| 39 |
+
"18": "I-PHONE",
|
| 40 |
+
"19": "B-DATE_OF_BIRTH",
|
| 41 |
+
"20": "I-DATE_OF_BIRTH",
|
| 42 |
+
"21": "B-STREET_ADDRESS",
|
| 43 |
+
"22": "I-STREET_ADDRESS",
|
| 44 |
+
"23": "B-IP_ADDRESS",
|
| 45 |
+
"24": "I-IP_ADDRESS",
|
| 46 |
+
"25": "B-USERNAME",
|
| 47 |
+
"26": "I-USERNAME",
|
| 48 |
+
"27": "B-DATE",
|
| 49 |
+
"28": "I-DATE",
|
| 50 |
+
"29": "B-LOCATION",
|
| 51 |
+
"30": "I-LOCATION",
|
| 52 |
+
"31": "B-ORGANIZATION",
|
| 53 |
+
"32": "I-ORGANIZATION",
|
| 54 |
+
"33": "B-URL",
|
| 55 |
+
"34": "I-URL",
|
| 56 |
+
"35": "B-LICENSE_PLATE",
|
| 57 |
+
"36": "I-LICENSE_PLATE",
|
| 58 |
+
"37": "B-AGE",
|
| 59 |
+
"38": "I-AGE",
|
| 60 |
+
"39": "B-NATIONALITY",
|
| 61 |
+
"40": "I-NATIONALITY",
|
| 62 |
+
"41": "B-GENDER",
|
| 63 |
+
"42": "I-GENDER",
|
| 64 |
+
"43": "B-ETHNICITY",
|
| 65 |
+
"44": "I-ETHNICITY",
|
| 66 |
+
"45": "B-RELIGION",
|
| 67 |
+
"46": "I-RELIGION",
|
| 68 |
+
"47": "B-MARITAL_STATUS",
|
| 69 |
+
"48": "I-MARITAL_STATUS",
|
| 70 |
+
"49": "B-MEDICAL_RECORD",
|
| 71 |
+
"50": "I-MEDICAL_RECORD",
|
| 72 |
+
"51": "B-EMPLOYEE_ID",
|
| 73 |
+
"52": "I-EMPLOYEE_ID",
|
| 74 |
+
"53": "B-STUDENT_ID",
|
| 75 |
+
"54": "I-STUDENT_ID",
|
| 76 |
+
"55": "B-ACCOUNT_NUMBER",
|
| 77 |
+
"56": "I-ACCOUNT_NUMBER",
|
| 78 |
+
"57": "B-PIN",
|
| 79 |
+
"58": "I-PIN",
|
| 80 |
+
"59": "B-PASSWORD",
|
| 81 |
+
"60": "I-PASSWORD",
|
| 82 |
+
"61": "B-BIOMETRIC",
|
| 83 |
+
"62": "I-BIOMETRIC",
|
| 84 |
+
"63": "B-VEHICLE_ID",
|
| 85 |
+
"64": "I-VEHICLE_ID",
|
| 86 |
+
"65": "B-DEVICE_ID",
|
| 87 |
+
"66": "I-DEVICE_ID",
|
| 88 |
+
"67": "B-CRYPTO_WALLET",
|
| 89 |
+
"68": "I-CRYPTO_WALLET",
|
| 90 |
+
"69": "B-IBAN",
|
| 91 |
+
"70": "I-IBAN",
|
| 92 |
+
"71": "B-SWIFT_CODE",
|
| 93 |
+
"72": "I-SWIFT_CODE",
|
| 94 |
+
"73": "B-INSURANCE_NUMBER",
|
| 95 |
+
"74": "I-INSURANCE_NUMBER",
|
| 96 |
+
"75": "B-SALARY",
|
| 97 |
+
"76": "I-SALARY",
|
| 98 |
+
"77": "B-CRIMINAL_RECORD",
|
| 99 |
+
"78": "I-CRIMINAL_RECORD",
|
| 100 |
+
"79": "B-POLITICAL_AFFILIATION",
|
| 101 |
+
"80": "I-POLITICAL_AFFILIATION",
|
| 102 |
+
"81": "B-SEXUAL_ORIENTATION",
|
| 103 |
+
"82": "I-SEXUAL_ORIENTATION",
|
| 104 |
+
"83": "B-HEALTH_CONDITION",
|
| 105 |
+
"84": "I-HEALTH_CONDITION",
|
| 106 |
+
"85": "B-GENETIC_DATA",
|
| 107 |
+
"86": "I-GENETIC_DATA",
|
| 108 |
+
"87": "B-TRADE_UNION",
|
| 109 |
+
"88": "I-TRADE_UNION"
|
| 110 |
+
},
|
| 111 |
+
"label2id": {
|
| 112 |
+
"O": 0,
|
| 113 |
+
"B-SSN": 1,
|
| 114 |
+
"I-SSN": 2,
|
| 115 |
+
"B-CREDIT_CARD": 3,
|
| 116 |
+
"I-CREDIT_CARD": 4,
|
| 117 |
+
"B-BANK_ACCOUNT": 5,
|
| 118 |
+
"I-BANK_ACCOUNT": 6,
|
| 119 |
+
"B-PASSPORT_NUMBER": 7,
|
| 120 |
+
"I-PASSPORT_NUMBER": 8,
|
| 121 |
+
"B-DRIVERS_LICENSE": 9,
|
| 122 |
+
"I-DRIVERS_LICENSE": 10,
|
| 123 |
+
"B-TAX_ID": 11,
|
| 124 |
+
"I-TAX_ID": 12,
|
| 125 |
+
"B-PERSON": 13,
|
| 126 |
+
"I-PERSON": 14,
|
| 127 |
+
"B-EMAIL": 15,
|
| 128 |
+
"I-EMAIL": 16,
|
| 129 |
+
"B-PHONE": 17,
|
| 130 |
+
"I-PHONE": 18,
|
| 131 |
+
"B-DATE_OF_BIRTH": 19,
|
| 132 |
+
"I-DATE_OF_BIRTH": 20,
|
| 133 |
+
"B-STREET_ADDRESS": 21,
|
| 134 |
+
"I-STREET_ADDRESS": 22,
|
| 135 |
+
"B-IP_ADDRESS": 23,
|
| 136 |
+
"I-IP_ADDRESS": 24,
|
| 137 |
+
"B-USERNAME": 25,
|
| 138 |
+
"I-USERNAME": 26,
|
| 139 |
+
"B-DATE": 27,
|
| 140 |
+
"I-DATE": 28,
|
| 141 |
+
"B-LOCATION": 29,
|
| 142 |
+
"I-LOCATION": 30,
|
| 143 |
+
"B-ORGANIZATION": 31,
|
| 144 |
+
"I-ORGANIZATION": 32,
|
| 145 |
+
"B-URL": 33,
|
| 146 |
+
"I-URL": 34,
|
| 147 |
+
"B-LICENSE_PLATE": 35,
|
| 148 |
+
"I-LICENSE_PLATE": 36,
|
| 149 |
+
"B-AGE": 37,
|
| 150 |
+
"I-AGE": 38,
|
| 151 |
+
"B-NATIONALITY": 39,
|
| 152 |
+
"I-NATIONALITY": 40,
|
| 153 |
+
"B-GENDER": 41,
|
| 154 |
+
"I-GENDER": 42,
|
| 155 |
+
"B-ETHNICITY": 43,
|
| 156 |
+
"I-ETHNICITY": 44,
|
| 157 |
+
"B-RELIGION": 45,
|
| 158 |
+
"I-RELIGION": 46,
|
| 159 |
+
"B-MARITAL_STATUS": 47,
|
| 160 |
+
"I-MARITAL_STATUS": 48,
|
| 161 |
+
"B-MEDICAL_RECORD": 49,
|
| 162 |
+
"I-MEDICAL_RECORD": 50,
|
| 163 |
+
"B-EMPLOYEE_ID": 51,
|
| 164 |
+
"I-EMPLOYEE_ID": 52,
|
| 165 |
+
"B-STUDENT_ID": 53,
|
| 166 |
+
"I-STUDENT_ID": 54,
|
| 167 |
+
"B-ACCOUNT_NUMBER": 55,
|
| 168 |
+
"I-ACCOUNT_NUMBER": 56,
|
| 169 |
+
"B-PIN": 57,
|
| 170 |
+
"I-PIN": 58,
|
| 171 |
+
"B-PASSWORD": 59,
|
| 172 |
+
"I-PASSWORD": 60,
|
| 173 |
+
"B-BIOMETRIC": 61,
|
| 174 |
+
"I-BIOMETRIC": 62,
|
| 175 |
+
"B-VEHICLE_ID": 63,
|
| 176 |
+
"I-VEHICLE_ID": 64,
|
| 177 |
+
"B-DEVICE_ID": 65,
|
| 178 |
+
"I-DEVICE_ID": 66,
|
| 179 |
+
"B-CRYPTO_WALLET": 67,
|
| 180 |
+
"I-CRYPTO_WALLET": 68,
|
| 181 |
+
"B-IBAN": 69,
|
| 182 |
+
"I-IBAN": 70,
|
| 183 |
+
"B-SWIFT_CODE": 71,
|
| 184 |
+
"I-SWIFT_CODE": 72,
|
| 185 |
+
"B-INSURANCE_NUMBER": 73,
|
| 186 |
+
"I-INSURANCE_NUMBER": 74,
|
| 187 |
+
"B-SALARY": 75,
|
| 188 |
+
"I-SALARY": 76,
|
| 189 |
+
"B-CRIMINAL_RECORD": 77,
|
| 190 |
+
"I-CRIMINAL_RECORD": 78,
|
| 191 |
+
"B-POLITICAL_AFFILIATION": 79,
|
| 192 |
+
"I-POLITICAL_AFFILIATION": 80,
|
| 193 |
+
"B-SEXUAL_ORIENTATION": 81,
|
| 194 |
+
"I-SEXUAL_ORIENTATION": 82,
|
| 195 |
+
"B-HEALTH_CONDITION": 83,
|
| 196 |
+
"I-HEALTH_CONDITION": 84,
|
| 197 |
+
"B-GENETIC_DATA": 85,
|
| 198 |
+
"I-GENETIC_DATA": 86,
|
| 199 |
+
"B-TRADE_UNION": 87,
|
| 200 |
+
"I-TRADE_UNION": 88
|
| 201 |
+
},
|
| 202 |
+
"max_char_len": 20,
|
| 203 |
+
"model_type": "pii_ner",
|
| 204 |
+
"transformers_version": "5.0.0",
|
| 205 |
+
"use_cache": false,
|
| 206 |
+
"num_labels": 89
|
| 207 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:888bf4c2b3f70475540abbfa30672ae2826cf4f1c3b09b02ec45c52fb56cd066
|
| 3 |
+
size 143144240
|
test_results.json
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"eval_loss": 2.6675541400909424,
|
| 3 |
+
"eval_overall_f1": 0.9043580541465338,
|
| 4 |
+
"eval_overall_precision": 0.9072015660825143,
|
| 5 |
+
"eval_overall_recall": 0.9015323117921386,
|
| 6 |
+
"eval_type_account_number_f1": 0.907736863880844,
|
| 7 |
+
"eval_type_account_number_recall": 0.9172240802675585,
|
| 8 |
+
"eval_type_age_f1": 0.850771869639794,
|
| 9 |
+
"eval_type_age_recall": 0.8611111111111112,
|
| 10 |
+
"eval_type_bank_account_f1": 0.791044776119403,
|
| 11 |
+
"eval_type_bank_account_recall": 0.7464788732394366,
|
| 12 |
+
"eval_type_biometric_f1": 0.9962141698215252,
|
| 13 |
+
"eval_type_biometric_recall": 0.9956756756756757,
|
| 14 |
+
"eval_type_credit_card_f1": 0.8619495299356753,
|
| 15 |
+
"eval_type_credit_card_recall": 0.8391136801541426,
|
| 16 |
+
"eval_type_date_f1": 0.8748460591133004,
|
| 17 |
+
"eval_type_date_recall": 0.8694920440636474,
|
| 18 |
+
"eval_type_date_of_birth_f1": 0.9784499054820416,
|
| 19 |
+
"eval_type_date_of_birth_recall": 0.9803030303030303,
|
| 20 |
+
"eval_type_drivers_license_f1": 0.8849557522123893,
|
| 21 |
+
"eval_type_drivers_license_recall": 0.8807588075880759,
|
| 22 |
+
"eval_type_email_f1": 0.9905722429291821,
|
| 23 |
+
"eval_type_email_recall": 0.9873251748251748,
|
| 24 |
+
"eval_type_employee_id_f1": 0.9622997172478794,
|
| 25 |
+
"eval_type_employee_id_recall": 0.9586854460093897,
|
| 26 |
+
"eval_type_gender_f1": 0.9522240527182866,
|
| 27 |
+
"eval_type_gender_recall": 0.9490968801313628,
|
| 28 |
+
"eval_type_iban_f1": 0.9304556354916067,
|
| 29 |
+
"eval_type_iban_recall": 0.8981481481481481,
|
| 30 |
+
"eval_type_ip_address_f1": 0.987908643081057,
|
| 31 |
+
"eval_type_ip_address_recall": 0.9919064748201439,
|
| 32 |
+
"eval_type_license_plate_f1": 0.9595290654893304,
|
| 33 |
+
"eval_type_license_plate_recall": 0.9518248175182482,
|
| 34 |
+
"eval_type_location_f1": 0.9216815623965575,
|
| 35 |
+
"eval_type_location_recall": 0.9077986437141367,
|
| 36 |
+
"eval_type_organization_f1": 0.8982110448535131,
|
| 37 |
+
"eval_type_organization_recall": 0.9025635681533972,
|
| 38 |
+
"eval_type_passport_number_f1": 0.46875,
|
| 39 |
+
"eval_type_passport_number_recall": 0.38461538461538464,
|
| 40 |
+
"eval_type_password_f1": 0.8778082191780823,
|
| 41 |
+
"eval_type_password_recall": 0.8850828729281768,
|
| 42 |
+
"eval_type_person_f1": 0.8611754487550666,
|
| 43 |
+
"eval_type_person_recall": 0.867643841610151,
|
| 44 |
+
"eval_type_phone_f1": 0.9628220140515222,
|
| 45 |
+
"eval_type_phone_recall": 0.9608530528775927,
|
| 46 |
+
"eval_type_pin_f1": 0.43209876543209874,
|
| 47 |
+
"eval_type_pin_recall": 0.3017241379310345,
|
| 48 |
+
"eval_type_ssn_f1": 0.8910891089108911,
|
| 49 |
+
"eval_type_ssn_recall": 0.8583106267029973,
|
| 50 |
+
"eval_type_street_address_f1": 0.833598628627403,
|
| 51 |
+
"eval_type_street_address_recall": 0.8165027584552651,
|
| 52 |
+
"eval_type_swift_code_f1": 0.9259259259259259,
|
| 53 |
+
"eval_type_swift_code_recall": 0.9803921568627451,
|
| 54 |
+
"eval_type_tax_id_f1": 0.6650602409638554,
|
| 55 |
+
"eval_type_tax_id_recall": 0.6244343891402715,
|
| 56 |
+
"eval_type_url_f1": 0.9938633938100321,
|
| 57 |
+
"eval_type_url_recall": 0.9946595460614153,
|
| 58 |
+
"eval_type_username_f1": 0.9244669316949765,
|
| 59 |
+
"eval_type_username_recall": 0.9122681883024251,
|
| 60 |
+
"eval_type_vehicle_id_f1": 0.9642458100558661,
|
| 61 |
+
"eval_type_vehicle_id_recall": 0.9885452462772051,
|
| 62 |
+
"eval_tier_1_recall": 0.7222852935733847,
|
| 63 |
+
"eval_tier_2_recall": 0.934089055481893,
|
| 64 |
+
"eval_tier_3_recall": 0.918601849881968,
|
| 65 |
+
"eval_tier_4_recall": 0.8656847205124917,
|
| 66 |
+
"eval_runtime": 165.9273,
|
| 67 |
+
"eval_samples_per_second": 102.123,
|
| 68 |
+
"eval_steps_per_second": 3.194,
|
| 69 |
+
"epoch": 10.0
|
| 70 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8bed6f2f807b6b79aa6aa3df98784b3a76b787829f82f533f3a1e66be070a519
|
| 3 |
+
size 5265
|
training_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"backbone": "microsoft/deberta-v3-xsmall",
|
| 3 |
+
"max_seq_len": 256,
|
| 4 |
+
"max_char_len": 20,
|
| 5 |
+
"dropout": 0.1,
|
| 6 |
+
"epochs": 10,
|
| 7 |
+
"batch_size": 32,
|
| 8 |
+
"gradient_accumulation_steps": 1,
|
| 9 |
+
"lr_backbone": 2e-05,
|
| 10 |
+
"lr_head": 0.001,
|
| 11 |
+
"warmup_ratio": 0.1,
|
| 12 |
+
"weight_decay": 0.01,
|
| 13 |
+
"fp16": false,
|
| 14 |
+
"bf16": true,
|
| 15 |
+
"val_ratio": 0.1,
|
| 16 |
+
"test_ratio": 0.1,
|
| 17 |
+
"seed": 42,
|
| 18 |
+
"output_dir": "/content/pii_ner_v1_output",
|
| 19 |
+
"run_name": "pii-ner-v1-full",
|
| 20 |
+
"_try_bf16": true
|
| 21 |
+
}
|