scanpatch commited on
Commit
99a8c78
·
verified ·
1 Parent(s): 506e198

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - ru
5
+ - uk
6
+ tags:
7
+ - token-classification
8
+ - ner
9
+ - pii
10
+ - xlm-roberta
11
+ - transformers
12
+ library_name: transformers
13
+ license: apache-2.0
14
+ base_model: xlm-roberta-large
15
+ pipeline_tag: token-classification
16
+ model-index:
17
+ - name: pii-ner-nemotron
18
+ results:
19
+ - task:
20
+ type: token-classification
21
+ name: Named Entity Recognition
22
+ metrics:
23
+ - type: f1
24
+ value: 0.9768405285513023
25
+ ---
26
+
27
+ # pii-ner-nemotron
28
+
29
+ ## Model summary
30
+
31
+ PII NER model trained on nemotron dataset for multilingual PII entity extraction.
32
+
33
+ - **Base model:** `xlm-roberta-large`
34
+ - **Repository:** `scanpatch/pii-ner-nemotron`
35
+ - **Training run name:** `pii-ner-nemotron`
36
+ - **Export timestamp (UTC):** `2025-12-29T12:06:13.731145+00:00`
37
+
38
+ ## Labels
39
+
40
+ ### Entity types
41
+ - `address`
42
+ - `address_apartment`
43
+ - `address_building`
44
+ - `address_city`
45
+ - `address_country`
46
+ - `address_district`
47
+ - `address_geolocation`
48
+ - `address_house`
49
+ - `address_postal_code`
50
+ - `address_region`
51
+ - `address_street`
52
+ - `date`
53
+ - `document_number`
54
+ - `email`
55
+ - `first_name`
56
+ - `ip`
57
+ - `last_name`
58
+ - `middle_name`
59
+ - `military_individual_number`
60
+ - `mobile_phone`
61
+ - `name`
62
+ - `name_initials`
63
+ - `nickname`
64
+ - `organization`
65
+ - `snils`
66
+ - `tin`
67
+ - `vehicle_number`
68
+
69
+ ## Evaluation
70
+
71
+ | Metric | Value |
72
+ |---|---:|
73
+ | `test_f1` | `0.9768405285513023` |
74
+ | `test_precision` | `0.9734942064790006` |
75
+ | `test_recall` | `0.9802099354987895` |
76
+ | `test_accuracy` | `0.9977181928808507` |
77
+ | `train_runtime` | `1693.5057` |
78
+ | `train_samples_per_second` | `238.116` |
79
+
80
+ ## How to use
81
+
82
+ ```python
83
+ from transformers import pipeline
84
+
85
+ ner = pipeline(
86
+ "token-classification",
87
+ model="scanpatch/pii-ner-nemotron",
88
+ aggregation_strategy="simple",
89
+ )
90
+
91
+ text = "Contact me at test@example.com and my phone is +380 67 123 45 67."
92
+ print(ner(text))
93
+ ```
config.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "XLMRobertaForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "id2label": {
14
+ "0": "O",
15
+ "1": "B-address",
16
+ "2": "I-address",
17
+ "3": "B-address_apartment",
18
+ "4": "I-address_apartment",
19
+ "5": "B-address_building",
20
+ "6": "I-address_building",
21
+ "7": "B-address_city",
22
+ "8": "I-address_city",
23
+ "9": "B-address_country",
24
+ "10": "I-address_country",
25
+ "11": "B-address_district",
26
+ "12": "I-address_district",
27
+ "13": "B-address_geolocation",
28
+ "14": "I-address_geolocation",
29
+ "15": "B-address_house",
30
+ "16": "I-address_house",
31
+ "17": "B-address_postal_code",
32
+ "18": "I-address_postal_code",
33
+ "19": "B-address_region",
34
+ "20": "I-address_region",
35
+ "21": "B-address_street",
36
+ "22": "I-address_street",
37
+ "23": "B-date",
38
+ "24": "I-date",
39
+ "25": "B-document_number",
40
+ "26": "I-document_number",
41
+ "27": "B-email",
42
+ "28": "I-email",
43
+ "29": "B-first_name",
44
+ "30": "I-first_name",
45
+ "31": "B-ip",
46
+ "32": "I-ip",
47
+ "33": "B-last_name",
48
+ "34": "I-last_name",
49
+ "35": "B-middle_name",
50
+ "36": "I-middle_name",
51
+ "37": "B-military_individual_number",
52
+ "38": "I-military_individual_number",
53
+ "39": "B-mobile_phone",
54
+ "40": "I-mobile_phone",
55
+ "41": "B-name",
56
+ "42": "I-name",
57
+ "43": "B-name_initials",
58
+ "44": "I-name_initials",
59
+ "45": "B-nickname",
60
+ "46": "I-nickname",
61
+ "47": "B-organization",
62
+ "48": "I-organization",
63
+ "49": "B-snils",
64
+ "50": "I-snils",
65
+ "51": "B-tin",
66
+ "52": "I-tin",
67
+ "53": "B-vehicle_number",
68
+ "54": "I-vehicle_number"
69
+ },
70
+ "initializer_range": 0.02,
71
+ "intermediate_size": 4096,
72
+ "label2id": {
73
+ "B-address": 1,
74
+ "B-address_apartment": 3,
75
+ "B-address_building": 5,
76
+ "B-address_city": 7,
77
+ "B-address_country": 9,
78
+ "B-address_district": 11,
79
+ "B-address_geolocation": 13,
80
+ "B-address_house": 15,
81
+ "B-address_postal_code": 17,
82
+ "B-address_region": 19,
83
+ "B-address_street": 21,
84
+ "B-date": 23,
85
+ "B-document_number": 25,
86
+ "B-email": 27,
87
+ "B-first_name": 29,
88
+ "B-ip": 31,
89
+ "B-last_name": 33,
90
+ "B-middle_name": 35,
91
+ "B-military_individual_number": 37,
92
+ "B-mobile_phone": 39,
93
+ "B-name": 41,
94
+ "B-name_initials": 43,
95
+ "B-nickname": 45,
96
+ "B-organization": 47,
97
+ "B-snils": 49,
98
+ "B-tin": 51,
99
+ "B-vehicle_number": 53,
100
+ "I-address": 2,
101
+ "I-address_apartment": 4,
102
+ "I-address_building": 6,
103
+ "I-address_city": 8,
104
+ "I-address_country": 10,
105
+ "I-address_district": 12,
106
+ "I-address_geolocation": 14,
107
+ "I-address_house": 16,
108
+ "I-address_postal_code": 18,
109
+ "I-address_region": 20,
110
+ "I-address_street": 22,
111
+ "I-date": 24,
112
+ "I-document_number": 26,
113
+ "I-email": 28,
114
+ "I-first_name": 30,
115
+ "I-ip": 32,
116
+ "I-last_name": 34,
117
+ "I-middle_name": 36,
118
+ "I-military_individual_number": 38,
119
+ "I-mobile_phone": 40,
120
+ "I-name": 42,
121
+ "I-name_initials": 44,
122
+ "I-nickname": 46,
123
+ "I-organization": 48,
124
+ "I-snils": 50,
125
+ "I-tin": 52,
126
+ "I-vehicle_number": 54,
127
+ "O": 0
128
+ },
129
+ "layer_norm_eps": 1e-05,
130
+ "max_position_embeddings": 514,
131
+ "model_type": "xlm-roberta",
132
+ "num_attention_heads": 16,
133
+ "num_hidden_layers": 24,
134
+ "output_past": true,
135
+ "pad_token_id": 1,
136
+ "position_embedding_type": "absolute",
137
+ "transformers_version": "4.57.3",
138
+ "type_vocab_size": 1,
139
+ "use_cache": true,
140
+ "vocab_size": 250002
141
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c20b6ccba7df24973121e7e344e535f4eed9824d73f74dbc5d3a6a8d886771a
3
+ size 2235637364
run_summary.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_name": "pii-ner-nemotron",
3
+ "base_model": "xlm-roberta-large",
4
+ "output_dir": "./pii_ner_outputs/nemotron",
5
+ "label_list": [
6
+ "O",
7
+ "B-address",
8
+ "I-address",
9
+ "B-address_apartment",
10
+ "I-address_apartment",
11
+ "B-address_building",
12
+ "I-address_building",
13
+ "B-address_city",
14
+ "I-address_city",
15
+ "B-address_country",
16
+ "I-address_country",
17
+ "B-address_district",
18
+ "I-address_district",
19
+ "B-address_geolocation",
20
+ "I-address_geolocation",
21
+ "B-address_house",
22
+ "I-address_house",
23
+ "B-address_postal_code",
24
+ "I-address_postal_code",
25
+ "B-address_region",
26
+ "I-address_region",
27
+ "B-address_street",
28
+ "I-address_street",
29
+ "B-date",
30
+ "I-date",
31
+ "B-document_number",
32
+ "I-document_number",
33
+ "B-email",
34
+ "I-email",
35
+ "B-first_name",
36
+ "I-first_name",
37
+ "B-ip",
38
+ "I-ip",
39
+ "B-last_name",
40
+ "I-last_name",
41
+ "B-middle_name",
42
+ "I-middle_name",
43
+ "B-military_individual_number",
44
+ "I-military_individual_number",
45
+ "B-mobile_phone",
46
+ "I-mobile_phone",
47
+ "B-name",
48
+ "I-name",
49
+ "B-name_initials",
50
+ "I-name_initials",
51
+ "B-nickname",
52
+ "I-nickname",
53
+ "B-organization",
54
+ "I-organization",
55
+ "B-snils",
56
+ "I-snils",
57
+ "B-tin",
58
+ "I-tin",
59
+ "B-vehicle_number",
60
+ "I-vehicle_number"
61
+ ],
62
+ "metrics": {
63
+ "train_runtime": 1693.5057,
64
+ "train_samples_per_second": 238.116,
65
+ "test_f1": 0.9768405285513023,
66
+ "test_precision": 0.9734942064790006,
67
+ "test_recall": 0.9802099354987895,
68
+ "test_accuracy": 0.9977181928808507
69
+ },
70
+ "timestamp_utc": "2025-12-29T12:06:13.731145+00:00"
71
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5c4016d8a8bd86269ac4af33cd40b85f148d2ffcc4f5fd5ad35225a007cf4c2
3
+ size 17082833
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "model_max_length": 512,
51
+ "pad_token": "<pad>",
52
+ "sep_token": "</s>",
53
+ "tokenizer_class": "XLMRobertaTokenizer",
54
+ "unk_token": "<unk>"
55
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e54f0d4c1a5b4d7460337789cf28ffb6bcd506d6d8374c066a1aace9b0e288a7
3
+ size 5841