PuxAI commited on
Commit
7e0284d
·
verified ·
1 Parent(s): 260eee0

Upload folder using huggingface_hub

Browse files
nemotron-pii-ready/TokenBased-CRF/checkpoint-63/config.json CHANGED
@@ -1,300 +1,40 @@
1
  {
 
2
  "architectures": [
3
  "TransformerCrfForTokenClassification"
4
  ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 50281,
8
- "classifier_activation": "gelu",
9
- "classifier_bias": false,
10
- "classifier_dropout": 0.0,
11
- "classifier_pooling": "mean",
12
- "cls_token_id": 50281,
13
- "decoder_bias": true,
14
- "deterministic_flash_attn": false,
15
  "dtype": "float32",
16
- "embedding_dropout": 0.0,
17
- "eos_token_id": 50282,
18
- "global_attn_every_n_layers": 3,
19
- "gradient_checkpointing": false,
20
- "hidden_activation": "gelu",
21
  "hidden_size": 768,
22
  "id2label": {
23
- "0": "O",
24
- "1": "B-ACCOUNT_NUMBER",
25
- "2": "I-ACCOUNT_NUMBER",
26
- "3": "B-AGE",
27
- "4": "I-AGE",
28
- "5": "B-API_KEY",
29
- "6": "I-API_KEY",
30
- "7": "B-BANK_ROUTING_NUMBER",
31
- "8": "I-BANK_ROUTING_NUMBER",
32
- "9": "B-BIOMETRIC_IDENTIFIER",
33
- "10": "I-BIOMETRIC_IDENTIFIER",
34
- "11": "B-BLOOD_TYPE",
35
- "12": "I-BLOOD_TYPE",
36
- "13": "B-CERTIFICATE_LICENSE_NUMBER",
37
- "14": "I-CERTIFICATE_LICENSE_NUMBER",
38
- "15": "B-CITY",
39
- "16": "I-CITY",
40
- "17": "B-COMPANY_NAME",
41
- "18": "I-COMPANY_NAME",
42
- "19": "B-COORDINATE",
43
- "20": "I-COORDINATE",
44
- "21": "B-COUNTRY",
45
- "22": "I-COUNTRY",
46
- "23": "B-COUNTY",
47
- "24": "I-COUNTY",
48
- "25": "B-CREDIT_DEBIT_CARD",
49
- "26": "I-CREDIT_DEBIT_CARD",
50
- "27": "B-CUSTOMER_ID",
51
- "28": "I-CUSTOMER_ID",
52
- "29": "B-CVV",
53
- "30": "I-CVV",
54
- "31": "B-DATE",
55
- "32": "I-DATE",
56
- "33": "B-DATE_OF_BIRTH",
57
- "34": "I-DATE_OF_BIRTH",
58
- "35": "B-DATE_TIME",
59
- "36": "I-DATE_TIME",
60
- "37": "B-DEVICE_IDENTIFIER",
61
- "38": "I-DEVICE_IDENTIFIER",
62
- "39": "B-EDUCATION_LEVEL",
63
- "40": "I-EDUCATION_LEVEL",
64
- "41": "B-EMAIL",
65
- "42": "I-EMAIL",
66
- "43": "B-EMPLOYEE_ID",
67
- "44": "I-EMPLOYEE_ID",
68
- "45": "B-EMPLOYMENT_STATUS",
69
- "46": "I-EMPLOYMENT_STATUS",
70
- "47": "B-FAX_NUMBER",
71
- "48": "I-FAX_NUMBER",
72
- "49": "B-FIRST_NAME",
73
- "50": "I-FIRST_NAME",
74
- "51": "B-GENDER",
75
- "52": "I-GENDER",
76
- "53": "B-HEALTH_PLAN_BENEFICIARY_NUMBER",
77
- "54": "I-HEALTH_PLAN_BENEFICIARY_NUMBER",
78
- "55": "B-HTTP_COOKIE",
79
- "56": "I-HTTP_COOKIE",
80
- "57": "B-IPV4",
81
- "58": "I-IPV4",
82
- "59": "B-IPV6",
83
- "60": "I-IPV6",
84
- "61": "B-LANGUAGE",
85
- "62": "I-LANGUAGE",
86
- "63": "B-LAST_NAME",
87
- "64": "I-LAST_NAME",
88
- "65": "B-LICENSE_PLATE",
89
- "66": "I-LICENSE_PLATE",
90
- "67": "B-MAC_ADDRESS",
91
- "68": "I-MAC_ADDRESS",
92
- "69": "B-MEDICAL_RECORD_NUMBER",
93
- "70": "I-MEDICAL_RECORD_NUMBER",
94
- "71": "B-OCCUPATION",
95
- "72": "I-OCCUPATION",
96
- "73": "B-PASSWORD",
97
- "74": "I-PASSWORD",
98
- "75": "B-PHONE_NUMBER",
99
- "76": "I-PHONE_NUMBER",
100
- "77": "B-PIN",
101
- "78": "I-PIN",
102
- "79": "B-POLITICAL_VIEW",
103
- "80": "I-POLITICAL_VIEW",
104
- "81": "B-POSTCODE",
105
- "82": "I-POSTCODE",
106
- "83": "B-RACE_ETHNICITY",
107
- "84": "I-RACE_ETHNICITY",
108
- "85": "B-RELIGIOUS_BELIEF",
109
- "86": "I-RELIGIOUS_BELIEF",
110
- "87": "B-SEXUALITY",
111
- "88": "I-SEXUALITY",
112
- "89": "B-SSN",
113
- "90": "I-SSN",
114
- "91": "B-STATE",
115
- "92": "I-STATE",
116
- "93": "B-STREET_ADDRESS",
117
- "94": "I-STREET_ADDRESS",
118
- "95": "B-SWIFT_BIC",
119
- "96": "I-SWIFT_BIC",
120
- "97": "B-TAX_ID",
121
- "98": "I-TAX_ID",
122
- "99": "B-TIME",
123
- "100": "I-TIME",
124
- "101": "B-UNIQUE_ID",
125
- "102": "I-UNIQUE_ID",
126
- "103": "B-URL",
127
- "104": "I-URL",
128
- "105": "B-USER_NAME",
129
- "106": "I-USER_NAME",
130
- "107": "B-VEHICLE_IDENTIFIER",
131
- "108": "I-VEHICLE_IDENTIFIER"
132
  },
133
- "initializer_cutoff_factor": 2.0,
134
  "initializer_range": 0.02,
135
- "intermediate_size": 1152,
 
136
  "label2id": {
137
- "B-ACCOUNT_NUMBER": 1,
138
- "B-AGE": 3,
139
- "B-API_KEY": 5,
140
- "B-BANK_ROUTING_NUMBER": 7,
141
- "B-BIOMETRIC_IDENTIFIER": 9,
142
- "B-BLOOD_TYPE": 11,
143
- "B-CERTIFICATE_LICENSE_NUMBER": 13,
144
- "B-CITY": 15,
145
- "B-COMPANY_NAME": 17,
146
- "B-COORDINATE": 19,
147
- "B-COUNTRY": 21,
148
- "B-COUNTY": 23,
149
- "B-CREDIT_DEBIT_CARD": 25,
150
- "B-CUSTOMER_ID": 27,
151
- "B-CVV": 29,
152
- "B-DATE": 31,
153
- "B-DATE_OF_BIRTH": 33,
154
- "B-DATE_TIME": 35,
155
- "B-DEVICE_IDENTIFIER": 37,
156
- "B-EDUCATION_LEVEL": 39,
157
- "B-EMAIL": 41,
158
- "B-EMPLOYEE_ID": 43,
159
- "B-EMPLOYMENT_STATUS": 45,
160
- "B-FAX_NUMBER": 47,
161
- "B-FIRST_NAME": 49,
162
- "B-GENDER": 51,
163
- "B-HEALTH_PLAN_BENEFICIARY_NUMBER": 53,
164
- "B-HTTP_COOKIE": 55,
165
- "B-IPV4": 57,
166
- "B-IPV6": 59,
167
- "B-LANGUAGE": 61,
168
- "B-LAST_NAME": 63,
169
- "B-LICENSE_PLATE": 65,
170
- "B-MAC_ADDRESS": 67,
171
- "B-MEDICAL_RECORD_NUMBER": 69,
172
- "B-OCCUPATION": 71,
173
- "B-PASSWORD": 73,
174
- "B-PHONE_NUMBER": 75,
175
- "B-PIN": 77,
176
- "B-POLITICAL_VIEW": 79,
177
- "B-POSTCODE": 81,
178
- "B-RACE_ETHNICITY": 83,
179
- "B-RELIGIOUS_BELIEF": 85,
180
- "B-SEXUALITY": 87,
181
- "B-SSN": 89,
182
- "B-STATE": 91,
183
- "B-STREET_ADDRESS": 93,
184
- "B-SWIFT_BIC": 95,
185
- "B-TAX_ID": 97,
186
- "B-TIME": 99,
187
- "B-UNIQUE_ID": 101,
188
- "B-URL": 103,
189
- "B-USER_NAME": 105,
190
- "B-VEHICLE_IDENTIFIER": 107,
191
- "I-ACCOUNT_NUMBER": 2,
192
- "I-AGE": 4,
193
- "I-API_KEY": 6,
194
- "I-BANK_ROUTING_NUMBER": 8,
195
- "I-BIOMETRIC_IDENTIFIER": 10,
196
- "I-BLOOD_TYPE": 12,
197
- "I-CERTIFICATE_LICENSE_NUMBER": 14,
198
- "I-CITY": 16,
199
- "I-COMPANY_NAME": 18,
200
- "I-COORDINATE": 20,
201
- "I-COUNTRY": 22,
202
- "I-COUNTY": 24,
203
- "I-CREDIT_DEBIT_CARD": 26,
204
- "I-CUSTOMER_ID": 28,
205
- "I-CVV": 30,
206
- "I-DATE": 32,
207
- "I-DATE_OF_BIRTH": 34,
208
- "I-DATE_TIME": 36,
209
- "I-DEVICE_IDENTIFIER": 38,
210
- "I-EDUCATION_LEVEL": 40,
211
- "I-EMAIL": 42,
212
- "I-EMPLOYEE_ID": 44,
213
- "I-EMPLOYMENT_STATUS": 46,
214
- "I-FAX_NUMBER": 48,
215
- "I-FIRST_NAME": 50,
216
- "I-GENDER": 52,
217
- "I-HEALTH_PLAN_BENEFICIARY_NUMBER": 54,
218
- "I-HTTP_COOKIE": 56,
219
- "I-IPV4": 58,
220
- "I-IPV6": 60,
221
- "I-LANGUAGE": 62,
222
- "I-LAST_NAME": 64,
223
- "I-LICENSE_PLATE": 66,
224
- "I-MAC_ADDRESS": 68,
225
- "I-MEDICAL_RECORD_NUMBER": 70,
226
- "I-OCCUPATION": 72,
227
- "I-PASSWORD": 74,
228
- "I-PHONE_NUMBER": 76,
229
- "I-PIN": 78,
230
- "I-POLITICAL_VIEW": 80,
231
- "I-POSTCODE": 82,
232
- "I-RACE_ETHNICITY": 84,
233
- "I-RELIGIOUS_BELIEF": 86,
234
- "I-SEXUALITY": 88,
235
- "I-SSN": 90,
236
- "I-STATE": 92,
237
- "I-STREET_ADDRESS": 94,
238
- "I-SWIFT_BIC": 96,
239
- "I-TAX_ID": 98,
240
- "I-TIME": 100,
241
- "I-UNIQUE_ID": 102,
242
- "I-URL": 104,
243
- "I-USER_NAME": 106,
244
- "I-VEHICLE_IDENTIFIER": 108,
245
  "O": 0
246
  },
247
- "layer_norm_eps": 1e-05,
248
- "layer_types": [
249
- "full_attention",
250
- "sliding_attention",
251
- "sliding_attention",
252
- "full_attention",
253
- "sliding_attention",
254
- "sliding_attention",
255
- "full_attention",
256
- "sliding_attention",
257
- "sliding_attention",
258
- "full_attention",
259
- "sliding_attention",
260
- "sliding_attention",
261
- "full_attention",
262
- "sliding_attention",
263
- "sliding_attention",
264
- "full_attention",
265
- "sliding_attention",
266
- "sliding_attention",
267
- "full_attention",
268
- "sliding_attention",
269
- "sliding_attention",
270
- "full_attention"
271
- ],
272
- "local_attention": 128,
273
- "max_position_embeddings": 8192,
274
- "mlp_bias": false,
275
- "mlp_dropout": 0.0,
276
- "model_type": "modernbert",
277
- "norm_bias": false,
278
- "norm_eps": 1e-05,
279
  "num_attention_heads": 12,
280
- "num_hidden_layers": 22,
281
- "pad_token_id": 50283,
282
- "position_embedding_type": "absolute",
283
- "rope_parameters": {
284
- "full_attention": {
285
- "rope_theta": 160000.0,
286
- "rope_type": "default"
287
- },
288
- "sliding_attention": {
289
- "rope_theta": 10000.0,
290
- "rope_type": "default"
291
- }
292
- },
293
- "sep_token_id": 50282,
294
- "sparse_pred_ignore_index": -100,
295
- "sparse_prediction": false,
296
  "tie_word_embeddings": true,
297
  "transformers_version": "5.3.0",
 
298
  "use_cache": false,
299
- "vocab_size": 50368
300
  }
 
1
  {
2
+ "add_cross_attention": false,
3
  "architectures": [
4
  "TransformerCrfForTokenClassification"
5
  ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "directionality": "bidi",
 
 
 
 
 
 
10
  "dtype": "float32",
11
+ "eos_token_id": null,
12
+ "hidden_act": "gelu",
13
+ "hidden_dropout_prob": 0.1,
 
 
14
  "hidden_size": 768,
15
  "id2label": {
16
+ "0": "O"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  },
 
18
  "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "is_decoder": false,
21
  "label2id": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  "O": 0
23
  },
24
+ "layer_norm_eps": 1e-12,
25
+ "max_position_embeddings": 512,
26
+ "model_type": "bert",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  "num_attention_heads": 12,
28
+ "num_hidden_layers": 12,
29
+ "pad_token_id": 0,
30
+ "pooler_fc_size": 768,
31
+ "pooler_num_attention_heads": 12,
32
+ "pooler_num_fc_layers": 3,
33
+ "pooler_size_per_head": 128,
34
+ "pooler_type": "first_token_transform",
 
 
 
 
 
 
 
 
 
35
  "tie_word_embeddings": true,
36
  "transformers_version": "5.3.0",
37
+ "type_vocab_size": 2,
38
  "use_cache": false,
39
+ "vocab_size": 119547
40
  }
nemotron-pii-ready/TokenBased-CRF/checkpoint-63/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e965726919d9abe4d6683e84e47fc3555c9bd708acfb3c10625b0fdd3fa9a33
3
- size 596455840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88eb408b27deb430fc1d251a404b9109b29c6dc6de065b842828dd099380174c
3
+ size 711441992
nemotron-pii-ready/TokenBased-CRF/checkpoint-63/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:225fe1de56f97502e28f204e618a2933d804abdb04c32a83b1a9996c126ac885
3
- size 1193000587
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79148412b64f6a2290a7249211de11a1587d63b7a027284d03c3ba7d2f16c7e8
3
+ size 1418281163
nemotron-pii-ready/TokenBased-CRF/checkpoint-63/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:435aa54f58cd2b4f9671a7e040885474baabcfc06b399b72cba63fbdd8fc2368
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f39eab8ed980549bfffcd8b948e8852bef979d820a1c47a898b2c9f270cc3986
3
  size 14645
nemotron-pii-ready/TokenBased-CRF/checkpoint-63/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
nemotron-pii-ready/TokenBased-CRF/checkpoint-63/tokenizer_config.json CHANGED
@@ -1,17 +1,15 @@
1
  {
2
  "add_prefix_space": true,
3
  "backend": "tokenizers",
4
- "clean_up_tokenization_spaces": true,
5
  "cls_token": "[CLS]",
 
6
  "is_local": false,
7
  "mask_token": "[MASK]",
8
- "model_input_names": [
9
- "input_ids",
10
- "attention_mask"
11
- ],
12
- "model_max_length": 8192,
13
  "pad_token": "[PAD]",
14
  "sep_token": "[SEP]",
15
- "tokenizer_class": "TokenizersBackend",
 
 
16
  "unk_token": "[UNK]"
17
  }
 
1
  {
2
  "add_prefix_space": true,
3
  "backend": "tokenizers",
 
4
  "cls_token": "[CLS]",
5
+ "do_lower_case": false,
6
  "is_local": false,
7
  "mask_token": "[MASK]",
8
+ "model_max_length": 512,
 
 
 
 
9
  "pad_token": "[PAD]",
10
  "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
  "unk_token": "[UNK]"
15
  }
nemotron-pii-ready/TokenBased-CRF/checkpoint-63/trainer_state.json CHANGED
@@ -11,10 +11,10 @@
11
  "log_history": [
12
  {
13
  "epoch": 1.0,
14
- "eval_loss": 152.4741973876953,
15
- "eval_runtime": 3.3214,
16
- "eval_samples_per_second": 30.108,
17
- "eval_steps_per_second": 3.914,
18
  "step": 63
19
  }
20
  ],
@@ -35,7 +35,7 @@
35
  "attributes": {}
36
  }
37
  },
38
- "total_flos": 169616744448000.0,
39
  "train_batch_size": 8,
40
  "trial_name": null,
41
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 1.0,
14
+ "eval_loss": 0.0,
15
+ "eval_runtime": 1.8457,
16
+ "eval_samples_per_second": 54.18,
17
+ "eval_steps_per_second": 7.043,
18
  "step": 63
19
  }
20
  ],
 
35
  "attributes": {}
36
  }
37
  },
38
+ "total_flos": 131554351104000.0,
39
  "train_batch_size": 8,
40
  "trial_name": null,
41
  "trial_params": null
nemotron-pii-ready/TokenBased-CRF/config.json CHANGED
@@ -1,300 +1,40 @@
1
  {
 
2
  "architectures": [
3
  "TransformerCrfForTokenClassification"
4
  ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 50281,
8
- "classifier_activation": "gelu",
9
- "classifier_bias": false,
10
- "classifier_dropout": 0.0,
11
- "classifier_pooling": "mean",
12
- "cls_token_id": 50281,
13
- "decoder_bias": true,
14
- "deterministic_flash_attn": false,
15
  "dtype": "float32",
16
- "embedding_dropout": 0.0,
17
- "eos_token_id": 50282,
18
- "global_attn_every_n_layers": 3,
19
- "gradient_checkpointing": false,
20
- "hidden_activation": "gelu",
21
  "hidden_size": 768,
22
  "id2label": {
23
- "0": "O",
24
- "1": "B-ACCOUNT_NUMBER",
25
- "2": "I-ACCOUNT_NUMBER",
26
- "3": "B-AGE",
27
- "4": "I-AGE",
28
- "5": "B-API_KEY",
29
- "6": "I-API_KEY",
30
- "7": "B-BANK_ROUTING_NUMBER",
31
- "8": "I-BANK_ROUTING_NUMBER",
32
- "9": "B-BIOMETRIC_IDENTIFIER",
33
- "10": "I-BIOMETRIC_IDENTIFIER",
34
- "11": "B-BLOOD_TYPE",
35
- "12": "I-BLOOD_TYPE",
36
- "13": "B-CERTIFICATE_LICENSE_NUMBER",
37
- "14": "I-CERTIFICATE_LICENSE_NUMBER",
38
- "15": "B-CITY",
39
- "16": "I-CITY",
40
- "17": "B-COMPANY_NAME",
41
- "18": "I-COMPANY_NAME",
42
- "19": "B-COORDINATE",
43
- "20": "I-COORDINATE",
44
- "21": "B-COUNTRY",
45
- "22": "I-COUNTRY",
46
- "23": "B-COUNTY",
47
- "24": "I-COUNTY",
48
- "25": "B-CREDIT_DEBIT_CARD",
49
- "26": "I-CREDIT_DEBIT_CARD",
50
- "27": "B-CUSTOMER_ID",
51
- "28": "I-CUSTOMER_ID",
52
- "29": "B-CVV",
53
- "30": "I-CVV",
54
- "31": "B-DATE",
55
- "32": "I-DATE",
56
- "33": "B-DATE_OF_BIRTH",
57
- "34": "I-DATE_OF_BIRTH",
58
- "35": "B-DATE_TIME",
59
- "36": "I-DATE_TIME",
60
- "37": "B-DEVICE_IDENTIFIER",
61
- "38": "I-DEVICE_IDENTIFIER",
62
- "39": "B-EDUCATION_LEVEL",
63
- "40": "I-EDUCATION_LEVEL",
64
- "41": "B-EMAIL",
65
- "42": "I-EMAIL",
66
- "43": "B-EMPLOYEE_ID",
67
- "44": "I-EMPLOYEE_ID",
68
- "45": "B-EMPLOYMENT_STATUS",
69
- "46": "I-EMPLOYMENT_STATUS",
70
- "47": "B-FAX_NUMBER",
71
- "48": "I-FAX_NUMBER",
72
- "49": "B-FIRST_NAME",
73
- "50": "I-FIRST_NAME",
74
- "51": "B-GENDER",
75
- "52": "I-GENDER",
76
- "53": "B-HEALTH_PLAN_BENEFICIARY_NUMBER",
77
- "54": "I-HEALTH_PLAN_BENEFICIARY_NUMBER",
78
- "55": "B-HTTP_COOKIE",
79
- "56": "I-HTTP_COOKIE",
80
- "57": "B-IPV4",
81
- "58": "I-IPV4",
82
- "59": "B-IPV6",
83
- "60": "I-IPV6",
84
- "61": "B-LANGUAGE",
85
- "62": "I-LANGUAGE",
86
- "63": "B-LAST_NAME",
87
- "64": "I-LAST_NAME",
88
- "65": "B-LICENSE_PLATE",
89
- "66": "I-LICENSE_PLATE",
90
- "67": "B-MAC_ADDRESS",
91
- "68": "I-MAC_ADDRESS",
92
- "69": "B-MEDICAL_RECORD_NUMBER",
93
- "70": "I-MEDICAL_RECORD_NUMBER",
94
- "71": "B-OCCUPATION",
95
- "72": "I-OCCUPATION",
96
- "73": "B-PASSWORD",
97
- "74": "I-PASSWORD",
98
- "75": "B-PHONE_NUMBER",
99
- "76": "I-PHONE_NUMBER",
100
- "77": "B-PIN",
101
- "78": "I-PIN",
102
- "79": "B-POLITICAL_VIEW",
103
- "80": "I-POLITICAL_VIEW",
104
- "81": "B-POSTCODE",
105
- "82": "I-POSTCODE",
106
- "83": "B-RACE_ETHNICITY",
107
- "84": "I-RACE_ETHNICITY",
108
- "85": "B-RELIGIOUS_BELIEF",
109
- "86": "I-RELIGIOUS_BELIEF",
110
- "87": "B-SEXUALITY",
111
- "88": "I-SEXUALITY",
112
- "89": "B-SSN",
113
- "90": "I-SSN",
114
- "91": "B-STATE",
115
- "92": "I-STATE",
116
- "93": "B-STREET_ADDRESS",
117
- "94": "I-STREET_ADDRESS",
118
- "95": "B-SWIFT_BIC",
119
- "96": "I-SWIFT_BIC",
120
- "97": "B-TAX_ID",
121
- "98": "I-TAX_ID",
122
- "99": "B-TIME",
123
- "100": "I-TIME",
124
- "101": "B-UNIQUE_ID",
125
- "102": "I-UNIQUE_ID",
126
- "103": "B-URL",
127
- "104": "I-URL",
128
- "105": "B-USER_NAME",
129
- "106": "I-USER_NAME",
130
- "107": "B-VEHICLE_IDENTIFIER",
131
- "108": "I-VEHICLE_IDENTIFIER"
132
  },
133
- "initializer_cutoff_factor": 2.0,
134
  "initializer_range": 0.02,
135
- "intermediate_size": 1152,
 
136
  "label2id": {
137
- "B-ACCOUNT_NUMBER": 1,
138
- "B-AGE": 3,
139
- "B-API_KEY": 5,
140
- "B-BANK_ROUTING_NUMBER": 7,
141
- "B-BIOMETRIC_IDENTIFIER": 9,
142
- "B-BLOOD_TYPE": 11,
143
- "B-CERTIFICATE_LICENSE_NUMBER": 13,
144
- "B-CITY": 15,
145
- "B-COMPANY_NAME": 17,
146
- "B-COORDINATE": 19,
147
- "B-COUNTRY": 21,
148
- "B-COUNTY": 23,
149
- "B-CREDIT_DEBIT_CARD": 25,
150
- "B-CUSTOMER_ID": 27,
151
- "B-CVV": 29,
152
- "B-DATE": 31,
153
- "B-DATE_OF_BIRTH": 33,
154
- "B-DATE_TIME": 35,
155
- "B-DEVICE_IDENTIFIER": 37,
156
- "B-EDUCATION_LEVEL": 39,
157
- "B-EMAIL": 41,
158
- "B-EMPLOYEE_ID": 43,
159
- "B-EMPLOYMENT_STATUS": 45,
160
- "B-FAX_NUMBER": 47,
161
- "B-FIRST_NAME": 49,
162
- "B-GENDER": 51,
163
- "B-HEALTH_PLAN_BENEFICIARY_NUMBER": 53,
164
- "B-HTTP_COOKIE": 55,
165
- "B-IPV4": 57,
166
- "B-IPV6": 59,
167
- "B-LANGUAGE": 61,
168
- "B-LAST_NAME": 63,
169
- "B-LICENSE_PLATE": 65,
170
- "B-MAC_ADDRESS": 67,
171
- "B-MEDICAL_RECORD_NUMBER": 69,
172
- "B-OCCUPATION": 71,
173
- "B-PASSWORD": 73,
174
- "B-PHONE_NUMBER": 75,
175
- "B-PIN": 77,
176
- "B-POLITICAL_VIEW": 79,
177
- "B-POSTCODE": 81,
178
- "B-RACE_ETHNICITY": 83,
179
- "B-RELIGIOUS_BELIEF": 85,
180
- "B-SEXUALITY": 87,
181
- "B-SSN": 89,
182
- "B-STATE": 91,
183
- "B-STREET_ADDRESS": 93,
184
- "B-SWIFT_BIC": 95,
185
- "B-TAX_ID": 97,
186
- "B-TIME": 99,
187
- "B-UNIQUE_ID": 101,
188
- "B-URL": 103,
189
- "B-USER_NAME": 105,
190
- "B-VEHICLE_IDENTIFIER": 107,
191
- "I-ACCOUNT_NUMBER": 2,
192
- "I-AGE": 4,
193
- "I-API_KEY": 6,
194
- "I-BANK_ROUTING_NUMBER": 8,
195
- "I-BIOMETRIC_IDENTIFIER": 10,
196
- "I-BLOOD_TYPE": 12,
197
- "I-CERTIFICATE_LICENSE_NUMBER": 14,
198
- "I-CITY": 16,
199
- "I-COMPANY_NAME": 18,
200
- "I-COORDINATE": 20,
201
- "I-COUNTRY": 22,
202
- "I-COUNTY": 24,
203
- "I-CREDIT_DEBIT_CARD": 26,
204
- "I-CUSTOMER_ID": 28,
205
- "I-CVV": 30,
206
- "I-DATE": 32,
207
- "I-DATE_OF_BIRTH": 34,
208
- "I-DATE_TIME": 36,
209
- "I-DEVICE_IDENTIFIER": 38,
210
- "I-EDUCATION_LEVEL": 40,
211
- "I-EMAIL": 42,
212
- "I-EMPLOYEE_ID": 44,
213
- "I-EMPLOYMENT_STATUS": 46,
214
- "I-FAX_NUMBER": 48,
215
- "I-FIRST_NAME": 50,
216
- "I-GENDER": 52,
217
- "I-HEALTH_PLAN_BENEFICIARY_NUMBER": 54,
218
- "I-HTTP_COOKIE": 56,
219
- "I-IPV4": 58,
220
- "I-IPV6": 60,
221
- "I-LANGUAGE": 62,
222
- "I-LAST_NAME": 64,
223
- "I-LICENSE_PLATE": 66,
224
- "I-MAC_ADDRESS": 68,
225
- "I-MEDICAL_RECORD_NUMBER": 70,
226
- "I-OCCUPATION": 72,
227
- "I-PASSWORD": 74,
228
- "I-PHONE_NUMBER": 76,
229
- "I-PIN": 78,
230
- "I-POLITICAL_VIEW": 80,
231
- "I-POSTCODE": 82,
232
- "I-RACE_ETHNICITY": 84,
233
- "I-RELIGIOUS_BELIEF": 86,
234
- "I-SEXUALITY": 88,
235
- "I-SSN": 90,
236
- "I-STATE": 92,
237
- "I-STREET_ADDRESS": 94,
238
- "I-SWIFT_BIC": 96,
239
- "I-TAX_ID": 98,
240
- "I-TIME": 100,
241
- "I-UNIQUE_ID": 102,
242
- "I-URL": 104,
243
- "I-USER_NAME": 106,
244
- "I-VEHICLE_IDENTIFIER": 108,
245
  "O": 0
246
  },
247
- "layer_norm_eps": 1e-05,
248
- "layer_types": [
249
- "full_attention",
250
- "sliding_attention",
251
- "sliding_attention",
252
- "full_attention",
253
- "sliding_attention",
254
- "sliding_attention",
255
- "full_attention",
256
- "sliding_attention",
257
- "sliding_attention",
258
- "full_attention",
259
- "sliding_attention",
260
- "sliding_attention",
261
- "full_attention",
262
- "sliding_attention",
263
- "sliding_attention",
264
- "full_attention",
265
- "sliding_attention",
266
- "sliding_attention",
267
- "full_attention",
268
- "sliding_attention",
269
- "sliding_attention",
270
- "full_attention"
271
- ],
272
- "local_attention": 128,
273
- "max_position_embeddings": 8192,
274
- "mlp_bias": false,
275
- "mlp_dropout": 0.0,
276
- "model_type": "modernbert",
277
- "norm_bias": false,
278
- "norm_eps": 1e-05,
279
  "num_attention_heads": 12,
280
- "num_hidden_layers": 22,
281
- "pad_token_id": 50283,
282
- "position_embedding_type": "absolute",
283
- "rope_parameters": {
284
- "full_attention": {
285
- "rope_theta": 160000.0,
286
- "rope_type": "default"
287
- },
288
- "sliding_attention": {
289
- "rope_theta": 10000.0,
290
- "rope_type": "default"
291
- }
292
- },
293
- "sep_token_id": 50282,
294
- "sparse_pred_ignore_index": -100,
295
- "sparse_prediction": false,
296
  "tie_word_embeddings": true,
297
  "transformers_version": "5.3.0",
 
298
  "use_cache": false,
299
- "vocab_size": 50368
300
  }
 
1
  {
2
+ "add_cross_attention": false,
3
  "architectures": [
4
  "TransformerCrfForTokenClassification"
5
  ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "directionality": "bidi",
 
 
 
 
 
 
10
  "dtype": "float32",
11
+ "eos_token_id": null,
12
+ "hidden_act": "gelu",
13
+ "hidden_dropout_prob": 0.1,
 
 
14
  "hidden_size": 768,
15
  "id2label": {
16
+ "0": "O"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  },
 
18
  "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "is_decoder": false,
21
  "label2id": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  "O": 0
23
  },
24
+ "layer_norm_eps": 1e-12,
25
+ "max_position_embeddings": 512,
26
+ "model_type": "bert",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  "num_attention_heads": 12,
28
+ "num_hidden_layers": 12,
29
+ "pad_token_id": 0,
30
+ "pooler_fc_size": 768,
31
+ "pooler_num_attention_heads": 12,
32
+ "pooler_num_fc_layers": 3,
33
+ "pooler_size_per_head": 128,
34
+ "pooler_type": "first_token_transform",
 
 
 
 
 
 
 
 
 
35
  "tie_word_embeddings": true,
36
  "transformers_version": "5.3.0",
37
+ "type_vocab_size": 2,
38
  "use_cache": false,
39
+ "vocab_size": 119547
40
  }
nemotron-pii-ready/TokenBased-CRF/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:deb31bda4f103abbb7f87649c896bfd459bff3545202cb799a4d0ceaeec3965f
3
- size 596501775
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:746337f74f742feb89c64cbca117b2728aa8763b9ede955b5bed72d57f0ef114
3
+ size 711504083
nemotron-pii-ready/TokenBased-CRF/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
nemotron-pii-ready/TokenBased-CRF/tokenizer_config.json CHANGED
@@ -1,17 +1,15 @@
1
  {
2
  "add_prefix_space": true,
3
  "backend": "tokenizers",
4
- "clean_up_tokenization_spaces": true,
5
  "cls_token": "[CLS]",
 
6
  "is_local": false,
7
  "mask_token": "[MASK]",
8
- "model_input_names": [
9
- "input_ids",
10
- "attention_mask"
11
- ],
12
- "model_max_length": 8192,
13
  "pad_token": "[PAD]",
14
  "sep_token": "[SEP]",
15
- "tokenizer_class": "TokenizersBackend",
 
 
16
  "unk_token": "[UNK]"
17
  }
 
1
  {
2
  "add_prefix_space": true,
3
  "backend": "tokenizers",
 
4
  "cls_token": "[CLS]",
5
+ "do_lower_case": false,
6
  "is_local": false,
7
  "mask_token": "[MASK]",
8
+ "model_max_length": 512,
 
 
 
 
9
  "pad_token": "[PAD]",
10
  "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
  "unk_token": "[UNK]"
15
  }