smoh commited on
Commit
0ad24a4
·
verified ·
1 Parent(s): bb193d1

Update to v1.3: best F1 (0.9071), early freeze + progressive tier weights

Browse files
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
config.json CHANGED
@@ -1,207 +1,205 @@
1
- {
2
- "architectures": [
3
- "PiiNerModel"
4
- ],
5
- "backbone": "microsoft/deberta-v3-xsmall",
6
- "char_cnn_filters": [
7
- 50,
8
- 50,
9
- 50
10
- ],
11
- "char_cnn_widths": [
12
- 3,
13
- 4,
14
- 5
15
- ],
16
- "char_embed_dim": 50,
17
- "char_vocab_size": 256,
18
- "dropout": 0.1,
19
- "dtype": "float16",
20
- "id2label": {
21
- "0": "O",
22
- "1": "B-SSN",
23
- "2": "I-SSN",
24
- "3": "B-CREDIT_CARD",
25
- "4": "I-CREDIT_CARD",
26
- "5": "B-BANK_ACCOUNT",
27
- "6": "I-BANK_ACCOUNT",
28
- "7": "B-PASSPORT_NUMBER",
29
- "8": "I-PASSPORT_NUMBER",
30
- "9": "B-DRIVERS_LICENSE",
31
- "10": "I-DRIVERS_LICENSE",
32
- "11": "B-TAX_ID",
33
- "12": "I-TAX_ID",
34
- "13": "B-PERSON",
35
- "14": "I-PERSON",
36
- "15": "B-EMAIL",
37
- "16": "I-EMAIL",
38
- "17": "B-PHONE",
39
- "18": "I-PHONE",
40
- "19": "B-DATE_OF_BIRTH",
41
- "20": "I-DATE_OF_BIRTH",
42
- "21": "B-STREET_ADDRESS",
43
- "22": "I-STREET_ADDRESS",
44
- "23": "B-IP_ADDRESS",
45
- "24": "I-IP_ADDRESS",
46
- "25": "B-USERNAME",
47
- "26": "I-USERNAME",
48
- "27": "B-DATE",
49
- "28": "I-DATE",
50
- "29": "B-LOCATION",
51
- "30": "I-LOCATION",
52
- "31": "B-ORGANIZATION",
53
- "32": "I-ORGANIZATION",
54
- "33": "B-URL",
55
- "34": "I-URL",
56
- "35": "B-LICENSE_PLATE",
57
- "36": "I-LICENSE_PLATE",
58
- "37": "B-AGE",
59
- "38": "I-AGE",
60
- "39": "B-NATIONALITY",
61
- "40": "I-NATIONALITY",
62
- "41": "B-GENDER",
63
- "42": "I-GENDER",
64
- "43": "B-ETHNICITY",
65
- "44": "I-ETHNICITY",
66
- "45": "B-RELIGION",
67
- "46": "I-RELIGION",
68
- "47": "B-MARITAL_STATUS",
69
- "48": "I-MARITAL_STATUS",
70
- "49": "B-MEDICAL_RECORD",
71
- "50": "I-MEDICAL_RECORD",
72
- "51": "B-EMPLOYEE_ID",
73
- "52": "I-EMPLOYEE_ID",
74
- "53": "B-STUDENT_ID",
75
- "54": "I-STUDENT_ID",
76
- "55": "B-ACCOUNT_NUMBER",
77
- "56": "I-ACCOUNT_NUMBER",
78
- "57": "B-PIN",
79
- "58": "I-PIN",
80
- "59": "B-PASSWORD",
81
- "60": "I-PASSWORD",
82
- "61": "B-BIOMETRIC",
83
- "62": "I-BIOMETRIC",
84
- "63": "B-VEHICLE_ID",
85
- "64": "I-VEHICLE_ID",
86
- "65": "B-DEVICE_ID",
87
- "66": "I-DEVICE_ID",
88
- "67": "B-CRYPTO_WALLET",
89
- "68": "I-CRYPTO_WALLET",
90
- "69": "B-IBAN",
91
- "70": "I-IBAN",
92
- "71": "B-SWIFT_CODE",
93
- "72": "I-SWIFT_CODE",
94
- "73": "B-INSURANCE_NUMBER",
95
- "74": "I-INSURANCE_NUMBER",
96
- "75": "B-SALARY",
97
- "76": "I-SALARY",
98
- "77": "B-CRIMINAL_RECORD",
99
- "78": "I-CRIMINAL_RECORD",
100
- "79": "B-POLITICAL_AFFILIATION",
101
- "80": "I-POLITICAL_AFFILIATION",
102
- "81": "B-SEXUAL_ORIENTATION",
103
- "82": "I-SEXUAL_ORIENTATION",
104
- "83": "B-HEALTH_CONDITION",
105
- "84": "I-HEALTH_CONDITION",
106
- "85": "B-GENETIC_DATA",
107
- "86": "I-GENETIC_DATA",
108
- "87": "B-TRADE_UNION",
109
- "88": "I-TRADE_UNION"
110
- },
111
- "label2id": {
112
- "O": 0,
113
- "B-SSN": 1,
114
- "I-SSN": 2,
115
- "B-CREDIT_CARD": 3,
116
- "I-CREDIT_CARD": 4,
117
- "B-BANK_ACCOUNT": 5,
118
- "I-BANK_ACCOUNT": 6,
119
- "B-PASSPORT_NUMBER": 7,
120
- "I-PASSPORT_NUMBER": 8,
121
- "B-DRIVERS_LICENSE": 9,
122
- "I-DRIVERS_LICENSE": 10,
123
- "B-TAX_ID": 11,
124
- "I-TAX_ID": 12,
125
- "B-PERSON": 13,
126
- "I-PERSON": 14,
127
- "B-EMAIL": 15,
128
- "I-EMAIL": 16,
129
- "B-PHONE": 17,
130
- "I-PHONE": 18,
131
- "B-DATE_OF_BIRTH": 19,
132
- "I-DATE_OF_BIRTH": 20,
133
- "B-STREET_ADDRESS": 21,
134
- "I-STREET_ADDRESS": 22,
135
- "B-IP_ADDRESS": 23,
136
- "I-IP_ADDRESS": 24,
137
- "B-USERNAME": 25,
138
- "I-USERNAME": 26,
139
- "B-DATE": 27,
140
- "I-DATE": 28,
141
- "B-LOCATION": 29,
142
- "I-LOCATION": 30,
143
- "B-ORGANIZATION": 31,
144
- "I-ORGANIZATION": 32,
145
- "B-URL": 33,
146
- "I-URL": 34,
147
- "B-LICENSE_PLATE": 35,
148
- "I-LICENSE_PLATE": 36,
149
- "B-AGE": 37,
150
- "I-AGE": 38,
151
- "B-NATIONALITY": 39,
152
- "I-NATIONALITY": 40,
153
- "B-GENDER": 41,
154
- "I-GENDER": 42,
155
- "B-ETHNICITY": 43,
156
- "I-ETHNICITY": 44,
157
- "B-RELIGION": 45,
158
- "I-RELIGION": 46,
159
- "B-MARITAL_STATUS": 47,
160
- "I-MARITAL_STATUS": 48,
161
- "B-MEDICAL_RECORD": 49,
162
- "I-MEDICAL_RECORD": 50,
163
- "B-EMPLOYEE_ID": 51,
164
- "I-EMPLOYEE_ID": 52,
165
- "B-STUDENT_ID": 53,
166
- "I-STUDENT_ID": 54,
167
- "B-ACCOUNT_NUMBER": 55,
168
- "I-ACCOUNT_NUMBER": 56,
169
- "B-PIN": 57,
170
- "I-PIN": 58,
171
- "B-PASSWORD": 59,
172
- "I-PASSWORD": 60,
173
- "B-BIOMETRIC": 61,
174
- "I-BIOMETRIC": 62,
175
- "B-VEHICLE_ID": 63,
176
- "I-VEHICLE_ID": 64,
177
- "B-DEVICE_ID": 65,
178
- "I-DEVICE_ID": 66,
179
- "B-CRYPTO_WALLET": 67,
180
- "I-CRYPTO_WALLET": 68,
181
- "B-IBAN": 69,
182
- "I-IBAN": 70,
183
- "B-SWIFT_CODE": 71,
184
- "I-SWIFT_CODE": 72,
185
- "B-INSURANCE_NUMBER": 73,
186
- "I-INSURANCE_NUMBER": 74,
187
- "B-SALARY": 75,
188
- "I-SALARY": 76,
189
- "B-CRIMINAL_RECORD": 77,
190
- "I-CRIMINAL_RECORD": 78,
191
- "B-POLITICAL_AFFILIATION": 79,
192
- "I-POLITICAL_AFFILIATION": 80,
193
- "B-SEXUAL_ORIENTATION": 81,
194
- "I-SEXUAL_ORIENTATION": 82,
195
- "B-HEALTH_CONDITION": 83,
196
- "I-HEALTH_CONDITION": 84,
197
- "B-GENETIC_DATA": 85,
198
- "I-GENETIC_DATA": 86,
199
- "B-TRADE_UNION": 87,
200
- "I-TRADE_UNION": 88
201
- },
202
- "max_char_len": 20,
203
- "model_type": "pii_ner",
204
- "transformers_version": "5.0.0",
205
- "use_cache": false,
206
- "num_labels": 89
207
- }
 
1
+ {
2
+ "architectures": [
3
+ "PiiNerModel"
4
+ ],
5
+ "backbone": "microsoft/deberta-v3-xsmall",
6
+ "char_cnn_filters": [
7
+ 50,
8
+ 50,
9
+ 50
10
+ ],
11
+ "char_cnn_widths": [
12
+ 3,
13
+ 4,
14
+ 5
15
+ ],
16
+ "char_embed_dim": 50,
17
+ "char_vocab_size": 256,
18
+ "dropout": 0.1,
19
+ "id2label": {
20
+ "0": "LABEL_0",
21
+ "1": "LABEL_1",
22
+ "2": "LABEL_2",
23
+ "3": "LABEL_3",
24
+ "4": "LABEL_4",
25
+ "5": "LABEL_5",
26
+ "6": "LABEL_6",
27
+ "7": "LABEL_7",
28
+ "8": "LABEL_8",
29
+ "9": "LABEL_9",
30
+ "10": "LABEL_10",
31
+ "11": "LABEL_11",
32
+ "12": "LABEL_12",
33
+ "13": "LABEL_13",
34
+ "14": "LABEL_14",
35
+ "15": "LABEL_15",
36
+ "16": "LABEL_16",
37
+ "17": "LABEL_17",
38
+ "18": "LABEL_18",
39
+ "19": "LABEL_19",
40
+ "20": "LABEL_20",
41
+ "21": "LABEL_21",
42
+ "22": "LABEL_22",
43
+ "23": "LABEL_23",
44
+ "24": "LABEL_24",
45
+ "25": "LABEL_25",
46
+ "26": "LABEL_26",
47
+ "27": "LABEL_27",
48
+ "28": "LABEL_28",
49
+ "29": "LABEL_29",
50
+ "30": "LABEL_30",
51
+ "31": "LABEL_31",
52
+ "32": "LABEL_32",
53
+ "33": "LABEL_33",
54
+ "34": "LABEL_34",
55
+ "35": "LABEL_35",
56
+ "36": "LABEL_36",
57
+ "37": "LABEL_37",
58
+ "38": "LABEL_38",
59
+ "39": "LABEL_39",
60
+ "40": "LABEL_40",
61
+ "41": "LABEL_41",
62
+ "42": "LABEL_42",
63
+ "43": "LABEL_43",
64
+ "44": "LABEL_44",
65
+ "45": "LABEL_45",
66
+ "46": "LABEL_46",
67
+ "47": "LABEL_47",
68
+ "48": "LABEL_48",
69
+ "49": "LABEL_49",
70
+ "50": "LABEL_50",
71
+ "51": "LABEL_51",
72
+ "52": "LABEL_52",
73
+ "53": "LABEL_53",
74
+ "54": "LABEL_54",
75
+ "55": "LABEL_55",
76
+ "56": "LABEL_56",
77
+ "57": "LABEL_57",
78
+ "58": "LABEL_58",
79
+ "59": "LABEL_59",
80
+ "60": "LABEL_60",
81
+ "61": "LABEL_61",
82
+ "62": "LABEL_62",
83
+ "63": "LABEL_63",
84
+ "64": "LABEL_64",
85
+ "65": "LABEL_65",
86
+ "66": "LABEL_66",
87
+ "67": "LABEL_67",
88
+ "68": "LABEL_68",
89
+ "69": "LABEL_69",
90
+ "70": "LABEL_70",
91
+ "71": "LABEL_71",
92
+ "72": "LABEL_72",
93
+ "73": "LABEL_73",
94
+ "74": "LABEL_74",
95
+ "75": "LABEL_75",
96
+ "76": "LABEL_76",
97
+ "77": "LABEL_77",
98
+ "78": "LABEL_78",
99
+ "79": "LABEL_79",
100
+ "80": "LABEL_80",
101
+ "81": "LABEL_81",
102
+ "82": "LABEL_82",
103
+ "83": "LABEL_83",
104
+ "84": "LABEL_84",
105
+ "85": "LABEL_85",
106
+ "86": "LABEL_86",
107
+ "87": "LABEL_87",
108
+ "88": "LABEL_88"
109
+ },
110
+ "label2id": {
111
+ "LABEL_0": 0,
112
+ "LABEL_1": 1,
113
+ "LABEL_10": 10,
114
+ "LABEL_11": 11,
115
+ "LABEL_12": 12,
116
+ "LABEL_13": 13,
117
+ "LABEL_14": 14,
118
+ "LABEL_15": 15,
119
+ "LABEL_16": 16,
120
+ "LABEL_17": 17,
121
+ "LABEL_18": 18,
122
+ "LABEL_19": 19,
123
+ "LABEL_2": 2,
124
+ "LABEL_20": 20,
125
+ "LABEL_21": 21,
126
+ "LABEL_22": 22,
127
+ "LABEL_23": 23,
128
+ "LABEL_24": 24,
129
+ "LABEL_25": 25,
130
+ "LABEL_26": 26,
131
+ "LABEL_27": 27,
132
+ "LABEL_28": 28,
133
+ "LABEL_29": 29,
134
+ "LABEL_3": 3,
135
+ "LABEL_30": 30,
136
+ "LABEL_31": 31,
137
+ "LABEL_32": 32,
138
+ "LABEL_33": 33,
139
+ "LABEL_34": 34,
140
+ "LABEL_35": 35,
141
+ "LABEL_36": 36,
142
+ "LABEL_37": 37,
143
+ "LABEL_38": 38,
144
+ "LABEL_39": 39,
145
+ "LABEL_4": 4,
146
+ "LABEL_40": 40,
147
+ "LABEL_41": 41,
148
+ "LABEL_42": 42,
149
+ "LABEL_43": 43,
150
+ "LABEL_44": 44,
151
+ "LABEL_45": 45,
152
+ "LABEL_46": 46,
153
+ "LABEL_47": 47,
154
+ "LABEL_48": 48,
155
+ "LABEL_49": 49,
156
+ "LABEL_5": 5,
157
+ "LABEL_50": 50,
158
+ "LABEL_51": 51,
159
+ "LABEL_52": 52,
160
+ "LABEL_53": 53,
161
+ "LABEL_54": 54,
162
+ "LABEL_55": 55,
163
+ "LABEL_56": 56,
164
+ "LABEL_57": 57,
165
+ "LABEL_58": 58,
166
+ "LABEL_59": 59,
167
+ "LABEL_6": 6,
168
+ "LABEL_60": 60,
169
+ "LABEL_61": 61,
170
+ "LABEL_62": 62,
171
+ "LABEL_63": 63,
172
+ "LABEL_64": 64,
173
+ "LABEL_65": 65,
174
+ "LABEL_66": 66,
175
+ "LABEL_67": 67,
176
+ "LABEL_68": 68,
177
+ "LABEL_69": 69,
178
+ "LABEL_7": 7,
179
+ "LABEL_70": 70,
180
+ "LABEL_71": 71,
181
+ "LABEL_72": 72,
182
+ "LABEL_73": 73,
183
+ "LABEL_74": 74,
184
+ "LABEL_75": 75,
185
+ "LABEL_76": 76,
186
+ "LABEL_77": 77,
187
+ "LABEL_78": 78,
188
+ "LABEL_79": 79,
189
+ "LABEL_8": 8,
190
+ "LABEL_80": 80,
191
+ "LABEL_81": 81,
192
+ "LABEL_82": 82,
193
+ "LABEL_83": 83,
194
+ "LABEL_84": 84,
195
+ "LABEL_85": 85,
196
+ "LABEL_86": 86,
197
+ "LABEL_87": 87,
198
+ "LABEL_88": 88,
199
+ "LABEL_9": 9
200
+ },
201
+ "max_char_len": 20,
202
+ "model_type": "pii_ner",
203
+ "torch_dtype": "float32",
204
+ "transformers_version": "4.49.0"
205
+ }
 
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:888bf4c2b3f70475540abbfa30672ae2826cf4f1c3b09b02ec45c52fb56cd066
3
- size 143144240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de2fdce39e2098e4936063aa68caa580739a451bfdb7988298d5870df64db5f9
3
+ size 284508924
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
test_results.json CHANGED
@@ -1,70 +1,70 @@
1
  {
2
- "eval_loss": 2.6675541400909424,
3
- "eval_overall_f1": 0.9043580541465338,
4
- "eval_overall_precision": 0.9072015660825143,
5
- "eval_overall_recall": 0.9015323117921386,
6
- "eval_type_account_number_f1": 0.907736863880844,
7
- "eval_type_account_number_recall": 0.9172240802675585,
8
- "eval_type_age_f1": 0.850771869639794,
9
- "eval_type_age_recall": 0.8611111111111112,
10
- "eval_type_bank_account_f1": 0.791044776119403,
11
- "eval_type_bank_account_recall": 0.7464788732394366,
12
- "eval_type_biometric_f1": 0.9962141698215252,
13
- "eval_type_biometric_recall": 0.9956756756756757,
14
- "eval_type_credit_card_f1": 0.8619495299356753,
15
- "eval_type_credit_card_recall": 0.8391136801541426,
16
- "eval_type_date_f1": 0.8748460591133004,
17
- "eval_type_date_recall": 0.8694920440636474,
18
- "eval_type_date_of_birth_f1": 0.9784499054820416,
19
- "eval_type_date_of_birth_recall": 0.9803030303030303,
20
- "eval_type_drivers_license_f1": 0.8849557522123893,
21
- "eval_type_drivers_license_recall": 0.8807588075880759,
22
- "eval_type_email_f1": 0.9905722429291821,
23
- "eval_type_email_recall": 0.9873251748251748,
24
- "eval_type_employee_id_f1": 0.9622997172478794,
25
- "eval_type_employee_id_recall": 0.9586854460093897,
26
- "eval_type_gender_f1": 0.9522240527182866,
27
- "eval_type_gender_recall": 0.9490968801313628,
28
- "eval_type_iban_f1": 0.9304556354916067,
29
- "eval_type_iban_recall": 0.8981481481481481,
30
- "eval_type_ip_address_f1": 0.987908643081057,
31
- "eval_type_ip_address_recall": 0.9919064748201439,
32
- "eval_type_license_plate_f1": 0.9595290654893304,
33
- "eval_type_license_plate_recall": 0.9518248175182482,
34
- "eval_type_location_f1": 0.9216815623965575,
35
- "eval_type_location_recall": 0.9077986437141367,
36
- "eval_type_organization_f1": 0.8982110448535131,
37
- "eval_type_organization_recall": 0.9025635681533972,
38
- "eval_type_passport_number_f1": 0.46875,
39
- "eval_type_passport_number_recall": 0.38461538461538464,
40
- "eval_type_password_f1": 0.8778082191780823,
41
- "eval_type_password_recall": 0.8850828729281768,
42
- "eval_type_person_f1": 0.8611754487550666,
43
- "eval_type_person_recall": 0.867643841610151,
44
- "eval_type_phone_f1": 0.9628220140515222,
45
- "eval_type_phone_recall": 0.9608530528775927,
46
- "eval_type_pin_f1": 0.43209876543209874,
47
- "eval_type_pin_recall": 0.3017241379310345,
48
- "eval_type_ssn_f1": 0.8910891089108911,
49
- "eval_type_ssn_recall": 0.8583106267029973,
50
- "eval_type_street_address_f1": 0.833598628627403,
51
- "eval_type_street_address_recall": 0.8165027584552651,
52
- "eval_type_swift_code_f1": 0.9259259259259259,
53
- "eval_type_swift_code_recall": 0.9803921568627451,
54
- "eval_type_tax_id_f1": 0.6650602409638554,
55
- "eval_type_tax_id_recall": 0.6244343891402715,
56
- "eval_type_url_f1": 0.9938633938100321,
57
- "eval_type_url_recall": 0.9946595460614153,
58
- "eval_type_username_f1": 0.9244669316949765,
59
- "eval_type_username_recall": 0.9122681883024251,
60
- "eval_type_vehicle_id_f1": 0.9642458100558661,
61
- "eval_type_vehicle_id_recall": 0.9885452462772051,
62
- "eval_tier_1_recall": 0.7222852935733847,
63
- "eval_tier_2_recall": 0.934089055481893,
64
- "eval_tier_3_recall": 0.918601849881968,
65
- "eval_tier_4_recall": 0.8656847205124917,
66
- "eval_runtime": 165.9273,
67
- "eval_samples_per_second": 102.123,
68
- "eval_steps_per_second": 3.194,
69
- "epoch": 10.0
70
  }
 
1
  {
2
+ "eval_loss": 2.7708330154418945,
3
+ "eval_overall_f1": 0.9070834490327528,
4
+ "eval_overall_precision": 0.8981291158796035,
5
+ "eval_overall_recall": 0.916218129365726,
6
+ "eval_type_account_number_f1": 0.9233963047539963,
7
+ "eval_type_account_number_recall": 0.9309334449560486,
8
+ "eval_type_age_f1": 0.8714524207011687,
9
+ "eval_type_age_recall": 0.9094076655052264,
10
+ "eval_type_bank_account_f1": 0.6451612903225806,
11
+ "eval_type_bank_account_recall": 0.8450704225352113,
12
+ "eval_type_biometric_f1": 0.9924487594390506,
13
+ "eval_type_biometric_recall": 0.9956709956709957,
14
+ "eval_type_credit_card_f1": 0.8369384359400999,
15
+ "eval_type_credit_card_recall": 0.9738625363020329,
16
+ "eval_type_date_f1": 0.8765927600046761,
17
+ "eval_type_date_recall": 0.8635700575815739,
18
+ "eval_type_date_of_birth_f1": 0.9811888638073739,
19
+ "eval_type_date_of_birth_recall": 0.9886277482941622,
20
+ "eval_type_drivers_license_f1": 0.8805031446540881,
21
+ "eval_type_drivers_license_recall": 0.9497964721845319,
22
+ "eval_type_email_f1": 0.9680042803638308,
23
+ "eval_type_email_recall": 0.9905825667980727,
24
+ "eval_type_employee_id_f1": 0.9399548532731377,
25
+ "eval_type_employee_id_recall": 0.9774647887323944,
26
+ "eval_type_gender_f1": 0.9461663947797716,
27
+ "eval_type_gender_recall": 0.9523809523809523,
28
+ "eval_type_iban_f1": 0.9346246973365617,
29
+ "eval_type_iban_recall": 0.8976744186046511,
30
+ "eval_type_ip_address_f1": 0.9883408071748879,
31
+ "eval_type_ip_address_recall": 0.9936880072137061,
32
+ "eval_type_license_plate_f1": 0.9519650655021834,
33
+ "eval_type_license_plate_recall": 0.956140350877193,
34
+ "eval_type_location_f1": 0.9284850028656945,
35
+ "eval_type_location_recall": 0.9524431669715182,
36
+ "eval_type_organization_f1": 0.9024039449352784,
37
+ "eval_type_organization_recall": 0.9171974522292994,
38
+ "eval_type_passport_number_f1": 0.4155844155844156,
39
+ "eval_type_passport_number_recall": 0.42105263157894735,
40
+ "eval_type_password_f1": 0.88,
41
+ "eval_type_password_recall": 0.8039867109634552,
42
+ "eval_type_person_f1": 0.8745558697701399,
43
+ "eval_type_person_recall": 0.8816520467836257,
44
+ "eval_type_phone_f1": 0.9660620245757754,
45
+ "eval_type_phone_recall": 0.9671939074399531,
46
+ "eval_type_pin_f1": 0.5393258426966292,
47
+ "eval_type_pin_recall": 0.41739130434782606,
48
+ "eval_type_ssn_f1": 0.9296320206584893,
49
+ "eval_type_ssn_recall": 0.9809264305177112,
50
+ "eval_type_street_address_f1": 0.8317064565447488,
51
+ "eval_type_street_address_recall": 0.8493874609656498,
52
+ "eval_type_swift_code_f1": 0.8727272727272728,
53
+ "eval_type_swift_code_recall": 0.9411764705882353,
54
+ "eval_type_tax_id_f1": 0.6733067729083665,
55
+ "eval_type_tax_id_recall": 0.7647058823529411,
56
+ "eval_type_url_f1": 0.9938436830835118,
57
+ "eval_type_url_recall": 0.99464237878382,
58
+ "eval_type_username_f1": 0.9303187546330616,
59
+ "eval_type_username_recall": 0.8977110157367668,
60
+ "eval_type_vehicle_id_f1": 0.975470621791215,
61
+ "eval_type_vehicle_id_recall": 0.9805045871559633,
62
+ "eval_tier_1_recall": 0.8225690625785625,
63
+ "eval_tier_2_recall": 0.9451886229158616,
64
+ "eval_tier_3_recall": 0.9304366300082938,
65
+ "eval_tier_4_recall": 0.8681003401274463,
66
+ "eval_runtime": 288.1903,
67
+ "eval_samples_per_second": 58.798,
68
+ "eval_steps_per_second": 7.353,
69
+ "epoch": 9.998447331862385
70
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "extra_special_tokens": {},
50
+ "mask_token": "[MASK]",
51
+ "model_max_length": 1000000000000000019884624838656,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "sp_model_kwargs": {},
55
+ "split_by_punct": false,
56
+ "tokenizer_class": "DebertaV2Tokenizer",
57
+ "unk_token": "[UNK]",
58
+ "vocab_type": "spm"
59
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8bed6f2f807b6b79aa6aa3df98784b3a76b787829f82f533f3a1e66be070a519
3
- size 5265
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c28aa2c2854b95f527374e01ae0672e12a39b71596ad41164d950caa1578e7aa
3
+ size 5841
training_config.json CHANGED
@@ -1,21 +1,64 @@
1
  {
2
- "backbone": "microsoft/deberta-v3-xsmall",
3
- "max_seq_len": 256,
4
- "max_char_len": 20,
5
- "dropout": 0.1,
6
- "epochs": 10,
7
- "batch_size": 32,
8
- "gradient_accumulation_steps": 1,
9
- "lr_backbone": 2e-05,
10
- "lr_head": 0.001,
11
- "warmup_ratio": 0.1,
12
- "weight_decay": 0.01,
13
- "fp16": false,
14
- "bf16": true,
15
- "val_ratio": 0.1,
16
- "test_ratio": 0.1,
17
- "seed": 42,
18
- "output_dir": "/content/pii_ner_v1_output",
19
- "run_name": "pii-ner-v1-full",
20
- "_try_bf16": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  }
 
1
  {
2
+ "model": {
3
+ "backbone": "microsoft/deberta-v3-xsmall",
4
+ "char_embed_dim": 50,
5
+ "char_vocab_size": 256,
6
+ "char_cnn_filters": [
7
+ 50,
8
+ 50,
9
+ 50
10
+ ],
11
+ "char_cnn_widths": [
12
+ 3,
13
+ 4,
14
+ 5
15
+ ],
16
+ "max_char_len": 20,
17
+ "dropout": 0.1
18
+ },
19
+ "data": {
20
+ "max_seq_len": 256,
21
+ "val_ratio": 0.1,
22
+ "test_ratio": 0.1,
23
+ "seed": 42,
24
+ "oversample_tiers": [
25
+ 1
26
+ ],
27
+ "oversample_factor": 3
28
+ },
29
+ "training": {
30
+ "epochs": 10,
31
+ "batch_size": 8,
32
+ "gradient_accumulation_steps": 4,
33
+ "lr_backbone": 1e-05,
34
+ "lr_head": 0.001,
35
+ "lr_scheduler_type": "cosine",
36
+ "warmup_steps": 500,
37
+ "weight_decay": 0.01,
38
+ "eval_strategy": "epoch",
39
+ "save_strategy": "epoch",
40
+ "metric_for_best_model": "overall_f1",
41
+ "logging_steps": 50,
42
+ "dataloader_num_workers": 4,
43
+ "save_total_limit": 3,
44
+ "output_dir": "/home/ubuntu/datafog-labs/pii-ner-v1/runs/2026-02-06_v1.3",
45
+ "run_name": "pii-ner-v1.3-h100-2026-02-06",
46
+ "freeze_backbone_after_epoch": 3,
47
+ "tier_weights": {
48
+ "1": 3.0,
49
+ "2": 2.0,
50
+ "3": 1.5,
51
+ "4": 1.0
52
+ },
53
+ "tier_weights_after_epoch_2": {
54
+ "1": 2.0,
55
+ "2": 1.5,
56
+ "3": 1.25,
57
+ "4": 1.0
58
+ }
59
+ },
60
+ "wandb": {
61
+ "enabled": true,
62
+ "project": "datafog-pii-ner"
63
+ }
64
  }