keegan111 commited on
Commit
7eaf797
·
verified ·
1 Parent(s): a11c459

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. cross_cell_type_generization/L6/B cells/.DS_Store +0 -0
  2. cross_cell_type_generization/L6/B cells/all_results.json +8 -0
  3. cross_cell_type_generization/L6/B cells/config.json +25 -0
  4. cross_cell_type_generization/L6/B cells/eval_results.json +8 -0
  5. cross_cell_type_generization/L6/B cells/trainer_state.json +450 -0
  6. cross_cell_type_generization/L6/Fibroblasts/trainer_state.json +450 -0
  7. data_curation&frozen_layers/dataset1_all/F0/optimizer.pt +3 -0
  8. data_curation&frozen_layers/dataset3_2048/F2/rng_state.pth +3 -0
  9. data_curation&frozen_layers/dataset3_2048/F2/scheduler.pt +3 -0
  10. data_curation&frozen_layers/dataset3_2048/F4/predictions.pickle +3 -0
  11. data_curation&frozen_layers/dataset3_2048/F4/training_args.bin +3 -0
  12. scaling_performance/.DS_Store +0 -0
  13. scaling_performance/1000/.DS_Store +0 -0
  14. scaling_performance/1000/L2/.DS_Store +0 -0
  15. scaling_performance/1000/L2/all_results.json +8 -0
  16. scaling_performance/1000/L2/config.json +25 -0
  17. scaling_performance/1000/L2/eval_results.json +8 -0
  18. scaling_performance/1000/L2/trainer_state.json +712 -0
  19. scaling_performance/1000/L4/.DS_Store +0 -0
  20. scaling_performance/1000/L4/all_results.json +8 -0
  21. scaling_performance/1000/L4/config.json +25 -0
  22. scaling_performance/1000/L4/eval_results.json +8 -0
  23. scaling_performance/1000/L4/trainer_state.json +480 -0
  24. scaling_performance/1000/fine-tuned/.DS_Store +0 -0
  25. scaling_performance/1000/fine-tuned/all_results.json +8 -0
  26. scaling_performance/1000/fine-tuned/config.json +27 -0
  27. scaling_performance/1000/fine-tuned/eval_results.json +8 -0
  28. scaling_performance/1000/fine-tuned/trainer_state.json +1756 -0
  29. scaling_performance/2000/.DS_Store +0 -0
  30. scaling_performance/2000/L1/.DS_Store +0 -0
  31. scaling_performance/2000/L1/all_results.json +8 -0
  32. scaling_performance/2000/L1/config.json +25 -0
  33. scaling_performance/2000/L1/eval_results.json +8 -0
  34. scaling_performance/2000/L1/trainer_state.json +886 -0
  35. scaling_performance/2000/L2/.DS_Store +0 -0
  36. scaling_performance/2000/L2/all_results.json +8 -0
  37. scaling_performance/2000/L2/config.json +25 -0
  38. scaling_performance/2000/L2/eval_results.json +8 -0
  39. scaling_performance/2000/L2/trainer_state.json +596 -0
  40. scaling_performance/2000/L4/.DS_Store +0 -0
  41. scaling_performance/2000/L4/all_results.json +8 -0
  42. scaling_performance/2000/L4/config.json +25 -0
  43. scaling_performance/2000/L4/eval_results.json +8 -0
  44. scaling_performance/2000/L4/trainer_state.json +596 -0
  45. scaling_performance/2000/L6/.DS_Store +0 -0
  46. scaling_performance/2000/L6/all_results.json +8 -0
  47. scaling_performance/2000/L6/config.json +25 -0
  48. scaling_performance/2000/L6/eval_results.json +8 -0
  49. scaling_performance/2000/L6/trainer_state.json +596 -0
  50. scaling_performance/2000/fine-tuned/.DS_Store +0 -0
cross_cell_type_generization/L6/B cells/.DS_Store CHANGED
Binary files a/cross_cell_type_generization/L6/B cells/.DS_Store and b/cross_cell_type_generization/L6/B cells/.DS_Store differ
 
cross_cell_type_generization/L6/B cells/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.9055555555555556,
3
+ "test_loss": 0.26022690534591675,
4
+ "test_macro_f1": 0.881574364332985,
5
+ "test_runtime": 3.0564,
6
+ "test_samples_per_second": 294.462,
7
+ "test_steps_per_second": 2.945
8
+ }
cross_cell_type_generization/L6/B cells/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.02,
6
+ "classifier_dropout": null,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.02,
9
+ "hidden_size": 256,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 512,
12
+ "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 2048,
14
+ "model_type": "bert",
15
+ "num_attention_heads": 4,
16
+ "num_hidden_layers": 6,
17
+ "pad_token_id": 0,
18
+ "position_embedding_type": "absolute",
19
+ "problem_type": "single_label_classification",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.28.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
cross_cell_type_generization/L6/B cells/eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.9055555555555556,
3
+ "test_loss": 0.26022690534591675,
4
+ "test_macro_f1": 0.881574364332985,
5
+ "test_runtime": 3.0564,
6
+ "test_samples_per_second": 294.462,
7
+ "test_steps_per_second": 2.945
8
+ }
cross_cell_type_generization/L6/B cells/trainer_state.json ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.40387290716171265,
3
+ "best_model_checkpoint": "/vsphhome/fengguoqing/Geneformer/models/data_diversity/L6/B cells/fold4/checkpoint-1122",
4
+ "epoch": 8.0,
5
+ "global_step": 1496,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.13,
12
+ "learning_rate": 2.5e-06,
13
+ "loss": 0.6926,
14
+ "step": 25
15
+ },
16
+ {
17
+ "epoch": 0.27,
18
+ "learning_rate": 5e-06,
19
+ "loss": 0.6933,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 0.4,
24
+ "learning_rate": 7.5e-06,
25
+ "loss": 0.6935,
26
+ "step": 75
27
+ },
28
+ {
29
+ "epoch": 0.53,
30
+ "learning_rate": 1e-05,
31
+ "loss": 0.6936,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.67,
36
+ "learning_rate": 1.25e-05,
37
+ "loss": 0.6928,
38
+ "step": 125
39
+ },
40
+ {
41
+ "epoch": 0.8,
42
+ "learning_rate": 1.5e-05,
43
+ "loss": 0.692,
44
+ "step": 150
45
+ },
46
+ {
47
+ "epoch": 0.94,
48
+ "learning_rate": 1.75e-05,
49
+ "loss": 0.6907,
50
+ "step": 175
51
+ },
52
+ {
53
+ "epoch": 1.0,
54
+ "eval_accuracy": 0.5210482872472142,
55
+ "eval_loss": 0.6905837059020996,
56
+ "eval_macro_f1": 0.41147454390525884,
57
+ "eval_runtime": 17.4321,
58
+ "eval_samples_per_second": 277.992,
59
+ "eval_steps_per_second": 2.696,
60
+ "step": 187
61
+ },
62
+ {
63
+ "epoch": 1.07,
64
+ "learning_rate": 2e-05,
65
+ "loss": 0.6904,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 1.2,
70
+ "learning_rate": 2.25e-05,
71
+ "loss": 0.6898,
72
+ "step": 225
73
+ },
74
+ {
75
+ "epoch": 1.34,
76
+ "learning_rate": 2.5e-05,
77
+ "loss": 0.6865,
78
+ "step": 250
79
+ },
80
+ {
81
+ "epoch": 1.47,
82
+ "learning_rate": 2.7500000000000004e-05,
83
+ "loss": 0.6869,
84
+ "step": 275
85
+ },
86
+ {
87
+ "epoch": 1.6,
88
+ "learning_rate": 3e-05,
89
+ "loss": 0.6814,
90
+ "step": 300
91
+ },
92
+ {
93
+ "epoch": 1.74,
94
+ "learning_rate": 3.2500000000000004e-05,
95
+ "loss": 0.6762,
96
+ "step": 325
97
+ },
98
+ {
99
+ "epoch": 1.87,
100
+ "learning_rate": 3.5e-05,
101
+ "loss": 0.6747,
102
+ "step": 350
103
+ },
104
+ {
105
+ "epoch": 2.0,
106
+ "eval_accuracy": 0.5984316962443252,
107
+ "eval_loss": 0.6632421016693115,
108
+ "eval_macro_f1": 0.5897740135113385,
109
+ "eval_runtime": 16.5825,
110
+ "eval_samples_per_second": 292.235,
111
+ "eval_steps_per_second": 2.834,
112
+ "step": 374
113
+ },
114
+ {
115
+ "epoch": 2.01,
116
+ "learning_rate": 3.7500000000000003e-05,
117
+ "loss": 0.6664,
118
+ "step": 375
119
+ },
120
+ {
121
+ "epoch": 2.14,
122
+ "learning_rate": 4e-05,
123
+ "loss": 0.6356,
124
+ "step": 400
125
+ },
126
+ {
127
+ "epoch": 2.27,
128
+ "learning_rate": 4.25e-05,
129
+ "loss": 0.6121,
130
+ "step": 425
131
+ },
132
+ {
133
+ "epoch": 2.41,
134
+ "learning_rate": 4.5e-05,
135
+ "loss": 0.5833,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 2.54,
140
+ "learning_rate": 4.75e-05,
141
+ "loss": 0.5619,
142
+ "step": 475
143
+ },
144
+ {
145
+ "epoch": 2.67,
146
+ "learning_rate": 5e-05,
147
+ "loss": 0.5355,
148
+ "step": 500
149
+ },
150
+ {
151
+ "epoch": 2.81,
152
+ "learning_rate": 4.9858757062146896e-05,
153
+ "loss": 0.4877,
154
+ "step": 525
155
+ },
156
+ {
157
+ "epoch": 2.94,
158
+ "learning_rate": 4.971751412429379e-05,
159
+ "loss": 0.4662,
160
+ "step": 550
161
+ },
162
+ {
163
+ "epoch": 3.0,
164
+ "eval_accuracy": 0.7631035905901775,
165
+ "eval_loss": 0.48640862107276917,
166
+ "eval_macro_f1": 0.7547183753731113,
167
+ "eval_runtime": 17.7844,
168
+ "eval_samples_per_second": 272.486,
169
+ "eval_steps_per_second": 2.643,
170
+ "step": 561
171
+ },
172
+ {
173
+ "epoch": 3.07,
174
+ "learning_rate": 4.957627118644068e-05,
175
+ "loss": 0.4818,
176
+ "step": 575
177
+ },
178
+ {
179
+ "epoch": 3.21,
180
+ "learning_rate": 4.9435028248587575e-05,
181
+ "loss": 0.458,
182
+ "step": 600
183
+ },
184
+ {
185
+ "epoch": 3.34,
186
+ "learning_rate": 4.929378531073446e-05,
187
+ "loss": 0.424,
188
+ "step": 625
189
+ },
190
+ {
191
+ "epoch": 3.48,
192
+ "learning_rate": 4.915254237288136e-05,
193
+ "loss": 0.4373,
194
+ "step": 650
195
+ },
196
+ {
197
+ "epoch": 3.61,
198
+ "learning_rate": 4.9011299435028255e-05,
199
+ "loss": 0.4121,
200
+ "step": 675
201
+ },
202
+ {
203
+ "epoch": 3.74,
204
+ "learning_rate": 4.887005649717514e-05,
205
+ "loss": 0.397,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 3.88,
210
+ "learning_rate": 4.8728813559322034e-05,
211
+ "loss": 0.3926,
212
+ "step": 725
213
+ },
214
+ {
215
+ "epoch": 4.0,
216
+ "eval_accuracy": 0.7961205117622782,
217
+ "eval_loss": 0.42608675360679626,
218
+ "eval_macro_f1": 0.7956681990046659,
219
+ "eval_runtime": 17.1151,
220
+ "eval_samples_per_second": 283.141,
221
+ "eval_steps_per_second": 2.746,
222
+ "step": 748
223
+ },
224
+ {
225
+ "epoch": 4.01,
226
+ "learning_rate": 4.8587570621468934e-05,
227
+ "loss": 0.3983,
228
+ "step": 750
229
+ },
230
+ {
231
+ "epoch": 4.14,
232
+ "learning_rate": 4.844632768361582e-05,
233
+ "loss": 0.3468,
234
+ "step": 775
235
+ },
236
+ {
237
+ "epoch": 4.28,
238
+ "learning_rate": 4.8305084745762714e-05,
239
+ "loss": 0.3599,
240
+ "step": 800
241
+ },
242
+ {
243
+ "epoch": 4.41,
244
+ "learning_rate": 4.816384180790961e-05,
245
+ "loss": 0.3619,
246
+ "step": 825
247
+ },
248
+ {
249
+ "epoch": 4.55,
250
+ "learning_rate": 4.80225988700565e-05,
251
+ "loss": 0.3428,
252
+ "step": 850
253
+ },
254
+ {
255
+ "epoch": 4.68,
256
+ "learning_rate": 4.788135593220339e-05,
257
+ "loss": 0.3622,
258
+ "step": 875
259
+ },
260
+ {
261
+ "epoch": 4.81,
262
+ "learning_rate": 4.7740112994350286e-05,
263
+ "loss": 0.3451,
264
+ "step": 900
265
+ },
266
+ {
267
+ "epoch": 4.95,
268
+ "learning_rate": 4.759887005649718e-05,
269
+ "loss": 0.3714,
270
+ "step": 925
271
+ },
272
+ {
273
+ "epoch": 5.0,
274
+ "eval_accuracy": 0.7845645893520429,
275
+ "eval_loss": 0.4528542459011078,
276
+ "eval_macro_f1": 0.7833690643653564,
277
+ "eval_runtime": 17.1144,
278
+ "eval_samples_per_second": 283.153,
279
+ "eval_steps_per_second": 2.746,
280
+ "step": 935
281
+ },
282
+ {
283
+ "epoch": 5.08,
284
+ "learning_rate": 4.745762711864407e-05,
285
+ "loss": 0.339,
286
+ "step": 950
287
+ },
288
+ {
289
+ "epoch": 5.21,
290
+ "learning_rate": 4.7316384180790966e-05,
291
+ "loss": 0.2876,
292
+ "step": 975
293
+ },
294
+ {
295
+ "epoch": 5.35,
296
+ "learning_rate": 4.717514124293785e-05,
297
+ "loss": 0.355,
298
+ "step": 1000
299
+ },
300
+ {
301
+ "epoch": 5.48,
302
+ "learning_rate": 4.703389830508475e-05,
303
+ "loss": 0.3099,
304
+ "step": 1025
305
+ },
306
+ {
307
+ "epoch": 5.61,
308
+ "learning_rate": 4.689265536723164e-05,
309
+ "loss": 0.322,
310
+ "step": 1050
311
+ },
312
+ {
313
+ "epoch": 5.75,
314
+ "learning_rate": 4.675141242937853e-05,
315
+ "loss": 0.3246,
316
+ "step": 1075
317
+ },
318
+ {
319
+ "epoch": 5.88,
320
+ "learning_rate": 4.6610169491525425e-05,
321
+ "loss": 0.3719,
322
+ "step": 1100
323
+ },
324
+ {
325
+ "epoch": 6.0,
326
+ "eval_accuracy": 0.8186132893107718,
327
+ "eval_loss": 0.40387290716171265,
328
+ "eval_macro_f1": 0.8179837351502454,
329
+ "eval_runtime": 17.1237,
330
+ "eval_samples_per_second": 282.999,
331
+ "eval_steps_per_second": 2.745,
332
+ "step": 1122
333
+ },
334
+ {
335
+ "epoch": 6.02,
336
+ "learning_rate": 4.646892655367232e-05,
337
+ "loss": 0.2755,
338
+ "step": 1125
339
+ },
340
+ {
341
+ "epoch": 6.15,
342
+ "learning_rate": 4.632768361581921e-05,
343
+ "loss": 0.293,
344
+ "step": 1150
345
+ },
346
+ {
347
+ "epoch": 6.28,
348
+ "learning_rate": 4.6186440677966104e-05,
349
+ "loss": 0.3708,
350
+ "step": 1175
351
+ },
352
+ {
353
+ "epoch": 6.42,
354
+ "learning_rate": 4.6045197740113e-05,
355
+ "loss": 0.2885,
356
+ "step": 1200
357
+ },
358
+ {
359
+ "epoch": 6.55,
360
+ "learning_rate": 4.590395480225989e-05,
361
+ "loss": 0.2562,
362
+ "step": 1225
363
+ },
364
+ {
365
+ "epoch": 6.68,
366
+ "learning_rate": 4.5762711864406784e-05,
367
+ "loss": 0.2814,
368
+ "step": 1250
369
+ },
370
+ {
371
+ "epoch": 6.82,
372
+ "learning_rate": 4.562146892655367e-05,
373
+ "loss": 0.3036,
374
+ "step": 1275
375
+ },
376
+ {
377
+ "epoch": 6.95,
378
+ "learning_rate": 4.548022598870056e-05,
379
+ "loss": 0.278,
380
+ "step": 1300
381
+ },
382
+ {
383
+ "epoch": 7.0,
384
+ "eval_accuracy": 0.7992158481221626,
385
+ "eval_loss": 0.44179922342300415,
386
+ "eval_macro_f1": 0.7983638814289351,
387
+ "eval_runtime": 16.9988,
388
+ "eval_samples_per_second": 285.079,
389
+ "eval_steps_per_second": 2.765,
390
+ "step": 1309
391
+ },
392
+ {
393
+ "epoch": 7.09,
394
+ "learning_rate": 4.533898305084746e-05,
395
+ "loss": 0.2443,
396
+ "step": 1325
397
+ },
398
+ {
399
+ "epoch": 7.22,
400
+ "learning_rate": 4.519774011299435e-05,
401
+ "loss": 0.2577,
402
+ "step": 1350
403
+ },
404
+ {
405
+ "epoch": 7.35,
406
+ "learning_rate": 4.505649717514124e-05,
407
+ "loss": 0.2615,
408
+ "step": 1375
409
+ },
410
+ {
411
+ "epoch": 7.49,
412
+ "learning_rate": 4.491525423728814e-05,
413
+ "loss": 0.2527,
414
+ "step": 1400
415
+ },
416
+ {
417
+ "epoch": 7.62,
418
+ "learning_rate": 4.477401129943503e-05,
419
+ "loss": 0.3119,
420
+ "step": 1425
421
+ },
422
+ {
423
+ "epoch": 7.75,
424
+ "learning_rate": 4.463276836158192e-05,
425
+ "loss": 0.2463,
426
+ "step": 1450
427
+ },
428
+ {
429
+ "epoch": 7.89,
430
+ "learning_rate": 4.4491525423728816e-05,
431
+ "loss": 0.2449,
432
+ "step": 1475
433
+ },
434
+ {
435
+ "epoch": 8.0,
436
+ "eval_accuracy": 0.8196450680973999,
437
+ "eval_loss": 0.4183129370212555,
438
+ "eval_macro_f1": 0.8195520921951007,
439
+ "eval_runtime": 16.9753,
440
+ "eval_samples_per_second": 285.473,
441
+ "eval_steps_per_second": 2.769,
442
+ "step": 1496
443
+ }
444
+ ],
445
+ "max_steps": 9350,
446
+ "num_train_epochs": 50,
447
+ "total_flos": 6154099081543680.0,
448
+ "trial_name": null,
449
+ "trial_params": null
450
+ }
cross_cell_type_generization/L6/Fibroblasts/trainer_state.json ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.40387290716171265,
3
+ "best_model_checkpoint": "/vsphhome/fengguoqing/Geneformer/models/data_diversity/L6/Fibroblasts/fold4/checkpoint-1122",
4
+ "epoch": 8.0,
5
+ "global_step": 1496,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.13,
12
+ "learning_rate": 2.5e-06,
13
+ "loss": 0.6926,
14
+ "step": 25
15
+ },
16
+ {
17
+ "epoch": 0.27,
18
+ "learning_rate": 5e-06,
19
+ "loss": 0.6933,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 0.4,
24
+ "learning_rate": 7.5e-06,
25
+ "loss": 0.6935,
26
+ "step": 75
27
+ },
28
+ {
29
+ "epoch": 0.53,
30
+ "learning_rate": 1e-05,
31
+ "loss": 0.6936,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.67,
36
+ "learning_rate": 1.25e-05,
37
+ "loss": 0.6928,
38
+ "step": 125
39
+ },
40
+ {
41
+ "epoch": 0.8,
42
+ "learning_rate": 1.5e-05,
43
+ "loss": 0.692,
44
+ "step": 150
45
+ },
46
+ {
47
+ "epoch": 0.94,
48
+ "learning_rate": 1.75e-05,
49
+ "loss": 0.6907,
50
+ "step": 175
51
+ },
52
+ {
53
+ "epoch": 1.0,
54
+ "eval_accuracy": 0.5210482872472142,
55
+ "eval_loss": 0.6905837059020996,
56
+ "eval_macro_f1": 0.41147454390525884,
57
+ "eval_runtime": 15.8319,
58
+ "eval_samples_per_second": 306.091,
59
+ "eval_steps_per_second": 2.969,
60
+ "step": 187
61
+ },
62
+ {
63
+ "epoch": 1.07,
64
+ "learning_rate": 2e-05,
65
+ "loss": 0.6904,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 1.2,
70
+ "learning_rate": 2.25e-05,
71
+ "loss": 0.6898,
72
+ "step": 225
73
+ },
74
+ {
75
+ "epoch": 1.34,
76
+ "learning_rate": 2.5e-05,
77
+ "loss": 0.6865,
78
+ "step": 250
79
+ },
80
+ {
81
+ "epoch": 1.47,
82
+ "learning_rate": 2.7500000000000004e-05,
83
+ "loss": 0.6869,
84
+ "step": 275
85
+ },
86
+ {
87
+ "epoch": 1.6,
88
+ "learning_rate": 3e-05,
89
+ "loss": 0.6814,
90
+ "step": 300
91
+ },
92
+ {
93
+ "epoch": 1.74,
94
+ "learning_rate": 3.2500000000000004e-05,
95
+ "loss": 0.6762,
96
+ "step": 325
97
+ },
98
+ {
99
+ "epoch": 1.87,
100
+ "learning_rate": 3.5e-05,
101
+ "loss": 0.6747,
102
+ "step": 350
103
+ },
104
+ {
105
+ "epoch": 2.0,
106
+ "eval_accuracy": 0.5984316962443252,
107
+ "eval_loss": 0.6632421016693115,
108
+ "eval_macro_f1": 0.5897740135113385,
109
+ "eval_runtime": 16.4963,
110
+ "eval_samples_per_second": 293.762,
111
+ "eval_steps_per_second": 2.849,
112
+ "step": 374
113
+ },
114
+ {
115
+ "epoch": 2.01,
116
+ "learning_rate": 3.7500000000000003e-05,
117
+ "loss": 0.6664,
118
+ "step": 375
119
+ },
120
+ {
121
+ "epoch": 2.14,
122
+ "learning_rate": 4e-05,
123
+ "loss": 0.6356,
124
+ "step": 400
125
+ },
126
+ {
127
+ "epoch": 2.27,
128
+ "learning_rate": 4.25e-05,
129
+ "loss": 0.6121,
130
+ "step": 425
131
+ },
132
+ {
133
+ "epoch": 2.41,
134
+ "learning_rate": 4.5e-05,
135
+ "loss": 0.5833,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 2.54,
140
+ "learning_rate": 4.75e-05,
141
+ "loss": 0.5619,
142
+ "step": 475
143
+ },
144
+ {
145
+ "epoch": 2.67,
146
+ "learning_rate": 5e-05,
147
+ "loss": 0.5355,
148
+ "step": 500
149
+ },
150
+ {
151
+ "epoch": 2.81,
152
+ "learning_rate": 4.9858757062146896e-05,
153
+ "loss": 0.4877,
154
+ "step": 525
155
+ },
156
+ {
157
+ "epoch": 2.94,
158
+ "learning_rate": 4.971751412429379e-05,
159
+ "loss": 0.4662,
160
+ "step": 550
161
+ },
162
+ {
163
+ "epoch": 3.0,
164
+ "eval_accuracy": 0.7631035905901775,
165
+ "eval_loss": 0.48640862107276917,
166
+ "eval_macro_f1": 0.7547183753731113,
167
+ "eval_runtime": 16.3862,
168
+ "eval_samples_per_second": 295.737,
169
+ "eval_steps_per_second": 2.868,
170
+ "step": 561
171
+ },
172
+ {
173
+ "epoch": 3.07,
174
+ "learning_rate": 4.957627118644068e-05,
175
+ "loss": 0.4818,
176
+ "step": 575
177
+ },
178
+ {
179
+ "epoch": 3.21,
180
+ "learning_rate": 4.9435028248587575e-05,
181
+ "loss": 0.458,
182
+ "step": 600
183
+ },
184
+ {
185
+ "epoch": 3.34,
186
+ "learning_rate": 4.929378531073446e-05,
187
+ "loss": 0.424,
188
+ "step": 625
189
+ },
190
+ {
191
+ "epoch": 3.48,
192
+ "learning_rate": 4.915254237288136e-05,
193
+ "loss": 0.4373,
194
+ "step": 650
195
+ },
196
+ {
197
+ "epoch": 3.61,
198
+ "learning_rate": 4.9011299435028255e-05,
199
+ "loss": 0.4121,
200
+ "step": 675
201
+ },
202
+ {
203
+ "epoch": 3.74,
204
+ "learning_rate": 4.887005649717514e-05,
205
+ "loss": 0.397,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 3.88,
210
+ "learning_rate": 4.8728813559322034e-05,
211
+ "loss": 0.3926,
212
+ "step": 725
213
+ },
214
+ {
215
+ "epoch": 4.0,
216
+ "eval_accuracy": 0.7961205117622782,
217
+ "eval_loss": 0.42608675360679626,
218
+ "eval_macro_f1": 0.7956681990046659,
219
+ "eval_runtime": 16.1617,
220
+ "eval_samples_per_second": 299.845,
221
+ "eval_steps_per_second": 2.908,
222
+ "step": 748
223
+ },
224
+ {
225
+ "epoch": 4.01,
226
+ "learning_rate": 4.8587570621468934e-05,
227
+ "loss": 0.3983,
228
+ "step": 750
229
+ },
230
+ {
231
+ "epoch": 4.14,
232
+ "learning_rate": 4.844632768361582e-05,
233
+ "loss": 0.3468,
234
+ "step": 775
235
+ },
236
+ {
237
+ "epoch": 4.28,
238
+ "learning_rate": 4.8305084745762714e-05,
239
+ "loss": 0.3599,
240
+ "step": 800
241
+ },
242
+ {
243
+ "epoch": 4.41,
244
+ "learning_rate": 4.816384180790961e-05,
245
+ "loss": 0.3619,
246
+ "step": 825
247
+ },
248
+ {
249
+ "epoch": 4.55,
250
+ "learning_rate": 4.80225988700565e-05,
251
+ "loss": 0.3428,
252
+ "step": 850
253
+ },
254
+ {
255
+ "epoch": 4.68,
256
+ "learning_rate": 4.788135593220339e-05,
257
+ "loss": 0.3622,
258
+ "step": 875
259
+ },
260
+ {
261
+ "epoch": 4.81,
262
+ "learning_rate": 4.7740112994350286e-05,
263
+ "loss": 0.3451,
264
+ "step": 900
265
+ },
266
+ {
267
+ "epoch": 4.95,
268
+ "learning_rate": 4.759887005649718e-05,
269
+ "loss": 0.3714,
270
+ "step": 925
271
+ },
272
+ {
273
+ "epoch": 5.0,
274
+ "eval_accuracy": 0.7845645893520429,
275
+ "eval_loss": 0.4528542459011078,
276
+ "eval_macro_f1": 0.7833690643653564,
277
+ "eval_runtime": 16.4437,
278
+ "eval_samples_per_second": 294.702,
279
+ "eval_steps_per_second": 2.858,
280
+ "step": 935
281
+ },
282
+ {
283
+ "epoch": 5.08,
284
+ "learning_rate": 4.745762711864407e-05,
285
+ "loss": 0.339,
286
+ "step": 950
287
+ },
288
+ {
289
+ "epoch": 5.21,
290
+ "learning_rate": 4.7316384180790966e-05,
291
+ "loss": 0.2876,
292
+ "step": 975
293
+ },
294
+ {
295
+ "epoch": 5.35,
296
+ "learning_rate": 4.717514124293785e-05,
297
+ "loss": 0.355,
298
+ "step": 1000
299
+ },
300
+ {
301
+ "epoch": 5.48,
302
+ "learning_rate": 4.703389830508475e-05,
303
+ "loss": 0.3099,
304
+ "step": 1025
305
+ },
306
+ {
307
+ "epoch": 5.61,
308
+ "learning_rate": 4.689265536723164e-05,
309
+ "loss": 0.322,
310
+ "step": 1050
311
+ },
312
+ {
313
+ "epoch": 5.75,
314
+ "learning_rate": 4.675141242937853e-05,
315
+ "loss": 0.3246,
316
+ "step": 1075
317
+ },
318
+ {
319
+ "epoch": 5.88,
320
+ "learning_rate": 4.6610169491525425e-05,
321
+ "loss": 0.3719,
322
+ "step": 1100
323
+ },
324
+ {
325
+ "epoch": 6.0,
326
+ "eval_accuracy": 0.8186132893107718,
327
+ "eval_loss": 0.40387290716171265,
328
+ "eval_macro_f1": 0.8179837351502454,
329
+ "eval_runtime": 15.9198,
330
+ "eval_samples_per_second": 304.4,
331
+ "eval_steps_per_second": 2.952,
332
+ "step": 1122
333
+ },
334
+ {
335
+ "epoch": 6.02,
336
+ "learning_rate": 4.646892655367232e-05,
337
+ "loss": 0.2755,
338
+ "step": 1125
339
+ },
340
+ {
341
+ "epoch": 6.15,
342
+ "learning_rate": 4.632768361581921e-05,
343
+ "loss": 0.293,
344
+ "step": 1150
345
+ },
346
+ {
347
+ "epoch": 6.28,
348
+ "learning_rate": 4.6186440677966104e-05,
349
+ "loss": 0.3708,
350
+ "step": 1175
351
+ },
352
+ {
353
+ "epoch": 6.42,
354
+ "learning_rate": 4.6045197740113e-05,
355
+ "loss": 0.2885,
356
+ "step": 1200
357
+ },
358
+ {
359
+ "epoch": 6.55,
360
+ "learning_rate": 4.590395480225989e-05,
361
+ "loss": 0.2562,
362
+ "step": 1225
363
+ },
364
+ {
365
+ "epoch": 6.68,
366
+ "learning_rate": 4.5762711864406784e-05,
367
+ "loss": 0.2814,
368
+ "step": 1250
369
+ },
370
+ {
371
+ "epoch": 6.82,
372
+ "learning_rate": 4.562146892655367e-05,
373
+ "loss": 0.3036,
374
+ "step": 1275
375
+ },
376
+ {
377
+ "epoch": 6.95,
378
+ "learning_rate": 4.548022598870056e-05,
379
+ "loss": 0.278,
380
+ "step": 1300
381
+ },
382
+ {
383
+ "epoch": 7.0,
384
+ "eval_accuracy": 0.7992158481221626,
385
+ "eval_loss": 0.44179922342300415,
386
+ "eval_macro_f1": 0.7983638814289351,
387
+ "eval_runtime": 16.9377,
388
+ "eval_samples_per_second": 286.107,
389
+ "eval_steps_per_second": 2.775,
390
+ "step": 1309
391
+ },
392
+ {
393
+ "epoch": 7.09,
394
+ "learning_rate": 4.533898305084746e-05,
395
+ "loss": 0.2443,
396
+ "step": 1325
397
+ },
398
+ {
399
+ "epoch": 7.22,
400
+ "learning_rate": 4.519774011299435e-05,
401
+ "loss": 0.2577,
402
+ "step": 1350
403
+ },
404
+ {
405
+ "epoch": 7.35,
406
+ "learning_rate": 4.505649717514124e-05,
407
+ "loss": 0.2615,
408
+ "step": 1375
409
+ },
410
+ {
411
+ "epoch": 7.49,
412
+ "learning_rate": 4.491525423728814e-05,
413
+ "loss": 0.2527,
414
+ "step": 1400
415
+ },
416
+ {
417
+ "epoch": 7.62,
418
+ "learning_rate": 4.477401129943503e-05,
419
+ "loss": 0.3119,
420
+ "step": 1425
421
+ },
422
+ {
423
+ "epoch": 7.75,
424
+ "learning_rate": 4.463276836158192e-05,
425
+ "loss": 0.2463,
426
+ "step": 1450
427
+ },
428
+ {
429
+ "epoch": 7.89,
430
+ "learning_rate": 4.4491525423728816e-05,
431
+ "loss": 0.2449,
432
+ "step": 1475
433
+ },
434
+ {
435
+ "epoch": 8.0,
436
+ "eval_accuracy": 0.8196450680973999,
437
+ "eval_loss": 0.4183129370212555,
438
+ "eval_macro_f1": 0.8195520921951007,
439
+ "eval_runtime": 16.081,
440
+ "eval_samples_per_second": 301.35,
441
+ "eval_steps_per_second": 2.923,
442
+ "step": 1496
443
+ }
444
+ ],
445
+ "max_steps": 9350,
446
+ "num_train_epochs": 50,
447
+ "total_flos": 6154099081543680.0,
448
+ "trial_name": null,
449
+ "trial_params": null
450
+ }
data_curation&frozen_layers/dataset1_all/F0/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e2d0e3aa5d2da2620967534f5f2e2c2256b221f18f54b08d3a18b5b4ce4645e
3
+ size 82168378
data_curation&frozen_layers/dataset3_2048/F2/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a75e0236ae48e131d17c55edac95710df7cda4d1d3ae117d50c03e177291089f
3
+ size 15006
data_curation&frozen_layers/dataset3_2048/F2/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5b52d243330319c0af9e8ee37eec3cd97862d9821d4bac5ea3efa228ba7e3a4
3
+ size 1064
data_curation&frozen_layers/dataset3_2048/F4/predictions.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:092e9fa3ba8aec8b46c4e4554cfbd6a55d1b7c2b48af1268d0a19547360a5793
3
+ size 78067
data_curation&frozen_layers/dataset3_2048/F4/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:495394a542cc7656b4fd829b32b9dab7bc9f591b59ad7220ce156679e0002696
3
+ size 4280
scaling_performance/.DS_Store CHANGED
Binary files a/scaling_performance/.DS_Store and b/scaling_performance/.DS_Store differ
 
scaling_performance/1000/.DS_Store CHANGED
Binary files a/scaling_performance/1000/.DS_Store and b/scaling_performance/1000/.DS_Store differ
 
scaling_performance/1000/L2/.DS_Store CHANGED
Binary files a/scaling_performance/1000/L2/.DS_Store and b/scaling_performance/1000/L2/.DS_Store differ
 
scaling_performance/1000/L2/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.545,
3
+ "test_loss": 0.6915959715843201,
4
+ "test_macro_f1": 0.3625442191166684,
5
+ "test_runtime": 0.4924,
6
+ "test_samples_per_second": 406.157,
7
+ "test_steps_per_second": 4.062
8
+ }
scaling_performance/1000/L2/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.02,
6
+ "classifier_dropout": null,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.02,
9
+ "hidden_size": 256,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 512,
12
+ "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 2048,
14
+ "model_type": "bert",
15
+ "num_attention_heads": 4,
16
+ "num_hidden_layers": 2,
17
+ "pad_token_id": 0,
18
+ "position_embedding_type": "absolute",
19
+ "problem_type": "single_label_classification",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.28.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
scaling_performance/1000/L2/eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.545,
3
+ "test_loss": 0.6915959715843201,
4
+ "test_macro_f1": 0.3625442191166684,
5
+ "test_runtime": 0.4924,
6
+ "test_samples_per_second": 406.157,
7
+ "test_steps_per_second": 4.062
8
+ }
scaling_performance/1000/L2/trainer_state.json ADDED
@@ -0,0 +1,712 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6915959715843201,
3
+ "best_model_checkpoint": "/vsphhome/fengguoqing/Geneformer/models/5folds_allmodels/1000samples/L2/fold4/checkpoint-80",
4
+ "epoch": 12.0,
5
+ "global_step": 96,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.12,
12
+ "learning_rate": 1.0000000000000001e-07,
13
+ "loss": 0.7159,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.25,
18
+ "learning_rate": 2.0000000000000002e-07,
19
+ "loss": 0.701,
20
+ "step": 2
21
+ },
22
+ {
23
+ "epoch": 0.38,
24
+ "learning_rate": 3.0000000000000004e-07,
25
+ "loss": 0.7445,
26
+ "step": 3
27
+ },
28
+ {
29
+ "epoch": 0.5,
30
+ "learning_rate": 4.0000000000000003e-07,
31
+ "loss": 0.7041,
32
+ "step": 4
33
+ },
34
+ {
35
+ "epoch": 0.62,
36
+ "learning_rate": 5.000000000000001e-07,
37
+ "loss": 0.6917,
38
+ "step": 5
39
+ },
40
+ {
41
+ "epoch": 0.75,
42
+ "learning_rate": 6.000000000000001e-07,
43
+ "loss": 0.7,
44
+ "step": 6
45
+ },
46
+ {
47
+ "epoch": 0.88,
48
+ "learning_rate": 7.000000000000001e-07,
49
+ "loss": 0.6976,
50
+ "step": 7
51
+ },
52
+ {
53
+ "epoch": 1.0,
54
+ "learning_rate": 8.000000000000001e-07,
55
+ "loss": 0.731,
56
+ "step": 8
57
+ },
58
+ {
59
+ "epoch": 1.0,
60
+ "eval_accuracy": 0.45,
61
+ "eval_loss": 0.7109375,
62
+ "eval_macro_f1": 0.3103448275862069,
63
+ "eval_runtime": 0.5076,
64
+ "eval_samples_per_second": 393.99,
65
+ "eval_steps_per_second": 3.94,
66
+ "step": 8
67
+ },
68
+ {
69
+ "epoch": 1.12,
70
+ "learning_rate": 9e-07,
71
+ "loss": 0.7144,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 1.25,
76
+ "learning_rate": 1.0000000000000002e-06,
77
+ "loss": 0.7244,
78
+ "step": 10
79
+ },
80
+ {
81
+ "epoch": 1.38,
82
+ "learning_rate": 1.1e-06,
83
+ "loss": 0.7184,
84
+ "step": 11
85
+ },
86
+ {
87
+ "epoch": 1.5,
88
+ "learning_rate": 1.2000000000000002e-06,
89
+ "loss": 0.7062,
90
+ "step": 12
91
+ },
92
+ {
93
+ "epoch": 1.62,
94
+ "learning_rate": 1.3e-06,
95
+ "loss": 0.6956,
96
+ "step": 13
97
+ },
98
+ {
99
+ "epoch": 1.75,
100
+ "learning_rate": 1.4000000000000001e-06,
101
+ "loss": 0.7072,
102
+ "step": 14
103
+ },
104
+ {
105
+ "epoch": 1.88,
106
+ "learning_rate": 1.5e-06,
107
+ "loss": 0.6984,
108
+ "step": 15
109
+ },
110
+ {
111
+ "epoch": 2.0,
112
+ "learning_rate": 1.6000000000000001e-06,
113
+ "loss": 0.7097,
114
+ "step": 16
115
+ },
116
+ {
117
+ "epoch": 2.0,
118
+ "eval_accuracy": 0.455,
119
+ "eval_loss": 0.7094874382019043,
120
+ "eval_macro_f1": 0.32780364466097256,
121
+ "eval_runtime": 0.499,
122
+ "eval_samples_per_second": 400.839,
123
+ "eval_steps_per_second": 4.008,
124
+ "step": 16
125
+ },
126
+ {
127
+ "epoch": 2.12,
128
+ "learning_rate": 1.7000000000000002e-06,
129
+ "loss": 0.7078,
130
+ "step": 17
131
+ },
132
+ {
133
+ "epoch": 2.25,
134
+ "learning_rate": 1.8e-06,
135
+ "loss": 0.7055,
136
+ "step": 18
137
+ },
138
+ {
139
+ "epoch": 2.38,
140
+ "learning_rate": 1.9e-06,
141
+ "loss": 0.698,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 2.5,
146
+ "learning_rate": 2.0000000000000003e-06,
147
+ "loss": 0.715,
148
+ "step": 20
149
+ },
150
+ {
151
+ "epoch": 2.62,
152
+ "learning_rate": 2.1000000000000002e-06,
153
+ "loss": 0.7083,
154
+ "step": 21
155
+ },
156
+ {
157
+ "epoch": 2.75,
158
+ "learning_rate": 2.2e-06,
159
+ "loss": 0.7027,
160
+ "step": 22
161
+ },
162
+ {
163
+ "epoch": 2.88,
164
+ "learning_rate": 2.3e-06,
165
+ "loss": 0.7082,
166
+ "step": 23
167
+ },
168
+ {
169
+ "epoch": 3.0,
170
+ "learning_rate": 2.4000000000000003e-06,
171
+ "loss": 0.7034,
172
+ "step": 24
173
+ },
174
+ {
175
+ "epoch": 3.0,
176
+ "eval_accuracy": 0.455,
177
+ "eval_loss": 0.7071405649185181,
178
+ "eval_macro_f1": 0.32780364466097256,
179
+ "eval_runtime": 0.5786,
180
+ "eval_samples_per_second": 345.685,
181
+ "eval_steps_per_second": 3.457,
182
+ "step": 24
183
+ },
184
+ {
185
+ "epoch": 3.12,
186
+ "learning_rate": 2.5e-06,
187
+ "loss": 0.7044,
188
+ "step": 25
189
+ },
190
+ {
191
+ "epoch": 3.25,
192
+ "learning_rate": 2.6e-06,
193
+ "loss": 0.7111,
194
+ "step": 26
195
+ },
196
+ {
197
+ "epoch": 3.38,
198
+ "learning_rate": 2.7e-06,
199
+ "loss": 0.6988,
200
+ "step": 27
201
+ },
202
+ {
203
+ "epoch": 3.5,
204
+ "learning_rate": 2.8000000000000003e-06,
205
+ "loss": 0.6876,
206
+ "step": 28
207
+ },
208
+ {
209
+ "epoch": 3.62,
210
+ "learning_rate": 2.9e-06,
211
+ "loss": 0.7027,
212
+ "step": 29
213
+ },
214
+ {
215
+ "epoch": 3.75,
216
+ "learning_rate": 3e-06,
217
+ "loss": 0.7028,
218
+ "step": 30
219
+ },
220
+ {
221
+ "epoch": 3.88,
222
+ "learning_rate": 3.1e-06,
223
+ "loss": 0.7086,
224
+ "step": 31
225
+ },
226
+ {
227
+ "epoch": 4.0,
228
+ "learning_rate": 3.2000000000000003e-06,
229
+ "loss": 0.7079,
230
+ "step": 32
231
+ },
232
+ {
233
+ "epoch": 4.0,
234
+ "eval_accuracy": 0.44,
235
+ "eval_loss": 0.7041958570480347,
236
+ "eval_macro_f1": 0.3333333333333333,
237
+ "eval_runtime": 0.5218,
238
+ "eval_samples_per_second": 383.289,
239
+ "eval_steps_per_second": 3.833,
240
+ "step": 32
241
+ },
242
+ {
243
+ "epoch": 4.12,
244
+ "learning_rate": 3.3e-06,
245
+ "loss": 0.7005,
246
+ "step": 33
247
+ },
248
+ {
249
+ "epoch": 4.25,
250
+ "learning_rate": 3.4000000000000005e-06,
251
+ "loss": 0.6968,
252
+ "step": 34
253
+ },
254
+ {
255
+ "epoch": 4.38,
256
+ "learning_rate": 3.5000000000000004e-06,
257
+ "loss": 0.7009,
258
+ "step": 35
259
+ },
260
+ {
261
+ "epoch": 4.5,
262
+ "learning_rate": 3.6e-06,
263
+ "loss": 0.7043,
264
+ "step": 36
265
+ },
266
+ {
267
+ "epoch": 4.62,
268
+ "learning_rate": 3.7e-06,
269
+ "loss": 0.6945,
270
+ "step": 37
271
+ },
272
+ {
273
+ "epoch": 4.75,
274
+ "learning_rate": 3.8e-06,
275
+ "loss": 0.6995,
276
+ "step": 38
277
+ },
278
+ {
279
+ "epoch": 4.88,
280
+ "learning_rate": 3.9e-06,
281
+ "loss": 0.693,
282
+ "step": 39
283
+ },
284
+ {
285
+ "epoch": 5.0,
286
+ "learning_rate": 4.000000000000001e-06,
287
+ "loss": 0.7036,
288
+ "step": 40
289
+ },
290
+ {
291
+ "epoch": 5.0,
292
+ "eval_accuracy": 0.46,
293
+ "eval_loss": 0.7008482217788696,
294
+ "eval_macro_f1": 0.3846153846153846,
295
+ "eval_runtime": 0.6101,
296
+ "eval_samples_per_second": 327.803,
297
+ "eval_steps_per_second": 3.278,
298
+ "step": 40
299
+ },
300
+ {
301
+ "epoch": 5.12,
302
+ "learning_rate": 4.1000000000000006e-06,
303
+ "loss": 0.6946,
304
+ "step": 41
305
+ },
306
+ {
307
+ "epoch": 5.25,
308
+ "learning_rate": 4.2000000000000004e-06,
309
+ "loss": 0.6955,
310
+ "step": 42
311
+ },
312
+ {
313
+ "epoch": 5.38,
314
+ "learning_rate": 4.2999999999999995e-06,
315
+ "loss": 0.697,
316
+ "step": 43
317
+ },
318
+ {
319
+ "epoch": 5.5,
320
+ "learning_rate": 4.4e-06,
321
+ "loss": 0.7048,
322
+ "step": 44
323
+ },
324
+ {
325
+ "epoch": 5.62,
326
+ "learning_rate": 4.5e-06,
327
+ "loss": 0.6956,
328
+ "step": 45
329
+ },
330
+ {
331
+ "epoch": 5.75,
332
+ "learning_rate": 4.6e-06,
333
+ "loss": 0.6956,
334
+ "step": 46
335
+ },
336
+ {
337
+ "epoch": 5.88,
338
+ "learning_rate": 4.7e-06,
339
+ "loss": 0.691,
340
+ "step": 47
341
+ },
342
+ {
343
+ "epoch": 6.0,
344
+ "learning_rate": 4.800000000000001e-06,
345
+ "loss": 0.6909,
346
+ "step": 48
347
+ },
348
+ {
349
+ "epoch": 6.0,
350
+ "eval_accuracy": 0.46,
351
+ "eval_loss": 0.6975666880607605,
352
+ "eval_macro_f1": 0.4375,
353
+ "eval_runtime": 0.5216,
354
+ "eval_samples_per_second": 383.451,
355
+ "eval_steps_per_second": 3.835,
356
+ "step": 48
357
+ },
358
+ {
359
+ "epoch": 6.12,
360
+ "learning_rate": 4.9000000000000005e-06,
361
+ "loss": 0.6932,
362
+ "step": 49
363
+ },
364
+ {
365
+ "epoch": 6.25,
366
+ "learning_rate": 5e-06,
367
+ "loss": 0.6907,
368
+ "step": 50
369
+ },
370
+ {
371
+ "epoch": 6.38,
372
+ "learning_rate": 5.1e-06,
373
+ "loss": 0.6925,
374
+ "step": 51
375
+ },
376
+ {
377
+ "epoch": 6.5,
378
+ "learning_rate": 5.2e-06,
379
+ "loss": 0.6979,
380
+ "step": 52
381
+ },
382
+ {
383
+ "epoch": 6.62,
384
+ "learning_rate": 5.3e-06,
385
+ "loss": 0.6935,
386
+ "step": 53
387
+ },
388
+ {
389
+ "epoch": 6.75,
390
+ "learning_rate": 5.4e-06,
391
+ "loss": 0.689,
392
+ "step": 54
393
+ },
394
+ {
395
+ "epoch": 6.88,
396
+ "learning_rate": 5.500000000000001e-06,
397
+ "loss": 0.6883,
398
+ "step": 55
399
+ },
400
+ {
401
+ "epoch": 7.0,
402
+ "learning_rate": 5.600000000000001e-06,
403
+ "loss": 0.6878,
404
+ "step": 56
405
+ },
406
+ {
407
+ "epoch": 7.0,
408
+ "eval_accuracy": 0.48,
409
+ "eval_loss": 0.6947869062423706,
410
+ "eval_macro_f1": 0.47743945332127424,
411
+ "eval_runtime": 0.5746,
412
+ "eval_samples_per_second": 348.056,
413
+ "eval_steps_per_second": 3.481,
414
+ "step": 56
415
+ },
416
+ {
417
+ "epoch": 7.12,
418
+ "learning_rate": 5.7000000000000005e-06,
419
+ "loss": 0.6902,
420
+ "step": 57
421
+ },
422
+ {
423
+ "epoch": 7.25,
424
+ "learning_rate": 5.8e-06,
425
+ "loss": 0.6918,
426
+ "step": 58
427
+ },
428
+ {
429
+ "epoch": 7.38,
430
+ "learning_rate": 5.9e-06,
431
+ "loss": 0.6876,
432
+ "step": 59
433
+ },
434
+ {
435
+ "epoch": 7.5,
436
+ "learning_rate": 6e-06,
437
+ "loss": 0.6932,
438
+ "step": 60
439
+ },
440
+ {
441
+ "epoch": 7.62,
442
+ "learning_rate": 6.1e-06,
443
+ "loss": 0.6856,
444
+ "step": 61
445
+ },
446
+ {
447
+ "epoch": 7.75,
448
+ "learning_rate": 6.2e-06,
449
+ "loss": 0.6851,
450
+ "step": 62
451
+ },
452
+ {
453
+ "epoch": 7.88,
454
+ "learning_rate": 6.300000000000001e-06,
455
+ "loss": 0.6837,
456
+ "step": 63
457
+ },
458
+ {
459
+ "epoch": 8.0,
460
+ "learning_rate": 6.4000000000000006e-06,
461
+ "loss": 0.687,
462
+ "step": 64
463
+ },
464
+ {
465
+ "epoch": 8.0,
466
+ "eval_accuracy": 0.54,
467
+ "eval_loss": 0.6931593418121338,
468
+ "eval_macro_f1": 0.4837840870833801,
469
+ "eval_runtime": 0.5564,
470
+ "eval_samples_per_second": 359.445,
471
+ "eval_steps_per_second": 3.594,
472
+ "step": 64
473
+ },
474
+ {
475
+ "epoch": 8.12,
476
+ "learning_rate": 6.5000000000000004e-06,
477
+ "loss": 0.6901,
478
+ "step": 65
479
+ },
480
+ {
481
+ "epoch": 8.25,
482
+ "learning_rate": 6.6e-06,
483
+ "loss": 0.6886,
484
+ "step": 66
485
+ },
486
+ {
487
+ "epoch": 8.38,
488
+ "learning_rate": 6.700000000000001e-06,
489
+ "loss": 0.6802,
490
+ "step": 67
491
+ },
492
+ {
493
+ "epoch": 8.5,
494
+ "learning_rate": 6.800000000000001e-06,
495
+ "loss": 0.679,
496
+ "step": 68
497
+ },
498
+ {
499
+ "epoch": 8.62,
500
+ "learning_rate": 6.900000000000001e-06,
501
+ "loss": 0.6916,
502
+ "step": 69
503
+ },
504
+ {
505
+ "epoch": 8.75,
506
+ "learning_rate": 7.000000000000001e-06,
507
+ "loss": 0.6878,
508
+ "step": 70
509
+ },
510
+ {
511
+ "epoch": 8.88,
512
+ "learning_rate": 7.1e-06,
513
+ "loss": 0.684,
514
+ "step": 71
515
+ },
516
+ {
517
+ "epoch": 9.0,
518
+ "learning_rate": 7.2e-06,
519
+ "loss": 0.6892,
520
+ "step": 72
521
+ },
522
+ {
523
+ "epoch": 9.0,
524
+ "eval_accuracy": 0.55,
525
+ "eval_loss": 0.6919929385185242,
526
+ "eval_macro_f1": 0.39999999999999997,
527
+ "eval_runtime": 0.5264,
528
+ "eval_samples_per_second": 379.923,
529
+ "eval_steps_per_second": 3.799,
530
+ "step": 72
531
+ },
532
+ {
533
+ "epoch": 9.12,
534
+ "learning_rate": 7.2999999999999996e-06,
535
+ "loss": 0.6799,
536
+ "step": 73
537
+ },
538
+ {
539
+ "epoch": 9.25,
540
+ "learning_rate": 7.4e-06,
541
+ "loss": 0.6864,
542
+ "step": 74
543
+ },
544
+ {
545
+ "epoch": 9.38,
546
+ "learning_rate": 7.5e-06,
547
+ "loss": 0.682,
548
+ "step": 75
549
+ },
550
+ {
551
+ "epoch": 9.5,
552
+ "learning_rate": 7.6e-06,
553
+ "loss": 0.6829,
554
+ "step": 76
555
+ },
556
+ {
557
+ "epoch": 9.62,
558
+ "learning_rate": 7.7e-06,
559
+ "loss": 0.6891,
560
+ "step": 77
561
+ },
562
+ {
563
+ "epoch": 9.75,
564
+ "learning_rate": 7.8e-06,
565
+ "loss": 0.6714,
566
+ "step": 78
567
+ },
568
+ {
569
+ "epoch": 9.88,
570
+ "learning_rate": 7.9e-06,
571
+ "loss": 0.6854,
572
+ "step": 79
573
+ },
574
+ {
575
+ "epoch": 10.0,
576
+ "learning_rate": 8.000000000000001e-06,
577
+ "loss": 0.6885,
578
+ "step": 80
579
+ },
580
+ {
581
+ "epoch": 10.0,
582
+ "eval_accuracy": 0.545,
583
+ "eval_loss": 0.6915959715843201,
584
+ "eval_macro_f1": 0.3625442191166684,
585
+ "eval_runtime": 0.5616,
586
+ "eval_samples_per_second": 356.111,
587
+ "eval_steps_per_second": 3.561,
588
+ "step": 80
589
+ },
590
+ {
591
+ "epoch": 10.12,
592
+ "learning_rate": 8.1e-06,
593
+ "loss": 0.6902,
594
+ "step": 81
595
+ },
596
+ {
597
+ "epoch": 10.25,
598
+ "learning_rate": 8.200000000000001e-06,
599
+ "loss": 0.68,
600
+ "step": 82
601
+ },
602
+ {
603
+ "epoch": 10.38,
604
+ "learning_rate": 8.3e-06,
605
+ "loss": 0.6864,
606
+ "step": 83
607
+ },
608
+ {
609
+ "epoch": 10.5,
610
+ "learning_rate": 8.400000000000001e-06,
611
+ "loss": 0.687,
612
+ "step": 84
613
+ },
614
+ {
615
+ "epoch": 10.62,
616
+ "learning_rate": 8.500000000000002e-06,
617
+ "loss": 0.681,
618
+ "step": 85
619
+ },
620
+ {
621
+ "epoch": 10.75,
622
+ "learning_rate": 8.599999999999999e-06,
623
+ "loss": 0.6713,
624
+ "step": 86
625
+ },
626
+ {
627
+ "epoch": 10.88,
628
+ "learning_rate": 8.7e-06,
629
+ "loss": 0.6792,
630
+ "step": 87
631
+ },
632
+ {
633
+ "epoch": 11.0,
634
+ "learning_rate": 8.8e-06,
635
+ "loss": 0.6705,
636
+ "step": 88
637
+ },
638
+ {
639
+ "epoch": 11.0,
640
+ "eval_accuracy": 0.54,
641
+ "eval_loss": 0.6916151642799377,
642
+ "eval_macro_f1": 0.35064935064935066,
643
+ "eval_runtime": 0.4888,
644
+ "eval_samples_per_second": 409.169,
645
+ "eval_steps_per_second": 4.092,
646
+ "step": 88
647
+ },
648
+ {
649
+ "epoch": 11.12,
650
+ "learning_rate": 8.9e-06,
651
+ "loss": 0.6946,
652
+ "step": 89
653
+ },
654
+ {
655
+ "epoch": 11.25,
656
+ "learning_rate": 9e-06,
657
+ "loss": 0.6807,
658
+ "step": 90
659
+ },
660
+ {
661
+ "epoch": 11.38,
662
+ "learning_rate": 9.100000000000001e-06,
663
+ "loss": 0.6819,
664
+ "step": 91
665
+ },
666
+ {
667
+ "epoch": 11.5,
668
+ "learning_rate": 9.2e-06,
669
+ "loss": 0.6822,
670
+ "step": 92
671
+ },
672
+ {
673
+ "epoch": 11.62,
674
+ "learning_rate": 9.3e-06,
675
+ "loss": 0.6774,
676
+ "step": 93
677
+ },
678
+ {
679
+ "epoch": 11.75,
680
+ "learning_rate": 9.4e-06,
681
+ "loss": 0.6684,
682
+ "step": 94
683
+ },
684
+ {
685
+ "epoch": 11.88,
686
+ "learning_rate": 9.5e-06,
687
+ "loss": 0.6839,
688
+ "step": 95
689
+ },
690
+ {
691
+ "epoch": 12.0,
692
+ "learning_rate": 9.600000000000001e-06,
693
+ "loss": 0.6724,
694
+ "step": 96
695
+ },
696
+ {
697
+ "epoch": 12.0,
698
+ "eval_accuracy": 0.545,
699
+ "eval_loss": 0.691866934299469,
700
+ "eval_macro_f1": 0.35275080906148865,
701
+ "eval_runtime": 0.5673,
702
+ "eval_samples_per_second": 352.518,
703
+ "eval_steps_per_second": 3.525,
704
+ "step": 96
705
+ }
706
+ ],
707
+ "max_steps": 400,
708
+ "num_train_epochs": 50,
709
+ "total_flos": 132241607884800.0,
710
+ "trial_name": null,
711
+ "trial_params": null
712
+ }
scaling_performance/1000/L4/.DS_Store CHANGED
Binary files a/scaling_performance/1000/L4/.DS_Store and b/scaling_performance/1000/L4/.DS_Store differ
 
scaling_performance/1000/L4/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.545,
3
+ "test_loss": 0.6891449689865112,
4
+ "test_macro_f1": 0.35275080906148865,
5
+ "test_runtime": 0.5999,
6
+ "test_samples_per_second": 333.384,
7
+ "test_steps_per_second": 3.334
8
+ }
scaling_performance/1000/L4/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.02,
6
+ "classifier_dropout": null,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.02,
9
+ "hidden_size": 256,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 512,
12
+ "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 2048,
14
+ "model_type": "bert",
15
+ "num_attention_heads": 4,
16
+ "num_hidden_layers": 4,
17
+ "pad_token_id": 0,
18
+ "position_embedding_type": "absolute",
19
+ "problem_type": "single_label_classification",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.28.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
scaling_performance/1000/L4/eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.545,
3
+ "test_loss": 0.6891449689865112,
4
+ "test_macro_f1": 0.35275080906148865,
5
+ "test_runtime": 0.5999,
6
+ "test_samples_per_second": 333.384,
7
+ "test_steps_per_second": 3.334
8
+ }
scaling_performance/1000/L4/trainer_state.json ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6891449689865112,
3
+ "best_model_checkpoint": "/vsphhome/fengguoqing/Geneformer/models/5folds_allmodels/1000samples/L4/fold4/checkpoint-48",
4
+ "epoch": 8.0,
5
+ "global_step": 64,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.12,
12
+ "learning_rate": 1.0000000000000001e-07,
13
+ "loss": 0.6931,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.25,
18
+ "learning_rate": 2.0000000000000002e-07,
19
+ "loss": 0.6891,
20
+ "step": 2
21
+ },
22
+ {
23
+ "epoch": 0.38,
24
+ "learning_rate": 3.0000000000000004e-07,
25
+ "loss": 0.69,
26
+ "step": 3
27
+ },
28
+ {
29
+ "epoch": 0.5,
30
+ "learning_rate": 4.0000000000000003e-07,
31
+ "loss": 0.6921,
32
+ "step": 4
33
+ },
34
+ {
35
+ "epoch": 0.62,
36
+ "learning_rate": 5.000000000000001e-07,
37
+ "loss": 0.6961,
38
+ "step": 5
39
+ },
40
+ {
41
+ "epoch": 0.75,
42
+ "learning_rate": 6.000000000000001e-07,
43
+ "loss": 0.6871,
44
+ "step": 6
45
+ },
46
+ {
47
+ "epoch": 0.88,
48
+ "learning_rate": 7.000000000000001e-07,
49
+ "loss": 0.6956,
50
+ "step": 7
51
+ },
52
+ {
53
+ "epoch": 1.0,
54
+ "learning_rate": 8.000000000000001e-07,
55
+ "loss": 0.6891,
56
+ "step": 8
57
+ },
58
+ {
59
+ "epoch": 1.0,
60
+ "eval_accuracy": 0.535,
61
+ "eval_loss": 0.6912603974342346,
62
+ "eval_macro_f1": 0.4906760864207673,
63
+ "eval_runtime": 0.6002,
64
+ "eval_samples_per_second": 333.239,
65
+ "eval_steps_per_second": 3.332,
66
+ "step": 8
67
+ },
68
+ {
69
+ "epoch": 1.12,
70
+ "learning_rate": 9e-07,
71
+ "loss": 0.6912,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 1.25,
76
+ "learning_rate": 1.0000000000000002e-06,
77
+ "loss": 0.6894,
78
+ "step": 10
79
+ },
80
+ {
81
+ "epoch": 1.38,
82
+ "learning_rate": 1.1e-06,
83
+ "loss": 0.6862,
84
+ "step": 11
85
+ },
86
+ {
87
+ "epoch": 1.5,
88
+ "learning_rate": 1.2000000000000002e-06,
89
+ "loss": 0.6888,
90
+ "step": 12
91
+ },
92
+ {
93
+ "epoch": 1.62,
94
+ "learning_rate": 1.3e-06,
95
+ "loss": 0.6906,
96
+ "step": 13
97
+ },
98
+ {
99
+ "epoch": 1.75,
100
+ "learning_rate": 1.4000000000000001e-06,
101
+ "loss": 0.6974,
102
+ "step": 14
103
+ },
104
+ {
105
+ "epoch": 1.88,
106
+ "learning_rate": 1.5e-06,
107
+ "loss": 0.6964,
108
+ "step": 15
109
+ },
110
+ {
111
+ "epoch": 2.0,
112
+ "learning_rate": 1.6000000000000001e-06,
113
+ "loss": 0.689,
114
+ "step": 16
115
+ },
116
+ {
117
+ "epoch": 2.0,
118
+ "eval_accuracy": 0.555,
119
+ "eval_loss": 0.6907957196235657,
120
+ "eval_macro_f1": 0.5024458420684835,
121
+ "eval_runtime": 0.5971,
122
+ "eval_samples_per_second": 334.937,
123
+ "eval_steps_per_second": 3.349,
124
+ "step": 16
125
+ },
126
+ {
127
+ "epoch": 2.12,
128
+ "learning_rate": 1.7000000000000002e-06,
129
+ "loss": 0.6874,
130
+ "step": 17
131
+ },
132
+ {
133
+ "epoch": 2.25,
134
+ "learning_rate": 1.8e-06,
135
+ "loss": 0.6979,
136
+ "step": 18
137
+ },
138
+ {
139
+ "epoch": 2.38,
140
+ "learning_rate": 1.9e-06,
141
+ "loss": 0.6931,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 2.5,
146
+ "learning_rate": 2.0000000000000003e-06,
147
+ "loss": 0.6823,
148
+ "step": 20
149
+ },
150
+ {
151
+ "epoch": 2.62,
152
+ "learning_rate": 2.1000000000000002e-06,
153
+ "loss": 0.6901,
154
+ "step": 21
155
+ },
156
+ {
157
+ "epoch": 2.75,
158
+ "learning_rate": 2.2e-06,
159
+ "loss": 0.6894,
160
+ "step": 22
161
+ },
162
+ {
163
+ "epoch": 2.88,
164
+ "learning_rate": 2.3e-06,
165
+ "loss": 0.6887,
166
+ "step": 23
167
+ },
168
+ {
169
+ "epoch": 3.0,
170
+ "learning_rate": 2.4000000000000003e-06,
171
+ "loss": 0.693,
172
+ "step": 24
173
+ },
174
+ {
175
+ "epoch": 3.0,
176
+ "eval_accuracy": 0.565,
177
+ "eval_loss": 0.690175473690033,
178
+ "eval_macro_f1": 0.4634763035367396,
179
+ "eval_runtime": 0.5949,
180
+ "eval_samples_per_second": 336.173,
181
+ "eval_steps_per_second": 3.362,
182
+ "step": 24
183
+ },
184
+ {
185
+ "epoch": 3.12,
186
+ "learning_rate": 2.5e-06,
187
+ "loss": 0.6931,
188
+ "step": 25
189
+ },
190
+ {
191
+ "epoch": 3.25,
192
+ "learning_rate": 2.6e-06,
193
+ "loss": 0.6882,
194
+ "step": 26
195
+ },
196
+ {
197
+ "epoch": 3.38,
198
+ "learning_rate": 2.7e-06,
199
+ "loss": 0.694,
200
+ "step": 27
201
+ },
202
+ {
203
+ "epoch": 3.5,
204
+ "learning_rate": 2.8000000000000003e-06,
205
+ "loss": 0.692,
206
+ "step": 28
207
+ },
208
+ {
209
+ "epoch": 3.62,
210
+ "learning_rate": 2.9e-06,
211
+ "loss": 0.6899,
212
+ "step": 29
213
+ },
214
+ {
215
+ "epoch": 3.75,
216
+ "learning_rate": 3e-06,
217
+ "loss": 0.6909,
218
+ "step": 30
219
+ },
220
+ {
221
+ "epoch": 3.88,
222
+ "learning_rate": 3.1e-06,
223
+ "loss": 0.6849,
224
+ "step": 31
225
+ },
226
+ {
227
+ "epoch": 4.0,
228
+ "learning_rate": 3.2000000000000003e-06,
229
+ "loss": 0.6811,
230
+ "step": 32
231
+ },
232
+ {
233
+ "epoch": 4.0,
234
+ "eval_accuracy": 0.565,
235
+ "eval_loss": 0.6895776987075806,
236
+ "eval_macro_f1": 0.4238219808602934,
237
+ "eval_runtime": 0.5921,
238
+ "eval_samples_per_second": 337.774,
239
+ "eval_steps_per_second": 3.378,
240
+ "step": 32
241
+ },
242
+ {
243
+ "epoch": 4.12,
244
+ "learning_rate": 3.3e-06,
245
+ "loss": 0.6888,
246
+ "step": 33
247
+ },
248
+ {
249
+ "epoch": 4.25,
250
+ "learning_rate": 3.4000000000000005e-06,
251
+ "loss": 0.6896,
252
+ "step": 34
253
+ },
254
+ {
255
+ "epoch": 4.38,
256
+ "learning_rate": 3.5000000000000004e-06,
257
+ "loss": 0.6842,
258
+ "step": 35
259
+ },
260
+ {
261
+ "epoch": 4.5,
262
+ "learning_rate": 3.6e-06,
263
+ "loss": 0.6798,
264
+ "step": 36
265
+ },
266
+ {
267
+ "epoch": 4.62,
268
+ "learning_rate": 3.7e-06,
269
+ "loss": 0.6918,
270
+ "step": 37
271
+ },
272
+ {
273
+ "epoch": 4.75,
274
+ "learning_rate": 3.8e-06,
275
+ "loss": 0.6909,
276
+ "step": 38
277
+ },
278
+ {
279
+ "epoch": 4.88,
280
+ "learning_rate": 3.9e-06,
281
+ "loss": 0.6875,
282
+ "step": 39
283
+ },
284
+ {
285
+ "epoch": 5.0,
286
+ "learning_rate": 4.000000000000001e-06,
287
+ "loss": 0.6771,
288
+ "step": 40
289
+ },
290
+ {
291
+ "epoch": 5.0,
292
+ "eval_accuracy": 0.55,
293
+ "eval_loss": 0.6891635060310364,
294
+ "eval_macro_f1": 0.3647656691134952,
295
+ "eval_runtime": 0.6082,
296
+ "eval_samples_per_second": 328.828,
297
+ "eval_steps_per_second": 3.288,
298
+ "step": 40
299
+ },
300
+ {
301
+ "epoch": 5.12,
302
+ "learning_rate": 4.1000000000000006e-06,
303
+ "loss": 0.6995,
304
+ "step": 41
305
+ },
306
+ {
307
+ "epoch": 5.25,
308
+ "learning_rate": 4.2000000000000004e-06,
309
+ "loss": 0.6806,
310
+ "step": 42
311
+ },
312
+ {
313
+ "epoch": 5.38,
314
+ "learning_rate": 4.2999999999999995e-06,
315
+ "loss": 0.6824,
316
+ "step": 43
317
+ },
318
+ {
319
+ "epoch": 5.5,
320
+ "learning_rate": 4.4e-06,
321
+ "loss": 0.6884,
322
+ "step": 44
323
+ },
324
+ {
325
+ "epoch": 5.62,
326
+ "learning_rate": 4.5e-06,
327
+ "loss": 0.6792,
328
+ "step": 45
329
+ },
330
+ {
331
+ "epoch": 5.75,
332
+ "learning_rate": 4.6e-06,
333
+ "loss": 0.6931,
334
+ "step": 46
335
+ },
336
+ {
337
+ "epoch": 5.88,
338
+ "learning_rate": 4.7e-06,
339
+ "loss": 0.6867,
340
+ "step": 47
341
+ },
342
+ {
343
+ "epoch": 6.0,
344
+ "learning_rate": 4.800000000000001e-06,
345
+ "loss": 0.6819,
346
+ "step": 48
347
+ },
348
+ {
349
+ "epoch": 6.0,
350
+ "eval_accuracy": 0.545,
351
+ "eval_loss": 0.6891449689865112,
352
+ "eval_macro_f1": 0.35275080906148865,
353
+ "eval_runtime": 0.6013,
354
+ "eval_samples_per_second": 332.599,
355
+ "eval_steps_per_second": 3.326,
356
+ "step": 48
357
+ },
358
+ {
359
+ "epoch": 6.12,
360
+ "learning_rate": 4.9000000000000005e-06,
361
+ "loss": 0.6826,
362
+ "step": 49
363
+ },
364
+ {
365
+ "epoch": 6.25,
366
+ "learning_rate": 5e-06,
367
+ "loss": 0.6705,
368
+ "step": 50
369
+ },
370
+ {
371
+ "epoch": 6.38,
372
+ "learning_rate": 5.1e-06,
373
+ "loss": 0.6671,
374
+ "step": 51
375
+ },
376
+ {
377
+ "epoch": 6.5,
378
+ "learning_rate": 5.2e-06,
379
+ "loss": 0.7043,
380
+ "step": 52
381
+ },
382
+ {
383
+ "epoch": 6.62,
384
+ "learning_rate": 5.3e-06,
385
+ "loss": 0.6757,
386
+ "step": 53
387
+ },
388
+ {
389
+ "epoch": 6.75,
390
+ "learning_rate": 5.4e-06,
391
+ "loss": 0.6942,
392
+ "step": 54
393
+ },
394
+ {
395
+ "epoch": 6.88,
396
+ "learning_rate": 5.500000000000001e-06,
397
+ "loss": 0.6902,
398
+ "step": 55
399
+ },
400
+ {
401
+ "epoch": 7.0,
402
+ "learning_rate": 5.600000000000001e-06,
403
+ "loss": 0.7139,
404
+ "step": 56
405
+ },
406
+ {
407
+ "epoch": 7.0,
408
+ "eval_accuracy": 0.545,
409
+ "eval_loss": 0.6895077228546143,
410
+ "eval_macro_f1": 0.35275080906148865,
411
+ "eval_runtime": 0.5744,
412
+ "eval_samples_per_second": 348.173,
413
+ "eval_steps_per_second": 3.482,
414
+ "step": 56
415
+ },
416
+ {
417
+ "epoch": 7.12,
418
+ "learning_rate": 5.7000000000000005e-06,
419
+ "loss": 0.6857,
420
+ "step": 57
421
+ },
422
+ {
423
+ "epoch": 7.25,
424
+ "learning_rate": 5.8e-06,
425
+ "loss": 0.7153,
426
+ "step": 58
427
+ },
428
+ {
429
+ "epoch": 7.38,
430
+ "learning_rate": 5.9e-06,
431
+ "loss": 0.6892,
432
+ "step": 59
433
+ },
434
+ {
435
+ "epoch": 7.5,
436
+ "learning_rate": 6e-06,
437
+ "loss": 0.6819,
438
+ "step": 60
439
+ },
440
+ {
441
+ "epoch": 7.62,
442
+ "learning_rate": 6.1e-06,
443
+ "loss": 0.6698,
444
+ "step": 61
445
+ },
446
+ {
447
+ "epoch": 7.75,
448
+ "learning_rate": 6.2e-06,
449
+ "loss": 0.6628,
450
+ "step": 62
451
+ },
452
+ {
453
+ "epoch": 7.88,
454
+ "learning_rate": 6.300000000000001e-06,
455
+ "loss": 0.6882,
456
+ "step": 63
457
+ },
458
+ {
459
+ "epoch": 8.0,
460
+ "learning_rate": 6.4000000000000006e-06,
461
+ "loss": 0.6845,
462
+ "step": 64
463
+ },
464
+ {
465
+ "epoch": 8.0,
466
+ "eval_accuracy": 0.545,
467
+ "eval_loss": 0.6896821856498718,
468
+ "eval_macro_f1": 0.35275080906148865,
469
+ "eval_runtime": 0.5921,
470
+ "eval_samples_per_second": 337.784,
471
+ "eval_steps_per_second": 3.378,
472
+ "step": 64
473
+ }
474
+ ],
475
+ "max_steps": 400,
476
+ "num_train_epochs": 50,
477
+ "total_flos": 171067362508800.0,
478
+ "trial_name": null,
479
+ "trial_params": null
480
+ }
scaling_performance/1000/fine-tuned/.DS_Store CHANGED
Binary files a/scaling_performance/1000/fine-tuned/.DS_Store and b/scaling_performance/1000/fine-tuned/.DS_Store differ
 
scaling_performance/1000/fine-tuned/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.615,
3
+ "test_loss": 0.6587879657745361,
4
+ "test_macro_f1": 0.6042250263421655,
5
+ "test_runtime": 0.6844,
6
+ "test_samples_per_second": 292.222,
7
+ "test_steps_per_second": 2.922
8
+ }
scaling_performance/1000/fine-tuned/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/vsphhome/fengguoqing/Geneformer",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.02,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "relu",
10
+ "hidden_dropout_prob": 0.02,
11
+ "hidden_size": 256,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 512,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 2048,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 4,
18
+ "num_hidden_layers": 6,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "problem_type": "single_label_classification",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.28.0",
24
+ "type_vocab_size": 2,
25
+ "use_cache": true,
26
+ "vocab_size": 25426
27
+ }
scaling_performance/1000/fine-tuned/eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.615,
3
+ "test_loss": 0.6587879657745361,
4
+ "test_macro_f1": 0.6042250263421655,
5
+ "test_runtime": 0.6844,
6
+ "test_samples_per_second": 292.222,
7
+ "test_steps_per_second": 2.922
8
+ }
scaling_performance/1000/fine-tuned/trainer_state.json ADDED
@@ -0,0 +1,1756 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6587879657745361,
3
+ "best_model_checkpoint": "/vsphhome/fengguoqing/Geneformer/models/5folds_allmodels/1000samples/fine_tuned/fold0/checkpoint-224",
4
+ "epoch": 30.0,
5
+ "global_step": 240,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.12,
12
+ "learning_rate": 1.0000000000000001e-07,
13
+ "loss": 0.6848,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.25,
18
+ "learning_rate": 2.0000000000000002e-07,
19
+ "loss": 0.7017,
20
+ "step": 2
21
+ },
22
+ {
23
+ "epoch": 0.38,
24
+ "learning_rate": 3.0000000000000004e-07,
25
+ "loss": 0.7008,
26
+ "step": 3
27
+ },
28
+ {
29
+ "epoch": 0.5,
30
+ "learning_rate": 4.0000000000000003e-07,
31
+ "loss": 0.6882,
32
+ "step": 4
33
+ },
34
+ {
35
+ "epoch": 0.62,
36
+ "learning_rate": 5.000000000000001e-07,
37
+ "loss": 0.6897,
38
+ "step": 5
39
+ },
40
+ {
41
+ "epoch": 0.75,
42
+ "learning_rate": 6.000000000000001e-07,
43
+ "loss": 0.6837,
44
+ "step": 6
45
+ },
46
+ {
47
+ "epoch": 0.88,
48
+ "learning_rate": 7.000000000000001e-07,
49
+ "loss": 0.6949,
50
+ "step": 7
51
+ },
52
+ {
53
+ "epoch": 1.0,
54
+ "learning_rate": 8.000000000000001e-07,
55
+ "loss": 0.6921,
56
+ "step": 8
57
+ },
58
+ {
59
+ "epoch": 1.0,
60
+ "eval_accuracy": 0.455,
61
+ "eval_loss": 0.6985742449760437,
62
+ "eval_macro_f1": 0.4285864066472701,
63
+ "eval_runtime": 0.767,
64
+ "eval_samples_per_second": 260.748,
65
+ "eval_steps_per_second": 2.607,
66
+ "step": 8
67
+ },
68
+ {
69
+ "epoch": 1.12,
70
+ "learning_rate": 9e-07,
71
+ "loss": 0.687,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 1.25,
76
+ "learning_rate": 1.0000000000000002e-06,
77
+ "loss": 0.6932,
78
+ "step": 10
79
+ },
80
+ {
81
+ "epoch": 1.38,
82
+ "learning_rate": 1.1e-06,
83
+ "loss": 0.6838,
84
+ "step": 11
85
+ },
86
+ {
87
+ "epoch": 1.5,
88
+ "learning_rate": 1.2000000000000002e-06,
89
+ "loss": 0.6902,
90
+ "step": 12
91
+ },
92
+ {
93
+ "epoch": 1.62,
94
+ "learning_rate": 1.3e-06,
95
+ "loss": 0.6992,
96
+ "step": 13
97
+ },
98
+ {
99
+ "epoch": 1.75,
100
+ "learning_rate": 1.4000000000000001e-06,
101
+ "loss": 0.6965,
102
+ "step": 14
103
+ },
104
+ {
105
+ "epoch": 1.88,
106
+ "learning_rate": 1.5e-06,
107
+ "loss": 0.6888,
108
+ "step": 15
109
+ },
110
+ {
111
+ "epoch": 2.0,
112
+ "learning_rate": 1.6000000000000001e-06,
113
+ "loss": 0.6817,
114
+ "step": 16
115
+ },
116
+ {
117
+ "epoch": 2.0,
118
+ "eval_accuracy": 0.475,
119
+ "eval_loss": 0.6979620456695557,
120
+ "eval_macro_f1": 0.42495687176538244,
121
+ "eval_runtime": 0.7018,
122
+ "eval_samples_per_second": 284.975,
123
+ "eval_steps_per_second": 2.85,
124
+ "step": 16
125
+ },
126
+ {
127
+ "epoch": 2.12,
128
+ "learning_rate": 1.7000000000000002e-06,
129
+ "loss": 0.6905,
130
+ "step": 17
131
+ },
132
+ {
133
+ "epoch": 2.25,
134
+ "learning_rate": 1.8e-06,
135
+ "loss": 0.6945,
136
+ "step": 18
137
+ },
138
+ {
139
+ "epoch": 2.38,
140
+ "learning_rate": 1.9e-06,
141
+ "loss": 0.6895,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 2.5,
146
+ "learning_rate": 2.0000000000000003e-06,
147
+ "loss": 0.6841,
148
+ "step": 20
149
+ },
150
+ {
151
+ "epoch": 2.62,
152
+ "learning_rate": 2.1000000000000002e-06,
153
+ "loss": 0.685,
154
+ "step": 21
155
+ },
156
+ {
157
+ "epoch": 2.75,
158
+ "learning_rate": 2.2e-06,
159
+ "loss": 0.6915,
160
+ "step": 22
161
+ },
162
+ {
163
+ "epoch": 2.88,
164
+ "learning_rate": 2.3e-06,
165
+ "loss": 0.6967,
166
+ "step": 23
167
+ },
168
+ {
169
+ "epoch": 3.0,
170
+ "learning_rate": 2.4000000000000003e-06,
171
+ "loss": 0.6912,
172
+ "step": 24
173
+ },
174
+ {
175
+ "epoch": 3.0,
176
+ "eval_accuracy": 0.48,
177
+ "eval_loss": 0.6971560716629028,
178
+ "eval_macro_f1": 0.37492487077773773,
179
+ "eval_runtime": 0.7011,
180
+ "eval_samples_per_second": 285.285,
181
+ "eval_steps_per_second": 2.853,
182
+ "step": 24
183
+ },
184
+ {
185
+ "epoch": 3.12,
186
+ "learning_rate": 2.5e-06,
187
+ "loss": 0.684,
188
+ "step": 25
189
+ },
190
+ {
191
+ "epoch": 3.25,
192
+ "learning_rate": 2.6e-06,
193
+ "loss": 0.6841,
194
+ "step": 26
195
+ },
196
+ {
197
+ "epoch": 3.38,
198
+ "learning_rate": 2.7e-06,
199
+ "loss": 0.6929,
200
+ "step": 27
201
+ },
202
+ {
203
+ "epoch": 3.5,
204
+ "learning_rate": 2.8000000000000003e-06,
205
+ "loss": 0.6904,
206
+ "step": 28
207
+ },
208
+ {
209
+ "epoch": 3.62,
210
+ "learning_rate": 2.9e-06,
211
+ "loss": 0.6837,
212
+ "step": 29
213
+ },
214
+ {
215
+ "epoch": 3.75,
216
+ "learning_rate": 3e-06,
217
+ "loss": 0.6952,
218
+ "step": 30
219
+ },
220
+ {
221
+ "epoch": 3.88,
222
+ "learning_rate": 3.1e-06,
223
+ "loss": 0.686,
224
+ "step": 31
225
+ },
226
+ {
227
+ "epoch": 4.0,
228
+ "learning_rate": 3.2000000000000003e-06,
229
+ "loss": 0.6879,
230
+ "step": 32
231
+ },
232
+ {
233
+ "epoch": 4.0,
234
+ "eval_accuracy": 0.505,
235
+ "eval_loss": 0.6962434649467468,
236
+ "eval_macro_f1": 0.35275080906148865,
237
+ "eval_runtime": 0.6739,
238
+ "eval_samples_per_second": 296.767,
239
+ "eval_steps_per_second": 2.968,
240
+ "step": 32
241
+ },
242
+ {
243
+ "epoch": 4.12,
244
+ "learning_rate": 3.3e-06,
245
+ "loss": 0.6858,
246
+ "step": 33
247
+ },
248
+ {
249
+ "epoch": 4.25,
250
+ "learning_rate": 3.4000000000000005e-06,
251
+ "loss": 0.6956,
252
+ "step": 34
253
+ },
254
+ {
255
+ "epoch": 4.38,
256
+ "learning_rate": 3.5000000000000004e-06,
257
+ "loss": 0.678,
258
+ "step": 35
259
+ },
260
+ {
261
+ "epoch": 4.5,
262
+ "learning_rate": 3.6e-06,
263
+ "loss": 0.6862,
264
+ "step": 36
265
+ },
266
+ {
267
+ "epoch": 4.62,
268
+ "learning_rate": 3.7e-06,
269
+ "loss": 0.69,
270
+ "step": 37
271
+ },
272
+ {
273
+ "epoch": 4.75,
274
+ "learning_rate": 3.8e-06,
275
+ "loss": 0.6754,
276
+ "step": 38
277
+ },
278
+ {
279
+ "epoch": 4.88,
280
+ "learning_rate": 3.9e-06,
281
+ "loss": 0.6943,
282
+ "step": 39
283
+ },
284
+ {
285
+ "epoch": 5.0,
286
+ "learning_rate": 4.000000000000001e-06,
287
+ "loss": 0.6874,
288
+ "step": 40
289
+ },
290
+ {
291
+ "epoch": 5.0,
292
+ "eval_accuracy": 0.545,
293
+ "eval_loss": 0.6953359246253967,
294
+ "eval_macro_f1": 0.3625442191166684,
295
+ "eval_runtime": 0.682,
296
+ "eval_samples_per_second": 293.25,
297
+ "eval_steps_per_second": 2.933,
298
+ "step": 40
299
+ },
300
+ {
301
+ "epoch": 5.12,
302
+ "learning_rate": 4.1000000000000006e-06,
303
+ "loss": 0.6939,
304
+ "step": 41
305
+ },
306
+ {
307
+ "epoch": 5.25,
308
+ "learning_rate": 4.2000000000000004e-06,
309
+ "loss": 0.6865,
310
+ "step": 42
311
+ },
312
+ {
313
+ "epoch": 5.38,
314
+ "learning_rate": 4.2999999999999995e-06,
315
+ "loss": 0.6706,
316
+ "step": 43
317
+ },
318
+ {
319
+ "epoch": 5.5,
320
+ "learning_rate": 4.4e-06,
321
+ "loss": 0.6749,
322
+ "step": 44
323
+ },
324
+ {
325
+ "epoch": 5.62,
326
+ "learning_rate": 4.5e-06,
327
+ "loss": 0.6929,
328
+ "step": 45
329
+ },
330
+ {
331
+ "epoch": 5.75,
332
+ "learning_rate": 4.6e-06,
333
+ "loss": 0.684,
334
+ "step": 46
335
+ },
336
+ {
337
+ "epoch": 5.88,
338
+ "learning_rate": 4.7e-06,
339
+ "loss": 0.6802,
340
+ "step": 47
341
+ },
342
+ {
343
+ "epoch": 6.0,
344
+ "learning_rate": 4.800000000000001e-06,
345
+ "loss": 0.7163,
346
+ "step": 48
347
+ },
348
+ {
349
+ "epoch": 6.0,
350
+ "eval_accuracy": 0.555,
351
+ "eval_loss": 0.6945347785949707,
352
+ "eval_macro_f1": 0.3669760660051922,
353
+ "eval_runtime": 0.7337,
354
+ "eval_samples_per_second": 272.601,
355
+ "eval_steps_per_second": 2.726,
356
+ "step": 48
357
+ },
358
+ {
359
+ "epoch": 6.12,
360
+ "learning_rate": 4.9000000000000005e-06,
361
+ "loss": 0.6936,
362
+ "step": 49
363
+ },
364
+ {
365
+ "epoch": 6.25,
366
+ "learning_rate": 5e-06,
367
+ "loss": 0.6854,
368
+ "step": 50
369
+ },
370
+ {
371
+ "epoch": 6.38,
372
+ "learning_rate": 5.1e-06,
373
+ "loss": 0.6865,
374
+ "step": 51
375
+ },
376
+ {
377
+ "epoch": 6.5,
378
+ "learning_rate": 5.2e-06,
379
+ "loss": 0.6784,
380
+ "step": 52
381
+ },
382
+ {
383
+ "epoch": 6.62,
384
+ "learning_rate": 5.3e-06,
385
+ "loss": 0.681,
386
+ "step": 53
387
+ },
388
+ {
389
+ "epoch": 6.75,
390
+ "learning_rate": 5.4e-06,
391
+ "loss": 0.6769,
392
+ "step": 54
393
+ },
394
+ {
395
+ "epoch": 6.88,
396
+ "learning_rate": 5.500000000000001e-06,
397
+ "loss": 0.6971,
398
+ "step": 55
399
+ },
400
+ {
401
+ "epoch": 7.0,
402
+ "learning_rate": 5.600000000000001e-06,
403
+ "loss": 0.6755,
404
+ "step": 56
405
+ },
406
+ {
407
+ "epoch": 7.0,
408
+ "eval_accuracy": 0.55,
409
+ "eval_loss": 0.6938488483428955,
410
+ "eval_macro_f1": 0.3548387096774194,
411
+ "eval_runtime": 0.6956,
412
+ "eval_samples_per_second": 287.509,
413
+ "eval_steps_per_second": 2.875,
414
+ "step": 56
415
+ },
416
+ {
417
+ "epoch": 7.12,
418
+ "learning_rate": 5.7000000000000005e-06,
419
+ "loss": 0.6804,
420
+ "step": 57
421
+ },
422
+ {
423
+ "epoch": 7.25,
424
+ "learning_rate": 5.8e-06,
425
+ "loss": 0.6875,
426
+ "step": 58
427
+ },
428
+ {
429
+ "epoch": 7.38,
430
+ "learning_rate": 5.9e-06,
431
+ "loss": 0.6561,
432
+ "step": 59
433
+ },
434
+ {
435
+ "epoch": 7.5,
436
+ "learning_rate": 6e-06,
437
+ "loss": 0.6871,
438
+ "step": 60
439
+ },
440
+ {
441
+ "epoch": 7.62,
442
+ "learning_rate": 6.1e-06,
443
+ "loss": 0.7011,
444
+ "step": 61
445
+ },
446
+ {
447
+ "epoch": 7.75,
448
+ "learning_rate": 6.2e-06,
449
+ "loss": 0.697,
450
+ "step": 62
451
+ },
452
+ {
453
+ "epoch": 7.88,
454
+ "learning_rate": 6.300000000000001e-06,
455
+ "loss": 0.6829,
456
+ "step": 63
457
+ },
458
+ {
459
+ "epoch": 8.0,
460
+ "learning_rate": 6.4000000000000006e-06,
461
+ "loss": 0.6857,
462
+ "step": 64
463
+ },
464
+ {
465
+ "epoch": 8.0,
466
+ "eval_accuracy": 0.55,
467
+ "eval_loss": 0.6929141283035278,
468
+ "eval_macro_f1": 0.3548387096774194,
469
+ "eval_runtime": 0.679,
470
+ "eval_samples_per_second": 294.572,
471
+ "eval_steps_per_second": 2.946,
472
+ "step": 64
473
+ },
474
+ {
475
+ "epoch": 8.12,
476
+ "learning_rate": 6.5000000000000004e-06,
477
+ "loss": 0.6794,
478
+ "step": 65
479
+ },
480
+ {
481
+ "epoch": 8.25,
482
+ "learning_rate": 6.6e-06,
483
+ "loss": 0.6768,
484
+ "step": 66
485
+ },
486
+ {
487
+ "epoch": 8.38,
488
+ "learning_rate": 6.700000000000001e-06,
489
+ "loss": 0.6813,
490
+ "step": 67
491
+ },
492
+ {
493
+ "epoch": 8.5,
494
+ "learning_rate": 6.800000000000001e-06,
495
+ "loss": 0.6926,
496
+ "step": 68
497
+ },
498
+ {
499
+ "epoch": 8.62,
500
+ "learning_rate": 6.900000000000001e-06,
501
+ "loss": 0.6782,
502
+ "step": 69
503
+ },
504
+ {
505
+ "epoch": 8.75,
506
+ "learning_rate": 7.000000000000001e-06,
507
+ "loss": 0.69,
508
+ "step": 70
509
+ },
510
+ {
511
+ "epoch": 8.88,
512
+ "learning_rate": 7.1e-06,
513
+ "loss": 0.6683,
514
+ "step": 71
515
+ },
516
+ {
517
+ "epoch": 9.0,
518
+ "learning_rate": 7.2e-06,
519
+ "loss": 0.703,
520
+ "step": 72
521
+ },
522
+ {
523
+ "epoch": 9.0,
524
+ "eval_accuracy": 0.555,
525
+ "eval_loss": 0.6919710040092468,
526
+ "eval_macro_f1": 0.3669760660051922,
527
+ "eval_runtime": 0.6934,
528
+ "eval_samples_per_second": 288.454,
529
+ "eval_steps_per_second": 2.885,
530
+ "step": 72
531
+ },
532
+ {
533
+ "epoch": 9.12,
534
+ "learning_rate": 7.2999999999999996e-06,
535
+ "loss": 0.7012,
536
+ "step": 73
537
+ },
538
+ {
539
+ "epoch": 9.25,
540
+ "learning_rate": 7.4e-06,
541
+ "loss": 0.6872,
542
+ "step": 74
543
+ },
544
+ {
545
+ "epoch": 9.38,
546
+ "learning_rate": 7.5e-06,
547
+ "loss": 0.6639,
548
+ "step": 75
549
+ },
550
+ {
551
+ "epoch": 9.5,
552
+ "learning_rate": 7.6e-06,
553
+ "loss": 0.692,
554
+ "step": 76
555
+ },
556
+ {
557
+ "epoch": 9.62,
558
+ "learning_rate": 7.7e-06,
559
+ "loss": 0.6915,
560
+ "step": 77
561
+ },
562
+ {
563
+ "epoch": 9.75,
564
+ "learning_rate": 7.8e-06,
565
+ "loss": 0.6845,
566
+ "step": 78
567
+ },
568
+ {
569
+ "epoch": 9.88,
570
+ "learning_rate": 7.9e-06,
571
+ "loss": 0.6628,
572
+ "step": 79
573
+ },
574
+ {
575
+ "epoch": 10.0,
576
+ "learning_rate": 8.000000000000001e-06,
577
+ "loss": 0.6792,
578
+ "step": 80
579
+ },
580
+ {
581
+ "epoch": 10.0,
582
+ "eval_accuracy": 0.555,
583
+ "eval_loss": 0.6907283067703247,
584
+ "eval_macro_f1": 0.3856773080241587,
585
+ "eval_runtime": 0.6816,
586
+ "eval_samples_per_second": 293.434,
587
+ "eval_steps_per_second": 2.934,
588
+ "step": 80
589
+ },
590
+ {
591
+ "epoch": 10.12,
592
+ "learning_rate": 8.1e-06,
593
+ "loss": 0.6768,
594
+ "step": 81
595
+ },
596
+ {
597
+ "epoch": 10.25,
598
+ "learning_rate": 8.200000000000001e-06,
599
+ "loss": 0.6778,
600
+ "step": 82
601
+ },
602
+ {
603
+ "epoch": 10.38,
604
+ "learning_rate": 8.3e-06,
605
+ "loss": 0.6719,
606
+ "step": 83
607
+ },
608
+ {
609
+ "epoch": 10.5,
610
+ "learning_rate": 8.400000000000001e-06,
611
+ "loss": 0.6957,
612
+ "step": 84
613
+ },
614
+ {
615
+ "epoch": 10.62,
616
+ "learning_rate": 8.500000000000002e-06,
617
+ "loss": 0.685,
618
+ "step": 85
619
+ },
620
+ {
621
+ "epoch": 10.75,
622
+ "learning_rate": 8.599999999999999e-06,
623
+ "loss": 0.6855,
624
+ "step": 86
625
+ },
626
+ {
627
+ "epoch": 10.88,
628
+ "learning_rate": 8.7e-06,
629
+ "loss": 0.6756,
630
+ "step": 87
631
+ },
632
+ {
633
+ "epoch": 11.0,
634
+ "learning_rate": 8.8e-06,
635
+ "loss": 0.6906,
636
+ "step": 88
637
+ },
638
+ {
639
+ "epoch": 11.0,
640
+ "eval_accuracy": 0.57,
641
+ "eval_loss": 0.6894752383232117,
642
+ "eval_macro_f1": 0.4188403838356535,
643
+ "eval_runtime": 0.689,
644
+ "eval_samples_per_second": 290.257,
645
+ "eval_steps_per_second": 2.903,
646
+ "step": 88
647
+ },
648
+ {
649
+ "epoch": 11.12,
650
+ "learning_rate": 8.9e-06,
651
+ "loss": 0.683,
652
+ "step": 89
653
+ },
654
+ {
655
+ "epoch": 11.25,
656
+ "learning_rate": 9e-06,
657
+ "loss": 0.675,
658
+ "step": 90
659
+ },
660
+ {
661
+ "epoch": 11.38,
662
+ "learning_rate": 9.100000000000001e-06,
663
+ "loss": 0.7001,
664
+ "step": 91
665
+ },
666
+ {
667
+ "epoch": 11.5,
668
+ "learning_rate": 9.2e-06,
669
+ "loss": 0.6714,
670
+ "step": 92
671
+ },
672
+ {
673
+ "epoch": 11.62,
674
+ "learning_rate": 9.3e-06,
675
+ "loss": 0.6846,
676
+ "step": 93
677
+ },
678
+ {
679
+ "epoch": 11.75,
680
+ "learning_rate": 9.4e-06,
681
+ "loss": 0.6772,
682
+ "step": 94
683
+ },
684
+ {
685
+ "epoch": 11.88,
686
+ "learning_rate": 9.5e-06,
687
+ "loss": 0.6588,
688
+ "step": 95
689
+ },
690
+ {
691
+ "epoch": 12.0,
692
+ "learning_rate": 9.600000000000001e-06,
693
+ "loss": 0.6864,
694
+ "step": 96
695
+ },
696
+ {
697
+ "epoch": 12.0,
698
+ "eval_accuracy": 0.56,
699
+ "eval_loss": 0.6882457733154297,
700
+ "eval_macro_f1": 0.43524579643178024,
701
+ "eval_runtime": 0.6737,
702
+ "eval_samples_per_second": 296.852,
703
+ "eval_steps_per_second": 2.969,
704
+ "step": 96
705
+ },
706
+ {
707
+ "epoch": 12.12,
708
+ "learning_rate": 9.7e-06,
709
+ "loss": 0.6651,
710
+ "step": 97
711
+ },
712
+ {
713
+ "epoch": 12.25,
714
+ "learning_rate": 9.800000000000001e-06,
715
+ "loss": 0.6887,
716
+ "step": 98
717
+ },
718
+ {
719
+ "epoch": 12.38,
720
+ "learning_rate": 9.900000000000002e-06,
721
+ "loss": 0.6746,
722
+ "step": 99
723
+ },
724
+ {
725
+ "epoch": 12.5,
726
+ "learning_rate": 1e-05,
727
+ "loss": 0.6694,
728
+ "step": 100
729
+ },
730
+ {
731
+ "epoch": 12.62,
732
+ "learning_rate": 1.0100000000000002e-05,
733
+ "loss": 0.6768,
734
+ "step": 101
735
+ },
736
+ {
737
+ "epoch": 12.75,
738
+ "learning_rate": 1.02e-05,
739
+ "loss": 0.6619,
740
+ "step": 102
741
+ },
742
+ {
743
+ "epoch": 12.88,
744
+ "learning_rate": 1.03e-05,
745
+ "loss": 0.7057,
746
+ "step": 103
747
+ },
748
+ {
749
+ "epoch": 13.0,
750
+ "learning_rate": 1.04e-05,
751
+ "loss": 0.6648,
752
+ "step": 104
753
+ },
754
+ {
755
+ "epoch": 13.0,
756
+ "eval_accuracy": 0.565,
757
+ "eval_loss": 0.6870580911636353,
758
+ "eval_macro_f1": 0.43825665859564167,
759
+ "eval_runtime": 0.7161,
760
+ "eval_samples_per_second": 279.296,
761
+ "eval_steps_per_second": 2.793,
762
+ "step": 104
763
+ },
764
+ {
765
+ "epoch": 13.12,
766
+ "learning_rate": 1.05e-05,
767
+ "loss": 0.6833,
768
+ "step": 105
769
+ },
770
+ {
771
+ "epoch": 13.25,
772
+ "learning_rate": 1.06e-05,
773
+ "loss": 0.6797,
774
+ "step": 106
775
+ },
776
+ {
777
+ "epoch": 13.38,
778
+ "learning_rate": 1.0700000000000001e-05,
779
+ "loss": 0.66,
780
+ "step": 107
781
+ },
782
+ {
783
+ "epoch": 13.5,
784
+ "learning_rate": 1.08e-05,
785
+ "loss": 0.6844,
786
+ "step": 108
787
+ },
788
+ {
789
+ "epoch": 13.62,
790
+ "learning_rate": 1.09e-05,
791
+ "loss": 0.6803,
792
+ "step": 109
793
+ },
794
+ {
795
+ "epoch": 13.75,
796
+ "learning_rate": 1.1000000000000001e-05,
797
+ "loss": 0.6712,
798
+ "step": 110
799
+ },
800
+ {
801
+ "epoch": 13.88,
802
+ "learning_rate": 1.11e-05,
803
+ "loss": 0.6683,
804
+ "step": 111
805
+ },
806
+ {
807
+ "epoch": 14.0,
808
+ "learning_rate": 1.1200000000000001e-05,
809
+ "loss": 0.6821,
810
+ "step": 112
811
+ },
812
+ {
813
+ "epoch": 14.0,
814
+ "eval_accuracy": 0.555,
815
+ "eval_loss": 0.6860873699188232,
816
+ "eval_macro_f1": 0.4388221570667423,
817
+ "eval_runtime": 0.6892,
818
+ "eval_samples_per_second": 290.191,
819
+ "eval_steps_per_second": 2.902,
820
+ "step": 112
821
+ },
822
+ {
823
+ "epoch": 14.12,
824
+ "learning_rate": 1.13e-05,
825
+ "loss": 0.6755,
826
+ "step": 113
827
+ },
828
+ {
829
+ "epoch": 14.25,
830
+ "learning_rate": 1.1400000000000001e-05,
831
+ "loss": 0.6666,
832
+ "step": 114
833
+ },
834
+ {
835
+ "epoch": 14.38,
836
+ "learning_rate": 1.1500000000000002e-05,
837
+ "loss": 0.6747,
838
+ "step": 115
839
+ },
840
+ {
841
+ "epoch": 14.5,
842
+ "learning_rate": 1.16e-05,
843
+ "loss": 0.6628,
844
+ "step": 116
845
+ },
846
+ {
847
+ "epoch": 14.62,
848
+ "learning_rate": 1.1700000000000001e-05,
849
+ "loss": 0.6699,
850
+ "step": 117
851
+ },
852
+ {
853
+ "epoch": 14.75,
854
+ "learning_rate": 1.18e-05,
855
+ "loss": 0.6717,
856
+ "step": 118
857
+ },
858
+ {
859
+ "epoch": 14.88,
860
+ "learning_rate": 1.19e-05,
861
+ "loss": 0.6764,
862
+ "step": 119
863
+ },
864
+ {
865
+ "epoch": 15.0,
866
+ "learning_rate": 1.2e-05,
867
+ "loss": 0.6721,
868
+ "step": 120
869
+ },
870
+ {
871
+ "epoch": 15.0,
872
+ "eval_accuracy": 0.555,
873
+ "eval_loss": 0.6848286390304565,
874
+ "eval_macro_f1": 0.4624143034037027,
875
+ "eval_runtime": 0.6968,
876
+ "eval_samples_per_second": 287.018,
877
+ "eval_steps_per_second": 2.87,
878
+ "step": 120
879
+ },
880
+ {
881
+ "epoch": 15.12,
882
+ "learning_rate": 1.2100000000000001e-05,
883
+ "loss": 0.6756,
884
+ "step": 121
885
+ },
886
+ {
887
+ "epoch": 15.25,
888
+ "learning_rate": 1.22e-05,
889
+ "loss": 0.6814,
890
+ "step": 122
891
+ },
892
+ {
893
+ "epoch": 15.38,
894
+ "learning_rate": 1.23e-05,
895
+ "loss": 0.6693,
896
+ "step": 123
897
+ },
898
+ {
899
+ "epoch": 15.5,
900
+ "learning_rate": 1.24e-05,
901
+ "loss": 0.6723,
902
+ "step": 124
903
+ },
904
+ {
905
+ "epoch": 15.62,
906
+ "learning_rate": 1.25e-05,
907
+ "loss": 0.6506,
908
+ "step": 125
909
+ },
910
+ {
911
+ "epoch": 15.75,
912
+ "learning_rate": 1.2600000000000001e-05,
913
+ "loss": 0.6818,
914
+ "step": 126
915
+ },
916
+ {
917
+ "epoch": 15.88,
918
+ "learning_rate": 1.27e-05,
919
+ "loss": 0.6681,
920
+ "step": 127
921
+ },
922
+ {
923
+ "epoch": 16.0,
924
+ "learning_rate": 1.2800000000000001e-05,
925
+ "loss": 0.676,
926
+ "step": 128
927
+ },
928
+ {
929
+ "epoch": 16.0,
930
+ "eval_accuracy": 0.54,
931
+ "eval_loss": 0.6835575699806213,
932
+ "eval_macro_f1": 0.4875222816399287,
933
+ "eval_runtime": 0.6965,
934
+ "eval_samples_per_second": 287.142,
935
+ "eval_steps_per_second": 2.871,
936
+ "step": 128
937
+ },
938
+ {
939
+ "epoch": 16.12,
940
+ "learning_rate": 1.29e-05,
941
+ "loss": 0.662,
942
+ "step": 129
943
+ },
944
+ {
945
+ "epoch": 16.25,
946
+ "learning_rate": 1.3000000000000001e-05,
947
+ "loss": 0.6677,
948
+ "step": 130
949
+ },
950
+ {
951
+ "epoch": 16.38,
952
+ "learning_rate": 1.3100000000000002e-05,
953
+ "loss": 0.6641,
954
+ "step": 131
955
+ },
956
+ {
957
+ "epoch": 16.5,
958
+ "learning_rate": 1.32e-05,
959
+ "loss": 0.6832,
960
+ "step": 132
961
+ },
962
+ {
963
+ "epoch": 16.62,
964
+ "learning_rate": 1.3300000000000001e-05,
965
+ "loss": 0.6764,
966
+ "step": 133
967
+ },
968
+ {
969
+ "epoch": 16.75,
970
+ "learning_rate": 1.3400000000000002e-05,
971
+ "loss": 0.6583,
972
+ "step": 134
973
+ },
974
+ {
975
+ "epoch": 16.88,
976
+ "learning_rate": 1.3500000000000001e-05,
977
+ "loss": 0.659,
978
+ "step": 135
979
+ },
980
+ {
981
+ "epoch": 17.0,
982
+ "learning_rate": 1.3600000000000002e-05,
983
+ "loss": 0.6883,
984
+ "step": 136
985
+ },
986
+ {
987
+ "epoch": 17.0,
988
+ "eval_accuracy": 0.555,
989
+ "eval_loss": 0.6816694736480713,
990
+ "eval_macro_f1": 0.4676874308442238,
991
+ "eval_runtime": 0.6967,
992
+ "eval_samples_per_second": 287.071,
993
+ "eval_steps_per_second": 2.871,
994
+ "step": 136
995
+ },
996
+ {
997
+ "epoch": 17.12,
998
+ "learning_rate": 1.3700000000000001e-05,
999
+ "loss": 0.6548,
1000
+ "step": 137
1001
+ },
1002
+ {
1003
+ "epoch": 17.25,
1004
+ "learning_rate": 1.3800000000000002e-05,
1005
+ "loss": 0.6767,
1006
+ "step": 138
1007
+ },
1008
+ {
1009
+ "epoch": 17.38,
1010
+ "learning_rate": 1.3900000000000002e-05,
1011
+ "loss": 0.6723,
1012
+ "step": 139
1013
+ },
1014
+ {
1015
+ "epoch": 17.5,
1016
+ "learning_rate": 1.4000000000000001e-05,
1017
+ "loss": 0.6818,
1018
+ "step": 140
1019
+ },
1020
+ {
1021
+ "epoch": 17.62,
1022
+ "learning_rate": 1.4099999999999999e-05,
1023
+ "loss": 0.6456,
1024
+ "step": 141
1025
+ },
1026
+ {
1027
+ "epoch": 17.75,
1028
+ "learning_rate": 1.42e-05,
1029
+ "loss": 0.677,
1030
+ "step": 142
1031
+ },
1032
+ {
1033
+ "epoch": 17.88,
1034
+ "learning_rate": 1.43e-05,
1035
+ "loss": 0.6573,
1036
+ "step": 143
1037
+ },
1038
+ {
1039
+ "epoch": 18.0,
1040
+ "learning_rate": 1.44e-05,
1041
+ "loss": 0.6332,
1042
+ "step": 144
1043
+ },
1044
+ {
1045
+ "epoch": 18.0,
1046
+ "eval_accuracy": 0.57,
1047
+ "eval_loss": 0.6794130802154541,
1048
+ "eval_macro_f1": 0.5174503422735944,
1049
+ "eval_runtime": 0.692,
1050
+ "eval_samples_per_second": 289.012,
1051
+ "eval_steps_per_second": 2.89,
1052
+ "step": 144
1053
+ },
1054
+ {
1055
+ "epoch": 18.12,
1056
+ "learning_rate": 1.45e-05,
1057
+ "loss": 0.6403,
1058
+ "step": 145
1059
+ },
1060
+ {
1061
+ "epoch": 18.25,
1062
+ "learning_rate": 1.4599999999999999e-05,
1063
+ "loss": 0.6612,
1064
+ "step": 146
1065
+ },
1066
+ {
1067
+ "epoch": 18.38,
1068
+ "learning_rate": 1.47e-05,
1069
+ "loss": 0.6482,
1070
+ "step": 147
1071
+ },
1072
+ {
1073
+ "epoch": 18.5,
1074
+ "learning_rate": 1.48e-05,
1075
+ "loss": 0.6469,
1076
+ "step": 148
1077
+ },
1078
+ {
1079
+ "epoch": 18.62,
1080
+ "learning_rate": 1.49e-05,
1081
+ "loss": 0.6756,
1082
+ "step": 149
1083
+ },
1084
+ {
1085
+ "epoch": 18.75,
1086
+ "learning_rate": 1.5e-05,
1087
+ "loss": 0.6543,
1088
+ "step": 150
1089
+ },
1090
+ {
1091
+ "epoch": 18.88,
1092
+ "learning_rate": 1.51e-05,
1093
+ "loss": 0.6734,
1094
+ "step": 151
1095
+ },
1096
+ {
1097
+ "epoch": 19.0,
1098
+ "learning_rate": 1.52e-05,
1099
+ "loss": 0.6401,
1100
+ "step": 152
1101
+ },
1102
+ {
1103
+ "epoch": 19.0,
1104
+ "eval_accuracy": 0.55,
1105
+ "eval_loss": 0.6782782077789307,
1106
+ "eval_macro_f1": 0.5146154675870995,
1107
+ "eval_runtime": 0.6724,
1108
+ "eval_samples_per_second": 297.433,
1109
+ "eval_steps_per_second": 2.974,
1110
+ "step": 152
1111
+ },
1112
+ {
1113
+ "epoch": 19.12,
1114
+ "learning_rate": 1.53e-05,
1115
+ "loss": 0.6368,
1116
+ "step": 153
1117
+ },
1118
+ {
1119
+ "epoch": 19.25,
1120
+ "learning_rate": 1.54e-05,
1121
+ "loss": 0.618,
1122
+ "step": 154
1123
+ },
1124
+ {
1125
+ "epoch": 19.38,
1126
+ "learning_rate": 1.55e-05,
1127
+ "loss": 0.6665,
1128
+ "step": 155
1129
+ },
1130
+ {
1131
+ "epoch": 19.5,
1132
+ "learning_rate": 1.56e-05,
1133
+ "loss": 0.6675,
1134
+ "step": 156
1135
+ },
1136
+ {
1137
+ "epoch": 19.62,
1138
+ "learning_rate": 1.5700000000000002e-05,
1139
+ "loss": 0.6414,
1140
+ "step": 157
1141
+ },
1142
+ {
1143
+ "epoch": 19.75,
1144
+ "learning_rate": 1.58e-05,
1145
+ "loss": 0.6627,
1146
+ "step": 158
1147
+ },
1148
+ {
1149
+ "epoch": 19.88,
1150
+ "learning_rate": 1.59e-05,
1151
+ "loss": 0.6375,
1152
+ "step": 159
1153
+ },
1154
+ {
1155
+ "epoch": 20.0,
1156
+ "learning_rate": 1.6000000000000003e-05,
1157
+ "loss": 0.6576,
1158
+ "step": 160
1159
+ },
1160
+ {
1161
+ "epoch": 20.0,
1162
+ "eval_accuracy": 0.56,
1163
+ "eval_loss": 0.6768765449523926,
1164
+ "eval_macro_f1": 0.5331069609507639,
1165
+ "eval_runtime": 0.6907,
1166
+ "eval_samples_per_second": 289.541,
1167
+ "eval_steps_per_second": 2.895,
1168
+ "step": 160
1169
+ },
1170
+ {
1171
+ "epoch": 20.12,
1172
+ "learning_rate": 1.6100000000000002e-05,
1173
+ "loss": 0.6339,
1174
+ "step": 161
1175
+ },
1176
+ {
1177
+ "epoch": 20.25,
1178
+ "learning_rate": 1.62e-05,
1179
+ "loss": 0.6623,
1180
+ "step": 162
1181
+ },
1182
+ {
1183
+ "epoch": 20.38,
1184
+ "learning_rate": 1.63e-05,
1185
+ "loss": 0.6738,
1186
+ "step": 163
1187
+ },
1188
+ {
1189
+ "epoch": 20.5,
1190
+ "learning_rate": 1.6400000000000002e-05,
1191
+ "loss": 0.6148,
1192
+ "step": 164
1193
+ },
1194
+ {
1195
+ "epoch": 20.62,
1196
+ "learning_rate": 1.65e-05,
1197
+ "loss": 0.6195,
1198
+ "step": 165
1199
+ },
1200
+ {
1201
+ "epoch": 20.75,
1202
+ "learning_rate": 1.66e-05,
1203
+ "loss": 0.675,
1204
+ "step": 166
1205
+ },
1206
+ {
1207
+ "epoch": 20.88,
1208
+ "learning_rate": 1.6700000000000003e-05,
1209
+ "loss": 0.6345,
1210
+ "step": 167
1211
+ },
1212
+ {
1213
+ "epoch": 21.0,
1214
+ "learning_rate": 1.6800000000000002e-05,
1215
+ "loss": 0.6266,
1216
+ "step": 168
1217
+ },
1218
+ {
1219
+ "epoch": 21.0,
1220
+ "eval_accuracy": 0.565,
1221
+ "eval_loss": 0.6750273108482361,
1222
+ "eval_macro_f1": 0.5372217346206016,
1223
+ "eval_runtime": 0.6888,
1224
+ "eval_samples_per_second": 290.355,
1225
+ "eval_steps_per_second": 2.904,
1226
+ "step": 168
1227
+ },
1228
+ {
1229
+ "epoch": 21.12,
1230
+ "learning_rate": 1.69e-05,
1231
+ "loss": 0.622,
1232
+ "step": 169
1233
+ },
1234
+ {
1235
+ "epoch": 21.25,
1236
+ "learning_rate": 1.7000000000000003e-05,
1237
+ "loss": 0.6214,
1238
+ "step": 170
1239
+ },
1240
+ {
1241
+ "epoch": 21.38,
1242
+ "learning_rate": 1.7100000000000002e-05,
1243
+ "loss": 0.6917,
1244
+ "step": 171
1245
+ },
1246
+ {
1247
+ "epoch": 21.5,
1248
+ "learning_rate": 1.7199999999999998e-05,
1249
+ "loss": 0.6134,
1250
+ "step": 172
1251
+ },
1252
+ {
1253
+ "epoch": 21.62,
1254
+ "learning_rate": 1.73e-05,
1255
+ "loss": 0.6315,
1256
+ "step": 173
1257
+ },
1258
+ {
1259
+ "epoch": 21.75,
1260
+ "learning_rate": 1.74e-05,
1261
+ "loss": 0.6232,
1262
+ "step": 174
1263
+ },
1264
+ {
1265
+ "epoch": 21.88,
1266
+ "learning_rate": 1.75e-05,
1267
+ "loss": 0.6393,
1268
+ "step": 175
1269
+ },
1270
+ {
1271
+ "epoch": 22.0,
1272
+ "learning_rate": 1.76e-05,
1273
+ "loss": 0.6437,
1274
+ "step": 176
1275
+ },
1276
+ {
1277
+ "epoch": 22.0,
1278
+ "eval_accuracy": 0.575,
1279
+ "eval_loss": 0.677147626876831,
1280
+ "eval_macro_f1": 0.5523370638578011,
1281
+ "eval_runtime": 0.6959,
1282
+ "eval_samples_per_second": 287.392,
1283
+ "eval_steps_per_second": 2.874,
1284
+ "step": 176
1285
+ },
1286
+ {
1287
+ "epoch": 22.12,
1288
+ "learning_rate": 1.77e-05,
1289
+ "loss": 0.5981,
1290
+ "step": 177
1291
+ },
1292
+ {
1293
+ "epoch": 22.25,
1294
+ "learning_rate": 1.78e-05,
1295
+ "loss": 0.6213,
1296
+ "step": 178
1297
+ },
1298
+ {
1299
+ "epoch": 22.38,
1300
+ "learning_rate": 1.79e-05,
1301
+ "loss": 0.6262,
1302
+ "step": 179
1303
+ },
1304
+ {
1305
+ "epoch": 22.5,
1306
+ "learning_rate": 1.8e-05,
1307
+ "loss": 0.6094,
1308
+ "step": 180
1309
+ },
1310
+ {
1311
+ "epoch": 22.62,
1312
+ "learning_rate": 1.81e-05,
1313
+ "loss": 0.6472,
1314
+ "step": 181
1315
+ },
1316
+ {
1317
+ "epoch": 22.75,
1318
+ "learning_rate": 1.8200000000000002e-05,
1319
+ "loss": 0.6196,
1320
+ "step": 182
1321
+ },
1322
+ {
1323
+ "epoch": 22.88,
1324
+ "learning_rate": 1.83e-05,
1325
+ "loss": 0.6156,
1326
+ "step": 183
1327
+ },
1328
+ {
1329
+ "epoch": 23.0,
1330
+ "learning_rate": 1.84e-05,
1331
+ "loss": 0.6671,
1332
+ "step": 184
1333
+ },
1334
+ {
1335
+ "epoch": 23.0,
1336
+ "eval_accuracy": 0.565,
1337
+ "eval_loss": 0.6740648150444031,
1338
+ "eval_macro_f1": 0.5294117647058824,
1339
+ "eval_runtime": 0.7084,
1340
+ "eval_samples_per_second": 282.312,
1341
+ "eval_steps_per_second": 2.823,
1342
+ "step": 184
1343
+ },
1344
+ {
1345
+ "epoch": 23.12,
1346
+ "learning_rate": 1.85e-05,
1347
+ "loss": 0.6085,
1348
+ "step": 185
1349
+ },
1350
+ {
1351
+ "epoch": 23.25,
1352
+ "learning_rate": 1.86e-05,
1353
+ "loss": 0.6269,
1354
+ "step": 186
1355
+ },
1356
+ {
1357
+ "epoch": 23.38,
1358
+ "learning_rate": 1.87e-05,
1359
+ "loss": 0.6125,
1360
+ "step": 187
1361
+ },
1362
+ {
1363
+ "epoch": 23.5,
1364
+ "learning_rate": 1.88e-05,
1365
+ "loss": 0.6173,
1366
+ "step": 188
1367
+ },
1368
+ {
1369
+ "epoch": 23.62,
1370
+ "learning_rate": 1.8900000000000002e-05,
1371
+ "loss": 0.6425,
1372
+ "step": 189
1373
+ },
1374
+ {
1375
+ "epoch": 23.75,
1376
+ "learning_rate": 1.9e-05,
1377
+ "loss": 0.609,
1378
+ "step": 190
1379
+ },
1380
+ {
1381
+ "epoch": 23.88,
1382
+ "learning_rate": 1.91e-05,
1383
+ "loss": 0.6031,
1384
+ "step": 191
1385
+ },
1386
+ {
1387
+ "epoch": 24.0,
1388
+ "learning_rate": 1.9200000000000003e-05,
1389
+ "loss": 0.6229,
1390
+ "step": 192
1391
+ },
1392
+ {
1393
+ "epoch": 24.0,
1394
+ "eval_accuracy": 0.585,
1395
+ "eval_loss": 0.6694185137748718,
1396
+ "eval_macro_f1": 0.5718891038039975,
1397
+ "eval_runtime": 0.695,
1398
+ "eval_samples_per_second": 287.787,
1399
+ "eval_steps_per_second": 2.878,
1400
+ "step": 192
1401
+ },
1402
+ {
1403
+ "epoch": 24.12,
1404
+ "learning_rate": 1.93e-05,
1405
+ "loss": 0.612,
1406
+ "step": 193
1407
+ },
1408
+ {
1409
+ "epoch": 24.25,
1410
+ "learning_rate": 1.94e-05,
1411
+ "loss": 0.5779,
1412
+ "step": 194
1413
+ },
1414
+ {
1415
+ "epoch": 24.38,
1416
+ "learning_rate": 1.9500000000000003e-05,
1417
+ "loss": 0.6329,
1418
+ "step": 195
1419
+ },
1420
+ {
1421
+ "epoch": 24.5,
1422
+ "learning_rate": 1.9600000000000002e-05,
1423
+ "loss": 0.6185,
1424
+ "step": 196
1425
+ },
1426
+ {
1427
+ "epoch": 24.62,
1428
+ "learning_rate": 1.97e-05,
1429
+ "loss": 0.606,
1430
+ "step": 197
1431
+ },
1432
+ {
1433
+ "epoch": 24.75,
1434
+ "learning_rate": 1.9800000000000004e-05,
1435
+ "loss": 0.6113,
1436
+ "step": 198
1437
+ },
1438
+ {
1439
+ "epoch": 24.88,
1440
+ "learning_rate": 1.9900000000000003e-05,
1441
+ "loss": 0.6261,
1442
+ "step": 199
1443
+ },
1444
+ {
1445
+ "epoch": 25.0,
1446
+ "learning_rate": 2e-05,
1447
+ "loss": 0.6475,
1448
+ "step": 200
1449
+ },
1450
+ {
1451
+ "epoch": 25.0,
1452
+ "eval_accuracy": 0.585,
1453
+ "eval_loss": 0.6710052490234375,
1454
+ "eval_macro_f1": 0.5359888190076869,
1455
+ "eval_runtime": 0.7024,
1456
+ "eval_samples_per_second": 284.724,
1457
+ "eval_steps_per_second": 2.847,
1458
+ "step": 200
1459
+ },
1460
+ {
1461
+ "epoch": 25.12,
1462
+ "learning_rate": 2.01e-05,
1463
+ "loss": 0.605,
1464
+ "step": 201
1465
+ },
1466
+ {
1467
+ "epoch": 25.25,
1468
+ "learning_rate": 2.0200000000000003e-05,
1469
+ "loss": 0.6224,
1470
+ "step": 202
1471
+ },
1472
+ {
1473
+ "epoch": 25.38,
1474
+ "learning_rate": 2.0300000000000002e-05,
1475
+ "loss": 0.6304,
1476
+ "step": 203
1477
+ },
1478
+ {
1479
+ "epoch": 25.5,
1480
+ "learning_rate": 2.04e-05,
1481
+ "loss": 0.5808,
1482
+ "step": 204
1483
+ },
1484
+ {
1485
+ "epoch": 25.62,
1486
+ "learning_rate": 2.05e-05,
1487
+ "loss": 0.5741,
1488
+ "step": 205
1489
+ },
1490
+ {
1491
+ "epoch": 25.75,
1492
+ "learning_rate": 2.06e-05,
1493
+ "loss": 0.61,
1494
+ "step": 206
1495
+ },
1496
+ {
1497
+ "epoch": 25.88,
1498
+ "learning_rate": 2.07e-05,
1499
+ "loss": 0.6129,
1500
+ "step": 207
1501
+ },
1502
+ {
1503
+ "epoch": 26.0,
1504
+ "learning_rate": 2.08e-05,
1505
+ "loss": 0.5683,
1506
+ "step": 208
1507
+ },
1508
+ {
1509
+ "epoch": 26.0,
1510
+ "eval_accuracy": 0.625,
1511
+ "eval_loss": 0.6624136567115784,
1512
+ "eval_macro_f1": 0.6180387563341907,
1513
+ "eval_runtime": 0.688,
1514
+ "eval_samples_per_second": 290.698,
1515
+ "eval_steps_per_second": 2.907,
1516
+ "step": 208
1517
+ },
1518
+ {
1519
+ "epoch": 26.12,
1520
+ "learning_rate": 2.09e-05,
1521
+ "loss": 0.5728,
1522
+ "step": 209
1523
+ },
1524
+ {
1525
+ "epoch": 26.25,
1526
+ "learning_rate": 2.1e-05,
1527
+ "loss": 0.6,
1528
+ "step": 210
1529
+ },
1530
+ {
1531
+ "epoch": 26.38,
1532
+ "learning_rate": 2.11e-05,
1533
+ "loss": 0.5585,
1534
+ "step": 211
1535
+ },
1536
+ {
1537
+ "epoch": 26.5,
1538
+ "learning_rate": 2.12e-05,
1539
+ "loss": 0.583,
1540
+ "step": 212
1541
+ },
1542
+ {
1543
+ "epoch": 26.62,
1544
+ "learning_rate": 2.13e-05,
1545
+ "loss": 0.5753,
1546
+ "step": 213
1547
+ },
1548
+ {
1549
+ "epoch": 26.75,
1550
+ "learning_rate": 2.1400000000000002e-05,
1551
+ "loss": 0.6253,
1552
+ "step": 214
1553
+ },
1554
+ {
1555
+ "epoch": 26.88,
1556
+ "learning_rate": 2.15e-05,
1557
+ "loss": 0.5611,
1558
+ "step": 215
1559
+ },
1560
+ {
1561
+ "epoch": 27.0,
1562
+ "learning_rate": 2.16e-05,
1563
+ "loss": 0.6105,
1564
+ "step": 216
1565
+ },
1566
+ {
1567
+ "epoch": 27.0,
1568
+ "eval_accuracy": 0.585,
1569
+ "eval_loss": 0.6604620218276978,
1570
+ "eval_macro_f1": 0.5536554542765723,
1571
+ "eval_runtime": 0.6984,
1572
+ "eval_samples_per_second": 286.362,
1573
+ "eval_steps_per_second": 2.864,
1574
+ "step": 216
1575
+ },
1576
+ {
1577
+ "epoch": 27.12,
1578
+ "learning_rate": 2.1700000000000002e-05,
1579
+ "loss": 0.5816,
1580
+ "step": 217
1581
+ },
1582
+ {
1583
+ "epoch": 27.25,
1584
+ "learning_rate": 2.18e-05,
1585
+ "loss": 0.5959,
1586
+ "step": 218
1587
+ },
1588
+ {
1589
+ "epoch": 27.38,
1590
+ "learning_rate": 2.19e-05,
1591
+ "loss": 0.6061,
1592
+ "step": 219
1593
+ },
1594
+ {
1595
+ "epoch": 27.5,
1596
+ "learning_rate": 2.2000000000000003e-05,
1597
+ "loss": 0.5869,
1598
+ "step": 220
1599
+ },
1600
+ {
1601
+ "epoch": 27.62,
1602
+ "learning_rate": 2.2100000000000002e-05,
1603
+ "loss": 0.6022,
1604
+ "step": 221
1605
+ },
1606
+ {
1607
+ "epoch": 27.75,
1608
+ "learning_rate": 2.22e-05,
1609
+ "loss": 0.5463,
1610
+ "step": 222
1611
+ },
1612
+ {
1613
+ "epoch": 27.88,
1614
+ "learning_rate": 2.23e-05,
1615
+ "loss": 0.5492,
1616
+ "step": 223
1617
+ },
1618
+ {
1619
+ "epoch": 28.0,
1620
+ "learning_rate": 2.2400000000000002e-05,
1621
+ "loss": 0.5077,
1622
+ "step": 224
1623
+ },
1624
+ {
1625
+ "epoch": 28.0,
1626
+ "eval_accuracy": 0.615,
1627
+ "eval_loss": 0.6587879657745361,
1628
+ "eval_macro_f1": 0.6042250263421655,
1629
+ "eval_runtime": 0.6976,
1630
+ "eval_samples_per_second": 286.693,
1631
+ "eval_steps_per_second": 2.867,
1632
+ "step": 224
1633
+ },
1634
+ {
1635
+ "epoch": 28.12,
1636
+ "learning_rate": 2.25e-05,
1637
+ "loss": 0.5857,
1638
+ "step": 225
1639
+ },
1640
+ {
1641
+ "epoch": 28.25,
1642
+ "learning_rate": 2.26e-05,
1643
+ "loss": 0.5795,
1644
+ "step": 226
1645
+ },
1646
+ {
1647
+ "epoch": 28.38,
1648
+ "learning_rate": 2.2700000000000003e-05,
1649
+ "loss": 0.563,
1650
+ "step": 227
1651
+ },
1652
+ {
1653
+ "epoch": 28.5,
1654
+ "learning_rate": 2.2800000000000002e-05,
1655
+ "loss": 0.5889,
1656
+ "step": 228
1657
+ },
1658
+ {
1659
+ "epoch": 28.62,
1660
+ "learning_rate": 2.29e-05,
1661
+ "loss": 0.5617,
1662
+ "step": 229
1663
+ },
1664
+ {
1665
+ "epoch": 28.75,
1666
+ "learning_rate": 2.3000000000000003e-05,
1667
+ "loss": 0.5756,
1668
+ "step": 230
1669
+ },
1670
+ {
1671
+ "epoch": 28.88,
1672
+ "learning_rate": 2.3100000000000002e-05,
1673
+ "loss": 0.5581,
1674
+ "step": 231
1675
+ },
1676
+ {
1677
+ "epoch": 29.0,
1678
+ "learning_rate": 2.32e-05,
1679
+ "loss": 0.5557,
1680
+ "step": 232
1681
+ },
1682
+ {
1683
+ "epoch": 29.0,
1684
+ "eval_accuracy": 0.595,
1685
+ "eval_loss": 0.660551130771637,
1686
+ "eval_macro_f1": 0.5644107445349681,
1687
+ "eval_runtime": 0.6821,
1688
+ "eval_samples_per_second": 293.231,
1689
+ "eval_steps_per_second": 2.932,
1690
+ "step": 232
1691
+ },
1692
+ {
1693
+ "epoch": 29.12,
1694
+ "learning_rate": 2.3300000000000004e-05,
1695
+ "loss": 0.6115,
1696
+ "step": 233
1697
+ },
1698
+ {
1699
+ "epoch": 29.25,
1700
+ "learning_rate": 2.3400000000000003e-05,
1701
+ "loss": 0.5437,
1702
+ "step": 234
1703
+ },
1704
+ {
1705
+ "epoch": 29.38,
1706
+ "learning_rate": 2.35e-05,
1707
+ "loss": 0.5806,
1708
+ "step": 235
1709
+ },
1710
+ {
1711
+ "epoch": 29.5,
1712
+ "learning_rate": 2.36e-05,
1713
+ "loss": 0.5529,
1714
+ "step": 236
1715
+ },
1716
+ {
1717
+ "epoch": 29.62,
1718
+ "learning_rate": 2.37e-05,
1719
+ "loss": 0.5296,
1720
+ "step": 237
1721
+ },
1722
+ {
1723
+ "epoch": 29.75,
1724
+ "learning_rate": 2.38e-05,
1725
+ "loss": 0.5341,
1726
+ "step": 238
1727
+ },
1728
+ {
1729
+ "epoch": 29.88,
1730
+ "learning_rate": 2.39e-05,
1731
+ "loss": 0.5454,
1732
+ "step": 239
1733
+ },
1734
+ {
1735
+ "epoch": 30.0,
1736
+ "learning_rate": 2.4e-05,
1737
+ "loss": 0.5498,
1738
+ "step": 240
1739
+ },
1740
+ {
1741
+ "epoch": 30.0,
1742
+ "eval_accuracy": 0.63,
1743
+ "eval_loss": 0.6594185829162598,
1744
+ "eval_macro_f1": 0.6269785260610949,
1745
+ "eval_runtime": 0.6994,
1746
+ "eval_samples_per_second": 285.941,
1747
+ "eval_steps_per_second": 2.859,
1748
+ "step": 240
1749
+ }
1750
+ ],
1751
+ "max_steps": 400,
1752
+ "num_train_epochs": 50,
1753
+ "total_flos": 952401199104000.0,
1754
+ "trial_name": null,
1755
+ "trial_params": null
1756
+ }
scaling_performance/2000/.DS_Store CHANGED
Binary files a/scaling_performance/2000/.DS_Store and b/scaling_performance/2000/.DS_Store differ
 
scaling_performance/2000/L1/.DS_Store CHANGED
Binary files a/scaling_performance/2000/L1/.DS_Store and b/scaling_performance/2000/L1/.DS_Store differ
 
scaling_performance/2000/L1/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.5775,
3
+ "test_loss": 0.6810634136199951,
4
+ "test_macro_f1": 0.36608557844690964,
5
+ "test_runtime": 0.9129,
6
+ "test_samples_per_second": 438.178,
7
+ "test_steps_per_second": 4.382
8
+ }
scaling_performance/2000/L1/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.02,
6
+ "classifier_dropout": null,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.02,
9
+ "hidden_size": 256,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 512,
12
+ "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 2048,
14
+ "model_type": "bert",
15
+ "num_attention_heads": 4,
16
+ "num_hidden_layers": 1,
17
+ "pad_token_id": 0,
18
+ "position_embedding_type": "absolute",
19
+ "problem_type": "single_label_classification",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.28.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
scaling_performance/2000/L1/eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.5775,
3
+ "test_loss": 0.6810634136199951,
4
+ "test_macro_f1": 0.36608557844690964,
5
+ "test_runtime": 0.9129,
6
+ "test_samples_per_second": 438.178,
7
+ "test_steps_per_second": 4.382
8
+ }
scaling_performance/2000/L1/trainer_state.json ADDED
@@ -0,0 +1,886 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6810634136199951,
3
+ "best_model_checkpoint": "./models/240626_geneformer_CellClassifier_PM25_Layers1_L2048_B26_LR5e-05_LSlinear_WU600_E20_Oadamw_F0_fold4/checkpoint-208",
4
+ "epoch": 15.0,
5
+ "global_step": 240,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.12,
12
+ "learning_rate": 1.6666666666666668e-07,
13
+ "loss": 0.6939,
14
+ "step": 2
15
+ },
16
+ {
17
+ "epoch": 0.25,
18
+ "learning_rate": 3.3333333333333335e-07,
19
+ "loss": 0.6887,
20
+ "step": 4
21
+ },
22
+ {
23
+ "epoch": 0.38,
24
+ "learning_rate": 5.000000000000001e-07,
25
+ "loss": 0.6921,
26
+ "step": 6
27
+ },
28
+ {
29
+ "epoch": 0.5,
30
+ "learning_rate": 6.666666666666667e-07,
31
+ "loss": 0.6899,
32
+ "step": 8
33
+ },
34
+ {
35
+ "epoch": 0.62,
36
+ "learning_rate": 8.333333333333333e-07,
37
+ "loss": 0.6913,
38
+ "step": 10
39
+ },
40
+ {
41
+ "epoch": 0.75,
42
+ "learning_rate": 1.0000000000000002e-06,
43
+ "loss": 0.691,
44
+ "step": 12
45
+ },
46
+ {
47
+ "epoch": 0.88,
48
+ "learning_rate": 1.1666666666666668e-06,
49
+ "loss": 0.6912,
50
+ "step": 14
51
+ },
52
+ {
53
+ "epoch": 1.0,
54
+ "learning_rate": 1.3333333333333334e-06,
55
+ "loss": 0.689,
56
+ "step": 16
57
+ },
58
+ {
59
+ "epoch": 1.0,
60
+ "eval_accuracy": 0.505,
61
+ "eval_loss": 0.6925567388534546,
62
+ "eval_macro_f1": 0.49510403916768664,
63
+ "eval_runtime": 0.8698,
64
+ "eval_samples_per_second": 459.879,
65
+ "eval_steps_per_second": 4.599,
66
+ "step": 16
67
+ },
68
+ {
69
+ "epoch": 1.12,
70
+ "learning_rate": 1.5e-06,
71
+ "loss": 0.6906,
72
+ "step": 18
73
+ },
74
+ {
75
+ "epoch": 1.25,
76
+ "learning_rate": 1.6666666666666667e-06,
77
+ "loss": 0.6896,
78
+ "step": 20
79
+ },
80
+ {
81
+ "epoch": 1.38,
82
+ "learning_rate": 1.8333333333333335e-06,
83
+ "loss": 0.6892,
84
+ "step": 22
85
+ },
86
+ {
87
+ "epoch": 1.5,
88
+ "learning_rate": 2.0000000000000003e-06,
89
+ "loss": 0.6984,
90
+ "step": 24
91
+ },
92
+ {
93
+ "epoch": 1.62,
94
+ "learning_rate": 2.166666666666667e-06,
95
+ "loss": 0.6923,
96
+ "step": 26
97
+ },
98
+ {
99
+ "epoch": 1.75,
100
+ "learning_rate": 2.3333333333333336e-06,
101
+ "loss": 0.6879,
102
+ "step": 28
103
+ },
104
+ {
105
+ "epoch": 1.88,
106
+ "learning_rate": 2.5e-06,
107
+ "loss": 0.6918,
108
+ "step": 30
109
+ },
110
+ {
111
+ "epoch": 2.0,
112
+ "learning_rate": 2.666666666666667e-06,
113
+ "loss": 0.6832,
114
+ "step": 32
115
+ },
116
+ {
117
+ "epoch": 2.0,
118
+ "eval_accuracy": 0.5375,
119
+ "eval_loss": 0.6915823221206665,
120
+ "eval_macro_f1": 0.513410793074652,
121
+ "eval_runtime": 1.0103,
122
+ "eval_samples_per_second": 395.928,
123
+ "eval_steps_per_second": 3.959,
124
+ "step": 32
125
+ },
126
+ {
127
+ "epoch": 2.12,
128
+ "learning_rate": 2.8333333333333335e-06,
129
+ "loss": 0.6928,
130
+ "step": 34
131
+ },
132
+ {
133
+ "epoch": 2.25,
134
+ "learning_rate": 3e-06,
135
+ "loss": 0.6909,
136
+ "step": 36
137
+ },
138
+ {
139
+ "epoch": 2.38,
140
+ "learning_rate": 3.166666666666667e-06,
141
+ "loss": 0.6901,
142
+ "step": 38
143
+ },
144
+ {
145
+ "epoch": 2.5,
146
+ "learning_rate": 3.3333333333333333e-06,
147
+ "loss": 0.6891,
148
+ "step": 40
149
+ },
150
+ {
151
+ "epoch": 2.62,
152
+ "learning_rate": 3.5000000000000004e-06,
153
+ "loss": 0.6851,
154
+ "step": 42
155
+ },
156
+ {
157
+ "epoch": 2.75,
158
+ "learning_rate": 3.666666666666667e-06,
159
+ "loss": 0.6853,
160
+ "step": 44
161
+ },
162
+ {
163
+ "epoch": 2.88,
164
+ "learning_rate": 3.833333333333334e-06,
165
+ "loss": 0.6949,
166
+ "step": 46
167
+ },
168
+ {
169
+ "epoch": 3.0,
170
+ "learning_rate": 4.000000000000001e-06,
171
+ "loss": 0.6825,
172
+ "step": 48
173
+ },
174
+ {
175
+ "epoch": 3.0,
176
+ "eval_accuracy": 0.5475,
177
+ "eval_loss": 0.689637303352356,
178
+ "eval_macro_f1": 0.49125506812544356,
179
+ "eval_runtime": 0.8008,
180
+ "eval_samples_per_second": 499.483,
181
+ "eval_steps_per_second": 4.995,
182
+ "step": 48
183
+ },
184
+ {
185
+ "epoch": 3.12,
186
+ "learning_rate": 4.166666666666667e-06,
187
+ "loss": 0.6895,
188
+ "step": 50
189
+ },
190
+ {
191
+ "epoch": 3.25,
192
+ "learning_rate": 4.333333333333334e-06,
193
+ "loss": 0.6837,
194
+ "step": 52
195
+ },
196
+ {
197
+ "epoch": 3.38,
198
+ "learning_rate": 4.5e-06,
199
+ "loss": 0.689,
200
+ "step": 54
201
+ },
202
+ {
203
+ "epoch": 3.5,
204
+ "learning_rate": 4.666666666666667e-06,
205
+ "loss": 0.6954,
206
+ "step": 56
207
+ },
208
+ {
209
+ "epoch": 3.62,
210
+ "learning_rate": 4.833333333333333e-06,
211
+ "loss": 0.6826,
212
+ "step": 58
213
+ },
214
+ {
215
+ "epoch": 3.75,
216
+ "learning_rate": 5e-06,
217
+ "loss": 0.6898,
218
+ "step": 60
219
+ },
220
+ {
221
+ "epoch": 3.88,
222
+ "learning_rate": 5.166666666666667e-06,
223
+ "loss": 0.6897,
224
+ "step": 62
225
+ },
226
+ {
227
+ "epoch": 4.0,
228
+ "learning_rate": 5.333333333333334e-06,
229
+ "loss": 0.6791,
230
+ "step": 64
231
+ },
232
+ {
233
+ "epoch": 4.0,
234
+ "eval_accuracy": 0.55,
235
+ "eval_loss": 0.6878230571746826,
236
+ "eval_macro_f1": 0.4642857142857143,
237
+ "eval_runtime": 0.7978,
238
+ "eval_samples_per_second": 501.403,
239
+ "eval_steps_per_second": 5.014,
240
+ "step": 64
241
+ },
242
+ {
243
+ "epoch": 4.12,
244
+ "learning_rate": 5.500000000000001e-06,
245
+ "loss": 0.6911,
246
+ "step": 66
247
+ },
248
+ {
249
+ "epoch": 4.25,
250
+ "learning_rate": 5.666666666666667e-06,
251
+ "loss": 0.685,
252
+ "step": 68
253
+ },
254
+ {
255
+ "epoch": 4.38,
256
+ "learning_rate": 5.833333333333334e-06,
257
+ "loss": 0.6865,
258
+ "step": 70
259
+ },
260
+ {
261
+ "epoch": 4.5,
262
+ "learning_rate": 6e-06,
263
+ "loss": 0.6938,
264
+ "step": 72
265
+ },
266
+ {
267
+ "epoch": 4.62,
268
+ "learning_rate": 6.166666666666667e-06,
269
+ "loss": 0.6845,
270
+ "step": 74
271
+ },
272
+ {
273
+ "epoch": 4.75,
274
+ "learning_rate": 6.333333333333334e-06,
275
+ "loss": 0.6838,
276
+ "step": 76
277
+ },
278
+ {
279
+ "epoch": 4.88,
280
+ "learning_rate": 6.5000000000000004e-06,
281
+ "loss": 0.6883,
282
+ "step": 78
283
+ },
284
+ {
285
+ "epoch": 5.0,
286
+ "learning_rate": 6.666666666666667e-06,
287
+ "loss": 0.6804,
288
+ "step": 80
289
+ },
290
+ {
291
+ "epoch": 5.0,
292
+ "eval_accuracy": 0.57,
293
+ "eval_loss": 0.6861330270767212,
294
+ "eval_macro_f1": 0.4377431270635154,
295
+ "eval_runtime": 0.9894,
296
+ "eval_samples_per_second": 404.267,
297
+ "eval_steps_per_second": 4.043,
298
+ "step": 80
299
+ },
300
+ {
301
+ "epoch": 5.12,
302
+ "learning_rate": 6.833333333333333e-06,
303
+ "loss": 0.6921,
304
+ "step": 82
305
+ },
306
+ {
307
+ "epoch": 5.25,
308
+ "learning_rate": 7.000000000000001e-06,
309
+ "loss": 0.6867,
310
+ "step": 84
311
+ },
312
+ {
313
+ "epoch": 5.38,
314
+ "learning_rate": 7.166666666666667e-06,
315
+ "loss": 0.6789,
316
+ "step": 86
317
+ },
318
+ {
319
+ "epoch": 5.5,
320
+ "learning_rate": 7.333333333333334e-06,
321
+ "loss": 0.6912,
322
+ "step": 88
323
+ },
324
+ {
325
+ "epoch": 5.62,
326
+ "learning_rate": 7.5e-06,
327
+ "loss": 0.6885,
328
+ "step": 90
329
+ },
330
+ {
331
+ "epoch": 5.75,
332
+ "learning_rate": 7.666666666666667e-06,
333
+ "loss": 0.6824,
334
+ "step": 92
335
+ },
336
+ {
337
+ "epoch": 5.88,
338
+ "learning_rate": 7.833333333333333e-06,
339
+ "loss": 0.6915,
340
+ "step": 94
341
+ },
342
+ {
343
+ "epoch": 6.0,
344
+ "learning_rate": 8.000000000000001e-06,
345
+ "loss": 0.6669,
346
+ "step": 96
347
+ },
348
+ {
349
+ "epoch": 6.0,
350
+ "eval_accuracy": 0.5625,
351
+ "eval_loss": 0.685230016708374,
352
+ "eval_macro_f1": 0.3893449415952055,
353
+ "eval_runtime": 1.0307,
354
+ "eval_samples_per_second": 388.076,
355
+ "eval_steps_per_second": 3.881,
356
+ "step": 96
357
+ },
358
+ {
359
+ "epoch": 6.12,
360
+ "learning_rate": 8.166666666666668e-06,
361
+ "loss": 0.6902,
362
+ "step": 98
363
+ },
364
+ {
365
+ "epoch": 6.25,
366
+ "learning_rate": 8.333333333333334e-06,
367
+ "loss": 0.6888,
368
+ "step": 100
369
+ },
370
+ {
371
+ "epoch": 6.38,
372
+ "learning_rate": 8.500000000000002e-06,
373
+ "loss": 0.6718,
374
+ "step": 102
375
+ },
376
+ {
377
+ "epoch": 6.5,
378
+ "learning_rate": 8.666666666666668e-06,
379
+ "loss": 0.6953,
380
+ "step": 104
381
+ },
382
+ {
383
+ "epoch": 6.62,
384
+ "learning_rate": 8.833333333333334e-06,
385
+ "loss": 0.679,
386
+ "step": 106
387
+ },
388
+ {
389
+ "epoch": 6.75,
390
+ "learning_rate": 9e-06,
391
+ "loss": 0.6763,
392
+ "step": 108
393
+ },
394
+ {
395
+ "epoch": 6.88,
396
+ "learning_rate": 9.166666666666666e-06,
397
+ "loss": 0.6932,
398
+ "step": 110
399
+ },
400
+ {
401
+ "epoch": 7.0,
402
+ "learning_rate": 9.333333333333334e-06,
403
+ "loss": 0.672,
404
+ "step": 112
405
+ },
406
+ {
407
+ "epoch": 7.0,
408
+ "eval_accuracy": 0.5725,
409
+ "eval_loss": 0.6839529275894165,
410
+ "eval_macro_f1": 0.3640699523052464,
411
+ "eval_runtime": 0.8433,
412
+ "eval_samples_per_second": 474.35,
413
+ "eval_steps_per_second": 4.744,
414
+ "step": 112
415
+ },
416
+ {
417
+ "epoch": 7.12,
418
+ "learning_rate": 9.5e-06,
419
+ "loss": 0.6954,
420
+ "step": 114
421
+ },
422
+ {
423
+ "epoch": 7.25,
424
+ "learning_rate": 9.666666666666667e-06,
425
+ "loss": 0.6823,
426
+ "step": 116
427
+ },
428
+ {
429
+ "epoch": 7.38,
430
+ "learning_rate": 9.833333333333333e-06,
431
+ "loss": 0.6712,
432
+ "step": 118
433
+ },
434
+ {
435
+ "epoch": 7.5,
436
+ "learning_rate": 1e-05,
437
+ "loss": 0.696,
438
+ "step": 120
439
+ },
440
+ {
441
+ "epoch": 7.62,
442
+ "learning_rate": 1.0166666666666667e-05,
443
+ "loss": 0.6741,
444
+ "step": 122
445
+ },
446
+ {
447
+ "epoch": 7.75,
448
+ "learning_rate": 1.0333333333333333e-05,
449
+ "loss": 0.6594,
450
+ "step": 124
451
+ },
452
+ {
453
+ "epoch": 7.88,
454
+ "learning_rate": 1.05e-05,
455
+ "loss": 0.7076,
456
+ "step": 126
457
+ },
458
+ {
459
+ "epoch": 8.0,
460
+ "learning_rate": 1.0666666666666667e-05,
461
+ "loss": 0.6754,
462
+ "step": 128
463
+ },
464
+ {
465
+ "epoch": 8.0,
466
+ "eval_accuracy": 0.575,
467
+ "eval_loss": 0.6830737590789795,
468
+ "eval_macro_f1": 0.36507936507936506,
469
+ "eval_runtime": 0.8854,
470
+ "eval_samples_per_second": 451.794,
471
+ "eval_steps_per_second": 4.518,
472
+ "step": 128
473
+ },
474
+ {
475
+ "epoch": 8.12,
476
+ "learning_rate": 1.0833333333333334e-05,
477
+ "loss": 0.6895,
478
+ "step": 130
479
+ },
480
+ {
481
+ "epoch": 8.25,
482
+ "learning_rate": 1.1000000000000001e-05,
483
+ "loss": 0.6817,
484
+ "step": 132
485
+ },
486
+ {
487
+ "epoch": 8.38,
488
+ "learning_rate": 1.1166666666666668e-05,
489
+ "loss": 0.6737,
490
+ "step": 134
491
+ },
492
+ {
493
+ "epoch": 8.5,
494
+ "learning_rate": 1.1333333333333334e-05,
495
+ "loss": 0.6945,
496
+ "step": 136
497
+ },
498
+ {
499
+ "epoch": 8.62,
500
+ "learning_rate": 1.1500000000000002e-05,
501
+ "loss": 0.6698,
502
+ "step": 138
503
+ },
504
+ {
505
+ "epoch": 8.75,
506
+ "learning_rate": 1.1666666666666668e-05,
507
+ "loss": 0.6699,
508
+ "step": 140
509
+ },
510
+ {
511
+ "epoch": 8.88,
512
+ "learning_rate": 1.1833333333333334e-05,
513
+ "loss": 0.6993,
514
+ "step": 142
515
+ },
516
+ {
517
+ "epoch": 9.0,
518
+ "learning_rate": 1.2e-05,
519
+ "loss": 0.6833,
520
+ "step": 144
521
+ },
522
+ {
523
+ "epoch": 9.0,
524
+ "eval_accuracy": 0.575,
525
+ "eval_loss": 0.6830700039863586,
526
+ "eval_macro_f1": 0.36507936507936506,
527
+ "eval_runtime": 0.9198,
528
+ "eval_samples_per_second": 434.883,
529
+ "eval_steps_per_second": 4.349,
530
+ "step": 144
531
+ },
532
+ {
533
+ "epoch": 9.12,
534
+ "learning_rate": 1.2166666666666668e-05,
535
+ "loss": 0.6917,
536
+ "step": 146
537
+ },
538
+ {
539
+ "epoch": 9.25,
540
+ "learning_rate": 1.2333333333333334e-05,
541
+ "loss": 0.6684,
542
+ "step": 148
543
+ },
544
+ {
545
+ "epoch": 9.38,
546
+ "learning_rate": 1.25e-05,
547
+ "loss": 0.6738,
548
+ "step": 150
549
+ },
550
+ {
551
+ "epoch": 9.5,
552
+ "learning_rate": 1.2666666666666668e-05,
553
+ "loss": 0.7013,
554
+ "step": 152
555
+ },
556
+ {
557
+ "epoch": 9.62,
558
+ "learning_rate": 1.2833333333333333e-05,
559
+ "loss": 0.6761,
560
+ "step": 154
561
+ },
562
+ {
563
+ "epoch": 9.75,
564
+ "learning_rate": 1.3000000000000001e-05,
565
+ "loss": 0.6761,
566
+ "step": 156
567
+ },
568
+ {
569
+ "epoch": 9.88,
570
+ "learning_rate": 1.3166666666666665e-05,
571
+ "loss": 0.6897,
572
+ "step": 158
573
+ },
574
+ {
575
+ "epoch": 10.0,
576
+ "learning_rate": 1.3333333333333333e-05,
577
+ "loss": 0.6625,
578
+ "step": 160
579
+ },
580
+ {
581
+ "epoch": 10.0,
582
+ "eval_accuracy": 0.57,
583
+ "eval_loss": 0.6834887862205505,
584
+ "eval_macro_f1": 0.36836693474349086,
585
+ "eval_runtime": 0.842,
586
+ "eval_samples_per_second": 475.084,
587
+ "eval_steps_per_second": 4.751,
588
+ "step": 160
589
+ },
590
+ {
591
+ "epoch": 10.12,
592
+ "learning_rate": 1.3500000000000001e-05,
593
+ "loss": 0.689,
594
+ "step": 162
595
+ },
596
+ {
597
+ "epoch": 10.25,
598
+ "learning_rate": 1.3666666666666666e-05,
599
+ "loss": 0.6762,
600
+ "step": 164
601
+ },
602
+ {
603
+ "epoch": 10.38,
604
+ "learning_rate": 1.3833333333333334e-05,
605
+ "loss": 0.6749,
606
+ "step": 166
607
+ },
608
+ {
609
+ "epoch": 10.5,
610
+ "learning_rate": 1.4000000000000001e-05,
611
+ "loss": 0.6952,
612
+ "step": 168
613
+ },
614
+ {
615
+ "epoch": 10.62,
616
+ "learning_rate": 1.4166666666666668e-05,
617
+ "loss": 0.671,
618
+ "step": 170
619
+ },
620
+ {
621
+ "epoch": 10.75,
622
+ "learning_rate": 1.4333333333333334e-05,
623
+ "loss": 0.6755,
624
+ "step": 172
625
+ },
626
+ {
627
+ "epoch": 10.88,
628
+ "learning_rate": 1.45e-05,
629
+ "loss": 0.6946,
630
+ "step": 174
631
+ },
632
+ {
633
+ "epoch": 11.0,
634
+ "learning_rate": 1.4666666666666668e-05,
635
+ "loss": 0.6535,
636
+ "step": 176
637
+ },
638
+ {
639
+ "epoch": 11.0,
640
+ "eval_accuracy": 0.575,
641
+ "eval_loss": 0.6825265288352966,
642
+ "eval_macro_f1": 0.36507936507936506,
643
+ "eval_runtime": 0.7992,
644
+ "eval_samples_per_second": 500.47,
645
+ "eval_steps_per_second": 5.005,
646
+ "step": 176
647
+ },
648
+ {
649
+ "epoch": 11.12,
650
+ "learning_rate": 1.4833333333333336e-05,
651
+ "loss": 0.6953,
652
+ "step": 178
653
+ },
654
+ {
655
+ "epoch": 11.25,
656
+ "learning_rate": 1.5e-05,
657
+ "loss": 0.6763,
658
+ "step": 180
659
+ },
660
+ {
661
+ "epoch": 11.38,
662
+ "learning_rate": 1.5166666666666668e-05,
663
+ "loss": 0.6666,
664
+ "step": 182
665
+ },
666
+ {
667
+ "epoch": 11.5,
668
+ "learning_rate": 1.5333333333333334e-05,
669
+ "loss": 0.6878,
670
+ "step": 184
671
+ },
672
+ {
673
+ "epoch": 11.62,
674
+ "learning_rate": 1.55e-05,
675
+ "loss": 0.6762,
676
+ "step": 186
677
+ },
678
+ {
679
+ "epoch": 11.75,
680
+ "learning_rate": 1.5666666666666667e-05,
681
+ "loss": 0.6701,
682
+ "step": 188
683
+ },
684
+ {
685
+ "epoch": 11.88,
686
+ "learning_rate": 1.5833333333333333e-05,
687
+ "loss": 0.683,
688
+ "step": 190
689
+ },
690
+ {
691
+ "epoch": 12.0,
692
+ "learning_rate": 1.6000000000000003e-05,
693
+ "loss": 0.6597,
694
+ "step": 192
695
+ },
696
+ {
697
+ "epoch": 12.0,
698
+ "eval_accuracy": 0.5775,
699
+ "eval_loss": 0.6820663213729858,
700
+ "eval_macro_f1": 0.36608557844690964,
701
+ "eval_runtime": 0.8333,
702
+ "eval_samples_per_second": 480.024,
703
+ "eval_steps_per_second": 4.8,
704
+ "step": 192
705
+ },
706
+ {
707
+ "epoch": 12.12,
708
+ "learning_rate": 1.6166666666666665e-05,
709
+ "loss": 0.6837,
710
+ "step": 194
711
+ },
712
+ {
713
+ "epoch": 12.25,
714
+ "learning_rate": 1.6333333333333335e-05,
715
+ "loss": 0.6707,
716
+ "step": 196
717
+ },
718
+ {
719
+ "epoch": 12.38,
720
+ "learning_rate": 1.65e-05,
721
+ "loss": 0.6663,
722
+ "step": 198
723
+ },
724
+ {
725
+ "epoch": 12.5,
726
+ "learning_rate": 1.6666666666666667e-05,
727
+ "loss": 0.7006,
728
+ "step": 200
729
+ },
730
+ {
731
+ "epoch": 12.62,
732
+ "learning_rate": 1.6833333333333334e-05,
733
+ "loss": 0.6666,
734
+ "step": 202
735
+ },
736
+ {
737
+ "epoch": 12.75,
738
+ "learning_rate": 1.7000000000000003e-05,
739
+ "loss": 0.6649,
740
+ "step": 204
741
+ },
742
+ {
743
+ "epoch": 12.88,
744
+ "learning_rate": 1.7166666666666666e-05,
745
+ "loss": 0.6918,
746
+ "step": 206
747
+ },
748
+ {
749
+ "epoch": 13.0,
750
+ "learning_rate": 1.7333333333333336e-05,
751
+ "loss": 0.6747,
752
+ "step": 208
753
+ },
754
+ {
755
+ "epoch": 13.0,
756
+ "eval_accuracy": 0.5775,
757
+ "eval_loss": 0.6810634136199951,
758
+ "eval_macro_f1": 0.36608557844690964,
759
+ "eval_runtime": 0.8039,
760
+ "eval_samples_per_second": 497.582,
761
+ "eval_steps_per_second": 4.976,
762
+ "step": 208
763
+ },
764
+ {
765
+ "epoch": 13.12,
766
+ "learning_rate": 1.75e-05,
767
+ "loss": 0.6878,
768
+ "step": 210
769
+ },
770
+ {
771
+ "epoch": 13.25,
772
+ "learning_rate": 1.7666666666666668e-05,
773
+ "loss": 0.6655,
774
+ "step": 212
775
+ },
776
+ {
777
+ "epoch": 13.38,
778
+ "learning_rate": 1.7833333333333334e-05,
779
+ "loss": 0.6616,
780
+ "step": 214
781
+ },
782
+ {
783
+ "epoch": 13.5,
784
+ "learning_rate": 1.8e-05,
785
+ "loss": 0.6951,
786
+ "step": 216
787
+ },
788
+ {
789
+ "epoch": 13.62,
790
+ "learning_rate": 1.8166666666666667e-05,
791
+ "loss": 0.6714,
792
+ "step": 218
793
+ },
794
+ {
795
+ "epoch": 13.75,
796
+ "learning_rate": 1.8333333333333333e-05,
797
+ "loss": 0.6741,
798
+ "step": 220
799
+ },
800
+ {
801
+ "epoch": 13.88,
802
+ "learning_rate": 1.85e-05,
803
+ "loss": 0.6853,
804
+ "step": 222
805
+ },
806
+ {
807
+ "epoch": 14.0,
808
+ "learning_rate": 1.866666666666667e-05,
809
+ "loss": 0.6661,
810
+ "step": 224
811
+ },
812
+ {
813
+ "epoch": 14.0,
814
+ "eval_accuracy": 0.5775,
815
+ "eval_loss": 0.681349515914917,
816
+ "eval_macro_f1": 0.38696484116404134,
817
+ "eval_runtime": 0.8726,
818
+ "eval_samples_per_second": 458.379,
819
+ "eval_steps_per_second": 4.584,
820
+ "step": 224
821
+ },
822
+ {
823
+ "epoch": 14.12,
824
+ "learning_rate": 1.8833333333333335e-05,
825
+ "loss": 0.6902,
826
+ "step": 226
827
+ },
828
+ {
829
+ "epoch": 14.25,
830
+ "learning_rate": 1.9e-05,
831
+ "loss": 0.674,
832
+ "step": 228
833
+ },
834
+ {
835
+ "epoch": 14.38,
836
+ "learning_rate": 1.9166666666666667e-05,
837
+ "loss": 0.6707,
838
+ "step": 230
839
+ },
840
+ {
841
+ "epoch": 14.5,
842
+ "learning_rate": 1.9333333333333333e-05,
843
+ "loss": 0.6799,
844
+ "step": 232
845
+ },
846
+ {
847
+ "epoch": 14.62,
848
+ "learning_rate": 1.9500000000000003e-05,
849
+ "loss": 0.6607,
850
+ "step": 234
851
+ },
852
+ {
853
+ "epoch": 14.75,
854
+ "learning_rate": 1.9666666666666666e-05,
855
+ "loss": 0.6748,
856
+ "step": 236
857
+ },
858
+ {
859
+ "epoch": 14.88,
860
+ "learning_rate": 1.9833333333333335e-05,
861
+ "loss": 0.6771,
862
+ "step": 238
863
+ },
864
+ {
865
+ "epoch": 15.0,
866
+ "learning_rate": 2e-05,
867
+ "loss": 0.6543,
868
+ "step": 240
869
+ },
870
+ {
871
+ "epoch": 15.0,
872
+ "eval_accuracy": 0.5725,
873
+ "eval_loss": 0.6816413998603821,
874
+ "eval_macro_f1": 0.4076845140674928,
875
+ "eval_runtime": 0.8528,
876
+ "eval_samples_per_second": 469.044,
877
+ "eval_steps_per_second": 4.69,
878
+ "step": 240
879
+ }
880
+ ],
881
+ "max_steps": 320,
882
+ "num_train_epochs": 20,
883
+ "total_flos": 175154724864000.0,
884
+ "trial_name": null,
885
+ "trial_params": null
886
+ }
scaling_performance/2000/L2/.DS_Store CHANGED
Binary files a/scaling_performance/2000/L2/.DS_Store and b/scaling_performance/2000/L2/.DS_Store differ
 
scaling_performance/2000/L2/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.5775,
3
+ "test_loss": 0.6842468976974487,
4
+ "test_macro_f1": 0.38194285714285714,
5
+ "test_runtime": 1.0481,
6
+ "test_samples_per_second": 381.652,
7
+ "test_steps_per_second": 3.817
8
+ }
scaling_performance/2000/L2/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.02,
6
+ "classifier_dropout": null,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.02,
9
+ "hidden_size": 256,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 512,
12
+ "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 2048,
14
+ "model_type": "bert",
15
+ "num_attention_heads": 4,
16
+ "num_hidden_layers": 2,
17
+ "pad_token_id": 0,
18
+ "position_embedding_type": "absolute",
19
+ "problem_type": "single_label_classification",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.28.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
scaling_performance/2000/L2/eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.5775,
3
+ "test_loss": 0.6842468976974487,
4
+ "test_macro_f1": 0.38194285714285714,
5
+ "test_runtime": 1.0481,
6
+ "test_samples_per_second": 381.652,
7
+ "test_steps_per_second": 3.817
8
+ }
scaling_performance/2000/L2/trainer_state.json ADDED
@@ -0,0 +1,596 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6842468976974487,
3
+ "best_model_checkpoint": "./models/240626_geneformer_CellClassifier_PM25_Layers2_L2048_B26_LR5e-05_LSlinear_WU600_E20_Oadamw_F0_fold4/checkpoint-128",
4
+ "epoch": 10.0,
5
+ "global_step": 160,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.12,
12
+ "learning_rate": 1.6666666666666668e-07,
13
+ "loss": 0.6937,
14
+ "step": 2
15
+ },
16
+ {
17
+ "epoch": 0.25,
18
+ "learning_rate": 3.3333333333333335e-07,
19
+ "loss": 0.7018,
20
+ "step": 4
21
+ },
22
+ {
23
+ "epoch": 0.38,
24
+ "learning_rate": 5.000000000000001e-07,
25
+ "loss": 0.7031,
26
+ "step": 6
27
+ },
28
+ {
29
+ "epoch": 0.5,
30
+ "learning_rate": 6.666666666666667e-07,
31
+ "loss": 0.6949,
32
+ "step": 8
33
+ },
34
+ {
35
+ "epoch": 0.62,
36
+ "learning_rate": 8.333333333333333e-07,
37
+ "loss": 0.694,
38
+ "step": 10
39
+ },
40
+ {
41
+ "epoch": 0.75,
42
+ "learning_rate": 1.0000000000000002e-06,
43
+ "loss": 0.7025,
44
+ "step": 12
45
+ },
46
+ {
47
+ "epoch": 0.88,
48
+ "learning_rate": 1.1666666666666668e-06,
49
+ "loss": 0.6975,
50
+ "step": 14
51
+ },
52
+ {
53
+ "epoch": 1.0,
54
+ "learning_rate": 1.3333333333333334e-06,
55
+ "loss": 0.7029,
56
+ "step": 16
57
+ },
58
+ {
59
+ "epoch": 1.0,
60
+ "eval_accuracy": 0.4575,
61
+ "eval_loss": 0.6974406242370605,
62
+ "eval_macro_f1": 0.4533144386710754,
63
+ "eval_runtime": 0.9243,
64
+ "eval_samples_per_second": 432.744,
65
+ "eval_steps_per_second": 4.327,
66
+ "step": 16
67
+ },
68
+ {
69
+ "epoch": 1.12,
70
+ "learning_rate": 1.5e-06,
71
+ "loss": 0.6934,
72
+ "step": 18
73
+ },
74
+ {
75
+ "epoch": 1.25,
76
+ "learning_rate": 1.6666666666666667e-06,
77
+ "loss": 0.7034,
78
+ "step": 20
79
+ },
80
+ {
81
+ "epoch": 1.38,
82
+ "learning_rate": 1.8333333333333335e-06,
83
+ "loss": 0.6945,
84
+ "step": 22
85
+ },
86
+ {
87
+ "epoch": 1.5,
88
+ "learning_rate": 2.0000000000000003e-06,
89
+ "loss": 0.6894,
90
+ "step": 24
91
+ },
92
+ {
93
+ "epoch": 1.62,
94
+ "learning_rate": 2.166666666666667e-06,
95
+ "loss": 0.7029,
96
+ "step": 26
97
+ },
98
+ {
99
+ "epoch": 1.75,
100
+ "learning_rate": 2.3333333333333336e-06,
101
+ "loss": 0.7002,
102
+ "step": 28
103
+ },
104
+ {
105
+ "epoch": 1.88,
106
+ "learning_rate": 2.5e-06,
107
+ "loss": 0.6943,
108
+ "step": 30
109
+ },
110
+ {
111
+ "epoch": 2.0,
112
+ "learning_rate": 2.666666666666667e-06,
113
+ "loss": 0.6983,
114
+ "step": 32
115
+ },
116
+ {
117
+ "epoch": 2.0,
118
+ "eval_accuracy": 0.485,
119
+ "eval_loss": 0.695452094078064,
120
+ "eval_macro_f1": 0.4841746794871795,
121
+ "eval_runtime": 0.9144,
122
+ "eval_samples_per_second": 437.423,
123
+ "eval_steps_per_second": 4.374,
124
+ "step": 32
125
+ },
126
+ {
127
+ "epoch": 2.12,
128
+ "learning_rate": 2.8333333333333335e-06,
129
+ "loss": 0.6959,
130
+ "step": 34
131
+ },
132
+ {
133
+ "epoch": 2.25,
134
+ "learning_rate": 3e-06,
135
+ "loss": 0.6983,
136
+ "step": 36
137
+ },
138
+ {
139
+ "epoch": 2.38,
140
+ "learning_rate": 3.166666666666667e-06,
141
+ "loss": 0.6905,
142
+ "step": 38
143
+ },
144
+ {
145
+ "epoch": 2.5,
146
+ "learning_rate": 3.3333333333333333e-06,
147
+ "loss": 0.6929,
148
+ "step": 40
149
+ },
150
+ {
151
+ "epoch": 2.62,
152
+ "learning_rate": 3.5000000000000004e-06,
153
+ "loss": 0.694,
154
+ "step": 42
155
+ },
156
+ {
157
+ "epoch": 2.75,
158
+ "learning_rate": 3.666666666666667e-06,
159
+ "loss": 0.6945,
160
+ "step": 44
161
+ },
162
+ {
163
+ "epoch": 2.88,
164
+ "learning_rate": 3.833333333333334e-06,
165
+ "loss": 0.6967,
166
+ "step": 46
167
+ },
168
+ {
169
+ "epoch": 3.0,
170
+ "learning_rate": 4.000000000000001e-06,
171
+ "loss": 0.6968,
172
+ "step": 48
173
+ },
174
+ {
175
+ "epoch": 3.0,
176
+ "eval_accuracy": 0.495,
177
+ "eval_loss": 0.6920776963233948,
178
+ "eval_macro_f1": 0.47608673098869175,
179
+ "eval_runtime": 0.9068,
180
+ "eval_samples_per_second": 441.113,
181
+ "eval_steps_per_second": 4.411,
182
+ "step": 48
183
+ },
184
+ {
185
+ "epoch": 3.12,
186
+ "learning_rate": 4.166666666666667e-06,
187
+ "loss": 0.6908,
188
+ "step": 50
189
+ },
190
+ {
191
+ "epoch": 3.25,
192
+ "learning_rate": 4.333333333333334e-06,
193
+ "loss": 0.6944,
194
+ "step": 52
195
+ },
196
+ {
197
+ "epoch": 3.38,
198
+ "learning_rate": 4.5e-06,
199
+ "loss": 0.6914,
200
+ "step": 54
201
+ },
202
+ {
203
+ "epoch": 3.5,
204
+ "learning_rate": 4.666666666666667e-06,
205
+ "loss": 0.6911,
206
+ "step": 56
207
+ },
208
+ {
209
+ "epoch": 3.62,
210
+ "learning_rate": 4.833333333333333e-06,
211
+ "loss": 0.6893,
212
+ "step": 58
213
+ },
214
+ {
215
+ "epoch": 3.75,
216
+ "learning_rate": 5e-06,
217
+ "loss": 0.6949,
218
+ "step": 60
219
+ },
220
+ {
221
+ "epoch": 3.88,
222
+ "learning_rate": 5.166666666666667e-06,
223
+ "loss": 0.6949,
224
+ "step": 62
225
+ },
226
+ {
227
+ "epoch": 4.0,
228
+ "learning_rate": 5.333333333333334e-06,
229
+ "loss": 0.6868,
230
+ "step": 64
231
+ },
232
+ {
233
+ "epoch": 4.0,
234
+ "eval_accuracy": 0.5325,
235
+ "eval_loss": 0.6893474459648132,
236
+ "eval_macro_f1": 0.46399524188289587,
237
+ "eval_runtime": 0.9384,
238
+ "eval_samples_per_second": 426.254,
239
+ "eval_steps_per_second": 4.263,
240
+ "step": 64
241
+ },
242
+ {
243
+ "epoch": 4.12,
244
+ "learning_rate": 5.500000000000001e-06,
245
+ "loss": 0.6984,
246
+ "step": 66
247
+ },
248
+ {
249
+ "epoch": 4.25,
250
+ "learning_rate": 5.666666666666667e-06,
251
+ "loss": 0.6822,
252
+ "step": 68
253
+ },
254
+ {
255
+ "epoch": 4.38,
256
+ "learning_rate": 5.833333333333334e-06,
257
+ "loss": 0.6864,
258
+ "step": 70
259
+ },
260
+ {
261
+ "epoch": 4.5,
262
+ "learning_rate": 6e-06,
263
+ "loss": 0.7002,
264
+ "step": 72
265
+ },
266
+ {
267
+ "epoch": 4.62,
268
+ "learning_rate": 6.166666666666667e-06,
269
+ "loss": 0.6897,
270
+ "step": 74
271
+ },
272
+ {
273
+ "epoch": 4.75,
274
+ "learning_rate": 6.333333333333334e-06,
275
+ "loss": 0.6903,
276
+ "step": 76
277
+ },
278
+ {
279
+ "epoch": 4.88,
280
+ "learning_rate": 6.5000000000000004e-06,
281
+ "loss": 0.6941,
282
+ "step": 78
283
+ },
284
+ {
285
+ "epoch": 5.0,
286
+ "learning_rate": 6.666666666666667e-06,
287
+ "loss": 0.6779,
288
+ "step": 80
289
+ },
290
+ {
291
+ "epoch": 5.0,
292
+ "eval_accuracy": 0.575,
293
+ "eval_loss": 0.6872670650482178,
294
+ "eval_macro_f1": 0.41748903508771934,
295
+ "eval_runtime": 0.928,
296
+ "eval_samples_per_second": 431.013,
297
+ "eval_steps_per_second": 4.31,
298
+ "step": 80
299
+ },
300
+ {
301
+ "epoch": 5.12,
302
+ "learning_rate": 6.833333333333333e-06,
303
+ "loss": 0.7031,
304
+ "step": 82
305
+ },
306
+ {
307
+ "epoch": 5.25,
308
+ "learning_rate": 7.000000000000001e-06,
309
+ "loss": 0.6892,
310
+ "step": 84
311
+ },
312
+ {
313
+ "epoch": 5.38,
314
+ "learning_rate": 7.166666666666667e-06,
315
+ "loss": 0.6823,
316
+ "step": 86
317
+ },
318
+ {
319
+ "epoch": 5.5,
320
+ "learning_rate": 7.333333333333334e-06,
321
+ "loss": 0.6936,
322
+ "step": 88
323
+ },
324
+ {
325
+ "epoch": 5.62,
326
+ "learning_rate": 7.5e-06,
327
+ "loss": 0.6907,
328
+ "step": 90
329
+ },
330
+ {
331
+ "epoch": 5.75,
332
+ "learning_rate": 7.666666666666667e-06,
333
+ "loss": 0.681,
334
+ "step": 92
335
+ },
336
+ {
337
+ "epoch": 5.88,
338
+ "learning_rate": 7.833333333333333e-06,
339
+ "loss": 0.6965,
340
+ "step": 94
341
+ },
342
+ {
343
+ "epoch": 6.0,
344
+ "learning_rate": 8.000000000000001e-06,
345
+ "loss": 0.6675,
346
+ "step": 96
347
+ },
348
+ {
349
+ "epoch": 6.0,
350
+ "eval_accuracy": 0.58,
351
+ "eval_loss": 0.6864795088768005,
352
+ "eval_macro_f1": 0.4159365874009178,
353
+ "eval_runtime": 0.9734,
354
+ "eval_samples_per_second": 410.925,
355
+ "eval_steps_per_second": 4.109,
356
+ "step": 96
357
+ },
358
+ {
359
+ "epoch": 6.12,
360
+ "learning_rate": 8.166666666666668e-06,
361
+ "loss": 0.6914,
362
+ "step": 98
363
+ },
364
+ {
365
+ "epoch": 6.25,
366
+ "learning_rate": 8.333333333333334e-06,
367
+ "loss": 0.6865,
368
+ "step": 100
369
+ },
370
+ {
371
+ "epoch": 6.38,
372
+ "learning_rate": 8.500000000000002e-06,
373
+ "loss": 0.6801,
374
+ "step": 102
375
+ },
376
+ {
377
+ "epoch": 6.5,
378
+ "learning_rate": 8.666666666666668e-06,
379
+ "loss": 0.699,
380
+ "step": 104
381
+ },
382
+ {
383
+ "epoch": 6.62,
384
+ "learning_rate": 8.833333333333334e-06,
385
+ "loss": 0.6816,
386
+ "step": 106
387
+ },
388
+ {
389
+ "epoch": 6.75,
390
+ "learning_rate": 9e-06,
391
+ "loss": 0.6803,
392
+ "step": 108
393
+ },
394
+ {
395
+ "epoch": 6.88,
396
+ "learning_rate": 9.166666666666666e-06,
397
+ "loss": 0.6934,
398
+ "step": 110
399
+ },
400
+ {
401
+ "epoch": 7.0,
402
+ "learning_rate": 9.333333333333334e-06,
403
+ "loss": 0.6765,
404
+ "step": 112
405
+ },
406
+ {
407
+ "epoch": 7.0,
408
+ "eval_accuracy": 0.5775,
409
+ "eval_loss": 0.6850780248641968,
410
+ "eval_macro_f1": 0.38696484116404134,
411
+ "eval_runtime": 0.9277,
412
+ "eval_samples_per_second": 431.159,
413
+ "eval_steps_per_second": 4.312,
414
+ "step": 112
415
+ },
416
+ {
417
+ "epoch": 7.12,
418
+ "learning_rate": 9.5e-06,
419
+ "loss": 0.6959,
420
+ "step": 114
421
+ },
422
+ {
423
+ "epoch": 7.25,
424
+ "learning_rate": 9.666666666666667e-06,
425
+ "loss": 0.6815,
426
+ "step": 116
427
+ },
428
+ {
429
+ "epoch": 7.38,
430
+ "learning_rate": 9.833333333333333e-06,
431
+ "loss": 0.6763,
432
+ "step": 118
433
+ },
434
+ {
435
+ "epoch": 7.5,
436
+ "learning_rate": 1e-05,
437
+ "loss": 0.698,
438
+ "step": 120
439
+ },
440
+ {
441
+ "epoch": 7.62,
442
+ "learning_rate": 1.0166666666666667e-05,
443
+ "loss": 0.6784,
444
+ "step": 122
445
+ },
446
+ {
447
+ "epoch": 7.75,
448
+ "learning_rate": 1.0333333333333333e-05,
449
+ "loss": 0.6672,
450
+ "step": 124
451
+ },
452
+ {
453
+ "epoch": 7.88,
454
+ "learning_rate": 1.05e-05,
455
+ "loss": 0.7066,
456
+ "step": 126
457
+ },
458
+ {
459
+ "epoch": 8.0,
460
+ "learning_rate": 1.0666666666666667e-05,
461
+ "loss": 0.6902,
462
+ "step": 128
463
+ },
464
+ {
465
+ "epoch": 8.0,
466
+ "eval_accuracy": 0.5775,
467
+ "eval_loss": 0.6842468976974487,
468
+ "eval_macro_f1": 0.38194285714285714,
469
+ "eval_runtime": 0.884,
470
+ "eval_samples_per_second": 452.506,
471
+ "eval_steps_per_second": 4.525,
472
+ "step": 128
473
+ },
474
+ {
475
+ "epoch": 8.12,
476
+ "learning_rate": 1.0833333333333334e-05,
477
+ "loss": 0.6862,
478
+ "step": 130
479
+ },
480
+ {
481
+ "epoch": 8.25,
482
+ "learning_rate": 1.1000000000000001e-05,
483
+ "loss": 0.6784,
484
+ "step": 132
485
+ },
486
+ {
487
+ "epoch": 8.38,
488
+ "learning_rate": 1.1166666666666668e-05,
489
+ "loss": 0.6774,
490
+ "step": 134
491
+ },
492
+ {
493
+ "epoch": 8.5,
494
+ "learning_rate": 1.1333333333333334e-05,
495
+ "loss": 0.7004,
496
+ "step": 136
497
+ },
498
+ {
499
+ "epoch": 8.62,
500
+ "learning_rate": 1.1500000000000002e-05,
501
+ "loss": 0.6755,
502
+ "step": 138
503
+ },
504
+ {
505
+ "epoch": 8.75,
506
+ "learning_rate": 1.1666666666666668e-05,
507
+ "loss": 0.68,
508
+ "step": 140
509
+ },
510
+ {
511
+ "epoch": 8.88,
512
+ "learning_rate": 1.1833333333333334e-05,
513
+ "loss": 0.7023,
514
+ "step": 142
515
+ },
516
+ {
517
+ "epoch": 9.0,
518
+ "learning_rate": 1.2e-05,
519
+ "loss": 0.6802,
520
+ "step": 144
521
+ },
522
+ {
523
+ "epoch": 9.0,
524
+ "eval_accuracy": 0.5775,
525
+ "eval_loss": 0.6846153140068054,
526
+ "eval_macro_f1": 0.38696484116404134,
527
+ "eval_runtime": 0.9145,
528
+ "eval_samples_per_second": 437.397,
529
+ "eval_steps_per_second": 4.374,
530
+ "step": 144
531
+ },
532
+ {
533
+ "epoch": 9.12,
534
+ "learning_rate": 1.2166666666666668e-05,
535
+ "loss": 0.6962,
536
+ "step": 146
537
+ },
538
+ {
539
+ "epoch": 9.25,
540
+ "learning_rate": 1.2333333333333334e-05,
541
+ "loss": 0.6717,
542
+ "step": 148
543
+ },
544
+ {
545
+ "epoch": 9.38,
546
+ "learning_rate": 1.25e-05,
547
+ "loss": 0.6783,
548
+ "step": 150
549
+ },
550
+ {
551
+ "epoch": 9.5,
552
+ "learning_rate": 1.2666666666666668e-05,
553
+ "loss": 0.7028,
554
+ "step": 152
555
+ },
556
+ {
557
+ "epoch": 9.62,
558
+ "learning_rate": 1.2833333333333333e-05,
559
+ "loss": 0.683,
560
+ "step": 154
561
+ },
562
+ {
563
+ "epoch": 9.75,
564
+ "learning_rate": 1.3000000000000001e-05,
565
+ "loss": 0.6835,
566
+ "step": 156
567
+ },
568
+ {
569
+ "epoch": 9.88,
570
+ "learning_rate": 1.3166666666666665e-05,
571
+ "loss": 0.687,
572
+ "step": 158
573
+ },
574
+ {
575
+ "epoch": 10.0,
576
+ "learning_rate": 1.3333333333333333e-05,
577
+ "loss": 0.6693,
578
+ "step": 160
579
+ },
580
+ {
581
+ "epoch": 10.0,
582
+ "eval_accuracy": 0.5875,
583
+ "eval_loss": 0.6855114698410034,
584
+ "eval_macro_f1": 0.4199077125906394,
585
+ "eval_runtime": 1.013,
586
+ "eval_samples_per_second": 394.875,
587
+ "eval_steps_per_second": 3.949,
588
+ "step": 160
589
+ }
590
+ ],
591
+ "max_steps": 320,
592
+ "num_train_epochs": 20,
593
+ "total_flos": 220402679808000.0,
594
+ "trial_name": null,
595
+ "trial_params": null
596
+ }
scaling_performance/2000/L4/.DS_Store CHANGED
Binary files a/scaling_performance/2000/L4/.DS_Store and b/scaling_performance/2000/L4/.DS_Store differ
 
scaling_performance/2000/L4/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.6975,
3
+ "test_loss": 0.5687375068664551,
4
+ "test_macro_f1": 0.6942881145029971,
5
+ "test_runtime": 1.6145,
6
+ "test_samples_per_second": 247.752,
7
+ "test_steps_per_second": 2.478
8
+ }
scaling_performance/2000/L4/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.02,
6
+ "classifier_dropout": null,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.02,
9
+ "hidden_size": 256,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 512,
12
+ "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 2048,
14
+ "model_type": "bert",
15
+ "num_attention_heads": 4,
16
+ "num_hidden_layers": 4,
17
+ "pad_token_id": 0,
18
+ "position_embedding_type": "absolute",
19
+ "problem_type": "single_label_classification",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.28.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
scaling_performance/2000/L4/eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.6975,
3
+ "test_loss": 0.5687375068664551,
4
+ "test_macro_f1": 0.6942881145029971,
5
+ "test_runtime": 1.6145,
6
+ "test_samples_per_second": 247.752,
7
+ "test_steps_per_second": 2.478
8
+ }
scaling_performance/2000/L4/trainer_state.json ADDED
@@ -0,0 +1,596 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6825920343399048,
3
+ "best_model_checkpoint": "./models/240626_geneformer_CellClassifier_PM25_Layers4_L2048_B26_LR5e-05_LSlinear_WU600_E20_Oadamw_F0_fold4/checkpoint-128",
4
+ "epoch": 10.0,
5
+ "global_step": 160,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.12,
12
+ "learning_rate": 1.6666666666666668e-07,
13
+ "loss": 0.6905,
14
+ "step": 2
15
+ },
16
+ {
17
+ "epoch": 0.25,
18
+ "learning_rate": 3.3333333333333335e-07,
19
+ "loss": 0.7029,
20
+ "step": 4
21
+ },
22
+ {
23
+ "epoch": 0.38,
24
+ "learning_rate": 5.000000000000001e-07,
25
+ "loss": 0.6978,
26
+ "step": 6
27
+ },
28
+ {
29
+ "epoch": 0.5,
30
+ "learning_rate": 6.666666666666667e-07,
31
+ "loss": 0.6968,
32
+ "step": 8
33
+ },
34
+ {
35
+ "epoch": 0.62,
36
+ "learning_rate": 8.333333333333333e-07,
37
+ "loss": 0.6954,
38
+ "step": 10
39
+ },
40
+ {
41
+ "epoch": 0.75,
42
+ "learning_rate": 1.0000000000000002e-06,
43
+ "loss": 0.7,
44
+ "step": 12
45
+ },
46
+ {
47
+ "epoch": 0.88,
48
+ "learning_rate": 1.1666666666666668e-06,
49
+ "loss": 0.697,
50
+ "step": 14
51
+ },
52
+ {
53
+ "epoch": 1.0,
54
+ "learning_rate": 1.3333333333333334e-06,
55
+ "loss": 0.6996,
56
+ "step": 16
57
+ },
58
+ {
59
+ "epoch": 1.0,
60
+ "eval_accuracy": 0.5,
61
+ "eval_loss": 0.6958171129226685,
62
+ "eval_macro_f1": 0.493298877657014,
63
+ "eval_runtime": 1.1509,
64
+ "eval_samples_per_second": 347.552,
65
+ "eval_steps_per_second": 3.476,
66
+ "step": 16
67
+ },
68
+ {
69
+ "epoch": 1.12,
70
+ "learning_rate": 1.5e-06,
71
+ "loss": 0.69,
72
+ "step": 18
73
+ },
74
+ {
75
+ "epoch": 1.25,
76
+ "learning_rate": 1.6666666666666667e-06,
77
+ "loss": 0.7017,
78
+ "step": 20
79
+ },
80
+ {
81
+ "epoch": 1.38,
82
+ "learning_rate": 1.8333333333333335e-06,
83
+ "loss": 0.6939,
84
+ "step": 22
85
+ },
86
+ {
87
+ "epoch": 1.5,
88
+ "learning_rate": 2.0000000000000003e-06,
89
+ "loss": 0.6928,
90
+ "step": 24
91
+ },
92
+ {
93
+ "epoch": 1.62,
94
+ "learning_rate": 2.166666666666667e-06,
95
+ "loss": 0.699,
96
+ "step": 26
97
+ },
98
+ {
99
+ "epoch": 1.75,
100
+ "learning_rate": 2.3333333333333336e-06,
101
+ "loss": 0.6927,
102
+ "step": 28
103
+ },
104
+ {
105
+ "epoch": 1.88,
106
+ "learning_rate": 2.5e-06,
107
+ "loss": 0.6986,
108
+ "step": 30
109
+ },
110
+ {
111
+ "epoch": 2.0,
112
+ "learning_rate": 2.666666666666667e-06,
113
+ "loss": 0.6999,
114
+ "step": 32
115
+ },
116
+ {
117
+ "epoch": 2.0,
118
+ "eval_accuracy": 0.51,
119
+ "eval_loss": 0.6933969259262085,
120
+ "eval_macro_f1": 0.5090057366166487,
121
+ "eval_runtime": 1.199,
122
+ "eval_samples_per_second": 333.601,
123
+ "eval_steps_per_second": 3.336,
124
+ "step": 32
125
+ },
126
+ {
127
+ "epoch": 2.12,
128
+ "learning_rate": 2.8333333333333335e-06,
129
+ "loss": 0.6941,
130
+ "step": 34
131
+ },
132
+ {
133
+ "epoch": 2.25,
134
+ "learning_rate": 3e-06,
135
+ "loss": 0.6918,
136
+ "step": 36
137
+ },
138
+ {
139
+ "epoch": 2.38,
140
+ "learning_rate": 3.166666666666667e-06,
141
+ "loss": 0.6899,
142
+ "step": 38
143
+ },
144
+ {
145
+ "epoch": 2.5,
146
+ "learning_rate": 3.3333333333333333e-06,
147
+ "loss": 0.6906,
148
+ "step": 40
149
+ },
150
+ {
151
+ "epoch": 2.62,
152
+ "learning_rate": 3.5000000000000004e-06,
153
+ "loss": 0.6914,
154
+ "step": 42
155
+ },
156
+ {
157
+ "epoch": 2.75,
158
+ "learning_rate": 3.666666666666667e-06,
159
+ "loss": 0.6919,
160
+ "step": 44
161
+ },
162
+ {
163
+ "epoch": 2.88,
164
+ "learning_rate": 3.833333333333334e-06,
165
+ "loss": 0.7009,
166
+ "step": 46
167
+ },
168
+ {
169
+ "epoch": 3.0,
170
+ "learning_rate": 4.000000000000001e-06,
171
+ "loss": 0.6881,
172
+ "step": 48
173
+ },
174
+ {
175
+ "epoch": 3.0,
176
+ "eval_accuracy": 0.545,
177
+ "eval_loss": 0.6895008087158203,
178
+ "eval_macro_f1": 0.4814814814814815,
179
+ "eval_runtime": 1.1808,
180
+ "eval_samples_per_second": 338.766,
181
+ "eval_steps_per_second": 3.388,
182
+ "step": 48
183
+ },
184
+ {
185
+ "epoch": 3.12,
186
+ "learning_rate": 4.166666666666667e-06,
187
+ "loss": 0.6938,
188
+ "step": 50
189
+ },
190
+ {
191
+ "epoch": 3.25,
192
+ "learning_rate": 4.333333333333334e-06,
193
+ "loss": 0.6876,
194
+ "step": 52
195
+ },
196
+ {
197
+ "epoch": 3.38,
198
+ "learning_rate": 4.5e-06,
199
+ "loss": 0.6855,
200
+ "step": 54
201
+ },
202
+ {
203
+ "epoch": 3.5,
204
+ "learning_rate": 4.666666666666667e-06,
205
+ "loss": 0.6973,
206
+ "step": 56
207
+ },
208
+ {
209
+ "epoch": 3.62,
210
+ "learning_rate": 4.833333333333333e-06,
211
+ "loss": 0.69,
212
+ "step": 58
213
+ },
214
+ {
215
+ "epoch": 3.75,
216
+ "learning_rate": 5e-06,
217
+ "loss": 0.6902,
218
+ "step": 60
219
+ },
220
+ {
221
+ "epoch": 3.88,
222
+ "learning_rate": 5.166666666666667e-06,
223
+ "loss": 0.6913,
224
+ "step": 62
225
+ },
226
+ {
227
+ "epoch": 4.0,
228
+ "learning_rate": 5.333333333333334e-06,
229
+ "loss": 0.6799,
230
+ "step": 64
231
+ },
232
+ {
233
+ "epoch": 4.0,
234
+ "eval_accuracy": 0.5625,
235
+ "eval_loss": 0.6866650581359863,
236
+ "eval_macro_f1": 0.41072976909025094,
237
+ "eval_runtime": 1.1114,
238
+ "eval_samples_per_second": 359.916,
239
+ "eval_steps_per_second": 3.599,
240
+ "step": 64
241
+ },
242
+ {
243
+ "epoch": 4.12,
244
+ "learning_rate": 5.500000000000001e-06,
245
+ "loss": 0.6976,
246
+ "step": 66
247
+ },
248
+ {
249
+ "epoch": 4.25,
250
+ "learning_rate": 5.666666666666667e-06,
251
+ "loss": 0.6823,
252
+ "step": 68
253
+ },
254
+ {
255
+ "epoch": 4.38,
256
+ "learning_rate": 5.833333333333334e-06,
257
+ "loss": 0.6863,
258
+ "step": 70
259
+ },
260
+ {
261
+ "epoch": 4.5,
262
+ "learning_rate": 6e-06,
263
+ "loss": 0.6955,
264
+ "step": 72
265
+ },
266
+ {
267
+ "epoch": 4.62,
268
+ "learning_rate": 6.166666666666667e-06,
269
+ "loss": 0.6842,
270
+ "step": 74
271
+ },
272
+ {
273
+ "epoch": 4.75,
274
+ "learning_rate": 6.333333333333334e-06,
275
+ "loss": 0.6873,
276
+ "step": 76
277
+ },
278
+ {
279
+ "epoch": 4.88,
280
+ "learning_rate": 6.5000000000000004e-06,
281
+ "loss": 0.6987,
282
+ "step": 78
283
+ },
284
+ {
285
+ "epoch": 5.0,
286
+ "learning_rate": 6.666666666666667e-06,
287
+ "loss": 0.6779,
288
+ "step": 80
289
+ },
290
+ {
291
+ "epoch": 5.0,
292
+ "eval_accuracy": 0.5675,
293
+ "eval_loss": 0.6847481727600098,
294
+ "eval_macro_f1": 0.3673142857142857,
295
+ "eval_runtime": 1.2206,
296
+ "eval_samples_per_second": 327.716,
297
+ "eval_steps_per_second": 3.277,
298
+ "step": 80
299
+ },
300
+ {
301
+ "epoch": 5.12,
302
+ "learning_rate": 6.833333333333333e-06,
303
+ "loss": 0.695,
304
+ "step": 82
305
+ },
306
+ {
307
+ "epoch": 5.25,
308
+ "learning_rate": 7.000000000000001e-06,
309
+ "loss": 0.689,
310
+ "step": 84
311
+ },
312
+ {
313
+ "epoch": 5.38,
314
+ "learning_rate": 7.166666666666667e-06,
315
+ "loss": 0.6814,
316
+ "step": 86
317
+ },
318
+ {
319
+ "epoch": 5.5,
320
+ "learning_rate": 7.333333333333334e-06,
321
+ "loss": 0.693,
322
+ "step": 88
323
+ },
324
+ {
325
+ "epoch": 5.62,
326
+ "learning_rate": 7.5e-06,
327
+ "loss": 0.6876,
328
+ "step": 90
329
+ },
330
+ {
331
+ "epoch": 5.75,
332
+ "learning_rate": 7.666666666666667e-06,
333
+ "loss": 0.6822,
334
+ "step": 92
335
+ },
336
+ {
337
+ "epoch": 5.88,
338
+ "learning_rate": 7.833333333333333e-06,
339
+ "loss": 0.6933,
340
+ "step": 94
341
+ },
342
+ {
343
+ "epoch": 6.0,
344
+ "learning_rate": 8.000000000000001e-06,
345
+ "loss": 0.6579,
346
+ "step": 96
347
+ },
348
+ {
349
+ "epoch": 6.0,
350
+ "eval_accuracy": 0.57,
351
+ "eval_loss": 0.6842648386955261,
352
+ "eval_macro_f1": 0.3630573248407643,
353
+ "eval_runtime": 1.1578,
354
+ "eval_samples_per_second": 345.493,
355
+ "eval_steps_per_second": 3.455,
356
+ "step": 96
357
+ },
358
+ {
359
+ "epoch": 6.12,
360
+ "learning_rate": 8.166666666666668e-06,
361
+ "loss": 0.6891,
362
+ "step": 98
363
+ },
364
+ {
365
+ "epoch": 6.25,
366
+ "learning_rate": 8.333333333333334e-06,
367
+ "loss": 0.6925,
368
+ "step": 100
369
+ },
370
+ {
371
+ "epoch": 6.38,
372
+ "learning_rate": 8.500000000000002e-06,
373
+ "loss": 0.6739,
374
+ "step": 102
375
+ },
376
+ {
377
+ "epoch": 6.5,
378
+ "learning_rate": 8.666666666666668e-06,
379
+ "loss": 0.6974,
380
+ "step": 104
381
+ },
382
+ {
383
+ "epoch": 6.62,
384
+ "learning_rate": 8.833333333333334e-06,
385
+ "loss": 0.6804,
386
+ "step": 106
387
+ },
388
+ {
389
+ "epoch": 6.75,
390
+ "learning_rate": 9e-06,
391
+ "loss": 0.6762,
392
+ "step": 108
393
+ },
394
+ {
395
+ "epoch": 6.88,
396
+ "learning_rate": 9.166666666666666e-06,
397
+ "loss": 0.7002,
398
+ "step": 110
399
+ },
400
+ {
401
+ "epoch": 7.0,
402
+ "learning_rate": 9.333333333333334e-06,
403
+ "loss": 0.669,
404
+ "step": 112
405
+ },
406
+ {
407
+ "epoch": 7.0,
408
+ "eval_accuracy": 0.5725,
409
+ "eval_loss": 0.683117687702179,
410
+ "eval_macro_f1": 0.3640699523052464,
411
+ "eval_runtime": 1.117,
412
+ "eval_samples_per_second": 358.09,
413
+ "eval_steps_per_second": 3.581,
414
+ "step": 112
415
+ },
416
+ {
417
+ "epoch": 7.12,
418
+ "learning_rate": 9.5e-06,
419
+ "loss": 0.6975,
420
+ "step": 114
421
+ },
422
+ {
423
+ "epoch": 7.25,
424
+ "learning_rate": 9.666666666666667e-06,
425
+ "loss": 0.6847,
426
+ "step": 116
427
+ },
428
+ {
429
+ "epoch": 7.38,
430
+ "learning_rate": 9.833333333333333e-06,
431
+ "loss": 0.6715,
432
+ "step": 118
433
+ },
434
+ {
435
+ "epoch": 7.5,
436
+ "learning_rate": 1e-05,
437
+ "loss": 0.7029,
438
+ "step": 120
439
+ },
440
+ {
441
+ "epoch": 7.62,
442
+ "learning_rate": 1.0166666666666667e-05,
443
+ "loss": 0.6696,
444
+ "step": 122
445
+ },
446
+ {
447
+ "epoch": 7.75,
448
+ "learning_rate": 1.0333333333333333e-05,
449
+ "loss": 0.6602,
450
+ "step": 124
451
+ },
452
+ {
453
+ "epoch": 7.88,
454
+ "learning_rate": 1.05e-05,
455
+ "loss": 0.7126,
456
+ "step": 126
457
+ },
458
+ {
459
+ "epoch": 8.0,
460
+ "learning_rate": 1.0666666666666667e-05,
461
+ "loss": 0.6814,
462
+ "step": 128
463
+ },
464
+ {
465
+ "epoch": 8.0,
466
+ "eval_accuracy": 0.5775,
467
+ "eval_loss": 0.6825920343399048,
468
+ "eval_macro_f1": 0.36608557844690964,
469
+ "eval_runtime": 1.1048,
470
+ "eval_samples_per_second": 362.052,
471
+ "eval_steps_per_second": 3.621,
472
+ "step": 128
473
+ },
474
+ {
475
+ "epoch": 8.12,
476
+ "learning_rate": 1.0833333333333334e-05,
477
+ "loss": 0.6947,
478
+ "step": 130
479
+ },
480
+ {
481
+ "epoch": 8.25,
482
+ "learning_rate": 1.1000000000000001e-05,
483
+ "loss": 0.6809,
484
+ "step": 132
485
+ },
486
+ {
487
+ "epoch": 8.38,
488
+ "learning_rate": 1.1166666666666668e-05,
489
+ "loss": 0.6765,
490
+ "step": 134
491
+ },
492
+ {
493
+ "epoch": 8.5,
494
+ "learning_rate": 1.1333333333333334e-05,
495
+ "loss": 0.6964,
496
+ "step": 136
497
+ },
498
+ {
499
+ "epoch": 8.62,
500
+ "learning_rate": 1.1500000000000002e-05,
501
+ "loss": 0.6771,
502
+ "step": 138
503
+ },
504
+ {
505
+ "epoch": 8.75,
506
+ "learning_rate": 1.1666666666666668e-05,
507
+ "loss": 0.667,
508
+ "step": 140
509
+ },
510
+ {
511
+ "epoch": 8.88,
512
+ "learning_rate": 1.1833333333333334e-05,
513
+ "loss": 0.7035,
514
+ "step": 142
515
+ },
516
+ {
517
+ "epoch": 9.0,
518
+ "learning_rate": 1.2e-05,
519
+ "loss": 0.6773,
520
+ "step": 144
521
+ },
522
+ {
523
+ "epoch": 9.0,
524
+ "eval_accuracy": 0.5725,
525
+ "eval_loss": 0.6831978559494019,
526
+ "eval_macro_f1": 0.3640699523052464,
527
+ "eval_runtime": 1.1552,
528
+ "eval_samples_per_second": 346.251,
529
+ "eval_steps_per_second": 3.463,
530
+ "step": 144
531
+ },
532
+ {
533
+ "epoch": 9.12,
534
+ "learning_rate": 1.2166666666666668e-05,
535
+ "loss": 0.6942,
536
+ "step": 146
537
+ },
538
+ {
539
+ "epoch": 9.25,
540
+ "learning_rate": 1.2333333333333334e-05,
541
+ "loss": 0.6726,
542
+ "step": 148
543
+ },
544
+ {
545
+ "epoch": 9.38,
546
+ "learning_rate": 1.25e-05,
547
+ "loss": 0.6732,
548
+ "step": 150
549
+ },
550
+ {
551
+ "epoch": 9.5,
552
+ "learning_rate": 1.2666666666666668e-05,
553
+ "loss": 0.7068,
554
+ "step": 152
555
+ },
556
+ {
557
+ "epoch": 9.62,
558
+ "learning_rate": 1.2833333333333333e-05,
559
+ "loss": 0.6755,
560
+ "step": 154
561
+ },
562
+ {
563
+ "epoch": 9.75,
564
+ "learning_rate": 1.3000000000000001e-05,
565
+ "loss": 0.6788,
566
+ "step": 156
567
+ },
568
+ {
569
+ "epoch": 9.88,
570
+ "learning_rate": 1.3166666666666665e-05,
571
+ "loss": 0.691,
572
+ "step": 158
573
+ },
574
+ {
575
+ "epoch": 10.0,
576
+ "learning_rate": 1.3333333333333333e-05,
577
+ "loss": 0.669,
578
+ "step": 160
579
+ },
580
+ {
581
+ "epoch": 10.0,
582
+ "eval_accuracy": 0.565,
583
+ "eval_loss": 0.6841627359390259,
584
+ "eval_macro_f1": 0.3950771798080934,
585
+ "eval_runtime": 1.123,
586
+ "eval_samples_per_second": 356.196,
587
+ "eval_steps_per_second": 3.562,
588
+ "step": 160
589
+ }
590
+ ],
591
+ "max_steps": 320,
592
+ "num_train_epochs": 20,
593
+ "total_flos": 427668406272000.0,
594
+ "trial_name": null,
595
+ "trial_params": null
596
+ }
scaling_performance/2000/L6/.DS_Store CHANGED
Binary files a/scaling_performance/2000/L6/.DS_Store and b/scaling_performance/2000/L6/.DS_Store differ
 
scaling_performance/2000/L6/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.5725,
3
+ "test_loss": 0.6847579479217529,
4
+ "test_macro_f1": 0.3746285714285715,
5
+ "test_runtime": 1.2641,
6
+ "test_samples_per_second": 316.42,
7
+ "test_steps_per_second": 3.164
8
+ }
scaling_performance/2000/L6/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.02,
6
+ "classifier_dropout": null,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.02,
9
+ "hidden_size": 256,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 512,
12
+ "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 2048,
14
+ "model_type": "bert",
15
+ "num_attention_heads": 4,
16
+ "num_hidden_layers": 6,
17
+ "pad_token_id": 0,
18
+ "position_embedding_type": "absolute",
19
+ "problem_type": "single_label_classification",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.28.0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
scaling_performance/2000/L6/eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_accuracy": 0.5725,
3
+ "test_loss": 0.6847579479217529,
4
+ "test_macro_f1": 0.3746285714285715,
5
+ "test_runtime": 1.2641,
6
+ "test_samples_per_second": 316.42,
7
+ "test_steps_per_second": 3.164
8
+ }
scaling_performance/2000/L6/trainer_state.json ADDED
@@ -0,0 +1,596 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6847579479217529,
3
+ "best_model_checkpoint": "./models/240626_geneformer_CellClassifier_PM25_Layers6_L2048_B26_LR5e-05_LSlinear_WU600_E20_Oadamw_F0_fold4/checkpoint-128",
4
+ "epoch": 10.0,
5
+ "global_step": 160,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.12,
12
+ "learning_rate": 1.6666666666666668e-07,
13
+ "loss": 0.6906,
14
+ "step": 2
15
+ },
16
+ {
17
+ "epoch": 0.25,
18
+ "learning_rate": 3.3333333333333335e-07,
19
+ "loss": 0.702,
20
+ "step": 4
21
+ },
22
+ {
23
+ "epoch": 0.38,
24
+ "learning_rate": 5.000000000000001e-07,
25
+ "loss": 0.7029,
26
+ "step": 6
27
+ },
28
+ {
29
+ "epoch": 0.5,
30
+ "learning_rate": 6.666666666666667e-07,
31
+ "loss": 0.6979,
32
+ "step": 8
33
+ },
34
+ {
35
+ "epoch": 0.62,
36
+ "learning_rate": 8.333333333333333e-07,
37
+ "loss": 0.7027,
38
+ "step": 10
39
+ },
40
+ {
41
+ "epoch": 0.75,
42
+ "learning_rate": 1.0000000000000002e-06,
43
+ "loss": 0.7037,
44
+ "step": 12
45
+ },
46
+ {
47
+ "epoch": 0.88,
48
+ "learning_rate": 1.1666666666666668e-06,
49
+ "loss": 0.6946,
50
+ "step": 14
51
+ },
52
+ {
53
+ "epoch": 1.0,
54
+ "learning_rate": 1.3333333333333334e-06,
55
+ "loss": 0.6957,
56
+ "step": 16
57
+ },
58
+ {
59
+ "epoch": 1.0,
60
+ "eval_accuracy": 0.44,
61
+ "eval_loss": 0.6994468569755554,
62
+ "eval_macro_f1": 0.4141646615754786,
63
+ "eval_runtime": 1.3238,
64
+ "eval_samples_per_second": 302.171,
65
+ "eval_steps_per_second": 3.022,
66
+ "step": 16
67
+ },
68
+ {
69
+ "epoch": 1.12,
70
+ "learning_rate": 1.5e-06,
71
+ "loss": 0.6918,
72
+ "step": 18
73
+ },
74
+ {
75
+ "epoch": 1.25,
76
+ "learning_rate": 1.6666666666666667e-06,
77
+ "loss": 0.7014,
78
+ "step": 20
79
+ },
80
+ {
81
+ "epoch": 1.38,
82
+ "learning_rate": 1.8333333333333335e-06,
83
+ "loss": 0.6956,
84
+ "step": 22
85
+ },
86
+ {
87
+ "epoch": 1.5,
88
+ "learning_rate": 2.0000000000000003e-06,
89
+ "loss": 0.6929,
90
+ "step": 24
91
+ },
92
+ {
93
+ "epoch": 1.62,
94
+ "learning_rate": 2.166666666666667e-06,
95
+ "loss": 0.698,
96
+ "step": 26
97
+ },
98
+ {
99
+ "epoch": 1.75,
100
+ "learning_rate": 2.3333333333333336e-06,
101
+ "loss": 0.6998,
102
+ "step": 28
103
+ },
104
+ {
105
+ "epoch": 1.88,
106
+ "learning_rate": 2.5e-06,
107
+ "loss": 0.6942,
108
+ "step": 30
109
+ },
110
+ {
111
+ "epoch": 2.0,
112
+ "learning_rate": 2.666666666666667e-06,
113
+ "loss": 0.6985,
114
+ "step": 32
115
+ },
116
+ {
117
+ "epoch": 2.0,
118
+ "eval_accuracy": 0.475,
119
+ "eval_loss": 0.6957302093505859,
120
+ "eval_macro_f1": 0.4745270743669302,
121
+ "eval_runtime": 1.4915,
122
+ "eval_samples_per_second": 268.182,
123
+ "eval_steps_per_second": 2.682,
124
+ "step": 32
125
+ },
126
+ {
127
+ "epoch": 2.12,
128
+ "learning_rate": 2.8333333333333335e-06,
129
+ "loss": 0.6932,
130
+ "step": 34
131
+ },
132
+ {
133
+ "epoch": 2.25,
134
+ "learning_rate": 3e-06,
135
+ "loss": 0.6924,
136
+ "step": 36
137
+ },
138
+ {
139
+ "epoch": 2.38,
140
+ "learning_rate": 3.166666666666667e-06,
141
+ "loss": 0.694,
142
+ "step": 38
143
+ },
144
+ {
145
+ "epoch": 2.5,
146
+ "learning_rate": 3.3333333333333333e-06,
147
+ "loss": 0.6913,
148
+ "step": 40
149
+ },
150
+ {
151
+ "epoch": 2.62,
152
+ "learning_rate": 3.5000000000000004e-06,
153
+ "loss": 0.6875,
154
+ "step": 42
155
+ },
156
+ {
157
+ "epoch": 2.75,
158
+ "learning_rate": 3.666666666666667e-06,
159
+ "loss": 0.6913,
160
+ "step": 44
161
+ },
162
+ {
163
+ "epoch": 2.88,
164
+ "learning_rate": 3.833333333333334e-06,
165
+ "loss": 0.6968,
166
+ "step": 46
167
+ },
168
+ {
169
+ "epoch": 3.0,
170
+ "learning_rate": 4.000000000000001e-06,
171
+ "loss": 0.6905,
172
+ "step": 48
173
+ },
174
+ {
175
+ "epoch": 3.0,
176
+ "eval_accuracy": 0.57,
177
+ "eval_loss": 0.6904626488685608,
178
+ "eval_macro_f1": 0.4724573671942093,
179
+ "eval_runtime": 1.349,
180
+ "eval_samples_per_second": 296.513,
181
+ "eval_steps_per_second": 2.965,
182
+ "step": 48
183
+ },
184
+ {
185
+ "epoch": 3.12,
186
+ "learning_rate": 4.166666666666667e-06,
187
+ "loss": 0.6902,
188
+ "step": 50
189
+ },
190
+ {
191
+ "epoch": 3.25,
192
+ "learning_rate": 4.333333333333334e-06,
193
+ "loss": 0.6873,
194
+ "step": 52
195
+ },
196
+ {
197
+ "epoch": 3.38,
198
+ "learning_rate": 4.5e-06,
199
+ "loss": 0.6838,
200
+ "step": 54
201
+ },
202
+ {
203
+ "epoch": 3.5,
204
+ "learning_rate": 4.666666666666667e-06,
205
+ "loss": 0.6996,
206
+ "step": 56
207
+ },
208
+ {
209
+ "epoch": 3.62,
210
+ "learning_rate": 4.833333333333333e-06,
211
+ "loss": 0.6857,
212
+ "step": 58
213
+ },
214
+ {
215
+ "epoch": 3.75,
216
+ "learning_rate": 5e-06,
217
+ "loss": 0.69,
218
+ "step": 60
219
+ },
220
+ {
221
+ "epoch": 3.88,
222
+ "learning_rate": 5.166666666666667e-06,
223
+ "loss": 0.6969,
224
+ "step": 62
225
+ },
226
+ {
227
+ "epoch": 4.0,
228
+ "learning_rate": 5.333333333333334e-06,
229
+ "loss": 0.6771,
230
+ "step": 64
231
+ },
232
+ {
233
+ "epoch": 4.0,
234
+ "eval_accuracy": 0.5725,
235
+ "eval_loss": 0.6874645352363586,
236
+ "eval_macro_f1": 0.38949829969921185,
237
+ "eval_runtime": 1.2979,
238
+ "eval_samples_per_second": 308.193,
239
+ "eval_steps_per_second": 3.082,
240
+ "step": 64
241
+ },
242
+ {
243
+ "epoch": 4.12,
244
+ "learning_rate": 5.500000000000001e-06,
245
+ "loss": 0.6962,
246
+ "step": 66
247
+ },
248
+ {
249
+ "epoch": 4.25,
250
+ "learning_rate": 5.666666666666667e-06,
251
+ "loss": 0.6745,
252
+ "step": 68
253
+ },
254
+ {
255
+ "epoch": 4.38,
256
+ "learning_rate": 5.833333333333334e-06,
257
+ "loss": 0.6831,
258
+ "step": 70
259
+ },
260
+ {
261
+ "epoch": 4.5,
262
+ "learning_rate": 6e-06,
263
+ "loss": 0.6989,
264
+ "step": 72
265
+ },
266
+ {
267
+ "epoch": 4.62,
268
+ "learning_rate": 6.166666666666667e-06,
269
+ "loss": 0.684,
270
+ "step": 74
271
+ },
272
+ {
273
+ "epoch": 4.75,
274
+ "learning_rate": 6.333333333333334e-06,
275
+ "loss": 0.6797,
276
+ "step": 76
277
+ },
278
+ {
279
+ "epoch": 4.88,
280
+ "learning_rate": 6.5000000000000004e-06,
281
+ "loss": 0.6998,
282
+ "step": 78
283
+ },
284
+ {
285
+ "epoch": 5.0,
286
+ "learning_rate": 6.666666666666667e-06,
287
+ "loss": 0.6734,
288
+ "step": 80
289
+ },
290
+ {
291
+ "epoch": 5.0,
292
+ "eval_accuracy": 0.5725,
293
+ "eval_loss": 0.6859826445579529,
294
+ "eval_macro_f1": 0.37970998721331994,
295
+ "eval_runtime": 1.544,
296
+ "eval_samples_per_second": 259.064,
297
+ "eval_steps_per_second": 2.591,
298
+ "step": 80
299
+ },
300
+ {
301
+ "epoch": 5.12,
302
+ "learning_rate": 6.833333333333333e-06,
303
+ "loss": 0.6978,
304
+ "step": 82
305
+ },
306
+ {
307
+ "epoch": 5.25,
308
+ "learning_rate": 7.000000000000001e-06,
309
+ "loss": 0.6867,
310
+ "step": 84
311
+ },
312
+ {
313
+ "epoch": 5.38,
314
+ "learning_rate": 7.166666666666667e-06,
315
+ "loss": 0.68,
316
+ "step": 86
317
+ },
318
+ {
319
+ "epoch": 5.5,
320
+ "learning_rate": 7.333333333333334e-06,
321
+ "loss": 0.6922,
322
+ "step": 88
323
+ },
324
+ {
325
+ "epoch": 5.62,
326
+ "learning_rate": 7.5e-06,
327
+ "loss": 0.6833,
328
+ "step": 90
329
+ },
330
+ {
331
+ "epoch": 5.75,
332
+ "learning_rate": 7.666666666666667e-06,
333
+ "loss": 0.6832,
334
+ "step": 92
335
+ },
336
+ {
337
+ "epoch": 5.88,
338
+ "learning_rate": 7.833333333333333e-06,
339
+ "loss": 0.6961,
340
+ "step": 94
341
+ },
342
+ {
343
+ "epoch": 6.0,
344
+ "learning_rate": 8.000000000000001e-06,
345
+ "loss": 0.6562,
346
+ "step": 96
347
+ },
348
+ {
349
+ "epoch": 6.0,
350
+ "eval_accuracy": 0.5725,
351
+ "eval_loss": 0.6861733794212341,
352
+ "eval_macro_f1": 0.37970998721331994,
353
+ "eval_runtime": 1.424,
354
+ "eval_samples_per_second": 280.898,
355
+ "eval_steps_per_second": 2.809,
356
+ "step": 96
357
+ },
358
+ {
359
+ "epoch": 6.12,
360
+ "learning_rate": 8.166666666666668e-06,
361
+ "loss": 0.6936,
362
+ "step": 98
363
+ },
364
+ {
365
+ "epoch": 6.25,
366
+ "learning_rate": 8.333333333333334e-06,
367
+ "loss": 0.6943,
368
+ "step": 100
369
+ },
370
+ {
371
+ "epoch": 6.38,
372
+ "learning_rate": 8.500000000000002e-06,
373
+ "loss": 0.6689,
374
+ "step": 102
375
+ },
376
+ {
377
+ "epoch": 6.5,
378
+ "learning_rate": 8.666666666666668e-06,
379
+ "loss": 0.6919,
380
+ "step": 104
381
+ },
382
+ {
383
+ "epoch": 6.62,
384
+ "learning_rate": 8.833333333333334e-06,
385
+ "loss": 0.674,
386
+ "step": 106
387
+ },
388
+ {
389
+ "epoch": 6.75,
390
+ "learning_rate": 9e-06,
391
+ "loss": 0.6754,
392
+ "step": 108
393
+ },
394
+ {
395
+ "epoch": 6.88,
396
+ "learning_rate": 9.166666666666666e-06,
397
+ "loss": 0.6999,
398
+ "step": 110
399
+ },
400
+ {
401
+ "epoch": 7.0,
402
+ "learning_rate": 9.333333333333334e-06,
403
+ "loss": 0.6714,
404
+ "step": 112
405
+ },
406
+ {
407
+ "epoch": 7.0,
408
+ "eval_accuracy": 0.57,
409
+ "eval_loss": 0.6851080060005188,
410
+ "eval_macro_f1": 0.3735431235431235,
411
+ "eval_runtime": 1.4333,
412
+ "eval_samples_per_second": 279.083,
413
+ "eval_steps_per_second": 2.791,
414
+ "step": 112
415
+ },
416
+ {
417
+ "epoch": 7.12,
418
+ "learning_rate": 9.5e-06,
419
+ "loss": 0.694,
420
+ "step": 114
421
+ },
422
+ {
423
+ "epoch": 7.25,
424
+ "learning_rate": 9.666666666666667e-06,
425
+ "loss": 0.6837,
426
+ "step": 116
427
+ },
428
+ {
429
+ "epoch": 7.38,
430
+ "learning_rate": 9.833333333333333e-06,
431
+ "loss": 0.6692,
432
+ "step": 118
433
+ },
434
+ {
435
+ "epoch": 7.5,
436
+ "learning_rate": 1e-05,
437
+ "loss": 0.7002,
438
+ "step": 120
439
+ },
440
+ {
441
+ "epoch": 7.62,
442
+ "learning_rate": 1.0166666666666667e-05,
443
+ "loss": 0.6706,
444
+ "step": 122
445
+ },
446
+ {
447
+ "epoch": 7.75,
448
+ "learning_rate": 1.0333333333333333e-05,
449
+ "loss": 0.6592,
450
+ "step": 124
451
+ },
452
+ {
453
+ "epoch": 7.88,
454
+ "learning_rate": 1.05e-05,
455
+ "loss": 0.7135,
456
+ "step": 126
457
+ },
458
+ {
459
+ "epoch": 8.0,
460
+ "learning_rate": 1.0666666666666667e-05,
461
+ "loss": 0.6858,
462
+ "step": 128
463
+ },
464
+ {
465
+ "epoch": 8.0,
466
+ "eval_accuracy": 0.5725,
467
+ "eval_loss": 0.6847579479217529,
468
+ "eval_macro_f1": 0.3746285714285715,
469
+ "eval_runtime": 1.2984,
470
+ "eval_samples_per_second": 308.064,
471
+ "eval_steps_per_second": 3.081,
472
+ "step": 128
473
+ },
474
+ {
475
+ "epoch": 8.12,
476
+ "learning_rate": 1.0833333333333334e-05,
477
+ "loss": 0.6926,
478
+ "step": 130
479
+ },
480
+ {
481
+ "epoch": 8.25,
482
+ "learning_rate": 1.1000000000000001e-05,
483
+ "loss": 0.6814,
484
+ "step": 132
485
+ },
486
+ {
487
+ "epoch": 8.38,
488
+ "learning_rate": 1.1166666666666668e-05,
489
+ "loss": 0.669,
490
+ "step": 134
491
+ },
492
+ {
493
+ "epoch": 8.5,
494
+ "learning_rate": 1.1333333333333334e-05,
495
+ "loss": 0.6969,
496
+ "step": 136
497
+ },
498
+ {
499
+ "epoch": 8.62,
500
+ "learning_rate": 1.1500000000000002e-05,
501
+ "loss": 0.6732,
502
+ "step": 138
503
+ },
504
+ {
505
+ "epoch": 8.75,
506
+ "learning_rate": 1.1666666666666668e-05,
507
+ "loss": 0.6704,
508
+ "step": 140
509
+ },
510
+ {
511
+ "epoch": 8.88,
512
+ "learning_rate": 1.1833333333333334e-05,
513
+ "loss": 0.6989,
514
+ "step": 142
515
+ },
516
+ {
517
+ "epoch": 9.0,
518
+ "learning_rate": 1.2e-05,
519
+ "loss": 0.6831,
520
+ "step": 144
521
+ },
522
+ {
523
+ "epoch": 9.0,
524
+ "eval_accuracy": 0.575,
525
+ "eval_loss": 0.6859992742538452,
526
+ "eval_macro_f1": 0.38581596155930487,
527
+ "eval_runtime": 1.4972,
528
+ "eval_samples_per_second": 267.159,
529
+ "eval_steps_per_second": 2.672,
530
+ "step": 144
531
+ },
532
+ {
533
+ "epoch": 9.12,
534
+ "learning_rate": 1.2166666666666668e-05,
535
+ "loss": 0.6949,
536
+ "step": 146
537
+ },
538
+ {
539
+ "epoch": 9.25,
540
+ "learning_rate": 1.2333333333333334e-05,
541
+ "loss": 0.6692,
542
+ "step": 148
543
+ },
544
+ {
545
+ "epoch": 9.38,
546
+ "learning_rate": 1.25e-05,
547
+ "loss": 0.6792,
548
+ "step": 150
549
+ },
550
+ {
551
+ "epoch": 9.5,
552
+ "learning_rate": 1.2666666666666668e-05,
553
+ "loss": 0.6972,
554
+ "step": 152
555
+ },
556
+ {
557
+ "epoch": 9.62,
558
+ "learning_rate": 1.2833333333333333e-05,
559
+ "loss": 0.6766,
560
+ "step": 154
561
+ },
562
+ {
563
+ "epoch": 9.75,
564
+ "learning_rate": 1.3000000000000001e-05,
565
+ "loss": 0.678,
566
+ "step": 156
567
+ },
568
+ {
569
+ "epoch": 9.88,
570
+ "learning_rate": 1.3166666666666665e-05,
571
+ "loss": 0.6887,
572
+ "step": 158
573
+ },
574
+ {
575
+ "epoch": 10.0,
576
+ "learning_rate": 1.3333333333333333e-05,
577
+ "loss": 0.6676,
578
+ "step": 160
579
+ },
580
+ {
581
+ "epoch": 10.0,
582
+ "eval_accuracy": 0.575,
583
+ "eval_loss": 0.6874198913574219,
584
+ "eval_macro_f1": 0.42159164370045255,
585
+ "eval_runtime": 1.4475,
586
+ "eval_samples_per_second": 276.347,
587
+ "eval_steps_per_second": 2.763,
588
+ "step": 160
589
+ }
590
+ ],
591
+ "max_steps": 320,
592
+ "num_train_epochs": 20,
593
+ "total_flos": 634934132736000.0,
594
+ "trial_name": null,
595
+ "trial_params": null
596
+ }
scaling_performance/2000/fine-tuned/.DS_Store CHANGED
Binary files a/scaling_performance/2000/fine-tuned/.DS_Store and b/scaling_performance/2000/fine-tuned/.DS_Store differ