Emil7018 commited on
Commit
dd2f84b
·
verified ·
1 Parent(s): c52d45c

Training in progress, epoch 1

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf658bcb3d6e926ee8e4606c9b8822870aeb69f28f4d04ca4eeb4ef7b23b422c
3
  size 598898116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d0ed4dbafa14012febf6308c9744c75bd232a710108fe5bac07bdb489e756bd
3
  size 598898116
run-0/checkpoint-1272/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b907c506df32ce38d7a707871e5d9a0026c32cf7bae7ea2a4214047a26467f97
3
  size 598898116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33e6dbc017e7f64ef220016fdf9f9fba336aee3835bc3d931dd5126c94babbfe
3
  size 598898116
run-0/checkpoint-1272/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33a3708f7d8d8c9159f3183f4973427edd1e5707aed0c0628abddd827ec93310
3
  size 1197886411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:535b6a8cdba5634d3fa7bc11df77c8659c420cab7e6b5f4eefa671b370f15ad7
3
  size 1197886411
run-0/checkpoint-1272/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd8d98d53657814f4efe9b4118b1c951c2a1001b48ef247f242d5ab768eba07d
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:353485bb1bbc64112c48bfd9c1dfdca238051fa65ea0255593541aaa6b2ac608
3
  size 1383
run-0/checkpoint-1272/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 1272,
3
- "best_metric": 0.9419235697866993,
4
  "best_model_checkpoint": "ModernBERT-base-distilled/run-0/checkpoint-1272",
5
  "epoch": 4.0,
6
  "eval_steps": 500,
@@ -11,74 +11,70 @@
11
  "log_history": [
12
  {
13
  "epoch": 1.0,
14
- "grad_norm": 7.658935546875,
15
  "learning_rate": 1.5015723270440253e-05,
16
- "loss": 4.0156,
17
  "step": 318
18
  },
19
  {
20
  "epoch": 1.0,
21
- "eval_accuracy": 0.8916129032258064,
22
- "eval_f1": 0.8897418223159607,
23
- "eval_loss": 2.2384302616119385,
24
- "eval_model_preparation_time": 0.0029,
25
- "eval_runtime": 17.8082,
26
- "eval_samples_per_second": 174.077,
27
- "eval_steps_per_second": 3.65,
28
  "step": 318
29
  },
30
  {
31
  "epoch": 2.0,
32
- "grad_norm": 6.194184303283691,
33
  "learning_rate": 1.0015723270440252e-05,
34
- "loss": 1.6502,
35
  "step": 636
36
  },
37
  {
38
  "epoch": 2.0,
39
- "eval_accuracy": 0.9319354838709677,
40
- "eval_f1": 0.9309773954881618,
41
- "eval_loss": 1.2816612720489502,
42
- "eval_model_preparation_time": 0.0029,
43
- "eval_runtime": 17.5976,
44
- "eval_samples_per_second": 176.161,
45
- "eval_steps_per_second": 3.694,
46
  "step": 636
47
  },
48
  {
49
  "epoch": 3.0,
50
- "grad_norm": 5.453166484832764,
51
  "learning_rate": 5.015723270440252e-06,
52
- "loss": 1.0071,
53
  "step": 954
54
  },
55
  {
56
  "epoch": 3.0,
57
- "eval_accuracy": 0.942258064516129,
58
- "eval_f1": 0.9412083289174236,
59
- "eval_loss": 1.0026001930236816,
60
- "eval_model_preparation_time": 0.0029,
61
- "eval_runtime": 17.5687,
62
- "eval_samples_per_second": 176.45,
63
- "eval_steps_per_second": 3.7,
64
  "step": 954
65
  },
66
  {
67
  "epoch": 4.0,
68
- "grad_norm": 3.245698928833008,
69
  "learning_rate": 1.5723270440251573e-08,
70
- "loss": 0.7999,
71
  "step": 1272
72
  },
73
  {
74
  "epoch": 4.0,
75
- "eval_accuracy": 0.9429032258064516,
76
- "eval_f1": 0.9419235697866993,
77
- "eval_loss": 0.9279481172561646,
78
- "eval_model_preparation_time": 0.0029,
79
- "eval_runtime": 17.6313,
80
- "eval_samples_per_second": 175.824,
81
- "eval_steps_per_second": 3.687,
82
  "step": 1272
83
  }
84
  ],
@@ -108,12 +104,12 @@
108
  "attributes": {}
109
  }
110
  },
111
- "total_flos": 901477353837708.0,
112
  "train_batch_size": 48,
113
  "trial_name": null,
114
  "trial_params": {
115
- "alpha": 0.0728812223134534,
116
  "num_train_epochs": 4,
117
- "temperature": 15
118
  }
119
  }
 
1
  {
2
  "best_global_step": 1272,
3
+ "best_metric": 0.9489170488645581,
4
  "best_model_checkpoint": "ModernBERT-base-distilled/run-0/checkpoint-1272",
5
  "epoch": 4.0,
6
  "eval_steps": 500,
 
11
  "log_history": [
12
  {
13
  "epoch": 1.0,
14
+ "grad_norm": 7.095359802246094,
15
  "learning_rate": 1.5015723270440253e-05,
16
+ "loss": 4.2443,
17
  "step": 318
18
  },
19
  {
20
  "epoch": 1.0,
21
+ "eval_accuracy": 0.9070967741935484,
22
+ "eval_f1": 0.9051373620810793,
23
+ "eval_loss": 2.06101655960083,
24
+ "eval_runtime": 16.6455,
25
+ "eval_samples_per_second": 186.236,
26
+ "eval_steps_per_second": 3.905,
 
27
  "step": 318
28
  },
29
  {
30
  "epoch": 2.0,
31
+ "grad_norm": 6.176209449768066,
32
  "learning_rate": 1.0015723270440252e-05,
33
+ "loss": 1.4635,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 2.0,
38
+ "eval_accuracy": 0.9425806451612904,
39
+ "eval_f1": 0.9414720571915698,
40
+ "eval_loss": 1.1604701280593872,
41
+ "eval_runtime": 16.5771,
42
+ "eval_samples_per_second": 187.005,
43
+ "eval_steps_per_second": 3.921,
 
44
  "step": 636
45
  },
46
  {
47
  "epoch": 3.0,
48
+ "grad_norm": 4.269500255584717,
49
  "learning_rate": 5.015723270440252e-06,
50
+ "loss": 0.8556,
51
  "step": 954
52
  },
53
  {
54
  "epoch": 3.0,
55
+ "eval_accuracy": 0.9490322580645161,
56
+ "eval_f1": 0.9483440674472702,
57
+ "eval_loss": 0.8985261917114258,
58
+ "eval_runtime": 16.5526,
59
+ "eval_samples_per_second": 187.281,
60
+ "eval_steps_per_second": 3.927,
 
61
  "step": 954
62
  },
63
  {
64
  "epoch": 4.0,
65
+ "grad_norm": 2.268059015274048,
66
  "learning_rate": 1.5723270440251573e-08,
67
+ "loss": 0.6622,
68
  "step": 1272
69
  },
70
  {
71
  "epoch": 4.0,
72
+ "eval_accuracy": 0.9496774193548387,
73
+ "eval_f1": 0.9489170488645581,
74
+ "eval_loss": 0.8302651047706604,
75
+ "eval_runtime": 16.8761,
76
+ "eval_samples_per_second": 183.692,
77
+ "eval_steps_per_second": 3.852,
 
78
  "step": 1272
79
  }
80
  ],
 
104
  "attributes": {}
105
  }
106
  },
107
+ "total_flos": 1007104147373484.0,
108
  "train_batch_size": 48,
109
  "trial_name": null,
110
  "trial_params": {
111
+ "alpha": 0.362336107066899,
112
  "num_train_epochs": 4,
113
+ "temperature": 10
114
  }
115
  }
run-0/checkpoint-1272/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:080327bf308da3daf1342344694cc74997103a3dbc9b586e2e8dc0503c895f98
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e57990bfcffc7508b1aa41cf9c14f9a33fd6a76ae0e5b564184c21c4e65e1398
3
  size 5905
run-0/checkpoint-954/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57f80ba04e05067b7be0cc159a0131656a1feb8b2d3f603b75429e0d8a499f84
3
  size 598898116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:556f7f3f069edc811e32fdb21dbb7851b0b99ce22f87a1fe6ba9e29fd7412872
3
  size 598898116
run-0/checkpoint-954/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16657c9327a7680545b663c22830919c89a19c29ac5a37ebe774ec442158e1b1
3
  size 1197886411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4874410d1e9e2085b42708e088a24661bb2a1629bceff5d3bf81b4484d2e7bb7
3
  size 1197886411
run-0/checkpoint-954/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09a405e0a62be2573bcce9ada4e872e06fcdfbb19dcc19f52fadc704a3877584
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f8d06ed524870ae790a674ab8105f40437ad4925ef629d9d365a68268f03ed3
3
  size 1383
run-0/checkpoint-954/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 954,
3
- "best_metric": 0.9412083289174236,
4
  "best_model_checkpoint": "ModernBERT-base-distilled/run-0/checkpoint-954",
5
  "epoch": 3.0,
6
  "eval_steps": 500,
@@ -11,56 +11,53 @@
11
  "log_history": [
12
  {
13
  "epoch": 1.0,
14
- "grad_norm": 7.658935546875,
15
  "learning_rate": 1.5015723270440253e-05,
16
- "loss": 4.0156,
17
  "step": 318
18
  },
19
  {
20
  "epoch": 1.0,
21
- "eval_accuracy": 0.8916129032258064,
22
- "eval_f1": 0.8897418223159607,
23
- "eval_loss": 2.2384302616119385,
24
- "eval_model_preparation_time": 0.0029,
25
- "eval_runtime": 17.8082,
26
- "eval_samples_per_second": 174.077,
27
- "eval_steps_per_second": 3.65,
28
  "step": 318
29
  },
30
  {
31
  "epoch": 2.0,
32
- "grad_norm": 6.194184303283691,
33
  "learning_rate": 1.0015723270440252e-05,
34
- "loss": 1.6502,
35
  "step": 636
36
  },
37
  {
38
  "epoch": 2.0,
39
- "eval_accuracy": 0.9319354838709677,
40
- "eval_f1": 0.9309773954881618,
41
- "eval_loss": 1.2816612720489502,
42
- "eval_model_preparation_time": 0.0029,
43
- "eval_runtime": 17.5976,
44
- "eval_samples_per_second": 176.161,
45
- "eval_steps_per_second": 3.694,
46
  "step": 636
47
  },
48
  {
49
  "epoch": 3.0,
50
- "grad_norm": 5.453166484832764,
51
  "learning_rate": 5.015723270440252e-06,
52
- "loss": 1.0071,
53
  "step": 954
54
  },
55
  {
56
  "epoch": 3.0,
57
- "eval_accuracy": 0.942258064516129,
58
- "eval_f1": 0.9412083289174236,
59
- "eval_loss": 1.0026001930236816,
60
- "eval_model_preparation_time": 0.0029,
61
- "eval_runtime": 17.5687,
62
- "eval_samples_per_second": 176.45,
63
- "eval_steps_per_second": 3.7,
64
  "step": 954
65
  }
66
  ],
@@ -90,12 +87,12 @@
90
  "attributes": {}
91
  }
92
  },
93
- "total_flos": 676280902103244.0,
94
  "train_batch_size": 48,
95
  "trial_name": null,
96
  "trial_params": {
97
- "alpha": 0.0728812223134534,
98
  "num_train_epochs": 4,
99
- "temperature": 15
100
  }
101
  }
 
1
  {
2
  "best_global_step": 954,
3
+ "best_metric": 0.9483440674472702,
4
  "best_model_checkpoint": "ModernBERT-base-distilled/run-0/checkpoint-954",
5
  "epoch": 3.0,
6
  "eval_steps": 500,
 
11
  "log_history": [
12
  {
13
  "epoch": 1.0,
14
+ "grad_norm": 7.095359802246094,
15
  "learning_rate": 1.5015723270440253e-05,
16
+ "loss": 4.2443,
17
  "step": 318
18
  },
19
  {
20
  "epoch": 1.0,
21
+ "eval_accuracy": 0.9070967741935484,
22
+ "eval_f1": 0.9051373620810793,
23
+ "eval_loss": 2.06101655960083,
24
+ "eval_runtime": 16.6455,
25
+ "eval_samples_per_second": 186.236,
26
+ "eval_steps_per_second": 3.905,
 
27
  "step": 318
28
  },
29
  {
30
  "epoch": 2.0,
31
+ "grad_norm": 6.176209449768066,
32
  "learning_rate": 1.0015723270440252e-05,
33
+ "loss": 1.4635,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 2.0,
38
+ "eval_accuracy": 0.9425806451612904,
39
+ "eval_f1": 0.9414720571915698,
40
+ "eval_loss": 1.1604701280593872,
41
+ "eval_runtime": 16.5771,
42
+ "eval_samples_per_second": 187.005,
43
+ "eval_steps_per_second": 3.921,
 
44
  "step": 636
45
  },
46
  {
47
  "epoch": 3.0,
48
+ "grad_norm": 4.269500255584717,
49
  "learning_rate": 5.015723270440252e-06,
50
+ "loss": 0.8556,
51
  "step": 954
52
  },
53
  {
54
  "epoch": 3.0,
55
+ "eval_accuracy": 0.9490322580645161,
56
+ "eval_f1": 0.9483440674472702,
57
+ "eval_loss": 0.8985261917114258,
58
+ "eval_runtime": 16.5526,
59
+ "eval_samples_per_second": 187.281,
60
+ "eval_steps_per_second": 3.927,
 
61
  "step": 954
62
  }
63
  ],
 
87
  "attributes": {}
88
  }
89
  },
90
+ "total_flos": 781907695639020.0,
91
  "train_batch_size": 48,
92
  "trial_name": null,
93
  "trial_params": {
94
+ "alpha": 0.362336107066899,
95
  "num_train_epochs": 4,
96
+ "temperature": 10
97
  }
98
  }
run-0/checkpoint-954/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:080327bf308da3daf1342344694cc74997103a3dbc9b586e2e8dc0503c895f98
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e57990bfcffc7508b1aa41cf9c14f9a33fd6a76ae0e5b564184c21c4e65e1398
3
  size 5905
run-1/checkpoint-318/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbe5fc7ba60f16342a2cea677db69c8ae5f868fc72eeac6a4b913d0533dcab28
3
  size 598898116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d0ed4dbafa14012febf6308c9744c75bd232a710108fe5bac07bdb489e756bd
3
  size 598898116
run-1/checkpoint-318/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b11b582e5ffc77ee3ed977efe9243dc7503ddbfa0daca77e34d8be765bdba076
3
  size 1197886411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1a1f3fe531a83a4ec091d28753109f9c0da4c20e130b3462720cecb12f2e7e9
3
  size 1197886411
run-1/checkpoint-318/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c75430e9735ac2401cc5dd787d1ea32f7e26f6e97fda72fd2cd23a50888689e7
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f5820fad84020b09d881b4b6d6ce78d3731e06e52d3401636c1604c2a2630f2
3
  size 1465
run-1/checkpoint-318/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 318,
3
- "best_metric": 0.8534546726797415,
4
  "best_model_checkpoint": "ModernBERT-base-distilled/run-1/checkpoint-318",
5
  "epoch": 1.0,
6
  "eval_steps": 500,
@@ -11,26 +11,26 @@
11
  "log_history": [
12
  {
13
  "epoch": 1.0,
14
- "grad_norm": 7.506621360778809,
15
- "learning_rate": 1.3354297693920338e-05,
16
- "loss": 3.6598,
17
  "step": 318
18
  },
19
  {
20
  "epoch": 1.0,
21
- "eval_accuracy": 0.8554838709677419,
22
- "eval_f1": 0.8534546726797415,
23
- "eval_loss": 2.181856393814087,
24
- "eval_runtime": 16.5545,
25
- "eval_samples_per_second": 187.26,
26
- "eval_steps_per_second": 3.926,
27
  "step": 318
28
  }
29
  ],
30
  "logging_steps": 500,
31
- "max_steps": 954,
32
  "num_input_tokens_seen": 0,
33
- "num_train_epochs": 3,
34
  "save_steps": 500,
35
  "stateful_callbacks": {
36
  "EarlyStoppingCallback": {
@@ -57,8 +57,8 @@
57
  "train_batch_size": 48,
58
  "trial_name": null,
59
  "trial_params": {
60
- "alpha": 0.008550262771937045,
61
- "num_train_epochs": 3,
62
- "temperature": 20
63
  }
64
  }
 
1
  {
2
  "best_global_step": 318,
3
+ "best_metric": 0.9026327188116668,
4
  "best_model_checkpoint": "ModernBERT-base-distilled/run-1/checkpoint-318",
5
  "epoch": 1.0,
6
  "eval_steps": 500,
 
11
  "log_history": [
12
  {
13
  "epoch": 1.0,
14
+ "grad_norm": 9.185647964477539,
15
+ "learning_rate": 1.5015723270440253e-05,
16
+ "loss": 5.2764,
17
  "step": 318
18
  },
19
  {
20
  "epoch": 1.0,
21
+ "eval_accuracy": 0.9051612903225806,
22
+ "eval_f1": 0.9026327188116668,
23
+ "eval_loss": 2.4359920024871826,
24
+ "eval_runtime": 16.8274,
25
+ "eval_samples_per_second": 184.223,
26
+ "eval_steps_per_second": 3.863,
27
  "step": 318
28
  }
29
  ],
30
  "logging_steps": 500,
31
+ "max_steps": 1272,
32
  "num_input_tokens_seen": 0,
33
+ "num_train_epochs": 4,
34
  "save_steps": 500,
35
  "stateful_callbacks": {
36
  "EarlyStoppingCallback": {
 
57
  "train_batch_size": 48,
58
  "trial_name": null,
59
  "trial_params": {
60
+ "alpha": 0.1772263993418024,
61
+ "num_train_epochs": 4,
62
+ "temperature": 9
63
  }
64
  }
run-1/checkpoint-318/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4d1f48d83021324c19290eb8526bd37db5d8c6569e5318a75916976ce927e32
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:758afc652d099f48a625a20bc0a6608ca13318a564c979a2c3084b7d294ae3f3
3
  size 5905
runs/Oct11_19-55-54_cd07aeb3aeb4/events.out.tfevents.1760213687.cd07aeb3aeb4.2948.1 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4325dfbf291448b9a1983f1cfad1c5aec6e8ea31c0fdf2bbb3ee2ec29d3515ed
3
- size 28072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1096071cb8f68624662034c2365584aa867e8f143e61a07502f0b65d16341fb
3
+ size 29586
runs/Oct11_19-55-54_cd07aeb3aeb4/events.out.tfevents.1760214697.cd07aeb3aeb4.2948.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af9663bb3be7e58d752f689c4a9c6f6a950849257d681b9eca968e5ed7459922
3
+ size 13790
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e57990bfcffc7508b1aa41cf9c14f9a33fd6a76ae0e5b564184c21c4e65e1398
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:758afc652d099f48a625a20bc0a6608ca13318a564c979a2c3084b7d294ae3f3
3
  size 5905