heavyhelium commited on
Commit
c7dfdac
·
verified ·
1 Parent(s): c3dd923

Training in progress, epoch 5, checkpoint

Browse files
checkpoint-235/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6f6a8505c36e37b1b5fcd38d4455a343639019d4e6f70f57493b8dc37fed351
3
  size 54221200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0554aaea26bf82ed76dceefda2b717a8ddc384a4e0e7fa9c30a0c2105cb23815
3
  size 54221200
checkpoint-235/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5853af63ceaea0b2d7b3ed21127b9b6e8ae5ecbece3d5738b4661f18ada1d05
3
  size 108567563
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3db8bdbc0b2d87c1f3d78389504db5af7bacf4489334138009d4ac77628ee068
3
  size 108567563
checkpoint-235/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 235,
3
- "best_metric": 0.65996599659966,
4
  "best_model_checkpoint": "models/electra-small-touche-base-binary/trainer/checkpoint-235",
5
  "epoch": 5.0,
6
  "eval_steps": 500,
@@ -11,218 +11,218 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.2127659574468085,
14
- "grad_norm": 0.7140693068504333,
15
  "learning_rate": 1.125e-05,
16
- "loss": 0.69647216796875,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.425531914893617,
21
- "grad_norm": 1.1431223154067993,
22
  "learning_rate": 2.3749999999999998e-05,
23
- "loss": 0.6931365966796875,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.6382978723404256,
28
- "grad_norm": 0.6381158828735352,
29
  "learning_rate": 2.928909952606635e-05,
30
- "loss": 0.69478759765625,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.851063829787234,
35
- "grad_norm": 1.9318883419036865,
36
  "learning_rate": 2.7867298578199053e-05,
37
- "loss": 0.6951080322265625,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 1.0,
42
  "eval_accuracy": 0.555,
43
  "eval_fallacy_f1": 0.3597122302158273,
44
- "eval_loss": 0.6898486614227295,
45
  "eval_macro_f1": 0.5093580308167259,
46
- "eval_runtime": 0.1669,
47
- "eval_samples_per_second": 1198.414,
48
- "eval_steps_per_second": 23.968,
49
  "step": 47
50
  },
51
  {
52
  "epoch": 1.0638297872340425,
53
- "grad_norm": 0.6443229913711548,
54
  "learning_rate": 2.6445497630331753e-05,
55
  "loss": 0.6960784912109375,
56
  "step": 50
57
  },
58
  {
59
  "epoch": 1.2765957446808511,
60
- "grad_norm": 1.3156065940856934,
61
  "learning_rate": 2.5023696682464456e-05,
62
  "loss": 0.6871307373046875,
63
  "step": 60
64
  },
65
  {
66
  "epoch": 1.4893617021276595,
67
- "grad_norm": 0.9663533568382263,
68
  "learning_rate": 2.360189573459716e-05,
69
  "loss": 0.69168701171875,
70
  "step": 70
71
  },
72
  {
73
  "epoch": 1.702127659574468,
74
- "grad_norm": 0.7499622702598572,
75
  "learning_rate": 2.2180094786729858e-05,
76
- "loss": 0.6835113525390625,
77
  "step": 80
78
  },
79
  {
80
  "epoch": 1.9148936170212765,
81
- "grad_norm": 1.1466385126113892,
82
  "learning_rate": 2.075829383886256e-05,
83
- "loss": 0.6876190185546875,
84
  "step": 90
85
  },
86
  {
87
  "epoch": 2.0,
88
  "eval_accuracy": 0.64,
89
  "eval_fallacy_f1": 0.64,
90
- "eval_loss": 0.6786279082298279,
91
  "eval_macro_f1": 0.64,
92
- "eval_runtime": 0.2441,
93
- "eval_samples_per_second": 819.224,
94
- "eval_steps_per_second": 16.384,
95
  "step": 94
96
  },
97
  {
98
  "epoch": 2.127659574468085,
99
- "grad_norm": 0.7131018042564392,
100
  "learning_rate": 1.933649289099526e-05,
101
- "loss": 0.6760498046875,
102
  "step": 100
103
  },
104
  {
105
  "epoch": 2.3404255319148937,
106
- "grad_norm": 0.8485779762268066,
107
  "learning_rate": 1.791469194312796e-05,
108
- "loss": 0.67723388671875,
109
  "step": 110
110
  },
111
  {
112
  "epoch": 2.5531914893617023,
113
- "grad_norm": 1.0046956539154053,
114
  "learning_rate": 1.6492890995260666e-05,
115
- "loss": 0.6680267333984375,
116
  "step": 120
117
  },
118
  {
119
  "epoch": 2.7659574468085104,
120
- "grad_norm": 1.3051499128341675,
121
  "learning_rate": 1.5071090047393365e-05,
122
- "loss": 0.6558563232421875,
123
  "step": 130
124
  },
125
  {
126
  "epoch": 2.978723404255319,
127
- "grad_norm": 1.3270968198776245,
128
  "learning_rate": 1.3649289099526066e-05,
129
- "loss": 0.6429000854492187,
130
  "step": 140
131
  },
132
  {
133
  "epoch": 3.0,
134
  "eval_accuracy": 0.63,
135
  "eval_fallacy_f1": 0.5432098765432098,
136
- "eval_loss": 0.6571618914604187,
137
  "eval_macro_f1": 0.6161427533976553,
138
- "eval_runtime": 0.1787,
139
- "eval_samples_per_second": 1119.018,
140
- "eval_steps_per_second": 22.38,
141
  "step": 141
142
  },
143
  {
144
  "epoch": 3.1914893617021276,
145
- "grad_norm": 1.5919787883758545,
146
  "learning_rate": 1.2227488151658767e-05,
147
- "loss": 0.6370620727539062,
148
  "step": 150
149
  },
150
  {
151
  "epoch": 3.404255319148936,
152
- "grad_norm": 1.217308759689331,
153
  "learning_rate": 1.080568720379147e-05,
154
- "loss": 0.6328628540039063,
155
  "step": 160
156
  },
157
  {
158
  "epoch": 3.617021276595745,
159
- "grad_norm": 2.489976406097412,
160
  "learning_rate": 9.383886255924171e-06,
161
- "loss": 0.6132949829101563,
162
  "step": 170
163
  },
164
  {
165
  "epoch": 3.829787234042553,
166
- "grad_norm": 1.557442307472229,
167
  "learning_rate": 7.962085308056872e-06,
168
- "loss": 0.6257461547851563,
169
  "step": 180
170
  },
171
  {
172
  "epoch": 4.0,
173
  "eval_accuracy": 0.645,
174
  "eval_fallacy_f1": 0.6697674418604651,
175
- "eval_loss": 0.6352868676185608,
176
  "eval_macro_f1": 0.6429918290383407,
177
- "eval_runtime": 0.1618,
178
- "eval_samples_per_second": 1236.069,
179
- "eval_steps_per_second": 24.721,
180
  "step": 188
181
  },
182
  {
183
  "epoch": 4.042553191489362,
184
- "grad_norm": 2.8439695835113525,
185
  "learning_rate": 6.5402843601895735e-06,
186
- "loss": 0.60234375,
187
  "step": 190
188
  },
189
  {
190
  "epoch": 4.25531914893617,
191
- "grad_norm": 1.99105703830719,
192
  "learning_rate": 5.1184834123222755e-06,
193
  "loss": 0.6104934692382813,
194
  "step": 200
195
  },
196
  {
197
  "epoch": 4.468085106382979,
198
- "grad_norm": 1.5985376834869385,
199
  "learning_rate": 3.696682464454976e-06,
200
- "loss": 0.5836532592773438,
201
  "step": 210
202
  },
203
  {
204
  "epoch": 4.680851063829787,
205
- "grad_norm": 1.6915240287780762,
206
  "learning_rate": 2.274881516587678e-06,
207
- "loss": 0.5967620849609375,
208
  "step": 220
209
  },
210
  {
211
  "epoch": 4.8936170212765955,
212
- "grad_norm": 1.8389923572540283,
213
  "learning_rate": 8.530805687203791e-07,
214
- "loss": 0.5855438232421875,
215
  "step": 230
216
  },
217
  {
218
  "epoch": 5.0,
219
- "eval_accuracy": 0.66,
220
- "eval_fallacy_f1": 0.6633663366336634,
221
- "eval_loss": 0.6288647651672363,
222
- "eval_macro_f1": 0.65996599659966,
223
- "eval_runtime": 0.1577,
224
- "eval_samples_per_second": 1268.019,
225
- "eval_steps_per_second": 25.36,
226
  "step": 235
227
  }
228
  ],
 
1
  {
2
  "best_global_step": 235,
3
+ "best_metric": 0.6649246080368083,
4
  "best_model_checkpoint": "models/electra-small-touche-base-binary/trainer/checkpoint-235",
5
  "epoch": 5.0,
6
  "eval_steps": 500,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.2127659574468085,
14
+ "grad_norm": 0.7142515182495117,
15
  "learning_rate": 1.125e-05,
16
+ "loss": 0.6964691162109375,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.425531914893617,
21
+ "grad_norm": 1.143205165863037,
22
  "learning_rate": 2.3749999999999998e-05,
23
+ "loss": 0.6931427001953125,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.6382978723404256,
28
+ "grad_norm": 0.6381425857543945,
29
  "learning_rate": 2.928909952606635e-05,
30
+ "loss": 0.6947906494140625,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.851063829787234,
35
+ "grad_norm": 1.9318283796310425,
36
  "learning_rate": 2.7867298578199053e-05,
37
+ "loss": 0.695111083984375,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 1.0,
42
  "eval_accuracy": 0.555,
43
  "eval_fallacy_f1": 0.3597122302158273,
44
+ "eval_loss": 0.6898462176322937,
45
  "eval_macro_f1": 0.5093580308167259,
46
+ "eval_runtime": 0.2531,
47
+ "eval_samples_per_second": 790.252,
48
+ "eval_steps_per_second": 15.805,
49
  "step": 47
50
  },
51
  {
52
  "epoch": 1.0638297872340425,
53
+ "grad_norm": 0.644514262676239,
54
  "learning_rate": 2.6445497630331753e-05,
55
  "loss": 0.6960784912109375,
56
  "step": 50
57
  },
58
  {
59
  "epoch": 1.2765957446808511,
60
+ "grad_norm": 1.315447211265564,
61
  "learning_rate": 2.5023696682464456e-05,
62
  "loss": 0.6871307373046875,
63
  "step": 60
64
  },
65
  {
66
  "epoch": 1.4893617021276595,
67
+ "grad_norm": 0.9659499526023865,
68
  "learning_rate": 2.360189573459716e-05,
69
  "loss": 0.69168701171875,
70
  "step": 70
71
  },
72
  {
73
  "epoch": 1.702127659574468,
74
+ "grad_norm": 0.750037670135498,
75
  "learning_rate": 2.2180094786729858e-05,
76
+ "loss": 0.683502197265625,
77
  "step": 80
78
  },
79
  {
80
  "epoch": 1.9148936170212765,
81
+ "grad_norm": 1.1467636823654175,
82
  "learning_rate": 2.075829383886256e-05,
83
+ "loss": 0.6876312255859375,
84
  "step": 90
85
  },
86
  {
87
  "epoch": 2.0,
88
  "eval_accuracy": 0.64,
89
  "eval_fallacy_f1": 0.64,
90
+ "eval_loss": 0.6786401271820068,
91
  "eval_macro_f1": 0.64,
92
+ "eval_runtime": 0.1813,
93
+ "eval_samples_per_second": 1103.211,
94
+ "eval_steps_per_second": 22.064,
95
  "step": 94
96
  },
97
  {
98
  "epoch": 2.127659574468085,
99
+ "grad_norm": 0.713529646396637,
100
  "learning_rate": 1.933649289099526e-05,
101
+ "loss": 0.6760345458984375,
102
  "step": 100
103
  },
104
  {
105
  "epoch": 2.3404255319148937,
106
+ "grad_norm": 0.8489329814910889,
107
  "learning_rate": 1.791469194312796e-05,
108
+ "loss": 0.6772491455078125,
109
  "step": 110
110
  },
111
  {
112
  "epoch": 2.5531914893617023,
113
+ "grad_norm": 1.0044538974761963,
114
  "learning_rate": 1.6492890995260666e-05,
115
+ "loss": 0.6680450439453125,
116
  "step": 120
117
  },
118
  {
119
  "epoch": 2.7659574468085104,
120
+ "grad_norm": 1.3042397499084473,
121
  "learning_rate": 1.5071090047393365e-05,
122
+ "loss": 0.6558624267578125,
123
  "step": 130
124
  },
125
  {
126
  "epoch": 2.978723404255319,
127
+ "grad_norm": 1.3250319957733154,
128
  "learning_rate": 1.3649289099526066e-05,
129
+ "loss": 0.64290771484375,
130
  "step": 140
131
  },
132
  {
133
  "epoch": 3.0,
134
  "eval_accuracy": 0.63,
135
  "eval_fallacy_f1": 0.5432098765432098,
136
+ "eval_loss": 0.6571679711341858,
137
  "eval_macro_f1": 0.6161427533976553,
138
+ "eval_runtime": 0.2009,
139
+ "eval_samples_per_second": 995.653,
140
+ "eval_steps_per_second": 19.913,
141
  "step": 141
142
  },
143
  {
144
  "epoch": 3.1914893617021276,
145
+ "grad_norm": 1.5915395021438599,
146
  "learning_rate": 1.2227488151658767e-05,
147
+ "loss": 0.6370498657226562,
148
  "step": 150
149
  },
150
  {
151
  "epoch": 3.404255319148936,
152
+ "grad_norm": 1.217298150062561,
153
  "learning_rate": 1.080568720379147e-05,
154
+ "loss": 0.6329071044921875,
155
  "step": 160
156
  },
157
  {
158
  "epoch": 3.617021276595745,
159
+ "grad_norm": 2.48813533782959,
160
  "learning_rate": 9.383886255924171e-06,
161
+ "loss": 0.613275146484375,
162
  "step": 170
163
  },
164
  {
165
  "epoch": 3.829787234042553,
166
+ "grad_norm": 1.5584394931793213,
167
  "learning_rate": 7.962085308056872e-06,
168
+ "loss": 0.625762939453125,
169
  "step": 180
170
  },
171
  {
172
  "epoch": 4.0,
173
  "eval_accuracy": 0.645,
174
  "eval_fallacy_f1": 0.6697674418604651,
175
+ "eval_loss": 0.6353307962417603,
176
  "eval_macro_f1": 0.6429918290383407,
177
+ "eval_runtime": 0.182,
178
+ "eval_samples_per_second": 1098.706,
179
+ "eval_steps_per_second": 21.974,
180
  "step": 188
181
  },
182
  {
183
  "epoch": 4.042553191489362,
184
+ "grad_norm": 2.846815824508667,
185
  "learning_rate": 6.5402843601895735e-06,
186
+ "loss": 0.6023529052734375,
187
  "step": 190
188
  },
189
  {
190
  "epoch": 4.25531914893617,
191
+ "grad_norm": 1.9856239557266235,
192
  "learning_rate": 5.1184834123222755e-06,
193
  "loss": 0.6104934692382813,
194
  "step": 200
195
  },
196
  {
197
  "epoch": 4.468085106382979,
198
+ "grad_norm": 1.5990350246429443,
199
  "learning_rate": 3.696682464454976e-06,
200
+ "loss": 0.5836715698242188,
201
  "step": 210
202
  },
203
  {
204
  "epoch": 4.680851063829787,
205
+ "grad_norm": 1.6923744678497314,
206
  "learning_rate": 2.274881516587678e-06,
207
+ "loss": 0.59677734375,
208
  "step": 220
209
  },
210
  {
211
  "epoch": 4.8936170212765955,
212
+ "grad_norm": 1.8375948667526245,
213
  "learning_rate": 8.530805687203791e-07,
214
+ "loss": 0.5855560302734375,
215
  "step": 230
216
  },
217
  {
218
  "epoch": 5.0,
219
+ "eval_accuracy": 0.665,
220
+ "eval_fallacy_f1": 0.6699507389162561,
221
+ "eval_loss": 0.6288989186286926,
222
+ "eval_macro_f1": 0.6649246080368083,
223
+ "eval_runtime": 0.1834,
224
+ "eval_samples_per_second": 1090.393,
225
+ "eval_steps_per_second": 21.808,
226
  "step": 235
227
  }
228
  ],
checkpoint-235/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:feed2b83d33087270d16ed7f3e10fa99ab935ebc4c954f924db9b9235a327ec4
3
  size 5393
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad6d6165191c810bbf9da8a352ad8d8ac94cb786f3d098baa4a10209e0effe11
3
  size 5393