fpadovani commited on
Commit
0ea4d07
·
verified ·
1 Parent(s): 139d0cc

Training in progress, step 5000, checkpoint

Browse files
checkpoint-5000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b544459240e5e208332928bf2093f0a0787011a109f9e06e2bc28f80b2d8be4
3
  size 435544704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64be429d4a72fd31456f44a30f45de122b90022e77ab700f128f25886ab6e4dc
3
  size 435544704
checkpoint-5000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:731a3c20c771bc5b68306323670efa35d01cf22d3bada6f61b3089bc53750c36
3
  size 871183627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09f19174e57d6e789bf42f977c6c09e5479c494f23988c035d4f2f916ebbb5af
3
  size 871183627
checkpoint-5000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb8d4244b55585fd41e23990cfe6cd14f3fc7dd7407ea6827a54539f144a2ba8
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3e8d3f76a297f56524742c12a38646813fa7217b2c78b65c170a2e22d70cff8
3
  size 14645
checkpoint-5000/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 4788,
3
- "best_metric": 4.389559268951416,
4
  "best_model_checkpoint": null,
5
  "epoch": 12.531328320802006,
6
  "eval_steps": 500,
@@ -11,189 +11,189 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.002506265664160401,
14
- "grad_norm": 15.276525497436523,
15
  "learning_rate": 0.0,
16
- "loss": 10.3822,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 1.0,
21
- "grad_norm": 1.2039393186569214,
22
  "learning_rate": 7.960000000000001e-05,
23
- "loss": 6.9837,
24
  "step": 399
25
  },
26
  {
27
  "epoch": 1.0,
28
- "eval_loss": 5.983016014099121,
29
- "eval_runtime": 15.8842,
30
- "eval_samples_per_second": 1611.54,
31
- "eval_steps_per_second": 6.296,
32
  "step": 399
33
  },
34
  {
35
  "epoch": 2.0,
36
- "grad_norm": 1.1639982461929321,
37
  "learning_rate": 9.601876675603218e-05,
38
- "loss": 5.6173,
39
  "step": 798
40
  },
41
  {
42
  "epoch": 2.0,
43
- "eval_loss": 5.357014179229736,
44
- "eval_runtime": 20.233,
45
- "eval_samples_per_second": 1265.164,
46
- "eval_steps_per_second": 4.942,
47
  "step": 798
48
  },
49
  {
50
  "epoch": 3.0,
51
- "grad_norm": 1.0922253131866455,
52
  "learning_rate": 9.067024128686328e-05,
53
- "loss": 5.1654,
54
  "step": 1197
55
  },
56
  {
57
  "epoch": 3.0,
58
- "eval_loss": 5.061666488647461,
59
- "eval_runtime": 21.5628,
60
- "eval_samples_per_second": 1187.138,
61
- "eval_steps_per_second": 4.638,
62
  "step": 1197
63
  },
64
  {
65
  "epoch": 4.0,
66
- "grad_norm": 1.1415351629257202,
67
  "learning_rate": 8.532171581769438e-05,
68
- "loss": 4.9033,
69
  "step": 1596
70
  },
71
  {
72
  "epoch": 4.0,
73
- "eval_loss": 4.871504306793213,
74
- "eval_runtime": 21.5888,
75
- "eval_samples_per_second": 1185.71,
76
- "eval_steps_per_second": 4.632,
77
  "step": 1596
78
  },
79
  {
80
  "epoch": 5.0,
81
- "grad_norm": 1.2201212644577026,
82
  "learning_rate": 7.997319034852548e-05,
83
- "loss": 4.7113,
84
  "step": 1995
85
  },
86
  {
87
  "epoch": 5.0,
88
- "eval_loss": 4.739400386810303,
89
- "eval_runtime": 21.6571,
90
- "eval_samples_per_second": 1181.971,
91
- "eval_steps_per_second": 4.617,
92
  "step": 1995
93
  },
94
  {
95
  "epoch": 6.0,
96
- "grad_norm": 1.3053981065750122,
97
  "learning_rate": 7.462466487935658e-05,
98
- "loss": 4.5573,
99
  "step": 2394
100
  },
101
  {
102
  "epoch": 6.0,
103
- "eval_loss": 4.6380228996276855,
104
- "eval_runtime": 20.1975,
105
- "eval_samples_per_second": 1267.387,
106
- "eval_steps_per_second": 4.951,
107
  "step": 2394
108
  },
109
  {
110
  "epoch": 7.0,
111
- "grad_norm": 1.306992769241333,
112
  "learning_rate": 6.927613941018766e-05,
113
- "loss": 4.4278,
114
  "step": 2793
115
  },
116
  {
117
  "epoch": 7.0,
118
- "eval_loss": 4.5619025230407715,
119
- "eval_runtime": 20.2525,
120
- "eval_samples_per_second": 1263.94,
121
- "eval_steps_per_second": 4.938,
122
  "step": 2793
123
  },
124
  {
125
  "epoch": 8.0,
126
- "grad_norm": 1.4036524295806885,
127
  "learning_rate": 6.392761394101878e-05,
128
- "loss": 4.3164,
129
  "step": 3192
130
  },
131
  {
132
  "epoch": 8.0,
133
- "eval_loss": 4.505926132202148,
134
- "eval_runtime": 20.1856,
135
- "eval_samples_per_second": 1268.13,
136
- "eval_steps_per_second": 4.954,
137
  "step": 3192
138
  },
139
  {
140
  "epoch": 9.0,
141
- "grad_norm": 1.4330567121505737,
142
  "learning_rate": 5.8579088471849864e-05,
143
  "loss": 4.2176,
144
  "step": 3591
145
  },
146
  {
147
  "epoch": 9.0,
148
- "eval_loss": 4.462268352508545,
149
- "eval_runtime": 20.1962,
150
- "eval_samples_per_second": 1267.466,
151
- "eval_steps_per_second": 4.951,
152
  "step": 3591
153
  },
154
  {
155
  "epoch": 10.0,
156
- "grad_norm": 1.5486063957214355,
157
  "learning_rate": 5.3230563002680965e-05,
158
- "loss": 4.1304,
159
  "step": 3990
160
  },
161
  {
162
  "epoch": 10.0,
163
- "eval_loss": 4.426018238067627,
164
- "eval_runtime": 20.2771,
165
- "eval_samples_per_second": 1262.41,
166
- "eval_steps_per_second": 4.932,
167
  "step": 3990
168
  },
169
  {
170
  "epoch": 11.0,
171
- "grad_norm": 1.620924472808838,
172
  "learning_rate": 4.7882037533512065e-05,
173
- "loss": 4.0533,
174
  "step": 4389
175
  },
176
  {
177
  "epoch": 11.0,
178
- "eval_loss": 4.405181884765625,
179
- "eval_runtime": 20.264,
180
- "eval_samples_per_second": 1263.225,
181
- "eval_steps_per_second": 4.935,
182
  "step": 4389
183
  },
184
  {
185
  "epoch": 12.0,
186
- "grad_norm": 1.7080353498458862,
187
  "learning_rate": 4.2533512064343165e-05,
188
- "loss": 3.9848,
189
  "step": 4788
190
  },
191
  {
192
  "epoch": 12.0,
193
- "eval_loss": 4.389559268951416,
194
- "eval_runtime": 20.213,
195
- "eval_samples_per_second": 1266.415,
196
- "eval_steps_per_second": 4.947,
197
  "step": 4788
198
  }
199
  ],
 
1
  {
2
  "best_global_step": 4788,
3
+ "best_metric": 4.387991428375244,
4
  "best_model_checkpoint": null,
5
  "epoch": 12.531328320802006,
6
  "eval_steps": 500,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.002506265664160401,
14
+ "grad_norm": 15.287896156311035,
15
  "learning_rate": 0.0,
16
+ "loss": 10.3873,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 1.0,
21
+ "grad_norm": 1.349755048751831,
22
  "learning_rate": 7.960000000000001e-05,
23
+ "loss": 6.9819,
24
  "step": 399
25
  },
26
  {
27
  "epoch": 1.0,
28
+ "eval_loss": 5.9839253425598145,
29
+ "eval_runtime": 40.2685,
30
+ "eval_samples_per_second": 635.682,
31
+ "eval_steps_per_second": 2.483,
32
  "step": 399
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "grad_norm": 1.2756340503692627,
37
  "learning_rate": 9.601876675603218e-05,
38
+ "loss": 5.6158,
39
  "step": 798
40
  },
41
  {
42
  "epoch": 2.0,
43
+ "eval_loss": 5.355185031890869,
44
+ "eval_runtime": 40.338,
45
+ "eval_samples_per_second": 634.588,
46
+ "eval_steps_per_second": 2.479,
47
  "step": 798
48
  },
49
  {
50
  "epoch": 3.0,
51
+ "grad_norm": 1.1477612257003784,
52
  "learning_rate": 9.067024128686328e-05,
53
+ "loss": 5.164,
54
  "step": 1197
55
  },
56
  {
57
  "epoch": 3.0,
58
+ "eval_loss": 5.061825275421143,
59
+ "eval_runtime": 40.9407,
60
+ "eval_samples_per_second": 625.245,
61
+ "eval_steps_per_second": 2.443,
62
  "step": 1197
63
  },
64
  {
65
  "epoch": 4.0,
66
+ "grad_norm": 1.1439207792282104,
67
  "learning_rate": 8.532171581769438e-05,
68
+ "loss": 4.902,
69
  "step": 1596
70
  },
71
  {
72
  "epoch": 4.0,
73
+ "eval_loss": 4.870055675506592,
74
+ "eval_runtime": 40.5657,
75
+ "eval_samples_per_second": 631.026,
76
+ "eval_steps_per_second": 2.465,
77
  "step": 1596
78
  },
79
  {
80
  "epoch": 5.0,
81
+ "grad_norm": 1.2400050163269043,
82
  "learning_rate": 7.997319034852548e-05,
83
+ "loss": 4.7103,
84
  "step": 1995
85
  },
86
  {
87
  "epoch": 5.0,
88
+ "eval_loss": 4.737980842590332,
89
+ "eval_runtime": 40.84,
90
+ "eval_samples_per_second": 626.787,
91
+ "eval_steps_per_second": 2.449,
92
  "step": 1995
93
  },
94
  {
95
  "epoch": 6.0,
96
+ "grad_norm": 1.3254481554031372,
97
  "learning_rate": 7.462466487935658e-05,
98
+ "loss": 4.5566,
99
  "step": 2394
100
  },
101
  {
102
  "epoch": 6.0,
103
+ "eval_loss": 4.6372785568237305,
104
+ "eval_runtime": 40.6525,
105
+ "eval_samples_per_second": 629.678,
106
+ "eval_steps_per_second": 2.46,
107
  "step": 2394
108
  },
109
  {
110
  "epoch": 7.0,
111
+ "grad_norm": 1.3006181716918945,
112
  "learning_rate": 6.927613941018766e-05,
113
+ "loss": 4.4277,
114
  "step": 2793
115
  },
116
  {
117
  "epoch": 7.0,
118
+ "eval_loss": 4.560876846313477,
119
+ "eval_runtime": 40.4604,
120
+ "eval_samples_per_second": 632.667,
121
+ "eval_steps_per_second": 2.472,
122
  "step": 2793
123
  },
124
  {
125
  "epoch": 8.0,
126
+ "grad_norm": 1.420179009437561,
127
  "learning_rate": 6.392761394101878e-05,
128
+ "loss": 4.3162,
129
  "step": 3192
130
  },
131
  {
132
  "epoch": 8.0,
133
+ "eval_loss": 4.503915786743164,
134
+ "eval_runtime": 39.8308,
135
+ "eval_samples_per_second": 642.669,
136
+ "eval_steps_per_second": 2.511,
137
  "step": 3192
138
  },
139
  {
140
  "epoch": 9.0,
141
+ "grad_norm": 1.4465044736862183,
142
  "learning_rate": 5.8579088471849864e-05,
143
  "loss": 4.2176,
144
  "step": 3591
145
  },
146
  {
147
  "epoch": 9.0,
148
+ "eval_loss": 4.462299346923828,
149
+ "eval_runtime": 41.0625,
150
+ "eval_samples_per_second": 623.391,
151
+ "eval_steps_per_second": 2.435,
152
  "step": 3591
153
  },
154
  {
155
  "epoch": 10.0,
156
+ "grad_norm": 1.5573575496673584,
157
  "learning_rate": 5.3230563002680965e-05,
158
+ "loss": 4.1306,
159
  "step": 3990
160
  },
161
  {
162
  "epoch": 10.0,
163
+ "eval_loss": 4.426837921142578,
164
+ "eval_runtime": 39.558,
165
+ "eval_samples_per_second": 647.1,
166
+ "eval_steps_per_second": 2.528,
167
  "step": 3990
168
  },
169
  {
170
  "epoch": 11.0,
171
+ "grad_norm": 1.59477961063385,
172
  "learning_rate": 4.7882037533512065e-05,
173
+ "loss": 4.0536,
174
  "step": 4389
175
  },
176
  {
177
  "epoch": 11.0,
178
+ "eval_loss": 4.405690670013428,
179
+ "eval_runtime": 41.0885,
180
+ "eval_samples_per_second": 622.996,
181
+ "eval_steps_per_second": 2.434,
182
  "step": 4389
183
  },
184
  {
185
  "epoch": 12.0,
186
+ "grad_norm": 1.7376188039779663,
187
  "learning_rate": 4.2533512064343165e-05,
188
+ "loss": 3.9853,
189
  "step": 4788
190
  },
191
  {
192
  "epoch": 12.0,
193
+ "eval_loss": 4.387991428375244,
194
+ "eval_runtime": 40.4482,
195
+ "eval_samples_per_second": 632.858,
196
+ "eval_steps_per_second": 2.472,
197
  "step": 4788
198
  }
199
  ],
checkpoint-5000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3d99a586d2a38e89becddbee0adce6692398fb4ff8b6a71d5f8e545b03eeef6
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4d8e95126c79934c43d708d7208a8515199b20fd582881a681e358089ed0c56
3
  size 5905