File size: 7,073 Bytes
9aceba2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
[
  {
    "loss": 1.4019,
    "grad_norm": 0.6986930966377258,
    "learning_rate": 2.25e-05,
    "entropy": 1.199985957145691,
    "num_tokens": 647929.0,
    "mean_token_accuracy": 0.6914159089326859,
    "epoch": 0.19607843137254902,
    "step": 10
  },
  {
    "loss": 1.3734,
    "grad_norm": 0.18671388924121857,
    "learning_rate": 4.75e-05,
    "entropy": 1.325679862499237,
    "num_tokens": 1300854.0,
    "mean_token_accuracy": 0.6935183644294739,
    "epoch": 0.39215686274509803,
    "step": 20
  },
  {
    "loss": 1.3317,
    "grad_norm": 0.18022498488426208,
    "learning_rate": 4.9819267987317665e-05,
    "entropy": 1.4015559554100037,
    "num_tokens": 1947357.0,
    "mean_token_accuracy": 0.6976928591728211,
    "epoch": 0.5882352941176471,
    "step": 30
  },
  {
    "loss": 1.2446,
    "grad_norm": 0.18924492597579956,
    "learning_rate": 4.9197869469162815e-05,
    "entropy": 1.219199287891388,
    "num_tokens": 2595628.0,
    "mean_token_accuracy": 0.7132954180240632,
    "epoch": 0.7843137254901961,
    "step": 40
  },
  {
    "loss": 1.2139,
    "grad_norm": 0.23499619960784912,
    "learning_rate": 4.814465939707259e-05,
    "entropy": 1.2155689418315887,
    "num_tokens": 3245496.0,
    "mean_token_accuracy": 0.7180316507816314,
    "epoch": 0.9803921568627451,
    "step": 50
  },
  {
    "loss": 1.1282,
    "grad_norm": 0.25000059604644775,
    "learning_rate": 4.6678432329734434e-05,
    "entropy": 1.1309684544801712,
    "num_tokens": 3876930.0,
    "mean_token_accuracy": 0.734208858013153,
    "epoch": 1.1764705882352942,
    "step": 60
  },
  {
    "loss": 1.1264,
    "grad_norm": 0.3251831531524658,
    "learning_rate": 4.482535312390058e-05,
    "entropy": 1.1345468521118165,
    "num_tokens": 4525171.0,
    "mean_token_accuracy": 0.7357972204685211,
    "epoch": 1.3725490196078431,
    "step": 70
  },
  {
    "loss": 1.0226,
    "grad_norm": 0.2027675360441208,
    "learning_rate": 4.2618490021899384e-05,
    "entropy": 1.0273457378149033,
    "num_tokens": 5172588.0,
    "mean_token_accuracy": 0.7597126334905624,
    "epoch": 1.5686274509803921,
    "step": 80
  },
  {
    "loss": 1.0134,
    "grad_norm": 0.2039007991552353,
    "learning_rate": 4.009722454806761e-05,
    "entropy": 1.0195463865995407,
    "num_tokens": 5819977.0,
    "mean_token_accuracy": 0.761520317196846,
    "epoch": 1.7647058823529411,
    "step": 90
  },
  {
    "loss": 0.9796,
    "grad_norm": 0.1867203265428543,
    "learning_rate": 3.730654874451569e-05,
    "entropy": 0.9862680763006211,
    "num_tokens": 6472798.0,
    "mean_token_accuracy": 0.7683205276727676,
    "epoch": 1.9607843137254903,
    "step": 100
  },
  {
    "loss": 0.9679,
    "grad_norm": 0.21607249975204468,
    "learning_rate": 3.429626228707034e-05,
    "entropy": 0.9721104234457016,
    "num_tokens": 7112529.0,
    "mean_token_accuracy": 0.7711095601320267,
    "epoch": 2.156862745098039,
    "step": 110
  },
  {
    "loss": 0.9666,
    "grad_norm": 0.23092280328273773,
    "learning_rate": 3.112008380887966e-05,
    "entropy": 0.9724053025245667,
    "num_tokens": 7758907.0,
    "mean_token_accuracy": 0.7709219127893447,
    "epoch": 2.3529411764705883,
    "step": 120
  },
  {
    "loss": 0.9683,
    "grad_norm": 0.20999926328659058,
    "learning_rate": 2.7834692290132052e-05,
    "entropy": 0.9730658024549484,
    "num_tokens": 8405368.0,
    "mean_token_accuracy": 0.769457995891571,
    "epoch": 2.549019607843137,
    "step": 130
  },
  {
    "loss": 0.9385,
    "grad_norm": 0.7688829898834229,
    "learning_rate": 2.449871562031194e-05,
    "entropy": 0.9452048629522324,
    "num_tokens": 9053347.0,
    "mean_token_accuracy": 0.7750110507011414,
    "epoch": 2.7450980392156863,
    "step": 140
  },
  {
    "loss": 0.9832,
    "grad_norm": 0.26651284098625183,
    "learning_rate": 2.1171684382123e-05,
    "entropy": 0.9904936224222183,
    "num_tokens": 9703053.0,
    "mean_token_accuracy": 0.7652157843112946,
    "epoch": 2.9411764705882355,
    "step": 150
  },
  {
    "loss": 0.9586,
    "grad_norm": 0.22833150625228882,
    "learning_rate": 1.7912969526829558e-05,
    "entropy": 0.9607909858226776,
    "num_tokens": 10340639.0,
    "mean_token_accuracy": 0.7710874438285827,
    "epoch": 3.1372549019607843,
    "step": 160
  },
  {
    "loss": 0.9382,
    "grad_norm": 0.2782645523548126,
    "learning_rate": 1.4780722898224707e-05,
    "entropy": 0.9437746345996857,
    "num_tokens": 10987284.0,
    "mean_token_accuracy": 0.7756666958332061,
    "epoch": 3.3333333333333335,
    "step": 170
  },
  {
    "loss": 0.9441,
    "grad_norm": 0.2571350038051605,
    "learning_rate": 1.1830839511600211e-05,
    "entropy": 0.9467583298683167,
    "num_tokens": 11638213.0,
    "mean_token_accuracy": 0.7747708618640899,
    "epoch": 3.5294117647058822,
    "step": 180
  },
  {
    "loss": 0.9175,
    "grad_norm": 0.26882949471473694,
    "learning_rate": 9.11596010587441e-06,
    "entropy": 0.923730057477951,
    "num_tokens": 12288333.0,
    "mean_token_accuracy": 0.7791785061359405,
    "epoch": 3.7254901960784315,
    "step": 190
  },
  {
    "loss": 0.91,
    "grad_norm": 0.22646215558052063,
    "learning_rate": 6.684531768359173e-06,
    "entropy": 0.9165982186794281,
    "num_tokens": 12936318.0,
    "mean_token_accuracy": 0.7812086254358291,
    "epoch": 3.9215686274509802,
    "step": 200
  },
  {
    "loss": 0.9193,
    "grad_norm": 0.2461111843585968,
    "learning_rate": 4.579943395339062e-06,
    "entropy": 0.9269091933965683,
    "num_tokens": 13570476.0,
    "mean_token_accuracy": 0.7785391122102737,
    "epoch": 4.117647058823529,
    "step": 210
  },
  {
    "loss": 0.9259,
    "grad_norm": 0.30823367834091187,
    "learning_rate": 2.8397514161892486e-06,
    "entropy": 0.9287486761808396,
    "num_tokens": 14217790.0,
    "mean_token_accuracy": 0.7776476472616196,
    "epoch": 4.313725490196078,
    "step": 220
  },
  {
    "loss": 0.8904,
    "grad_norm": 0.29259830713272095,
    "learning_rate": 1.4950095980035772e-06,
    "entropy": 0.8954251229763031,
    "num_tokens": 14867755.0,
    "mean_token_accuracy": 0.7863523244857789,
    "epoch": 4.509803921568627,
    "step": 230
  },
  {
    "loss": 0.9222,
    "grad_norm": 0.2476108819246292,
    "learning_rate": 5.697148903850868e-07,
    "entropy": 0.92759590446949,
    "num_tokens": 15512923.0,
    "mean_token_accuracy": 0.7787990540266037,
    "epoch": 4.705882352941177,
    "step": 240
  },
  {
    "loss": 0.9369,
    "grad_norm": 0.2094818353652954,
    "learning_rate": 8.037919931187244e-08,
    "entropy": 0.9443553179502487,
    "num_tokens": 16165639.0,
    "mean_token_accuracy": 0.7741728782653808,
    "epoch": 4.901960784313726,
    "step": 250
  },
  {
    "train_runtime": 4077.9021,
    "train_samples_per_second": 1.994,
    "train_steps_per_second": 0.063,
    "total_flos": 4.655305572168499e+18,
    "train_loss": 1.0347469572927437,
    "entropy": 0.9348823964595795,
    "num_tokens": 16478955.0,
    "mean_token_accuracy": 0.7756926536560058,
    "epoch": 5.0,
    "step": 255
  }
]