kiatkock commited on
Commit
b5cbde1
·
verified ·
1 Parent(s): 634be04

tickers_448_7_Channels_with_temporal_tape model training @ 2025-10-26 22:24:12

Browse files
checkpoint-3500/config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "time_moe_50m",
3
+ "apply_aux_loss": true,
4
+ "architectures": [
5
+ "TimeMoeForPrediction"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "Maple728/TimeMoE-50M--configuration_time_moe.TimeMoeConfig",
10
+ "AutoModelForCausalLM": "Maple728/TimeMoE-50M--modeling_time_moe.TimeMoeForPrediction"
11
+ },
12
+ "channel_configs": [
13
+ [
14
+ 63,
15
+ 1,
16
+ 1
17
+ ],
18
+ [
19
+ 6,
20
+ 1,
21
+ 4
22
+ ],
23
+ [
24
+ 6,
25
+ 1,
26
+ 5
27
+ ],
28
+ [
29
+ 10,
30
+ 1,
31
+ 1
32
+ ],
33
+ [
34
+ 5,
35
+ 1,
36
+ 1
37
+ ],
38
+ [
39
+ 5,
40
+ 1,
41
+ 1
42
+ ],
43
+ [
44
+ 5,
45
+ 1,
46
+ 2
47
+ ]
48
+ ],
49
+ "embedding_hidden_size": 128,
50
+ "hidden_act": "silu",
51
+ "hidden_size": 384,
52
+ "horizon_lengths": [
53
+ 1,
54
+ 8,
55
+ 32,
56
+ 64
57
+ ],
58
+ "initializer_range": 0.02,
59
+ "input_size": 42,
60
+ "intermediate_size": 1536,
61
+ "max_position_embeddings": 4096,
62
+ "model_type": "time_moe",
63
+ "num_attention_heads": 12,
64
+ "num_experts": 8,
65
+ "num_experts_per_tok": 2,
66
+ "num_hidden_layers": 12,
67
+ "num_key_value_heads": 12,
68
+ "rms_norm_eps": 1e-06,
69
+ "rope_theta": 10000,
70
+ "router_aux_loss_factor": 0.02,
71
+ "tie_word_embeddings": false,
72
+ "torch_dtype": "float32",
73
+ "transformers_version": "4.40.1",
74
+ "use_cache": true,
75
+ "use_dense": false
76
+ }
checkpoint-3500/generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.40.1"
4
+ }
checkpoint-3500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51824f4fd0ce52faeece495ec0c0e9db2f0e9626377b20dde7c45a4e8aa47567
3
+ size 523322016
checkpoint-3500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e7a9210a2e3d9b68dce4b4ce02bb781c9f29336c74599c640d292ad1e521808
3
+ size 1046999962
checkpoint-3500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:459183a7b5bd3231b0f93a8128d45e78284d6bf4838a73e17fe0e52c0824ac88
3
+ size 14645
checkpoint-3500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:020e7fbbd2973980c721955674ee75fc4607df1b4013ba380b31bae440d53b26
3
+ size 1465
checkpoint-3500/trainer_state.json ADDED
@@ -0,0 +1,574 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.3600683808326721,
3
+ "best_model_checkpoint": "/home/yinkiat/logs/time_moe_tickers_448_7_Channels_with_temporal_tape/checkpoint-3500",
4
+ "epoch": 0.7803137976200429,
5
+ "eval_steps": 500,
6
+ "global_step": 3500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.00022294679932001227,
13
+ "grad_norm": 0.6522400379180908,
14
+ "learning_rate": 9.999999846670801e-05,
15
+ "loss": 0.5519,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.011147339966000614,
20
+ "grad_norm": 0.12162817269563675,
21
+ "learning_rate": 9.999616686793398e-05,
22
+ "loss": 0.4283,
23
+ "step": 50
24
+ },
25
+ {
26
+ "epoch": 0.022294679932001227,
27
+ "grad_norm": 0.19253581762313843,
28
+ "learning_rate": 9.998466864716803e-05,
29
+ "loss": 0.3917,
30
+ "step": 100
31
+ },
32
+ {
33
+ "epoch": 0.033442019898001836,
34
+ "grad_norm": 0.13114529848098755,
35
+ "learning_rate": 9.996550886363802e-05,
36
+ "loss": 0.3761,
37
+ "step": 150
38
+ },
39
+ {
40
+ "epoch": 0.044589359864002455,
41
+ "grad_norm": 0.11506406962871552,
42
+ "learning_rate": 9.993869339270242e-05,
43
+ "loss": 0.3669,
44
+ "step": 200
45
+ },
46
+ {
47
+ "epoch": 0.05573669983000307,
48
+ "grad_norm": 0.1872943639755249,
49
+ "learning_rate": 9.990423045734056e-05,
50
+ "loss": 0.3619,
51
+ "step": 250
52
+ },
53
+ {
54
+ "epoch": 0.06688403979600367,
55
+ "grad_norm": 0.09673713147640228,
56
+ "learning_rate": 9.986213062563104e-05,
57
+ "loss": 0.3613,
58
+ "step": 300
59
+ },
60
+ {
61
+ "epoch": 0.07803137976200429,
62
+ "grad_norm": 0.2162093222141266,
63
+ "learning_rate": 9.981240680751106e-05,
64
+ "loss": 0.3609,
65
+ "step": 350
66
+ },
67
+ {
68
+ "epoch": 0.08917871972800491,
69
+ "grad_norm": 0.20694172382354736,
70
+ "learning_rate": 9.975507425081755e-05,
71
+ "loss": 0.3592,
72
+ "step": 400
73
+ },
74
+ {
75
+ "epoch": 0.10032605969400551,
76
+ "grad_norm": 0.3145907521247864,
77
+ "learning_rate": 9.969015053661142e-05,
78
+ "loss": 0.3573,
79
+ "step": 450
80
+ },
81
+ {
82
+ "epoch": 0.11147339966000613,
83
+ "grad_norm": 0.15271279215812683,
84
+ "learning_rate": 9.961765557378634e-05,
85
+ "loss": 0.3574,
86
+ "step": 500
87
+ },
88
+ {
89
+ "epoch": 0.11147339966000613,
90
+ "eval_loss": 0.36369073390960693,
91
+ "eval_runtime": 1369.4679,
92
+ "eval_samples_per_second": 305.045,
93
+ "eval_steps_per_second": 9.533,
94
+ "step": 500
95
+ },
96
+ {
97
+ "epoch": 0.12262073962600674,
98
+ "grad_norm": 0.2306584119796753,
99
+ "learning_rate": 9.953761159296364e-05,
100
+ "loss": 0.3573,
101
+ "step": 550
102
+ },
103
+ {
104
+ "epoch": 0.13376807959200734,
105
+ "grad_norm": 0.19413670897483826,
106
+ "learning_rate": 9.945004313967528e-05,
107
+ "loss": 0.3566,
108
+ "step": 600
109
+ },
110
+ {
111
+ "epoch": 0.14491541955800796,
112
+ "grad_norm": 0.23593685030937195,
113
+ "learning_rate": 9.935497706683698e-05,
114
+ "loss": 0.3577,
115
+ "step": 650
116
+ },
117
+ {
118
+ "epoch": 0.15606275952400858,
119
+ "grad_norm": 0.12937627732753754,
120
+ "learning_rate": 9.92524425265137e-05,
121
+ "loss": 0.3541,
122
+ "step": 700
123
+ },
124
+ {
125
+ "epoch": 0.1672100994900092,
126
+ "grad_norm": 0.14659036695957184,
127
+ "learning_rate": 9.91424709609802e-05,
128
+ "loss": 0.3553,
129
+ "step": 750
130
+ },
131
+ {
132
+ "epoch": 0.17835743945600982,
133
+ "grad_norm": 0.16453103721141815,
134
+ "learning_rate": 9.902509609307918e-05,
135
+ "loss": 0.356,
136
+ "step": 800
137
+ },
138
+ {
139
+ "epoch": 0.1895047794220104,
140
+ "grad_norm": 0.08065643161535263,
141
+ "learning_rate": 9.890035391588028e-05,
142
+ "loss": 0.3552,
143
+ "step": 850
144
+ },
145
+ {
146
+ "epoch": 0.20065211938801103,
147
+ "grad_norm": 0.1387893557548523,
148
+ "learning_rate": 9.876828268164265e-05,
149
+ "loss": 0.3559,
150
+ "step": 900
151
+ },
152
+ {
153
+ "epoch": 0.21179945935401165,
154
+ "grad_norm": 0.15444770455360413,
155
+ "learning_rate": 9.86289228900849e-05,
156
+ "loss": 0.3528,
157
+ "step": 950
158
+ },
159
+ {
160
+ "epoch": 0.22294679932001227,
161
+ "grad_norm": 0.11623840034008026,
162
+ "learning_rate": 9.848231727596589e-05,
163
+ "loss": 0.3535,
164
+ "step": 1000
165
+ },
166
+ {
167
+ "epoch": 0.22294679932001227,
168
+ "eval_loss": 0.3608478009700775,
169
+ "eval_runtime": 1366.4982,
170
+ "eval_samples_per_second": 305.708,
171
+ "eval_steps_per_second": 9.554,
172
+ "step": 1000
173
+ },
174
+ {
175
+ "epoch": 0.2340941392860129,
176
+ "grad_norm": 0.25702351331710815,
177
+ "learning_rate": 9.832851079598007e-05,
178
+ "loss": 0.3508,
179
+ "step": 1050
180
+ },
181
+ {
182
+ "epoch": 0.24524147925201348,
183
+ "grad_norm": 0.20600661635398865,
184
+ "learning_rate": 9.816755061497149e-05,
185
+ "loss": 0.355,
186
+ "step": 1100
187
+ },
188
+ {
189
+ "epoch": 0.2563888192180141,
190
+ "grad_norm": 0.16554264724254608,
191
+ "learning_rate": 9.79994860914706e-05,
192
+ "loss": 0.3527,
193
+ "step": 1150
194
+ },
195
+ {
196
+ "epoch": 0.2675361591840147,
197
+ "grad_norm": 0.38233572244644165,
198
+ "learning_rate": 9.782436876255859e-05,
199
+ "loss": 0.3558,
200
+ "step": 1200
201
+ },
202
+ {
203
+ "epoch": 0.27868349915001533,
204
+ "grad_norm": 0.15968042612075806,
205
+ "learning_rate": 9.764225232806334e-05,
206
+ "loss": 0.3534,
207
+ "step": 1250
208
+ },
209
+ {
210
+ "epoch": 0.2898308391160159,
211
+ "grad_norm": 0.08807487785816193,
212
+ "learning_rate": 9.745319263409241e-05,
213
+ "loss": 0.3494,
214
+ "step": 1300
215
+ },
216
+ {
217
+ "epoch": 0.3009781790820166,
218
+ "grad_norm": 0.1922980546951294,
219
+ "learning_rate": 9.725724765590786e-05,
220
+ "loss": 0.353,
221
+ "step": 1350
222
+ },
223
+ {
224
+ "epoch": 0.31212551904801716,
225
+ "grad_norm": 0.11445993185043335,
226
+ "learning_rate": 9.7054477480148e-05,
227
+ "loss": 0.3535,
228
+ "step": 1400
229
+ },
230
+ {
231
+ "epoch": 0.32327285901401775,
232
+ "grad_norm": 0.22301775217056274,
233
+ "learning_rate": 9.684494428640185e-05,
234
+ "loss": 0.3531,
235
+ "step": 1450
236
+ },
237
+ {
238
+ "epoch": 0.3344201989800184,
239
+ "grad_norm": 0.19347846508026123,
240
+ "learning_rate": 9.662871232814171e-05,
241
+ "loss": 0.3515,
242
+ "step": 1500
243
+ },
244
+ {
245
+ "epoch": 0.3344201989800184,
246
+ "eval_loss": 0.36202892661094666,
247
+ "eval_runtime": 1359.8166,
248
+ "eval_samples_per_second": 307.21,
249
+ "eval_steps_per_second": 9.601,
250
+ "step": 1500
251
+ },
252
+ {
253
+ "epoch": 0.345567538946019,
254
+ "grad_norm": 0.20853643119335175,
255
+ "learning_rate": 9.640584791301984e-05,
256
+ "loss": 0.3514,
257
+ "step": 1550
258
+ },
259
+ {
260
+ "epoch": 0.35671487891201964,
261
+ "grad_norm": 0.20620982348918915,
262
+ "learning_rate": 9.617641938253508e-05,
263
+ "loss": 0.3528,
264
+ "step": 1600
265
+ },
266
+ {
267
+ "epoch": 0.36786221887802023,
268
+ "grad_norm": 0.12390906363725662,
269
+ "learning_rate": 9.594049709107604e-05,
270
+ "loss": 0.3489,
271
+ "step": 1650
272
+ },
273
+ {
274
+ "epoch": 0.3790095588440208,
275
+ "grad_norm": 0.11034612357616425,
276
+ "learning_rate": 9.569815338434672e-05,
277
+ "loss": 0.3511,
278
+ "step": 1700
279
+ },
280
+ {
281
+ "epoch": 0.39015689881002147,
282
+ "grad_norm": 0.2769007086753845,
283
+ "learning_rate": 9.54494625771818e-05,
284
+ "loss": 0.3506,
285
+ "step": 1750
286
+ },
287
+ {
288
+ "epoch": 0.40130423877602206,
289
+ "grad_norm": 0.24118147790431976,
290
+ "learning_rate": 9.519450093075788e-05,
291
+ "loss": 0.3506,
292
+ "step": 1800
293
+ },
294
+ {
295
+ "epoch": 0.4124515787420227,
296
+ "grad_norm": 0.10841376334428787,
297
+ "learning_rate": 9.493334662920794e-05,
298
+ "loss": 0.3491,
299
+ "step": 1850
300
+ },
301
+ {
302
+ "epoch": 0.4235989187080233,
303
+ "grad_norm": 0.14958825707435608,
304
+ "learning_rate": 9.46660797556462e-05,
305
+ "loss": 0.3501,
306
+ "step": 1900
307
+ },
308
+ {
309
+ "epoch": 0.4347462586740239,
310
+ "grad_norm": 0.19621045887470245,
311
+ "learning_rate": 9.43927822676105e-05,
312
+ "loss": 0.3502,
313
+ "step": 1950
314
+ },
315
+ {
316
+ "epoch": 0.44589359864002454,
317
+ "grad_norm": 0.15458981692790985,
318
+ "learning_rate": 9.411353797193005e-05,
319
+ "loss": 0.3526,
320
+ "step": 2000
321
+ },
322
+ {
323
+ "epoch": 0.44589359864002454,
324
+ "eval_loss": 0.36221134662628174,
325
+ "eval_runtime": 1367.0864,
326
+ "eval_samples_per_second": 305.576,
327
+ "eval_steps_per_second": 9.55,
328
+ "step": 2000
329
+ },
330
+ {
331
+ "epoch": 0.4570409386060251,
332
+ "grad_norm": 0.10762229561805725,
333
+ "learning_rate": 9.382843249902597e-05,
334
+ "loss": 0.3476,
335
+ "step": 2050
336
+ },
337
+ {
338
+ "epoch": 0.4681882785720258,
339
+ "grad_norm": 0.27003607153892517,
340
+ "learning_rate": 9.353755327665268e-05,
341
+ "loss": 0.3506,
342
+ "step": 2100
343
+ },
344
+ {
345
+ "epoch": 0.47933561853802636,
346
+ "grad_norm": 0.19240695238113403,
347
+ "learning_rate": 9.324098950308817e-05,
348
+ "loss": 0.3486,
349
+ "step": 2150
350
+ },
351
+ {
352
+ "epoch": 0.49048295850402696,
353
+ "grad_norm": 0.1518525779247284,
354
+ "learning_rate": 9.293883211978123e-05,
355
+ "loss": 0.3496,
356
+ "step": 2200
357
+ },
358
+ {
359
+ "epoch": 0.5016302984700276,
360
+ "grad_norm": 0.14137060940265656,
361
+ "learning_rate": 9.263117378346424e-05,
362
+ "loss": 0.3502,
363
+ "step": 2250
364
+ },
365
+ {
366
+ "epoch": 0.5127776384360282,
367
+ "grad_norm": 0.10448052734136581,
368
+ "learning_rate": 9.231810883773999e-05,
369
+ "loss": 0.3508,
370
+ "step": 2300
371
+ },
372
+ {
373
+ "epoch": 0.5239249784020288,
374
+ "grad_norm": 0.16013021767139435,
375
+ "learning_rate": 9.199973328415102e-05,
376
+ "loss": 0.3486,
377
+ "step": 2350
378
+ },
379
+ {
380
+ "epoch": 0.5350723183680294,
381
+ "grad_norm": 0.15149140357971191,
382
+ "learning_rate": 9.167614475274082e-05,
383
+ "loss": 0.3503,
384
+ "step": 2400
385
+ },
386
+ {
387
+ "epoch": 0.5462196583340301,
388
+ "grad_norm": 0.1527142971754074,
389
+ "learning_rate": 9.134744247211547e-05,
390
+ "loss": 0.3483,
391
+ "step": 2450
392
+ },
393
+ {
394
+ "epoch": 0.5573669983000307,
395
+ "grad_norm": 0.19500622153282166,
396
+ "learning_rate": 9.101372723901513e-05,
397
+ "loss": 0.3496,
398
+ "step": 2500
399
+ },
400
+ {
401
+ "epoch": 0.5573669983000307,
402
+ "eval_loss": 0.36044490337371826,
403
+ "eval_runtime": 1365.782,
404
+ "eval_samples_per_second": 305.868,
405
+ "eval_steps_per_second": 9.559,
406
+ "step": 2500
407
+ },
408
+ {
409
+ "epoch": 0.5685143382660313,
410
+ "grad_norm": 0.23164886236190796,
411
+ "learning_rate": 9.067510138740467e-05,
412
+ "loss": 0.3496,
413
+ "step": 2550
414
+ },
415
+ {
416
+ "epoch": 0.5796616782320319,
417
+ "grad_norm": 0.2949928045272827,
418
+ "learning_rate": 9.033166875709292e-05,
419
+ "loss": 0.3463,
420
+ "step": 2600
421
+ },
422
+ {
423
+ "epoch": 0.5908090181980324,
424
+ "grad_norm": 0.15740317106246948,
425
+ "learning_rate": 8.998353466189007e-05,
426
+ "loss": 0.3486,
427
+ "step": 2650
428
+ },
429
+ {
430
+ "epoch": 0.6019563581640331,
431
+ "grad_norm": 0.10845978558063507,
432
+ "learning_rate": 8.963080585731323e-05,
433
+ "loss": 0.3457,
434
+ "step": 2700
435
+ },
436
+ {
437
+ "epoch": 0.6131036981300337,
438
+ "grad_norm": 0.13455775380134583,
439
+ "learning_rate": 8.927359050784974e-05,
440
+ "loss": 0.3472,
441
+ "step": 2750
442
+ },
443
+ {
444
+ "epoch": 0.6242510380960343,
445
+ "grad_norm": 0.1484765261411667,
446
+ "learning_rate": 8.891199815378839e-05,
447
+ "loss": 0.3484,
448
+ "step": 2800
449
+ },
450
+ {
451
+ "epoch": 0.6353983780620349,
452
+ "grad_norm": 0.13684849441051483,
453
+ "learning_rate": 8.854613967762898e-05,
454
+ "loss": 0.3474,
455
+ "step": 2850
456
+ },
457
+ {
458
+ "epoch": 0.6465457180280355,
459
+ "grad_norm": 0.20448821783065796,
460
+ "learning_rate": 8.817612727008003e-05,
461
+ "loss": 0.3474,
462
+ "step": 2900
463
+ },
464
+ {
465
+ "epoch": 0.6576930579940362,
466
+ "grad_norm": 0.12173973768949509,
467
+ "learning_rate": 8.78020743956555e-05,
468
+ "loss": 0.3453,
469
+ "step": 2950
470
+ },
471
+ {
472
+ "epoch": 0.6688403979600368,
473
+ "grad_norm": 0.09354478865861893,
474
+ "learning_rate": 8.742409575788075e-05,
475
+ "loss": 0.3478,
476
+ "step": 3000
477
+ },
478
+ {
479
+ "epoch": 0.6688403979600368,
480
+ "eval_loss": 0.3610248863697052,
481
+ "eval_runtime": 1367.109,
482
+ "eval_samples_per_second": 305.571,
483
+ "eval_steps_per_second": 9.549,
484
+ "step": 3000
485
+ },
486
+ {
487
+ "epoch": 0.6799877379260374,
488
+ "grad_norm": 0.13489292562007904,
489
+ "learning_rate": 8.704230726411871e-05,
490
+ "loss": 0.3465,
491
+ "step": 3050
492
+ },
493
+ {
494
+ "epoch": 0.691135077892038,
495
+ "grad_norm": 0.3066965341567993,
496
+ "learning_rate": 8.665682599002684e-05,
497
+ "loss": 0.3478,
498
+ "step": 3100
499
+ },
500
+ {
501
+ "epoch": 0.7022824178580386,
502
+ "grad_norm": 0.16600176692008972,
503
+ "learning_rate": 8.626777014365575e-05,
504
+ "loss": 0.3477,
505
+ "step": 3150
506
+ },
507
+ {
508
+ "epoch": 0.7134297578240393,
509
+ "grad_norm": 0.14644253253936768,
510
+ "learning_rate": 8.587525902920062e-05,
511
+ "loss": 0.3469,
512
+ "step": 3200
513
+ },
514
+ {
515
+ "epoch": 0.7245770977900399,
516
+ "grad_norm": 0.2417059689760208,
517
+ "learning_rate": 8.547941301041661e-05,
518
+ "loss": 0.3429,
519
+ "step": 3250
520
+ },
521
+ {
522
+ "epoch": 0.7357244377560405,
523
+ "grad_norm": 0.1694221943616867,
524
+ "learning_rate": 8.508035347370913e-05,
525
+ "loss": 0.3475,
526
+ "step": 3300
527
+ },
528
+ {
529
+ "epoch": 0.746871777722041,
530
+ "grad_norm": 0.11019590497016907,
531
+ "learning_rate": 8.467820279091068e-05,
532
+ "loss": 0.3457,
533
+ "step": 3350
534
+ },
535
+ {
536
+ "epoch": 0.7580191176880416,
537
+ "grad_norm": 0.11622463166713715,
538
+ "learning_rate": 8.427308428175548e-05,
539
+ "loss": 0.3478,
540
+ "step": 3400
541
+ },
542
+ {
543
+ "epoch": 0.7691664576540423,
544
+ "grad_norm": 0.2572800815105438,
545
+ "learning_rate": 8.38651221760634e-05,
546
+ "loss": 0.3468,
547
+ "step": 3450
548
+ },
549
+ {
550
+ "epoch": 0.7803137976200429,
551
+ "grad_norm": 0.2664414644241333,
552
+ "learning_rate": 8.345444157564472e-05,
553
+ "loss": 0.3455,
554
+ "step": 3500
555
+ },
556
+ {
557
+ "epoch": 0.7803137976200429,
558
+ "eval_loss": 0.3600683808326721,
559
+ "eval_runtime": 1372.5301,
560
+ "eval_samples_per_second": 304.364,
561
+ "eval_steps_per_second": 9.512,
562
+ "step": 3500
563
+ }
564
+ ],
565
+ "logging_steps": 50,
566
+ "max_steps": 8970,
567
+ "num_input_tokens_seen": 0,
568
+ "num_train_epochs": 2,
569
+ "save_steps": 500,
570
+ "total_flos": 2.215247118336e+16,
571
+ "train_batch_size": 16,
572
+ "trial_name": null,
573
+ "trial_params": null
574
+ }
checkpoint-3500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6394692b4ac18512a7a049387e4f1b46a7d6b86d79f1e0f6ae7b1378b6f6630a
3
+ size 5585
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b8a9d93f7fe6d70ed5ed4f6a8cad2934f1493bad03eac64cc9928b29b5c3895
3
  size 523322016
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51824f4fd0ce52faeece495ec0c0e9db2f0e9626377b20dde7c45a4e8aa47567
3
  size 523322016
tb_logs/events.out.tfevents.1761467516.luyao1.803017.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8779df14ca538e09876a816c95cdd56d04605dec21fbe850dc4c552c6a2c9d12
3
+ size 29835
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3548dcc79e6ba8c12ab5ce14e8d5d0a8edd35cabeee7b91e3a58bdaef9c8b298
3
  size 5585
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6394692b4ac18512a7a049387e4f1b46a7d6b86d79f1e0f6ae7b1378b6f6630a
3
  size 5585