minpeter commited on
Commit
67de511
·
verified ·
1 Parent(s): 3f8971b

Training in progress, step 400, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd13ee0cbea58cba9cc7fc01eb302046b4bff7c2c4bc96437408e51e0a258984
3
  size 373077376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4452d8ee5d1c4b3f248050b462aef67647d83e8aa2c819475c2561ad6988260f
3
  size 373077376
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ac14285b251f49d934b2757ab7b76d640647a69a6604df05d4abcea37c572f2
3
  size 373225675
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abc52b4912bc5a69beadb79f29cd708ffaef6d5ab82fd19a670038c92f29c313
3
  size 373225675
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:793829d79b248c3a7b8954f2cd95073c2ba034f6ee2bb0edff8ce8fef88cb5ad
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a3c442dd05d519f184832cb8cb76be210b67395e80e365a8e2c8fc2a9d09440
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:80968dab5c533e2a1cf2f64e5806de56ff6c85624fdd0d13e21f084017ee671b
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f438d73941ac2939699522d3048115527267b7c8c06f9f728e1517b0c3c16832
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6237006237006237,
6
  "eval_steps": 100,
7
- "global_step": 300,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2132,6 +2132,714 @@
2132
  "eval_samples_per_second": 23.972,
2133
  "eval_steps_per_second": 2.996,
2134
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2135
  }
2136
  ],
2137
  "logging_steps": 1,
@@ -2151,7 +2859,7 @@
2151
  "attributes": {}
2152
  }
2153
  },
2154
- "total_flos": 7.64170916069376e+16,
2155
  "train_batch_size": 32,
2156
  "trial_name": null,
2157
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.8316008316008316,
6
  "eval_steps": 100,
7
+ "global_step": 400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2132
  "eval_samples_per_second": 23.972,
2133
  "eval_steps_per_second": 2.996,
2134
  "step": 300
2135
+ },
2136
+ {
2137
+ "epoch": 0.6257796257796258,
2138
+ "grad_norm": 0.361328125,
2139
+ "learning_rate": 0.00034091217642278086,
2140
+ "loss": 4.7977,
2141
+ "step": 301
2142
+ },
2143
+ {
2144
+ "epoch": 0.6278586278586279,
2145
+ "grad_norm": 0.283203125,
2146
+ "learning_rate": 0.0003376502653976583,
2147
+ "loss": 5.6144,
2148
+ "step": 302
2149
+ },
2150
+ {
2151
+ "epoch": 0.6299376299376299,
2152
+ "grad_norm": 0.3125,
2153
+ "learning_rate": 0.0003343960602114349,
2154
+ "loss": 5.3662,
2155
+ "step": 303
2156
+ },
2157
+ {
2158
+ "epoch": 0.632016632016632,
2159
+ "grad_norm": 0.279296875,
2160
+ "learning_rate": 0.0003311497153231305,
2161
+ "loss": 5.5636,
2162
+ "step": 304
2163
+ },
2164
+ {
2165
+ "epoch": 0.6340956340956341,
2166
+ "grad_norm": 0.2890625,
2167
+ "learning_rate": 0.00032791138481868084,
2168
+ "loss": 5.5391,
2169
+ "step": 305
2170
+ },
2171
+ {
2172
+ "epoch": 0.6361746361746362,
2173
+ "grad_norm": 0.3203125,
2174
+ "learning_rate": 0.00032468122240362287,
2175
+ "loss": 5.4312,
2176
+ "step": 306
2177
+ },
2178
+ {
2179
+ "epoch": 0.6382536382536382,
2180
+ "grad_norm": 0.330078125,
2181
+ "learning_rate": 0.0003214593813958001,
2182
+ "loss": 5.501,
2183
+ "step": 307
2184
+ },
2185
+ {
2186
+ "epoch": 0.6403326403326404,
2187
+ "grad_norm": 0.298828125,
2188
+ "learning_rate": 0.000318246014718085,
2189
+ "loss": 5.4276,
2190
+ "step": 308
2191
+ },
2192
+ {
2193
+ "epoch": 0.6424116424116424,
2194
+ "grad_norm": 0.26953125,
2195
+ "learning_rate": 0.00031504127489112105,
2196
+ "loss": 5.3774,
2197
+ "step": 309
2198
+ },
2199
+ {
2200
+ "epoch": 0.6444906444906445,
2201
+ "grad_norm": 0.31640625,
2202
+ "learning_rate": 0.0003118453140260823,
2203
+ "loss": 5.1568,
2204
+ "step": 310
2205
+ },
2206
+ {
2207
+ "epoch": 0.6465696465696466,
2208
+ "grad_norm": 0.265625,
2209
+ "learning_rate": 0.0003086582838174551,
2210
+ "loss": 5.7294,
2211
+ "step": 311
2212
+ },
2213
+ {
2214
+ "epoch": 0.6486486486486487,
2215
+ "grad_norm": 0.28515625,
2216
+ "learning_rate": 0.000305480335535837,
2217
+ "loss": 5.5416,
2218
+ "step": 312
2219
+ },
2220
+ {
2221
+ "epoch": 0.6507276507276507,
2222
+ "grad_norm": 0.28125,
2223
+ "learning_rate": 0.00030231162002075673,
2224
+ "loss": 5.3863,
2225
+ "step": 313
2226
+ },
2227
+ {
2228
+ "epoch": 0.6528066528066528,
2229
+ "grad_norm": 0.318359375,
2230
+ "learning_rate": 0.0002991522876735154,
2231
+ "loss": 5.1067,
2232
+ "step": 314
2233
+ },
2234
+ {
2235
+ "epoch": 0.6548856548856549,
2236
+ "grad_norm": 0.275390625,
2237
+ "learning_rate": 0.0002960024884500467,
2238
+ "loss": 5.5995,
2239
+ "step": 315
2240
+ },
2241
+ {
2242
+ "epoch": 0.656964656964657,
2243
+ "grad_norm": 0.384765625,
2244
+ "learning_rate": 0.0002928623718538006,
2245
+ "loss": 5.5833,
2246
+ "step": 316
2247
+ },
2248
+ {
2249
+ "epoch": 0.659043659043659,
2250
+ "grad_norm": 0.37890625,
2251
+ "learning_rate": 0.0002897320869286462,
2252
+ "loss": 4.8974,
2253
+ "step": 317
2254
+ },
2255
+ {
2256
+ "epoch": 0.6611226611226612,
2257
+ "grad_norm": 0.275390625,
2258
+ "learning_rate": 0.0002866117822517982,
2259
+ "loss": 5.291,
2260
+ "step": 318
2261
+ },
2262
+ {
2263
+ "epoch": 0.6632016632016632,
2264
+ "grad_norm": 0.3125,
2265
+ "learning_rate": 0.000283501605926764,
2266
+ "loss": 5.5008,
2267
+ "step": 319
2268
+ },
2269
+ {
2270
+ "epoch": 0.6652806652806653,
2271
+ "grad_norm": 0.271484375,
2272
+ "learning_rate": 0.0002804017055763149,
2273
+ "loss": 5.5551,
2274
+ "step": 320
2275
+ },
2276
+ {
2277
+ "epoch": 0.6673596673596673,
2278
+ "grad_norm": 0.296875,
2279
+ "learning_rate": 0.00027731222833547844,
2280
+ "loss": 5.6069,
2281
+ "step": 321
2282
+ },
2283
+ {
2284
+ "epoch": 0.6694386694386695,
2285
+ "grad_norm": 0.326171875,
2286
+ "learning_rate": 0.00027423332084455543,
2287
+ "loss": 5.4693,
2288
+ "step": 322
2289
+ },
2290
+ {
2291
+ "epoch": 0.6715176715176715,
2292
+ "grad_norm": 0.3828125,
2293
+ "learning_rate": 0.0002711651292421593,
2294
+ "loss": 5.2478,
2295
+ "step": 323
2296
+ },
2297
+ {
2298
+ "epoch": 0.6735966735966736,
2299
+ "grad_norm": 0.35546875,
2300
+ "learning_rate": 0.0002681077991582797,
2301
+ "loss": 5.3848,
2302
+ "step": 324
2303
+ },
2304
+ {
2305
+ "epoch": 0.6756756756756757,
2306
+ "grad_norm": 0.341796875,
2307
+ "learning_rate": 0.00026506147570737093,
2308
+ "loss": 5.4962,
2309
+ "step": 325
2310
+ },
2311
+ {
2312
+ "epoch": 0.6777546777546778,
2313
+ "grad_norm": 0.423828125,
2314
+ "learning_rate": 0.0002620263034814632,
2315
+ "loss": 5.1192,
2316
+ "step": 326
2317
+ },
2318
+ {
2319
+ "epoch": 0.6798336798336798,
2320
+ "grad_norm": 0.30078125,
2321
+ "learning_rate": 0.0002590024265433002,
2322
+ "loss": 5.2134,
2323
+ "step": 327
2324
+ },
2325
+ {
2326
+ "epoch": 0.681912681912682,
2327
+ "grad_norm": 0.58203125,
2328
+ "learning_rate": 0.00025598998841950106,
2329
+ "loss": 5.4984,
2330
+ "step": 328
2331
+ },
2332
+ {
2333
+ "epoch": 0.683991683991684,
2334
+ "grad_norm": 0.271484375,
2335
+ "learning_rate": 0.00025298913209374806,
2336
+ "loss": 5.4985,
2337
+ "step": 329
2338
+ },
2339
+ {
2340
+ "epoch": 0.6860706860706861,
2341
+ "grad_norm": 0.328125,
2342
+ "learning_rate": 0.0002500000000000001,
2343
+ "loss": 5.2018,
2344
+ "step": 330
2345
+ },
2346
+ {
2347
+ "epoch": 0.6881496881496881,
2348
+ "grad_norm": 0.5390625,
2349
+ "learning_rate": 0.0002470227340157316,
2350
+ "loss": 4.8436,
2351
+ "step": 331
2352
+ },
2353
+ {
2354
+ "epoch": 0.6902286902286903,
2355
+ "grad_norm": 0.33203125,
2356
+ "learning_rate": 0.00024405747545519962,
2357
+ "loss": 5.2047,
2358
+ "step": 332
2359
+ },
2360
+ {
2361
+ "epoch": 0.6923076923076923,
2362
+ "grad_norm": 0.349609375,
2363
+ "learning_rate": 0.00024110436506273432,
2364
+ "loss": 5.4741,
2365
+ "step": 333
2366
+ },
2367
+ {
2368
+ "epoch": 0.6943866943866944,
2369
+ "grad_norm": 0.40625,
2370
+ "learning_rate": 0.00023816354300606107,
2371
+ "loss": 4.9758,
2372
+ "step": 334
2373
+ },
2374
+ {
2375
+ "epoch": 0.6964656964656964,
2376
+ "grad_norm": 0.296875,
2377
+ "learning_rate": 0.0002352351488696457,
2378
+ "loss": 5.4983,
2379
+ "step": 335
2380
+ },
2381
+ {
2382
+ "epoch": 0.6985446985446986,
2383
+ "grad_norm": 0.283203125,
2384
+ "learning_rate": 0.0002323193216480698,
2385
+ "loss": 5.2683,
2386
+ "step": 336
2387
+ },
2388
+ {
2389
+ "epoch": 0.7006237006237006,
2390
+ "grad_norm": 0.306640625,
2391
+ "learning_rate": 0.00022941619973943362,
2392
+ "loss": 5.3248,
2393
+ "step": 337
2394
+ },
2395
+ {
2396
+ "epoch": 0.7027027027027027,
2397
+ "grad_norm": 0.2412109375,
2398
+ "learning_rate": 0.00022652592093878665,
2399
+ "loss": 5.6097,
2400
+ "step": 338
2401
+ },
2402
+ {
2403
+ "epoch": 0.7047817047817048,
2404
+ "grad_norm": 0.341796875,
2405
+ "learning_rate": 0.00022364862243158767,
2406
+ "loss": 5.3526,
2407
+ "step": 339
2408
+ },
2409
+ {
2410
+ "epoch": 0.7068607068607069,
2411
+ "grad_norm": 0.27734375,
2412
+ "learning_rate": 0.0002207844407871929,
2413
+ "loss": 5.4411,
2414
+ "step": 340
2415
+ },
2416
+ {
2417
+ "epoch": 0.7089397089397089,
2418
+ "grad_norm": 2.046875,
2419
+ "learning_rate": 0.0002179335119523745,
2420
+ "loss": 5.9201,
2421
+ "step": 341
2422
+ },
2423
+ {
2424
+ "epoch": 0.7110187110187111,
2425
+ "grad_norm": 0.3828125,
2426
+ "learning_rate": 0.0002150959712448669,
2427
+ "loss": 5.0856,
2428
+ "step": 342
2429
+ },
2430
+ {
2431
+ "epoch": 0.7130977130977131,
2432
+ "grad_norm": 0.267578125,
2433
+ "learning_rate": 0.000212271953346945,
2434
+ "loss": 5.4956,
2435
+ "step": 343
2436
+ },
2437
+ {
2438
+ "epoch": 0.7151767151767152,
2439
+ "grad_norm": 0.302734375,
2440
+ "learning_rate": 0.0002094615922990309,
2441
+ "loss": 5.3681,
2442
+ "step": 344
2443
+ },
2444
+ {
2445
+ "epoch": 0.7172557172557172,
2446
+ "grad_norm": 0.333984375,
2447
+ "learning_rate": 0.00020666502149333215,
2448
+ "loss": 5.342,
2449
+ "step": 345
2450
+ },
2451
+ {
2452
+ "epoch": 0.7193347193347194,
2453
+ "grad_norm": 0.31640625,
2454
+ "learning_rate": 0.00020388237366751006,
2455
+ "loss": 5.3868,
2456
+ "step": 346
2457
+ },
2458
+ {
2459
+ "epoch": 0.7214137214137214,
2460
+ "grad_norm": 0.42578125,
2461
+ "learning_rate": 0.00020111378089837957,
2462
+ "loss": 5.0104,
2463
+ "step": 347
2464
+ },
2465
+ {
2466
+ "epoch": 0.7234927234927235,
2467
+ "grad_norm": 0.2890625,
2468
+ "learning_rate": 0.00019835937459564064,
2469
+ "loss": 5.2236,
2470
+ "step": 348
2471
+ },
2472
+ {
2473
+ "epoch": 0.7255717255717256,
2474
+ "grad_norm": 0.296875,
2475
+ "learning_rate": 0.00019561928549563967,
2476
+ "loss": 5.4828,
2477
+ "step": 349
2478
+ },
2479
+ {
2480
+ "epoch": 0.7276507276507277,
2481
+ "grad_norm": 0.267578125,
2482
+ "learning_rate": 0.00019289364365516608,
2483
+ "loss": 5.7081,
2484
+ "step": 350
2485
+ },
2486
+ {
2487
+ "epoch": 0.7297297297297297,
2488
+ "grad_norm": 0.353515625,
2489
+ "learning_rate": 0.0001901825784452777,
2490
+ "loss": 5.3906,
2491
+ "step": 351
2492
+ },
2493
+ {
2494
+ "epoch": 0.7318087318087318,
2495
+ "grad_norm": 0.291015625,
2496
+ "learning_rate": 0.00018748621854516078,
2497
+ "loss": 5.1981,
2498
+ "step": 352
2499
+ },
2500
+ {
2501
+ "epoch": 0.7338877338877339,
2502
+ "grad_norm": 0.283203125,
2503
+ "learning_rate": 0.0001848046919360225,
2504
+ "loss": 5.4607,
2505
+ "step": 353
2506
+ },
2507
+ {
2508
+ "epoch": 0.735966735966736,
2509
+ "grad_norm": 0.259765625,
2510
+ "learning_rate": 0.0001821381258950161,
2511
+ "loss": 5.3527,
2512
+ "step": 354
2513
+ },
2514
+ {
2515
+ "epoch": 0.738045738045738,
2516
+ "grad_norm": 0.31640625,
2517
+ "learning_rate": 0.00017948664698919987,
2518
+ "loss": 5.4137,
2519
+ "step": 355
2520
+ },
2521
+ {
2522
+ "epoch": 0.7401247401247402,
2523
+ "grad_norm": 0.251953125,
2524
+ "learning_rate": 0.0001768503810695295,
2525
+ "loss": 5.3231,
2526
+ "step": 356
2527
+ },
2528
+ {
2529
+ "epoch": 0.7422037422037422,
2530
+ "grad_norm": 0.27734375,
2531
+ "learning_rate": 0.00017422945326488553,
2532
+ "loss": 5.3483,
2533
+ "step": 357
2534
+ },
2535
+ {
2536
+ "epoch": 0.7442827442827443,
2537
+ "grad_norm": 0.33984375,
2538
+ "learning_rate": 0.00017162398797613282,
2539
+ "loss": 5.2544,
2540
+ "step": 358
2541
+ },
2542
+ {
2543
+ "epoch": 0.7463617463617463,
2544
+ "grad_norm": 0.296875,
2545
+ "learning_rate": 0.00016903410887021675,
2546
+ "loss": 5.3142,
2547
+ "step": 359
2548
+ },
2549
+ {
2550
+ "epoch": 0.7484407484407485,
2551
+ "grad_norm": 0.263671875,
2552
+ "learning_rate": 0.00016645993887429345,
2553
+ "loss": 5.3094,
2554
+ "step": 360
2555
+ },
2556
+ {
2557
+ "epoch": 0.7505197505197505,
2558
+ "grad_norm": 0.37890625,
2559
+ "learning_rate": 0.00016390160016989486,
2560
+ "loss": 5.4099,
2561
+ "step": 361
2562
+ },
2563
+ {
2564
+ "epoch": 0.7525987525987526,
2565
+ "grad_norm": 0.3515625,
2566
+ "learning_rate": 0.00016135921418712956,
2567
+ "loss": 4.6715,
2568
+ "step": 362
2569
+ },
2570
+ {
2571
+ "epoch": 0.7546777546777547,
2572
+ "grad_norm": 0.2734375,
2573
+ "learning_rate": 0.00015883290159891906,
2574
+ "loss": 5.2596,
2575
+ "step": 363
2576
+ },
2577
+ {
2578
+ "epoch": 0.7567567567567568,
2579
+ "grad_norm": 0.314453125,
2580
+ "learning_rate": 0.0001563227823152708,
2581
+ "loss": 5.411,
2582
+ "step": 364
2583
+ },
2584
+ {
2585
+ "epoch": 0.7588357588357588,
2586
+ "grad_norm": 0.259765625,
2587
+ "learning_rate": 0.00015382897547758513,
2588
+ "loss": 5.3466,
2589
+ "step": 365
2590
+ },
2591
+ {
2592
+ "epoch": 0.760914760914761,
2593
+ "grad_norm": 0.25390625,
2594
+ "learning_rate": 0.0001513515994530023,
2595
+ "loss": 5.3548,
2596
+ "step": 366
2597
+ },
2598
+ {
2599
+ "epoch": 0.762993762993763,
2600
+ "grad_norm": 0.349609375,
2601
+ "learning_rate": 0.00014889077182878268,
2602
+ "loss": 5.1248,
2603
+ "step": 367
2604
+ },
2605
+ {
2606
+ "epoch": 0.7650727650727651,
2607
+ "grad_norm": 0.310546875,
2608
+ "learning_rate": 0.00014644660940672628,
2609
+ "loss": 4.9604,
2610
+ "step": 368
2611
+ },
2612
+ {
2613
+ "epoch": 0.7671517671517671,
2614
+ "grad_norm": 0.306640625,
2615
+ "learning_rate": 0.00014401922819762863,
2616
+ "loss": 5.2443,
2617
+ "step": 369
2618
+ },
2619
+ {
2620
+ "epoch": 0.7692307692307693,
2621
+ "grad_norm": 0.341796875,
2622
+ "learning_rate": 0.00014160874341577446,
2623
+ "loss": 4.8705,
2624
+ "step": 370
2625
+ },
2626
+ {
2627
+ "epoch": 0.7713097713097713,
2628
+ "grad_norm": 0.3046875,
2629
+ "learning_rate": 0.00013921526947346903,
2630
+ "loss": 5.2561,
2631
+ "step": 371
2632
+ },
2633
+ {
2634
+ "epoch": 0.7733887733887734,
2635
+ "grad_norm": 0.625,
2636
+ "learning_rate": 0.0001368389199756075,
2637
+ "loss": 5.5852,
2638
+ "step": 372
2639
+ },
2640
+ {
2641
+ "epoch": 0.7754677754677755,
2642
+ "grad_norm": 0.27734375,
2643
+ "learning_rate": 0.0001344798077142836,
2644
+ "loss": 5.2821,
2645
+ "step": 373
2646
+ },
2647
+ {
2648
+ "epoch": 0.7775467775467776,
2649
+ "grad_norm": 0.34375,
2650
+ "learning_rate": 0.0001321380446634342,
2651
+ "loss": 5.0683,
2652
+ "step": 374
2653
+ },
2654
+ {
2655
+ "epoch": 0.7796257796257796,
2656
+ "grad_norm": 0.27734375,
2657
+ "learning_rate": 0.00012981374197352664,
2658
+ "loss": 5.1648,
2659
+ "step": 375
2660
+ },
2661
+ {
2662
+ "epoch": 0.7817047817047817,
2663
+ "grad_norm": 0.33203125,
2664
+ "learning_rate": 0.0001275070099662815,
2665
+ "loss": 5.2669,
2666
+ "step": 376
2667
+ },
2668
+ {
2669
+ "epoch": 0.7837837837837838,
2670
+ "grad_norm": 0.28125,
2671
+ "learning_rate": 0.00012521795812943704,
2672
+ "loss": 5.4582,
2673
+ "step": 377
2674
+ },
2675
+ {
2676
+ "epoch": 0.7858627858627859,
2677
+ "grad_norm": 0.267578125,
2678
+ "learning_rate": 0.00012294669511155192,
2679
+ "loss": 5.3977,
2680
+ "step": 378
2681
+ },
2682
+ {
2683
+ "epoch": 0.7879417879417879,
2684
+ "grad_norm": 0.33203125,
2685
+ "learning_rate": 0.00012069332871684874,
2686
+ "loss": 4.7924,
2687
+ "step": 379
2688
+ },
2689
+ {
2690
+ "epoch": 0.7900207900207901,
2691
+ "grad_norm": 0.287109375,
2692
+ "learning_rate": 0.00011845796590009682,
2693
+ "loss": 5.4676,
2694
+ "step": 380
2695
+ },
2696
+ {
2697
+ "epoch": 0.7920997920997921,
2698
+ "grad_norm": 0.2421875,
2699
+ "learning_rate": 0.00011624071276153569,
2700
+ "loss": 5.6005,
2701
+ "step": 381
2702
+ },
2703
+ {
2704
+ "epoch": 0.7941787941787942,
2705
+ "grad_norm": 0.28125,
2706
+ "learning_rate": 0.00011404167454183955,
2707
+ "loss": 5.3724,
2708
+ "step": 382
2709
+ },
2710
+ {
2711
+ "epoch": 0.7962577962577962,
2712
+ "grad_norm": 0.451171875,
2713
+ "learning_rate": 0.00011186095561712129,
2714
+ "loss": 5.0408,
2715
+ "step": 383
2716
+ },
2717
+ {
2718
+ "epoch": 0.7983367983367984,
2719
+ "grad_norm": 0.251953125,
2720
+ "learning_rate": 0.000109698659493979,
2721
+ "loss": 5.41,
2722
+ "step": 384
2723
+ },
2724
+ {
2725
+ "epoch": 0.8004158004158004,
2726
+ "grad_norm": 0.30859375,
2727
+ "learning_rate": 0.0001075548888045827,
2728
+ "loss": 4.975,
2729
+ "step": 385
2730
+ },
2731
+ {
2732
+ "epoch": 0.8024948024948025,
2733
+ "grad_norm": 0.345703125,
2734
+ "learning_rate": 0.00010542974530180327,
2735
+ "loss": 5.4077,
2736
+ "step": 386
2737
+ },
2738
+ {
2739
+ "epoch": 0.8045738045738046,
2740
+ "grad_norm": 0.26171875,
2741
+ "learning_rate": 0.00010332332985438247,
2742
+ "loss": 5.1575,
2743
+ "step": 387
2744
+ },
2745
+ {
2746
+ "epoch": 0.8066528066528067,
2747
+ "grad_norm": 0.259765625,
2748
+ "learning_rate": 0.00010123574244214551,
2749
+ "loss": 5.6162,
2750
+ "step": 388
2751
+ },
2752
+ {
2753
+ "epoch": 0.8087318087318087,
2754
+ "grad_norm": 0.87890625,
2755
+ "learning_rate": 9.916708215125586e-05,
2756
+ "loss": 5.2549,
2757
+ "step": 389
2758
+ },
2759
+ {
2760
+ "epoch": 0.8108108108108109,
2761
+ "grad_norm": 0.439453125,
2762
+ "learning_rate": 9.711744716951093e-05,
2763
+ "loss": 4.9944,
2764
+ "step": 390
2765
+ },
2766
+ {
2767
+ "epoch": 0.8128898128898129,
2768
+ "grad_norm": 0.470703125,
2769
+ "learning_rate": 9.508693478168346e-05,
2770
+ "loss": 5.1125,
2771
+ "step": 391
2772
+ },
2773
+ {
2774
+ "epoch": 0.814968814968815,
2775
+ "grad_norm": 0.283203125,
2776
+ "learning_rate": 9.307564136490254e-05,
2777
+ "loss": 4.9202,
2778
+ "step": 392
2779
+ },
2780
+ {
2781
+ "epoch": 0.817047817047817,
2782
+ "grad_norm": 0.255859375,
2783
+ "learning_rate": 9.108366238407967e-05,
2784
+ "loss": 5.3454,
2785
+ "step": 393
2786
+ },
2787
+ {
2788
+ "epoch": 0.8191268191268192,
2789
+ "grad_norm": 0.2734375,
2790
+ "learning_rate": 8.911109238737747e-05,
2791
+ "loss": 5.283,
2792
+ "step": 394
2793
+ },
2794
+ {
2795
+ "epoch": 0.8212058212058212,
2796
+ "grad_norm": 0.40234375,
2797
+ "learning_rate": 8.715802500172215e-05,
2798
+ "loss": 4.732,
2799
+ "step": 395
2800
+ },
2801
+ {
2802
+ "epoch": 0.8232848232848233,
2803
+ "grad_norm": 0.28125,
2804
+ "learning_rate": 8.522455292835934e-05,
2805
+ "loss": 4.9245,
2806
+ "step": 396
2807
+ },
2808
+ {
2809
+ "epoch": 0.8253638253638254,
2810
+ "grad_norm": 0.259765625,
2811
+ "learning_rate": 8.331076793845421e-05,
2812
+ "loss": 4.9568,
2813
+ "step": 397
2814
+ },
2815
+ {
2816
+ "epoch": 0.8274428274428275,
2817
+ "grad_norm": 0.263671875,
2818
+ "learning_rate": 8.141676086873573e-05,
2819
+ "loss": 5.3107,
2820
+ "step": 398
2821
+ },
2822
+ {
2823
+ "epoch": 0.8295218295218295,
2824
+ "grad_norm": 0.33203125,
2825
+ "learning_rate": 7.954262161718479e-05,
2826
+ "loss": 4.9182,
2827
+ "step": 399
2828
+ },
2829
+ {
2830
+ "epoch": 0.8316008316008316,
2831
+ "grad_norm": 0.26171875,
2832
+ "learning_rate": 7.768843913876755e-05,
2833
+ "loss": 5.3271,
2834
+ "step": 400
2835
+ },
2836
+ {
2837
+ "epoch": 0.8316008316008316,
2838
+ "eval_loss": 4.718051910400391,
2839
+ "eval_runtime": 0.7111,
2840
+ "eval_samples_per_second": 22.501,
2841
+ "eval_steps_per_second": 2.813,
2842
+ "step": 400
2843
  }
2844
  ],
2845
  "logging_steps": 1,
 
2859
  "attributes": {}
2860
  }
2861
  },
2862
+ "total_flos": 1.018894554759168e+17,
2863
  "train_batch_size": 32,
2864
  "trial_name": null,
2865
  "trial_params": null