CocoRoF commited on
Commit
ee965d4
·
verified ·
1 Parent(s): c892087

Training in progress, step 1855, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5fee08fae4a270923d847f749b15fcd42f7381884da39c605eb17b3e0a9ce81
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8e57e667b960b56e96b51410b7155677ee5f6f65335d22becf61dd69a8772b6
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f39ccf0568d82c2eefebb237f16c6e23c73c67d0f03adba0677b59e01b1871d
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:836b2f95bd092426379cd5f1f88e3a5b0be2151ea5112d892b65276f45920021
3
  size 2375752250
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eebbda9031bbe83261af4068018fa312b4ce10a163bb58ed929c991e6ab70844
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35cfcc0427b746466615d2216d048b8c86e2bc0218bb45a5ecc4e1ea12650eab
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8084210526315789,
5
  "eval_steps": 500,
6
- "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2131,6 +2131,503 @@
2131
  "eval_samples_per_second": 602.367,
2132
  "eval_steps_per_second": 37.648,
2133
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2134
  }
2135
  ],
2136
  "logging_steps": 5,
@@ -2145,12 +2642,12 @@
2145
  "should_evaluate": false,
2146
  "should_log": false,
2147
  "should_save": true,
2148
- "should_training_stop": false
2149
  },
2150
  "attributes": {}
2151
  }
2152
  },
2153
- "total_flos": 6.498536989183181e+18,
2154
  "train_batch_size": 4,
2155
  "trial_name": null,
2156
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9997473684210526,
5
  "eval_steps": 500,
6
+ "global_step": 1855,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2131
  "eval_samples_per_second": 602.367,
2132
  "eval_steps_per_second": 37.648,
2133
  "step": 1500
2134
+ },
2135
+ {
2136
+ "epoch": 0.8111157894736842,
2137
+ "grad_norm": 113.875,
2138
+ "learning_rate": 2.0970641102456563e-06,
2139
+ "loss": 47.3825,
2140
+ "step": 1505
2141
+ },
2142
+ {
2143
+ "epoch": 0.8138105263157894,
2144
+ "grad_norm": 112.9375,
2145
+ "learning_rate": 2.0671060515278613e-06,
2146
+ "loss": 46.2206,
2147
+ "step": 1510
2148
+ },
2149
+ {
2150
+ "epoch": 0.8165052631578947,
2151
+ "grad_norm": 109.6875,
2152
+ "learning_rate": 2.037147992810066e-06,
2153
+ "loss": 47.1773,
2154
+ "step": 1515
2155
+ },
2156
+ {
2157
+ "epoch": 0.8192,
2158
+ "grad_norm": 106.1875,
2159
+ "learning_rate": 2.007189934092271e-06,
2160
+ "loss": 46.7185,
2161
+ "step": 1520
2162
+ },
2163
+ {
2164
+ "epoch": 0.8218947368421052,
2165
+ "grad_norm": 107.25,
2166
+ "learning_rate": 1.977231875374476e-06,
2167
+ "loss": 47.2794,
2168
+ "step": 1525
2169
+ },
2170
+ {
2171
+ "epoch": 0.8245894736842105,
2172
+ "grad_norm": 113.125,
2173
+ "learning_rate": 1.9472738166566808e-06,
2174
+ "loss": 46.1662,
2175
+ "step": 1530
2176
+ },
2177
+ {
2178
+ "epoch": 0.8272842105263158,
2179
+ "grad_norm": 106.1875,
2180
+ "learning_rate": 1.9173157579388858e-06,
2181
+ "loss": 47.7226,
2182
+ "step": 1535
2183
+ },
2184
+ {
2185
+ "epoch": 0.829978947368421,
2186
+ "grad_norm": 111.375,
2187
+ "learning_rate": 1.8873576992210907e-06,
2188
+ "loss": 46.4276,
2189
+ "step": 1540
2190
+ },
2191
+ {
2192
+ "epoch": 0.8326736842105263,
2193
+ "grad_norm": 114.4375,
2194
+ "learning_rate": 1.8573996405032955e-06,
2195
+ "loss": 46.6981,
2196
+ "step": 1545
2197
+ },
2198
+ {
2199
+ "epoch": 0.8353684210526315,
2200
+ "grad_norm": 104.0625,
2201
+ "learning_rate": 1.8274415817855005e-06,
2202
+ "loss": 46.4666,
2203
+ "step": 1550
2204
+ },
2205
+ {
2206
+ "epoch": 0.8380631578947368,
2207
+ "grad_norm": 106.6875,
2208
+ "learning_rate": 1.7974835230677055e-06,
2209
+ "loss": 45.6698,
2210
+ "step": 1555
2211
+ },
2212
+ {
2213
+ "epoch": 0.8407578947368421,
2214
+ "grad_norm": 108.625,
2215
+ "learning_rate": 1.7675254643499104e-06,
2216
+ "loss": 48.0453,
2217
+ "step": 1560
2218
+ },
2219
+ {
2220
+ "epoch": 0.8434526315789473,
2221
+ "grad_norm": 107.1875,
2222
+ "learning_rate": 1.7375674056321152e-06,
2223
+ "loss": 46.3295,
2224
+ "step": 1565
2225
+ },
2226
+ {
2227
+ "epoch": 0.8461473684210526,
2228
+ "grad_norm": 114.125,
2229
+ "learning_rate": 1.7076093469143202e-06,
2230
+ "loss": 46.0131,
2231
+ "step": 1570
2232
+ },
2233
+ {
2234
+ "epoch": 0.8488421052631578,
2235
+ "grad_norm": 98.4375,
2236
+ "learning_rate": 1.6776512881965251e-06,
2237
+ "loss": 47.3054,
2238
+ "step": 1575
2239
+ },
2240
+ {
2241
+ "epoch": 0.8515368421052631,
2242
+ "grad_norm": 109.5,
2243
+ "learning_rate": 1.64769322947873e-06,
2244
+ "loss": 46.6946,
2245
+ "step": 1580
2246
+ },
2247
+ {
2248
+ "epoch": 0.8542315789473685,
2249
+ "grad_norm": 107.375,
2250
+ "learning_rate": 1.6177351707609349e-06,
2251
+ "loss": 46.8322,
2252
+ "step": 1585
2253
+ },
2254
+ {
2255
+ "epoch": 0.8569263157894736,
2256
+ "grad_norm": 111.5625,
2257
+ "learning_rate": 1.5877771120431399e-06,
2258
+ "loss": 47.2501,
2259
+ "step": 1590
2260
+ },
2261
+ {
2262
+ "epoch": 0.859621052631579,
2263
+ "grad_norm": 109.4375,
2264
+ "learning_rate": 1.5578190533253446e-06,
2265
+ "loss": 47.1194,
2266
+ "step": 1595
2267
+ },
2268
+ {
2269
+ "epoch": 0.8623157894736843,
2270
+ "grad_norm": 110.625,
2271
+ "learning_rate": 1.5278609946075496e-06,
2272
+ "loss": 45.8835,
2273
+ "step": 1600
2274
+ },
2275
+ {
2276
+ "epoch": 0.8650105263157895,
2277
+ "grad_norm": 104.3125,
2278
+ "learning_rate": 1.4979029358897546e-06,
2279
+ "loss": 46.3103,
2280
+ "step": 1605
2281
+ },
2282
+ {
2283
+ "epoch": 0.8677052631578948,
2284
+ "grad_norm": 105.4375,
2285
+ "learning_rate": 1.4679448771719593e-06,
2286
+ "loss": 45.329,
2287
+ "step": 1610
2288
+ },
2289
+ {
2290
+ "epoch": 0.8704,
2291
+ "grad_norm": 108.75,
2292
+ "learning_rate": 1.4379868184541643e-06,
2293
+ "loss": 46.3264,
2294
+ "step": 1615
2295
+ },
2296
+ {
2297
+ "epoch": 0.8730947368421053,
2298
+ "grad_norm": 105.6875,
2299
+ "learning_rate": 1.4080287597363693e-06,
2300
+ "loss": 45.5464,
2301
+ "step": 1620
2302
+ },
2303
+ {
2304
+ "epoch": 0.8757894736842106,
2305
+ "grad_norm": 111.5625,
2306
+ "learning_rate": 1.378070701018574e-06,
2307
+ "loss": 45.2749,
2308
+ "step": 1625
2309
+ },
2310
+ {
2311
+ "epoch": 0.8784842105263158,
2312
+ "grad_norm": 106.125,
2313
+ "learning_rate": 1.348112642300779e-06,
2314
+ "loss": 46.0788,
2315
+ "step": 1630
2316
+ },
2317
+ {
2318
+ "epoch": 0.8811789473684211,
2319
+ "grad_norm": 105.375,
2320
+ "learning_rate": 1.318154583582984e-06,
2321
+ "loss": 46.3733,
2322
+ "step": 1635
2323
+ },
2324
+ {
2325
+ "epoch": 0.8838736842105264,
2326
+ "grad_norm": 102.9375,
2327
+ "learning_rate": 1.288196524865189e-06,
2328
+ "loss": 45.4384,
2329
+ "step": 1640
2330
+ },
2331
+ {
2332
+ "epoch": 0.8865684210526316,
2333
+ "grad_norm": 110.4375,
2334
+ "learning_rate": 1.2582384661473938e-06,
2335
+ "loss": 46.3224,
2336
+ "step": 1645
2337
+ },
2338
+ {
2339
+ "epoch": 0.8892631578947369,
2340
+ "grad_norm": 104.5,
2341
+ "learning_rate": 1.2282804074295987e-06,
2342
+ "loss": 44.5758,
2343
+ "step": 1650
2344
+ },
2345
+ {
2346
+ "epoch": 0.8919578947368421,
2347
+ "grad_norm": 107.1875,
2348
+ "learning_rate": 1.1983223487118035e-06,
2349
+ "loss": 46.2027,
2350
+ "step": 1655
2351
+ },
2352
+ {
2353
+ "epoch": 0.8946526315789474,
2354
+ "grad_norm": 105.375,
2355
+ "learning_rate": 1.1683642899940085e-06,
2356
+ "loss": 45.8361,
2357
+ "step": 1660
2358
+ },
2359
+ {
2360
+ "epoch": 0.8973473684210527,
2361
+ "grad_norm": 103.1875,
2362
+ "learning_rate": 1.1384062312762134e-06,
2363
+ "loss": 46.9477,
2364
+ "step": 1665
2365
+ },
2366
+ {
2367
+ "epoch": 0.9000421052631579,
2368
+ "grad_norm": 105.9375,
2369
+ "learning_rate": 1.1084481725584182e-06,
2370
+ "loss": 46.2056,
2371
+ "step": 1670
2372
+ },
2373
+ {
2374
+ "epoch": 0.9027368421052632,
2375
+ "grad_norm": 105.0625,
2376
+ "learning_rate": 1.0784901138406232e-06,
2377
+ "loss": 46.3934,
2378
+ "step": 1675
2379
+ },
2380
+ {
2381
+ "epoch": 0.9054315789473685,
2382
+ "grad_norm": 110.5,
2383
+ "learning_rate": 1.0485320551228282e-06,
2384
+ "loss": 45.8651,
2385
+ "step": 1680
2386
+ },
2387
+ {
2388
+ "epoch": 0.9081263157894737,
2389
+ "grad_norm": 106.1875,
2390
+ "learning_rate": 1.018573996405033e-06,
2391
+ "loss": 44.2944,
2392
+ "step": 1685
2393
+ },
2394
+ {
2395
+ "epoch": 0.910821052631579,
2396
+ "grad_norm": 100.6875,
2397
+ "learning_rate": 9.88615937687238e-07,
2398
+ "loss": 45.6455,
2399
+ "step": 1690
2400
+ },
2401
+ {
2402
+ "epoch": 0.9135157894736842,
2403
+ "grad_norm": 102.625,
2404
+ "learning_rate": 9.586578789694429e-07,
2405
+ "loss": 45.1027,
2406
+ "step": 1695
2407
+ },
2408
+ {
2409
+ "epoch": 0.9162105263157895,
2410
+ "grad_norm": 108.125,
2411
+ "learning_rate": 9.286998202516478e-07,
2412
+ "loss": 46.1671,
2413
+ "step": 1700
2414
+ },
2415
+ {
2416
+ "epoch": 0.9189052631578948,
2417
+ "grad_norm": 108.4375,
2418
+ "learning_rate": 8.987417615338527e-07,
2419
+ "loss": 44.9381,
2420
+ "step": 1705
2421
+ },
2422
+ {
2423
+ "epoch": 0.9216,
2424
+ "grad_norm": 98.875,
2425
+ "learning_rate": 8.687837028160576e-07,
2426
+ "loss": 45.2336,
2427
+ "step": 1710
2428
+ },
2429
+ {
2430
+ "epoch": 0.9242947368421053,
2431
+ "grad_norm": 112.6875,
2432
+ "learning_rate": 8.388256440982626e-07,
2433
+ "loss": 45.8469,
2434
+ "step": 1715
2435
+ },
2436
+ {
2437
+ "epoch": 0.9269894736842105,
2438
+ "grad_norm": 108.0,
2439
+ "learning_rate": 8.088675853804674e-07,
2440
+ "loss": 45.2639,
2441
+ "step": 1720
2442
+ },
2443
+ {
2444
+ "epoch": 0.9296842105263158,
2445
+ "grad_norm": 107.9375,
2446
+ "learning_rate": 7.789095266626723e-07,
2447
+ "loss": 45.5577,
2448
+ "step": 1725
2449
+ },
2450
+ {
2451
+ "epoch": 0.9323789473684211,
2452
+ "grad_norm": 100.6875,
2453
+ "learning_rate": 7.489514679448773e-07,
2454
+ "loss": 44.7973,
2455
+ "step": 1730
2456
+ },
2457
+ {
2458
+ "epoch": 0.9350736842105263,
2459
+ "grad_norm": 110.0,
2460
+ "learning_rate": 7.189934092270822e-07,
2461
+ "loss": 45.1726,
2462
+ "step": 1735
2463
+ },
2464
+ {
2465
+ "epoch": 0.9377684210526316,
2466
+ "grad_norm": 104.125,
2467
+ "learning_rate": 6.89035350509287e-07,
2468
+ "loss": 44.1223,
2469
+ "step": 1740
2470
+ },
2471
+ {
2472
+ "epoch": 0.9404631578947369,
2473
+ "grad_norm": 101.5,
2474
+ "learning_rate": 6.59077291791492e-07,
2475
+ "loss": 45.1792,
2476
+ "step": 1745
2477
+ },
2478
+ {
2479
+ "epoch": 0.9431578947368421,
2480
+ "grad_norm": 103.5625,
2481
+ "learning_rate": 6.291192330736969e-07,
2482
+ "loss": 46.004,
2483
+ "step": 1750
2484
+ },
2485
+ {
2486
+ "epoch": 0.9458526315789474,
2487
+ "grad_norm": 108.0,
2488
+ "learning_rate": 5.991611743559017e-07,
2489
+ "loss": 44.949,
2490
+ "step": 1755
2491
+ },
2492
+ {
2493
+ "epoch": 0.9485473684210526,
2494
+ "grad_norm": 103.5625,
2495
+ "learning_rate": 5.692031156381067e-07,
2496
+ "loss": 45.3909,
2497
+ "step": 1760
2498
+ },
2499
+ {
2500
+ "epoch": 0.9512421052631579,
2501
+ "grad_norm": 105.375,
2502
+ "learning_rate": 5.392450569203116e-07,
2503
+ "loss": 45.4842,
2504
+ "step": 1765
2505
+ },
2506
+ {
2507
+ "epoch": 0.9539368421052632,
2508
+ "grad_norm": 109.625,
2509
+ "learning_rate": 5.092869982025165e-07,
2510
+ "loss": 44.3717,
2511
+ "step": 1770
2512
+ },
2513
+ {
2514
+ "epoch": 0.9566315789473684,
2515
+ "grad_norm": 98.9375,
2516
+ "learning_rate": 4.793289394847214e-07,
2517
+ "loss": 45.0351,
2518
+ "step": 1775
2519
+ },
2520
+ {
2521
+ "epoch": 0.9593263157894737,
2522
+ "grad_norm": 104.0,
2523
+ "learning_rate": 4.4937088076692636e-07,
2524
+ "loss": 44.7382,
2525
+ "step": 1780
2526
+ },
2527
+ {
2528
+ "epoch": 0.962021052631579,
2529
+ "grad_norm": 104.875,
2530
+ "learning_rate": 4.194128220491313e-07,
2531
+ "loss": 44.2659,
2532
+ "step": 1785
2533
+ },
2534
+ {
2535
+ "epoch": 0.9647157894736842,
2536
+ "grad_norm": 107.6875,
2537
+ "learning_rate": 3.8945476333133616e-07,
2538
+ "loss": 44.9655,
2539
+ "step": 1790
2540
+ },
2541
+ {
2542
+ "epoch": 0.9674105263157895,
2543
+ "grad_norm": 108.5625,
2544
+ "learning_rate": 3.594967046135411e-07,
2545
+ "loss": 46.1188,
2546
+ "step": 1795
2547
+ },
2548
+ {
2549
+ "epoch": 0.9701052631578947,
2550
+ "grad_norm": 104.625,
2551
+ "learning_rate": 3.29538645895746e-07,
2552
+ "loss": 44.0362,
2553
+ "step": 1800
2554
+ },
2555
+ {
2556
+ "epoch": 0.9728,
2557
+ "grad_norm": 99.1875,
2558
+ "learning_rate": 2.995805871779509e-07,
2559
+ "loss": 44.2291,
2560
+ "step": 1805
2561
+ },
2562
+ {
2563
+ "epoch": 0.9754947368421053,
2564
+ "grad_norm": 107.75,
2565
+ "learning_rate": 2.696225284601558e-07,
2566
+ "loss": 44.9852,
2567
+ "step": 1810
2568
+ },
2569
+ {
2570
+ "epoch": 0.9781894736842105,
2571
+ "grad_norm": 108.625,
2572
+ "learning_rate": 2.396644697423607e-07,
2573
+ "loss": 44.8573,
2574
+ "step": 1815
2575
+ },
2576
+ {
2577
+ "epoch": 0.9808842105263158,
2578
+ "grad_norm": 108.125,
2579
+ "learning_rate": 2.0970641102456564e-07,
2580
+ "loss": 44.9958,
2581
+ "step": 1820
2582
+ },
2583
+ {
2584
+ "epoch": 0.983578947368421,
2585
+ "grad_norm": 107.6875,
2586
+ "learning_rate": 1.7974835230677054e-07,
2587
+ "loss": 43.81,
2588
+ "step": 1825
2589
+ },
2590
+ {
2591
+ "epoch": 0.9862736842105263,
2592
+ "grad_norm": 106.0625,
2593
+ "learning_rate": 1.4979029358897544e-07,
2594
+ "loss": 44.0496,
2595
+ "step": 1830
2596
+ },
2597
+ {
2598
+ "epoch": 0.9889684210526316,
2599
+ "grad_norm": 104.9375,
2600
+ "learning_rate": 1.1983223487118036e-07,
2601
+ "loss": 43.5615,
2602
+ "step": 1835
2603
+ },
2604
+ {
2605
+ "epoch": 0.9916631578947368,
2606
+ "grad_norm": 106.5625,
2607
+ "learning_rate": 8.987417615338527e-08,
2608
+ "loss": 44.9755,
2609
+ "step": 1840
2610
+ },
2611
+ {
2612
+ "epoch": 0.9943578947368421,
2613
+ "grad_norm": 108.0,
2614
+ "learning_rate": 5.991611743559018e-08,
2615
+ "loss": 42.8911,
2616
+ "step": 1845
2617
+ },
2618
+ {
2619
+ "epoch": 0.9970526315789474,
2620
+ "grad_norm": 104.1875,
2621
+ "learning_rate": 2.995805871779509e-08,
2622
+ "loss": 44.0027,
2623
+ "step": 1850
2624
+ },
2625
+ {
2626
+ "epoch": 0.9997473684210526,
2627
+ "grad_norm": 110.8125,
2628
+ "learning_rate": 0.0,
2629
+ "loss": 44.4761,
2630
+ "step": 1855
2631
  }
2632
  ],
2633
  "logging_steps": 5,
 
2642
  "should_evaluate": false,
2643
  "should_log": false,
2644
  "should_save": true,
2645
+ "should_training_stop": true
2646
  },
2647
  "attributes": {}
2648
  }
2649
  },
2650
+ "total_flos": 8.0365240766232e+18,
2651
  "train_batch_size": 4,
2652
  "trial_name": null,
2653
  "trial_params": null