alessandronascimento commited on
Commit
16414f7
·
verified ·
1 Parent(s): 6afff68

Training in progress, epoch 5, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a231763b5faedaca18c9c191c07586b3550c6053a1eddc5ce3eee6f01c3b980
3
  size 852404428
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f873b0e350ecae608ae2a7c6d5064a22a73d12e41a6adc123d385a128fd6ea9
3
  size 852404428
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db9aea2220afea8c93b03bf9c5dc034e1c9c9fdd3ba5f8b9d73dc58e1a4f72ef
3
  size 1705187266
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:148de1705a5a44a135e6a613648d0a1b113e2d7acafe2bee645417f3d4b152db
3
  size 1705187266
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66de308632efa7bbf1e857a5abd292ee172491d8a7d235c0db723e6bccfc78f2
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc39a926741056c260a97c86cef4ca60de38e0b73eeb05622c76a983f99f2c84
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ac3e56631fd633144f95d1e4f53fccbe783b424998d247387be1c2ff61fe767
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ccdb98bd899bb8d04e411fe55a1b125ef862b3eaa3dc4b4994d14e842f255ee
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.8315503597259521,
3
  "best_model_checkpoint": "./results/checkpoint-63554",
4
- "epoch": 4.999996066352762,
5
  "eval_steps": 500,
6
- "global_step": 158885,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2266,6 +2266,462 @@
2266
  "eval_samples_per_second": 54.646,
2267
  "eval_steps_per_second": 0.854,
2268
  "step": 158885
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2269
  }
2270
  ],
2271
  "logging_steps": 500,
@@ -2294,7 +2750,7 @@
2294
  "attributes": {}
2295
  }
2296
  },
2297
- "total_flos": 3.216354771107147e+18,
2298
  "train_batch_size": 4,
2299
  "trial_name": null,
2300
  "trial_params": null
 
1
  {
2
  "best_metric": 0.8315503597259521,
3
  "best_model_checkpoint": "./results/checkpoint-63554",
4
+ "epoch": 5.999996066352762,
5
  "eval_steps": 500,
6
+ "global_step": 190662,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2266
  "eval_samples_per_second": 54.646,
2267
  "eval_steps_per_second": 0.854,
2268
  "step": 158885
2269
+ },
2270
+ {
2271
+ "epoch": 5.003618955459312,
2272
+ "grad_norm": 93093.9921875,
2273
+ "learning_rate": 1.0088047426114604e-05,
2274
+ "loss": 0.4392,
2275
+ "step": 159000
2276
+ },
2277
+ {
2278
+ "epoch": 5.019353544412844,
2279
+ "grad_norm": 101550.875,
2280
+ "learning_rate": 1.0038303518828214e-05,
2281
+ "loss": 0.4355,
2282
+ "step": 159500
2283
+ },
2284
+ {
2285
+ "epoch": 5.035088133366376,
2286
+ "grad_norm": 87909.828125,
2287
+ "learning_rate": 9.988558663700153e-06,
2288
+ "loss": 0.4364,
2289
+ "step": 160000
2290
+ },
2291
+ {
2292
+ "epoch": 5.050822722319908,
2293
+ "grad_norm": 91443.0078125,
2294
+ "learning_rate": 9.938814091694261e-06,
2295
+ "loss": 0.4357,
2296
+ "step": 160500
2297
+ },
2298
+ {
2299
+ "epoch": 5.0665573112734394,
2300
+ "grad_norm": 113526.984375,
2301
+ "learning_rate": 9.889071033767369e-06,
2302
+ "loss": 0.4365,
2303
+ "step": 161000
2304
+ },
2305
+ {
2306
+ "epoch": 5.082291900226972,
2307
+ "grad_norm": 96782.1640625,
2308
+ "learning_rate": 9.839330720838837e-06,
2309
+ "loss": 0.4369,
2310
+ "step": 161500
2311
+ },
2312
+ {
2313
+ "epoch": 5.098026489180503,
2314
+ "grad_norm": 94661.75,
2315
+ "learning_rate": 9.789594383760112e-06,
2316
+ "loss": 0.4345,
2317
+ "step": 162000
2318
+ },
2319
+ {
2320
+ "epoch": 5.1137610781340355,
2321
+ "grad_norm": 88317.609375,
2322
+ "learning_rate": 9.73986325328424e-06,
2323
+ "loss": 0.4334,
2324
+ "step": 162500
2325
+ },
2326
+ {
2327
+ "epoch": 5.129495667087567,
2328
+ "grad_norm": 85447.578125,
2329
+ "learning_rate": 9.690138560035441e-06,
2330
+ "loss": 0.4383,
2331
+ "step": 163000
2332
+ },
2333
+ {
2334
+ "epoch": 5.145230256041099,
2335
+ "grad_norm": 88336.3125,
2336
+ "learning_rate": 9.64042153447863e-06,
2337
+ "loss": 0.4336,
2338
+ "step": 163500
2339
+ },
2340
+ {
2341
+ "epoch": 5.160964844994631,
2342
+ "grad_norm": 106095.5,
2343
+ "learning_rate": 9.59071340688899e-06,
2344
+ "loss": 0.4359,
2345
+ "step": 164000
2346
+ },
2347
+ {
2348
+ "epoch": 5.176699433948162,
2349
+ "grad_norm": 99273.8203125,
2350
+ "learning_rate": 9.541015407321514e-06,
2351
+ "loss": 0.4351,
2352
+ "step": 164500
2353
+ },
2354
+ {
2355
+ "epoch": 5.192434022901694,
2356
+ "grad_norm": 90265.3125,
2357
+ "learning_rate": 9.491328765580572e-06,
2358
+ "loss": 0.4408,
2359
+ "step": 165000
2360
+ },
2361
+ {
2362
+ "epoch": 5.208168611855226,
2363
+ "grad_norm": 79781.9140625,
2364
+ "learning_rate": 9.441654711189482e-06,
2365
+ "loss": 0.4333,
2366
+ "step": 165500
2367
+ },
2368
+ {
2369
+ "epoch": 5.223903200808758,
2370
+ "grad_norm": 94186.921875,
2371
+ "learning_rate": 9.391994473360074e-06,
2372
+ "loss": 0.4368,
2373
+ "step": 166000
2374
+ },
2375
+ {
2376
+ "epoch": 5.239637789762289,
2377
+ "grad_norm": 92022.53125,
2378
+ "learning_rate": 9.342349280962287e-06,
2379
+ "loss": 0.4363,
2380
+ "step": 166500
2381
+ },
2382
+ {
2383
+ "epoch": 5.255372378715822,
2384
+ "grad_norm": 99226.21875,
2385
+ "learning_rate": 9.292720362493748e-06,
2386
+ "loss": 0.434,
2387
+ "step": 167000
2388
+ },
2389
+ {
2390
+ "epoch": 5.271106967669353,
2391
+ "grad_norm": 84206.7421875,
2392
+ "learning_rate": 9.24310894604938e-06,
2393
+ "loss": 0.4369,
2394
+ "step": 167500
2395
+ },
2396
+ {
2397
+ "epoch": 5.2868415566228855,
2398
+ "grad_norm": 99916.9921875,
2399
+ "learning_rate": 9.193516259291002e-06,
2400
+ "loss": 0.4353,
2401
+ "step": 168000
2402
+ },
2403
+ {
2404
+ "epoch": 5.302576145576417,
2405
+ "grad_norm": 101395.203125,
2406
+ "learning_rate": 9.143943529416966e-06,
2407
+ "loss": 0.4351,
2408
+ "step": 168500
2409
+ },
2410
+ {
2411
+ "epoch": 5.318310734529949,
2412
+ "grad_norm": 106872.765625,
2413
+ "learning_rate": 9.09439198313177e-06,
2414
+ "loss": 0.4349,
2415
+ "step": 169000
2416
+ },
2417
+ {
2418
+ "epoch": 5.334045323483481,
2419
+ "grad_norm": 85570.0859375,
2420
+ "learning_rate": 9.044862846615724e-06,
2421
+ "loss": 0.4342,
2422
+ "step": 169500
2423
+ },
2424
+ {
2425
+ "epoch": 5.349779912437013,
2426
+ "grad_norm": 89842.7421875,
2427
+ "learning_rate": 8.995357345494588e-06,
2428
+ "loss": 0.4376,
2429
+ "step": 170000
2430
+ },
2431
+ {
2432
+ "epoch": 5.365514501390544,
2433
+ "grad_norm": 87297.84375,
2434
+ "learning_rate": 8.94587670480925e-06,
2435
+ "loss": 0.4342,
2436
+ "step": 170500
2437
+ },
2438
+ {
2439
+ "epoch": 5.381249090344076,
2440
+ "grad_norm": 83102.15625,
2441
+ "learning_rate": 8.896422148985418e-06,
2442
+ "loss": 0.4337,
2443
+ "step": 171000
2444
+ },
2445
+ {
2446
+ "epoch": 5.396983679297608,
2447
+ "grad_norm": 105961.4296875,
2448
+ "learning_rate": 8.846994901803313e-06,
2449
+ "loss": 0.4356,
2450
+ "step": 171500
2451
+ },
2452
+ {
2453
+ "epoch": 5.412718268251139,
2454
+ "grad_norm": 88251.3046875,
2455
+ "learning_rate": 8.797596186367387e-06,
2456
+ "loss": 0.4358,
2457
+ "step": 172000
2458
+ },
2459
+ {
2460
+ "epoch": 5.428452857204672,
2461
+ "grad_norm": 89459.875,
2462
+ "learning_rate": 8.748227225076064e-06,
2463
+ "loss": 0.4347,
2464
+ "step": 172500
2465
+ },
2466
+ {
2467
+ "epoch": 5.444187446158203,
2468
+ "grad_norm": 85620.671875,
2469
+ "learning_rate": 8.698889239591477e-06,
2470
+ "loss": 0.4356,
2471
+ "step": 173000
2472
+ },
2473
+ {
2474
+ "epoch": 5.4599220351117355,
2475
+ "grad_norm": 90309.140625,
2476
+ "learning_rate": 8.649583450809254e-06,
2477
+ "loss": 0.4331,
2478
+ "step": 173500
2479
+ },
2480
+ {
2481
+ "epoch": 5.475656624065267,
2482
+ "grad_norm": 89333.3671875,
2483
+ "learning_rate": 8.600311078828291e-06,
2484
+ "loss": 0.4369,
2485
+ "step": 174000
2486
+ },
2487
+ {
2488
+ "epoch": 5.491391213018799,
2489
+ "grad_norm": 99571.203125,
2490
+ "learning_rate": 8.55107334292057e-06,
2491
+ "loss": 0.4334,
2492
+ "step": 174500
2493
+ },
2494
+ {
2495
+ "epoch": 5.507125801972331,
2496
+ "grad_norm": 103962.2734375,
2497
+ "learning_rate": 8.501871461500981e-06,
2498
+ "loss": 0.4339,
2499
+ "step": 175000
2500
+ },
2501
+ {
2502
+ "epoch": 5.522860390925863,
2503
+ "grad_norm": 90352.78125,
2504
+ "learning_rate": 8.452706652097187e-06,
2505
+ "loss": 0.4311,
2506
+ "step": 175500
2507
+ },
2508
+ {
2509
+ "epoch": 5.538594979879394,
2510
+ "grad_norm": 85944.9609375,
2511
+ "learning_rate": 8.403580131319469e-06,
2512
+ "loss": 0.4325,
2513
+ "step": 176000
2514
+ },
2515
+ {
2516
+ "epoch": 5.554329568832927,
2517
+ "grad_norm": 81867.296875,
2518
+ "learning_rate": 8.354493114830642e-06,
2519
+ "loss": 0.4332,
2520
+ "step": 176500
2521
+ },
2522
+ {
2523
+ "epoch": 5.570064157786458,
2524
+ "grad_norm": 100460.6171875,
2525
+ "learning_rate": 8.305446817315961e-06,
2526
+ "loss": 0.4351,
2527
+ "step": 177000
2528
+ },
2529
+ {
2530
+ "epoch": 5.585798746739989,
2531
+ "grad_norm": 92339.1953125,
2532
+ "learning_rate": 8.256442452453073e-06,
2533
+ "loss": 0.4329,
2534
+ "step": 177500
2535
+ },
2536
+ {
2537
+ "epoch": 5.601533335693522,
2538
+ "grad_norm": 89528.234375,
2539
+ "learning_rate": 8.207481232881975e-06,
2540
+ "loss": 0.4326,
2541
+ "step": 178000
2542
+ },
2543
+ {
2544
+ "epoch": 5.617267924647053,
2545
+ "grad_norm": 98558.296875,
2546
+ "learning_rate": 8.15856437017501e-06,
2547
+ "loss": 0.4347,
2548
+ "step": 178500
2549
+ },
2550
+ {
2551
+ "epoch": 5.6330025136005855,
2552
+ "grad_norm": 96948.5546875,
2553
+ "learning_rate": 8.109693074806891e-06,
2554
+ "loss": 0.4348,
2555
+ "step": 179000
2556
+ },
2557
+ {
2558
+ "epoch": 5.648737102554117,
2559
+ "grad_norm": 99710.2890625,
2560
+ "learning_rate": 8.060868556124735e-06,
2561
+ "loss": 0.4342,
2562
+ "step": 179500
2563
+ },
2564
+ {
2565
+ "epoch": 5.664471691507649,
2566
+ "grad_norm": 103463.359375,
2567
+ "learning_rate": 8.012092022318148e-06,
2568
+ "loss": 0.4304,
2569
+ "step": 180000
2570
+ },
2571
+ {
2572
+ "epoch": 5.680206280461181,
2573
+ "grad_norm": 87654.609375,
2574
+ "learning_rate": 7.963364680389322e-06,
2575
+ "loss": 0.4281,
2576
+ "step": 180500
2577
+ },
2578
+ {
2579
+ "epoch": 5.695940869414713,
2580
+ "grad_norm": 81925.9609375,
2581
+ "learning_rate": 7.914687736123171e-06,
2582
+ "loss": 0.435,
2583
+ "step": 181000
2584
+ },
2585
+ {
2586
+ "epoch": 5.711675458368244,
2587
+ "grad_norm": 89614.90625,
2588
+ "learning_rate": 7.866062394057486e-06,
2589
+ "loss": 0.4346,
2590
+ "step": 181500
2591
+ },
2592
+ {
2593
+ "epoch": 5.727410047321777,
2594
+ "grad_norm": 93440.1171875,
2595
+ "learning_rate": 7.817489857453146e-06,
2596
+ "loss": 0.4336,
2597
+ "step": 182000
2598
+ },
2599
+ {
2600
+ "epoch": 5.743144636275308,
2601
+ "grad_norm": 93339.640625,
2602
+ "learning_rate": 7.768971328264314e-06,
2603
+ "loss": 0.4325,
2604
+ "step": 182500
2605
+ },
2606
+ {
2607
+ "epoch": 5.75887922522884,
2608
+ "grad_norm": 107675.4296875,
2609
+ "learning_rate": 7.720508007108721e-06,
2610
+ "loss": 0.4275,
2611
+ "step": 183000
2612
+ },
2613
+ {
2614
+ "epoch": 5.774613814182372,
2615
+ "grad_norm": 91257.9609375,
2616
+ "learning_rate": 7.672101093237936e-06,
2617
+ "loss": 0.4318,
2618
+ "step": 183500
2619
+ },
2620
+ {
2621
+ "epoch": 5.790348403135903,
2622
+ "grad_norm": 93339.4453125,
2623
+ "learning_rate": 7.623751784507707e-06,
2624
+ "loss": 0.4306,
2625
+ "step": 184000
2626
+ },
2627
+ {
2628
+ "epoch": 5.8060829920894355,
2629
+ "grad_norm": 95532.234375,
2630
+ "learning_rate": 7.575461277348304e-06,
2631
+ "loss": 0.431,
2632
+ "step": 184500
2633
+ },
2634
+ {
2635
+ "epoch": 5.821817581042967,
2636
+ "grad_norm": 95540.015625,
2637
+ "learning_rate": 7.527230766734925e-06,
2638
+ "loss": 0.4297,
2639
+ "step": 185000
2640
+ },
2641
+ {
2642
+ "epoch": 5.837552169996499,
2643
+ "grad_norm": 99399.796875,
2644
+ "learning_rate": 7.479061446158119e-06,
2645
+ "loss": 0.4273,
2646
+ "step": 185500
2647
+ },
2648
+ {
2649
+ "epoch": 5.853286758950031,
2650
+ "grad_norm": 99299.875,
2651
+ "learning_rate": 7.4309545075942494e-06,
2652
+ "loss": 0.4295,
2653
+ "step": 186000
2654
+ },
2655
+ {
2656
+ "epoch": 5.869021347903563,
2657
+ "grad_norm": 95743.7265625,
2658
+ "learning_rate": 7.38291114147601e-06,
2659
+ "loss": 0.4305,
2660
+ "step": 186500
2661
+ },
2662
+ {
2663
+ "epoch": 5.884755936857094,
2664
+ "grad_norm": 97193.7421875,
2665
+ "learning_rate": 7.334932536662957e-06,
2666
+ "loss": 0.432,
2667
+ "step": 187000
2668
+ },
2669
+ {
2670
+ "epoch": 5.900490525810627,
2671
+ "grad_norm": 88953.2421875,
2672
+ "learning_rate": 7.2870198804120874e-06,
2673
+ "loss": 0.4309,
2674
+ "step": 187500
2675
+ },
2676
+ {
2677
+ "epoch": 5.916225114764158,
2678
+ "grad_norm": 101634.828125,
2679
+ "learning_rate": 7.239174358348464e-06,
2680
+ "loss": 0.4289,
2681
+ "step": 188000
2682
+ },
2683
+ {
2684
+ "epoch": 5.93195970371769,
2685
+ "grad_norm": 94037.0,
2686
+ "learning_rate": 7.191397154435893e-06,
2687
+ "loss": 0.4262,
2688
+ "step": 188500
2689
+ },
2690
+ {
2691
+ "epoch": 5.947694292671222,
2692
+ "grad_norm": 91032.2578125,
2693
+ "learning_rate": 7.143689450947593e-06,
2694
+ "loss": 0.4303,
2695
+ "step": 189000
2696
+ },
2697
+ {
2698
+ "epoch": 5.963428881624754,
2699
+ "grad_norm": 100366.0,
2700
+ "learning_rate": 7.096052428436962e-06,
2701
+ "loss": 0.431,
2702
+ "step": 189500
2703
+ },
2704
+ {
2705
+ "epoch": 5.9791634705782855,
2706
+ "grad_norm": 86355.7421875,
2707
+ "learning_rate": 7.048487265708357e-06,
2708
+ "loss": 0.4309,
2709
+ "step": 190000
2710
+ },
2711
+ {
2712
+ "epoch": 5.994898059531817,
2713
+ "grad_norm": 88552.546875,
2714
+ "learning_rate": 7.000995139787929e-06,
2715
+ "loss": 0.4284,
2716
+ "step": 190500
2717
+ },
2718
+ {
2719
+ "epoch": 5.999996066352762,
2720
+ "eval_loss": 0.8569527268409729,
2721
+ "eval_runtime": 4685.7619,
2722
+ "eval_samples_per_second": 54.61,
2723
+ "eval_steps_per_second": 0.853,
2724
+ "step": 190662
2725
  }
2726
  ],
2727
  "logging_steps": 500,
 
2750
  "attributes": {}
2751
  }
2752
  },
2753
+ "total_flos": 3.8596166159870546e+18,
2754
  "train_batch_size": 4,
2755
  "trial_name": null,
2756
  "trial_params": null