ljcamargo commited on
Commit
bab085d
·
verified ·
1 Parent(s): 04d2aa1

Training in progress, step 3750, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca1d58e2cd18a1fc09601b5143bb78b5307a273e6e2f1750228c18041d7fe77e
3
  size 3809184360
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb01dfc1c2e8ff7808bbee052468667857cd0cc81291988921d045c2d7906b09
3
  size 3809184360
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d9b3bc32af7a9c7224769cd6c49a9680aa7f8795c0d4e082d32018b4c599ff6
3
  size 2458291491
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21ad5221e1f7ef28d1eefb19cd0fee3fef5ece52f6f27106c14b5c6e4c86bc91
3
  size 2458291491
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8841fa7aad843ffcf2b76d868211d81553f8ce267313764fca81be4f5b42b4d
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fdbd0a64c232d94ea2546e029d229e872e7990a092a5c5c86566d2492a53f3b
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c493c177a6a7b8f4b53d9f320f001568436248d37644f960edb69a4818ec2df
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64a6a4e6bbe43acf16ab43cab9cd60b8f70df5b08ce4b3f2a9b7397dc1ce58b0
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9333333333333333,
6
  "eval_steps": 500,
7
- "global_step": 3500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2458,6 +2458,181 @@
2458
  "learning_rate": 3.436657681940701e-06,
2459
  "loss": 0.1441,
2460
  "step": 3500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2461
  }
2462
  ],
2463
  "logging_steps": 10,
@@ -2472,12 +2647,12 @@
2472
  "should_evaluate": false,
2473
  "should_log": false,
2474
  "should_save": true,
2475
- "should_training_stop": false
2476
  },
2477
  "attributes": {}
2478
  }
2479
  },
2480
- "total_flos": 6.32834922129408e+16,
2481
  "train_batch_size": 2,
2482
  "trial_name": null,
2483
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
  "eval_steps": 500,
7
+ "global_step": 3750,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2458
  "learning_rate": 3.436657681940701e-06,
2459
  "loss": 0.1441,
2460
  "step": 3500
2461
+ },
2462
+ {
2463
+ "epoch": 0.936,
2464
+ "grad_norm": 9.182714462280273,
2465
+ "learning_rate": 3.30188679245283e-06,
2466
+ "loss": 0.2332,
2467
+ "step": 3510
2468
+ },
2469
+ {
2470
+ "epoch": 0.9386666666666666,
2471
+ "grad_norm": 6.365874290466309,
2472
+ "learning_rate": 3.1671159029649594e-06,
2473
+ "loss": 0.1443,
2474
+ "step": 3520
2475
+ },
2476
+ {
2477
+ "epoch": 0.9413333333333334,
2478
+ "grad_norm": 6.266571521759033,
2479
+ "learning_rate": 3.032345013477089e-06,
2480
+ "loss": 0.1959,
2481
+ "step": 3530
2482
+ },
2483
+ {
2484
+ "epoch": 0.944,
2485
+ "grad_norm": 7.494802474975586,
2486
+ "learning_rate": 2.8975741239892183e-06,
2487
+ "loss": 0.149,
2488
+ "step": 3540
2489
+ },
2490
+ {
2491
+ "epoch": 0.9466666666666667,
2492
+ "grad_norm": 5.22160005569458,
2493
+ "learning_rate": 2.762803234501348e-06,
2494
+ "loss": 0.3431,
2495
+ "step": 3550
2496
+ },
2497
+ {
2498
+ "epoch": 0.9493333333333334,
2499
+ "grad_norm": 11.847735404968262,
2500
+ "learning_rate": 2.628032345013477e-06,
2501
+ "loss": 0.2068,
2502
+ "step": 3560
2503
+ },
2504
+ {
2505
+ "epoch": 0.952,
2506
+ "grad_norm": 41.45210647583008,
2507
+ "learning_rate": 2.4932614555256068e-06,
2508
+ "loss": 0.2057,
2509
+ "step": 3570
2510
+ },
2511
+ {
2512
+ "epoch": 0.9546666666666667,
2513
+ "grad_norm": 8.89501953125,
2514
+ "learning_rate": 2.358490566037736e-06,
2515
+ "loss": 0.5128,
2516
+ "step": 3580
2517
+ },
2518
+ {
2519
+ "epoch": 0.9573333333333334,
2520
+ "grad_norm": 6.3149261474609375,
2521
+ "learning_rate": 2.223719676549865e-06,
2522
+ "loss": 0.1869,
2523
+ "step": 3590
2524
+ },
2525
+ {
2526
+ "epoch": 0.96,
2527
+ "grad_norm": 5.511444091796875,
2528
+ "learning_rate": 2.088948787061995e-06,
2529
+ "loss": 0.2311,
2530
+ "step": 3600
2531
+ },
2532
+ {
2533
+ "epoch": 0.9626666666666667,
2534
+ "grad_norm": 6.782158851623535,
2535
+ "learning_rate": 1.954177897574124e-06,
2536
+ "loss": 0.1655,
2537
+ "step": 3610
2538
+ },
2539
+ {
2540
+ "epoch": 0.9653333333333334,
2541
+ "grad_norm": 6.828353404998779,
2542
+ "learning_rate": 1.8194070080862537e-06,
2543
+ "loss": 0.1694,
2544
+ "step": 3620
2545
+ },
2546
+ {
2547
+ "epoch": 0.968,
2548
+ "grad_norm": 2.4872541427612305,
2549
+ "learning_rate": 1.6846361185983827e-06,
2550
+ "loss": 0.1647,
2551
+ "step": 3630
2552
+ },
2553
+ {
2554
+ "epoch": 0.9706666666666667,
2555
+ "grad_norm": 8.890005111694336,
2556
+ "learning_rate": 1.5498652291105121e-06,
2557
+ "loss": 0.1979,
2558
+ "step": 3640
2559
+ },
2560
+ {
2561
+ "epoch": 0.9733333333333334,
2562
+ "grad_norm": 7.4598259925842285,
2563
+ "learning_rate": 1.4150943396226415e-06,
2564
+ "loss": 0.3526,
2565
+ "step": 3650
2566
+ },
2567
+ {
2568
+ "epoch": 0.976,
2569
+ "grad_norm": 4.237139701843262,
2570
+ "learning_rate": 1.280323450134771e-06,
2571
+ "loss": 0.2159,
2572
+ "step": 3660
2573
+ },
2574
+ {
2575
+ "epoch": 0.9786666666666667,
2576
+ "grad_norm": 5.643311500549316,
2577
+ "learning_rate": 1.1455525606469004e-06,
2578
+ "loss": 0.1425,
2579
+ "step": 3670
2580
+ },
2581
+ {
2582
+ "epoch": 0.9813333333333333,
2583
+ "grad_norm": 7.4330267906188965,
2584
+ "learning_rate": 1.0107816711590296e-06,
2585
+ "loss": 0.1761,
2586
+ "step": 3680
2587
+ },
2588
+ {
2589
+ "epoch": 0.984,
2590
+ "grad_norm": 12.03699779510498,
2591
+ "learning_rate": 8.76010781671159e-07,
2592
+ "loss": 0.2607,
2593
+ "step": 3690
2594
+ },
2595
+ {
2596
+ "epoch": 0.9866666666666667,
2597
+ "grad_norm": 6.911093235015869,
2598
+ "learning_rate": 7.412398921832885e-07,
2599
+ "loss": 0.1755,
2600
+ "step": 3700
2601
+ },
2602
+ {
2603
+ "epoch": 0.9893333333333333,
2604
+ "grad_norm": 6.668974876403809,
2605
+ "learning_rate": 6.064690026954178e-07,
2606
+ "loss": 0.2031,
2607
+ "step": 3710
2608
+ },
2609
+ {
2610
+ "epoch": 0.992,
2611
+ "grad_norm": 11.474651336669922,
2612
+ "learning_rate": 4.7169811320754717e-07,
2613
+ "loss": 0.2236,
2614
+ "step": 3720
2615
+ },
2616
+ {
2617
+ "epoch": 0.9946666666666667,
2618
+ "grad_norm": 9.00444507598877,
2619
+ "learning_rate": 3.369272237196766e-07,
2620
+ "loss": 0.1306,
2621
+ "step": 3730
2622
+ },
2623
+ {
2624
+ "epoch": 0.9973333333333333,
2625
+ "grad_norm": 52.68935012817383,
2626
+ "learning_rate": 2.0215633423180594e-07,
2627
+ "loss": 0.231,
2628
+ "step": 3740
2629
+ },
2630
+ {
2631
+ "epoch": 1.0,
2632
+ "grad_norm": 5.777242183685303,
2633
+ "learning_rate": 6.738544474393531e-08,
2634
+ "loss": 0.1794,
2635
+ "step": 3750
2636
  }
2637
  ],
2638
  "logging_steps": 10,
 
2647
  "should_evaluate": false,
2648
  "should_log": false,
2649
  "should_save": true,
2650
+ "should_training_stop": true
2651
  },
2652
  "attributes": {}
2653
  }
2654
  },
2655
+ "total_flos": 6.77976396217344e+16,
2656
  "train_batch_size": 2,
2657
  "trial_name": null,
2658
  "trial_params": null