rovdetection commited on
Commit
0073a61
·
verified ·
1 Parent(s): af65f4d

Training in progress, step 3000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea03fbd5faff9829b79932a9492534fbbbe2845de9ce69e896c0b8d109c1a825
3
  size 9446744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c4bc2b38adc32706e2acf860899364c389c44795cde5d43402ad1b3e24719a4
3
  size 9446744
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ca4bfe766f5a9ce1a39e0d776749658d826fc560902b47178ff40c41d18b94a
3
  size 4879947
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa9401bbac3284c4bb6a11169e446baad7a8e6fc00ab949bbe417b3d7a375a77
3
  size 4879947
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd840ceb0cbd2bc41560fadd05ab11cb9d3690eebf99ba42e453854e5f372ed8
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2194fb161e52a47a7f6b1734e178985577fd22e6aae4a22215e086c0248266b
3
  size 14917
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e68bcbbf919727508b1f5613e7b10a32a3e07fdef6c3370ef48c8724f2e31e4
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:431821c493d4002c62a876cbdeb3eade105892abe1c599865b041dfe28827339
3
  size 14917
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:861ce13e6ca091acee9a68ebfc5ca38479baf4b537c37b3949f071f77b81e9f0
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ac1c46a2776d12775d23d0f587efc112188137ce2140da35bc15d301c9f620e
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5bce3ff1203929d6808ab229d6e6d4d185a3da8ef87a3b682b0eec04e6bacf2
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad46a212d4576c083702df279951b960843d734a5cd61ac93041cad4b1712452
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 4.295723189340211,
6
  "eval_steps": 500,
7
- "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2508,6 +2508,506 @@
2508
  "mean_token_accuracy": 0.6467559643089771,
2509
  "num_tokens": 14861262.0,
2510
  "step": 2500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2511
  }
2512
  ],
2513
  "logging_steps": 10,
@@ -2527,7 +3027,7 @@
2527
  "attributes": {}
2528
  }
2529
  },
2530
- "total_flos": 1.2204939073814528e+17,
2531
  "train_batch_size": 2,
2532
  "trial_name": null,
2533
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 5.154738878143133,
6
  "eval_steps": 500,
7
+ "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2508
  "mean_token_accuracy": 0.6467559643089771,
2509
  "num_tokens": 14861262.0,
2510
  "step": 2500
2511
+ },
2512
+ {
2513
+ "entropy": 1.8234948687255383,
2514
+ "epoch": 4.312916398022781,
2515
+ "grad_norm": 0.778538167476654,
2516
+ "learning_rate": 9.964e-05,
2517
+ "loss": 1.8733020782470704,
2518
+ "mean_token_accuracy": 0.6553889319300652,
2519
+ "num_tokens": 14920923.0,
2520
+ "step": 2510
2521
+ },
2522
+ {
2523
+ "entropy": 1.812998068332672,
2524
+ "epoch": 4.330109606705351,
2525
+ "grad_norm": 0.7861834764480591,
2526
+ "learning_rate": 9.924e-05,
2527
+ "loss": 1.8699317932128907,
2528
+ "mean_token_accuracy": 0.6555795632302761,
2529
+ "num_tokens": 14978173.0,
2530
+ "step": 2520
2531
+ },
2532
+ {
2533
+ "entropy": 1.8013822883367538,
2534
+ "epoch": 4.347302815387922,
2535
+ "grad_norm": 0.751916229724884,
2536
+ "learning_rate": 9.884e-05,
2537
+ "loss": 1.8372121810913087,
2538
+ "mean_token_accuracy": 0.664341426640749,
2539
+ "num_tokens": 15034480.0,
2540
+ "step": 2530
2541
+ },
2542
+ {
2543
+ "entropy": 1.7700918450951577,
2544
+ "epoch": 4.364496024070492,
2545
+ "grad_norm": 0.7365695834159851,
2546
+ "learning_rate": 9.844000000000001e-05,
2547
+ "loss": 1.8166645050048829,
2548
+ "mean_token_accuracy": 0.6654425717890262,
2549
+ "num_tokens": 15093226.0,
2550
+ "step": 2540
2551
+ },
2552
+ {
2553
+ "entropy": 1.7808674454689026,
2554
+ "epoch": 4.381689232753063,
2555
+ "grad_norm": 0.7306393980979919,
2556
+ "learning_rate": 9.804e-05,
2557
+ "loss": 1.8363780975341797,
2558
+ "mean_token_accuracy": 0.6601886965334416,
2559
+ "num_tokens": 15149937.0,
2560
+ "step": 2550
2561
+ },
2562
+ {
2563
+ "entropy": 1.7890540674328803,
2564
+ "epoch": 4.398882441435633,
2565
+ "grad_norm": 0.7466715574264526,
2566
+ "learning_rate": 9.764000000000001e-05,
2567
+ "loss": 1.847653579711914,
2568
+ "mean_token_accuracy": 0.6586611110717058,
2569
+ "num_tokens": 15210500.0,
2570
+ "step": 2560
2571
+ },
2572
+ {
2573
+ "entropy": 1.7866264268755914,
2574
+ "epoch": 4.416075650118203,
2575
+ "grad_norm": 0.7825273871421814,
2576
+ "learning_rate": 9.724000000000001e-05,
2577
+ "loss": 1.82576904296875,
2578
+ "mean_token_accuracy": 0.6592508733272553,
2579
+ "num_tokens": 15268262.0,
2580
+ "step": 2570
2581
+ },
2582
+ {
2583
+ "entropy": 1.8321722269058227,
2584
+ "epoch": 4.433268858800774,
2585
+ "grad_norm": 0.7158058285713196,
2586
+ "learning_rate": 9.684000000000001e-05,
2587
+ "loss": 1.8807327270507812,
2588
+ "mean_token_accuracy": 0.6545467376708984,
2589
+ "num_tokens": 15330745.0,
2590
+ "step": 2580
2591
+ },
2592
+ {
2593
+ "entropy": 1.739266212284565,
2594
+ "epoch": 4.450462067483344,
2595
+ "grad_norm": 0.7281847596168518,
2596
+ "learning_rate": 9.644e-05,
2597
+ "loss": 1.7686588287353515,
2598
+ "mean_token_accuracy": 0.6666045777499676,
2599
+ "num_tokens": 15391266.0,
2600
+ "step": 2590
2601
+ },
2602
+ {
2603
+ "entropy": 1.8295569285750388,
2604
+ "epoch": 4.467655276165915,
2605
+ "grad_norm": 0.7166727781295776,
2606
+ "learning_rate": 9.604000000000001e-05,
2607
+ "loss": 1.9156217575073242,
2608
+ "mean_token_accuracy": 0.655017600953579,
2609
+ "num_tokens": 15449819.0,
2610
+ "step": 2600
2611
+ },
2612
+ {
2613
+ "entropy": 1.8236071288585662,
2614
+ "epoch": 4.484848484848484,
2615
+ "grad_norm": 0.6946532726287842,
2616
+ "learning_rate": 9.564000000000001e-05,
2617
+ "loss": 1.9035514831542968,
2618
+ "mean_token_accuracy": 0.649907086789608,
2619
+ "num_tokens": 15513231.0,
2620
+ "step": 2610
2621
+ },
2622
+ {
2623
+ "entropy": 1.7869442969560623,
2624
+ "epoch": 4.502041693531055,
2625
+ "grad_norm": 0.7257023453712463,
2626
+ "learning_rate": 9.524e-05,
2627
+ "loss": 1.841336441040039,
2628
+ "mean_token_accuracy": 0.6655759517103433,
2629
+ "num_tokens": 15568973.0,
2630
+ "step": 2620
2631
+ },
2632
+ {
2633
+ "entropy": 1.7462848544120788,
2634
+ "epoch": 4.519234902213626,
2635
+ "grad_norm": 0.7239391803741455,
2636
+ "learning_rate": 9.484e-05,
2637
+ "loss": 1.7989360809326171,
2638
+ "mean_token_accuracy": 0.6646886244416237,
2639
+ "num_tokens": 15627655.0,
2640
+ "step": 2630
2641
+ },
2642
+ {
2643
+ "entropy": 1.7926493644714356,
2644
+ "epoch": 4.536428110896196,
2645
+ "grad_norm": 0.7628325819969177,
2646
+ "learning_rate": 9.444000000000001e-05,
2647
+ "loss": 1.8627632141113282,
2648
+ "mean_token_accuracy": 0.654141866415739,
2649
+ "num_tokens": 15687626.0,
2650
+ "step": 2640
2651
+ },
2652
+ {
2653
+ "entropy": 1.7928333327174186,
2654
+ "epoch": 4.553621319578767,
2655
+ "grad_norm": 0.629107654094696,
2656
+ "learning_rate": 9.404e-05,
2657
+ "loss": 1.8784042358398438,
2658
+ "mean_token_accuracy": 0.6618591919541359,
2659
+ "num_tokens": 15750035.0,
2660
+ "step": 2650
2661
+ },
2662
+ {
2663
+ "entropy": 1.7438783437013625,
2664
+ "epoch": 4.570814528261336,
2665
+ "grad_norm": 0.6948845982551575,
2666
+ "learning_rate": 9.364e-05,
2667
+ "loss": 1.7456579208374023,
2668
+ "mean_token_accuracy": 0.6722261719405651,
2669
+ "num_tokens": 15809533.0,
2670
+ "step": 2660
2671
+ },
2672
+ {
2673
+ "entropy": 1.7451874181628226,
2674
+ "epoch": 4.588007736943907,
2675
+ "grad_norm": 0.7213107943534851,
2676
+ "learning_rate": 9.324000000000001e-05,
2677
+ "loss": 1.8111917495727539,
2678
+ "mean_token_accuracy": 0.6621977139264346,
2679
+ "num_tokens": 15866570.0,
2680
+ "step": 2670
2681
+ },
2682
+ {
2683
+ "entropy": 1.806991095095873,
2684
+ "epoch": 4.6052009456264775,
2685
+ "grad_norm": 0.9146936535835266,
2686
+ "learning_rate": 9.284e-05,
2687
+ "loss": 1.8761199951171874,
2688
+ "mean_token_accuracy": 0.6552402298897505,
2689
+ "num_tokens": 15923681.0,
2690
+ "step": 2680
2691
+ },
2692
+ {
2693
+ "entropy": 1.854476225376129,
2694
+ "epoch": 4.622394154309048,
2695
+ "grad_norm": 0.675061047077179,
2696
+ "learning_rate": 9.244e-05,
2697
+ "loss": 1.8601364135742187,
2698
+ "mean_token_accuracy": 0.656403211131692,
2699
+ "num_tokens": 15979879.0,
2700
+ "step": 2690
2701
+ },
2702
+ {
2703
+ "entropy": 1.8345128282904626,
2704
+ "epoch": 4.639587362991619,
2705
+ "grad_norm": 0.7702699303627014,
2706
+ "learning_rate": 9.204e-05,
2707
+ "loss": 1.9170707702636718,
2708
+ "mean_token_accuracy": 0.6507652081549168,
2709
+ "num_tokens": 16040136.0,
2710
+ "step": 2700
2711
+ },
2712
+ {
2713
+ "entropy": 1.8444690719246863,
2714
+ "epoch": 4.656780571674188,
2715
+ "grad_norm": 0.7249677181243896,
2716
+ "learning_rate": 9.164000000000001e-05,
2717
+ "loss": 1.9021928787231446,
2718
+ "mean_token_accuracy": 0.6553504541516304,
2719
+ "num_tokens": 16097652.0,
2720
+ "step": 2710
2721
+ },
2722
+ {
2723
+ "entropy": 1.8083212688565253,
2724
+ "epoch": 4.673973780356759,
2725
+ "grad_norm": 0.7018275260925293,
2726
+ "learning_rate": 9.124e-05,
2727
+ "loss": 1.87921199798584,
2728
+ "mean_token_accuracy": 0.6609590038657188,
2729
+ "num_tokens": 16159014.0,
2730
+ "step": 2720
2731
+ },
2732
+ {
2733
+ "entropy": 1.793540646135807,
2734
+ "epoch": 4.6911669890393295,
2735
+ "grad_norm": 0.731863796710968,
2736
+ "learning_rate": 9.084e-05,
2737
+ "loss": 1.847224807739258,
2738
+ "mean_token_accuracy": 0.6638176888227463,
2739
+ "num_tokens": 16223636.0,
2740
+ "step": 2730
2741
+ },
2742
+ {
2743
+ "entropy": 1.7947301134467124,
2744
+ "epoch": 4.7083601977219,
2745
+ "grad_norm": 0.7208489775657654,
2746
+ "learning_rate": 9.044000000000001e-05,
2747
+ "loss": 1.8400375366210937,
2748
+ "mean_token_accuracy": 0.6600434482097626,
2749
+ "num_tokens": 16281647.0,
2750
+ "step": 2740
2751
+ },
2752
+ {
2753
+ "entropy": 1.8043948471546174,
2754
+ "epoch": 4.725553406404471,
2755
+ "grad_norm": 0.7633848190307617,
2756
+ "learning_rate": 9.004e-05,
2757
+ "loss": 1.8509382247924804,
2758
+ "mean_token_accuracy": 0.6632162068039179,
2759
+ "num_tokens": 16340706.0,
2760
+ "step": 2750
2761
+ },
2762
+ {
2763
+ "entropy": 1.8240734949707984,
2764
+ "epoch": 4.74274661508704,
2765
+ "grad_norm": 0.7516812086105347,
2766
+ "learning_rate": 8.964e-05,
2767
+ "loss": 1.9139686584472657,
2768
+ "mean_token_accuracy": 0.6504824224859476,
2769
+ "num_tokens": 16398077.0,
2770
+ "step": 2760
2771
+ },
2772
+ {
2773
+ "entropy": 1.7775158017873764,
2774
+ "epoch": 4.759939823769611,
2775
+ "grad_norm": 0.7677133679389954,
2776
+ "learning_rate": 8.924e-05,
2777
+ "loss": 1.8351661682128906,
2778
+ "mean_token_accuracy": 0.6568478621542454,
2779
+ "num_tokens": 16458898.0,
2780
+ "step": 2770
2781
+ },
2782
+ {
2783
+ "entropy": 1.8671277523040772,
2784
+ "epoch": 4.7771330324521815,
2785
+ "grad_norm": 0.750451385974884,
2786
+ "learning_rate": 8.884e-05,
2787
+ "loss": 1.9589305877685548,
2788
+ "mean_token_accuracy": 0.6506143860518933,
2789
+ "num_tokens": 16519496.0,
2790
+ "step": 2780
2791
+ },
2792
+ {
2793
+ "entropy": 1.7745324671268463,
2794
+ "epoch": 4.794326241134752,
2795
+ "grad_norm": 0.8302338719367981,
2796
+ "learning_rate": 8.844e-05,
2797
+ "loss": 1.8637496948242187,
2798
+ "mean_token_accuracy": 0.6621543657034635,
2799
+ "num_tokens": 16579080.0,
2800
+ "step": 2790
2801
+ },
2802
+ {
2803
+ "entropy": 1.73246541172266,
2804
+ "epoch": 4.811519449817322,
2805
+ "grad_norm": 0.778176486492157,
2806
+ "learning_rate": 8.804e-05,
2807
+ "loss": 1.752696418762207,
2808
+ "mean_token_accuracy": 0.6727286443114281,
2809
+ "num_tokens": 16640932.0,
2810
+ "step": 2800
2811
+ },
2812
+ {
2813
+ "entropy": 1.8060437709093093,
2814
+ "epoch": 4.828712658499892,
2815
+ "grad_norm": 0.9019444584846497,
2816
+ "learning_rate": 8.764e-05,
2817
+ "loss": 1.9031681060791015,
2818
+ "mean_token_accuracy": 0.6563040159642697,
2819
+ "num_tokens": 16702244.0,
2820
+ "step": 2810
2821
+ },
2822
+ {
2823
+ "entropy": 1.8732322439551354,
2824
+ "epoch": 4.845905867182463,
2825
+ "grad_norm": 0.7397829294204712,
2826
+ "learning_rate": 8.724e-05,
2827
+ "loss": 1.9326038360595703,
2828
+ "mean_token_accuracy": 0.6478111572563648,
2829
+ "num_tokens": 16764555.0,
2830
+ "step": 2820
2831
+ },
2832
+ {
2833
+ "entropy": 1.842681024968624,
2834
+ "epoch": 4.863099075865033,
2835
+ "grad_norm": 0.8511717915534973,
2836
+ "learning_rate": 8.684e-05,
2837
+ "loss": 1.9107376098632813,
2838
+ "mean_token_accuracy": 0.6531910292804242,
2839
+ "num_tokens": 16821936.0,
2840
+ "step": 2830
2841
+ },
2842
+ {
2843
+ "entropy": 1.7571960732340812,
2844
+ "epoch": 4.880292284547604,
2845
+ "grad_norm": 0.7064304947853088,
2846
+ "learning_rate": 8.643999999999999e-05,
2847
+ "loss": 1.7985404968261718,
2848
+ "mean_token_accuracy": 0.6667480751872062,
2849
+ "num_tokens": 16882205.0,
2850
+ "step": 2840
2851
+ },
2852
+ {
2853
+ "entropy": 1.8695308573544025,
2854
+ "epoch": 4.897485493230175,
2855
+ "grad_norm": 0.7386742234230042,
2856
+ "learning_rate": 8.604000000000001e-05,
2857
+ "loss": 1.9543342590332031,
2858
+ "mean_token_accuracy": 0.6496741093695164,
2859
+ "num_tokens": 16939799.0,
2860
+ "step": 2850
2861
+ },
2862
+ {
2863
+ "entropy": 1.7877972453832627,
2864
+ "epoch": 4.914678701912744,
2865
+ "grad_norm": 0.7687976956367493,
2866
+ "learning_rate": 8.564000000000001e-05,
2867
+ "loss": 1.7994373321533204,
2868
+ "mean_token_accuracy": 0.6637697361409665,
2869
+ "num_tokens": 16997716.0,
2870
+ "step": 2860
2871
+ },
2872
+ {
2873
+ "entropy": 1.761916320025921,
2874
+ "epoch": 4.931871910595315,
2875
+ "grad_norm": 0.7507193088531494,
2876
+ "learning_rate": 8.524e-05,
2877
+ "loss": 1.788670539855957,
2878
+ "mean_token_accuracy": 0.6648910716176033,
2879
+ "num_tokens": 17057260.0,
2880
+ "step": 2870
2881
+ },
2882
+ {
2883
+ "entropy": 1.804823537170887,
2884
+ "epoch": 4.949065119277885,
2885
+ "grad_norm": 0.727188229560852,
2886
+ "learning_rate": 8.484000000000001e-05,
2887
+ "loss": 1.855522346496582,
2888
+ "mean_token_accuracy": 0.657912939786911,
2889
+ "num_tokens": 17116073.0,
2890
+ "step": 2880
2891
+ },
2892
+ {
2893
+ "entropy": 1.8259041801095008,
2894
+ "epoch": 4.966258327960456,
2895
+ "grad_norm": 0.7195336818695068,
2896
+ "learning_rate": 8.444000000000001e-05,
2897
+ "loss": 1.8942272186279296,
2898
+ "mean_token_accuracy": 0.6546841934323311,
2899
+ "num_tokens": 17174141.0,
2900
+ "step": 2890
2901
+ },
2902
+ {
2903
+ "entropy": 1.7153871595859527,
2904
+ "epoch": 4.983451536643026,
2905
+ "grad_norm": 0.7093940377235413,
2906
+ "learning_rate": 8.404e-05,
2907
+ "loss": 1.7350996017456055,
2908
+ "mean_token_accuracy": 0.6728265054523945,
2909
+ "num_tokens": 17233307.0,
2910
+ "step": 2900
2911
+ },
2912
+ {
2913
+ "entropy": 1.7630670566063422,
2914
+ "epoch": 5.0,
2915
+ "grad_norm": 0.979345440864563,
2916
+ "learning_rate": 8.364e-05,
2917
+ "loss": 1.8098876953125,
2918
+ "mean_token_accuracy": 0.6604567510741097,
2919
+ "num_tokens": 17289810.0,
2920
+ "step": 2910
2921
+ },
2922
+ {
2923
+ "entropy": 1.8877688512206077,
2924
+ "epoch": 5.017193208682571,
2925
+ "grad_norm": 0.8140257596969604,
2926
+ "learning_rate": 8.324000000000001e-05,
2927
+ "loss": 1.9562681198120118,
2928
+ "mean_token_accuracy": 0.6476880256086588,
2929
+ "num_tokens": 17349922.0,
2930
+ "step": 2920
2931
+ },
2932
+ {
2933
+ "entropy": 1.6694072388112544,
2934
+ "epoch": 5.034386417365141,
2935
+ "grad_norm": 0.7486578226089478,
2936
+ "learning_rate": 8.284000000000001e-05,
2937
+ "loss": 1.71788330078125,
2938
+ "mean_token_accuracy": 0.6781885512173176,
2939
+ "num_tokens": 17409363.0,
2940
+ "step": 2930
2941
+ },
2942
+ {
2943
+ "entropy": 1.8061093628406524,
2944
+ "epoch": 5.051579626047711,
2945
+ "grad_norm": 0.8148984313011169,
2946
+ "learning_rate": 8.244e-05,
2947
+ "loss": 1.8484228134155274,
2948
+ "mean_token_accuracy": 0.6591597832739353,
2949
+ "num_tokens": 17468218.0,
2950
+ "step": 2940
2951
+ },
2952
+ {
2953
+ "entropy": 1.7561381176114081,
2954
+ "epoch": 5.068772834730281,
2955
+ "grad_norm": 0.7412339448928833,
2956
+ "learning_rate": 8.204000000000001e-05,
2957
+ "loss": 1.8109855651855469,
2958
+ "mean_token_accuracy": 0.6648329850286245,
2959
+ "num_tokens": 17529603.0,
2960
+ "step": 2950
2961
+ },
2962
+ {
2963
+ "entropy": 1.7058369636535644,
2964
+ "epoch": 5.085966043412852,
2965
+ "grad_norm": 0.7845883369445801,
2966
+ "learning_rate": 8.164000000000001e-05,
2967
+ "loss": 1.7577402114868164,
2968
+ "mean_token_accuracy": 0.675883399322629,
2969
+ "num_tokens": 17587275.0,
2970
+ "step": 2960
2971
+ },
2972
+ {
2973
+ "entropy": 1.7319279327988624,
2974
+ "epoch": 5.1031592520954225,
2975
+ "grad_norm": 0.7546029090881348,
2976
+ "learning_rate": 8.124e-05,
2977
+ "loss": 1.8096488952636718,
2978
+ "mean_token_accuracy": 0.668717809766531,
2979
+ "num_tokens": 17647368.0,
2980
+ "step": 2970
2981
+ },
2982
+ {
2983
+ "entropy": 1.7872621923685075,
2984
+ "epoch": 5.120352460777993,
2985
+ "grad_norm": 0.7214957475662231,
2986
+ "learning_rate": 8.084e-05,
2987
+ "loss": 1.7827239990234376,
2988
+ "mean_token_accuracy": 0.663322826102376,
2989
+ "num_tokens": 17708210.0,
2990
+ "step": 2980
2991
+ },
2992
+ {
2993
+ "entropy": 1.7479579642415046,
2994
+ "epoch": 5.137545669460563,
2995
+ "grad_norm": 0.6938044428825378,
2996
+ "learning_rate": 8.044000000000001e-05,
2997
+ "loss": 1.837489700317383,
2998
+ "mean_token_accuracy": 0.666904554143548,
2999
+ "num_tokens": 17770498.0,
3000
+ "step": 2990
3001
+ },
3002
+ {
3003
+ "entropy": 1.760008592903614,
3004
+ "epoch": 5.154738878143133,
3005
+ "grad_norm": 0.7440096139907837,
3006
+ "learning_rate": 8.004e-05,
3007
+ "loss": 1.7957250595092773,
3008
+ "mean_token_accuracy": 0.6704145818948746,
3009
+ "num_tokens": 17831493.0,
3010
+ "step": 3000
3011
  }
3012
  ],
3013
  "logging_steps": 10,
 
3027
  "attributes": {}
3028
  }
3029
  },
3030
+ "total_flos": 1.4643251157506458e+17,
3031
  "train_batch_size": 2,
3032
  "trial_name": null,
3033
  "trial_params": null