qwen_binary_sft / trainer_state.json
jinqij's picture
Upload folder using huggingface_hub
b865b15 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 5624,
"global_step": 16872,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 64.30284881591797,
"learning_rate": 2.9620853080568726e-09,
"loss": 3.7905,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 74.81798553466797,
"learning_rate": 9.478672985781992e-08,
"loss": 4.4137,
"step": 32
},
{
"epoch": 0.01,
"grad_norm": 101.4776840209961,
"learning_rate": 1.8957345971563984e-07,
"loss": 4.2954,
"step": 64
},
{
"epoch": 0.02,
"grad_norm": 68.84349822998047,
"learning_rate": 2.843601895734597e-07,
"loss": 3.376,
"step": 96
},
{
"epoch": 0.02,
"grad_norm": 42.47146224975586,
"learning_rate": 3.791469194312797e-07,
"loss": 1.962,
"step": 128
},
{
"epoch": 0.03,
"grad_norm": 46.63043212890625,
"learning_rate": 4.7393364928909956e-07,
"loss": 1.168,
"step": 160
},
{
"epoch": 0.03,
"grad_norm": 2.709139347076416,
"learning_rate": 5.687203791469194e-07,
"loss": 0.4681,
"step": 192
},
{
"epoch": 0.04,
"grad_norm": 3.304001808166504,
"learning_rate": 6.635071090047394e-07,
"loss": 0.4059,
"step": 224
},
{
"epoch": 0.05,
"grad_norm": 2.5251598358154297,
"learning_rate": 7.582938388625594e-07,
"loss": 0.3999,
"step": 256
},
{
"epoch": 0.05,
"grad_norm": 1.9685628414154053,
"learning_rate": 8.530805687203792e-07,
"loss": 0.3904,
"step": 288
},
{
"epoch": 0.06,
"grad_norm": 1.7696113586425781,
"learning_rate": 9.478672985781991e-07,
"loss": 0.3769,
"step": 320
},
{
"epoch": 0.06,
"grad_norm": 1.8694322109222412,
"learning_rate": 1.042654028436019e-06,
"loss": 0.3718,
"step": 352
},
{
"epoch": 0.07,
"grad_norm": 1.7985926866531372,
"learning_rate": 1.1374407582938388e-06,
"loss": 0.3569,
"step": 384
},
{
"epoch": 0.07,
"grad_norm": 1.9462366104125977,
"learning_rate": 1.2322274881516587e-06,
"loss": 0.3432,
"step": 416
},
{
"epoch": 0.08,
"grad_norm": 1.916548490524292,
"learning_rate": 1.3270142180094788e-06,
"loss": 0.331,
"step": 448
},
{
"epoch": 0.09,
"grad_norm": 1.2683026790618896,
"learning_rate": 1.4218009478672987e-06,
"loss": 0.334,
"step": 480
},
{
"epoch": 0.09,
"grad_norm": 1.4642925262451172,
"learning_rate": 1.5165876777251187e-06,
"loss": 0.3386,
"step": 512
},
{
"epoch": 0.1,
"grad_norm": 1.3890858888626099,
"learning_rate": 1.6113744075829384e-06,
"loss": 0.3073,
"step": 544
},
{
"epoch": 0.1,
"grad_norm": 1.4565709829330444,
"learning_rate": 1.7061611374407585e-06,
"loss": 0.3263,
"step": 576
},
{
"epoch": 0.11,
"grad_norm": 1.349706768989563,
"learning_rate": 1.8009478672985784e-06,
"loss": 0.3165,
"step": 608
},
{
"epoch": 0.11,
"grad_norm": 1.7937098741531372,
"learning_rate": 1.8957345971563982e-06,
"loss": 0.3052,
"step": 640
},
{
"epoch": 0.12,
"grad_norm": 1.616604208946228,
"learning_rate": 1.990521327014218e-06,
"loss": 0.3138,
"step": 672
},
{
"epoch": 0.13,
"grad_norm": 1.3185547590255737,
"learning_rate": 2.085308056872038e-06,
"loss": 0.297,
"step": 704
},
{
"epoch": 0.13,
"grad_norm": 1.8201861381530762,
"learning_rate": 2.180094786729858e-06,
"loss": 0.3072,
"step": 736
},
{
"epoch": 0.14,
"grad_norm": 1.1981289386749268,
"learning_rate": 2.2748815165876777e-06,
"loss": 0.2957,
"step": 768
},
{
"epoch": 0.14,
"grad_norm": 1.7006583213806152,
"learning_rate": 2.369668246445498e-06,
"loss": 0.2941,
"step": 800
},
{
"epoch": 0.15,
"grad_norm": 2.1674447059631348,
"learning_rate": 2.4644549763033174e-06,
"loss": 0.2798,
"step": 832
},
{
"epoch": 0.15,
"grad_norm": 1.3364813327789307,
"learning_rate": 2.5592417061611373e-06,
"loss": 0.2996,
"step": 864
},
{
"epoch": 0.16,
"grad_norm": 1.2685174942016602,
"learning_rate": 2.6540284360189576e-06,
"loss": 0.3027,
"step": 896
},
{
"epoch": 0.17,
"grad_norm": 1.3952556848526,
"learning_rate": 2.7488151658767775e-06,
"loss": 0.2985,
"step": 928
},
{
"epoch": 0.17,
"grad_norm": 1.251714825630188,
"learning_rate": 2.8436018957345973e-06,
"loss": 0.2905,
"step": 960
},
{
"epoch": 0.18,
"grad_norm": 1.553152322769165,
"learning_rate": 2.938388625592417e-06,
"loss": 0.278,
"step": 992
},
{
"epoch": 0.18,
"grad_norm": 1.380920171737671,
"learning_rate": 3.0331753554502375e-06,
"loss": 0.2813,
"step": 1024
},
{
"epoch": 0.19,
"grad_norm": 1.5351643562316895,
"learning_rate": 3.1279620853080574e-06,
"loss": 0.2805,
"step": 1056
},
{
"epoch": 0.19,
"grad_norm": 1.4867887496948242,
"learning_rate": 3.222748815165877e-06,
"loss": 0.2767,
"step": 1088
},
{
"epoch": 0.2,
"grad_norm": 1.317229986190796,
"learning_rate": 3.3175355450236967e-06,
"loss": 0.2859,
"step": 1120
},
{
"epoch": 0.2,
"grad_norm": 1.8770791292190552,
"learning_rate": 3.412322274881517e-06,
"loss": 0.2875,
"step": 1152
},
{
"epoch": 0.21,
"grad_norm": 1.4476697444915771,
"learning_rate": 3.507109004739337e-06,
"loss": 0.2884,
"step": 1184
},
{
"epoch": 0.22,
"grad_norm": 1.351965069770813,
"learning_rate": 3.6018957345971567e-06,
"loss": 0.2802,
"step": 1216
},
{
"epoch": 0.22,
"grad_norm": 1.4647209644317627,
"learning_rate": 3.6966824644549766e-06,
"loss": 0.2703,
"step": 1248
},
{
"epoch": 0.23,
"grad_norm": 1.901773452758789,
"learning_rate": 3.7914691943127964e-06,
"loss": 0.2815,
"step": 1280
},
{
"epoch": 0.23,
"grad_norm": 1.139844536781311,
"learning_rate": 3.886255924170616e-06,
"loss": 0.2658,
"step": 1312
},
{
"epoch": 0.24,
"grad_norm": 1.1863261461257935,
"learning_rate": 3.981042654028436e-06,
"loss": 0.2707,
"step": 1344
},
{
"epoch": 0.24,
"grad_norm": 1.2720916271209717,
"learning_rate": 4.075829383886256e-06,
"loss": 0.2646,
"step": 1376
},
{
"epoch": 0.25,
"grad_norm": 1.6161096096038818,
"learning_rate": 4.170616113744076e-06,
"loss": 0.2748,
"step": 1408
},
{
"epoch": 0.26,
"grad_norm": 1.4303381443023682,
"learning_rate": 4.265402843601897e-06,
"loss": 0.2691,
"step": 1440
},
{
"epoch": 0.26,
"grad_norm": 1.401880145072937,
"learning_rate": 4.360189573459716e-06,
"loss": 0.2699,
"step": 1472
},
{
"epoch": 0.27,
"grad_norm": 1.3495467901229858,
"learning_rate": 4.4549763033175355e-06,
"loss": 0.2772,
"step": 1504
},
{
"epoch": 0.27,
"grad_norm": 1.3915464878082275,
"learning_rate": 4.549763033175355e-06,
"loss": 0.2752,
"step": 1536
},
{
"epoch": 0.28,
"grad_norm": 1.3412673473358154,
"learning_rate": 4.644549763033176e-06,
"loss": 0.2751,
"step": 1568
},
{
"epoch": 0.28,
"grad_norm": 1.3777296543121338,
"learning_rate": 4.739336492890996e-06,
"loss": 0.2717,
"step": 1600
},
{
"epoch": 0.29,
"grad_norm": 1.2612873315811157,
"learning_rate": 4.834123222748816e-06,
"loss": 0.2678,
"step": 1632
},
{
"epoch": 0.3,
"grad_norm": 1.2989717721939087,
"learning_rate": 4.928909952606635e-06,
"loss": 0.2778,
"step": 1664
},
{
"epoch": 0.3,
"grad_norm": 1.3525702953338623,
"learning_rate": 4.999996575341721e-06,
"loss": 0.2719,
"step": 1696
},
{
"epoch": 0.31,
"grad_norm": 1.4678899049758911,
"learning_rate": 4.999914384012144e-06,
"loss": 0.2755,
"step": 1728
},
{
"epoch": 0.31,
"grad_norm": 1.2093278169631958,
"learning_rate": 4.999722607745944e-06,
"loss": 0.2755,
"step": 1760
},
{
"epoch": 0.32,
"grad_norm": 1.4915423393249512,
"learning_rate": 4.999421254949728e-06,
"loss": 0.2686,
"step": 1792
},
{
"epoch": 0.32,
"grad_norm": 1.1101861000061035,
"learning_rate": 4.999010338833436e-06,
"loss": 0.2594,
"step": 1824
},
{
"epoch": 0.33,
"grad_norm": 1.3432806730270386,
"learning_rate": 4.9984898774097735e-06,
"loss": 0.2658,
"step": 1856
},
{
"epoch": 0.34,
"grad_norm": 1.2808105945587158,
"learning_rate": 4.997859893493414e-06,
"loss": 0.2632,
"step": 1888
},
{
"epoch": 0.34,
"grad_norm": 1.3815045356750488,
"learning_rate": 4.997120414700003e-06,
"loss": 0.2557,
"step": 1920
},
{
"epoch": 0.35,
"grad_norm": 1.4393643140792847,
"learning_rate": 4.996271473444944e-06,
"loss": 0.263,
"step": 1952
},
{
"epoch": 0.35,
"grad_norm": 1.138375163078308,
"learning_rate": 4.995313106941982e-06,
"loss": 0.2805,
"step": 1984
},
{
"epoch": 0.36,
"grad_norm": 1.6412934064865112,
"learning_rate": 4.994245357201568e-06,
"loss": 0.2641,
"step": 2016
},
{
"epoch": 0.36,
"grad_norm": 1.465922236442566,
"learning_rate": 4.9930682710290205e-06,
"loss": 0.2637,
"step": 2048
},
{
"epoch": 0.37,
"grad_norm": 1.4526797533035278,
"learning_rate": 4.991781900022471e-06,
"loss": 0.2596,
"step": 2080
},
{
"epoch": 0.38,
"grad_norm": 1.504759669303894,
"learning_rate": 4.990386300570607e-06,
"loss": 0.2633,
"step": 2112
},
{
"epoch": 0.38,
"grad_norm": 1.5599263906478882,
"learning_rate": 4.988881533850192e-06,
"loss": 0.2658,
"step": 2144
},
{
"epoch": 0.39,
"grad_norm": 1.1662814617156982,
"learning_rate": 4.987267665823392e-06,
"loss": 0.2694,
"step": 2176
},
{
"epoch": 0.39,
"grad_norm": 1.3952819108963013,
"learning_rate": 4.98554476723488e-06,
"loss": 0.2449,
"step": 2208
},
{
"epoch": 0.4,
"grad_norm": 1.2887946367263794,
"learning_rate": 4.983712913608736e-06,
"loss": 0.2651,
"step": 2240
},
{
"epoch": 0.4,
"grad_norm": 1.5893690586090088,
"learning_rate": 4.981772185245135e-06,
"loss": 0.2568,
"step": 2272
},
{
"epoch": 0.41,
"grad_norm": 1.228550672531128,
"learning_rate": 4.979722667216829e-06,
"loss": 0.2667,
"step": 2304
},
{
"epoch": 0.42,
"grad_norm": 1.2756662368774414,
"learning_rate": 4.977564449365415e-06,
"loss": 0.2508,
"step": 2336
},
{
"epoch": 0.42,
"grad_norm": 1.5225822925567627,
"learning_rate": 4.975297626297399e-06,
"loss": 0.2691,
"step": 2368
},
{
"epoch": 0.43,
"grad_norm": 1.2656946182250977,
"learning_rate": 4.972922297380052e-06,
"loss": 0.2704,
"step": 2400
},
{
"epoch": 0.43,
"grad_norm": 1.3268104791641235,
"learning_rate": 4.970438566737043e-06,
"loss": 0.2577,
"step": 2432
},
{
"epoch": 0.44,
"grad_norm": 1.5536099672317505,
"learning_rate": 4.96784654324389e-06,
"loss": 0.2578,
"step": 2464
},
{
"epoch": 0.44,
"grad_norm": 1.1516194343566895,
"learning_rate": 4.965146340523175e-06,
"loss": 0.2446,
"step": 2496
},
{
"epoch": 0.45,
"grad_norm": 1.1923089027404785,
"learning_rate": 4.962338076939569e-06,
"loss": 0.2569,
"step": 2528
},
{
"epoch": 0.46,
"grad_norm": 1.124197006225586,
"learning_rate": 4.959421875594643e-06,
"loss": 0.2625,
"step": 2560
},
{
"epoch": 0.46,
"grad_norm": 1.680388331413269,
"learning_rate": 4.95639786432147e-06,
"loss": 0.264,
"step": 2592
},
{
"epoch": 0.47,
"grad_norm": 1.3039462566375732,
"learning_rate": 4.953266175679023e-06,
"loss": 0.2624,
"step": 2624
},
{
"epoch": 0.47,
"grad_norm": 1.109054684638977,
"learning_rate": 4.9500269469463655e-06,
"loss": 0.2548,
"step": 2656
},
{
"epoch": 0.48,
"grad_norm": 1.2704750299453735,
"learning_rate": 4.94668032011663e-06,
"loss": 0.2569,
"step": 2688
},
{
"epoch": 0.48,
"grad_norm": 1.1952179670333862,
"learning_rate": 4.943226441890794e-06,
"loss": 0.2599,
"step": 2720
},
{
"epoch": 0.49,
"grad_norm": 1.2229312658309937,
"learning_rate": 4.939665463671255e-06,
"loss": 0.2577,
"step": 2752
},
{
"epoch": 0.5,
"grad_norm": 1.3956924676895142,
"learning_rate": 4.935997541555188e-06,
"loss": 0.2642,
"step": 2784
},
{
"epoch": 0.5,
"grad_norm": 1.116629958152771,
"learning_rate": 4.932222836327703e-06,
"loss": 0.2587,
"step": 2816
},
{
"epoch": 0.51,
"grad_norm": 1.1389435529708862,
"learning_rate": 4.928341513454801e-06,
"loss": 0.2566,
"step": 2848
},
{
"epoch": 0.51,
"grad_norm": 1.3800580501556396,
"learning_rate": 4.9243537430761155e-06,
"loss": 0.2579,
"step": 2880
},
{
"epoch": 0.52,
"grad_norm": 1.3852914571762085,
"learning_rate": 4.920259699997461e-06,
"loss": 0.2666,
"step": 2912
},
{
"epoch": 0.52,
"grad_norm": 1.31257963180542,
"learning_rate": 4.916059563683162e-06,
"loss": 0.2547,
"step": 2944
},
{
"epoch": 0.53,
"grad_norm": 1.599116563796997,
"learning_rate": 4.911753518248194e-06,
"loss": 0.2612,
"step": 2976
},
{
"epoch": 0.53,
"grad_norm": 1.2397140264511108,
"learning_rate": 4.907341752450105e-06,
"loss": 0.2589,
"step": 3008
},
{
"epoch": 0.54,
"grad_norm": 1.3178327083587646,
"learning_rate": 4.9028244596807525e-06,
"loss": 0.2605,
"step": 3040
},
{
"epoch": 0.55,
"grad_norm": 1.7413417100906372,
"learning_rate": 4.898201837957811e-06,
"loss": 0.2565,
"step": 3072
},
{
"epoch": 0.55,
"grad_norm": 1.314085602760315,
"learning_rate": 4.893474089916105e-06,
"loss": 0.2498,
"step": 3104
},
{
"epoch": 0.56,
"grad_norm": 1.1399492025375366,
"learning_rate": 4.888641422798719e-06,
"loss": 0.2647,
"step": 3136
},
{
"epoch": 0.56,
"grad_norm": 1.3332985639572144,
"learning_rate": 4.883704048447916e-06,
"loss": 0.2594,
"step": 3168
},
{
"epoch": 0.57,
"grad_norm": 1.3460063934326172,
"learning_rate": 4.87866218329585e-06,
"loss": 0.2571,
"step": 3200
},
{
"epoch": 0.57,
"grad_norm": 1.5006327629089355,
"learning_rate": 4.87351604835508e-06,
"loss": 0.2458,
"step": 3232
},
{
"epoch": 0.58,
"grad_norm": 1.1781283617019653,
"learning_rate": 4.868265869208879e-06,
"loss": 0.2452,
"step": 3264
},
{
"epoch": 0.59,
"grad_norm": 1.117686152458191,
"learning_rate": 4.862911876001348e-06,
"loss": 0.2469,
"step": 3296
},
{
"epoch": 0.59,
"grad_norm": 0.9969549775123596,
"learning_rate": 4.857454303427328e-06,
"loss": 0.2453,
"step": 3328
},
{
"epoch": 0.6,
"grad_norm": 1.4894945621490479,
"learning_rate": 4.851893390722109e-06,
"loss": 0.2457,
"step": 3360
},
{
"epoch": 0.6,
"grad_norm": 1.106041431427002,
"learning_rate": 4.846229381650946e-06,
"loss": 0.2474,
"step": 3392
},
{
"epoch": 0.61,
"grad_norm": 1.035601019859314,
"learning_rate": 4.840462524498372e-06,
"loss": 0.2593,
"step": 3424
},
{
"epoch": 0.61,
"grad_norm": 1.7077690362930298,
"learning_rate": 4.834593072057313e-06,
"loss": 0.2506,
"step": 3456
},
{
"epoch": 0.62,
"grad_norm": 1.1017436981201172,
"learning_rate": 4.8286212816180124e-06,
"loss": 0.2506,
"step": 3488
},
{
"epoch": 0.63,
"grad_norm": 1.2720685005187988,
"learning_rate": 4.8225474149567434e-06,
"loss": 0.2567,
"step": 3520
},
{
"epoch": 0.63,
"grad_norm": 1.328189730644226,
"learning_rate": 4.816371738324343e-06,
"loss": 0.2531,
"step": 3552
},
{
"epoch": 0.64,
"grad_norm": 1.2597825527191162,
"learning_rate": 4.810094522434534e-06,
"loss": 0.246,
"step": 3584
},
{
"epoch": 0.64,
"grad_norm": 1.244281530380249,
"learning_rate": 4.803716042452063e-06,
"loss": 0.2433,
"step": 3616
},
{
"epoch": 0.65,
"grad_norm": 1.4658986330032349,
"learning_rate": 4.797236577980634e-06,
"loss": 0.2496,
"step": 3648
},
{
"epoch": 0.65,
"grad_norm": 1.4121670722961426,
"learning_rate": 4.7906564130506575e-06,
"loss": 0.2531,
"step": 3680
},
{
"epoch": 0.66,
"grad_norm": 1.1751240491867065,
"learning_rate": 4.783975836106791e-06,
"loss": 0.2515,
"step": 3712
},
{
"epoch": 0.67,
"grad_norm": 1.2011898756027222,
"learning_rate": 4.777195139995308e-06,
"loss": 0.2453,
"step": 3744
},
{
"epoch": 0.67,
"grad_norm": 1.5764689445495605,
"learning_rate": 4.770314621951245e-06,
"loss": 0.2496,
"step": 3776
},
{
"epoch": 0.68,
"grad_norm": 1.4584077596664429,
"learning_rate": 4.763334583585388e-06,
"loss": 0.2392,
"step": 3808
},
{
"epoch": 0.68,
"grad_norm": 1.0098185539245605,
"learning_rate": 4.756255330871039e-06,
"loss": 0.2393,
"step": 3840
},
{
"epoch": 0.69,
"grad_norm": 1.3514459133148193,
"learning_rate": 4.749077174130609e-06,
"loss": 0.2572,
"step": 3872
},
{
"epoch": 0.69,
"grad_norm": 1.3888107538223267,
"learning_rate": 4.741800428022014e-06,
"loss": 0.2383,
"step": 3904
},
{
"epoch": 0.7,
"grad_norm": 1.3402737379074097,
"learning_rate": 4.734425411524884e-06,
"loss": 0.2556,
"step": 3936
},
{
"epoch": 0.71,
"grad_norm": 1.2175307273864746,
"learning_rate": 4.726952447926576e-06,
"loss": 0.2555,
"step": 3968
},
{
"epoch": 0.71,
"grad_norm": 1.386852502822876,
"learning_rate": 4.719381864808005e-06,
"loss": 0.2503,
"step": 4000
},
{
"epoch": 0.72,
"grad_norm": 1.2774380445480347,
"learning_rate": 4.711713994029284e-06,
"loss": 0.2503,
"step": 4032
},
{
"epoch": 0.72,
"grad_norm": 1.177322268486023,
"learning_rate": 4.703949171715179e-06,
"loss": 0.2574,
"step": 4064
},
{
"epoch": 0.73,
"grad_norm": 1.269942283630371,
"learning_rate": 4.69608773824037e-06,
"loss": 0.2529,
"step": 4096
},
{
"epoch": 0.73,
"grad_norm": 1.2209409475326538,
"learning_rate": 4.688130038214534e-06,
"loss": 0.2536,
"step": 4128
},
{
"epoch": 0.74,
"grad_norm": 1.4368942975997925,
"learning_rate": 4.6800764204672385e-06,
"loss": 0.2378,
"step": 4160
},
{
"epoch": 0.75,
"grad_norm": 1.6493048667907715,
"learning_rate": 4.671927238032651e-06,
"loss": 0.2538,
"step": 4192
},
{
"epoch": 0.75,
"grad_norm": 1.038549542427063,
"learning_rate": 4.6636828481340594e-06,
"loss": 0.2501,
"step": 4224
},
{
"epoch": 0.76,
"grad_norm": 1.343204379081726,
"learning_rate": 4.655343612168219e-06,
"loss": 0.251,
"step": 4256
},
{
"epoch": 0.76,
"grad_norm": 1.4020464420318604,
"learning_rate": 4.646909895689508e-06,
"loss": 0.2564,
"step": 4288
},
{
"epoch": 0.77,
"grad_norm": 1.1331307888031006,
"learning_rate": 4.638382068393899e-06,
"loss": 0.2505,
"step": 4320
},
{
"epoch": 0.77,
"grad_norm": 1.3825620412826538,
"learning_rate": 4.629760504102761e-06,
"loss": 0.2513,
"step": 4352
},
{
"epoch": 0.78,
"grad_norm": 1.310570478439331,
"learning_rate": 4.621045580746467e-06,
"loss": 0.2464,
"step": 4384
},
{
"epoch": 0.79,
"grad_norm": 1.15547776222229,
"learning_rate": 4.61223768034783e-06,
"loss": 0.2515,
"step": 4416
},
{
"epoch": 0.79,
"grad_norm": 1.340010404586792,
"learning_rate": 4.603337189005354e-06,
"loss": 0.2473,
"step": 4448
},
{
"epoch": 0.8,
"grad_norm": 1.2413158416748047,
"learning_rate": 4.594344496876313e-06,
"loss": 0.2354,
"step": 4480
},
{
"epoch": 0.8,
"grad_norm": 1.2394189834594727,
"learning_rate": 4.585259998159646e-06,
"loss": 0.2512,
"step": 4512
},
{
"epoch": 0.81,
"grad_norm": 1.2866027355194092,
"learning_rate": 4.576084091078677e-06,
"loss": 0.2364,
"step": 4544
},
{
"epoch": 0.81,
"grad_norm": 1.1080009937286377,
"learning_rate": 4.5668171778636585e-06,
"loss": 0.2432,
"step": 4576
},
{
"epoch": 0.82,
"grad_norm": 1.2469310760498047,
"learning_rate": 4.5574596647341414e-06,
"loss": 0.256,
"step": 4608
},
{
"epoch": 0.83,
"grad_norm": 1.0387507677078247,
"learning_rate": 4.548011961881167e-06,
"loss": 0.232,
"step": 4640
},
{
"epoch": 0.83,
"grad_norm": 1.2382770776748657,
"learning_rate": 4.538474483449286e-06,
"loss": 0.2552,
"step": 4672
},
{
"epoch": 0.84,
"grad_norm": 1.2282336950302124,
"learning_rate": 4.528847647518403e-06,
"loss": 0.2525,
"step": 4704
},
{
"epoch": 0.84,
"grad_norm": 1.4016482830047607,
"learning_rate": 4.5191318760854526e-06,
"loss": 0.2582,
"step": 4736
},
{
"epoch": 0.85,
"grad_norm": 1.3214083909988403,
"learning_rate": 4.509327595045898e-06,
"loss": 0.2578,
"step": 4768
},
{
"epoch": 0.85,
"grad_norm": 0.9114232063293457,
"learning_rate": 4.499435234175065e-06,
"loss": 0.2533,
"step": 4800
},
{
"epoch": 0.86,
"grad_norm": 1.172450065612793,
"learning_rate": 4.4894552271093e-06,
"loss": 0.264,
"step": 4832
},
{
"epoch": 0.86,
"grad_norm": 1.249770998954773,
"learning_rate": 4.4793880113269595e-06,
"loss": 0.2389,
"step": 4864
},
{
"epoch": 0.87,
"grad_norm": 1.0912755727767944,
"learning_rate": 4.469234028129241e-06,
"loss": 0.2456,
"step": 4896
},
{
"epoch": 0.88,
"grad_norm": 1.1503956317901611,
"learning_rate": 4.458993722620827e-06,
"loss": 0.2562,
"step": 4928
},
{
"epoch": 0.88,
"grad_norm": 1.1564654111862183,
"learning_rate": 4.448667543690384e-06,
"loss": 0.25,
"step": 4960
},
{
"epoch": 0.89,
"grad_norm": 1.271000862121582,
"learning_rate": 4.438255943990879e-06,
"loss": 0.243,
"step": 4992
},
{
"epoch": 0.89,
"grad_norm": 1.0601048469543457,
"learning_rate": 4.427759379919739e-06,
"loss": 0.2397,
"step": 5024
},
{
"epoch": 0.9,
"grad_norm": 1.214858889579773,
"learning_rate": 4.417178311598845e-06,
"loss": 0.2442,
"step": 5056
},
{
"epoch": 0.9,
"grad_norm": 1.0516908168792725,
"learning_rate": 4.406513202854363e-06,
"loss": 0.2467,
"step": 5088
},
{
"epoch": 0.91,
"grad_norm": 1.326076865196228,
"learning_rate": 4.3957645211964065e-06,
"loss": 0.2488,
"step": 5120
},
{
"epoch": 0.92,
"grad_norm": 1.173823356628418,
"learning_rate": 4.384932737798554e-06,
"loss": 0.241,
"step": 5152
},
{
"epoch": 0.92,
"grad_norm": 1.4526327848434448,
"learning_rate": 4.3740183274771845e-06,
"loss": 0.2553,
"step": 5184
},
{
"epoch": 0.93,
"grad_norm": 1.2346609830856323,
"learning_rate": 4.363021768670668e-06,
"loss": 0.242,
"step": 5216
},
{
"epoch": 0.93,
"grad_norm": 0.8957495093345642,
"learning_rate": 4.351943543418392e-06,
"loss": 0.2444,
"step": 5248
},
{
"epoch": 0.94,
"grad_norm": 1.097772479057312,
"learning_rate": 4.340784137339632e-06,
"loss": 0.2531,
"step": 5280
},
{
"epoch": 0.94,
"grad_norm": 1.1537779569625854,
"learning_rate": 4.329544039612264e-06,
"loss": 0.2507,
"step": 5312
},
{
"epoch": 0.95,
"grad_norm": 1.1922253370285034,
"learning_rate": 4.318223742951321e-06,
"loss": 0.2335,
"step": 5344
},
{
"epoch": 0.96,
"grad_norm": 1.1036819219589233,
"learning_rate": 4.306823743587394e-06,
"loss": 0.2465,
"step": 5376
},
{
"epoch": 0.96,
"grad_norm": 1.229779839515686,
"learning_rate": 4.295344541244879e-06,
"loss": 0.2403,
"step": 5408
},
{
"epoch": 0.97,
"grad_norm": 1.4036519527435303,
"learning_rate": 4.283786639120074e-06,
"loss": 0.254,
"step": 5440
},
{
"epoch": 0.97,
"grad_norm": 0.9732062816619873,
"learning_rate": 4.272150543859117e-06,
"loss": 0.2517,
"step": 5472
},
{
"epoch": 0.98,
"grad_norm": 1.3309801816940308,
"learning_rate": 4.260436765535784e-06,
"loss": 0.25,
"step": 5504
},
{
"epoch": 0.98,
"grad_norm": 1.3353493213653564,
"learning_rate": 4.2486458176291176e-06,
"loss": 0.2482,
"step": 5536
},
{
"epoch": 0.99,
"grad_norm": 1.6585358381271362,
"learning_rate": 4.236778217000934e-06,
"loss": 0.248,
"step": 5568
},
{
"epoch": 1.0,
"grad_norm": 0.9717461466789246,
"learning_rate": 4.224834483873152e-06,
"loss": 0.2366,
"step": 5600
},
{
"epoch": 1.0,
"grad_norm": 0.9571962356567383,
"learning_rate": 4.2128151418049976e-06,
"loss": 0.2404,
"step": 5632
},
{
"epoch": 1.01,
"grad_norm": 1.0692377090454102,
"learning_rate": 4.200720717670048e-06,
"loss": 0.2135,
"step": 5664
},
{
"epoch": 1.01,
"grad_norm": 1.1159001588821411,
"learning_rate": 4.188551741633144e-06,
"loss": 0.1854,
"step": 5696
},
{
"epoch": 1.02,
"grad_norm": 1.4514949321746826,
"learning_rate": 4.176308747127136e-06,
"loss": 0.2095,
"step": 5728
},
{
"epoch": 1.02,
"grad_norm": 1.4603676795959473,
"learning_rate": 4.1639922708295176e-06,
"loss": 0.2015,
"step": 5760
},
{
"epoch": 1.03,
"grad_norm": 1.1802875995635986,
"learning_rate": 4.151602852638888e-06,
"loss": 0.222,
"step": 5792
},
{
"epoch": 1.04,
"grad_norm": 1.2036052942276,
"learning_rate": 4.139141035651288e-06,
"loss": 0.2093,
"step": 5824
},
{
"epoch": 1.04,
"grad_norm": 1.1690653562545776,
"learning_rate": 4.126607366136395e-06,
"loss": 0.1925,
"step": 5856
},
{
"epoch": 1.05,
"grad_norm": 0.9996016621589661,
"learning_rate": 4.114002393513577e-06,
"loss": 0.206,
"step": 5888
},
{
"epoch": 1.05,
"grad_norm": 1.1670773029327393,
"learning_rate": 4.101326670327807e-06,
"loss": 0.2097,
"step": 5920
},
{
"epoch": 1.06,
"grad_norm": 0.8733654022216797,
"learning_rate": 4.0885807522254435e-06,
"loss": 0.2015,
"step": 5952
},
{
"epoch": 1.06,
"grad_norm": 1.2280749082565308,
"learning_rate": 4.075765197929872e-06,
"loss": 0.2108,
"step": 5984
},
{
"epoch": 1.07,
"grad_norm": 1.1926356554031372,
"learning_rate": 4.0628805692170105e-06,
"loss": 0.2047,
"step": 6016
},
{
"epoch": 1.08,
"grad_norm": 1.0048396587371826,
"learning_rate": 4.049927430890693e-06,
"loss": 0.2077,
"step": 6048
},
{
"epoch": 1.08,
"grad_norm": 1.026442050933838,
"learning_rate": 4.0369063507578995e-06,
"loss": 0.2051,
"step": 6080
},
{
"epoch": 1.09,
"grad_norm": 1.1310842037200928,
"learning_rate": 4.023817899603875e-06,
"loss": 0.2055,
"step": 6112
},
{
"epoch": 1.09,
"grad_norm": 1.1275712251663208,
"learning_rate": 4.010662651167106e-06,
"loss": 0.1965,
"step": 6144
},
{
"epoch": 1.1,
"grad_norm": 1.1789113283157349,
"learning_rate": 3.997441182114164e-06,
"loss": 0.2118,
"step": 6176
},
{
"epoch": 1.1,
"grad_norm": 1.3836599588394165,
"learning_rate": 3.984154072014438e-06,
"loss": 0.2056,
"step": 6208
},
{
"epoch": 1.11,
"grad_norm": 1.1013050079345703,
"learning_rate": 3.970801903314722e-06,
"loss": 0.2109,
"step": 6240
},
{
"epoch": 1.12,
"grad_norm": 1.1018249988555908,
"learning_rate": 3.957385261313685e-06,
"loss": 0.202,
"step": 6272
},
{
"epoch": 1.12,
"grad_norm": 1.3906185626983643,
"learning_rate": 3.943904734136213e-06,
"loss": 0.2065,
"step": 6304
},
{
"epoch": 1.13,
"grad_norm": 1.2197610139846802,
"learning_rate": 3.930360912707632e-06,
"loss": 0.2096,
"step": 6336
},
{
"epoch": 1.13,
"grad_norm": 1.0342845916748047,
"learning_rate": 3.916754390727795e-06,
"loss": 0.2024,
"step": 6368
},
{
"epoch": 1.14,
"grad_norm": 1.236260175704956,
"learning_rate": 3.90308576464507e-06,
"loss": 0.216,
"step": 6400
},
{
"epoch": 1.14,
"grad_norm": 1.3906086683273315,
"learning_rate": 3.889355633630186e-06,
"loss": 0.2153,
"step": 6432
},
{
"epoch": 1.15,
"grad_norm": 1.2441129684448242,
"learning_rate": 3.875564599549968e-06,
"loss": 0.2092,
"step": 6464
},
{
"epoch": 1.16,
"grad_norm": 1.2320338487625122,
"learning_rate": 3.861713266940959e-06,
"loss": 0.2038,
"step": 6496
},
{
"epoch": 1.16,
"grad_norm": 1.6422646045684814,
"learning_rate": 3.847802242982915e-06,
"loss": 0.205,
"step": 6528
},
{
"epoch": 1.17,
"grad_norm": 1.1179068088531494,
"learning_rate": 3.83383213747219e-06,
"loss": 0.2162,
"step": 6560
},
{
"epoch": 1.17,
"grad_norm": 1.0986745357513428,
"learning_rate": 3.8198035627950084e-06,
"loss": 0.1956,
"step": 6592
},
{
"epoch": 1.18,
"grad_norm": 1.340859055519104,
"learning_rate": 3.8057171339006138e-06,
"loss": 0.2093,
"step": 6624
},
{
"epoch": 1.18,
"grad_norm": 1.7803446054458618,
"learning_rate": 3.791573468274323e-06,
"loss": 0.2133,
"step": 6656
},
{
"epoch": 1.19,
"grad_norm": 1.022388219833374,
"learning_rate": 3.777373185910448e-06,
"loss": 0.2182,
"step": 6688
},
{
"epoch": 1.19,
"grad_norm": 1.0795223712921143,
"learning_rate": 3.7631169092851226e-06,
"loss": 0.2051,
"step": 6720
},
{
"epoch": 1.2,
"grad_norm": 1.0785856246948242,
"learning_rate": 3.7488052633290174e-06,
"loss": 0.2047,
"step": 6752
},
{
"epoch": 1.21,
"grad_norm": 1.0391508340835571,
"learning_rate": 3.7344388753999434e-06,
"loss": 0.2081,
"step": 6784
},
{
"epoch": 1.21,
"grad_norm": 1.3015925884246826,
"learning_rate": 3.720018375255352e-06,
"loss": 0.2013,
"step": 6816
},
{
"epoch": 1.22,
"grad_norm": 1.2382066249847412,
"learning_rate": 3.7055443950247276e-06,
"loss": 0.2037,
"step": 6848
},
{
"epoch": 1.22,
"grad_norm": 1.1386123895645142,
"learning_rate": 3.691017569181882e-06,
"loss": 0.2046,
"step": 6880
},
{
"epoch": 1.23,
"grad_norm": 0.9857081770896912,
"learning_rate": 3.6764385345171393e-06,
"loss": 0.207,
"step": 6912
},
{
"epoch": 1.23,
"grad_norm": 1.1276394128799438,
"learning_rate": 3.661807930109422e-06,
"loss": 0.2134,
"step": 6944
},
{
"epoch": 1.24,
"grad_norm": 1.1821982860565186,
"learning_rate": 3.647126397298234e-06,
"loss": 0.2162,
"step": 6976
},
{
"epoch": 1.25,
"grad_norm": 1.218800663948059,
"learning_rate": 3.632394579655555e-06,
"loss": 0.2023,
"step": 7008
},
{
"epoch": 1.25,
"grad_norm": 1.083310842514038,
"learning_rate": 3.6176131229576193e-06,
"loss": 0.1999,
"step": 7040
},
{
"epoch": 1.26,
"grad_norm": 1.0640002489089966,
"learning_rate": 3.602782675156617e-06,
"loss": 0.2125,
"step": 7072
},
{
"epoch": 1.26,
"grad_norm": 1.1672149896621704,
"learning_rate": 3.5879038863522843e-06,
"loss": 0.2157,
"step": 7104
},
{
"epoch": 1.27,
"grad_norm": 1.1732845306396484,
"learning_rate": 3.572977408763407e-06,
"loss": 0.2082,
"step": 7136
},
{
"epoch": 1.27,
"grad_norm": 1.1544941663742065,
"learning_rate": 3.5580038966992344e-06,
"loss": 0.2067,
"step": 7168
},
{
"epoch": 1.28,
"grad_norm": 1.2914546728134155,
"learning_rate": 3.5429840065307924e-06,
"loss": 0.2019,
"step": 7200
},
{
"epoch": 1.29,
"grad_norm": 1.0473650693893433,
"learning_rate": 3.527918396662115e-06,
"loss": 0.1952,
"step": 7232
},
{
"epoch": 1.29,
"grad_norm": 1.2211614847183228,
"learning_rate": 3.512807727501379e-06,
"loss": 0.2093,
"step": 7264
},
{
"epoch": 1.3,
"grad_norm": 1.1035760641098022,
"learning_rate": 3.4976526614319573e-06,
"loss": 0.2007,
"step": 7296
},
{
"epoch": 1.3,
"grad_norm": 1.2120308876037598,
"learning_rate": 3.4824538627833825e-06,
"loss": 0.2205,
"step": 7328
},
{
"epoch": 1.31,
"grad_norm": 0.8647122979164124,
"learning_rate": 3.4672119978022277e-06,
"loss": 0.2063,
"step": 7360
},
{
"epoch": 1.31,
"grad_norm": 1.1142189502716064,
"learning_rate": 3.4519277346228953e-06,
"loss": 0.2075,
"step": 7392
},
{
"epoch": 1.32,
"grad_norm": 1.3183207511901855,
"learning_rate": 3.436601743238335e-06,
"loss": 0.2094,
"step": 7424
},
{
"epoch": 1.33,
"grad_norm": 1.0320820808410645,
"learning_rate": 3.421234695470673e-06,
"loss": 0.2029,
"step": 7456
},
{
"epoch": 1.33,
"grad_norm": 1.3065481185913086,
"learning_rate": 3.4058272649417607e-06,
"loss": 0.2127,
"step": 7488
},
{
"epoch": 1.34,
"grad_norm": 1.209372639656067,
"learning_rate": 3.3903801270436465e-06,
"loss": 0.2015,
"step": 7520
},
{
"epoch": 1.34,
"grad_norm": 1.0177247524261475,
"learning_rate": 3.374893958908971e-06,
"loss": 0.2075,
"step": 7552
},
{
"epoch": 1.35,
"grad_norm": 1.4457709789276123,
"learning_rate": 3.3593694393812827e-06,
"loss": 0.2098,
"step": 7584
},
{
"epoch": 1.35,
"grad_norm": 1.1711078882217407,
"learning_rate": 3.3438072489852837e-06,
"loss": 0.2088,
"step": 7616
},
{
"epoch": 1.36,
"grad_norm": 1.1928409337997437,
"learning_rate": 3.3282080698969953e-06,
"loss": 0.1918,
"step": 7648
},
{
"epoch": 1.37,
"grad_norm": 0.9215808510780334,
"learning_rate": 3.3125725859138548e-06,
"loss": 0.2106,
"step": 7680
},
{
"epoch": 1.37,
"grad_norm": 1.3021633625030518,
"learning_rate": 3.2969014824247436e-06,
"loss": 0.2018,
"step": 7712
},
{
"epoch": 1.38,
"grad_norm": 1.1597398519515991,
"learning_rate": 3.28119544637994e-06,
"loss": 0.2035,
"step": 7744
},
{
"epoch": 1.38,
"grad_norm": 1.2015706300735474,
"learning_rate": 3.265455166261009e-06,
"loss": 0.2027,
"step": 7776
},
{
"epoch": 1.39,
"grad_norm": 1.1449264287948608,
"learning_rate": 3.2496813320506183e-06,
"loss": 0.2165,
"step": 7808
},
{
"epoch": 1.39,
"grad_norm": 1.1332265138626099,
"learning_rate": 3.2338746352022965e-06,
"loss": 0.2006,
"step": 7840
},
{
"epoch": 1.4,
"grad_norm": 1.430891990661621,
"learning_rate": 3.2180357686101226e-06,
"loss": 0.2102,
"step": 7872
},
{
"epoch": 1.41,
"grad_norm": 1.4063985347747803,
"learning_rate": 3.2021654265783505e-06,
"loss": 0.196,
"step": 7904
},
{
"epoch": 1.41,
"grad_norm": 1.3970558643341064,
"learning_rate": 3.1862643047909746e-06,
"loss": 0.2161,
"step": 7936
},
{
"epoch": 1.42,
"grad_norm": 1.3233983516693115,
"learning_rate": 3.170333100281236e-06,
"loss": 0.1921,
"step": 7968
},
{
"epoch": 1.42,
"grad_norm": 1.2325806617736816,
"learning_rate": 3.154372511401064e-06,
"loss": 0.2042,
"step": 8000
},
{
"epoch": 1.43,
"grad_norm": 1.0558186769485474,
"learning_rate": 3.1383832377904676e-06,
"loss": 0.2056,
"step": 8032
},
{
"epoch": 1.43,
"grad_norm": 1.189503788948059,
"learning_rate": 3.1223659803468653e-06,
"loss": 0.203,
"step": 8064
},
{
"epoch": 1.44,
"grad_norm": 1.1646627187728882,
"learning_rate": 3.1063214411943576e-06,
"loss": 0.2088,
"step": 8096
},
{
"epoch": 1.45,
"grad_norm": 1.1149977445602417,
"learning_rate": 3.0902503236529533e-06,
"loss": 0.2081,
"step": 8128
},
{
"epoch": 1.45,
"grad_norm": 1.5566740036010742,
"learning_rate": 3.074153332207738e-06,
"loss": 0.2141,
"step": 8160
},
{
"epoch": 1.46,
"grad_norm": 1.304262638092041,
"learning_rate": 3.058031172477992e-06,
"loss": 0.2006,
"step": 8192
},
{
"epoch": 1.46,
"grad_norm": 1.1247010231018066,
"learning_rate": 3.041884551186258e-06,
"loss": 0.2109,
"step": 8224
},
{
"epoch": 1.47,
"grad_norm": 1.4587311744689941,
"learning_rate": 3.0257141761273627e-06,
"loss": 0.2016,
"step": 8256
},
{
"epoch": 1.47,
"grad_norm": 1.1545838117599487,
"learning_rate": 3.0095207561373935e-06,
"loss": 0.183,
"step": 8288
},
{
"epoch": 1.48,
"grad_norm": 1.3221790790557861,
"learning_rate": 2.9933050010626208e-06,
"loss": 0.1985,
"step": 8320
},
{
"epoch": 1.49,
"grad_norm": 1.1717700958251953,
"learning_rate": 2.9770676217283844e-06,
"loss": 0.2113,
"step": 8352
},
{
"epoch": 1.49,
"grad_norm": 1.0709022283554077,
"learning_rate": 2.960809329907934e-06,
"loss": 0.2012,
"step": 8384
},
{
"epoch": 1.5,
"grad_norm": 1.4594495296478271,
"learning_rate": 2.944530838291229e-06,
"loss": 0.2039,
"step": 8416
},
{
"epoch": 1.5,
"grad_norm": 1.3135212659835815,
"learning_rate": 2.928232860453694e-06,
"loss": 0.206,
"step": 8448
},
{
"epoch": 1.51,
"grad_norm": 1.316394329071045,
"learning_rate": 2.911916110824945e-06,
"loss": 0.212,
"step": 8480
},
{
"epoch": 1.51,
"grad_norm": 1.3602452278137207,
"learning_rate": 2.895581304657465e-06,
"loss": 0.2068,
"step": 8512
},
{
"epoch": 1.52,
"grad_norm": 1.2073897123336792,
"learning_rate": 2.8792291579952553e-06,
"loss": 0.2098,
"step": 8544
},
{
"epoch": 1.52,
"grad_norm": 1.2983072996139526,
"learning_rate": 2.8628603876424467e-06,
"loss": 0.2086,
"step": 8576
},
{
"epoch": 1.53,
"grad_norm": 1.0781196355819702,
"learning_rate": 2.846475711131877e-06,
"loss": 0.201,
"step": 8608
},
{
"epoch": 1.54,
"grad_norm": 1.1917251348495483,
"learning_rate": 2.8300758466936366e-06,
"loss": 0.1982,
"step": 8640
},
{
"epoch": 1.54,
"grad_norm": 1.2894983291625977,
"learning_rate": 2.813661513223588e-06,
"loss": 0.1943,
"step": 8672
},
{
"epoch": 1.55,
"grad_norm": 1.2249202728271484,
"learning_rate": 2.7972334302518504e-06,
"loss": 0.2145,
"step": 8704
},
{
"epoch": 1.55,
"grad_norm": 1.1947064399719238,
"learning_rate": 2.7807923179112576e-06,
"loss": 0.2003,
"step": 8736
},
{
"epoch": 1.56,
"grad_norm": 1.0660251379013062,
"learning_rate": 2.764338896905792e-06,
"loss": 0.1984,
"step": 8768
},
{
"epoch": 1.56,
"grad_norm": 1.0243247747421265,
"learning_rate": 2.7478738884789934e-06,
"loss": 0.2036,
"step": 8800
},
{
"epoch": 1.57,
"grad_norm": 1.286199927330017,
"learning_rate": 2.731398014382341e-06,
"loss": 0.2027,
"step": 8832
},
{
"epoch": 1.58,
"grad_norm": 1.1617448329925537,
"learning_rate": 2.714911996843617e-06,
"loss": 0.2162,
"step": 8864
},
{
"epoch": 1.58,
"grad_norm": 1.1921496391296387,
"learning_rate": 2.6984165585352435e-06,
"loss": 0.2124,
"step": 8896
},
{
"epoch": 1.59,
"grad_norm": 1.2066140174865723,
"learning_rate": 2.6819124225426085e-06,
"loss": 0.199,
"step": 8928
},
{
"epoch": 1.59,
"grad_norm": 1.0459320545196533,
"learning_rate": 2.665400312332368e-06,
"loss": 0.2072,
"step": 8960
},
{
"epoch": 1.6,
"grad_norm": 1.2983636856079102,
"learning_rate": 2.648880951720729e-06,
"loss": 0.2024,
"step": 8992
},
{
"epoch": 1.6,
"grad_norm": 1.0876768827438354,
"learning_rate": 2.6323550648417267e-06,
"loss": 0.2143,
"step": 9024
},
{
"epoch": 1.61,
"grad_norm": 1.0047022104263306,
"learning_rate": 2.6158233761154744e-06,
"loss": 0.2043,
"step": 9056
},
{
"epoch": 1.62,
"grad_norm": 0.9878237247467041,
"learning_rate": 2.5992866102164146e-06,
"loss": 0.1991,
"step": 9088
},
{
"epoch": 1.62,
"grad_norm": 0.9894827604293823,
"learning_rate": 2.58274549204155e-06,
"loss": 0.1979,
"step": 9120
},
{
"epoch": 1.63,
"grad_norm": 0.9374232292175293,
"learning_rate": 2.5662007466786674e-06,
"loss": 0.2055,
"step": 9152
},
{
"epoch": 1.63,
"grad_norm": 1.259948492050171,
"learning_rate": 2.5496530993745518e-06,
"loss": 0.2057,
"step": 9184
},
{
"epoch": 1.64,
"grad_norm": 0.958737850189209,
"learning_rate": 2.533103275503197e-06,
"loss": 0.2029,
"step": 9216
},
{
"epoch": 1.64,
"grad_norm": 1.079717755317688,
"learning_rate": 2.5165520005340082e-06,
"loss": 0.2049,
"step": 9248
},
{
"epoch": 1.65,
"grad_norm": 1.1001982688903809,
"learning_rate": 2.5e-06,
"loss": 0.211,
"step": 9280
},
{
"epoch": 1.66,
"grad_norm": 1.2408779859542847,
"learning_rate": 2.4834479994659926e-06,
"loss": 0.2028,
"step": 9312
},
{
"epoch": 1.66,
"grad_norm": 1.0395313501358032,
"learning_rate": 2.4668967244968035e-06,
"loss": 0.1988,
"step": 9344
},
{
"epoch": 1.67,
"grad_norm": 1.0080056190490723,
"learning_rate": 2.4503469006254487e-06,
"loss": 0.1988,
"step": 9376
},
{
"epoch": 1.67,
"grad_norm": 1.4669593572616577,
"learning_rate": 2.4337992533213334e-06,
"loss": 0.1942,
"step": 9408
},
{
"epoch": 1.68,
"grad_norm": 1.3393524885177612,
"learning_rate": 2.4172545079584508e-06,
"loss": 0.1964,
"step": 9440
},
{
"epoch": 1.68,
"grad_norm": 0.9786149263381958,
"learning_rate": 2.4007133897835863e-06,
"loss": 0.1984,
"step": 9472
},
{
"epoch": 1.69,
"grad_norm": 1.1999776363372803,
"learning_rate": 2.3841766238845264e-06,
"loss": 0.2102,
"step": 9504
},
{
"epoch": 1.7,
"grad_norm": 1.3276174068450928,
"learning_rate": 2.367644935158274e-06,
"loss": 0.1941,
"step": 9536
},
{
"epoch": 1.7,
"grad_norm": 1.0124472379684448,
"learning_rate": 2.3511190482792713e-06,
"loss": 0.199,
"step": 9568
},
{
"epoch": 1.71,
"grad_norm": 1.258489966392517,
"learning_rate": 2.3345996876676334e-06,
"loss": 0.2008,
"step": 9600
},
{
"epoch": 1.71,
"grad_norm": 1.1993016004562378,
"learning_rate": 2.318087577457392e-06,
"loss": 0.2154,
"step": 9632
},
{
"epoch": 1.72,
"grad_norm": 1.250908613204956,
"learning_rate": 2.3015834414647573e-06,
"loss": 0.2068,
"step": 9664
},
{
"epoch": 1.72,
"grad_norm": 1.211915373802185,
"learning_rate": 2.2850880031563845e-06,
"loss": 0.1946,
"step": 9696
},
{
"epoch": 1.73,
"grad_norm": 1.0278340578079224,
"learning_rate": 2.26860198561766e-06,
"loss": 0.1948,
"step": 9728
},
{
"epoch": 1.74,
"grad_norm": 1.2455780506134033,
"learning_rate": 2.2521261115210074e-06,
"loss": 0.197,
"step": 9760
},
{
"epoch": 1.74,
"grad_norm": 1.2321908473968506,
"learning_rate": 2.2356611030942084e-06,
"loss": 0.2075,
"step": 9792
},
{
"epoch": 1.75,
"grad_norm": 1.0618436336517334,
"learning_rate": 2.219207682088743e-06,
"loss": 0.1931,
"step": 9824
},
{
"epoch": 1.75,
"grad_norm": 1.36842942237854,
"learning_rate": 2.20276656974815e-06,
"loss": 0.1999,
"step": 9856
},
{
"epoch": 1.76,
"grad_norm": 1.033603310585022,
"learning_rate": 2.186338486776412e-06,
"loss": 0.2028,
"step": 9888
},
{
"epoch": 1.76,
"grad_norm": 1.303781270980835,
"learning_rate": 2.169924153306363e-06,
"loss": 0.214,
"step": 9920
},
{
"epoch": 1.77,
"grad_norm": 1.2051355838775635,
"learning_rate": 2.153524288868124e-06,
"loss": 0.2091,
"step": 9952
},
{
"epoch": 1.78,
"grad_norm": 0.9946267604827881,
"learning_rate": 2.137139612357554e-06,
"loss": 0.1942,
"step": 9984
},
{
"epoch": 1.78,
"grad_norm": 1.283492088317871,
"learning_rate": 2.120770842004746e-06,
"loss": 0.1971,
"step": 10016
},
{
"epoch": 1.79,
"grad_norm": 0.9329233169555664,
"learning_rate": 2.1044186953425358e-06,
"loss": 0.203,
"step": 10048
},
{
"epoch": 1.79,
"grad_norm": 1.082767367362976,
"learning_rate": 2.0880838891750553e-06,
"loss": 0.2012,
"step": 10080
},
{
"epoch": 1.8,
"grad_norm": 1.1007740497589111,
"learning_rate": 2.0717671395463063e-06,
"loss": 0.2028,
"step": 10112
},
{
"epoch": 1.8,
"grad_norm": 1.2502014636993408,
"learning_rate": 2.0554691617087725e-06,
"loss": 0.2121,
"step": 10144
},
{
"epoch": 1.81,
"grad_norm": 1.073034405708313,
"learning_rate": 2.0391906700920667e-06,
"loss": 0.1994,
"step": 10176
},
{
"epoch": 1.82,
"grad_norm": 0.9409189820289612,
"learning_rate": 2.0229323782716156e-06,
"loss": 0.2054,
"step": 10208
},
{
"epoch": 1.82,
"grad_norm": 1.197383999824524,
"learning_rate": 2.0066949989373797e-06,
"loss": 0.1946,
"step": 10240
},
{
"epoch": 1.83,
"grad_norm": 1.261612892150879,
"learning_rate": 1.9904792438626074e-06,
"loss": 0.2038,
"step": 10272
},
{
"epoch": 1.83,
"grad_norm": 1.4839472770690918,
"learning_rate": 1.9742858238726377e-06,
"loss": 0.2067,
"step": 10304
},
{
"epoch": 1.84,
"grad_norm": 1.0103521347045898,
"learning_rate": 1.9581154488137425e-06,
"loss": 0.2104,
"step": 10336
},
{
"epoch": 1.84,
"grad_norm": 1.2776283025741577,
"learning_rate": 1.9419688275220085e-06,
"loss": 0.196,
"step": 10368
},
{
"epoch": 1.85,
"grad_norm": 1.0784910917282104,
"learning_rate": 1.9258466677922624e-06,
"loss": 0.1975,
"step": 10400
},
{
"epoch": 1.85,
"grad_norm": 0.9643808007240295,
"learning_rate": 1.909749676347047e-06,
"loss": 0.2111,
"step": 10432
},
{
"epoch": 1.86,
"grad_norm": 1.3432437181472778,
"learning_rate": 1.8936785588056428e-06,
"loss": 0.1923,
"step": 10464
},
{
"epoch": 1.87,
"grad_norm": 1.133470892906189,
"learning_rate": 1.8776340196531351e-06,
"loss": 0.2016,
"step": 10496
},
{
"epoch": 1.87,
"grad_norm": 1.0897003412246704,
"learning_rate": 1.8616167622095328e-06,
"loss": 0.193,
"step": 10528
},
{
"epoch": 1.88,
"grad_norm": 0.9629374146461487,
"learning_rate": 1.8456274885989374e-06,
"loss": 0.1937,
"step": 10560
},
{
"epoch": 1.88,
"grad_norm": 1.406630039215088,
"learning_rate": 1.829666899718765e-06,
"loss": 0.1997,
"step": 10592
},
{
"epoch": 1.89,
"grad_norm": 1.165366291999817,
"learning_rate": 1.8137356952090258e-06,
"loss": 0.1976,
"step": 10624
},
{
"epoch": 1.89,
"grad_norm": 1.2674609422683716,
"learning_rate": 1.7978345734216502e-06,
"loss": 0.1908,
"step": 10656
},
{
"epoch": 1.9,
"grad_norm": 1.2211626768112183,
"learning_rate": 1.7819642313898783e-06,
"loss": 0.1984,
"step": 10688
},
{
"epoch": 1.91,
"grad_norm": 1.1066653728485107,
"learning_rate": 1.766125364797704e-06,
"loss": 0.2035,
"step": 10720
},
{
"epoch": 1.91,
"grad_norm": 1.539157748222351,
"learning_rate": 1.7503186679493821e-06,
"loss": 0.201,
"step": 10752
},
{
"epoch": 1.92,
"grad_norm": 1.1897879838943481,
"learning_rate": 1.7345448337389918e-06,
"loss": 0.194,
"step": 10784
},
{
"epoch": 1.92,
"grad_norm": 1.162571668624878,
"learning_rate": 1.7188045536200604e-06,
"loss": 0.1899,
"step": 10816
},
{
"epoch": 1.93,
"grad_norm": 1.0616639852523804,
"learning_rate": 1.7030985175752574e-06,
"loss": 0.1978,
"step": 10848
},
{
"epoch": 1.93,
"grad_norm": 1.0391641855239868,
"learning_rate": 1.687427414086146e-06,
"loss": 0.2017,
"step": 10880
},
{
"epoch": 1.94,
"grad_norm": 1.0870113372802734,
"learning_rate": 1.6717919301030055e-06,
"loss": 0.2012,
"step": 10912
},
{
"epoch": 1.95,
"grad_norm": 1.143446922302246,
"learning_rate": 1.6561927510147172e-06,
"loss": 0.1911,
"step": 10944
},
{
"epoch": 1.95,
"grad_norm": 1.015080213546753,
"learning_rate": 1.6406305606187183e-06,
"loss": 0.198,
"step": 10976
},
{
"epoch": 1.96,
"grad_norm": 0.9722278714179993,
"learning_rate": 1.6251060410910301e-06,
"loss": 0.1862,
"step": 11008
},
{
"epoch": 1.96,
"grad_norm": 0.9311158061027527,
"learning_rate": 1.6096198729563539e-06,
"loss": 0.198,
"step": 11040
},
{
"epoch": 1.97,
"grad_norm": 1.3127020597457886,
"learning_rate": 1.5941727350582399e-06,
"loss": 0.2,
"step": 11072
},
{
"epoch": 1.97,
"grad_norm": 0.9450055956840515,
"learning_rate": 1.5787653045293278e-06,
"loss": 0.2015,
"step": 11104
},
{
"epoch": 1.98,
"grad_norm": 1.1553057432174683,
"learning_rate": 1.5633982567616657e-06,
"loss": 0.2068,
"step": 11136
},
{
"epoch": 1.99,
"grad_norm": 0.8617095351219177,
"learning_rate": 1.548072265377105e-06,
"loss": 0.2014,
"step": 11168
},
{
"epoch": 1.99,
"grad_norm": 0.9857013821601868,
"learning_rate": 1.532788002197773e-06,
"loss": 0.2031,
"step": 11200
},
{
"epoch": 2.0,
"grad_norm": 1.0158861875534058,
"learning_rate": 1.5175461372166177e-06,
"loss": 0.1941,
"step": 11232
},
{
"epoch": 2.0,
"grad_norm": 1.1121022701263428,
"learning_rate": 1.5023473385680438e-06,
"loss": 0.1708,
"step": 11264
},
{
"epoch": 2.01,
"grad_norm": 1.1300593614578247,
"learning_rate": 1.4871922724986215e-06,
"loss": 0.1504,
"step": 11296
},
{
"epoch": 2.01,
"grad_norm": 1.232246994972229,
"learning_rate": 1.4720816033378856e-06,
"loss": 0.151,
"step": 11328
},
{
"epoch": 2.02,
"grad_norm": 1.2618398666381836,
"learning_rate": 1.4570159934692085e-06,
"loss": 0.1421,
"step": 11360
},
{
"epoch": 2.03,
"grad_norm": 1.275038242340088,
"learning_rate": 1.4419961033007669e-06,
"loss": 0.1457,
"step": 11392
},
{
"epoch": 2.03,
"grad_norm": 1.079405426979065,
"learning_rate": 1.427022591236594e-06,
"loss": 0.144,
"step": 11424
},
{
"epoch": 2.04,
"grad_norm": 1.1695400476455688,
"learning_rate": 1.4120961136477168e-06,
"loss": 0.1531,
"step": 11456
},
{
"epoch": 2.04,
"grad_norm": 1.008547306060791,
"learning_rate": 1.3972173248433832e-06,
"loss": 0.1453,
"step": 11488
},
{
"epoch": 2.05,
"grad_norm": 1.0746265649795532,
"learning_rate": 1.3823868770423815e-06,
"loss": 0.1446,
"step": 11520
},
{
"epoch": 2.05,
"grad_norm": 1.1596614122390747,
"learning_rate": 1.3676054203444462e-06,
"loss": 0.1477,
"step": 11552
},
{
"epoch": 2.06,
"grad_norm": 1.1029706001281738,
"learning_rate": 1.3528736027017663e-06,
"loss": 0.1477,
"step": 11584
},
{
"epoch": 2.07,
"grad_norm": 0.9398396015167236,
"learning_rate": 1.3381920698905788e-06,
"loss": 0.1477,
"step": 11616
},
{
"epoch": 2.07,
"grad_norm": 1.0209776163101196,
"learning_rate": 1.3235614654828604e-06,
"loss": 0.1448,
"step": 11648
},
{
"epoch": 2.08,
"grad_norm": 0.9415841102600098,
"learning_rate": 1.3089824308181187e-06,
"loss": 0.1481,
"step": 11680
},
{
"epoch": 2.08,
"grad_norm": 1.088069200515747,
"learning_rate": 1.2944556049752726e-06,
"loss": 0.149,
"step": 11712
},
{
"epoch": 2.09,
"grad_norm": 1.3269786834716797,
"learning_rate": 1.2799816247446494e-06,
"loss": 0.1497,
"step": 11744
},
{
"epoch": 2.09,
"grad_norm": 0.9119545817375183,
"learning_rate": 1.265561124600057e-06,
"loss": 0.1467,
"step": 11776
},
{
"epoch": 2.1,
"grad_norm": 1.0677683353424072,
"learning_rate": 1.251194736670983e-06,
"loss": 0.1448,
"step": 11808
},
{
"epoch": 2.11,
"grad_norm": 1.0756884813308716,
"learning_rate": 1.2368830907148778e-06,
"loss": 0.1363,
"step": 11840
},
{
"epoch": 2.11,
"grad_norm": 1.0578961372375488,
"learning_rate": 1.2226268140895528e-06,
"loss": 0.1527,
"step": 11872
},
{
"epoch": 2.12,
"grad_norm": 1.0562700033187866,
"learning_rate": 1.2084265317256772e-06,
"loss": 0.1449,
"step": 11904
},
{
"epoch": 2.12,
"grad_norm": 1.0958082675933838,
"learning_rate": 1.1942828660993869e-06,
"loss": 0.1474,
"step": 11936
},
{
"epoch": 2.13,
"grad_norm": 0.9672511219978333,
"learning_rate": 1.1801964372049932e-06,
"loss": 0.1459,
"step": 11968
},
{
"epoch": 2.13,
"grad_norm": 1.1125974655151367,
"learning_rate": 1.1661678625278106e-06,
"loss": 0.1483,
"step": 12000
},
{
"epoch": 2.14,
"grad_norm": 1.227283239364624,
"learning_rate": 1.152197757017086e-06,
"loss": 0.1453,
"step": 12032
},
{
"epoch": 2.15,
"grad_norm": 1.3206896781921387,
"learning_rate": 1.1382867330590414e-06,
"loss": 0.1425,
"step": 12064
},
{
"epoch": 2.15,
"grad_norm": 0.9912922978401184,
"learning_rate": 1.1244354004500335e-06,
"loss": 0.1529,
"step": 12096
},
{
"epoch": 2.16,
"grad_norm": 0.8996345400810242,
"learning_rate": 1.110644366369815e-06,
"loss": 0.1437,
"step": 12128
},
{
"epoch": 2.16,
"grad_norm": 0.9117119312286377,
"learning_rate": 1.0969142353549315e-06,
"loss": 0.1429,
"step": 12160
},
{
"epoch": 2.17,
"grad_norm": 1.1916334629058838,
"learning_rate": 1.0832456092722063e-06,
"loss": 0.1509,
"step": 12192
},
{
"epoch": 2.17,
"grad_norm": 1.1345574855804443,
"learning_rate": 1.0696390872923696e-06,
"loss": 0.1547,
"step": 12224
},
{
"epoch": 2.18,
"grad_norm": 1.3311399221420288,
"learning_rate": 1.0560952658637869e-06,
"loss": 0.1428,
"step": 12256
},
{
"epoch": 2.18,
"grad_norm": 1.0195939540863037,
"learning_rate": 1.042614738686315e-06,
"loss": 0.1447,
"step": 12288
},
{
"epoch": 2.19,
"grad_norm": 1.1453065872192383,
"learning_rate": 1.029198096685278e-06,
"loss": 0.1384,
"step": 12320
},
{
"epoch": 2.2,
"grad_norm": 1.1899516582489014,
"learning_rate": 1.0158459279855632e-06,
"loss": 0.1433,
"step": 12352
},
{
"epoch": 2.2,
"grad_norm": 1.060065507888794,
"learning_rate": 1.0025588178858372e-06,
"loss": 0.1456,
"step": 12384
},
{
"epoch": 2.21,
"grad_norm": 1.1489146947860718,
"learning_rate": 9.893373488328953e-07,
"loss": 0.1433,
"step": 12416
},
{
"epoch": 2.21,
"grad_norm": 1.3114417791366577,
"learning_rate": 9.761821003961246e-07,
"loss": 0.1467,
"step": 12448
},
{
"epoch": 2.22,
"grad_norm": 1.3255183696746826,
"learning_rate": 9.630936492421005e-07,
"loss": 0.1463,
"step": 12480
},
{
"epoch": 2.22,
"grad_norm": 1.2642742395401,
"learning_rate": 9.500725691093085e-07,
"loss": 0.1525,
"step": 12512
},
{
"epoch": 2.23,
"grad_norm": 1.2281138896942139,
"learning_rate": 9.371194307829895e-07,
"loss": 0.1383,
"step": 12544
},
{
"epoch": 2.24,
"grad_norm": 0.9627875089645386,
"learning_rate": 9.242348020701295e-07,
"loss": 0.1642,
"step": 12576
},
{
"epoch": 2.24,
"grad_norm": 1.1187249422073364,
"learning_rate": 9.114192477745568e-07,
"loss": 0.1439,
"step": 12608
},
{
"epoch": 2.25,
"grad_norm": 1.0410840511322021,
"learning_rate": 8.986733296721931e-07,
"loss": 0.142,
"step": 12640
},
{
"epoch": 2.25,
"grad_norm": 1.2345024347305298,
"learning_rate": 8.859976064864235e-07,
"loss": 0.1512,
"step": 12672
},
{
"epoch": 2.26,
"grad_norm": 1.6558443307876587,
"learning_rate": 8.733926338636056e-07,
"loss": 0.1363,
"step": 12704
},
{
"epoch": 2.26,
"grad_norm": 1.3118538856506348,
"learning_rate": 8.608589643487128e-07,
"loss": 0.1471,
"step": 12736
},
{
"epoch": 2.27,
"grad_norm": 1.1155567169189453,
"learning_rate": 8.483971473611133e-07,
"loss": 0.1396,
"step": 12768
},
{
"epoch": 2.28,
"grad_norm": 1.0880179405212402,
"learning_rate": 8.360077291704821e-07,
"loss": 0.1413,
"step": 12800
},
{
"epoch": 2.28,
"grad_norm": 0.9752321839332581,
"learning_rate": 8.236912528728647e-07,
"loss": 0.146,
"step": 12832
},
{
"epoch": 2.29,
"grad_norm": 0.9778379201889038,
"learning_rate": 8.114482583668576e-07,
"loss": 0.1403,
"step": 12864
},
{
"epoch": 2.29,
"grad_norm": 0.8760839700698853,
"learning_rate": 7.99279282329952e-07,
"loss": 0.148,
"step": 12896
},
{
"epoch": 2.3,
"grad_norm": 1.187658667564392,
"learning_rate": 7.871848581950039e-07,
"loss": 0.132,
"step": 12928
},
{
"epoch": 2.3,
"grad_norm": 0.9668059349060059,
"learning_rate": 7.751655161268481e-07,
"loss": 0.1424,
"step": 12960
},
{
"epoch": 2.31,
"grad_norm": 1.1318392753601074,
"learning_rate": 7.632217829990668e-07,
"loss": 0.1516,
"step": 12992
},
{
"epoch": 2.32,
"grad_norm": 1.3520994186401367,
"learning_rate": 7.513541823708828e-07,
"loss": 0.1495,
"step": 13024
},
{
"epoch": 2.32,
"grad_norm": 1.3352413177490234,
"learning_rate": 7.395632344642173e-07,
"loss": 0.1446,
"step": 13056
},
{
"epoch": 2.33,
"grad_norm": 1.0273305177688599,
"learning_rate": 7.278494561408833e-07,
"loss": 0.1391,
"step": 13088
},
{
"epoch": 2.33,
"grad_norm": 1.2872681617736816,
"learning_rate": 7.162133608799271e-07,
"loss": 0.1391,
"step": 13120
},
{
"epoch": 2.34,
"grad_norm": 1.0563528537750244,
"learning_rate": 7.046554587551216e-07,
"loss": 0.1521,
"step": 13152
},
{
"epoch": 2.34,
"grad_norm": 1.1487845182418823,
"learning_rate": 6.931762564126074e-07,
"loss": 0.1411,
"step": 13184
},
{
"epoch": 2.35,
"grad_norm": 1.058159351348877,
"learning_rate": 6.817762570486791e-07,
"loss": 0.1424,
"step": 13216
},
{
"epoch": 2.36,
"grad_norm": 1.249377727508545,
"learning_rate": 6.704559603877367e-07,
"loss": 0.1448,
"step": 13248
},
{
"epoch": 2.36,
"grad_norm": 0.9334893226623535,
"learning_rate": 6.592158626603689e-07,
"loss": 0.1384,
"step": 13280
},
{
"epoch": 2.37,
"grad_norm": 1.5639148950576782,
"learning_rate": 6.480564565816091e-07,
"loss": 0.1426,
"step": 13312
},
{
"epoch": 2.37,
"grad_norm": 1.0596867799758911,
"learning_rate": 6.369782313293335e-07,
"loss": 0.1358,
"step": 13344
},
{
"epoch": 2.38,
"grad_norm": 0.8567415475845337,
"learning_rate": 6.259816725228158e-07,
"loss": 0.1465,
"step": 13376
},
{
"epoch": 2.38,
"grad_norm": 1.1086764335632324,
"learning_rate": 6.150672622014459e-07,
"loss": 0.1538,
"step": 13408
},
{
"epoch": 2.39,
"grad_norm": 1.1636631488800049,
"learning_rate": 6.042354788035943e-07,
"loss": 0.1389,
"step": 13440
},
{
"epoch": 2.4,
"grad_norm": 1.274596929550171,
"learning_rate": 5.934867971456384e-07,
"loss": 0.1464,
"step": 13472
},
{
"epoch": 2.4,
"grad_norm": 1.173563003540039,
"learning_rate": 5.828216884011553e-07,
"loss": 0.1435,
"step": 13504
},
{
"epoch": 2.41,
"grad_norm": 1.0788921117782593,
"learning_rate": 5.722406200802613e-07,
"loss": 0.145,
"step": 13536
},
{
"epoch": 2.41,
"grad_norm": 1.1613490581512451,
"learning_rate": 5.617440560091212e-07,
"loss": 0.1474,
"step": 13568
},
{
"epoch": 2.42,
"grad_norm": 0.9075080156326294,
"learning_rate": 5.513324563096167e-07,
"loss": 0.1423,
"step": 13600
},
{
"epoch": 2.42,
"grad_norm": 1.1296495199203491,
"learning_rate": 5.41006277379173e-07,
"loss": 0.1506,
"step": 13632
},
{
"epoch": 2.43,
"grad_norm": 1.2199699878692627,
"learning_rate": 5.307659718707603e-07,
"loss": 0.1459,
"step": 13664
},
{
"epoch": 2.44,
"grad_norm": 1.2415364980697632,
"learning_rate": 5.20611988673041e-07,
"loss": 0.1459,
"step": 13696
},
{
"epoch": 2.44,
"grad_norm": 0.9814534187316895,
"learning_rate": 5.105447728907012e-07,
"loss": 0.1405,
"step": 13728
},
{
"epoch": 2.45,
"grad_norm": 1.1437889337539673,
"learning_rate": 5.00564765824936e-07,
"loss": 0.147,
"step": 13760
},
{
"epoch": 2.45,
"grad_norm": 1.1459242105484009,
"learning_rate": 4.906724049541023e-07,
"loss": 0.1454,
"step": 13792
},
{
"epoch": 2.46,
"grad_norm": 1.093807578086853,
"learning_rate": 4.808681239145479e-07,
"loss": 0.1448,
"step": 13824
},
{
"epoch": 2.46,
"grad_norm": 1.1457182168960571,
"learning_rate": 4.711523524815978e-07,
"loss": 0.1391,
"step": 13856
},
{
"epoch": 2.47,
"grad_norm": 1.0422513484954834,
"learning_rate": 4.615255165507146e-07,
"loss": 0.1435,
"step": 13888
},
{
"epoch": 2.48,
"grad_norm": 1.1171213388442993,
"learning_rate": 4.5198803811883326e-07,
"loss": 0.1545,
"step": 13920
},
{
"epoch": 2.48,
"grad_norm": 1.3410863876342773,
"learning_rate": 4.4254033526585917e-07,
"loss": 0.1526,
"step": 13952
},
{
"epoch": 2.49,
"grad_norm": 0.9821498394012451,
"learning_rate": 4.331828221363424e-07,
"loss": 0.1407,
"step": 13984
},
{
"epoch": 2.49,
"grad_norm": 1.2533886432647705,
"learning_rate": 4.239159089213246e-07,
"loss": 0.1358,
"step": 14016
},
{
"epoch": 2.5,
"grad_norm": 0.9848787784576416,
"learning_rate": 4.147400018403544e-07,
"loss": 0.1449,
"step": 14048
},
{
"epoch": 2.5,
"grad_norm": 0.9092288613319397,
"learning_rate": 4.056555031236878e-07,
"loss": 0.1433,
"step": 14080
},
{
"epoch": 2.51,
"grad_norm": 1.3839354515075684,
"learning_rate": 3.966628109946469e-07,
"loss": 0.1494,
"step": 14112
},
{
"epoch": 2.51,
"grad_norm": 0.9744381904602051,
"learning_rate": 3.877623196521707e-07,
"loss": 0.1426,
"step": 14144
},
{
"epoch": 2.52,
"grad_norm": 1.165624976158142,
"learning_rate": 3.7895441925353356e-07,
"loss": 0.1418,
"step": 14176
},
{
"epoch": 2.53,
"grad_norm": 0.9751477241516113,
"learning_rate": 3.702394958972391e-07,
"loss": 0.1479,
"step": 14208
},
{
"epoch": 2.53,
"grad_norm": 1.0645439624786377,
"learning_rate": 3.616179316061011e-07,
"loss": 0.1373,
"step": 14240
},
{
"epoch": 2.54,
"grad_norm": 0.9858296513557434,
"learning_rate": 3.5309010431049284e-07,
"loss": 0.1367,
"step": 14272
},
{
"epoch": 2.54,
"grad_norm": 1.0521584749221802,
"learning_rate": 3.44656387831781e-07,
"loss": 0.1421,
"step": 14304
},
{
"epoch": 2.55,
"grad_norm": 1.2546510696411133,
"learning_rate": 3.363171518659408e-07,
"loss": 0.1384,
"step": 14336
},
{
"epoch": 2.55,
"grad_norm": 1.0795034170150757,
"learning_rate": 3.280727619673496e-07,
"loss": 0.1463,
"step": 14368
},
{
"epoch": 2.56,
"grad_norm": 1.1764500141143799,
"learning_rate": 3.199235795327615e-07,
"loss": 0.1499,
"step": 14400
},
{
"epoch": 2.57,
"grad_norm": 1.1234067678451538,
"learning_rate": 3.1186996178546674e-07,
"loss": 0.1497,
"step": 14432
},
{
"epoch": 2.57,
"grad_norm": 0.9825364947319031,
"learning_rate": 3.039122617596302e-07,
"loss": 0.1514,
"step": 14464
},
{
"epoch": 2.58,
"grad_norm": 1.263085961341858,
"learning_rate": 2.960508282848215e-07,
"loss": 0.1476,
"step": 14496
},
{
"epoch": 2.58,
"grad_norm": 1.084181308746338,
"learning_rate": 2.8828600597071597e-07,
"loss": 0.1308,
"step": 14528
},
{
"epoch": 2.59,
"grad_norm": 1.1697498559951782,
"learning_rate": 2.8061813519199536e-07,
"loss": 0.1348,
"step": 14560
},
{
"epoch": 2.59,
"grad_norm": 1.3982306718826294,
"learning_rate": 2.7304755207342467e-07,
"loss": 0.1455,
"step": 14592
},
{
"epoch": 2.6,
"grad_norm": 1.1802705526351929,
"learning_rate": 2.655745884751157e-07,
"loss": 0.1437,
"step": 14624
},
{
"epoch": 2.61,
"grad_norm": 1.0200531482696533,
"learning_rate": 2.581995719779856e-07,
"loss": 0.1394,
"step": 14656
},
{
"epoch": 2.61,
"grad_norm": 1.1693042516708374,
"learning_rate": 2.5092282586939187e-07,
"loss": 0.151,
"step": 14688
},
{
"epoch": 2.62,
"grad_norm": 1.116024374961853,
"learning_rate": 2.437446691289616e-07,
"loss": 0.1478,
"step": 14720
},
{
"epoch": 2.62,
"grad_norm": 1.05259370803833,
"learning_rate": 2.3666541641461231e-07,
"loss": 0.1436,
"step": 14752
},
{
"epoch": 2.63,
"grad_norm": 1.0545703172683716,
"learning_rate": 2.2968537804875485e-07,
"loss": 0.1379,
"step": 14784
},
{
"epoch": 2.63,
"grad_norm": 1.0618197917938232,
"learning_rate": 2.228048600046928e-07,
"loss": 0.1409,
"step": 14816
},
{
"epoch": 2.64,
"grad_norm": 1.389092206954956,
"learning_rate": 2.1602416389320922e-07,
"loss": 0.1499,
"step": 14848
},
{
"epoch": 2.65,
"grad_norm": 1.0467108488082886,
"learning_rate": 2.0934358694934347e-07,
"loss": 0.1406,
"step": 14880
},
{
"epoch": 2.65,
"grad_norm": 1.1932706832885742,
"learning_rate": 2.0276342201936637e-07,
"loss": 0.1468,
"step": 14912
},
{
"epoch": 2.66,
"grad_norm": 1.2286850214004517,
"learning_rate": 1.9628395754793777e-07,
"loss": 0.1457,
"step": 14944
},
{
"epoch": 2.66,
"grad_norm": 0.9705607891082764,
"learning_rate": 1.899054775654663e-07,
"loss": 0.1439,
"step": 14976
},
{
"epoch": 2.67,
"grad_norm": 0.9110348224639893,
"learning_rate": 1.8362826167565796e-07,
"loss": 0.1439,
"step": 15008
},
{
"epoch": 2.67,
"grad_norm": 0.9858996272087097,
"learning_rate": 1.774525850432568e-07,
"loss": 0.1528,
"step": 15040
},
{
"epoch": 2.68,
"grad_norm": 1.1253962516784668,
"learning_rate": 1.7137871838198817e-07,
"loss": 0.1408,
"step": 15072
},
{
"epoch": 2.69,
"grad_norm": 1.5971510410308838,
"learning_rate": 1.654069279426873e-07,
"loss": 0.1497,
"step": 15104
},
{
"epoch": 2.69,
"grad_norm": 0.8475412130355835,
"learning_rate": 1.5953747550162907e-07,
"loss": 0.1456,
"step": 15136
},
{
"epoch": 2.7,
"grad_norm": 0.9866968989372253,
"learning_rate": 1.537706183490545e-07,
"loss": 0.1349,
"step": 15168
},
{
"epoch": 2.7,
"grad_norm": 1.1067461967468262,
"learning_rate": 1.481066092778913e-07,
"loss": 0.1457,
"step": 15200
},
{
"epoch": 2.71,
"grad_norm": 1.1080329418182373,
"learning_rate": 1.4254569657267235e-07,
"loss": 0.146,
"step": 15232
},
{
"epoch": 2.71,
"grad_norm": 0.992157518863678,
"learning_rate": 1.370881239986524e-07,
"loss": 0.1439,
"step": 15264
},
{
"epoch": 2.72,
"grad_norm": 1.032788872718811,
"learning_rate": 1.3173413079112128e-07,
"loss": 0.1369,
"step": 15296
},
{
"epoch": 2.73,
"grad_norm": 0.9706469774246216,
"learning_rate": 1.264839516449204e-07,
"loss": 0.136,
"step": 15328
},
{
"epoch": 2.73,
"grad_norm": 1.1187324523925781,
"learning_rate": 1.2133781670415013e-07,
"loss": 0.1359,
"step": 15360
},
{
"epoch": 2.74,
"grad_norm": 1.1595239639282227,
"learning_rate": 1.1629595155208424e-07,
"loss": 0.1401,
"step": 15392
},
{
"epoch": 2.74,
"grad_norm": 1.087785243988037,
"learning_rate": 1.1135857720128151e-07,
"loss": 0.1358,
"step": 15424
},
{
"epoch": 2.75,
"grad_norm": 1.0765306949615479,
"learning_rate": 1.0652591008389557e-07,
"loss": 0.1438,
"step": 15456
},
{
"epoch": 2.75,
"grad_norm": 1.1016918420791626,
"learning_rate": 1.0179816204218928e-07,
"loss": 0.1373,
"step": 15488
},
{
"epoch": 2.76,
"grad_norm": 1.0536975860595703,
"learning_rate": 9.717554031924842e-08,
"loss": 0.1349,
"step": 15520
},
{
"epoch": 2.77,
"grad_norm": 0.8933613300323486,
"learning_rate": 9.265824754989467e-08,
"loss": 0.1316,
"step": 15552
},
{
"epoch": 2.77,
"grad_norm": 0.9983497858047485,
"learning_rate": 8.824648175180722e-08,
"loss": 0.1346,
"step": 15584
},
{
"epoch": 2.78,
"grad_norm": 1.0600674152374268,
"learning_rate": 8.394043631683862e-08,
"loss": 0.1533,
"step": 15616
},
{
"epoch": 2.78,
"grad_norm": 1.0515772104263306,
"learning_rate": 7.974030000253986e-08,
"loss": 0.139,
"step": 15648
},
{
"epoch": 2.79,
"grad_norm": 1.4163565635681152,
"learning_rate": 7.564625692388499e-08,
"loss": 0.1323,
"step": 15680
},
{
"epoch": 2.79,
"grad_norm": 1.0619480609893799,
"learning_rate": 7.165848654519969e-08,
"loss": 0.1373,
"step": 15712
},
{
"epoch": 2.8,
"grad_norm": 0.9783567786216736,
"learning_rate": 6.777716367229764e-08,
"loss": 0.1525,
"step": 15744
},
{
"epoch": 2.81,
"grad_norm": 1.0433095693588257,
"learning_rate": 6.400245844481262e-08,
"loss": 0.1409,
"step": 15776
},
{
"epoch": 2.81,
"grad_norm": 1.258354663848877,
"learning_rate": 6.033453632874498e-08,
"loss": 0.1402,
"step": 15808
},
{
"epoch": 2.82,
"grad_norm": 1.1823972463607788,
"learning_rate": 5.677355810920604e-08,
"loss": 0.1418,
"step": 15840
},
{
"epoch": 2.82,
"grad_norm": 1.2745051383972168,
"learning_rate": 5.3319679883370724e-08,
"loss": 0.1471,
"step": 15872
},
{
"epoch": 2.83,
"grad_norm": 1.238215684890747,
"learning_rate": 4.9973053053634365e-08,
"loss": 0.1426,
"step": 15904
},
{
"epoch": 2.83,
"grad_norm": 1.0394669771194458,
"learning_rate": 4.6733824320976674e-08,
"loss": 0.1335,
"step": 15936
},
{
"epoch": 2.84,
"grad_norm": 1.2872110605239868,
"learning_rate": 4.360213567853072e-08,
"loss": 0.1544,
"step": 15968
},
{
"epoch": 2.84,
"grad_norm": 1.2786180973052979,
"learning_rate": 4.057812440535797e-08,
"loss": 0.1461,
"step": 16000
},
{
"epoch": 2.85,
"grad_norm": 1.169412612915039,
"learning_rate": 3.766192306043165e-08,
"loss": 0.1413,
"step": 16032
},
{
"epoch": 2.86,
"grad_norm": 1.2436180114746094,
"learning_rate": 3.485365947682562e-08,
"loss": 0.1357,
"step": 16064
},
{
"epoch": 2.86,
"grad_norm": 1.1065387725830078,
"learning_rate": 3.215345675611076e-08,
"loss": 0.1472,
"step": 16096
},
{
"epoch": 2.87,
"grad_norm": 1.00310218334198,
"learning_rate": 2.9561433262957072e-08,
"loss": 0.1499,
"step": 16128
},
{
"epoch": 2.87,
"grad_norm": 0.9328859448432922,
"learning_rate": 2.7077702619948963e-08,
"loss": 0.1376,
"step": 16160
},
{
"epoch": 2.88,
"grad_norm": 1.122558832168579,
"learning_rate": 2.4702373702600868e-08,
"loss": 0.1461,
"step": 16192
},
{
"epoch": 2.88,
"grad_norm": 1.1374387741088867,
"learning_rate": 2.2435550634585522e-08,
"loss": 0.1427,
"step": 16224
},
{
"epoch": 2.89,
"grad_norm": 1.102001428604126,
"learning_rate": 2.027733278317151e-08,
"loss": 0.1402,
"step": 16256
},
{
"epoch": 2.9,
"grad_norm": 1.3255332708358765,
"learning_rate": 1.822781475486507e-08,
"loss": 0.1427,
"step": 16288
},
{
"epoch": 2.9,
"grad_norm": 1.1129345893859863,
"learning_rate": 1.628708639126425e-08,
"loss": 0.1443,
"step": 16320
},
{
"epoch": 2.91,
"grad_norm": 0.9628246426582336,
"learning_rate": 1.4455232765120397e-08,
"loss": 0.1425,
"step": 16352
},
{
"epoch": 2.91,
"grad_norm": 1.2334058284759521,
"learning_rate": 1.273233417660863e-08,
"loss": 0.134,
"step": 16384
},
{
"epoch": 2.92,
"grad_norm": 1.1855486631393433,
"learning_rate": 1.1118466149808994e-08,
"loss": 0.1403,
"step": 16416
},
{
"epoch": 2.92,
"grad_norm": 1.0412129163742065,
"learning_rate": 9.61369942939383e-09,
"loss": 0.1369,
"step": 16448
},
{
"epoch": 2.93,
"grad_norm": 1.2178360223770142,
"learning_rate": 8.218099977528871e-09,
"loss": 0.1346,
"step": 16480
},
{
"epoch": 2.94,
"grad_norm": 1.1358833312988281,
"learning_rate": 6.9317289709799896e-09,
"loss": 0.1504,
"step": 16512
},
{
"epoch": 2.94,
"grad_norm": 1.1772416830062866,
"learning_rate": 5.754642798432297e-09,
"loss": 0.144,
"step": 16544
},
{
"epoch": 2.95,
"grad_norm": 1.4765156507492065,
"learning_rate": 4.686893058018227e-09,
"loss": 0.1531,
"step": 16576
},
{
"epoch": 2.95,
"grad_norm": 1.0203588008880615,
"learning_rate": 3.728526555056289e-09,
"loss": 0.1439,
"step": 16608
},
{
"epoch": 2.96,
"grad_norm": 1.053261637687683,
"learning_rate": 2.879585299997434e-09,
"loss": 0.1438,
"step": 16640
},
{
"epoch": 2.96,
"grad_norm": 1.2388105392456055,
"learning_rate": 2.1401065065859704e-09,
"loss": 0.145,
"step": 16672
},
{
"epoch": 2.97,
"grad_norm": 0.9954524040222168,
"learning_rate": 1.5101225902267036e-09,
"loss": 0.147,
"step": 16704
},
{
"epoch": 2.98,
"grad_norm": 1.2384732961654663,
"learning_rate": 9.89661166564404e-10,
"loss": 0.1492,
"step": 16736
},
{
"epoch": 2.98,
"grad_norm": 0.8787427544593811,
"learning_rate": 5.787450502728331e-10,
"loss": 0.1299,
"step": 16768
},
{
"epoch": 2.99,
"grad_norm": 0.9824705719947815,
"learning_rate": 2.7739225405609694e-10,
"loss": 0.1428,
"step": 16800
},
{
"epoch": 2.99,
"grad_norm": 1.0306531190872192,
"learning_rate": 8.561598785705727e-11,
"loss": 0.1434,
"step": 16832
},
{
"epoch": 3.0,
"grad_norm": 1.0343343019485474,
"learning_rate": 3.424658279460591e-12,
"loss": 0.1521,
"step": 16864
}
],
"logging_steps": 32,
"max_steps": 16872,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5624,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}