{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 5624, "global_step": 16872, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 64.30284881591797, "learning_rate": 2.9620853080568726e-09, "loss": 3.7905, "step": 1 }, { "epoch": 0.01, "grad_norm": 74.81798553466797, "learning_rate": 9.478672985781992e-08, "loss": 4.4137, "step": 32 }, { "epoch": 0.01, "grad_norm": 101.4776840209961, "learning_rate": 1.8957345971563984e-07, "loss": 4.2954, "step": 64 }, { "epoch": 0.02, "grad_norm": 68.84349822998047, "learning_rate": 2.843601895734597e-07, "loss": 3.376, "step": 96 }, { "epoch": 0.02, "grad_norm": 42.47146224975586, "learning_rate": 3.791469194312797e-07, "loss": 1.962, "step": 128 }, { "epoch": 0.03, "grad_norm": 46.63043212890625, "learning_rate": 4.7393364928909956e-07, "loss": 1.168, "step": 160 }, { "epoch": 0.03, "grad_norm": 2.709139347076416, "learning_rate": 5.687203791469194e-07, "loss": 0.4681, "step": 192 }, { "epoch": 0.04, "grad_norm": 3.304001808166504, "learning_rate": 6.635071090047394e-07, "loss": 0.4059, "step": 224 }, { "epoch": 0.05, "grad_norm": 2.5251598358154297, "learning_rate": 7.582938388625594e-07, "loss": 0.3999, "step": 256 }, { "epoch": 0.05, "grad_norm": 1.9685628414154053, "learning_rate": 8.530805687203792e-07, "loss": 0.3904, "step": 288 }, { "epoch": 0.06, "grad_norm": 1.7696113586425781, "learning_rate": 9.478672985781991e-07, "loss": 0.3769, "step": 320 }, { "epoch": 0.06, "grad_norm": 1.8694322109222412, "learning_rate": 1.042654028436019e-06, "loss": 0.3718, "step": 352 }, { "epoch": 0.07, "grad_norm": 1.7985926866531372, "learning_rate": 1.1374407582938388e-06, "loss": 0.3569, "step": 384 }, { "epoch": 0.07, "grad_norm": 1.9462366104125977, "learning_rate": 1.2322274881516587e-06, "loss": 0.3432, "step": 416 }, { "epoch": 0.08, "grad_norm": 1.916548490524292, "learning_rate": 1.3270142180094788e-06, "loss": 0.331, "step": 448 }, { "epoch": 0.09, "grad_norm": 1.2683026790618896, "learning_rate": 1.4218009478672987e-06, "loss": 0.334, "step": 480 }, { "epoch": 0.09, "grad_norm": 1.4642925262451172, "learning_rate": 1.5165876777251187e-06, "loss": 0.3386, "step": 512 }, { "epoch": 0.1, "grad_norm": 1.3890858888626099, "learning_rate": 1.6113744075829384e-06, "loss": 0.3073, "step": 544 }, { "epoch": 0.1, "grad_norm": 1.4565709829330444, "learning_rate": 1.7061611374407585e-06, "loss": 0.3263, "step": 576 }, { "epoch": 0.11, "grad_norm": 1.349706768989563, "learning_rate": 1.8009478672985784e-06, "loss": 0.3165, "step": 608 }, { "epoch": 0.11, "grad_norm": 1.7937098741531372, "learning_rate": 1.8957345971563982e-06, "loss": 0.3052, "step": 640 }, { "epoch": 0.12, "grad_norm": 1.616604208946228, "learning_rate": 1.990521327014218e-06, "loss": 0.3138, "step": 672 }, { "epoch": 0.13, "grad_norm": 1.3185547590255737, "learning_rate": 2.085308056872038e-06, "loss": 0.297, "step": 704 }, { "epoch": 0.13, "grad_norm": 1.8201861381530762, "learning_rate": 2.180094786729858e-06, "loss": 0.3072, "step": 736 }, { "epoch": 0.14, "grad_norm": 1.1981289386749268, "learning_rate": 2.2748815165876777e-06, "loss": 0.2957, "step": 768 }, { "epoch": 0.14, "grad_norm": 1.7006583213806152, "learning_rate": 2.369668246445498e-06, "loss": 0.2941, "step": 800 }, { "epoch": 0.15, "grad_norm": 2.1674447059631348, "learning_rate": 2.4644549763033174e-06, "loss": 0.2798, "step": 832 }, { "epoch": 0.15, "grad_norm": 1.3364813327789307, "learning_rate": 2.5592417061611373e-06, "loss": 0.2996, "step": 864 }, { "epoch": 0.16, "grad_norm": 1.2685174942016602, "learning_rate": 2.6540284360189576e-06, "loss": 0.3027, "step": 896 }, { "epoch": 0.17, "grad_norm": 1.3952556848526, "learning_rate": 2.7488151658767775e-06, "loss": 0.2985, "step": 928 }, { "epoch": 0.17, "grad_norm": 1.251714825630188, "learning_rate": 2.8436018957345973e-06, "loss": 0.2905, "step": 960 }, { "epoch": 0.18, "grad_norm": 1.553152322769165, "learning_rate": 2.938388625592417e-06, "loss": 0.278, "step": 992 }, { "epoch": 0.18, "grad_norm": 1.380920171737671, "learning_rate": 3.0331753554502375e-06, "loss": 0.2813, "step": 1024 }, { "epoch": 0.19, "grad_norm": 1.5351643562316895, "learning_rate": 3.1279620853080574e-06, "loss": 0.2805, "step": 1056 }, { "epoch": 0.19, "grad_norm": 1.4867887496948242, "learning_rate": 3.222748815165877e-06, "loss": 0.2767, "step": 1088 }, { "epoch": 0.2, "grad_norm": 1.317229986190796, "learning_rate": 3.3175355450236967e-06, "loss": 0.2859, "step": 1120 }, { "epoch": 0.2, "grad_norm": 1.8770791292190552, "learning_rate": 3.412322274881517e-06, "loss": 0.2875, "step": 1152 }, { "epoch": 0.21, "grad_norm": 1.4476697444915771, "learning_rate": 3.507109004739337e-06, "loss": 0.2884, "step": 1184 }, { "epoch": 0.22, "grad_norm": 1.351965069770813, "learning_rate": 3.6018957345971567e-06, "loss": 0.2802, "step": 1216 }, { "epoch": 0.22, "grad_norm": 1.4647209644317627, "learning_rate": 3.6966824644549766e-06, "loss": 0.2703, "step": 1248 }, { "epoch": 0.23, "grad_norm": 1.901773452758789, "learning_rate": 3.7914691943127964e-06, "loss": 0.2815, "step": 1280 }, { "epoch": 0.23, "grad_norm": 1.139844536781311, "learning_rate": 3.886255924170616e-06, "loss": 0.2658, "step": 1312 }, { "epoch": 0.24, "grad_norm": 1.1863261461257935, "learning_rate": 3.981042654028436e-06, "loss": 0.2707, "step": 1344 }, { "epoch": 0.24, "grad_norm": 1.2720916271209717, "learning_rate": 4.075829383886256e-06, "loss": 0.2646, "step": 1376 }, { "epoch": 0.25, "grad_norm": 1.6161096096038818, "learning_rate": 4.170616113744076e-06, "loss": 0.2748, "step": 1408 }, { "epoch": 0.26, "grad_norm": 1.4303381443023682, "learning_rate": 4.265402843601897e-06, "loss": 0.2691, "step": 1440 }, { "epoch": 0.26, "grad_norm": 1.401880145072937, "learning_rate": 4.360189573459716e-06, "loss": 0.2699, "step": 1472 }, { "epoch": 0.27, "grad_norm": 1.3495467901229858, "learning_rate": 4.4549763033175355e-06, "loss": 0.2772, "step": 1504 }, { "epoch": 0.27, "grad_norm": 1.3915464878082275, "learning_rate": 4.549763033175355e-06, "loss": 0.2752, "step": 1536 }, { "epoch": 0.28, "grad_norm": 1.3412673473358154, "learning_rate": 4.644549763033176e-06, "loss": 0.2751, "step": 1568 }, { "epoch": 0.28, "grad_norm": 1.3777296543121338, "learning_rate": 4.739336492890996e-06, "loss": 0.2717, "step": 1600 }, { "epoch": 0.29, "grad_norm": 1.2612873315811157, "learning_rate": 4.834123222748816e-06, "loss": 0.2678, "step": 1632 }, { "epoch": 0.3, "grad_norm": 1.2989717721939087, "learning_rate": 4.928909952606635e-06, "loss": 0.2778, "step": 1664 }, { "epoch": 0.3, "grad_norm": 1.3525702953338623, "learning_rate": 4.999996575341721e-06, "loss": 0.2719, "step": 1696 }, { "epoch": 0.31, "grad_norm": 1.4678899049758911, "learning_rate": 4.999914384012144e-06, "loss": 0.2755, "step": 1728 }, { "epoch": 0.31, "grad_norm": 1.2093278169631958, "learning_rate": 4.999722607745944e-06, "loss": 0.2755, "step": 1760 }, { "epoch": 0.32, "grad_norm": 1.4915423393249512, "learning_rate": 4.999421254949728e-06, "loss": 0.2686, "step": 1792 }, { "epoch": 0.32, "grad_norm": 1.1101861000061035, "learning_rate": 4.999010338833436e-06, "loss": 0.2594, "step": 1824 }, { "epoch": 0.33, "grad_norm": 1.3432806730270386, "learning_rate": 4.9984898774097735e-06, "loss": 0.2658, "step": 1856 }, { "epoch": 0.34, "grad_norm": 1.2808105945587158, "learning_rate": 4.997859893493414e-06, "loss": 0.2632, "step": 1888 }, { "epoch": 0.34, "grad_norm": 1.3815045356750488, "learning_rate": 4.997120414700003e-06, "loss": 0.2557, "step": 1920 }, { "epoch": 0.35, "grad_norm": 1.4393643140792847, "learning_rate": 4.996271473444944e-06, "loss": 0.263, "step": 1952 }, { "epoch": 0.35, "grad_norm": 1.138375163078308, "learning_rate": 4.995313106941982e-06, "loss": 0.2805, "step": 1984 }, { "epoch": 0.36, "grad_norm": 1.6412934064865112, "learning_rate": 4.994245357201568e-06, "loss": 0.2641, "step": 2016 }, { "epoch": 0.36, "grad_norm": 1.465922236442566, "learning_rate": 4.9930682710290205e-06, "loss": 0.2637, "step": 2048 }, { "epoch": 0.37, "grad_norm": 1.4526797533035278, "learning_rate": 4.991781900022471e-06, "loss": 0.2596, "step": 2080 }, { "epoch": 0.38, "grad_norm": 1.504759669303894, "learning_rate": 4.990386300570607e-06, "loss": 0.2633, "step": 2112 }, { "epoch": 0.38, "grad_norm": 1.5599263906478882, "learning_rate": 4.988881533850192e-06, "loss": 0.2658, "step": 2144 }, { "epoch": 0.39, "grad_norm": 1.1662814617156982, "learning_rate": 4.987267665823392e-06, "loss": 0.2694, "step": 2176 }, { "epoch": 0.39, "grad_norm": 1.3952819108963013, "learning_rate": 4.98554476723488e-06, "loss": 0.2449, "step": 2208 }, { "epoch": 0.4, "grad_norm": 1.2887946367263794, "learning_rate": 4.983712913608736e-06, "loss": 0.2651, "step": 2240 }, { "epoch": 0.4, "grad_norm": 1.5893690586090088, "learning_rate": 4.981772185245135e-06, "loss": 0.2568, "step": 2272 }, { "epoch": 0.41, "grad_norm": 1.228550672531128, "learning_rate": 4.979722667216829e-06, "loss": 0.2667, "step": 2304 }, { "epoch": 0.42, "grad_norm": 1.2756662368774414, "learning_rate": 4.977564449365415e-06, "loss": 0.2508, "step": 2336 }, { "epoch": 0.42, "grad_norm": 1.5225822925567627, "learning_rate": 4.975297626297399e-06, "loss": 0.2691, "step": 2368 }, { "epoch": 0.43, "grad_norm": 1.2656946182250977, "learning_rate": 4.972922297380052e-06, "loss": 0.2704, "step": 2400 }, { "epoch": 0.43, "grad_norm": 1.3268104791641235, "learning_rate": 4.970438566737043e-06, "loss": 0.2577, "step": 2432 }, { "epoch": 0.44, "grad_norm": 1.5536099672317505, "learning_rate": 4.96784654324389e-06, "loss": 0.2578, "step": 2464 }, { "epoch": 0.44, "grad_norm": 1.1516194343566895, "learning_rate": 4.965146340523175e-06, "loss": 0.2446, "step": 2496 }, { "epoch": 0.45, "grad_norm": 1.1923089027404785, "learning_rate": 4.962338076939569e-06, "loss": 0.2569, "step": 2528 }, { "epoch": 0.46, "grad_norm": 1.124197006225586, "learning_rate": 4.959421875594643e-06, "loss": 0.2625, "step": 2560 }, { "epoch": 0.46, "grad_norm": 1.680388331413269, "learning_rate": 4.95639786432147e-06, "loss": 0.264, "step": 2592 }, { "epoch": 0.47, "grad_norm": 1.3039462566375732, "learning_rate": 4.953266175679023e-06, "loss": 0.2624, "step": 2624 }, { "epoch": 0.47, "grad_norm": 1.109054684638977, "learning_rate": 4.9500269469463655e-06, "loss": 0.2548, "step": 2656 }, { "epoch": 0.48, "grad_norm": 1.2704750299453735, "learning_rate": 4.94668032011663e-06, "loss": 0.2569, "step": 2688 }, { "epoch": 0.48, "grad_norm": 1.1952179670333862, "learning_rate": 4.943226441890794e-06, "loss": 0.2599, "step": 2720 }, { "epoch": 0.49, "grad_norm": 1.2229312658309937, "learning_rate": 4.939665463671255e-06, "loss": 0.2577, "step": 2752 }, { "epoch": 0.5, "grad_norm": 1.3956924676895142, "learning_rate": 4.935997541555188e-06, "loss": 0.2642, "step": 2784 }, { "epoch": 0.5, "grad_norm": 1.116629958152771, "learning_rate": 4.932222836327703e-06, "loss": 0.2587, "step": 2816 }, { "epoch": 0.51, "grad_norm": 1.1389435529708862, "learning_rate": 4.928341513454801e-06, "loss": 0.2566, "step": 2848 }, { "epoch": 0.51, "grad_norm": 1.3800580501556396, "learning_rate": 4.9243537430761155e-06, "loss": 0.2579, "step": 2880 }, { "epoch": 0.52, "grad_norm": 1.3852914571762085, "learning_rate": 4.920259699997461e-06, "loss": 0.2666, "step": 2912 }, { "epoch": 0.52, "grad_norm": 1.31257963180542, "learning_rate": 4.916059563683162e-06, "loss": 0.2547, "step": 2944 }, { "epoch": 0.53, "grad_norm": 1.599116563796997, "learning_rate": 4.911753518248194e-06, "loss": 0.2612, "step": 2976 }, { "epoch": 0.53, "grad_norm": 1.2397140264511108, "learning_rate": 4.907341752450105e-06, "loss": 0.2589, "step": 3008 }, { "epoch": 0.54, "grad_norm": 1.3178327083587646, "learning_rate": 4.9028244596807525e-06, "loss": 0.2605, "step": 3040 }, { "epoch": 0.55, "grad_norm": 1.7413417100906372, "learning_rate": 4.898201837957811e-06, "loss": 0.2565, "step": 3072 }, { "epoch": 0.55, "grad_norm": 1.314085602760315, "learning_rate": 4.893474089916105e-06, "loss": 0.2498, "step": 3104 }, { "epoch": 0.56, "grad_norm": 1.1399492025375366, "learning_rate": 4.888641422798719e-06, "loss": 0.2647, "step": 3136 }, { "epoch": 0.56, "grad_norm": 1.3332985639572144, "learning_rate": 4.883704048447916e-06, "loss": 0.2594, "step": 3168 }, { "epoch": 0.57, "grad_norm": 1.3460063934326172, "learning_rate": 4.87866218329585e-06, "loss": 0.2571, "step": 3200 }, { "epoch": 0.57, "grad_norm": 1.5006327629089355, "learning_rate": 4.87351604835508e-06, "loss": 0.2458, "step": 3232 }, { "epoch": 0.58, "grad_norm": 1.1781283617019653, "learning_rate": 4.868265869208879e-06, "loss": 0.2452, "step": 3264 }, { "epoch": 0.59, "grad_norm": 1.117686152458191, "learning_rate": 4.862911876001348e-06, "loss": 0.2469, "step": 3296 }, { "epoch": 0.59, "grad_norm": 0.9969549775123596, "learning_rate": 4.857454303427328e-06, "loss": 0.2453, "step": 3328 }, { "epoch": 0.6, "grad_norm": 1.4894945621490479, "learning_rate": 4.851893390722109e-06, "loss": 0.2457, "step": 3360 }, { "epoch": 0.6, "grad_norm": 1.106041431427002, "learning_rate": 4.846229381650946e-06, "loss": 0.2474, "step": 3392 }, { "epoch": 0.61, "grad_norm": 1.035601019859314, "learning_rate": 4.840462524498372e-06, "loss": 0.2593, "step": 3424 }, { "epoch": 0.61, "grad_norm": 1.7077690362930298, "learning_rate": 4.834593072057313e-06, "loss": 0.2506, "step": 3456 }, { "epoch": 0.62, "grad_norm": 1.1017436981201172, "learning_rate": 4.8286212816180124e-06, "loss": 0.2506, "step": 3488 }, { "epoch": 0.63, "grad_norm": 1.2720685005187988, "learning_rate": 4.8225474149567434e-06, "loss": 0.2567, "step": 3520 }, { "epoch": 0.63, "grad_norm": 1.328189730644226, "learning_rate": 4.816371738324343e-06, "loss": 0.2531, "step": 3552 }, { "epoch": 0.64, "grad_norm": 1.2597825527191162, "learning_rate": 4.810094522434534e-06, "loss": 0.246, "step": 3584 }, { "epoch": 0.64, "grad_norm": 1.244281530380249, "learning_rate": 4.803716042452063e-06, "loss": 0.2433, "step": 3616 }, { "epoch": 0.65, "grad_norm": 1.4658986330032349, "learning_rate": 4.797236577980634e-06, "loss": 0.2496, "step": 3648 }, { "epoch": 0.65, "grad_norm": 1.4121670722961426, "learning_rate": 4.7906564130506575e-06, "loss": 0.2531, "step": 3680 }, { "epoch": 0.66, "grad_norm": 1.1751240491867065, "learning_rate": 4.783975836106791e-06, "loss": 0.2515, "step": 3712 }, { "epoch": 0.67, "grad_norm": 1.2011898756027222, "learning_rate": 4.777195139995308e-06, "loss": 0.2453, "step": 3744 }, { "epoch": 0.67, "grad_norm": 1.5764689445495605, "learning_rate": 4.770314621951245e-06, "loss": 0.2496, "step": 3776 }, { "epoch": 0.68, "grad_norm": 1.4584077596664429, "learning_rate": 4.763334583585388e-06, "loss": 0.2392, "step": 3808 }, { "epoch": 0.68, "grad_norm": 1.0098185539245605, "learning_rate": 4.756255330871039e-06, "loss": 0.2393, "step": 3840 }, { "epoch": 0.69, "grad_norm": 1.3514459133148193, "learning_rate": 4.749077174130609e-06, "loss": 0.2572, "step": 3872 }, { "epoch": 0.69, "grad_norm": 1.3888107538223267, "learning_rate": 4.741800428022014e-06, "loss": 0.2383, "step": 3904 }, { "epoch": 0.7, "grad_norm": 1.3402737379074097, "learning_rate": 4.734425411524884e-06, "loss": 0.2556, "step": 3936 }, { "epoch": 0.71, "grad_norm": 1.2175307273864746, "learning_rate": 4.726952447926576e-06, "loss": 0.2555, "step": 3968 }, { "epoch": 0.71, "grad_norm": 1.386852502822876, "learning_rate": 4.719381864808005e-06, "loss": 0.2503, "step": 4000 }, { "epoch": 0.72, "grad_norm": 1.2774380445480347, "learning_rate": 4.711713994029284e-06, "loss": 0.2503, "step": 4032 }, { "epoch": 0.72, "grad_norm": 1.177322268486023, "learning_rate": 4.703949171715179e-06, "loss": 0.2574, "step": 4064 }, { "epoch": 0.73, "grad_norm": 1.269942283630371, "learning_rate": 4.69608773824037e-06, "loss": 0.2529, "step": 4096 }, { "epoch": 0.73, "grad_norm": 1.2209409475326538, "learning_rate": 4.688130038214534e-06, "loss": 0.2536, "step": 4128 }, { "epoch": 0.74, "grad_norm": 1.4368942975997925, "learning_rate": 4.6800764204672385e-06, "loss": 0.2378, "step": 4160 }, { "epoch": 0.75, "grad_norm": 1.6493048667907715, "learning_rate": 4.671927238032651e-06, "loss": 0.2538, "step": 4192 }, { "epoch": 0.75, "grad_norm": 1.038549542427063, "learning_rate": 4.6636828481340594e-06, "loss": 0.2501, "step": 4224 }, { "epoch": 0.76, "grad_norm": 1.343204379081726, "learning_rate": 4.655343612168219e-06, "loss": 0.251, "step": 4256 }, { "epoch": 0.76, "grad_norm": 1.4020464420318604, "learning_rate": 4.646909895689508e-06, "loss": 0.2564, "step": 4288 }, { "epoch": 0.77, "grad_norm": 1.1331307888031006, "learning_rate": 4.638382068393899e-06, "loss": 0.2505, "step": 4320 }, { "epoch": 0.77, "grad_norm": 1.3825620412826538, "learning_rate": 4.629760504102761e-06, "loss": 0.2513, "step": 4352 }, { "epoch": 0.78, "grad_norm": 1.310570478439331, "learning_rate": 4.621045580746467e-06, "loss": 0.2464, "step": 4384 }, { "epoch": 0.79, "grad_norm": 1.15547776222229, "learning_rate": 4.61223768034783e-06, "loss": 0.2515, "step": 4416 }, { "epoch": 0.79, "grad_norm": 1.340010404586792, "learning_rate": 4.603337189005354e-06, "loss": 0.2473, "step": 4448 }, { "epoch": 0.8, "grad_norm": 1.2413158416748047, "learning_rate": 4.594344496876313e-06, "loss": 0.2354, "step": 4480 }, { "epoch": 0.8, "grad_norm": 1.2394189834594727, "learning_rate": 4.585259998159646e-06, "loss": 0.2512, "step": 4512 }, { "epoch": 0.81, "grad_norm": 1.2866027355194092, "learning_rate": 4.576084091078677e-06, "loss": 0.2364, "step": 4544 }, { "epoch": 0.81, "grad_norm": 1.1080009937286377, "learning_rate": 4.5668171778636585e-06, "loss": 0.2432, "step": 4576 }, { "epoch": 0.82, "grad_norm": 1.2469310760498047, "learning_rate": 4.5574596647341414e-06, "loss": 0.256, "step": 4608 }, { "epoch": 0.83, "grad_norm": 1.0387507677078247, "learning_rate": 4.548011961881167e-06, "loss": 0.232, "step": 4640 }, { "epoch": 0.83, "grad_norm": 1.2382770776748657, "learning_rate": 4.538474483449286e-06, "loss": 0.2552, "step": 4672 }, { "epoch": 0.84, "grad_norm": 1.2282336950302124, "learning_rate": 4.528847647518403e-06, "loss": 0.2525, "step": 4704 }, { "epoch": 0.84, "grad_norm": 1.4016482830047607, "learning_rate": 4.5191318760854526e-06, "loss": 0.2582, "step": 4736 }, { "epoch": 0.85, "grad_norm": 1.3214083909988403, "learning_rate": 4.509327595045898e-06, "loss": 0.2578, "step": 4768 }, { "epoch": 0.85, "grad_norm": 0.9114232063293457, "learning_rate": 4.499435234175065e-06, "loss": 0.2533, "step": 4800 }, { "epoch": 0.86, "grad_norm": 1.172450065612793, "learning_rate": 4.4894552271093e-06, "loss": 0.264, "step": 4832 }, { "epoch": 0.86, "grad_norm": 1.249770998954773, "learning_rate": 4.4793880113269595e-06, "loss": 0.2389, "step": 4864 }, { "epoch": 0.87, "grad_norm": 1.0912755727767944, "learning_rate": 4.469234028129241e-06, "loss": 0.2456, "step": 4896 }, { "epoch": 0.88, "grad_norm": 1.1503956317901611, "learning_rate": 4.458993722620827e-06, "loss": 0.2562, "step": 4928 }, { "epoch": 0.88, "grad_norm": 1.1564654111862183, "learning_rate": 4.448667543690384e-06, "loss": 0.25, "step": 4960 }, { "epoch": 0.89, "grad_norm": 1.271000862121582, "learning_rate": 4.438255943990879e-06, "loss": 0.243, "step": 4992 }, { "epoch": 0.89, "grad_norm": 1.0601048469543457, "learning_rate": 4.427759379919739e-06, "loss": 0.2397, "step": 5024 }, { "epoch": 0.9, "grad_norm": 1.214858889579773, "learning_rate": 4.417178311598845e-06, "loss": 0.2442, "step": 5056 }, { "epoch": 0.9, "grad_norm": 1.0516908168792725, "learning_rate": 4.406513202854363e-06, "loss": 0.2467, "step": 5088 }, { "epoch": 0.91, "grad_norm": 1.326076865196228, "learning_rate": 4.3957645211964065e-06, "loss": 0.2488, "step": 5120 }, { "epoch": 0.92, "grad_norm": 1.173823356628418, "learning_rate": 4.384932737798554e-06, "loss": 0.241, "step": 5152 }, { "epoch": 0.92, "grad_norm": 1.4526327848434448, "learning_rate": 4.3740183274771845e-06, "loss": 0.2553, "step": 5184 }, { "epoch": 0.93, "grad_norm": 1.2346609830856323, "learning_rate": 4.363021768670668e-06, "loss": 0.242, "step": 5216 }, { "epoch": 0.93, "grad_norm": 0.8957495093345642, "learning_rate": 4.351943543418392e-06, "loss": 0.2444, "step": 5248 }, { "epoch": 0.94, "grad_norm": 1.097772479057312, "learning_rate": 4.340784137339632e-06, "loss": 0.2531, "step": 5280 }, { "epoch": 0.94, "grad_norm": 1.1537779569625854, "learning_rate": 4.329544039612264e-06, "loss": 0.2507, "step": 5312 }, { "epoch": 0.95, "grad_norm": 1.1922253370285034, "learning_rate": 4.318223742951321e-06, "loss": 0.2335, "step": 5344 }, { "epoch": 0.96, "grad_norm": 1.1036819219589233, "learning_rate": 4.306823743587394e-06, "loss": 0.2465, "step": 5376 }, { "epoch": 0.96, "grad_norm": 1.229779839515686, "learning_rate": 4.295344541244879e-06, "loss": 0.2403, "step": 5408 }, { "epoch": 0.97, "grad_norm": 1.4036519527435303, "learning_rate": 4.283786639120074e-06, "loss": 0.254, "step": 5440 }, { "epoch": 0.97, "grad_norm": 0.9732062816619873, "learning_rate": 4.272150543859117e-06, "loss": 0.2517, "step": 5472 }, { "epoch": 0.98, "grad_norm": 1.3309801816940308, "learning_rate": 4.260436765535784e-06, "loss": 0.25, "step": 5504 }, { "epoch": 0.98, "grad_norm": 1.3353493213653564, "learning_rate": 4.2486458176291176e-06, "loss": 0.2482, "step": 5536 }, { "epoch": 0.99, "grad_norm": 1.6585358381271362, "learning_rate": 4.236778217000934e-06, "loss": 0.248, "step": 5568 }, { "epoch": 1.0, "grad_norm": 0.9717461466789246, "learning_rate": 4.224834483873152e-06, "loss": 0.2366, "step": 5600 }, { "epoch": 1.0, "grad_norm": 0.9571962356567383, "learning_rate": 4.2128151418049976e-06, "loss": 0.2404, "step": 5632 }, { "epoch": 1.01, "grad_norm": 1.0692377090454102, "learning_rate": 4.200720717670048e-06, "loss": 0.2135, "step": 5664 }, { "epoch": 1.01, "grad_norm": 1.1159001588821411, "learning_rate": 4.188551741633144e-06, "loss": 0.1854, "step": 5696 }, { "epoch": 1.02, "grad_norm": 1.4514949321746826, "learning_rate": 4.176308747127136e-06, "loss": 0.2095, "step": 5728 }, { "epoch": 1.02, "grad_norm": 1.4603676795959473, "learning_rate": 4.1639922708295176e-06, "loss": 0.2015, "step": 5760 }, { "epoch": 1.03, "grad_norm": 1.1802875995635986, "learning_rate": 4.151602852638888e-06, "loss": 0.222, "step": 5792 }, { "epoch": 1.04, "grad_norm": 1.2036052942276, "learning_rate": 4.139141035651288e-06, "loss": 0.2093, "step": 5824 }, { "epoch": 1.04, "grad_norm": 1.1690653562545776, "learning_rate": 4.126607366136395e-06, "loss": 0.1925, "step": 5856 }, { "epoch": 1.05, "grad_norm": 0.9996016621589661, "learning_rate": 4.114002393513577e-06, "loss": 0.206, "step": 5888 }, { "epoch": 1.05, "grad_norm": 1.1670773029327393, "learning_rate": 4.101326670327807e-06, "loss": 0.2097, "step": 5920 }, { "epoch": 1.06, "grad_norm": 0.8733654022216797, "learning_rate": 4.0885807522254435e-06, "loss": 0.2015, "step": 5952 }, { "epoch": 1.06, "grad_norm": 1.2280749082565308, "learning_rate": 4.075765197929872e-06, "loss": 0.2108, "step": 5984 }, { "epoch": 1.07, "grad_norm": 1.1926356554031372, "learning_rate": 4.0628805692170105e-06, "loss": 0.2047, "step": 6016 }, { "epoch": 1.08, "grad_norm": 1.0048396587371826, "learning_rate": 4.049927430890693e-06, "loss": 0.2077, "step": 6048 }, { "epoch": 1.08, "grad_norm": 1.026442050933838, "learning_rate": 4.0369063507578995e-06, "loss": 0.2051, "step": 6080 }, { "epoch": 1.09, "grad_norm": 1.1310842037200928, "learning_rate": 4.023817899603875e-06, "loss": 0.2055, "step": 6112 }, { "epoch": 1.09, "grad_norm": 1.1275712251663208, "learning_rate": 4.010662651167106e-06, "loss": 0.1965, "step": 6144 }, { "epoch": 1.1, "grad_norm": 1.1789113283157349, "learning_rate": 3.997441182114164e-06, "loss": 0.2118, "step": 6176 }, { "epoch": 1.1, "grad_norm": 1.3836599588394165, "learning_rate": 3.984154072014438e-06, "loss": 0.2056, "step": 6208 }, { "epoch": 1.11, "grad_norm": 1.1013050079345703, "learning_rate": 3.970801903314722e-06, "loss": 0.2109, "step": 6240 }, { "epoch": 1.12, "grad_norm": 1.1018249988555908, "learning_rate": 3.957385261313685e-06, "loss": 0.202, "step": 6272 }, { "epoch": 1.12, "grad_norm": 1.3906185626983643, "learning_rate": 3.943904734136213e-06, "loss": 0.2065, "step": 6304 }, { "epoch": 1.13, "grad_norm": 1.2197610139846802, "learning_rate": 3.930360912707632e-06, "loss": 0.2096, "step": 6336 }, { "epoch": 1.13, "grad_norm": 1.0342845916748047, "learning_rate": 3.916754390727795e-06, "loss": 0.2024, "step": 6368 }, { "epoch": 1.14, "grad_norm": 1.236260175704956, "learning_rate": 3.90308576464507e-06, "loss": 0.216, "step": 6400 }, { "epoch": 1.14, "grad_norm": 1.3906086683273315, "learning_rate": 3.889355633630186e-06, "loss": 0.2153, "step": 6432 }, { "epoch": 1.15, "grad_norm": 1.2441129684448242, "learning_rate": 3.875564599549968e-06, "loss": 0.2092, "step": 6464 }, { "epoch": 1.16, "grad_norm": 1.2320338487625122, "learning_rate": 3.861713266940959e-06, "loss": 0.2038, "step": 6496 }, { "epoch": 1.16, "grad_norm": 1.6422646045684814, "learning_rate": 3.847802242982915e-06, "loss": 0.205, "step": 6528 }, { "epoch": 1.17, "grad_norm": 1.1179068088531494, "learning_rate": 3.83383213747219e-06, "loss": 0.2162, "step": 6560 }, { "epoch": 1.17, "grad_norm": 1.0986745357513428, "learning_rate": 3.8198035627950084e-06, "loss": 0.1956, "step": 6592 }, { "epoch": 1.18, "grad_norm": 1.340859055519104, "learning_rate": 3.8057171339006138e-06, "loss": 0.2093, "step": 6624 }, { "epoch": 1.18, "grad_norm": 1.7803446054458618, "learning_rate": 3.791573468274323e-06, "loss": 0.2133, "step": 6656 }, { "epoch": 1.19, "grad_norm": 1.022388219833374, "learning_rate": 3.777373185910448e-06, "loss": 0.2182, "step": 6688 }, { "epoch": 1.19, "grad_norm": 1.0795223712921143, "learning_rate": 3.7631169092851226e-06, "loss": 0.2051, "step": 6720 }, { "epoch": 1.2, "grad_norm": 1.0785856246948242, "learning_rate": 3.7488052633290174e-06, "loss": 0.2047, "step": 6752 }, { "epoch": 1.21, "grad_norm": 1.0391508340835571, "learning_rate": 3.7344388753999434e-06, "loss": 0.2081, "step": 6784 }, { "epoch": 1.21, "grad_norm": 1.3015925884246826, "learning_rate": 3.720018375255352e-06, "loss": 0.2013, "step": 6816 }, { "epoch": 1.22, "grad_norm": 1.2382066249847412, "learning_rate": 3.7055443950247276e-06, "loss": 0.2037, "step": 6848 }, { "epoch": 1.22, "grad_norm": 1.1386123895645142, "learning_rate": 3.691017569181882e-06, "loss": 0.2046, "step": 6880 }, { "epoch": 1.23, "grad_norm": 0.9857081770896912, "learning_rate": 3.6764385345171393e-06, "loss": 0.207, "step": 6912 }, { "epoch": 1.23, "grad_norm": 1.1276394128799438, "learning_rate": 3.661807930109422e-06, "loss": 0.2134, "step": 6944 }, { "epoch": 1.24, "grad_norm": 1.1821982860565186, "learning_rate": 3.647126397298234e-06, "loss": 0.2162, "step": 6976 }, { "epoch": 1.25, "grad_norm": 1.218800663948059, "learning_rate": 3.632394579655555e-06, "loss": 0.2023, "step": 7008 }, { "epoch": 1.25, "grad_norm": 1.083310842514038, "learning_rate": 3.6176131229576193e-06, "loss": 0.1999, "step": 7040 }, { "epoch": 1.26, "grad_norm": 1.0640002489089966, "learning_rate": 3.602782675156617e-06, "loss": 0.2125, "step": 7072 }, { "epoch": 1.26, "grad_norm": 1.1672149896621704, "learning_rate": 3.5879038863522843e-06, "loss": 0.2157, "step": 7104 }, { "epoch": 1.27, "grad_norm": 1.1732845306396484, "learning_rate": 3.572977408763407e-06, "loss": 0.2082, "step": 7136 }, { "epoch": 1.27, "grad_norm": 1.1544941663742065, "learning_rate": 3.5580038966992344e-06, "loss": 0.2067, "step": 7168 }, { "epoch": 1.28, "grad_norm": 1.2914546728134155, "learning_rate": 3.5429840065307924e-06, "loss": 0.2019, "step": 7200 }, { "epoch": 1.29, "grad_norm": 1.0473650693893433, "learning_rate": 3.527918396662115e-06, "loss": 0.1952, "step": 7232 }, { "epoch": 1.29, "grad_norm": 1.2211614847183228, "learning_rate": 3.512807727501379e-06, "loss": 0.2093, "step": 7264 }, { "epoch": 1.3, "grad_norm": 1.1035760641098022, "learning_rate": 3.4976526614319573e-06, "loss": 0.2007, "step": 7296 }, { "epoch": 1.3, "grad_norm": 1.2120308876037598, "learning_rate": 3.4824538627833825e-06, "loss": 0.2205, "step": 7328 }, { "epoch": 1.31, "grad_norm": 0.8647122979164124, "learning_rate": 3.4672119978022277e-06, "loss": 0.2063, "step": 7360 }, { "epoch": 1.31, "grad_norm": 1.1142189502716064, "learning_rate": 3.4519277346228953e-06, "loss": 0.2075, "step": 7392 }, { "epoch": 1.32, "grad_norm": 1.3183207511901855, "learning_rate": 3.436601743238335e-06, "loss": 0.2094, "step": 7424 }, { "epoch": 1.33, "grad_norm": 1.0320820808410645, "learning_rate": 3.421234695470673e-06, "loss": 0.2029, "step": 7456 }, { "epoch": 1.33, "grad_norm": 1.3065481185913086, "learning_rate": 3.4058272649417607e-06, "loss": 0.2127, "step": 7488 }, { "epoch": 1.34, "grad_norm": 1.209372639656067, "learning_rate": 3.3903801270436465e-06, "loss": 0.2015, "step": 7520 }, { "epoch": 1.34, "grad_norm": 1.0177247524261475, "learning_rate": 3.374893958908971e-06, "loss": 0.2075, "step": 7552 }, { "epoch": 1.35, "grad_norm": 1.4457709789276123, "learning_rate": 3.3593694393812827e-06, "loss": 0.2098, "step": 7584 }, { "epoch": 1.35, "grad_norm": 1.1711078882217407, "learning_rate": 3.3438072489852837e-06, "loss": 0.2088, "step": 7616 }, { "epoch": 1.36, "grad_norm": 1.1928409337997437, "learning_rate": 3.3282080698969953e-06, "loss": 0.1918, "step": 7648 }, { "epoch": 1.37, "grad_norm": 0.9215808510780334, "learning_rate": 3.3125725859138548e-06, "loss": 0.2106, "step": 7680 }, { "epoch": 1.37, "grad_norm": 1.3021633625030518, "learning_rate": 3.2969014824247436e-06, "loss": 0.2018, "step": 7712 }, { "epoch": 1.38, "grad_norm": 1.1597398519515991, "learning_rate": 3.28119544637994e-06, "loss": 0.2035, "step": 7744 }, { "epoch": 1.38, "grad_norm": 1.2015706300735474, "learning_rate": 3.265455166261009e-06, "loss": 0.2027, "step": 7776 }, { "epoch": 1.39, "grad_norm": 1.1449264287948608, "learning_rate": 3.2496813320506183e-06, "loss": 0.2165, "step": 7808 }, { "epoch": 1.39, "grad_norm": 1.1332265138626099, "learning_rate": 3.2338746352022965e-06, "loss": 0.2006, "step": 7840 }, { "epoch": 1.4, "grad_norm": 1.430891990661621, "learning_rate": 3.2180357686101226e-06, "loss": 0.2102, "step": 7872 }, { "epoch": 1.41, "grad_norm": 1.4063985347747803, "learning_rate": 3.2021654265783505e-06, "loss": 0.196, "step": 7904 }, { "epoch": 1.41, "grad_norm": 1.3970558643341064, "learning_rate": 3.1862643047909746e-06, "loss": 0.2161, "step": 7936 }, { "epoch": 1.42, "grad_norm": 1.3233983516693115, "learning_rate": 3.170333100281236e-06, "loss": 0.1921, "step": 7968 }, { "epoch": 1.42, "grad_norm": 1.2325806617736816, "learning_rate": 3.154372511401064e-06, "loss": 0.2042, "step": 8000 }, { "epoch": 1.43, "grad_norm": 1.0558186769485474, "learning_rate": 3.1383832377904676e-06, "loss": 0.2056, "step": 8032 }, { "epoch": 1.43, "grad_norm": 1.189503788948059, "learning_rate": 3.1223659803468653e-06, "loss": 0.203, "step": 8064 }, { "epoch": 1.44, "grad_norm": 1.1646627187728882, "learning_rate": 3.1063214411943576e-06, "loss": 0.2088, "step": 8096 }, { "epoch": 1.45, "grad_norm": 1.1149977445602417, "learning_rate": 3.0902503236529533e-06, "loss": 0.2081, "step": 8128 }, { "epoch": 1.45, "grad_norm": 1.5566740036010742, "learning_rate": 3.074153332207738e-06, "loss": 0.2141, "step": 8160 }, { "epoch": 1.46, "grad_norm": 1.304262638092041, "learning_rate": 3.058031172477992e-06, "loss": 0.2006, "step": 8192 }, { "epoch": 1.46, "grad_norm": 1.1247010231018066, "learning_rate": 3.041884551186258e-06, "loss": 0.2109, "step": 8224 }, { "epoch": 1.47, "grad_norm": 1.4587311744689941, "learning_rate": 3.0257141761273627e-06, "loss": 0.2016, "step": 8256 }, { "epoch": 1.47, "grad_norm": 1.1545838117599487, "learning_rate": 3.0095207561373935e-06, "loss": 0.183, "step": 8288 }, { "epoch": 1.48, "grad_norm": 1.3221790790557861, "learning_rate": 2.9933050010626208e-06, "loss": 0.1985, "step": 8320 }, { "epoch": 1.49, "grad_norm": 1.1717700958251953, "learning_rate": 2.9770676217283844e-06, "loss": 0.2113, "step": 8352 }, { "epoch": 1.49, "grad_norm": 1.0709022283554077, "learning_rate": 2.960809329907934e-06, "loss": 0.2012, "step": 8384 }, { "epoch": 1.5, "grad_norm": 1.4594495296478271, "learning_rate": 2.944530838291229e-06, "loss": 0.2039, "step": 8416 }, { "epoch": 1.5, "grad_norm": 1.3135212659835815, "learning_rate": 2.928232860453694e-06, "loss": 0.206, "step": 8448 }, { "epoch": 1.51, "grad_norm": 1.316394329071045, "learning_rate": 2.911916110824945e-06, "loss": 0.212, "step": 8480 }, { "epoch": 1.51, "grad_norm": 1.3602452278137207, "learning_rate": 2.895581304657465e-06, "loss": 0.2068, "step": 8512 }, { "epoch": 1.52, "grad_norm": 1.2073897123336792, "learning_rate": 2.8792291579952553e-06, "loss": 0.2098, "step": 8544 }, { "epoch": 1.52, "grad_norm": 1.2983072996139526, "learning_rate": 2.8628603876424467e-06, "loss": 0.2086, "step": 8576 }, { "epoch": 1.53, "grad_norm": 1.0781196355819702, "learning_rate": 2.846475711131877e-06, "loss": 0.201, "step": 8608 }, { "epoch": 1.54, "grad_norm": 1.1917251348495483, "learning_rate": 2.8300758466936366e-06, "loss": 0.1982, "step": 8640 }, { "epoch": 1.54, "grad_norm": 1.2894983291625977, "learning_rate": 2.813661513223588e-06, "loss": 0.1943, "step": 8672 }, { "epoch": 1.55, "grad_norm": 1.2249202728271484, "learning_rate": 2.7972334302518504e-06, "loss": 0.2145, "step": 8704 }, { "epoch": 1.55, "grad_norm": 1.1947064399719238, "learning_rate": 2.7807923179112576e-06, "loss": 0.2003, "step": 8736 }, { "epoch": 1.56, "grad_norm": 1.0660251379013062, "learning_rate": 2.764338896905792e-06, "loss": 0.1984, "step": 8768 }, { "epoch": 1.56, "grad_norm": 1.0243247747421265, "learning_rate": 2.7478738884789934e-06, "loss": 0.2036, "step": 8800 }, { "epoch": 1.57, "grad_norm": 1.286199927330017, "learning_rate": 2.731398014382341e-06, "loss": 0.2027, "step": 8832 }, { "epoch": 1.58, "grad_norm": 1.1617448329925537, "learning_rate": 2.714911996843617e-06, "loss": 0.2162, "step": 8864 }, { "epoch": 1.58, "grad_norm": 1.1921496391296387, "learning_rate": 2.6984165585352435e-06, "loss": 0.2124, "step": 8896 }, { "epoch": 1.59, "grad_norm": 1.2066140174865723, "learning_rate": 2.6819124225426085e-06, "loss": 0.199, "step": 8928 }, { "epoch": 1.59, "grad_norm": 1.0459320545196533, "learning_rate": 2.665400312332368e-06, "loss": 0.2072, "step": 8960 }, { "epoch": 1.6, "grad_norm": 1.2983636856079102, "learning_rate": 2.648880951720729e-06, "loss": 0.2024, "step": 8992 }, { "epoch": 1.6, "grad_norm": 1.0876768827438354, "learning_rate": 2.6323550648417267e-06, "loss": 0.2143, "step": 9024 }, { "epoch": 1.61, "grad_norm": 1.0047022104263306, "learning_rate": 2.6158233761154744e-06, "loss": 0.2043, "step": 9056 }, { "epoch": 1.62, "grad_norm": 0.9878237247467041, "learning_rate": 2.5992866102164146e-06, "loss": 0.1991, "step": 9088 }, { "epoch": 1.62, "grad_norm": 0.9894827604293823, "learning_rate": 2.58274549204155e-06, "loss": 0.1979, "step": 9120 }, { "epoch": 1.63, "grad_norm": 0.9374232292175293, "learning_rate": 2.5662007466786674e-06, "loss": 0.2055, "step": 9152 }, { "epoch": 1.63, "grad_norm": 1.259948492050171, "learning_rate": 2.5496530993745518e-06, "loss": 0.2057, "step": 9184 }, { "epoch": 1.64, "grad_norm": 0.958737850189209, "learning_rate": 2.533103275503197e-06, "loss": 0.2029, "step": 9216 }, { "epoch": 1.64, "grad_norm": 1.079717755317688, "learning_rate": 2.5165520005340082e-06, "loss": 0.2049, "step": 9248 }, { "epoch": 1.65, "grad_norm": 1.1001982688903809, "learning_rate": 2.5e-06, "loss": 0.211, "step": 9280 }, { "epoch": 1.66, "grad_norm": 1.2408779859542847, "learning_rate": 2.4834479994659926e-06, "loss": 0.2028, "step": 9312 }, { "epoch": 1.66, "grad_norm": 1.0395313501358032, "learning_rate": 2.4668967244968035e-06, "loss": 0.1988, "step": 9344 }, { "epoch": 1.67, "grad_norm": 1.0080056190490723, "learning_rate": 2.4503469006254487e-06, "loss": 0.1988, "step": 9376 }, { "epoch": 1.67, "grad_norm": 1.4669593572616577, "learning_rate": 2.4337992533213334e-06, "loss": 0.1942, "step": 9408 }, { "epoch": 1.68, "grad_norm": 1.3393524885177612, "learning_rate": 2.4172545079584508e-06, "loss": 0.1964, "step": 9440 }, { "epoch": 1.68, "grad_norm": 0.9786149263381958, "learning_rate": 2.4007133897835863e-06, "loss": 0.1984, "step": 9472 }, { "epoch": 1.69, "grad_norm": 1.1999776363372803, "learning_rate": 2.3841766238845264e-06, "loss": 0.2102, "step": 9504 }, { "epoch": 1.7, "grad_norm": 1.3276174068450928, "learning_rate": 2.367644935158274e-06, "loss": 0.1941, "step": 9536 }, { "epoch": 1.7, "grad_norm": 1.0124472379684448, "learning_rate": 2.3511190482792713e-06, "loss": 0.199, "step": 9568 }, { "epoch": 1.71, "grad_norm": 1.258489966392517, "learning_rate": 2.3345996876676334e-06, "loss": 0.2008, "step": 9600 }, { "epoch": 1.71, "grad_norm": 1.1993016004562378, "learning_rate": 2.318087577457392e-06, "loss": 0.2154, "step": 9632 }, { "epoch": 1.72, "grad_norm": 1.250908613204956, "learning_rate": 2.3015834414647573e-06, "loss": 0.2068, "step": 9664 }, { "epoch": 1.72, "grad_norm": 1.211915373802185, "learning_rate": 2.2850880031563845e-06, "loss": 0.1946, "step": 9696 }, { "epoch": 1.73, "grad_norm": 1.0278340578079224, "learning_rate": 2.26860198561766e-06, "loss": 0.1948, "step": 9728 }, { "epoch": 1.74, "grad_norm": 1.2455780506134033, "learning_rate": 2.2521261115210074e-06, "loss": 0.197, "step": 9760 }, { "epoch": 1.74, "grad_norm": 1.2321908473968506, "learning_rate": 2.2356611030942084e-06, "loss": 0.2075, "step": 9792 }, { "epoch": 1.75, "grad_norm": 1.0618436336517334, "learning_rate": 2.219207682088743e-06, "loss": 0.1931, "step": 9824 }, { "epoch": 1.75, "grad_norm": 1.36842942237854, "learning_rate": 2.20276656974815e-06, "loss": 0.1999, "step": 9856 }, { "epoch": 1.76, "grad_norm": 1.033603310585022, "learning_rate": 2.186338486776412e-06, "loss": 0.2028, "step": 9888 }, { "epoch": 1.76, "grad_norm": 1.303781270980835, "learning_rate": 2.169924153306363e-06, "loss": 0.214, "step": 9920 }, { "epoch": 1.77, "grad_norm": 1.2051355838775635, "learning_rate": 2.153524288868124e-06, "loss": 0.2091, "step": 9952 }, { "epoch": 1.78, "grad_norm": 0.9946267604827881, "learning_rate": 2.137139612357554e-06, "loss": 0.1942, "step": 9984 }, { "epoch": 1.78, "grad_norm": 1.283492088317871, "learning_rate": 2.120770842004746e-06, "loss": 0.1971, "step": 10016 }, { "epoch": 1.79, "grad_norm": 0.9329233169555664, "learning_rate": 2.1044186953425358e-06, "loss": 0.203, "step": 10048 }, { "epoch": 1.79, "grad_norm": 1.082767367362976, "learning_rate": 2.0880838891750553e-06, "loss": 0.2012, "step": 10080 }, { "epoch": 1.8, "grad_norm": 1.1007740497589111, "learning_rate": 2.0717671395463063e-06, "loss": 0.2028, "step": 10112 }, { "epoch": 1.8, "grad_norm": 1.2502014636993408, "learning_rate": 2.0554691617087725e-06, "loss": 0.2121, "step": 10144 }, { "epoch": 1.81, "grad_norm": 1.073034405708313, "learning_rate": 2.0391906700920667e-06, "loss": 0.1994, "step": 10176 }, { "epoch": 1.82, "grad_norm": 0.9409189820289612, "learning_rate": 2.0229323782716156e-06, "loss": 0.2054, "step": 10208 }, { "epoch": 1.82, "grad_norm": 1.197383999824524, "learning_rate": 2.0066949989373797e-06, "loss": 0.1946, "step": 10240 }, { "epoch": 1.83, "grad_norm": 1.261612892150879, "learning_rate": 1.9904792438626074e-06, "loss": 0.2038, "step": 10272 }, { "epoch": 1.83, "grad_norm": 1.4839472770690918, "learning_rate": 1.9742858238726377e-06, "loss": 0.2067, "step": 10304 }, { "epoch": 1.84, "grad_norm": 1.0103521347045898, "learning_rate": 1.9581154488137425e-06, "loss": 0.2104, "step": 10336 }, { "epoch": 1.84, "grad_norm": 1.2776283025741577, "learning_rate": 1.9419688275220085e-06, "loss": 0.196, "step": 10368 }, { "epoch": 1.85, "grad_norm": 1.0784910917282104, "learning_rate": 1.9258466677922624e-06, "loss": 0.1975, "step": 10400 }, { "epoch": 1.85, "grad_norm": 0.9643808007240295, "learning_rate": 1.909749676347047e-06, "loss": 0.2111, "step": 10432 }, { "epoch": 1.86, "grad_norm": 1.3432437181472778, "learning_rate": 1.8936785588056428e-06, "loss": 0.1923, "step": 10464 }, { "epoch": 1.87, "grad_norm": 1.133470892906189, "learning_rate": 1.8776340196531351e-06, "loss": 0.2016, "step": 10496 }, { "epoch": 1.87, "grad_norm": 1.0897003412246704, "learning_rate": 1.8616167622095328e-06, "loss": 0.193, "step": 10528 }, { "epoch": 1.88, "grad_norm": 0.9629374146461487, "learning_rate": 1.8456274885989374e-06, "loss": 0.1937, "step": 10560 }, { "epoch": 1.88, "grad_norm": 1.406630039215088, "learning_rate": 1.829666899718765e-06, "loss": 0.1997, "step": 10592 }, { "epoch": 1.89, "grad_norm": 1.165366291999817, "learning_rate": 1.8137356952090258e-06, "loss": 0.1976, "step": 10624 }, { "epoch": 1.89, "grad_norm": 1.2674609422683716, "learning_rate": 1.7978345734216502e-06, "loss": 0.1908, "step": 10656 }, { "epoch": 1.9, "grad_norm": 1.2211626768112183, "learning_rate": 1.7819642313898783e-06, "loss": 0.1984, "step": 10688 }, { "epoch": 1.91, "grad_norm": 1.1066653728485107, "learning_rate": 1.766125364797704e-06, "loss": 0.2035, "step": 10720 }, { "epoch": 1.91, "grad_norm": 1.539157748222351, "learning_rate": 1.7503186679493821e-06, "loss": 0.201, "step": 10752 }, { "epoch": 1.92, "grad_norm": 1.1897879838943481, "learning_rate": 1.7345448337389918e-06, "loss": 0.194, "step": 10784 }, { "epoch": 1.92, "grad_norm": 1.162571668624878, "learning_rate": 1.7188045536200604e-06, "loss": 0.1899, "step": 10816 }, { "epoch": 1.93, "grad_norm": 1.0616639852523804, "learning_rate": 1.7030985175752574e-06, "loss": 0.1978, "step": 10848 }, { "epoch": 1.93, "grad_norm": 1.0391641855239868, "learning_rate": 1.687427414086146e-06, "loss": 0.2017, "step": 10880 }, { "epoch": 1.94, "grad_norm": 1.0870113372802734, "learning_rate": 1.6717919301030055e-06, "loss": 0.2012, "step": 10912 }, { "epoch": 1.95, "grad_norm": 1.143446922302246, "learning_rate": 1.6561927510147172e-06, "loss": 0.1911, "step": 10944 }, { "epoch": 1.95, "grad_norm": 1.015080213546753, "learning_rate": 1.6406305606187183e-06, "loss": 0.198, "step": 10976 }, { "epoch": 1.96, "grad_norm": 0.9722278714179993, "learning_rate": 1.6251060410910301e-06, "loss": 0.1862, "step": 11008 }, { "epoch": 1.96, "grad_norm": 0.9311158061027527, "learning_rate": 1.6096198729563539e-06, "loss": 0.198, "step": 11040 }, { "epoch": 1.97, "grad_norm": 1.3127020597457886, "learning_rate": 1.5941727350582399e-06, "loss": 0.2, "step": 11072 }, { "epoch": 1.97, "grad_norm": 0.9450055956840515, "learning_rate": 1.5787653045293278e-06, "loss": 0.2015, "step": 11104 }, { "epoch": 1.98, "grad_norm": 1.1553057432174683, "learning_rate": 1.5633982567616657e-06, "loss": 0.2068, "step": 11136 }, { "epoch": 1.99, "grad_norm": 0.8617095351219177, "learning_rate": 1.548072265377105e-06, "loss": 0.2014, "step": 11168 }, { "epoch": 1.99, "grad_norm": 0.9857013821601868, "learning_rate": 1.532788002197773e-06, "loss": 0.2031, "step": 11200 }, { "epoch": 2.0, "grad_norm": 1.0158861875534058, "learning_rate": 1.5175461372166177e-06, "loss": 0.1941, "step": 11232 }, { "epoch": 2.0, "grad_norm": 1.1121022701263428, "learning_rate": 1.5023473385680438e-06, "loss": 0.1708, "step": 11264 }, { "epoch": 2.01, "grad_norm": 1.1300593614578247, "learning_rate": 1.4871922724986215e-06, "loss": 0.1504, "step": 11296 }, { "epoch": 2.01, "grad_norm": 1.232246994972229, "learning_rate": 1.4720816033378856e-06, "loss": 0.151, "step": 11328 }, { "epoch": 2.02, "grad_norm": 1.2618398666381836, "learning_rate": 1.4570159934692085e-06, "loss": 0.1421, "step": 11360 }, { "epoch": 2.03, "grad_norm": 1.275038242340088, "learning_rate": 1.4419961033007669e-06, "loss": 0.1457, "step": 11392 }, { "epoch": 2.03, "grad_norm": 1.079405426979065, "learning_rate": 1.427022591236594e-06, "loss": 0.144, "step": 11424 }, { "epoch": 2.04, "grad_norm": 1.1695400476455688, "learning_rate": 1.4120961136477168e-06, "loss": 0.1531, "step": 11456 }, { "epoch": 2.04, "grad_norm": 1.008547306060791, "learning_rate": 1.3972173248433832e-06, "loss": 0.1453, "step": 11488 }, { "epoch": 2.05, "grad_norm": 1.0746265649795532, "learning_rate": 1.3823868770423815e-06, "loss": 0.1446, "step": 11520 }, { "epoch": 2.05, "grad_norm": 1.1596614122390747, "learning_rate": 1.3676054203444462e-06, "loss": 0.1477, "step": 11552 }, { "epoch": 2.06, "grad_norm": 1.1029706001281738, "learning_rate": 1.3528736027017663e-06, "loss": 0.1477, "step": 11584 }, { "epoch": 2.07, "grad_norm": 0.9398396015167236, "learning_rate": 1.3381920698905788e-06, "loss": 0.1477, "step": 11616 }, { "epoch": 2.07, "grad_norm": 1.0209776163101196, "learning_rate": 1.3235614654828604e-06, "loss": 0.1448, "step": 11648 }, { "epoch": 2.08, "grad_norm": 0.9415841102600098, "learning_rate": 1.3089824308181187e-06, "loss": 0.1481, "step": 11680 }, { "epoch": 2.08, "grad_norm": 1.088069200515747, "learning_rate": 1.2944556049752726e-06, "loss": 0.149, "step": 11712 }, { "epoch": 2.09, "grad_norm": 1.3269786834716797, "learning_rate": 1.2799816247446494e-06, "loss": 0.1497, "step": 11744 }, { "epoch": 2.09, "grad_norm": 0.9119545817375183, "learning_rate": 1.265561124600057e-06, "loss": 0.1467, "step": 11776 }, { "epoch": 2.1, "grad_norm": 1.0677683353424072, "learning_rate": 1.251194736670983e-06, "loss": 0.1448, "step": 11808 }, { "epoch": 2.11, "grad_norm": 1.0756884813308716, "learning_rate": 1.2368830907148778e-06, "loss": 0.1363, "step": 11840 }, { "epoch": 2.11, "grad_norm": 1.0578961372375488, "learning_rate": 1.2226268140895528e-06, "loss": 0.1527, "step": 11872 }, { "epoch": 2.12, "grad_norm": 1.0562700033187866, "learning_rate": 1.2084265317256772e-06, "loss": 0.1449, "step": 11904 }, { "epoch": 2.12, "grad_norm": 1.0958082675933838, "learning_rate": 1.1942828660993869e-06, "loss": 0.1474, "step": 11936 }, { "epoch": 2.13, "grad_norm": 0.9672511219978333, "learning_rate": 1.1801964372049932e-06, "loss": 0.1459, "step": 11968 }, { "epoch": 2.13, "grad_norm": 1.1125974655151367, "learning_rate": 1.1661678625278106e-06, "loss": 0.1483, "step": 12000 }, { "epoch": 2.14, "grad_norm": 1.227283239364624, "learning_rate": 1.152197757017086e-06, "loss": 0.1453, "step": 12032 }, { "epoch": 2.15, "grad_norm": 1.3206896781921387, "learning_rate": 1.1382867330590414e-06, "loss": 0.1425, "step": 12064 }, { "epoch": 2.15, "grad_norm": 0.9912922978401184, "learning_rate": 1.1244354004500335e-06, "loss": 0.1529, "step": 12096 }, { "epoch": 2.16, "grad_norm": 0.8996345400810242, "learning_rate": 1.110644366369815e-06, "loss": 0.1437, "step": 12128 }, { "epoch": 2.16, "grad_norm": 0.9117119312286377, "learning_rate": 1.0969142353549315e-06, "loss": 0.1429, "step": 12160 }, { "epoch": 2.17, "grad_norm": 1.1916334629058838, "learning_rate": 1.0832456092722063e-06, "loss": 0.1509, "step": 12192 }, { "epoch": 2.17, "grad_norm": 1.1345574855804443, "learning_rate": 1.0696390872923696e-06, "loss": 0.1547, "step": 12224 }, { "epoch": 2.18, "grad_norm": 1.3311399221420288, "learning_rate": 1.0560952658637869e-06, "loss": 0.1428, "step": 12256 }, { "epoch": 2.18, "grad_norm": 1.0195939540863037, "learning_rate": 1.042614738686315e-06, "loss": 0.1447, "step": 12288 }, { "epoch": 2.19, "grad_norm": 1.1453065872192383, "learning_rate": 1.029198096685278e-06, "loss": 0.1384, "step": 12320 }, { "epoch": 2.2, "grad_norm": 1.1899516582489014, "learning_rate": 1.0158459279855632e-06, "loss": 0.1433, "step": 12352 }, { "epoch": 2.2, "grad_norm": 1.060065507888794, "learning_rate": 1.0025588178858372e-06, "loss": 0.1456, "step": 12384 }, { "epoch": 2.21, "grad_norm": 1.1489146947860718, "learning_rate": 9.893373488328953e-07, "loss": 0.1433, "step": 12416 }, { "epoch": 2.21, "grad_norm": 1.3114417791366577, "learning_rate": 9.761821003961246e-07, "loss": 0.1467, "step": 12448 }, { "epoch": 2.22, "grad_norm": 1.3255183696746826, "learning_rate": 9.630936492421005e-07, "loss": 0.1463, "step": 12480 }, { "epoch": 2.22, "grad_norm": 1.2642742395401, "learning_rate": 9.500725691093085e-07, "loss": 0.1525, "step": 12512 }, { "epoch": 2.23, "grad_norm": 1.2281138896942139, "learning_rate": 9.371194307829895e-07, "loss": 0.1383, "step": 12544 }, { "epoch": 2.24, "grad_norm": 0.9627875089645386, "learning_rate": 9.242348020701295e-07, "loss": 0.1642, "step": 12576 }, { "epoch": 2.24, "grad_norm": 1.1187249422073364, "learning_rate": 9.114192477745568e-07, "loss": 0.1439, "step": 12608 }, { "epoch": 2.25, "grad_norm": 1.0410840511322021, "learning_rate": 8.986733296721931e-07, "loss": 0.142, "step": 12640 }, { "epoch": 2.25, "grad_norm": 1.2345024347305298, "learning_rate": 8.859976064864235e-07, "loss": 0.1512, "step": 12672 }, { "epoch": 2.26, "grad_norm": 1.6558443307876587, "learning_rate": 8.733926338636056e-07, "loss": 0.1363, "step": 12704 }, { "epoch": 2.26, "grad_norm": 1.3118538856506348, "learning_rate": 8.608589643487128e-07, "loss": 0.1471, "step": 12736 }, { "epoch": 2.27, "grad_norm": 1.1155567169189453, "learning_rate": 8.483971473611133e-07, "loss": 0.1396, "step": 12768 }, { "epoch": 2.28, "grad_norm": 1.0880179405212402, "learning_rate": 8.360077291704821e-07, "loss": 0.1413, "step": 12800 }, { "epoch": 2.28, "grad_norm": 0.9752321839332581, "learning_rate": 8.236912528728647e-07, "loss": 0.146, "step": 12832 }, { "epoch": 2.29, "grad_norm": 0.9778379201889038, "learning_rate": 8.114482583668576e-07, "loss": 0.1403, "step": 12864 }, { "epoch": 2.29, "grad_norm": 0.8760839700698853, "learning_rate": 7.99279282329952e-07, "loss": 0.148, "step": 12896 }, { "epoch": 2.3, "grad_norm": 1.187658667564392, "learning_rate": 7.871848581950039e-07, "loss": 0.132, "step": 12928 }, { "epoch": 2.3, "grad_norm": 0.9668059349060059, "learning_rate": 7.751655161268481e-07, "loss": 0.1424, "step": 12960 }, { "epoch": 2.31, "grad_norm": 1.1318392753601074, "learning_rate": 7.632217829990668e-07, "loss": 0.1516, "step": 12992 }, { "epoch": 2.32, "grad_norm": 1.3520994186401367, "learning_rate": 7.513541823708828e-07, "loss": 0.1495, "step": 13024 }, { "epoch": 2.32, "grad_norm": 1.3352413177490234, "learning_rate": 7.395632344642173e-07, "loss": 0.1446, "step": 13056 }, { "epoch": 2.33, "grad_norm": 1.0273305177688599, "learning_rate": 7.278494561408833e-07, "loss": 0.1391, "step": 13088 }, { "epoch": 2.33, "grad_norm": 1.2872681617736816, "learning_rate": 7.162133608799271e-07, "loss": 0.1391, "step": 13120 }, { "epoch": 2.34, "grad_norm": 1.0563528537750244, "learning_rate": 7.046554587551216e-07, "loss": 0.1521, "step": 13152 }, { "epoch": 2.34, "grad_norm": 1.1487845182418823, "learning_rate": 6.931762564126074e-07, "loss": 0.1411, "step": 13184 }, { "epoch": 2.35, "grad_norm": 1.058159351348877, "learning_rate": 6.817762570486791e-07, "loss": 0.1424, "step": 13216 }, { "epoch": 2.36, "grad_norm": 1.249377727508545, "learning_rate": 6.704559603877367e-07, "loss": 0.1448, "step": 13248 }, { "epoch": 2.36, "grad_norm": 0.9334893226623535, "learning_rate": 6.592158626603689e-07, "loss": 0.1384, "step": 13280 }, { "epoch": 2.37, "grad_norm": 1.5639148950576782, "learning_rate": 6.480564565816091e-07, "loss": 0.1426, "step": 13312 }, { "epoch": 2.37, "grad_norm": 1.0596867799758911, "learning_rate": 6.369782313293335e-07, "loss": 0.1358, "step": 13344 }, { "epoch": 2.38, "grad_norm": 0.8567415475845337, "learning_rate": 6.259816725228158e-07, "loss": 0.1465, "step": 13376 }, { "epoch": 2.38, "grad_norm": 1.1086764335632324, "learning_rate": 6.150672622014459e-07, "loss": 0.1538, "step": 13408 }, { "epoch": 2.39, "grad_norm": 1.1636631488800049, "learning_rate": 6.042354788035943e-07, "loss": 0.1389, "step": 13440 }, { "epoch": 2.4, "grad_norm": 1.274596929550171, "learning_rate": 5.934867971456384e-07, "loss": 0.1464, "step": 13472 }, { "epoch": 2.4, "grad_norm": 1.173563003540039, "learning_rate": 5.828216884011553e-07, "loss": 0.1435, "step": 13504 }, { "epoch": 2.41, "grad_norm": 1.0788921117782593, "learning_rate": 5.722406200802613e-07, "loss": 0.145, "step": 13536 }, { "epoch": 2.41, "grad_norm": 1.1613490581512451, "learning_rate": 5.617440560091212e-07, "loss": 0.1474, "step": 13568 }, { "epoch": 2.42, "grad_norm": 0.9075080156326294, "learning_rate": 5.513324563096167e-07, "loss": 0.1423, "step": 13600 }, { "epoch": 2.42, "grad_norm": 1.1296495199203491, "learning_rate": 5.41006277379173e-07, "loss": 0.1506, "step": 13632 }, { "epoch": 2.43, "grad_norm": 1.2199699878692627, "learning_rate": 5.307659718707603e-07, "loss": 0.1459, "step": 13664 }, { "epoch": 2.44, "grad_norm": 1.2415364980697632, "learning_rate": 5.20611988673041e-07, "loss": 0.1459, "step": 13696 }, { "epoch": 2.44, "grad_norm": 0.9814534187316895, "learning_rate": 5.105447728907012e-07, "loss": 0.1405, "step": 13728 }, { "epoch": 2.45, "grad_norm": 1.1437889337539673, "learning_rate": 5.00564765824936e-07, "loss": 0.147, "step": 13760 }, { "epoch": 2.45, "grad_norm": 1.1459242105484009, "learning_rate": 4.906724049541023e-07, "loss": 0.1454, "step": 13792 }, { "epoch": 2.46, "grad_norm": 1.093807578086853, "learning_rate": 4.808681239145479e-07, "loss": 0.1448, "step": 13824 }, { "epoch": 2.46, "grad_norm": 1.1457182168960571, "learning_rate": 4.711523524815978e-07, "loss": 0.1391, "step": 13856 }, { "epoch": 2.47, "grad_norm": 1.0422513484954834, "learning_rate": 4.615255165507146e-07, "loss": 0.1435, "step": 13888 }, { "epoch": 2.48, "grad_norm": 1.1171213388442993, "learning_rate": 4.5198803811883326e-07, "loss": 0.1545, "step": 13920 }, { "epoch": 2.48, "grad_norm": 1.3410863876342773, "learning_rate": 4.4254033526585917e-07, "loss": 0.1526, "step": 13952 }, { "epoch": 2.49, "grad_norm": 0.9821498394012451, "learning_rate": 4.331828221363424e-07, "loss": 0.1407, "step": 13984 }, { "epoch": 2.49, "grad_norm": 1.2533886432647705, "learning_rate": 4.239159089213246e-07, "loss": 0.1358, "step": 14016 }, { "epoch": 2.5, "grad_norm": 0.9848787784576416, "learning_rate": 4.147400018403544e-07, "loss": 0.1449, "step": 14048 }, { "epoch": 2.5, "grad_norm": 0.9092288613319397, "learning_rate": 4.056555031236878e-07, "loss": 0.1433, "step": 14080 }, { "epoch": 2.51, "grad_norm": 1.3839354515075684, "learning_rate": 3.966628109946469e-07, "loss": 0.1494, "step": 14112 }, { "epoch": 2.51, "grad_norm": 0.9744381904602051, "learning_rate": 3.877623196521707e-07, "loss": 0.1426, "step": 14144 }, { "epoch": 2.52, "grad_norm": 1.165624976158142, "learning_rate": 3.7895441925353356e-07, "loss": 0.1418, "step": 14176 }, { "epoch": 2.53, "grad_norm": 0.9751477241516113, "learning_rate": 3.702394958972391e-07, "loss": 0.1479, "step": 14208 }, { "epoch": 2.53, "grad_norm": 1.0645439624786377, "learning_rate": 3.616179316061011e-07, "loss": 0.1373, "step": 14240 }, { "epoch": 2.54, "grad_norm": 0.9858296513557434, "learning_rate": 3.5309010431049284e-07, "loss": 0.1367, "step": 14272 }, { "epoch": 2.54, "grad_norm": 1.0521584749221802, "learning_rate": 3.44656387831781e-07, "loss": 0.1421, "step": 14304 }, { "epoch": 2.55, "grad_norm": 1.2546510696411133, "learning_rate": 3.363171518659408e-07, "loss": 0.1384, "step": 14336 }, { "epoch": 2.55, "grad_norm": 1.0795034170150757, "learning_rate": 3.280727619673496e-07, "loss": 0.1463, "step": 14368 }, { "epoch": 2.56, "grad_norm": 1.1764500141143799, "learning_rate": 3.199235795327615e-07, "loss": 0.1499, "step": 14400 }, { "epoch": 2.57, "grad_norm": 1.1234067678451538, "learning_rate": 3.1186996178546674e-07, "loss": 0.1497, "step": 14432 }, { "epoch": 2.57, "grad_norm": 0.9825364947319031, "learning_rate": 3.039122617596302e-07, "loss": 0.1514, "step": 14464 }, { "epoch": 2.58, "grad_norm": 1.263085961341858, "learning_rate": 2.960508282848215e-07, "loss": 0.1476, "step": 14496 }, { "epoch": 2.58, "grad_norm": 1.084181308746338, "learning_rate": 2.8828600597071597e-07, "loss": 0.1308, "step": 14528 }, { "epoch": 2.59, "grad_norm": 1.1697498559951782, "learning_rate": 2.8061813519199536e-07, "loss": 0.1348, "step": 14560 }, { "epoch": 2.59, "grad_norm": 1.3982306718826294, "learning_rate": 2.7304755207342467e-07, "loss": 0.1455, "step": 14592 }, { "epoch": 2.6, "grad_norm": 1.1802705526351929, "learning_rate": 2.655745884751157e-07, "loss": 0.1437, "step": 14624 }, { "epoch": 2.61, "grad_norm": 1.0200531482696533, "learning_rate": 2.581995719779856e-07, "loss": 0.1394, "step": 14656 }, { "epoch": 2.61, "grad_norm": 1.1693042516708374, "learning_rate": 2.5092282586939187e-07, "loss": 0.151, "step": 14688 }, { "epoch": 2.62, "grad_norm": 1.116024374961853, "learning_rate": 2.437446691289616e-07, "loss": 0.1478, "step": 14720 }, { "epoch": 2.62, "grad_norm": 1.05259370803833, "learning_rate": 2.3666541641461231e-07, "loss": 0.1436, "step": 14752 }, { "epoch": 2.63, "grad_norm": 1.0545703172683716, "learning_rate": 2.2968537804875485e-07, "loss": 0.1379, "step": 14784 }, { "epoch": 2.63, "grad_norm": 1.0618197917938232, "learning_rate": 2.228048600046928e-07, "loss": 0.1409, "step": 14816 }, { "epoch": 2.64, "grad_norm": 1.389092206954956, "learning_rate": 2.1602416389320922e-07, "loss": 0.1499, "step": 14848 }, { "epoch": 2.65, "grad_norm": 1.0467108488082886, "learning_rate": 2.0934358694934347e-07, "loss": 0.1406, "step": 14880 }, { "epoch": 2.65, "grad_norm": 1.1932706832885742, "learning_rate": 2.0276342201936637e-07, "loss": 0.1468, "step": 14912 }, { "epoch": 2.66, "grad_norm": 1.2286850214004517, "learning_rate": 1.9628395754793777e-07, "loss": 0.1457, "step": 14944 }, { "epoch": 2.66, "grad_norm": 0.9705607891082764, "learning_rate": 1.899054775654663e-07, "loss": 0.1439, "step": 14976 }, { "epoch": 2.67, "grad_norm": 0.9110348224639893, "learning_rate": 1.8362826167565796e-07, "loss": 0.1439, "step": 15008 }, { "epoch": 2.67, "grad_norm": 0.9858996272087097, "learning_rate": 1.774525850432568e-07, "loss": 0.1528, "step": 15040 }, { "epoch": 2.68, "grad_norm": 1.1253962516784668, "learning_rate": 1.7137871838198817e-07, "loss": 0.1408, "step": 15072 }, { "epoch": 2.69, "grad_norm": 1.5971510410308838, "learning_rate": 1.654069279426873e-07, "loss": 0.1497, "step": 15104 }, { "epoch": 2.69, "grad_norm": 0.8475412130355835, "learning_rate": 1.5953747550162907e-07, "loss": 0.1456, "step": 15136 }, { "epoch": 2.7, "grad_norm": 0.9866968989372253, "learning_rate": 1.537706183490545e-07, "loss": 0.1349, "step": 15168 }, { "epoch": 2.7, "grad_norm": 1.1067461967468262, "learning_rate": 1.481066092778913e-07, "loss": 0.1457, "step": 15200 }, { "epoch": 2.71, "grad_norm": 1.1080329418182373, "learning_rate": 1.4254569657267235e-07, "loss": 0.146, "step": 15232 }, { "epoch": 2.71, "grad_norm": 0.992157518863678, "learning_rate": 1.370881239986524e-07, "loss": 0.1439, "step": 15264 }, { "epoch": 2.72, "grad_norm": 1.032788872718811, "learning_rate": 1.3173413079112128e-07, "loss": 0.1369, "step": 15296 }, { "epoch": 2.73, "grad_norm": 0.9706469774246216, "learning_rate": 1.264839516449204e-07, "loss": 0.136, "step": 15328 }, { "epoch": 2.73, "grad_norm": 1.1187324523925781, "learning_rate": 1.2133781670415013e-07, "loss": 0.1359, "step": 15360 }, { "epoch": 2.74, "grad_norm": 1.1595239639282227, "learning_rate": 1.1629595155208424e-07, "loss": 0.1401, "step": 15392 }, { "epoch": 2.74, "grad_norm": 1.087785243988037, "learning_rate": 1.1135857720128151e-07, "loss": 0.1358, "step": 15424 }, { "epoch": 2.75, "grad_norm": 1.0765306949615479, "learning_rate": 1.0652591008389557e-07, "loss": 0.1438, "step": 15456 }, { "epoch": 2.75, "grad_norm": 1.1016918420791626, "learning_rate": 1.0179816204218928e-07, "loss": 0.1373, "step": 15488 }, { "epoch": 2.76, "grad_norm": 1.0536975860595703, "learning_rate": 9.717554031924842e-08, "loss": 0.1349, "step": 15520 }, { "epoch": 2.77, "grad_norm": 0.8933613300323486, "learning_rate": 9.265824754989467e-08, "loss": 0.1316, "step": 15552 }, { "epoch": 2.77, "grad_norm": 0.9983497858047485, "learning_rate": 8.824648175180722e-08, "loss": 0.1346, "step": 15584 }, { "epoch": 2.78, "grad_norm": 1.0600674152374268, "learning_rate": 8.394043631683862e-08, "loss": 0.1533, "step": 15616 }, { "epoch": 2.78, "grad_norm": 1.0515772104263306, "learning_rate": 7.974030000253986e-08, "loss": 0.139, "step": 15648 }, { "epoch": 2.79, "grad_norm": 1.4163565635681152, "learning_rate": 7.564625692388499e-08, "loss": 0.1323, "step": 15680 }, { "epoch": 2.79, "grad_norm": 1.0619480609893799, "learning_rate": 7.165848654519969e-08, "loss": 0.1373, "step": 15712 }, { "epoch": 2.8, "grad_norm": 0.9783567786216736, "learning_rate": 6.777716367229764e-08, "loss": 0.1525, "step": 15744 }, { "epoch": 2.81, "grad_norm": 1.0433095693588257, "learning_rate": 6.400245844481262e-08, "loss": 0.1409, "step": 15776 }, { "epoch": 2.81, "grad_norm": 1.258354663848877, "learning_rate": 6.033453632874498e-08, "loss": 0.1402, "step": 15808 }, { "epoch": 2.82, "grad_norm": 1.1823972463607788, "learning_rate": 5.677355810920604e-08, "loss": 0.1418, "step": 15840 }, { "epoch": 2.82, "grad_norm": 1.2745051383972168, "learning_rate": 5.3319679883370724e-08, "loss": 0.1471, "step": 15872 }, { "epoch": 2.83, "grad_norm": 1.238215684890747, "learning_rate": 4.9973053053634365e-08, "loss": 0.1426, "step": 15904 }, { "epoch": 2.83, "grad_norm": 1.0394669771194458, "learning_rate": 4.6733824320976674e-08, "loss": 0.1335, "step": 15936 }, { "epoch": 2.84, "grad_norm": 1.2872110605239868, "learning_rate": 4.360213567853072e-08, "loss": 0.1544, "step": 15968 }, { "epoch": 2.84, "grad_norm": 1.2786180973052979, "learning_rate": 4.057812440535797e-08, "loss": 0.1461, "step": 16000 }, { "epoch": 2.85, "grad_norm": 1.169412612915039, "learning_rate": 3.766192306043165e-08, "loss": 0.1413, "step": 16032 }, { "epoch": 2.86, "grad_norm": 1.2436180114746094, "learning_rate": 3.485365947682562e-08, "loss": 0.1357, "step": 16064 }, { "epoch": 2.86, "grad_norm": 1.1065387725830078, "learning_rate": 3.215345675611076e-08, "loss": 0.1472, "step": 16096 }, { "epoch": 2.87, "grad_norm": 1.00310218334198, "learning_rate": 2.9561433262957072e-08, "loss": 0.1499, "step": 16128 }, { "epoch": 2.87, "grad_norm": 0.9328859448432922, "learning_rate": 2.7077702619948963e-08, "loss": 0.1376, "step": 16160 }, { "epoch": 2.88, "grad_norm": 1.122558832168579, "learning_rate": 2.4702373702600868e-08, "loss": 0.1461, "step": 16192 }, { "epoch": 2.88, "grad_norm": 1.1374387741088867, "learning_rate": 2.2435550634585522e-08, "loss": 0.1427, "step": 16224 }, { "epoch": 2.89, "grad_norm": 1.102001428604126, "learning_rate": 2.027733278317151e-08, "loss": 0.1402, "step": 16256 }, { "epoch": 2.9, "grad_norm": 1.3255332708358765, "learning_rate": 1.822781475486507e-08, "loss": 0.1427, "step": 16288 }, { "epoch": 2.9, "grad_norm": 1.1129345893859863, "learning_rate": 1.628708639126425e-08, "loss": 0.1443, "step": 16320 }, { "epoch": 2.91, "grad_norm": 0.9628246426582336, "learning_rate": 1.4455232765120397e-08, "loss": 0.1425, "step": 16352 }, { "epoch": 2.91, "grad_norm": 1.2334058284759521, "learning_rate": 1.273233417660863e-08, "loss": 0.134, "step": 16384 }, { "epoch": 2.92, "grad_norm": 1.1855486631393433, "learning_rate": 1.1118466149808994e-08, "loss": 0.1403, "step": 16416 }, { "epoch": 2.92, "grad_norm": 1.0412129163742065, "learning_rate": 9.61369942939383e-09, "loss": 0.1369, "step": 16448 }, { "epoch": 2.93, "grad_norm": 1.2178360223770142, "learning_rate": 8.218099977528871e-09, "loss": 0.1346, "step": 16480 }, { "epoch": 2.94, "grad_norm": 1.1358833312988281, "learning_rate": 6.9317289709799896e-09, "loss": 0.1504, "step": 16512 }, { "epoch": 2.94, "grad_norm": 1.1772416830062866, "learning_rate": 5.754642798432297e-09, "loss": 0.144, "step": 16544 }, { "epoch": 2.95, "grad_norm": 1.4765156507492065, "learning_rate": 4.686893058018227e-09, "loss": 0.1531, "step": 16576 }, { "epoch": 2.95, "grad_norm": 1.0203588008880615, "learning_rate": 3.728526555056289e-09, "loss": 0.1439, "step": 16608 }, { "epoch": 2.96, "grad_norm": 1.053261637687683, "learning_rate": 2.879585299997434e-09, "loss": 0.1438, "step": 16640 }, { "epoch": 2.96, "grad_norm": 1.2388105392456055, "learning_rate": 2.1401065065859704e-09, "loss": 0.145, "step": 16672 }, { "epoch": 2.97, "grad_norm": 0.9954524040222168, "learning_rate": 1.5101225902267036e-09, "loss": 0.147, "step": 16704 }, { "epoch": 2.98, "grad_norm": 1.2384732961654663, "learning_rate": 9.89661166564404e-10, "loss": 0.1492, "step": 16736 }, { "epoch": 2.98, "grad_norm": 0.8787427544593811, "learning_rate": 5.787450502728331e-10, "loss": 0.1299, "step": 16768 }, { "epoch": 2.99, "grad_norm": 0.9824705719947815, "learning_rate": 2.7739225405609694e-10, "loss": 0.1428, "step": 16800 }, { "epoch": 2.99, "grad_norm": 1.0306531190872192, "learning_rate": 8.561598785705727e-11, "loss": 0.1434, "step": 16832 }, { "epoch": 3.0, "grad_norm": 1.0343343019485474, "learning_rate": 3.424658279460591e-12, "loss": 0.1521, "step": 16864 } ], "logging_steps": 32, "max_steps": 16872, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5624, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }