{ "best_global_step": 3200, "best_metric": 9.030352592468262, "best_model_checkpoint": "./qwen3moe_tinystories_sft/checkpoint-3200", "epoch": 0.8136024153821707, "eval_steps": 100, "global_step": 3200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00025425075480692836, "grad_norm": 17874.9453125, "learning_rate": 0.0, "loss": 5.9147, "step": 1 }, { "epoch": 0.0005085015096138567, "grad_norm": 17123.0625, "learning_rate": 1.2690355329949238e-07, "loss": 5.9008, "step": 2 }, { "epoch": 0.000762752264420785, "grad_norm": 17448.5234375, "learning_rate": 2.5380710659898475e-07, "loss": 5.91, "step": 3 }, { "epoch": 0.0010170030192277134, "grad_norm": 18966.7109375, "learning_rate": 3.807106598984772e-07, "loss": 5.9248, "step": 4 }, { "epoch": 0.0012712537740346417, "grad_norm": 17238.7421875, "learning_rate": 5.076142131979695e-07, "loss": 5.9275, "step": 5 }, { "epoch": 0.00152550452884157, "grad_norm": 18854.775390625, "learning_rate": 6.345177664974619e-07, "loss": 5.9297, "step": 6 }, { "epoch": 0.0017797552836484984, "grad_norm": 18089.51171875, "learning_rate": 7.614213197969544e-07, "loss": 5.9428, "step": 7 }, { "epoch": 0.002034006038455427, "grad_norm": 17900.16796875, "learning_rate": 8.883248730964468e-07, "loss": 5.9239, "step": 8 }, { "epoch": 0.002288256793262355, "grad_norm": 18685.57421875, "learning_rate": 1.015228426395939e-06, "loss": 5.9148, "step": 9 }, { "epoch": 0.0025425075480692834, "grad_norm": 18439.728515625, "learning_rate": 1.1421319796954315e-06, "loss": 5.9132, "step": 10 }, { "epoch": 0.002796758302876212, "grad_norm": 18592.375, "learning_rate": 1.2690355329949238e-06, "loss": 5.9367, "step": 11 }, { "epoch": 0.00305100905768314, "grad_norm": 18665.373046875, "learning_rate": 1.3959390862944163e-06, "loss": 5.9124, "step": 12 }, { "epoch": 0.0033052598124900683, "grad_norm": 18325.0234375, "learning_rate": 1.5228426395939088e-06, "loss": 5.9293, "step": 13 }, { "epoch": 0.003559510567296997, "grad_norm": 18158.3203125, "learning_rate": 1.6497461928934011e-06, "loss": 5.9115, "step": 14 }, { "epoch": 0.003813761322103925, "grad_norm": 18622.93359375, "learning_rate": 1.7766497461928936e-06, "loss": 5.9264, "step": 15 }, { "epoch": 0.004068012076910854, "grad_norm": 17586.322265625, "learning_rate": 1.903553299492386e-06, "loss": 5.9095, "step": 16 }, { "epoch": 0.004322262831717782, "grad_norm": 18946.81640625, "learning_rate": 2.030456852791878e-06, "loss": 5.9144, "step": 17 }, { "epoch": 0.00457651358652471, "grad_norm": 17792.5234375, "learning_rate": 2.1573604060913707e-06, "loss": 5.9174, "step": 18 }, { "epoch": 0.004830764341331639, "grad_norm": 18125.546875, "learning_rate": 2.284263959390863e-06, "loss": 5.9149, "step": 19 }, { "epoch": 0.005085015096138567, "grad_norm": 19357.408203125, "learning_rate": 2.4111675126903553e-06, "loss": 5.921, "step": 20 }, { "epoch": 0.005339265850945495, "grad_norm": 18574.943359375, "learning_rate": 2.5380710659898476e-06, "loss": 5.8929, "step": 21 }, { "epoch": 0.005593516605752424, "grad_norm": 19672.7578125, "learning_rate": 2.6649746192893404e-06, "loss": 5.9338, "step": 22 }, { "epoch": 0.005847767360559352, "grad_norm": 19014.935546875, "learning_rate": 2.7918781725888327e-06, "loss": 5.9172, "step": 23 }, { "epoch": 0.00610201811536628, "grad_norm": 18349.404296875, "learning_rate": 2.918781725888325e-06, "loss": 5.9241, "step": 24 }, { "epoch": 0.006356268870173209, "grad_norm": 19636.85546875, "learning_rate": 3.0456852791878177e-06, "loss": 5.9213, "step": 25 }, { "epoch": 0.006610519624980137, "grad_norm": 19006.40234375, "learning_rate": 3.1725888324873095e-06, "loss": 5.909, "step": 26 }, { "epoch": 0.006864770379787065, "grad_norm": 18447.80859375, "learning_rate": 3.2994923857868023e-06, "loss": 5.9011, "step": 27 }, { "epoch": 0.007119021134593994, "grad_norm": 18491.94921875, "learning_rate": 3.4263959390862946e-06, "loss": 5.9242, "step": 28 }, { "epoch": 0.007373271889400922, "grad_norm": 18376.041015625, "learning_rate": 3.5532994923857873e-06, "loss": 5.9213, "step": 29 }, { "epoch": 0.00762752264420785, "grad_norm": 18408.8359375, "learning_rate": 3.680203045685279e-06, "loss": 5.8996, "step": 30 }, { "epoch": 0.007881773399014778, "grad_norm": 17806.453125, "learning_rate": 3.807106598984772e-06, "loss": 5.9194, "step": 31 }, { "epoch": 0.008136024153821707, "grad_norm": 19786.220703125, "learning_rate": 3.934010152284264e-06, "loss": 5.93, "step": 32 }, { "epoch": 0.008390274908628636, "grad_norm": 19096.904296875, "learning_rate": 4.060913705583756e-06, "loss": 5.922, "step": 33 }, { "epoch": 0.008644525663435564, "grad_norm": 19384.958984375, "learning_rate": 4.187817258883249e-06, "loss": 5.9004, "step": 34 }, { "epoch": 0.008898776418242492, "grad_norm": 18427.359375, "learning_rate": 4.3147208121827415e-06, "loss": 5.9233, "step": 35 }, { "epoch": 0.00915302717304942, "grad_norm": 19002.884765625, "learning_rate": 4.441624365482234e-06, "loss": 5.9167, "step": 36 }, { "epoch": 0.009407277927856348, "grad_norm": 17368.919921875, "learning_rate": 4.568527918781726e-06, "loss": 5.9175, "step": 37 }, { "epoch": 0.009661528682663277, "grad_norm": 18952.740234375, "learning_rate": 4.695431472081218e-06, "loss": 5.9195, "step": 38 }, { "epoch": 0.009915779437470205, "grad_norm": 20026.333984375, "learning_rate": 4.822335025380711e-06, "loss": 5.9014, "step": 39 }, { "epoch": 0.010170030192277133, "grad_norm": 18163.779296875, "learning_rate": 4.949238578680204e-06, "loss": 5.9036, "step": 40 }, { "epoch": 0.010424280947084062, "grad_norm": 18968.326171875, "learning_rate": 5.076142131979695e-06, "loss": 5.9211, "step": 41 }, { "epoch": 0.01067853170189099, "grad_norm": 18830.423828125, "learning_rate": 5.203045685279188e-06, "loss": 5.9474, "step": 42 }, { "epoch": 0.010932782456697918, "grad_norm": 17818.533203125, "learning_rate": 5.329949238578681e-06, "loss": 5.9221, "step": 43 }, { "epoch": 0.011187033211504847, "grad_norm": 17642.869140625, "learning_rate": 5.456852791878172e-06, "loss": 5.9297, "step": 44 }, { "epoch": 0.011441283966311775, "grad_norm": 17480.1171875, "learning_rate": 5.583756345177665e-06, "loss": 5.9168, "step": 45 }, { "epoch": 0.011695534721118703, "grad_norm": 17172.3515625, "learning_rate": 5.710659898477158e-06, "loss": 5.913, "step": 46 }, { "epoch": 0.011949785475925631, "grad_norm": 17409.3046875, "learning_rate": 5.83756345177665e-06, "loss": 5.9063, "step": 47 }, { "epoch": 0.01220403623073256, "grad_norm": 17079.10546875, "learning_rate": 5.964467005076142e-06, "loss": 5.9117, "step": 48 }, { "epoch": 0.012458286985539488, "grad_norm": 17173.505859375, "learning_rate": 6.091370558375635e-06, "loss": 5.8861, "step": 49 }, { "epoch": 0.012712537740346417, "grad_norm": 16594.908203125, "learning_rate": 6.218274111675127e-06, "loss": 5.9148, "step": 50 }, { "epoch": 0.012966788495153345, "grad_norm": 16416.0859375, "learning_rate": 6.345177664974619e-06, "loss": 5.9133, "step": 51 }, { "epoch": 0.013221039249960273, "grad_norm": 16740.3359375, "learning_rate": 6.472081218274112e-06, "loss": 5.9171, "step": 52 }, { "epoch": 0.013475290004767201, "grad_norm": 16300.2470703125, "learning_rate": 6.5989847715736045e-06, "loss": 5.9075, "step": 53 }, { "epoch": 0.01372954075957413, "grad_norm": 15715.08984375, "learning_rate": 6.725888324873096e-06, "loss": 5.9175, "step": 54 }, { "epoch": 0.013983791514381057, "grad_norm": 14736.154296875, "learning_rate": 6.852791878172589e-06, "loss": 5.9288, "step": 55 }, { "epoch": 0.014238042269187987, "grad_norm": 15556.548828125, "learning_rate": 6.9796954314720814e-06, "loss": 5.9124, "step": 56 }, { "epoch": 0.014492293023994915, "grad_norm": 13768.126953125, "learning_rate": 7.106598984771575e-06, "loss": 5.9091, "step": 57 }, { "epoch": 0.014746543778801843, "grad_norm": 14840.53515625, "learning_rate": 7.233502538071066e-06, "loss": 5.9124, "step": 58 }, { "epoch": 0.015000794533608771, "grad_norm": 15261.126953125, "learning_rate": 7.360406091370558e-06, "loss": 5.9162, "step": 59 }, { "epoch": 0.0152550452884157, "grad_norm": 14289.3095703125, "learning_rate": 7.4873096446700515e-06, "loss": 5.9206, "step": 60 }, { "epoch": 0.01550929604322263, "grad_norm": 14206.84375, "learning_rate": 7.614213197969544e-06, "loss": 5.9084, "step": 61 }, { "epoch": 0.015763546798029555, "grad_norm": 13316.6611328125, "learning_rate": 7.741116751269035e-06, "loss": 5.9273, "step": 62 }, { "epoch": 0.016017797552836485, "grad_norm": 13425.056640625, "learning_rate": 7.868020304568528e-06, "loss": 5.9029, "step": 63 }, { "epoch": 0.016272048307643415, "grad_norm": 13660.4619140625, "learning_rate": 7.994923857868022e-06, "loss": 5.9054, "step": 64 }, { "epoch": 0.01652629906245034, "grad_norm": 13360.3232421875, "learning_rate": 8.121827411167512e-06, "loss": 5.905, "step": 65 }, { "epoch": 0.01678054981725727, "grad_norm": 13411.6171875, "learning_rate": 8.248730964467004e-06, "loss": 5.9125, "step": 66 }, { "epoch": 0.017034800572064197, "grad_norm": 13302.5546875, "learning_rate": 8.375634517766498e-06, "loss": 5.9007, "step": 67 }, { "epoch": 0.017289051326871127, "grad_norm": 13521.859375, "learning_rate": 8.50253807106599e-06, "loss": 5.916, "step": 68 }, { "epoch": 0.017543302081678053, "grad_norm": 13756.3876953125, "learning_rate": 8.629441624365483e-06, "loss": 5.8867, "step": 69 }, { "epoch": 0.017797552836484983, "grad_norm": 13790.0654296875, "learning_rate": 8.756345177664975e-06, "loss": 5.9113, "step": 70 }, { "epoch": 0.018051803591291913, "grad_norm": 13175.75390625, "learning_rate": 8.883248730964468e-06, "loss": 5.8928, "step": 71 }, { "epoch": 0.01830605434609884, "grad_norm": 13981.7958984375, "learning_rate": 9.01015228426396e-06, "loss": 5.904, "step": 72 }, { "epoch": 0.01856030510090577, "grad_norm": 13376.328125, "learning_rate": 9.137055837563452e-06, "loss": 5.9054, "step": 73 }, { "epoch": 0.018814555855712695, "grad_norm": 13306.1318359375, "learning_rate": 9.263959390862944e-06, "loss": 5.9076, "step": 74 }, { "epoch": 0.019068806610519625, "grad_norm": 13207.986328125, "learning_rate": 9.390862944162437e-06, "loss": 5.9027, "step": 75 }, { "epoch": 0.019323057365326555, "grad_norm": 13185.794921875, "learning_rate": 9.517766497461929e-06, "loss": 5.9094, "step": 76 }, { "epoch": 0.01957730812013348, "grad_norm": 13483.1728515625, "learning_rate": 9.644670050761421e-06, "loss": 5.925, "step": 77 }, { "epoch": 0.01983155887494041, "grad_norm": 13370.5615234375, "learning_rate": 9.771573604060914e-06, "loss": 5.9001, "step": 78 }, { "epoch": 0.020085809629747337, "grad_norm": 13176.5107421875, "learning_rate": 9.898477157360408e-06, "loss": 5.9185, "step": 79 }, { "epoch": 0.020340060384554267, "grad_norm": 13421.5244140625, "learning_rate": 1.0025380710659898e-05, "loss": 5.9031, "step": 80 }, { "epoch": 0.020594311139361193, "grad_norm": 13590.021484375, "learning_rate": 1.015228426395939e-05, "loss": 5.9013, "step": 81 }, { "epoch": 0.020848561894168123, "grad_norm": 13200.2705078125, "learning_rate": 1.0279187817258885e-05, "loss": 5.897, "step": 82 }, { "epoch": 0.021102812648975053, "grad_norm": 13252.2431640625, "learning_rate": 1.0406091370558377e-05, "loss": 5.8837, "step": 83 }, { "epoch": 0.02135706340378198, "grad_norm": 13280.3037109375, "learning_rate": 1.0532994923857867e-05, "loss": 5.9044, "step": 84 }, { "epoch": 0.02161131415858891, "grad_norm": 13695.7080078125, "learning_rate": 1.0659898477157361e-05, "loss": 5.9125, "step": 85 }, { "epoch": 0.021865564913395835, "grad_norm": 13352.21484375, "learning_rate": 1.0786802030456854e-05, "loss": 5.8973, "step": 86 }, { "epoch": 0.022119815668202765, "grad_norm": 13334.6044921875, "learning_rate": 1.0913705583756344e-05, "loss": 5.9082, "step": 87 }, { "epoch": 0.022374066423009695, "grad_norm": 13765.154296875, "learning_rate": 1.1040609137055838e-05, "loss": 5.9042, "step": 88 }, { "epoch": 0.02262831717781662, "grad_norm": 13421.01171875, "learning_rate": 1.116751269035533e-05, "loss": 5.8932, "step": 89 }, { "epoch": 0.02288256793262355, "grad_norm": 13485.15234375, "learning_rate": 1.1294416243654823e-05, "loss": 5.9081, "step": 90 }, { "epoch": 0.023136818687430477, "grad_norm": 13717.1845703125, "learning_rate": 1.1421319796954315e-05, "loss": 5.9179, "step": 91 }, { "epoch": 0.023391069442237407, "grad_norm": 13565.94140625, "learning_rate": 1.1548223350253808e-05, "loss": 5.9076, "step": 92 }, { "epoch": 0.023645320197044337, "grad_norm": 13472.171875, "learning_rate": 1.16751269035533e-05, "loss": 5.8887, "step": 93 }, { "epoch": 0.023899570951851263, "grad_norm": 13683.9931640625, "learning_rate": 1.1802030456852794e-05, "loss": 5.9068, "step": 94 }, { "epoch": 0.024153821706658193, "grad_norm": 13872.4140625, "learning_rate": 1.1928934010152284e-05, "loss": 5.8972, "step": 95 }, { "epoch": 0.02440807246146512, "grad_norm": 13557.8515625, "learning_rate": 1.2055837563451777e-05, "loss": 5.8923, "step": 96 }, { "epoch": 0.02466232321627205, "grad_norm": 13712.21484375, "learning_rate": 1.218274111675127e-05, "loss": 5.9164, "step": 97 }, { "epoch": 0.024916573971078975, "grad_norm": 13596.0732421875, "learning_rate": 1.2309644670050761e-05, "loss": 5.8984, "step": 98 }, { "epoch": 0.025170824725885905, "grad_norm": 13513.576171875, "learning_rate": 1.2436548223350254e-05, "loss": 5.887, "step": 99 }, { "epoch": 0.025425075480692835, "grad_norm": 13328.6044921875, "learning_rate": 1.2563451776649746e-05, "loss": 5.9013, "step": 100 }, { "epoch": 0.025425075480692835, "eval_loss": 11.887640953063965, "eval_runtime": 697.8119, "eval_samples_per_second": 151.883, "eval_steps_per_second": 9.494, "step": 100 }, { "epoch": 0.02567932623549976, "grad_norm": 13536.2607421875, "learning_rate": 1.2690355329949238e-05, "loss": 5.8976, "step": 101 }, { "epoch": 0.02593357699030669, "grad_norm": 13625.7626953125, "learning_rate": 1.281725888324873e-05, "loss": 5.8834, "step": 102 }, { "epoch": 0.026187827745113617, "grad_norm": 13483.0322265625, "learning_rate": 1.2944162436548224e-05, "loss": 5.8999, "step": 103 }, { "epoch": 0.026442078499920547, "grad_norm": 13512.1435546875, "learning_rate": 1.3071065989847717e-05, "loss": 5.8884, "step": 104 }, { "epoch": 0.026696329254727477, "grad_norm": 13751.4580078125, "learning_rate": 1.3197969543147209e-05, "loss": 5.8908, "step": 105 }, { "epoch": 0.026950580009534403, "grad_norm": 13559.2783203125, "learning_rate": 1.3324873096446703e-05, "loss": 5.9082, "step": 106 }, { "epoch": 0.027204830764341333, "grad_norm": 13575.0419921875, "learning_rate": 1.3451776649746192e-05, "loss": 5.8927, "step": 107 }, { "epoch": 0.02745908151914826, "grad_norm": 13510.3466796875, "learning_rate": 1.3578680203045684e-05, "loss": 5.8836, "step": 108 }, { "epoch": 0.02771333227395519, "grad_norm": 13477.1591796875, "learning_rate": 1.3705583756345178e-05, "loss": 5.8889, "step": 109 }, { "epoch": 0.027967583028762115, "grad_norm": 13679.9140625, "learning_rate": 1.383248730964467e-05, "loss": 5.9003, "step": 110 }, { "epoch": 0.028221833783569045, "grad_norm": 13792.7548828125, "learning_rate": 1.3959390862944163e-05, "loss": 5.8894, "step": 111 }, { "epoch": 0.028476084538375974, "grad_norm": 13725.0146484375, "learning_rate": 1.4086294416243657e-05, "loss": 5.8858, "step": 112 }, { "epoch": 0.0287303352931829, "grad_norm": 13686.3720703125, "learning_rate": 1.421319796954315e-05, "loss": 5.8788, "step": 113 }, { "epoch": 0.02898458604798983, "grad_norm": 13671.60546875, "learning_rate": 1.4340101522842641e-05, "loss": 5.8878, "step": 114 }, { "epoch": 0.029238836802796757, "grad_norm": 13452.1201171875, "learning_rate": 1.4467005076142132e-05, "loss": 5.8957, "step": 115 }, { "epoch": 0.029493087557603687, "grad_norm": 13602.41015625, "learning_rate": 1.4593908629441624e-05, "loss": 5.8766, "step": 116 }, { "epoch": 0.029747338312410616, "grad_norm": 13690.53125, "learning_rate": 1.4720812182741117e-05, "loss": 5.8951, "step": 117 }, { "epoch": 0.030001589067217543, "grad_norm": 13700.568359375, "learning_rate": 1.484771573604061e-05, "loss": 5.8945, "step": 118 }, { "epoch": 0.030255839822024472, "grad_norm": 13505.111328125, "learning_rate": 1.4974619289340103e-05, "loss": 5.8965, "step": 119 }, { "epoch": 0.0305100905768314, "grad_norm": 13736.1865234375, "learning_rate": 1.5101522842639595e-05, "loss": 5.8742, "step": 120 }, { "epoch": 0.03076434133163833, "grad_norm": 13643.3935546875, "learning_rate": 1.5228426395939088e-05, "loss": 5.9014, "step": 121 }, { "epoch": 0.03101859208644526, "grad_norm": 13563.412109375, "learning_rate": 1.535532994923858e-05, "loss": 5.8933, "step": 122 }, { "epoch": 0.03127284284125219, "grad_norm": 13571.185546875, "learning_rate": 1.548223350253807e-05, "loss": 5.8736, "step": 123 }, { "epoch": 0.03152709359605911, "grad_norm": 13560.033203125, "learning_rate": 1.5609137055837564e-05, "loss": 5.9019, "step": 124 }, { "epoch": 0.03178134435086604, "grad_norm": 13774.8857421875, "learning_rate": 1.5736040609137055e-05, "loss": 5.9036, "step": 125 }, { "epoch": 0.03203559510567297, "grad_norm": 13625.09765625, "learning_rate": 1.586294416243655e-05, "loss": 5.8879, "step": 126 }, { "epoch": 0.0322898458604799, "grad_norm": 13534.6162109375, "learning_rate": 1.5989847715736043e-05, "loss": 5.8661, "step": 127 }, { "epoch": 0.03254409661528683, "grad_norm": 13809.35546875, "learning_rate": 1.6116751269035534e-05, "loss": 5.895, "step": 128 }, { "epoch": 0.03279834737009375, "grad_norm": 13707.41796875, "learning_rate": 1.6243654822335024e-05, "loss": 5.8883, "step": 129 }, { "epoch": 0.03305259812490068, "grad_norm": 13843.4951171875, "learning_rate": 1.6370558375634518e-05, "loss": 5.9049, "step": 130 }, { "epoch": 0.03330684887970761, "grad_norm": 13867.34765625, "learning_rate": 1.649746192893401e-05, "loss": 5.8972, "step": 131 }, { "epoch": 0.03356109963451454, "grad_norm": 13773.0732421875, "learning_rate": 1.6624365482233503e-05, "loss": 5.8697, "step": 132 }, { "epoch": 0.033815350389321465, "grad_norm": 13810.62890625, "learning_rate": 1.6751269035532997e-05, "loss": 5.8848, "step": 133 }, { "epoch": 0.034069601144128395, "grad_norm": 13859.7099609375, "learning_rate": 1.6878172588832487e-05, "loss": 5.8775, "step": 134 }, { "epoch": 0.034323851898935324, "grad_norm": 13713.4169921875, "learning_rate": 1.700507614213198e-05, "loss": 5.8831, "step": 135 }, { "epoch": 0.034578102653742254, "grad_norm": 13702.9951171875, "learning_rate": 1.7131979695431472e-05, "loss": 5.8658, "step": 136 }, { "epoch": 0.034832353408549184, "grad_norm": 13757.0, "learning_rate": 1.7258883248730966e-05, "loss": 5.8831, "step": 137 }, { "epoch": 0.03508660416335611, "grad_norm": 14029.185546875, "learning_rate": 1.7385786802030457e-05, "loss": 5.8856, "step": 138 }, { "epoch": 0.03534085491816304, "grad_norm": 13854.5400390625, "learning_rate": 1.751269035532995e-05, "loss": 5.903, "step": 139 }, { "epoch": 0.035595105672969966, "grad_norm": 13903.626953125, "learning_rate": 1.763959390862944e-05, "loss": 5.8816, "step": 140 }, { "epoch": 0.035849356427776896, "grad_norm": 13783.7431640625, "learning_rate": 1.7766497461928935e-05, "loss": 5.8934, "step": 141 }, { "epoch": 0.036103607182583826, "grad_norm": 13807.6279296875, "learning_rate": 1.789340101522843e-05, "loss": 5.8794, "step": 142 }, { "epoch": 0.03635785793739075, "grad_norm": 13903.8896484375, "learning_rate": 1.802030456852792e-05, "loss": 5.8909, "step": 143 }, { "epoch": 0.03661210869219768, "grad_norm": 13844.205078125, "learning_rate": 1.814720812182741e-05, "loss": 5.8787, "step": 144 }, { "epoch": 0.03686635944700461, "grad_norm": 13829.861328125, "learning_rate": 1.8274111675126904e-05, "loss": 5.8848, "step": 145 }, { "epoch": 0.03712061020181154, "grad_norm": 13927.4833984375, "learning_rate": 1.8401015228426395e-05, "loss": 5.8872, "step": 146 }, { "epoch": 0.03737486095661847, "grad_norm": 14146.7177734375, "learning_rate": 1.852791878172589e-05, "loss": 5.9012, "step": 147 }, { "epoch": 0.03762911171142539, "grad_norm": 13803.76171875, "learning_rate": 1.8654822335025383e-05, "loss": 5.8809, "step": 148 }, { "epoch": 0.03788336246623232, "grad_norm": 13910.3681640625, "learning_rate": 1.8781725888324874e-05, "loss": 5.8893, "step": 149 }, { "epoch": 0.03813761322103925, "grad_norm": 14064.740234375, "learning_rate": 1.8908629441624368e-05, "loss": 5.8773, "step": 150 }, { "epoch": 0.03839186397584618, "grad_norm": 13910.2197265625, "learning_rate": 1.9035532994923858e-05, "loss": 5.8726, "step": 151 }, { "epoch": 0.03864611473065311, "grad_norm": 13988.330078125, "learning_rate": 1.916243654822335e-05, "loss": 5.8794, "step": 152 }, { "epoch": 0.03890036548546003, "grad_norm": 13807.6494140625, "learning_rate": 1.9289340101522843e-05, "loss": 5.8654, "step": 153 }, { "epoch": 0.03915461624026696, "grad_norm": 14099.2060546875, "learning_rate": 1.9416243654822337e-05, "loss": 5.8787, "step": 154 }, { "epoch": 0.03940886699507389, "grad_norm": 14038.322265625, "learning_rate": 1.9543147208121827e-05, "loss": 5.8783, "step": 155 }, { "epoch": 0.03966311774988082, "grad_norm": 13967.1875, "learning_rate": 1.967005076142132e-05, "loss": 5.8966, "step": 156 }, { "epoch": 0.03991736850468775, "grad_norm": 13959.806640625, "learning_rate": 1.9796954314720815e-05, "loss": 5.86, "step": 157 }, { "epoch": 0.040171619259494674, "grad_norm": 14113.689453125, "learning_rate": 1.9923857868020303e-05, "loss": 5.8542, "step": 158 }, { "epoch": 0.040425870014301604, "grad_norm": 14134.0732421875, "learning_rate": 2.0050761421319797e-05, "loss": 5.8784, "step": 159 }, { "epoch": 0.040680120769108534, "grad_norm": 13855.0908203125, "learning_rate": 2.017766497461929e-05, "loss": 5.8766, "step": 160 }, { "epoch": 0.040934371523915464, "grad_norm": 13932.7353515625, "learning_rate": 2.030456852791878e-05, "loss": 5.8655, "step": 161 }, { "epoch": 0.04118862227872239, "grad_norm": 14085.6005859375, "learning_rate": 2.0431472081218275e-05, "loss": 5.8788, "step": 162 }, { "epoch": 0.041442873033529316, "grad_norm": 13949.18359375, "learning_rate": 2.055837563451777e-05, "loss": 5.8905, "step": 163 }, { "epoch": 0.041697123788336246, "grad_norm": 14003.30859375, "learning_rate": 2.068527918781726e-05, "loss": 5.8823, "step": 164 }, { "epoch": 0.041951374543143176, "grad_norm": 14008.98046875, "learning_rate": 2.0812182741116754e-05, "loss": 5.8612, "step": 165 }, { "epoch": 0.042205625297950106, "grad_norm": 14037.0654296875, "learning_rate": 2.0939086294416244e-05, "loss": 5.8743, "step": 166 }, { "epoch": 0.04245987605275703, "grad_norm": 13932.53125, "learning_rate": 2.1065989847715735e-05, "loss": 5.8667, "step": 167 }, { "epoch": 0.04271412680756396, "grad_norm": 13870.0458984375, "learning_rate": 2.119289340101523e-05, "loss": 5.8795, "step": 168 }, { "epoch": 0.04296837756237089, "grad_norm": 13954.0673828125, "learning_rate": 2.1319796954314723e-05, "loss": 5.8477, "step": 169 }, { "epoch": 0.04322262831717782, "grad_norm": 13942.2314453125, "learning_rate": 2.1446700507614213e-05, "loss": 5.881, "step": 170 }, { "epoch": 0.04347687907198475, "grad_norm": 13986.8291015625, "learning_rate": 2.1573604060913707e-05, "loss": 5.8705, "step": 171 }, { "epoch": 0.04373112982679167, "grad_norm": 14061.9111328125, "learning_rate": 2.17005076142132e-05, "loss": 5.865, "step": 172 }, { "epoch": 0.0439853805815986, "grad_norm": 14076.083984375, "learning_rate": 2.182741116751269e-05, "loss": 5.8634, "step": 173 }, { "epoch": 0.04423963133640553, "grad_norm": 14042.853515625, "learning_rate": 2.1954314720812183e-05, "loss": 5.8763, "step": 174 }, { "epoch": 0.04449388209121246, "grad_norm": 13883.662109375, "learning_rate": 2.2081218274111677e-05, "loss": 5.8849, "step": 175 }, { "epoch": 0.04474813284601939, "grad_norm": 13985.61328125, "learning_rate": 2.2208121827411167e-05, "loss": 5.8639, "step": 176 }, { "epoch": 0.04500238360082631, "grad_norm": 14063.1689453125, "learning_rate": 2.233502538071066e-05, "loss": 5.8698, "step": 177 }, { "epoch": 0.04525663435563324, "grad_norm": 14112.0966796875, "learning_rate": 2.2461928934010155e-05, "loss": 5.8915, "step": 178 }, { "epoch": 0.04551088511044017, "grad_norm": 14043.98828125, "learning_rate": 2.2588832487309646e-05, "loss": 5.8587, "step": 179 }, { "epoch": 0.0457651358652471, "grad_norm": 14180.72265625, "learning_rate": 2.2715736040609136e-05, "loss": 5.8631, "step": 180 }, { "epoch": 0.04601938662005403, "grad_norm": 14142.736328125, "learning_rate": 2.284263959390863e-05, "loss": 5.8681, "step": 181 }, { "epoch": 0.046273637374860954, "grad_norm": 14137.55859375, "learning_rate": 2.296954314720812e-05, "loss": 5.8552, "step": 182 }, { "epoch": 0.046527888129667884, "grad_norm": 14159.0224609375, "learning_rate": 2.3096446700507615e-05, "loss": 5.8597, "step": 183 }, { "epoch": 0.046782138884474814, "grad_norm": 14298.9453125, "learning_rate": 2.322335025380711e-05, "loss": 5.8619, "step": 184 }, { "epoch": 0.047036389639281743, "grad_norm": 14007.9853515625, "learning_rate": 2.33502538071066e-05, "loss": 5.8654, "step": 185 }, { "epoch": 0.04729064039408867, "grad_norm": 14221.35546875, "learning_rate": 2.3477157360406094e-05, "loss": 5.8506, "step": 186 }, { "epoch": 0.047544891148895596, "grad_norm": 13967.9560546875, "learning_rate": 2.3604060913705588e-05, "loss": 5.8619, "step": 187 }, { "epoch": 0.047799141903702526, "grad_norm": 14121.439453125, "learning_rate": 2.3730964467005075e-05, "loss": 5.8651, "step": 188 }, { "epoch": 0.048053392658509456, "grad_norm": 13945.2744140625, "learning_rate": 2.385786802030457e-05, "loss": 5.874, "step": 189 }, { "epoch": 0.048307643413316385, "grad_norm": 13990.255859375, "learning_rate": 2.3984771573604063e-05, "loss": 5.8624, "step": 190 }, { "epoch": 0.04856189416812331, "grad_norm": 13951.9560546875, "learning_rate": 2.4111675126903553e-05, "loss": 5.8442, "step": 191 }, { "epoch": 0.04881614492293024, "grad_norm": 14099.447265625, "learning_rate": 2.4238578680203047e-05, "loss": 5.8675, "step": 192 }, { "epoch": 0.04907039567773717, "grad_norm": 13915.048828125, "learning_rate": 2.436548223350254e-05, "loss": 5.8706, "step": 193 }, { "epoch": 0.0493246464325441, "grad_norm": 14257.0009765625, "learning_rate": 2.4492385786802032e-05, "loss": 5.8689, "step": 194 }, { "epoch": 0.04957889718735103, "grad_norm": 13921.3193359375, "learning_rate": 2.4619289340101523e-05, "loss": 5.8521, "step": 195 }, { "epoch": 0.04983314794215795, "grad_norm": 13949.6025390625, "learning_rate": 2.4746192893401017e-05, "loss": 5.8449, "step": 196 }, { "epoch": 0.05008739869696488, "grad_norm": 14410.4150390625, "learning_rate": 2.4873096446700507e-05, "loss": 5.8856, "step": 197 }, { "epoch": 0.05034164945177181, "grad_norm": 14101.67578125, "learning_rate": 2.5e-05, "loss": 5.8372, "step": 198 }, { "epoch": 0.05059590020657874, "grad_norm": 14241.529296875, "learning_rate": 2.5126903553299492e-05, "loss": 5.8823, "step": 199 }, { "epoch": 0.05085015096138567, "grad_norm": 14324.90234375, "learning_rate": 2.5253807106598986e-05, "loss": 5.8547, "step": 200 }, { "epoch": 0.05085015096138567, "eval_loss": 11.810378074645996, "eval_runtime": 695.8663, "eval_samples_per_second": 152.308, "eval_steps_per_second": 9.521, "step": 200 }, { "epoch": 0.05110440171619259, "grad_norm": 13824.125, "learning_rate": 2.5380710659898476e-05, "loss": 5.8447, "step": 201 }, { "epoch": 0.05135865247099952, "grad_norm": 14183.697265625, "learning_rate": 2.5507614213197974e-05, "loss": 5.8576, "step": 202 }, { "epoch": 0.05161290322580645, "grad_norm": 13871.00390625, "learning_rate": 2.563451776649746e-05, "loss": 5.8533, "step": 203 }, { "epoch": 0.05186715398061338, "grad_norm": 14473.0146484375, "learning_rate": 2.576142131979696e-05, "loss": 5.8639, "step": 204 }, { "epoch": 0.05212140473542031, "grad_norm": 14182.2587890625, "learning_rate": 2.588832487309645e-05, "loss": 5.8501, "step": 205 }, { "epoch": 0.052375655490227234, "grad_norm": 14335.306640625, "learning_rate": 2.6015228426395936e-05, "loss": 5.8581, "step": 206 }, { "epoch": 0.052629906245034164, "grad_norm": 14553.783203125, "learning_rate": 2.6142131979695434e-05, "loss": 5.8572, "step": 207 }, { "epoch": 0.052884156999841093, "grad_norm": 14136.1455078125, "learning_rate": 2.6269035532994924e-05, "loss": 5.8586, "step": 208 }, { "epoch": 0.05313840775464802, "grad_norm": 14284.5458984375, "learning_rate": 2.6395939086294418e-05, "loss": 5.8548, "step": 209 }, { "epoch": 0.05339265850945495, "grad_norm": 14215.7578125, "learning_rate": 2.652284263959391e-05, "loss": 5.8581, "step": 210 }, { "epoch": 0.053646909264261876, "grad_norm": 14402.2216796875, "learning_rate": 2.6649746192893406e-05, "loss": 5.8437, "step": 211 }, { "epoch": 0.053901160019068806, "grad_norm": 14041.2705078125, "learning_rate": 2.6776649746192893e-05, "loss": 5.8627, "step": 212 }, { "epoch": 0.054155410773875735, "grad_norm": 14536.2783203125, "learning_rate": 2.6903553299492384e-05, "loss": 5.8507, "step": 213 }, { "epoch": 0.054409661528682665, "grad_norm": 14204.775390625, "learning_rate": 2.703045685279188e-05, "loss": 5.8558, "step": 214 }, { "epoch": 0.054663912283489595, "grad_norm": 14513.5751953125, "learning_rate": 2.715736040609137e-05, "loss": 5.8297, "step": 215 }, { "epoch": 0.05491816303829652, "grad_norm": 14284.46875, "learning_rate": 2.7284263959390866e-05, "loss": 5.8549, "step": 216 }, { "epoch": 0.05517241379310345, "grad_norm": 14067.05078125, "learning_rate": 2.7411167512690357e-05, "loss": 5.8432, "step": 217 }, { "epoch": 0.05542666454791038, "grad_norm": 14304.111328125, "learning_rate": 2.753807106598985e-05, "loss": 5.8596, "step": 218 }, { "epoch": 0.05568091530271731, "grad_norm": 14343.162109375, "learning_rate": 2.766497461928934e-05, "loss": 5.834, "step": 219 }, { "epoch": 0.05593516605752423, "grad_norm": 14471.5126953125, "learning_rate": 2.7791878172588832e-05, "loss": 5.8478, "step": 220 }, { "epoch": 0.05618941681233116, "grad_norm": 14145.4619140625, "learning_rate": 2.7918781725888326e-05, "loss": 5.8566, "step": 221 }, { "epoch": 0.05644366756713809, "grad_norm": 14756.76171875, "learning_rate": 2.8045685279187816e-05, "loss": 5.847, "step": 222 }, { "epoch": 0.05669791832194502, "grad_norm": 14184.3203125, "learning_rate": 2.8172588832487314e-05, "loss": 5.8627, "step": 223 }, { "epoch": 0.05695216907675195, "grad_norm": 14302.7763671875, "learning_rate": 2.82994923857868e-05, "loss": 5.8573, "step": 224 }, { "epoch": 0.05720641983155887, "grad_norm": 14140.478515625, "learning_rate": 2.84263959390863e-05, "loss": 5.8599, "step": 225 }, { "epoch": 0.0574606705863658, "grad_norm": 14565.8662109375, "learning_rate": 2.855329949238579e-05, "loss": 5.8474, "step": 226 }, { "epoch": 0.05771492134117273, "grad_norm": 14593.7666015625, "learning_rate": 2.8680203045685283e-05, "loss": 5.8386, "step": 227 }, { "epoch": 0.05796917209597966, "grad_norm": 14264.8037109375, "learning_rate": 2.8807106598984774e-05, "loss": 5.8382, "step": 228 }, { "epoch": 0.05822342285078659, "grad_norm": 14515.216796875, "learning_rate": 2.8934010152284264e-05, "loss": 5.8271, "step": 229 }, { "epoch": 0.058477673605593514, "grad_norm": 14455.34375, "learning_rate": 2.9060913705583758e-05, "loss": 5.8454, "step": 230 }, { "epoch": 0.05873192436040044, "grad_norm": 14383.3017578125, "learning_rate": 2.918781725888325e-05, "loss": 5.8468, "step": 231 }, { "epoch": 0.05898617511520737, "grad_norm": 14204.083984375, "learning_rate": 2.9314720812182743e-05, "loss": 5.8546, "step": 232 }, { "epoch": 0.0592404258700143, "grad_norm": 14394.791015625, "learning_rate": 2.9441624365482233e-05, "loss": 5.8401, "step": 233 }, { "epoch": 0.05949467662482123, "grad_norm": 14483.0283203125, "learning_rate": 2.956852791878173e-05, "loss": 5.8406, "step": 234 }, { "epoch": 0.059748927379628156, "grad_norm": 14330.568359375, "learning_rate": 2.969543147208122e-05, "loss": 5.843, "step": 235 }, { "epoch": 0.060003178134435085, "grad_norm": 14366.0263671875, "learning_rate": 2.982233502538071e-05, "loss": 5.8554, "step": 236 }, { "epoch": 0.060257428889242015, "grad_norm": 14203.2294921875, "learning_rate": 2.9949238578680206e-05, "loss": 5.829, "step": 237 }, { "epoch": 0.060511679644048945, "grad_norm": 14442.908203125, "learning_rate": 3.0076142131979696e-05, "loss": 5.8308, "step": 238 }, { "epoch": 0.060765930398855875, "grad_norm": 14376.8310546875, "learning_rate": 3.020304568527919e-05, "loss": 5.8311, "step": 239 }, { "epoch": 0.0610201811536628, "grad_norm": 14524.6455078125, "learning_rate": 3.032994923857868e-05, "loss": 5.8133, "step": 240 }, { "epoch": 0.06127443190846973, "grad_norm": 16003.4541015625, "learning_rate": 3.0456852791878175e-05, "loss": 5.8434, "step": 241 }, { "epoch": 0.06152868266327666, "grad_norm": 16413.31640625, "learning_rate": 3.0583756345177666e-05, "loss": 5.8513, "step": 242 }, { "epoch": 0.06178293341808359, "grad_norm": 15272.3359375, "learning_rate": 3.071065989847716e-05, "loss": 5.8448, "step": 243 }, { "epoch": 0.06203718417289052, "grad_norm": 14435.873046875, "learning_rate": 3.0837563451776654e-05, "loss": 5.8461, "step": 244 }, { "epoch": 0.06229143492769744, "grad_norm": 16821.875, "learning_rate": 3.096446700507614e-05, "loss": 5.8183, "step": 245 }, { "epoch": 0.06254568568250438, "grad_norm": 17576.55078125, "learning_rate": 3.1091370558375635e-05, "loss": 5.8332, "step": 246 }, { "epoch": 0.0627999364373113, "grad_norm": 14018.4365234375, "learning_rate": 3.121827411167513e-05, "loss": 5.8088, "step": 247 }, { "epoch": 0.06305418719211822, "grad_norm": 21211.966796875, "learning_rate": 3.134517766497462e-05, "loss": 5.8292, "step": 248 }, { "epoch": 0.06330843794692516, "grad_norm": 18285.978515625, "learning_rate": 3.147208121827411e-05, "loss": 5.829, "step": 249 }, { "epoch": 0.06356268870173208, "grad_norm": 15779.1826171875, "learning_rate": 3.1598984771573604e-05, "loss": 5.8409, "step": 250 }, { "epoch": 0.06381693945653902, "grad_norm": 29363.07421875, "learning_rate": 3.17258883248731e-05, "loss": 5.8376, "step": 251 }, { "epoch": 0.06407119021134594, "grad_norm": 14751.14453125, "learning_rate": 3.185279187817259e-05, "loss": 5.8504, "step": 252 }, { "epoch": 0.06432544096615286, "grad_norm": 22211.19140625, "learning_rate": 3.1979695431472086e-05, "loss": 5.8253, "step": 253 }, { "epoch": 0.0645796917209598, "grad_norm": 16582.279296875, "learning_rate": 3.210659898477157e-05, "loss": 5.8189, "step": 254 }, { "epoch": 0.06483394247576672, "grad_norm": 28961.861328125, "learning_rate": 3.223350253807107e-05, "loss": 5.8252, "step": 255 }, { "epoch": 0.06508819323057366, "grad_norm": 22928.2421875, "learning_rate": 3.236040609137056e-05, "loss": 5.8319, "step": 256 }, { "epoch": 0.06534244398538058, "grad_norm": 21412.84765625, "learning_rate": 3.248730964467005e-05, "loss": 5.8437, "step": 257 }, { "epoch": 0.0655966947401875, "grad_norm": 21025.75, "learning_rate": 3.261421319796954e-05, "loss": 5.8195, "step": 258 }, { "epoch": 0.06585094549499444, "grad_norm": 14630.908203125, "learning_rate": 3.2741116751269036e-05, "loss": 5.8212, "step": 259 }, { "epoch": 0.06610519624980137, "grad_norm": 25747.43359375, "learning_rate": 3.286802030456853e-05, "loss": 5.8387, "step": 260 }, { "epoch": 0.0663594470046083, "grad_norm": 19133.5546875, "learning_rate": 3.299492385786802e-05, "loss": 5.8295, "step": 261 }, { "epoch": 0.06661369775941522, "grad_norm": 18813.26171875, "learning_rate": 3.312182741116752e-05, "loss": 5.8433, "step": 262 }, { "epoch": 0.06686794851422215, "grad_norm": 19891.6015625, "learning_rate": 3.3248730964467006e-05, "loss": 5.8359, "step": 263 }, { "epoch": 0.06712219926902908, "grad_norm": 14633.3525390625, "learning_rate": 3.33756345177665e-05, "loss": 5.8482, "step": 264 }, { "epoch": 0.06737645002383601, "grad_norm": 22824.740234375, "learning_rate": 3.3502538071065994e-05, "loss": 5.8368, "step": 265 }, { "epoch": 0.06763070077864293, "grad_norm": 17525.8125, "learning_rate": 3.362944162436548e-05, "loss": 5.8235, "step": 266 }, { "epoch": 0.06788495153344987, "grad_norm": 16179.4912109375, "learning_rate": 3.3756345177664975e-05, "loss": 5.8129, "step": 267 }, { "epoch": 0.06813920228825679, "grad_norm": 19221.17578125, "learning_rate": 3.388324873096447e-05, "loss": 5.8252, "step": 268 }, { "epoch": 0.06839345304306373, "grad_norm": 14516.4716796875, "learning_rate": 3.401015228426396e-05, "loss": 5.8114, "step": 269 }, { "epoch": 0.06864770379787065, "grad_norm": 18901.623046875, "learning_rate": 3.413705583756345e-05, "loss": 5.8357, "step": 270 }, { "epoch": 0.06890195455267757, "grad_norm": 17530.751953125, "learning_rate": 3.4263959390862944e-05, "loss": 5.8415, "step": 271 }, { "epoch": 0.06915620530748451, "grad_norm": 16299.2001953125, "learning_rate": 3.439086294416244e-05, "loss": 5.8054, "step": 272 }, { "epoch": 0.06941045606229143, "grad_norm": 18370.28515625, "learning_rate": 3.451776649746193e-05, "loss": 5.8022, "step": 273 }, { "epoch": 0.06966470681709837, "grad_norm": 14381.2001953125, "learning_rate": 3.4644670050761426e-05, "loss": 5.8151, "step": 274 }, { "epoch": 0.06991895757190529, "grad_norm": 18153.146484375, "learning_rate": 3.477157360406091e-05, "loss": 5.8362, "step": 275 }, { "epoch": 0.07017320832671221, "grad_norm": 15234.4326171875, "learning_rate": 3.489847715736041e-05, "loss": 5.8043, "step": 276 }, { "epoch": 0.07042745908151915, "grad_norm": 17595.009765625, "learning_rate": 3.50253807106599e-05, "loss": 5.8226, "step": 277 }, { "epoch": 0.07068170983632607, "grad_norm": 17185.26171875, "learning_rate": 3.5152284263959395e-05, "loss": 5.8155, "step": 278 }, { "epoch": 0.07093596059113301, "grad_norm": 16285.2021484375, "learning_rate": 3.527918781725888e-05, "loss": 5.8099, "step": 279 }, { "epoch": 0.07119021134593993, "grad_norm": 16419.501953125, "learning_rate": 3.5406091370558376e-05, "loss": 5.8287, "step": 280 }, { "epoch": 0.07144446210074686, "grad_norm": 15122.501953125, "learning_rate": 3.553299492385787e-05, "loss": 5.8258, "step": 281 }, { "epoch": 0.07169871285555379, "grad_norm": 16796.24609375, "learning_rate": 3.565989847715736e-05, "loss": 5.8221, "step": 282 }, { "epoch": 0.07195296361036072, "grad_norm": 14491.029296875, "learning_rate": 3.578680203045686e-05, "loss": 5.815, "step": 283 }, { "epoch": 0.07220721436516765, "grad_norm": 15623.6533203125, "learning_rate": 3.5913705583756346e-05, "loss": 5.8259, "step": 284 }, { "epoch": 0.07246146511997457, "grad_norm": 14359.7822265625, "learning_rate": 3.604060913705584e-05, "loss": 5.8075, "step": 285 }, { "epoch": 0.0727157158747815, "grad_norm": 15124.78515625, "learning_rate": 3.6167512690355334e-05, "loss": 5.8165, "step": 286 }, { "epoch": 0.07296996662958843, "grad_norm": 14818.9287109375, "learning_rate": 3.629441624365482e-05, "loss": 5.8249, "step": 287 }, { "epoch": 0.07322421738439536, "grad_norm": 16652.857421875, "learning_rate": 3.6421319796954315e-05, "loss": 5.818, "step": 288 }, { "epoch": 0.0734784681392023, "grad_norm": 14468.9892578125, "learning_rate": 3.654822335025381e-05, "loss": 5.8158, "step": 289 }, { "epoch": 0.07373271889400922, "grad_norm": 16769.716796875, "learning_rate": 3.66751269035533e-05, "loss": 5.8083, "step": 290 }, { "epoch": 0.07398696964881614, "grad_norm": 14478.5517578125, "learning_rate": 3.680203045685279e-05, "loss": 5.8264, "step": 291 }, { "epoch": 0.07424122040362308, "grad_norm": 16448.939453125, "learning_rate": 3.692893401015229e-05, "loss": 5.82, "step": 292 }, { "epoch": 0.07449547115843, "grad_norm": 14361.9794921875, "learning_rate": 3.705583756345178e-05, "loss": 5.8203, "step": 293 }, { "epoch": 0.07474972191323694, "grad_norm": 16790.470703125, "learning_rate": 3.7182741116751265e-05, "loss": 5.8109, "step": 294 }, { "epoch": 0.07500397266804386, "grad_norm": 14581.0, "learning_rate": 3.7309644670050766e-05, "loss": 5.8106, "step": 295 }, { "epoch": 0.07525822342285078, "grad_norm": 16166.630859375, "learning_rate": 3.743654822335025e-05, "loss": 5.822, "step": 296 }, { "epoch": 0.07551247417765772, "grad_norm": 14668.458984375, "learning_rate": 3.756345177664975e-05, "loss": 5.8238, "step": 297 }, { "epoch": 0.07576672493246464, "grad_norm": 15926.828125, "learning_rate": 3.769035532994924e-05, "loss": 5.8182, "step": 298 }, { "epoch": 0.07602097568727158, "grad_norm": 16209.4697265625, "learning_rate": 3.7817258883248735e-05, "loss": 5.8131, "step": 299 }, { "epoch": 0.0762752264420785, "grad_norm": 14617.1806640625, "learning_rate": 3.794416243654822e-05, "loss": 5.7984, "step": 300 }, { "epoch": 0.0762752264420785, "eval_loss": 11.709574699401855, "eval_runtime": 696.5225, "eval_samples_per_second": 152.164, "eval_steps_per_second": 9.512, "step": 300 }, { "epoch": 0.07652947719688542, "grad_norm": 16631.537109375, "learning_rate": 3.8071065989847716e-05, "loss": 5.8042, "step": 301 }, { "epoch": 0.07678372795169236, "grad_norm": 14410.109375, "learning_rate": 3.819796954314721e-05, "loss": 5.8009, "step": 302 }, { "epoch": 0.07703797870649928, "grad_norm": 16206.8203125, "learning_rate": 3.83248730964467e-05, "loss": 5.7978, "step": 303 }, { "epoch": 0.07729222946130622, "grad_norm": 15047.525390625, "learning_rate": 3.84517766497462e-05, "loss": 5.8107, "step": 304 }, { "epoch": 0.07754648021611314, "grad_norm": 14869.10546875, "learning_rate": 3.8578680203045685e-05, "loss": 5.8066, "step": 305 }, { "epoch": 0.07780073097092007, "grad_norm": 15287.2099609375, "learning_rate": 3.870558375634518e-05, "loss": 5.7939, "step": 306 }, { "epoch": 0.078054981725727, "grad_norm": 14399.8154296875, "learning_rate": 3.8832487309644673e-05, "loss": 5.7954, "step": 307 }, { "epoch": 0.07830923248053392, "grad_norm": 14655.740234375, "learning_rate": 3.895939086294416e-05, "loss": 5.7993, "step": 308 }, { "epoch": 0.07856348323534086, "grad_norm": 15159.48828125, "learning_rate": 3.9086294416243655e-05, "loss": 5.8116, "step": 309 }, { "epoch": 0.07881773399014778, "grad_norm": 15555.21875, "learning_rate": 3.921319796954315e-05, "loss": 5.8081, "step": 310 }, { "epoch": 0.07907198474495471, "grad_norm": 15971.390625, "learning_rate": 3.934010152284264e-05, "loss": 5.8004, "step": 311 }, { "epoch": 0.07932623549976164, "grad_norm": 14717.396484375, "learning_rate": 3.946700507614213e-05, "loss": 5.8227, "step": 312 }, { "epoch": 0.07958048625456857, "grad_norm": 15724.9521484375, "learning_rate": 3.959390862944163e-05, "loss": 5.7976, "step": 313 }, { "epoch": 0.0798347370093755, "grad_norm": 15999.9755859375, "learning_rate": 3.972081218274112e-05, "loss": 5.8175, "step": 314 }, { "epoch": 0.08008898776418243, "grad_norm": 14571.6796875, "learning_rate": 3.9847715736040605e-05, "loss": 5.7995, "step": 315 }, { "epoch": 0.08034323851898935, "grad_norm": 15078.56640625, "learning_rate": 3.9974619289340106e-05, "loss": 5.8054, "step": 316 }, { "epoch": 0.08059748927379629, "grad_norm": 16242.9296875, "learning_rate": 4.010152284263959e-05, "loss": 5.8063, "step": 317 }, { "epoch": 0.08085174002860321, "grad_norm": 15402.4443359375, "learning_rate": 4.022842639593909e-05, "loss": 5.8046, "step": 318 }, { "epoch": 0.08110599078341015, "grad_norm": 14665.2529296875, "learning_rate": 4.035532994923858e-05, "loss": 5.8062, "step": 319 }, { "epoch": 0.08136024153821707, "grad_norm": 15843.931640625, "learning_rate": 4.0482233502538075e-05, "loss": 5.8005, "step": 320 }, { "epoch": 0.08161449229302399, "grad_norm": 16756.453125, "learning_rate": 4.060913705583756e-05, "loss": 5.7895, "step": 321 }, { "epoch": 0.08186874304783093, "grad_norm": 16084.455078125, "learning_rate": 4.073604060913706e-05, "loss": 5.8098, "step": 322 }, { "epoch": 0.08212299380263785, "grad_norm": 14420.24609375, "learning_rate": 4.086294416243655e-05, "loss": 5.7863, "step": 323 }, { "epoch": 0.08237724455744477, "grad_norm": 15253.8271484375, "learning_rate": 4.098984771573604e-05, "loss": 5.7787, "step": 324 }, { "epoch": 0.08263149531225171, "grad_norm": 18756.3671875, "learning_rate": 4.111675126903554e-05, "loss": 5.8293, "step": 325 }, { "epoch": 0.08288574606705863, "grad_norm": 17335.005859375, "learning_rate": 4.1243654822335025e-05, "loss": 5.8052, "step": 326 }, { "epoch": 0.08313999682186557, "grad_norm": 14691.3466796875, "learning_rate": 4.137055837563452e-05, "loss": 5.7827, "step": 327 }, { "epoch": 0.08339424757667249, "grad_norm": 15806.7529296875, "learning_rate": 4.1497461928934013e-05, "loss": 5.7762, "step": 328 }, { "epoch": 0.08364849833147942, "grad_norm": 22169.841796875, "learning_rate": 4.162436548223351e-05, "loss": 5.778, "step": 329 }, { "epoch": 0.08390274908628635, "grad_norm": 21508.794921875, "learning_rate": 4.1751269035532995e-05, "loss": 5.8074, "step": 330 }, { "epoch": 0.08415699984109327, "grad_norm": 15525.013671875, "learning_rate": 4.187817258883249e-05, "loss": 5.7905, "step": 331 }, { "epoch": 0.08441125059590021, "grad_norm": 14844.3486328125, "learning_rate": 4.200507614213198e-05, "loss": 5.7763, "step": 332 }, { "epoch": 0.08466550135070713, "grad_norm": 16572.13671875, "learning_rate": 4.213197969543147e-05, "loss": 5.781, "step": 333 }, { "epoch": 0.08491975210551406, "grad_norm": 18925.337890625, "learning_rate": 4.225888324873097e-05, "loss": 5.7868, "step": 334 }, { "epoch": 0.085174002860321, "grad_norm": 19097.6875, "learning_rate": 4.238578680203046e-05, "loss": 5.7849, "step": 335 }, { "epoch": 0.08542825361512792, "grad_norm": 18242.955078125, "learning_rate": 4.251269035532995e-05, "loss": 5.7796, "step": 336 }, { "epoch": 0.08568250436993485, "grad_norm": 19598.6796875, "learning_rate": 4.2639593908629446e-05, "loss": 5.7923, "step": 337 }, { "epoch": 0.08593675512474178, "grad_norm": 19591.587890625, "learning_rate": 4.276649746192893e-05, "loss": 5.7912, "step": 338 }, { "epoch": 0.0861910058795487, "grad_norm": 15452.560546875, "learning_rate": 4.289340101522843e-05, "loss": 5.7846, "step": 339 }, { "epoch": 0.08644525663435564, "grad_norm": 15059.263671875, "learning_rate": 4.302030456852792e-05, "loss": 5.7758, "step": 340 }, { "epoch": 0.08669950738916256, "grad_norm": 21027.1640625, "learning_rate": 4.3147208121827415e-05, "loss": 5.8121, "step": 341 }, { "epoch": 0.0869537581439695, "grad_norm": 19786.572265625, "learning_rate": 4.32741116751269e-05, "loss": 5.7721, "step": 342 }, { "epoch": 0.08720800889877642, "grad_norm": 14951.8037109375, "learning_rate": 4.34010152284264e-05, "loss": 5.782, "step": 343 }, { "epoch": 0.08746225965358334, "grad_norm": 22819.787109375, "learning_rate": 4.352791878172589e-05, "loss": 5.7658, "step": 344 }, { "epoch": 0.08771651040839028, "grad_norm": 21210.58984375, "learning_rate": 4.365482233502538e-05, "loss": 5.7992, "step": 345 }, { "epoch": 0.0879707611631972, "grad_norm": 14568.482421875, "learning_rate": 4.378172588832488e-05, "loss": 5.7944, "step": 346 }, { "epoch": 0.08822501191800414, "grad_norm": 21739.427734375, "learning_rate": 4.3908629441624365e-05, "loss": 5.7758, "step": 347 }, { "epoch": 0.08847926267281106, "grad_norm": 20614.376953125, "learning_rate": 4.403553299492386e-05, "loss": 5.7728, "step": 348 }, { "epoch": 0.08873351342761798, "grad_norm": 14787.8486328125, "learning_rate": 4.416243654822335e-05, "loss": 5.786, "step": 349 }, { "epoch": 0.08898776418242492, "grad_norm": 17875.984375, "learning_rate": 4.428934010152285e-05, "loss": 5.7926, "step": 350 }, { "epoch": 0.08924201493723184, "grad_norm": 22780.046875, "learning_rate": 4.4416243654822335e-05, "loss": 5.7632, "step": 351 }, { "epoch": 0.08949626569203878, "grad_norm": 15378.287109375, "learning_rate": 4.454314720812183e-05, "loss": 5.7809, "step": 352 }, { "epoch": 0.0897505164468457, "grad_norm": 18587.49609375, "learning_rate": 4.467005076142132e-05, "loss": 5.7907, "step": 353 }, { "epoch": 0.09000476720165262, "grad_norm": 25359.814453125, "learning_rate": 4.479695431472081e-05, "loss": 5.7719, "step": 354 }, { "epoch": 0.09025901795645956, "grad_norm": 15609.775390625, "learning_rate": 4.492385786802031e-05, "loss": 5.784, "step": 355 }, { "epoch": 0.09051326871126648, "grad_norm": 16369.25, "learning_rate": 4.50507614213198e-05, "loss": 5.7869, "step": 356 }, { "epoch": 0.09076751946607342, "grad_norm": 23553.107421875, "learning_rate": 4.517766497461929e-05, "loss": 5.76, "step": 357 }, { "epoch": 0.09102177022088034, "grad_norm": 16909.330078125, "learning_rate": 4.5304568527918786e-05, "loss": 5.769, "step": 358 }, { "epoch": 0.09127602097568727, "grad_norm": 14716.7490234375, "learning_rate": 4.543147208121827e-05, "loss": 5.7636, "step": 359 }, { "epoch": 0.0915302717304942, "grad_norm": 15915.0244140625, "learning_rate": 4.555837563451777e-05, "loss": 5.7797, "step": 360 }, { "epoch": 0.09178452248530113, "grad_norm": 20231.58203125, "learning_rate": 4.568527918781726e-05, "loss": 5.7885, "step": 361 }, { "epoch": 0.09203877324010806, "grad_norm": 19425.763671875, "learning_rate": 4.5812182741116755e-05, "loss": 5.7585, "step": 362 }, { "epoch": 0.09229302399491499, "grad_norm": 15632.22265625, "learning_rate": 4.593908629441624e-05, "loss": 5.7596, "step": 363 }, { "epoch": 0.09254727474972191, "grad_norm": 14821.1826171875, "learning_rate": 4.606598984771574e-05, "loss": 5.7629, "step": 364 }, { "epoch": 0.09280152550452885, "grad_norm": 14788.517578125, "learning_rate": 4.619289340101523e-05, "loss": 5.7517, "step": 365 }, { "epoch": 0.09305577625933577, "grad_norm": 16733.14453125, "learning_rate": 4.631979695431472e-05, "loss": 5.7765, "step": 366 }, { "epoch": 0.0933100270141427, "grad_norm": 17748.39453125, "learning_rate": 4.644670050761422e-05, "loss": 5.7574, "step": 367 }, { "epoch": 0.09356427776894963, "grad_norm": 17039.28125, "learning_rate": 4.6573604060913705e-05, "loss": 5.7625, "step": 368 }, { "epoch": 0.09381852852375655, "grad_norm": 17125.658203125, "learning_rate": 4.67005076142132e-05, "loss": 5.7632, "step": 369 }, { "epoch": 0.09407277927856349, "grad_norm": 19509.609375, "learning_rate": 4.682741116751269e-05, "loss": 5.7818, "step": 370 }, { "epoch": 0.09432703003337041, "grad_norm": 19239.0546875, "learning_rate": 4.695431472081219e-05, "loss": 5.7627, "step": 371 }, { "epoch": 0.09458128078817735, "grad_norm": 17571.00390625, "learning_rate": 4.7081218274111674e-05, "loss": 5.7619, "step": 372 }, { "epoch": 0.09483553154298427, "grad_norm": 16918.115234375, "learning_rate": 4.7208121827411175e-05, "loss": 5.7548, "step": 373 }, { "epoch": 0.09508978229779119, "grad_norm": 19990.57421875, "learning_rate": 4.733502538071066e-05, "loss": 5.7612, "step": 374 }, { "epoch": 0.09534403305259813, "grad_norm": 23999.90234375, "learning_rate": 4.746192893401015e-05, "loss": 5.7643, "step": 375 }, { "epoch": 0.09559828380740505, "grad_norm": 19003.72265625, "learning_rate": 4.758883248730965e-05, "loss": 5.7416, "step": 376 }, { "epoch": 0.09585253456221199, "grad_norm": 16454.0859375, "learning_rate": 4.771573604060914e-05, "loss": 5.7641, "step": 377 }, { "epoch": 0.09610678531701891, "grad_norm": 17433.04296875, "learning_rate": 4.784263959390863e-05, "loss": 5.7706, "step": 378 }, { "epoch": 0.09636103607182583, "grad_norm": 20504.951171875, "learning_rate": 4.7969543147208126e-05, "loss": 5.7641, "step": 379 }, { "epoch": 0.09661528682663277, "grad_norm": 18765.4765625, "learning_rate": 4.809644670050762e-05, "loss": 5.7462, "step": 380 }, { "epoch": 0.0968695375814397, "grad_norm": 16072.2783203125, "learning_rate": 4.822335025380711e-05, "loss": 5.7729, "step": 381 }, { "epoch": 0.09712378833624662, "grad_norm": 15258.67578125, "learning_rate": 4.83502538071066e-05, "loss": 5.7606, "step": 382 }, { "epoch": 0.09737803909105355, "grad_norm": 14945.3935546875, "learning_rate": 4.8477157360406095e-05, "loss": 5.7608, "step": 383 }, { "epoch": 0.09763228984586048, "grad_norm": 15470.05859375, "learning_rate": 4.860406091370558e-05, "loss": 5.7531, "step": 384 }, { "epoch": 0.09788654060066741, "grad_norm": 20388.533203125, "learning_rate": 4.873096446700508e-05, "loss": 5.7696, "step": 385 }, { "epoch": 0.09814079135547434, "grad_norm": 28126.078125, "learning_rate": 4.885786802030457e-05, "loss": 5.7731, "step": 386 }, { "epoch": 0.09839504211028126, "grad_norm": 15670.330078125, "learning_rate": 4.8984771573604064e-05, "loss": 5.7566, "step": 387 }, { "epoch": 0.0986492928650882, "grad_norm": 17388.93359375, "learning_rate": 4.911167512690356e-05, "loss": 5.7731, "step": 388 }, { "epoch": 0.09890354361989512, "grad_norm": 28166.693359375, "learning_rate": 4.9238578680203045e-05, "loss": 5.7375, "step": 389 }, { "epoch": 0.09915779437470205, "grad_norm": 17543.197265625, "learning_rate": 4.936548223350254e-05, "loss": 5.7589, "step": 390 }, { "epoch": 0.09941204512950898, "grad_norm": 15108.068359375, "learning_rate": 4.949238578680203e-05, "loss": 5.7438, "step": 391 }, { "epoch": 0.0996662958843159, "grad_norm": 14917.2275390625, "learning_rate": 4.961928934010153e-05, "loss": 5.7403, "step": 392 }, { "epoch": 0.09992054663912284, "grad_norm": 14791.08984375, "learning_rate": 4.9746192893401014e-05, "loss": 5.7605, "step": 393 }, { "epoch": 0.10017479739392976, "grad_norm": 16332.1171875, "learning_rate": 4.9873096446700515e-05, "loss": 5.755, "step": 394 }, { "epoch": 0.1004290481487367, "grad_norm": 24996.771484375, "learning_rate": 5e-05, "loss": 5.7636, "step": 395 }, { "epoch": 0.10068329890354362, "grad_norm": 28145.7265625, "learning_rate": 4.999999014971934e-05, "loss": 5.7464, "step": 396 }, { "epoch": 0.10093754965835054, "grad_norm": 18033.251953125, "learning_rate": 4.999996059888515e-05, "loss": 5.7638, "step": 397 }, { "epoch": 0.10119180041315748, "grad_norm": 17646.732421875, "learning_rate": 4.999991134752069e-05, "loss": 5.7582, "step": 398 }, { "epoch": 0.1014460511679644, "grad_norm": 24271.55078125, "learning_rate": 4.9999842395664773e-05, "loss": 5.7309, "step": 399 }, { "epoch": 0.10170030192277134, "grad_norm": 28549.97265625, "learning_rate": 4.999975374337176e-05, "loss": 5.7474, "step": 400 }, { "epoch": 0.10170030192277134, "eval_loss": 11.57848072052002, "eval_runtime": 697.3055, "eval_samples_per_second": 151.994, "eval_steps_per_second": 9.501, "step": 400 }, { "epoch": 0.10195455267757826, "grad_norm": 17738.744140625, "learning_rate": 4.999964539071148e-05, "loss": 5.7357, "step": 401 }, { "epoch": 0.10220880343238518, "grad_norm": 15361.2373046875, "learning_rate": 4.999951733776933e-05, "loss": 5.7243, "step": 402 }, { "epoch": 0.10246305418719212, "grad_norm": 15122.3681640625, "learning_rate": 4.9999369584646226e-05, "loss": 5.7351, "step": 403 }, { "epoch": 0.10271730494199904, "grad_norm": 15341.8037109375, "learning_rate": 4.99992021314586e-05, "loss": 5.7437, "step": 404 }, { "epoch": 0.10297155569680598, "grad_norm": 19395.509765625, "learning_rate": 4.99990149783384e-05, "loss": 5.7491, "step": 405 }, { "epoch": 0.1032258064516129, "grad_norm": 26739.3359375, "learning_rate": 4.9998808125433106e-05, "loss": 5.7481, "step": 406 }, { "epoch": 0.10348005720641983, "grad_norm": 17126.6171875, "learning_rate": 4.9998581572905724e-05, "loss": 5.7338, "step": 407 }, { "epoch": 0.10373430796122676, "grad_norm": 15103.6669921875, "learning_rate": 4.9998335320934795e-05, "loss": 5.7518, "step": 408 }, { "epoch": 0.10398855871603369, "grad_norm": 16252.2412109375, "learning_rate": 4.999806936971435e-05, "loss": 5.7407, "step": 409 }, { "epoch": 0.10424280947084062, "grad_norm": 24540.556640625, "learning_rate": 4.999778371945399e-05, "loss": 5.7342, "step": 410 }, { "epoch": 0.10449706022564755, "grad_norm": 24937.2578125, "learning_rate": 4.9997478370378794e-05, "loss": 5.7225, "step": 411 }, { "epoch": 0.10475131098045447, "grad_norm": 17110.91796875, "learning_rate": 4.9997153322729386e-05, "loss": 5.7338, "step": 412 }, { "epoch": 0.1050055617352614, "grad_norm": 16506.455078125, "learning_rate": 4.999680857676192e-05, "loss": 5.7277, "step": 413 }, { "epoch": 0.10525981249006833, "grad_norm": 24584.044921875, "learning_rate": 4.9996444132748055e-05, "loss": 5.7373, "step": 414 }, { "epoch": 0.10551406324487526, "grad_norm": 26646.958984375, "learning_rate": 4.9996059990974984e-05, "loss": 5.7243, "step": 415 }, { "epoch": 0.10576831399968219, "grad_norm": 15127.9755859375, "learning_rate": 4.999565615174542e-05, "loss": 5.7263, "step": 416 }, { "epoch": 0.10602256475448911, "grad_norm": 22097.779296875, "learning_rate": 4.99952326153776e-05, "loss": 5.7323, "step": 417 }, { "epoch": 0.10627681550929605, "grad_norm": 32168.458984375, "learning_rate": 4.9994789382205275e-05, "loss": 5.7075, "step": 418 }, { "epoch": 0.10653106626410297, "grad_norm": 15955.8134765625, "learning_rate": 4.9994326452577735e-05, "loss": 5.7404, "step": 419 }, { "epoch": 0.1067853170189099, "grad_norm": 51767.4296875, "learning_rate": 4.999384382685975e-05, "loss": 5.7317, "step": 420 }, { "epoch": 0.10703956777371683, "grad_norm": 18034.890625, "learning_rate": 4.9993341505431675e-05, "loss": 5.7169, "step": 421 }, { "epoch": 0.10729381852852375, "grad_norm": 40395.14453125, "learning_rate": 4.999281948868932e-05, "loss": 5.7429, "step": 422 }, { "epoch": 0.10754806928333069, "grad_norm": 38049.34765625, "learning_rate": 4.9992277777044075e-05, "loss": 5.7306, "step": 423 }, { "epoch": 0.10780232003813761, "grad_norm": 24377.923828125, "learning_rate": 4.9991716370922804e-05, "loss": 5.7364, "step": 424 }, { "epoch": 0.10805657079294455, "grad_norm": 31733.537109375, "learning_rate": 4.9991135270767904e-05, "loss": 5.729, "step": 425 }, { "epoch": 0.10831082154775147, "grad_norm": 24497.560546875, "learning_rate": 4.9990534477037296e-05, "loss": 5.7321, "step": 426 }, { "epoch": 0.1085650723025584, "grad_norm": 21727.189453125, "learning_rate": 4.9989913990204436e-05, "loss": 5.7215, "step": 427 }, { "epoch": 0.10881932305736533, "grad_norm": 36596.0859375, "learning_rate": 4.9989273810758265e-05, "loss": 5.72, "step": 428 }, { "epoch": 0.10907357381217225, "grad_norm": 26537.92578125, "learning_rate": 4.998861393920326e-05, "loss": 5.7302, "step": 429 }, { "epoch": 0.10932782456697919, "grad_norm": 32205.70703125, "learning_rate": 4.998793437605942e-05, "loss": 5.7199, "step": 430 }, { "epoch": 0.10958207532178611, "grad_norm": 28683.19140625, "learning_rate": 4.9987235121862255e-05, "loss": 5.7394, "step": 431 }, { "epoch": 0.10983632607659304, "grad_norm": 17488.791015625, "learning_rate": 4.998651617716279e-05, "loss": 5.7219, "step": 432 }, { "epoch": 0.11009057683139997, "grad_norm": 27071.734375, "learning_rate": 4.9985777542527566e-05, "loss": 5.7032, "step": 433 }, { "epoch": 0.1103448275862069, "grad_norm": 16953.912109375, "learning_rate": 4.9985019218538656e-05, "loss": 5.7104, "step": 434 }, { "epoch": 0.11059907834101383, "grad_norm": 18923.18359375, "learning_rate": 4.998424120579363e-05, "loss": 5.7327, "step": 435 }, { "epoch": 0.11085332909582075, "grad_norm": 18261.357421875, "learning_rate": 4.998344350490558e-05, "loss": 5.7221, "step": 436 }, { "epoch": 0.11110757985062768, "grad_norm": 15947.138671875, "learning_rate": 4.99826261165031e-05, "loss": 5.7246, "step": 437 }, { "epoch": 0.11136183060543461, "grad_norm": 18423.921875, "learning_rate": 4.998178904123033e-05, "loss": 5.7102, "step": 438 }, { "epoch": 0.11161608136024154, "grad_norm": 15425.244140625, "learning_rate": 4.99809322797469e-05, "loss": 5.7016, "step": 439 }, { "epoch": 0.11187033211504846, "grad_norm": 22775.279296875, "learning_rate": 4.9980055832727946e-05, "loss": 5.7221, "step": 440 }, { "epoch": 0.1121245828698554, "grad_norm": 15630.93359375, "learning_rate": 4.997915970086413e-05, "loss": 5.7345, "step": 441 }, { "epoch": 0.11237883362466232, "grad_norm": 16381.5498046875, "learning_rate": 4.9978243884861635e-05, "loss": 5.7293, "step": 442 }, { "epoch": 0.11263308437946926, "grad_norm": 16565.13671875, "learning_rate": 4.997730838544214e-05, "loss": 5.694, "step": 443 }, { "epoch": 0.11288733513427618, "grad_norm": 15240.77734375, "learning_rate": 4.997635320334283e-05, "loss": 5.7241, "step": 444 }, { "epoch": 0.1131415858890831, "grad_norm": 15775.3681640625, "learning_rate": 4.9975378339316434e-05, "loss": 5.7336, "step": 445 }, { "epoch": 0.11339583664389004, "grad_norm": 15154.955078125, "learning_rate": 4.997438379413114e-05, "loss": 5.726, "step": 446 }, { "epoch": 0.11365008739869696, "grad_norm": 15382.392578125, "learning_rate": 4.997336956857068e-05, "loss": 5.7209, "step": 447 }, { "epoch": 0.1139043381535039, "grad_norm": 15969.6162109375, "learning_rate": 4.997233566343429e-05, "loss": 5.7175, "step": 448 }, { "epoch": 0.11415858890831082, "grad_norm": 15737.732421875, "learning_rate": 4.997128207953671e-05, "loss": 5.6985, "step": 449 }, { "epoch": 0.11441283966311774, "grad_norm": 15486.15234375, "learning_rate": 4.99702088177082e-05, "loss": 5.7084, "step": 450 }, { "epoch": 0.11466709041792468, "grad_norm": 15467.587890625, "learning_rate": 4.9969115878794484e-05, "loss": 5.7265, "step": 451 }, { "epoch": 0.1149213411727316, "grad_norm": 15145.166015625, "learning_rate": 4.996800326365685e-05, "loss": 5.7209, "step": 452 }, { "epoch": 0.11517559192753854, "grad_norm": 15714.81640625, "learning_rate": 4.9966870973172046e-05, "loss": 5.7124, "step": 453 }, { "epoch": 0.11542984268234546, "grad_norm": 18364.576171875, "learning_rate": 4.996571900823236e-05, "loss": 5.701, "step": 454 }, { "epoch": 0.11568409343715239, "grad_norm": 16856.05859375, "learning_rate": 4.996454736974555e-05, "loss": 5.7009, "step": 455 }, { "epoch": 0.11593834419195932, "grad_norm": 15237.99609375, "learning_rate": 4.9963356058634903e-05, "loss": 5.7102, "step": 456 }, { "epoch": 0.11619259494676625, "grad_norm": 16812.8671875, "learning_rate": 4.996214507583919e-05, "loss": 5.6973, "step": 457 }, { "epoch": 0.11644684570157318, "grad_norm": 17960.55078125, "learning_rate": 4.99609144223127e-05, "loss": 5.7145, "step": 458 }, { "epoch": 0.1167010964563801, "grad_norm": 16421.55078125, "learning_rate": 4.9959664099025216e-05, "loss": 5.7148, "step": 459 }, { "epoch": 0.11695534721118703, "grad_norm": 15507.2197265625, "learning_rate": 4.995839410696202e-05, "loss": 5.6977, "step": 460 }, { "epoch": 0.11720959796599396, "grad_norm": 18020.625, "learning_rate": 4.995710444712389e-05, "loss": 5.7285, "step": 461 }, { "epoch": 0.11746384872080089, "grad_norm": 18671.88671875, "learning_rate": 4.995579512052712e-05, "loss": 5.7145, "step": 462 }, { "epoch": 0.11771809947560782, "grad_norm": 15958.7607421875, "learning_rate": 4.995446612820346e-05, "loss": 5.7144, "step": 463 }, { "epoch": 0.11797235023041475, "grad_norm": 16075.9072265625, "learning_rate": 4.9953117471200215e-05, "loss": 5.7147, "step": 464 }, { "epoch": 0.11822660098522167, "grad_norm": 21117.54296875, "learning_rate": 4.995174915058015e-05, "loss": 5.6987, "step": 465 }, { "epoch": 0.1184808517400286, "grad_norm": 21167.033203125, "learning_rate": 4.9950361167421526e-05, "loss": 5.6985, "step": 466 }, { "epoch": 0.11873510249483553, "grad_norm": 15609.7109375, "learning_rate": 4.994895352281811e-05, "loss": 5.7058, "step": 467 }, { "epoch": 0.11898935324964247, "grad_norm": 19494.00390625, "learning_rate": 4.994752621787915e-05, "loss": 5.6978, "step": 468 }, { "epoch": 0.11924360400444939, "grad_norm": 23788.21875, "learning_rate": 4.9946079253729406e-05, "loss": 5.6764, "step": 469 }, { "epoch": 0.11949785475925631, "grad_norm": 15217.6826171875, "learning_rate": 4.99446126315091e-05, "loss": 5.6901, "step": 470 }, { "epoch": 0.11975210551406325, "grad_norm": 34201.64453125, "learning_rate": 4.9943126352373984e-05, "loss": 5.6901, "step": 471 }, { "epoch": 0.12000635626887017, "grad_norm": 16188.8125, "learning_rate": 4.994162041749527e-05, "loss": 5.7206, "step": 472 }, { "epoch": 0.12026060702367711, "grad_norm": 19348.451171875, "learning_rate": 4.994009482805967e-05, "loss": 5.6751, "step": 473 }, { "epoch": 0.12051485777848403, "grad_norm": 26459.375, "learning_rate": 4.993854958526938e-05, "loss": 5.6943, "step": 474 }, { "epoch": 0.12076910853329095, "grad_norm": 15800.505859375, "learning_rate": 4.9936984690342094e-05, "loss": 5.6905, "step": 475 }, { "epoch": 0.12102335928809789, "grad_norm": 17625.212890625, "learning_rate": 4.9935400144510966e-05, "loss": 5.6914, "step": 476 }, { "epoch": 0.12127761004290481, "grad_norm": 27727.19140625, "learning_rate": 4.993379594902468e-05, "loss": 5.7044, "step": 477 }, { "epoch": 0.12153186079771175, "grad_norm": 17069.87109375, "learning_rate": 4.993217210514734e-05, "loss": 5.6939, "step": 478 }, { "epoch": 0.12178611155251867, "grad_norm": 15412.1865234375, "learning_rate": 4.993052861415862e-05, "loss": 5.7013, "step": 479 }, { "epoch": 0.1220403623073256, "grad_norm": 20495.89453125, "learning_rate": 4.992886547735359e-05, "loss": 5.7095, "step": 480 }, { "epoch": 0.12229461306213253, "grad_norm": 20597.22265625, "learning_rate": 4.9927182696042856e-05, "loss": 5.6795, "step": 481 }, { "epoch": 0.12254886381693945, "grad_norm": 15255.236328125, "learning_rate": 4.992548027155248e-05, "loss": 5.6751, "step": 482 }, { "epoch": 0.12280311457174639, "grad_norm": 25595.765625, "learning_rate": 4.9923758205224025e-05, "loss": 5.6862, "step": 483 }, { "epoch": 0.12305736532655331, "grad_norm": 19672.923828125, "learning_rate": 4.99220164984145e-05, "loss": 5.6854, "step": 484 }, { "epoch": 0.12331161608136024, "grad_norm": 15898.150390625, "learning_rate": 4.992025515249642e-05, "loss": 5.7032, "step": 485 }, { "epoch": 0.12356586683616717, "grad_norm": 25745.66015625, "learning_rate": 4.9918474168857755e-05, "loss": 5.6798, "step": 486 }, { "epoch": 0.1238201175909741, "grad_norm": 18262.912109375, "learning_rate": 4.991667354890196e-05, "loss": 5.6727, "step": 487 }, { "epoch": 0.12407436834578103, "grad_norm": 15679.765625, "learning_rate": 4.9914853294047986e-05, "loss": 5.6857, "step": 488 }, { "epoch": 0.12432861910058796, "grad_norm": 22240.96875, "learning_rate": 4.9913013405730215e-05, "loss": 5.6853, "step": 489 }, { "epoch": 0.12458286985539488, "grad_norm": 19653.525390625, "learning_rate": 4.991115388539852e-05, "loss": 5.6615, "step": 490 }, { "epoch": 0.12483712061020182, "grad_norm": 15283.314453125, "learning_rate": 4.990927473451825e-05, "loss": 5.6852, "step": 491 }, { "epoch": 0.12509137136500875, "grad_norm": 22386.185546875, "learning_rate": 4.9907375954570225e-05, "loss": 5.6877, "step": 492 }, { "epoch": 0.12534562211981568, "grad_norm": 20030.65625, "learning_rate": 4.990545754705071e-05, "loss": 5.6858, "step": 493 }, { "epoch": 0.1255998728746226, "grad_norm": 15557.42578125, "learning_rate": 4.990351951347147e-05, "loss": 5.6791, "step": 494 }, { "epoch": 0.12585412362942952, "grad_norm": 22766.5703125, "learning_rate": 4.9901561855359705e-05, "loss": 5.6886, "step": 495 }, { "epoch": 0.12610837438423644, "grad_norm": 20646.341796875, "learning_rate": 4.98995845742581e-05, "loss": 5.6811, "step": 496 }, { "epoch": 0.1263626251390434, "grad_norm": 16207.798828125, "learning_rate": 4.989758767172479e-05, "loss": 5.6696, "step": 497 }, { "epoch": 0.12661687589385032, "grad_norm": 29511.078125, "learning_rate": 4.989557114933339e-05, "loss": 5.6652, "step": 498 }, { "epoch": 0.12687112664865724, "grad_norm": 15871.7978515625, "learning_rate": 4.989353500867296e-05, "loss": 5.6692, "step": 499 }, { "epoch": 0.12712537740346416, "grad_norm": 18247.537109375, "learning_rate": 4.9891479251348026e-05, "loss": 5.6786, "step": 500 }, { "epoch": 0.12712537740346416, "eval_loss": 11.436968803405762, "eval_runtime": 696.683, "eval_samples_per_second": 152.129, "eval_steps_per_second": 9.509, "step": 500 }, { "epoch": 0.12737962815827109, "grad_norm": 21737.20703125, "learning_rate": 4.988940387897857e-05, "loss": 5.66, "step": 501 }, { "epoch": 0.12763387891307804, "grad_norm": 16418.75, "learning_rate": 4.988730889320004e-05, "loss": 5.6799, "step": 502 }, { "epoch": 0.12788812966788496, "grad_norm": 15836.2626953125, "learning_rate": 4.9885194295663306e-05, "loss": 5.6765, "step": 503 }, { "epoch": 0.12814238042269188, "grad_norm": 21452.65234375, "learning_rate": 4.988306008803475e-05, "loss": 5.6771, "step": 504 }, { "epoch": 0.1283966311774988, "grad_norm": 20613.23828125, "learning_rate": 4.988090627199615e-05, "loss": 5.6607, "step": 505 }, { "epoch": 0.12865088193230573, "grad_norm": 15403.5283203125, "learning_rate": 4.987873284924478e-05, "loss": 5.6664, "step": 506 }, { "epoch": 0.12890513268711268, "grad_norm": 18133.78515625, "learning_rate": 4.987653982149334e-05, "loss": 5.6795, "step": 507 }, { "epoch": 0.1291593834419196, "grad_norm": 20240.896484375, "learning_rate": 4.987432719046998e-05, "loss": 5.6629, "step": 508 }, { "epoch": 0.12941363419672652, "grad_norm": 15481.91015625, "learning_rate": 4.987209495791831e-05, "loss": 5.6612, "step": 509 }, { "epoch": 0.12966788495153345, "grad_norm": 27840.04296875, "learning_rate": 4.9869843125597374e-05, "loss": 5.6792, "step": 510 }, { "epoch": 0.12992213570634037, "grad_norm": 19821.810546875, "learning_rate": 4.9867571695281666e-05, "loss": 5.664, "step": 511 }, { "epoch": 0.13017638646114732, "grad_norm": 16410.453125, "learning_rate": 4.986528066876113e-05, "loss": 5.6731, "step": 512 }, { "epoch": 0.13043063721595424, "grad_norm": 29050.4296875, "learning_rate": 4.9862970047841144e-05, "loss": 5.6657, "step": 513 }, { "epoch": 0.13068488797076117, "grad_norm": 16172.8349609375, "learning_rate": 4.9860639834342525e-05, "loss": 5.6749, "step": 514 }, { "epoch": 0.1309391387255681, "grad_norm": 18856.3359375, "learning_rate": 4.985829003010154e-05, "loss": 5.6627, "step": 515 }, { "epoch": 0.131193389480375, "grad_norm": 24609.775390625, "learning_rate": 4.985592063696988e-05, "loss": 5.6709, "step": 516 }, { "epoch": 0.13144764023518196, "grad_norm": 16182.5205078125, "learning_rate": 4.985353165681469e-05, "loss": 5.6843, "step": 517 }, { "epoch": 0.13170189098998888, "grad_norm": 15324.4169921875, "learning_rate": 4.985112309151853e-05, "loss": 5.6688, "step": 518 }, { "epoch": 0.1319561417447958, "grad_norm": 15727.365234375, "learning_rate": 4.984869494297941e-05, "loss": 5.6662, "step": 519 }, { "epoch": 0.13221039249960273, "grad_norm": 15892.57421875, "learning_rate": 4.9846247213110765e-05, "loss": 5.6601, "step": 520 }, { "epoch": 0.13246464325440965, "grad_norm": 15805.169921875, "learning_rate": 4.984377990384145e-05, "loss": 5.6494, "step": 521 }, { "epoch": 0.1327188940092166, "grad_norm": 15867.2119140625, "learning_rate": 4.984129301711578e-05, "loss": 5.6411, "step": 522 }, { "epoch": 0.13297314476402353, "grad_norm": 17240.55078125, "learning_rate": 4.9838786554893455e-05, "loss": 5.6732, "step": 523 }, { "epoch": 0.13322739551883045, "grad_norm": 18650.443359375, "learning_rate": 4.9836260519149644e-05, "loss": 5.6509, "step": 524 }, { "epoch": 0.13348164627363737, "grad_norm": 16894.62890625, "learning_rate": 4.983371491187492e-05, "loss": 5.6669, "step": 525 }, { "epoch": 0.1337358970284443, "grad_norm": 15476.4794921875, "learning_rate": 4.9831149735075255e-05, "loss": 5.6341, "step": 526 }, { "epoch": 0.13399014778325122, "grad_norm": 15612.296875, "learning_rate": 4.982856499077209e-05, "loss": 5.6663, "step": 527 }, { "epoch": 0.13424439853805817, "grad_norm": 16043.396484375, "learning_rate": 4.982596068100225e-05, "loss": 5.6415, "step": 528 }, { "epoch": 0.1344986492928651, "grad_norm": 16745.125, "learning_rate": 4.982333680781799e-05, "loss": 5.6387, "step": 529 }, { "epoch": 0.13475290004767201, "grad_norm": 18327.9921875, "learning_rate": 4.982069337328698e-05, "loss": 5.6466, "step": 530 }, { "epoch": 0.13500715080247894, "grad_norm": 20842.2109375, "learning_rate": 4.9818030379492314e-05, "loss": 5.6564, "step": 531 }, { "epoch": 0.13526140155728586, "grad_norm": 18361.029296875, "learning_rate": 4.9815347828532486e-05, "loss": 5.6416, "step": 532 }, { "epoch": 0.1355156523120928, "grad_norm": 15737.955078125, "learning_rate": 4.981264572252141e-05, "loss": 5.6446, "step": 533 }, { "epoch": 0.13576990306689973, "grad_norm": 15640.228515625, "learning_rate": 4.9809924063588394e-05, "loss": 5.6594, "step": 534 }, { "epoch": 0.13602415382170666, "grad_norm": 19447.421875, "learning_rate": 4.980718285387818e-05, "loss": 5.6464, "step": 535 }, { "epoch": 0.13627840457651358, "grad_norm": 23074.4921875, "learning_rate": 4.9804422095550894e-05, "loss": 5.6481, "step": 536 }, { "epoch": 0.1365326553313205, "grad_norm": 16168.689453125, "learning_rate": 4.9801641790782085e-05, "loss": 5.6358, "step": 537 }, { "epoch": 0.13678690608612745, "grad_norm": 17012.951171875, "learning_rate": 4.979884194176268e-05, "loss": 5.6407, "step": 538 }, { "epoch": 0.13704115684093437, "grad_norm": 22454.2265625, "learning_rate": 4.979602255069904e-05, "loss": 5.6222, "step": 539 }, { "epoch": 0.1372954075957413, "grad_norm": 17975.060546875, "learning_rate": 4.97931836198129e-05, "loss": 5.6552, "step": 540 }, { "epoch": 0.13754965835054822, "grad_norm": 15872.36328125, "learning_rate": 4.97903251513414e-05, "loss": 5.6584, "step": 541 }, { "epoch": 0.13780390910535514, "grad_norm": 18451.474609375, "learning_rate": 4.978744714753708e-05, "loss": 5.629, "step": 542 }, { "epoch": 0.1380581598601621, "grad_norm": 23676.044921875, "learning_rate": 4.978454961066787e-05, "loss": 5.6582, "step": 543 }, { "epoch": 0.13831241061496902, "grad_norm": 19392.4296875, "learning_rate": 4.97816325430171e-05, "loss": 5.6415, "step": 544 }, { "epoch": 0.13856666136977594, "grad_norm": 16136.1005859375, "learning_rate": 4.977869594688348e-05, "loss": 5.6457, "step": 545 }, { "epoch": 0.13882091212458286, "grad_norm": 16434.310546875, "learning_rate": 4.977573982458111e-05, "loss": 5.6549, "step": 546 }, { "epoch": 0.13907516287938979, "grad_norm": 19780.55078125, "learning_rate": 4.9772764178439485e-05, "loss": 5.6132, "step": 547 }, { "epoch": 0.13932941363419674, "grad_norm": 21970.705078125, "learning_rate": 4.976976901080348e-05, "loss": 5.6392, "step": 548 }, { "epoch": 0.13958366438900366, "grad_norm": 18257.62109375, "learning_rate": 4.976675432403336e-05, "loss": 5.6445, "step": 549 }, { "epoch": 0.13983791514381058, "grad_norm": 16311.8681640625, "learning_rate": 4.9763720120504756e-05, "loss": 5.6347, "step": 550 }, { "epoch": 0.1400921658986175, "grad_norm": 16021.2177734375, "learning_rate": 4.976066640260869e-05, "loss": 5.6239, "step": 551 }, { "epoch": 0.14034641665342443, "grad_norm": 16831.166015625, "learning_rate": 4.975759317275157e-05, "loss": 5.632, "step": 552 }, { "epoch": 0.14060066740823138, "grad_norm": 18917.677734375, "learning_rate": 4.975450043335517e-05, "loss": 5.6209, "step": 553 }, { "epoch": 0.1408549181630383, "grad_norm": 19774.138671875, "learning_rate": 4.975138818685662e-05, "loss": 5.6466, "step": 554 }, { "epoch": 0.14110916891784522, "grad_norm": 18575.240234375, "learning_rate": 4.974825643570845e-05, "loss": 5.608, "step": 555 }, { "epoch": 0.14136341967265215, "grad_norm": 17651.923828125, "learning_rate": 4.974510518237856e-05, "loss": 5.6164, "step": 556 }, { "epoch": 0.14161767042745907, "grad_norm": 17001.119140625, "learning_rate": 4.97419344293502e-05, "loss": 5.6189, "step": 557 }, { "epoch": 0.14187192118226602, "grad_norm": 17398.4296875, "learning_rate": 4.973874417912199e-05, "loss": 5.62, "step": 558 }, { "epoch": 0.14212617193707294, "grad_norm": 18886.431640625, "learning_rate": 4.9735534434207925e-05, "loss": 5.6276, "step": 559 }, { "epoch": 0.14238042269187987, "grad_norm": 20584.83203125, "learning_rate": 4.9732305197137356e-05, "loss": 5.6137, "step": 560 }, { "epoch": 0.1426346734466868, "grad_norm": 19928.982421875, "learning_rate": 4.972905647045499e-05, "loss": 5.6381, "step": 561 }, { "epoch": 0.1428889242014937, "grad_norm": 17919.140625, "learning_rate": 4.9725788256720905e-05, "loss": 5.6481, "step": 562 }, { "epoch": 0.14314317495630066, "grad_norm": 18179.376953125, "learning_rate": 4.9722500558510524e-05, "loss": 5.6181, "step": 563 }, { "epoch": 0.14339742571110758, "grad_norm": 17842.84375, "learning_rate": 4.9719193378414616e-05, "loss": 5.6353, "step": 564 }, { "epoch": 0.1436516764659145, "grad_norm": 18230.892578125, "learning_rate": 4.9715866719039326e-05, "loss": 5.621, "step": 565 }, { "epoch": 0.14390592722072143, "grad_norm": 18977.90625, "learning_rate": 4.971252058300614e-05, "loss": 5.6392, "step": 566 }, { "epoch": 0.14416017797552835, "grad_norm": 19686.40234375, "learning_rate": 4.970915497295187e-05, "loss": 5.5983, "step": 567 }, { "epoch": 0.1444144287303353, "grad_norm": 18930.306640625, "learning_rate": 4.970576989152871e-05, "loss": 5.6122, "step": 568 }, { "epoch": 0.14466867948514223, "grad_norm": 18535.255859375, "learning_rate": 4.970236534140417e-05, "loss": 5.6109, "step": 569 }, { "epoch": 0.14492293023994915, "grad_norm": 19047.4296875, "learning_rate": 4.9698941325261104e-05, "loss": 5.6177, "step": 570 }, { "epoch": 0.14517718099475607, "grad_norm": 18814.443359375, "learning_rate": 4.969549784579773e-05, "loss": 5.6271, "step": 571 }, { "epoch": 0.145431431749563, "grad_norm": 16480.15234375, "learning_rate": 4.969203490572759e-05, "loss": 5.6281, "step": 572 }, { "epoch": 0.14568568250436995, "grad_norm": 15779.3984375, "learning_rate": 4.9688552507779554e-05, "loss": 5.6277, "step": 573 }, { "epoch": 0.14593993325917687, "grad_norm": 16809.806640625, "learning_rate": 4.9685050654697806e-05, "loss": 5.6211, "step": 574 }, { "epoch": 0.1461941840139838, "grad_norm": 20341.103515625, "learning_rate": 4.968152934924192e-05, "loss": 5.5933, "step": 575 }, { "epoch": 0.14644843476879071, "grad_norm": 18689.234375, "learning_rate": 4.967798859418674e-05, "loss": 5.6168, "step": 576 }, { "epoch": 0.14670268552359764, "grad_norm": 16318.267578125, "learning_rate": 4.9674428392322476e-05, "loss": 5.6036, "step": 577 }, { "epoch": 0.1469569362784046, "grad_norm": 29769.20703125, "learning_rate": 4.967084874645463e-05, "loss": 5.6077, "step": 578 }, { "epoch": 0.1472111870332115, "grad_norm": 20526.70703125, "learning_rate": 4.966724965940407e-05, "loss": 5.6155, "step": 579 }, { "epoch": 0.14746543778801843, "grad_norm": 16122.1318359375, "learning_rate": 4.966363113400693e-05, "loss": 5.6045, "step": 580 }, { "epoch": 0.14771968854282536, "grad_norm": 28428.771484375, "learning_rate": 4.965999317311469e-05, "loss": 5.6197, "step": 581 }, { "epoch": 0.14797393929763228, "grad_norm": 19143.50390625, "learning_rate": 4.965633577959417e-05, "loss": 5.6081, "step": 582 }, { "epoch": 0.14822819005243923, "grad_norm": 16851.91796875, "learning_rate": 4.9652658956327457e-05, "loss": 5.6062, "step": 583 }, { "epoch": 0.14848244080724615, "grad_norm": 24752.48828125, "learning_rate": 4.964896270621198e-05, "loss": 5.6142, "step": 584 }, { "epoch": 0.14873669156205307, "grad_norm": 19609.951171875, "learning_rate": 4.964524703216046e-05, "loss": 5.6065, "step": 585 }, { "epoch": 0.14899094231686, "grad_norm": 16254.486328125, "learning_rate": 4.9641511937100934e-05, "loss": 5.5968, "step": 586 }, { "epoch": 0.14924519307166692, "grad_norm": 36469.53125, "learning_rate": 4.963775742397674e-05, "loss": 5.6088, "step": 587 }, { "epoch": 0.14949944382647387, "grad_norm": 17407.080078125, "learning_rate": 4.963398349574653e-05, "loss": 5.5943, "step": 588 }, { "epoch": 0.1497536945812808, "grad_norm": 19350.60546875, "learning_rate": 4.963019015538422e-05, "loss": 5.6002, "step": 589 }, { "epoch": 0.15000794533608772, "grad_norm": 31245.01171875, "learning_rate": 4.962637740587907e-05, "loss": 5.6126, "step": 590 }, { "epoch": 0.15026219609089464, "grad_norm": 15731.158203125, "learning_rate": 4.962254525023561e-05, "loss": 5.5871, "step": 591 }, { "epoch": 0.15051644684570156, "grad_norm": 22137.224609375, "learning_rate": 4.961869369147365e-05, "loss": 5.5978, "step": 592 }, { "epoch": 0.1507706976005085, "grad_norm": 18919.173828125, "learning_rate": 4.961482273262831e-05, "loss": 5.6055, "step": 593 }, { "epoch": 0.15102494835531544, "grad_norm": 15994.861328125, "learning_rate": 4.9610932376750006e-05, "loss": 5.5934, "step": 594 }, { "epoch": 0.15127919911012236, "grad_norm": 19860.427734375, "learning_rate": 4.960702262690441e-05, "loss": 5.5961, "step": 595 }, { "epoch": 0.15153344986492928, "grad_norm": 19515.59375, "learning_rate": 4.9603093486172504e-05, "loss": 5.5927, "step": 596 }, { "epoch": 0.1517877006197362, "grad_norm": 16144.533203125, "learning_rate": 4.959914495765052e-05, "loss": 5.5984, "step": 597 }, { "epoch": 0.15204195137454316, "grad_norm": 15955.1083984375, "learning_rate": 4.959517704445001e-05, "loss": 5.5973, "step": 598 }, { "epoch": 0.15229620212935008, "grad_norm": 17576.66796875, "learning_rate": 4.959118974969777e-05, "loss": 5.5894, "step": 599 }, { "epoch": 0.152550452884157, "grad_norm": 18451.59375, "learning_rate": 4.958718307653588e-05, "loss": 5.5885, "step": 600 }, { "epoch": 0.152550452884157, "eval_loss": 11.28321647644043, "eval_runtime": 699.4596, "eval_samples_per_second": 151.526, "eval_steps_per_second": 9.472, "step": 600 }, { "epoch": 0.15280470363896392, "grad_norm": 15883.5673828125, "learning_rate": 4.958315702812168e-05, "loss": 5.6017, "step": 601 }, { "epoch": 0.15305895439377085, "grad_norm": 16934.130859375, "learning_rate": 4.957911160762779e-05, "loss": 5.6034, "step": 602 }, { "epoch": 0.1533132051485778, "grad_norm": 19183.017578125, "learning_rate": 4.9575046818242106e-05, "loss": 5.5867, "step": 603 }, { "epoch": 0.15356745590338472, "grad_norm": 17251.52734375, "learning_rate": 4.9570962663167756e-05, "loss": 5.5952, "step": 604 }, { "epoch": 0.15382170665819164, "grad_norm": 16172.880859375, "learning_rate": 4.956685914562315e-05, "loss": 5.6117, "step": 605 }, { "epoch": 0.15407595741299857, "grad_norm": 16140.353515625, "learning_rate": 4.9562736268841946e-05, "loss": 5.5772, "step": 606 }, { "epoch": 0.1543302081678055, "grad_norm": 19752.396484375, "learning_rate": 4.955859403607308e-05, "loss": 5.5945, "step": 607 }, { "epoch": 0.15458445892261244, "grad_norm": 20558.962890625, "learning_rate": 4.955443245058071e-05, "loss": 5.5899, "step": 608 }, { "epoch": 0.15483870967741936, "grad_norm": 15940.642578125, "learning_rate": 4.9550251515644275e-05, "loss": 5.5884, "step": 609 }, { "epoch": 0.15509296043222628, "grad_norm": 23089.677734375, "learning_rate": 4.954605123455842e-05, "loss": 5.5997, "step": 610 }, { "epoch": 0.1553472111870332, "grad_norm": 21694.041015625, "learning_rate": 4.954183161063309e-05, "loss": 5.5975, "step": 611 }, { "epoch": 0.15560146194184013, "grad_norm": 16205.421875, "learning_rate": 4.953759264719342e-05, "loss": 5.5904, "step": 612 }, { "epoch": 0.15585571269664708, "grad_norm": 30056.099609375, "learning_rate": 4.9533334347579816e-05, "loss": 5.5811, "step": 613 }, { "epoch": 0.156109963451454, "grad_norm": 17744.61328125, "learning_rate": 4.952905671514792e-05, "loss": 5.586, "step": 614 }, { "epoch": 0.15636421420626093, "grad_norm": 17406.412109375, "learning_rate": 4.9524759753268594e-05, "loss": 5.5827, "step": 615 }, { "epoch": 0.15661846496106785, "grad_norm": 21392.123046875, "learning_rate": 4.952044346532795e-05, "loss": 5.5851, "step": 616 }, { "epoch": 0.15687271571587477, "grad_norm": 16311.4326171875, "learning_rate": 4.9516107854727304e-05, "loss": 5.5871, "step": 617 }, { "epoch": 0.15712696647068172, "grad_norm": 18318.0546875, "learning_rate": 4.951175292488323e-05, "loss": 5.5831, "step": 618 }, { "epoch": 0.15738121722548865, "grad_norm": 19625.283203125, "learning_rate": 4.950737867922751e-05, "loss": 5.5946, "step": 619 }, { "epoch": 0.15763546798029557, "grad_norm": 16345.9580078125, "learning_rate": 4.950298512120714e-05, "loss": 5.5791, "step": 620 }, { "epoch": 0.1578897187351025, "grad_norm": 17236.556640625, "learning_rate": 4.9498572254284336e-05, "loss": 5.596, "step": 621 }, { "epoch": 0.15814396948990941, "grad_norm": 21715.25390625, "learning_rate": 4.949414008193655e-05, "loss": 5.5954, "step": 622 }, { "epoch": 0.15839822024471636, "grad_norm": 17622.6328125, "learning_rate": 4.9489688607656424e-05, "loss": 5.5868, "step": 623 }, { "epoch": 0.1586524709995233, "grad_norm": 16723.8359375, "learning_rate": 4.948521783495183e-05, "loss": 5.5918, "step": 624 }, { "epoch": 0.1589067217543302, "grad_norm": 24881.146484375, "learning_rate": 4.948072776734583e-05, "loss": 5.5659, "step": 625 }, { "epoch": 0.15916097250913713, "grad_norm": 18340.728515625, "learning_rate": 4.94762184083767e-05, "loss": 5.581, "step": 626 }, { "epoch": 0.15941522326394406, "grad_norm": 17637.662109375, "learning_rate": 4.947168976159792e-05, "loss": 5.5728, "step": 627 }, { "epoch": 0.159669474018751, "grad_norm": 31284.01171875, "learning_rate": 4.946714183057815e-05, "loss": 5.5648, "step": 628 }, { "epoch": 0.15992372477355793, "grad_norm": 16909.30078125, "learning_rate": 4.946257461890128e-05, "loss": 5.5673, "step": 629 }, { "epoch": 0.16017797552836485, "grad_norm": 18615.958984375, "learning_rate": 4.9457988130166365e-05, "loss": 5.5708, "step": 630 }, { "epoch": 0.16043222628317177, "grad_norm": 25193.091796875, "learning_rate": 4.9453382367987664e-05, "loss": 5.5711, "step": 631 }, { "epoch": 0.1606864770379787, "grad_norm": 16814.205078125, "learning_rate": 4.944875733599462e-05, "loss": 5.574, "step": 632 }, { "epoch": 0.16094072779278565, "grad_norm": 18124.3515625, "learning_rate": 4.944411303783187e-05, "loss": 5.5659, "step": 633 }, { "epoch": 0.16119497854759257, "grad_norm": 25308.673828125, "learning_rate": 4.943944947715922e-05, "loss": 5.5784, "step": 634 }, { "epoch": 0.1614492293023995, "grad_norm": 17068.517578125, "learning_rate": 4.9434766657651644e-05, "loss": 5.5731, "step": 635 }, { "epoch": 0.16170348005720642, "grad_norm": 18465.509765625, "learning_rate": 4.9430064582999335e-05, "loss": 5.5724, "step": 636 }, { "epoch": 0.16195773081201334, "grad_norm": 20113.205078125, "learning_rate": 4.942534325690762e-05, "loss": 5.5759, "step": 637 }, { "epoch": 0.1622119815668203, "grad_norm": 16281.3671875, "learning_rate": 4.942060268309701e-05, "loss": 5.5619, "step": 638 }, { "epoch": 0.1624662323216272, "grad_norm": 16662.591796875, "learning_rate": 4.941584286530319e-05, "loss": 5.5623, "step": 639 }, { "epoch": 0.16272048307643414, "grad_norm": 17126.716796875, "learning_rate": 4.941106380727699e-05, "loss": 5.5664, "step": 640 }, { "epoch": 0.16297473383124106, "grad_norm": 16530.728515625, "learning_rate": 4.9406265512784435e-05, "loss": 5.5784, "step": 641 }, { "epoch": 0.16322898458604798, "grad_norm": 16175.728515625, "learning_rate": 4.9401447985606676e-05, "loss": 5.5666, "step": 642 }, { "epoch": 0.1634832353408549, "grad_norm": 16602.26953125, "learning_rate": 4.939661122954003e-05, "loss": 5.571, "step": 643 }, { "epoch": 0.16373748609566185, "grad_norm": 16899.26953125, "learning_rate": 4.939175524839598e-05, "loss": 5.5559, "step": 644 }, { "epoch": 0.16399173685046878, "grad_norm": 16559.48046875, "learning_rate": 4.938688004600113e-05, "loss": 5.5655, "step": 645 }, { "epoch": 0.1642459876052757, "grad_norm": 16872.763671875, "learning_rate": 4.938198562619727e-05, "loss": 5.5592, "step": 646 }, { "epoch": 0.16450023836008262, "grad_norm": 17657.427734375, "learning_rate": 4.93770719928413e-05, "loss": 5.5664, "step": 647 }, { "epoch": 0.16475448911488955, "grad_norm": 17201.142578125, "learning_rate": 4.937213914980528e-05, "loss": 5.5816, "step": 648 }, { "epoch": 0.1650087398696965, "grad_norm": 16415.009765625, "learning_rate": 4.93671871009764e-05, "loss": 5.5573, "step": 649 }, { "epoch": 0.16526299062450342, "grad_norm": 16179.7470703125, "learning_rate": 4.936221585025698e-05, "loss": 5.544, "step": 650 }, { "epoch": 0.16551724137931034, "grad_norm": 17267.3203125, "learning_rate": 4.935722540156448e-05, "loss": 5.5482, "step": 651 }, { "epoch": 0.16577149213411727, "grad_norm": 16624.052734375, "learning_rate": 4.935221575883149e-05, "loss": 5.5636, "step": 652 }, { "epoch": 0.1660257428889242, "grad_norm": 16263.875, "learning_rate": 4.9347186926005714e-05, "loss": 5.5569, "step": 653 }, { "epoch": 0.16627999364373114, "grad_norm": 19820.095703125, "learning_rate": 4.934213890704999e-05, "loss": 5.5588, "step": 654 }, { "epoch": 0.16653424439853806, "grad_norm": 18276.126953125, "learning_rate": 4.9337071705942276e-05, "loss": 5.5559, "step": 655 }, { "epoch": 0.16678849515334498, "grad_norm": 16186.74609375, "learning_rate": 4.9331985326675624e-05, "loss": 5.5613, "step": 656 }, { "epoch": 0.1670427459081519, "grad_norm": 19869.337890625, "learning_rate": 4.932687977325823e-05, "loss": 5.5524, "step": 657 }, { "epoch": 0.16729699666295883, "grad_norm": 19918.806640625, "learning_rate": 4.932175504971337e-05, "loss": 5.5572, "step": 658 }, { "epoch": 0.16755124741776578, "grad_norm": 16860.58203125, "learning_rate": 4.9316611160079454e-05, "loss": 5.5482, "step": 659 }, { "epoch": 0.1678054981725727, "grad_norm": 31688.140625, "learning_rate": 4.931144810840999e-05, "loss": 5.551, "step": 660 }, { "epoch": 0.16805974892737963, "grad_norm": 16726.48828125, "learning_rate": 4.930626589877355e-05, "loss": 5.5557, "step": 661 }, { "epoch": 0.16831399968218655, "grad_norm": 20100.140625, "learning_rate": 4.930106453525386e-05, "loss": 5.5503, "step": 662 }, { "epoch": 0.16856825043699347, "grad_norm": 21077.978515625, "learning_rate": 4.92958440219497e-05, "loss": 5.5375, "step": 663 }, { "epoch": 0.16882250119180042, "grad_norm": 16542.46484375, "learning_rate": 4.9290604362974946e-05, "loss": 5.5482, "step": 664 }, { "epoch": 0.16907675194660735, "grad_norm": 17726.416015625, "learning_rate": 4.928534556245857e-05, "loss": 5.5473, "step": 665 }, { "epoch": 0.16933100270141427, "grad_norm": 20504.6953125, "learning_rate": 4.928006762454463e-05, "loss": 5.5525, "step": 666 }, { "epoch": 0.1695852534562212, "grad_norm": 17580.134765625, "learning_rate": 4.927477055339227e-05, "loss": 5.5115, "step": 667 }, { "epoch": 0.16983950421102811, "grad_norm": 16410.330078125, "learning_rate": 4.9269454353175674e-05, "loss": 5.5393, "step": 668 }, { "epoch": 0.17009375496583506, "grad_norm": 22899.498046875, "learning_rate": 4.926411902808415e-05, "loss": 5.5336, "step": 669 }, { "epoch": 0.170348005720642, "grad_norm": 19626.82421875, "learning_rate": 4.925876458232204e-05, "loss": 5.551, "step": 670 }, { "epoch": 0.1706022564754489, "grad_norm": 17094.5546875, "learning_rate": 4.925339102010877e-05, "loss": 5.5443, "step": 671 }, { "epoch": 0.17085650723025583, "grad_norm": 29964.884765625, "learning_rate": 4.9247998345678836e-05, "loss": 5.5488, "step": 672 }, { "epoch": 0.17111075798506276, "grad_norm": 17078.453125, "learning_rate": 4.924258656328178e-05, "loss": 5.53, "step": 673 }, { "epoch": 0.1713650087398697, "grad_norm": 18604.427734375, "learning_rate": 4.9237155677182215e-05, "loss": 5.5466, "step": 674 }, { "epoch": 0.17161925949467663, "grad_norm": 20238.46484375, "learning_rate": 4.923170569165979e-05, "loss": 5.5507, "step": 675 }, { "epoch": 0.17187351024948355, "grad_norm": 16794.333984375, "learning_rate": 4.9226236611009214e-05, "loss": 5.5359, "step": 676 }, { "epoch": 0.17212776100429047, "grad_norm": 17136.896484375, "learning_rate": 4.922074843954026e-05, "loss": 5.536, "step": 677 }, { "epoch": 0.1723820117590974, "grad_norm": 17338.365234375, "learning_rate": 4.921524118157772e-05, "loss": 5.5381, "step": 678 }, { "epoch": 0.17263626251390435, "grad_norm": 16591.2109375, "learning_rate": 4.920971484146144e-05, "loss": 5.5322, "step": 679 }, { "epoch": 0.17289051326871127, "grad_norm": 16398.8046875, "learning_rate": 4.9204169423546304e-05, "loss": 5.549, "step": 680 }, { "epoch": 0.1731447640235182, "grad_norm": 17364.615234375, "learning_rate": 4.9198604932202216e-05, "loss": 5.5261, "step": 681 }, { "epoch": 0.17339901477832512, "grad_norm": 17418.7265625, "learning_rate": 4.919302137181413e-05, "loss": 5.547, "step": 682 }, { "epoch": 0.17365326553313204, "grad_norm": 16392.10546875, "learning_rate": 4.918741874678201e-05, "loss": 5.5165, "step": 683 }, { "epoch": 0.173907516287939, "grad_norm": 17120.955078125, "learning_rate": 4.918179706152086e-05, "loss": 5.5222, "step": 684 }, { "epoch": 0.1741617670427459, "grad_norm": 17222.70703125, "learning_rate": 4.917615632046068e-05, "loss": 5.5374, "step": 685 }, { "epoch": 0.17441601779755284, "grad_norm": 16485.86328125, "learning_rate": 4.917049652804651e-05, "loss": 5.5285, "step": 686 }, { "epoch": 0.17467026855235976, "grad_norm": 19400.7421875, "learning_rate": 4.916481768873839e-05, "loss": 5.5386, "step": 687 }, { "epoch": 0.17492451930716668, "grad_norm": 17421.396484375, "learning_rate": 4.915911980701137e-05, "loss": 5.516, "step": 688 }, { "epoch": 0.17517877006197363, "grad_norm": 17308.685546875, "learning_rate": 4.915340288735552e-05, "loss": 5.5295, "step": 689 }, { "epoch": 0.17543302081678055, "grad_norm": 22447.404296875, "learning_rate": 4.9147666934275895e-05, "loss": 5.5153, "step": 690 }, { "epoch": 0.17568727157158748, "grad_norm": 17880.169921875, "learning_rate": 4.9141911952292554e-05, "loss": 5.5266, "step": 691 }, { "epoch": 0.1759415223263944, "grad_norm": 16937.1796875, "learning_rate": 4.9136137945940544e-05, "loss": 5.5306, "step": 692 }, { "epoch": 0.17619577308120132, "grad_norm": 22388.53125, "learning_rate": 4.913034491976992e-05, "loss": 5.5206, "step": 693 }, { "epoch": 0.17645002383600827, "grad_norm": 18653.53125, "learning_rate": 4.9124532878345724e-05, "loss": 5.5387, "step": 694 }, { "epoch": 0.1767042745908152, "grad_norm": 17489.583984375, "learning_rate": 4.911870182624796e-05, "loss": 5.5089, "step": 695 }, { "epoch": 0.17695852534562212, "grad_norm": 32763.01171875, "learning_rate": 4.911285176807164e-05, "loss": 5.529, "step": 696 }, { "epoch": 0.17721277610042904, "grad_norm": 17255.34765625, "learning_rate": 4.910698270842674e-05, "loss": 5.5427, "step": 697 }, { "epoch": 0.17746702685523597, "grad_norm": 18857.74609375, "learning_rate": 4.910109465193821e-05, "loss": 5.5177, "step": 698 }, { "epoch": 0.17772127761004292, "grad_norm": 22253.734375, "learning_rate": 4.909518760324595e-05, "loss": 5.5126, "step": 699 }, { "epoch": 0.17797552836484984, "grad_norm": 16930.79296875, "learning_rate": 4.908926156700488e-05, "loss": 5.5265, "step": 700 }, { "epoch": 0.17797552836484984, "eval_loss": 11.12784481048584, "eval_runtime": 699.3397, "eval_samples_per_second": 151.552, "eval_steps_per_second": 9.473, "step": 700 }, { "epoch": 0.17822977911965676, "grad_norm": 17669.5, "learning_rate": 4.9083316547884826e-05, "loss": 5.5158, "step": 701 }, { "epoch": 0.17848402987446368, "grad_norm": 18947.248046875, "learning_rate": 4.907735255057061e-05, "loss": 5.5145, "step": 702 }, { "epoch": 0.1787382806292706, "grad_norm": 16640.62109375, "learning_rate": 4.9071369579761995e-05, "loss": 5.5266, "step": 703 }, { "epoch": 0.17899253138407756, "grad_norm": 16972.955078125, "learning_rate": 4.906536764017369e-05, "loss": 5.524, "step": 704 }, { "epoch": 0.17924678213888448, "grad_norm": 19407.259765625, "learning_rate": 4.905934673653536e-05, "loss": 5.5118, "step": 705 }, { "epoch": 0.1795010328936914, "grad_norm": 18232.404296875, "learning_rate": 4.905330687359161e-05, "loss": 5.5218, "step": 706 }, { "epoch": 0.17975528364849833, "grad_norm": 17059.677734375, "learning_rate": 4.904724805610199e-05, "loss": 5.5465, "step": 707 }, { "epoch": 0.18000953440330525, "grad_norm": 27981.82421875, "learning_rate": 4.9041170288840985e-05, "loss": 5.5318, "step": 708 }, { "epoch": 0.1802637851581122, "grad_norm": 19111.18359375, "learning_rate": 4.9035073576598014e-05, "loss": 5.507, "step": 709 }, { "epoch": 0.18051803591291912, "grad_norm": 17588.126953125, "learning_rate": 4.902895792417742e-05, "loss": 5.5193, "step": 710 }, { "epoch": 0.18077228666772605, "grad_norm": 27783.1328125, "learning_rate": 4.902282333639847e-05, "loss": 5.5129, "step": 711 }, { "epoch": 0.18102653742253297, "grad_norm": 17371.529296875, "learning_rate": 4.901666981809537e-05, "loss": 5.5128, "step": 712 }, { "epoch": 0.1812807881773399, "grad_norm": 18173.57421875, "learning_rate": 4.9010497374117214e-05, "loss": 5.5234, "step": 713 }, { "epoch": 0.18153503893214684, "grad_norm": 20874.427734375, "learning_rate": 4.900430600932804e-05, "loss": 5.5069, "step": 714 }, { "epoch": 0.18178928968695376, "grad_norm": 17087.595703125, "learning_rate": 4.899809572860677e-05, "loss": 5.5097, "step": 715 }, { "epoch": 0.1820435404417607, "grad_norm": 17009.755859375, "learning_rate": 4.899186653684726e-05, "loss": 5.5087, "step": 716 }, { "epoch": 0.1822977911965676, "grad_norm": 19133.095703125, "learning_rate": 4.8985618438958254e-05, "loss": 5.5119, "step": 717 }, { "epoch": 0.18255204195137453, "grad_norm": 16642.587890625, "learning_rate": 4.8979351439863376e-05, "loss": 5.5097, "step": 718 }, { "epoch": 0.18280629270618148, "grad_norm": 16945.041015625, "learning_rate": 4.897306554450117e-05, "loss": 5.5119, "step": 719 }, { "epoch": 0.1830605434609884, "grad_norm": 17844.716796875, "learning_rate": 4.896676075782506e-05, "loss": 5.5076, "step": 720 }, { "epoch": 0.18331479421579533, "grad_norm": 17752.22265625, "learning_rate": 4.896043708480337e-05, "loss": 5.5019, "step": 721 }, { "epoch": 0.18356904497060225, "grad_norm": 16694.708984375, "learning_rate": 4.895409453041928e-05, "loss": 5.5121, "step": 722 }, { "epoch": 0.18382329572540917, "grad_norm": 16469.119140625, "learning_rate": 4.894773309967088e-05, "loss": 5.5073, "step": 723 }, { "epoch": 0.18407754648021613, "grad_norm": 16523.3828125, "learning_rate": 4.894135279757111e-05, "loss": 5.5003, "step": 724 }, { "epoch": 0.18433179723502305, "grad_norm": 16605.66015625, "learning_rate": 4.89349536291478e-05, "loss": 5.5058, "step": 725 }, { "epoch": 0.18458604798982997, "grad_norm": 16735.14453125, "learning_rate": 4.892853559944363e-05, "loss": 5.5068, "step": 726 }, { "epoch": 0.1848402987446369, "grad_norm": 16979.35546875, "learning_rate": 4.8922098713516165e-05, "loss": 5.5005, "step": 727 }, { "epoch": 0.18509454949944382, "grad_norm": 17449.349609375, "learning_rate": 4.89156429764378e-05, "loss": 5.499, "step": 728 }, { "epoch": 0.18534880025425077, "grad_norm": 17190.453125, "learning_rate": 4.8909168393295803e-05, "loss": 5.4871, "step": 729 }, { "epoch": 0.1856030510090577, "grad_norm": 16658.19921875, "learning_rate": 4.8902674969192294e-05, "loss": 5.5104, "step": 730 }, { "epoch": 0.1858573017638646, "grad_norm": 16575.8046875, "learning_rate": 4.889616270924425e-05, "loss": 5.5139, "step": 731 }, { "epoch": 0.18611155251867154, "grad_norm": 17014.2421875, "learning_rate": 4.888963161858346e-05, "loss": 5.4892, "step": 732 }, { "epoch": 0.18636580327347846, "grad_norm": 17797.419921875, "learning_rate": 4.888308170235657e-05, "loss": 5.4889, "step": 733 }, { "epoch": 0.1866200540282854, "grad_norm": 18417.068359375, "learning_rate": 4.887651296572508e-05, "loss": 5.4938, "step": 734 }, { "epoch": 0.18687430478309233, "grad_norm": 17704.82421875, "learning_rate": 4.886992541386528e-05, "loss": 5.4811, "step": 735 }, { "epoch": 0.18712855553789925, "grad_norm": 16696.72265625, "learning_rate": 4.886331905196831e-05, "loss": 5.4995, "step": 736 }, { "epoch": 0.18738280629270618, "grad_norm": 18708.583984375, "learning_rate": 4.8856693885240154e-05, "loss": 5.4957, "step": 737 }, { "epoch": 0.1876370570475131, "grad_norm": 22231.39453125, "learning_rate": 4.8850049918901574e-05, "loss": 5.5008, "step": 738 }, { "epoch": 0.18789130780232005, "grad_norm": 16901.51171875, "learning_rate": 4.884338715818817e-05, "loss": 5.4764, "step": 739 }, { "epoch": 0.18814555855712697, "grad_norm": 20277.150390625, "learning_rate": 4.883670560835034e-05, "loss": 5.4936, "step": 740 }, { "epoch": 0.1883998093119339, "grad_norm": 22048.296875, "learning_rate": 4.88300052746533e-05, "loss": 5.5044, "step": 741 }, { "epoch": 0.18865406006674082, "grad_norm": 17260.599609375, "learning_rate": 4.882328616237707e-05, "loss": 5.4832, "step": 742 }, { "epoch": 0.18890831082154774, "grad_norm": 39408.87890625, "learning_rate": 4.8816548276816446e-05, "loss": 5.5015, "step": 743 }, { "epoch": 0.1891625615763547, "grad_norm": 16681.390625, "learning_rate": 4.880979162328105e-05, "loss": 5.4792, "step": 744 }, { "epoch": 0.18941681233116162, "grad_norm": 23736.94921875, "learning_rate": 4.8803016207095263e-05, "loss": 5.4793, "step": 745 }, { "epoch": 0.18967106308596854, "grad_norm": 17365.373046875, "learning_rate": 4.879622203359828e-05, "loss": 5.4833, "step": 746 }, { "epoch": 0.18992531384077546, "grad_norm": 38777.96875, "learning_rate": 4.8789409108144046e-05, "loss": 5.4903, "step": 747 }, { "epoch": 0.19017956459558238, "grad_norm": 17310.361328125, "learning_rate": 4.878257743610131e-05, "loss": 5.4888, "step": 748 }, { "epoch": 0.19043381535038933, "grad_norm": 24678.96484375, "learning_rate": 4.877572702285358e-05, "loss": 5.4846, "step": 749 }, { "epoch": 0.19068806610519626, "grad_norm": 20161.830078125, "learning_rate": 4.8768857873799136e-05, "loss": 5.4817, "step": 750 }, { "epoch": 0.19094231686000318, "grad_norm": 36119.22265625, "learning_rate": 4.876196999435101e-05, "loss": 5.4972, "step": 751 }, { "epoch": 0.1911965676148101, "grad_norm": 21399.68359375, "learning_rate": 4.875506338993703e-05, "loss": 5.4785, "step": 752 }, { "epoch": 0.19145081836961703, "grad_norm": 22758.49609375, "learning_rate": 4.8748138065999736e-05, "loss": 5.4763, "step": 753 }, { "epoch": 0.19170506912442398, "grad_norm": 23260.380859375, "learning_rate": 4.874119402799644e-05, "loss": 5.475, "step": 754 }, { "epoch": 0.1919593198792309, "grad_norm": 17307.568359375, "learning_rate": 4.873423128139921e-05, "loss": 5.4839, "step": 755 }, { "epoch": 0.19221357063403782, "grad_norm": 29400.138671875, "learning_rate": 4.8727249831694845e-05, "loss": 5.462, "step": 756 }, { "epoch": 0.19246782138884475, "grad_norm": 16735.525390625, "learning_rate": 4.872024968438487e-05, "loss": 5.4587, "step": 757 }, { "epoch": 0.19272207214365167, "grad_norm": 20396.298828125, "learning_rate": 4.871323084498557e-05, "loss": 5.481, "step": 758 }, { "epoch": 0.1929763228984586, "grad_norm": 18180.109375, "learning_rate": 4.870619331902795e-05, "loss": 5.4751, "step": 759 }, { "epoch": 0.19323057365326554, "grad_norm": 25730.244140625, "learning_rate": 4.869913711205773e-05, "loss": 5.4655, "step": 760 }, { "epoch": 0.19348482440807246, "grad_norm": 19287.75390625, "learning_rate": 4.869206222963537e-05, "loss": 5.497, "step": 761 }, { "epoch": 0.1937390751628794, "grad_norm": 20065.8671875, "learning_rate": 4.868496867733603e-05, "loss": 5.4752, "step": 762 }, { "epoch": 0.1939933259176863, "grad_norm": 20584.716796875, "learning_rate": 4.867785646074959e-05, "loss": 5.4681, "step": 763 }, { "epoch": 0.19424757667249323, "grad_norm": 17356.533203125, "learning_rate": 4.8670725585480635e-05, "loss": 5.4707, "step": 764 }, { "epoch": 0.19450182742730018, "grad_norm": 22042.802734375, "learning_rate": 4.866357605714845e-05, "loss": 5.4789, "step": 765 }, { "epoch": 0.1947560781821071, "grad_norm": 17293.966796875, "learning_rate": 4.8656407881387035e-05, "loss": 5.4921, "step": 766 }, { "epoch": 0.19501032893691403, "grad_norm": 18169.630859375, "learning_rate": 4.864922106384506e-05, "loss": 5.4772, "step": 767 }, { "epoch": 0.19526457969172095, "grad_norm": 16894.482421875, "learning_rate": 4.8642015610185914e-05, "loss": 5.4545, "step": 768 }, { "epoch": 0.19551883044652787, "grad_norm": 19278.734375, "learning_rate": 4.863479152608764e-05, "loss": 5.4621, "step": 769 }, { "epoch": 0.19577308120133483, "grad_norm": 16807.013671875, "learning_rate": 4.8627548817242995e-05, "loss": 5.4468, "step": 770 }, { "epoch": 0.19602733195614175, "grad_norm": 17927.306640625, "learning_rate": 4.8620287489359384e-05, "loss": 5.4683, "step": 771 }, { "epoch": 0.19628158271094867, "grad_norm": 17142.498046875, "learning_rate": 4.8613007548158906e-05, "loss": 5.4564, "step": 772 }, { "epoch": 0.1965358334657556, "grad_norm": 17235.14453125, "learning_rate": 4.860570899937831e-05, "loss": 5.4768, "step": 773 }, { "epoch": 0.19679008422056252, "grad_norm": 17203.421875, "learning_rate": 4.8598391848769006e-05, "loss": 5.494, "step": 774 }, { "epoch": 0.19704433497536947, "grad_norm": 17139.166015625, "learning_rate": 4.85910561020971e-05, "loss": 5.4797, "step": 775 }, { "epoch": 0.1972985857301764, "grad_norm": 17136.216796875, "learning_rate": 4.858370176514331e-05, "loss": 5.4656, "step": 776 }, { "epoch": 0.1975528364849833, "grad_norm": 17079.115234375, "learning_rate": 4.857632884370301e-05, "loss": 5.4574, "step": 777 }, { "epoch": 0.19780708723979024, "grad_norm": 17595.861328125, "learning_rate": 4.856893734358625e-05, "loss": 5.4625, "step": 778 }, { "epoch": 0.19806133799459716, "grad_norm": 16878.646484375, "learning_rate": 4.856152727061768e-05, "loss": 5.4655, "step": 779 }, { "epoch": 0.1983155887494041, "grad_norm": 17753.927734375, "learning_rate": 4.85540986306366e-05, "loss": 5.4699, "step": 780 }, { "epoch": 0.19856983950421103, "grad_norm": 17242.048828125, "learning_rate": 4.8546651429496967e-05, "loss": 5.4609, "step": 781 }, { "epoch": 0.19882409025901795, "grad_norm": 17300.095703125, "learning_rate": 4.8539185673067325e-05, "loss": 5.4773, "step": 782 }, { "epoch": 0.19907834101382488, "grad_norm": 17454.78515625, "learning_rate": 4.853170136723086e-05, "loss": 5.4481, "step": 783 }, { "epoch": 0.1993325917686318, "grad_norm": 17033.931640625, "learning_rate": 4.8524198517885376e-05, "loss": 5.4591, "step": 784 }, { "epoch": 0.19958684252343875, "grad_norm": 17601.669921875, "learning_rate": 4.851667713094329e-05, "loss": 5.4621, "step": 785 }, { "epoch": 0.19984109327824567, "grad_norm": 16838.68359375, "learning_rate": 4.850913721233162e-05, "loss": 5.4574, "step": 786 }, { "epoch": 0.2000953440330526, "grad_norm": 16886.314453125, "learning_rate": 4.850157876799198e-05, "loss": 5.4516, "step": 787 }, { "epoch": 0.20034959478785952, "grad_norm": 16944.869140625, "learning_rate": 4.849400180388062e-05, "loss": 5.4362, "step": 788 }, { "epoch": 0.20060384554266644, "grad_norm": 17570.51171875, "learning_rate": 4.848640632596834e-05, "loss": 5.4597, "step": 789 }, { "epoch": 0.2008580962974734, "grad_norm": 16958.255859375, "learning_rate": 4.8478792340240543e-05, "loss": 5.4461, "step": 790 }, { "epoch": 0.20111234705228032, "grad_norm": 17096.994140625, "learning_rate": 4.847115985269723e-05, "loss": 5.4676, "step": 791 }, { "epoch": 0.20136659780708724, "grad_norm": 16994.615234375, "learning_rate": 4.846350886935298e-05, "loss": 5.4394, "step": 792 }, { "epoch": 0.20162084856189416, "grad_norm": 17317.64453125, "learning_rate": 4.845583939623692e-05, "loss": 5.4537, "step": 793 }, { "epoch": 0.20187509931670108, "grad_norm": 17152.9921875, "learning_rate": 4.844815143939277e-05, "loss": 5.4515, "step": 794 }, { "epoch": 0.20212935007150803, "grad_norm": 17149.2421875, "learning_rate": 4.8440445004878836e-05, "loss": 5.4507, "step": 795 }, { "epoch": 0.20238360082631496, "grad_norm": 17196.46484375, "learning_rate": 4.8432720098767934e-05, "loss": 5.4573, "step": 796 }, { "epoch": 0.20263785158112188, "grad_norm": 17182.818359375, "learning_rate": 4.842497672714749e-05, "loss": 5.4417, "step": 797 }, { "epoch": 0.2028921023359288, "grad_norm": 17190.388671875, "learning_rate": 4.841721489611942e-05, "loss": 5.4481, "step": 798 }, { "epoch": 0.20314635309073573, "grad_norm": 17173.533203125, "learning_rate": 4.8409434611800254e-05, "loss": 5.434, "step": 799 }, { "epoch": 0.20340060384554268, "grad_norm": 17155.787109375, "learning_rate": 4.8401635880321007e-05, "loss": 5.4492, "step": 800 }, { "epoch": 0.20340060384554268, "eval_loss": 10.975968360900879, "eval_runtime": 698.7587, "eval_samples_per_second": 151.678, "eval_steps_per_second": 9.481, "step": 800 }, { "epoch": 0.2036548546003496, "grad_norm": 17709.8515625, "learning_rate": 4.839381870782726e-05, "loss": 5.4467, "step": 801 }, { "epoch": 0.20390910535515652, "grad_norm": 18139.349609375, "learning_rate": 4.8385983100479135e-05, "loss": 5.4396, "step": 802 }, { "epoch": 0.20416335610996345, "grad_norm": 19168.87890625, "learning_rate": 4.837812906445126e-05, "loss": 5.4391, "step": 803 }, { "epoch": 0.20441760686477037, "grad_norm": 18207.087890625, "learning_rate": 4.8370256605932784e-05, "loss": 5.4577, "step": 804 }, { "epoch": 0.20467185761957732, "grad_norm": 17704.251953125, "learning_rate": 4.8362365731127385e-05, "loss": 5.4406, "step": 805 }, { "epoch": 0.20492610837438424, "grad_norm": 18517.12890625, "learning_rate": 4.835445644625325e-05, "loss": 5.4563, "step": 806 }, { "epoch": 0.20518035912919116, "grad_norm": 16896.431640625, "learning_rate": 4.834652875754307e-05, "loss": 5.438, "step": 807 }, { "epoch": 0.2054346098839981, "grad_norm": 17488.44140625, "learning_rate": 4.833858267124405e-05, "loss": 5.4463, "step": 808 }, { "epoch": 0.205688860638805, "grad_norm": 17255.595703125, "learning_rate": 4.833061819361787e-05, "loss": 5.4575, "step": 809 }, { "epoch": 0.20594311139361196, "grad_norm": 17746.630859375, "learning_rate": 4.832263533094073e-05, "loss": 5.4447, "step": 810 }, { "epoch": 0.20619736214841888, "grad_norm": 18005.53125, "learning_rate": 4.831463408950331e-05, "loss": 5.442, "step": 811 }, { "epoch": 0.2064516129032258, "grad_norm": 17274.60546875, "learning_rate": 4.830661447561074e-05, "loss": 5.4466, "step": 812 }, { "epoch": 0.20670586365803273, "grad_norm": 19063.30078125, "learning_rate": 4.8298576495582694e-05, "loss": 5.4185, "step": 813 }, { "epoch": 0.20696011441283965, "grad_norm": 17205.919921875, "learning_rate": 4.8290520155753254e-05, "loss": 5.4361, "step": 814 }, { "epoch": 0.2072143651676466, "grad_norm": 18312.072265625, "learning_rate": 4.8282445462471004e-05, "loss": 5.424, "step": 815 }, { "epoch": 0.20746861592245353, "grad_norm": 18321.888671875, "learning_rate": 4.827435242209898e-05, "loss": 5.4184, "step": 816 }, { "epoch": 0.20772286667726045, "grad_norm": 17325.552734375, "learning_rate": 4.826624104101469e-05, "loss": 5.4168, "step": 817 }, { "epoch": 0.20797711743206737, "grad_norm": 18686.46484375, "learning_rate": 4.825811132561008e-05, "loss": 5.4188, "step": 818 }, { "epoch": 0.2082313681868743, "grad_norm": 17174.45703125, "learning_rate": 4.8249963282291544e-05, "loss": 5.459, "step": 819 }, { "epoch": 0.20848561894168124, "grad_norm": 18713.06640625, "learning_rate": 4.824179691747992e-05, "loss": 5.4098, "step": 820 }, { "epoch": 0.20873986969648817, "grad_norm": 17252.634765625, "learning_rate": 4.8233612237610493e-05, "loss": 5.4444, "step": 821 }, { "epoch": 0.2089941204512951, "grad_norm": 17764.0, "learning_rate": 4.822540924913298e-05, "loss": 5.4287, "step": 822 }, { "epoch": 0.209248371206102, "grad_norm": 17277.396484375, "learning_rate": 4.82171879585115e-05, "loss": 5.4396, "step": 823 }, { "epoch": 0.20950262196090894, "grad_norm": 17452.599609375, "learning_rate": 4.820894837222464e-05, "loss": 5.4151, "step": 824 }, { "epoch": 0.2097568727157159, "grad_norm": 17822.01953125, "learning_rate": 4.820069049676537e-05, "loss": 5.4145, "step": 825 }, { "epoch": 0.2100111234705228, "grad_norm": 17212.953125, "learning_rate": 4.819241433864107e-05, "loss": 5.422, "step": 826 }, { "epoch": 0.21026537422532973, "grad_norm": 18727.654296875, "learning_rate": 4.818411990437355e-05, "loss": 5.4338, "step": 827 }, { "epoch": 0.21051962498013665, "grad_norm": 17617.97265625, "learning_rate": 4.817580720049901e-05, "loss": 5.4423, "step": 828 }, { "epoch": 0.21077387573494358, "grad_norm": 17747.548828125, "learning_rate": 4.8167476233568045e-05, "loss": 5.4114, "step": 829 }, { "epoch": 0.21102812648975053, "grad_norm": 18010.123046875, "learning_rate": 4.815912701014563e-05, "loss": 5.4305, "step": 830 }, { "epoch": 0.21128237724455745, "grad_norm": 17344.083984375, "learning_rate": 4.815075953681117e-05, "loss": 5.4259, "step": 831 }, { "epoch": 0.21153662799936437, "grad_norm": 18157.705078125, "learning_rate": 4.8142373820158396e-05, "loss": 5.4213, "step": 832 }, { "epoch": 0.2117908787541713, "grad_norm": 17346.078125, "learning_rate": 4.813396986679546e-05, "loss": 5.441, "step": 833 }, { "epoch": 0.21204512950897822, "grad_norm": 18123.140625, "learning_rate": 4.8125547683344854e-05, "loss": 5.4167, "step": 834 }, { "epoch": 0.21229938026378517, "grad_norm": 17636.365234375, "learning_rate": 4.8117107276443446e-05, "loss": 5.4146, "step": 835 }, { "epoch": 0.2125536310185921, "grad_norm": 17534.755859375, "learning_rate": 4.8108648652742475e-05, "loss": 5.4249, "step": 836 }, { "epoch": 0.21280788177339902, "grad_norm": 18610.494140625, "learning_rate": 4.810017181890752e-05, "loss": 5.4308, "step": 837 }, { "epoch": 0.21306213252820594, "grad_norm": 17490.263671875, "learning_rate": 4.809167678161852e-05, "loss": 5.4097, "step": 838 }, { "epoch": 0.21331638328301286, "grad_norm": 17355.6171875, "learning_rate": 4.8083163547569754e-05, "loss": 5.4133, "step": 839 }, { "epoch": 0.2135706340378198, "grad_norm": 18585.685546875, "learning_rate": 4.8074632123469834e-05, "loss": 5.4228, "step": 840 }, { "epoch": 0.21382488479262673, "grad_norm": 18207.43359375, "learning_rate": 4.806608251604173e-05, "loss": 5.4165, "step": 841 }, { "epoch": 0.21407913554743366, "grad_norm": 17172.05859375, "learning_rate": 4.8057514732022716e-05, "loss": 5.4087, "step": 842 }, { "epoch": 0.21433338630224058, "grad_norm": 17985.322265625, "learning_rate": 4.8048928778164395e-05, "loss": 5.4057, "step": 843 }, { "epoch": 0.2145876370570475, "grad_norm": 17661.4453125, "learning_rate": 4.8040324661232686e-05, "loss": 5.4098, "step": 844 }, { "epoch": 0.21484188781185445, "grad_norm": 17584.203125, "learning_rate": 4.8031702388007845e-05, "loss": 5.411, "step": 845 }, { "epoch": 0.21509613856666138, "grad_norm": 19419.599609375, "learning_rate": 4.80230619652844e-05, "loss": 5.4085, "step": 846 }, { "epoch": 0.2153503893214683, "grad_norm": 18347.896484375, "learning_rate": 4.801440339987121e-05, "loss": 5.4191, "step": 847 }, { "epoch": 0.21560464007627522, "grad_norm": 17328.365234375, "learning_rate": 4.800572669859141e-05, "loss": 5.3976, "step": 848 }, { "epoch": 0.21585889083108215, "grad_norm": 19423.048828125, "learning_rate": 4.7997031868282435e-05, "loss": 5.4107, "step": 849 }, { "epoch": 0.2161131415858891, "grad_norm": 18428.134765625, "learning_rate": 4.7988318915796016e-05, "loss": 5.4035, "step": 850 }, { "epoch": 0.21636739234069602, "grad_norm": 17473.404296875, "learning_rate": 4.797958784799815e-05, "loss": 5.3878, "step": 851 }, { "epoch": 0.21662164309550294, "grad_norm": 18494.755859375, "learning_rate": 4.7970838671769114e-05, "loss": 5.3993, "step": 852 }, { "epoch": 0.21687589385030986, "grad_norm": 17283.00390625, "learning_rate": 4.796207139400345e-05, "loss": 5.3892, "step": 853 }, { "epoch": 0.2171301446051168, "grad_norm": 18038.091796875, "learning_rate": 4.795328602160998e-05, "loss": 5.4199, "step": 854 }, { "epoch": 0.21738439535992374, "grad_norm": 18823.369140625, "learning_rate": 4.7944482561511773e-05, "loss": 5.3995, "step": 855 }, { "epoch": 0.21763864611473066, "grad_norm": 17727.009765625, "learning_rate": 4.793566102064614e-05, "loss": 5.4035, "step": 856 }, { "epoch": 0.21789289686953758, "grad_norm": 17558.423828125, "learning_rate": 4.792682140596467e-05, "loss": 5.4095, "step": 857 }, { "epoch": 0.2181471476243445, "grad_norm": 17932.189453125, "learning_rate": 4.791796372443317e-05, "loss": 5.4084, "step": 858 }, { "epoch": 0.21840139837915143, "grad_norm": 17523.052734375, "learning_rate": 4.790908798303169e-05, "loss": 5.3872, "step": 859 }, { "epoch": 0.21865564913395838, "grad_norm": 18066.94921875, "learning_rate": 4.790019418875452e-05, "loss": 5.3969, "step": 860 }, { "epoch": 0.2189098998887653, "grad_norm": 17449.490234375, "learning_rate": 4.789128234861017e-05, "loss": 5.3993, "step": 861 }, { "epoch": 0.21916415064357223, "grad_norm": 17495.359375, "learning_rate": 4.7882352469621354e-05, "loss": 5.3987, "step": 862 }, { "epoch": 0.21941840139837915, "grad_norm": 17641.064453125, "learning_rate": 4.787340455882504e-05, "loss": 5.3996, "step": 863 }, { "epoch": 0.21967265215318607, "grad_norm": 17604.541015625, "learning_rate": 4.786443862327237e-05, "loss": 5.4015, "step": 864 }, { "epoch": 0.21992690290799302, "grad_norm": 17249.251953125, "learning_rate": 4.78554546700287e-05, "loss": 5.3957, "step": 865 }, { "epoch": 0.22018115366279994, "grad_norm": 17524.5703125, "learning_rate": 4.7846452706173596e-05, "loss": 5.3888, "step": 866 }, { "epoch": 0.22043540441760687, "grad_norm": 17364.328125, "learning_rate": 4.7837432738800796e-05, "loss": 5.3971, "step": 867 }, { "epoch": 0.2206896551724138, "grad_norm": 17521.37109375, "learning_rate": 4.782839477501825e-05, "loss": 5.3782, "step": 868 }, { "epoch": 0.2209439059272207, "grad_norm": 18890.900390625, "learning_rate": 4.781933882194807e-05, "loss": 5.384, "step": 869 }, { "epoch": 0.22119815668202766, "grad_norm": 17461.412109375, "learning_rate": 4.781026488672655e-05, "loss": 5.3917, "step": 870 }, { "epoch": 0.2214524074368346, "grad_norm": 17433.080078125, "learning_rate": 4.780117297650415e-05, "loss": 5.3781, "step": 871 }, { "epoch": 0.2217066581916415, "grad_norm": 17750.408203125, "learning_rate": 4.779206309844551e-05, "loss": 5.3757, "step": 872 }, { "epoch": 0.22196090894644843, "grad_norm": 17979.791015625, "learning_rate": 4.7782935259729414e-05, "loss": 5.3908, "step": 873 }, { "epoch": 0.22221515970125535, "grad_norm": 18263.474609375, "learning_rate": 4.77737894675488e-05, "loss": 5.3898, "step": 874 }, { "epoch": 0.22246941045606228, "grad_norm": 18163.5, "learning_rate": 4.776462572911076e-05, "loss": 5.4258, "step": 875 }, { "epoch": 0.22272366121086923, "grad_norm": 17572.2890625, "learning_rate": 4.7755444051636525e-05, "loss": 5.3994, "step": 876 }, { "epoch": 0.22297791196567615, "grad_norm": 17499.98046875, "learning_rate": 4.774624444236147e-05, "loss": 5.3757, "step": 877 }, { "epoch": 0.22323216272048307, "grad_norm": 17581.466796875, "learning_rate": 4.773702690853508e-05, "loss": 5.3822, "step": 878 }, { "epoch": 0.22348641347529, "grad_norm": 17665.419921875, "learning_rate": 4.772779145742099e-05, "loss": 5.3816, "step": 879 }, { "epoch": 0.22374066423009692, "grad_norm": 17547.91015625, "learning_rate": 4.771853809629694e-05, "loss": 5.3914, "step": 880 }, { "epoch": 0.22399491498490387, "grad_norm": 17665.396484375, "learning_rate": 4.7709266832454786e-05, "loss": 5.3713, "step": 881 }, { "epoch": 0.2242491657397108, "grad_norm": 17600.853515625, "learning_rate": 4.769997767320049e-05, "loss": 5.3654, "step": 882 }, { "epoch": 0.22450341649451772, "grad_norm": 17626.947265625, "learning_rate": 4.769067062585412e-05, "loss": 5.3778, "step": 883 }, { "epoch": 0.22475766724932464, "grad_norm": 18446.50390625, "learning_rate": 4.768134569774984e-05, "loss": 5.3867, "step": 884 }, { "epoch": 0.22501191800413156, "grad_norm": 18473.2109375, "learning_rate": 4.76720028962359e-05, "loss": 5.367, "step": 885 }, { "epoch": 0.2252661687589385, "grad_norm": 17785.484375, "learning_rate": 4.766264222867463e-05, "loss": 5.3794, "step": 886 }, { "epoch": 0.22552041951374543, "grad_norm": 17738.54296875, "learning_rate": 4.7653263702442464e-05, "loss": 5.3989, "step": 887 }, { "epoch": 0.22577467026855236, "grad_norm": 19106.841796875, "learning_rate": 4.764386732492988e-05, "loss": 5.3801, "step": 888 }, { "epoch": 0.22602892102335928, "grad_norm": 18218.19140625, "learning_rate": 4.7634453103541434e-05, "loss": 5.3671, "step": 889 }, { "epoch": 0.2262831717781662, "grad_norm": 17934.93359375, "learning_rate": 4.7625021045695736e-05, "loss": 5.3913, "step": 890 }, { "epoch": 0.22653742253297315, "grad_norm": 17768.75390625, "learning_rate": 4.761557115882549e-05, "loss": 5.3628, "step": 891 }, { "epoch": 0.22679167328778008, "grad_norm": 17855.341796875, "learning_rate": 4.760610345037738e-05, "loss": 5.3907, "step": 892 }, { "epoch": 0.227045924042587, "grad_norm": 17917.2421875, "learning_rate": 4.759661792781219e-05, "loss": 5.3769, "step": 893 }, { "epoch": 0.22730017479739392, "grad_norm": 17846.27734375, "learning_rate": 4.7587114598604745e-05, "loss": 5.3572, "step": 894 }, { "epoch": 0.22755442555220085, "grad_norm": 17893.140625, "learning_rate": 4.757759347024384e-05, "loss": 5.3806, "step": 895 }, { "epoch": 0.2278086763070078, "grad_norm": 18742.2890625, "learning_rate": 4.7568054550232376e-05, "loss": 5.3574, "step": 896 }, { "epoch": 0.22806292706181472, "grad_norm": 17868.220703125, "learning_rate": 4.755849784608721e-05, "loss": 5.3726, "step": 897 }, { "epoch": 0.22831717781662164, "grad_norm": 17623.63671875, "learning_rate": 4.754892336533926e-05, "loss": 5.3723, "step": 898 }, { "epoch": 0.22857142857142856, "grad_norm": 18265.275390625, "learning_rate": 4.7539331115533416e-05, "loss": 5.367, "step": 899 }, { "epoch": 0.2288256793262355, "grad_norm": 18189.677734375, "learning_rate": 4.7529721104228594e-05, "loss": 5.3674, "step": 900 }, { "epoch": 0.2288256793262355, "eval_loss": 10.820060729980469, "eval_runtime": 700.6181, "eval_samples_per_second": 151.275, "eval_steps_per_second": 9.456, "step": 900 }, { "epoch": 0.22907993008104244, "grad_norm": 17588.166015625, "learning_rate": 4.75200933389977e-05, "loss": 5.3686, "step": 901 }, { "epoch": 0.22933418083584936, "grad_norm": 17979.3203125, "learning_rate": 4.751044782742762e-05, "loss": 5.3782, "step": 902 }, { "epoch": 0.22958843159065628, "grad_norm": 17750.35546875, "learning_rate": 4.750078457711924e-05, "loss": 5.3605, "step": 903 }, { "epoch": 0.2298426823454632, "grad_norm": 18194.66796875, "learning_rate": 4.749110359568741e-05, "loss": 5.3513, "step": 904 }, { "epoch": 0.23009693310027013, "grad_norm": 18587.244140625, "learning_rate": 4.748140489076098e-05, "loss": 5.372, "step": 905 }, { "epoch": 0.23035118385507708, "grad_norm": 18668.67578125, "learning_rate": 4.747168846998273e-05, "loss": 5.3629, "step": 906 }, { "epoch": 0.230605434609884, "grad_norm": 18011.33203125, "learning_rate": 4.746195434100943e-05, "loss": 5.3521, "step": 907 }, { "epoch": 0.23085968536469093, "grad_norm": 17774.375, "learning_rate": 4.745220251151178e-05, "loss": 5.3644, "step": 908 }, { "epoch": 0.23111393611949785, "grad_norm": 18431.01171875, "learning_rate": 4.7442432989174447e-05, "loss": 5.3618, "step": 909 }, { "epoch": 0.23136818687430477, "grad_norm": 21850.416015625, "learning_rate": 4.743264578169603e-05, "loss": 5.3504, "step": 910 }, { "epoch": 0.23162243762911172, "grad_norm": 22610.974609375, "learning_rate": 4.742284089678908e-05, "loss": 5.3578, "step": 911 }, { "epoch": 0.23187668838391864, "grad_norm": 17902.734375, "learning_rate": 4.741301834218006e-05, "loss": 5.3586, "step": 912 }, { "epoch": 0.23213093913872557, "grad_norm": 32920.03515625, "learning_rate": 4.740317812560935e-05, "loss": 5.3584, "step": 913 }, { "epoch": 0.2323851898935325, "grad_norm": 20057.27734375, "learning_rate": 4.739332025483127e-05, "loss": 5.3523, "step": 914 }, { "epoch": 0.2326394406483394, "grad_norm": 19211.392578125, "learning_rate": 4.7383444737614056e-05, "loss": 5.3464, "step": 915 }, { "epoch": 0.23289369140314636, "grad_norm": 29184.7421875, "learning_rate": 4.7373551581739825e-05, "loss": 5.3691, "step": 916 }, { "epoch": 0.2331479421579533, "grad_norm": 19907.173828125, "learning_rate": 4.736364079500461e-05, "loss": 5.3562, "step": 917 }, { "epoch": 0.2334021929127602, "grad_norm": 18084.181640625, "learning_rate": 4.735371238521833e-05, "loss": 5.3514, "step": 918 }, { "epoch": 0.23365644366756713, "grad_norm": 23599.41796875, "learning_rate": 4.73437663602048e-05, "loss": 5.3381, "step": 919 }, { "epoch": 0.23391069442237405, "grad_norm": 20313.943359375, "learning_rate": 4.7333802727801706e-05, "loss": 5.3468, "step": 920 }, { "epoch": 0.234164945177181, "grad_norm": 18963.916015625, "learning_rate": 4.7323821495860616e-05, "loss": 5.3427, "step": 921 }, { "epoch": 0.23441919593198793, "grad_norm": 29278.365234375, "learning_rate": 4.731382267224697e-05, "loss": 5.3646, "step": 922 }, { "epoch": 0.23467344668679485, "grad_norm": 18982.904296875, "learning_rate": 4.730380626484005e-05, "loss": 5.3521, "step": 923 }, { "epoch": 0.23492769744160177, "grad_norm": 19954.646484375, "learning_rate": 4.7293772281533024e-05, "loss": 5.3511, "step": 924 }, { "epoch": 0.2351819481964087, "grad_norm": 20905.38671875, "learning_rate": 4.7283720730232895e-05, "loss": 5.3481, "step": 925 }, { "epoch": 0.23543619895121565, "grad_norm": 17828.63671875, "learning_rate": 4.727365161886051e-05, "loss": 5.3532, "step": 926 }, { "epoch": 0.23569044970602257, "grad_norm": 18717.716796875, "learning_rate": 4.7263564955350546e-05, "loss": 5.3525, "step": 927 }, { "epoch": 0.2359447004608295, "grad_norm": 18535.908203125, "learning_rate": 4.725346074765154e-05, "loss": 5.3403, "step": 928 }, { "epoch": 0.23619895121563642, "grad_norm": 17812.9609375, "learning_rate": 4.7243339003725816e-05, "loss": 5.3547, "step": 929 }, { "epoch": 0.23645320197044334, "grad_norm": 18247.92578125, "learning_rate": 4.723319973154954e-05, "loss": 5.3259, "step": 930 }, { "epoch": 0.2367074527252503, "grad_norm": 18077.51953125, "learning_rate": 4.7223042939112686e-05, "loss": 5.3402, "step": 931 }, { "epoch": 0.2369617034800572, "grad_norm": 18649.927734375, "learning_rate": 4.721286863441905e-05, "loss": 5.3482, "step": 932 }, { "epoch": 0.23721595423486413, "grad_norm": 17952.345703125, "learning_rate": 4.720267682548618e-05, "loss": 5.3352, "step": 933 }, { "epoch": 0.23747020498967106, "grad_norm": 18010.802734375, "learning_rate": 4.719246752034548e-05, "loss": 5.3444, "step": 934 }, { "epoch": 0.23772445574447798, "grad_norm": 17835.041015625, "learning_rate": 4.71822407270421e-05, "loss": 5.3614, "step": 935 }, { "epoch": 0.23797870649928493, "grad_norm": 17767.880859375, "learning_rate": 4.7171996453634984e-05, "loss": 5.3437, "step": 936 }, { "epoch": 0.23823295725409185, "grad_norm": 17838.28515625, "learning_rate": 4.716173470819684e-05, "loss": 5.3455, "step": 937 }, { "epoch": 0.23848720800889878, "grad_norm": 17781.654296875, "learning_rate": 4.715145549881417e-05, "loss": 5.3349, "step": 938 }, { "epoch": 0.2387414587637057, "grad_norm": 17865.822265625, "learning_rate": 4.714115883358722e-05, "loss": 5.3388, "step": 939 }, { "epoch": 0.23899570951851262, "grad_norm": 18114.009765625, "learning_rate": 4.713084472062998e-05, "loss": 5.3403, "step": 940 }, { "epoch": 0.23924996027331957, "grad_norm": 18377.095703125, "learning_rate": 4.7120513168070215e-05, "loss": 5.3401, "step": 941 }, { "epoch": 0.2395042110281265, "grad_norm": 18223.400390625, "learning_rate": 4.711016418404941e-05, "loss": 5.3463, "step": 942 }, { "epoch": 0.23975846178293342, "grad_norm": 17863.875, "learning_rate": 4.709979777672281e-05, "loss": 5.3475, "step": 943 }, { "epoch": 0.24001271253774034, "grad_norm": 18238.734375, "learning_rate": 4.708941395425936e-05, "loss": 5.331, "step": 944 }, { "epoch": 0.24026696329254726, "grad_norm": 19287.94921875, "learning_rate": 4.707901272484177e-05, "loss": 5.3254, "step": 945 }, { "epoch": 0.24052121404735421, "grad_norm": 18871.19921875, "learning_rate": 4.706859409666642e-05, "loss": 5.3527, "step": 946 }, { "epoch": 0.24077546480216114, "grad_norm": 17860.0859375, "learning_rate": 4.7058158077943424e-05, "loss": 5.348, "step": 947 }, { "epoch": 0.24102971555696806, "grad_norm": 20285.310546875, "learning_rate": 4.7047704676896606e-05, "loss": 5.3342, "step": 948 }, { "epoch": 0.24128396631177498, "grad_norm": 19997.982421875, "learning_rate": 4.703723390176349e-05, "loss": 5.3364, "step": 949 }, { "epoch": 0.2415382170665819, "grad_norm": 18071.662109375, "learning_rate": 4.702674576079527e-05, "loss": 5.3293, "step": 950 }, { "epoch": 0.24179246782138886, "grad_norm": 23144.951171875, "learning_rate": 4.7016240262256825e-05, "loss": 5.332, "step": 951 }, { "epoch": 0.24204671857619578, "grad_norm": 19710.435546875, "learning_rate": 4.700571741442674e-05, "loss": 5.3289, "step": 952 }, { "epoch": 0.2423009693310027, "grad_norm": 18603.517578125, "learning_rate": 4.699517722559726e-05, "loss": 5.3384, "step": 953 }, { "epoch": 0.24255522008580963, "grad_norm": 22146.8046875, "learning_rate": 4.698461970407429e-05, "loss": 5.3405, "step": 954 }, { "epoch": 0.24280947084061655, "grad_norm": 19674.232421875, "learning_rate": 4.697404485817737e-05, "loss": 5.3255, "step": 955 }, { "epoch": 0.2430637215954235, "grad_norm": 18092.431640625, "learning_rate": 4.696345269623974e-05, "loss": 5.3357, "step": 956 }, { "epoch": 0.24331797235023042, "grad_norm": 21211.541015625, "learning_rate": 4.695284322660825e-05, "loss": 5.3314, "step": 957 }, { "epoch": 0.24357222310503734, "grad_norm": 20317.283203125, "learning_rate": 4.694221645764341e-05, "loss": 5.3308, "step": 958 }, { "epoch": 0.24382647385984427, "grad_norm": 17944.79296875, "learning_rate": 4.6931572397719346e-05, "loss": 5.3376, "step": 959 }, { "epoch": 0.2440807246146512, "grad_norm": 19708.265625, "learning_rate": 4.6920911055223814e-05, "loss": 5.3377, "step": 960 }, { "epoch": 0.24433497536945814, "grad_norm": 18920.71875, "learning_rate": 4.69102324385582e-05, "loss": 5.3346, "step": 961 }, { "epoch": 0.24458922612426506, "grad_norm": 18248.060546875, "learning_rate": 4.689953655613748e-05, "loss": 5.3216, "step": 962 }, { "epoch": 0.244843476879072, "grad_norm": 20516.634765625, "learning_rate": 4.6888823416390264e-05, "loss": 5.3206, "step": 963 }, { "epoch": 0.2450977276338789, "grad_norm": 18636.779296875, "learning_rate": 4.687809302775874e-05, "loss": 5.3257, "step": 964 }, { "epoch": 0.24535197838868583, "grad_norm": 18972.439453125, "learning_rate": 4.6867345398698694e-05, "loss": 5.3068, "step": 965 }, { "epoch": 0.24560622914349278, "grad_norm": 20794.03515625, "learning_rate": 4.68565805376795e-05, "loss": 5.316, "step": 966 }, { "epoch": 0.2458604798982997, "grad_norm": 18309.11328125, "learning_rate": 4.684579845318411e-05, "loss": 5.3131, "step": 967 }, { "epoch": 0.24611473065310663, "grad_norm": 18749.728515625, "learning_rate": 4.6834999153709055e-05, "loss": 5.3225, "step": 968 }, { "epoch": 0.24636898140791355, "grad_norm": 20605.408203125, "learning_rate": 4.682418264776442e-05, "loss": 5.3263, "step": 969 }, { "epoch": 0.24662323216272047, "grad_norm": 18425.98046875, "learning_rate": 4.6813348943873844e-05, "loss": 5.3088, "step": 970 }, { "epoch": 0.24687748291752742, "grad_norm": 18449.224609375, "learning_rate": 4.680249805057455e-05, "loss": 5.3222, "step": 971 }, { "epoch": 0.24713173367233435, "grad_norm": 19419.236328125, "learning_rate": 4.6791629976417264e-05, "loss": 5.3248, "step": 972 }, { "epoch": 0.24738598442714127, "grad_norm": 18711.662109375, "learning_rate": 4.678074472996628e-05, "loss": 5.3164, "step": 973 }, { "epoch": 0.2476402351819482, "grad_norm": 18136.14453125, "learning_rate": 4.676984231979944e-05, "loss": 5.3279, "step": 974 }, { "epoch": 0.24789448593675512, "grad_norm": 18540.59765625, "learning_rate": 4.675892275450805e-05, "loss": 5.3159, "step": 975 }, { "epoch": 0.24814873669156207, "grad_norm": 18179.595703125, "learning_rate": 4.6747986042697e-05, "loss": 5.3181, "step": 976 }, { "epoch": 0.248402987446369, "grad_norm": 17983.767578125, "learning_rate": 4.673703219298465e-05, "loss": 5.2956, "step": 977 }, { "epoch": 0.2486572382011759, "grad_norm": 18342.99609375, "learning_rate": 4.6726061214002894e-05, "loss": 5.3249, "step": 978 }, { "epoch": 0.24891148895598283, "grad_norm": 18580.19140625, "learning_rate": 4.671507311439709e-05, "loss": 5.3274, "step": 979 }, { "epoch": 0.24916573971078976, "grad_norm": 18269.54296875, "learning_rate": 4.670406790282612e-05, "loss": 5.3286, "step": 980 }, { "epoch": 0.2494199904655967, "grad_norm": 18964.98828125, "learning_rate": 4.669304558796233e-05, "loss": 5.3143, "step": 981 }, { "epoch": 0.24967424122040363, "grad_norm": 18295.220703125, "learning_rate": 4.668200617849157e-05, "loss": 5.3232, "step": 982 }, { "epoch": 0.24992849197521055, "grad_norm": 18181.0546875, "learning_rate": 4.667094968311311e-05, "loss": 5.3061, "step": 983 }, { "epoch": 0.2501827427300175, "grad_norm": 18935.896484375, "learning_rate": 4.665987611053975e-05, "loss": 5.3209, "step": 984 }, { "epoch": 0.2504369934848244, "grad_norm": 18704.439453125, "learning_rate": 4.6648785469497696e-05, "loss": 5.3037, "step": 985 }, { "epoch": 0.25069124423963135, "grad_norm": 18550.984375, "learning_rate": 4.663767776872663e-05, "loss": 5.329, "step": 986 }, { "epoch": 0.25094549499443825, "grad_norm": 20367.5078125, "learning_rate": 4.662655301697966e-05, "loss": 5.3121, "step": 987 }, { "epoch": 0.2511997457492452, "grad_norm": 20519.23046875, "learning_rate": 4.6615411223023346e-05, "loss": 5.3071, "step": 988 }, { "epoch": 0.25145399650405215, "grad_norm": 18400.6875, "learning_rate": 4.660425239563767e-05, "loss": 5.3133, "step": 989 }, { "epoch": 0.25170824725885904, "grad_norm": 24820.783203125, "learning_rate": 4.659307654361605e-05, "loss": 5.2959, "step": 990 }, { "epoch": 0.251962498013666, "grad_norm": 20000.869140625, "learning_rate": 4.658188367576529e-05, "loss": 5.3138, "step": 991 }, { "epoch": 0.2522167487684729, "grad_norm": 20044.357421875, "learning_rate": 4.657067380090563e-05, "loss": 5.3042, "step": 992 }, { "epoch": 0.25247099952327984, "grad_norm": 24468.15625, "learning_rate": 4.65594469278707e-05, "loss": 5.2929, "step": 993 }, { "epoch": 0.2527252502780868, "grad_norm": 18265.525390625, "learning_rate": 4.6548203065507533e-05, "loss": 5.2895, "step": 994 }, { "epoch": 0.2529795010328937, "grad_norm": 20103.119140625, "learning_rate": 4.653694222267655e-05, "loss": 5.319, "step": 995 }, { "epoch": 0.25323375178770063, "grad_norm": 18747.2890625, "learning_rate": 4.6525664408251526e-05, "loss": 5.2967, "step": 996 }, { "epoch": 0.25348800254250753, "grad_norm": 18531.0703125, "learning_rate": 4.651436963111966e-05, "loss": 5.2981, "step": 997 }, { "epoch": 0.2537422532973145, "grad_norm": 18618.912109375, "learning_rate": 4.650305790018147e-05, "loss": 5.3137, "step": 998 }, { "epoch": 0.25399650405212143, "grad_norm": 18419.814453125, "learning_rate": 4.649172922435086e-05, "loss": 5.2909, "step": 999 }, { "epoch": 0.2542507548069283, "grad_norm": 20947.517578125, "learning_rate": 4.648038361255508e-05, "loss": 5.297, "step": 1000 }, { "epoch": 0.2542507548069283, "eval_loss": 10.676921844482422, "eval_runtime": 698.7233, "eval_samples_per_second": 151.685, "eval_steps_per_second": 9.482, "step": 1000 }, { "epoch": 0.2545050055617353, "grad_norm": 18704.73828125, "learning_rate": 4.646902107373473e-05, "loss": 5.2888, "step": 1001 }, { "epoch": 0.25475925631654217, "grad_norm": 19256.822265625, "learning_rate": 4.645764161684375e-05, "loss": 5.2966, "step": 1002 }, { "epoch": 0.2550135070713491, "grad_norm": 20010.93359375, "learning_rate": 4.6446245250849396e-05, "loss": 5.3064, "step": 1003 }, { "epoch": 0.25526775782615607, "grad_norm": 18103.59375, "learning_rate": 4.6434831984732264e-05, "loss": 5.2922, "step": 1004 }, { "epoch": 0.25552200858096297, "grad_norm": 19176.099609375, "learning_rate": 4.642340182748627e-05, "loss": 5.2599, "step": 1005 }, { "epoch": 0.2557762593357699, "grad_norm": 18458.234375, "learning_rate": 4.6411954788118624e-05, "loss": 5.2888, "step": 1006 }, { "epoch": 0.2560305100905768, "grad_norm": 20568.380859375, "learning_rate": 4.640049087564986e-05, "loss": 5.2903, "step": 1007 }, { "epoch": 0.25628476084538376, "grad_norm": 18988.369140625, "learning_rate": 4.638901009911379e-05, "loss": 5.2875, "step": 1008 }, { "epoch": 0.2565390116001907, "grad_norm": 18686.9375, "learning_rate": 4.637751246755753e-05, "loss": 5.3128, "step": 1009 }, { "epoch": 0.2567932623549976, "grad_norm": 18832.705078125, "learning_rate": 4.636599799004148e-05, "loss": 5.2974, "step": 1010 }, { "epoch": 0.25704751310980456, "grad_norm": 18556.197265625, "learning_rate": 4.6354466675639285e-05, "loss": 5.2934, "step": 1011 }, { "epoch": 0.25730176386461145, "grad_norm": 19354.244140625, "learning_rate": 4.634291853343789e-05, "loss": 5.2936, "step": 1012 }, { "epoch": 0.2575560146194184, "grad_norm": 18408.103515625, "learning_rate": 4.633135357253751e-05, "loss": 5.2797, "step": 1013 }, { "epoch": 0.25781026537422536, "grad_norm": 19626.26953125, "learning_rate": 4.631977180205156e-05, "loss": 5.2847, "step": 1014 }, { "epoch": 0.25806451612903225, "grad_norm": 18935.166015625, "learning_rate": 4.630817323110676e-05, "loss": 5.2829, "step": 1015 }, { "epoch": 0.2583187668838392, "grad_norm": 18956.142578125, "learning_rate": 4.629655786884302e-05, "loss": 5.2931, "step": 1016 }, { "epoch": 0.2585730176386461, "grad_norm": 22038.21875, "learning_rate": 4.6284925724413534e-05, "loss": 5.2939, "step": 1017 }, { "epoch": 0.25882726839345305, "grad_norm": 18735.51171875, "learning_rate": 4.627327680698468e-05, "loss": 5.2861, "step": 1018 }, { "epoch": 0.25908151914826, "grad_norm": 18768.82421875, "learning_rate": 4.626161112573606e-05, "loss": 5.2874, "step": 1019 }, { "epoch": 0.2593357699030669, "grad_norm": 19256.115234375, "learning_rate": 4.6249928689860504e-05, "loss": 5.2698, "step": 1020 }, { "epoch": 0.25959002065787384, "grad_norm": 18492.08984375, "learning_rate": 4.6238229508564036e-05, "loss": 5.2819, "step": 1021 }, { "epoch": 0.25984427141268074, "grad_norm": 20447.416015625, "learning_rate": 4.6226513591065856e-05, "loss": 5.2845, "step": 1022 }, { "epoch": 0.2600985221674877, "grad_norm": 18602.169921875, "learning_rate": 4.6214780946598386e-05, "loss": 5.2835, "step": 1023 }, { "epoch": 0.26035277292229464, "grad_norm": 20133.9140625, "learning_rate": 4.620303158440721e-05, "loss": 5.2857, "step": 1024 }, { "epoch": 0.26060702367710153, "grad_norm": 18633.75390625, "learning_rate": 4.619126551375109e-05, "loss": 5.2786, "step": 1025 }, { "epoch": 0.2608612744319085, "grad_norm": 18813.966796875, "learning_rate": 4.617948274390194e-05, "loss": 5.2519, "step": 1026 }, { "epoch": 0.2611155251867154, "grad_norm": 18847.447265625, "learning_rate": 4.616768328414487e-05, "loss": 5.2768, "step": 1027 }, { "epoch": 0.26136977594152233, "grad_norm": 18288.798828125, "learning_rate": 4.6155867143778096e-05, "loss": 5.2885, "step": 1028 }, { "epoch": 0.2616240266963293, "grad_norm": 18775.12890625, "learning_rate": 4.614403433211303e-05, "loss": 5.2616, "step": 1029 }, { "epoch": 0.2618782774511362, "grad_norm": 18532.947265625, "learning_rate": 4.613218485847416e-05, "loss": 5.2927, "step": 1030 }, { "epoch": 0.2621325282059431, "grad_norm": 19250.541015625, "learning_rate": 4.612031873219916e-05, "loss": 5.2784, "step": 1031 }, { "epoch": 0.26238677896075, "grad_norm": 19046.58984375, "learning_rate": 4.6108435962638805e-05, "loss": 5.2806, "step": 1032 }, { "epoch": 0.262641029715557, "grad_norm": 18557.748046875, "learning_rate": 4.6096536559156976e-05, "loss": 5.2698, "step": 1033 }, { "epoch": 0.2628952804703639, "grad_norm": 18484.72265625, "learning_rate": 4.6084620531130665e-05, "loss": 5.2833, "step": 1034 }, { "epoch": 0.2631495312251708, "grad_norm": 18384.3515625, "learning_rate": 4.6072687887949986e-05, "loss": 5.2771, "step": 1035 }, { "epoch": 0.26340378197997777, "grad_norm": 18433.171875, "learning_rate": 4.606073863901811e-05, "loss": 5.2523, "step": 1036 }, { "epoch": 0.26365803273478466, "grad_norm": 18639.447265625, "learning_rate": 4.6048772793751324e-05, "loss": 5.2709, "step": 1037 }, { "epoch": 0.2639122834895916, "grad_norm": 18573.83984375, "learning_rate": 4.603679036157899e-05, "loss": 5.2708, "step": 1038 }, { "epoch": 0.26416653424439857, "grad_norm": 18574.572265625, "learning_rate": 4.602479135194352e-05, "loss": 5.2711, "step": 1039 }, { "epoch": 0.26442078499920546, "grad_norm": 18720.8046875, "learning_rate": 4.601277577430041e-05, "loss": 5.2689, "step": 1040 }, { "epoch": 0.2646750357540124, "grad_norm": 19043.833984375, "learning_rate": 4.6000743638118206e-05, "loss": 5.2603, "step": 1041 }, { "epoch": 0.2649292865088193, "grad_norm": 18659.544921875, "learning_rate": 4.598869495287849e-05, "loss": 5.2842, "step": 1042 }, { "epoch": 0.26518353726362626, "grad_norm": 19066.189453125, "learning_rate": 4.5976629728075913e-05, "loss": 5.2879, "step": 1043 }, { "epoch": 0.2654377880184332, "grad_norm": 18618.068359375, "learning_rate": 4.596454797321813e-05, "loss": 5.2625, "step": 1044 }, { "epoch": 0.2656920387732401, "grad_norm": 18596.009765625, "learning_rate": 4.595244969782585e-05, "loss": 5.2554, "step": 1045 }, { "epoch": 0.26594628952804705, "grad_norm": 19555.931640625, "learning_rate": 4.594033491143277e-05, "loss": 5.2776, "step": 1046 }, { "epoch": 0.26620054028285395, "grad_norm": 18995.42578125, "learning_rate": 4.592820362358562e-05, "loss": 5.2615, "step": 1047 }, { "epoch": 0.2664547910376609, "grad_norm": 18991.349609375, "learning_rate": 4.591605584384413e-05, "loss": 5.2549, "step": 1048 }, { "epoch": 0.2667090417924678, "grad_norm": 18617.767578125, "learning_rate": 4.590389158178102e-05, "loss": 5.2746, "step": 1049 }, { "epoch": 0.26696329254727474, "grad_norm": 18747.291015625, "learning_rate": 4.5891710846982e-05, "loss": 5.263, "step": 1050 }, { "epoch": 0.2672175433020817, "grad_norm": 18721.935546875, "learning_rate": 4.587951364904576e-05, "loss": 5.262, "step": 1051 }, { "epoch": 0.2674717940568886, "grad_norm": 18589.1875, "learning_rate": 4.586729999758398e-05, "loss": 5.2491, "step": 1052 }, { "epoch": 0.26772604481169554, "grad_norm": 18682.291015625, "learning_rate": 4.585506990222127e-05, "loss": 5.2676, "step": 1053 }, { "epoch": 0.26798029556650244, "grad_norm": 18647.712890625, "learning_rate": 4.584282337259524e-05, "loss": 5.27, "step": 1054 }, { "epoch": 0.2682345463213094, "grad_norm": 18895.015625, "learning_rate": 4.583056041835643e-05, "loss": 5.2683, "step": 1055 }, { "epoch": 0.26848879707611634, "grad_norm": 18877.19921875, "learning_rate": 4.58182810491683e-05, "loss": 5.2757, "step": 1056 }, { "epoch": 0.26874304783092323, "grad_norm": 18682.20703125, "learning_rate": 4.580598527470729e-05, "loss": 5.2627, "step": 1057 }, { "epoch": 0.2689972985857302, "grad_norm": 18800.166015625, "learning_rate": 4.5793673104662746e-05, "loss": 5.2508, "step": 1058 }, { "epoch": 0.2692515493405371, "grad_norm": 18938.908203125, "learning_rate": 4.578134454873692e-05, "loss": 5.2507, "step": 1059 }, { "epoch": 0.26950580009534403, "grad_norm": 23650.009765625, "learning_rate": 4.5768999616645006e-05, "loss": 5.2504, "step": 1060 }, { "epoch": 0.269760050850151, "grad_norm": 19527.74609375, "learning_rate": 4.5756638318115074e-05, "loss": 5.2633, "step": 1061 }, { "epoch": 0.2700143016049579, "grad_norm": 19006.771484375, "learning_rate": 4.574426066288812e-05, "loss": 5.2599, "step": 1062 }, { "epoch": 0.2702685523597648, "grad_norm": 19969.28125, "learning_rate": 4.5731866660717997e-05, "loss": 5.2504, "step": 1063 }, { "epoch": 0.2705228031145717, "grad_norm": 20093.740234375, "learning_rate": 4.571945632137147e-05, "loss": 5.2544, "step": 1064 }, { "epoch": 0.27077705386937867, "grad_norm": 18873.5078125, "learning_rate": 4.570702965462817e-05, "loss": 5.2385, "step": 1065 }, { "epoch": 0.2710313046241856, "grad_norm": 25937.205078125, "learning_rate": 4.5694586670280566e-05, "loss": 5.264, "step": 1066 }, { "epoch": 0.2712855553789925, "grad_norm": 20998.25390625, "learning_rate": 4.568212737813403e-05, "loss": 5.2587, "step": 1067 }, { "epoch": 0.27153980613379947, "grad_norm": 20501.58203125, "learning_rate": 4.566965178800676e-05, "loss": 5.2149, "step": 1068 }, { "epoch": 0.27179405688860636, "grad_norm": 28938.4921875, "learning_rate": 4.56571599097298e-05, "loss": 5.2487, "step": 1069 }, { "epoch": 0.2720483076434133, "grad_norm": 19291.228515625, "learning_rate": 4.5644651753147015e-05, "loss": 5.2612, "step": 1070 }, { "epoch": 0.27230255839822026, "grad_norm": 21341.572265625, "learning_rate": 4.5632127328115146e-05, "loss": 5.2482, "step": 1071 }, { "epoch": 0.27255680915302716, "grad_norm": 20495.69140625, "learning_rate": 4.561958664450369e-05, "loss": 5.2533, "step": 1072 }, { "epoch": 0.2728110599078341, "grad_norm": 19533.490234375, "learning_rate": 4.5607029712195004e-05, "loss": 5.2511, "step": 1073 }, { "epoch": 0.273065310662641, "grad_norm": 19501.73046875, "learning_rate": 4.559445654108424e-05, "loss": 5.2442, "step": 1074 }, { "epoch": 0.27331956141744795, "grad_norm": 18751.208984375, "learning_rate": 4.5581867141079315e-05, "loss": 5.2471, "step": 1075 }, { "epoch": 0.2735738121722549, "grad_norm": 23359.640625, "learning_rate": 4.556926152210097e-05, "loss": 5.2689, "step": 1076 }, { "epoch": 0.2738280629270618, "grad_norm": 20858.423828125, "learning_rate": 4.555663969408273e-05, "loss": 5.2448, "step": 1077 }, { "epoch": 0.27408231368186875, "grad_norm": 20229.2734375, "learning_rate": 4.5544001666970845e-05, "loss": 5.2411, "step": 1078 }, { "epoch": 0.27433656443667565, "grad_norm": 22775.314453125, "learning_rate": 4.5531347450724396e-05, "loss": 5.232, "step": 1079 }, { "epoch": 0.2745908151914826, "grad_norm": 18757.3671875, "learning_rate": 4.551867705531519e-05, "loss": 5.246, "step": 1080 }, { "epoch": 0.27484506594628955, "grad_norm": 20306.38671875, "learning_rate": 4.550599049072776e-05, "loss": 5.2362, "step": 1081 }, { "epoch": 0.27509931670109644, "grad_norm": 18759.568359375, "learning_rate": 4.549328776695941e-05, "loss": 5.227, "step": 1082 }, { "epoch": 0.2753535674559034, "grad_norm": 22638.13671875, "learning_rate": 4.548056889402019e-05, "loss": 5.2403, "step": 1083 }, { "epoch": 0.2756078182107103, "grad_norm": 19456.306640625, "learning_rate": 4.5467833881932835e-05, "loss": 5.2447, "step": 1084 }, { "epoch": 0.27586206896551724, "grad_norm": 20195.033203125, "learning_rate": 4.5455082740732835e-05, "loss": 5.2475, "step": 1085 }, { "epoch": 0.2761163197203242, "grad_norm": 19552.236328125, "learning_rate": 4.5442315480468365e-05, "loss": 5.241, "step": 1086 }, { "epoch": 0.2763705704751311, "grad_norm": 19486.474609375, "learning_rate": 4.542953211120033e-05, "loss": 5.2518, "step": 1087 }, { "epoch": 0.27662482122993803, "grad_norm": 19451.001953125, "learning_rate": 4.541673264300229e-05, "loss": 5.2324, "step": 1088 }, { "epoch": 0.27687907198474493, "grad_norm": 19479.142578125, "learning_rate": 4.540391708596053e-05, "loss": 5.2207, "step": 1089 }, { "epoch": 0.2771333227395519, "grad_norm": 21534.453125, "learning_rate": 4.539108545017399e-05, "loss": 5.2264, "step": 1090 }, { "epoch": 0.27738757349435883, "grad_norm": 18740.244140625, "learning_rate": 4.537823774575428e-05, "loss": 5.2392, "step": 1091 }, { "epoch": 0.2776418242491657, "grad_norm": 19559.61328125, "learning_rate": 4.5365373982825695e-05, "loss": 5.2292, "step": 1092 }, { "epoch": 0.2778960750039727, "grad_norm": 19390.04296875, "learning_rate": 4.5352494171525155e-05, "loss": 5.2284, "step": 1093 }, { "epoch": 0.27815032575877957, "grad_norm": 22003.71875, "learning_rate": 4.5339598322002255e-05, "loss": 5.2337, "step": 1094 }, { "epoch": 0.2784045765135865, "grad_norm": 18859.396484375, "learning_rate": 4.532668644441919e-05, "loss": 5.2227, "step": 1095 }, { "epoch": 0.27865882726839347, "grad_norm": 20484.791015625, "learning_rate": 4.5313758548950837e-05, "loss": 5.2304, "step": 1096 }, { "epoch": 0.27891307802320037, "grad_norm": 18884.150390625, "learning_rate": 4.530081464578465e-05, "loss": 5.2221, "step": 1097 }, { "epoch": 0.2791673287780073, "grad_norm": 22696.0078125, "learning_rate": 4.5287854745120726e-05, "loss": 5.2395, "step": 1098 }, { "epoch": 0.2794215795328142, "grad_norm": 19195.8359375, "learning_rate": 4.527487885717175e-05, "loss": 5.2301, "step": 1099 }, { "epoch": 0.27967583028762116, "grad_norm": 20194.6171875, "learning_rate": 4.526188699216301e-05, "loss": 5.2476, "step": 1100 }, { "epoch": 0.27967583028762116, "eval_loss": 10.538366317749023, "eval_runtime": 698.9295, "eval_samples_per_second": 151.64, "eval_steps_per_second": 9.479, "step": 1100 }, { "epoch": 0.2799300810424281, "grad_norm": 18768.498046875, "learning_rate": 4.524887916033241e-05, "loss": 5.2272, "step": 1101 }, { "epoch": 0.280184331797235, "grad_norm": 20945.138671875, "learning_rate": 4.523585537193039e-05, "loss": 5.2288, "step": 1102 }, { "epoch": 0.28043858255204196, "grad_norm": 19218.41796875, "learning_rate": 4.5222815637219984e-05, "loss": 5.2257, "step": 1103 }, { "epoch": 0.28069283330684885, "grad_norm": 19959.6171875, "learning_rate": 4.5209759966476814e-05, "loss": 5.2119, "step": 1104 }, { "epoch": 0.2809470840616558, "grad_norm": 18951.61328125, "learning_rate": 4.519668836998904e-05, "loss": 5.2283, "step": 1105 }, { "epoch": 0.28120133481646276, "grad_norm": 20073.236328125, "learning_rate": 4.518360085805735e-05, "loss": 5.2236, "step": 1106 }, { "epoch": 0.28145558557126965, "grad_norm": 19036.421875, "learning_rate": 4.517049744099503e-05, "loss": 5.2235, "step": 1107 }, { "epoch": 0.2817098363260766, "grad_norm": 19566.7265625, "learning_rate": 4.515737812912785e-05, "loss": 5.2267, "step": 1108 }, { "epoch": 0.2819640870808835, "grad_norm": 19226.5078125, "learning_rate": 4.5144242932794114e-05, "loss": 5.227, "step": 1109 }, { "epoch": 0.28221833783569045, "grad_norm": 19637.94140625, "learning_rate": 4.513109186234467e-05, "loss": 5.2195, "step": 1110 }, { "epoch": 0.2824725885904974, "grad_norm": 19243.708984375, "learning_rate": 4.511792492814284e-05, "loss": 5.2085, "step": 1111 }, { "epoch": 0.2827268393453043, "grad_norm": 18864.64453125, "learning_rate": 4.5104742140564484e-05, "loss": 5.2228, "step": 1112 }, { "epoch": 0.28298109010011124, "grad_norm": 19017.24609375, "learning_rate": 4.509154350999791e-05, "loss": 5.2061, "step": 1113 }, { "epoch": 0.28323534085491814, "grad_norm": 19696.044921875, "learning_rate": 4.507832904684395e-05, "loss": 5.2219, "step": 1114 }, { "epoch": 0.2834895916097251, "grad_norm": 18992.564453125, "learning_rate": 4.50650987615159e-05, "loss": 5.2164, "step": 1115 }, { "epoch": 0.28374384236453204, "grad_norm": 19503.03125, "learning_rate": 4.505185266443952e-05, "loss": 5.2318, "step": 1116 }, { "epoch": 0.28399809311933893, "grad_norm": 19167.876953125, "learning_rate": 4.5038590766053025e-05, "loss": 5.2245, "step": 1117 }, { "epoch": 0.2842523438741459, "grad_norm": 19578.37890625, "learning_rate": 4.5025313076807084e-05, "loss": 5.2269, "step": 1118 }, { "epoch": 0.2845065946289528, "grad_norm": 19599.34375, "learning_rate": 4.501201960716483e-05, "loss": 5.2073, "step": 1119 }, { "epoch": 0.28476084538375973, "grad_norm": 19364.05859375, "learning_rate": 4.499871036760182e-05, "loss": 5.2067, "step": 1120 }, { "epoch": 0.2850150961385667, "grad_norm": 19509.134765625, "learning_rate": 4.498538536860601e-05, "loss": 5.215, "step": 1121 }, { "epoch": 0.2852693468933736, "grad_norm": 18876.78515625, "learning_rate": 4.497204462067781e-05, "loss": 5.2048, "step": 1122 }, { "epoch": 0.2855235976481805, "grad_norm": 19477.646484375, "learning_rate": 4.4958688134330034e-05, "loss": 5.2243, "step": 1123 }, { "epoch": 0.2857778484029874, "grad_norm": 19461.203125, "learning_rate": 4.494531592008789e-05, "loss": 5.2169, "step": 1124 }, { "epoch": 0.2860320991577944, "grad_norm": 19409.619140625, "learning_rate": 4.493192798848898e-05, "loss": 5.213, "step": 1125 }, { "epoch": 0.2862863499126013, "grad_norm": 19104.021484375, "learning_rate": 4.491852435008329e-05, "loss": 5.2026, "step": 1126 }, { "epoch": 0.2865406006674082, "grad_norm": 19291.71484375, "learning_rate": 4.49051050154332e-05, "loss": 5.2102, "step": 1127 }, { "epoch": 0.28679485142221517, "grad_norm": 19089.091796875, "learning_rate": 4.489166999511344e-05, "loss": 5.199, "step": 1128 }, { "epoch": 0.28704910217702206, "grad_norm": 19295.65234375, "learning_rate": 4.487821929971111e-05, "loss": 5.1955, "step": 1129 }, { "epoch": 0.287303352931829, "grad_norm": 19036.705078125, "learning_rate": 4.486475293982566e-05, "loss": 5.1987, "step": 1130 }, { "epoch": 0.28755760368663597, "grad_norm": 19167.36328125, "learning_rate": 4.485127092606889e-05, "loss": 5.2207, "step": 1131 }, { "epoch": 0.28781185444144286, "grad_norm": 19268.767578125, "learning_rate": 4.483777326906491e-05, "loss": 5.2053, "step": 1132 }, { "epoch": 0.2880661051962498, "grad_norm": 19050.767578125, "learning_rate": 4.482425997945019e-05, "loss": 5.2184, "step": 1133 }, { "epoch": 0.2883203559510567, "grad_norm": 19141.060546875, "learning_rate": 4.4810731067873515e-05, "loss": 5.1966, "step": 1134 }, { "epoch": 0.28857460670586366, "grad_norm": 18897.35546875, "learning_rate": 4.4797186544995954e-05, "loss": 5.2057, "step": 1135 }, { "epoch": 0.2888288574606706, "grad_norm": 19116.806640625, "learning_rate": 4.47836264214909e-05, "loss": 5.2033, "step": 1136 }, { "epoch": 0.2890831082154775, "grad_norm": 19259.021484375, "learning_rate": 4.477005070804404e-05, "loss": 5.2097, "step": 1137 }, { "epoch": 0.28933735897028445, "grad_norm": 19018.482421875, "learning_rate": 4.475645941535334e-05, "loss": 5.2091, "step": 1138 }, { "epoch": 0.28959160972509135, "grad_norm": 19302.724609375, "learning_rate": 4.474285255412904e-05, "loss": 5.2009, "step": 1139 }, { "epoch": 0.2898458604798983, "grad_norm": 19230.236328125, "learning_rate": 4.4729230135093645e-05, "loss": 5.1969, "step": 1140 }, { "epoch": 0.29010011123470525, "grad_norm": 19891.123046875, "learning_rate": 4.471559216898195e-05, "loss": 5.1995, "step": 1141 }, { "epoch": 0.29035436198951214, "grad_norm": 19324.708984375, "learning_rate": 4.470193866654096e-05, "loss": 5.2058, "step": 1142 }, { "epoch": 0.2906086127443191, "grad_norm": 19443.1171875, "learning_rate": 4.4688269638529945e-05, "loss": 5.2078, "step": 1143 }, { "epoch": 0.290862863499126, "grad_norm": 19438.568359375, "learning_rate": 4.46745850957204e-05, "loss": 5.1818, "step": 1144 }, { "epoch": 0.29111711425393294, "grad_norm": 19019.572265625, "learning_rate": 4.466088504889607e-05, "loss": 5.2027, "step": 1145 }, { "epoch": 0.2913713650087399, "grad_norm": 20348.5703125, "learning_rate": 4.4647169508852885e-05, "loss": 5.2083, "step": 1146 }, { "epoch": 0.2916256157635468, "grad_norm": 19141.072265625, "learning_rate": 4.4633438486398996e-05, "loss": 5.2125, "step": 1147 }, { "epoch": 0.29187986651835374, "grad_norm": 19181.828125, "learning_rate": 4.461969199235477e-05, "loss": 5.21, "step": 1148 }, { "epoch": 0.29213411727316063, "grad_norm": 19225.615234375, "learning_rate": 4.4605930037552746e-05, "loss": 5.2035, "step": 1149 }, { "epoch": 0.2923883680279676, "grad_norm": 19277.08203125, "learning_rate": 4.4592152632837646e-05, "loss": 5.2011, "step": 1150 }, { "epoch": 0.29264261878277453, "grad_norm": 19206.990234375, "learning_rate": 4.4578359789066384e-05, "loss": 5.1859, "step": 1151 }, { "epoch": 0.29289686953758143, "grad_norm": 19416.390625, "learning_rate": 4.4564551517108034e-05, "loss": 5.1956, "step": 1152 }, { "epoch": 0.2931511202923884, "grad_norm": 19266.19140625, "learning_rate": 4.455072782784381e-05, "loss": 5.1942, "step": 1153 }, { "epoch": 0.2934053710471953, "grad_norm": 19319.84765625, "learning_rate": 4.4536888732167105e-05, "loss": 5.1801, "step": 1154 }, { "epoch": 0.2936596218020022, "grad_norm": 19243.646484375, "learning_rate": 4.452303424098342e-05, "loss": 5.2009, "step": 1155 }, { "epoch": 0.2939138725568092, "grad_norm": 19192.30078125, "learning_rate": 4.4509164365210424e-05, "loss": 5.195, "step": 1156 }, { "epoch": 0.29416812331161607, "grad_norm": 19331.48828125, "learning_rate": 4.4495279115777874e-05, "loss": 5.1922, "step": 1157 }, { "epoch": 0.294422374066423, "grad_norm": 19333.892578125, "learning_rate": 4.448137850362768e-05, "loss": 5.1939, "step": 1158 }, { "epoch": 0.2946766248212299, "grad_norm": 20555.83984375, "learning_rate": 4.446746253971381e-05, "loss": 5.1916, "step": 1159 }, { "epoch": 0.29493087557603687, "grad_norm": 19440.171875, "learning_rate": 4.4453531235002375e-05, "loss": 5.1783, "step": 1160 }, { "epoch": 0.2951851263308438, "grad_norm": 19383.220703125, "learning_rate": 4.4439584600471546e-05, "loss": 5.1831, "step": 1161 }, { "epoch": 0.2954393770856507, "grad_norm": 19301.896484375, "learning_rate": 4.4425622647111586e-05, "loss": 5.192, "step": 1162 }, { "epoch": 0.29569362784045766, "grad_norm": 19287.884765625, "learning_rate": 4.441164538592483e-05, "loss": 5.1934, "step": 1163 }, { "epoch": 0.29594787859526456, "grad_norm": 19611.091796875, "learning_rate": 4.439765282792567e-05, "loss": 5.1989, "step": 1164 }, { "epoch": 0.2962021293500715, "grad_norm": 19401.994140625, "learning_rate": 4.4383644984140565e-05, "loss": 5.1804, "step": 1165 }, { "epoch": 0.29645638010487846, "grad_norm": 20315.923828125, "learning_rate": 4.4369621865608e-05, "loss": 5.1868, "step": 1166 }, { "epoch": 0.29671063085968535, "grad_norm": 19917.0078125, "learning_rate": 4.4355583483378514e-05, "loss": 5.1973, "step": 1167 }, { "epoch": 0.2969648816144923, "grad_norm": 19760.30859375, "learning_rate": 4.434152984851466e-05, "loss": 5.1819, "step": 1168 }, { "epoch": 0.2972191323692992, "grad_norm": 20807.515625, "learning_rate": 4.432746097209103e-05, "loss": 5.1769, "step": 1169 }, { "epoch": 0.29747338312410615, "grad_norm": 19250.61328125, "learning_rate": 4.4313376865194204e-05, "loss": 5.1855, "step": 1170 }, { "epoch": 0.2977276338789131, "grad_norm": 19682.90625, "learning_rate": 4.4299277538922776e-05, "loss": 5.167, "step": 1171 }, { "epoch": 0.29798188463372, "grad_norm": 19363.853515625, "learning_rate": 4.428516300438733e-05, "loss": 5.19, "step": 1172 }, { "epoch": 0.29823613538852695, "grad_norm": 19313.884765625, "learning_rate": 4.4271033272710444e-05, "loss": 5.1887, "step": 1173 }, { "epoch": 0.29849038614333384, "grad_norm": 19418.591796875, "learning_rate": 4.425688835502666e-05, "loss": 5.1811, "step": 1174 }, { "epoch": 0.2987446368981408, "grad_norm": 19325.841796875, "learning_rate": 4.424272826248248e-05, "loss": 5.1706, "step": 1175 }, { "epoch": 0.29899888765294774, "grad_norm": 19616.07421875, "learning_rate": 4.4228553006236395e-05, "loss": 5.18, "step": 1176 }, { "epoch": 0.29925313840775464, "grad_norm": 19263.18359375, "learning_rate": 4.4214362597458813e-05, "loss": 5.1958, "step": 1177 }, { "epoch": 0.2995073891625616, "grad_norm": 19283.693359375, "learning_rate": 4.420015704733209e-05, "loss": 5.185, "step": 1178 }, { "epoch": 0.2997616399173685, "grad_norm": 19800.904296875, "learning_rate": 4.418593636705054e-05, "loss": 5.175, "step": 1179 }, { "epoch": 0.30001589067217543, "grad_norm": 19762.6953125, "learning_rate": 4.417170056782035e-05, "loss": 5.1804, "step": 1180 }, { "epoch": 0.3002701414269824, "grad_norm": 24224.61328125, "learning_rate": 4.4157449660859665e-05, "loss": 5.1558, "step": 1181 }, { "epoch": 0.3005243921817893, "grad_norm": 19985.32421875, "learning_rate": 4.414318365739852e-05, "loss": 5.1691, "step": 1182 }, { "epoch": 0.30077864293659623, "grad_norm": 19891.27734375, "learning_rate": 4.412890256867884e-05, "loss": 5.1742, "step": 1183 }, { "epoch": 0.3010328936914031, "grad_norm": 19686.4296875, "learning_rate": 4.411460640595445e-05, "loss": 5.1642, "step": 1184 }, { "epoch": 0.3012871444462101, "grad_norm": 19818.65234375, "learning_rate": 4.410029518049105e-05, "loss": 5.1734, "step": 1185 }, { "epoch": 0.301541395201017, "grad_norm": 19511.919921875, "learning_rate": 4.4085968903566186e-05, "loss": 5.1815, "step": 1186 }, { "epoch": 0.3017956459558239, "grad_norm": 19616.92578125, "learning_rate": 4.407162758646931e-05, "loss": 5.1736, "step": 1187 }, { "epoch": 0.30204989671063087, "grad_norm": 19353.744140625, "learning_rate": 4.405727124050169e-05, "loss": 5.1773, "step": 1188 }, { "epoch": 0.30230414746543777, "grad_norm": 20175.39453125, "learning_rate": 4.4042899876976465e-05, "loss": 5.1626, "step": 1189 }, { "epoch": 0.3025583982202447, "grad_norm": 20587.6328125, "learning_rate": 4.402851350721856e-05, "loss": 5.1821, "step": 1190 }, { "epoch": 0.30281264897505167, "grad_norm": 19521.3203125, "learning_rate": 4.401411214256479e-05, "loss": 5.1803, "step": 1191 }, { "epoch": 0.30306689972985856, "grad_norm": 22237.19921875, "learning_rate": 4.399969579436374e-05, "loss": 5.1714, "step": 1192 }, { "epoch": 0.3033211504846655, "grad_norm": 20076.607421875, "learning_rate": 4.398526447397581e-05, "loss": 5.187, "step": 1193 }, { "epoch": 0.3035754012394724, "grad_norm": 20269.62890625, "learning_rate": 4.397081819277321e-05, "loss": 5.1684, "step": 1194 }, { "epoch": 0.30382965199427936, "grad_norm": 19471.693359375, "learning_rate": 4.395635696213993e-05, "loss": 5.1591, "step": 1195 }, { "epoch": 0.3040839027490863, "grad_norm": 21970.646484375, "learning_rate": 4.394188079347176e-05, "loss": 5.1679, "step": 1196 }, { "epoch": 0.3043381535038932, "grad_norm": 20995.275390625, "learning_rate": 4.3927389698176237e-05, "loss": 5.1672, "step": 1197 }, { "epoch": 0.30459240425870016, "grad_norm": 21099.685546875, "learning_rate": 4.3912883687672654e-05, "loss": 5.1701, "step": 1198 }, { "epoch": 0.30484665501350705, "grad_norm": 23570.384765625, "learning_rate": 4.3898362773392095e-05, "loss": 5.1486, "step": 1199 }, { "epoch": 0.305100905768314, "grad_norm": 24977.611328125, "learning_rate": 4.388382696677735e-05, "loss": 5.1557, "step": 1200 }, { "epoch": 0.305100905768314, "eval_loss": 10.403265953063965, "eval_runtime": 699.978, "eval_samples_per_second": 151.413, "eval_steps_per_second": 9.465, "step": 1200 }, { "epoch": 0.30535515652312095, "grad_norm": 20541.41015625, "learning_rate": 4.3869276279282976e-05, "loss": 5.1478, "step": 1201 }, { "epoch": 0.30560940727792785, "grad_norm": 19590.240234375, "learning_rate": 4.3854710722375237e-05, "loss": 5.1466, "step": 1202 }, { "epoch": 0.3058636580327348, "grad_norm": 21604.7265625, "learning_rate": 4.384013030753211e-05, "loss": 5.1583, "step": 1203 }, { "epoch": 0.3061179087875417, "grad_norm": 20013.283203125, "learning_rate": 4.382553504624331e-05, "loss": 5.1588, "step": 1204 }, { "epoch": 0.30637215954234864, "grad_norm": 20195.771484375, "learning_rate": 4.3810924950010195e-05, "loss": 5.1697, "step": 1205 }, { "epoch": 0.3066264102971556, "grad_norm": 19981.65625, "learning_rate": 4.3796300030345876e-05, "loss": 5.1526, "step": 1206 }, { "epoch": 0.3068806610519625, "grad_norm": 20020.1875, "learning_rate": 4.3781660298775116e-05, "loss": 5.1384, "step": 1207 }, { "epoch": 0.30713491180676944, "grad_norm": 20344.783203125, "learning_rate": 4.3767005766834346e-05, "loss": 5.1641, "step": 1208 }, { "epoch": 0.30738916256157633, "grad_norm": 19750.4453125, "learning_rate": 4.3752336446071677e-05, "loss": 5.1328, "step": 1209 }, { "epoch": 0.3076434133163833, "grad_norm": 22305.890625, "learning_rate": 4.373765234804684e-05, "loss": 5.1588, "step": 1210 }, { "epoch": 0.30789766407119024, "grad_norm": 19771.02734375, "learning_rate": 4.372295348433125e-05, "loss": 5.142, "step": 1211 }, { "epoch": 0.30815191482599713, "grad_norm": 20072.134765625, "learning_rate": 4.370823986650795e-05, "loss": 5.1655, "step": 1212 }, { "epoch": 0.3084061655808041, "grad_norm": 19482.8203125, "learning_rate": 4.369351150617158e-05, "loss": 5.1608, "step": 1213 }, { "epoch": 0.308660416335611, "grad_norm": 20071.876953125, "learning_rate": 4.367876841492844e-05, "loss": 5.1387, "step": 1214 }, { "epoch": 0.3089146670904179, "grad_norm": 19533.78125, "learning_rate": 4.3664010604396404e-05, "loss": 5.1422, "step": 1215 }, { "epoch": 0.3091689178452249, "grad_norm": 20021.111328125, "learning_rate": 4.3649238086204955e-05, "loss": 5.1504, "step": 1216 }, { "epoch": 0.3094231686000318, "grad_norm": 19802.34375, "learning_rate": 4.363445087199518e-05, "loss": 5.1623, "step": 1217 }, { "epoch": 0.3096774193548387, "grad_norm": 21246.015625, "learning_rate": 4.361964897341973e-05, "loss": 5.1468, "step": 1218 }, { "epoch": 0.3099316701096456, "grad_norm": 19995.50390625, "learning_rate": 4.360483240214284e-05, "loss": 5.164, "step": 1219 }, { "epoch": 0.31018592086445257, "grad_norm": 22513.32421875, "learning_rate": 4.359000116984029e-05, "loss": 5.1565, "step": 1220 }, { "epoch": 0.3104401716192595, "grad_norm": 19838.423828125, "learning_rate": 4.357515528819942e-05, "loss": 5.1462, "step": 1221 }, { "epoch": 0.3106944223740664, "grad_norm": 20792.4765625, "learning_rate": 4.356029476891914e-05, "loss": 5.13, "step": 1222 }, { "epoch": 0.31094867312887337, "grad_norm": 19451.251953125, "learning_rate": 4.354541962370985e-05, "loss": 5.1464, "step": 1223 }, { "epoch": 0.31120292388368026, "grad_norm": 20367.1484375, "learning_rate": 4.353052986429351e-05, "loss": 5.1608, "step": 1224 }, { "epoch": 0.3114571746384872, "grad_norm": 19801.705078125, "learning_rate": 4.351562550240359e-05, "loss": 5.1471, "step": 1225 }, { "epoch": 0.31171142539329416, "grad_norm": 21052.32421875, "learning_rate": 4.3500706549785056e-05, "loss": 5.1335, "step": 1226 }, { "epoch": 0.31196567614810106, "grad_norm": 19839.130859375, "learning_rate": 4.3485773018194365e-05, "loss": 5.1526, "step": 1227 }, { "epoch": 0.312219926902908, "grad_norm": 19741.162109375, "learning_rate": 4.347082491939949e-05, "loss": 5.1347, "step": 1228 }, { "epoch": 0.3124741776577149, "grad_norm": 19714.0703125, "learning_rate": 4.345586226517987e-05, "loss": 5.1496, "step": 1229 }, { "epoch": 0.31272842841252185, "grad_norm": 19635.5390625, "learning_rate": 4.3440885067326405e-05, "loss": 5.1477, "step": 1230 }, { "epoch": 0.3129826791673288, "grad_norm": 19685.90234375, "learning_rate": 4.342589333764146e-05, "loss": 5.1358, "step": 1231 }, { "epoch": 0.3132369299221357, "grad_norm": 20508.802734375, "learning_rate": 4.3410887087938865e-05, "loss": 5.1542, "step": 1232 }, { "epoch": 0.31349118067694265, "grad_norm": 19723.970703125, "learning_rate": 4.339586633004388e-05, "loss": 5.1351, "step": 1233 }, { "epoch": 0.31374543143174954, "grad_norm": 19800.984375, "learning_rate": 4.3380831075793194e-05, "loss": 5.1356, "step": 1234 }, { "epoch": 0.3139996821865565, "grad_norm": 19727.21484375, "learning_rate": 4.336578133703493e-05, "loss": 5.1421, "step": 1235 }, { "epoch": 0.31425393294136345, "grad_norm": 19545.599609375, "learning_rate": 4.335071712562862e-05, "loss": 5.1404, "step": 1236 }, { "epoch": 0.31450818369617034, "grad_norm": 19616.6796875, "learning_rate": 4.333563845344518e-05, "loss": 5.1315, "step": 1237 }, { "epoch": 0.3147624344509773, "grad_norm": 19754.3984375, "learning_rate": 4.3320545332366976e-05, "loss": 5.1533, "step": 1238 }, { "epoch": 0.3150166852057842, "grad_norm": 19678.0078125, "learning_rate": 4.330543777428771e-05, "loss": 5.1401, "step": 1239 }, { "epoch": 0.31527093596059114, "grad_norm": 19605.453125, "learning_rate": 4.329031579111248e-05, "loss": 5.1304, "step": 1240 }, { "epoch": 0.3155251867153981, "grad_norm": 19767.70703125, "learning_rate": 4.327517939475774e-05, "loss": 5.1347, "step": 1241 }, { "epoch": 0.315779437470205, "grad_norm": 20732.2265625, "learning_rate": 4.3260028597151315e-05, "loss": 5.1308, "step": 1242 }, { "epoch": 0.31603368822501193, "grad_norm": 19815.3515625, "learning_rate": 4.3244863410232383e-05, "loss": 5.1272, "step": 1243 }, { "epoch": 0.31628793897981883, "grad_norm": 19953.953125, "learning_rate": 4.322968384595143e-05, "loss": 5.1406, "step": 1244 }, { "epoch": 0.3165421897346258, "grad_norm": 20573.7421875, "learning_rate": 4.3214489916270316e-05, "loss": 5.1237, "step": 1245 }, { "epoch": 0.31679644048943273, "grad_norm": 19799.109375, "learning_rate": 4.3199281633162196e-05, "loss": 5.1371, "step": 1246 }, { "epoch": 0.3170506912442396, "grad_norm": 19982.7734375, "learning_rate": 4.318405900861152e-05, "loss": 5.136, "step": 1247 }, { "epoch": 0.3173049419990466, "grad_norm": 20054.2734375, "learning_rate": 4.3168822054614075e-05, "loss": 5.1295, "step": 1248 }, { "epoch": 0.31755919275385347, "grad_norm": 19859.669921875, "learning_rate": 4.315357078317692e-05, "loss": 5.1281, "step": 1249 }, { "epoch": 0.3178134435086604, "grad_norm": 19729.4375, "learning_rate": 4.3138305206318395e-05, "loss": 5.1198, "step": 1250 }, { "epoch": 0.31806769426346737, "grad_norm": 20013.509765625, "learning_rate": 4.312302533606813e-05, "loss": 5.1245, "step": 1251 }, { "epoch": 0.31832194501827427, "grad_norm": 19770.373046875, "learning_rate": 4.310773118446699e-05, "loss": 5.1169, "step": 1252 }, { "epoch": 0.3185761957730812, "grad_norm": 19846.98046875, "learning_rate": 4.309242276356711e-05, "loss": 5.1464, "step": 1253 }, { "epoch": 0.3188304465278881, "grad_norm": 19756.466796875, "learning_rate": 4.307710008543187e-05, "loss": 5.137, "step": 1254 }, { "epoch": 0.31908469728269506, "grad_norm": 19736.759765625, "learning_rate": 4.30617631621359e-05, "loss": 5.1307, "step": 1255 }, { "epoch": 0.319338948037502, "grad_norm": 19902.673828125, "learning_rate": 4.304641200576502e-05, "loss": 5.1296, "step": 1256 }, { "epoch": 0.3195931987923089, "grad_norm": 19743.802734375, "learning_rate": 4.3031046628416306e-05, "loss": 5.1295, "step": 1257 }, { "epoch": 0.31984744954711586, "grad_norm": 20099.115234375, "learning_rate": 4.301566704219801e-05, "loss": 5.1267, "step": 1258 }, { "epoch": 0.32010170030192275, "grad_norm": 19752.916015625, "learning_rate": 4.3000273259229583e-05, "loss": 5.1342, "step": 1259 }, { "epoch": 0.3203559510567297, "grad_norm": 20736.10546875, "learning_rate": 4.298486529164168e-05, "loss": 5.1347, "step": 1260 }, { "epoch": 0.32061020181153665, "grad_norm": 19977.865234375, "learning_rate": 4.2969443151576126e-05, "loss": 5.113, "step": 1261 }, { "epoch": 0.32086445256634355, "grad_norm": 19813.3125, "learning_rate": 4.2954006851185915e-05, "loss": 5.1096, "step": 1262 }, { "epoch": 0.3211187033211505, "grad_norm": 19996.478515625, "learning_rate": 4.293855640263519e-05, "loss": 5.1153, "step": 1263 }, { "epoch": 0.3213729540759574, "grad_norm": 19746.96484375, "learning_rate": 4.292309181809926e-05, "loss": 5.1153, "step": 1264 }, { "epoch": 0.32162720483076435, "grad_norm": 19771.24609375, "learning_rate": 4.290761310976456e-05, "loss": 5.1201, "step": 1265 }, { "epoch": 0.3218814555855713, "grad_norm": 19841.5625, "learning_rate": 4.2892120289828664e-05, "loss": 5.1215, "step": 1266 }, { "epoch": 0.3221357063403782, "grad_norm": 19981.927734375, "learning_rate": 4.287661337050026e-05, "loss": 5.1351, "step": 1267 }, { "epoch": 0.32238995709518514, "grad_norm": 19845.1484375, "learning_rate": 4.286109236399914e-05, "loss": 5.1024, "step": 1268 }, { "epoch": 0.32264420784999204, "grad_norm": 19998.423828125, "learning_rate": 4.284555728255622e-05, "loss": 5.1124, "step": 1269 }, { "epoch": 0.322898458604799, "grad_norm": 19892.732421875, "learning_rate": 4.283000813841349e-05, "loss": 5.1113, "step": 1270 }, { "epoch": 0.32315270935960594, "grad_norm": 21815.525390625, "learning_rate": 4.2814444943824014e-05, "loss": 5.1093, "step": 1271 }, { "epoch": 0.32340696011441283, "grad_norm": 20048.7109375, "learning_rate": 4.279886771105195e-05, "loss": 5.1214, "step": 1272 }, { "epoch": 0.3236612108692198, "grad_norm": 20031.232421875, "learning_rate": 4.27832764523725e-05, "loss": 5.1167, "step": 1273 }, { "epoch": 0.3239154616240267, "grad_norm": 20546.447265625, "learning_rate": 4.2767671180071935e-05, "loss": 5.105, "step": 1274 }, { "epoch": 0.32416971237883363, "grad_norm": 20309.35546875, "learning_rate": 4.275205190644756e-05, "loss": 5.1325, "step": 1275 }, { "epoch": 0.3244239631336406, "grad_norm": 20087.349609375, "learning_rate": 4.273641864380769e-05, "loss": 5.1169, "step": 1276 }, { "epoch": 0.3246782138884475, "grad_norm": 19972.22265625, "learning_rate": 4.272077140447172e-05, "loss": 5.1187, "step": 1277 }, { "epoch": 0.3249324646432544, "grad_norm": 20051.26171875, "learning_rate": 4.2705110200769996e-05, "loss": 5.1063, "step": 1278 }, { "epoch": 0.3251867153980613, "grad_norm": 20097.4765625, "learning_rate": 4.2689435045043925e-05, "loss": 5.1235, "step": 1279 }, { "epoch": 0.32544096615286827, "grad_norm": 20417.92578125, "learning_rate": 4.267374594964586e-05, "loss": 5.107, "step": 1280 }, { "epoch": 0.3256952169076752, "grad_norm": 21291.435546875, "learning_rate": 4.2658042926939175e-05, "loss": 5.1075, "step": 1281 }, { "epoch": 0.3259494676624821, "grad_norm": 19855.05078125, "learning_rate": 4.2642325989298194e-05, "loss": 5.1059, "step": 1282 }, { "epoch": 0.32620371841728907, "grad_norm": 20626.611328125, "learning_rate": 4.262659514910823e-05, "loss": 5.1122, "step": 1283 }, { "epoch": 0.32645796917209596, "grad_norm": 19825.59765625, "learning_rate": 4.261085041876552e-05, "loss": 5.1066, "step": 1284 }, { "epoch": 0.3267122199269029, "grad_norm": 20555.48046875, "learning_rate": 4.259509181067728e-05, "loss": 5.0968, "step": 1285 }, { "epoch": 0.3269664706817098, "grad_norm": 19933.44921875, "learning_rate": 4.2579319337261644e-05, "loss": 5.1054, "step": 1286 }, { "epoch": 0.32722072143651676, "grad_norm": 20873.533203125, "learning_rate": 4.256353301094767e-05, "loss": 5.0964, "step": 1287 }, { "epoch": 0.3274749721913237, "grad_norm": 19889.3125, "learning_rate": 4.254773284417534e-05, "loss": 5.1, "step": 1288 }, { "epoch": 0.3277292229461306, "grad_norm": 21461.48828125, "learning_rate": 4.253191884939554e-05, "loss": 5.1067, "step": 1289 }, { "epoch": 0.32798347370093756, "grad_norm": 20288.03515625, "learning_rate": 4.251609103907006e-05, "loss": 5.0986, "step": 1290 }, { "epoch": 0.32823772445574445, "grad_norm": 20197.896484375, "learning_rate": 4.250024942567156e-05, "loss": 5.1063, "step": 1291 }, { "epoch": 0.3284919752105514, "grad_norm": 20193.87890625, "learning_rate": 4.2484394021683596e-05, "loss": 5.1088, "step": 1292 }, { "epoch": 0.32874622596535835, "grad_norm": 20334.603515625, "learning_rate": 4.2468524839600566e-05, "loss": 5.0992, "step": 1293 }, { "epoch": 0.32900047672016525, "grad_norm": 20315.34375, "learning_rate": 4.245264189192776e-05, "loss": 5.1087, "step": 1294 }, { "epoch": 0.3292547274749722, "grad_norm": 20287.775390625, "learning_rate": 4.243674519118129e-05, "loss": 5.0836, "step": 1295 }, { "epoch": 0.3295089782297791, "grad_norm": 20205.9453125, "learning_rate": 4.242083474988812e-05, "loss": 5.0961, "step": 1296 }, { "epoch": 0.32976322898458604, "grad_norm": 20001.400390625, "learning_rate": 4.240491058058601e-05, "loss": 5.0987, "step": 1297 }, { "epoch": 0.330017479739393, "grad_norm": 20167.29296875, "learning_rate": 4.2388972695823594e-05, "loss": 5.0903, "step": 1298 }, { "epoch": 0.3302717304941999, "grad_norm": 20238.060546875, "learning_rate": 4.237302110816027e-05, "loss": 5.1022, "step": 1299 }, { "epoch": 0.33052598124900684, "grad_norm": 20412.294921875, "learning_rate": 4.235705583016625e-05, "loss": 5.0886, "step": 1300 }, { "epoch": 0.33052598124900684, "eval_loss": 10.275012016296387, "eval_runtime": 699.146, "eval_samples_per_second": 151.594, "eval_steps_per_second": 9.476, "step": 1300 }, { "epoch": 0.33078023200381373, "grad_norm": 20100.3828125, "learning_rate": 4.234107687442252e-05, "loss": 5.0906, "step": 1301 }, { "epoch": 0.3310344827586207, "grad_norm": 20233.59765625, "learning_rate": 4.232508425352087e-05, "loss": 5.1064, "step": 1302 }, { "epoch": 0.33128873351342764, "grad_norm": 20024.859375, "learning_rate": 4.230907798006384e-05, "loss": 5.1014, "step": 1303 }, { "epoch": 0.33154298426823453, "grad_norm": 20425.4375, "learning_rate": 4.2293058066664734e-05, "loss": 5.0751, "step": 1304 }, { "epoch": 0.3317972350230415, "grad_norm": 20071.107421875, "learning_rate": 4.227702452594759e-05, "loss": 5.0973, "step": 1305 }, { "epoch": 0.3320514857778484, "grad_norm": 20287.640625, "learning_rate": 4.2260977370547225e-05, "loss": 5.094, "step": 1306 }, { "epoch": 0.3323057365326553, "grad_norm": 20121.212890625, "learning_rate": 4.2244916613109135e-05, "loss": 5.0985, "step": 1307 }, { "epoch": 0.3325599872874623, "grad_norm": 20336.966796875, "learning_rate": 4.222884226628957e-05, "loss": 5.0827, "step": 1308 }, { "epoch": 0.3328142380422692, "grad_norm": 20287.17578125, "learning_rate": 4.2212754342755464e-05, "loss": 5.1025, "step": 1309 }, { "epoch": 0.3330684887970761, "grad_norm": 20628.662109375, "learning_rate": 4.219665285518447e-05, "loss": 5.0942, "step": 1310 }, { "epoch": 0.333322739551883, "grad_norm": 20041.59375, "learning_rate": 4.218053781626493e-05, "loss": 5.0862, "step": 1311 }, { "epoch": 0.33357699030668997, "grad_norm": 20977.61328125, "learning_rate": 4.216440923869584e-05, "loss": 5.0848, "step": 1312 }, { "epoch": 0.3338312410614969, "grad_norm": 20465.666015625, "learning_rate": 4.214826713518689e-05, "loss": 5.0858, "step": 1313 }, { "epoch": 0.3340854918163038, "grad_norm": 20362.083984375, "learning_rate": 4.213211151845842e-05, "loss": 5.1007, "step": 1314 }, { "epoch": 0.33433974257111077, "grad_norm": 20163.548828125, "learning_rate": 4.211594240124141e-05, "loss": 5.1008, "step": 1315 }, { "epoch": 0.33459399332591766, "grad_norm": 20307.2890625, "learning_rate": 4.209975979627751e-05, "loss": 5.0741, "step": 1316 }, { "epoch": 0.3348482440807246, "grad_norm": 20222.36328125, "learning_rate": 4.208356371631894e-05, "loss": 5.0628, "step": 1317 }, { "epoch": 0.33510249483553156, "grad_norm": 20340.34375, "learning_rate": 4.2067354174128606e-05, "loss": 5.0871, "step": 1318 }, { "epoch": 0.33535674559033846, "grad_norm": 20454.966796875, "learning_rate": 4.205113118247999e-05, "loss": 5.0944, "step": 1319 }, { "epoch": 0.3356109963451454, "grad_norm": 20080.46484375, "learning_rate": 4.203489475415714e-05, "loss": 5.1005, "step": 1320 }, { "epoch": 0.3358652470999523, "grad_norm": 20567.080078125, "learning_rate": 4.2018644901954765e-05, "loss": 5.0768, "step": 1321 }, { "epoch": 0.33611949785475925, "grad_norm": 20374.1875, "learning_rate": 4.20023816386781e-05, "loss": 5.1077, "step": 1322 }, { "epoch": 0.3363737486095662, "grad_norm": 20328.609375, "learning_rate": 4.198610497714296e-05, "loss": 5.0923, "step": 1323 }, { "epoch": 0.3366279993643731, "grad_norm": 20210.048828125, "learning_rate": 4.196981493017572e-05, "loss": 5.0756, "step": 1324 }, { "epoch": 0.33688225011918005, "grad_norm": 20224.494140625, "learning_rate": 4.19535115106133e-05, "loss": 5.0841, "step": 1325 }, { "epoch": 0.33713650087398694, "grad_norm": 20655.853515625, "learning_rate": 4.193719473130317e-05, "loss": 5.07, "step": 1326 }, { "epoch": 0.3373907516287939, "grad_norm": 20672.8046875, "learning_rate": 4.1920864605103304e-05, "loss": 5.075, "step": 1327 }, { "epoch": 0.33764500238360085, "grad_norm": 20118.31640625, "learning_rate": 4.190452114488222e-05, "loss": 5.0704, "step": 1328 }, { "epoch": 0.33789925313840774, "grad_norm": 20429.97265625, "learning_rate": 4.1888164363518926e-05, "loss": 5.0864, "step": 1329 }, { "epoch": 0.3381535038932147, "grad_norm": 19991.333984375, "learning_rate": 4.187179427390293e-05, "loss": 5.0901, "step": 1330 }, { "epoch": 0.3384077546480216, "grad_norm": 20147.642578125, "learning_rate": 4.1855410888934244e-05, "loss": 5.0817, "step": 1331 }, { "epoch": 0.33866200540282854, "grad_norm": 20130.478515625, "learning_rate": 4.183901422152332e-05, "loss": 5.071, "step": 1332 }, { "epoch": 0.3389162561576355, "grad_norm": 20706.8828125, "learning_rate": 4.182260428459113e-05, "loss": 5.0844, "step": 1333 }, { "epoch": 0.3391705069124424, "grad_norm": 20779.1875, "learning_rate": 4.1806181091069046e-05, "loss": 5.066, "step": 1334 }, { "epoch": 0.33942475766724933, "grad_norm": 20123.12109375, "learning_rate": 4.178974465389893e-05, "loss": 5.0754, "step": 1335 }, { "epoch": 0.33967900842205623, "grad_norm": 20410.642578125, "learning_rate": 4.177329498603305e-05, "loss": 5.0684, "step": 1336 }, { "epoch": 0.3399332591768632, "grad_norm": 20151.8515625, "learning_rate": 4.175683210043413e-05, "loss": 5.0753, "step": 1337 }, { "epoch": 0.34018750993167013, "grad_norm": 20529.083984375, "learning_rate": 4.174035601007528e-05, "loss": 5.0828, "step": 1338 }, { "epoch": 0.340441760686477, "grad_norm": 20459.279296875, "learning_rate": 4.1723866727940036e-05, "loss": 5.0828, "step": 1339 }, { "epoch": 0.340696011441284, "grad_norm": 21032.62890625, "learning_rate": 4.170736426702232e-05, "loss": 5.0667, "step": 1340 }, { "epoch": 0.34095026219609087, "grad_norm": 20187.595703125, "learning_rate": 4.1690848640326444e-05, "loss": 5.0701, "step": 1341 }, { "epoch": 0.3412045129508978, "grad_norm": 20655.5234375, "learning_rate": 4.167431986086708e-05, "loss": 5.0878, "step": 1342 }, { "epoch": 0.34145876370570477, "grad_norm": 20282.087890625, "learning_rate": 4.16577779416693e-05, "loss": 5.0669, "step": 1343 }, { "epoch": 0.34171301446051167, "grad_norm": 20813.08984375, "learning_rate": 4.164122289576849e-05, "loss": 5.0834, "step": 1344 }, { "epoch": 0.3419672652153186, "grad_norm": 20602.091796875, "learning_rate": 4.16246547362104e-05, "loss": 5.0712, "step": 1345 }, { "epoch": 0.3422215159701255, "grad_norm": 20427.025390625, "learning_rate": 4.160807347605112e-05, "loss": 5.0769, "step": 1346 }, { "epoch": 0.34247576672493246, "grad_norm": 21232.017578125, "learning_rate": 4.1591479128357054e-05, "loss": 5.0635, "step": 1347 }, { "epoch": 0.3427300174797394, "grad_norm": 20302.38671875, "learning_rate": 4.157487170620491e-05, "loss": 5.0631, "step": 1348 }, { "epoch": 0.3429842682345463, "grad_norm": 20684.921875, "learning_rate": 4.155825122268172e-05, "loss": 5.0664, "step": 1349 }, { "epoch": 0.34323851898935326, "grad_norm": 20399.96484375, "learning_rate": 4.154161769088479e-05, "loss": 5.058, "step": 1350 }, { "epoch": 0.34349276974416015, "grad_norm": 20703.484375, "learning_rate": 4.152497112392173e-05, "loss": 5.0605, "step": 1351 }, { "epoch": 0.3437470204989671, "grad_norm": 20663.806640625, "learning_rate": 4.1508311534910394e-05, "loss": 5.0588, "step": 1352 }, { "epoch": 0.34400127125377405, "grad_norm": 20704.51953125, "learning_rate": 4.149163893697893e-05, "loss": 5.0786, "step": 1353 }, { "epoch": 0.34425552200858095, "grad_norm": 21822.69140625, "learning_rate": 4.147495334326569e-05, "loss": 5.0637, "step": 1354 }, { "epoch": 0.3445097727633879, "grad_norm": 21407.75390625, "learning_rate": 4.145825476691932e-05, "loss": 5.0656, "step": 1355 }, { "epoch": 0.3447640235181948, "grad_norm": 21587.5703125, "learning_rate": 4.144154322109867e-05, "loss": 5.0581, "step": 1356 }, { "epoch": 0.34501827427300175, "grad_norm": 20367.529296875, "learning_rate": 4.142481871897281e-05, "loss": 5.0524, "step": 1357 }, { "epoch": 0.3452725250278087, "grad_norm": 25677.013671875, "learning_rate": 4.1408081273721023e-05, "loss": 5.0546, "step": 1358 }, { "epoch": 0.3455267757826156, "grad_norm": 22312.373046875, "learning_rate": 4.1391330898532794e-05, "loss": 5.07, "step": 1359 }, { "epoch": 0.34578102653742254, "grad_norm": 22724.1171875, "learning_rate": 4.137456760660779e-05, "loss": 5.0625, "step": 1360 }, { "epoch": 0.34603527729222944, "grad_norm": 20273.291015625, "learning_rate": 4.1357791411155865e-05, "loss": 5.0526, "step": 1361 }, { "epoch": 0.3462895280470364, "grad_norm": 27532.322265625, "learning_rate": 4.134100232539704e-05, "loss": 5.072, "step": 1362 }, { "epoch": 0.34654377880184334, "grad_norm": 23138.115234375, "learning_rate": 4.132420036256148e-05, "loss": 5.0542, "step": 1363 }, { "epoch": 0.34679802955665023, "grad_norm": 21453.220703125, "learning_rate": 4.130738553588953e-05, "loss": 5.061, "step": 1364 }, { "epoch": 0.3470522803114572, "grad_norm": 20350.22265625, "learning_rate": 4.129055785863163e-05, "loss": 5.0696, "step": 1365 }, { "epoch": 0.3473065310662641, "grad_norm": 22019.0078125, "learning_rate": 4.1273717344048375e-05, "loss": 5.0538, "step": 1366 }, { "epoch": 0.34756078182107103, "grad_norm": 21566.451171875, "learning_rate": 4.125686400541047e-05, "loss": 5.0669, "step": 1367 }, { "epoch": 0.347815032575878, "grad_norm": 20886.859375, "learning_rate": 4.123999785599873e-05, "loss": 5.0543, "step": 1368 }, { "epoch": 0.3480692833306849, "grad_norm": 20517.46484375, "learning_rate": 4.1223118909104055e-05, "loss": 5.0622, "step": 1369 }, { "epoch": 0.3483235340854918, "grad_norm": 21522.505859375, "learning_rate": 4.1206227178027426e-05, "loss": 5.048, "step": 1370 }, { "epoch": 0.3485777848402987, "grad_norm": 20795.326171875, "learning_rate": 4.118932267607991e-05, "loss": 5.0577, "step": 1371 }, { "epoch": 0.34883203559510567, "grad_norm": 21447.560546875, "learning_rate": 4.117240541658264e-05, "loss": 5.0348, "step": 1372 }, { "epoch": 0.3490862863499126, "grad_norm": 20870.556640625, "learning_rate": 4.11554754128668e-05, "loss": 5.0614, "step": 1373 }, { "epoch": 0.3493405371047195, "grad_norm": 24859.7109375, "learning_rate": 4.11385326782736e-05, "loss": 5.0334, "step": 1374 }, { "epoch": 0.34959478785952647, "grad_norm": 21025.8984375, "learning_rate": 4.11215772261543e-05, "loss": 5.0341, "step": 1375 }, { "epoch": 0.34984903861433336, "grad_norm": 20546.03125, "learning_rate": 4.110460906987018e-05, "loss": 5.0444, "step": 1376 }, { "epoch": 0.3501032893691403, "grad_norm": 20573.177734375, "learning_rate": 4.108762822279253e-05, "loss": 5.0456, "step": 1377 }, { "epoch": 0.35035754012394726, "grad_norm": 21535.056640625, "learning_rate": 4.107063469830263e-05, "loss": 5.0614, "step": 1378 }, { "epoch": 0.35061179087875416, "grad_norm": 20411.982421875, "learning_rate": 4.1053628509791766e-05, "loss": 5.0659, "step": 1379 }, { "epoch": 0.3508660416335611, "grad_norm": 21210.41015625, "learning_rate": 4.1036609670661196e-05, "loss": 5.0465, "step": 1380 }, { "epoch": 0.351120292388368, "grad_norm": 20453.76171875, "learning_rate": 4.101957819432215e-05, "loss": 5.0596, "step": 1381 }, { "epoch": 0.35137454314317496, "grad_norm": 21562.908203125, "learning_rate": 4.10025340941958e-05, "loss": 5.0398, "step": 1382 }, { "epoch": 0.3516287938979819, "grad_norm": 20612.826171875, "learning_rate": 4.098547738371329e-05, "loss": 5.0448, "step": 1383 }, { "epoch": 0.3518830446527888, "grad_norm": 20957.23828125, "learning_rate": 4.09684080763157e-05, "loss": 5.0404, "step": 1384 }, { "epoch": 0.35213729540759575, "grad_norm": 21259.423828125, "learning_rate": 4.095132618545401e-05, "loss": 5.0433, "step": 1385 }, { "epoch": 0.35239154616240265, "grad_norm": 21018.7890625, "learning_rate": 4.093423172458914e-05, "loss": 5.0539, "step": 1386 }, { "epoch": 0.3526457969172096, "grad_norm": 20516.033203125, "learning_rate": 4.0917124707191915e-05, "loss": 5.0491, "step": 1387 }, { "epoch": 0.35290004767201655, "grad_norm": 20672.13671875, "learning_rate": 4.0900005146743035e-05, "loss": 5.0401, "step": 1388 }, { "epoch": 0.35315429842682344, "grad_norm": 20577.83203125, "learning_rate": 4.0882873056733116e-05, "loss": 5.0303, "step": 1389 }, { "epoch": 0.3534085491816304, "grad_norm": 20534.044921875, "learning_rate": 4.086572845066262e-05, "loss": 5.0558, "step": 1390 }, { "epoch": 0.3536627999364373, "grad_norm": 20514.53125, "learning_rate": 4.084857134204187e-05, "loss": 5.0317, "step": 1391 }, { "epoch": 0.35391705069124424, "grad_norm": 20635.29296875, "learning_rate": 4.0831401744391087e-05, "loss": 5.0513, "step": 1392 }, { "epoch": 0.3541713014460512, "grad_norm": 20619.875, "learning_rate": 4.081421967124026e-05, "loss": 5.0604, "step": 1393 }, { "epoch": 0.3544255522008581, "grad_norm": 20880.6640625, "learning_rate": 4.079702513612927e-05, "loss": 5.0263, "step": 1394 }, { "epoch": 0.35467980295566504, "grad_norm": 20556.16015625, "learning_rate": 4.077981815260779e-05, "loss": 5.0342, "step": 1395 }, { "epoch": 0.35493405371047193, "grad_norm": 20635.166015625, "learning_rate": 4.0762598734235314e-05, "loss": 5.0356, "step": 1396 }, { "epoch": 0.3551883044652789, "grad_norm": 20488.4921875, "learning_rate": 4.0745366894581126e-05, "loss": 5.0288, "step": 1397 }, { "epoch": 0.35544255522008583, "grad_norm": 20495.84765625, "learning_rate": 4.072812264722431e-05, "loss": 5.0337, "step": 1398 }, { "epoch": 0.3556968059748927, "grad_norm": 20507.818359375, "learning_rate": 4.071086600575371e-05, "loss": 5.0293, "step": 1399 }, { "epoch": 0.3559510567296997, "grad_norm": 20516.603515625, "learning_rate": 4.069359698376795e-05, "loss": 5.0313, "step": 1400 }, { "epoch": 0.3559510567296997, "eval_loss": 10.153711318969727, "eval_runtime": 696.649, "eval_samples_per_second": 152.137, "eval_steps_per_second": 9.51, "step": 1400 }, { "epoch": 0.3562053074845066, "grad_norm": 21380.99609375, "learning_rate": 4.0676315594875416e-05, "loss": 5.0426, "step": 1401 }, { "epoch": 0.3564595582393135, "grad_norm": 24253.44140625, "learning_rate": 4.0659021852694226e-05, "loss": 5.0363, "step": 1402 }, { "epoch": 0.3567138089941205, "grad_norm": 20590.5078125, "learning_rate": 4.0641715770852215e-05, "loss": 5.0254, "step": 1403 }, { "epoch": 0.35696805974892737, "grad_norm": 21081.021484375, "learning_rate": 4.0624397362987e-05, "loss": 5.0305, "step": 1404 }, { "epoch": 0.3572223105037343, "grad_norm": 20629.21484375, "learning_rate": 4.060706664274585e-05, "loss": 5.0423, "step": 1405 }, { "epoch": 0.3574765612585412, "grad_norm": 21007.060546875, "learning_rate": 4.058972362378578e-05, "loss": 5.0327, "step": 1406 }, { "epoch": 0.35773081201334817, "grad_norm": 20600.265625, "learning_rate": 4.057236831977346e-05, "loss": 5.0392, "step": 1407 }, { "epoch": 0.3579850627681551, "grad_norm": 20932.49609375, "learning_rate": 4.0555000744385274e-05, "loss": 5.0392, "step": 1408 }, { "epoch": 0.358239313522962, "grad_norm": 20723.2265625, "learning_rate": 4.053762091130725e-05, "loss": 5.037, "step": 1409 }, { "epoch": 0.35849356427776896, "grad_norm": 20771.119140625, "learning_rate": 4.052022883423509e-05, "loss": 5.0302, "step": 1410 }, { "epoch": 0.35874781503257586, "grad_norm": 20619.994140625, "learning_rate": 4.050282452687415e-05, "loss": 5.0469, "step": 1411 }, { "epoch": 0.3590020657873828, "grad_norm": 20759.640625, "learning_rate": 4.04854080029394e-05, "loss": 5.0293, "step": 1412 }, { "epoch": 0.35925631654218976, "grad_norm": 20662.76953125, "learning_rate": 4.0467979276155464e-05, "loss": 5.0319, "step": 1413 }, { "epoch": 0.35951056729699665, "grad_norm": 20976.1875, "learning_rate": 4.045053836025656e-05, "loss": 5.0432, "step": 1414 }, { "epoch": 0.3597648180518036, "grad_norm": 20431.185546875, "learning_rate": 4.043308526898654e-05, "loss": 5.0419, "step": 1415 }, { "epoch": 0.3600190688066105, "grad_norm": 20750.216796875, "learning_rate": 4.041562001609881e-05, "loss": 5.0167, "step": 1416 }, { "epoch": 0.36027331956141745, "grad_norm": 20800.796875, "learning_rate": 4.0398142615356396e-05, "loss": 5.0224, "step": 1417 }, { "epoch": 0.3605275703162244, "grad_norm": 20754.34765625, "learning_rate": 4.038065308053187e-05, "loss": 5.0426, "step": 1418 }, { "epoch": 0.3607818210710313, "grad_norm": 20627.396484375, "learning_rate": 4.036315142540739e-05, "loss": 5.0126, "step": 1419 }, { "epoch": 0.36103607182583825, "grad_norm": 24564.541015625, "learning_rate": 4.0345637663774635e-05, "loss": 5.0212, "step": 1420 }, { "epoch": 0.36129032258064514, "grad_norm": 37871.4765625, "learning_rate": 4.032811180943487e-05, "loss": 5.0426, "step": 1421 }, { "epoch": 0.3615445733354521, "grad_norm": 20797.484375, "learning_rate": 4.0310573876198846e-05, "loss": 5.0191, "step": 1422 }, { "epoch": 0.36179882409025904, "grad_norm": 21103.12109375, "learning_rate": 4.0293023877886846e-05, "loss": 5.0256, "step": 1423 }, { "epoch": 0.36205307484506594, "grad_norm": 20829.919921875, "learning_rate": 4.027546182832866e-05, "loss": 5.0255, "step": 1424 }, { "epoch": 0.3623073255998729, "grad_norm": 21079.755859375, "learning_rate": 4.0257887741363585e-05, "loss": 5.0259, "step": 1425 }, { "epoch": 0.3625615763546798, "grad_norm": 20816.08984375, "learning_rate": 4.02403016308404e-05, "loss": 5.034, "step": 1426 }, { "epoch": 0.36281582710948673, "grad_norm": 21079.033203125, "learning_rate": 4.022270351061735e-05, "loss": 5.0219, "step": 1427 }, { "epoch": 0.3630700778642937, "grad_norm": 21122.498046875, "learning_rate": 4.020509339456214e-05, "loss": 5.0042, "step": 1428 }, { "epoch": 0.3633243286191006, "grad_norm": 20782.990234375, "learning_rate": 4.0187471296551956e-05, "loss": 5.0208, "step": 1429 }, { "epoch": 0.36357857937390753, "grad_norm": 20848.810546875, "learning_rate": 4.0169837230473386e-05, "loss": 5.0285, "step": 1430 }, { "epoch": 0.3638328301287144, "grad_norm": 20720.689453125, "learning_rate": 4.0152191210222485e-05, "loss": 5.0277, "step": 1431 }, { "epoch": 0.3640870808835214, "grad_norm": 20801.99609375, "learning_rate": 4.013453324970471e-05, "loss": 5.0248, "step": 1432 }, { "epoch": 0.3643413316383283, "grad_norm": 20609.95703125, "learning_rate": 4.011686336283492e-05, "loss": 5.0061, "step": 1433 }, { "epoch": 0.3645955823931352, "grad_norm": 21219.537109375, "learning_rate": 4.0099181563537395e-05, "loss": 5.0149, "step": 1434 }, { "epoch": 0.36484983314794217, "grad_norm": 20856.98828125, "learning_rate": 4.008148786574579e-05, "loss": 5.0149, "step": 1435 }, { "epoch": 0.36510408390274907, "grad_norm": 20848.314453125, "learning_rate": 4.006378228340313e-05, "loss": 5.0069, "step": 1436 }, { "epoch": 0.365358334657556, "grad_norm": 21045.689453125, "learning_rate": 4.0046064830461816e-05, "loss": 5.0127, "step": 1437 }, { "epoch": 0.36561258541236297, "grad_norm": 20940.07421875, "learning_rate": 4.002833552088359e-05, "loss": 5.0187, "step": 1438 }, { "epoch": 0.36586683616716986, "grad_norm": 20717.06640625, "learning_rate": 4.001059436863955e-05, "loss": 5.0132, "step": 1439 }, { "epoch": 0.3661210869219768, "grad_norm": 21017.23828125, "learning_rate": 3.999284138771013e-05, "loss": 4.994, "step": 1440 }, { "epoch": 0.3663753376767837, "grad_norm": 20876.576171875, "learning_rate": 3.997507659208507e-05, "loss": 5.017, "step": 1441 }, { "epoch": 0.36662958843159066, "grad_norm": 21044.259765625, "learning_rate": 3.995729999576343e-05, "loss": 5.0022, "step": 1442 }, { "epoch": 0.3668838391863976, "grad_norm": 20672.845703125, "learning_rate": 3.9939511612753564e-05, "loss": 5.0136, "step": 1443 }, { "epoch": 0.3671380899412045, "grad_norm": 20896.17578125, "learning_rate": 3.9921711457073125e-05, "loss": 5.0031, "step": 1444 }, { "epoch": 0.36739234069601145, "grad_norm": 21340.353515625, "learning_rate": 3.9903899542749026e-05, "loss": 5.0142, "step": 1445 }, { "epoch": 0.36764659145081835, "grad_norm": 20945.185546875, "learning_rate": 3.988607588381746e-05, "loss": 5.0071, "step": 1446 }, { "epoch": 0.3679008422056253, "grad_norm": 21055.23828125, "learning_rate": 3.986824049432387e-05, "loss": 4.9991, "step": 1447 }, { "epoch": 0.36815509296043225, "grad_norm": 20799.30859375, "learning_rate": 3.985039338832295e-05, "loss": 5.0117, "step": 1448 }, { "epoch": 0.36840934371523915, "grad_norm": 20687.353515625, "learning_rate": 3.983253457987861e-05, "loss": 4.9938, "step": 1449 }, { "epoch": 0.3686635944700461, "grad_norm": 20659.6875, "learning_rate": 3.981466408306399e-05, "loss": 4.9841, "step": 1450 }, { "epoch": 0.368917845224853, "grad_norm": 20908.5078125, "learning_rate": 3.979678191196146e-05, "loss": 5.0204, "step": 1451 }, { "epoch": 0.36917209597965994, "grad_norm": 20579.8984375, "learning_rate": 3.9778888080662555e-05, "loss": 4.9961, "step": 1452 }, { "epoch": 0.3694263467344669, "grad_norm": 20834.25390625, "learning_rate": 3.976098260326802e-05, "loss": 5.0062, "step": 1453 }, { "epoch": 0.3696805974892738, "grad_norm": 20722.501953125, "learning_rate": 3.9743065493887774e-05, "loss": 5.0081, "step": 1454 }, { "epoch": 0.36993484824408074, "grad_norm": 20788.0703125, "learning_rate": 3.97251367666409e-05, "loss": 5.0124, "step": 1455 }, { "epoch": 0.37018909899888763, "grad_norm": 20767.16015625, "learning_rate": 3.970719643565565e-05, "loss": 5.0078, "step": 1456 }, { "epoch": 0.3704433497536946, "grad_norm": 20994.658203125, "learning_rate": 3.968924451506939e-05, "loss": 5.0139, "step": 1457 }, { "epoch": 0.37069760050850153, "grad_norm": 20918.130859375, "learning_rate": 3.9671281019028645e-05, "loss": 5.0012, "step": 1458 }, { "epoch": 0.37095185126330843, "grad_norm": 20856.056640625, "learning_rate": 3.9653305961689044e-05, "loss": 5.0008, "step": 1459 }, { "epoch": 0.3712061020181154, "grad_norm": 21184.412109375, "learning_rate": 3.9635319357215365e-05, "loss": 5.0242, "step": 1460 }, { "epoch": 0.3714603527729223, "grad_norm": 20952.85546875, "learning_rate": 3.961732121978142e-05, "loss": 5.0063, "step": 1461 }, { "epoch": 0.3717146035277292, "grad_norm": 20994.27734375, "learning_rate": 3.959931156357016e-05, "loss": 4.9989, "step": 1462 }, { "epoch": 0.3719688542825362, "grad_norm": 20795.455078125, "learning_rate": 3.9581290402773605e-05, "loss": 4.9806, "step": 1463 }, { "epoch": 0.37222310503734307, "grad_norm": 21110.5234375, "learning_rate": 3.956325775159282e-05, "loss": 4.9924, "step": 1464 }, { "epoch": 0.37247735579215, "grad_norm": 21154.40625, "learning_rate": 3.954521362423795e-05, "loss": 4.9912, "step": 1465 }, { "epoch": 0.3727316065469569, "grad_norm": 21014.50390625, "learning_rate": 3.952715803492818e-05, "loss": 5.0042, "step": 1466 }, { "epoch": 0.37298585730176387, "grad_norm": 21070.060546875, "learning_rate": 3.95090909978917e-05, "loss": 5.0039, "step": 1467 }, { "epoch": 0.3732401080565708, "grad_norm": 21246.404296875, "learning_rate": 3.9491012527365753e-05, "loss": 5.0018, "step": 1468 }, { "epoch": 0.3734943588113777, "grad_norm": 21604.51953125, "learning_rate": 3.9472922637596576e-05, "loss": 5.0051, "step": 1469 }, { "epoch": 0.37374860956618466, "grad_norm": 20936.728515625, "learning_rate": 3.945482134283941e-05, "loss": 4.9825, "step": 1470 }, { "epoch": 0.37400286032099156, "grad_norm": 20951.236328125, "learning_rate": 3.943670865735849e-05, "loss": 4.9922, "step": 1471 }, { "epoch": 0.3742571110757985, "grad_norm": 20979.51171875, "learning_rate": 3.9418584595427e-05, "loss": 4.997, "step": 1472 }, { "epoch": 0.37451136183060546, "grad_norm": 21166.302734375, "learning_rate": 3.9400449171327115e-05, "loss": 4.9898, "step": 1473 }, { "epoch": 0.37476561258541236, "grad_norm": 20928.44140625, "learning_rate": 3.938230239934997e-05, "loss": 4.9839, "step": 1474 }, { "epoch": 0.3750198633402193, "grad_norm": 21149.673828125, "learning_rate": 3.93641442937956e-05, "loss": 4.9893, "step": 1475 }, { "epoch": 0.3752741140950262, "grad_norm": 20992.947265625, "learning_rate": 3.934597486897303e-05, "loss": 4.9882, "step": 1476 }, { "epoch": 0.37552836484983315, "grad_norm": 21158.515625, "learning_rate": 3.932779413920017e-05, "loss": 5.0042, "step": 1477 }, { "epoch": 0.3757826156046401, "grad_norm": 20933.39453125, "learning_rate": 3.9309602118803824e-05, "loss": 4.9999, "step": 1478 }, { "epoch": 0.376036866359447, "grad_norm": 21171.5234375, "learning_rate": 3.9291398822119725e-05, "loss": 5.0124, "step": 1479 }, { "epoch": 0.37629111711425395, "grad_norm": 21032.017578125, "learning_rate": 3.927318426349248e-05, "loss": 5.0032, "step": 1480 }, { "epoch": 0.37654536786906084, "grad_norm": 20867.19140625, "learning_rate": 3.925495845727557e-05, "loss": 4.9924, "step": 1481 }, { "epoch": 0.3767996186238678, "grad_norm": 21033.208984375, "learning_rate": 3.923672141783133e-05, "loss": 4.9901, "step": 1482 }, { "epoch": 0.37705386937867474, "grad_norm": 21061.41015625, "learning_rate": 3.9218473159530975e-05, "loss": 4.9939, "step": 1483 }, { "epoch": 0.37730812013348164, "grad_norm": 21026.09765625, "learning_rate": 3.920021369675453e-05, "loss": 4.9824, "step": 1484 }, { "epoch": 0.3775623708882886, "grad_norm": 21149.673828125, "learning_rate": 3.9181943043890865e-05, "loss": 4.982, "step": 1485 }, { "epoch": 0.3778166216430955, "grad_norm": 20940.236328125, "learning_rate": 3.916366121533767e-05, "loss": 4.9815, "step": 1486 }, { "epoch": 0.37807087239790244, "grad_norm": 21251.8515625, "learning_rate": 3.914536822550141e-05, "loss": 4.9913, "step": 1487 }, { "epoch": 0.3783251231527094, "grad_norm": 20905.994140625, "learning_rate": 3.9127064088797406e-05, "loss": 4.9715, "step": 1488 }, { "epoch": 0.3785793739075163, "grad_norm": 21418.8125, "learning_rate": 3.910874881964971e-05, "loss": 4.9821, "step": 1489 }, { "epoch": 0.37883362466232323, "grad_norm": 21253.865234375, "learning_rate": 3.909042243249117e-05, "loss": 4.984, "step": 1490 }, { "epoch": 0.3790878754171301, "grad_norm": 20961.84375, "learning_rate": 3.9072084941763395e-05, "loss": 4.9671, "step": 1491 }, { "epoch": 0.3793421261719371, "grad_norm": 21087.693359375, "learning_rate": 3.905373636191673e-05, "loss": 4.9792, "step": 1492 }, { "epoch": 0.37959637692674403, "grad_norm": 20970.341796875, "learning_rate": 3.903537670741026e-05, "loss": 4.9965, "step": 1493 }, { "epoch": 0.3798506276815509, "grad_norm": 21103.857421875, "learning_rate": 3.901700599271184e-05, "loss": 4.9863, "step": 1494 }, { "epoch": 0.3801048784363579, "grad_norm": 20976.533203125, "learning_rate": 3.8998624232297975e-05, "loss": 4.9838, "step": 1495 }, { "epoch": 0.38035912919116477, "grad_norm": 20968.71875, "learning_rate": 3.89802314406539e-05, "loss": 4.978, "step": 1496 }, { "epoch": 0.3806133799459717, "grad_norm": 21097.630859375, "learning_rate": 3.896182763227358e-05, "loss": 4.9771, "step": 1497 }, { "epoch": 0.38086763070077867, "grad_norm": 21137.41796875, "learning_rate": 3.89434128216596e-05, "loss": 4.9686, "step": 1498 }, { "epoch": 0.38112188145558556, "grad_norm": 20979.57421875, "learning_rate": 3.892498702332326e-05, "loss": 4.97, "step": 1499 }, { "epoch": 0.3813761322103925, "grad_norm": 21282.443359375, "learning_rate": 3.890655025178449e-05, "loss": 4.9794, "step": 1500 }, { "epoch": 0.3813761322103925, "eval_loss": 10.037830352783203, "eval_runtime": 699.4502, "eval_samples_per_second": 151.528, "eval_steps_per_second": 9.472, "step": 1500 }, { "epoch": 0.3816303829651994, "grad_norm": 21108.099609375, "learning_rate": 3.888810252157189e-05, "loss": 4.9901, "step": 1501 }, { "epoch": 0.38188463372000636, "grad_norm": 21029.072265625, "learning_rate": 3.886964384722268e-05, "loss": 4.9658, "step": 1502 }, { "epoch": 0.3821388844748133, "grad_norm": 20969.4140625, "learning_rate": 3.885117424328271e-05, "loss": 4.9668, "step": 1503 }, { "epoch": 0.3823931352296202, "grad_norm": 21315.923828125, "learning_rate": 3.8832693724306444e-05, "loss": 4.9728, "step": 1504 }, { "epoch": 0.38264738598442716, "grad_norm": 21029.814453125, "learning_rate": 3.881420230485696e-05, "loss": 4.9772, "step": 1505 }, { "epoch": 0.38290163673923405, "grad_norm": 20990.447265625, "learning_rate": 3.879569999950589e-05, "loss": 4.9651, "step": 1506 }, { "epoch": 0.383155887494041, "grad_norm": 21186.33984375, "learning_rate": 3.877718682283347e-05, "loss": 4.9908, "step": 1507 }, { "epoch": 0.38341013824884795, "grad_norm": 21121.689453125, "learning_rate": 3.8758662789428515e-05, "loss": 4.9748, "step": 1508 }, { "epoch": 0.38366438900365485, "grad_norm": 21179.07421875, "learning_rate": 3.8740127913888356e-05, "loss": 4.9754, "step": 1509 }, { "epoch": 0.3839186397584618, "grad_norm": 21273.123046875, "learning_rate": 3.872158221081891e-05, "loss": 4.9709, "step": 1510 }, { "epoch": 0.3841728905132687, "grad_norm": 21022.14453125, "learning_rate": 3.870302569483459e-05, "loss": 4.9545, "step": 1511 }, { "epoch": 0.38442714126807565, "grad_norm": 21070.720703125, "learning_rate": 3.868445838055836e-05, "loss": 4.9764, "step": 1512 }, { "epoch": 0.3846813920228826, "grad_norm": 20866.03125, "learning_rate": 3.866588028262169e-05, "loss": 4.9572, "step": 1513 }, { "epoch": 0.3849356427776895, "grad_norm": 21212.82421875, "learning_rate": 3.864729141566452e-05, "loss": 4.9774, "step": 1514 }, { "epoch": 0.38518989353249644, "grad_norm": 21061.98828125, "learning_rate": 3.8628691794335294e-05, "loss": 4.9789, "step": 1515 }, { "epoch": 0.38544414428730334, "grad_norm": 21465.978515625, "learning_rate": 3.861008143329095e-05, "loss": 4.9567, "step": 1516 }, { "epoch": 0.3856983950421103, "grad_norm": 21075.642578125, "learning_rate": 3.859146034719684e-05, "loss": 4.9722, "step": 1517 }, { "epoch": 0.3859526457969172, "grad_norm": 21462.427734375, "learning_rate": 3.857282855072683e-05, "loss": 4.9852, "step": 1518 }, { "epoch": 0.38620689655172413, "grad_norm": 21176.634765625, "learning_rate": 3.855418605856318e-05, "loss": 4.9805, "step": 1519 }, { "epoch": 0.3864611473065311, "grad_norm": 21378.16015625, "learning_rate": 3.8535532885396585e-05, "loss": 4.9789, "step": 1520 }, { "epoch": 0.386715398061338, "grad_norm": 21267.939453125, "learning_rate": 3.851686904592617e-05, "loss": 4.9743, "step": 1521 }, { "epoch": 0.38696964881614493, "grad_norm": 21190.212890625, "learning_rate": 3.8498194554859476e-05, "loss": 4.9752, "step": 1522 }, { "epoch": 0.3872238995709518, "grad_norm": 21358.046875, "learning_rate": 3.84795094269124e-05, "loss": 4.9709, "step": 1523 }, { "epoch": 0.3874781503257588, "grad_norm": 21233.3203125, "learning_rate": 3.846081367680924e-05, "loss": 4.9649, "step": 1524 }, { "epoch": 0.3877324010805657, "grad_norm": 21131.3046875, "learning_rate": 3.844210731928268e-05, "loss": 4.9719, "step": 1525 }, { "epoch": 0.3879866518353726, "grad_norm": 21464.201171875, "learning_rate": 3.842339036907375e-05, "loss": 4.9638, "step": 1526 }, { "epoch": 0.38824090259017957, "grad_norm": 21253.546875, "learning_rate": 3.840466284093183e-05, "loss": 4.9535, "step": 1527 }, { "epoch": 0.38849515334498647, "grad_norm": 21201.64453125, "learning_rate": 3.838592474961461e-05, "loss": 4.9583, "step": 1528 }, { "epoch": 0.3887494040997934, "grad_norm": 21423.244140625, "learning_rate": 3.836717610988815e-05, "loss": 4.9622, "step": 1529 }, { "epoch": 0.38900365485460037, "grad_norm": 21263.2265625, "learning_rate": 3.834841693652679e-05, "loss": 4.9578, "step": 1530 }, { "epoch": 0.38925790560940726, "grad_norm": 21437.376953125, "learning_rate": 3.832964724431318e-05, "loss": 4.9623, "step": 1531 }, { "epoch": 0.3895121563642142, "grad_norm": 21214.513671875, "learning_rate": 3.8310867048038256e-05, "loss": 4.9606, "step": 1532 }, { "epoch": 0.3897664071190211, "grad_norm": 21181.826171875, "learning_rate": 3.829207636250124e-05, "loss": 4.962, "step": 1533 }, { "epoch": 0.39002065787382806, "grad_norm": 21346.517578125, "learning_rate": 3.8273275202509616e-05, "loss": 4.9658, "step": 1534 }, { "epoch": 0.390274908628635, "grad_norm": 21517.15234375, "learning_rate": 3.8254463582879105e-05, "loss": 4.9629, "step": 1535 }, { "epoch": 0.3905291593834419, "grad_norm": 21421.583984375, "learning_rate": 3.823564151843371e-05, "loss": 4.9644, "step": 1536 }, { "epoch": 0.39078341013824885, "grad_norm": 21565.421875, "learning_rate": 3.821680902400562e-05, "loss": 4.9637, "step": 1537 }, { "epoch": 0.39103766089305575, "grad_norm": 21515.404296875, "learning_rate": 3.8197966114435265e-05, "loss": 4.9657, "step": 1538 }, { "epoch": 0.3912919116478627, "grad_norm": 21260.091796875, "learning_rate": 3.81791128045713e-05, "loss": 4.9557, "step": 1539 }, { "epoch": 0.39154616240266965, "grad_norm": 21503.732421875, "learning_rate": 3.816024910927054e-05, "loss": 4.9738, "step": 1540 }, { "epoch": 0.39180041315747655, "grad_norm": 21256.716796875, "learning_rate": 3.8141375043397996e-05, "loss": 4.9709, "step": 1541 }, { "epoch": 0.3920546639122835, "grad_norm": 21197.525390625, "learning_rate": 3.812249062182687e-05, "loss": 4.9463, "step": 1542 }, { "epoch": 0.3923089146670904, "grad_norm": 21073.4140625, "learning_rate": 3.8103595859438495e-05, "loss": 4.9453, "step": 1543 }, { "epoch": 0.39256316542189734, "grad_norm": 21267.0703125, "learning_rate": 3.808469077112238e-05, "loss": 4.9545, "step": 1544 }, { "epoch": 0.3928174161767043, "grad_norm": 21215.130859375, "learning_rate": 3.8065775371776166e-05, "loss": 4.9625, "step": 1545 }, { "epoch": 0.3930716669315112, "grad_norm": 21569.37890625, "learning_rate": 3.8046849676305587e-05, "loss": 4.9701, "step": 1546 }, { "epoch": 0.39332591768631814, "grad_norm": 21321.40625, "learning_rate": 3.802791369962453e-05, "loss": 4.9533, "step": 1547 }, { "epoch": 0.39358016844112503, "grad_norm": 21263.0625, "learning_rate": 3.800896745665498e-05, "loss": 4.9449, "step": 1548 }, { "epoch": 0.393834419195932, "grad_norm": 21279.13671875, "learning_rate": 3.799001096232699e-05, "loss": 4.9602, "step": 1549 }, { "epoch": 0.39408866995073893, "grad_norm": 21376.54296875, "learning_rate": 3.7971044231578706e-05, "loss": 4.947, "step": 1550 }, { "epoch": 0.39434292070554583, "grad_norm": 21252.875, "learning_rate": 3.795206727935633e-05, "loss": 4.947, "step": 1551 }, { "epoch": 0.3945971714603528, "grad_norm": 21414.9765625, "learning_rate": 3.793308012061414e-05, "loss": 4.954, "step": 1552 }, { "epoch": 0.3948514222151597, "grad_norm": 21248.7109375, "learning_rate": 3.7914082770314436e-05, "loss": 4.9502, "step": 1553 }, { "epoch": 0.3951056729699666, "grad_norm": 21366.3125, "learning_rate": 3.789507524342756e-05, "loss": 4.9418, "step": 1554 }, { "epoch": 0.3953599237247736, "grad_norm": 21357.92578125, "learning_rate": 3.787605755493186e-05, "loss": 4.9515, "step": 1555 }, { "epoch": 0.39561417447958047, "grad_norm": 21427.646484375, "learning_rate": 3.785702971981372e-05, "loss": 4.956, "step": 1556 }, { "epoch": 0.3958684252343874, "grad_norm": 21325.81640625, "learning_rate": 3.783799175306747e-05, "loss": 4.9588, "step": 1557 }, { "epoch": 0.3961226759891943, "grad_norm": 21325.96875, "learning_rate": 3.7818943669695496e-05, "loss": 4.9478, "step": 1558 }, { "epoch": 0.39637692674400127, "grad_norm": 21331.484375, "learning_rate": 3.779988548470809e-05, "loss": 4.9434, "step": 1559 }, { "epoch": 0.3966311774988082, "grad_norm": 21457.83203125, "learning_rate": 3.7780817213123534e-05, "loss": 4.9562, "step": 1560 }, { "epoch": 0.3968854282536151, "grad_norm": 21139.501953125, "learning_rate": 3.7761738869968043e-05, "loss": 4.9551, "step": 1561 }, { "epoch": 0.39713967900842206, "grad_norm": 21467.375, "learning_rate": 3.7742650470275806e-05, "loss": 4.942, "step": 1562 }, { "epoch": 0.39739392976322896, "grad_norm": 21406.478515625, "learning_rate": 3.772355202908889e-05, "loss": 4.9522, "step": 1563 }, { "epoch": 0.3976481805180359, "grad_norm": 21372.96875, "learning_rate": 3.77044435614573e-05, "loss": 4.9374, "step": 1564 }, { "epoch": 0.39790243127284286, "grad_norm": 21493.8125, "learning_rate": 3.7685325082438943e-05, "loss": 4.9512, "step": 1565 }, { "epoch": 0.39815668202764976, "grad_norm": 21326.302734375, "learning_rate": 3.76661966070996e-05, "loss": 4.9539, "step": 1566 }, { "epoch": 0.3984109327824567, "grad_norm": 21405.16796875, "learning_rate": 3.764705815051295e-05, "loss": 4.9479, "step": 1567 }, { "epoch": 0.3986651835372636, "grad_norm": 21500.7734375, "learning_rate": 3.762790972776052e-05, "loss": 4.9305, "step": 1568 }, { "epoch": 0.39891943429207055, "grad_norm": 21364.056640625, "learning_rate": 3.76087513539317e-05, "loss": 4.9309, "step": 1569 }, { "epoch": 0.3991736850468775, "grad_norm": 21395.208984375, "learning_rate": 3.758958304412372e-05, "loss": 4.9331, "step": 1570 }, { "epoch": 0.3994279358016844, "grad_norm": 21496.416015625, "learning_rate": 3.7570404813441626e-05, "loss": 4.942, "step": 1571 }, { "epoch": 0.39968218655649135, "grad_norm": 21445.732421875, "learning_rate": 3.7551216676998304e-05, "loss": 4.9369, "step": 1572 }, { "epoch": 0.39993643731129824, "grad_norm": 21446.796875, "learning_rate": 3.753201864991444e-05, "loss": 4.9327, "step": 1573 }, { "epoch": 0.4001906880661052, "grad_norm": 21526.1015625, "learning_rate": 3.7512810747318506e-05, "loss": 4.9376, "step": 1574 }, { "epoch": 0.40044493882091214, "grad_norm": 21728.33203125, "learning_rate": 3.749359298434677e-05, "loss": 4.9299, "step": 1575 }, { "epoch": 0.40069918957571904, "grad_norm": 21494.74609375, "learning_rate": 3.747436537614324e-05, "loss": 4.9459, "step": 1576 }, { "epoch": 0.400953440330526, "grad_norm": 21572.361328125, "learning_rate": 3.745512793785972e-05, "loss": 4.9343, "step": 1577 }, { "epoch": 0.4012076910853329, "grad_norm": 21482.47265625, "learning_rate": 3.743588068465573e-05, "loss": 4.9482, "step": 1578 }, { "epoch": 0.40146194184013984, "grad_norm": 21443.158203125, "learning_rate": 3.741662363169856e-05, "loss": 4.9344, "step": 1579 }, { "epoch": 0.4017161925949468, "grad_norm": 21497.869140625, "learning_rate": 3.739735679416317e-05, "loss": 4.9174, "step": 1580 }, { "epoch": 0.4019704433497537, "grad_norm": 21446.015625, "learning_rate": 3.737808018723229e-05, "loss": 4.9439, "step": 1581 }, { "epoch": 0.40222469410456063, "grad_norm": 21972.1796875, "learning_rate": 3.73587938260963e-05, "loss": 4.9502, "step": 1582 }, { "epoch": 0.4024789448593675, "grad_norm": 21463.888671875, "learning_rate": 3.733949772595329e-05, "loss": 4.9278, "step": 1583 }, { "epoch": 0.4027331956141745, "grad_norm": 21688.326171875, "learning_rate": 3.732019190200902e-05, "loss": 4.9353, "step": 1584 }, { "epoch": 0.40298744636898143, "grad_norm": 21537.6015625, "learning_rate": 3.730087636947692e-05, "loss": 4.9182, "step": 1585 }, { "epoch": 0.4032416971237883, "grad_norm": 21491.390625, "learning_rate": 3.728155114357805e-05, "loss": 4.9369, "step": 1586 }, { "epoch": 0.4034959478785953, "grad_norm": 23047.10546875, "learning_rate": 3.7262216239541135e-05, "loss": 4.9517, "step": 1587 }, { "epoch": 0.40375019863340217, "grad_norm": 21860.529296875, "learning_rate": 3.72428716726025e-05, "loss": 4.9283, "step": 1588 }, { "epoch": 0.4040044493882091, "grad_norm": 21623.978515625, "learning_rate": 3.722351745800611e-05, "loss": 4.9307, "step": 1589 }, { "epoch": 0.40425870014301607, "grad_norm": 21743.81640625, "learning_rate": 3.720415361100352e-05, "loss": 4.9369, "step": 1590 }, { "epoch": 0.40451295089782296, "grad_norm": 21573.21484375, "learning_rate": 3.718478014685387e-05, "loss": 4.9247, "step": 1591 }, { "epoch": 0.4047672016526299, "grad_norm": 21771.232421875, "learning_rate": 3.716539708082389e-05, "loss": 4.9362, "step": 1592 }, { "epoch": 0.4050214524074368, "grad_norm": 21642.990234375, "learning_rate": 3.7146004428187864e-05, "loss": 4.9422, "step": 1593 }, { "epoch": 0.40527570316224376, "grad_norm": 21645.24609375, "learning_rate": 3.7126602204227636e-05, "loss": 4.9285, "step": 1594 }, { "epoch": 0.4055299539170507, "grad_norm": 21890.04296875, "learning_rate": 3.710719042423262e-05, "loss": 4.9309, "step": 1595 }, { "epoch": 0.4057842046718576, "grad_norm": 21528.333984375, "learning_rate": 3.70877691034997e-05, "loss": 4.9295, "step": 1596 }, { "epoch": 0.40603845542666456, "grad_norm": 21905.609375, "learning_rate": 3.706833825733333e-05, "loss": 4.9203, "step": 1597 }, { "epoch": 0.40629270618147145, "grad_norm": 22114.712890625, "learning_rate": 3.704889790104545e-05, "loss": 4.9165, "step": 1598 }, { "epoch": 0.4065469569362784, "grad_norm": 21702.681640625, "learning_rate": 3.7029448049955496e-05, "loss": 4.9239, "step": 1599 }, { "epoch": 0.40680120769108535, "grad_norm": 21679.171875, "learning_rate": 3.7009988719390395e-05, "loss": 4.9188, "step": 1600 }, { "epoch": 0.40680120769108535, "eval_loss": 9.927664756774902, "eval_runtime": 699.0872, "eval_samples_per_second": 151.606, "eval_steps_per_second": 9.477, "step": 1600 }, { "epoch": 0.40705545844589225, "grad_norm": 21939.634765625, "learning_rate": 3.699051992468453e-05, "loss": 4.9212, "step": 1601 }, { "epoch": 0.4073097092006992, "grad_norm": 21534.181640625, "learning_rate": 3.6971041681179743e-05, "loss": 4.9232, "step": 1602 }, { "epoch": 0.4075639599555061, "grad_norm": 21565.546875, "learning_rate": 3.695155400422534e-05, "loss": 4.9285, "step": 1603 }, { "epoch": 0.40781821071031304, "grad_norm": 21511.466796875, "learning_rate": 3.693205690917804e-05, "loss": 4.9153, "step": 1604 }, { "epoch": 0.40807246146512, "grad_norm": 21691.6640625, "learning_rate": 3.6912550411401984e-05, "loss": 4.9164, "step": 1605 }, { "epoch": 0.4083267122199269, "grad_norm": 21525.65234375, "learning_rate": 3.6893034526268747e-05, "loss": 4.9253, "step": 1606 }, { "epoch": 0.40858096297473384, "grad_norm": 21488.17578125, "learning_rate": 3.687350926915728e-05, "loss": 4.9109, "step": 1607 }, { "epoch": 0.40883521372954074, "grad_norm": 21611.3828125, "learning_rate": 3.6853974655453914e-05, "loss": 4.9272, "step": 1608 }, { "epoch": 0.4090894644843477, "grad_norm": 21736.541015625, "learning_rate": 3.683443070055237e-05, "loss": 4.9366, "step": 1609 }, { "epoch": 0.40934371523915464, "grad_norm": 21618.369140625, "learning_rate": 3.681487741985373e-05, "loss": 4.9196, "step": 1610 }, { "epoch": 0.40959796599396153, "grad_norm": 21595.90625, "learning_rate": 3.6795314828766405e-05, "loss": 4.9213, "step": 1611 }, { "epoch": 0.4098522167487685, "grad_norm": 21640.361328125, "learning_rate": 3.677574294270617e-05, "loss": 4.9271, "step": 1612 }, { "epoch": 0.4101064675035754, "grad_norm": 21779.6015625, "learning_rate": 3.675616177709609e-05, "loss": 4.9261, "step": 1613 }, { "epoch": 0.41036071825838233, "grad_norm": 21612.134765625, "learning_rate": 3.673657134736658e-05, "loss": 4.9255, "step": 1614 }, { "epoch": 0.4106149690131893, "grad_norm": 21874.701171875, "learning_rate": 3.6716971668955344e-05, "loss": 4.9146, "step": 1615 }, { "epoch": 0.4108692197679962, "grad_norm": 21585.548828125, "learning_rate": 3.669736275730735e-05, "loss": 4.9197, "step": 1616 }, { "epoch": 0.4111234705228031, "grad_norm": 21718.55078125, "learning_rate": 3.667774462787487e-05, "loss": 4.9247, "step": 1617 }, { "epoch": 0.41137772127761, "grad_norm": 21887.751953125, "learning_rate": 3.665811729611744e-05, "loss": 4.922, "step": 1618 }, { "epoch": 0.41163197203241697, "grad_norm": 21929.798828125, "learning_rate": 3.663848077750182e-05, "loss": 4.9263, "step": 1619 }, { "epoch": 0.4118862227872239, "grad_norm": 21495.984375, "learning_rate": 3.661883508750203e-05, "loss": 4.9005, "step": 1620 }, { "epoch": 0.4121404735420308, "grad_norm": 21489.89453125, "learning_rate": 3.6599180241599335e-05, "loss": 4.9052, "step": 1621 }, { "epoch": 0.41239472429683777, "grad_norm": 21656.25390625, "learning_rate": 3.657951625528217e-05, "loss": 4.8971, "step": 1622 }, { "epoch": 0.41264897505164466, "grad_norm": 21682.71875, "learning_rate": 3.655984314404621e-05, "loss": 4.9104, "step": 1623 }, { "epoch": 0.4129032258064516, "grad_norm": 21776.6171875, "learning_rate": 3.654016092339432e-05, "loss": 4.9059, "step": 1624 }, { "epoch": 0.41315747656125856, "grad_norm": 21720.146484375, "learning_rate": 3.652046960883651e-05, "loss": 4.9184, "step": 1625 }, { "epoch": 0.41341172731606546, "grad_norm": 21610.6875, "learning_rate": 3.650076921588998e-05, "loss": 4.915, "step": 1626 }, { "epoch": 0.4136659780708724, "grad_norm": 21891.4453125, "learning_rate": 3.648105976007909e-05, "loss": 4.9165, "step": 1627 }, { "epoch": 0.4139202288256793, "grad_norm": 21603.302734375, "learning_rate": 3.646134125693534e-05, "loss": 4.9212, "step": 1628 }, { "epoch": 0.41417447958048625, "grad_norm": 21650.87109375, "learning_rate": 3.644161372199735e-05, "loss": 4.9054, "step": 1629 }, { "epoch": 0.4144287303352932, "grad_norm": 21877.341796875, "learning_rate": 3.6421877170810836e-05, "loss": 4.913, "step": 1630 }, { "epoch": 0.4146829810901001, "grad_norm": 21647.607421875, "learning_rate": 3.6402131618928675e-05, "loss": 4.903, "step": 1631 }, { "epoch": 0.41493723184490705, "grad_norm": 22283.400390625, "learning_rate": 3.638237708191079e-05, "loss": 4.9145, "step": 1632 }, { "epoch": 0.41519148259971395, "grad_norm": 21666.384765625, "learning_rate": 3.636261357532421e-05, "loss": 4.9037, "step": 1633 }, { "epoch": 0.4154457333545209, "grad_norm": 21531.67578125, "learning_rate": 3.634284111474301e-05, "loss": 4.9037, "step": 1634 }, { "epoch": 0.41569998410932785, "grad_norm": 21632.896484375, "learning_rate": 3.632305971574834e-05, "loss": 4.9046, "step": 1635 }, { "epoch": 0.41595423486413474, "grad_norm": 21658.61328125, "learning_rate": 3.630326939392838e-05, "loss": 4.9074, "step": 1636 }, { "epoch": 0.4162084856189417, "grad_norm": 21766.416015625, "learning_rate": 3.628347016487836e-05, "loss": 4.8985, "step": 1637 }, { "epoch": 0.4164627363737486, "grad_norm": 21578.001953125, "learning_rate": 3.626366204420051e-05, "loss": 4.9071, "step": 1638 }, { "epoch": 0.41671698712855554, "grad_norm": 21805.52734375, "learning_rate": 3.624384504750407e-05, "loss": 4.8926, "step": 1639 }, { "epoch": 0.4169712378833625, "grad_norm": 21873.283203125, "learning_rate": 3.622401919040528e-05, "loss": 4.9103, "step": 1640 }, { "epoch": 0.4172254886381694, "grad_norm": 21696.416015625, "learning_rate": 3.620418448852737e-05, "loss": 4.9028, "step": 1641 }, { "epoch": 0.41747973939297633, "grad_norm": 21697.595703125, "learning_rate": 3.618434095750051e-05, "loss": 4.8983, "step": 1642 }, { "epoch": 0.41773399014778323, "grad_norm": 21964.34765625, "learning_rate": 3.616448861296187e-05, "loss": 4.9248, "step": 1643 }, { "epoch": 0.4179882409025902, "grad_norm": 21953.5625, "learning_rate": 3.6144627470555534e-05, "loss": 4.9121, "step": 1644 }, { "epoch": 0.41824249165739713, "grad_norm": 21809.197265625, "learning_rate": 3.612475754593253e-05, "loss": 4.9091, "step": 1645 }, { "epoch": 0.418496742412204, "grad_norm": 21916.5078125, "learning_rate": 3.6104878854750787e-05, "loss": 4.9011, "step": 1646 }, { "epoch": 0.418750993167011, "grad_norm": 21816.2734375, "learning_rate": 3.608499141267519e-05, "loss": 4.8906, "step": 1647 }, { "epoch": 0.41900524392181787, "grad_norm": 21631.5546875, "learning_rate": 3.606509523537748e-05, "loss": 4.9051, "step": 1648 }, { "epoch": 0.4192594946766248, "grad_norm": 21687.5234375, "learning_rate": 3.604519033853628e-05, "loss": 4.8971, "step": 1649 }, { "epoch": 0.4195137454314318, "grad_norm": 21873.890625, "learning_rate": 3.60252767378371e-05, "loss": 4.9144, "step": 1650 }, { "epoch": 0.41976799618623867, "grad_norm": 21899.76953125, "learning_rate": 3.600535444897231e-05, "loss": 4.8783, "step": 1651 }, { "epoch": 0.4200222469410456, "grad_norm": 21633.9375, "learning_rate": 3.5985423487641115e-05, "loss": 4.8773, "step": 1652 }, { "epoch": 0.4202764976958525, "grad_norm": 32092.7109375, "learning_rate": 3.596548386954956e-05, "loss": 4.9003, "step": 1653 }, { "epoch": 0.42053074845065946, "grad_norm": 21750.923828125, "learning_rate": 3.594553561041053e-05, "loss": 4.9002, "step": 1654 }, { "epoch": 0.4207849992054664, "grad_norm": 21614.08203125, "learning_rate": 3.592557872594368e-05, "loss": 4.881, "step": 1655 }, { "epoch": 0.4210392499602733, "grad_norm": 21781.92578125, "learning_rate": 3.590561323187548e-05, "loss": 4.8867, "step": 1656 }, { "epoch": 0.42129350071508026, "grad_norm": 21969.76953125, "learning_rate": 3.58856391439392e-05, "loss": 4.8942, "step": 1657 }, { "epoch": 0.42154775146988716, "grad_norm": 22522.05859375, "learning_rate": 3.586565647787488e-05, "loss": 4.8803, "step": 1658 }, { "epoch": 0.4218020022246941, "grad_norm": 22621.322265625, "learning_rate": 3.584566524942928e-05, "loss": 4.9009, "step": 1659 }, { "epoch": 0.42205625297950106, "grad_norm": 21819.041015625, "learning_rate": 3.582566547435596e-05, "loss": 4.8935, "step": 1660 }, { "epoch": 0.42231050373430795, "grad_norm": 23410.6796875, "learning_rate": 3.580565716841517e-05, "loss": 4.8864, "step": 1661 }, { "epoch": 0.4225647544891149, "grad_norm": 22864.201171875, "learning_rate": 3.578564034737394e-05, "loss": 4.9081, "step": 1662 }, { "epoch": 0.4228190052439218, "grad_norm": 22266.919921875, "learning_rate": 3.576561502700594e-05, "loss": 4.8892, "step": 1663 }, { "epoch": 0.42307325599872875, "grad_norm": 26095.134765625, "learning_rate": 3.57455812230916e-05, "loss": 4.8954, "step": 1664 }, { "epoch": 0.4233275067535357, "grad_norm": 22740.720703125, "learning_rate": 3.5725538951417973e-05, "loss": 4.9033, "step": 1665 }, { "epoch": 0.4235817575083426, "grad_norm": 22555.861328125, "learning_rate": 3.570548822777885e-05, "loss": 4.8892, "step": 1666 }, { "epoch": 0.42383600826314954, "grad_norm": 22874.27734375, "learning_rate": 3.568542906797463e-05, "loss": 4.8997, "step": 1667 }, { "epoch": 0.42409025901795644, "grad_norm": 21495.142578125, "learning_rate": 3.5665361487812406e-05, "loss": 4.8847, "step": 1668 }, { "epoch": 0.4243445097727634, "grad_norm": 22216.201171875, "learning_rate": 3.5645285503105866e-05, "loss": 4.8894, "step": 1669 }, { "epoch": 0.42459876052757034, "grad_norm": 21716.841796875, "learning_rate": 3.562520112967533e-05, "loss": 4.8733, "step": 1670 }, { "epoch": 0.42485301128237724, "grad_norm": 22870.220703125, "learning_rate": 3.560510838334774e-05, "loss": 4.8936, "step": 1671 }, { "epoch": 0.4251072620371842, "grad_norm": 22410.798828125, "learning_rate": 3.5585007279956645e-05, "loss": 4.8818, "step": 1672 }, { "epoch": 0.4253615127919911, "grad_norm": 21816.15234375, "learning_rate": 3.5564897835342145e-05, "loss": 4.8735, "step": 1673 }, { "epoch": 0.42561576354679803, "grad_norm": 22911.541015625, "learning_rate": 3.554478006535096e-05, "loss": 4.8915, "step": 1674 }, { "epoch": 0.425870014301605, "grad_norm": 21875.9296875, "learning_rate": 3.552465398583631e-05, "loss": 4.9038, "step": 1675 }, { "epoch": 0.4261242650564119, "grad_norm": 22147.525390625, "learning_rate": 3.550451961265803e-05, "loss": 4.8788, "step": 1676 }, { "epoch": 0.42637851581121883, "grad_norm": 22352.337890625, "learning_rate": 3.548437696168243e-05, "loss": 4.9178, "step": 1677 }, { "epoch": 0.4266327665660257, "grad_norm": 21874.51953125, "learning_rate": 3.546422604878239e-05, "loss": 4.8719, "step": 1678 }, { "epoch": 0.4268870173208327, "grad_norm": 24230.541015625, "learning_rate": 3.544406688983728e-05, "loss": 4.8812, "step": 1679 }, { "epoch": 0.4271412680756396, "grad_norm": 22601.33984375, "learning_rate": 3.542389950073297e-05, "loss": 4.8804, "step": 1680 }, { "epoch": 0.4273955188304465, "grad_norm": 21862.162109375, "learning_rate": 3.5403723897361806e-05, "loss": 4.8903, "step": 1681 }, { "epoch": 0.42764976958525347, "grad_norm": 23466.4921875, "learning_rate": 3.538354009562263e-05, "loss": 4.8762, "step": 1682 }, { "epoch": 0.42790402034006036, "grad_norm": 22503.498046875, "learning_rate": 3.536334811142071e-05, "loss": 4.895, "step": 1683 }, { "epoch": 0.4281582710948673, "grad_norm": 22191.65625, "learning_rate": 3.534314796066781e-05, "loss": 4.8867, "step": 1684 }, { "epoch": 0.42841252184967427, "grad_norm": 22442.6640625, "learning_rate": 3.5322939659282084e-05, "loss": 4.8732, "step": 1685 }, { "epoch": 0.42866677260448116, "grad_norm": 22191.251953125, "learning_rate": 3.530272322318814e-05, "loss": 4.8897, "step": 1686 }, { "epoch": 0.4289210233592881, "grad_norm": 22047.107421875, "learning_rate": 3.5282498668316965e-05, "loss": 4.8758, "step": 1687 }, { "epoch": 0.429175274114095, "grad_norm": 22093.28125, "learning_rate": 3.5262266010605974e-05, "loss": 4.8955, "step": 1688 }, { "epoch": 0.42942952486890196, "grad_norm": 22349.4921875, "learning_rate": 3.5242025265998955e-05, "loss": 4.8688, "step": 1689 }, { "epoch": 0.4296837756237089, "grad_norm": 21869.244140625, "learning_rate": 3.522177645044607e-05, "loss": 4.8715, "step": 1690 }, { "epoch": 0.4299380263785158, "grad_norm": 21924.68359375, "learning_rate": 3.520151957990385e-05, "loss": 4.8713, "step": 1691 }, { "epoch": 0.43019227713332275, "grad_norm": 21897.029296875, "learning_rate": 3.518125467033515e-05, "loss": 4.871, "step": 1692 }, { "epoch": 0.43044652788812965, "grad_norm": 22098.861328125, "learning_rate": 3.516098173770916e-05, "loss": 4.8773, "step": 1693 }, { "epoch": 0.4307007786429366, "grad_norm": 33320.83203125, "learning_rate": 3.5140700798001436e-05, "loss": 4.8591, "step": 1694 }, { "epoch": 0.43095502939774355, "grad_norm": 22015.482421875, "learning_rate": 3.5120411867193795e-05, "loss": 4.8804, "step": 1695 }, { "epoch": 0.43120928015255044, "grad_norm": 22019.46484375, "learning_rate": 3.510011496127438e-05, "loss": 4.8766, "step": 1696 }, { "epoch": 0.4314635309073574, "grad_norm": 21862.5546875, "learning_rate": 3.50798100962376e-05, "loss": 4.8634, "step": 1697 }, { "epoch": 0.4317177816621643, "grad_norm": 22089.81640625, "learning_rate": 3.505949728808415e-05, "loss": 4.8711, "step": 1698 }, { "epoch": 0.43197203241697124, "grad_norm": 22051.53515625, "learning_rate": 3.5039176552820975e-05, "loss": 4.8763, "step": 1699 }, { "epoch": 0.4322262831717782, "grad_norm": 22092.03515625, "learning_rate": 3.501884790646128e-05, "loss": 4.8664, "step": 1700 }, { "epoch": 0.4322262831717782, "eval_loss": 9.825671195983887, "eval_runtime": 699.7335, "eval_samples_per_second": 151.466, "eval_steps_per_second": 9.468, "step": 1700 }, { "epoch": 0.4324805339265851, "grad_norm": 21989.029296875, "learning_rate": 3.499851136502449e-05, "loss": 4.8864, "step": 1701 }, { "epoch": 0.43273478468139204, "grad_norm": 22010.939453125, "learning_rate": 3.497816694453624e-05, "loss": 4.8646, "step": 1702 }, { "epoch": 0.43298903543619893, "grad_norm": 22087.806640625, "learning_rate": 3.495781466102841e-05, "loss": 4.86, "step": 1703 }, { "epoch": 0.4332432861910059, "grad_norm": 22119.8359375, "learning_rate": 3.493745453053906e-05, "loss": 4.8739, "step": 1704 }, { "epoch": 0.43349753694581283, "grad_norm": 22071.79296875, "learning_rate": 3.491708656911242e-05, "loss": 4.8769, "step": 1705 }, { "epoch": 0.43375178770061973, "grad_norm": 21819.615234375, "learning_rate": 3.489671079279889e-05, "loss": 4.864, "step": 1706 }, { "epoch": 0.4340060384554267, "grad_norm": 22032.8984375, "learning_rate": 3.487632721765506e-05, "loss": 4.8787, "step": 1707 }, { "epoch": 0.4342602892102336, "grad_norm": 21978.28125, "learning_rate": 3.4855935859743634e-05, "loss": 4.8666, "step": 1708 }, { "epoch": 0.4345145399650405, "grad_norm": 21969.259765625, "learning_rate": 3.483553673513346e-05, "loss": 4.8635, "step": 1709 }, { "epoch": 0.4347687907198475, "grad_norm": 22161.021484375, "learning_rate": 3.481512985989951e-05, "loss": 4.8755, "step": 1710 }, { "epoch": 0.43502304147465437, "grad_norm": 21971.896484375, "learning_rate": 3.4794715250122854e-05, "loss": 4.8616, "step": 1711 }, { "epoch": 0.4352772922294613, "grad_norm": 22054.76171875, "learning_rate": 3.477429292189067e-05, "loss": 4.8706, "step": 1712 }, { "epoch": 0.4355315429842682, "grad_norm": 22108.994140625, "learning_rate": 3.475386289129621e-05, "loss": 4.8798, "step": 1713 }, { "epoch": 0.43578579373907517, "grad_norm": 22375.142578125, "learning_rate": 3.473342517443878e-05, "loss": 4.8703, "step": 1714 }, { "epoch": 0.4360400444938821, "grad_norm": 21990.71484375, "learning_rate": 3.471297978742379e-05, "loss": 4.8656, "step": 1715 }, { "epoch": 0.436294295248689, "grad_norm": 22038.552734375, "learning_rate": 3.469252674636264e-05, "loss": 4.8728, "step": 1716 }, { "epoch": 0.43654854600349596, "grad_norm": 21873.314453125, "learning_rate": 3.46720660673728e-05, "loss": 4.8584, "step": 1717 }, { "epoch": 0.43680279675830286, "grad_norm": 21927.455078125, "learning_rate": 3.465159776657774e-05, "loss": 4.8366, "step": 1718 }, { "epoch": 0.4370570475131098, "grad_norm": 22044.982421875, "learning_rate": 3.4631121860106926e-05, "loss": 4.8651, "step": 1719 }, { "epoch": 0.43731129826791676, "grad_norm": 21996.814453125, "learning_rate": 3.461063836409585e-05, "loss": 4.852, "step": 1720 }, { "epoch": 0.43756554902272365, "grad_norm": 22560.287109375, "learning_rate": 3.459014729468597e-05, "loss": 4.8768, "step": 1721 }, { "epoch": 0.4378197997775306, "grad_norm": 22825.607421875, "learning_rate": 3.4569648668024704e-05, "loss": 4.8617, "step": 1722 }, { "epoch": 0.4380740505323375, "grad_norm": 22108.912109375, "learning_rate": 3.454914250026542e-05, "loss": 4.8654, "step": 1723 }, { "epoch": 0.43832830128714445, "grad_norm": 22084.595703125, "learning_rate": 3.452862880756743e-05, "loss": 4.8596, "step": 1724 }, { "epoch": 0.4385825520419514, "grad_norm": 22076.978515625, "learning_rate": 3.450810760609601e-05, "loss": 4.8691, "step": 1725 }, { "epoch": 0.4388368027967583, "grad_norm": 22177.564453125, "learning_rate": 3.448757891202232e-05, "loss": 4.8576, "step": 1726 }, { "epoch": 0.43909105355156525, "grad_norm": 22192.54296875, "learning_rate": 3.446704274152343e-05, "loss": 4.8428, "step": 1727 }, { "epoch": 0.43934530430637214, "grad_norm": 22073.0859375, "learning_rate": 3.44464991107823e-05, "loss": 4.8525, "step": 1728 }, { "epoch": 0.4395995550611791, "grad_norm": 21942.03515625, "learning_rate": 3.442594803598778e-05, "loss": 4.8546, "step": 1729 }, { "epoch": 0.43985380581598604, "grad_norm": 21952.177734375, "learning_rate": 3.440538953333456e-05, "loss": 4.8412, "step": 1730 }, { "epoch": 0.44010805657079294, "grad_norm": 22270.20703125, "learning_rate": 3.4384823619023224e-05, "loss": 4.8562, "step": 1731 }, { "epoch": 0.4403623073255999, "grad_norm": 22065.0703125, "learning_rate": 3.436425030926017e-05, "loss": 4.8524, "step": 1732 }, { "epoch": 0.4406165580804068, "grad_norm": 22071.935546875, "learning_rate": 3.434366962025761e-05, "loss": 4.852, "step": 1733 }, { "epoch": 0.44087080883521373, "grad_norm": 22116.904296875, "learning_rate": 3.432308156823361e-05, "loss": 4.8633, "step": 1734 }, { "epoch": 0.4411250595900207, "grad_norm": 22021.4140625, "learning_rate": 3.4302486169412004e-05, "loss": 4.854, "step": 1735 }, { "epoch": 0.4413793103448276, "grad_norm": 22173.978515625, "learning_rate": 3.428188344002244e-05, "loss": 4.8527, "step": 1736 }, { "epoch": 0.44163356109963453, "grad_norm": 22006.560546875, "learning_rate": 3.426127339630032e-05, "loss": 4.862, "step": 1737 }, { "epoch": 0.4418878118544414, "grad_norm": 22062.224609375, "learning_rate": 3.424065605448683e-05, "loss": 4.8532, "step": 1738 }, { "epoch": 0.4421420626092484, "grad_norm": 22106.29296875, "learning_rate": 3.4220031430828895e-05, "loss": 4.8619, "step": 1739 }, { "epoch": 0.4423963133640553, "grad_norm": 21951.20703125, "learning_rate": 3.419939954157917e-05, "loss": 4.8424, "step": 1740 }, { "epoch": 0.4426505641188622, "grad_norm": 22118.111328125, "learning_rate": 3.4178760402996066e-05, "loss": 4.8613, "step": 1741 }, { "epoch": 0.4429048148736692, "grad_norm": 22128.494140625, "learning_rate": 3.415811403134369e-05, "loss": 4.8609, "step": 1742 }, { "epoch": 0.44315906562847607, "grad_norm": 22074.3671875, "learning_rate": 3.4137460442891824e-05, "loss": 4.856, "step": 1743 }, { "epoch": 0.443413316383283, "grad_norm": 22072.138671875, "learning_rate": 3.411679965391598e-05, "loss": 4.8403, "step": 1744 }, { "epoch": 0.44366756713808997, "grad_norm": 22199.03125, "learning_rate": 3.4096131680697305e-05, "loss": 4.8395, "step": 1745 }, { "epoch": 0.44392181789289686, "grad_norm": 22152.58984375, "learning_rate": 3.407545653952264e-05, "loss": 4.8591, "step": 1746 }, { "epoch": 0.4441760686477038, "grad_norm": 22074.8359375, "learning_rate": 3.405477424668445e-05, "loss": 4.8495, "step": 1747 }, { "epoch": 0.4444303194025107, "grad_norm": 22346.208984375, "learning_rate": 3.4034084818480865e-05, "loss": 4.8546, "step": 1748 }, { "epoch": 0.44468457015731766, "grad_norm": 22437.583984375, "learning_rate": 3.40133882712156e-05, "loss": 4.8456, "step": 1749 }, { "epoch": 0.44493882091212456, "grad_norm": 22261.560546875, "learning_rate": 3.3992684621198006e-05, "loss": 4.8663, "step": 1750 }, { "epoch": 0.4451930716669315, "grad_norm": 22399.09765625, "learning_rate": 3.397197388474302e-05, "loss": 4.8698, "step": 1751 }, { "epoch": 0.44544732242173846, "grad_norm": 22262.453125, "learning_rate": 3.395125607817118e-05, "loss": 4.854, "step": 1752 }, { "epoch": 0.44570157317654535, "grad_norm": 22260.6484375, "learning_rate": 3.393053121780857e-05, "loss": 4.8502, "step": 1753 }, { "epoch": 0.4459558239313523, "grad_norm": 22395.189453125, "learning_rate": 3.390979931998685e-05, "loss": 4.8319, "step": 1754 }, { "epoch": 0.4462100746861592, "grad_norm": 22180.734375, "learning_rate": 3.388906040104322e-05, "loss": 4.8537, "step": 1755 }, { "epoch": 0.44646432544096615, "grad_norm": 22464.212890625, "learning_rate": 3.386831447732041e-05, "loss": 4.8603, "step": 1756 }, { "epoch": 0.4467185761957731, "grad_norm": 22336.3828125, "learning_rate": 3.384756156516667e-05, "loss": 4.846, "step": 1757 }, { "epoch": 0.44697282695058, "grad_norm": 22041.79296875, "learning_rate": 3.3826801680935785e-05, "loss": 4.8461, "step": 1758 }, { "epoch": 0.44722707770538694, "grad_norm": 22224.439453125, "learning_rate": 3.380603484098698e-05, "loss": 4.8454, "step": 1759 }, { "epoch": 0.44748132846019384, "grad_norm": 23013.74609375, "learning_rate": 3.3785261061685015e-05, "loss": 4.8579, "step": 1760 }, { "epoch": 0.4477355792150008, "grad_norm": 22225.5390625, "learning_rate": 3.376448035940007e-05, "loss": 4.8504, "step": 1761 }, { "epoch": 0.44798982996980774, "grad_norm": 22198.61328125, "learning_rate": 3.374369275050783e-05, "loss": 4.8444, "step": 1762 }, { "epoch": 0.44824408072461464, "grad_norm": 22202.001953125, "learning_rate": 3.372289825138938e-05, "loss": 4.8501, "step": 1763 }, { "epoch": 0.4484983314794216, "grad_norm": 22426.322265625, "learning_rate": 3.370209687843126e-05, "loss": 4.8384, "step": 1764 }, { "epoch": 0.4487525822342285, "grad_norm": 22443.2109375, "learning_rate": 3.3681288648025415e-05, "loss": 4.8439, "step": 1765 }, { "epoch": 0.44900683298903543, "grad_norm": 22218.0078125, "learning_rate": 3.36604735765692e-05, "loss": 4.8364, "step": 1766 }, { "epoch": 0.4492610837438424, "grad_norm": 52952.296875, "learning_rate": 3.363965168046537e-05, "loss": 4.8266, "step": 1767 }, { "epoch": 0.4495153344986493, "grad_norm": 22943.0078125, "learning_rate": 3.361882297612202e-05, "loss": 4.8386, "step": 1768 }, { "epoch": 0.4497695852534562, "grad_norm": 22271.9609375, "learning_rate": 3.359798747995266e-05, "loss": 4.8391, "step": 1769 }, { "epoch": 0.4500238360082631, "grad_norm": 25515.71875, "learning_rate": 3.357714520837612e-05, "loss": 4.85, "step": 1770 }, { "epoch": 0.4502780867630701, "grad_norm": 22855.2578125, "learning_rate": 3.3556296177816575e-05, "loss": 4.8389, "step": 1771 }, { "epoch": 0.450532337517877, "grad_norm": 23352.083984375, "learning_rate": 3.353544040470353e-05, "loss": 4.8464, "step": 1772 }, { "epoch": 0.4507865882726839, "grad_norm": 22517.65625, "learning_rate": 3.351457790547182e-05, "loss": 4.8351, "step": 1773 }, { "epoch": 0.45104083902749087, "grad_norm": 24770.787109375, "learning_rate": 3.3493708696561545e-05, "loss": 4.8365, "step": 1774 }, { "epoch": 0.45129508978229776, "grad_norm": 23343.32421875, "learning_rate": 3.34728327944181e-05, "loss": 4.8194, "step": 1775 }, { "epoch": 0.4515493405371047, "grad_norm": 36990.63671875, "learning_rate": 3.345195021549219e-05, "loss": 4.8467, "step": 1776 }, { "epoch": 0.45180359129191167, "grad_norm": 22373.421875, "learning_rate": 3.3431060976239736e-05, "loss": 4.8285, "step": 1777 }, { "epoch": 0.45205784204671856, "grad_norm": 25395.486328125, "learning_rate": 3.341016509312194e-05, "loss": 4.8273, "step": 1778 }, { "epoch": 0.4523120928015255, "grad_norm": 22469.193359375, "learning_rate": 3.3389262582605224e-05, "loss": 4.8463, "step": 1779 }, { "epoch": 0.4525663435563324, "grad_norm": 23005.87890625, "learning_rate": 3.3368353461161235e-05, "loss": 4.8415, "step": 1780 }, { "epoch": 0.45282059431113936, "grad_norm": 22361.6015625, "learning_rate": 3.3347437745266824e-05, "loss": 4.8372, "step": 1781 }, { "epoch": 0.4530748450659463, "grad_norm": 23105.833984375, "learning_rate": 3.332651545140405e-05, "loss": 4.8492, "step": 1782 }, { "epoch": 0.4533290958207532, "grad_norm": 22482.94921875, "learning_rate": 3.330558659606015e-05, "loss": 4.8373, "step": 1783 }, { "epoch": 0.45358334657556015, "grad_norm": 22494.169921875, "learning_rate": 3.3284651195727545e-05, "loss": 4.8322, "step": 1784 }, { "epoch": 0.45383759733036705, "grad_norm": 22108.89453125, "learning_rate": 3.326370926690378e-05, "loss": 4.8374, "step": 1785 }, { "epoch": 0.454091848085174, "grad_norm": 22578.302734375, "learning_rate": 3.3242760826091566e-05, "loss": 4.8284, "step": 1786 }, { "epoch": 0.45434609883998095, "grad_norm": 22233.822265625, "learning_rate": 3.3221805889798754e-05, "loss": 4.8246, "step": 1787 }, { "epoch": 0.45460034959478784, "grad_norm": 22614.87109375, "learning_rate": 3.3200844474538306e-05, "loss": 4.8344, "step": 1788 }, { "epoch": 0.4548546003495948, "grad_norm": 22526.244140625, "learning_rate": 3.317987659682828e-05, "loss": 4.8355, "step": 1789 }, { "epoch": 0.4551088511044017, "grad_norm": 22276.392578125, "learning_rate": 3.3158902273191847e-05, "loss": 4.8219, "step": 1790 }, { "epoch": 0.45536310185920864, "grad_norm": 22414.005859375, "learning_rate": 3.3137921520157225e-05, "loss": 4.8356, "step": 1791 }, { "epoch": 0.4556173526140156, "grad_norm": 22496.921875, "learning_rate": 3.3116934354257735e-05, "loss": 4.8294, "step": 1792 }, { "epoch": 0.4558716033688225, "grad_norm": 22519.23046875, "learning_rate": 3.309594079203173e-05, "loss": 4.8446, "step": 1793 }, { "epoch": 0.45612585412362944, "grad_norm": 22497.74609375, "learning_rate": 3.307494085002261e-05, "loss": 4.8341, "step": 1794 }, { "epoch": 0.45638010487843633, "grad_norm": 22519.412109375, "learning_rate": 3.305393454477879e-05, "loss": 4.8129, "step": 1795 }, { "epoch": 0.4566343556332433, "grad_norm": 22552.716796875, "learning_rate": 3.303292189285373e-05, "loss": 4.8511, "step": 1796 }, { "epoch": 0.45688860638805023, "grad_norm": 22368.73828125, "learning_rate": 3.301190291080585e-05, "loss": 4.8399, "step": 1797 }, { "epoch": 0.45714285714285713, "grad_norm": 22402.310546875, "learning_rate": 3.29908776151986e-05, "loss": 4.8448, "step": 1798 }, { "epoch": 0.4573971078976641, "grad_norm": 22370.611328125, "learning_rate": 3.296984602260037e-05, "loss": 4.8192, "step": 1799 }, { "epoch": 0.457651358652471, "grad_norm": 22556.884765625, "learning_rate": 3.294880814958453e-05, "loss": 4.8415, "step": 1800 }, { "epoch": 0.457651358652471, "eval_loss": 9.730688095092773, "eval_runtime": 697.963, "eval_samples_per_second": 151.85, "eval_steps_per_second": 9.492, "step": 1800 }, { "epoch": 0.4579056094072779, "grad_norm": 22225.671875, "learning_rate": 3.292776401272941e-05, "loss": 4.8105, "step": 1801 }, { "epoch": 0.4581598601620849, "grad_norm": 22374.408203125, "learning_rate": 3.2906713628618234e-05, "loss": 4.8204, "step": 1802 }, { "epoch": 0.45841411091689177, "grad_norm": 22469.703125, "learning_rate": 3.28856570138392e-05, "loss": 4.8198, "step": 1803 }, { "epoch": 0.4586683616716987, "grad_norm": 22390.865234375, "learning_rate": 3.2864594184985396e-05, "loss": 4.8329, "step": 1804 }, { "epoch": 0.4589226124265056, "grad_norm": 22345.4609375, "learning_rate": 3.2843525158654794e-05, "loss": 4.8311, "step": 1805 }, { "epoch": 0.45917686318131257, "grad_norm": 22406.984375, "learning_rate": 3.282244995145025e-05, "loss": 4.8197, "step": 1806 }, { "epoch": 0.4594311139361195, "grad_norm": 22343.69140625, "learning_rate": 3.2801368579979525e-05, "loss": 4.8246, "step": 1807 }, { "epoch": 0.4596853646909264, "grad_norm": 22558.763671875, "learning_rate": 3.278028106085519e-05, "loss": 4.8176, "step": 1808 }, { "epoch": 0.45993961544573336, "grad_norm": 48236.984375, "learning_rate": 3.27591874106947e-05, "loss": 4.8308, "step": 1809 }, { "epoch": 0.46019386620054026, "grad_norm": 22446.171875, "learning_rate": 3.273808764612032e-05, "loss": 4.8223, "step": 1810 }, { "epoch": 0.4604481169553472, "grad_norm": 22481.224609375, "learning_rate": 3.271698178375913e-05, "loss": 4.8325, "step": 1811 }, { "epoch": 0.46070236771015416, "grad_norm": 22303.732421875, "learning_rate": 3.269586984024303e-05, "loss": 4.8268, "step": 1812 }, { "epoch": 0.46095661846496105, "grad_norm": 22489.181640625, "learning_rate": 3.267475183220871e-05, "loss": 4.8312, "step": 1813 }, { "epoch": 0.461210869219768, "grad_norm": 22360.875, "learning_rate": 3.265362777629763e-05, "loss": 4.81, "step": 1814 }, { "epoch": 0.4614651199745749, "grad_norm": 22496.4765625, "learning_rate": 3.263249768915602e-05, "loss": 4.8245, "step": 1815 }, { "epoch": 0.46171937072938185, "grad_norm": 22410.193359375, "learning_rate": 3.261136158743486e-05, "loss": 4.8223, "step": 1816 }, { "epoch": 0.4619736214841888, "grad_norm": 22436.291015625, "learning_rate": 3.259021948778988e-05, "loss": 4.8258, "step": 1817 }, { "epoch": 0.4622278722389957, "grad_norm": 22534.212890625, "learning_rate": 3.2569071406881526e-05, "loss": 4.8136, "step": 1818 }, { "epoch": 0.46248212299380265, "grad_norm": 23084.279296875, "learning_rate": 3.254791736137495e-05, "loss": 4.8089, "step": 1819 }, { "epoch": 0.46273637374860954, "grad_norm": 22466.341796875, "learning_rate": 3.252675736794003e-05, "loss": 4.8326, "step": 1820 }, { "epoch": 0.4629906245034165, "grad_norm": 22379.46875, "learning_rate": 3.250559144325132e-05, "loss": 4.8125, "step": 1821 }, { "epoch": 0.46324487525822344, "grad_norm": 22613.451171875, "learning_rate": 3.2484419603988026e-05, "loss": 4.8062, "step": 1822 }, { "epoch": 0.46349912601303034, "grad_norm": 22390.404296875, "learning_rate": 3.2463241866834047e-05, "loss": 4.8103, "step": 1823 }, { "epoch": 0.4637533767678373, "grad_norm": 22634.107421875, "learning_rate": 3.2442058248477905e-05, "loss": 4.8151, "step": 1824 }, { "epoch": 0.4640076275226442, "grad_norm": 22431.23046875, "learning_rate": 3.2420868765612765e-05, "loss": 4.8303, "step": 1825 }, { "epoch": 0.46426187827745113, "grad_norm": 22459.298828125, "learning_rate": 3.239967343493643e-05, "loss": 4.8005, "step": 1826 }, { "epoch": 0.4645161290322581, "grad_norm": 22400.962890625, "learning_rate": 3.237847227315129e-05, "loss": 4.8165, "step": 1827 }, { "epoch": 0.464770379787065, "grad_norm": 22365.119140625, "learning_rate": 3.235726529696433e-05, "loss": 4.8205, "step": 1828 }, { "epoch": 0.46502463054187193, "grad_norm": 22584.58984375, "learning_rate": 3.233605252308713e-05, "loss": 4.827, "step": 1829 }, { "epoch": 0.4652788812966788, "grad_norm": 22684.552734375, "learning_rate": 3.231483396823583e-05, "loss": 4.8157, "step": 1830 }, { "epoch": 0.4655331320514858, "grad_norm": 22396.828125, "learning_rate": 3.229360964913112e-05, "loss": 4.8146, "step": 1831 }, { "epoch": 0.4657873828062927, "grad_norm": 22500.2578125, "learning_rate": 3.2272379582498265e-05, "loss": 4.8154, "step": 1832 }, { "epoch": 0.4660416335610996, "grad_norm": 22640.51171875, "learning_rate": 3.2251143785066996e-05, "loss": 4.8118, "step": 1833 }, { "epoch": 0.4662958843159066, "grad_norm": 22688.171875, "learning_rate": 3.222990227357163e-05, "loss": 4.8047, "step": 1834 }, { "epoch": 0.46655013507071347, "grad_norm": 22385.5625, "learning_rate": 3.2208655064750945e-05, "loss": 4.8106, "step": 1835 }, { "epoch": 0.4668043858255204, "grad_norm": 22749.375, "learning_rate": 3.218740217534822e-05, "loss": 4.8108, "step": 1836 }, { "epoch": 0.46705863658032737, "grad_norm": 22512.611328125, "learning_rate": 3.21661436221112e-05, "loss": 4.8068, "step": 1837 }, { "epoch": 0.46731288733513426, "grad_norm": 22616.384765625, "learning_rate": 3.214487942179212e-05, "loss": 4.7913, "step": 1838 }, { "epoch": 0.4675671380899412, "grad_norm": 22521.5, "learning_rate": 3.2123609591147624e-05, "loss": 4.7978, "step": 1839 }, { "epoch": 0.4678213888447481, "grad_norm": 22541.130859375, "learning_rate": 3.2102334146938836e-05, "loss": 4.8018, "step": 1840 }, { "epoch": 0.46807563959955506, "grad_norm": 22782.498046875, "learning_rate": 3.2081053105931274e-05, "loss": 4.8019, "step": 1841 }, { "epoch": 0.468329890354362, "grad_norm": 22556.59765625, "learning_rate": 3.2059766484894874e-05, "loss": 4.8125, "step": 1842 }, { "epoch": 0.4685841411091689, "grad_norm": 22595.67578125, "learning_rate": 3.203847430060398e-05, "loss": 4.8114, "step": 1843 }, { "epoch": 0.46883839186397586, "grad_norm": 22675.302734375, "learning_rate": 3.2017176569837305e-05, "loss": 4.8037, "step": 1844 }, { "epoch": 0.46909264261878275, "grad_norm": 22500.16796875, "learning_rate": 3.1995873309377946e-05, "loss": 4.8086, "step": 1845 }, { "epoch": 0.4693468933735897, "grad_norm": 22399.734375, "learning_rate": 3.1974564536013344e-05, "loss": 4.8002, "step": 1846 }, { "epoch": 0.46960114412839665, "grad_norm": 22821.880859375, "learning_rate": 3.195325026653528e-05, "loss": 4.8165, "step": 1847 }, { "epoch": 0.46985539488320355, "grad_norm": 22599.626953125, "learning_rate": 3.1931930517739904e-05, "loss": 4.7915, "step": 1848 }, { "epoch": 0.4701096456380105, "grad_norm": 22832.888671875, "learning_rate": 3.191060530642763e-05, "loss": 4.8171, "step": 1849 }, { "epoch": 0.4703638963928174, "grad_norm": 22452.3046875, "learning_rate": 3.188927464940323e-05, "loss": 4.8034, "step": 1850 }, { "epoch": 0.47061814714762434, "grad_norm": 22611.71875, "learning_rate": 3.1867938563475716e-05, "loss": 4.8099, "step": 1851 }, { "epoch": 0.4708723979024313, "grad_norm": 22378.44921875, "learning_rate": 3.1846597065458414e-05, "loss": 4.795, "step": 1852 }, { "epoch": 0.4711266486572382, "grad_norm": 22750.693359375, "learning_rate": 3.1825250172168904e-05, "loss": 4.809, "step": 1853 }, { "epoch": 0.47138089941204514, "grad_norm": 22607.263671875, "learning_rate": 3.180389790042902e-05, "loss": 4.8168, "step": 1854 }, { "epoch": 0.47163515016685204, "grad_norm": 22431.84765625, "learning_rate": 3.178254026706481e-05, "loss": 4.8026, "step": 1855 }, { "epoch": 0.471889400921659, "grad_norm": 22443.013671875, "learning_rate": 3.1761177288906594e-05, "loss": 4.7883, "step": 1856 }, { "epoch": 0.47214365167646594, "grad_norm": 22635.654296875, "learning_rate": 3.1739808982788875e-05, "loss": 4.8036, "step": 1857 }, { "epoch": 0.47239790243127283, "grad_norm": 22704.373046875, "learning_rate": 3.171843536555035e-05, "loss": 4.8044, "step": 1858 }, { "epoch": 0.4726521531860798, "grad_norm": 22658.59375, "learning_rate": 3.169705645403391e-05, "loss": 4.791, "step": 1859 }, { "epoch": 0.4729064039408867, "grad_norm": 22853.2890625, "learning_rate": 3.167567226508663e-05, "loss": 4.8033, "step": 1860 }, { "epoch": 0.4731606546956936, "grad_norm": 22567.544921875, "learning_rate": 3.1654282815559714e-05, "loss": 4.8012, "step": 1861 }, { "epoch": 0.4734149054505006, "grad_norm": 22865.716796875, "learning_rate": 3.163288812230852e-05, "loss": 4.8013, "step": 1862 }, { "epoch": 0.4736691562053075, "grad_norm": 22588.98046875, "learning_rate": 3.1611488202192586e-05, "loss": 4.7938, "step": 1863 }, { "epoch": 0.4739234069601144, "grad_norm": 22473.080078125, "learning_rate": 3.159008307207549e-05, "loss": 4.8004, "step": 1864 }, { "epoch": 0.4741776577149213, "grad_norm": 22534.927734375, "learning_rate": 3.156867274882497e-05, "loss": 4.7811, "step": 1865 }, { "epoch": 0.47443190846972827, "grad_norm": 22676.81640625, "learning_rate": 3.1547257249312856e-05, "loss": 4.7965, "step": 1866 }, { "epoch": 0.4746861592245352, "grad_norm": 22663.265625, "learning_rate": 3.152583659041501e-05, "loss": 4.8092, "step": 1867 }, { "epoch": 0.4749404099793421, "grad_norm": 22629.4296875, "learning_rate": 3.1504410789011424e-05, "loss": 4.8125, "step": 1868 }, { "epoch": 0.47519466073414907, "grad_norm": 22685.26953125, "learning_rate": 3.148297986198609e-05, "loss": 4.8051, "step": 1869 }, { "epoch": 0.47544891148895596, "grad_norm": 22487.66796875, "learning_rate": 3.146154382622707e-05, "loss": 4.7956, "step": 1870 }, { "epoch": 0.4757031622437629, "grad_norm": 22590.541015625, "learning_rate": 3.1440102698626435e-05, "loss": 4.7848, "step": 1871 }, { "epoch": 0.47595741299856986, "grad_norm": 22767.001953125, "learning_rate": 3.1418656496080286e-05, "loss": 4.8052, "step": 1872 }, { "epoch": 0.47621166375337676, "grad_norm": 22659.193359375, "learning_rate": 3.139720523548869e-05, "loss": 4.7967, "step": 1873 }, { "epoch": 0.4764659145081837, "grad_norm": 22626.56640625, "learning_rate": 3.137574893375575e-05, "loss": 4.7849, "step": 1874 }, { "epoch": 0.4767201652629906, "grad_norm": 22797.703125, "learning_rate": 3.135428760778949e-05, "loss": 4.7939, "step": 1875 }, { "epoch": 0.47697441601779755, "grad_norm": 22589.9765625, "learning_rate": 3.133282127450193e-05, "loss": 4.7823, "step": 1876 }, { "epoch": 0.4772286667726045, "grad_norm": 22667.11328125, "learning_rate": 3.131134995080902e-05, "loss": 4.7886, "step": 1877 }, { "epoch": 0.4774829175274114, "grad_norm": 22582.736328125, "learning_rate": 3.1289873653630646e-05, "loss": 4.773, "step": 1878 }, { "epoch": 0.47773716828221835, "grad_norm": 22789.642578125, "learning_rate": 3.126839239989061e-05, "loss": 4.7998, "step": 1879 }, { "epoch": 0.47799141903702524, "grad_norm": 22740.98828125, "learning_rate": 3.124690620651661e-05, "loss": 4.7894, "step": 1880 }, { "epoch": 0.4782456697918322, "grad_norm": 22670.673828125, "learning_rate": 3.122541509044027e-05, "loss": 4.7907, "step": 1881 }, { "epoch": 0.47849992054663915, "grad_norm": 22658.84765625, "learning_rate": 3.120391906859707e-05, "loss": 4.8005, "step": 1882 }, { "epoch": 0.47875417130144604, "grad_norm": 22749.806640625, "learning_rate": 3.118241815792635e-05, "loss": 4.7983, "step": 1883 }, { "epoch": 0.479008422056253, "grad_norm": 22623.4375, "learning_rate": 3.116091237537131e-05, "loss": 4.7853, "step": 1884 }, { "epoch": 0.4792626728110599, "grad_norm": 22501.96484375, "learning_rate": 3.113940173787899e-05, "loss": 4.7945, "step": 1885 }, { "epoch": 0.47951692356586684, "grad_norm": 22562.8203125, "learning_rate": 3.1117886262400254e-05, "loss": 4.7908, "step": 1886 }, { "epoch": 0.4797711743206738, "grad_norm": 22628.876953125, "learning_rate": 3.109636596588978e-05, "loss": 4.7882, "step": 1887 }, { "epoch": 0.4800254250754807, "grad_norm": 22786.875, "learning_rate": 3.1074840865306056e-05, "loss": 4.7961, "step": 1888 }, { "epoch": 0.48027967583028763, "grad_norm": 22741.66796875, "learning_rate": 3.105331097761133e-05, "loss": 4.7988, "step": 1889 }, { "epoch": 0.48053392658509453, "grad_norm": 22783.935546875, "learning_rate": 3.1031776319771645e-05, "loss": 4.7738, "step": 1890 }, { "epoch": 0.4807881773399015, "grad_norm": 22623.73828125, "learning_rate": 3.101023690875679e-05, "loss": 4.7721, "step": 1891 }, { "epoch": 0.48104242809470843, "grad_norm": 22722.255859375, "learning_rate": 3.0988692761540314e-05, "loss": 4.7854, "step": 1892 }, { "epoch": 0.4812966788495153, "grad_norm": 22793.50390625, "learning_rate": 3.096714389509947e-05, "loss": 4.7948, "step": 1893 }, { "epoch": 0.4815509296043223, "grad_norm": 22652.376953125, "learning_rate": 3.094559032641527e-05, "loss": 4.7883, "step": 1894 }, { "epoch": 0.48180518035912917, "grad_norm": 22778.07421875, "learning_rate": 3.0924032072472395e-05, "loss": 4.7888, "step": 1895 }, { "epoch": 0.4820594311139361, "grad_norm": 22720.45703125, "learning_rate": 3.090246915025924e-05, "loss": 4.7835, "step": 1896 }, { "epoch": 0.48231368186874307, "grad_norm": 22859.427734375, "learning_rate": 3.088090157676787e-05, "loss": 4.7792, "step": 1897 }, { "epoch": 0.48256793262354997, "grad_norm": 22847.28125, "learning_rate": 3.085932936899402e-05, "loss": 4.7792, "step": 1898 }, { "epoch": 0.4828221833783569, "grad_norm": 22616.79296875, "learning_rate": 3.083775254393707e-05, "loss": 4.7692, "step": 1899 }, { "epoch": 0.4830764341331638, "grad_norm": 22816.8984375, "learning_rate": 3.081617111860004e-05, "loss": 4.7693, "step": 1900 }, { "epoch": 0.4830764341331638, "eval_loss": 9.641205787658691, "eval_runtime": 695.8747, "eval_samples_per_second": 152.306, "eval_steps_per_second": 9.52, "step": 1900 }, { "epoch": 0.48333068488797076, "grad_norm": 22595.158203125, "learning_rate": 3.0794585109989583e-05, "loss": 4.7814, "step": 1901 }, { "epoch": 0.4835849356427777, "grad_norm": 22686.86328125, "learning_rate": 3.077299453511596e-05, "loss": 4.7804, "step": 1902 }, { "epoch": 0.4838391863975846, "grad_norm": 22691.53125, "learning_rate": 3.0751399410993026e-05, "loss": 4.785, "step": 1903 }, { "epoch": 0.48409343715239156, "grad_norm": 22758.580078125, "learning_rate": 3.072979975463822e-05, "loss": 4.7882, "step": 1904 }, { "epoch": 0.48434768790719845, "grad_norm": 22638.390625, "learning_rate": 3.070819558307256e-05, "loss": 4.7726, "step": 1905 }, { "epoch": 0.4846019386620054, "grad_norm": 22842.73046875, "learning_rate": 3.068658691332063e-05, "loss": 4.7815, "step": 1906 }, { "epoch": 0.48485618941681236, "grad_norm": 22637.6171875, "learning_rate": 3.066497376241052e-05, "loss": 4.7837, "step": 1907 }, { "epoch": 0.48511044017161925, "grad_norm": 22845.419921875, "learning_rate": 3.0643356147373906e-05, "loss": 4.7967, "step": 1908 }, { "epoch": 0.4853646909264262, "grad_norm": 22819.27734375, "learning_rate": 3.062173408524593e-05, "loss": 4.7798, "step": 1909 }, { "epoch": 0.4856189416812331, "grad_norm": 22656.298828125, "learning_rate": 3.0600107593065274e-05, "loss": 4.7841, "step": 1910 }, { "epoch": 0.48587319243604005, "grad_norm": 22886.1484375, "learning_rate": 3.05784766878741e-05, "loss": 4.7769, "step": 1911 }, { "epoch": 0.486127443190847, "grad_norm": 22839.2421875, "learning_rate": 3.055684138671805e-05, "loss": 4.7807, "step": 1912 }, { "epoch": 0.4863816939456539, "grad_norm": 22790.19921875, "learning_rate": 3.053520170664623e-05, "loss": 4.7665, "step": 1913 }, { "epoch": 0.48663594470046084, "grad_norm": 22900.73046875, "learning_rate": 3.051355766471118e-05, "loss": 4.7709, "step": 1914 }, { "epoch": 0.48689019545526774, "grad_norm": 22733.4375, "learning_rate": 3.0491909277968895e-05, "loss": 4.7778, "step": 1915 }, { "epoch": 0.4871444462100747, "grad_norm": 22799.046875, "learning_rate": 3.0470256563478793e-05, "loss": 4.7734, "step": 1916 }, { "epoch": 0.48739869696488164, "grad_norm": 22931.326171875, "learning_rate": 3.04485995383037e-05, "loss": 4.775, "step": 1917 }, { "epoch": 0.48765294771968853, "grad_norm": 22791.875, "learning_rate": 3.0426938219509837e-05, "loss": 4.7782, "step": 1918 }, { "epoch": 0.4879071984744955, "grad_norm": 22681.470703125, "learning_rate": 3.0405272624166807e-05, "loss": 4.7911, "step": 1919 }, { "epoch": 0.4881614492293024, "grad_norm": 22781.81640625, "learning_rate": 3.0383602769347595e-05, "loss": 4.7706, "step": 1920 }, { "epoch": 0.48841569998410933, "grad_norm": 22732.724609375, "learning_rate": 3.0361928672128526e-05, "loss": 4.7766, "step": 1921 }, { "epoch": 0.4886699507389163, "grad_norm": 22938.74609375, "learning_rate": 3.0340250349589266e-05, "loss": 4.7704, "step": 1922 }, { "epoch": 0.4889242014937232, "grad_norm": 22578.44140625, "learning_rate": 3.0318567818812836e-05, "loss": 4.7681, "step": 1923 }, { "epoch": 0.4891784522485301, "grad_norm": 22930.767578125, "learning_rate": 3.029688109688555e-05, "loss": 4.7608, "step": 1924 }, { "epoch": 0.489432703003337, "grad_norm": 22787.478515625, "learning_rate": 3.0275190200897035e-05, "loss": 4.7816, "step": 1925 }, { "epoch": 0.489686953758144, "grad_norm": 23046.08984375, "learning_rate": 3.0253495147940197e-05, "loss": 4.7814, "step": 1926 }, { "epoch": 0.4899412045129509, "grad_norm": 22750.03515625, "learning_rate": 3.023179595511123e-05, "loss": 4.776, "step": 1927 }, { "epoch": 0.4901954552677578, "grad_norm": 22935.765625, "learning_rate": 3.0210092639509586e-05, "loss": 4.7705, "step": 1928 }, { "epoch": 0.49044970602256477, "grad_norm": 23020.4921875, "learning_rate": 3.0188385218237957e-05, "loss": 4.786, "step": 1929 }, { "epoch": 0.49070395677737166, "grad_norm": 23021.29296875, "learning_rate": 3.0166673708402287e-05, "loss": 4.7778, "step": 1930 }, { "epoch": 0.4909582075321786, "grad_norm": 23370.517578125, "learning_rate": 3.014495812711174e-05, "loss": 4.76, "step": 1931 }, { "epoch": 0.49121245828698556, "grad_norm": 22602.65234375, "learning_rate": 3.012323849147866e-05, "loss": 4.7669, "step": 1932 }, { "epoch": 0.49146670904179246, "grad_norm": 23087.5390625, "learning_rate": 3.010151481861862e-05, "loss": 4.7733, "step": 1933 }, { "epoch": 0.4917209597965994, "grad_norm": 22796.1328125, "learning_rate": 3.0079787125650372e-05, "loss": 4.7806, "step": 1934 }, { "epoch": 0.4919752105514063, "grad_norm": 23238.685546875, "learning_rate": 3.0058055429695812e-05, "loss": 4.7664, "step": 1935 }, { "epoch": 0.49222946130621326, "grad_norm": 22716.732421875, "learning_rate": 3.0036319747880003e-05, "loss": 4.7426, "step": 1936 }, { "epoch": 0.4924837120610202, "grad_norm": 23319.64453125, "learning_rate": 3.0014580097331168e-05, "loss": 4.7681, "step": 1937 }, { "epoch": 0.4927379628158271, "grad_norm": 23150.38671875, "learning_rate": 2.9992836495180608e-05, "loss": 4.7724, "step": 1938 }, { "epoch": 0.49299221357063405, "grad_norm": 22830.548828125, "learning_rate": 2.997108895856281e-05, "loss": 4.7625, "step": 1939 }, { "epoch": 0.49324646432544095, "grad_norm": 22915.86328125, "learning_rate": 2.9949337504615287e-05, "loss": 4.7656, "step": 1940 }, { "epoch": 0.4935007150802479, "grad_norm": 22902.11328125, "learning_rate": 2.9927582150478688e-05, "loss": 4.7688, "step": 1941 }, { "epoch": 0.49375496583505485, "grad_norm": 22863.533203125, "learning_rate": 2.9905822913296722e-05, "loss": 4.7676, "step": 1942 }, { "epoch": 0.49400921658986174, "grad_norm": 22825.634765625, "learning_rate": 2.9884059810216147e-05, "loss": 4.7644, "step": 1943 }, { "epoch": 0.4942634673446687, "grad_norm": 22819.275390625, "learning_rate": 2.9862292858386782e-05, "loss": 4.7773, "step": 1944 }, { "epoch": 0.4945177180994756, "grad_norm": 23065.78125, "learning_rate": 2.9840522074961484e-05, "loss": 4.7668, "step": 1945 }, { "epoch": 0.49477196885428254, "grad_norm": 23035.5078125, "learning_rate": 2.9818747477096103e-05, "loss": 4.77, "step": 1946 }, { "epoch": 0.4950262196090895, "grad_norm": 23054.912109375, "learning_rate": 2.979696908194952e-05, "loss": 4.7739, "step": 1947 }, { "epoch": 0.4952804703638964, "grad_norm": 22977.8828125, "learning_rate": 2.9775186906683593e-05, "loss": 4.7662, "step": 1948 }, { "epoch": 0.49553472111870334, "grad_norm": 22949.826171875, "learning_rate": 2.9753400968463173e-05, "loss": 4.7534, "step": 1949 }, { "epoch": 0.49578897187351023, "grad_norm": 23086.3046875, "learning_rate": 2.9731611284456068e-05, "loss": 4.7669, "step": 1950 }, { "epoch": 0.4960432226283172, "grad_norm": 22937.015625, "learning_rate": 2.9709817871833033e-05, "loss": 4.7744, "step": 1951 }, { "epoch": 0.49629747338312413, "grad_norm": 22889.5703125, "learning_rate": 2.968802074776777e-05, "loss": 4.7539, "step": 1952 }, { "epoch": 0.496551724137931, "grad_norm": 22840.66796875, "learning_rate": 2.9666219929436896e-05, "loss": 4.7608, "step": 1953 }, { "epoch": 0.496805974892738, "grad_norm": 22922.9453125, "learning_rate": 2.964441543401995e-05, "loss": 4.7586, "step": 1954 }, { "epoch": 0.4970602256475449, "grad_norm": 22957.939453125, "learning_rate": 2.9622607278699365e-05, "loss": 4.7624, "step": 1955 }, { "epoch": 0.4973144764023518, "grad_norm": 23087.447265625, "learning_rate": 2.9600795480660466e-05, "loss": 4.7625, "step": 1956 }, { "epoch": 0.4975687271571588, "grad_norm": 22927.603515625, "learning_rate": 2.9578980057091414e-05, "loss": 4.7593, "step": 1957 }, { "epoch": 0.49782297791196567, "grad_norm": 22810.44921875, "learning_rate": 2.9557161025183278e-05, "loss": 4.7634, "step": 1958 }, { "epoch": 0.4980772286667726, "grad_norm": 23032.84375, "learning_rate": 2.953533840212993e-05, "loss": 4.7505, "step": 1959 }, { "epoch": 0.4983314794215795, "grad_norm": 22964.78125, "learning_rate": 2.951351220512809e-05, "loss": 4.7563, "step": 1960 }, { "epoch": 0.49858573017638647, "grad_norm": 23036.1796875, "learning_rate": 2.94916824513773e-05, "loss": 4.7644, "step": 1961 }, { "epoch": 0.4988399809311934, "grad_norm": 22814.412109375, "learning_rate": 2.9469849158079886e-05, "loss": 4.7651, "step": 1962 }, { "epoch": 0.4990942316860003, "grad_norm": 23342.337890625, "learning_rate": 2.944801234244098e-05, "loss": 4.7539, "step": 1963 }, { "epoch": 0.49934848244080726, "grad_norm": 23009.666015625, "learning_rate": 2.9426172021668475e-05, "loss": 4.7464, "step": 1964 }, { "epoch": 0.49960273319561416, "grad_norm": 22917.50390625, "learning_rate": 2.9404328212973044e-05, "loss": 4.7566, "step": 1965 }, { "epoch": 0.4998569839504211, "grad_norm": 23165.1015625, "learning_rate": 2.938248093356809e-05, "loss": 4.7487, "step": 1966 }, { "epoch": 0.500111234705228, "grad_norm": 22979.5546875, "learning_rate": 2.9360630200669766e-05, "loss": 4.7718, "step": 1967 }, { "epoch": 0.500365485460035, "grad_norm": 22906.5234375, "learning_rate": 2.933877603149694e-05, "loss": 4.7536, "step": 1968 }, { "epoch": 0.5006197362148419, "grad_norm": 22827.6875, "learning_rate": 2.9316918443271176e-05, "loss": 4.7588, "step": 1969 }, { "epoch": 0.5008739869696488, "grad_norm": 23026.64453125, "learning_rate": 2.9295057453216758e-05, "loss": 4.7543, "step": 1970 }, { "epoch": 0.5011282377244557, "grad_norm": 23014.537109375, "learning_rate": 2.9273193078560636e-05, "loss": 4.7461, "step": 1971 }, { "epoch": 0.5013824884792627, "grad_norm": 22945.01953125, "learning_rate": 2.925132533653242e-05, "loss": 4.7447, "step": 1972 }, { "epoch": 0.5016367392340696, "grad_norm": 23150.68359375, "learning_rate": 2.9229454244364397e-05, "loss": 4.7381, "step": 1973 }, { "epoch": 0.5018909899888765, "grad_norm": 23043.513671875, "learning_rate": 2.9207579819291453e-05, "loss": 4.7532, "step": 1974 }, { "epoch": 0.5021452407436835, "grad_norm": 22943.0625, "learning_rate": 2.9185702078551148e-05, "loss": 4.7308, "step": 1975 }, { "epoch": 0.5023994914984904, "grad_norm": 22949.34375, "learning_rate": 2.916382103938363e-05, "loss": 4.7531, "step": 1976 }, { "epoch": 0.5026537422532973, "grad_norm": 22955.57421875, "learning_rate": 2.9141936719031644e-05, "loss": 4.7441, "step": 1977 }, { "epoch": 0.5029079930081043, "grad_norm": 23065.19140625, "learning_rate": 2.9120049134740522e-05, "loss": 4.7714, "step": 1978 }, { "epoch": 0.5031622437629112, "grad_norm": 23003.017578125, "learning_rate": 2.909815830375818e-05, "loss": 4.7425, "step": 1979 }, { "epoch": 0.5034164945177181, "grad_norm": 23466.689453125, "learning_rate": 2.9076264243335083e-05, "loss": 4.7448, "step": 1980 }, { "epoch": 0.503670745272525, "grad_norm": 23075.076171875, "learning_rate": 2.9054366970724235e-05, "loss": 4.7547, "step": 1981 }, { "epoch": 0.503924996027332, "grad_norm": 23075.654296875, "learning_rate": 2.9032466503181193e-05, "loss": 4.755, "step": 1982 }, { "epoch": 0.5041792467821389, "grad_norm": 23305.966796875, "learning_rate": 2.9010562857964e-05, "loss": 4.7668, "step": 1983 }, { "epoch": 0.5044334975369458, "grad_norm": 22911.794921875, "learning_rate": 2.8988656052333235e-05, "loss": 4.7482, "step": 1984 }, { "epoch": 0.5046877482917528, "grad_norm": 23034.783203125, "learning_rate": 2.896674610355194e-05, "loss": 4.7502, "step": 1985 }, { "epoch": 0.5049419990465597, "grad_norm": 22959.732421875, "learning_rate": 2.8944833028885654e-05, "loss": 4.7473, "step": 1986 }, { "epoch": 0.5051962498013666, "grad_norm": 23156.033203125, "learning_rate": 2.8922916845602377e-05, "loss": 4.7306, "step": 1987 }, { "epoch": 0.5054505005561736, "grad_norm": 23070.759765625, "learning_rate": 2.8900997570972545e-05, "loss": 4.7554, "step": 1988 }, { "epoch": 0.5057047513109805, "grad_norm": 23160.287109375, "learning_rate": 2.8879075222269036e-05, "loss": 4.7437, "step": 1989 }, { "epoch": 0.5059590020657874, "grad_norm": 22863.69140625, "learning_rate": 2.8857149816767158e-05, "loss": 4.7479, "step": 1990 }, { "epoch": 0.5062132528205943, "grad_norm": 23054.64453125, "learning_rate": 2.883522137174462e-05, "loss": 4.7566, "step": 1991 }, { "epoch": 0.5064675035754013, "grad_norm": 22964.81640625, "learning_rate": 2.8813289904481535e-05, "loss": 4.7401, "step": 1992 }, { "epoch": 0.5067217543302082, "grad_norm": 23167.4296875, "learning_rate": 2.8791355432260392e-05, "loss": 4.7464, "step": 1993 }, { "epoch": 0.5069760050850151, "grad_norm": 22849.697265625, "learning_rate": 2.8769417972366037e-05, "loss": 4.7356, "step": 1994 }, { "epoch": 0.5072302558398221, "grad_norm": 23022.30078125, "learning_rate": 2.8747477542085686e-05, "loss": 4.7413, "step": 1995 }, { "epoch": 0.507484506594629, "grad_norm": 22905.880859375, "learning_rate": 2.87255341587089e-05, "loss": 4.7432, "step": 1996 }, { "epoch": 0.5077387573494359, "grad_norm": 23229.41796875, "learning_rate": 2.8703587839527546e-05, "loss": 4.7723, "step": 1997 }, { "epoch": 0.5079930081042429, "grad_norm": 22982.7421875, "learning_rate": 2.8681638601835815e-05, "loss": 4.7399, "step": 1998 }, { "epoch": 0.5082472588590498, "grad_norm": 23071.857421875, "learning_rate": 2.8659686462930212e-05, "loss": 4.733, "step": 1999 }, { "epoch": 0.5085015096138567, "grad_norm": 23090.21484375, "learning_rate": 2.8637731440109507e-05, "loss": 4.7424, "step": 2000 }, { "epoch": 0.5085015096138567, "eval_loss": 9.557955741882324, "eval_runtime": 699.1214, "eval_samples_per_second": 151.599, "eval_steps_per_second": 9.476, "step": 2000 }, { "epoch": 0.5087557603686635, "grad_norm": 23007.51953125, "learning_rate": 2.8615773550674744e-05, "loss": 4.7398, "step": 2001 }, { "epoch": 0.5090100111234706, "grad_norm": 23056.48046875, "learning_rate": 2.859381281192925e-05, "loss": 4.7519, "step": 2002 }, { "epoch": 0.5092642618782774, "grad_norm": 23053.72265625, "learning_rate": 2.857184924117856e-05, "loss": 4.7312, "step": 2003 }, { "epoch": 0.5095185126330843, "grad_norm": 23077.6171875, "learning_rate": 2.8549882855730485e-05, "loss": 4.7256, "step": 2004 }, { "epoch": 0.5097727633878913, "grad_norm": 23031.88671875, "learning_rate": 2.8527913672895012e-05, "loss": 4.7345, "step": 2005 }, { "epoch": 0.5100270141426982, "grad_norm": 23169.986328125, "learning_rate": 2.8505941709984348e-05, "loss": 4.7439, "step": 2006 }, { "epoch": 0.5102812648975051, "grad_norm": 23364.912109375, "learning_rate": 2.8483966984312906e-05, "loss": 4.7317, "step": 2007 }, { "epoch": 0.5105355156523121, "grad_norm": 23049.017578125, "learning_rate": 2.8461989513197252e-05, "loss": 4.7378, "step": 2008 }, { "epoch": 0.510789766407119, "grad_norm": 22993.38671875, "learning_rate": 2.8440009313956133e-05, "loss": 4.7347, "step": 2009 }, { "epoch": 0.5110440171619259, "grad_norm": 23220.9453125, "learning_rate": 2.841802640391045e-05, "loss": 4.7469, "step": 2010 }, { "epoch": 0.5112982679167328, "grad_norm": 23500.806640625, "learning_rate": 2.83960408003832e-05, "loss": 4.7359, "step": 2011 }, { "epoch": 0.5115525186715398, "grad_norm": 23078.642578125, "learning_rate": 2.8374052520699557e-05, "loss": 4.7269, "step": 2012 }, { "epoch": 0.5118067694263467, "grad_norm": 23080.47265625, "learning_rate": 2.8352061582186777e-05, "loss": 4.7359, "step": 2013 }, { "epoch": 0.5120610201811536, "grad_norm": 23217.416015625, "learning_rate": 2.8330068002174202e-05, "loss": 4.723, "step": 2014 }, { "epoch": 0.5123152709359606, "grad_norm": 23020.74609375, "learning_rate": 2.830807179799328e-05, "loss": 4.7409, "step": 2015 }, { "epoch": 0.5125695216907675, "grad_norm": 23209.0859375, "learning_rate": 2.82860729869775e-05, "loss": 4.7317, "step": 2016 }, { "epoch": 0.5128237724455744, "grad_norm": 23137.6796875, "learning_rate": 2.826407158646243e-05, "loss": 4.746, "step": 2017 }, { "epoch": 0.5130780232003814, "grad_norm": 22964.87109375, "learning_rate": 2.8242067613785673e-05, "loss": 4.7285, "step": 2018 }, { "epoch": 0.5133322739551883, "grad_norm": 23120.8515625, "learning_rate": 2.822006108628683e-05, "loss": 4.7229, "step": 2019 }, { "epoch": 0.5135865247099952, "grad_norm": 23373.91015625, "learning_rate": 2.819805202130756e-05, "loss": 4.7279, "step": 2020 }, { "epoch": 0.5138407754648021, "grad_norm": 23226.46875, "learning_rate": 2.8176040436191487e-05, "loss": 4.7329, "step": 2021 }, { "epoch": 0.5140950262196091, "grad_norm": 23186.08203125, "learning_rate": 2.8154026348284247e-05, "loss": 4.7457, "step": 2022 }, { "epoch": 0.514349276974416, "grad_norm": 23213.958984375, "learning_rate": 2.8132009774933427e-05, "loss": 4.7295, "step": 2023 }, { "epoch": 0.5146035277292229, "grad_norm": 23086.5078125, "learning_rate": 2.810999073348859e-05, "loss": 4.7355, "step": 2024 }, { "epoch": 0.5148577784840299, "grad_norm": 23512.626953125, "learning_rate": 2.808796924130122e-05, "loss": 4.7262, "step": 2025 }, { "epoch": 0.5151120292388368, "grad_norm": 23135.099609375, "learning_rate": 2.8065945315724756e-05, "loss": 4.7418, "step": 2026 }, { "epoch": 0.5153662799936437, "grad_norm": 23408.845703125, "learning_rate": 2.8043918974114547e-05, "loss": 4.7315, "step": 2027 }, { "epoch": 0.5156205307484507, "grad_norm": 23213.99609375, "learning_rate": 2.8021890233827842e-05, "loss": 4.7351, "step": 2028 }, { "epoch": 0.5158747815032576, "grad_norm": 35557.0625, "learning_rate": 2.7999859112223785e-05, "loss": 4.7523, "step": 2029 }, { "epoch": 0.5161290322580645, "grad_norm": 23306.736328125, "learning_rate": 2.79778256266634e-05, "loss": 4.725, "step": 2030 }, { "epoch": 0.5163832830128714, "grad_norm": 23075.65625, "learning_rate": 2.7955789794509556e-05, "loss": 4.7246, "step": 2031 }, { "epoch": 0.5166375337676784, "grad_norm": 23116.28125, "learning_rate": 2.7933751633126987e-05, "loss": 4.7287, "step": 2032 }, { "epoch": 0.5168917845224853, "grad_norm": 23078.94921875, "learning_rate": 2.7911711159882266e-05, "loss": 4.729, "step": 2033 }, { "epoch": 0.5171460352772922, "grad_norm": 23201.748046875, "learning_rate": 2.7889668392143777e-05, "loss": 4.7299, "step": 2034 }, { "epoch": 0.5174002860320992, "grad_norm": 23212.55078125, "learning_rate": 2.7867623347281713e-05, "loss": 4.7207, "step": 2035 }, { "epoch": 0.5176545367869061, "grad_norm": 23330.412109375, "learning_rate": 2.7845576042668066e-05, "loss": 4.7213, "step": 2036 }, { "epoch": 0.517908787541713, "grad_norm": 23131.947265625, "learning_rate": 2.78235264956766e-05, "loss": 4.7056, "step": 2037 }, { "epoch": 0.51816303829652, "grad_norm": 24849.046875, "learning_rate": 2.7801474723682873e-05, "loss": 4.721, "step": 2038 }, { "epoch": 0.5184172890513269, "grad_norm": 23961.619140625, "learning_rate": 2.777942074406416e-05, "loss": 4.716, "step": 2039 }, { "epoch": 0.5186715398061338, "grad_norm": 28028.6328125, "learning_rate": 2.7757364574199496e-05, "loss": 4.7289, "step": 2040 }, { "epoch": 0.5189257905609407, "grad_norm": 26758.71875, "learning_rate": 2.7735306231469644e-05, "loss": 4.7226, "step": 2041 }, { "epoch": 0.5191800413157477, "grad_norm": 23318.28125, "learning_rate": 2.7713245733257053e-05, "loss": 4.7258, "step": 2042 }, { "epoch": 0.5194342920705546, "grad_norm": 24078.271484375, "learning_rate": 2.769118309694591e-05, "loss": 4.7304, "step": 2043 }, { "epoch": 0.5196885428253615, "grad_norm": 23060.48046875, "learning_rate": 2.7669118339922072e-05, "loss": 4.7105, "step": 2044 }, { "epoch": 0.5199427935801685, "grad_norm": 25068.29296875, "learning_rate": 2.764705147957305e-05, "loss": 4.7283, "step": 2045 }, { "epoch": 0.5201970443349754, "grad_norm": 24059.552734375, "learning_rate": 2.762498253328803e-05, "loss": 4.7194, "step": 2046 }, { "epoch": 0.5204512950897823, "grad_norm": 23689.7578125, "learning_rate": 2.7602911518457835e-05, "loss": 4.7216, "step": 2047 }, { "epoch": 0.5207055458445893, "grad_norm": 24054.283203125, "learning_rate": 2.7580838452474923e-05, "loss": 4.7143, "step": 2048 }, { "epoch": 0.5209597965993962, "grad_norm": 23281.78125, "learning_rate": 2.7558763352733362e-05, "loss": 4.7101, "step": 2049 }, { "epoch": 0.5212140473542031, "grad_norm": 24276.9296875, "learning_rate": 2.7536686236628834e-05, "loss": 4.7236, "step": 2050 }, { "epoch": 0.52146829810901, "grad_norm": 23157.599609375, "learning_rate": 2.7514607121558594e-05, "loss": 4.729, "step": 2051 }, { "epoch": 0.521722548863817, "grad_norm": 25382.3515625, "learning_rate": 2.7492526024921484e-05, "loss": 4.7223, "step": 2052 }, { "epoch": 0.5219767996186239, "grad_norm": 23496.029296875, "learning_rate": 2.7470442964117897e-05, "loss": 4.7195, "step": 2053 }, { "epoch": 0.5222310503734308, "grad_norm": 23880.427734375, "learning_rate": 2.7448357956549793e-05, "loss": 4.7315, "step": 2054 }, { "epoch": 0.5224853011282378, "grad_norm": 23273.0703125, "learning_rate": 2.7426271019620654e-05, "loss": 4.7207, "step": 2055 }, { "epoch": 0.5227395518830447, "grad_norm": 24034.41015625, "learning_rate": 2.7404182170735464e-05, "loss": 4.7312, "step": 2056 }, { "epoch": 0.5229938026378516, "grad_norm": 23682.8515625, "learning_rate": 2.7382091427300748e-05, "loss": 4.7211, "step": 2057 }, { "epoch": 0.5232480533926586, "grad_norm": 23399.884765625, "learning_rate": 2.7359998806724506e-05, "loss": 4.7079, "step": 2058 }, { "epoch": 0.5235023041474655, "grad_norm": 23222.86328125, "learning_rate": 2.7337904326416214e-05, "loss": 4.7023, "step": 2059 }, { "epoch": 0.5237565549022724, "grad_norm": 23315.37890625, "learning_rate": 2.7315808003786826e-05, "loss": 4.728, "step": 2060 }, { "epoch": 0.5240108056570792, "grad_norm": 23392.298828125, "learning_rate": 2.7293709856248734e-05, "loss": 4.7225, "step": 2061 }, { "epoch": 0.5242650564118863, "grad_norm": 23439.419921875, "learning_rate": 2.7271609901215778e-05, "loss": 4.7041, "step": 2062 }, { "epoch": 0.5245193071666931, "grad_norm": 24017.865234375, "learning_rate": 2.724950815610321e-05, "loss": 4.7206, "step": 2063 }, { "epoch": 0.5247735579215, "grad_norm": 23258.296875, "learning_rate": 2.7227404638327712e-05, "loss": 4.7215, "step": 2064 }, { "epoch": 0.525027808676307, "grad_norm": 23284.140625, "learning_rate": 2.7205299365307345e-05, "loss": 4.7116, "step": 2065 }, { "epoch": 0.525282059431114, "grad_norm": 23141.248046875, "learning_rate": 2.7183192354461573e-05, "loss": 4.7155, "step": 2066 }, { "epoch": 0.5255363101859208, "grad_norm": 23496.93359375, "learning_rate": 2.7161083623211203e-05, "loss": 4.7094, "step": 2067 }, { "epoch": 0.5257905609407278, "grad_norm": 23329.158203125, "learning_rate": 2.7138973188978416e-05, "loss": 4.7132, "step": 2068 }, { "epoch": 0.5260448116955347, "grad_norm": 23593.390625, "learning_rate": 2.7116861069186726e-05, "loss": 4.7269, "step": 2069 }, { "epoch": 0.5262990624503416, "grad_norm": 23326.470703125, "learning_rate": 2.7094747281260992e-05, "loss": 4.7281, "step": 2070 }, { "epoch": 0.5265533132051485, "grad_norm": 23733.70703125, "learning_rate": 2.7072631842627367e-05, "loss": 4.7078, "step": 2071 }, { "epoch": 0.5268075639599555, "grad_norm": 23178.4921875, "learning_rate": 2.705051477071332e-05, "loss": 4.7005, "step": 2072 }, { "epoch": 0.5270618147147624, "grad_norm": 23485.16015625, "learning_rate": 2.702839608294758e-05, "loss": 4.7227, "step": 2073 }, { "epoch": 0.5273160654695693, "grad_norm": 23290.314453125, "learning_rate": 2.7006275796760194e-05, "loss": 4.719, "step": 2074 }, { "epoch": 0.5275703162243763, "grad_norm": 23742.23828125, "learning_rate": 2.6984153929582433e-05, "loss": 4.7034, "step": 2075 }, { "epoch": 0.5278245669791832, "grad_norm": 23395.416015625, "learning_rate": 2.696203049884683e-05, "loss": 4.7251, "step": 2076 }, { "epoch": 0.5280788177339901, "grad_norm": 23555.201171875, "learning_rate": 2.6939905521987137e-05, "loss": 4.7158, "step": 2077 }, { "epoch": 0.5283330684887971, "grad_norm": 23185.91796875, "learning_rate": 2.6917779016438342e-05, "loss": 4.7085, "step": 2078 }, { "epoch": 0.528587319243604, "grad_norm": 23298.71484375, "learning_rate": 2.689565099963662e-05, "loss": 4.7195, "step": 2079 }, { "epoch": 0.5288415699984109, "grad_norm": 23305.232421875, "learning_rate": 2.6873521489019348e-05, "loss": 4.7213, "step": 2080 }, { "epoch": 0.5290958207532178, "grad_norm": 23419.66015625, "learning_rate": 2.6851390502025082e-05, "loss": 4.7204, "step": 2081 }, { "epoch": 0.5293500715080248, "grad_norm": 23082.498046875, "learning_rate": 2.6829258056093526e-05, "loss": 4.7096, "step": 2082 }, { "epoch": 0.5296043222628317, "grad_norm": 23548.970703125, "learning_rate": 2.680712416866556e-05, "loss": 4.7176, "step": 2083 }, { "epoch": 0.5298585730176386, "grad_norm": 23188.61328125, "learning_rate": 2.678498885718316e-05, "loss": 4.7045, "step": 2084 }, { "epoch": 0.5301128237724456, "grad_norm": 23513.1328125, "learning_rate": 2.6762852139089467e-05, "loss": 4.7036, "step": 2085 }, { "epoch": 0.5303670745272525, "grad_norm": 23257.73828125, "learning_rate": 2.6740714031828725e-05, "loss": 4.7096, "step": 2086 }, { "epoch": 0.5306213252820594, "grad_norm": 23426.67578125, "learning_rate": 2.6718574552846225e-05, "loss": 4.7137, "step": 2087 }, { "epoch": 0.5308755760368664, "grad_norm": 23352.6015625, "learning_rate": 2.6696433719588398e-05, "loss": 4.7228, "step": 2088 }, { "epoch": 0.5311298267916733, "grad_norm": 23303.849609375, "learning_rate": 2.6674291549502706e-05, "loss": 4.7145, "step": 2089 }, { "epoch": 0.5313840775464802, "grad_norm": 23343.2109375, "learning_rate": 2.6652148060037685e-05, "loss": 4.6971, "step": 2090 }, { "epoch": 0.5316383283012871, "grad_norm": 23663.53515625, "learning_rate": 2.6630003268642902e-05, "loss": 4.7119, "step": 2091 }, { "epoch": 0.5318925790560941, "grad_norm": 23375.068359375, "learning_rate": 2.6607857192768943e-05, "loss": 4.7125, "step": 2092 }, { "epoch": 0.532146829810901, "grad_norm": 23589.341796875, "learning_rate": 2.6585709849867414e-05, "loss": 4.7132, "step": 2093 }, { "epoch": 0.5324010805657079, "grad_norm": 23355.55078125, "learning_rate": 2.6563561257390925e-05, "loss": 4.706, "step": 2094 }, { "epoch": 0.5326553313205149, "grad_norm": 23303.205078125, "learning_rate": 2.654141143279305e-05, "loss": 4.7029, "step": 2095 }, { "epoch": 0.5329095820753218, "grad_norm": 23292.84765625, "learning_rate": 2.6519260393528366e-05, "loss": 4.7012, "step": 2096 }, { "epoch": 0.5331638328301287, "grad_norm": 23259.251953125, "learning_rate": 2.6497108157052386e-05, "loss": 4.6946, "step": 2097 }, { "epoch": 0.5334180835849356, "grad_norm": 23252.619140625, "learning_rate": 2.6474954740821555e-05, "loss": 4.6993, "step": 2098 }, { "epoch": 0.5336723343397426, "grad_norm": 23247.013671875, "learning_rate": 2.6452800162293273e-05, "loss": 4.6996, "step": 2099 }, { "epoch": 0.5339265850945495, "grad_norm": 33386.3359375, "learning_rate": 2.6430644438925844e-05, "loss": 4.7113, "step": 2100 }, { "epoch": 0.5339265850945495, "eval_loss": 9.482353210449219, "eval_runtime": 696.41, "eval_samples_per_second": 152.189, "eval_steps_per_second": 9.513, "step": 2100 }, { "epoch": 0.5341808358493564, "grad_norm": 24149.486328125, "learning_rate": 2.6408487588178477e-05, "loss": 4.7019, "step": 2101 }, { "epoch": 0.5344350866041634, "grad_norm": 23557.75, "learning_rate": 2.6386329627511265e-05, "loss": 4.7102, "step": 2102 }, { "epoch": 0.5346893373589703, "grad_norm": 23583.0234375, "learning_rate": 2.636417057438519e-05, "loss": 4.6826, "step": 2103 }, { "epoch": 0.5349435881137772, "grad_norm": 24047.4375, "learning_rate": 2.634201044626206e-05, "loss": 4.7044, "step": 2104 }, { "epoch": 0.5351978388685842, "grad_norm": 25081.236328125, "learning_rate": 2.631984926060457e-05, "loss": 4.6972, "step": 2105 }, { "epoch": 0.5354520896233911, "grad_norm": 23678.958984375, "learning_rate": 2.6297687034876238e-05, "loss": 4.7079, "step": 2106 }, { "epoch": 0.535706340378198, "grad_norm": 23832.02734375, "learning_rate": 2.6275523786541377e-05, "loss": 4.7141, "step": 2107 }, { "epoch": 0.5359605911330049, "grad_norm": 23217.287109375, "learning_rate": 2.6253359533065135e-05, "loss": 4.6954, "step": 2108 }, { "epoch": 0.5362148418878119, "grad_norm": 24535.060546875, "learning_rate": 2.6231194291913447e-05, "loss": 4.7029, "step": 2109 }, { "epoch": 0.5364690926426188, "grad_norm": 23473.38671875, "learning_rate": 2.6209028080553005e-05, "loss": 4.7215, "step": 2110 }, { "epoch": 0.5367233433974257, "grad_norm": 23935.1796875, "learning_rate": 2.61868609164513e-05, "loss": 4.7037, "step": 2111 }, { "epoch": 0.5369775941522327, "grad_norm": 23354.58203125, "learning_rate": 2.6164692817076535e-05, "loss": 4.7077, "step": 2112 }, { "epoch": 0.5372318449070396, "grad_norm": 24832.46484375, "learning_rate": 2.6142523799897683e-05, "loss": 4.7068, "step": 2113 }, { "epoch": 0.5374860956618465, "grad_norm": 23532.0859375, "learning_rate": 2.612035388238443e-05, "loss": 4.6967, "step": 2114 }, { "epoch": 0.5377403464166535, "grad_norm": 23633.271484375, "learning_rate": 2.6098183082007155e-05, "loss": 4.6981, "step": 2115 }, { "epoch": 0.5379945971714604, "grad_norm": 23384.09765625, "learning_rate": 2.6076011416236956e-05, "loss": 4.7102, "step": 2116 }, { "epoch": 0.5382488479262673, "grad_norm": 24732.24609375, "learning_rate": 2.6053838902545608e-05, "loss": 4.704, "step": 2117 }, { "epoch": 0.5385030986810742, "grad_norm": 23493.09765625, "learning_rate": 2.6031665558405536e-05, "loss": 4.6917, "step": 2118 }, { "epoch": 0.5387573494358812, "grad_norm": 23800.6171875, "learning_rate": 2.6009491401289842e-05, "loss": 4.6953, "step": 2119 }, { "epoch": 0.5390116001906881, "grad_norm": 23385.017578125, "learning_rate": 2.598731644867226e-05, "loss": 4.695, "step": 2120 }, { "epoch": 0.539265850945495, "grad_norm": 25875.88671875, "learning_rate": 2.5965140718027152e-05, "loss": 4.6892, "step": 2121 }, { "epoch": 0.539520101700302, "grad_norm": 23469.125, "learning_rate": 2.5942964226829488e-05, "loss": 4.6889, "step": 2122 }, { "epoch": 0.5397743524551089, "grad_norm": 23864.87109375, "learning_rate": 2.592078699255484e-05, "loss": 4.6852, "step": 2123 }, { "epoch": 0.5400286032099157, "grad_norm": 23716.55078125, "learning_rate": 2.5898609032679366e-05, "loss": 4.7008, "step": 2124 }, { "epoch": 0.5402828539647228, "grad_norm": 24383.921875, "learning_rate": 2.58764303646798e-05, "loss": 4.7043, "step": 2125 }, { "epoch": 0.5405371047195296, "grad_norm": 23633.931640625, "learning_rate": 2.5854251006033426e-05, "loss": 4.6915, "step": 2126 }, { "epoch": 0.5407913554743365, "grad_norm": 23681.015625, "learning_rate": 2.5832070974218083e-05, "loss": 4.7042, "step": 2127 }, { "epoch": 0.5410456062291434, "grad_norm": 23592.380859375, "learning_rate": 2.5809890286712136e-05, "loss": 4.6928, "step": 2128 }, { "epoch": 0.5412998569839504, "grad_norm": 23555.6015625, "learning_rate": 2.578770896099445e-05, "loss": 4.6773, "step": 2129 }, { "epoch": 0.5415541077387573, "grad_norm": 23653.02734375, "learning_rate": 2.5765527014544416e-05, "loss": 4.7001, "step": 2130 }, { "epoch": 0.5418083584935642, "grad_norm": 23540.796875, "learning_rate": 2.5743344464841912e-05, "loss": 4.6952, "step": 2131 }, { "epoch": 0.5420626092483712, "grad_norm": 23544.94921875, "learning_rate": 2.5721161329367278e-05, "loss": 4.6916, "step": 2132 }, { "epoch": 0.5423168600031781, "grad_norm": 23536.80859375, "learning_rate": 2.5698977625601323e-05, "loss": 4.6977, "step": 2133 }, { "epoch": 0.542571110757985, "grad_norm": 23597.537109375, "learning_rate": 2.567679337102531e-05, "loss": 4.6946, "step": 2134 }, { "epoch": 0.542825361512792, "grad_norm": 23443.845703125, "learning_rate": 2.5654608583120922e-05, "loss": 4.7006, "step": 2135 }, { "epoch": 0.5430796122675989, "grad_norm": 23682.0, "learning_rate": 2.5632423279370272e-05, "loss": 4.6903, "step": 2136 }, { "epoch": 0.5433338630224058, "grad_norm": 23529.369140625, "learning_rate": 2.5610237477255878e-05, "loss": 4.7025, "step": 2137 }, { "epoch": 0.5435881137772127, "grad_norm": 23552.083984375, "learning_rate": 2.558805119426065e-05, "loss": 4.7028, "step": 2138 }, { "epoch": 0.5438423645320197, "grad_norm": 23398.064453125, "learning_rate": 2.5565864447867878e-05, "loss": 4.6849, "step": 2139 }, { "epoch": 0.5440966152868266, "grad_norm": 23539.12109375, "learning_rate": 2.5543677255561222e-05, "loss": 4.676, "step": 2140 }, { "epoch": 0.5443508660416335, "grad_norm": 23525.49609375, "learning_rate": 2.5521489634824674e-05, "loss": 4.6899, "step": 2141 }, { "epoch": 0.5446051167964405, "grad_norm": 23321.806640625, "learning_rate": 2.5499301603142588e-05, "loss": 4.6916, "step": 2142 }, { "epoch": 0.5448593675512474, "grad_norm": 23877.974609375, "learning_rate": 2.547711317799963e-05, "loss": 4.6895, "step": 2143 }, { "epoch": 0.5451136183060543, "grad_norm": 23400.06640625, "learning_rate": 2.5454924376880772e-05, "loss": 4.6848, "step": 2144 }, { "epoch": 0.5453678690608613, "grad_norm": 23665.232421875, "learning_rate": 2.5432735217271297e-05, "loss": 4.6847, "step": 2145 }, { "epoch": 0.5456221198156682, "grad_norm": 23365.0, "learning_rate": 2.541054571665675e-05, "loss": 4.6838, "step": 2146 }, { "epoch": 0.5458763705704751, "grad_norm": 23369.833984375, "learning_rate": 2.538835589252296e-05, "loss": 4.6734, "step": 2147 }, { "epoch": 0.546130621325282, "grad_norm": 23808.841796875, "learning_rate": 2.5366165762356008e-05, "loss": 4.6907, "step": 2148 }, { "epoch": 0.546384872080089, "grad_norm": 23655.90625, "learning_rate": 2.5343975343642217e-05, "loss": 4.6985, "step": 2149 }, { "epoch": 0.5466391228348959, "grad_norm": 23475.595703125, "learning_rate": 2.532178465386813e-05, "loss": 4.6877, "step": 2150 }, { "epoch": 0.5468933735897028, "grad_norm": 23527.767578125, "learning_rate": 2.5299593710520515e-05, "loss": 4.699, "step": 2151 }, { "epoch": 0.5471476243445098, "grad_norm": 23683.05859375, "learning_rate": 2.527740253108632e-05, "loss": 4.6851, "step": 2152 }, { "epoch": 0.5474018750993167, "grad_norm": 23423.20703125, "learning_rate": 2.5255211133052703e-05, "loss": 4.6853, "step": 2153 }, { "epoch": 0.5476561258541236, "grad_norm": 23770.255859375, "learning_rate": 2.5233019533906994e-05, "loss": 4.6836, "step": 2154 }, { "epoch": 0.5479103766089306, "grad_norm": 23272.328125, "learning_rate": 2.521082775113665e-05, "loss": 4.6704, "step": 2155 }, { "epoch": 0.5481646273637375, "grad_norm": 23532.994140625, "learning_rate": 2.518863580222931e-05, "loss": 4.6863, "step": 2156 }, { "epoch": 0.5484188781185444, "grad_norm": 23402.43359375, "learning_rate": 2.516644370467272e-05, "loss": 4.683, "step": 2157 }, { "epoch": 0.5486731288733513, "grad_norm": 23518.423828125, "learning_rate": 2.5144251475954754e-05, "loss": 4.6881, "step": 2158 }, { "epoch": 0.5489273796281583, "grad_norm": 23648.654296875, "learning_rate": 2.512205913356339e-05, "loss": 4.6834, "step": 2159 }, { "epoch": 0.5491816303829652, "grad_norm": 23744.015625, "learning_rate": 2.5099866694986685e-05, "loss": 4.6927, "step": 2160 }, { "epoch": 0.5494358811377721, "grad_norm": 23478.8515625, "learning_rate": 2.5077674177712785e-05, "loss": 4.6796, "step": 2161 }, { "epoch": 0.5496901318925791, "grad_norm": 23574.271484375, "learning_rate": 2.5055481599229886e-05, "loss": 4.685, "step": 2162 }, { "epoch": 0.549944382647386, "grad_norm": 23522.96484375, "learning_rate": 2.5033288977026237e-05, "loss": 4.6783, "step": 2163 }, { "epoch": 0.5501986334021929, "grad_norm": 23506.69140625, "learning_rate": 2.5011096328590132e-05, "loss": 4.6865, "step": 2164 }, { "epoch": 0.5504528841569999, "grad_norm": 23553.56640625, "learning_rate": 2.4988903671409873e-05, "loss": 4.681, "step": 2165 }, { "epoch": 0.5507071349118068, "grad_norm": 23498.599609375, "learning_rate": 2.4966711022973773e-05, "loss": 4.6759, "step": 2166 }, { "epoch": 0.5509613856666137, "grad_norm": 23617.935546875, "learning_rate": 2.4944518400770123e-05, "loss": 4.6804, "step": 2167 }, { "epoch": 0.5512156364214206, "grad_norm": 23562.78515625, "learning_rate": 2.492232582228722e-05, "loss": 4.684, "step": 2168 }, { "epoch": 0.5514698871762276, "grad_norm": 23549.787109375, "learning_rate": 2.4900133305013325e-05, "loss": 4.6769, "step": 2169 }, { "epoch": 0.5517241379310345, "grad_norm": 23639.8984375, "learning_rate": 2.4877940866436613e-05, "loss": 4.6659, "step": 2170 }, { "epoch": 0.5519783886858414, "grad_norm": 23550.84765625, "learning_rate": 2.485574852404525e-05, "loss": 4.6738, "step": 2171 }, { "epoch": 0.5522326394406484, "grad_norm": 23430.025390625, "learning_rate": 2.4833556295327285e-05, "loss": 4.675, "step": 2172 }, { "epoch": 0.5524868901954553, "grad_norm": 24781.015625, "learning_rate": 2.481136419777069e-05, "loss": 4.6809, "step": 2173 }, { "epoch": 0.5527411409502622, "grad_norm": 23808.51953125, "learning_rate": 2.4789172248863352e-05, "loss": 4.6757, "step": 2174 }, { "epoch": 0.5529953917050692, "grad_norm": 23670.490234375, "learning_rate": 2.476698046609302e-05, "loss": 4.6832, "step": 2175 }, { "epoch": 0.5532496424598761, "grad_norm": 23626.3671875, "learning_rate": 2.4744788866947293e-05, "loss": 4.68, "step": 2176 }, { "epoch": 0.553503893214683, "grad_norm": 23613.134765625, "learning_rate": 2.4722597468913687e-05, "loss": 4.6705, "step": 2177 }, { "epoch": 0.5537581439694899, "grad_norm": 23579.19140625, "learning_rate": 2.4700406289479498e-05, "loss": 4.6899, "step": 2178 }, { "epoch": 0.5540123947242969, "grad_norm": 23817.73046875, "learning_rate": 2.4678215346131874e-05, "loss": 4.677, "step": 2179 }, { "epoch": 0.5542666454791038, "grad_norm": 23665.787109375, "learning_rate": 2.465602465635779e-05, "loss": 4.6795, "step": 2180 }, { "epoch": 0.5545208962339107, "grad_norm": 24260.82421875, "learning_rate": 2.4633834237643998e-05, "loss": 4.693, "step": 2181 }, { "epoch": 0.5547751469887177, "grad_norm": 23447.123046875, "learning_rate": 2.4611644107477043e-05, "loss": 4.666, "step": 2182 }, { "epoch": 0.5550293977435246, "grad_norm": 23537.998046875, "learning_rate": 2.458945428334325e-05, "loss": 4.6793, "step": 2183 }, { "epoch": 0.5552836484983315, "grad_norm": 23895.3984375, "learning_rate": 2.456726478272871e-05, "loss": 4.7007, "step": 2184 }, { "epoch": 0.5555378992531385, "grad_norm": 23540.876953125, "learning_rate": 2.4545075623119227e-05, "loss": 4.6602, "step": 2185 }, { "epoch": 0.5557921500079454, "grad_norm": 24107.580078125, "learning_rate": 2.4522886822000373e-05, "loss": 4.6786, "step": 2186 }, { "epoch": 0.5560464007627522, "grad_norm": 23539.541015625, "learning_rate": 2.450069839685742e-05, "loss": 4.6771, "step": 2187 }, { "epoch": 0.5563006515175591, "grad_norm": 23844.34765625, "learning_rate": 2.4478510365175328e-05, "loss": 4.6693, "step": 2188 }, { "epoch": 0.5565549022723661, "grad_norm": 23687.580078125, "learning_rate": 2.4456322744438784e-05, "loss": 4.6809, "step": 2189 }, { "epoch": 0.556809153027173, "grad_norm": 23721.546875, "learning_rate": 2.4434135552132128e-05, "loss": 4.6725, "step": 2190 }, { "epoch": 0.5570634037819799, "grad_norm": 23811.8125, "learning_rate": 2.4411948805739353e-05, "loss": 4.6776, "step": 2191 }, { "epoch": 0.5573176545367869, "grad_norm": 23693.94921875, "learning_rate": 2.438976252274413e-05, "loss": 4.6757, "step": 2192 }, { "epoch": 0.5575719052915938, "grad_norm": 23756.41796875, "learning_rate": 2.4367576720629737e-05, "loss": 4.6709, "step": 2193 }, { "epoch": 0.5578261560464007, "grad_norm": 23525.6171875, "learning_rate": 2.4345391416879084e-05, "loss": 4.6678, "step": 2194 }, { "epoch": 0.5580804068012077, "grad_norm": 23918.93359375, "learning_rate": 2.4323206628974697e-05, "loss": 4.6649, "step": 2195 }, { "epoch": 0.5583346575560146, "grad_norm": 23451.505859375, "learning_rate": 2.4301022374398687e-05, "loss": 4.678, "step": 2196 }, { "epoch": 0.5585889083108215, "grad_norm": 23600.818359375, "learning_rate": 2.4278838670632738e-05, "loss": 4.6644, "step": 2197 }, { "epoch": 0.5588431590656284, "grad_norm": 23666.921875, "learning_rate": 2.4256655535158097e-05, "loss": 4.6787, "step": 2198 }, { "epoch": 0.5590974098204354, "grad_norm": 23449.609375, "learning_rate": 2.423447298545559e-05, "loss": 4.6602, "step": 2199 }, { "epoch": 0.5593516605752423, "grad_norm": 23701.328125, "learning_rate": 2.421229103900556e-05, "loss": 4.6676, "step": 2200 }, { "epoch": 0.5593516605752423, "eval_loss": 9.412834167480469, "eval_runtime": 696.9193, "eval_samples_per_second": 152.078, "eval_steps_per_second": 9.506, "step": 2200 }, { "epoch": 0.5596059113300492, "grad_norm": 23569.12890625, "learning_rate": 2.4190109713287873e-05, "loss": 4.6554, "step": 2201 }, { "epoch": 0.5598601620848562, "grad_norm": 23646.833984375, "learning_rate": 2.416792902578192e-05, "loss": 4.6494, "step": 2202 }, { "epoch": 0.5601144128396631, "grad_norm": 23752.40625, "learning_rate": 2.4145748993966576e-05, "loss": 4.682, "step": 2203 }, { "epoch": 0.56036866359447, "grad_norm": 23682.833984375, "learning_rate": 2.4123569635320205e-05, "loss": 4.6673, "step": 2204 }, { "epoch": 0.560622914349277, "grad_norm": 23659.744140625, "learning_rate": 2.410139096732064e-05, "loss": 4.6584, "step": 2205 }, { "epoch": 0.5608771651040839, "grad_norm": 23427.720703125, "learning_rate": 2.407921300744517e-05, "loss": 4.6631, "step": 2206 }, { "epoch": 0.5611314158588908, "grad_norm": 23747.083984375, "learning_rate": 2.4057035773170515e-05, "loss": 4.6799, "step": 2207 }, { "epoch": 0.5613856666136977, "grad_norm": 23551.533203125, "learning_rate": 2.4034859281972854e-05, "loss": 4.6625, "step": 2208 }, { "epoch": 0.5616399173685047, "grad_norm": 23538.8203125, "learning_rate": 2.4012683551327743e-05, "loss": 4.6762, "step": 2209 }, { "epoch": 0.5618941681233116, "grad_norm": 23587.6953125, "learning_rate": 2.3990508598710153e-05, "loss": 4.6676, "step": 2210 }, { "epoch": 0.5621484188781185, "grad_norm": 23740.1640625, "learning_rate": 2.3968334441594466e-05, "loss": 4.6661, "step": 2211 }, { "epoch": 0.5624026696329255, "grad_norm": 23712.07421875, "learning_rate": 2.3946161097454405e-05, "loss": 4.675, "step": 2212 }, { "epoch": 0.5626569203877324, "grad_norm": 23351.328125, "learning_rate": 2.3923988583763046e-05, "loss": 4.6619, "step": 2213 }, { "epoch": 0.5629111711425393, "grad_norm": 23578.640625, "learning_rate": 2.3901816917992854e-05, "loss": 4.661, "step": 2214 }, { "epoch": 0.5631654218973463, "grad_norm": 23539.26171875, "learning_rate": 2.387964611761558e-05, "loss": 4.661, "step": 2215 }, { "epoch": 0.5634196726521532, "grad_norm": 23750.275390625, "learning_rate": 2.3857476200102316e-05, "loss": 4.685, "step": 2216 }, { "epoch": 0.5636739234069601, "grad_norm": 23637.703125, "learning_rate": 2.3835307182923468e-05, "loss": 4.6693, "step": 2217 }, { "epoch": 0.563928174161767, "grad_norm": 23817.5859375, "learning_rate": 2.3813139083548715e-05, "loss": 4.656, "step": 2218 }, { "epoch": 0.564182424916574, "grad_norm": 23694.33984375, "learning_rate": 2.3790971919446994e-05, "loss": 4.6586, "step": 2219 }, { "epoch": 0.5644366756713809, "grad_norm": 23545.91015625, "learning_rate": 2.3768805708086556e-05, "loss": 4.6573, "step": 2220 }, { "epoch": 0.5646909264261878, "grad_norm": 23494.041015625, "learning_rate": 2.3746640466934868e-05, "loss": 4.656, "step": 2221 }, { "epoch": 0.5649451771809948, "grad_norm": 23604.33203125, "learning_rate": 2.3724476213458622e-05, "loss": 4.6702, "step": 2222 }, { "epoch": 0.5651994279358017, "grad_norm": 23475.966796875, "learning_rate": 2.3702312965123768e-05, "loss": 4.6617, "step": 2223 }, { "epoch": 0.5654536786906086, "grad_norm": 23563.611328125, "learning_rate": 2.3680150739395433e-05, "loss": 4.6688, "step": 2224 }, { "epoch": 0.5657079294454156, "grad_norm": 23558.9921875, "learning_rate": 2.3657989553737943e-05, "loss": 4.6619, "step": 2225 }, { "epoch": 0.5659621802002225, "grad_norm": 23713.333984375, "learning_rate": 2.3635829425614816e-05, "loss": 4.6547, "step": 2226 }, { "epoch": 0.5662164309550294, "grad_norm": 23621.0625, "learning_rate": 2.3613670372488737e-05, "loss": 4.6585, "step": 2227 }, { "epoch": 0.5664706817098363, "grad_norm": 23559.63671875, "learning_rate": 2.3591512411821533e-05, "loss": 4.6608, "step": 2228 }, { "epoch": 0.5667249324646433, "grad_norm": 23587.990234375, "learning_rate": 2.3569355561074162e-05, "loss": 4.6704, "step": 2229 }, { "epoch": 0.5669791832194502, "grad_norm": 23695.6875, "learning_rate": 2.3547199837706733e-05, "loss": 4.6537, "step": 2230 }, { "epoch": 0.5672334339742571, "grad_norm": 23656.74609375, "learning_rate": 2.3525045259178457e-05, "loss": 4.6547, "step": 2231 }, { "epoch": 0.5674876847290641, "grad_norm": 23655.390625, "learning_rate": 2.3502891842947623e-05, "loss": 4.6549, "step": 2232 }, { "epoch": 0.567741935483871, "grad_norm": 23649.732421875, "learning_rate": 2.348073960647164e-05, "loss": 4.6609, "step": 2233 }, { "epoch": 0.5679961862386779, "grad_norm": 23770.0625, "learning_rate": 2.3458588567206956e-05, "loss": 4.6475, "step": 2234 }, { "epoch": 0.5682504369934849, "grad_norm": 24422.041015625, "learning_rate": 2.343643874260908e-05, "loss": 4.6458, "step": 2235 }, { "epoch": 0.5685046877482918, "grad_norm": 23832.166015625, "learning_rate": 2.3414290150132588e-05, "loss": 4.6625, "step": 2236 }, { "epoch": 0.5687589385030987, "grad_norm": 23755.197265625, "learning_rate": 2.3392142807231066e-05, "loss": 4.6631, "step": 2237 }, { "epoch": 0.5690131892579056, "grad_norm": 23709.3046875, "learning_rate": 2.33699967313571e-05, "loss": 4.6583, "step": 2238 }, { "epoch": 0.5692674400127126, "grad_norm": 23829.67578125, "learning_rate": 2.3347851939962317e-05, "loss": 4.6642, "step": 2239 }, { "epoch": 0.5695216907675195, "grad_norm": 23508.193359375, "learning_rate": 2.3325708450497297e-05, "loss": 4.6493, "step": 2240 }, { "epoch": 0.5697759415223264, "grad_norm": 23656.46484375, "learning_rate": 2.3303566280411604e-05, "loss": 4.6611, "step": 2241 }, { "epoch": 0.5700301922771334, "grad_norm": 23611.6328125, "learning_rate": 2.328142544715378e-05, "loss": 4.659, "step": 2242 }, { "epoch": 0.5702844430319403, "grad_norm": 23784.775390625, "learning_rate": 2.325928596817129e-05, "loss": 4.647, "step": 2243 }, { "epoch": 0.5705386937867472, "grad_norm": 23664.802734375, "learning_rate": 2.323714786091053e-05, "loss": 4.6583, "step": 2244 }, { "epoch": 0.5707929445415542, "grad_norm": 23850.833984375, "learning_rate": 2.3215011142816843e-05, "loss": 4.6484, "step": 2245 }, { "epoch": 0.571047195296361, "grad_norm": 23581.072265625, "learning_rate": 2.3192875831334453e-05, "loss": 4.6491, "step": 2246 }, { "epoch": 0.571301446051168, "grad_norm": 23552.564453125, "learning_rate": 2.3170741943906476e-05, "loss": 4.6525, "step": 2247 }, { "epoch": 0.5715556968059748, "grad_norm": 23436.001953125, "learning_rate": 2.3148609497974927e-05, "loss": 4.6426, "step": 2248 }, { "epoch": 0.5718099475607819, "grad_norm": 23562.5, "learning_rate": 2.312647851098066e-05, "loss": 4.6552, "step": 2249 }, { "epoch": 0.5720641983155887, "grad_norm": 23684.80859375, "learning_rate": 2.3104349000363383e-05, "loss": 4.656, "step": 2250 }, { "epoch": 0.5723184490703956, "grad_norm": 23678.734375, "learning_rate": 2.308222098356166e-05, "loss": 4.6475, "step": 2251 }, { "epoch": 0.5725726998252026, "grad_norm": 23693.77734375, "learning_rate": 2.306009447801287e-05, "loss": 4.6478, "step": 2252 }, { "epoch": 0.5728269505800095, "grad_norm": 23781.884765625, "learning_rate": 2.3037969501153173e-05, "loss": 4.6622, "step": 2253 }, { "epoch": 0.5730812013348164, "grad_norm": 23766.623046875, "learning_rate": 2.3015846070417572e-05, "loss": 4.65, "step": 2254 }, { "epoch": 0.5733354520896234, "grad_norm": 23729.931640625, "learning_rate": 2.2993724203239815e-05, "loss": 4.6452, "step": 2255 }, { "epoch": 0.5735897028444303, "grad_norm": 23766.7109375, "learning_rate": 2.297160391705242e-05, "loss": 4.6533, "step": 2256 }, { "epoch": 0.5738439535992372, "grad_norm": 23791.107421875, "learning_rate": 2.294948522928669e-05, "loss": 4.649, "step": 2257 }, { "epoch": 0.5740982043540441, "grad_norm": 23727.583984375, "learning_rate": 2.292736815737264e-05, "loss": 4.6383, "step": 2258 }, { "epoch": 0.5743524551088511, "grad_norm": 23779.15234375, "learning_rate": 2.2905252718739017e-05, "loss": 4.6527, "step": 2259 }, { "epoch": 0.574606705863658, "grad_norm": 23620.60546875, "learning_rate": 2.2883138930813276e-05, "loss": 4.6423, "step": 2260 }, { "epoch": 0.5748609566184649, "grad_norm": 23726.921875, "learning_rate": 2.286102681102159e-05, "loss": 4.6445, "step": 2261 }, { "epoch": 0.5751152073732719, "grad_norm": 23875.283203125, "learning_rate": 2.2838916376788806e-05, "loss": 4.6501, "step": 2262 }, { "epoch": 0.5753694581280788, "grad_norm": 23707.943359375, "learning_rate": 2.281680764553843e-05, "loss": 4.6516, "step": 2263 }, { "epoch": 0.5756237088828857, "grad_norm": 23859.169921875, "learning_rate": 2.279470063469266e-05, "loss": 4.6586, "step": 2264 }, { "epoch": 0.5758779596376927, "grad_norm": 23808.572265625, "learning_rate": 2.27725953616723e-05, "loss": 4.6685, "step": 2265 }, { "epoch": 0.5761322103924996, "grad_norm": 23589.271484375, "learning_rate": 2.2750491843896795e-05, "loss": 4.6517, "step": 2266 }, { "epoch": 0.5763864611473065, "grad_norm": 23687.83203125, "learning_rate": 2.272839009878423e-05, "loss": 4.6475, "step": 2267 }, { "epoch": 0.5766407119021134, "grad_norm": 23941.404296875, "learning_rate": 2.2706290143751275e-05, "loss": 4.656, "step": 2268 }, { "epoch": 0.5768949626569204, "grad_norm": 23688.3046875, "learning_rate": 2.2684191996213173e-05, "loss": 4.6569, "step": 2269 }, { "epoch": 0.5771492134117273, "grad_norm": 23800.994140625, "learning_rate": 2.266209567358379e-05, "loss": 4.6521, "step": 2270 }, { "epoch": 0.5774034641665342, "grad_norm": 23917.234375, "learning_rate": 2.26400011932755e-05, "loss": 4.6535, "step": 2271 }, { "epoch": 0.5776577149213412, "grad_norm": 23950.013671875, "learning_rate": 2.2617908572699255e-05, "loss": 4.6477, "step": 2272 }, { "epoch": 0.5779119656761481, "grad_norm": 23824.01171875, "learning_rate": 2.259581782926454e-05, "loss": 4.6452, "step": 2273 }, { "epoch": 0.578166216430955, "grad_norm": 23571.326171875, "learning_rate": 2.257372898037936e-05, "loss": 4.6371, "step": 2274 }, { "epoch": 0.578420467185762, "grad_norm": 23664.302734375, "learning_rate": 2.2551642043450206e-05, "loss": 4.6404, "step": 2275 }, { "epoch": 0.5786747179405689, "grad_norm": 23698.400390625, "learning_rate": 2.2529557035882106e-05, "loss": 4.6437, "step": 2276 }, { "epoch": 0.5789289686953758, "grad_norm": 24018.16796875, "learning_rate": 2.2507473975078525e-05, "loss": 4.6531, "step": 2277 }, { "epoch": 0.5791832194501827, "grad_norm": 23694.31640625, "learning_rate": 2.2485392878441408e-05, "loss": 4.6532, "step": 2278 }, { "epoch": 0.5794374702049897, "grad_norm": 31096.650390625, "learning_rate": 2.2463313763371172e-05, "loss": 4.6481, "step": 2279 }, { "epoch": 0.5796917209597966, "grad_norm": 24021.9453125, "learning_rate": 2.2441236647266643e-05, "loss": 4.6544, "step": 2280 }, { "epoch": 0.5799459717146035, "grad_norm": 23882.1328125, "learning_rate": 2.241916154752508e-05, "loss": 4.6595, "step": 2281 }, { "epoch": 0.5802002224694105, "grad_norm": 24133.59765625, "learning_rate": 2.239708848154217e-05, "loss": 4.6477, "step": 2282 }, { "epoch": 0.5804544732242174, "grad_norm": 23939.361328125, "learning_rate": 2.2375017466711974e-05, "loss": 4.6486, "step": 2283 }, { "epoch": 0.5807087239790243, "grad_norm": 24304.78125, "learning_rate": 2.2352948520426952e-05, "loss": 4.6377, "step": 2284 }, { "epoch": 0.5809629747338313, "grad_norm": 24256.896484375, "learning_rate": 2.233088166007793e-05, "loss": 4.634, "step": 2285 }, { "epoch": 0.5812172254886382, "grad_norm": 23963.439453125, "learning_rate": 2.2308816903054093e-05, "loss": 4.6451, "step": 2286 }, { "epoch": 0.5814714762434451, "grad_norm": 23782.306640625, "learning_rate": 2.228675426674295e-05, "loss": 4.6485, "step": 2287 }, { "epoch": 0.581725726998252, "grad_norm": 24005.71484375, "learning_rate": 2.2264693768530365e-05, "loss": 4.6496, "step": 2288 }, { "epoch": 0.581979977753059, "grad_norm": 23960.724609375, "learning_rate": 2.224263542580051e-05, "loss": 4.6512, "step": 2289 }, { "epoch": 0.5822342285078659, "grad_norm": 23995.58203125, "learning_rate": 2.222057925593585e-05, "loss": 4.6439, "step": 2290 }, { "epoch": 0.5824884792626728, "grad_norm": 23967.404296875, "learning_rate": 2.2198525276317132e-05, "loss": 4.6247, "step": 2291 }, { "epoch": 0.5827427300174798, "grad_norm": 23795.951171875, "learning_rate": 2.2176473504323403e-05, "loss": 4.6459, "step": 2292 }, { "epoch": 0.5829969807722867, "grad_norm": 23767.240234375, "learning_rate": 2.2154423957331943e-05, "loss": 4.6333, "step": 2293 }, { "epoch": 0.5832512315270936, "grad_norm": 23764.513671875, "learning_rate": 2.2132376652718293e-05, "loss": 4.6286, "step": 2294 }, { "epoch": 0.5835054822819006, "grad_norm": 24018.6171875, "learning_rate": 2.2110331607856232e-05, "loss": 4.6403, "step": 2295 }, { "epoch": 0.5837597330367075, "grad_norm": 23816.189453125, "learning_rate": 2.2088288840117747e-05, "loss": 4.6459, "step": 2296 }, { "epoch": 0.5840139837915144, "grad_norm": 23951.2734375, "learning_rate": 2.2066248366873016e-05, "loss": 4.6483, "step": 2297 }, { "epoch": 0.5842682345463213, "grad_norm": 23919.416015625, "learning_rate": 2.204421020549045e-05, "loss": 4.6433, "step": 2298 }, { "epoch": 0.5845224853011283, "grad_norm": 23633.701171875, "learning_rate": 2.202217437333661e-05, "loss": 4.629, "step": 2299 }, { "epoch": 0.5847767360559352, "grad_norm": 23658.61328125, "learning_rate": 2.2000140887776217e-05, "loss": 4.6164, "step": 2300 }, { "epoch": 0.5847767360559352, "eval_loss": 9.34914493560791, "eval_runtime": 696.6936, "eval_samples_per_second": 152.127, "eval_steps_per_second": 9.509, "step": 2300 }, { "epoch": 0.5850309868107421, "grad_norm": 23826.490234375, "learning_rate": 2.1978109766172163e-05, "loss": 4.6445, "step": 2301 }, { "epoch": 0.5852852375655491, "grad_norm": 24745.111328125, "learning_rate": 2.195608102588546e-05, "loss": 4.656, "step": 2302 }, { "epoch": 0.585539488320356, "grad_norm": 24077.908203125, "learning_rate": 2.193405468427525e-05, "loss": 4.625, "step": 2303 }, { "epoch": 0.5857937390751629, "grad_norm": 24552.796875, "learning_rate": 2.1912030758698787e-05, "loss": 4.6509, "step": 2304 }, { "epoch": 0.5860479898299699, "grad_norm": 23921.84765625, "learning_rate": 2.1890009266511426e-05, "loss": 4.6403, "step": 2305 }, { "epoch": 0.5863022405847768, "grad_norm": 24942.193359375, "learning_rate": 2.1867990225066575e-05, "loss": 4.6431, "step": 2306 }, { "epoch": 0.5865564913395837, "grad_norm": 23944.13671875, "learning_rate": 2.184597365171576e-05, "loss": 4.6406, "step": 2307 }, { "epoch": 0.5868107420943905, "grad_norm": 24154.078125, "learning_rate": 2.1823959563808516e-05, "loss": 4.6455, "step": 2308 }, { "epoch": 0.5870649928491976, "grad_norm": 23738.748046875, "learning_rate": 2.1801947978692442e-05, "loss": 4.6272, "step": 2309 }, { "epoch": 0.5873192436040044, "grad_norm": 24209.685546875, "learning_rate": 2.1779938913713175e-05, "loss": 4.6309, "step": 2310 }, { "epoch": 0.5875734943588113, "grad_norm": 24121.2421875, "learning_rate": 2.175793238621434e-05, "loss": 4.6309, "step": 2311 }, { "epoch": 0.5878277451136183, "grad_norm": 23901.662109375, "learning_rate": 2.173592841353757e-05, "loss": 4.6364, "step": 2312 }, { "epoch": 0.5880819958684252, "grad_norm": 24193.826171875, "learning_rate": 2.1713927013022504e-05, "loss": 4.6363, "step": 2313 }, { "epoch": 0.5883362466232321, "grad_norm": 23835.875, "learning_rate": 2.1691928202006727e-05, "loss": 4.6468, "step": 2314 }, { "epoch": 0.5885904973780391, "grad_norm": 24479.708984375, "learning_rate": 2.1669931997825797e-05, "loss": 4.6484, "step": 2315 }, { "epoch": 0.588844748132846, "grad_norm": 23919.068359375, "learning_rate": 2.164793841781323e-05, "loss": 4.6271, "step": 2316 }, { "epoch": 0.5890989988876529, "grad_norm": 24080.228515625, "learning_rate": 2.162594747930045e-05, "loss": 4.6392, "step": 2317 }, { "epoch": 0.5893532496424598, "grad_norm": 23908.3125, "learning_rate": 2.1603959199616802e-05, "loss": 4.6322, "step": 2318 }, { "epoch": 0.5896075003972668, "grad_norm": 23747.232421875, "learning_rate": 2.1581973596089557e-05, "loss": 4.6348, "step": 2319 }, { "epoch": 0.5898617511520737, "grad_norm": 24052.70703125, "learning_rate": 2.1559990686043873e-05, "loss": 4.6429, "step": 2320 }, { "epoch": 0.5901160019068806, "grad_norm": 23932.85546875, "learning_rate": 2.1538010486802747e-05, "loss": 4.6391, "step": 2321 }, { "epoch": 0.5903702526616876, "grad_norm": 23887.466796875, "learning_rate": 2.15160330156871e-05, "loss": 4.6404, "step": 2322 }, { "epoch": 0.5906245034164945, "grad_norm": 23930.689453125, "learning_rate": 2.149405829001566e-05, "loss": 4.6201, "step": 2323 }, { "epoch": 0.5908787541713014, "grad_norm": 23932.43359375, "learning_rate": 2.1472086327105e-05, "loss": 4.6257, "step": 2324 }, { "epoch": 0.5911330049261084, "grad_norm": 24049.189453125, "learning_rate": 2.1450117144269518e-05, "loss": 4.6346, "step": 2325 }, { "epoch": 0.5913872556809153, "grad_norm": 23789.107421875, "learning_rate": 2.142815075882144e-05, "loss": 4.6313, "step": 2326 }, { "epoch": 0.5916415064357222, "grad_norm": 24165.654296875, "learning_rate": 2.140618718807076e-05, "loss": 4.6325, "step": 2327 }, { "epoch": 0.5918957571905291, "grad_norm": 23930.60546875, "learning_rate": 2.1384226449325258e-05, "loss": 4.623, "step": 2328 }, { "epoch": 0.5921500079453361, "grad_norm": 30069.75, "learning_rate": 2.13622685598905e-05, "loss": 4.6395, "step": 2329 }, { "epoch": 0.592404258700143, "grad_norm": 24049.90234375, "learning_rate": 2.1340313537069794e-05, "loss": 4.6289, "step": 2330 }, { "epoch": 0.5926585094549499, "grad_norm": 24269.625, "learning_rate": 2.1318361398164184e-05, "loss": 4.6249, "step": 2331 }, { "epoch": 0.5929127602097569, "grad_norm": 23968.32421875, "learning_rate": 2.1296412160472463e-05, "loss": 4.6224, "step": 2332 }, { "epoch": 0.5931670109645638, "grad_norm": 24813.8671875, "learning_rate": 2.1274465841291113e-05, "loss": 4.6206, "step": 2333 }, { "epoch": 0.5934212617193707, "grad_norm": 23804.875, "learning_rate": 2.1252522457914316e-05, "loss": 4.6267, "step": 2334 }, { "epoch": 0.5936755124741776, "grad_norm": 24078.123046875, "learning_rate": 2.1230582027633966e-05, "loss": 4.6244, "step": 2335 }, { "epoch": 0.5939297632289846, "grad_norm": 24245.869140625, "learning_rate": 2.1208644567739617e-05, "loss": 4.626, "step": 2336 }, { "epoch": 0.5941840139837915, "grad_norm": 24020.359375, "learning_rate": 2.1186710095518464e-05, "loss": 4.6199, "step": 2337 }, { "epoch": 0.5944382647385984, "grad_norm": 24579.013671875, "learning_rate": 2.1164778628255387e-05, "loss": 4.6299, "step": 2338 }, { "epoch": 0.5946925154934054, "grad_norm": 23744.48828125, "learning_rate": 2.1142850183232848e-05, "loss": 4.6126, "step": 2339 }, { "epoch": 0.5949467662482123, "grad_norm": 24056.154296875, "learning_rate": 2.112092477773097e-05, "loss": 4.6177, "step": 2340 }, { "epoch": 0.5952010170030192, "grad_norm": 24197.67578125, "learning_rate": 2.109900242902746e-05, "loss": 4.6194, "step": 2341 }, { "epoch": 0.5954552677578262, "grad_norm": 23995.0859375, "learning_rate": 2.1077083154397632e-05, "loss": 4.6236, "step": 2342 }, { "epoch": 0.5957095185126331, "grad_norm": 24290.8359375, "learning_rate": 2.1055166971114345e-05, "loss": 4.6327, "step": 2343 }, { "epoch": 0.59596376926744, "grad_norm": 23867.078125, "learning_rate": 2.1033253896448062e-05, "loss": 4.6153, "step": 2344 }, { "epoch": 0.5962180200222469, "grad_norm": 24057.21484375, "learning_rate": 2.101134394766677e-05, "loss": 4.6227, "step": 2345 }, { "epoch": 0.5964722707770539, "grad_norm": 24038.43359375, "learning_rate": 2.0989437142035998e-05, "loss": 4.6253, "step": 2346 }, { "epoch": 0.5967265215318608, "grad_norm": 24079.84375, "learning_rate": 2.0967533496818812e-05, "loss": 4.6344, "step": 2347 }, { "epoch": 0.5969807722866677, "grad_norm": 24216.71875, "learning_rate": 2.0945633029275768e-05, "loss": 4.6171, "step": 2348 }, { "epoch": 0.5972350230414747, "grad_norm": 23881.865234375, "learning_rate": 2.092373575666492e-05, "loss": 4.6144, "step": 2349 }, { "epoch": 0.5974892737962816, "grad_norm": 24141.783203125, "learning_rate": 2.0901841696241824e-05, "loss": 4.6191, "step": 2350 }, { "epoch": 0.5977435245510885, "grad_norm": 23909.884765625, "learning_rate": 2.0879950865259483e-05, "loss": 4.6283, "step": 2351 }, { "epoch": 0.5979977753058955, "grad_norm": 24007.158203125, "learning_rate": 2.085806328096836e-05, "loss": 4.6127, "step": 2352 }, { "epoch": 0.5982520260607024, "grad_norm": 23884.1640625, "learning_rate": 2.0836178960616374e-05, "loss": 4.6076, "step": 2353 }, { "epoch": 0.5985062768155093, "grad_norm": 23840.015625, "learning_rate": 2.081429792144886e-05, "loss": 4.6197, "step": 2354 }, { "epoch": 0.5987605275703162, "grad_norm": 23980.75390625, "learning_rate": 2.0792420180708556e-05, "loss": 4.629, "step": 2355 }, { "epoch": 0.5990147783251232, "grad_norm": 23877.373046875, "learning_rate": 2.0770545755635612e-05, "loss": 4.6283, "step": 2356 }, { "epoch": 0.5992690290799301, "grad_norm": 24055.482421875, "learning_rate": 2.0748674663467584e-05, "loss": 4.6246, "step": 2357 }, { "epoch": 0.599523279834737, "grad_norm": 23922.23046875, "learning_rate": 2.0726806921439377e-05, "loss": 4.6318, "step": 2358 }, { "epoch": 0.599777530589544, "grad_norm": 24015.642578125, "learning_rate": 2.0704942546783244e-05, "loss": 4.6179, "step": 2359 }, { "epoch": 0.6000317813443509, "grad_norm": 24127.0625, "learning_rate": 2.0683081556728834e-05, "loss": 4.6268, "step": 2360 }, { "epoch": 0.6002860320991578, "grad_norm": 23787.287109375, "learning_rate": 2.0661223968503073e-05, "loss": 4.6286, "step": 2361 }, { "epoch": 0.6005402828539648, "grad_norm": 24056.884765625, "learning_rate": 2.0639369799330236e-05, "loss": 4.6387, "step": 2362 }, { "epoch": 0.6007945336087717, "grad_norm": 23954.59765625, "learning_rate": 2.061751906643192e-05, "loss": 4.6202, "step": 2363 }, { "epoch": 0.6010487843635786, "grad_norm": 23982.9140625, "learning_rate": 2.059567178702697e-05, "loss": 4.6083, "step": 2364 }, { "epoch": 0.6013030351183855, "grad_norm": 23947.17578125, "learning_rate": 2.0573827978331528e-05, "loss": 4.6392, "step": 2365 }, { "epoch": 0.6015572858731925, "grad_norm": 24015.712890625, "learning_rate": 2.0551987657559023e-05, "loss": 4.6405, "step": 2366 }, { "epoch": 0.6018115366279994, "grad_norm": 24191.13671875, "learning_rate": 2.053015084192012e-05, "loss": 4.6193, "step": 2367 }, { "epoch": 0.6020657873828063, "grad_norm": 23959.859375, "learning_rate": 2.0508317548622703e-05, "loss": 4.6161, "step": 2368 }, { "epoch": 0.6023200381376133, "grad_norm": 23834.861328125, "learning_rate": 2.0486487794871915e-05, "loss": 4.6132, "step": 2369 }, { "epoch": 0.6025742888924202, "grad_norm": 23883.919921875, "learning_rate": 2.046466159787008e-05, "loss": 4.6239, "step": 2370 }, { "epoch": 0.602828539647227, "grad_norm": 24051.052734375, "learning_rate": 2.044283897481673e-05, "loss": 4.6122, "step": 2371 }, { "epoch": 0.603082790402034, "grad_norm": 23979.1796875, "learning_rate": 2.0421019942908588e-05, "loss": 4.616, "step": 2372 }, { "epoch": 0.603337041156841, "grad_norm": 23985.74609375, "learning_rate": 2.039920451933955e-05, "loss": 4.61, "step": 2373 }, { "epoch": 0.6035912919116478, "grad_norm": 24028.240234375, "learning_rate": 2.0377392721300634e-05, "loss": 4.6134, "step": 2374 }, { "epoch": 0.6038455426664547, "grad_norm": 24225.2109375, "learning_rate": 2.0355584565980055e-05, "loss": 4.6203, "step": 2375 }, { "epoch": 0.6040997934212617, "grad_norm": 24082.880859375, "learning_rate": 2.033378007056311e-05, "loss": 4.6098, "step": 2376 }, { "epoch": 0.6043540441760686, "grad_norm": 24132.091796875, "learning_rate": 2.0311979252232233e-05, "loss": 4.6122, "step": 2377 }, { "epoch": 0.6046082949308755, "grad_norm": 23865.46484375, "learning_rate": 2.029018212816697e-05, "loss": 4.61, "step": 2378 }, { "epoch": 0.6048625456856825, "grad_norm": 24046.63671875, "learning_rate": 2.026838871554394e-05, "loss": 4.6025, "step": 2379 }, { "epoch": 0.6051167964404894, "grad_norm": 24040.93359375, "learning_rate": 2.0246599031536826e-05, "loss": 4.6109, "step": 2380 }, { "epoch": 0.6053710471952963, "grad_norm": 24000.0078125, "learning_rate": 2.022481309331641e-05, "loss": 4.6087, "step": 2381 }, { "epoch": 0.6056252979501033, "grad_norm": 23814.85546875, "learning_rate": 2.0203030918050485e-05, "loss": 4.6103, "step": 2382 }, { "epoch": 0.6058795487049102, "grad_norm": 24202.513671875, "learning_rate": 2.0181252522903896e-05, "loss": 4.6042, "step": 2383 }, { "epoch": 0.6061337994597171, "grad_norm": 23995.8203125, "learning_rate": 2.015947792503852e-05, "loss": 4.6267, "step": 2384 }, { "epoch": 0.606388050214524, "grad_norm": 24072.720703125, "learning_rate": 2.013770714161322e-05, "loss": 4.6166, "step": 2385 }, { "epoch": 0.606642300969331, "grad_norm": 24216.36328125, "learning_rate": 2.0115940189783862e-05, "loss": 4.6323, "step": 2386 }, { "epoch": 0.6068965517241379, "grad_norm": 23951.4140625, "learning_rate": 2.0094177086703284e-05, "loss": 4.6179, "step": 2387 }, { "epoch": 0.6071508024789448, "grad_norm": 24178.369140625, "learning_rate": 2.0072417849521318e-05, "loss": 4.6158, "step": 2388 }, { "epoch": 0.6074050532337518, "grad_norm": 24516.453125, "learning_rate": 2.0050662495384726e-05, "loss": 4.611, "step": 2389 }, { "epoch": 0.6076593039885587, "grad_norm": 23935.572265625, "learning_rate": 2.0028911041437198e-05, "loss": 4.6106, "step": 2390 }, { "epoch": 0.6079135547433656, "grad_norm": 24013.7578125, "learning_rate": 2.0007163504819394e-05, "loss": 4.603, "step": 2391 }, { "epoch": 0.6081678054981726, "grad_norm": 23881.341796875, "learning_rate": 1.9985419902668845e-05, "loss": 4.6108, "step": 2392 }, { "epoch": 0.6084220562529795, "grad_norm": 24132.826171875, "learning_rate": 1.9963680252119992e-05, "loss": 4.6052, "step": 2393 }, { "epoch": 0.6086763070077864, "grad_norm": 23897.7109375, "learning_rate": 1.9941944570304193e-05, "loss": 4.6072, "step": 2394 }, { "epoch": 0.6089305577625933, "grad_norm": 24085.8671875, "learning_rate": 1.9920212874349637e-05, "loss": 4.6108, "step": 2395 }, { "epoch": 0.6091848085174003, "grad_norm": 23989.02734375, "learning_rate": 1.9898485181381377e-05, "loss": 4.6097, "step": 2396 }, { "epoch": 0.6094390592722072, "grad_norm": 23890.033203125, "learning_rate": 1.9876761508521342e-05, "loss": 4.608, "step": 2397 }, { "epoch": 0.6096933100270141, "grad_norm": 24097.001953125, "learning_rate": 1.9855041872888273e-05, "loss": 4.6056, "step": 2398 }, { "epoch": 0.6099475607818211, "grad_norm": 23937.810546875, "learning_rate": 1.9833326291597712e-05, "loss": 4.6072, "step": 2399 }, { "epoch": 0.610201811536628, "grad_norm": 24209.03515625, "learning_rate": 1.981161478176205e-05, "loss": 4.5977, "step": 2400 }, { "epoch": 0.610201811536628, "eval_loss": 9.29190731048584, "eval_runtime": 696.0967, "eval_samples_per_second": 152.258, "eval_steps_per_second": 9.517, "step": 2400 }, { "epoch": 0.6104560622914349, "grad_norm": 24077.4140625, "learning_rate": 1.978990736049043e-05, "loss": 4.6121, "step": 2401 }, { "epoch": 0.6107103130462419, "grad_norm": 24136.462890625, "learning_rate": 1.9768204044888778e-05, "loss": 4.6078, "step": 2402 }, { "epoch": 0.6109645638010488, "grad_norm": 23773.525390625, "learning_rate": 1.974650485205981e-05, "loss": 4.5966, "step": 2403 }, { "epoch": 0.6112188145558557, "grad_norm": 23942.267578125, "learning_rate": 1.9724809799102974e-05, "loss": 4.6134, "step": 2404 }, { "epoch": 0.6114730653106626, "grad_norm": 24046.46484375, "learning_rate": 1.970311890311445e-05, "loss": 4.6057, "step": 2405 }, { "epoch": 0.6117273160654696, "grad_norm": 24032.455078125, "learning_rate": 1.968143218118717e-05, "loss": 4.6163, "step": 2406 }, { "epoch": 0.6119815668202765, "grad_norm": 24019.90234375, "learning_rate": 1.9659749650410737e-05, "loss": 4.6133, "step": 2407 }, { "epoch": 0.6122358175750834, "grad_norm": 23899.46484375, "learning_rate": 1.9638071327871483e-05, "loss": 4.6102, "step": 2408 }, { "epoch": 0.6124900683298904, "grad_norm": 23958.88671875, "learning_rate": 1.9616397230652407e-05, "loss": 4.6039, "step": 2409 }, { "epoch": 0.6127443190846973, "grad_norm": 24100.830078125, "learning_rate": 1.9594727375833195e-05, "loss": 4.6165, "step": 2410 }, { "epoch": 0.6129985698395042, "grad_norm": 24041.77734375, "learning_rate": 1.957306178049016e-05, "loss": 4.6006, "step": 2411 }, { "epoch": 0.6132528205943112, "grad_norm": 24166.205078125, "learning_rate": 1.9551400461696308e-05, "loss": 4.5937, "step": 2412 }, { "epoch": 0.6135070713491181, "grad_norm": 24076.693359375, "learning_rate": 1.9529743436521212e-05, "loss": 4.618, "step": 2413 }, { "epoch": 0.613761322103925, "grad_norm": 24092.552734375, "learning_rate": 1.9508090722031104e-05, "loss": 4.5888, "step": 2414 }, { "epoch": 0.6140155728587319, "grad_norm": 23962.7265625, "learning_rate": 1.9486442335288826e-05, "loss": 4.6182, "step": 2415 }, { "epoch": 0.6142698236135389, "grad_norm": 24037.544921875, "learning_rate": 1.946479829335378e-05, "loss": 4.6133, "step": 2416 }, { "epoch": 0.6145240743683458, "grad_norm": 23970.84765625, "learning_rate": 1.9443158613281953e-05, "loss": 4.6103, "step": 2417 }, { "epoch": 0.6147783251231527, "grad_norm": 24136.70703125, "learning_rate": 1.94215233121259e-05, "loss": 4.6152, "step": 2418 }, { "epoch": 0.6150325758779597, "grad_norm": 23995.876953125, "learning_rate": 1.9399892406934728e-05, "loss": 4.5922, "step": 2419 }, { "epoch": 0.6152868266327666, "grad_norm": 24161.4453125, "learning_rate": 1.9378265914754083e-05, "loss": 4.6051, "step": 2420 }, { "epoch": 0.6155410773875735, "grad_norm": 24075.326171875, "learning_rate": 1.9356643852626104e-05, "loss": 4.6037, "step": 2421 }, { "epoch": 0.6157953281423805, "grad_norm": 24321.203125, "learning_rate": 1.9335026237589487e-05, "loss": 4.6047, "step": 2422 }, { "epoch": 0.6160495788971874, "grad_norm": 24204.9921875, "learning_rate": 1.931341308667938e-05, "loss": 4.5921, "step": 2423 }, { "epoch": 0.6163038296519943, "grad_norm": 24214.46875, "learning_rate": 1.9291804416927436e-05, "loss": 4.6085, "step": 2424 }, { "epoch": 0.6165580804068012, "grad_norm": 24089.654296875, "learning_rate": 1.9270200245361783e-05, "loss": 4.5945, "step": 2425 }, { "epoch": 0.6168123311616082, "grad_norm": 24021.005859375, "learning_rate": 1.9248600589006986e-05, "loss": 4.593, "step": 2426 }, { "epoch": 0.6170665819164151, "grad_norm": 24244.95703125, "learning_rate": 1.9227005464884043e-05, "loss": 4.596, "step": 2427 }, { "epoch": 0.617320832671222, "grad_norm": 24159.51171875, "learning_rate": 1.9205414890010426e-05, "loss": 4.6018, "step": 2428 }, { "epoch": 0.617575083426029, "grad_norm": 24049.142578125, "learning_rate": 1.9183828881399968e-05, "loss": 4.6034, "step": 2429 }, { "epoch": 0.6178293341808359, "grad_norm": 24109.869140625, "learning_rate": 1.9162247456062938e-05, "loss": 4.5999, "step": 2430 }, { "epoch": 0.6180835849356427, "grad_norm": 23990.083984375, "learning_rate": 1.914067063100599e-05, "loss": 4.6131, "step": 2431 }, { "epoch": 0.6183378356904498, "grad_norm": 24034.0859375, "learning_rate": 1.911909842323214e-05, "loss": 4.6053, "step": 2432 }, { "epoch": 0.6185920864452567, "grad_norm": 24245.017578125, "learning_rate": 1.9097530849740763e-05, "loss": 4.5911, "step": 2433 }, { "epoch": 0.6188463372000635, "grad_norm": 24006.162109375, "learning_rate": 1.907596792752761e-05, "loss": 4.6051, "step": 2434 }, { "epoch": 0.6191005879548704, "grad_norm": 24283.794921875, "learning_rate": 1.905440967358474e-05, "loss": 4.589, "step": 2435 }, { "epoch": 0.6193548387096774, "grad_norm": 24029.14453125, "learning_rate": 1.903285610490053e-05, "loss": 4.6032, "step": 2436 }, { "epoch": 0.6196090894644843, "grad_norm": 24080.90234375, "learning_rate": 1.9011307238459698e-05, "loss": 4.5921, "step": 2437 }, { "epoch": 0.6198633402192912, "grad_norm": 23884.517578125, "learning_rate": 1.8989763091243214e-05, "loss": 4.5807, "step": 2438 }, { "epoch": 0.6201175909740982, "grad_norm": 24080.244140625, "learning_rate": 1.8968223680228357e-05, "loss": 4.5966, "step": 2439 }, { "epoch": 0.6203718417289051, "grad_norm": 24096.248046875, "learning_rate": 1.8946689022388672e-05, "loss": 4.6143, "step": 2440 }, { "epoch": 0.620626092483712, "grad_norm": 24167.19921875, "learning_rate": 1.892515913469395e-05, "loss": 4.5989, "step": 2441 }, { "epoch": 0.620880343238519, "grad_norm": 24043.50390625, "learning_rate": 1.890363403411022e-05, "loss": 4.6023, "step": 2442 }, { "epoch": 0.6211345939933259, "grad_norm": 24128.74609375, "learning_rate": 1.8882113737599752e-05, "loss": 4.609, "step": 2443 }, { "epoch": 0.6213888447481328, "grad_norm": 24154.384765625, "learning_rate": 1.8860598262121015e-05, "loss": 4.5962, "step": 2444 }, { "epoch": 0.6216430955029397, "grad_norm": 23937.044921875, "learning_rate": 1.883908762462869e-05, "loss": 4.5945, "step": 2445 }, { "epoch": 0.6218973462577467, "grad_norm": 24179.6875, "learning_rate": 1.8817581842073653e-05, "loss": 4.5945, "step": 2446 }, { "epoch": 0.6221515970125536, "grad_norm": 24050.8125, "learning_rate": 1.8796080931402934e-05, "loss": 4.604, "step": 2447 }, { "epoch": 0.6224058477673605, "grad_norm": 23989.33984375, "learning_rate": 1.8774584909559728e-05, "loss": 4.5998, "step": 2448 }, { "epoch": 0.6226600985221675, "grad_norm": 24366.857421875, "learning_rate": 1.8753093793483388e-05, "loss": 4.6063, "step": 2449 }, { "epoch": 0.6229143492769744, "grad_norm": 24138.767578125, "learning_rate": 1.87316076001094e-05, "loss": 4.598, "step": 2450 }, { "epoch": 0.6231686000317813, "grad_norm": 24208.26171875, "learning_rate": 1.8710126346369367e-05, "loss": 4.5969, "step": 2451 }, { "epoch": 0.6234228507865883, "grad_norm": 24129.83203125, "learning_rate": 1.868865004919098e-05, "loss": 4.6048, "step": 2452 }, { "epoch": 0.6236771015413952, "grad_norm": 24142.56640625, "learning_rate": 1.8667178725498074e-05, "loss": 4.6057, "step": 2453 }, { "epoch": 0.6239313522962021, "grad_norm": 24080.140625, "learning_rate": 1.864571239221051e-05, "loss": 4.5938, "step": 2454 }, { "epoch": 0.624185603051009, "grad_norm": 24112.578125, "learning_rate": 1.862425106624425e-05, "loss": 4.6078, "step": 2455 }, { "epoch": 0.624439853805816, "grad_norm": 24273.203125, "learning_rate": 1.8602794764511312e-05, "loss": 4.6271, "step": 2456 }, { "epoch": 0.6246941045606229, "grad_norm": 24075.630859375, "learning_rate": 1.8581343503919726e-05, "loss": 4.5954, "step": 2457 }, { "epoch": 0.6249483553154298, "grad_norm": 24070.806640625, "learning_rate": 1.8559897301373567e-05, "loss": 4.6036, "step": 2458 }, { "epoch": 0.6252026060702368, "grad_norm": 24103.83203125, "learning_rate": 1.8538456173772938e-05, "loss": 4.5907, "step": 2459 }, { "epoch": 0.6254568568250437, "grad_norm": 24324.216796875, "learning_rate": 1.8517020138013912e-05, "loss": 4.5965, "step": 2460 }, { "epoch": 0.6257111075798506, "grad_norm": 24115.67578125, "learning_rate": 1.8495589210988575e-05, "loss": 4.594, "step": 2461 }, { "epoch": 0.6259653583346576, "grad_norm": 24061.4140625, "learning_rate": 1.847416340958499e-05, "loss": 4.5945, "step": 2462 }, { "epoch": 0.6262196090894645, "grad_norm": 24268.869140625, "learning_rate": 1.8452742750687156e-05, "loss": 4.589, "step": 2463 }, { "epoch": 0.6264738598442714, "grad_norm": 24175.830078125, "learning_rate": 1.8431327251175028e-05, "loss": 4.5912, "step": 2464 }, { "epoch": 0.6267281105990783, "grad_norm": 23971.470703125, "learning_rate": 1.840991692792451e-05, "loss": 4.5901, "step": 2465 }, { "epoch": 0.6269823613538853, "grad_norm": 24081.029296875, "learning_rate": 1.8388511797807423e-05, "loss": 4.5883, "step": 2466 }, { "epoch": 0.6272366121086922, "grad_norm": 24787.693359375, "learning_rate": 1.8367111877691473e-05, "loss": 4.5839, "step": 2467 }, { "epoch": 0.6274908628634991, "grad_norm": 24246.01171875, "learning_rate": 1.83457171844403e-05, "loss": 4.5712, "step": 2468 }, { "epoch": 0.6277451136183061, "grad_norm": 24131.01953125, "learning_rate": 1.8324327734913385e-05, "loss": 4.5999, "step": 2469 }, { "epoch": 0.627999364373113, "grad_norm": 24299.576171875, "learning_rate": 1.830294354596609e-05, "loss": 4.6027, "step": 2470 }, { "epoch": 0.6282536151279199, "grad_norm": 24074.599609375, "learning_rate": 1.8281564634449652e-05, "loss": 4.5765, "step": 2471 }, { "epoch": 0.6285078658827269, "grad_norm": 24136.794921875, "learning_rate": 1.826019101721113e-05, "loss": 4.5973, "step": 2472 }, { "epoch": 0.6287621166375338, "grad_norm": 24252.568359375, "learning_rate": 1.8238822711093405e-05, "loss": 4.5895, "step": 2473 }, { "epoch": 0.6290163673923407, "grad_norm": 24157.841796875, "learning_rate": 1.8217459732935194e-05, "loss": 4.5756, "step": 2474 }, { "epoch": 0.6292706181471476, "grad_norm": 24179.923828125, "learning_rate": 1.8196102099570995e-05, "loss": 4.5823, "step": 2475 }, { "epoch": 0.6295248689019546, "grad_norm": 24248.7578125, "learning_rate": 1.81747498278311e-05, "loss": 4.5988, "step": 2476 }, { "epoch": 0.6297791196567615, "grad_norm": 24326.951171875, "learning_rate": 1.815340293454159e-05, "loss": 4.5746, "step": 2477 }, { "epoch": 0.6300333704115684, "grad_norm": 24214.560546875, "learning_rate": 1.8132061436524296e-05, "loss": 4.5948, "step": 2478 }, { "epoch": 0.6302876211663754, "grad_norm": 24183.5625, "learning_rate": 1.8110725350596787e-05, "loss": 4.597, "step": 2479 }, { "epoch": 0.6305418719211823, "grad_norm": 24230.060546875, "learning_rate": 1.808939469357237e-05, "loss": 4.5812, "step": 2480 }, { "epoch": 0.6307961226759892, "grad_norm": 24158.80078125, "learning_rate": 1.8068069482260102e-05, "loss": 4.5957, "step": 2481 }, { "epoch": 0.6310503734307962, "grad_norm": 24142.5546875, "learning_rate": 1.8046749733464723e-05, "loss": 4.5926, "step": 2482 }, { "epoch": 0.6313046241856031, "grad_norm": 24118.13671875, "learning_rate": 1.8025435463986662e-05, "loss": 4.5935, "step": 2483 }, { "epoch": 0.63155887494041, "grad_norm": 24302.646484375, "learning_rate": 1.8004126690622063e-05, "loss": 4.5845, "step": 2484 }, { "epoch": 0.6318131256952169, "grad_norm": 24153.447265625, "learning_rate": 1.7982823430162697e-05, "loss": 4.5869, "step": 2485 }, { "epoch": 0.6320673764500239, "grad_norm": 24060.43359375, "learning_rate": 1.7961525699396025e-05, "loss": 4.5937, "step": 2486 }, { "epoch": 0.6323216272048308, "grad_norm": 24141.41015625, "learning_rate": 1.7940233515105128e-05, "loss": 4.5852, "step": 2487 }, { "epoch": 0.6325758779596377, "grad_norm": 24227.083984375, "learning_rate": 1.7918946894068736e-05, "loss": 4.5877, "step": 2488 }, { "epoch": 0.6328301287144447, "grad_norm": 24059.630859375, "learning_rate": 1.789766585306117e-05, "loss": 4.5876, "step": 2489 }, { "epoch": 0.6330843794692516, "grad_norm": 24174.29296875, "learning_rate": 1.7876390408852385e-05, "loss": 4.599, "step": 2490 }, { "epoch": 0.6333386302240585, "grad_norm": 24081.818359375, "learning_rate": 1.785512057820789e-05, "loss": 4.5783, "step": 2491 }, { "epoch": 0.6335928809788655, "grad_norm": 24321.181640625, "learning_rate": 1.7833856377888796e-05, "loss": 4.592, "step": 2492 }, { "epoch": 0.6338471317336724, "grad_norm": 24287.48828125, "learning_rate": 1.7812597824651783e-05, "loss": 4.6032, "step": 2493 }, { "epoch": 0.6341013824884792, "grad_norm": 24185.6796875, "learning_rate": 1.779134493524906e-05, "loss": 4.5765, "step": 2494 }, { "epoch": 0.6343556332432861, "grad_norm": 24248.607421875, "learning_rate": 1.777009772642837e-05, "loss": 4.5691, "step": 2495 }, { "epoch": 0.6346098839980931, "grad_norm": 24190.3125, "learning_rate": 1.7748856214933006e-05, "loss": 4.5883, "step": 2496 }, { "epoch": 0.6348641347529, "grad_norm": 24106.21875, "learning_rate": 1.772762041750175e-05, "loss": 4.59, "step": 2497 }, { "epoch": 0.6351183855077069, "grad_norm": 24152.291015625, "learning_rate": 1.770639035086888e-05, "loss": 4.5899, "step": 2498 }, { "epoch": 0.635372636262514, "grad_norm": 23959.857421875, "learning_rate": 1.7685166031764178e-05, "loss": 4.5926, "step": 2499 }, { "epoch": 0.6356268870173208, "grad_norm": 24178.6875, "learning_rate": 1.7663947476912886e-05, "loss": 4.5936, "step": 2500 }, { "epoch": 0.6356268870173208, "eval_loss": 9.240402221679688, "eval_runtime": 696.0918, "eval_samples_per_second": 152.259, "eval_steps_per_second": 9.517, "step": 2500 }, { "epoch": 0.6358811377721277, "grad_norm": 24244.623046875, "learning_rate": 1.7642734703035675e-05, "loss": 4.5803, "step": 2501 }, { "epoch": 0.6361353885269347, "grad_norm": 24223.193359375, "learning_rate": 1.7621527726848717e-05, "loss": 4.5943, "step": 2502 }, { "epoch": 0.6363896392817416, "grad_norm": 24211.23828125, "learning_rate": 1.7600326565063576e-05, "loss": 4.5932, "step": 2503 }, { "epoch": 0.6366438900365485, "grad_norm": 24327.18359375, "learning_rate": 1.7579131234387238e-05, "loss": 4.581, "step": 2504 }, { "epoch": 0.6368981407913554, "grad_norm": 24221.48828125, "learning_rate": 1.7557941751522107e-05, "loss": 4.578, "step": 2505 }, { "epoch": 0.6371523915461624, "grad_norm": 24290.9765625, "learning_rate": 1.7536758133165963e-05, "loss": 4.5766, "step": 2506 }, { "epoch": 0.6374066423009693, "grad_norm": 24239.48046875, "learning_rate": 1.7515580396011976e-05, "loss": 4.588, "step": 2507 }, { "epoch": 0.6376608930557762, "grad_norm": 24361.490234375, "learning_rate": 1.7494408556748683e-05, "loss": 4.589, "step": 2508 }, { "epoch": 0.6379151438105832, "grad_norm": 24069.62890625, "learning_rate": 1.7473242632059972e-05, "loss": 4.5666, "step": 2509 }, { "epoch": 0.6381693945653901, "grad_norm": 24544.166015625, "learning_rate": 1.7452082638625057e-05, "loss": 4.6005, "step": 2510 }, { "epoch": 0.638423645320197, "grad_norm": 24267.517578125, "learning_rate": 1.7430928593118483e-05, "loss": 4.5908, "step": 2511 }, { "epoch": 0.638677896075004, "grad_norm": 24144.861328125, "learning_rate": 1.7409780512210126e-05, "loss": 4.5816, "step": 2512 }, { "epoch": 0.6389321468298109, "grad_norm": 24160.880859375, "learning_rate": 1.738863841256515e-05, "loss": 4.5771, "step": 2513 }, { "epoch": 0.6391863975846178, "grad_norm": 24155.625, "learning_rate": 1.7367502310843986e-05, "loss": 4.5903, "step": 2514 }, { "epoch": 0.6394406483394247, "grad_norm": 24405.29296875, "learning_rate": 1.7346372223702378e-05, "loss": 4.5811, "step": 2515 }, { "epoch": 0.6396948990942317, "grad_norm": 24368.484375, "learning_rate": 1.7325248167791294e-05, "loss": 4.5793, "step": 2516 }, { "epoch": 0.6399491498490386, "grad_norm": 24171.306640625, "learning_rate": 1.730413015975697e-05, "loss": 4.5761, "step": 2517 }, { "epoch": 0.6402034006038455, "grad_norm": 24201.806640625, "learning_rate": 1.7283018216240874e-05, "loss": 4.5724, "step": 2518 }, { "epoch": 0.6404576513586525, "grad_norm": 24133.904296875, "learning_rate": 1.726191235387969e-05, "loss": 4.5737, "step": 2519 }, { "epoch": 0.6407119021134594, "grad_norm": 24377.591796875, "learning_rate": 1.72408125893053e-05, "loss": 4.5854, "step": 2520 }, { "epoch": 0.6409661528682663, "grad_norm": 24438.13671875, "learning_rate": 1.7219718939144812e-05, "loss": 4.5793, "step": 2521 }, { "epoch": 0.6412204036230733, "grad_norm": 24163.6640625, "learning_rate": 1.7198631420020484e-05, "loss": 4.5784, "step": 2522 }, { "epoch": 0.6414746543778802, "grad_norm": 24440.03125, "learning_rate": 1.7177550048549746e-05, "loss": 4.5765, "step": 2523 }, { "epoch": 0.6417289051326871, "grad_norm": 24253.587890625, "learning_rate": 1.7156474841345212e-05, "loss": 4.5788, "step": 2524 }, { "epoch": 0.641983155887494, "grad_norm": 24234.193359375, "learning_rate": 1.713540581501461e-05, "loss": 4.5801, "step": 2525 }, { "epoch": 0.642237406642301, "grad_norm": 24537.63671875, "learning_rate": 1.7114342986160797e-05, "loss": 4.5821, "step": 2526 }, { "epoch": 0.6424916573971079, "grad_norm": 24128.314453125, "learning_rate": 1.709328637138177e-05, "loss": 4.5654, "step": 2527 }, { "epoch": 0.6427459081519148, "grad_norm": 24724.302734375, "learning_rate": 1.7072235987270603e-05, "loss": 4.5805, "step": 2528 }, { "epoch": 0.6430001589067218, "grad_norm": 24352.306640625, "learning_rate": 1.7051191850415467e-05, "loss": 4.5767, "step": 2529 }, { "epoch": 0.6432544096615287, "grad_norm": 24312.7578125, "learning_rate": 1.7030153977399635e-05, "loss": 4.5926, "step": 2530 }, { "epoch": 0.6435086604163356, "grad_norm": 24119.224609375, "learning_rate": 1.700912238480141e-05, "loss": 4.5677, "step": 2531 }, { "epoch": 0.6437629111711426, "grad_norm": 24259.05859375, "learning_rate": 1.698809708919415e-05, "loss": 4.5843, "step": 2532 }, { "epoch": 0.6440171619259495, "grad_norm": 24297.130859375, "learning_rate": 1.696707810714627e-05, "loss": 4.5754, "step": 2533 }, { "epoch": 0.6442714126807564, "grad_norm": 24270.455078125, "learning_rate": 1.6946065455221213e-05, "loss": 4.5669, "step": 2534 }, { "epoch": 0.6445256634355633, "grad_norm": 24254.466796875, "learning_rate": 1.6925059149977395e-05, "loss": 4.5824, "step": 2535 }, { "epoch": 0.6447799141903703, "grad_norm": 24486.9921875, "learning_rate": 1.6904059207968277e-05, "loss": 4.5811, "step": 2536 }, { "epoch": 0.6450341649451772, "grad_norm": 24020.109375, "learning_rate": 1.6883065645742274e-05, "loss": 4.5757, "step": 2537 }, { "epoch": 0.6452884156999841, "grad_norm": 24878.609375, "learning_rate": 1.6862078479842778e-05, "loss": 4.5947, "step": 2538 }, { "epoch": 0.6455426664547911, "grad_norm": 24334.31640625, "learning_rate": 1.684109772680816e-05, "loss": 4.5691, "step": 2539 }, { "epoch": 0.645796917209598, "grad_norm": 24240.169921875, "learning_rate": 1.6820123403171723e-05, "loss": 4.5729, "step": 2540 }, { "epoch": 0.6460511679644049, "grad_norm": 24486.2265625, "learning_rate": 1.6799155525461707e-05, "loss": 4.5685, "step": 2541 }, { "epoch": 0.6463054187192119, "grad_norm": 24304.26953125, "learning_rate": 1.677819411020125e-05, "loss": 4.5695, "step": 2542 }, { "epoch": 0.6465596694740188, "grad_norm": 24576.150390625, "learning_rate": 1.675723917390844e-05, "loss": 4.5747, "step": 2543 }, { "epoch": 0.6468139202288257, "grad_norm": 24128.58984375, "learning_rate": 1.6736290733096235e-05, "loss": 4.5709, "step": 2544 }, { "epoch": 0.6470681709836326, "grad_norm": 24498.22265625, "learning_rate": 1.671534880427246e-05, "loss": 4.5801, "step": 2545 }, { "epoch": 0.6473224217384396, "grad_norm": 24382.521484375, "learning_rate": 1.669441340393985e-05, "loss": 4.5552, "step": 2546 }, { "epoch": 0.6475766724932465, "grad_norm": 24435.91796875, "learning_rate": 1.667348454859596e-05, "loss": 4.5658, "step": 2547 }, { "epoch": 0.6478309232480534, "grad_norm": 24412.626953125, "learning_rate": 1.6652562254733185e-05, "loss": 4.5683, "step": 2548 }, { "epoch": 0.6480851740028604, "grad_norm": 24386.279296875, "learning_rate": 1.6631646538838774e-05, "loss": 4.5748, "step": 2549 }, { "epoch": 0.6483394247576673, "grad_norm": 24362.896484375, "learning_rate": 1.6610737417394785e-05, "loss": 4.5627, "step": 2550 }, { "epoch": 0.6485936755124742, "grad_norm": 24008.484375, "learning_rate": 1.658983490687806e-05, "loss": 4.5683, "step": 2551 }, { "epoch": 0.6488479262672812, "grad_norm": 24282.52734375, "learning_rate": 1.656893902376027e-05, "loss": 4.5629, "step": 2552 }, { "epoch": 0.6491021770220881, "grad_norm": 24190.306640625, "learning_rate": 1.654804978450782e-05, "loss": 4.562, "step": 2553 }, { "epoch": 0.649356427776895, "grad_norm": 24304.244140625, "learning_rate": 1.6527167205581903e-05, "loss": 4.5686, "step": 2554 }, { "epoch": 0.6496106785317018, "grad_norm": 24525.248046875, "learning_rate": 1.6506291303438464e-05, "loss": 4.5531, "step": 2555 }, { "epoch": 0.6498649292865089, "grad_norm": 24319.943359375, "learning_rate": 1.648542209452819e-05, "loss": 4.5697, "step": 2556 }, { "epoch": 0.6501191800413157, "grad_norm": 24227.51953125, "learning_rate": 1.6464559595296462e-05, "loss": 4.5717, "step": 2557 }, { "epoch": 0.6503734307961226, "grad_norm": 24229.46484375, "learning_rate": 1.6443703822183428e-05, "loss": 4.5778, "step": 2558 }, { "epoch": 0.6506276815509296, "grad_norm": 24172.23046875, "learning_rate": 1.6422854791623886e-05, "loss": 4.559, "step": 2559 }, { "epoch": 0.6508819323057365, "grad_norm": 24447.97265625, "learning_rate": 1.640201252004734e-05, "loss": 4.5815, "step": 2560 }, { "epoch": 0.6511361830605434, "grad_norm": 24136.083984375, "learning_rate": 1.638117702387798e-05, "loss": 4.5608, "step": 2561 }, { "epoch": 0.6513904338153504, "grad_norm": 24441.25, "learning_rate": 1.636034831953464e-05, "loss": 4.5791, "step": 2562 }, { "epoch": 0.6516446845701573, "grad_norm": 24246.384765625, "learning_rate": 1.6339526423430797e-05, "loss": 4.5755, "step": 2563 }, { "epoch": 0.6518989353249642, "grad_norm": 24261.48828125, "learning_rate": 1.631871135197459e-05, "loss": 4.573, "step": 2564 }, { "epoch": 0.6521531860797711, "grad_norm": 24247.67578125, "learning_rate": 1.6297903121568747e-05, "loss": 4.5796, "step": 2565 }, { "epoch": 0.6524074368345781, "grad_norm": 24322.1015625, "learning_rate": 1.6277101748610622e-05, "loss": 4.5577, "step": 2566 }, { "epoch": 0.652661687589385, "grad_norm": 24325.9609375, "learning_rate": 1.6256307249492177e-05, "loss": 4.5744, "step": 2567 }, { "epoch": 0.6529159383441919, "grad_norm": 24275.09375, "learning_rate": 1.6235519640599938e-05, "loss": 4.5701, "step": 2568 }, { "epoch": 0.6531701890989989, "grad_norm": 24240.07421875, "learning_rate": 1.6214738938314994e-05, "loss": 4.5682, "step": 2569 }, { "epoch": 0.6534244398538058, "grad_norm": 24224.517578125, "learning_rate": 1.6193965159013023e-05, "loss": 4.5723, "step": 2570 }, { "epoch": 0.6536786906086127, "grad_norm": 24453.654296875, "learning_rate": 1.6173198319064227e-05, "loss": 4.5777, "step": 2571 }, { "epoch": 0.6539329413634196, "grad_norm": 24286.72265625, "learning_rate": 1.6152438434833337e-05, "loss": 4.5645, "step": 2572 }, { "epoch": 0.6541871921182266, "grad_norm": 24249.177734375, "learning_rate": 1.61316855226796e-05, "loss": 4.5678, "step": 2573 }, { "epoch": 0.6544414428730335, "grad_norm": 24488.66015625, "learning_rate": 1.611093959895679e-05, "loss": 4.5705, "step": 2574 }, { "epoch": 0.6546956936278404, "grad_norm": 24407.724609375, "learning_rate": 1.609020068001316e-05, "loss": 4.5644, "step": 2575 }, { "epoch": 0.6549499443826474, "grad_norm": 24337.296875, "learning_rate": 1.606946878219143e-05, "loss": 4.566, "step": 2576 }, { "epoch": 0.6552041951374543, "grad_norm": 24338.869140625, "learning_rate": 1.6048743921828825e-05, "loss": 4.5549, "step": 2577 }, { "epoch": 0.6554584458922612, "grad_norm": 24311.572265625, "learning_rate": 1.6028026115256984e-05, "loss": 4.5816, "step": 2578 }, { "epoch": 0.6557126966470682, "grad_norm": 24314.982421875, "learning_rate": 1.6007315378801997e-05, "loss": 4.5662, "step": 2579 }, { "epoch": 0.6559669474018751, "grad_norm": 24301.125, "learning_rate": 1.5986611728784404e-05, "loss": 4.5642, "step": 2580 }, { "epoch": 0.656221198156682, "grad_norm": 24435.6171875, "learning_rate": 1.5965915181519144e-05, "loss": 4.5719, "step": 2581 }, { "epoch": 0.6564754489114889, "grad_norm": 24352.328125, "learning_rate": 1.5945225753315544e-05, "loss": 4.5722, "step": 2582 }, { "epoch": 0.6567296996662959, "grad_norm": 24420.17578125, "learning_rate": 1.592454346047737e-05, "loss": 4.5657, "step": 2583 }, { "epoch": 0.6569839504211028, "grad_norm": 24249.03515625, "learning_rate": 1.5903868319302704e-05, "loss": 4.5693, "step": 2584 }, { "epoch": 0.6572382011759097, "grad_norm": 24325.220703125, "learning_rate": 1.5883200346084032e-05, "loss": 4.5522, "step": 2585 }, { "epoch": 0.6574924519307167, "grad_norm": 24389.767578125, "learning_rate": 1.5862539557108182e-05, "loss": 4.5815, "step": 2586 }, { "epoch": 0.6577467026855236, "grad_norm": 24442.4453125, "learning_rate": 1.584188596865632e-05, "loss": 4.5644, "step": 2587 }, { "epoch": 0.6580009534403305, "grad_norm": 24160.30078125, "learning_rate": 1.5821239597003933e-05, "loss": 4.5477, "step": 2588 }, { "epoch": 0.6582552041951375, "grad_norm": 24312.27734375, "learning_rate": 1.5800600458420834e-05, "loss": 4.5595, "step": 2589 }, { "epoch": 0.6585094549499444, "grad_norm": 24372.12890625, "learning_rate": 1.5779968569171118e-05, "loss": 4.5732, "step": 2590 }, { "epoch": 0.6587637057047513, "grad_norm": 24259.5546875, "learning_rate": 1.5759343945513173e-05, "loss": 4.5705, "step": 2591 }, { "epoch": 0.6590179564595582, "grad_norm": 24217.7421875, "learning_rate": 1.5738726603699684e-05, "loss": 4.5595, "step": 2592 }, { "epoch": 0.6592722072143652, "grad_norm": 24322.662109375, "learning_rate": 1.571811655997757e-05, "loss": 4.5703, "step": 2593 }, { "epoch": 0.6595264579691721, "grad_norm": 24315.576171875, "learning_rate": 1.5697513830587995e-05, "loss": 4.5585, "step": 2594 }, { "epoch": 0.659780708723979, "grad_norm": 24202.53515625, "learning_rate": 1.56769184317664e-05, "loss": 4.5604, "step": 2595 }, { "epoch": 0.660034959478786, "grad_norm": 24433.025390625, "learning_rate": 1.5656330379742397e-05, "loss": 4.5706, "step": 2596 }, { "epoch": 0.6602892102335929, "grad_norm": 24239.861328125, "learning_rate": 1.5635749690739838e-05, "loss": 4.5695, "step": 2597 }, { "epoch": 0.6605434609883998, "grad_norm": 24277.451171875, "learning_rate": 1.561517638097678e-05, "loss": 4.5684, "step": 2598 }, { "epoch": 0.6607977117432068, "grad_norm": 24537.86328125, "learning_rate": 1.5594610466665442e-05, "loss": 4.5681, "step": 2599 }, { "epoch": 0.6610519624980137, "grad_norm": 24327.55078125, "learning_rate": 1.5574051964012226e-05, "loss": 4.5717, "step": 2600 }, { "epoch": 0.6610519624980137, "eval_loss": 9.194845199584961, "eval_runtime": 695.2131, "eval_samples_per_second": 152.451, "eval_steps_per_second": 9.529, "step": 2600 }, { "epoch": 0.6613062132528206, "grad_norm": 24295.04296875, "learning_rate": 1.55535008892177e-05, "loss": 4.5415, "step": 2601 }, { "epoch": 0.6615604640076275, "grad_norm": 24349.5859375, "learning_rate": 1.5532957258476577e-05, "loss": 4.5631, "step": 2602 }, { "epoch": 0.6618147147624345, "grad_norm": 24385.060546875, "learning_rate": 1.551242108797769e-05, "loss": 4.5622, "step": 2603 }, { "epoch": 0.6620689655172414, "grad_norm": 24276.966796875, "learning_rate": 1.549189239390399e-05, "loss": 4.5616, "step": 2604 }, { "epoch": 0.6623232162720483, "grad_norm": 24408.85546875, "learning_rate": 1.5471371192432577e-05, "loss": 4.5501, "step": 2605 }, { "epoch": 0.6625774670268553, "grad_norm": 24281.455078125, "learning_rate": 1.54508574997346e-05, "loss": 4.5605, "step": 2606 }, { "epoch": 0.6628317177816622, "grad_norm": 24378.90625, "learning_rate": 1.5430351331975305e-05, "loss": 4.5622, "step": 2607 }, { "epoch": 0.6630859685364691, "grad_norm": 24312.474609375, "learning_rate": 1.5409852705314037e-05, "loss": 4.5591, "step": 2608 }, { "epoch": 0.6633402192912761, "grad_norm": 24265.546875, "learning_rate": 1.5389361635904153e-05, "loss": 4.5532, "step": 2609 }, { "epoch": 0.663594470046083, "grad_norm": 24375.109375, "learning_rate": 1.5368878139893076e-05, "loss": 4.5521, "step": 2610 }, { "epoch": 0.6638487208008899, "grad_norm": 24323.583984375, "learning_rate": 1.534840223342227e-05, "loss": 4.5615, "step": 2611 }, { "epoch": 0.6641029715556968, "grad_norm": 24380.0703125, "learning_rate": 1.532793393262721e-05, "loss": 4.5621, "step": 2612 }, { "epoch": 0.6643572223105038, "grad_norm": 24382.95703125, "learning_rate": 1.530747325363736e-05, "loss": 4.5606, "step": 2613 }, { "epoch": 0.6646114730653107, "grad_norm": 24253.33984375, "learning_rate": 1.5287020212576216e-05, "loss": 4.5492, "step": 2614 }, { "epoch": 0.6648657238201175, "grad_norm": 24287.13671875, "learning_rate": 1.5266574825561224e-05, "loss": 4.5605, "step": 2615 }, { "epoch": 0.6651199745749246, "grad_norm": 24491.078125, "learning_rate": 1.5246137108703801e-05, "loss": 4.5533, "step": 2616 }, { "epoch": 0.6653742253297315, "grad_norm": 24305.341796875, "learning_rate": 1.5225707078109336e-05, "loss": 4.5504, "step": 2617 }, { "epoch": 0.6656284760845383, "grad_norm": 24401.724609375, "learning_rate": 1.5205284749877153e-05, "loss": 4.5652, "step": 2618 }, { "epoch": 0.6658827268393454, "grad_norm": 24451.150390625, "learning_rate": 1.5184870140100493e-05, "loss": 4.5577, "step": 2619 }, { "epoch": 0.6661369775941522, "grad_norm": 24463.044921875, "learning_rate": 1.5164463264866546e-05, "loss": 4.5466, "step": 2620 }, { "epoch": 0.6663912283489591, "grad_norm": 24340.640625, "learning_rate": 1.5144064140256374e-05, "loss": 4.5506, "step": 2621 }, { "epoch": 0.666645479103766, "grad_norm": 24280.7734375, "learning_rate": 1.5123672782344943e-05, "loss": 4.575, "step": 2622 }, { "epoch": 0.666899729858573, "grad_norm": 24522.998046875, "learning_rate": 1.510328920720111e-05, "loss": 4.5535, "step": 2623 }, { "epoch": 0.6671539806133799, "grad_norm": 24471.98828125, "learning_rate": 1.5082913430887591e-05, "loss": 4.5568, "step": 2624 }, { "epoch": 0.6674082313681868, "grad_norm": 24511.259765625, "learning_rate": 1.506254546946094e-05, "loss": 4.5615, "step": 2625 }, { "epoch": 0.6676624821229938, "grad_norm": 24492.615234375, "learning_rate": 1.5042185338971588e-05, "loss": 4.5573, "step": 2626 }, { "epoch": 0.6679167328778007, "grad_norm": 24245.654296875, "learning_rate": 1.502183305546376e-05, "loss": 4.5409, "step": 2627 }, { "epoch": 0.6681709836326076, "grad_norm": 24393.26953125, "learning_rate": 1.5001488634975514e-05, "loss": 4.5431, "step": 2628 }, { "epoch": 0.6684252343874146, "grad_norm": 24628.59765625, "learning_rate": 1.4981152093538723e-05, "loss": 4.5683, "step": 2629 }, { "epoch": 0.6686794851422215, "grad_norm": 24390.611328125, "learning_rate": 1.4960823447179029e-05, "loss": 4.5587, "step": 2630 }, { "epoch": 0.6689337358970284, "grad_norm": 24350.181640625, "learning_rate": 1.4940502711915852e-05, "loss": 4.5523, "step": 2631 }, { "epoch": 0.6691879866518353, "grad_norm": 24268.439453125, "learning_rate": 1.4920189903762403e-05, "loss": 4.5557, "step": 2632 }, { "epoch": 0.6694422374066423, "grad_norm": 24424.794921875, "learning_rate": 1.4899885038725628e-05, "loss": 4.5592, "step": 2633 }, { "epoch": 0.6696964881614492, "grad_norm": 24477.333984375, "learning_rate": 1.4879588132806205e-05, "loss": 4.5689, "step": 2634 }, { "epoch": 0.6699507389162561, "grad_norm": 24517.03125, "learning_rate": 1.4859299201998572e-05, "loss": 4.5476, "step": 2635 }, { "epoch": 0.6702049896710631, "grad_norm": 24364.654296875, "learning_rate": 1.483901826229085e-05, "loss": 4.5441, "step": 2636 }, { "epoch": 0.67045924042587, "grad_norm": 24339.876953125, "learning_rate": 1.4818745329664868e-05, "loss": 4.5553, "step": 2637 }, { "epoch": 0.6707134911806769, "grad_norm": 24150.935546875, "learning_rate": 1.4798480420096156e-05, "loss": 4.537, "step": 2638 }, { "epoch": 0.6709677419354839, "grad_norm": 24323.576171875, "learning_rate": 1.4778223549553929e-05, "loss": 4.5627, "step": 2639 }, { "epoch": 0.6712219926902908, "grad_norm": 24327.634765625, "learning_rate": 1.4757974734001051e-05, "loss": 4.5621, "step": 2640 }, { "epoch": 0.6714762434450977, "grad_norm": 24511.54296875, "learning_rate": 1.4737733989394025e-05, "loss": 4.5444, "step": 2641 }, { "epoch": 0.6717304941999046, "grad_norm": 24349.28515625, "learning_rate": 1.4717501331683037e-05, "loss": 4.5447, "step": 2642 }, { "epoch": 0.6719847449547116, "grad_norm": 24370.83984375, "learning_rate": 1.4697276776811871e-05, "loss": 4.5309, "step": 2643 }, { "epoch": 0.6722389957095185, "grad_norm": 24459.435546875, "learning_rate": 1.4677060340717913e-05, "loss": 4.5427, "step": 2644 }, { "epoch": 0.6724932464643254, "grad_norm": 24393.083984375, "learning_rate": 1.4656852039332194e-05, "loss": 4.5578, "step": 2645 }, { "epoch": 0.6727474972191324, "grad_norm": 24398.8359375, "learning_rate": 1.4636651888579294e-05, "loss": 4.5465, "step": 2646 }, { "epoch": 0.6730017479739393, "grad_norm": 24499.3359375, "learning_rate": 1.4616459904377377e-05, "loss": 4.5464, "step": 2647 }, { "epoch": 0.6732559987287462, "grad_norm": 24575.685546875, "learning_rate": 1.4596276102638196e-05, "loss": 4.5551, "step": 2648 }, { "epoch": 0.6735102494835532, "grad_norm": 24342.509765625, "learning_rate": 1.457610049926704e-05, "loss": 4.5442, "step": 2649 }, { "epoch": 0.6737645002383601, "grad_norm": 24487.666015625, "learning_rate": 1.4555933110162719e-05, "loss": 4.5456, "step": 2650 }, { "epoch": 0.674018750993167, "grad_norm": 24358.662109375, "learning_rate": 1.4535773951217612e-05, "loss": 4.5463, "step": 2651 }, { "epoch": 0.6742730017479739, "grad_norm": 24565.26953125, "learning_rate": 1.451562303831758e-05, "loss": 4.5506, "step": 2652 }, { "epoch": 0.6745272525027809, "grad_norm": 24423.33984375, "learning_rate": 1.449548038734198e-05, "loss": 4.5644, "step": 2653 }, { "epoch": 0.6747815032575878, "grad_norm": 24559.173828125, "learning_rate": 1.4475346014163698e-05, "loss": 4.5515, "step": 2654 }, { "epoch": 0.6750357540123947, "grad_norm": 24355.234375, "learning_rate": 1.445521993464905e-05, "loss": 4.5475, "step": 2655 }, { "epoch": 0.6752900047672017, "grad_norm": 24532.427734375, "learning_rate": 1.443510216465786e-05, "loss": 4.5595, "step": 2656 }, { "epoch": 0.6755442555220086, "grad_norm": 24590.47265625, "learning_rate": 1.4414992720043357e-05, "loss": 4.5485, "step": 2657 }, { "epoch": 0.6757985062768155, "grad_norm": 24426.86328125, "learning_rate": 1.4394891616652261e-05, "loss": 4.5451, "step": 2658 }, { "epoch": 0.6760527570316225, "grad_norm": 24510.662109375, "learning_rate": 1.437479887032467e-05, "loss": 4.5413, "step": 2659 }, { "epoch": 0.6763070077864294, "grad_norm": 48439.36328125, "learning_rate": 1.4354714496894142e-05, "loss": 4.5473, "step": 2660 }, { "epoch": 0.6765612585412363, "grad_norm": 24661.67578125, "learning_rate": 1.4334638512187602e-05, "loss": 4.5498, "step": 2661 }, { "epoch": 0.6768155092960432, "grad_norm": 24507.7578125, "learning_rate": 1.4314570932025365e-05, "loss": 4.5628, "step": 2662 }, { "epoch": 0.6770697600508502, "grad_norm": 24600.8125, "learning_rate": 1.4294511772221156e-05, "loss": 4.5441, "step": 2663 }, { "epoch": 0.6773240108056571, "grad_norm": 24368.47265625, "learning_rate": 1.4274461048582036e-05, "loss": 4.5402, "step": 2664 }, { "epoch": 0.677578261560464, "grad_norm": 24495.548828125, "learning_rate": 1.4254418776908412e-05, "loss": 4.5376, "step": 2665 }, { "epoch": 0.677832512315271, "grad_norm": 24475.486328125, "learning_rate": 1.4234384972994055e-05, "loss": 4.554, "step": 2666 }, { "epoch": 0.6780867630700779, "grad_norm": 24396.6171875, "learning_rate": 1.4214359652626064e-05, "loss": 4.5461, "step": 2667 }, { "epoch": 0.6783410138248848, "grad_norm": 24350.16796875, "learning_rate": 1.4194342831584829e-05, "loss": 4.5499, "step": 2668 }, { "epoch": 0.6785952645796918, "grad_norm": 24518.283203125, "learning_rate": 1.4174334525644045e-05, "loss": 4.5446, "step": 2669 }, { "epoch": 0.6788495153344987, "grad_norm": 24546.09375, "learning_rate": 1.4154334750570727e-05, "loss": 4.5571, "step": 2670 }, { "epoch": 0.6791037660893056, "grad_norm": 24416.140625, "learning_rate": 1.4134343522125138e-05, "loss": 4.5519, "step": 2671 }, { "epoch": 0.6793580168441125, "grad_norm": 24436.51171875, "learning_rate": 1.41143608560608e-05, "loss": 4.5402, "step": 2672 }, { "epoch": 0.6796122675989195, "grad_norm": 24666.904296875, "learning_rate": 1.4094386768124527e-05, "loss": 4.5506, "step": 2673 }, { "epoch": 0.6798665183537264, "grad_norm": 24486.943359375, "learning_rate": 1.4074421274056337e-05, "loss": 4.5514, "step": 2674 }, { "epoch": 0.6801207691085333, "grad_norm": 24392.087890625, "learning_rate": 1.4054464389589478e-05, "loss": 4.5483, "step": 2675 }, { "epoch": 0.6803750198633403, "grad_norm": 24516.591796875, "learning_rate": 1.4034516130450431e-05, "loss": 4.5409, "step": 2676 }, { "epoch": 0.6806292706181472, "grad_norm": 24615.484375, "learning_rate": 1.401457651235889e-05, "loss": 4.5404, "step": 2677 }, { "epoch": 0.680883521372954, "grad_norm": 24461.681640625, "learning_rate": 1.3994645551027692e-05, "loss": 4.5538, "step": 2678 }, { "epoch": 0.681137772127761, "grad_norm": 24424.5, "learning_rate": 1.3974723262162902e-05, "loss": 4.532, "step": 2679 }, { "epoch": 0.681392022882568, "grad_norm": 24341.431640625, "learning_rate": 1.3954809661463731e-05, "loss": 4.5371, "step": 2680 }, { "epoch": 0.6816462736373748, "grad_norm": 24641.0390625, "learning_rate": 1.3934904764622525e-05, "loss": 4.5517, "step": 2681 }, { "epoch": 0.6819005243921817, "grad_norm": 24360.1015625, "learning_rate": 1.3915008587324812e-05, "loss": 4.5391, "step": 2682 }, { "epoch": 0.6821547751469887, "grad_norm": 24652.677734375, "learning_rate": 1.3895121145249218e-05, "loss": 4.5477, "step": 2683 }, { "epoch": 0.6824090259017956, "grad_norm": 24644.857421875, "learning_rate": 1.387524245406748e-05, "loss": 4.543, "step": 2684 }, { "epoch": 0.6826632766566025, "grad_norm": 24428.333984375, "learning_rate": 1.3855372529444477e-05, "loss": 4.5368, "step": 2685 }, { "epoch": 0.6829175274114095, "grad_norm": 24593.79296875, "learning_rate": 1.383551138703813e-05, "loss": 4.536, "step": 2686 }, { "epoch": 0.6831717781662164, "grad_norm": 24470.54296875, "learning_rate": 1.3815659042499495e-05, "loss": 4.5488, "step": 2687 }, { "epoch": 0.6834260289210233, "grad_norm": 24542.51953125, "learning_rate": 1.3795815511472634e-05, "loss": 4.5381, "step": 2688 }, { "epoch": 0.6836802796758303, "grad_norm": 24658.890625, "learning_rate": 1.3775980809594725e-05, "loss": 4.5445, "step": 2689 }, { "epoch": 0.6839345304306372, "grad_norm": 24542.85546875, "learning_rate": 1.3756154952495932e-05, "loss": 4.5614, "step": 2690 }, { "epoch": 0.6841887811854441, "grad_norm": 24484.181640625, "learning_rate": 1.3736337955799495e-05, "loss": 4.5477, "step": 2691 }, { "epoch": 0.684443031940251, "grad_norm": 30898.072265625, "learning_rate": 1.3716529835121644e-05, "loss": 4.5439, "step": 2692 }, { "epoch": 0.684697282695058, "grad_norm": 26642.958984375, "learning_rate": 1.3696730606071617e-05, "loss": 4.5383, "step": 2693 }, { "epoch": 0.6849515334498649, "grad_norm": 24576.06640625, "learning_rate": 1.3676940284251666e-05, "loss": 4.5404, "step": 2694 }, { "epoch": 0.6852057842046718, "grad_norm": 24803.51953125, "learning_rate": 1.3657158885256998e-05, "loss": 4.5414, "step": 2695 }, { "epoch": 0.6854600349594788, "grad_norm": 25177.0390625, "learning_rate": 1.3637386424675793e-05, "loss": 4.5463, "step": 2696 }, { "epoch": 0.6857142857142857, "grad_norm": 24413.080078125, "learning_rate": 1.3617622918089215e-05, "loss": 4.5353, "step": 2697 }, { "epoch": 0.6859685364690926, "grad_norm": 25677.380859375, "learning_rate": 1.3597868381071327e-05, "loss": 4.5294, "step": 2698 }, { "epoch": 0.6862227872238996, "grad_norm": 24625.978515625, "learning_rate": 1.3578122829189168e-05, "loss": 4.541, "step": 2699 }, { "epoch": 0.6864770379787065, "grad_norm": 24854.724609375, "learning_rate": 1.355838627800266e-05, "loss": 4.5493, "step": 2700 }, { "epoch": 0.6864770379787065, "eval_loss": 9.15518856048584, "eval_runtime": 695.3568, "eval_samples_per_second": 152.42, "eval_steps_per_second": 9.527, "step": 2700 }, { "epoch": 0.6867312887335134, "grad_norm": 25059.322265625, "learning_rate": 1.3538658743064667e-05, "loss": 4.548, "step": 2701 }, { "epoch": 0.6869855394883203, "grad_norm": 24568.916015625, "learning_rate": 1.3518940239920916e-05, "loss": 4.5444, "step": 2702 }, { "epoch": 0.6872397902431273, "grad_norm": 25235.271484375, "learning_rate": 1.3499230784110024e-05, "loss": 4.5602, "step": 2703 }, { "epoch": 0.6874940409979342, "grad_norm": 24682.8671875, "learning_rate": 1.3479530391163504e-05, "loss": 4.5261, "step": 2704 }, { "epoch": 0.6877482917527411, "grad_norm": 24491.275390625, "learning_rate": 1.3459839076605696e-05, "loss": 4.5383, "step": 2705 }, { "epoch": 0.6880025425075481, "grad_norm": 24557.62109375, "learning_rate": 1.344015685595379e-05, "loss": 4.5407, "step": 2706 }, { "epoch": 0.688256793262355, "grad_norm": 24460.927734375, "learning_rate": 1.3420483744717838e-05, "loss": 4.5492, "step": 2707 }, { "epoch": 0.6885110440171619, "grad_norm": 24695.630859375, "learning_rate": 1.340081975840067e-05, "loss": 4.5299, "step": 2708 }, { "epoch": 0.6887652947719689, "grad_norm": 24426.888671875, "learning_rate": 1.3381164912497962e-05, "loss": 4.5339, "step": 2709 }, { "epoch": 0.6890195455267758, "grad_norm": 24603.654296875, "learning_rate": 1.3361519222498187e-05, "loss": 4.5303, "step": 2710 }, { "epoch": 0.6892737962815827, "grad_norm": 24566.6015625, "learning_rate": 1.3341882703882572e-05, "loss": 4.5382, "step": 2711 }, { "epoch": 0.6895280470363896, "grad_norm": 24491.1953125, "learning_rate": 1.3322255372125131e-05, "loss": 4.5327, "step": 2712 }, { "epoch": 0.6897822977911966, "grad_norm": 24574.552734375, "learning_rate": 1.3302637242692656e-05, "loss": 4.5427, "step": 2713 }, { "epoch": 0.6900365485460035, "grad_norm": 24334.91015625, "learning_rate": 1.328302833104467e-05, "loss": 4.5375, "step": 2714 }, { "epoch": 0.6902907993008104, "grad_norm": 24664.884765625, "learning_rate": 1.326342865263342e-05, "loss": 4.5488, "step": 2715 }, { "epoch": 0.6905450500556174, "grad_norm": 24596.033203125, "learning_rate": 1.324383822290392e-05, "loss": 4.5357, "step": 2716 }, { "epoch": 0.6907993008104243, "grad_norm": 24578.140625, "learning_rate": 1.3224257057293848e-05, "loss": 4.5223, "step": 2717 }, { "epoch": 0.6910535515652312, "grad_norm": 24637.0625, "learning_rate": 1.3204685171233602e-05, "loss": 4.5457, "step": 2718 }, { "epoch": 0.6913078023200382, "grad_norm": 24653.853515625, "learning_rate": 1.3185122580146274e-05, "loss": 4.5372, "step": 2719 }, { "epoch": 0.6915620530748451, "grad_norm": 24514.70703125, "learning_rate": 1.3165569299447633e-05, "loss": 4.5363, "step": 2720 }, { "epoch": 0.691816303829652, "grad_norm": 24774.013671875, "learning_rate": 1.3146025344546087e-05, "loss": 4.5438, "step": 2721 }, { "epoch": 0.6920705545844589, "grad_norm": 24428.32421875, "learning_rate": 1.3126490730842727e-05, "loss": 4.5334, "step": 2722 }, { "epoch": 0.6923248053392659, "grad_norm": 24422.35546875, "learning_rate": 1.310696547373126e-05, "loss": 4.5392, "step": 2723 }, { "epoch": 0.6925790560940728, "grad_norm": 24679.43359375, "learning_rate": 1.3087449588598016e-05, "loss": 4.5539, "step": 2724 }, { "epoch": 0.6928333068488797, "grad_norm": 25650.291015625, "learning_rate": 1.3067943090821971e-05, "loss": 4.5398, "step": 2725 }, { "epoch": 0.6930875576036867, "grad_norm": 24690.90625, "learning_rate": 1.3048445995774672e-05, "loss": 4.5331, "step": 2726 }, { "epoch": 0.6933418083584936, "grad_norm": 24565.85546875, "learning_rate": 1.302895831882026e-05, "loss": 4.5292, "step": 2727 }, { "epoch": 0.6935960591133005, "grad_norm": 24578.10546875, "learning_rate": 1.3009480075315481e-05, "loss": 4.5213, "step": 2728 }, { "epoch": 0.6938503098681075, "grad_norm": 24551.95703125, "learning_rate": 1.2990011280609607e-05, "loss": 4.5345, "step": 2729 }, { "epoch": 0.6941045606229144, "grad_norm": 24626.541015625, "learning_rate": 1.2970551950044507e-05, "loss": 4.5317, "step": 2730 }, { "epoch": 0.6943588113777213, "grad_norm": 24581.595703125, "learning_rate": 1.295110209895455e-05, "loss": 4.5413, "step": 2731 }, { "epoch": 0.6946130621325282, "grad_norm": 24518.07421875, "learning_rate": 1.2931661742666676e-05, "loss": 4.5164, "step": 2732 }, { "epoch": 0.6948673128873352, "grad_norm": 24475.59375, "learning_rate": 1.291223089650031e-05, "loss": 4.5473, "step": 2733 }, { "epoch": 0.6951215636421421, "grad_norm": 24565.189453125, "learning_rate": 1.2892809575767389e-05, "loss": 4.525, "step": 2734 }, { "epoch": 0.695375814396949, "grad_norm": 24648.650390625, "learning_rate": 1.2873397795772363e-05, "loss": 4.5428, "step": 2735 }, { "epoch": 0.695630065151756, "grad_norm": 24591.53515625, "learning_rate": 1.2853995571812146e-05, "loss": 4.5326, "step": 2736 }, { "epoch": 0.6958843159065629, "grad_norm": 24548.4140625, "learning_rate": 1.2834602919176117e-05, "loss": 4.5234, "step": 2737 }, { "epoch": 0.6961385666613698, "grad_norm": 24423.3125, "learning_rate": 1.2815219853146137e-05, "loss": 4.5396, "step": 2738 }, { "epoch": 0.6963928174161768, "grad_norm": 24610.857421875, "learning_rate": 1.2795846388996482e-05, "loss": 4.5322, "step": 2739 }, { "epoch": 0.6966470681709837, "grad_norm": 24601.65625, "learning_rate": 1.2776482541993884e-05, "loss": 4.5336, "step": 2740 }, { "epoch": 0.6969013189257905, "grad_norm": 24320.134765625, "learning_rate": 1.27571283273975e-05, "loss": 4.5347, "step": 2741 }, { "epoch": 0.6971555696805974, "grad_norm": 24588.26171875, "learning_rate": 1.273778376045887e-05, "loss": 4.5438, "step": 2742 }, { "epoch": 0.6974098204354044, "grad_norm": 24585.6328125, "learning_rate": 1.271844885642195e-05, "loss": 4.5348, "step": 2743 }, { "epoch": 0.6976640711902113, "grad_norm": 24780.6953125, "learning_rate": 1.2699123630523086e-05, "loss": 4.5626, "step": 2744 }, { "epoch": 0.6979183219450182, "grad_norm": 24794.88671875, "learning_rate": 1.2679808097990986e-05, "loss": 4.5423, "step": 2745 }, { "epoch": 0.6981725726998252, "grad_norm": 24593.896484375, "learning_rate": 1.2660502274046714e-05, "loss": 4.5297, "step": 2746 }, { "epoch": 0.6984268234546321, "grad_norm": 24634.080078125, "learning_rate": 1.2641206173903708e-05, "loss": 4.5079, "step": 2747 }, { "epoch": 0.698681074209439, "grad_norm": 24642.599609375, "learning_rate": 1.2621919812767724e-05, "loss": 4.5281, "step": 2748 }, { "epoch": 0.698935324964246, "grad_norm": 24419.669921875, "learning_rate": 1.260264320583683e-05, "loss": 4.5357, "step": 2749 }, { "epoch": 0.6991895757190529, "grad_norm": 24614.642578125, "learning_rate": 1.2583376368301442e-05, "loss": 4.5272, "step": 2750 }, { "epoch": 0.6994438264738598, "grad_norm": 24744.416015625, "learning_rate": 1.256411931534427e-05, "loss": 4.5245, "step": 2751 }, { "epoch": 0.6996980772286667, "grad_norm": 24517.322265625, "learning_rate": 1.2544872062140281e-05, "loss": 4.5282, "step": 2752 }, { "epoch": 0.6999523279834737, "grad_norm": 24756.703125, "learning_rate": 1.2525634623856763e-05, "loss": 4.5379, "step": 2753 }, { "epoch": 0.7002065787382806, "grad_norm": 24605.791015625, "learning_rate": 1.2506407015653244e-05, "loss": 4.5336, "step": 2754 }, { "epoch": 0.7004608294930875, "grad_norm": 24491.95703125, "learning_rate": 1.2487189252681491e-05, "loss": 4.5312, "step": 2755 }, { "epoch": 0.7007150802478945, "grad_norm": 24499.0859375, "learning_rate": 1.246798135008556e-05, "loss": 4.5382, "step": 2756 }, { "epoch": 0.7009693310027014, "grad_norm": 24607.48828125, "learning_rate": 1.2448783323001703e-05, "loss": 4.526, "step": 2757 }, { "epoch": 0.7012235817575083, "grad_norm": 24581.46484375, "learning_rate": 1.242959518655838e-05, "loss": 4.5362, "step": 2758 }, { "epoch": 0.7014778325123153, "grad_norm": 24611.47265625, "learning_rate": 1.2410416955876294e-05, "loss": 4.5352, "step": 2759 }, { "epoch": 0.7017320832671222, "grad_norm": 24762.625, "learning_rate": 1.2391248646068304e-05, "loss": 4.5494, "step": 2760 }, { "epoch": 0.7019863340219291, "grad_norm": 24322.44140625, "learning_rate": 1.2372090272239483e-05, "loss": 4.5271, "step": 2761 }, { "epoch": 0.702240584776736, "grad_norm": 24410.3125, "learning_rate": 1.2352941849487048e-05, "loss": 4.5409, "step": 2762 }, { "epoch": 0.702494835531543, "grad_norm": 24564.642578125, "learning_rate": 1.23338033929004e-05, "loss": 4.5246, "step": 2763 }, { "epoch": 0.7027490862863499, "grad_norm": 24566.39453125, "learning_rate": 1.2314674917561067e-05, "loss": 4.5204, "step": 2764 }, { "epoch": 0.7030033370411568, "grad_norm": 24650.7421875, "learning_rate": 1.2295556438542702e-05, "loss": 4.5454, "step": 2765 }, { "epoch": 0.7032575877959638, "grad_norm": 24607.2578125, "learning_rate": 1.2276447970911118e-05, "loss": 4.5204, "step": 2766 }, { "epoch": 0.7035118385507707, "grad_norm": 24574.49609375, "learning_rate": 1.2257349529724208e-05, "loss": 4.5153, "step": 2767 }, { "epoch": 0.7037660893055776, "grad_norm": 24731.025390625, "learning_rate": 1.2238261130031959e-05, "loss": 4.5328, "step": 2768 }, { "epoch": 0.7040203400603846, "grad_norm": 24577.15625, "learning_rate": 1.2219182786876482e-05, "loss": 4.5213, "step": 2769 }, { "epoch": 0.7042745908151915, "grad_norm": 24677.474609375, "learning_rate": 1.220011451529192e-05, "loss": 4.5264, "step": 2770 }, { "epoch": 0.7045288415699984, "grad_norm": 24732.84375, "learning_rate": 1.2181056330304505e-05, "loss": 4.5402, "step": 2771 }, { "epoch": 0.7047830923248053, "grad_norm": 24607.443359375, "learning_rate": 1.2162008246932527e-05, "loss": 4.5284, "step": 2772 }, { "epoch": 0.7050373430796123, "grad_norm": 24714.83984375, "learning_rate": 1.2142970280186295e-05, "loss": 4.5423, "step": 2773 }, { "epoch": 0.7052915938344192, "grad_norm": 24522.263671875, "learning_rate": 1.212394244506814e-05, "loss": 4.5264, "step": 2774 }, { "epoch": 0.7055458445892261, "grad_norm": 24505.8671875, "learning_rate": 1.210492475657245e-05, "loss": 4.5334, "step": 2775 }, { "epoch": 0.7058000953440331, "grad_norm": 24583.64453125, "learning_rate": 1.2085917229685573e-05, "loss": 4.5403, "step": 2776 }, { "epoch": 0.70605434609884, "grad_norm": 24423.26171875, "learning_rate": 1.2066919879385864e-05, "loss": 4.5262, "step": 2777 }, { "epoch": 0.7063085968536469, "grad_norm": 24572.904296875, "learning_rate": 1.2047932720643676e-05, "loss": 4.5344, "step": 2778 }, { "epoch": 0.7065628476084539, "grad_norm": 24621.18359375, "learning_rate": 1.2028955768421307e-05, "loss": 4.5229, "step": 2779 }, { "epoch": 0.7068170983632608, "grad_norm": 24553.146484375, "learning_rate": 1.2009989037673017e-05, "loss": 4.5353, "step": 2780 }, { "epoch": 0.7070713491180677, "grad_norm": 24619.4765625, "learning_rate": 1.1991032543345019e-05, "loss": 4.5441, "step": 2781 }, { "epoch": 0.7073255998728746, "grad_norm": 24470.681640625, "learning_rate": 1.1972086300375468e-05, "loss": 4.5282, "step": 2782 }, { "epoch": 0.7075798506276816, "grad_norm": 24457.095703125, "learning_rate": 1.1953150323694413e-05, "loss": 4.5171, "step": 2783 }, { "epoch": 0.7078341013824885, "grad_norm": 24458.388671875, "learning_rate": 1.1934224628223841e-05, "loss": 4.5305, "step": 2784 }, { "epoch": 0.7080883521372954, "grad_norm": 24587.169921875, "learning_rate": 1.1915309228877622e-05, "loss": 4.5183, "step": 2785 }, { "epoch": 0.7083426028921024, "grad_norm": 24627.361328125, "learning_rate": 1.1896404140561504e-05, "loss": 4.5284, "step": 2786 }, { "epoch": 0.7085968536469093, "grad_norm": 24575.806640625, "learning_rate": 1.1877509378173137e-05, "loss": 4.5147, "step": 2787 }, { "epoch": 0.7088511044017162, "grad_norm": 24511.853515625, "learning_rate": 1.1858624956602013e-05, "loss": 4.5349, "step": 2788 }, { "epoch": 0.7091053551565232, "grad_norm": 24568.107421875, "learning_rate": 1.1839750890729467e-05, "loss": 4.5262, "step": 2789 }, { "epoch": 0.7093596059113301, "grad_norm": 24516.11328125, "learning_rate": 1.1820887195428707e-05, "loss": 4.5246, "step": 2790 }, { "epoch": 0.709613856666137, "grad_norm": 24561.26953125, "learning_rate": 1.1802033885564732e-05, "loss": 4.5291, "step": 2791 }, { "epoch": 0.7098681074209439, "grad_norm": 24440.9921875, "learning_rate": 1.1783190975994388e-05, "loss": 4.5282, "step": 2792 }, { "epoch": 0.7101223581757509, "grad_norm": 24644.482421875, "learning_rate": 1.1764358481566293e-05, "loss": 4.5257, "step": 2793 }, { "epoch": 0.7103766089305578, "grad_norm": 24571.787109375, "learning_rate": 1.1745536417120895e-05, "loss": 4.5246, "step": 2794 }, { "epoch": 0.7106308596853647, "grad_norm": 24504.5703125, "learning_rate": 1.1726724797490396e-05, "loss": 4.5357, "step": 2795 }, { "epoch": 0.7108851104401717, "grad_norm": 24616.7890625, "learning_rate": 1.1707923637498763e-05, "loss": 4.5271, "step": 2796 }, { "epoch": 0.7111393611949786, "grad_norm": 24491.08984375, "learning_rate": 1.1689132951961751e-05, "loss": 4.5203, "step": 2797 }, { "epoch": 0.7113936119497855, "grad_norm": 24620.1015625, "learning_rate": 1.1670352755686834e-05, "loss": 4.5337, "step": 2798 }, { "epoch": 0.7116478627045923, "grad_norm": 24555.818359375, "learning_rate": 1.1651583063473217e-05, "loss": 4.5284, "step": 2799 }, { "epoch": 0.7119021134593994, "grad_norm": 24425.498046875, "learning_rate": 1.163282389011186e-05, "loss": 4.5173, "step": 2800 }, { "epoch": 0.7119021134593994, "eval_loss": 9.120746612548828, "eval_runtime": 698.1799, "eval_samples_per_second": 151.803, "eval_steps_per_second": 9.489, "step": 2800 }, { "epoch": 0.7121563642142063, "grad_norm": 24571.57421875, "learning_rate": 1.1614075250385392e-05, "loss": 4.5257, "step": 2801 }, { "epoch": 0.7124106149690131, "grad_norm": 24655.33203125, "learning_rate": 1.1595337159068173e-05, "loss": 4.5185, "step": 2802 }, { "epoch": 0.7126648657238202, "grad_norm": 24661.380859375, "learning_rate": 1.1576609630926246e-05, "loss": 4.5329, "step": 2803 }, { "epoch": 0.712919116478627, "grad_norm": 24512.60546875, "learning_rate": 1.155789268071732e-05, "loss": 4.5347, "step": 2804 }, { "epoch": 0.7131733672334339, "grad_norm": 24715.060546875, "learning_rate": 1.1539186323190757e-05, "loss": 4.5342, "step": 2805 }, { "epoch": 0.713427617988241, "grad_norm": 24664.0546875, "learning_rate": 1.1520490573087605e-05, "loss": 4.5177, "step": 2806 }, { "epoch": 0.7136818687430478, "grad_norm": 24562.830078125, "learning_rate": 1.1501805445140531e-05, "loss": 4.5267, "step": 2807 }, { "epoch": 0.7139361194978547, "grad_norm": 24562.740234375, "learning_rate": 1.1483130954073826e-05, "loss": 4.5263, "step": 2808 }, { "epoch": 0.7141903702526616, "grad_norm": 24506.6171875, "learning_rate": 1.1464467114603419e-05, "loss": 4.5133, "step": 2809 }, { "epoch": 0.7144446210074686, "grad_norm": 24463.29296875, "learning_rate": 1.1445813941436833e-05, "loss": 4.5253, "step": 2810 }, { "epoch": 0.7146988717622755, "grad_norm": 24612.748046875, "learning_rate": 1.1427171449273175e-05, "loss": 4.5164, "step": 2811 }, { "epoch": 0.7149531225170824, "grad_norm": 24495.36328125, "learning_rate": 1.1408539652803157e-05, "loss": 4.518, "step": 2812 }, { "epoch": 0.7152073732718894, "grad_norm": 24609.59765625, "learning_rate": 1.138991856670906e-05, "loss": 4.531, "step": 2813 }, { "epoch": 0.7154616240266963, "grad_norm": 24586.6640625, "learning_rate": 1.1371308205664705e-05, "loss": 4.5214, "step": 2814 }, { "epoch": 0.7157158747815032, "grad_norm": 24626.869140625, "learning_rate": 1.1352708584335486e-05, "loss": 4.5195, "step": 2815 }, { "epoch": 0.7159701255363102, "grad_norm": 24586.49609375, "learning_rate": 1.133411971737832e-05, "loss": 4.5166, "step": 2816 }, { "epoch": 0.7162243762911171, "grad_norm": 24509.5234375, "learning_rate": 1.1315541619441636e-05, "loss": 4.5202, "step": 2817 }, { "epoch": 0.716478627045924, "grad_norm": 24551.453125, "learning_rate": 1.1296974305165414e-05, "loss": 4.526, "step": 2818 }, { "epoch": 0.7167328778007309, "grad_norm": 24518.50390625, "learning_rate": 1.1278417789181104e-05, "loss": 4.5134, "step": 2819 }, { "epoch": 0.7169871285555379, "grad_norm": 24575.814453125, "learning_rate": 1.125987208611165e-05, "loss": 4.526, "step": 2820 }, { "epoch": 0.7172413793103448, "grad_norm": 24721.21875, "learning_rate": 1.1241337210571498e-05, "loss": 4.5257, "step": 2821 }, { "epoch": 0.7174956300651517, "grad_norm": 24604.32421875, "learning_rate": 1.122281317716653e-05, "loss": 4.5222, "step": 2822 }, { "epoch": 0.7177498808199587, "grad_norm": 24642.431640625, "learning_rate": 1.1204300000494117e-05, "loss": 4.5227, "step": 2823 }, { "epoch": 0.7180041315747656, "grad_norm": 24788.953125, "learning_rate": 1.118579769514304e-05, "loss": 4.5049, "step": 2824 }, { "epoch": 0.7182583823295725, "grad_norm": 24547.365234375, "learning_rate": 1.1167306275693553e-05, "loss": 4.4986, "step": 2825 }, { "epoch": 0.7185126330843795, "grad_norm": 24592.56640625, "learning_rate": 1.1148825756717296e-05, "loss": 4.5227, "step": 2826 }, { "epoch": 0.7187668838391864, "grad_norm": 24771.935546875, "learning_rate": 1.1130356152777324e-05, "loss": 4.5322, "step": 2827 }, { "epoch": 0.7190211345939933, "grad_norm": 24512.740234375, "learning_rate": 1.1111897478428118e-05, "loss": 4.5186, "step": 2828 }, { "epoch": 0.7192753853488002, "grad_norm": 24559.509765625, "learning_rate": 1.1093449748215522e-05, "loss": 4.514, "step": 2829 }, { "epoch": 0.7195296361036072, "grad_norm": 24490.0625, "learning_rate": 1.1075012976676748e-05, "loss": 4.5148, "step": 2830 }, { "epoch": 0.7197838868584141, "grad_norm": 24711.111328125, "learning_rate": 1.1056587178340408e-05, "loss": 4.518, "step": 2831 }, { "epoch": 0.720038137613221, "grad_norm": 24496.298828125, "learning_rate": 1.1038172367726424e-05, "loss": 4.5162, "step": 2832 }, { "epoch": 0.720292388368028, "grad_norm": 24688.23046875, "learning_rate": 1.10197685593461e-05, "loss": 4.515, "step": 2833 }, { "epoch": 0.7205466391228349, "grad_norm": 24696.15625, "learning_rate": 1.100137576770203e-05, "loss": 4.5215, "step": 2834 }, { "epoch": 0.7208008898776418, "grad_norm": 24514.96484375, "learning_rate": 1.0982994007288166e-05, "loss": 4.5088, "step": 2835 }, { "epoch": 0.7210551406324488, "grad_norm": 24648.576171875, "learning_rate": 1.0964623292589728e-05, "loss": 4.5073, "step": 2836 }, { "epoch": 0.7213093913872557, "grad_norm": 24753.626953125, "learning_rate": 1.0946263638083276e-05, "loss": 4.5111, "step": 2837 }, { "epoch": 0.7215636421420626, "grad_norm": 24724.482421875, "learning_rate": 1.0927915058236615e-05, "loss": 4.52, "step": 2838 }, { "epoch": 0.7218178928968695, "grad_norm": 24680.513671875, "learning_rate": 1.090957756750883e-05, "loss": 4.5266, "step": 2839 }, { "epoch": 0.7220721436516765, "grad_norm": 24543.712890625, "learning_rate": 1.0891251180350295e-05, "loss": 4.5166, "step": 2840 }, { "epoch": 0.7223263944064834, "grad_norm": 24585.509765625, "learning_rate": 1.0872935911202603e-05, "loss": 4.5157, "step": 2841 }, { "epoch": 0.7225806451612903, "grad_norm": 24542.85546875, "learning_rate": 1.0854631774498591e-05, "loss": 4.523, "step": 2842 }, { "epoch": 0.7228348959160973, "grad_norm": 24622.171875, "learning_rate": 1.0836338784662348e-05, "loss": 4.5346, "step": 2843 }, { "epoch": 0.7230891466709042, "grad_norm": 24633.845703125, "learning_rate": 1.0818056956109138e-05, "loss": 4.5032, "step": 2844 }, { "epoch": 0.7233433974257111, "grad_norm": 24724.28515625, "learning_rate": 1.0799786303245465e-05, "loss": 4.513, "step": 2845 }, { "epoch": 0.7235976481805181, "grad_norm": 24725.634765625, "learning_rate": 1.0781526840469022e-05, "loss": 4.5248, "step": 2846 }, { "epoch": 0.723851898935325, "grad_norm": 24581.431640625, "learning_rate": 1.0763278582168671e-05, "loss": 4.5124, "step": 2847 }, { "epoch": 0.7241061496901319, "grad_norm": 24616.732421875, "learning_rate": 1.0745041542724431e-05, "loss": 4.52, "step": 2848 }, { "epoch": 0.7243604004449388, "grad_norm": 24578.033203125, "learning_rate": 1.0726815736507526e-05, "loss": 4.5095, "step": 2849 }, { "epoch": 0.7246146511997458, "grad_norm": 24537.380859375, "learning_rate": 1.0708601177880284e-05, "loss": 4.5105, "step": 2850 }, { "epoch": 0.7248689019545527, "grad_norm": 24634.66015625, "learning_rate": 1.0690397881196182e-05, "loss": 4.5236, "step": 2851 }, { "epoch": 0.7251231527093596, "grad_norm": 24801.009765625, "learning_rate": 1.0672205860799841e-05, "loss": 4.5062, "step": 2852 }, { "epoch": 0.7253774034641666, "grad_norm": 24844.7421875, "learning_rate": 1.0654025131026976e-05, "loss": 4.5098, "step": 2853 }, { "epoch": 0.7256316542189735, "grad_norm": 24626.2578125, "learning_rate": 1.06358557062044e-05, "loss": 4.5281, "step": 2854 }, { "epoch": 0.7258859049737804, "grad_norm": 24679.78515625, "learning_rate": 1.0617697600650033e-05, "loss": 4.5121, "step": 2855 }, { "epoch": 0.7261401557285874, "grad_norm": 24709.298828125, "learning_rate": 1.0599550828672886e-05, "loss": 4.5254, "step": 2856 }, { "epoch": 0.7263944064833943, "grad_norm": 24571.041015625, "learning_rate": 1.0581415404573008e-05, "loss": 4.5176, "step": 2857 }, { "epoch": 0.7266486572382012, "grad_norm": 24846.720703125, "learning_rate": 1.0563291342641515e-05, "loss": 4.5203, "step": 2858 }, { "epoch": 0.726902907993008, "grad_norm": 24602.583984375, "learning_rate": 1.054517865716059e-05, "loss": 4.5012, "step": 2859 }, { "epoch": 0.7271571587478151, "grad_norm": 24707.646484375, "learning_rate": 1.052707736240343e-05, "loss": 4.5106, "step": 2860 }, { "epoch": 0.727411409502622, "grad_norm": 24561.548828125, "learning_rate": 1.0508987472634249e-05, "loss": 4.5209, "step": 2861 }, { "epoch": 0.7276656602574288, "grad_norm": 24707.3828125, "learning_rate": 1.0490909002108303e-05, "loss": 4.4947, "step": 2862 }, { "epoch": 0.7279199110122359, "grad_norm": 24711.82421875, "learning_rate": 1.0472841965071831e-05, "loss": 4.5269, "step": 2863 }, { "epoch": 0.7281741617670427, "grad_norm": 24861.1484375, "learning_rate": 1.0454786375762049e-05, "loss": 4.5303, "step": 2864 }, { "epoch": 0.7284284125218496, "grad_norm": 24730.208984375, "learning_rate": 1.0436742248407177e-05, "loss": 4.5274, "step": 2865 }, { "epoch": 0.7286826632766567, "grad_norm": 24674.330078125, "learning_rate": 1.04187095972264e-05, "loss": 4.5066, "step": 2866 }, { "epoch": 0.7289369140314635, "grad_norm": 24659.705078125, "learning_rate": 1.0400688436429837e-05, "loss": 4.5086, "step": 2867 }, { "epoch": 0.7291911647862704, "grad_norm": 24667.701171875, "learning_rate": 1.0382678780218585e-05, "loss": 4.5005, "step": 2868 }, { "epoch": 0.7294454155410773, "grad_norm": 24320.171875, "learning_rate": 1.0364680642784646e-05, "loss": 4.52, "step": 2869 }, { "epoch": 0.7296996662958843, "grad_norm": 24739.58984375, "learning_rate": 1.034669403831095e-05, "loss": 4.5138, "step": 2870 }, { "epoch": 0.7299539170506912, "grad_norm": 24748.58984375, "learning_rate": 1.0328718980971361e-05, "loss": 4.5272, "step": 2871 }, { "epoch": 0.7302081678054981, "grad_norm": 24510.892578125, "learning_rate": 1.0310755484930621e-05, "loss": 4.507, "step": 2872 }, { "epoch": 0.7304624185603051, "grad_norm": 24731.041015625, "learning_rate": 1.0292803564344358e-05, "loss": 4.5131, "step": 2873 }, { "epoch": 0.730716669315112, "grad_norm": 24497.859375, "learning_rate": 1.0274863233359106e-05, "loss": 4.5071, "step": 2874 }, { "epoch": 0.7309709200699189, "grad_norm": 24641.732421875, "learning_rate": 1.0256934506112228e-05, "loss": 4.5205, "step": 2875 }, { "epoch": 0.7312251708247259, "grad_norm": 24699.0546875, "learning_rate": 1.0239017396731978e-05, "loss": 4.5069, "step": 2876 }, { "epoch": 0.7314794215795328, "grad_norm": 24471.77734375, "learning_rate": 1.0221111919337451e-05, "loss": 4.5017, "step": 2877 }, { "epoch": 0.7317336723343397, "grad_norm": 24584.58203125, "learning_rate": 1.0203218088038546e-05, "loss": 4.5266, "step": 2878 }, { "epoch": 0.7319879230891466, "grad_norm": 24751.947265625, "learning_rate": 1.0185335916936006e-05, "loss": 4.5191, "step": 2879 }, { "epoch": 0.7322421738439536, "grad_norm": 24693.267578125, "learning_rate": 1.0167465420121394e-05, "loss": 4.5181, "step": 2880 }, { "epoch": 0.7324964245987605, "grad_norm": 24565.5859375, "learning_rate": 1.0149606611677057e-05, "loss": 4.5209, "step": 2881 }, { "epoch": 0.7327506753535674, "grad_norm": 24728.08984375, "learning_rate": 1.0131759505676128e-05, "loss": 4.5137, "step": 2882 }, { "epoch": 0.7330049261083744, "grad_norm": 24549.197265625, "learning_rate": 1.0113924116182545e-05, "loss": 4.5203, "step": 2883 }, { "epoch": 0.7332591768631813, "grad_norm": 24702.048828125, "learning_rate": 1.0096100457250982e-05, "loss": 4.5081, "step": 2884 }, { "epoch": 0.7335134276179882, "grad_norm": 24607.986328125, "learning_rate": 1.0078288542926881e-05, "loss": 4.5141, "step": 2885 }, { "epoch": 0.7337676783727952, "grad_norm": 24852.66015625, "learning_rate": 1.0060488387246433e-05, "loss": 4.4966, "step": 2886 }, { "epoch": 0.7340219291276021, "grad_norm": 24571.57421875, "learning_rate": 1.0042700004236574e-05, "loss": 4.5055, "step": 2887 }, { "epoch": 0.734276179882409, "grad_norm": 24748.0625, "learning_rate": 1.0024923407914937e-05, "loss": 4.5174, "step": 2888 }, { "epoch": 0.7345304306372159, "grad_norm": 24663.998046875, "learning_rate": 1.0007158612289875e-05, "loss": 4.5018, "step": 2889 }, { "epoch": 0.7347846813920229, "grad_norm": 24664.279296875, "learning_rate": 9.989405631360454e-06, "loss": 4.518, "step": 2890 }, { "epoch": 0.7350389321468298, "grad_norm": 24721.41796875, "learning_rate": 9.971664479116424e-06, "loss": 4.5194, "step": 2891 }, { "epoch": 0.7352931829016367, "grad_norm": 24771.279296875, "learning_rate": 9.953935169538195e-06, "loss": 4.5195, "step": 2892 }, { "epoch": 0.7355474336564437, "grad_norm": 24779.580078125, "learning_rate": 9.93621771659688e-06, "loss": 4.5154, "step": 2893 }, { "epoch": 0.7358016844112506, "grad_norm": 24772.01171875, "learning_rate": 9.918512134254224e-06, "loss": 4.5082, "step": 2894 }, { "epoch": 0.7360559351660575, "grad_norm": 24735.28515625, "learning_rate": 9.900818436462608e-06, "loss": 4.5153, "step": 2895 }, { "epoch": 0.7363101859208645, "grad_norm": 24682.697265625, "learning_rate": 9.88313663716508e-06, "loss": 4.5095, "step": 2896 }, { "epoch": 0.7365644366756714, "grad_norm": 24713.830078125, "learning_rate": 9.865466750295299e-06, "loss": 4.5085, "step": 2897 }, { "epoch": 0.7368186874304783, "grad_norm": 24520.05078125, "learning_rate": 9.847808789777516e-06, "loss": 4.5033, "step": 2898 }, { "epoch": 0.7370729381852852, "grad_norm": 24690.03125, "learning_rate": 9.830162769526616e-06, "loss": 4.5131, "step": 2899 }, { "epoch": 0.7373271889400922, "grad_norm": 24717.259765625, "learning_rate": 9.812528703448053e-06, "loss": 4.5158, "step": 2900 }, { "epoch": 0.7373271889400922, "eval_loss": 9.091280937194824, "eval_runtime": 699.6815, "eval_samples_per_second": 151.477, "eval_steps_per_second": 9.469, "step": 2900 }, { "epoch": 0.7375814396948991, "grad_norm": 24643.61328125, "learning_rate": 9.79490660543786e-06, "loss": 4.5028, "step": 2901 }, { "epoch": 0.737835690449706, "grad_norm": 24695.962890625, "learning_rate": 9.77729648938266e-06, "loss": 4.5053, "step": 2902 }, { "epoch": 0.738089941204513, "grad_norm": 24660.46484375, "learning_rate": 9.759698369159608e-06, "loss": 4.5258, "step": 2903 }, { "epoch": 0.7383441919593199, "grad_norm": 24826.34375, "learning_rate": 9.742112258636415e-06, "loss": 4.5041, "step": 2904 }, { "epoch": 0.7385984427141268, "grad_norm": 24553.201171875, "learning_rate": 9.72453817167135e-06, "loss": 4.5012, "step": 2905 }, { "epoch": 0.7388526934689338, "grad_norm": 24939.783203125, "learning_rate": 9.706976122113162e-06, "loss": 4.519, "step": 2906 }, { "epoch": 0.7391069442237407, "grad_norm": 24790.2109375, "learning_rate": 9.689426123801157e-06, "loss": 4.5105, "step": 2907 }, { "epoch": 0.7393611949785476, "grad_norm": 24654.67578125, "learning_rate": 9.671888190565132e-06, "loss": 4.509, "step": 2908 }, { "epoch": 0.7396154457333545, "grad_norm": 24797.306640625, "learning_rate": 9.654362336225368e-06, "loss": 4.5131, "step": 2909 }, { "epoch": 0.7398696964881615, "grad_norm": 24856.2265625, "learning_rate": 9.636848574592616e-06, "loss": 4.4883, "step": 2910 }, { "epoch": 0.7401239472429684, "grad_norm": 24707.658203125, "learning_rate": 9.619346919468136e-06, "loss": 4.5157, "step": 2911 }, { "epoch": 0.7403781979977753, "grad_norm": 24921.486328125, "learning_rate": 9.601857384643617e-06, "loss": 4.5139, "step": 2912 }, { "epoch": 0.7406324487525823, "grad_norm": 25025.767578125, "learning_rate": 9.584379983901193e-06, "loss": 4.509, "step": 2913 }, { "epoch": 0.7408866995073892, "grad_norm": 24735.580078125, "learning_rate": 9.566914731013469e-06, "loss": 4.507, "step": 2914 }, { "epoch": 0.7411409502621961, "grad_norm": 24881.625, "learning_rate": 9.549461639743445e-06, "loss": 4.5001, "step": 2915 }, { "epoch": 0.7413952010170031, "grad_norm": 24822.53125, "learning_rate": 9.53202072384454e-06, "loss": 4.5144, "step": 2916 }, { "epoch": 0.74164945177181, "grad_norm": 24711.216796875, "learning_rate": 9.5145919970606e-06, "loss": 4.5049, "step": 2917 }, { "epoch": 0.7419037025266169, "grad_norm": 24874.71875, "learning_rate": 9.497175473125854e-06, "loss": 4.511, "step": 2918 }, { "epoch": 0.7421579532814238, "grad_norm": 24836.255859375, "learning_rate": 9.479771165764916e-06, "loss": 4.5082, "step": 2919 }, { "epoch": 0.7424122040362308, "grad_norm": 24491.640625, "learning_rate": 9.462379088692752e-06, "loss": 4.4925, "step": 2920 }, { "epoch": 0.7426664547910377, "grad_norm": 24887.7265625, "learning_rate": 9.444999255614734e-06, "loss": 4.4999, "step": 2921 }, { "epoch": 0.7429207055458446, "grad_norm": 24762.064453125, "learning_rate": 9.427631680226548e-06, "loss": 4.505, "step": 2922 }, { "epoch": 0.7431749563006516, "grad_norm": 24669.28515625, "learning_rate": 9.410276376214225e-06, "loss": 4.498, "step": 2923 }, { "epoch": 0.7434292070554585, "grad_norm": 24822.990234375, "learning_rate": 9.392933357254151e-06, "loss": 4.5174, "step": 2924 }, { "epoch": 0.7436834578102653, "grad_norm": 24809.12890625, "learning_rate": 9.37560263701301e-06, "loss": 4.5119, "step": 2925 }, { "epoch": 0.7439377085650724, "grad_norm": 24701.10546875, "learning_rate": 9.358284229147785e-06, "loss": 4.5113, "step": 2926 }, { "epoch": 0.7441919593198792, "grad_norm": 24921.001953125, "learning_rate": 9.34097814730578e-06, "loss": 4.5091, "step": 2927 }, { "epoch": 0.7444462100746861, "grad_norm": 24737.771484375, "learning_rate": 9.323684405124586e-06, "loss": 4.5099, "step": 2928 }, { "epoch": 0.744700460829493, "grad_norm": 24839.58984375, "learning_rate": 9.306403016232042e-06, "loss": 4.5022, "step": 2929 }, { "epoch": 0.7449547115843, "grad_norm": 24923.19921875, "learning_rate": 9.289133994246288e-06, "loss": 4.5225, "step": 2930 }, { "epoch": 0.7452089623391069, "grad_norm": 24689.068359375, "learning_rate": 9.271877352775693e-06, "loss": 4.507, "step": 2931 }, { "epoch": 0.7454632130939138, "grad_norm": 24819.75390625, "learning_rate": 9.25463310541887e-06, "loss": 4.511, "step": 2932 }, { "epoch": 0.7457174638487208, "grad_norm": 24796.404296875, "learning_rate": 9.237401265764687e-06, "loss": 4.5036, "step": 2933 }, { "epoch": 0.7459717146035277, "grad_norm": 24705.724609375, "learning_rate": 9.220181847392215e-06, "loss": 4.5241, "step": 2934 }, { "epoch": 0.7462259653583346, "grad_norm": 24488.142578125, "learning_rate": 9.202974863870734e-06, "loss": 4.5104, "step": 2935 }, { "epoch": 0.7464802161131416, "grad_norm": 24748.849609375, "learning_rate": 9.185780328759746e-06, "loss": 4.487, "step": 2936 }, { "epoch": 0.7467344668679485, "grad_norm": 24702.623046875, "learning_rate": 9.168598255608917e-06, "loss": 4.5082, "step": 2937 }, { "epoch": 0.7469887176227554, "grad_norm": 24454.08203125, "learning_rate": 9.151428657958119e-06, "loss": 4.4932, "step": 2938 }, { "epoch": 0.7472429683775623, "grad_norm": 24816.328125, "learning_rate": 9.13427154933738e-06, "loss": 4.5216, "step": 2939 }, { "epoch": 0.7474972191323693, "grad_norm": 24859.419921875, "learning_rate": 9.117126943266887e-06, "loss": 4.5138, "step": 2940 }, { "epoch": 0.7477514698871762, "grad_norm": 24800.67578125, "learning_rate": 9.09999485325696e-06, "loss": 4.5021, "step": 2941 }, { "epoch": 0.7480057206419831, "grad_norm": 24711.828125, "learning_rate": 9.082875292808091e-06, "loss": 4.5023, "step": 2942 }, { "epoch": 0.7482599713967901, "grad_norm": 24823.349609375, "learning_rate": 9.065768275410865e-06, "loss": 4.5068, "step": 2943 }, { "epoch": 0.748514222151597, "grad_norm": 24625.912109375, "learning_rate": 9.048673814545994e-06, "loss": 4.4879, "step": 2944 }, { "epoch": 0.7487684729064039, "grad_norm": 24630.763671875, "learning_rate": 9.03159192368431e-06, "loss": 4.5103, "step": 2945 }, { "epoch": 0.7490227236612109, "grad_norm": 24870.470703125, "learning_rate": 9.014522616286717e-06, "loss": 4.514, "step": 2946 }, { "epoch": 0.7492769744160178, "grad_norm": 24770.916015625, "learning_rate": 8.997465905804205e-06, "loss": 4.4901, "step": 2947 }, { "epoch": 0.7495312251708247, "grad_norm": 24852.30859375, "learning_rate": 8.980421805677855e-06, "loss": 4.4983, "step": 2948 }, { "epoch": 0.7497854759256316, "grad_norm": 24658.908203125, "learning_rate": 8.963390329338808e-06, "loss": 4.5004, "step": 2949 }, { "epoch": 0.7500397266804386, "grad_norm": 24816.4453125, "learning_rate": 8.946371490208241e-06, "loss": 4.5079, "step": 2950 }, { "epoch": 0.7502939774352455, "grad_norm": 24840.474609375, "learning_rate": 8.929365301697373e-06, "loss": 4.495, "step": 2951 }, { "epoch": 0.7505482281900524, "grad_norm": 24781.185546875, "learning_rate": 8.912371777207478e-06, "loss": 4.496, "step": 2952 }, { "epoch": 0.7508024789448594, "grad_norm": 24763.560546875, "learning_rate": 8.89539093012983e-06, "loss": 4.5163, "step": 2953 }, { "epoch": 0.7510567296996663, "grad_norm": 24786.45703125, "learning_rate": 8.878422773845704e-06, "loss": 4.5086, "step": 2954 }, { "epoch": 0.7513109804544732, "grad_norm": 24827.353515625, "learning_rate": 8.86146732172641e-06, "loss": 4.4979, "step": 2955 }, { "epoch": 0.7515652312092802, "grad_norm": 24724.4140625, "learning_rate": 8.844524587133216e-06, "loss": 4.4954, "step": 2956 }, { "epoch": 0.7518194819640871, "grad_norm": 24726.474609375, "learning_rate": 8.827594583417365e-06, "loss": 4.5073, "step": 2957 }, { "epoch": 0.752073732718894, "grad_norm": 24840.26171875, "learning_rate": 8.81067732392009e-06, "loss": 4.475, "step": 2958 }, { "epoch": 0.7523279834737009, "grad_norm": 24783.5703125, "learning_rate": 8.793772821972582e-06, "loss": 4.5137, "step": 2959 }, { "epoch": 0.7525822342285079, "grad_norm": 24860.142578125, "learning_rate": 8.77688109089595e-06, "loss": 4.5024, "step": 2960 }, { "epoch": 0.7528364849833148, "grad_norm": 24594.4921875, "learning_rate": 8.760002144001272e-06, "loss": 4.4995, "step": 2961 }, { "epoch": 0.7530907357381217, "grad_norm": 24607.1484375, "learning_rate": 8.743135994589533e-06, "loss": 4.4845, "step": 2962 }, { "epoch": 0.7533449864929287, "grad_norm": 24703.84375, "learning_rate": 8.726282655951625e-06, "loss": 4.4897, "step": 2963 }, { "epoch": 0.7535992372477356, "grad_norm": 24817.990234375, "learning_rate": 8.70944214136838e-06, "loss": 4.4955, "step": 2964 }, { "epoch": 0.7538534880025425, "grad_norm": 24610.814453125, "learning_rate": 8.692614464110486e-06, "loss": 4.5021, "step": 2965 }, { "epoch": 0.7541077387573495, "grad_norm": 24879.591796875, "learning_rate": 8.675799637438522e-06, "loss": 4.5059, "step": 2966 }, { "epoch": 0.7543619895121564, "grad_norm": 24550.423828125, "learning_rate": 8.658997674602973e-06, "loss": 4.4913, "step": 2967 }, { "epoch": 0.7546162402669633, "grad_norm": 24834.60546875, "learning_rate": 8.642208588844139e-06, "loss": 4.5187, "step": 2968 }, { "epoch": 0.7548704910217702, "grad_norm": 24763.9765625, "learning_rate": 8.625432393392219e-06, "loss": 4.5108, "step": 2969 }, { "epoch": 0.7551247417765772, "grad_norm": 24771.94921875, "learning_rate": 8.60866910146721e-06, "loss": 4.5151, "step": 2970 }, { "epoch": 0.7553789925313841, "grad_norm": 24773.953125, "learning_rate": 8.591918726278982e-06, "loss": 4.509, "step": 2971 }, { "epoch": 0.755633243286191, "grad_norm": 24784.517578125, "learning_rate": 8.575181281027192e-06, "loss": 4.5014, "step": 2972 }, { "epoch": 0.755887494040998, "grad_norm": 24757.48828125, "learning_rate": 8.558456778901333e-06, "loss": 4.5124, "step": 2973 }, { "epoch": 0.7561417447958049, "grad_norm": 24694.72265625, "learning_rate": 8.541745233080687e-06, "loss": 4.5031, "step": 2974 }, { "epoch": 0.7563959955506118, "grad_norm": 24608.060546875, "learning_rate": 8.525046656734312e-06, "loss": 4.5034, "step": 2975 }, { "epoch": 0.7566502463054188, "grad_norm": 24574.517578125, "learning_rate": 8.508361063021084e-06, "loss": 4.4907, "step": 2976 }, { "epoch": 0.7569044970602257, "grad_norm": 24649.9609375, "learning_rate": 8.491688465089612e-06, "loss": 4.5132, "step": 2977 }, { "epoch": 0.7571587478150326, "grad_norm": 24596.609375, "learning_rate": 8.475028876078272e-06, "loss": 4.4972, "step": 2978 }, { "epoch": 0.7574129985698395, "grad_norm": 24690.931640625, "learning_rate": 8.458382309115212e-06, "loss": 4.5074, "step": 2979 }, { "epoch": 0.7576672493246465, "grad_norm": 24754.642578125, "learning_rate": 8.44174877731828e-06, "loss": 4.4943, "step": 2980 }, { "epoch": 0.7579215000794534, "grad_norm": 24791.7421875, "learning_rate": 8.425128293795095e-06, "loss": 4.5083, "step": 2981 }, { "epoch": 0.7581757508342603, "grad_norm": 24782.58203125, "learning_rate": 8.40852087164295e-06, "loss": 4.4886, "step": 2982 }, { "epoch": 0.7584300015890673, "grad_norm": 24663.4140625, "learning_rate": 8.391926523948883e-06, "loss": 4.5062, "step": 2983 }, { "epoch": 0.7586842523438742, "grad_norm": 24547.46875, "learning_rate": 8.375345263789608e-06, "loss": 4.4846, "step": 2984 }, { "epoch": 0.758938503098681, "grad_norm": 24608.7109375, "learning_rate": 8.358777104231516e-06, "loss": 4.4955, "step": 2985 }, { "epoch": 0.7591927538534881, "grad_norm": 24791.587890625, "learning_rate": 8.342222058330708e-06, "loss": 4.495, "step": 2986 }, { "epoch": 0.759447004608295, "grad_norm": 25059.408203125, "learning_rate": 8.32568013913293e-06, "loss": 4.5227, "step": 2987 }, { "epoch": 0.7597012553631018, "grad_norm": 24596.451171875, "learning_rate": 8.309151359673567e-06, "loss": 4.5026, "step": 2988 }, { "epoch": 0.7599555061179087, "grad_norm": 24864.330078125, "learning_rate": 8.29263573297769e-06, "loss": 4.4971, "step": 2989 }, { "epoch": 0.7602097568727157, "grad_norm": 24871.751953125, "learning_rate": 8.276133272059968e-06, "loss": 4.5126, "step": 2990 }, { "epoch": 0.7604640076275226, "grad_norm": 24624.2265625, "learning_rate": 8.259643989924718e-06, "loss": 4.4852, "step": 2991 }, { "epoch": 0.7607182583823295, "grad_norm": 24952.94921875, "learning_rate": 8.243167899565874e-06, "loss": 4.515, "step": 2992 }, { "epoch": 0.7609725091371365, "grad_norm": 24933.1796875, "learning_rate": 8.22670501396695e-06, "loss": 4.5153, "step": 2993 }, { "epoch": 0.7612267598919434, "grad_norm": 24772.37109375, "learning_rate": 8.210255346101073e-06, "loss": 4.4977, "step": 2994 }, { "epoch": 0.7614810106467503, "grad_norm": 24786.470703125, "learning_rate": 8.19381890893096e-06, "loss": 4.4987, "step": 2995 }, { "epoch": 0.7617352614015573, "grad_norm": 24628.978515625, "learning_rate": 8.177395715408881e-06, "loss": 4.4863, "step": 2996 }, { "epoch": 0.7619895121563642, "grad_norm": 24777.875, "learning_rate": 8.16098577847668e-06, "loss": 4.4865, "step": 2997 }, { "epoch": 0.7622437629111711, "grad_norm": 24735.517578125, "learning_rate": 8.144589111065767e-06, "loss": 4.4895, "step": 2998 }, { "epoch": 0.762498013665978, "grad_norm": 24796.296875, "learning_rate": 8.12820572609708e-06, "loss": 4.5068, "step": 2999 }, { "epoch": 0.762752264420785, "grad_norm": 24632.31640625, "learning_rate": 8.111835636481083e-06, "loss": 4.5016, "step": 3000 }, { "epoch": 0.762752264420785, "eval_loss": 9.066652297973633, "eval_runtime": 700.0909, "eval_samples_per_second": 151.389, "eval_steps_per_second": 9.463, "step": 3000 }, { "epoch": 0.7630065151755919, "grad_norm": 24812.98046875, "learning_rate": 8.095478855117786e-06, "loss": 4.4949, "step": 3001 }, { "epoch": 0.7632607659303988, "grad_norm": 24744.34765625, "learning_rate": 8.079135394896704e-06, "loss": 4.513, "step": 3002 }, { "epoch": 0.7635150166852058, "grad_norm": 24719.552734375, "learning_rate": 8.062805268696836e-06, "loss": 4.4922, "step": 3003 }, { "epoch": 0.7637692674400127, "grad_norm": 24782.23046875, "learning_rate": 8.046488489386703e-06, "loss": 4.4881, "step": 3004 }, { "epoch": 0.7640235181948196, "grad_norm": 24790.525390625, "learning_rate": 8.030185069824286e-06, "loss": 4.5201, "step": 3005 }, { "epoch": 0.7642777689496266, "grad_norm": 24780.771484375, "learning_rate": 8.013895022857041e-06, "loss": 4.4881, "step": 3006 }, { "epoch": 0.7645320197044335, "grad_norm": 24717.3828125, "learning_rate": 7.9976183613219e-06, "loss": 4.4951, "step": 3007 }, { "epoch": 0.7647862704592404, "grad_norm": 24726.548828125, "learning_rate": 7.98135509804524e-06, "loss": 4.4978, "step": 3008 }, { "epoch": 0.7650405212140473, "grad_norm": 24805.623046875, "learning_rate": 7.965105245842861e-06, "loss": 4.5157, "step": 3009 }, { "epoch": 0.7652947719688543, "grad_norm": 24728.56640625, "learning_rate": 7.948868817520028e-06, "loss": 4.5039, "step": 3010 }, { "epoch": 0.7655490227236612, "grad_norm": 24716.193359375, "learning_rate": 7.932645825871397e-06, "loss": 4.5065, "step": 3011 }, { "epoch": 0.7658032734784681, "grad_norm": 24814.25390625, "learning_rate": 7.916436283681064e-06, "loss": 4.5049, "step": 3012 }, { "epoch": 0.7660575242332751, "grad_norm": 24650.6328125, "learning_rate": 7.9002402037225e-06, "loss": 4.4915, "step": 3013 }, { "epoch": 0.766311774988082, "grad_norm": 24666.185546875, "learning_rate": 7.884057598758594e-06, "loss": 4.4985, "step": 3014 }, { "epoch": 0.7665660257428889, "grad_norm": 24725.87109375, "learning_rate": 7.867888481541592e-06, "loss": 4.5065, "step": 3015 }, { "epoch": 0.7668202764976959, "grad_norm": 24871.794921875, "learning_rate": 7.851732864813116e-06, "loss": 4.4943, "step": 3016 }, { "epoch": 0.7670745272525028, "grad_norm": 24606.994140625, "learning_rate": 7.835590761304168e-06, "loss": 4.4942, "step": 3017 }, { "epoch": 0.7673287780073097, "grad_norm": 24957.333984375, "learning_rate": 7.819462183735083e-06, "loss": 4.507, "step": 3018 }, { "epoch": 0.7675830287621166, "grad_norm": 24821.9296875, "learning_rate": 7.803347144815531e-06, "loss": 4.4932, "step": 3019 }, { "epoch": 0.7678372795169236, "grad_norm": 24766.341796875, "learning_rate": 7.787245657244543e-06, "loss": 4.4862, "step": 3020 }, { "epoch": 0.7680915302717305, "grad_norm": 24696.41796875, "learning_rate": 7.771157733710435e-06, "loss": 4.4828, "step": 3021 }, { "epoch": 0.7683457810265374, "grad_norm": 24908.318359375, "learning_rate": 7.755083386890864e-06, "loss": 4.5207, "step": 3022 }, { "epoch": 0.7686000317813444, "grad_norm": 24870.7265625, "learning_rate": 7.739022629452777e-06, "loss": 4.4882, "step": 3023 }, { "epoch": 0.7688542825361513, "grad_norm": 24655.09375, "learning_rate": 7.72297547405241e-06, "loss": 4.4829, "step": 3024 }, { "epoch": 0.7691085332909582, "grad_norm": 25012.5, "learning_rate": 7.70694193333527e-06, "loss": 4.5009, "step": 3025 }, { "epoch": 0.7693627840457652, "grad_norm": 24816.1953125, "learning_rate": 7.690922019936164e-06, "loss": 4.4962, "step": 3026 }, { "epoch": 0.7696170348005721, "grad_norm": 24783.46484375, "learning_rate": 7.67491574647914e-06, "loss": 4.4841, "step": 3027 }, { "epoch": 0.769871285555379, "grad_norm": 24930.853515625, "learning_rate": 7.658923125577483e-06, "loss": 4.4999, "step": 3028 }, { "epoch": 0.7701255363101859, "grad_norm": 24910.69921875, "learning_rate": 7.642944169833763e-06, "loss": 4.4916, "step": 3029 }, { "epoch": 0.7703797870649929, "grad_norm": 24702.07421875, "learning_rate": 7.626978891839742e-06, "loss": 4.4825, "step": 3030 }, { "epoch": 0.7706340378197998, "grad_norm": 24853.23046875, "learning_rate": 7.61102730417641e-06, "loss": 4.4849, "step": 3031 }, { "epoch": 0.7708882885746067, "grad_norm": 24865.048828125, "learning_rate": 7.595089419413989e-06, "loss": 4.4957, "step": 3032 }, { "epoch": 0.7711425393294137, "grad_norm": 24514.96875, "learning_rate": 7.579165250111894e-06, "loss": 4.4942, "step": 3033 }, { "epoch": 0.7713967900842206, "grad_norm": 24968.3828125, "learning_rate": 7.563254808818712e-06, "loss": 4.5064, "step": 3034 }, { "epoch": 0.7716510408390275, "grad_norm": 24628.16015625, "learning_rate": 7.547358108072244e-06, "loss": 4.5035, "step": 3035 }, { "epoch": 0.7719052915938344, "grad_norm": 24699.173828125, "learning_rate": 7.53147516039944e-06, "loss": 4.4921, "step": 3036 }, { "epoch": 0.7721595423486414, "grad_norm": 24687.427734375, "learning_rate": 7.515605978316412e-06, "loss": 4.4798, "step": 3037 }, { "epoch": 0.7724137931034483, "grad_norm": 24806.796875, "learning_rate": 7.499750574328446e-06, "loss": 4.4803, "step": 3038 }, { "epoch": 0.7726680438582552, "grad_norm": 24806.646484375, "learning_rate": 7.483908960929947e-06, "loss": 4.4885, "step": 3039 }, { "epoch": 0.7729222946130622, "grad_norm": 24857.46484375, "learning_rate": 7.4680811506044615e-06, "loss": 4.4924, "step": 3040 }, { "epoch": 0.7731765453678691, "grad_norm": 24780.79296875, "learning_rate": 7.452267155824666e-06, "loss": 4.5027, "step": 3041 }, { "epoch": 0.773430796122676, "grad_norm": 24662.123046875, "learning_rate": 7.436466989052332e-06, "loss": 4.4883, "step": 3042 }, { "epoch": 0.773685046877483, "grad_norm": 24766.751953125, "learning_rate": 7.420680662738363e-06, "loss": 4.5038, "step": 3043 }, { "epoch": 0.7739392976322899, "grad_norm": 24775.390625, "learning_rate": 7.40490818932272e-06, "loss": 4.4829, "step": 3044 }, { "epoch": 0.7741935483870968, "grad_norm": 24670.78515625, "learning_rate": 7.389149581234484e-06, "loss": 4.4984, "step": 3045 }, { "epoch": 0.7744477991419036, "grad_norm": 24701.46484375, "learning_rate": 7.3734048508917826e-06, "loss": 4.5015, "step": 3046 }, { "epoch": 0.7747020498967107, "grad_norm": 24848.77734375, "learning_rate": 7.357674010701807e-06, "loss": 4.5075, "step": 3047 }, { "epoch": 0.7749563006515175, "grad_norm": 24823.4453125, "learning_rate": 7.341957073060832e-06, "loss": 4.4872, "step": 3048 }, { "epoch": 0.7752105514063244, "grad_norm": 24722.748046875, "learning_rate": 7.326254050354151e-06, "loss": 4.4839, "step": 3049 }, { "epoch": 0.7754648021611315, "grad_norm": 24862.36328125, "learning_rate": 7.310564954956084e-06, "loss": 4.4994, "step": 3050 }, { "epoch": 0.7757190529159383, "grad_norm": 24736.33203125, "learning_rate": 7.2948897992300094e-06, "loss": 4.4816, "step": 3051 }, { "epoch": 0.7759733036707452, "grad_norm": 24862.775390625, "learning_rate": 7.2792285955282896e-06, "loss": 4.4932, "step": 3052 }, { "epoch": 0.7762275544255522, "grad_norm": 24790.2890625, "learning_rate": 7.263581356192306e-06, "loss": 4.496, "step": 3053 }, { "epoch": 0.7764818051803591, "grad_norm": 24886.8984375, "learning_rate": 7.247948093552448e-06, "loss": 4.4923, "step": 3054 }, { "epoch": 0.776736055935166, "grad_norm": 24792.615234375, "learning_rate": 7.232328819928069e-06, "loss": 4.5034, "step": 3055 }, { "epoch": 0.7769903066899729, "grad_norm": 24817.869140625, "learning_rate": 7.216723547627496e-06, "loss": 4.4993, "step": 3056 }, { "epoch": 0.7772445574447799, "grad_norm": 24770.5859375, "learning_rate": 7.201132288948051e-06, "loss": 4.4996, "step": 3057 }, { "epoch": 0.7774988081995868, "grad_norm": 24777.591796875, "learning_rate": 7.185555056175991e-06, "loss": 4.4889, "step": 3058 }, { "epoch": 0.7777530589543937, "grad_norm": 24638.62109375, "learning_rate": 7.169991861586514e-06, "loss": 4.4882, "step": 3059 }, { "epoch": 0.7780073097092007, "grad_norm": 24901.322265625, "learning_rate": 7.154442717443785e-06, "loss": 4.4843, "step": 3060 }, { "epoch": 0.7782615604640076, "grad_norm": 24750.486328125, "learning_rate": 7.138907636000866e-06, "loss": 4.4842, "step": 3061 }, { "epoch": 0.7785158112188145, "grad_norm": 24643.51953125, "learning_rate": 7.123386629499748e-06, "loss": 4.4899, "step": 3062 }, { "epoch": 0.7787700619736215, "grad_norm": 24659.4453125, "learning_rate": 7.107879710171339e-06, "loss": 4.4771, "step": 3063 }, { "epoch": 0.7790243127284284, "grad_norm": 24744.48828125, "learning_rate": 7.092386890235444e-06, "loss": 4.4763, "step": 3064 }, { "epoch": 0.7792785634832353, "grad_norm": 24783.048828125, "learning_rate": 7.076908181900741e-06, "loss": 4.4981, "step": 3065 }, { "epoch": 0.7795328142380422, "grad_norm": 24890.05078125, "learning_rate": 7.061443597364814e-06, "loss": 4.4992, "step": 3066 }, { "epoch": 0.7797870649928492, "grad_norm": 24672.982421875, "learning_rate": 7.045993148814095e-06, "loss": 4.4753, "step": 3067 }, { "epoch": 0.7800413157476561, "grad_norm": 24757.85546875, "learning_rate": 7.030556848423875e-06, "loss": 4.4962, "step": 3068 }, { "epoch": 0.780295566502463, "grad_norm": 24777.28125, "learning_rate": 7.0151347083583255e-06, "loss": 4.4848, "step": 3069 }, { "epoch": 0.78054981725727, "grad_norm": 24790.796875, "learning_rate": 6.9997267407704265e-06, "loss": 4.4847, "step": 3070 }, { "epoch": 0.7808040680120769, "grad_norm": 24711.962890625, "learning_rate": 6.984332957801998e-06, "loss": 4.488, "step": 3071 }, { "epoch": 0.7810583187668838, "grad_norm": 24655.369140625, "learning_rate": 6.968953371583697e-06, "loss": 4.4777, "step": 3072 }, { "epoch": 0.7813125695216908, "grad_norm": 24812.095703125, "learning_rate": 6.9535879942349755e-06, "loss": 4.4973, "step": 3073 }, { "epoch": 0.7815668202764977, "grad_norm": 24676.564453125, "learning_rate": 6.938236837864104e-06, "loss": 4.5004, "step": 3074 }, { "epoch": 0.7818210710313046, "grad_norm": 24703.044921875, "learning_rate": 6.922899914568126e-06, "loss": 4.4871, "step": 3075 }, { "epoch": 0.7820753217861115, "grad_norm": 24819.53125, "learning_rate": 6.907577236432896e-06, "loss": 4.4941, "step": 3076 }, { "epoch": 0.7823295725409185, "grad_norm": 24601.890625, "learning_rate": 6.892268815533021e-06, "loss": 4.4922, "step": 3077 }, { "epoch": 0.7825838232957254, "grad_norm": 24802.498046875, "learning_rate": 6.876974663931873e-06, "loss": 4.4829, "step": 3078 }, { "epoch": 0.7828380740505323, "grad_norm": 24886.794921875, "learning_rate": 6.861694793681603e-06, "loss": 4.4925, "step": 3079 }, { "epoch": 0.7830923248053393, "grad_norm": 24908.9609375, "learning_rate": 6.846429216823083e-06, "loss": 4.4968, "step": 3080 }, { "epoch": 0.7833465755601462, "grad_norm": 24722.369140625, "learning_rate": 6.831177945385925e-06, "loss": 4.4965, "step": 3081 }, { "epoch": 0.7836008263149531, "grad_norm": 24695.556640625, "learning_rate": 6.815940991388484e-06, "loss": 4.498, "step": 3082 }, { "epoch": 0.7838550770697601, "grad_norm": 24929.9296875, "learning_rate": 6.800718366837808e-06, "loss": 4.5075, "step": 3083 }, { "epoch": 0.784109327824567, "grad_norm": 24741.54296875, "learning_rate": 6.7855100837296765e-06, "loss": 4.4785, "step": 3084 }, { "epoch": 0.7843635785793739, "grad_norm": 24830.31640625, "learning_rate": 6.770316154048567e-06, "loss": 4.4762, "step": 3085 }, { "epoch": 0.7846178293341808, "grad_norm": 24922.259765625, "learning_rate": 6.755136589767624e-06, "loss": 4.4859, "step": 3086 }, { "epoch": 0.7848720800889878, "grad_norm": 24668.982421875, "learning_rate": 6.739971402848683e-06, "loss": 4.4886, "step": 3087 }, { "epoch": 0.7851263308437947, "grad_norm": 24701.212890625, "learning_rate": 6.724820605242263e-06, "loss": 4.4833, "step": 3088 }, { "epoch": 0.7853805815986016, "grad_norm": 24762.29296875, "learning_rate": 6.7096842088875285e-06, "loss": 4.4913, "step": 3089 }, { "epoch": 0.7856348323534086, "grad_norm": 24827.857421875, "learning_rate": 6.694562225712292e-06, "loss": 4.479, "step": 3090 }, { "epoch": 0.7858890831082155, "grad_norm": 24873.296875, "learning_rate": 6.679454667633025e-06, "loss": 4.4921, "step": 3091 }, { "epoch": 0.7861433338630224, "grad_norm": 24919.337890625, "learning_rate": 6.664361546554823e-06, "loss": 4.4928, "step": 3092 }, { "epoch": 0.7863975846178294, "grad_norm": 24770.166015625, "learning_rate": 6.6492828743713904e-06, "loss": 4.4951, "step": 3093 }, { "epoch": 0.7866518353726363, "grad_norm": 24725.470703125, "learning_rate": 6.634218662965072e-06, "loss": 4.4937, "step": 3094 }, { "epoch": 0.7869060861274432, "grad_norm": 24801.416015625, "learning_rate": 6.619168924206809e-06, "loss": 4.4901, "step": 3095 }, { "epoch": 0.7871603368822501, "grad_norm": 24770.974609375, "learning_rate": 6.6041336699561165e-06, "loss": 4.4758, "step": 3096 }, { "epoch": 0.7874145876370571, "grad_norm": 24936.57421875, "learning_rate": 6.5891129120611325e-06, "loss": 4.4823, "step": 3097 }, { "epoch": 0.787668838391864, "grad_norm": 25135.271484375, "learning_rate": 6.574106662358542e-06, "loss": 4.4974, "step": 3098 }, { "epoch": 0.7879230891466709, "grad_norm": 24939.65625, "learning_rate": 6.559114932673599e-06, "loss": 4.4753, "step": 3099 }, { "epoch": 0.7881773399014779, "grad_norm": 24745.9609375, "learning_rate": 6.544137734820136e-06, "loss": 4.4644, "step": 3100 }, { "epoch": 0.7881773399014779, "eval_loss": 9.046455383300781, "eval_runtime": 699.1793, "eval_samples_per_second": 151.586, "eval_steps_per_second": 9.475, "step": 3100 }, { "epoch": 0.7884315906562848, "grad_norm": 25031.130859375, "learning_rate": 6.529175080600516e-06, "loss": 4.4777, "step": 3101 }, { "epoch": 0.7886858414110917, "grad_norm": 25009.912109375, "learning_rate": 6.514226981805638e-06, "loss": 4.495, "step": 3102 }, { "epoch": 0.7889400921658987, "grad_norm": 24806.14453125, "learning_rate": 6.499293450214955e-06, "loss": 4.4785, "step": 3103 }, { "epoch": 0.7891943429207056, "grad_norm": 24932.083984375, "learning_rate": 6.484374497596413e-06, "loss": 4.5037, "step": 3104 }, { "epoch": 0.7894485936755125, "grad_norm": 24777.408203125, "learning_rate": 6.46947013570649e-06, "loss": 4.4774, "step": 3105 }, { "epoch": 0.7897028444303194, "grad_norm": 24946.05859375, "learning_rate": 6.454580376290148e-06, "loss": 4.4738, "step": 3106 }, { "epoch": 0.7899570951851264, "grad_norm": 24691.1796875, "learning_rate": 6.439705231080867e-06, "loss": 4.4738, "step": 3107 }, { "epoch": 0.7902113459399333, "grad_norm": 24852.51953125, "learning_rate": 6.424844711800584e-06, "loss": 4.4829, "step": 3108 }, { "epoch": 0.7904655966947401, "grad_norm": 24964.654296875, "learning_rate": 6.409998830159716e-06, "loss": 4.4898, "step": 3109 }, { "epoch": 0.7907198474495472, "grad_norm": 24723.6171875, "learning_rate": 6.395167597857169e-06, "loss": 4.4787, "step": 3110 }, { "epoch": 0.790974098204354, "grad_norm": 24948.20703125, "learning_rate": 6.380351026580275e-06, "loss": 4.4911, "step": 3111 }, { "epoch": 0.7912283489591609, "grad_norm": 24780.328125, "learning_rate": 6.365549128004822e-06, "loss": 4.4809, "step": 3112 }, { "epoch": 0.791482599713968, "grad_norm": 24843.33203125, "learning_rate": 6.350761913795048e-06, "loss": 4.4865, "step": 3113 }, { "epoch": 0.7917368504687748, "grad_norm": 24697.119140625, "learning_rate": 6.335989395603598e-06, "loss": 4.4911, "step": 3114 }, { "epoch": 0.7919911012235817, "grad_norm": 24744.1328125, "learning_rate": 6.321231585071563e-06, "loss": 4.4764, "step": 3115 }, { "epoch": 0.7922453519783886, "grad_norm": 24878.234375, "learning_rate": 6.306488493828416e-06, "loss": 4.4725, "step": 3116 }, { "epoch": 0.7924996027331956, "grad_norm": 24903.19140625, "learning_rate": 6.291760133492056e-06, "loss": 4.4926, "step": 3117 }, { "epoch": 0.7927538534880025, "grad_norm": 24664.0, "learning_rate": 6.2770465156687466e-06, "loss": 4.4831, "step": 3118 }, { "epoch": 0.7930081042428094, "grad_norm": 24822.92578125, "learning_rate": 6.262347651953162e-06, "loss": 4.4884, "step": 3119 }, { "epoch": 0.7932623549976164, "grad_norm": 24909.0, "learning_rate": 6.247663553928338e-06, "loss": 4.481, "step": 3120 }, { "epoch": 0.7935166057524233, "grad_norm": 24926.3125, "learning_rate": 6.232994233165653e-06, "loss": 4.4773, "step": 3121 }, { "epoch": 0.7937708565072302, "grad_norm": 24859.541015625, "learning_rate": 6.218339701224887e-06, "loss": 4.4863, "step": 3122 }, { "epoch": 0.7940251072620372, "grad_norm": 24714.427734375, "learning_rate": 6.203699969654131e-06, "loss": 4.49, "step": 3123 }, { "epoch": 0.7942793580168441, "grad_norm": 24748.134765625, "learning_rate": 6.189075049989809e-06, "loss": 4.4661, "step": 3124 }, { "epoch": 0.794533608771651, "grad_norm": 24945.236328125, "learning_rate": 6.174464953756706e-06, "loss": 4.5007, "step": 3125 }, { "epoch": 0.7947878595264579, "grad_norm": 24647.71484375, "learning_rate": 6.15986969246789e-06, "loss": 4.4893, "step": 3126 }, { "epoch": 0.7950421102812649, "grad_norm": 24716.8828125, "learning_rate": 6.145289277624761e-06, "loss": 4.4911, "step": 3127 }, { "epoch": 0.7952963610360718, "grad_norm": 24663.099609375, "learning_rate": 6.130723720717021e-06, "loss": 4.4836, "step": 3128 }, { "epoch": 0.7955506117908787, "grad_norm": 24772.46875, "learning_rate": 6.116173033222649e-06, "loss": 4.4936, "step": 3129 }, { "epoch": 0.7958048625456857, "grad_norm": 24830.66796875, "learning_rate": 6.1016372266079065e-06, "loss": 4.4754, "step": 3130 }, { "epoch": 0.7960591133004926, "grad_norm": 24866.197265625, "learning_rate": 6.087116312327348e-06, "loss": 4.489, "step": 3131 }, { "epoch": 0.7963133640552995, "grad_norm": 24880.685546875, "learning_rate": 6.072610301823775e-06, "loss": 4.4933, "step": 3132 }, { "epoch": 0.7965676148101065, "grad_norm": 24822.49609375, "learning_rate": 6.058119206528243e-06, "loss": 4.4874, "step": 3133 }, { "epoch": 0.7968218655649134, "grad_norm": 25177.599609375, "learning_rate": 6.043643037860072e-06, "loss": 4.4707, "step": 3134 }, { "epoch": 0.7970761163197203, "grad_norm": 24640.3671875, "learning_rate": 6.029181807226794e-06, "loss": 4.4792, "step": 3135 }, { "epoch": 0.7973303670745272, "grad_norm": 25037.650390625, "learning_rate": 6.0147355260241986e-06, "loss": 4.4933, "step": 3136 }, { "epoch": 0.7975846178293342, "grad_norm": 24882.111328125, "learning_rate": 6.000304205636265e-06, "loss": 4.4712, "step": 3137 }, { "epoch": 0.7978388685841411, "grad_norm": 24760.849609375, "learning_rate": 5.985887857435213e-06, "loss": 4.4829, "step": 3138 }, { "epoch": 0.798093119338948, "grad_norm": 24810.79296875, "learning_rate": 5.971486492781442e-06, "loss": 4.4848, "step": 3139 }, { "epoch": 0.798347370093755, "grad_norm": 24867.193359375, "learning_rate": 5.957100123023543e-06, "loss": 4.4876, "step": 3140 }, { "epoch": 0.7986016208485619, "grad_norm": 24791.056640625, "learning_rate": 5.942728759498309e-06, "loss": 4.4807, "step": 3141 }, { "epoch": 0.7988558716033688, "grad_norm": 24756.990234375, "learning_rate": 5.9283724135306946e-06, "loss": 4.4777, "step": 3142 }, { "epoch": 0.7991101223581758, "grad_norm": 24763.775390625, "learning_rate": 5.9140310964338145e-06, "loss": 4.4831, "step": 3143 }, { "epoch": 0.7993643731129827, "grad_norm": 24711.83984375, "learning_rate": 5.899704819508964e-06, "loss": 4.4806, "step": 3144 }, { "epoch": 0.7996186238677896, "grad_norm": 24668.044921875, "learning_rate": 5.885393594045552e-06, "loss": 4.4817, "step": 3145 }, { "epoch": 0.7998728746225965, "grad_norm": 24867.08203125, "learning_rate": 5.871097431321165e-06, "loss": 4.4777, "step": 3146 }, { "epoch": 0.8001271253774035, "grad_norm": 25030.48046875, "learning_rate": 5.856816342601481e-06, "loss": 4.4844, "step": 3147 }, { "epoch": 0.8003813761322104, "grad_norm": 24788.541015625, "learning_rate": 5.84255033914034e-06, "loss": 4.4786, "step": 3148 }, { "epoch": 0.8006356268870173, "grad_norm": 24939.25, "learning_rate": 5.828299432179652e-06, "loss": 4.4946, "step": 3149 }, { "epoch": 0.8008898776418243, "grad_norm": 25031.431640625, "learning_rate": 5.814063632949468e-06, "loss": 4.4854, "step": 3150 }, { "epoch": 0.8011441283966312, "grad_norm": 24842.267578125, "learning_rate": 5.799842952667911e-06, "loss": 4.4764, "step": 3151 }, { "epoch": 0.8013983791514381, "grad_norm": 24891.974609375, "learning_rate": 5.785637402541189e-06, "loss": 4.4838, "step": 3152 }, { "epoch": 0.8016526299062451, "grad_norm": 24802.287109375, "learning_rate": 5.77144699376361e-06, "loss": 4.4853, "step": 3153 }, { "epoch": 0.801906880661052, "grad_norm": 24971.9296875, "learning_rate": 5.757271737517525e-06, "loss": 4.4829, "step": 3154 }, { "epoch": 0.8021611314158589, "grad_norm": 24811.19921875, "learning_rate": 5.743111644973348e-06, "loss": 4.4875, "step": 3155 }, { "epoch": 0.8024153821706658, "grad_norm": 24843.765625, "learning_rate": 5.728966727289564e-06, "loss": 4.4785, "step": 3156 }, { "epoch": 0.8026696329254728, "grad_norm": 24808.470703125, "learning_rate": 5.714836995612671e-06, "loss": 4.4925, "step": 3157 }, { "epoch": 0.8029238836802797, "grad_norm": 24870.248046875, "learning_rate": 5.700722461077226e-06, "loss": 4.491, "step": 3158 }, { "epoch": 0.8031781344350866, "grad_norm": 24869.529296875, "learning_rate": 5.686623134805802e-06, "loss": 4.4811, "step": 3159 }, { "epoch": 0.8034323851898936, "grad_norm": 24906.11328125, "learning_rate": 5.672539027908977e-06, "loss": 4.4887, "step": 3160 }, { "epoch": 0.8036866359447005, "grad_norm": 24823.68359375, "learning_rate": 5.658470151485337e-06, "loss": 4.4793, "step": 3161 }, { "epoch": 0.8039408866995074, "grad_norm": 24841.802734375, "learning_rate": 5.64441651662149e-06, "loss": 4.4762, "step": 3162 }, { "epoch": 0.8041951374543144, "grad_norm": 24834.0, "learning_rate": 5.630378134392006e-06, "loss": 4.4692, "step": 3163 }, { "epoch": 0.8044493882091213, "grad_norm": 24878.4765625, "learning_rate": 5.616355015859437e-06, "loss": 4.4963, "step": 3164 }, { "epoch": 0.8047036389639282, "grad_norm": 24599.439453125, "learning_rate": 5.602347172074332e-06, "loss": 4.4856, "step": 3165 }, { "epoch": 0.804957889718735, "grad_norm": 24957.349609375, "learning_rate": 5.588354614075178e-06, "loss": 4.491, "step": 3166 }, { "epoch": 0.8052121404735421, "grad_norm": 24786.6328125, "learning_rate": 5.5743773528884185e-06, "loss": 4.4675, "step": 3167 }, { "epoch": 0.805466391228349, "grad_norm": 24639.76171875, "learning_rate": 5.560415399528457e-06, "loss": 4.4866, "step": 3168 }, { "epoch": 0.8057206419831558, "grad_norm": 24835.48046875, "learning_rate": 5.546468764997631e-06, "loss": 4.4809, "step": 3169 }, { "epoch": 0.8059748927379629, "grad_norm": 24779.017578125, "learning_rate": 5.532537460286194e-06, "loss": 4.4727, "step": 3170 }, { "epoch": 0.8062291434927698, "grad_norm": 24812.84765625, "learning_rate": 5.518621496372323e-06, "loss": 4.4769, "step": 3171 }, { "epoch": 0.8064833942475766, "grad_norm": 25013.306640625, "learning_rate": 5.5047208842221225e-06, "loss": 4.4864, "step": 3172 }, { "epoch": 0.8067376450023837, "grad_norm": 24901.984375, "learning_rate": 5.4908356347895816e-06, "loss": 4.4833, "step": 3173 }, { "epoch": 0.8069918957571905, "grad_norm": 24812.544921875, "learning_rate": 5.476965759016581e-06, "loss": 4.485, "step": 3174 }, { "epoch": 0.8072461465119974, "grad_norm": 24836.447265625, "learning_rate": 5.463111267832904e-06, "loss": 4.49, "step": 3175 }, { "epoch": 0.8075003972668043, "grad_norm": 24987.533203125, "learning_rate": 5.449272172156197e-06, "loss": 4.4879, "step": 3176 }, { "epoch": 0.8077546480216113, "grad_norm": 24953.025390625, "learning_rate": 5.435448482891975e-06, "loss": 4.4846, "step": 3177 }, { "epoch": 0.8080088987764182, "grad_norm": 24866.1796875, "learning_rate": 5.421640210933615e-06, "loss": 4.4769, "step": 3178 }, { "epoch": 0.8082631495312251, "grad_norm": 24921.35546875, "learning_rate": 5.407847367162358e-06, "loss": 4.4794, "step": 3179 }, { "epoch": 0.8085174002860321, "grad_norm": 24947.603515625, "learning_rate": 5.3940699624472595e-06, "loss": 4.4749, "step": 3180 }, { "epoch": 0.808771651040839, "grad_norm": 24988.021484375, "learning_rate": 5.380308007645235e-06, "loss": 4.4747, "step": 3181 }, { "epoch": 0.8090259017956459, "grad_norm": 24958.92578125, "learning_rate": 5.36656151360101e-06, "loss": 4.4825, "step": 3182 }, { "epoch": 0.8092801525504529, "grad_norm": 24902.267578125, "learning_rate": 5.352830491147121e-06, "loss": 4.4793, "step": 3183 }, { "epoch": 0.8095344033052598, "grad_norm": 24851.865234375, "learning_rate": 5.339114951103935e-06, "loss": 4.4864, "step": 3184 }, { "epoch": 0.8097886540600667, "grad_norm": 24732.71484375, "learning_rate": 5.3254149042796045e-06, "loss": 4.4716, "step": 3185 }, { "epoch": 0.8100429048148736, "grad_norm": 24873.80078125, "learning_rate": 5.31173036147006e-06, "loss": 4.479, "step": 3186 }, { "epoch": 0.8102971555696806, "grad_norm": 24935.572265625, "learning_rate": 5.298061333459048e-06, "loss": 4.4895, "step": 3187 }, { "epoch": 0.8105514063244875, "grad_norm": 24816.404296875, "learning_rate": 5.2844078310180536e-06, "loss": 4.4987, "step": 3188 }, { "epoch": 0.8108056570792944, "grad_norm": 24858.3984375, "learning_rate": 5.270769864906349e-06, "loss": 4.4749, "step": 3189 }, { "epoch": 0.8110599078341014, "grad_norm": 24886.70703125, "learning_rate": 5.257147445870963e-06, "loss": 4.4784, "step": 3190 }, { "epoch": 0.8113141585889083, "grad_norm": 24765.677734375, "learning_rate": 5.243540584646667e-06, "loss": 4.4639, "step": 3191 }, { "epoch": 0.8115684093437152, "grad_norm": 25030.677734375, "learning_rate": 5.22994929195596e-06, "loss": 4.4984, "step": 3192 }, { "epoch": 0.8118226600985222, "grad_norm": 24748.40625, "learning_rate": 5.216373578509104e-06, "loss": 4.4716, "step": 3193 }, { "epoch": 0.8120769108533291, "grad_norm": 24849.619140625, "learning_rate": 5.2028134550040545e-06, "loss": 4.4788, "step": 3194 }, { "epoch": 0.812331161608136, "grad_norm": 24659.5, "learning_rate": 5.1892689321264925e-06, "loss": 4.4666, "step": 3195 }, { "epoch": 0.8125854123629429, "grad_norm": 24920.703125, "learning_rate": 5.175740020549813e-06, "loss": 4.4742, "step": 3196 }, { "epoch": 0.8128396631177499, "grad_norm": 24827.90625, "learning_rate": 5.1622267309350995e-06, "loss": 4.478, "step": 3197 }, { "epoch": 0.8130939138725568, "grad_norm": 24984.4140625, "learning_rate": 5.1487290739311216e-06, "loss": 4.4702, "step": 3198 }, { "epoch": 0.8133481646273637, "grad_norm": 24786.87109375, "learning_rate": 5.135247060174339e-06, "loss": 4.4666, "step": 3199 }, { "epoch": 0.8136024153821707, "grad_norm": 24834.40625, "learning_rate": 5.121780700288892e-06, "loss": 4.4742, "step": 3200 }, { "epoch": 0.8136024153821707, "eval_loss": 9.030352592468262, "eval_runtime": 696.4169, "eval_samples_per_second": 152.188, "eval_steps_per_second": 9.513, "step": 3200 } ], "logging_steps": 1, "max_steps": 3933, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3225070809907200.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }