{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.02869777005849354, "eval_steps": 100, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.739554011698708e-05, "grad_norm": 1.8802112340927124, "learning_rate": 5.9999999999999995e-05, "loss": 2.9438, "step": 1 }, { "epoch": 0.00011479108023397416, "grad_norm": 1.9408955574035645, "learning_rate": 0.00011999999999999999, "loss": 2.9429, "step": 2 }, { "epoch": 0.00017218662035096125, "grad_norm": 2.9192652702331543, "learning_rate": 0.00017999999999999998, "loss": 2.952, "step": 3 }, { "epoch": 0.00022958216046794832, "grad_norm": 2.3403642177581787, "learning_rate": 0.00023999999999999998, "loss": 2.9307, "step": 4 }, { "epoch": 0.00028697770058493544, "grad_norm": 2.134683847427368, "learning_rate": 0.0003, "loss": 2.8917, "step": 5 }, { "epoch": 0.0003443732407019225, "grad_norm": 1.5358260869979858, "learning_rate": 0.00035999999999999997, "loss": 2.9205, "step": 6 }, { "epoch": 0.0004017687808189096, "grad_norm": 0.9012013673782349, "learning_rate": 0.00041999999999999996, "loss": 2.8937, "step": 7 }, { "epoch": 0.00045916432093589664, "grad_norm": 0.9427694082260132, "learning_rate": 0.00047999999999999996, "loss": 2.904, "step": 8 }, { "epoch": 0.0005165598610528837, "grad_norm": 1.662156105041504, "learning_rate": 0.00054, "loss": 2.9114, "step": 9 }, { "epoch": 0.0005739554011698709, "grad_norm": 1.2877967357635498, "learning_rate": 0.0006, "loss": 2.9185, "step": 10 }, { "epoch": 0.000631350941286858, "grad_norm": 1.3717082738876343, "learning_rate": 0.0005999969170437548, "loss": 2.899, "step": 11 }, { "epoch": 0.000688746481403845, "grad_norm": 1.3706175088882446, "learning_rate": 0.0005999876683017478, "loss": 2.8522, "step": 12 }, { "epoch": 0.0007461420215208321, "grad_norm": 0.7431464791297913, "learning_rate": 0.0005999722541541584, "loss": 2.8894, "step": 13 }, { "epoch": 0.0008035375616378192, "grad_norm": 0.5839619040489197, "learning_rate": 0.0005999506752346019, "loss": 2.8866, "step": 14 }, { "epoch": 0.0008609331017548062, "grad_norm": 0.5229901671409607, "learning_rate": 0.0005999229324301031, "loss": 2.8608, "step": 15 }, { "epoch": 0.0009183286418717933, "grad_norm": 0.6879259943962097, "learning_rate": 0.00059988902688106, "loss": 2.8801, "step": 16 }, { "epoch": 0.0009757241819887805, "grad_norm": 0.4949502646923065, "learning_rate": 0.0005998489599811971, "loss": 2.8857, "step": 17 }, { "epoch": 0.0010331197221057674, "grad_norm": 0.5659216642379761, "learning_rate": 0.0005998027333775077, "loss": 2.8172, "step": 18 }, { "epoch": 0.0010905152622227546, "grad_norm": 0.43849167227745056, "learning_rate": 0.0005997503489701861, "loss": 2.8479, "step": 19 }, { "epoch": 0.0011479108023397418, "grad_norm": 0.5036750435829163, "learning_rate": 0.0005996918089125504, "loss": 2.8957, "step": 20 }, { "epoch": 0.0012053063424567287, "grad_norm": 0.40093106031417847, "learning_rate": 0.000599627115610953, "loss": 2.8951, "step": 21 }, { "epoch": 0.001262701882573716, "grad_norm": 0.3499244153499603, "learning_rate": 0.0005995562717246821, "loss": 2.8535, "step": 22 }, { "epoch": 0.0013200974226907029, "grad_norm": 0.3672889769077301, "learning_rate": 0.0005994792801658526, "loss": 2.8507, "step": 23 }, { "epoch": 0.00137749296280769, "grad_norm": 0.3307906985282898, "learning_rate": 0.0005993961440992859, "loss": 2.8597, "step": 24 }, { "epoch": 0.001434888502924677, "grad_norm": 0.33352652192115784, "learning_rate": 0.0005993068669423797, "loss": 2.8023, "step": 25 }, { "epoch": 0.0014922840430416642, "grad_norm": 0.30308255553245544, "learning_rate": 0.0005992114523649685, "loss": 2.864, "step": 26 }, { "epoch": 0.0015496795831586513, "grad_norm": 0.2800331711769104, "learning_rate": 0.000599109904289172, "loss": 2.8459, "step": 27 }, { "epoch": 0.0016070751232756383, "grad_norm": 0.2467849850654602, "learning_rate": 0.0005990022268892337, "loss": 2.8298, "step": 28 }, { "epoch": 0.0016644706633926255, "grad_norm": 0.25928932428359985, "learning_rate": 0.0005988884245913497, "loss": 2.8061, "step": 29 }, { "epoch": 0.0017218662035096124, "grad_norm": 0.2770285904407501, "learning_rate": 0.0005987685020734869, "loss": 2.8363, "step": 30 }, { "epoch": 0.0017792617436265996, "grad_norm": 0.2888840436935425, "learning_rate": 0.0005986424642651901, "loss": 2.847, "step": 31 }, { "epoch": 0.0018366572837435866, "grad_norm": 0.3389260172843933, "learning_rate": 0.0005985103163473802, "loss": 2.8185, "step": 32 }, { "epoch": 0.0018940528238605737, "grad_norm": 0.3043622672557831, "learning_rate": 0.0005983720637521404, "loss": 2.8073, "step": 33 }, { "epoch": 0.001951448363977561, "grad_norm": 0.2626359760761261, "learning_rate": 0.0005982277121624933, "loss": 2.8278, "step": 34 }, { "epoch": 0.002008843904094548, "grad_norm": 0.2601317763328552, "learning_rate": 0.0005980772675121675, "loss": 2.8293, "step": 35 }, { "epoch": 0.002066239444211535, "grad_norm": 0.2932066023349762, "learning_rate": 0.0005979207359853532, "loss": 2.842, "step": 36 }, { "epoch": 0.002123634984328522, "grad_norm": 0.3828963041305542, "learning_rate": 0.0005977581240164485, "loss": 2.8383, "step": 37 }, { "epoch": 0.002181030524445509, "grad_norm": 0.2928522527217865, "learning_rate": 0.0005975894382897944, "loss": 2.8291, "step": 38 }, { "epoch": 0.0022384260645624964, "grad_norm": 0.2287234663963318, "learning_rate": 0.0005974146857394005, "loss": 2.8422, "step": 39 }, { "epoch": 0.0022958216046794835, "grad_norm": 0.2722682058811188, "learning_rate": 0.0005972338735486597, "loss": 2.8217, "step": 40 }, { "epoch": 0.0023532171447964703, "grad_norm": 0.21170516312122345, "learning_rate": 0.0005970470091500531, "loss": 2.831, "step": 41 }, { "epoch": 0.0024106126849134575, "grad_norm": 0.22243160009384155, "learning_rate": 0.0005968541002248439, "loss": 2.862, "step": 42 }, { "epoch": 0.0024680082250304446, "grad_norm": 0.18485133349895477, "learning_rate": 0.0005966551547027627, "loss": 2.8531, "step": 43 }, { "epoch": 0.002525403765147432, "grad_norm": 0.21640127897262573, "learning_rate": 0.0005964501807616806, "loss": 2.8245, "step": 44 }, { "epoch": 0.0025827993052644185, "grad_norm": 0.2716100513935089, "learning_rate": 0.0005962391868272735, "loss": 2.8093, "step": 45 }, { "epoch": 0.0026401948453814057, "grad_norm": 0.19726517796516418, "learning_rate": 0.0005960221815726757, "loss": 2.8214, "step": 46 }, { "epoch": 0.002697590385498393, "grad_norm": 0.2424098700284958, "learning_rate": 0.0005957991739181231, "loss": 2.818, "step": 47 }, { "epoch": 0.00275498592561538, "grad_norm": 0.2414388209581375, "learning_rate": 0.0005955701730305872, "loss": 2.8491, "step": 48 }, { "epoch": 0.0028123814657323673, "grad_norm": 0.25403571128845215, "learning_rate": 0.0005953351883233972, "loss": 2.8321, "step": 49 }, { "epoch": 0.002869777005849354, "grad_norm": 0.30923786759376526, "learning_rate": 0.0005950942294558544, "loss": 2.8298, "step": 50 }, { "epoch": 0.002927172545966341, "grad_norm": 0.22294141352176666, "learning_rate": 0.0005948473063328338, "loss": 2.8015, "step": 51 }, { "epoch": 0.0029845680860833283, "grad_norm": 0.2882789075374603, "learning_rate": 0.0005945944291043779, "loss": 2.8256, "step": 52 }, { "epoch": 0.0030419636262003155, "grad_norm": 0.25416064262390137, "learning_rate": 0.0005943356081652793, "loss": 2.8211, "step": 53 }, { "epoch": 0.0030993591663173027, "grad_norm": 0.2488490343093872, "learning_rate": 0.0005940708541546529, "loss": 2.8618, "step": 54 }, { "epoch": 0.0031567547064342894, "grad_norm": 0.27515849471092224, "learning_rate": 0.000593800177955499, "loss": 2.802, "step": 55 }, { "epoch": 0.0032141502465512766, "grad_norm": 0.2030380666255951, "learning_rate": 0.0005935235906942563, "loss": 2.8229, "step": 56 }, { "epoch": 0.003271545786668264, "grad_norm": 0.2384052276611328, "learning_rate": 0.0005932411037403436, "loss": 2.8122, "step": 57 }, { "epoch": 0.003328941326785251, "grad_norm": 0.2543489336967468, "learning_rate": 0.000592952728705693, "loss": 2.8302, "step": 58 }, { "epoch": 0.003386336866902238, "grad_norm": 0.2387794405221939, "learning_rate": 0.000592658477444273, "loss": 2.835, "step": 59 }, { "epoch": 0.003443732407019225, "grad_norm": 0.2748169004917145, "learning_rate": 0.0005923583620516003, "loss": 2.834, "step": 60 }, { "epoch": 0.003501127947136212, "grad_norm": 0.2565017640590668, "learning_rate": 0.0005920523948642431, "loss": 2.8452, "step": 61 }, { "epoch": 0.0035585234872531992, "grad_norm": 0.25502678751945496, "learning_rate": 0.0005917405884593144, "loss": 2.8345, "step": 62 }, { "epoch": 0.0036159190273701864, "grad_norm": 0.22830121219158173, "learning_rate": 0.0005914229556539538, "loss": 2.7989, "step": 63 }, { "epoch": 0.003673314567487173, "grad_norm": 0.3146669268608093, "learning_rate": 0.0005910995095048024, "loss": 2.845, "step": 64 }, { "epoch": 0.0037307101076041603, "grad_norm": 0.2924383580684662, "learning_rate": 0.000590770263307464, "loss": 2.8303, "step": 65 }, { "epoch": 0.0037881056477211475, "grad_norm": 0.2577711343765259, "learning_rate": 0.0005904352305959605, "loss": 2.8156, "step": 66 }, { "epoch": 0.0038455011878381347, "grad_norm": 0.2631978988647461, "learning_rate": 0.0005900944251421745, "loss": 2.833, "step": 67 }, { "epoch": 0.003902896727955122, "grad_norm": 0.21994397044181824, "learning_rate": 0.000589747860955283, "loss": 2.8136, "step": 68 }, { "epoch": 0.003960292268072109, "grad_norm": 0.3000943064689636, "learning_rate": 0.0005893955522811827, "loss": 2.8415, "step": 69 }, { "epoch": 0.004017687808189096, "grad_norm": 0.24310976266860962, "learning_rate": 0.0005890375136019032, "loss": 2.8148, "step": 70 }, { "epoch": 0.004075083348306083, "grad_norm": 0.24616850912570953, "learning_rate": 0.0005886737596350122, "loss": 2.8329, "step": 71 }, { "epoch": 0.00413247888842307, "grad_norm": 0.2714521884918213, "learning_rate": 0.0005883043053330105, "loss": 2.8356, "step": 72 }, { "epoch": 0.004189874428540057, "grad_norm": 0.2601388096809387, "learning_rate": 0.0005879291658827176, "loss": 2.8228, "step": 73 }, { "epoch": 0.004247269968657044, "grad_norm": 0.22764116525650024, "learning_rate": 0.0005875483567046467, "loss": 2.801, "step": 74 }, { "epoch": 0.004304665508774032, "grad_norm": 0.22346433997154236, "learning_rate": 0.0005871618934523719, "loss": 2.7948, "step": 75 }, { "epoch": 0.004362061048891018, "grad_norm": 0.18839874863624573, "learning_rate": 0.0005867697920118835, "loss": 2.8341, "step": 76 }, { "epoch": 0.004419456589008005, "grad_norm": 0.25794312357902527, "learning_rate": 0.0005863720685009362, "loss": 2.815, "step": 77 }, { "epoch": 0.004476852129124993, "grad_norm": 0.2352106124162674, "learning_rate": 0.0005859687392683856, "loss": 2.8169, "step": 78 }, { "epoch": 0.0045342476692419795, "grad_norm": 0.28784099221229553, "learning_rate": 0.0005855598208935169, "loss": 2.8506, "step": 79 }, { "epoch": 0.004591643209358967, "grad_norm": 0.22999855875968933, "learning_rate": 0.0005851453301853628, "loss": 2.8377, "step": 80 }, { "epoch": 0.004649038749475954, "grad_norm": 0.21411263942718506, "learning_rate": 0.0005847252841820128, "loss": 2.8137, "step": 81 }, { "epoch": 0.0047064342895929406, "grad_norm": 0.2420736700296402, "learning_rate": 0.0005842997001499129, "loss": 2.7929, "step": 82 }, { "epoch": 0.004763829829709928, "grad_norm": 0.24426190555095673, "learning_rate": 0.0005838685955831558, "loss": 2.8273, "step": 83 }, { "epoch": 0.004821225369826915, "grad_norm": 0.20297811925411224, "learning_rate": 0.0005834319882027617, "loss": 2.7993, "step": 84 }, { "epoch": 0.0048786209099439025, "grad_norm": 0.2474389523267746, "learning_rate": 0.00058298989595595, "loss": 2.8252, "step": 85 }, { "epoch": 0.004936016450060889, "grad_norm": 0.22601982951164246, "learning_rate": 0.0005825423370154012, "loss": 2.8421, "step": 86 }, { "epoch": 0.004993411990177876, "grad_norm": 0.24997788667678833, "learning_rate": 0.0005820893297785106, "loss": 2.8485, "step": 87 }, { "epoch": 0.005050807530294864, "grad_norm": 0.19994623959064484, "learning_rate": 0.0005816308928666314, "loss": 2.8456, "step": 88 }, { "epoch": 0.00510820307041185, "grad_norm": 0.19206245243549347, "learning_rate": 0.0005811670451243093, "loss": 2.8035, "step": 89 }, { "epoch": 0.005165598610528837, "grad_norm": 0.2515026032924652, "learning_rate": 0.0005806978056185083, "loss": 2.8232, "step": 90 }, { "epoch": 0.005222994150645825, "grad_norm": 0.22921022772789001, "learning_rate": 0.0005802231936378267, "loss": 2.8366, "step": 91 }, { "epoch": 0.0052803896907628114, "grad_norm": 0.248809352517128, "learning_rate": 0.000579743228691704, "loss": 2.8331, "step": 92 }, { "epoch": 0.005337785230879799, "grad_norm": 0.18247073888778687, "learning_rate": 0.0005792579305096191, "loss": 2.8249, "step": 93 }, { "epoch": 0.005395180770996786, "grad_norm": 0.2440440058708191, "learning_rate": 0.0005787673190402799, "loss": 2.837, "step": 94 }, { "epoch": 0.0054525763111137725, "grad_norm": 0.21160444617271423, "learning_rate": 0.0005782714144508019, "loss": 2.7864, "step": 95 }, { "epoch": 0.00550997185123076, "grad_norm": 0.21344538033008575, "learning_rate": 0.0005777702371258806, "loss": 2.847, "step": 96 }, { "epoch": 0.005567367391347747, "grad_norm": 0.24861139059066772, "learning_rate": 0.0005772638076669529, "loss": 2.8267, "step": 97 }, { "epoch": 0.0056247629314647345, "grad_norm": 0.290520042181015, "learning_rate": 0.0005767521468913501, "loss": 2.827, "step": 98 }, { "epoch": 0.005682158471581721, "grad_norm": 0.20536312460899353, "learning_rate": 0.0005762352758314429, "loss": 2.8476, "step": 99 }, { "epoch": 0.005739554011698708, "grad_norm": 0.21782469749450684, "learning_rate": 0.000575713215733776, "loss": 2.844, "step": 100 }, { "epoch": 0.005739554011698708, "eval_loss": 2.7509028911590576, "eval_runtime": 85.2068, "eval_samples_per_second": 50.641, "eval_steps_per_second": 12.663, "step": 100 }, { "epoch": 0.005796949551815696, "grad_norm": 0.2523731291294098, "learning_rate": 0.0005751859880581954, "loss": 2.8125, "step": 101 }, { "epoch": 0.005854345091932682, "grad_norm": 0.30107325315475464, "learning_rate": 0.0005746536144769656, "loss": 2.8108, "step": 102 }, { "epoch": 0.00591174063204967, "grad_norm": 0.24103832244873047, "learning_rate": 0.0005741161168738794, "loss": 2.8282, "step": 103 }, { "epoch": 0.005969136172166657, "grad_norm": 0.31273001432418823, "learning_rate": 0.0005735735173433582, "loss": 2.8104, "step": 104 }, { "epoch": 0.006026531712283643, "grad_norm": 0.19059035181999207, "learning_rate": 0.0005730258381895433, "loss": 2.8186, "step": 105 }, { "epoch": 0.006083927252400631, "grad_norm": 0.25082021951675415, "learning_rate": 0.0005724731019253797, "loss": 2.8154, "step": 106 }, { "epoch": 0.006141322792517618, "grad_norm": 0.23254480957984924, "learning_rate": 0.0005719153312716904, "loss": 2.8121, "step": 107 }, { "epoch": 0.006198718332634605, "grad_norm": 0.24095705151557922, "learning_rate": 0.0005713525491562421, "loss": 2.8361, "step": 108 }, { "epoch": 0.006256113872751592, "grad_norm": 0.17760275304317474, "learning_rate": 0.0005707847787128034, "loss": 2.8396, "step": 109 }, { "epoch": 0.006313509412868579, "grad_norm": 0.20905229449272156, "learning_rate": 0.0005702120432801934, "loss": 2.8284, "step": 110 }, { "epoch": 0.0063709049529855665, "grad_norm": 0.19538630545139313, "learning_rate": 0.0005696343664013227, "loss": 2.8417, "step": 111 }, { "epoch": 0.006428300493102553, "grad_norm": 0.2408672571182251, "learning_rate": 0.0005690517718222248, "loss": 2.8416, "step": 112 }, { "epoch": 0.006485696033219541, "grad_norm": 0.19618412852287292, "learning_rate": 0.0005684642834910813, "loss": 2.8683, "step": 113 }, { "epoch": 0.006543091573336528, "grad_norm": 0.17854906618595123, "learning_rate": 0.0005678719255572363, "loss": 2.8232, "step": 114 }, { "epoch": 0.006600487113453514, "grad_norm": 0.2527766227722168, "learning_rate": 0.0005672747223702044, "loss": 2.8219, "step": 115 }, { "epoch": 0.006657882653570502, "grad_norm": 0.21465440094470978, "learning_rate": 0.0005666726984786695, "loss": 2.8308, "step": 116 }, { "epoch": 0.006715278193687489, "grad_norm": 0.2080729454755783, "learning_rate": 0.000566065878629476, "loss": 2.8369, "step": 117 }, { "epoch": 0.006772673733804476, "grad_norm": 0.18979360163211823, "learning_rate": 0.0005654542877666108, "loss": 2.7997, "step": 118 }, { "epoch": 0.006830069273921463, "grad_norm": 0.20258580148220062, "learning_rate": 0.0005648379510301792, "loss": 2.846, "step": 119 }, { "epoch": 0.00688746481403845, "grad_norm": 0.2112026810646057, "learning_rate": 0.0005642168937553701, "loss": 2.8521, "step": 120 }, { "epoch": 0.006944860354155437, "grad_norm": 0.25105029344558716, "learning_rate": 0.0005635911414714158, "loss": 2.8081, "step": 121 }, { "epoch": 0.007002255894272424, "grad_norm": 0.21830224990844727, "learning_rate": 0.0005629607199005416, "loss": 2.8161, "step": 122 }, { "epoch": 0.007059651434389411, "grad_norm": 0.19216330349445343, "learning_rate": 0.0005623256549569091, "loss": 2.805, "step": 123 }, { "epoch": 0.0071170469745063985, "grad_norm": 0.19969609379768372, "learning_rate": 0.000561685972745551, "loss": 2.7859, "step": 124 }, { "epoch": 0.007174442514623385, "grad_norm": 0.22093947231769562, "learning_rate": 0.0005610416995612973, "loss": 2.8194, "step": 125 }, { "epoch": 0.007231838054740373, "grad_norm": 0.2148187905550003, "learning_rate": 0.0005603928618876952, "loss": 2.8565, "step": 126 }, { "epoch": 0.0072892335948573595, "grad_norm": 0.18277674913406372, "learning_rate": 0.0005597394863959201, "loss": 2.8187, "step": 127 }, { "epoch": 0.007346629134974346, "grad_norm": 0.22607837617397308, "learning_rate": 0.0005590815999436795, "loss": 2.8607, "step": 128 }, { "epoch": 0.007404024675091334, "grad_norm": 0.22417186200618744, "learning_rate": 0.0005584192295741086, "loss": 2.8198, "step": 129 }, { "epoch": 0.007461420215208321, "grad_norm": 0.229670912027359, "learning_rate": 0.0005577524025146591, "loss": 2.8477, "step": 130 }, { "epoch": 0.007518815755325308, "grad_norm": 0.1985808163881302, "learning_rate": 0.0005570811461759794, "loss": 2.8058, "step": 131 }, { "epoch": 0.007576211295442295, "grad_norm": 0.22260330617427826, "learning_rate": 0.0005564054881507886, "loss": 2.8369, "step": 132 }, { "epoch": 0.007633606835559282, "grad_norm": 0.20925524830818176, "learning_rate": 0.0005557254562127417, "loss": 2.8205, "step": 133 }, { "epoch": 0.007691002375676269, "grad_norm": 0.26581674814224243, "learning_rate": 0.0005550410783152882, "loss": 2.8164, "step": 134 }, { "epoch": 0.007748397915793256, "grad_norm": 0.2182077318429947, "learning_rate": 0.0005543523825905229, "loss": 2.8279, "step": 135 }, { "epoch": 0.007805793455910244, "grad_norm": 0.24468722939491272, "learning_rate": 0.0005536593973480297, "loss": 2.8281, "step": 136 }, { "epoch": 0.007863188996027231, "grad_norm": 0.22021321952342987, "learning_rate": 0.0005529621510737175, "loss": 2.8028, "step": 137 }, { "epoch": 0.007920584536144217, "grad_norm": 0.20566654205322266, "learning_rate": 0.0005522606724286498, "loss": 2.7937, "step": 138 }, { "epoch": 0.007977980076261205, "grad_norm": 0.1960543841123581, "learning_rate": 0.0005515549902478665, "loss": 2.8089, "step": 139 }, { "epoch": 0.008035375616378192, "grad_norm": 0.2689999043941498, "learning_rate": 0.0005508451335391975, "loss": 2.7959, "step": 140 }, { "epoch": 0.008092771156495178, "grad_norm": 0.19776718318462372, "learning_rate": 0.0005501311314820721, "loss": 2.8442, "step": 141 }, { "epoch": 0.008150166696612166, "grad_norm": 0.2156287282705307, "learning_rate": 0.0005494130134263184, "loss": 2.8224, "step": 142 }, { "epoch": 0.008207562236729153, "grad_norm": 0.17528703808784485, "learning_rate": 0.0005486908088909568, "loss": 2.8659, "step": 143 }, { "epoch": 0.00826495777684614, "grad_norm": 0.1757359504699707, "learning_rate": 0.0005479645475629872, "loss": 2.8119, "step": 144 }, { "epoch": 0.008322353316963127, "grad_norm": 0.1916513890028, "learning_rate": 0.0005472342592961683, "loss": 2.8069, "step": 145 }, { "epoch": 0.008379748857080115, "grad_norm": 0.19162799417972565, "learning_rate": 0.0005464999741097901, "loss": 2.8211, "step": 146 }, { "epoch": 0.0084371443971971, "grad_norm": 0.1881379634141922, "learning_rate": 0.0005457617221874408, "loss": 2.7954, "step": 147 }, { "epoch": 0.008494539937314088, "grad_norm": 0.22305060923099518, "learning_rate": 0.0005450195338757654, "loss": 2.8447, "step": 148 }, { "epoch": 0.008551935477431076, "grad_norm": 0.25081732869148254, "learning_rate": 0.0005442734396832185, "loss": 2.8205, "step": 149 }, { "epoch": 0.008609331017548063, "grad_norm": 0.24046167731285095, "learning_rate": 0.00054352347027881, "loss": 2.8246, "step": 150 }, { "epoch": 0.00866672655766505, "grad_norm": 0.20985569059848785, "learning_rate": 0.0005427696564908447, "loss": 2.8384, "step": 151 }, { "epoch": 0.008724122097782037, "grad_norm": 0.18979063630104065, "learning_rate": 0.000542012029305655, "loss": 2.8261, "step": 152 }, { "epoch": 0.008781517637899024, "grad_norm": 0.21513347327709198, "learning_rate": 0.0005412506198663268, "loss": 2.8197, "step": 153 }, { "epoch": 0.00883891317801601, "grad_norm": 0.25432831048965454, "learning_rate": 0.0005404854594714204, "loss": 2.8091, "step": 154 }, { "epoch": 0.008896308718132998, "grad_norm": 0.261273592710495, "learning_rate": 0.0005397165795736823, "loss": 2.8324, "step": 155 }, { "epoch": 0.008953704258249985, "grad_norm": 0.22144336998462677, "learning_rate": 0.0005389440117787538, "loss": 2.8459, "step": 156 }, { "epoch": 0.009011099798366971, "grad_norm": 0.1860560178756714, "learning_rate": 0.000538167787843871, "loss": 2.8552, "step": 157 }, { "epoch": 0.009068495338483959, "grad_norm": 0.2402401566505432, "learning_rate": 0.0005373879396765593, "loss": 2.8229, "step": 158 }, { "epoch": 0.009125890878600947, "grad_norm": 0.2112584114074707, "learning_rate": 0.0005366044993333228, "loss": 2.823, "step": 159 }, { "epoch": 0.009183286418717934, "grad_norm": 0.24757996201515198, "learning_rate": 0.0005358174990183254, "loss": 2.8458, "step": 160 }, { "epoch": 0.00924068195883492, "grad_norm": 0.20984984934329987, "learning_rate": 0.0005350269710820675, "loss": 2.8375, "step": 161 }, { "epoch": 0.009298077498951908, "grad_norm": 0.22329501807689667, "learning_rate": 0.0005342329480200562, "loss": 2.815, "step": 162 }, { "epoch": 0.009355473039068895, "grad_norm": 0.26144203543663025, "learning_rate": 0.0005334354624714697, "loss": 2.8286, "step": 163 }, { "epoch": 0.009412868579185881, "grad_norm": 0.20015327632427216, "learning_rate": 0.0005326345472178154, "loss": 2.8304, "step": 164 }, { "epoch": 0.009470264119302869, "grad_norm": 0.29256758093833923, "learning_rate": 0.0005318302351815823, "loss": 2.7884, "step": 165 }, { "epoch": 0.009527659659419856, "grad_norm": 0.22914084792137146, "learning_rate": 0.000531022559424888, "loss": 2.8253, "step": 166 }, { "epoch": 0.009585055199536842, "grad_norm": 0.2677003741264343, "learning_rate": 0.0005302115531481195, "loss": 2.8084, "step": 167 }, { "epoch": 0.00964245073965383, "grad_norm": 0.2672327756881714, "learning_rate": 0.000529397249688568, "loss": 2.8351, "step": 168 }, { "epoch": 0.009699846279770817, "grad_norm": 0.21281464397907257, "learning_rate": 0.0005285796825190598, "loss": 2.8463, "step": 169 }, { "epoch": 0.009757241819887805, "grad_norm": 0.22858156263828278, "learning_rate": 0.0005277588852465788, "loss": 2.8156, "step": 170 }, { "epoch": 0.009814637360004791, "grad_norm": 0.20694582164287567, "learning_rate": 0.0005269348916108859, "loss": 2.8392, "step": 171 }, { "epoch": 0.009872032900121779, "grad_norm": 0.22438685595989227, "learning_rate": 0.0005261077354831322, "loss": 2.8336, "step": 172 }, { "epoch": 0.009929428440238766, "grad_norm": 0.2279587984085083, "learning_rate": 0.0005252774508644666, "loss": 2.7972, "step": 173 }, { "epoch": 0.009986823980355752, "grad_norm": 0.21278439462184906, "learning_rate": 0.0005244440718846375, "loss": 2.7946, "step": 174 }, { "epoch": 0.01004421952047274, "grad_norm": 0.23399871587753296, "learning_rate": 0.0005236076328005906, "loss": 2.8648, "step": 175 }, { "epoch": 0.010101615060589727, "grad_norm": 0.2649572193622589, "learning_rate": 0.0005227681679950607, "loss": 2.8453, "step": 176 }, { "epoch": 0.010159010600706713, "grad_norm": 0.21067285537719727, "learning_rate": 0.0005219257119751581, "loss": 2.8357, "step": 177 }, { "epoch": 0.0102164061408237, "grad_norm": 0.22862860560417175, "learning_rate": 0.0005210802993709497, "loss": 2.8235, "step": 178 }, { "epoch": 0.010273801680940688, "grad_norm": 0.22179283201694489, "learning_rate": 0.0005202319649340369, "loss": 2.82, "step": 179 }, { "epoch": 0.010331197221057674, "grad_norm": 0.16690605878829956, "learning_rate": 0.0005193807435361252, "loss": 2.8237, "step": 180 }, { "epoch": 0.010388592761174662, "grad_norm": 0.21572506427764893, "learning_rate": 0.0005185266701675927, "loss": 2.8403, "step": 181 }, { "epoch": 0.01044598830129165, "grad_norm": 0.1778525710105896, "learning_rate": 0.0005176697799360502, "loss": 2.8204, "step": 182 }, { "epoch": 0.010503383841408637, "grad_norm": 0.18887534737586975, "learning_rate": 0.0005168101080648989, "loss": 2.8146, "step": 183 }, { "epoch": 0.010560779381525623, "grad_norm": 0.18108077347278595, "learning_rate": 0.0005159476898918823, "loss": 2.853, "step": 184 }, { "epoch": 0.01061817492164261, "grad_norm": 0.1870754212141037, "learning_rate": 0.0005150825608676336, "loss": 2.8537, "step": 185 }, { "epoch": 0.010675570461759598, "grad_norm": 0.16484060883522034, "learning_rate": 0.0005142147565542188, "loss": 2.8194, "step": 186 }, { "epoch": 0.010732966001876584, "grad_norm": 0.18527449667453766, "learning_rate": 0.0005133443126236739, "loss": 2.8402, "step": 187 }, { "epoch": 0.010790361541993572, "grad_norm": 0.17674389481544495, "learning_rate": 0.0005124712648565398, "loss": 2.8412, "step": 188 }, { "epoch": 0.01084775708211056, "grad_norm": 0.2521503269672394, "learning_rate": 0.0005115956491403907, "loss": 2.8348, "step": 189 }, { "epoch": 0.010905152622227545, "grad_norm": 0.17621657252311707, "learning_rate": 0.000510717501468359, "loss": 2.8293, "step": 190 }, { "epoch": 0.010962548162344533, "grad_norm": 0.2621336579322815, "learning_rate": 0.0005098368579376563, "loss": 2.8164, "step": 191 }, { "epoch": 0.01101994370246152, "grad_norm": 0.18950189650058746, "learning_rate": 0.0005089537547480885, "loss": 2.7976, "step": 192 }, { "epoch": 0.011077339242578508, "grad_norm": 0.24857239425182343, "learning_rate": 0.0005080682282005692, "loss": 2.8323, "step": 193 }, { "epoch": 0.011134734782695494, "grad_norm": 0.16708490252494812, "learning_rate": 0.0005071803146956262, "loss": 2.801, "step": 194 }, { "epoch": 0.011192130322812481, "grad_norm": 0.24443359673023224, "learning_rate": 0.000506290050731906, "loss": 2.8121, "step": 195 }, { "epoch": 0.011249525862929469, "grad_norm": 0.2458924949169159, "learning_rate": 0.0005053974729046734, "loss": 2.8325, "step": 196 }, { "epoch": 0.011306921403046455, "grad_norm": 0.2034812569618225, "learning_rate": 0.0005045026179043067, "loss": 2.8123, "step": 197 }, { "epoch": 0.011364316943163442, "grad_norm": 0.2774895429611206, "learning_rate": 0.0005036055225147901, "loss": 2.8324, "step": 198 }, { "epoch": 0.01142171248328043, "grad_norm": 0.22201013565063477, "learning_rate": 0.0005027062236122014, "loss": 2.8195, "step": 199 }, { "epoch": 0.011479108023397416, "grad_norm": 0.1893691122531891, "learning_rate": 0.0005018047581631961, "loss": 2.8177, "step": 200 }, { "epoch": 0.011479108023397416, "eval_loss": 2.749150037765503, "eval_runtime": 85.2258, "eval_samples_per_second": 50.63, "eval_steps_per_second": 12.66, "step": 200 }, { "epoch": 0.011536503563514404, "grad_norm": 0.2689765691757202, "learning_rate": 0.0005009011632234881, "loss": 2.8438, "step": 201 }, { "epoch": 0.011593899103631391, "grad_norm": 0.2234533727169037, "learning_rate": 0.0004999954759363262, "loss": 2.8103, "step": 202 }, { "epoch": 0.011651294643748379, "grad_norm": 0.25140801072120667, "learning_rate": 0.0004990877335309675, "loss": 2.8178, "step": 203 }, { "epoch": 0.011708690183865365, "grad_norm": 0.3070688843727112, "learning_rate": 0.0004981779733211468, "loss": 2.8518, "step": 204 }, { "epoch": 0.011766085723982352, "grad_norm": 0.25637757778167725, "learning_rate": 0.0004972662327035431, "loss": 2.8578, "step": 205 }, { "epoch": 0.01182348126409934, "grad_norm": 0.2551119923591614, "learning_rate": 0.0004963525491562421, "loss": 2.8237, "step": 206 }, { "epoch": 0.011880876804216326, "grad_norm": 0.2416735738515854, "learning_rate": 0.0004954369602371958, "loss": 2.8195, "step": 207 }, { "epoch": 0.011938272344333313, "grad_norm": 0.3950039744377136, "learning_rate": 0.0004945195035826785, "loss": 2.8087, "step": 208 }, { "epoch": 0.011995667884450301, "grad_norm": 0.16370531916618347, "learning_rate": 0.00049360021690574, "loss": 2.8464, "step": 209 }, { "epoch": 0.012053063424567287, "grad_norm": 0.28070008754730225, "learning_rate": 0.0004926791379946549, "loss": 2.8377, "step": 210 }, { "epoch": 0.012110458964684274, "grad_norm": 0.1902085244655609, "learning_rate": 0.0004917563047113695, "loss": 2.8279, "step": 211 }, { "epoch": 0.012167854504801262, "grad_norm": 0.27748385071754456, "learning_rate": 0.0004908317549899456, "loss": 2.837, "step": 212 }, { "epoch": 0.012225250044918248, "grad_norm": 0.18437190353870392, "learning_rate": 0.0004899055268350012, "loss": 2.8301, "step": 213 }, { "epoch": 0.012282645585035236, "grad_norm": 0.22971947491168976, "learning_rate": 0.0004889776583201479, "loss": 2.8051, "step": 214 }, { "epoch": 0.012340041125152223, "grad_norm": 0.238089457154274, "learning_rate": 0.0004880481875864261, "loss": 2.8162, "step": 215 }, { "epoch": 0.01239743666526921, "grad_norm": 0.24253320693969727, "learning_rate": 0.0004871171528407371, "loss": 2.8181, "step": 216 }, { "epoch": 0.012454832205386197, "grad_norm": 0.2351958006620407, "learning_rate": 0.0004861845923542728, "loss": 2.8136, "step": 217 }, { "epoch": 0.012512227745503184, "grad_norm": 0.23203608393669128, "learning_rate": 0.0004852505444609422, "loss": 2.804, "step": 218 }, { "epoch": 0.012569623285620172, "grad_norm": 0.1896822452545166, "learning_rate": 0.00048431504755579575, "loss": 2.8118, "step": 219 }, { "epoch": 0.012627018825737158, "grad_norm": 0.18357349932193756, "learning_rate": 0.0004833781400934471, "loss": 2.8205, "step": 220 }, { "epoch": 0.012684414365854145, "grad_norm": 0.23723295331001282, "learning_rate": 0.00048243986058649246, "loss": 2.8291, "step": 221 }, { "epoch": 0.012741809905971133, "grad_norm": 0.1937919706106186, "learning_rate": 0.0004815002476039273, "loss": 2.8416, "step": 222 }, { "epoch": 0.012799205446088119, "grad_norm": 0.19754467904567719, "learning_rate": 0.0004805593397695613, "loss": 2.7963, "step": 223 }, { "epoch": 0.012856600986205106, "grad_norm": 0.1592610776424408, "learning_rate": 0.00047961717576043, "loss": 2.8264, "step": 224 }, { "epoch": 0.012913996526322094, "grad_norm": 0.2083783745765686, "learning_rate": 0.00047867379430520585, "loss": 2.8348, "step": 225 }, { "epoch": 0.012971392066439082, "grad_norm": 0.1895647495985031, "learning_rate": 0.00047772923418260525, "loss": 2.8212, "step": 226 }, { "epoch": 0.013028787606556068, "grad_norm": 0.2173570841550827, "learning_rate": 0.0004767835342197954, "loss": 2.8098, "step": 227 }, { "epoch": 0.013086183146673055, "grad_norm": 0.1693475991487503, "learning_rate": 0.0004758367332907978, "loss": 2.796, "step": 228 }, { "epoch": 0.013143578686790043, "grad_norm": 0.21635355055332184, "learning_rate": 0.00047488887031489017, "loss": 2.843, "step": 229 }, { "epoch": 0.013200974226907029, "grad_norm": 0.18521156907081604, "learning_rate": 0.0004739399842550068, "loss": 2.8296, "step": 230 }, { "epoch": 0.013258369767024016, "grad_norm": 0.22925664484500885, "learning_rate": 0.00047299011411613734, "loss": 2.8287, "step": 231 }, { "epoch": 0.013315765307141004, "grad_norm": 0.24881386756896973, "learning_rate": 0.00047203929894372264, "loss": 2.8257, "step": 232 }, { "epoch": 0.01337316084725799, "grad_norm": 0.20801618695259094, "learning_rate": 0.00047108757782205043, "loss": 2.8241, "step": 233 }, { "epoch": 0.013430556387374977, "grad_norm": 0.199665367603302, "learning_rate": 0.0004701349898726483, "loss": 2.7916, "step": 234 }, { "epoch": 0.013487951927491965, "grad_norm": 0.25221607089042664, "learning_rate": 0.00046918157425267584, "loss": 2.8233, "step": 235 }, { "epoch": 0.013545347467608953, "grad_norm": 0.1931813657283783, "learning_rate": 0.00046822737015331505, "loss": 2.8016, "step": 236 }, { "epoch": 0.013602743007725938, "grad_norm": 0.17353369295597076, "learning_rate": 0.00046727241679815894, "loss": 2.8125, "step": 237 }, { "epoch": 0.013660138547842926, "grad_norm": 0.22225958108901978, "learning_rate": 0.0004663167534415996, "loss": 2.824, "step": 238 }, { "epoch": 0.013717534087959914, "grad_norm": 0.17010116577148438, "learning_rate": 0.0004653604193672147, "loss": 2.8425, "step": 239 }, { "epoch": 0.0137749296280769, "grad_norm": 0.2103683203458786, "learning_rate": 0.00046440345388615225, "loss": 2.8641, "step": 240 }, { "epoch": 0.013832325168193887, "grad_norm": 0.17934557795524597, "learning_rate": 0.00046344589633551497, "loss": 2.8069, "step": 241 }, { "epoch": 0.013889720708310875, "grad_norm": 0.2116999328136444, "learning_rate": 0.0004624877860767434, "loss": 2.8601, "step": 242 }, { "epoch": 0.01394711624842786, "grad_norm": 0.20861205458641052, "learning_rate": 0.0004615291624939975, "loss": 2.8232, "step": 243 }, { "epoch": 0.014004511788544848, "grad_norm": 0.24393285810947418, "learning_rate": 0.0004605700649925381, "loss": 2.8041, "step": 244 }, { "epoch": 0.014061907328661836, "grad_norm": 0.2089577168226242, "learning_rate": 0.0004596105329971069, "loss": 2.8351, "step": 245 }, { "epoch": 0.014119302868778822, "grad_norm": 0.20232421159744263, "learning_rate": 0.00045865060595030616, "loss": 2.8171, "step": 246 }, { "epoch": 0.01417669840889581, "grad_norm": 0.22081732749938965, "learning_rate": 0.00045769032331097686, "loss": 2.8202, "step": 247 }, { "epoch": 0.014234093949012797, "grad_norm": 0.17081516981124878, "learning_rate": 0.00045672972455257723, "loss": 2.8358, "step": 248 }, { "epoch": 0.014291489489129785, "grad_norm": 0.3317008316516876, "learning_rate": 0.0004557688491615597, "loss": 2.8302, "step": 249 }, { "epoch": 0.01434888502924677, "grad_norm": 0.23239760100841522, "learning_rate": 0.0004548077366357483, "loss": 2.8191, "step": 250 }, { "epoch": 0.014406280569363758, "grad_norm": 0.22138993442058563, "learning_rate": 0.0004538464264827143, "loss": 2.8096, "step": 251 }, { "epoch": 0.014463676109480746, "grad_norm": 0.23655574023723602, "learning_rate": 0.000452884958218153, "loss": 2.8295, "step": 252 }, { "epoch": 0.014521071649597731, "grad_norm": 0.2227945327758789, "learning_rate": 0.000451923371364259, "loss": 2.8158, "step": 253 }, { "epoch": 0.014578467189714719, "grad_norm": 0.20443300902843475, "learning_rate": 0.0004509617054481017, "loss": 2.83, "step": 254 }, { "epoch": 0.014635862729831707, "grad_norm": 0.22221451997756958, "learning_rate": 0.00045, "loss": 2.8253, "step": 255 }, { "epoch": 0.014693258269948693, "grad_norm": 0.1941068023443222, "learning_rate": 0.00044903829455189825, "loss": 2.83, "step": 256 }, { "epoch": 0.01475065381006568, "grad_norm": 0.1914331614971161, "learning_rate": 0.0004480766286357409, "loss": 2.8162, "step": 257 }, { "epoch": 0.014808049350182668, "grad_norm": 0.21014779806137085, "learning_rate": 0.0004471150417818469, "loss": 2.7993, "step": 258 }, { "epoch": 0.014865444890299655, "grad_norm": 0.2057676762342453, "learning_rate": 0.00044615357351728566, "loss": 2.8223, "step": 259 }, { "epoch": 0.014922840430416641, "grad_norm": 0.19875939190387726, "learning_rate": 0.00044519226336425165, "loss": 2.8016, "step": 260 }, { "epoch": 0.014980235970533629, "grad_norm": 0.23691999912261963, "learning_rate": 0.0004442311508384402, "loss": 2.8373, "step": 261 }, { "epoch": 0.015037631510650616, "grad_norm": 0.1729947328567505, "learning_rate": 0.0004432702754474228, "loss": 2.8233, "step": 262 }, { "epoch": 0.015095027050767602, "grad_norm": 0.18821187317371368, "learning_rate": 0.00044230967668902306, "loss": 2.8128, "step": 263 }, { "epoch": 0.01515242259088459, "grad_norm": 0.2283882200717926, "learning_rate": 0.00044134939404969387, "loss": 2.8178, "step": 264 }, { "epoch": 0.015209818131001578, "grad_norm": 0.16724412143230438, "learning_rate": 0.000440389467002893, "loss": 2.8249, "step": 265 }, { "epoch": 0.015267213671118563, "grad_norm": 0.18209712207317352, "learning_rate": 0.00043942993500746183, "loss": 2.8095, "step": 266 }, { "epoch": 0.015324609211235551, "grad_norm": 0.1857995092868805, "learning_rate": 0.00043847083750600253, "loss": 2.806, "step": 267 }, { "epoch": 0.015382004751352539, "grad_norm": 0.20734605193138123, "learning_rate": 0.0004375122139232566, "loss": 2.8695, "step": 268 }, { "epoch": 0.015439400291469526, "grad_norm": 0.23138895630836487, "learning_rate": 0.00043655410366448495, "loss": 2.8033, "step": 269 }, { "epoch": 0.015496795831586512, "grad_norm": 0.20481987297534943, "learning_rate": 0.0004355965461138477, "loss": 2.8269, "step": 270 }, { "epoch": 0.0155541913717035, "grad_norm": 0.2318529337644577, "learning_rate": 0.00043463958063278524, "loss": 2.8332, "step": 271 }, { "epoch": 0.015611586911820487, "grad_norm": 0.2501411736011505, "learning_rate": 0.00043368324655840035, "loss": 2.8445, "step": 272 }, { "epoch": 0.015668982451937475, "grad_norm": 0.26137158274650574, "learning_rate": 0.0004327275832018411, "loss": 2.8279, "step": 273 }, { "epoch": 0.015726377992054463, "grad_norm": 0.19074887037277222, "learning_rate": 0.0004317726298466849, "loss": 2.8132, "step": 274 }, { "epoch": 0.015783773532171447, "grad_norm": 0.26000818610191345, "learning_rate": 0.0004308184257473241, "loss": 2.8091, "step": 275 }, { "epoch": 0.015841169072288434, "grad_norm": 0.16060984134674072, "learning_rate": 0.0004298650101273517, "loss": 2.8206, "step": 276 }, { "epoch": 0.015898564612405422, "grad_norm": 0.284445583820343, "learning_rate": 0.00042891242217794954, "loss": 2.7867, "step": 277 }, { "epoch": 0.01595596015252241, "grad_norm": 0.15903466939926147, "learning_rate": 0.0004279607010562773, "loss": 2.83, "step": 278 }, { "epoch": 0.016013355692639397, "grad_norm": 0.24330751597881317, "learning_rate": 0.0004270098858838626, "loss": 2.817, "step": 279 }, { "epoch": 0.016070751232756385, "grad_norm": 0.1687777042388916, "learning_rate": 0.0004260600157449931, "loss": 2.8112, "step": 280 }, { "epoch": 0.01612814677287337, "grad_norm": 0.18230785429477692, "learning_rate": 0.0004251111296851098, "loss": 2.8394, "step": 281 }, { "epoch": 0.016185542312990357, "grad_norm": 0.1889660507440567, "learning_rate": 0.00042416326670920217, "loss": 2.8109, "step": 282 }, { "epoch": 0.016242937853107344, "grad_norm": 0.16135123372077942, "learning_rate": 0.0004232164657802045, "loss": 2.7953, "step": 283 }, { "epoch": 0.016300333393224332, "grad_norm": 0.15787218511104584, "learning_rate": 0.00042227076581739467, "loss": 2.7921, "step": 284 }, { "epoch": 0.01635772893334132, "grad_norm": 0.16313977539539337, "learning_rate": 0.0004213262056947942, "loss": 2.8107, "step": 285 }, { "epoch": 0.016415124473458307, "grad_norm": 0.18806132674217224, "learning_rate": 0.0004203828242395699, "loss": 2.8451, "step": 286 }, { "epoch": 0.016472520013575295, "grad_norm": 0.17279674112796783, "learning_rate": 0.00041944066023043866, "loss": 2.8333, "step": 287 }, { "epoch": 0.01652991555369228, "grad_norm": 0.17451834678649902, "learning_rate": 0.00041849975239607255, "loss": 2.7798, "step": 288 }, { "epoch": 0.016587311093809266, "grad_norm": 0.1943039745092392, "learning_rate": 0.00041756013941350747, "loss": 2.8011, "step": 289 }, { "epoch": 0.016644706633926254, "grad_norm": 0.1578904092311859, "learning_rate": 0.0004166218599065528, "loss": 2.852, "step": 290 }, { "epoch": 0.01670210217404324, "grad_norm": 0.20066620409488678, "learning_rate": 0.0004156849524442042, "loss": 2.7876, "step": 291 }, { "epoch": 0.01675949771416023, "grad_norm": 0.18306495249271393, "learning_rate": 0.0004147494555390577, "loss": 2.817, "step": 292 }, { "epoch": 0.016816893254277217, "grad_norm": 0.1622687727212906, "learning_rate": 0.0004138154076457271, "loss": 2.815, "step": 293 }, { "epoch": 0.0168742887943942, "grad_norm": 0.2056518942117691, "learning_rate": 0.0004128828471592628, "loss": 2.8131, "step": 294 }, { "epoch": 0.01693168433451119, "grad_norm": 0.17123937606811523, "learning_rate": 0.00041195181241357383, "loss": 2.8025, "step": 295 }, { "epoch": 0.016989079874628176, "grad_norm": 0.2233334332704544, "learning_rate": 0.00041102234167985204, "loss": 2.8347, "step": 296 }, { "epoch": 0.017046475414745164, "grad_norm": 0.20740529894828796, "learning_rate": 0.0004100944731649987, "loss": 2.8099, "step": 297 }, { "epoch": 0.01710387095486215, "grad_norm": 0.20391066372394562, "learning_rate": 0.0004091682450100543, "loss": 2.8363, "step": 298 }, { "epoch": 0.01716126649497914, "grad_norm": 0.17306548357009888, "learning_rate": 0.0004082436952886305, "loss": 2.8211, "step": 299 }, { "epoch": 0.017218662035096127, "grad_norm": 0.24933576583862305, "learning_rate": 0.0004073208620053451, "loss": 2.8048, "step": 300 }, { "epoch": 0.017218662035096127, "eval_loss": 2.7432332038879395, "eval_runtime": 85.2508, "eval_samples_per_second": 50.615, "eval_steps_per_second": 12.657, "step": 300 }, { "epoch": 0.01727605757521311, "grad_norm": 0.231708824634552, "learning_rate": 0.00040639978309425995, "loss": 2.8025, "step": 301 }, { "epoch": 0.0173334531153301, "grad_norm": 0.15970614552497864, "learning_rate": 0.00040548049641732137, "loss": 2.8392, "step": 302 }, { "epoch": 0.017390848655447086, "grad_norm": 0.20457029342651367, "learning_rate": 0.0004045630397628042, "loss": 2.8247, "step": 303 }, { "epoch": 0.017448244195564074, "grad_norm": 0.1734900325536728, "learning_rate": 0.00040364745084375787, "loss": 2.7979, "step": 304 }, { "epoch": 0.01750563973568106, "grad_norm": 0.19265452027320862, "learning_rate": 0.00040273376729645685, "loss": 2.8033, "step": 305 }, { "epoch": 0.01756303527579805, "grad_norm": 0.19174844026565552, "learning_rate": 0.00040182202667885317, "loss": 2.8354, "step": 306 }, { "epoch": 0.017620430815915036, "grad_norm": 0.27793413400650024, "learning_rate": 0.00040091226646903245, "loss": 2.797, "step": 307 }, { "epoch": 0.01767782635603202, "grad_norm": 0.1806309074163437, "learning_rate": 0.00040000452406367367, "loss": 2.8046, "step": 308 }, { "epoch": 0.017735221896149008, "grad_norm": 0.2249089479446411, "learning_rate": 0.0003990988367765118, "loss": 2.8125, "step": 309 }, { "epoch": 0.017792617436265996, "grad_norm": 0.27839699387550354, "learning_rate": 0.00039819524183680384, "loss": 2.8183, "step": 310 }, { "epoch": 0.017850012976382983, "grad_norm": 0.1877232789993286, "learning_rate": 0.00039729377638779857, "loss": 2.7989, "step": 311 }, { "epoch": 0.01790740851649997, "grad_norm": 0.25160273909568787, "learning_rate": 0.00039639447748520985, "loss": 2.8536, "step": 312 }, { "epoch": 0.01796480405661696, "grad_norm": 0.23843353986740112, "learning_rate": 0.0003954973820956932, "loss": 2.8064, "step": 313 }, { "epoch": 0.018022199596733943, "grad_norm": 0.2549470365047455, "learning_rate": 0.00039460252709532656, "loss": 2.8415, "step": 314 }, { "epoch": 0.01807959513685093, "grad_norm": 0.39248892664909363, "learning_rate": 0.0003937099492680938, "loss": 2.8137, "step": 315 }, { "epoch": 0.018136990676967918, "grad_norm": 0.24034982919692993, "learning_rate": 0.0003928196853043737, "loss": 2.8301, "step": 316 }, { "epoch": 0.018194386217084905, "grad_norm": 0.29434794187545776, "learning_rate": 0.00039193177179943083, "loss": 2.8288, "step": 317 }, { "epoch": 0.018251781757201893, "grad_norm": 0.21636317670345306, "learning_rate": 0.0003910462452519114, "loss": 2.8121, "step": 318 }, { "epoch": 0.01830917729731888, "grad_norm": 0.2217407375574112, "learning_rate": 0.0003901631420623437, "loss": 2.8551, "step": 319 }, { "epoch": 0.01836657283743587, "grad_norm": 0.20126426219940186, "learning_rate": 0.0003892824985316409, "loss": 2.7812, "step": 320 }, { "epoch": 0.018423968377552852, "grad_norm": 0.20343463122844696, "learning_rate": 0.0003884043508596093, "loss": 2.7959, "step": 321 }, { "epoch": 0.01848136391766984, "grad_norm": 0.22265484929084778, "learning_rate": 0.00038752873514346015, "loss": 2.8254, "step": 322 }, { "epoch": 0.018538759457786828, "grad_norm": 0.20545947551727295, "learning_rate": 0.000386655687376326, "loss": 2.8166, "step": 323 }, { "epoch": 0.018596154997903815, "grad_norm": 0.17015507817268372, "learning_rate": 0.00038578524344578115, "loss": 2.806, "step": 324 }, { "epoch": 0.018653550538020803, "grad_norm": 0.19378258287906647, "learning_rate": 0.00038491743913236624, "loss": 2.7979, "step": 325 }, { "epoch": 0.01871094607813779, "grad_norm": 0.2112617790699005, "learning_rate": 0.0003840523101081177, "loss": 2.8149, "step": 326 }, { "epoch": 0.018768341618254775, "grad_norm": 0.18846029043197632, "learning_rate": 0.0003831898919351011, "loss": 2.8334, "step": 327 }, { "epoch": 0.018825737158371762, "grad_norm": 0.20672033727169037, "learning_rate": 0.00038233022006394976, "loss": 2.8061, "step": 328 }, { "epoch": 0.01888313269848875, "grad_norm": 0.2700256109237671, "learning_rate": 0.00038147332983240717, "loss": 2.8101, "step": 329 }, { "epoch": 0.018940528238605737, "grad_norm": 0.16990099847316742, "learning_rate": 0.00038061925646387467, "loss": 2.8227, "step": 330 }, { "epoch": 0.018997923778722725, "grad_norm": 0.2140357792377472, "learning_rate": 0.0003797680350659631, "loss": 2.8018, "step": 331 }, { "epoch": 0.019055319318839713, "grad_norm": 0.2538260221481323, "learning_rate": 0.0003789197006290502, "loss": 2.7725, "step": 332 }, { "epoch": 0.0191127148589567, "grad_norm": 0.1694011092185974, "learning_rate": 0.0003780742880248419, "loss": 2.7973, "step": 333 }, { "epoch": 0.019170110399073684, "grad_norm": 0.2092764526605606, "learning_rate": 0.0003772318320049391, "loss": 2.8256, "step": 334 }, { "epoch": 0.019227505939190672, "grad_norm": 0.22675682604312897, "learning_rate": 0.0003763923671994093, "loss": 2.8092, "step": 335 }, { "epoch": 0.01928490147930766, "grad_norm": 0.20571155846118927, "learning_rate": 0.0003755559281153625, "loss": 2.8176, "step": 336 }, { "epoch": 0.019342297019424647, "grad_norm": 0.18606650829315186, "learning_rate": 0.0003747225491355334, "loss": 2.8019, "step": 337 }, { "epoch": 0.019399692559541635, "grad_norm": 0.19859890639781952, "learning_rate": 0.00037389226451686763, "loss": 2.8036, "step": 338 }, { "epoch": 0.019457088099658622, "grad_norm": 0.1632896512746811, "learning_rate": 0.00037306510838911404, "loss": 2.797, "step": 339 }, { "epoch": 0.01951448363977561, "grad_norm": 0.17494754493236542, "learning_rate": 0.00037224111475342116, "loss": 2.8152, "step": 340 }, { "epoch": 0.019571879179892594, "grad_norm": 0.20659732818603516, "learning_rate": 0.00037142031748094016, "loss": 2.8061, "step": 341 }, { "epoch": 0.019629274720009582, "grad_norm": 0.18716713786125183, "learning_rate": 0.00037060275031143184, "loss": 2.8419, "step": 342 }, { "epoch": 0.01968667026012657, "grad_norm": 0.2575749158859253, "learning_rate": 0.0003697884468518805, "loss": 2.7814, "step": 343 }, { "epoch": 0.019744065800243557, "grad_norm": 0.19076134264469147, "learning_rate": 0.0003689774405751119, "loss": 2.797, "step": 344 }, { "epoch": 0.019801461340360545, "grad_norm": 0.19563442468643188, "learning_rate": 0.00036816976481841764, "loss": 2.8269, "step": 345 }, { "epoch": 0.019858856880477532, "grad_norm": 0.1790810525417328, "learning_rate": 0.0003673654527821846, "loss": 2.7856, "step": 346 }, { "epoch": 0.019916252420594516, "grad_norm": 0.2125868797302246, "learning_rate": 0.00036656453752853025, "loss": 2.7973, "step": 347 }, { "epoch": 0.019973647960711504, "grad_norm": 0.1454995572566986, "learning_rate": 0.00036576705197994376, "loss": 2.7869, "step": 348 }, { "epoch": 0.02003104350082849, "grad_norm": 0.2808379530906677, "learning_rate": 0.00036497302891793255, "loss": 2.7923, "step": 349 }, { "epoch": 0.02008843904094548, "grad_norm": 0.1776140034198761, "learning_rate": 0.0003641825009816745, "loss": 2.8194, "step": 350 }, { "epoch": 0.020145834581062467, "grad_norm": 0.22207793593406677, "learning_rate": 0.0003633955006666771, "loss": 2.8234, "step": 351 }, { "epoch": 0.020203230121179454, "grad_norm": 0.24642404913902283, "learning_rate": 0.0003626120603234406, "loss": 2.8351, "step": 352 }, { "epoch": 0.020260625661296442, "grad_norm": 0.24731726944446564, "learning_rate": 0.000361832212156129, "loss": 2.7983, "step": 353 }, { "epoch": 0.020318021201413426, "grad_norm": 0.21677981317043304, "learning_rate": 0.0003610559882212461, "loss": 2.8372, "step": 354 }, { "epoch": 0.020375416741530414, "grad_norm": 0.28350090980529785, "learning_rate": 0.00036028342042631755, "loss": 2.8138, "step": 355 }, { "epoch": 0.0204328122816474, "grad_norm": 0.22418756783008575, "learning_rate": 0.00035951454052857954, "loss": 2.7897, "step": 356 }, { "epoch": 0.02049020782176439, "grad_norm": 0.27765804529190063, "learning_rate": 0.000358749380133673, "loss": 2.8139, "step": 357 }, { "epoch": 0.020547603361881377, "grad_norm": 0.2694258391857147, "learning_rate": 0.000357987970694345, "loss": 2.7881, "step": 358 }, { "epoch": 0.020604998901998364, "grad_norm": 0.3746117055416107, "learning_rate": 0.00035723034350915525, "loss": 2.8108, "step": 359 }, { "epoch": 0.02066239444211535, "grad_norm": 0.22864773869514465, "learning_rate": 0.00035647652972119, "loss": 2.8102, "step": 360 }, { "epoch": 0.020719789982232336, "grad_norm": 0.2728801369667053, "learning_rate": 0.0003557265603167814, "loss": 2.8046, "step": 361 }, { "epoch": 0.020777185522349324, "grad_norm": 0.2561710774898529, "learning_rate": 0.0003549804661242345, "loss": 2.8242, "step": 362 }, { "epoch": 0.02083458106246631, "grad_norm": 0.26235631108283997, "learning_rate": 0.00035423827781255914, "loss": 2.847, "step": 363 }, { "epoch": 0.0208919766025833, "grad_norm": 0.24725806713104248, "learning_rate": 0.0003535000258902099, "loss": 2.7873, "step": 364 }, { "epoch": 0.020949372142700286, "grad_norm": 0.2562279999256134, "learning_rate": 0.0003527657407038317, "loss": 2.799, "step": 365 }, { "epoch": 0.021006767682817274, "grad_norm": 0.20368199050426483, "learning_rate": 0.00035203545243701266, "loss": 2.8011, "step": 366 }, { "epoch": 0.021064163222934258, "grad_norm": 0.25594958662986755, "learning_rate": 0.0003513091911090431, "loss": 2.8099, "step": 367 }, { "epoch": 0.021121558763051246, "grad_norm": 0.20084761083126068, "learning_rate": 0.00035058698657368154, "loss": 2.8249, "step": 368 }, { "epoch": 0.021178954303168233, "grad_norm": 0.24110020697116852, "learning_rate": 0.00034986886851792775, "loss": 2.8058, "step": 369 }, { "epoch": 0.02123634984328522, "grad_norm": 0.2016633003950119, "learning_rate": 0.0003491548664608024, "loss": 2.7935, "step": 370 }, { "epoch": 0.02129374538340221, "grad_norm": 0.2722468376159668, "learning_rate": 0.0003484450097521336, "loss": 2.8146, "step": 371 }, { "epoch": 0.021351140923519196, "grad_norm": 0.2089434564113617, "learning_rate": 0.0003477393275713501, "loss": 2.8231, "step": 372 }, { "epoch": 0.021408536463636184, "grad_norm": 0.24770453572273254, "learning_rate": 0.0003470378489262824, "loss": 2.7994, "step": 373 }, { "epoch": 0.021465932003753168, "grad_norm": 0.21104897558689117, "learning_rate": 0.00034634060265197026, "loss": 2.8189, "step": 374 }, { "epoch": 0.021523327543870156, "grad_norm": 0.23374824225902557, "learning_rate": 0.000345647617409477, "loss": 2.783, "step": 375 }, { "epoch": 0.021580723083987143, "grad_norm": 0.24334168434143066, "learning_rate": 0.00034495892168471176, "loss": 2.8092, "step": 376 }, { "epoch": 0.02163811862410413, "grad_norm": 0.22772932052612305, "learning_rate": 0.00034427454378725827, "loss": 2.8178, "step": 377 }, { "epoch": 0.02169551416422112, "grad_norm": 0.22545067965984344, "learning_rate": 0.00034359451184921125, "loss": 2.7961, "step": 378 }, { "epoch": 0.021752909704338106, "grad_norm": 0.2873929738998413, "learning_rate": 0.00034291885382402044, "loss": 2.8408, "step": 379 }, { "epoch": 0.02181030524445509, "grad_norm": 0.2099824994802475, "learning_rate": 0.00034224759748534083, "loss": 2.782, "step": 380 }, { "epoch": 0.021867700784572078, "grad_norm": 0.32221996784210205, "learning_rate": 0.0003415807704258913, "loss": 2.8337, "step": 381 }, { "epoch": 0.021925096324689065, "grad_norm": 0.2531490623950958, "learning_rate": 0.0003409184000563204, "loss": 2.8273, "step": 382 }, { "epoch": 0.021982491864806053, "grad_norm": 0.3075484037399292, "learning_rate": 0.00034026051360407973, "loss": 2.7805, "step": 383 }, { "epoch": 0.02203988740492304, "grad_norm": 0.2366313338279724, "learning_rate": 0.0003396071381123047, "loss": 2.8278, "step": 384 }, { "epoch": 0.022097282945040028, "grad_norm": 0.2348204106092453, "learning_rate": 0.00033895830043870266, "loss": 2.7922, "step": 385 }, { "epoch": 0.022154678485157016, "grad_norm": 0.28124627470970154, "learning_rate": 0.00033831402725444896, "loss": 2.8065, "step": 386 }, { "epoch": 0.022212074025274, "grad_norm": 0.1927008032798767, "learning_rate": 0.0003376743450430907, "loss": 2.7958, "step": 387 }, { "epoch": 0.022269469565390988, "grad_norm": 0.26325997710227966, "learning_rate": 0.0003370392800994583, "loss": 2.8313, "step": 388 }, { "epoch": 0.022326865105507975, "grad_norm": 0.23394963145256042, "learning_rate": 0.0003364088585285842, "loss": 2.8126, "step": 389 }, { "epoch": 0.022384260645624963, "grad_norm": 0.26055994629859924, "learning_rate": 0.00033578310624462983, "loss": 2.787, "step": 390 }, { "epoch": 0.02244165618574195, "grad_norm": 0.2207145392894745, "learning_rate": 0.0003351620489698208, "loss": 2.796, "step": 391 }, { "epoch": 0.022499051725858938, "grad_norm": 0.34231698513031006, "learning_rate": 0.0003345457122333891, "loss": 2.7951, "step": 392 }, { "epoch": 0.022556447265975922, "grad_norm": 0.22361671924591064, "learning_rate": 0.00033393412137052396, "loss": 2.8251, "step": 393 }, { "epoch": 0.02261384280609291, "grad_norm": 0.24573372304439545, "learning_rate": 0.0003333273015213304, "loss": 2.7899, "step": 394 }, { "epoch": 0.022671238346209897, "grad_norm": 0.22109688818454742, "learning_rate": 0.0003327252776297955, "loss": 2.8178, "step": 395 }, { "epoch": 0.022728633886326885, "grad_norm": 0.22289875149726868, "learning_rate": 0.00033212807444276364, "loss": 2.8053, "step": 396 }, { "epoch": 0.022786029426443873, "grad_norm": 0.21445147693157196, "learning_rate": 0.00033153571650891865, "loss": 2.7998, "step": 397 }, { "epoch": 0.02284342496656086, "grad_norm": 0.25061139464378357, "learning_rate": 0.00033094822817777514, "loss": 2.8055, "step": 398 }, { "epoch": 0.022900820506677848, "grad_norm": 0.24680854380130768, "learning_rate": 0.0003303656335986773, "loss": 2.8143, "step": 399 }, { "epoch": 0.022958216046794832, "grad_norm": 0.16644932329654694, "learning_rate": 0.0003297879567198065, "loss": 2.8192, "step": 400 }, { "epoch": 0.022958216046794832, "eval_loss": 2.738191604614258, "eval_runtime": 85.3252, "eval_samples_per_second": 50.571, "eval_steps_per_second": 12.646, "step": 400 }, { "epoch": 0.02301561158691182, "grad_norm": 0.2816384434700012, "learning_rate": 0.00032921522128719657, "loss": 2.8209, "step": 401 }, { "epoch": 0.023073007127028807, "grad_norm": 0.20395685732364655, "learning_rate": 0.00032864745084375783, "loss": 2.8021, "step": 402 }, { "epoch": 0.023130402667145795, "grad_norm": 0.24216794967651367, "learning_rate": 0.00032808466872830957, "loss": 2.8447, "step": 403 }, { "epoch": 0.023187798207262782, "grad_norm": 0.2526738941669464, "learning_rate": 0.00032752689807462017, "loss": 2.7906, "step": 404 }, { "epoch": 0.02324519374737977, "grad_norm": 0.21725283563137054, "learning_rate": 0.0003269741618104566, "loss": 2.7943, "step": 405 }, { "epoch": 0.023302589287496758, "grad_norm": 0.2765718102455139, "learning_rate": 0.00032642648265664175, "loss": 2.8109, "step": 406 }, { "epoch": 0.02335998482761374, "grad_norm": 0.20015880465507507, "learning_rate": 0.00032588388312612053, "loss": 2.8239, "step": 407 }, { "epoch": 0.02341738036773073, "grad_norm": 0.26865240931510925, "learning_rate": 0.0003253463855230344, "loss": 2.8279, "step": 408 }, { "epoch": 0.023474775907847717, "grad_norm": 0.23522211611270905, "learning_rate": 0.0003248140119418046, "loss": 2.8123, "step": 409 }, { "epoch": 0.023532171447964705, "grad_norm": 0.2388644963502884, "learning_rate": 0.0003242867842662239, "loss": 2.8057, "step": 410 }, { "epoch": 0.023589566988081692, "grad_norm": 0.18323197960853577, "learning_rate": 0.00032376472416855703, "loss": 2.8193, "step": 411 }, { "epoch": 0.02364696252819868, "grad_norm": 0.24734856188297272, "learning_rate": 0.00032324785310864983, "loss": 2.7924, "step": 412 }, { "epoch": 0.023704358068315664, "grad_norm": 0.1722363829612732, "learning_rate": 0.0003227361923330471, "loss": 2.8242, "step": 413 }, { "epoch": 0.02376175360843265, "grad_norm": 0.2052358090877533, "learning_rate": 0.00032222976287411934, "loss": 2.8129, "step": 414 }, { "epoch": 0.02381914914854964, "grad_norm": 0.2536105811595917, "learning_rate": 0.00032172858554919807, "loss": 2.8207, "step": 415 }, { "epoch": 0.023876544688666627, "grad_norm": 0.23084022104740143, "learning_rate": 0.00032123268095972005, "loss": 2.8156, "step": 416 }, { "epoch": 0.023933940228783614, "grad_norm": 0.28741586208343506, "learning_rate": 0.00032074206949038073, "loss": 2.8008, "step": 417 }, { "epoch": 0.023991335768900602, "grad_norm": 0.2419297993183136, "learning_rate": 0.0003202567713082959, "loss": 2.8112, "step": 418 }, { "epoch": 0.02404873130901759, "grad_norm": 0.19744537770748138, "learning_rate": 0.0003197768063621732, "loss": 2.7894, "step": 419 }, { "epoch": 0.024106126849134574, "grad_norm": 0.22780993580818176, "learning_rate": 0.0003193021943814916, "loss": 2.8019, "step": 420 }, { "epoch": 0.02416352238925156, "grad_norm": 0.2176397144794464, "learning_rate": 0.00031883295487569063, "loss": 2.8183, "step": 421 }, { "epoch": 0.02422091792936855, "grad_norm": 0.23891203105449677, "learning_rate": 0.00031836910713336857, "loss": 2.8022, "step": 422 }, { "epoch": 0.024278313469485537, "grad_norm": 0.18507017195224762, "learning_rate": 0.0003179106702214893, "loss": 2.8013, "step": 423 }, { "epoch": 0.024335709009602524, "grad_norm": 0.20408926904201508, "learning_rate": 0.0003174576629845987, "loss": 2.8085, "step": 424 }, { "epoch": 0.024393104549719512, "grad_norm": 0.18055075407028198, "learning_rate": 0.00031701010404404996, "loss": 2.8341, "step": 425 }, { "epoch": 0.024450500089836496, "grad_norm": 0.22974956035614014, "learning_rate": 0.0003165680117972382, "loss": 2.8044, "step": 426 }, { "epoch": 0.024507895629953484, "grad_norm": 0.17688511312007904, "learning_rate": 0.00031613140441684413, "loss": 2.7866, "step": 427 }, { "epoch": 0.02456529117007047, "grad_norm": 0.22350828349590302, "learning_rate": 0.000315700299850087, "loss": 2.7939, "step": 428 }, { "epoch": 0.02462268671018746, "grad_norm": 0.2138863056898117, "learning_rate": 0.0003152747158179871, "loss": 2.8112, "step": 429 }, { "epoch": 0.024680082250304446, "grad_norm": 0.1666262447834015, "learning_rate": 0.0003148546698146371, "loss": 2.8464, "step": 430 }, { "epoch": 0.024737477790421434, "grad_norm": 0.23217864334583282, "learning_rate": 0.00031444017910648293, "loss": 2.8154, "step": 431 }, { "epoch": 0.02479487333053842, "grad_norm": 0.23967209458351135, "learning_rate": 0.00031403126073161424, "loss": 2.8068, "step": 432 }, { "epoch": 0.024852268870655406, "grad_norm": 0.2363416850566864, "learning_rate": 0.0003136279314990637, "loss": 2.832, "step": 433 }, { "epoch": 0.024909664410772393, "grad_norm": 0.20204566419124603, "learning_rate": 0.00031323020798811643, "loss": 2.8118, "step": 434 }, { "epoch": 0.02496705995088938, "grad_norm": 0.2645012438297272, "learning_rate": 0.00031283810654762816, "loss": 2.7988, "step": 435 }, { "epoch": 0.02502445549100637, "grad_norm": 0.31096434593200684, "learning_rate": 0.0003124516432953532, "loss": 2.8021, "step": 436 }, { "epoch": 0.025081851031123356, "grad_norm": 0.25740697979927063, "learning_rate": 0.00031207083411728236, "loss": 2.828, "step": 437 }, { "epoch": 0.025139246571240344, "grad_norm": 0.24895477294921875, "learning_rate": 0.00031169569466698937, "loss": 2.8073, "step": 438 }, { "epoch": 0.02519664211135733, "grad_norm": 0.2860502004623413, "learning_rate": 0.00031132624036498774, "loss": 2.8275, "step": 439 }, { "epoch": 0.025254037651474315, "grad_norm": 0.3134096562862396, "learning_rate": 0.00031096248639809674, "loss": 2.816, "step": 440 }, { "epoch": 0.025311433191591303, "grad_norm": 0.2185070812702179, "learning_rate": 0.0003106044477188172, "loss": 2.7799, "step": 441 }, { "epoch": 0.02536882873170829, "grad_norm": 0.3582714796066284, "learning_rate": 0.0003102521390447169, "loss": 2.7923, "step": 442 }, { "epoch": 0.02542622427182528, "grad_norm": 0.19494207203388214, "learning_rate": 0.00030990557485782553, "loss": 2.7999, "step": 443 }, { "epoch": 0.025483619811942266, "grad_norm": 0.2574940025806427, "learning_rate": 0.0003095647694040394, "loss": 2.8087, "step": 444 }, { "epoch": 0.025541015352059254, "grad_norm": 0.17501215636730194, "learning_rate": 0.0003092297366925359, "loss": 2.7817, "step": 445 }, { "epoch": 0.025598410892176238, "grad_norm": 0.4073377251625061, "learning_rate": 0.0003089004904951976, "loss": 2.813, "step": 446 }, { "epoch": 0.025655806432293225, "grad_norm": 0.21654489636421204, "learning_rate": 0.000308577044346046, "loss": 2.8165, "step": 447 }, { "epoch": 0.025713201972410213, "grad_norm": 0.26500189304351807, "learning_rate": 0.0003082594115406856, "loss": 2.8229, "step": 448 }, { "epoch": 0.0257705975125272, "grad_norm": 0.188262477517128, "learning_rate": 0.00030794760513575675, "loss": 2.8112, "step": 449 }, { "epoch": 0.025827993052644188, "grad_norm": 0.3432970643043518, "learning_rate": 0.00030764163794839966, "loss": 2.8241, "step": 450 }, { "epoch": 0.025885388592761176, "grad_norm": 0.23415225744247437, "learning_rate": 0.0003073415225557269, "loss": 2.8039, "step": 451 }, { "epoch": 0.025942784132878163, "grad_norm": 0.2670385241508484, "learning_rate": 0.0003070472712943069, "loss": 2.8215, "step": 452 }, { "epoch": 0.026000179672995147, "grad_norm": 0.17434735596179962, "learning_rate": 0.00030675889625965646, "loss": 2.8352, "step": 453 }, { "epoch": 0.026057575213112135, "grad_norm": 0.2789264917373657, "learning_rate": 0.0003064764093057437, "loss": 2.7856, "step": 454 }, { "epoch": 0.026114970753229123, "grad_norm": 0.2666022479534149, "learning_rate": 0.0003061998220445009, "loss": 2.8063, "step": 455 }, { "epoch": 0.02617236629334611, "grad_norm": 0.22438260912895203, "learning_rate": 0.00030592914584534706, "loss": 2.7783, "step": 456 }, { "epoch": 0.026229761833463098, "grad_norm": 0.2177169770002365, "learning_rate": 0.00030566439183472063, "loss": 2.786, "step": 457 }, { "epoch": 0.026287157373580086, "grad_norm": 0.22771142423152924, "learning_rate": 0.000305405570895622, "loss": 2.7881, "step": 458 }, { "epoch": 0.02634455291369707, "grad_norm": 0.29228097200393677, "learning_rate": 0.00030515269366716613, "loss": 2.7876, "step": 459 }, { "epoch": 0.026401948453814057, "grad_norm": 0.18204721808433533, "learning_rate": 0.00030490577054414553, "loss": 2.8153, "step": 460 }, { "epoch": 0.026459343993931045, "grad_norm": 0.19830970466136932, "learning_rate": 0.0003046648116766027, "loss": 2.7884, "step": 461 }, { "epoch": 0.026516739534048032, "grad_norm": 0.17311398684978485, "learning_rate": 0.00030442982696941276, "loss": 2.8055, "step": 462 }, { "epoch": 0.02657413507416502, "grad_norm": 0.21194536983966827, "learning_rate": 0.0003042008260818768, "loss": 2.815, "step": 463 }, { "epoch": 0.026631530614282008, "grad_norm": 0.22366400063037872, "learning_rate": 0.0003039778184273243, "loss": 2.7994, "step": 464 }, { "epoch": 0.026688926154398995, "grad_norm": 0.17785237729549408, "learning_rate": 0.00030376081317272645, "loss": 2.8049, "step": 465 }, { "epoch": 0.02674632169451598, "grad_norm": 0.2285715490579605, "learning_rate": 0.00030354981923831934, "loss": 2.8105, "step": 466 }, { "epoch": 0.026803717234632967, "grad_norm": 0.17985928058624268, "learning_rate": 0.0003033448452972373, "loss": 2.8246, "step": 467 }, { "epoch": 0.026861112774749955, "grad_norm": 0.2026437669992447, "learning_rate": 0.000303145899775156, "loss": 2.8192, "step": 468 }, { "epoch": 0.026918508314866942, "grad_norm": 0.2605213522911072, "learning_rate": 0.0003029529908499469, "loss": 2.826, "step": 469 }, { "epoch": 0.02697590385498393, "grad_norm": 0.22592206299304962, "learning_rate": 0.00030276612645134017, "loss": 2.7987, "step": 470 }, { "epoch": 0.027033299395100917, "grad_norm": 0.2988434433937073, "learning_rate": 0.0003025853142605994, "loss": 2.826, "step": 471 }, { "epoch": 0.027090694935217905, "grad_norm": 0.2247052788734436, "learning_rate": 0.0003024105617102055, "loss": 2.815, "step": 472 }, { "epoch": 0.02714809047533489, "grad_norm": 0.26565778255462646, "learning_rate": 0.00030224187598355145, "loss": 2.8283, "step": 473 }, { "epoch": 0.027205486015451877, "grad_norm": 0.2834932804107666, "learning_rate": 0.00030207926401464675, "loss": 2.8088, "step": 474 }, { "epoch": 0.027262881555568864, "grad_norm": 0.2396688312292099, "learning_rate": 0.0003019227324878324, "loss": 2.8024, "step": 475 }, { "epoch": 0.027320277095685852, "grad_norm": 0.2600051760673523, "learning_rate": 0.0003017722878375066, "loss": 2.8258, "step": 476 }, { "epoch": 0.02737767263580284, "grad_norm": 0.26368406414985657, "learning_rate": 0.00030162793624785957, "loss": 2.7875, "step": 477 }, { "epoch": 0.027435068175919827, "grad_norm": 0.389852911233902, "learning_rate": 0.0003014896836526197, "loss": 2.8166, "step": 478 }, { "epoch": 0.02749246371603681, "grad_norm": 0.23984675109386444, "learning_rate": 0.0003013575357348098, "loss": 2.8025, "step": 479 }, { "epoch": 0.0275498592561538, "grad_norm": 0.24591901898384094, "learning_rate": 0.00030123149792651307, "loss": 2.7898, "step": 480 }, { "epoch": 0.027607254796270787, "grad_norm": 0.24797213077545166, "learning_rate": 0.00030111157540865026, "loss": 2.8291, "step": 481 }, { "epoch": 0.027664650336387774, "grad_norm": 0.2542579770088196, "learning_rate": 0.0003009977731107663, "loss": 2.7868, "step": 482 }, { "epoch": 0.027722045876504762, "grad_norm": 0.21780452132225037, "learning_rate": 0.00030089009571082794, "loss": 2.8051, "step": 483 }, { "epoch": 0.02777944141662175, "grad_norm": 0.2790198028087616, "learning_rate": 0.0003007885476350314, "loss": 2.8004, "step": 484 }, { "epoch": 0.027836836956738737, "grad_norm": 0.2793212831020355, "learning_rate": 0.00030069313305762025, "loss": 2.8077, "step": 485 }, { "epoch": 0.02789423249685572, "grad_norm": 0.2663847506046295, "learning_rate": 0.0003006038559007141, "loss": 2.805, "step": 486 }, { "epoch": 0.02795162803697271, "grad_norm": 0.2695571482181549, "learning_rate": 0.0003005207198341473, "loss": 2.8102, "step": 487 }, { "epoch": 0.028009023577089696, "grad_norm": 0.3027716875076294, "learning_rate": 0.0003004437282753177, "loss": 2.7944, "step": 488 }, { "epoch": 0.028066419117206684, "grad_norm": 0.25220444798469543, "learning_rate": 0.0003003728843890469, "loss": 2.781, "step": 489 }, { "epoch": 0.02812381465732367, "grad_norm": 0.2733742594718933, "learning_rate": 0.0003003081910874495, "loss": 2.8138, "step": 490 }, { "epoch": 0.02818121019744066, "grad_norm": 0.23873530328273773, "learning_rate": 0.00030024965102981387, "loss": 2.8017, "step": 491 }, { "epoch": 0.028238605737557643, "grad_norm": 0.29158100485801697, "learning_rate": 0.0003001972666224923, "loss": 2.8084, "step": 492 }, { "epoch": 0.02829600127767463, "grad_norm": 0.3079324960708618, "learning_rate": 0.00030015104001880274, "loss": 2.8061, "step": 493 }, { "epoch": 0.02835339681779162, "grad_norm": 0.2448122203350067, "learning_rate": 0.00030011097311893984, "loss": 2.7817, "step": 494 }, { "epoch": 0.028410792357908606, "grad_norm": 0.3495275378227234, "learning_rate": 0.00030007706756989683, "loss": 2.8053, "step": 495 }, { "epoch": 0.028468187898025594, "grad_norm": 0.19935691356658936, "learning_rate": 0.000300049324765398, "loss": 2.7985, "step": 496 }, { "epoch": 0.02852558343814258, "grad_norm": 0.30157798528671265, "learning_rate": 0.0003000277458458415, "loss": 2.8271, "step": 497 }, { "epoch": 0.02858297897825957, "grad_norm": 0.23343823850154877, "learning_rate": 0.00030001233169825214, "loss": 2.807, "step": 498 }, { "epoch": 0.028640374518376553, "grad_norm": 0.25404173135757446, "learning_rate": 0.0003000030829562451, "loss": 2.8072, "step": 499 }, { "epoch": 0.02869777005849354, "grad_norm": 0.28863540291786194, "learning_rate": 0.0003, "loss": 2.8088, "step": 500 }, { "epoch": 0.02869777005849354, "eval_loss": 2.735079288482666, "eval_runtime": 85.4355, "eval_samples_per_second": 50.506, "eval_steps_per_second": 12.629, "step": 500 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.69922551431168e+17, "train_batch_size": 22, "trial_name": null, "trial_params": null }