{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1079, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009267840593141798, "grad_norm": 356.406982421875, "learning_rate": 0.005, "loss": 15.9, "step": 1 }, { "epoch": 0.0018535681186283596, "grad_norm": 32.9332389831543, "learning_rate": 0.0049999894033994794, "loss": 13.6, "step": 2 }, { "epoch": 0.0027803521779425394, "grad_norm": 10.453313827514648, "learning_rate": 0.004999957613687751, "loss": 21.425, "step": 3 }, { "epoch": 0.0037071362372567192, "grad_norm": 3.510478973388672, "learning_rate": 0.004999904631134301, "loss": 15.225, "step": 4 }, { "epoch": 0.004633920296570899, "grad_norm": 35.607364654541016, "learning_rate": 0.004999830456188281, "loss": 18.325, "step": 5 }, { "epoch": 0.005560704355885079, "grad_norm": 4.46471643447876, "learning_rate": 0.004999735089478491, "loss": 19.7, "step": 6 }, { "epoch": 0.006487488415199258, "grad_norm": 1.207599401473999, "learning_rate": 0.004999618531813382, "loss": 14.125, "step": 7 }, { "epoch": 0.0074142724745134385, "grad_norm": 46.56653594970703, "learning_rate": 0.004999480784181046, "loss": 32.7, "step": 8 }, { "epoch": 0.008341056533827619, "grad_norm": 2.0620079040527344, "learning_rate": 0.004999321847749208, "loss": 13.4, "step": 9 }, { "epoch": 0.009267840593141797, "grad_norm": 3.376063823699951, "learning_rate": 0.0049991417238652155, "loss": 13.3, "step": 10 }, { "epoch": 0.010194624652455977, "grad_norm": 0.6672539710998535, "learning_rate": 0.004998940414056032, "loss": 13.4375, "step": 11 }, { "epoch": 0.011121408711770158, "grad_norm": 0.4186709523200989, "learning_rate": 0.004998717920028215, "loss": 12.6375, "step": 12 }, { "epoch": 0.012048192771084338, "grad_norm": 0.4992158114910126, "learning_rate": 0.00499847424366791, "loss": 11.6625, "step": 13 }, { "epoch": 0.012974976830398516, "grad_norm": 0.21440155804157257, "learning_rate": 0.004998209387040828, "loss": 10.5375, "step": 14 }, { "epoch": 0.013901760889712697, "grad_norm": 2.2223408222198486, "learning_rate": 0.004997923352392236, "loss": 11.6, "step": 15 }, { "epoch": 0.014828544949026877, "grad_norm": 1.4461462497711182, "learning_rate": 0.004997616142146927, "loss": 12.7125, "step": 16 }, { "epoch": 0.015755329008341055, "grad_norm": 1.9746646881103516, "learning_rate": 0.004997287758909209, "loss": 12.2125, "step": 17 }, { "epoch": 0.016682113067655237, "grad_norm": 8.858609199523926, "learning_rate": 0.004996938205462881, "loss": 14.0625, "step": 18 }, { "epoch": 0.017608897126969416, "grad_norm": 0.9914843440055847, "learning_rate": 0.004996567484771203, "loss": 11.35, "step": 19 }, { "epoch": 0.018535681186283594, "grad_norm": 0.8945605158805847, "learning_rate": 0.004996175599976878, "loss": 11.725, "step": 20 }, { "epoch": 0.019462465245597776, "grad_norm": 1.340647578239441, "learning_rate": 0.004995762554402026, "loss": 12.8875, "step": 21 }, { "epoch": 0.020389249304911955, "grad_norm": 0.6224690079689026, "learning_rate": 0.004995328351548148, "loss": 11.7, "step": 22 }, { "epoch": 0.021316033364226137, "grad_norm": 0.6904886960983276, "learning_rate": 0.004994872995096104, "loss": 10.6375, "step": 23 }, { "epoch": 0.022242817423540315, "grad_norm": 0.7552493214607239, "learning_rate": 0.004994396488906078, "loss": 13.275, "step": 24 }, { "epoch": 0.023169601482854494, "grad_norm": 0.1830722540616989, "learning_rate": 0.004993898837017547, "loss": 10.225, "step": 25 }, { "epoch": 0.024096385542168676, "grad_norm": 0.31753918528556824, "learning_rate": 0.004993380043649245, "loss": 10.0875, "step": 26 }, { "epoch": 0.025023169601482854, "grad_norm": 0.17651186883449554, "learning_rate": 0.00499284011319913, "loss": 9.675, "step": 27 }, { "epoch": 0.025949953660797033, "grad_norm": 0.1835695058107376, "learning_rate": 0.004992279050244343, "loss": 9.625, "step": 28 }, { "epoch": 0.026876737720111215, "grad_norm": 0.15531466901302338, "learning_rate": 0.004991696859541173, "loss": 9.525, "step": 29 }, { "epoch": 0.027803521779425393, "grad_norm": 0.1167324110865593, "learning_rate": 0.004991093546025012, "loss": 9.3375, "step": 30 }, { "epoch": 0.028730305838739572, "grad_norm": 0.06774014979600906, "learning_rate": 0.004990469114810318, "loss": 9.275, "step": 31 }, { "epoch": 0.029657089898053754, "grad_norm": 0.11318591982126236, "learning_rate": 0.004989823571190571, "loss": 9.2875, "step": 32 }, { "epoch": 0.030583873957367932, "grad_norm": 0.039967115968465805, "learning_rate": 0.004989156920638226, "loss": 9.225, "step": 33 }, { "epoch": 0.03151065801668211, "grad_norm": 0.07919777184724808, "learning_rate": 0.004988469168804664, "loss": 9.2375, "step": 34 }, { "epoch": 0.03243744207599629, "grad_norm": 0.04368596524000168, "learning_rate": 0.0049877603215201525, "loss": 9.1875, "step": 35 }, { "epoch": 0.033364226135310475, "grad_norm": 0.04921940341591835, "learning_rate": 0.004987030384793787, "loss": 9.1875, "step": 36 }, { "epoch": 0.03429101019462465, "grad_norm": 0.040833037346601486, "learning_rate": 0.0049862793648134465, "loss": 9.1625, "step": 37 }, { "epoch": 0.03521779425393883, "grad_norm": 0.03423991799354553, "learning_rate": 0.004985507267945738, "loss": 9.1125, "step": 38 }, { "epoch": 0.03614457831325301, "grad_norm": 0.04628804698586464, "learning_rate": 0.004984714100735943, "loss": 9.1375, "step": 39 }, { "epoch": 0.03707136237256719, "grad_norm": 0.02513456903398037, "learning_rate": 0.0049838998699079625, "loss": 9.125, "step": 40 }, { "epoch": 0.037998146431881374, "grad_norm": 0.04390294477343559, "learning_rate": 0.00498306458236426, "loss": 9.125, "step": 41 }, { "epoch": 0.03892493049119555, "grad_norm": 0.02223977819085121, "learning_rate": 0.004982208245185801, "loss": 9.1125, "step": 42 }, { "epoch": 0.03985171455050973, "grad_norm": 0.03464260324835777, "learning_rate": 0.004981330865631997, "loss": 9.1125, "step": 43 }, { "epoch": 0.04077849860982391, "grad_norm": 0.0259235929697752, "learning_rate": 0.00498043245114064, "loss": 9.0625, "step": 44 }, { "epoch": 0.04170528266913809, "grad_norm": 0.023725276812911034, "learning_rate": 0.004979513009327842, "loss": 9.1, "step": 45 }, { "epoch": 0.042632066728452274, "grad_norm": 0.022491367533802986, "learning_rate": 0.004978572547987968, "loss": 9.05, "step": 46 }, { "epoch": 0.04355885078776645, "grad_norm": 0.018162831664085388, "learning_rate": 0.004977611075093574, "loss": 9.0875, "step": 47 }, { "epoch": 0.04448563484708063, "grad_norm": 0.033248819410800934, "learning_rate": 0.004976628598795336, "loss": 9.025, "step": 48 }, { "epoch": 0.04541241890639481, "grad_norm": 0.015689486637711525, "learning_rate": 0.0049756251274219775, "loss": 9.0625, "step": 49 }, { "epoch": 0.04633920296570899, "grad_norm": 0.022721588611602783, "learning_rate": 0.00497460066948021, "loss": 9.0375, "step": 50 }, { "epoch": 0.047265987025023166, "grad_norm": 0.020086370408535004, "learning_rate": 0.00497355523365465, "loss": 9.0625, "step": 51 }, { "epoch": 0.04819277108433735, "grad_norm": 0.01713702268898487, "learning_rate": 0.00497248882880775, "loss": 9.0375, "step": 52 }, { "epoch": 0.04911955514365153, "grad_norm": 0.01819983310997486, "learning_rate": 0.004971401463979721, "loss": 9.0375, "step": 53 }, { "epoch": 0.05004633920296571, "grad_norm": 0.01858202926814556, "learning_rate": 0.004970293148388463, "loss": 9.0125, "step": 54 }, { "epoch": 0.05097312326227989, "grad_norm": 0.016383878886699677, "learning_rate": 0.004969163891429476, "loss": 9.0, "step": 55 }, { "epoch": 0.051899907321594066, "grad_norm": 0.01655055209994316, "learning_rate": 0.0049680137026757885, "loss": 9.025, "step": 56 }, { "epoch": 0.05282669138090825, "grad_norm": 0.01438821293413639, "learning_rate": 0.004966842591877872, "loss": 9.0, "step": 57 }, { "epoch": 0.05375347544022243, "grad_norm": 0.01816794089972973, "learning_rate": 0.004965650568963563, "loss": 9.0, "step": 58 }, { "epoch": 0.05468025949953661, "grad_norm": 0.017415305599570274, "learning_rate": 0.004964437644037973, "loss": 8.9625, "step": 59 }, { "epoch": 0.05560704355885079, "grad_norm": 0.017612161114811897, "learning_rate": 0.004963203827383406, "loss": 8.975, "step": 60 }, { "epoch": 0.056533827618164965, "grad_norm": 0.014700948260724545, "learning_rate": 0.0049619491294592725, "loss": 9.0, "step": 61 }, { "epoch": 0.057460611677479144, "grad_norm": 0.0167540330439806, "learning_rate": 0.004960673560901999, "loss": 8.9875, "step": 62 }, { "epoch": 0.05838739573679333, "grad_norm": 0.029445504769682884, "learning_rate": 0.004959377132524938, "loss": 8.9625, "step": 63 }, { "epoch": 0.05931417979610751, "grad_norm": 0.013282664120197296, "learning_rate": 0.004958059855318275, "loss": 8.9625, "step": 64 }, { "epoch": 0.060240963855421686, "grad_norm": 0.019158177077770233, "learning_rate": 0.00495672174044894, "loss": 8.9, "step": 65 }, { "epoch": 0.061167747914735865, "grad_norm": 0.02090335451066494, "learning_rate": 0.004955362799260506, "loss": 8.9125, "step": 66 }, { "epoch": 0.06209453197405004, "grad_norm": 0.019786162301898003, "learning_rate": 0.004953983043273102, "loss": 8.95, "step": 67 }, { "epoch": 0.06302131603336422, "grad_norm": 0.0192793570458889, "learning_rate": 0.004952582484183302, "loss": 8.925, "step": 68 }, { "epoch": 0.0639481000926784, "grad_norm": 0.029085692018270493, "learning_rate": 0.0049511611338640404, "loss": 8.9625, "step": 69 }, { "epoch": 0.06487488415199258, "grad_norm": 0.028297357261180878, "learning_rate": 0.004949719004364503, "loss": 8.925, "step": 70 }, { "epoch": 0.06580166821130677, "grad_norm": 0.013140903785824776, "learning_rate": 0.0049482561079100245, "loss": 8.925, "step": 71 }, { "epoch": 0.06672845227062095, "grad_norm": 0.016508571803569794, "learning_rate": 0.004946772456901989, "loss": 8.95, "step": 72 }, { "epoch": 0.06765523632993513, "grad_norm": 0.028362734243273735, "learning_rate": 0.004945268063917723, "loss": 8.9375, "step": 73 }, { "epoch": 0.0685820203892493, "grad_norm": 0.028645526617765427, "learning_rate": 0.004943742941710386, "loss": 8.9375, "step": 74 }, { "epoch": 0.06950880444856349, "grad_norm": 0.010765830054879189, "learning_rate": 0.004942197103208867, "loss": 8.925, "step": 75 }, { "epoch": 0.07043558850787766, "grad_norm": 0.022227909415960312, "learning_rate": 0.004940630561517674, "loss": 8.9375, "step": 76 }, { "epoch": 0.07136237256719184, "grad_norm": 0.020959695801138878, "learning_rate": 0.004939043329916819, "loss": 8.95, "step": 77 }, { "epoch": 0.07228915662650602, "grad_norm": 0.01679840497672558, "learning_rate": 0.00493743542186171, "loss": 8.925, "step": 78 }, { "epoch": 0.0732159406858202, "grad_norm": 0.01441862341016531, "learning_rate": 0.004935806850983033, "loss": 8.9125, "step": 79 }, { "epoch": 0.07414272474513438, "grad_norm": 0.014738287776708603, "learning_rate": 0.004934157631086642, "loss": 8.9, "step": 80 }, { "epoch": 0.07506950880444857, "grad_norm": 0.013974464498460293, "learning_rate": 0.004932487776153435, "loss": 8.875, "step": 81 }, { "epoch": 0.07599629286376275, "grad_norm": 0.014242907054722309, "learning_rate": 0.004930797300339241, "loss": 8.8875, "step": 82 }, { "epoch": 0.07692307692307693, "grad_norm": 0.014142482541501522, "learning_rate": 0.004929086217974697, "loss": 8.875, "step": 83 }, { "epoch": 0.0778498609823911, "grad_norm": 0.011345421895384789, "learning_rate": 0.0049273545435651305, "loss": 8.9, "step": 84 }, { "epoch": 0.07877664504170528, "grad_norm": 0.01937839388847351, "learning_rate": 0.004925602291790427, "loss": 8.875, "step": 85 }, { "epoch": 0.07970342910101946, "grad_norm": 0.019322404637932777, "learning_rate": 0.0049238294775049195, "loss": 8.875, "step": 86 }, { "epoch": 0.08063021316033364, "grad_norm": 0.02427850104868412, "learning_rate": 0.004922036115737251, "loss": 8.875, "step": 87 }, { "epoch": 0.08155699721964782, "grad_norm": 0.02773062139749527, "learning_rate": 0.0049202222216902505, "loss": 8.875, "step": 88 }, { "epoch": 0.082483781278962, "grad_norm": 0.022121064364910126, "learning_rate": 0.0049183878107408084, "loss": 8.875, "step": 89 }, { "epoch": 0.08341056533827618, "grad_norm": 0.014306942000985146, "learning_rate": 0.00491653289843974, "loss": 8.85, "step": 90 }, { "epoch": 0.08433734939759036, "grad_norm": 0.01174082513898611, "learning_rate": 0.004914657500511657, "loss": 8.85, "step": 91 }, { "epoch": 0.08526413345690455, "grad_norm": 0.017720786854624748, "learning_rate": 0.004912761632854833, "loss": 8.8625, "step": 92 }, { "epoch": 0.08619091751621873, "grad_norm": 0.023863809183239937, "learning_rate": 0.004910845311541071, "loss": 8.8625, "step": 93 }, { "epoch": 0.0871177015755329, "grad_norm": 0.034596893936395645, "learning_rate": 0.004908908552815563, "loss": 8.8625, "step": 94 }, { "epoch": 0.08804448563484708, "grad_norm": 0.04321544989943504, "learning_rate": 0.004906951373096757, "loss": 8.85, "step": 95 }, { "epoch": 0.08897126969416126, "grad_norm": 0.05180607736110687, "learning_rate": 0.004904973788976213, "loss": 8.8625, "step": 96 }, { "epoch": 0.08989805375347544, "grad_norm": 0.04927121847867966, "learning_rate": 0.004902975817218467, "loss": 8.825, "step": 97 }, { "epoch": 0.09082483781278962, "grad_norm": 0.030304012820124626, "learning_rate": 0.004900957474760885, "loss": 8.825, "step": 98 }, { "epoch": 0.0917516218721038, "grad_norm": 0.018640510737895966, "learning_rate": 0.004898918778713524, "loss": 8.8, "step": 99 }, { "epoch": 0.09267840593141798, "grad_norm": 0.033853888511657715, "learning_rate": 0.004896859746358979, "loss": 8.7875, "step": 100 }, { "epoch": 0.09360518999073215, "grad_norm": 0.04043276980519295, "learning_rate": 0.004894780395152247, "loss": 8.775, "step": 101 }, { "epoch": 0.09453197405004633, "grad_norm": 0.0534222349524498, "learning_rate": 0.004892680742720571, "loss": 8.7375, "step": 102 }, { "epoch": 0.09545875810936053, "grad_norm": 0.082061268389225, "learning_rate": 0.004890560806863293, "loss": 8.8, "step": 103 }, { "epoch": 0.0963855421686747, "grad_norm": 0.05508153885602951, "learning_rate": 0.004888420605551703, "loss": 8.775, "step": 104 }, { "epoch": 0.09731232622798888, "grad_norm": 0.04220907762646675, "learning_rate": 0.004886260156928888, "loss": 8.7625, "step": 105 }, { "epoch": 0.09823911028730306, "grad_norm": 0.04727254807949066, "learning_rate": 0.004884079479309578, "loss": 8.7875, "step": 106 }, { "epoch": 0.09916589434661724, "grad_norm": 0.04981837049126625, "learning_rate": 0.004881878591179988, "loss": 8.75, "step": 107 }, { "epoch": 0.10009267840593142, "grad_norm": 0.039716847240924835, "learning_rate": 0.004879657511197662, "loss": 8.675, "step": 108 }, { "epoch": 0.1010194624652456, "grad_norm": 0.028658628463745117, "learning_rate": 0.0048774162581913215, "loss": 8.675, "step": 109 }, { "epoch": 0.10194624652455977, "grad_norm": 0.03913936764001846, "learning_rate": 0.0048751548511606945, "loss": 8.6625, "step": 110 }, { "epoch": 0.10287303058387395, "grad_norm": 0.027623698115348816, "learning_rate": 0.004872873309276362, "loss": 8.6625, "step": 111 }, { "epoch": 0.10379981464318813, "grad_norm": 0.0399942547082901, "learning_rate": 0.004870571651879596, "loss": 8.6625, "step": 112 }, { "epoch": 0.10472659870250231, "grad_norm": 0.02140922099351883, "learning_rate": 0.00486824989848219, "loss": 8.5875, "step": 113 }, { "epoch": 0.1056533827618165, "grad_norm": 0.0371641181409359, "learning_rate": 0.0048659080687663, "loss": 8.6, "step": 114 }, { "epoch": 0.10658016682113068, "grad_norm": 0.018301891162991524, "learning_rate": 0.004863546182584273, "loss": 8.575, "step": 115 }, { "epoch": 0.10750695088044486, "grad_norm": 0.029274851083755493, "learning_rate": 0.0048611642599584795, "loss": 8.55, "step": 116 }, { "epoch": 0.10843373493975904, "grad_norm": 0.025735612958669662, "learning_rate": 0.004858762321081146, "loss": 8.525, "step": 117 }, { "epoch": 0.10936051899907322, "grad_norm": 0.036481715738773346, "learning_rate": 0.004856340386314182, "loss": 8.4875, "step": 118 }, { "epoch": 0.1102873030583874, "grad_norm": 0.11254877597093582, "learning_rate": 0.004853898476189007, "loss": 8.5375, "step": 119 }, { "epoch": 0.11121408711770157, "grad_norm": 0.19445450603961945, "learning_rate": 0.00485143661140638, "loss": 8.85, "step": 120 }, { "epoch": 0.11214087117701575, "grad_norm": 0.16596297919750214, "learning_rate": 0.004848954812836217, "loss": 8.7625, "step": 121 }, { "epoch": 0.11306765523632993, "grad_norm": 0.044869761914014816, "learning_rate": 0.004846453101517421, "loss": 8.5125, "step": 122 }, { "epoch": 0.11399443929564411, "grad_norm": 0.08229261636734009, "learning_rate": 0.0048439314986577, "loss": 8.6, "step": 123 }, { "epoch": 0.11492122335495829, "grad_norm": 0.04814854636788368, "learning_rate": 0.00484139002563339, "loss": 8.475, "step": 124 }, { "epoch": 0.11584800741427248, "grad_norm": 0.07902152091264725, "learning_rate": 0.004838828703989269, "loss": 8.55, "step": 125 }, { "epoch": 0.11677479147358666, "grad_norm": 0.02725468948483467, "learning_rate": 0.0048362475554383786, "loss": 8.4, "step": 126 }, { "epoch": 0.11770157553290084, "grad_norm": 0.05269164219498634, "learning_rate": 0.004833646601861841, "loss": 8.4375, "step": 127 }, { "epoch": 0.11862835959221502, "grad_norm": 0.03333018347620964, "learning_rate": 0.004831025865308667, "loss": 8.3625, "step": 128 }, { "epoch": 0.1195551436515292, "grad_norm": 0.040032755583524704, "learning_rate": 0.004828385367995575, "loss": 8.325, "step": 129 }, { "epoch": 0.12048192771084337, "grad_norm": 0.03257158771157265, "learning_rate": 0.004825725132306803, "loss": 8.2625, "step": 130 }, { "epoch": 0.12140871177015755, "grad_norm": 0.03259531036019325, "learning_rate": 0.0048230451807939135, "loss": 8.225, "step": 131 }, { "epoch": 0.12233549582947173, "grad_norm": 0.03383934497833252, "learning_rate": 0.004820345536175607, "loss": 8.2, "step": 132 }, { "epoch": 0.12326227988878591, "grad_norm": 0.02867773361504078, "learning_rate": 0.004817626221337529, "loss": 8.15, "step": 133 }, { "epoch": 0.12418906394810009, "grad_norm": 0.03943765163421631, "learning_rate": 0.004814887259332073, "loss": 8.125, "step": 134 }, { "epoch": 0.12511584800741427, "grad_norm": 0.034471139311790466, "learning_rate": 0.004812128673378188, "loss": 7.9875, "step": 135 }, { "epoch": 0.12604263206672844, "grad_norm": 0.03869534283876419, "learning_rate": 0.004809350486861181, "loss": 7.95, "step": 136 }, { "epoch": 0.12696941612604262, "grad_norm": 0.03380202502012253, "learning_rate": 0.0048065527233325175, "loss": 7.875, "step": 137 }, { "epoch": 0.1278962001853568, "grad_norm": 0.03459366410970688, "learning_rate": 0.004803735406509625, "loss": 7.7812, "step": 138 }, { "epoch": 0.12882298424467098, "grad_norm": 0.0600280836224556, "learning_rate": 0.0048008985602756874, "loss": 7.65, "step": 139 }, { "epoch": 0.12974976830398516, "grad_norm": 0.11870339512825012, "learning_rate": 0.004798042208679445, "loss": 7.6375, "step": 140 }, { "epoch": 0.13067655236329936, "grad_norm": 0.1849852204322815, "learning_rate": 0.0047951663759349915, "loss": 7.7, "step": 141 }, { "epoch": 0.13160333642261354, "grad_norm": 0.15893682837486267, "learning_rate": 0.0047922710864215685, "loss": 7.6375, "step": 142 }, { "epoch": 0.13253012048192772, "grad_norm": 0.10825814306735992, "learning_rate": 0.004789356364683356, "loss": 7.4437, "step": 143 }, { "epoch": 0.1334569045412419, "grad_norm": 0.12936848402023315, "learning_rate": 0.004786422235429268, "loss": 7.3688, "step": 144 }, { "epoch": 0.13438368860055608, "grad_norm": 0.07664606720209122, "learning_rate": 0.0047834687235327415, "loss": 7.2625, "step": 145 }, { "epoch": 0.13531047265987026, "grad_norm": 0.1079607829451561, "learning_rate": 0.0047804958540315235, "loss": 7.2125, "step": 146 }, { "epoch": 0.13623725671918444, "grad_norm": 0.04593510553240776, "learning_rate": 0.004777503652127464, "loss": 7.0687, "step": 147 }, { "epoch": 0.1371640407784986, "grad_norm": 0.06448942422866821, "learning_rate": 0.004774492143186296, "loss": 7.075, "step": 148 }, { "epoch": 0.1380908248378128, "grad_norm": 0.04284033551812172, "learning_rate": 0.004771461352737427, "loss": 6.9688, "step": 149 }, { "epoch": 0.13901760889712697, "grad_norm": 0.048541247844696045, "learning_rate": 0.004768411306473717, "loss": 6.9125, "step": 150 }, { "epoch": 0.13994439295644115, "grad_norm": 0.0369611531496048, "learning_rate": 0.004765342030251263, "loss": 6.8875, "step": 151 }, { "epoch": 0.14087117701575533, "grad_norm": 0.07809454202651978, "learning_rate": 0.004762253550089181, "loss": 6.8375, "step": 152 }, { "epoch": 0.1417979610750695, "grad_norm": 0.030714238062500954, "learning_rate": 0.004759145892169382, "loss": 6.8063, "step": 153 }, { "epoch": 0.14272474513438368, "grad_norm": 0.030746718868613243, "learning_rate": 0.004756019082836354, "loss": 6.7875, "step": 154 }, { "epoch": 0.14365152919369786, "grad_norm": 0.026088058948516846, "learning_rate": 0.004752873148596938, "loss": 6.7438, "step": 155 }, { "epoch": 0.14457831325301204, "grad_norm": 0.017927952110767365, "learning_rate": 0.004749708116120099, "loss": 6.7688, "step": 156 }, { "epoch": 0.14550509731232622, "grad_norm": 0.023661252111196518, "learning_rate": 0.004746524012236706, "loss": 6.725, "step": 157 }, { "epoch": 0.1464318813716404, "grad_norm": 0.018965313211083412, "learning_rate": 0.004743320863939299, "loss": 6.725, "step": 158 }, { "epoch": 0.14735866543095458, "grad_norm": 0.022316887974739075, "learning_rate": 0.004740098698381866, "loss": 6.675, "step": 159 }, { "epoch": 0.14828544949026876, "grad_norm": 0.019958553835749626, "learning_rate": 0.004736857542879608, "loss": 6.6875, "step": 160 }, { "epoch": 0.14921223354958293, "grad_norm": 0.016147589311003685, "learning_rate": 0.004733597424908707, "loss": 6.6875, "step": 161 }, { "epoch": 0.15013901760889714, "grad_norm": 0.020692575722932816, "learning_rate": 0.004730318372106099, "loss": 6.6438, "step": 162 }, { "epoch": 0.15106580166821132, "grad_norm": 0.014802551828324795, "learning_rate": 0.004727020412269234, "loss": 6.6312, "step": 163 }, { "epoch": 0.1519925857275255, "grad_norm": 0.01826154999434948, "learning_rate": 0.004723703573355842, "loss": 6.6375, "step": 164 }, { "epoch": 0.15291936978683968, "grad_norm": 0.014861056581139565, "learning_rate": 0.004720367883483697, "loss": 6.6562, "step": 165 }, { "epoch": 0.15384615384615385, "grad_norm": 0.0160931795835495, "learning_rate": 0.004717013370930377, "loss": 6.6, "step": 166 }, { "epoch": 0.15477293790546803, "grad_norm": 0.02078167535364628, "learning_rate": 0.004713640064133024, "loss": 6.6063, "step": 167 }, { "epoch": 0.1556997219647822, "grad_norm": 0.01577616296708584, "learning_rate": 0.004710247991688109, "loss": 6.5563, "step": 168 }, { "epoch": 0.1566265060240964, "grad_norm": 0.019711369648575783, "learning_rate": 0.0047068371823511795, "loss": 6.575, "step": 169 }, { "epoch": 0.15755329008341057, "grad_norm": 0.01820039190351963, "learning_rate": 0.004703407665036622, "loss": 6.5813, "step": 170 }, { "epoch": 0.15848007414272475, "grad_norm": 0.015363371931016445, "learning_rate": 0.004699959468817417, "loss": 6.5375, "step": 171 }, { "epoch": 0.15940685820203893, "grad_norm": 0.015872852876782417, "learning_rate": 0.004696492622924892, "loss": 6.5687, "step": 172 }, { "epoch": 0.1603336422613531, "grad_norm": 0.016906000673770905, "learning_rate": 0.004693007156748471, "loss": 6.5125, "step": 173 }, { "epoch": 0.16126042632066728, "grad_norm": 0.016961950808763504, "learning_rate": 0.0046895030998354275, "loss": 6.525, "step": 174 }, { "epoch": 0.16218721037998146, "grad_norm": 0.016262684017419815, "learning_rate": 0.004685980481890634, "loss": 6.5062, "step": 175 }, { "epoch": 0.16311399443929564, "grad_norm": 0.014922458678483963, "learning_rate": 0.004682439332776313, "loss": 6.4688, "step": 176 }, { "epoch": 0.16404077849860982, "grad_norm": 0.022018995136022568, "learning_rate": 0.004678879682511777, "loss": 6.5188, "step": 177 }, { "epoch": 0.164967562557924, "grad_norm": 0.014819780364632607, "learning_rate": 0.004675301561273179, "loss": 6.4437, "step": 178 }, { "epoch": 0.16589434661723818, "grad_norm": 0.0183818731456995, "learning_rate": 0.004671704999393256, "loss": 6.4563, "step": 179 }, { "epoch": 0.16682113067655235, "grad_norm": 0.020285405218601227, "learning_rate": 0.004668090027361074, "loss": 6.4563, "step": 180 }, { "epoch": 0.16774791473586653, "grad_norm": 0.0204929132014513, "learning_rate": 0.004664456675821761, "loss": 6.4813, "step": 181 }, { "epoch": 0.1686746987951807, "grad_norm": 0.022332845255732536, "learning_rate": 0.0046608049755762606, "loss": 6.4563, "step": 182 }, { "epoch": 0.1696014828544949, "grad_norm": 0.014836137183010578, "learning_rate": 0.004657134957581057, "loss": 6.4625, "step": 183 }, { "epoch": 0.1705282669138091, "grad_norm": 0.024512965232133865, "learning_rate": 0.0046534466529479235, "loss": 6.4563, "step": 184 }, { "epoch": 0.17145505097312327, "grad_norm": 0.025079630315303802, "learning_rate": 0.004649740092943651, "loss": 6.4188, "step": 185 }, { "epoch": 0.17238183503243745, "grad_norm": 0.032594986259937286, "learning_rate": 0.00464601530898979, "loss": 6.4125, "step": 186 }, { "epoch": 0.17330861909175163, "grad_norm": 0.028524870052933693, "learning_rate": 0.004642272332662377, "loss": 6.4125, "step": 187 }, { "epoch": 0.1742354031510658, "grad_norm": 0.02017652988433838, "learning_rate": 0.0046385111956916735, "loss": 6.3938, "step": 188 }, { "epoch": 0.17516218721038, "grad_norm": 0.023051844909787178, "learning_rate": 0.004634731929961891, "loss": 6.4062, "step": 189 }, { "epoch": 0.17608897126969417, "grad_norm": 0.025438351556658745, "learning_rate": 0.004630934567510925, "loss": 6.3812, "step": 190 }, { "epoch": 0.17701575532900835, "grad_norm": 0.037845317274332047, "learning_rate": 0.004627119140530083, "loss": 6.4062, "step": 191 }, { "epoch": 0.17794253938832252, "grad_norm": 0.05386321246623993, "learning_rate": 0.004623285681363807, "loss": 6.4062, "step": 192 }, { "epoch": 0.1788693234476367, "grad_norm": 0.0913223922252655, "learning_rate": 0.004619434222509408, "loss": 6.3875, "step": 193 }, { "epoch": 0.17979610750695088, "grad_norm": 0.1158546730875969, "learning_rate": 0.00461556479661678, "loss": 6.4563, "step": 194 }, { "epoch": 0.18072289156626506, "grad_norm": 0.08018877357244492, "learning_rate": 0.0046116774364881345, "loss": 6.375, "step": 195 }, { "epoch": 0.18164967562557924, "grad_norm": 0.03276560455560684, "learning_rate": 0.0046077721750777114, "loss": 6.3812, "step": 196 }, { "epoch": 0.18257645968489342, "grad_norm": 0.07004847377538681, "learning_rate": 0.0046038490454915065, "loss": 6.3875, "step": 197 }, { "epoch": 0.1835032437442076, "grad_norm": 0.03939942270517349, "learning_rate": 0.004599908080986991, "loss": 6.325, "step": 198 }, { "epoch": 0.18443002780352177, "grad_norm": 0.0445321649312973, "learning_rate": 0.004595949314972824, "loss": 6.3125, "step": 199 }, { "epoch": 0.18535681186283595, "grad_norm": 0.04666861146688461, "learning_rate": 0.004591972781008576, "loss": 6.3375, "step": 200 }, { "epoch": 0.18628359592215013, "grad_norm": 0.032554373145103455, "learning_rate": 0.0045879785128044425, "loss": 6.3187, "step": 201 }, { "epoch": 0.1872103799814643, "grad_norm": 0.03748049587011337, "learning_rate": 0.004583966544220952, "loss": 6.3313, "step": 202 }, { "epoch": 0.1881371640407785, "grad_norm": 0.02630574069917202, "learning_rate": 0.00457993690926869, "loss": 6.3563, "step": 203 }, { "epoch": 0.18906394810009267, "grad_norm": 0.04539572447538376, "learning_rate": 0.004575889642107998, "loss": 6.3063, "step": 204 }, { "epoch": 0.18999073215940684, "grad_norm": 0.02216522768139839, "learning_rate": 0.0045718247770487, "loss": 6.2812, "step": 205 }, { "epoch": 0.19091751621872105, "grad_norm": 0.05376052483916283, "learning_rate": 0.004567742348549793, "loss": 6.35, "step": 206 }, { "epoch": 0.19184430027803523, "grad_norm": 0.02676314301788807, "learning_rate": 0.004563642391219168, "loss": 6.3, "step": 207 }, { "epoch": 0.1927710843373494, "grad_norm": 0.039810191839933395, "learning_rate": 0.004559524939813316, "loss": 6.2875, "step": 208 }, { "epoch": 0.1936978683966636, "grad_norm": 0.03783705458045006, "learning_rate": 0.0045553900292370254, "loss": 6.2625, "step": 209 }, { "epoch": 0.19462465245597776, "grad_norm": 0.02999858744442463, "learning_rate": 0.004551237694543092, "loss": 6.2438, "step": 210 }, { "epoch": 0.19555143651529194, "grad_norm": 0.0282985121011734, "learning_rate": 0.004547067970932022, "loss": 6.2438, "step": 211 }, { "epoch": 0.19647822057460612, "grad_norm": 0.03198060020804405, "learning_rate": 0.004542880893751732, "loss": 6.2625, "step": 212 }, { "epoch": 0.1974050046339203, "grad_norm": 0.03950299322605133, "learning_rate": 0.00453867649849725, "loss": 6.2188, "step": 213 }, { "epoch": 0.19833178869323448, "grad_norm": 0.026990199461579323, "learning_rate": 0.004534454820810412, "loss": 6.2063, "step": 214 }, { "epoch": 0.19925857275254866, "grad_norm": 0.0420188382267952, "learning_rate": 0.004530215896479564, "loss": 6.2625, "step": 215 }, { "epoch": 0.20018535681186284, "grad_norm": 0.04251977428793907, "learning_rate": 0.004525959761439257, "loss": 6.2063, "step": 216 }, { "epoch": 0.20111214087117701, "grad_norm": 0.06442005932331085, "learning_rate": 0.0045216864517699405, "loss": 6.2125, "step": 217 }, { "epoch": 0.2020389249304912, "grad_norm": 0.05594475567340851, "learning_rate": 0.004517396003697659, "loss": 6.1562, "step": 218 }, { "epoch": 0.20296570898980537, "grad_norm": 0.038938529789447784, "learning_rate": 0.004513088453593744, "loss": 6.1937, "step": 219 }, { "epoch": 0.20389249304911955, "grad_norm": 0.057002611458301544, "learning_rate": 0.0045087638379745065, "loss": 6.175, "step": 220 }, { "epoch": 0.20481927710843373, "grad_norm": 0.047009214758872986, "learning_rate": 0.004504422193500925, "loss": 6.1688, "step": 221 }, { "epoch": 0.2057460611677479, "grad_norm": 0.05817709118127823, "learning_rate": 0.004500063556978336, "loss": 6.1375, "step": 222 }, { "epoch": 0.20667284522706209, "grad_norm": 0.05288264900445938, "learning_rate": 0.004495687965356126, "loss": 6.1688, "step": 223 }, { "epoch": 0.20759962928637626, "grad_norm": 0.03736674785614014, "learning_rate": 0.00449129545572741, "loss": 6.175, "step": 224 }, { "epoch": 0.20852641334569044, "grad_norm": 0.034431926906108856, "learning_rate": 0.004486886065328725, "loss": 6.1125, "step": 225 }, { "epoch": 0.20945319740500462, "grad_norm": 0.03445250913500786, "learning_rate": 0.004482459831539709, "loss": 6.1625, "step": 226 }, { "epoch": 0.21037998146431883, "grad_norm": 0.035410068929195404, "learning_rate": 0.004478016791882787, "loss": 6.0875, "step": 227 }, { "epoch": 0.211306765523633, "grad_norm": 0.026350026950240135, "learning_rate": 0.004473556984022854, "loss": 6.125, "step": 228 }, { "epoch": 0.21223354958294718, "grad_norm": 0.028956936672329903, "learning_rate": 0.0044690804457669505, "loss": 6.1063, "step": 229 }, { "epoch": 0.21316033364226136, "grad_norm": 0.03521239385008812, "learning_rate": 0.004464587215063946, "loss": 6.0875, "step": 230 }, { "epoch": 0.21408711770157554, "grad_norm": 0.04613986983895302, "learning_rate": 0.004460077330004218, "loss": 6.1312, "step": 231 }, { "epoch": 0.21501390176088972, "grad_norm": 0.05228109285235405, "learning_rate": 0.0044555508288193265, "loss": 6.1063, "step": 232 }, { "epoch": 0.2159406858202039, "grad_norm": 0.045205965638160706, "learning_rate": 0.004451007749881691, "loss": 6.1, "step": 233 }, { "epoch": 0.21686746987951808, "grad_norm": 0.028526296839118004, "learning_rate": 0.004446448131704267, "loss": 6.0813, "step": 234 }, { "epoch": 0.21779425393883226, "grad_norm": 0.027809731662273407, "learning_rate": 0.004441872012940214, "loss": 6.075, "step": 235 }, { "epoch": 0.21872103799814643, "grad_norm": 0.04913929104804993, "learning_rate": 0.004437279432382576, "loss": 6.075, "step": 236 }, { "epoch": 0.2196478220574606, "grad_norm": 0.046848297119140625, "learning_rate": 0.004432670428963946, "loss": 6.0938, "step": 237 }, { "epoch": 0.2205746061167748, "grad_norm": 0.0395938940346241, "learning_rate": 0.004428045041756137, "loss": 6.075, "step": 238 }, { "epoch": 0.22150139017608897, "grad_norm": 0.0638502761721611, "learning_rate": 0.004423403309969855, "loss": 6.025, "step": 239 }, { "epoch": 0.22242817423540315, "grad_norm": 0.06795669347047806, "learning_rate": 0.004418745272954361, "loss": 6.0438, "step": 240 }, { "epoch": 0.22335495829471733, "grad_norm": 0.052847135812044144, "learning_rate": 0.004414070970197141, "loss": 6.0625, "step": 241 }, { "epoch": 0.2242817423540315, "grad_norm": 0.04967901483178139, "learning_rate": 0.0044093804413235715, "loss": 6.0375, "step": 242 }, { "epoch": 0.22520852641334568, "grad_norm": 0.0682300478219986, "learning_rate": 0.004404673726096578, "loss": 6.0625, "step": 243 }, { "epoch": 0.22613531047265986, "grad_norm": 0.0553511306643486, "learning_rate": 0.00439995086441631, "loss": 5.9813, "step": 244 }, { "epoch": 0.22706209453197404, "grad_norm": 0.028195617720484734, "learning_rate": 0.004395211896319786, "loss": 6.025, "step": 245 }, { "epoch": 0.22798887859128822, "grad_norm": 0.04402211681008339, "learning_rate": 0.00439045686198057, "loss": 6.0125, "step": 246 }, { "epoch": 0.2289156626506024, "grad_norm": 0.03047800622880459, "learning_rate": 0.00438568580170842, "loss": 5.9938, "step": 247 }, { "epoch": 0.22984244670991658, "grad_norm": 0.03843539580702782, "learning_rate": 0.004380898755948953, "loss": 5.9813, "step": 248 }, { "epoch": 0.23076923076923078, "grad_norm": 0.0366608090698719, "learning_rate": 0.004376095765283298, "loss": 6.0, "step": 249 }, { "epoch": 0.23169601482854496, "grad_norm": 0.06157747656106949, "learning_rate": 0.004371276870427753, "loss": 6.025, "step": 250 }, { "epoch": 0.23262279888785914, "grad_norm": 0.055426549166440964, "learning_rate": 0.004366442112233441, "loss": 5.975, "step": 251 }, { "epoch": 0.23354958294717332, "grad_norm": 0.03506896272301674, "learning_rate": 0.004361591531685964, "loss": 5.9813, "step": 252 }, { "epoch": 0.2344763670064875, "grad_norm": 0.03997468575835228, "learning_rate": 0.004356725169905052, "loss": 5.95, "step": 253 }, { "epoch": 0.23540315106580167, "grad_norm": 0.06662409007549286, "learning_rate": 0.0043518430681442205, "loss": 5.9625, "step": 254 }, { "epoch": 0.23632993512511585, "grad_norm": 0.0542214997112751, "learning_rate": 0.004346945267790413, "loss": 5.9625, "step": 255 }, { "epoch": 0.23725671918443003, "grad_norm": 0.05418306961655617, "learning_rate": 0.004342031810363658, "loss": 5.9625, "step": 256 }, { "epoch": 0.2381835032437442, "grad_norm": 0.08298410475254059, "learning_rate": 0.004337102737516711, "loss": 5.9563, "step": 257 }, { "epoch": 0.2391102873030584, "grad_norm": 0.051485590636730194, "learning_rate": 0.004332158091034705, "loss": 5.9938, "step": 258 }, { "epoch": 0.24003707136237257, "grad_norm": 0.041104063391685486, "learning_rate": 0.004327197912834795, "loss": 5.9125, "step": 259 }, { "epoch": 0.24096385542168675, "grad_norm": 0.06750784069299698, "learning_rate": 0.0043222222449658025, "loss": 5.9563, "step": 260 }, { "epoch": 0.24189063948100092, "grad_norm": 0.05327602103352547, "learning_rate": 0.0043172311296078595, "loss": 5.8812, "step": 261 }, { "epoch": 0.2428174235403151, "grad_norm": 0.05027195066213608, "learning_rate": 0.00431222460907205, "loss": 5.9125, "step": 262 }, { "epoch": 0.24374420759962928, "grad_norm": 0.06142845377326012, "learning_rate": 0.004307202725800052, "loss": 5.9, "step": 263 }, { "epoch": 0.24467099165894346, "grad_norm": 0.06710369884967804, "learning_rate": 0.004302165522363779, "loss": 5.9437, "step": 264 }, { "epoch": 0.24559777571825764, "grad_norm": 0.06705372035503387, "learning_rate": 0.004297113041465017, "loss": 5.9062, "step": 265 }, { "epoch": 0.24652455977757182, "grad_norm": 0.06116189435124397, "learning_rate": 0.004292045325935063, "loss": 5.9, "step": 266 }, { "epoch": 0.247451343836886, "grad_norm": 0.054194726049900055, "learning_rate": 0.004286962418734364, "loss": 5.875, "step": 267 }, { "epoch": 0.24837812789620017, "grad_norm": 0.0627150684595108, "learning_rate": 0.004281864362952147, "loss": 5.8875, "step": 268 }, { "epoch": 0.24930491195551435, "grad_norm": 0.0440673902630806, "learning_rate": 0.004276751201806063, "loss": 5.8938, "step": 269 }, { "epoch": 0.25023169601482853, "grad_norm": 0.034663740545511246, "learning_rate": 0.004271622978641812, "loss": 5.8625, "step": 270 }, { "epoch": 0.2511584800741427, "grad_norm": 0.04779878258705139, "learning_rate": 0.004266479736932779, "loss": 5.8563, "step": 271 }, { "epoch": 0.2520852641334569, "grad_norm": 0.060510262846946716, "learning_rate": 0.004261321520279666, "loss": 5.8563, "step": 272 }, { "epoch": 0.25301204819277107, "grad_norm": 0.05226600542664528, "learning_rate": 0.004256148372410125, "loss": 5.8375, "step": 273 }, { "epoch": 0.25393883225208524, "grad_norm": 0.05810929834842682, "learning_rate": 0.004250960337178377, "loss": 5.8625, "step": 274 }, { "epoch": 0.2548656163113994, "grad_norm": 0.07357963919639587, "learning_rate": 0.004245757458564855, "loss": 5.8688, "step": 275 }, { "epoch": 0.2557924003707136, "grad_norm": 0.07380347698926926, "learning_rate": 0.004240539780675817, "loss": 5.8563, "step": 276 }, { "epoch": 0.2567191844300278, "grad_norm": 0.05101478099822998, "learning_rate": 0.0042353073477429835, "loss": 5.825, "step": 277 }, { "epoch": 0.25764596848934196, "grad_norm": 0.03864740952849388, "learning_rate": 0.004230060204123156, "loss": 5.8688, "step": 278 }, { "epoch": 0.25857275254865614, "grad_norm": 0.06766132265329361, "learning_rate": 0.004224798394297841, "loss": 5.85, "step": 279 }, { "epoch": 0.2594995366079703, "grad_norm": 0.06980055570602417, "learning_rate": 0.004219521962872876, "loss": 5.875, "step": 280 }, { "epoch": 0.26042632066728455, "grad_norm": 0.04153401404619217, "learning_rate": 0.004214230954578051, "loss": 5.8313, "step": 281 }, { "epoch": 0.26135310472659873, "grad_norm": 0.045340005308389664, "learning_rate": 0.004208925414266726, "loss": 5.8125, "step": 282 }, { "epoch": 0.2622798887859129, "grad_norm": 0.04986559599637985, "learning_rate": 0.004203605386915454, "loss": 5.825, "step": 283 }, { "epoch": 0.2632066728452271, "grad_norm": 0.04970383271574974, "learning_rate": 0.004198270917623599, "loss": 5.7688, "step": 284 }, { "epoch": 0.26413345690454126, "grad_norm": 0.05129897966980934, "learning_rate": 0.004192922051612953, "loss": 5.8, "step": 285 }, { "epoch": 0.26506024096385544, "grad_norm": 0.03994636610150337, "learning_rate": 0.004187558834227354, "loss": 5.8, "step": 286 }, { "epoch": 0.2659870250231696, "grad_norm": 0.05204310640692711, "learning_rate": 0.004182181310932297, "loss": 5.7938, "step": 287 }, { "epoch": 0.2669138090824838, "grad_norm": 0.03257805109024048, "learning_rate": 0.004176789527314558, "loss": 5.7562, "step": 288 }, { "epoch": 0.267840593141798, "grad_norm": 0.035661760717630386, "learning_rate": 0.004171383529081797, "loss": 5.7812, "step": 289 }, { "epoch": 0.26876737720111216, "grad_norm": 0.04478088766336441, "learning_rate": 0.004165963362062177, "loss": 5.7562, "step": 290 }, { "epoch": 0.26969416126042633, "grad_norm": 0.03838647902011871, "learning_rate": 0.004160529072203974, "loss": 5.7688, "step": 291 }, { "epoch": 0.2706209453197405, "grad_norm": 0.040849462151527405, "learning_rate": 0.004155080705575188, "loss": 5.7438, "step": 292 }, { "epoch": 0.2715477293790547, "grad_norm": 0.051210496574640274, "learning_rate": 0.004149618308363149, "loss": 5.7375, "step": 293 }, { "epoch": 0.27247451343836887, "grad_norm": 0.07401825487613678, "learning_rate": 0.00414414192687413, "loss": 5.7812, "step": 294 }, { "epoch": 0.27340129749768305, "grad_norm": 0.10748963057994843, "learning_rate": 0.004138651607532954, "loss": 5.75, "step": 295 }, { "epoch": 0.2743280815569972, "grad_norm": 0.07754500955343246, "learning_rate": 0.004133147396882597, "loss": 5.7562, "step": 296 }, { "epoch": 0.2752548656163114, "grad_norm": 0.04524754732847214, "learning_rate": 0.004127629341583795, "loss": 5.7375, "step": 297 }, { "epoch": 0.2761816496756256, "grad_norm": 0.06774584203958511, "learning_rate": 0.004122097488414652, "loss": 5.7375, "step": 298 }, { "epoch": 0.27710843373493976, "grad_norm": 0.050472185015678406, "learning_rate": 0.004116551884270237, "loss": 5.6937, "step": 299 }, { "epoch": 0.27803521779425394, "grad_norm": 0.040967270731925964, "learning_rate": 0.0041109925761621926, "loss": 5.7313, "step": 300 }, { "epoch": 0.2789620018535681, "grad_norm": 0.03739303722977638, "learning_rate": 0.004105419611218332, "loss": 5.7188, "step": 301 }, { "epoch": 0.2798887859128823, "grad_norm": 0.04636852815747261, "learning_rate": 0.004099833036682241, "loss": 5.725, "step": 302 }, { "epoch": 0.2808155699721965, "grad_norm": 0.08012169599533081, "learning_rate": 0.00409423289991288, "loss": 5.7313, "step": 303 }, { "epoch": 0.28174235403151066, "grad_norm": 0.05987093225121498, "learning_rate": 0.004088619248384178, "loss": 5.7125, "step": 304 }, { "epoch": 0.28266913809082483, "grad_norm": 0.07735589891672134, "learning_rate": 0.0040829921296846325, "loss": 5.7, "step": 305 }, { "epoch": 0.283595922150139, "grad_norm": 0.09283655136823654, "learning_rate": 0.004077351591516908, "loss": 5.675, "step": 306 }, { "epoch": 0.2845227062094532, "grad_norm": 0.09337766468524933, "learning_rate": 0.004071697681697427, "loss": 5.7375, "step": 307 }, { "epoch": 0.28544949026876737, "grad_norm": 0.06437985599040985, "learning_rate": 0.00406603044815597, "loss": 5.6875, "step": 308 }, { "epoch": 0.28637627432808155, "grad_norm": 0.04110102728009224, "learning_rate": 0.004060349938935264, "loss": 5.6937, "step": 309 }, { "epoch": 0.2873030583873957, "grad_norm": 0.06071547046303749, "learning_rate": 0.004054656202190578, "loss": 5.7375, "step": 310 }, { "epoch": 0.2882298424467099, "grad_norm": 0.05311071500182152, "learning_rate": 0.004048949286189315, "loss": 5.65, "step": 311 }, { "epoch": 0.2891566265060241, "grad_norm": 0.031259018927812576, "learning_rate": 0.004043229239310603, "loss": 5.6688, "step": 312 }, { "epoch": 0.29008341056533826, "grad_norm": 0.03335728868842125, "learning_rate": 0.0040374961100448845, "loss": 5.675, "step": 313 }, { "epoch": 0.29101019462465244, "grad_norm": 0.035077281296253204, "learning_rate": 0.004031749946993501, "loss": 5.675, "step": 314 }, { "epoch": 0.2919369786839666, "grad_norm": 0.030766339972615242, "learning_rate": 0.004025990798868291, "loss": 5.6688, "step": 315 }, { "epoch": 0.2928637627432808, "grad_norm": 0.03741341829299927, "learning_rate": 0.004020218714491166, "loss": 5.6625, "step": 316 }, { "epoch": 0.293790546802595, "grad_norm": 0.044073686003685, "learning_rate": 0.0040144337427937046, "loss": 5.6375, "step": 317 }, { "epoch": 0.29471733086190915, "grad_norm": 0.05024448409676552, "learning_rate": 0.004008635932816734, "loss": 5.6813, "step": 318 }, { "epoch": 0.29564411492122333, "grad_norm": 0.045678358525037766, "learning_rate": 0.004002825333709915, "loss": 5.5938, "step": 319 }, { "epoch": 0.2965708989805375, "grad_norm": 0.05762135609984398, "learning_rate": 0.003997001994731328, "loss": 5.6438, "step": 320 }, { "epoch": 0.2974976830398517, "grad_norm": 0.07177098840475082, "learning_rate": 0.003991165965247046, "loss": 5.6375, "step": 321 }, { "epoch": 0.29842446709916587, "grad_norm": 0.07682537287473679, "learning_rate": 0.003985317294730731, "loss": 5.675, "step": 322 }, { "epoch": 0.29935125115848005, "grad_norm": 0.08128990978002548, "learning_rate": 0.003979456032763201, "loss": 5.675, "step": 323 }, { "epoch": 0.3002780352177943, "grad_norm": 0.08135168999433517, "learning_rate": 0.003973582229032019, "loss": 5.7125, "step": 324 }, { "epoch": 0.30120481927710846, "grad_norm": 0.10176597535610199, "learning_rate": 0.003967695933331064, "loss": 5.6875, "step": 325 }, { "epoch": 0.30213160333642264, "grad_norm": 0.10529598593711853, "learning_rate": 0.003961797195560118, "loss": 5.675, "step": 326 }, { "epoch": 0.3030583873957368, "grad_norm": 0.06495360285043716, "learning_rate": 0.003955886065724433, "loss": 5.6312, "step": 327 }, { "epoch": 0.303985171455051, "grad_norm": 0.06810038536787033, "learning_rate": 0.003949962593934316, "loss": 5.6312, "step": 328 }, { "epoch": 0.3049119555143652, "grad_norm": 0.058491405099630356, "learning_rate": 0.003944026830404698, "loss": 5.5813, "step": 329 }, { "epoch": 0.30583873957367935, "grad_norm": 0.05078050121665001, "learning_rate": 0.003938078825454709, "loss": 5.575, "step": 330 }, { "epoch": 0.30676552363299353, "grad_norm": 0.06602590531110764, "learning_rate": 0.003932118629507257, "loss": 5.5875, "step": 331 }, { "epoch": 0.3076923076923077, "grad_norm": 0.0416589193046093, "learning_rate": 0.0039261462930885935, "loss": 5.6, "step": 332 }, { "epoch": 0.3086190917516219, "grad_norm": 0.04823141545057297, "learning_rate": 0.003920161866827889, "loss": 5.5813, "step": 333 }, { "epoch": 0.30954587581093607, "grad_norm": 0.03508712351322174, "learning_rate": 0.003914165401456804, "loss": 5.5875, "step": 334 }, { "epoch": 0.31047265987025024, "grad_norm": 0.03729189559817314, "learning_rate": 0.003908156947809056, "loss": 5.575, "step": 335 }, { "epoch": 0.3113994439295644, "grad_norm": 0.047349270433187485, "learning_rate": 0.0039021365568199917, "loss": 5.5625, "step": 336 }, { "epoch": 0.3123262279888786, "grad_norm": 0.04627249017357826, "learning_rate": 0.0038961042795261536, "loss": 5.5375, "step": 337 }, { "epoch": 0.3132530120481928, "grad_norm": 0.03604106232523918, "learning_rate": 0.0038900601670648484, "loss": 5.575, "step": 338 }, { "epoch": 0.31417979610750696, "grad_norm": 0.040808554738759995, "learning_rate": 0.0038840042706737112, "loss": 5.5563, "step": 339 }, { "epoch": 0.31510658016682114, "grad_norm": 0.027617141604423523, "learning_rate": 0.003877936641690275, "loss": 5.5813, "step": 340 }, { "epoch": 0.3160333642261353, "grad_norm": 0.03513359650969505, "learning_rate": 0.0038718573315515317, "loss": 5.5438, "step": 341 }, { "epoch": 0.3169601482854495, "grad_norm": 0.03978215530514717, "learning_rate": 0.0038657663917934983, "loss": 5.575, "step": 342 }, { "epoch": 0.3178869323447637, "grad_norm": 0.025322776287794113, "learning_rate": 0.0038596638740507785, "loss": 5.525, "step": 343 }, { "epoch": 0.31881371640407785, "grad_norm": 0.04898100346326828, "learning_rate": 0.0038535498300561266, "loss": 5.525, "step": 344 }, { "epoch": 0.31974050046339203, "grad_norm": 0.0469982884824276, "learning_rate": 0.003847424311640009, "loss": 5.5438, "step": 345 }, { "epoch": 0.3206672845227062, "grad_norm": 0.03919081762433052, "learning_rate": 0.0038412873707301615, "loss": 5.5312, "step": 346 }, { "epoch": 0.3215940685820204, "grad_norm": 0.04740371182560921, "learning_rate": 0.0038351390593511546, "loss": 5.5, "step": 347 }, { "epoch": 0.32252085264133457, "grad_norm": 0.05560089647769928, "learning_rate": 0.003828979429623947, "loss": 5.5125, "step": 348 }, { "epoch": 0.32344763670064874, "grad_norm": 0.060783710330724716, "learning_rate": 0.0038228085337654472, "loss": 5.5312, "step": 349 }, { "epoch": 0.3243744207599629, "grad_norm": 0.0725303441286087, "learning_rate": 0.00381662642408807, "loss": 5.5, "step": 350 }, { "epoch": 0.3253012048192771, "grad_norm": 0.07496823370456696, "learning_rate": 0.003810433152999293, "loss": 5.5, "step": 351 }, { "epoch": 0.3262279888785913, "grad_norm": 0.06248985975980759, "learning_rate": 0.0038042287730012114, "loss": 5.525, "step": 352 }, { "epoch": 0.32715477293790546, "grad_norm": 0.06995397806167603, "learning_rate": 0.003798013336690095, "loss": 5.5188, "step": 353 }, { "epoch": 0.32808155699721964, "grad_norm": 0.04727565497159958, "learning_rate": 0.0037917868967559387, "loss": 5.525, "step": 354 }, { "epoch": 0.3290083410565338, "grad_norm": 0.05960770696401596, "learning_rate": 0.0037855495059820215, "loss": 5.5, "step": 355 }, { "epoch": 0.329935125115848, "grad_norm": 0.049259670078754425, "learning_rate": 0.0037793012172444534, "loss": 5.4813, "step": 356 }, { "epoch": 0.33086190917516217, "grad_norm": 0.06020974740386009, "learning_rate": 0.003773042083511731, "loss": 5.4625, "step": 357 }, { "epoch": 0.33178869323447635, "grad_norm": 0.0410022996366024, "learning_rate": 0.003766772157844284, "loss": 5.4813, "step": 358 }, { "epoch": 0.33271547729379053, "grad_norm": 0.04682173952460289, "learning_rate": 0.003760491493394032, "loss": 5.5, "step": 359 }, { "epoch": 0.3336422613531047, "grad_norm": 0.055474553257226944, "learning_rate": 0.003754200143403929, "loss": 5.4938, "step": 360 }, { "epoch": 0.3345690454124189, "grad_norm": 0.04533625394105911, "learning_rate": 0.0037478981612075126, "loss": 5.4625, "step": 361 }, { "epoch": 0.33549582947173306, "grad_norm": 0.0564807690680027, "learning_rate": 0.0037415856002284524, "loss": 5.4188, "step": 362 }, { "epoch": 0.33642261353104724, "grad_norm": 0.056940093636512756, "learning_rate": 0.003735262513980099, "loss": 5.4313, "step": 363 }, { "epoch": 0.3373493975903614, "grad_norm": 0.03561275824904442, "learning_rate": 0.003728928956065027, "loss": 5.4313, "step": 364 }, { "epoch": 0.3382761816496756, "grad_norm": 0.04059695452451706, "learning_rate": 0.003722584980174583, "loss": 5.425, "step": 365 }, { "epoch": 0.3392029657089898, "grad_norm": 0.05738742649555206, "learning_rate": 0.0037162306400884307, "loss": 5.45, "step": 366 }, { "epoch": 0.340129749768304, "grad_norm": 0.057356227189302444, "learning_rate": 0.0037098659896740906, "loss": 5.45, "step": 367 }, { "epoch": 0.3410565338276182, "grad_norm": 0.049577098339796066, "learning_rate": 0.0037034910828864904, "loss": 5.4625, "step": 368 }, { "epoch": 0.34198331788693237, "grad_norm": 0.03639480471611023, "learning_rate": 0.003697105973767503, "loss": 5.3875, "step": 369 }, { "epoch": 0.34291010194624655, "grad_norm": 0.0382065586745739, "learning_rate": 0.003690710716445488, "loss": 5.4437, "step": 370 }, { "epoch": 0.3438368860055607, "grad_norm": 0.06564627587795258, "learning_rate": 0.0036843053651348357, "loss": 5.4062, "step": 371 }, { "epoch": 0.3447636700648749, "grad_norm": 0.08808669447898865, "learning_rate": 0.003677889974135504, "loss": 5.4062, "step": 372 }, { "epoch": 0.3456904541241891, "grad_norm": 0.05307735130190849, "learning_rate": 0.0036714645978325636, "loss": 5.4, "step": 373 }, { "epoch": 0.34661723818350326, "grad_norm": 0.05861683562397957, "learning_rate": 0.0036650292906957294, "loss": 5.4563, "step": 374 }, { "epoch": 0.34754402224281744, "grad_norm": 0.06583855301141739, "learning_rate": 0.003658584107278905, "loss": 5.3938, "step": 375 }, { "epoch": 0.3484708063021316, "grad_norm": 0.038819484412670135, "learning_rate": 0.0036521291022197184, "loss": 5.3625, "step": 376 }, { "epoch": 0.3493975903614458, "grad_norm": 0.0668378546833992, "learning_rate": 0.0036456643302390564, "loss": 5.3688, "step": 377 }, { "epoch": 0.35032437442076, "grad_norm": 0.06500761210918427, "learning_rate": 0.0036391898461406043, "loss": 5.3688, "step": 378 }, { "epoch": 0.35125115848007415, "grad_norm": 0.06566040962934494, "learning_rate": 0.003632705704810379, "loss": 5.3875, "step": 379 }, { "epoch": 0.35217794253938833, "grad_norm": 0.04046965390443802, "learning_rate": 0.0036262119612162657, "loss": 5.3563, "step": 380 }, { "epoch": 0.3531047265987025, "grad_norm": 0.04664246365427971, "learning_rate": 0.0036197086704075495, "loss": 5.35, "step": 381 }, { "epoch": 0.3540315106580167, "grad_norm": 0.06433206051588058, "learning_rate": 0.0036131958875144496, "loss": 5.3938, "step": 382 }, { "epoch": 0.35495829471733087, "grad_norm": 0.06552179157733917, "learning_rate": 0.003606673667747653, "loss": 5.375, "step": 383 }, { "epoch": 0.35588507877664505, "grad_norm": 0.0640706792473793, "learning_rate": 0.0036001420663978466, "loss": 5.3938, "step": 384 }, { "epoch": 0.3568118628359592, "grad_norm": 0.0631820559501648, "learning_rate": 0.003593601138835246, "loss": 5.3375, "step": 385 }, { "epoch": 0.3577386468952734, "grad_norm": 0.0694313570857048, "learning_rate": 0.0035870509405091272, "loss": 5.3812, "step": 386 }, { "epoch": 0.3586654309545876, "grad_norm": 0.05696525424718857, "learning_rate": 0.0035804915269473598, "loss": 5.3563, "step": 387 }, { "epoch": 0.35959221501390176, "grad_norm": 0.041316401213407516, "learning_rate": 0.0035739229537559316, "loss": 5.3313, "step": 388 }, { "epoch": 0.36051899907321594, "grad_norm": 0.05180737376213074, "learning_rate": 0.003567345276618479, "loss": 5.3625, "step": 389 }, { "epoch": 0.3614457831325301, "grad_norm": 0.06132522597908974, "learning_rate": 0.003560758551295816, "loss": 5.3375, "step": 390 }, { "epoch": 0.3623725671918443, "grad_norm": 0.0825105607509613, "learning_rate": 0.00355416283362546, "loss": 5.3625, "step": 391 }, { "epoch": 0.3632993512511585, "grad_norm": 0.09952400624752045, "learning_rate": 0.0035475581795211594, "loss": 5.375, "step": 392 }, { "epoch": 0.36422613531047265, "grad_norm": 0.11159048974514008, "learning_rate": 0.0035409446449724187, "loss": 5.3875, "step": 393 }, { "epoch": 0.36515291936978683, "grad_norm": 0.06153342127799988, "learning_rate": 0.0035343222860440247, "loss": 5.35, "step": 394 }, { "epoch": 0.366079703429101, "grad_norm": 0.055650901049375534, "learning_rate": 0.0035276911588755723, "loss": 5.2938, "step": 395 }, { "epoch": 0.3670064874884152, "grad_norm": 0.05008624121546745, "learning_rate": 0.003521051319680984, "loss": 5.3375, "step": 396 }, { "epoch": 0.36793327154772937, "grad_norm": 0.04708503931760788, "learning_rate": 0.0035144028247480405, "loss": 5.3438, "step": 397 }, { "epoch": 0.36886005560704355, "grad_norm": 0.041482266038656235, "learning_rate": 0.0035077457304378964, "loss": 5.2875, "step": 398 }, { "epoch": 0.3697868396663577, "grad_norm": 0.056157998740673065, "learning_rate": 0.003501080093184607, "loss": 5.3, "step": 399 }, { "epoch": 0.3707136237256719, "grad_norm": 0.047049764543771744, "learning_rate": 0.0034944059694946494, "loss": 5.3, "step": 400 }, { "epoch": 0.3716404077849861, "grad_norm": 0.0425553135573864, "learning_rate": 0.0034877234159464412, "loss": 5.325, "step": 401 }, { "epoch": 0.37256719184430026, "grad_norm": 0.036974068731069565, "learning_rate": 0.003481032489189862, "loss": 5.275, "step": 402 }, { "epoch": 0.37349397590361444, "grad_norm": 0.038740385323762894, "learning_rate": 0.003474333245945775, "loss": 5.2438, "step": 403 }, { "epoch": 0.3744207599629286, "grad_norm": 0.037295546382665634, "learning_rate": 0.0034676257430055436, "loss": 5.2688, "step": 404 }, { "epoch": 0.3753475440222428, "grad_norm": 0.04598161205649376, "learning_rate": 0.00346091003723055, "loss": 5.2812, "step": 405 }, { "epoch": 0.376274328081557, "grad_norm": 0.052688293159008026, "learning_rate": 0.003454186185551717, "loss": 5.2625, "step": 406 }, { "epoch": 0.37720111214087115, "grad_norm": 0.0431685745716095, "learning_rate": 0.0034474542449690203, "loss": 5.2313, "step": 407 }, { "epoch": 0.37812789620018533, "grad_norm": 0.047002580016851425, "learning_rate": 0.0034407142725510075, "loss": 5.25, "step": 408 }, { "epoch": 0.3790546802594995, "grad_norm": 0.034174490720033646, "learning_rate": 0.003433966325434315, "loss": 5.2438, "step": 409 }, { "epoch": 0.3799814643188137, "grad_norm": 0.037927597761154175, "learning_rate": 0.0034272104608231825, "loss": 5.2562, "step": 410 }, { "epoch": 0.3809082483781279, "grad_norm": 0.040478792041540146, "learning_rate": 0.003420446735988969, "loss": 5.25, "step": 411 }, { "epoch": 0.3818350324374421, "grad_norm": 0.043072253465652466, "learning_rate": 0.0034136752082696664, "loss": 5.1688, "step": 412 }, { "epoch": 0.3827618164967563, "grad_norm": 0.04011726379394531, "learning_rate": 0.003406895935069414, "loss": 5.2375, "step": 413 }, { "epoch": 0.38368860055607046, "grad_norm": 0.056565847247838974, "learning_rate": 0.0034001089738580127, "loss": 5.2562, "step": 414 }, { "epoch": 0.38461538461538464, "grad_norm": 0.045512937009334564, "learning_rate": 0.0033933143821704343, "loss": 5.25, "step": 415 }, { "epoch": 0.3855421686746988, "grad_norm": 0.05256471410393715, "learning_rate": 0.003386512217606339, "loss": 5.2375, "step": 416 }, { "epoch": 0.386468952734013, "grad_norm": 0.055981192737817764, "learning_rate": 0.0033797025378295826, "loss": 5.2438, "step": 417 }, { "epoch": 0.3873957367933272, "grad_norm": 0.06136908382177353, "learning_rate": 0.003372885400567731, "loss": 5.2375, "step": 418 }, { "epoch": 0.38832252085264135, "grad_norm": 0.07198972254991531, "learning_rate": 0.003366060863611567, "loss": 5.225, "step": 419 }, { "epoch": 0.38924930491195553, "grad_norm": 0.05037841945886612, "learning_rate": 0.003359228984814605, "loss": 5.1937, "step": 420 }, { "epoch": 0.3901760889712697, "grad_norm": 0.0768144503235817, "learning_rate": 0.0033523898220925974, "loss": 5.1875, "step": 421 }, { "epoch": 0.3911028730305839, "grad_norm": 0.08858561515808105, "learning_rate": 0.003345543433423044, "loss": 5.2625, "step": 422 }, { "epoch": 0.39202965708989806, "grad_norm": 0.10811244696378708, "learning_rate": 0.0033386898768447016, "loss": 5.2375, "step": 423 }, { "epoch": 0.39295644114921224, "grad_norm": 0.11364039778709412, "learning_rate": 0.003331829210457091, "loss": 5.2812, "step": 424 }, { "epoch": 0.3938832252085264, "grad_norm": 0.08991072326898575, "learning_rate": 0.0033249614924200054, "loss": 5.2188, "step": 425 }, { "epoch": 0.3948100092678406, "grad_norm": 0.0634012222290039, "learning_rate": 0.003318086780953016, "loss": 5.1813, "step": 426 }, { "epoch": 0.3957367933271548, "grad_norm": 0.07201571762561798, "learning_rate": 0.003311205134334979, "loss": 5.2, "step": 427 }, { "epoch": 0.39666357738646896, "grad_norm": 0.0652351826429367, "learning_rate": 0.0033043166109035446, "loss": 5.2, "step": 428 }, { "epoch": 0.39759036144578314, "grad_norm": 0.04549067094922066, "learning_rate": 0.0032974212690546558, "loss": 5.1875, "step": 429 }, { "epoch": 0.3985171455050973, "grad_norm": 0.06608382612466812, "learning_rate": 0.0032905191672420596, "loss": 5.2313, "step": 430 }, { "epoch": 0.3994439295644115, "grad_norm": 0.04941621795296669, "learning_rate": 0.003283610363976809, "loss": 5.1375, "step": 431 }, { "epoch": 0.40037071362372567, "grad_norm": 0.05331863835453987, "learning_rate": 0.0032766949178267657, "loss": 5.1188, "step": 432 }, { "epoch": 0.40129749768303985, "grad_norm": 0.04874474182724953, "learning_rate": 0.003269772887416106, "loss": 5.1562, "step": 433 }, { "epoch": 0.40222428174235403, "grad_norm": 0.05278300493955612, "learning_rate": 0.0032628443314248233, "loss": 5.1438, "step": 434 }, { "epoch": 0.4031510658016682, "grad_norm": 0.04638415202498436, "learning_rate": 0.003255909308588229, "loss": 5.1438, "step": 435 }, { "epoch": 0.4040778498609824, "grad_norm": 0.06462404876947403, "learning_rate": 0.003248967877696457, "loss": 5.1875, "step": 436 }, { "epoch": 0.40500463392029656, "grad_norm": 0.04122454300522804, "learning_rate": 0.0032420200975939633, "loss": 5.1375, "step": 437 }, { "epoch": 0.40593141797961074, "grad_norm": 0.05846314877271652, "learning_rate": 0.003235066027179028, "loss": 5.15, "step": 438 }, { "epoch": 0.4068582020389249, "grad_norm": 0.06503690779209137, "learning_rate": 0.0032281057254032563, "loss": 5.1375, "step": 439 }, { "epoch": 0.4077849860982391, "grad_norm": 0.05073606222867966, "learning_rate": 0.0032211392512710773, "loss": 5.0875, "step": 440 }, { "epoch": 0.4087117701575533, "grad_norm": 0.06046286225318909, "learning_rate": 0.003214166663839247, "loss": 5.1188, "step": 441 }, { "epoch": 0.40963855421686746, "grad_norm": 0.03978972136974335, "learning_rate": 0.003207188022216343, "loss": 5.125, "step": 442 }, { "epoch": 0.41056533827618164, "grad_norm": 0.04392355680465698, "learning_rate": 0.0032002033855622683, "loss": 5.125, "step": 443 }, { "epoch": 0.4114921223354958, "grad_norm": 0.039449259638786316, "learning_rate": 0.003193212813087745, "loss": 5.125, "step": 444 }, { "epoch": 0.41241890639481, "grad_norm": 0.04521370679140091, "learning_rate": 0.003186216364053818, "loss": 5.0813, "step": 445 }, { "epoch": 0.41334569045412417, "grad_norm": 0.06002253293991089, "learning_rate": 0.003179214097771346, "loss": 5.0875, "step": 446 }, { "epoch": 0.41427247451343835, "grad_norm": 0.07361883670091629, "learning_rate": 0.0031722060736005054, "loss": 5.1312, "step": 447 }, { "epoch": 0.4151992585727525, "grad_norm": 0.06389747560024261, "learning_rate": 0.0031651923509502817, "loss": 5.0875, "step": 448 }, { "epoch": 0.4161260426320667, "grad_norm": 0.07580303400754929, "learning_rate": 0.003158172989277968, "loss": 5.1438, "step": 449 }, { "epoch": 0.4170528266913809, "grad_norm": 0.06630785763263702, "learning_rate": 0.0031511480480886623, "loss": 5.125, "step": 450 }, { "epoch": 0.41797961075069506, "grad_norm": 0.05100114271044731, "learning_rate": 0.0031441175869347604, "loss": 5.0563, "step": 451 }, { "epoch": 0.41890639481000924, "grad_norm": 0.044168341904878616, "learning_rate": 0.003137081665415453, "loss": 5.1063, "step": 452 }, { "epoch": 0.4198331788693234, "grad_norm": 0.036300163716077805, "learning_rate": 0.0031300403431762202, "loss": 5.0938, "step": 453 }, { "epoch": 0.42075996292863765, "grad_norm": 0.03885301947593689, "learning_rate": 0.003122993679908325, "loss": 5.075, "step": 454 }, { "epoch": 0.42168674698795183, "grad_norm": 0.047411106526851654, "learning_rate": 0.0031159417353483075, "loss": 5.0813, "step": 455 }, { "epoch": 0.422613531047266, "grad_norm": 0.04042837396264076, "learning_rate": 0.00310888456927748, "loss": 5.025, "step": 456 }, { "epoch": 0.4235403151065802, "grad_norm": 0.0529557429254055, "learning_rate": 0.0031018222415214176, "loss": 5.0938, "step": 457 }, { "epoch": 0.42446709916589437, "grad_norm": 0.03582127019762993, "learning_rate": 0.003094754811949453, "loss": 5.05, "step": 458 }, { "epoch": 0.42539388322520855, "grad_norm": 0.04631989449262619, "learning_rate": 0.0030876823404741693, "loss": 5.0625, "step": 459 }, { "epoch": 0.4263206672845227, "grad_norm": 0.05943077430129051, "learning_rate": 0.0030806048870508896, "loss": 5.0375, "step": 460 }, { "epoch": 0.4272474513438369, "grad_norm": 0.04641159623861313, "learning_rate": 0.003073522511677171, "loss": 5.0687, "step": 461 }, { "epoch": 0.4281742354031511, "grad_norm": 0.04967037960886955, "learning_rate": 0.0030664352743922964, "loss": 5.05, "step": 462 }, { "epoch": 0.42910101946246526, "grad_norm": 0.05452379956841469, "learning_rate": 0.0030593432352767637, "loss": 5.0563, "step": 463 }, { "epoch": 0.43002780352177944, "grad_norm": 0.05275031551718712, "learning_rate": 0.003052246454451776, "loss": 5.05, "step": 464 }, { "epoch": 0.4309545875810936, "grad_norm": 0.0582866407930851, "learning_rate": 0.0030451449920787356, "loss": 5.0375, "step": 465 }, { "epoch": 0.4318813716404078, "grad_norm": 0.07089794427156448, "learning_rate": 0.00303803890835873, "loss": 5.0813, "step": 466 }, { "epoch": 0.432808155699722, "grad_norm": 0.05818159505724907, "learning_rate": 0.0030309282635320235, "loss": 5.025, "step": 467 }, { "epoch": 0.43373493975903615, "grad_norm": 0.05577028915286064, "learning_rate": 0.0030238131178775465, "loss": 5.0312, "step": 468 }, { "epoch": 0.43466172381835033, "grad_norm": 0.0684211254119873, "learning_rate": 0.0030166935317123824, "loss": 5.0, "step": 469 }, { "epoch": 0.4355885078776645, "grad_norm": 0.06801000237464905, "learning_rate": 0.0030095695653912617, "loss": 5.0687, "step": 470 }, { "epoch": 0.4365152919369787, "grad_norm": 0.07714419811964035, "learning_rate": 0.0030024412793060442, "loss": 5.05, "step": 471 }, { "epoch": 0.43744207599629287, "grad_norm": 0.07117122411727905, "learning_rate": 0.0029953087338852086, "loss": 5.0375, "step": 472 }, { "epoch": 0.43836886005560705, "grad_norm": 0.05810219794511795, "learning_rate": 0.002988171989593344, "loss": 5.0125, "step": 473 }, { "epoch": 0.4392956441149212, "grad_norm": 0.0630822405219078, "learning_rate": 0.002981031106930632, "loss": 4.9938, "step": 474 }, { "epoch": 0.4402224281742354, "grad_norm": 0.09144022315740585, "learning_rate": 0.002973886146432338, "loss": 5.05, "step": 475 }, { "epoch": 0.4411492122335496, "grad_norm": 0.07084767520427704, "learning_rate": 0.002966737168668295, "loss": 5.0062, "step": 476 }, { "epoch": 0.44207599629286376, "grad_norm": 0.048369865864515305, "learning_rate": 0.0029595842342423936, "loss": 4.9313, "step": 477 }, { "epoch": 0.44300278035217794, "grad_norm": 0.05783843249082565, "learning_rate": 0.002952427403792063, "loss": 4.9375, "step": 478 }, { "epoch": 0.4439295644114921, "grad_norm": 0.05991849675774574, "learning_rate": 0.002945266737987763, "loss": 4.9688, "step": 479 }, { "epoch": 0.4448563484708063, "grad_norm": 0.05597536638379097, "learning_rate": 0.0029381022975324645, "loss": 5.0, "step": 480 }, { "epoch": 0.4457831325301205, "grad_norm": 0.0695003792643547, "learning_rate": 0.0029309341431611397, "loss": 5.0125, "step": 481 }, { "epoch": 0.44670991658943465, "grad_norm": 0.08234460651874542, "learning_rate": 0.002923762335640242, "loss": 5.0125, "step": 482 }, { "epoch": 0.44763670064874883, "grad_norm": 0.07713950425386429, "learning_rate": 0.002916586935767195, "loss": 5.0125, "step": 483 }, { "epoch": 0.448563484708063, "grad_norm": 0.07240517437458038, "learning_rate": 0.002909408004369877, "loss": 5.0125, "step": 484 }, { "epoch": 0.4494902687673772, "grad_norm": 0.0547131672501564, "learning_rate": 0.0029022256023061004, "loss": 4.9625, "step": 485 }, { "epoch": 0.45041705282669137, "grad_norm": 0.045404303818941116, "learning_rate": 0.0028950397904631033, "loss": 5.0, "step": 486 }, { "epoch": 0.45134383688600554, "grad_norm": 0.05781068280339241, "learning_rate": 0.002887850629757026, "loss": 4.9563, "step": 487 }, { "epoch": 0.4522706209453197, "grad_norm": 0.048498354852199554, "learning_rate": 0.0028806581811324007, "loss": 4.925, "step": 488 }, { "epoch": 0.4531974050046339, "grad_norm": 0.039063528180122375, "learning_rate": 0.002873462505561632, "loss": 4.9688, "step": 489 }, { "epoch": 0.4541241890639481, "grad_norm": 0.038773953914642334, "learning_rate": 0.002866263664044479, "loss": 4.9437, "step": 490 }, { "epoch": 0.45505097312326226, "grad_norm": 0.058951422572135925, "learning_rate": 0.002859061717607539, "loss": 4.95, "step": 491 }, { "epoch": 0.45597775718257644, "grad_norm": 0.058964647352695465, "learning_rate": 0.0028518567273037327, "loss": 4.9313, "step": 492 }, { "epoch": 0.4569045412418906, "grad_norm": 0.05438453331589699, "learning_rate": 0.002844648754211783, "loss": 4.95, "step": 493 }, { "epoch": 0.4578313253012048, "grad_norm": 0.04710723087191582, "learning_rate": 0.002837437859435698, "loss": 4.9062, "step": 494 }, { "epoch": 0.458758109360519, "grad_norm": 0.0365031473338604, "learning_rate": 0.0028302241041042566, "loss": 4.9688, "step": 495 }, { "epoch": 0.45968489341983315, "grad_norm": 0.03951582312583923, "learning_rate": 0.0028230075493704838, "loss": 4.9563, "step": 496 }, { "epoch": 0.4606116774791474, "grad_norm": 0.04623804986476898, "learning_rate": 0.0028157882564111385, "loss": 4.9375, "step": 497 }, { "epoch": 0.46153846153846156, "grad_norm": 0.040012940764427185, "learning_rate": 0.002808566286426191, "loss": 4.925, "step": 498 }, { "epoch": 0.46246524559777574, "grad_norm": 0.04338626191020012, "learning_rate": 0.0028013417006383075, "loss": 4.95, "step": 499 }, { "epoch": 0.4633920296570899, "grad_norm": 0.0410669781267643, "learning_rate": 0.0027941145602923267, "loss": 4.9125, "step": 500 }, { "epoch": 0.4643188137164041, "grad_norm": 0.03322385624051094, "learning_rate": 0.0027868849266547437, "loss": 4.8875, "step": 501 }, { "epoch": 0.4652455977757183, "grad_norm": 0.036676980555057526, "learning_rate": 0.00277965286101319, "loss": 4.95, "step": 502 }, { "epoch": 0.46617238183503246, "grad_norm": 0.044222161173820496, "learning_rate": 0.0027724184246759147, "loss": 4.9125, "step": 503 }, { "epoch": 0.46709916589434664, "grad_norm": 0.06456394493579865, "learning_rate": 0.002765181678971263, "loss": 4.9062, "step": 504 }, { "epoch": 0.4680259499536608, "grad_norm": 0.0746362954378128, "learning_rate": 0.0027579426852471574, "loss": 4.8875, "step": 505 }, { "epoch": 0.468952734012975, "grad_norm": 0.08617927134037018, "learning_rate": 0.0027507015048705776, "loss": 4.8938, "step": 506 }, { "epoch": 0.46987951807228917, "grad_norm": 0.07306444644927979, "learning_rate": 0.00274345819922704, "loss": 4.9, "step": 507 }, { "epoch": 0.47080630213160335, "grad_norm": 0.04307616129517555, "learning_rate": 0.0027362128297200783, "loss": 4.9062, "step": 508 }, { "epoch": 0.4717330861909175, "grad_norm": 0.06619231402873993, "learning_rate": 0.0027289654577707214, "loss": 4.8938, "step": 509 }, { "epoch": 0.4726598702502317, "grad_norm": 0.07649318128824234, "learning_rate": 0.002721716144816973, "loss": 4.8938, "step": 510 }, { "epoch": 0.4735866543095459, "grad_norm": 0.0643559917807579, "learning_rate": 0.002714464952313292, "loss": 4.825, "step": 511 }, { "epoch": 0.47451343836886006, "grad_norm": 0.07730736583471298, "learning_rate": 0.0027072119417300713, "loss": 4.8812, "step": 512 }, { "epoch": 0.47544022242817424, "grad_norm": 0.08054769784212112, "learning_rate": 0.002699957174553115, "loss": 4.9062, "step": 513 }, { "epoch": 0.4763670064874884, "grad_norm": 0.06001604348421097, "learning_rate": 0.002692700712283119, "loss": 4.8938, "step": 514 }, { "epoch": 0.4772937905468026, "grad_norm": 0.04911705106496811, "learning_rate": 0.0026854426164351483, "loss": 4.8625, "step": 515 }, { "epoch": 0.4782205746061168, "grad_norm": 0.04762764275074005, "learning_rate": 0.002678182948538117, "loss": 4.8375, "step": 516 }, { "epoch": 0.47914735866543096, "grad_norm": 0.045550934970378876, "learning_rate": 0.002670921770134266, "loss": 4.8938, "step": 517 }, { "epoch": 0.48007414272474513, "grad_norm": 0.057238396257162094, "learning_rate": 0.00266365914277864, "loss": 4.8875, "step": 518 }, { "epoch": 0.4810009267840593, "grad_norm": 0.053200677037239075, "learning_rate": 0.002656395128038568, "loss": 4.8438, "step": 519 }, { "epoch": 0.4819277108433735, "grad_norm": 0.047585804015398026, "learning_rate": 0.00264912978749314, "loss": 4.8063, "step": 520 }, { "epoch": 0.48285449490268767, "grad_norm": 0.05673938989639282, "learning_rate": 0.0026418631827326857, "loss": 4.8875, "step": 521 }, { "epoch": 0.48378127896200185, "grad_norm": 0.05663244426250458, "learning_rate": 0.0026345953753582497, "loss": 4.9, "step": 522 }, { "epoch": 0.484708063021316, "grad_norm": 0.04882281646132469, "learning_rate": 0.0026273264269810743, "loss": 4.8313, "step": 523 }, { "epoch": 0.4856348470806302, "grad_norm": 0.0483589768409729, "learning_rate": 0.0026200563992220733, "loss": 4.8438, "step": 524 }, { "epoch": 0.4865616311399444, "grad_norm": 0.05800378695130348, "learning_rate": 0.00261278535371131, "loss": 4.8125, "step": 525 }, { "epoch": 0.48748841519925856, "grad_norm": 0.04723868519067764, "learning_rate": 0.002605513352087477, "loss": 4.7812, "step": 526 }, { "epoch": 0.48841519925857274, "grad_norm": 0.051099590957164764, "learning_rate": 0.0025982404559973704, "loss": 4.8125, "step": 527 }, { "epoch": 0.4893419833178869, "grad_norm": 0.05315464735031128, "learning_rate": 0.00259096672709537, "loss": 4.775, "step": 528 }, { "epoch": 0.4902687673772011, "grad_norm": 0.05382310971617699, "learning_rate": 0.002583692227042916, "loss": 4.7812, "step": 529 }, { "epoch": 0.4911955514365153, "grad_norm": 0.05870763957500458, "learning_rate": 0.002576417017507983, "loss": 4.8625, "step": 530 }, { "epoch": 0.49212233549582945, "grad_norm": 0.03859548643231392, "learning_rate": 0.0025691411601645657, "loss": 4.7938, "step": 531 }, { "epoch": 0.49304911955514363, "grad_norm": 0.05789710581302643, "learning_rate": 0.002561864716692145, "loss": 4.8438, "step": 532 }, { "epoch": 0.4939759036144578, "grad_norm": 0.04865971952676773, "learning_rate": 0.0025545877487751735, "loss": 4.7812, "step": 533 }, { "epoch": 0.494902687673772, "grad_norm": 0.05406877398490906, "learning_rate": 0.0025473103181025475, "loss": 4.8313, "step": 534 }, { "epoch": 0.49582947173308617, "grad_norm": 0.051227353513240814, "learning_rate": 0.002540032486367089, "loss": 4.7562, "step": 535 }, { "epoch": 0.49675625579240035, "grad_norm": 0.05123087763786316, "learning_rate": 0.002532754315265018, "loss": 4.8187, "step": 536 }, { "epoch": 0.4976830398517145, "grad_norm": 0.04913110285997391, "learning_rate": 0.0025254758664954306, "loss": 4.8125, "step": 537 }, { "epoch": 0.4986098239110287, "grad_norm": 0.04741792008280754, "learning_rate": 0.0025181972017597806, "loss": 4.7875, "step": 538 }, { "epoch": 0.4995366079703429, "grad_norm": 0.055246248841285706, "learning_rate": 0.0025109183827613474, "loss": 4.8063, "step": 539 }, { "epoch": 0.5004633920296571, "grad_norm": 0.037354640662670135, "learning_rate": 0.002503639471204722, "loss": 4.75, "step": 540 }, { "epoch": 0.5013901760889713, "grad_norm": 0.04416719824075699, "learning_rate": 0.002496360528795279, "loss": 4.7812, "step": 541 }, { "epoch": 0.5023169601482854, "grad_norm": 0.04072472080588341, "learning_rate": 0.0024890816172386527, "loss": 4.75, "step": 542 }, { "epoch": 0.5032437442075997, "grad_norm": 0.048542048782110214, "learning_rate": 0.002481802798240221, "loss": 4.7688, "step": 543 }, { "epoch": 0.5041705282669138, "grad_norm": 0.05309506133198738, "learning_rate": 0.0024745241335045695, "loss": 4.775, "step": 544 }, { "epoch": 0.505097312326228, "grad_norm": 0.037804365158081055, "learning_rate": 0.0024672456847349834, "loss": 4.75, "step": 545 }, { "epoch": 0.5060240963855421, "grad_norm": 0.045449260622262955, "learning_rate": 0.0024599675136329113, "loss": 4.7625, "step": 546 }, { "epoch": 0.5069508804448564, "grad_norm": 0.046078864485025406, "learning_rate": 0.002452689681897453, "loss": 4.7688, "step": 547 }, { "epoch": 0.5078776645041705, "grad_norm": 0.04518760368227959, "learning_rate": 0.002445412251224827, "loss": 4.7375, "step": 548 }, { "epoch": 0.5088044485634847, "grad_norm": 0.03942165523767471, "learning_rate": 0.002438135283307855, "loss": 4.75, "step": 549 }, { "epoch": 0.5097312326227988, "grad_norm": 0.045819394290447235, "learning_rate": 0.0024308588398354344, "loss": 4.7313, "step": 550 }, { "epoch": 0.5106580166821131, "grad_norm": 0.06149514392018318, "learning_rate": 0.002423582982492017, "loss": 4.7313, "step": 551 }, { "epoch": 0.5115848007414272, "grad_norm": 0.06028604507446289, "learning_rate": 0.002416307772957085, "loss": 4.7438, "step": 552 }, { "epoch": 0.5125115848007414, "grad_norm": 0.043709807097911835, "learning_rate": 0.002409033272904631, "loss": 4.7625, "step": 553 }, { "epoch": 0.5134383688600556, "grad_norm": 0.042988523840904236, "learning_rate": 0.00240175954400263, "loss": 4.7562, "step": 554 }, { "epoch": 0.5143651529193698, "grad_norm": 0.053336091339588165, "learning_rate": 0.002394486647912524, "loss": 4.6875, "step": 555 }, { "epoch": 0.5152919369786839, "grad_norm": 0.061223022639751434, "learning_rate": 0.00238721464628869, "loss": 4.725, "step": 556 }, { "epoch": 0.5162187210379982, "grad_norm": 0.0704147219657898, "learning_rate": 0.0023799436007779277, "loss": 4.6813, "step": 557 }, { "epoch": 0.5171455050973123, "grad_norm": 0.06097421795129776, "learning_rate": 0.002372673573018926, "loss": 4.7625, "step": 558 }, { "epoch": 0.5180722891566265, "grad_norm": 0.04165394976735115, "learning_rate": 0.0023654046246417513, "loss": 4.7125, "step": 559 }, { "epoch": 0.5189990732159406, "grad_norm": 0.040571633726358414, "learning_rate": 0.0023581368172673153, "loss": 4.7625, "step": 560 }, { "epoch": 0.5199258572752549, "grad_norm": 0.04544011875987053, "learning_rate": 0.0023508702125068608, "loss": 4.7625, "step": 561 }, { "epoch": 0.5208526413345691, "grad_norm": 0.04342002421617508, "learning_rate": 0.0023436048719614323, "loss": 4.7313, "step": 562 }, { "epoch": 0.5217794253938832, "grad_norm": 0.041976965963840485, "learning_rate": 0.00233634085722136, "loss": 4.7313, "step": 563 }, { "epoch": 0.5227062094531975, "grad_norm": 0.0512029230594635, "learning_rate": 0.0023290782298657346, "loss": 4.6937, "step": 564 }, { "epoch": 0.5236329935125116, "grad_norm": 0.06346142292022705, "learning_rate": 0.002321817051461883, "loss": 4.675, "step": 565 }, { "epoch": 0.5245597775718258, "grad_norm": 0.05272765830159187, "learning_rate": 0.002314557383564852, "loss": 4.75, "step": 566 }, { "epoch": 0.5254865616311399, "grad_norm": 0.038122035562992096, "learning_rate": 0.002307299287716881, "loss": 4.7125, "step": 567 }, { "epoch": 0.5264133456904542, "grad_norm": 0.042520515620708466, "learning_rate": 0.0023000428254468853, "loss": 4.6875, "step": 568 }, { "epoch": 0.5273401297497683, "grad_norm": 0.05327059328556061, "learning_rate": 0.0022927880582699284, "loss": 4.7438, "step": 569 }, { "epoch": 0.5282669138090825, "grad_norm": 0.10062926262617111, "learning_rate": 0.0022855350476867083, "loss": 5.4125, "step": 570 }, { "epoch": 0.5291936978683967, "grad_norm": 0.19139476120471954, "learning_rate": 0.002278283855183027, "loss": 5.9375, "step": 571 }, { "epoch": 0.5301204819277109, "grad_norm": 0.30302053689956665, "learning_rate": 0.002271034542229279, "loss": 6.1438, "step": 572 }, { "epoch": 0.531047265987025, "grad_norm": 0.3599642515182495, "learning_rate": 0.002263787170279922, "loss": 6.125, "step": 573 }, { "epoch": 0.5319740500463392, "grad_norm": 0.2241661548614502, "learning_rate": 0.00225654180077296, "loss": 5.9938, "step": 574 }, { "epoch": 0.5329008341056534, "grad_norm": 0.10801433026790619, "learning_rate": 0.0022492984951294225, "loss": 5.7938, "step": 575 }, { "epoch": 0.5338276181649676, "grad_norm": 0.11764154583215714, "learning_rate": 0.0022420573147528436, "loss": 5.7812, "step": 576 }, { "epoch": 0.5347544022242817, "grad_norm": 0.08790837973356247, "learning_rate": 0.002234818321028737, "loss": 5.7375, "step": 577 }, { "epoch": 0.535681186283596, "grad_norm": 0.06823479384183884, "learning_rate": 0.002227581575324086, "loss": 5.6438, "step": 578 }, { "epoch": 0.5366079703429101, "grad_norm": 0.0775035172700882, "learning_rate": 0.00222034713898681, "loss": 5.6375, "step": 579 }, { "epoch": 0.5375347544022243, "grad_norm": 0.05802862346172333, "learning_rate": 0.0022131150733452573, "loss": 5.5687, "step": 580 }, { "epoch": 0.5384615384615384, "grad_norm": 0.058500614017248154, "learning_rate": 0.0022058854397076734, "loss": 5.5438, "step": 581 }, { "epoch": 0.5393883225208527, "grad_norm": 0.055464208126068115, "learning_rate": 0.0021986582993616926, "loss": 5.5, "step": 582 }, { "epoch": 0.5403151065801668, "grad_norm": 0.041989766061306, "learning_rate": 0.0021914337135738086, "loss": 5.4563, "step": 583 }, { "epoch": 0.541241890639481, "grad_norm": 0.05176004022359848, "learning_rate": 0.0021842117435888625, "loss": 5.45, "step": 584 }, { "epoch": 0.5421686746987951, "grad_norm": 0.058837149292230606, "learning_rate": 0.0021769924506295168, "loss": 5.4563, "step": 585 }, { "epoch": 0.5430954587581094, "grad_norm": 0.04392680153250694, "learning_rate": 0.002169775895895745, "loss": 5.4062, "step": 586 }, { "epoch": 0.5440222428174235, "grad_norm": 0.05528188496828079, "learning_rate": 0.002162562140564302, "loss": 5.375, "step": 587 }, { "epoch": 0.5449490268767377, "grad_norm": 0.04781576246023178, "learning_rate": 0.002155351245788218, "loss": 5.3938, "step": 588 }, { "epoch": 0.5458758109360519, "grad_norm": 0.0435294434428215, "learning_rate": 0.002148143272696268, "loss": 5.3, "step": 589 }, { "epoch": 0.5468025949953661, "grad_norm": 0.04509313404560089, "learning_rate": 0.002140938282392461, "loss": 5.35, "step": 590 }, { "epoch": 0.5477293790546802, "grad_norm": 0.03679104149341583, "learning_rate": 0.002133736335955522, "loss": 5.2688, "step": 591 }, { "epoch": 0.5486561631139945, "grad_norm": 0.05090980976819992, "learning_rate": 0.0021265374944383682, "loss": 5.2812, "step": 592 }, { "epoch": 0.5495829471733086, "grad_norm": 0.03438156098127365, "learning_rate": 0.0021193418188675994, "loss": 5.2688, "step": 593 }, { "epoch": 0.5505097312326228, "grad_norm": 0.03302653878927231, "learning_rate": 0.002112149370242975, "loss": 5.25, "step": 594 }, { "epoch": 0.5514365152919369, "grad_norm": 0.039244670420885086, "learning_rate": 0.0021049602095368973, "loss": 5.2063, "step": 595 }, { "epoch": 0.5523632993512512, "grad_norm": 0.03585642948746681, "learning_rate": 0.0020977743976939005, "loss": 5.275, "step": 596 }, { "epoch": 0.5532900834105653, "grad_norm": 0.03510696068406105, "learning_rate": 0.0020905919956301236, "loss": 5.2438, "step": 597 }, { "epoch": 0.5542168674698795, "grad_norm": 0.03569590672850609, "learning_rate": 0.0020834130642328054, "loss": 5.175, "step": 598 }, { "epoch": 0.5551436515291936, "grad_norm": 0.030981766059994698, "learning_rate": 0.0020762376643597585, "loss": 5.2, "step": 599 }, { "epoch": 0.5560704355885079, "grad_norm": 0.04017426446080208, "learning_rate": 0.0020690658568388613, "loss": 5.15, "step": 600 }, { "epoch": 0.556997219647822, "grad_norm": 0.039772696793079376, "learning_rate": 0.0020618977024675356, "loss": 5.125, "step": 601 }, { "epoch": 0.5579240037071362, "grad_norm": 0.043551571667194366, "learning_rate": 0.002054733262012238, "loss": 5.1438, "step": 602 }, { "epoch": 0.5588507877664504, "grad_norm": 0.03988911956548691, "learning_rate": 0.0020475725962079373, "loss": 5.1688, "step": 603 }, { "epoch": 0.5597775718257646, "grad_norm": 0.03845544904470444, "learning_rate": 0.0020404157657576073, "loss": 5.1375, "step": 604 }, { "epoch": 0.5607043558850788, "grad_norm": 0.048617441207170486, "learning_rate": 0.002033262831331705, "loss": 5.15, "step": 605 }, { "epoch": 0.561631139944393, "grad_norm": 0.03950534015893936, "learning_rate": 0.0020261138535676614, "loss": 5.1312, "step": 606 }, { "epoch": 0.5625579240037072, "grad_norm": 0.04601586237549782, "learning_rate": 0.002018968893069368, "loss": 5.0687, "step": 607 }, { "epoch": 0.5634847080630213, "grad_norm": 0.048377152532339096, "learning_rate": 0.002011828010406656, "loss": 5.0625, "step": 608 }, { "epoch": 0.5644114921223355, "grad_norm": 0.04253297671675682, "learning_rate": 0.0020046912661147915, "loss": 5.1, "step": 609 }, { "epoch": 0.5653382761816497, "grad_norm": 0.04242146387696266, "learning_rate": 0.001997558720693956, "loss": 5.0813, "step": 610 }, { "epoch": 0.5662650602409639, "grad_norm": 0.042660947889089584, "learning_rate": 0.001990430434608739, "loss": 5.1188, "step": 611 }, { "epoch": 0.567191844300278, "grad_norm": 0.03864769637584686, "learning_rate": 0.0019833064682876177, "loss": 5.0625, "step": 612 }, { "epoch": 0.5681186283595923, "grad_norm": 0.03322991728782654, "learning_rate": 0.0019761868821224545, "loss": 5.0375, "step": 613 }, { "epoch": 0.5690454124189064, "grad_norm": 0.032155055552721024, "learning_rate": 0.001969071736467977, "loss": 5.0687, "step": 614 }, { "epoch": 0.5699721964782206, "grad_norm": 0.04553236439824104, "learning_rate": 0.0019619610916412704, "loss": 5.1, "step": 615 }, { "epoch": 0.5708989805375347, "grad_norm": 0.039135731756687164, "learning_rate": 0.001954855007921265, "loss": 5.025, "step": 616 }, { "epoch": 0.571825764596849, "grad_norm": 0.03503022342920303, "learning_rate": 0.0019477535455482242, "loss": 5.0312, "step": 617 }, { "epoch": 0.5727525486561631, "grad_norm": 0.02648424543440342, "learning_rate": 0.0019406567647232366, "loss": 5.0125, "step": 618 }, { "epoch": 0.5736793327154773, "grad_norm": 0.030889399349689484, "learning_rate": 0.0019335647256077037, "loss": 5.0312, "step": 619 }, { "epoch": 0.5746061167747915, "grad_norm": 0.028193505480885506, "learning_rate": 0.0019264774883228286, "loss": 5.0563, "step": 620 }, { "epoch": 0.5755329008341057, "grad_norm": 0.039721377193927765, "learning_rate": 0.0019193951129491112, "loss": 4.9563, "step": 621 }, { "epoch": 0.5764596848934198, "grad_norm": 0.0343133881688118, "learning_rate": 0.0019123176595258306, "loss": 5.0, "step": 622 }, { "epoch": 0.577386468952734, "grad_norm": 0.03925079479813576, "learning_rate": 0.0019052451880505472, "loss": 5.05, "step": 623 }, { "epoch": 0.5783132530120482, "grad_norm": 0.061298515647649765, "learning_rate": 0.0018981777584785823, "loss": 5.0, "step": 624 }, { "epoch": 0.5792400370713624, "grad_norm": 0.045300450176000595, "learning_rate": 0.0018911154307225204, "loss": 4.975, "step": 625 }, { "epoch": 0.5801668211306765, "grad_norm": 0.03320182114839554, "learning_rate": 0.0018840582646516924, "loss": 4.9938, "step": 626 }, { "epoch": 0.5810936051899908, "grad_norm": 0.04246627911925316, "learning_rate": 0.0018770063200916757, "loss": 4.9625, "step": 627 }, { "epoch": 0.5820203892493049, "grad_norm": 0.04181812331080437, "learning_rate": 0.0018699596568237799, "loss": 4.9875, "step": 628 }, { "epoch": 0.5829471733086191, "grad_norm": 0.038650691509246826, "learning_rate": 0.0018629183345845477, "loss": 4.9625, "step": 629 }, { "epoch": 0.5838739573679332, "grad_norm": 0.03198286145925522, "learning_rate": 0.0018558824130652399, "loss": 4.9125, "step": 630 }, { "epoch": 0.5848007414272475, "grad_norm": 0.030322790145874023, "learning_rate": 0.0018488519519113387, "loss": 4.9563, "step": 631 }, { "epoch": 0.5857275254865616, "grad_norm": 0.03637656942009926, "learning_rate": 0.0018418270107220325, "loss": 4.9625, "step": 632 }, { "epoch": 0.5866543095458758, "grad_norm": 0.03812320902943611, "learning_rate": 0.001834807649049719, "loss": 4.9062, "step": 633 }, { "epoch": 0.58758109360519, "grad_norm": 0.038305167108774185, "learning_rate": 0.001827793926399495, "loss": 4.9062, "step": 634 }, { "epoch": 0.5885078776645042, "grad_norm": 0.03868838772177696, "learning_rate": 0.0018207859022286543, "loss": 4.95, "step": 635 }, { "epoch": 0.5894346617238183, "grad_norm": 0.05012492835521698, "learning_rate": 0.0018137836359461822, "loss": 4.9125, "step": 636 }, { "epoch": 0.5903614457831325, "grad_norm": 0.04664972424507141, "learning_rate": 0.0018067871869122559, "loss": 4.9188, "step": 637 }, { "epoch": 0.5912882298424467, "grad_norm": 0.03777710720896721, "learning_rate": 0.0017997966144377327, "loss": 4.9, "step": 638 }, { "epoch": 0.5922150139017609, "grad_norm": 0.04331712797284126, "learning_rate": 0.0017928119777836581, "loss": 4.9062, "step": 639 }, { "epoch": 0.593141797961075, "grad_norm": 0.04469927027821541, "learning_rate": 0.0017858333361607537, "loss": 4.9188, "step": 640 }, { "epoch": 0.5940685820203893, "grad_norm": 0.02936607599258423, "learning_rate": 0.0017788607487289232, "loss": 4.9188, "step": 641 }, { "epoch": 0.5949953660797034, "grad_norm": 0.05327693372964859, "learning_rate": 0.0017718942745967442, "loss": 4.9437, "step": 642 }, { "epoch": 0.5959221501390176, "grad_norm": 0.04499313235282898, "learning_rate": 0.0017649339728209726, "loss": 4.9125, "step": 643 }, { "epoch": 0.5968489341983317, "grad_norm": 0.03915273770689964, "learning_rate": 0.0017579799024060366, "loss": 4.9437, "step": 644 }, { "epoch": 0.597775718257646, "grad_norm": 0.04526703059673309, "learning_rate": 0.0017510321223035436, "loss": 4.9062, "step": 645 }, { "epoch": 0.5987025023169601, "grad_norm": 0.05192454531788826, "learning_rate": 0.001744090691411771, "loss": 4.8438, "step": 646 }, { "epoch": 0.5996292863762743, "grad_norm": 0.06659810990095139, "learning_rate": 0.0017371556685751776, "loss": 4.875, "step": 647 }, { "epoch": 0.6005560704355886, "grad_norm": 0.026750769466161728, "learning_rate": 0.0017302271125838944, "loss": 4.8688, "step": 648 }, { "epoch": 0.6014828544949027, "grad_norm": 0.05911999195814133, "learning_rate": 0.0017233050821732344, "loss": 4.9125, "step": 649 }, { "epoch": 0.6024096385542169, "grad_norm": 0.046929407864809036, "learning_rate": 0.0017163896360231918, "loss": 4.8438, "step": 650 }, { "epoch": 0.603336422613531, "grad_norm": 0.0461881086230278, "learning_rate": 0.00170948083275794, "loss": 4.8812, "step": 651 }, { "epoch": 0.6042632066728453, "grad_norm": 0.041216105222702026, "learning_rate": 0.0017025787309453443, "loss": 4.8625, "step": 652 }, { "epoch": 0.6051899907321594, "grad_norm": 0.047575026750564575, "learning_rate": 0.001695683389096455, "loss": 4.8625, "step": 653 }, { "epoch": 0.6061167747914736, "grad_norm": 0.039946090430021286, "learning_rate": 0.001688794865665021, "loss": 4.8688, "step": 654 }, { "epoch": 0.6070435588507878, "grad_norm": 0.03767408803105354, "learning_rate": 0.0016819132190469843, "loss": 4.8563, "step": 655 }, { "epoch": 0.607970342910102, "grad_norm": 0.046980541199445724, "learning_rate": 0.0016750385075799952, "loss": 4.8438, "step": 656 }, { "epoch": 0.6088971269694161, "grad_norm": 0.04574093222618103, "learning_rate": 0.0016681707895429094, "loss": 4.8563, "step": 657 }, { "epoch": 0.6098239110287303, "grad_norm": 0.049847353249788284, "learning_rate": 0.0016613101231552987, "loss": 4.8563, "step": 658 }, { "epoch": 0.6107506950880445, "grad_norm": 0.04778432473540306, "learning_rate": 0.0016544565665769558, "loss": 4.8625, "step": 659 }, { "epoch": 0.6116774791473587, "grad_norm": 0.042641252279281616, "learning_rate": 0.001647610177907403, "loss": 4.875, "step": 660 }, { "epoch": 0.6126042632066728, "grad_norm": 0.03789043426513672, "learning_rate": 0.001640771015185395, "loss": 4.8563, "step": 661 }, { "epoch": 0.6135310472659871, "grad_norm": 0.0583447702229023, "learning_rate": 0.0016339391363884334, "loss": 4.7875, "step": 662 }, { "epoch": 0.6144578313253012, "grad_norm": 0.047329407185316086, "learning_rate": 0.0016271145994322693, "loss": 4.8313, "step": 663 }, { "epoch": 0.6153846153846154, "grad_norm": 0.051290351897478104, "learning_rate": 0.0016202974621704175, "loss": 4.8438, "step": 664 }, { "epoch": 0.6163113994439295, "grad_norm": 0.04638203606009483, "learning_rate": 0.0016134877823936607, "loss": 4.8187, "step": 665 }, { "epoch": 0.6172381835032438, "grad_norm": 0.0436415858566761, "learning_rate": 0.0016066856178295658, "loss": 4.8063, "step": 666 }, { "epoch": 0.6181649675625579, "grad_norm": 0.05077355355024338, "learning_rate": 0.0015998910261419874, "loss": 4.8063, "step": 667 }, { "epoch": 0.6190917516218721, "grad_norm": 0.05078209191560745, "learning_rate": 0.0015931040649305862, "loss": 4.8688, "step": 668 }, { "epoch": 0.6200185356811863, "grad_norm": 0.06357160210609436, "learning_rate": 0.0015863247917303337, "loss": 4.8313, "step": 669 }, { "epoch": 0.6209453197405005, "grad_norm": 0.03996184095740318, "learning_rate": 0.0015795532640110316, "loss": 4.8688, "step": 670 }, { "epoch": 0.6218721037998146, "grad_norm": 0.05953163281083107, "learning_rate": 0.0015727895391768176, "loss": 4.7938, "step": 671 }, { "epoch": 0.6227988878591288, "grad_norm": 0.05362982302904129, "learning_rate": 0.0015660336745656862, "loss": 4.7875, "step": 672 }, { "epoch": 0.623725671918443, "grad_norm": 0.03395141288638115, "learning_rate": 0.001559285727448993, "loss": 4.7875, "step": 673 }, { "epoch": 0.6246524559777572, "grad_norm": 0.06038745865225792, "learning_rate": 0.0015525457550309802, "loss": 4.775, "step": 674 }, { "epoch": 0.6255792400370713, "grad_norm": 0.04683006927371025, "learning_rate": 0.0015458138144482832, "loss": 4.8625, "step": 675 }, { "epoch": 0.6265060240963856, "grad_norm": 0.04466160014271736, "learning_rate": 0.0015390899627694505, "loss": 4.7812, "step": 676 }, { "epoch": 0.6274328081556997, "grad_norm": 0.054469116032123566, "learning_rate": 0.0015323742569944572, "loss": 4.775, "step": 677 }, { "epoch": 0.6283595922150139, "grad_norm": 0.05092649534344673, "learning_rate": 0.001525666754054226, "loss": 4.775, "step": 678 }, { "epoch": 0.629286376274328, "grad_norm": 0.04114770516753197, "learning_rate": 0.0015189675108101385, "loss": 4.8063, "step": 679 }, { "epoch": 0.6302131603336423, "grad_norm": 0.04045185446739197, "learning_rate": 0.0015122765840535602, "loss": 4.8063, "step": 680 }, { "epoch": 0.6311399443929564, "grad_norm": 0.04068306088447571, "learning_rate": 0.0015055940305053511, "loss": 4.7688, "step": 681 }, { "epoch": 0.6320667284522706, "grad_norm": 0.048991914838552475, "learning_rate": 0.0014989199068153936, "loss": 4.7812, "step": 682 }, { "epoch": 0.6329935125115848, "grad_norm": 0.04630220681428909, "learning_rate": 0.0014922542695621041, "loss": 4.8313, "step": 683 }, { "epoch": 0.633920296570899, "grad_norm": 0.05090312659740448, "learning_rate": 0.0014855971752519607, "loss": 4.75, "step": 684 }, { "epoch": 0.6348470806302131, "grad_norm": 0.03676120191812515, "learning_rate": 0.001478948680319016, "loss": 4.775, "step": 685 }, { "epoch": 0.6357738646895273, "grad_norm": 0.04959641024470329, "learning_rate": 0.001472308841124429, "loss": 4.8063, "step": 686 }, { "epoch": 0.6367006487488415, "grad_norm": 0.04228943958878517, "learning_rate": 0.0014656777139559754, "loss": 4.8125, "step": 687 }, { "epoch": 0.6376274328081557, "grad_norm": 0.04116208478808403, "learning_rate": 0.001459055355027582, "loss": 4.7562, "step": 688 }, { "epoch": 0.6385542168674698, "grad_norm": 0.05446736142039299, "learning_rate": 0.0014524418204788405, "loss": 4.75, "step": 689 }, { "epoch": 0.6394810009267841, "grad_norm": 0.04483804479241371, "learning_rate": 0.0014458371663745402, "loss": 4.7688, "step": 690 }, { "epoch": 0.6404077849860983, "grad_norm": 0.04954027384519577, "learning_rate": 0.0014392414487041838, "loss": 4.6937, "step": 691 }, { "epoch": 0.6413345690454124, "grad_norm": 0.043852776288986206, "learning_rate": 0.00143265472338152, "loss": 4.7938, "step": 692 }, { "epoch": 0.6422613531047267, "grad_norm": 0.046749938279390335, "learning_rate": 0.001426077046244068, "loss": 4.7688, "step": 693 }, { "epoch": 0.6431881371640408, "grad_norm": 0.05037090927362442, "learning_rate": 0.0014195084730526395, "loss": 4.7562, "step": 694 }, { "epoch": 0.644114921223355, "grad_norm": 0.0452822744846344, "learning_rate": 0.0014129490594908729, "loss": 4.8, "step": 695 }, { "epoch": 0.6450417052826691, "grad_norm": 0.03884583339095116, "learning_rate": 0.001406398861164754, "loss": 4.725, "step": 696 }, { "epoch": 0.6459684893419834, "grad_norm": 0.04877614974975586, "learning_rate": 0.0013998579336021535, "loss": 4.7063, "step": 697 }, { "epoch": 0.6468952734012975, "grad_norm": 0.043750159442424774, "learning_rate": 0.0013933263322523466, "loss": 4.7063, "step": 698 }, { "epoch": 0.6478220574606117, "grad_norm": 0.047424763441085815, "learning_rate": 0.0013868041124855508, "loss": 4.7562, "step": 699 }, { "epoch": 0.6487488415199258, "grad_norm": 0.044932421296834946, "learning_rate": 0.0013802913295924508, "loss": 4.6875, "step": 700 }, { "epoch": 0.6496756255792401, "grad_norm": 0.03677170351147652, "learning_rate": 0.0013737880387837348, "loss": 4.7688, "step": 701 }, { "epoch": 0.6506024096385542, "grad_norm": 0.049118272960186005, "learning_rate": 0.0013672942951896206, "loss": 4.7188, "step": 702 }, { "epoch": 0.6515291936978684, "grad_norm": 0.06206013634800911, "learning_rate": 0.0013608101538593964, "loss": 4.75, "step": 703 }, { "epoch": 0.6524559777571826, "grad_norm": 0.045777998864650726, "learning_rate": 0.0013543356697609439, "loss": 4.8063, "step": 704 }, { "epoch": 0.6533827618164968, "grad_norm": 0.06643692404031754, "learning_rate": 0.0013478708977802823, "loss": 4.7438, "step": 705 }, { "epoch": 0.6543095458758109, "grad_norm": 0.05065048485994339, "learning_rate": 0.0013414158927210946, "loss": 4.7375, "step": 706 }, { "epoch": 0.6552363299351252, "grad_norm": 0.047690439969301224, "learning_rate": 0.0013349707093042707, "loss": 4.75, "step": 707 }, { "epoch": 0.6561631139944393, "grad_norm": 0.05915187671780586, "learning_rate": 0.0013285354021674361, "loss": 4.675, "step": 708 }, { "epoch": 0.6570898980537535, "grad_norm": 0.04628239572048187, "learning_rate": 0.0013221100258644957, "loss": 4.7375, "step": 709 }, { "epoch": 0.6580166821130676, "grad_norm": 0.04324619472026825, "learning_rate": 0.0013156946348651644, "loss": 4.7, "step": 710 }, { "epoch": 0.6589434661723819, "grad_norm": 0.048746492713689804, "learning_rate": 0.0013092892835545123, "loss": 4.7438, "step": 711 }, { "epoch": 0.659870250231696, "grad_norm": 0.04211176931858063, "learning_rate": 0.001302894026232497, "loss": 4.7188, "step": 712 }, { "epoch": 0.6607970342910102, "grad_norm": 0.04411826655268669, "learning_rate": 0.0012965089171135097, "loss": 4.7375, "step": 713 }, { "epoch": 0.6617238183503243, "grad_norm": 0.049165111035108566, "learning_rate": 0.0012901340103259097, "loss": 4.7, "step": 714 }, { "epoch": 0.6626506024096386, "grad_norm": 0.04350108280777931, "learning_rate": 0.0012837693599115707, "loss": 4.6813, "step": 715 }, { "epoch": 0.6635773864689527, "grad_norm": 0.053538527339696884, "learning_rate": 0.001277415019825417, "loss": 4.7375, "step": 716 }, { "epoch": 0.6645041705282669, "grad_norm": 0.03999413177371025, "learning_rate": 0.0012710710439349739, "loss": 4.6625, "step": 717 }, { "epoch": 0.6654309545875811, "grad_norm": 0.05112524330615997, "learning_rate": 0.0012647374860199018, "loss": 4.7375, "step": 718 }, { "epoch": 0.6663577386468953, "grad_norm": 0.03731364756822586, "learning_rate": 0.0012584143997715486, "loss": 4.6625, "step": 719 }, { "epoch": 0.6672845227062094, "grad_norm": 0.036096684634685516, "learning_rate": 0.0012521018387924884, "loss": 4.7, "step": 720 }, { "epoch": 0.6682113067655236, "grad_norm": 0.040185850113630295, "learning_rate": 0.0012457998565960724, "loss": 4.7, "step": 721 }, { "epoch": 0.6691380908248378, "grad_norm": 0.03686061128973961, "learning_rate": 0.0012395085066059686, "loss": 4.7125, "step": 722 }, { "epoch": 0.670064874884152, "grad_norm": 0.04309338331222534, "learning_rate": 0.0012332278421557175, "loss": 4.6875, "step": 723 }, { "epoch": 0.6709916589434661, "grad_norm": 0.033990684896707535, "learning_rate": 0.0012269579164882706, "loss": 4.7, "step": 724 }, { "epoch": 0.6719184430027804, "grad_norm": 0.06331422179937363, "learning_rate": 0.0012206987827555469, "loss": 4.6875, "step": 725 }, { "epoch": 0.6728452270620945, "grad_norm": 0.05111413821578026, "learning_rate": 0.0012144504940179793, "loss": 4.6625, "step": 726 }, { "epoch": 0.6737720111214087, "grad_norm": 0.039602335542440414, "learning_rate": 0.0012082131032440616, "loss": 4.6562, "step": 727 }, { "epoch": 0.6746987951807228, "grad_norm": 0.0525193028151989, "learning_rate": 0.0012019866633099052, "loss": 4.6562, "step": 728 }, { "epoch": 0.6756255792400371, "grad_norm": 0.04521778225898743, "learning_rate": 0.001195771226998789, "loss": 4.675, "step": 729 }, { "epoch": 0.6765523632993512, "grad_norm": 0.042900171130895615, "learning_rate": 0.0011895668470007067, "loss": 4.675, "step": 730 }, { "epoch": 0.6774791473586654, "grad_norm": 0.046152036637067795, "learning_rate": 0.0011833735759119303, "loss": 4.6375, "step": 731 }, { "epoch": 0.6784059314179796, "grad_norm": 0.03777175024151802, "learning_rate": 0.0011771914662345527, "loss": 4.7125, "step": 732 }, { "epoch": 0.6793327154772938, "grad_norm": 0.04087323322892189, "learning_rate": 0.0011710205703760535, "loss": 4.6875, "step": 733 }, { "epoch": 0.680259499536608, "grad_norm": 0.03955033794045448, "learning_rate": 0.0011648609406488455, "loss": 4.6562, "step": 734 }, { "epoch": 0.6811862835959221, "grad_norm": 0.030934706330299377, "learning_rate": 0.001158712629269838, "loss": 4.6438, "step": 735 }, { "epoch": 0.6821130676552364, "grad_norm": 0.03988910838961601, "learning_rate": 0.0011525756883599915, "loss": 4.6438, "step": 736 }, { "epoch": 0.6830398517145505, "grad_norm": 0.03788105770945549, "learning_rate": 0.0011464501699438728, "loss": 4.65, "step": 737 }, { "epoch": 0.6839666357738647, "grad_norm": 0.04469624534249306, "learning_rate": 0.0011403361259492218, "loss": 4.6937, "step": 738 }, { "epoch": 0.6848934198331789, "grad_norm": 0.04028180614113808, "learning_rate": 0.001134233608206502, "loss": 4.65, "step": 739 }, { "epoch": 0.6858202038924931, "grad_norm": 0.04203322157263756, "learning_rate": 0.0011281426684484686, "loss": 4.65, "step": 740 }, { "epoch": 0.6867469879518072, "grad_norm": 0.045880451798439026, "learning_rate": 0.0011220633583097247, "loss": 4.65, "step": 741 }, { "epoch": 0.6876737720111215, "grad_norm": 0.0346485935151577, "learning_rate": 0.0011159957293262886, "loss": 4.6562, "step": 742 }, { "epoch": 0.6886005560704356, "grad_norm": 0.048363398760557175, "learning_rate": 0.0011099398329351515, "loss": 4.6438, "step": 743 }, { "epoch": 0.6895273401297498, "grad_norm": 0.0373103991150856, "learning_rate": 0.0011038957204738465, "loss": 4.6813, "step": 744 }, { "epoch": 0.6904541241890639, "grad_norm": 0.043777722865343094, "learning_rate": 0.001097863443180008, "loss": 4.6688, "step": 745 }, { "epoch": 0.6913809082483782, "grad_norm": 0.03708568960428238, "learning_rate": 0.0010918430521909442, "loss": 4.6688, "step": 746 }, { "epoch": 0.6923076923076923, "grad_norm": 0.04273151233792305, "learning_rate": 0.0010858345985431956, "loss": 4.6312, "step": 747 }, { "epoch": 0.6932344763670065, "grad_norm": 0.04535781592130661, "learning_rate": 0.0010798381331721108, "loss": 4.675, "step": 748 }, { "epoch": 0.6941612604263206, "grad_norm": 0.03782697021961212, "learning_rate": 0.0010738537069114062, "loss": 4.675, "step": 749 }, { "epoch": 0.6950880444856349, "grad_norm": 0.04372243955731392, "learning_rate": 0.0010678813704927434, "loss": 4.6625, "step": 750 }, { "epoch": 0.696014828544949, "grad_norm": 0.04960807040333748, "learning_rate": 0.0010619211745452912, "loss": 4.6375, "step": 751 }, { "epoch": 0.6969416126042632, "grad_norm": 0.040741242468357086, "learning_rate": 0.001055973169595303, "loss": 4.6375, "step": 752 }, { "epoch": 0.6978683966635774, "grad_norm": 0.04263027384877205, "learning_rate": 0.0010500374060656839, "loss": 4.5938, "step": 753 }, { "epoch": 0.6987951807228916, "grad_norm": 0.046234361827373505, "learning_rate": 0.001044113934275567, "loss": 4.6688, "step": 754 }, { "epoch": 0.6997219647822057, "grad_norm": 0.03574342280626297, "learning_rate": 0.0010382028044398823, "loss": 4.6375, "step": 755 }, { "epoch": 0.70064874884152, "grad_norm": 0.044964589178562164, "learning_rate": 0.0010323040666689366, "loss": 4.6312, "step": 756 }, { "epoch": 0.7015755329008341, "grad_norm": 0.037156179547309875, "learning_rate": 0.001026417770967982, "loss": 4.6188, "step": 757 }, { "epoch": 0.7025023169601483, "grad_norm": 0.046747058629989624, "learning_rate": 0.0010205439672368, "loss": 4.5875, "step": 758 }, { "epoch": 0.7034291010194624, "grad_norm": 0.042588070034980774, "learning_rate": 0.0010146827052692701, "loss": 4.6125, "step": 759 }, { "epoch": 0.7043558850787767, "grad_norm": 0.036094602197408676, "learning_rate": 0.0010088340347529552, "loss": 4.6625, "step": 760 }, { "epoch": 0.7052826691380908, "grad_norm": 0.03903704881668091, "learning_rate": 0.0010029980052686733, "loss": 4.5875, "step": 761 }, { "epoch": 0.706209453197405, "grad_norm": 0.045382946729660034, "learning_rate": 0.0009971746662900851, "loss": 4.6375, "step": 762 }, { "epoch": 0.7071362372567191, "grad_norm": 0.04216109961271286, "learning_rate": 0.0009913640671832663, "loss": 4.6063, "step": 763 }, { "epoch": 0.7080630213160334, "grad_norm": 0.044599149376153946, "learning_rate": 0.0009855662572062962, "loss": 4.625, "step": 764 }, { "epoch": 0.7089898053753475, "grad_norm": 0.0511021688580513, "learning_rate": 0.0009797812855088348, "loss": 4.5875, "step": 765 }, { "epoch": 0.7099165894346617, "grad_norm": 0.04359891265630722, "learning_rate": 0.0009740092011317095, "loss": 4.6688, "step": 766 }, { "epoch": 0.7108433734939759, "grad_norm": 0.047334376722574234, "learning_rate": 0.0009682500530064992, "loss": 4.5875, "step": 767 }, { "epoch": 0.7117701575532901, "grad_norm": 0.04199070855975151, "learning_rate": 0.0009625038899551161, "loss": 4.625, "step": 768 }, { "epoch": 0.7126969416126042, "grad_norm": 0.057890091091394424, "learning_rate": 0.0009567707606893971, "loss": 4.6125, "step": 769 }, { "epoch": 0.7136237256719185, "grad_norm": 0.04788359999656677, "learning_rate": 0.0009510507138106853, "loss": 4.5875, "step": 770 }, { "epoch": 0.7145505097312326, "grad_norm": 0.04499724879860878, "learning_rate": 0.0009453437978094223, "loss": 4.5938, "step": 771 }, { "epoch": 0.7154772937905468, "grad_norm": 0.04197373613715172, "learning_rate": 0.0009396500610647368, "loss": 4.6562, "step": 772 }, { "epoch": 0.7164040778498609, "grad_norm": 0.048124760389328, "learning_rate": 0.00093396955184403, "loss": 4.625, "step": 773 }, { "epoch": 0.7173308619091752, "grad_norm": 0.05138612538576126, "learning_rate": 0.000928302318302573, "loss": 4.575, "step": 774 }, { "epoch": 0.7182576459684893, "grad_norm": 0.044739775359630585, "learning_rate": 0.0009226484084830918, "loss": 4.625, "step": 775 }, { "epoch": 0.7191844300278035, "grad_norm": 0.04016095772385597, "learning_rate": 0.0009170078703153676, "loss": 4.6063, "step": 776 }, { "epoch": 0.7201112140871178, "grad_norm": 0.05538894608616829, "learning_rate": 0.000911380751615822, "loss": 4.625, "step": 777 }, { "epoch": 0.7210379981464319, "grad_norm": 0.04083118215203285, "learning_rate": 0.0009057671000871195, "loss": 4.6063, "step": 778 }, { "epoch": 0.7219647822057461, "grad_norm": 0.05446457862854004, "learning_rate": 0.0009001669633177587, "loss": 4.575, "step": 779 }, { "epoch": 0.7228915662650602, "grad_norm": 0.03577585890889168, "learning_rate": 0.0008945803887816678, "loss": 4.6, "step": 780 }, { "epoch": 0.7238183503243745, "grad_norm": 0.04933847859501839, "learning_rate": 0.0008890074238378073, "loss": 4.5875, "step": 781 }, { "epoch": 0.7247451343836886, "grad_norm": 0.03600107133388519, "learning_rate": 0.0008834481157297625, "loss": 4.5875, "step": 782 }, { "epoch": 0.7256719184430028, "grad_norm": 0.05166667327284813, "learning_rate": 0.0008779025115853482, "loss": 4.5938, "step": 783 }, { "epoch": 0.726598702502317, "grad_norm": 0.03323368355631828, "learning_rate": 0.0008723706584162044, "loss": 4.5563, "step": 784 }, { "epoch": 0.7275254865616312, "grad_norm": 0.04717453941702843, "learning_rate": 0.0008668526031174034, "loss": 4.6125, "step": 785 }, { "epoch": 0.7284522706209453, "grad_norm": 0.04695433750748634, "learning_rate": 0.0008613483924670457, "loss": 4.5875, "step": 786 }, { "epoch": 0.7293790546802595, "grad_norm": 0.04457440972328186, "learning_rate": 0.00085585807312587, "loss": 4.6, "step": 787 }, { "epoch": 0.7303058387395737, "grad_norm": 0.04753506928682327, "learning_rate": 0.0008503816916368512, "loss": 4.5687, "step": 788 }, { "epoch": 0.7312326227988879, "grad_norm": 0.04823901131749153, "learning_rate": 0.0008449192944248127, "loss": 4.5625, "step": 789 }, { "epoch": 0.732159406858202, "grad_norm": 0.041306272149086, "learning_rate": 0.0008394709277960255, "loss": 4.5563, "step": 790 }, { "epoch": 0.7330861909175163, "grad_norm": 0.054446831345558167, "learning_rate": 0.0008340366379378234, "loss": 4.55, "step": 791 }, { "epoch": 0.7340129749768304, "grad_norm": 0.03289240226149559, "learning_rate": 0.0008286164709182031, "loss": 4.575, "step": 792 }, { "epoch": 0.7349397590361446, "grad_norm": 0.04518633335828781, "learning_rate": 0.0008232104726854425, "loss": 4.6, "step": 793 }, { "epoch": 0.7358665430954587, "grad_norm": 0.03345628082752228, "learning_rate": 0.0008178186890677027, "loss": 4.55, "step": 794 }, { "epoch": 0.736793327154773, "grad_norm": 0.046789661049842834, "learning_rate": 0.0008124411657726471, "loss": 4.575, "step": 795 }, { "epoch": 0.7377201112140871, "grad_norm": 0.03443962708115578, "learning_rate": 0.0008070779483870469, "loss": 4.55, "step": 796 }, { "epoch": 0.7386468952734013, "grad_norm": 0.04330628737807274, "learning_rate": 0.0008017290823764014, "loss": 4.5563, "step": 797 }, { "epoch": 0.7395736793327155, "grad_norm": 0.032368697226047516, "learning_rate": 0.0007963946130845462, "loss": 4.5438, "step": 798 }, { "epoch": 0.7405004633920297, "grad_norm": 0.04270923137664795, "learning_rate": 0.0007910745857332749, "loss": 4.6, "step": 799 }, { "epoch": 0.7414272474513438, "grad_norm": 0.03373492881655693, "learning_rate": 0.0007857690454219494, "loss": 4.5687, "step": 800 }, { "epoch": 0.742354031510658, "grad_norm": 0.03647404536604881, "learning_rate": 0.0007804780371271248, "loss": 4.5125, "step": 801 }, { "epoch": 0.7432808155699722, "grad_norm": 0.037898655980825424, "learning_rate": 0.0007752016057021596, "loss": 4.5687, "step": 802 }, { "epoch": 0.7442075996292864, "grad_norm": 0.0339631550014019, "learning_rate": 0.0007699397958768451, "loss": 4.575, "step": 803 }, { "epoch": 0.7451343836886005, "grad_norm": 0.03792402520775795, "learning_rate": 0.0007646926522570166, "loss": 4.5687, "step": 804 }, { "epoch": 0.7460611677479148, "grad_norm": 0.03865986317396164, "learning_rate": 0.0007594602193241839, "loss": 4.5312, "step": 805 }, { "epoch": 0.7469879518072289, "grad_norm": 0.03740232065320015, "learning_rate": 0.0007542425414351462, "loss": 4.55, "step": 806 }, { "epoch": 0.7479147358665431, "grad_norm": 0.03663860633969307, "learning_rate": 0.0007490396628216237, "loss": 4.55, "step": 807 }, { "epoch": 0.7488415199258572, "grad_norm": 0.0422244630753994, "learning_rate": 0.0007438516275898762, "loss": 4.5563, "step": 808 }, { "epoch": 0.7497683039851715, "grad_norm": 0.03552339971065521, "learning_rate": 0.0007386784797203335, "loss": 4.5563, "step": 809 }, { "epoch": 0.7506950880444856, "grad_norm": 0.03856317326426506, "learning_rate": 0.0007335202630672222, "loss": 4.5188, "step": 810 }, { "epoch": 0.7516218721037998, "grad_norm": 0.03579216077923775, "learning_rate": 0.0007283770213581889, "loss": 4.525, "step": 811 }, { "epoch": 0.752548656163114, "grad_norm": 0.04030256345868111, "learning_rate": 0.0007232487981939371, "loss": 4.5563, "step": 812 }, { "epoch": 0.7534754402224282, "grad_norm": 0.03762529417872429, "learning_rate": 0.0007181356370478531, "loss": 4.55, "step": 813 }, { "epoch": 0.7544022242817423, "grad_norm": 0.03724801167845726, "learning_rate": 0.0007130375812656365, "loss": 4.5375, "step": 814 }, { "epoch": 0.7553290083410565, "grad_norm": 0.03805640712380409, "learning_rate": 0.000707954674064937, "loss": 4.575, "step": 815 }, { "epoch": 0.7562557924003707, "grad_norm": 0.0410294272005558, "learning_rate": 0.0007028869585349828, "loss": 4.5625, "step": 816 }, { "epoch": 0.7571825764596849, "grad_norm": 0.0386902280151844, "learning_rate": 0.0006978344776362214, "loss": 4.5188, "step": 817 }, { "epoch": 0.758109360518999, "grad_norm": 0.037720050662755966, "learning_rate": 0.000692797274199948, "loss": 4.55, "step": 818 }, { "epoch": 0.7590361445783133, "grad_norm": 0.029812660068273544, "learning_rate": 0.0006877753909279508, "loss": 4.475, "step": 819 }, { "epoch": 0.7599629286376274, "grad_norm": 0.04356846958398819, "learning_rate": 0.0006827688703921406, "loss": 4.4938, "step": 820 }, { "epoch": 0.7608897126969416, "grad_norm": 0.03893793001770973, "learning_rate": 0.0006777777550341977, "loss": 4.5188, "step": 821 }, { "epoch": 0.7618164967562558, "grad_norm": 0.0387520007789135, "learning_rate": 0.0006728020871652046, "loss": 4.5188, "step": 822 }, { "epoch": 0.76274328081557, "grad_norm": 0.0450495183467865, "learning_rate": 0.0006678419089652943, "loss": 4.5438, "step": 823 }, { "epoch": 0.7636700648748842, "grad_norm": 0.04003477469086647, "learning_rate": 0.0006628972624832891, "loss": 4.5813, "step": 824 }, { "epoch": 0.7645968489341983, "grad_norm": 0.05103557929396629, "learning_rate": 0.0006579681896363418, "loss": 4.5188, "step": 825 }, { "epoch": 0.7655236329935126, "grad_norm": 0.038706224411726, "learning_rate": 0.000653054732209587, "loss": 4.5188, "step": 826 }, { "epoch": 0.7664504170528267, "grad_norm": 0.04914843663573265, "learning_rate": 0.0006481569318557793, "loss": 4.525, "step": 827 }, { "epoch": 0.7673772011121409, "grad_norm": 0.03715524449944496, "learning_rate": 0.0006432748300949476, "loss": 4.5062, "step": 828 }, { "epoch": 0.768303985171455, "grad_norm": 0.03968851640820503, "learning_rate": 0.0006384084683140359, "loss": 4.5563, "step": 829 }, { "epoch": 0.7692307692307693, "grad_norm": 0.042003631591796875, "learning_rate": 0.000633557887766559, "loss": 4.5312, "step": 830 }, { "epoch": 0.7701575532900834, "grad_norm": 0.04498601332306862, "learning_rate": 0.000628723129572247, "loss": 4.5, "step": 831 }, { "epoch": 0.7710843373493976, "grad_norm": 0.039209991693496704, "learning_rate": 0.0006239042347167026, "loss": 4.5375, "step": 832 }, { "epoch": 0.7720111214087118, "grad_norm": 0.03667667508125305, "learning_rate": 0.0006191012440510469, "loss": 4.5375, "step": 833 }, { "epoch": 0.772937905468026, "grad_norm": 0.03756443038582802, "learning_rate": 0.0006143141982915801, "loss": 4.525, "step": 834 }, { "epoch": 0.7738646895273401, "grad_norm": 0.03308939188718796, "learning_rate": 0.0006095431380194299, "loss": 4.55, "step": 835 }, { "epoch": 0.7747914735866543, "grad_norm": 0.03881024196743965, "learning_rate": 0.0006047881036802141, "loss": 4.5375, "step": 836 }, { "epoch": 0.7757182576459685, "grad_norm": 0.03667169064283371, "learning_rate": 0.0006000491355836904, "loss": 4.5188, "step": 837 }, { "epoch": 0.7766450417052827, "grad_norm": 0.03264870494604111, "learning_rate": 0.0005953262739034218, "loss": 4.5188, "step": 838 }, { "epoch": 0.7775718257645968, "grad_norm": 0.0369790680706501, "learning_rate": 0.0005906195586764294, "loss": 4.5125, "step": 839 }, { "epoch": 0.7784986098239111, "grad_norm": 0.03252223879098892, "learning_rate": 0.0005859290298028596, "loss": 4.4813, "step": 840 }, { "epoch": 0.7794253938832252, "grad_norm": 0.03256712481379509, "learning_rate": 0.0005812547270456397, "loss": 4.5062, "step": 841 }, { "epoch": 0.7803521779425394, "grad_norm": 0.031595002859830856, "learning_rate": 0.0005765966900301462, "loss": 4.5188, "step": 842 }, { "epoch": 0.7812789620018535, "grad_norm": 0.0356653667986393, "learning_rate": 0.0005719549582438636, "loss": 4.5438, "step": 843 }, { "epoch": 0.7822057460611678, "grad_norm": 0.038195762783288956, "learning_rate": 0.0005673295710360555, "loss": 4.4875, "step": 844 }, { "epoch": 0.7831325301204819, "grad_norm": 0.02905537374317646, "learning_rate": 0.0005627205676174244, "loss": 4.525, "step": 845 }, { "epoch": 0.7840593141797961, "grad_norm": 0.03345280513167381, "learning_rate": 0.0005581279870597866, "loss": 4.4938, "step": 846 }, { "epoch": 0.7849860982391103, "grad_norm": 0.034679800271987915, "learning_rate": 0.0005535518682957341, "loss": 4.4938, "step": 847 }, { "epoch": 0.7859128822984245, "grad_norm": 0.03583706170320511, "learning_rate": 0.0005489922501183095, "loss": 4.5188, "step": 848 }, { "epoch": 0.7868396663577386, "grad_norm": 0.032523263245821, "learning_rate": 0.000544449171180674, "loss": 4.4938, "step": 849 }, { "epoch": 0.7877664504170528, "grad_norm": 0.03378100320696831, "learning_rate": 0.0005399226699957821, "loss": 4.5062, "step": 850 }, { "epoch": 0.788693234476367, "grad_norm": 0.03234217315912247, "learning_rate": 0.0005354127849360543, "loss": 4.45, "step": 851 }, { "epoch": 0.7896200185356812, "grad_norm": 0.03637991473078728, "learning_rate": 0.0005309195542330497, "loss": 4.5188, "step": 852 }, { "epoch": 0.7905468025949953, "grad_norm": 0.03120928816497326, "learning_rate": 0.0005264430159771455, "loss": 4.5, "step": 853 }, { "epoch": 0.7914735866543096, "grad_norm": 0.03429511934518814, "learning_rate": 0.0005219832081172124, "loss": 4.5312, "step": 854 }, { "epoch": 0.7924003707136237, "grad_norm": 0.029146216809749603, "learning_rate": 0.0005175401684602912, "loss": 4.4938, "step": 855 }, { "epoch": 0.7933271547729379, "grad_norm": 0.029695888981223106, "learning_rate": 0.0005131139346712758, "loss": 4.4875, "step": 856 }, { "epoch": 0.794253938832252, "grad_norm": 0.03263707831501961, "learning_rate": 0.0005087045442725904, "loss": 4.5312, "step": 857 }, { "epoch": 0.7951807228915663, "grad_norm": 0.028736894950270653, "learning_rate": 0.0005043120346438748, "loss": 4.525, "step": 858 }, { "epoch": 0.7961075069508804, "grad_norm": 0.030789796262979507, "learning_rate": 0.0004999364430216638, "loss": 4.5, "step": 859 }, { "epoch": 0.7970342910101946, "grad_norm": 0.04033099114894867, "learning_rate": 0.0004955778064990757, "loss": 4.5125, "step": 860 }, { "epoch": 0.7979610750695088, "grad_norm": 0.03556600585579872, "learning_rate": 0.0004912361620254932, "loss": 4.4813, "step": 861 }, { "epoch": 0.798887859128823, "grad_norm": 0.031120220199227333, "learning_rate": 0.00048691154640625566, "loss": 4.4688, "step": 862 }, { "epoch": 0.7998146431881371, "grad_norm": 0.03250223025679588, "learning_rate": 0.0004826039963023407, "loss": 4.4688, "step": 863 }, { "epoch": 0.8007414272474513, "grad_norm": 0.029799439013004303, "learning_rate": 0.0004783135482300596, "loss": 4.4875, "step": 864 }, { "epoch": 0.8016682113067656, "grad_norm": 0.030422599986195564, "learning_rate": 0.0004740402385607431, "loss": 4.4813, "step": 865 }, { "epoch": 0.8025949953660797, "grad_norm": 0.029015803709626198, "learning_rate": 0.0004697841035204356, "loss": 4.4938, "step": 866 }, { "epoch": 0.8035217794253939, "grad_norm": 0.031820014119148254, "learning_rate": 0.00046554517918958845, "loss": 4.5062, "step": 867 }, { "epoch": 0.8044485634847081, "grad_norm": 0.03146743401885033, "learning_rate": 0.00046132350150275005, "loss": 4.475, "step": 868 }, { "epoch": 0.8053753475440223, "grad_norm": 0.02848106250166893, "learning_rate": 0.0004571191062482677, "loss": 4.4875, "step": 869 }, { "epoch": 0.8063021316033364, "grad_norm": 0.031561560928821564, "learning_rate": 0.00045293202906797754, "loss": 4.4875, "step": 870 }, { "epoch": 0.8072289156626506, "grad_norm": 0.031885311007499695, "learning_rate": 0.0004487623054569084, "loss": 4.5062, "step": 871 }, { "epoch": 0.8081556997219648, "grad_norm": 0.03388173505663872, "learning_rate": 0.000444609970762975, "loss": 4.4813, "step": 872 }, { "epoch": 0.809082483781279, "grad_norm": 0.03390287980437279, "learning_rate": 0.00044047506018668415, "loss": 4.5, "step": 873 }, { "epoch": 0.8100092678405931, "grad_norm": 0.032265473157167435, "learning_rate": 0.0004363576087808313, "loss": 4.4938, "step": 874 }, { "epoch": 0.8109360518999074, "grad_norm": 0.03563728928565979, "learning_rate": 0.00043225765145020803, "loss": 4.5188, "step": 875 }, { "epoch": 0.8118628359592215, "grad_norm": 0.03663501888513565, "learning_rate": 0.0004281752229513006, "loss": 4.5188, "step": 876 }, { "epoch": 0.8127896200185357, "grad_norm": 0.03167020156979561, "learning_rate": 0.00042411035789200163, "loss": 4.4875, "step": 877 }, { "epoch": 0.8137164040778498, "grad_norm": 0.03226330131292343, "learning_rate": 0.0004200630907313108, "loss": 4.5062, "step": 878 }, { "epoch": 0.8146431881371641, "grad_norm": 0.029977647587656975, "learning_rate": 0.00041603345577904824, "loss": 4.4688, "step": 879 }, { "epoch": 0.8155699721964782, "grad_norm": 0.03339603170752525, "learning_rate": 0.0004120214871955577, "loss": 4.5125, "step": 880 }, { "epoch": 0.8164967562557924, "grad_norm": 0.031077727675437927, "learning_rate": 0.00040802721899142356, "loss": 4.4938, "step": 881 }, { "epoch": 0.8174235403151066, "grad_norm": 0.02900145947933197, "learning_rate": 0.0004040506850271761, "loss": 4.4375, "step": 882 }, { "epoch": 0.8183503243744208, "grad_norm": 0.029496431350708008, "learning_rate": 0.00040009191901301005, "loss": 4.4625, "step": 883 }, { "epoch": 0.8192771084337349, "grad_norm": 0.02934381552040577, "learning_rate": 0.00039615095450849374, "loss": 4.5062, "step": 884 }, { "epoch": 0.8202038924930491, "grad_norm": 0.030950119718909264, "learning_rate": 0.00039222782492228937, "loss": 4.5, "step": 885 }, { "epoch": 0.8211306765523633, "grad_norm": 0.029751867055892944, "learning_rate": 0.0003883225635118659, "loss": 4.4625, "step": 886 }, { "epoch": 0.8220574606116775, "grad_norm": 0.026806732639670372, "learning_rate": 0.0003844352033832199, "loss": 4.5125, "step": 887 }, { "epoch": 0.8229842446709916, "grad_norm": 0.03083191066980362, "learning_rate": 0.00038056577749059266, "loss": 4.4688, "step": 888 }, { "epoch": 0.8239110287303059, "grad_norm": 0.034451741725206375, "learning_rate": 0.0003767143186361935, "loss": 4.4563, "step": 889 }, { "epoch": 0.82483781278962, "grad_norm": 0.030912496149539948, "learning_rate": 0.0003728808594699179, "loss": 4.475, "step": 890 }, { "epoch": 0.8257645968489342, "grad_norm": 0.03567620739340782, "learning_rate": 0.00036906543248907495, "loss": 4.4938, "step": 891 }, { "epoch": 0.8266913809082483, "grad_norm": 0.03392716869711876, "learning_rate": 0.0003652680700381092, "loss": 4.45, "step": 892 }, { "epoch": 0.8276181649675626, "grad_norm": 0.032731059938669205, "learning_rate": 0.0003614888043083264, "loss": 4.4875, "step": 893 }, { "epoch": 0.8285449490268767, "grad_norm": 0.035781849175691605, "learning_rate": 0.00035772766733762284, "loss": 4.4625, "step": 894 }, { "epoch": 0.8294717330861909, "grad_norm": 0.02696853317320347, "learning_rate": 0.00035398469101020983, "loss": 4.4688, "step": 895 }, { "epoch": 0.830398517145505, "grad_norm": 0.033876750618219376, "learning_rate": 0.00035025990705634833, "loss": 4.5, "step": 896 }, { "epoch": 0.8313253012048193, "grad_norm": 0.03308440372347832, "learning_rate": 0.0003465533470520768, "loss": 4.5125, "step": 897 }, { "epoch": 0.8322520852641334, "grad_norm": 0.0284098070114851, "learning_rate": 0.0003428650424189428, "loss": 4.5, "step": 898 }, { "epoch": 0.8331788693234476, "grad_norm": 0.0362527072429657, "learning_rate": 0.0003391950244237396, "loss": 4.4813, "step": 899 }, { "epoch": 0.8341056533827618, "grad_norm": 0.03239575773477554, "learning_rate": 0.0003355433241782385, "loss": 4.4437, "step": 900 }, { "epoch": 0.835032437442076, "grad_norm": 0.028916817158460617, "learning_rate": 0.00033190997263892683, "loss": 4.5062, "step": 901 }, { "epoch": 0.8359592215013901, "grad_norm": 0.037763047963380814, "learning_rate": 0.0003282950006067439, "loss": 4.475, "step": 902 }, { "epoch": 0.8368860055607044, "grad_norm": 0.03783184662461281, "learning_rate": 0.000324698438726822, "loss": 4.4375, "step": 903 }, { "epoch": 0.8378127896200185, "grad_norm": 0.03236427158117294, "learning_rate": 0.00032112031748822407, "loss": 4.425, "step": 904 }, { "epoch": 0.8387395736793327, "grad_norm": 0.031087512150406837, "learning_rate": 0.00031756066722368775, "loss": 4.4875, "step": 905 }, { "epoch": 0.8396663577386468, "grad_norm": 0.02958965301513672, "learning_rate": 0.0003140195181093658, "loss": 4.475, "step": 906 }, { "epoch": 0.8405931417979611, "grad_norm": 0.028066281229257584, "learning_rate": 0.0003104969001645735, "loss": 4.4563, "step": 907 }, { "epoch": 0.8415199258572753, "grad_norm": 0.030324235558509827, "learning_rate": 0.00030699284325152955, "loss": 4.4437, "step": 908 }, { "epoch": 0.8424467099165894, "grad_norm": 0.03359181433916092, "learning_rate": 0.00030350737707510764, "loss": 4.4813, "step": 909 }, { "epoch": 0.8433734939759037, "grad_norm": 0.02781173586845398, "learning_rate": 0.0003000405311825824, "loss": 4.4437, "step": 910 }, { "epoch": 0.8443002780352178, "grad_norm": 0.03504948690533638, "learning_rate": 0.0002965923349633778, "loss": 4.45, "step": 911 }, { "epoch": 0.845227062094532, "grad_norm": 0.03041827119886875, "learning_rate": 0.00029316281764882074, "loss": 4.4563, "step": 912 }, { "epoch": 0.8461538461538461, "grad_norm": 0.03221605718135834, "learning_rate": 0.00028975200831189067, "loss": 4.475, "step": 913 }, { "epoch": 0.8470806302131604, "grad_norm": 0.03199669346213341, "learning_rate": 0.0002863599358669755, "loss": 4.4313, "step": 914 }, { "epoch": 0.8480074142724745, "grad_norm": 0.030510928481817245, "learning_rate": 0.0002829866290696234, "loss": 4.4, "step": 915 }, { "epoch": 0.8489341983317887, "grad_norm": 0.02957424893975258, "learning_rate": 0.0002796321165163032, "loss": 4.5062, "step": 916 }, { "epoch": 0.8498609823911029, "grad_norm": 0.0366031751036644, "learning_rate": 0.0002762964266441578, "loss": 4.4313, "step": 917 }, { "epoch": 0.8507877664504171, "grad_norm": 0.03369331359863281, "learning_rate": 0.0002729795877307659, "loss": 4.4437, "step": 918 }, { "epoch": 0.8517145505097312, "grad_norm": 0.03299278765916824, "learning_rate": 0.00026968162789390074, "loss": 4.4313, "step": 919 }, { "epoch": 0.8526413345690455, "grad_norm": 0.03193372115492821, "learning_rate": 0.0002664025750912932, "loss": 4.4625, "step": 920 }, { "epoch": 0.8535681186283596, "grad_norm": 0.029631877318024635, "learning_rate": 0.00026314245712039276, "loss": 4.4375, "step": 921 }, { "epoch": 0.8544949026876738, "grad_norm": 0.03459390997886658, "learning_rate": 0.00025990130161813427, "loss": 4.4688, "step": 922 }, { "epoch": 0.8554216867469879, "grad_norm": 0.0364365316927433, "learning_rate": 0.00025667913606070095, "loss": 4.4625, "step": 923 }, { "epoch": 0.8563484708063022, "grad_norm": 0.0323617160320282, "learning_rate": 0.000253475987763295, "loss": 4.425, "step": 924 }, { "epoch": 0.8572752548656163, "grad_norm": 0.02805604226887226, "learning_rate": 0.0002502918838799015, "loss": 4.4813, "step": 925 }, { "epoch": 0.8582020389249305, "grad_norm": 0.033434659242630005, "learning_rate": 0.0002471268514030628, "loss": 4.425, "step": 926 }, { "epoch": 0.8591288229842446, "grad_norm": 0.03157290443778038, "learning_rate": 0.00024398091716364617, "loss": 4.4313, "step": 927 }, { "epoch": 0.8600556070435589, "grad_norm": 0.029048243537545204, "learning_rate": 0.00024085410783061895, "loss": 4.4625, "step": 928 }, { "epoch": 0.860982391102873, "grad_norm": 0.0280530396848917, "learning_rate": 0.00023774644991081978, "loss": 4.4125, "step": 929 }, { "epoch": 0.8619091751621872, "grad_norm": 0.03451543301343918, "learning_rate": 0.00023465796974873722, "loss": 4.4875, "step": 930 }, { "epoch": 0.8628359592215014, "grad_norm": 0.030910175293684006, "learning_rate": 0.00023158869352628286, "loss": 4.45, "step": 931 }, { "epoch": 0.8637627432808156, "grad_norm": 0.03156379237771034, "learning_rate": 0.00022853864726257307, "loss": 4.4125, "step": 932 }, { "epoch": 0.8646895273401297, "grad_norm": 0.03295775502920151, "learning_rate": 0.00022550785681370368, "loss": 4.4313, "step": 933 }, { "epoch": 0.865616311399444, "grad_norm": 0.026067038998007774, "learning_rate": 0.00022249634787253615, "loss": 4.45, "step": 934 }, { "epoch": 0.8665430954587581, "grad_norm": 0.02678762935101986, "learning_rate": 0.00021950414596847684, "loss": 4.4563, "step": 935 }, { "epoch": 0.8674698795180723, "grad_norm": 0.028849739581346512, "learning_rate": 0.0002165312764672589, "loss": 4.4437, "step": 936 }, { "epoch": 0.8683966635773864, "grad_norm": 0.03232532739639282, "learning_rate": 0.0002135777645707318, "loss": 4.4, "step": 937 }, { "epoch": 0.8693234476367007, "grad_norm": 0.027282997965812683, "learning_rate": 0.0002106436353166441, "loss": 4.4625, "step": 938 }, { "epoch": 0.8702502316960148, "grad_norm": 0.026645608246326447, "learning_rate": 0.0002077289135784316, "loss": 4.4437, "step": 939 }, { "epoch": 0.871177015755329, "grad_norm": 0.02711557038128376, "learning_rate": 0.00020483362406500838, "loss": 4.4313, "step": 940 }, { "epoch": 0.8721037998146431, "grad_norm": 0.030816104263067245, "learning_rate": 0.0002019577913205553, "loss": 4.4625, "step": 941 }, { "epoch": 0.8730305838739574, "grad_norm": 0.026929127052426338, "learning_rate": 0.00019910143972431323, "loss": 4.4313, "step": 942 }, { "epoch": 0.8739573679332715, "grad_norm": 0.028096897527575493, "learning_rate": 0.0001962645934903748, "loss": 4.4875, "step": 943 }, { "epoch": 0.8748841519925857, "grad_norm": 0.029124116525053978, "learning_rate": 0.00019344727666748218, "loss": 4.4563, "step": 944 }, { "epoch": 0.8758109360518999, "grad_norm": 0.027243295684456825, "learning_rate": 0.00019064951313881918, "loss": 4.4375, "step": 945 }, { "epoch": 0.8767377201112141, "grad_norm": 0.028546737506985664, "learning_rate": 0.00018787132662181238, "loss": 4.45, "step": 946 }, { "epoch": 0.8776645041705282, "grad_norm": 0.026934707537293434, "learning_rate": 0.00018511274066792733, "loss": 4.425, "step": 947 }, { "epoch": 0.8785912882298424, "grad_norm": 0.03399607166647911, "learning_rate": 0.00018237377866247157, "loss": 4.4563, "step": 948 }, { "epoch": 0.8795180722891566, "grad_norm": 0.02882063016295433, "learning_rate": 0.000179654463824393, "loss": 4.4688, "step": 949 }, { "epoch": 0.8804448563484708, "grad_norm": 0.026831530034542084, "learning_rate": 0.00017695481920608713, "loss": 4.4188, "step": 950 }, { "epoch": 0.881371640407785, "grad_norm": 0.029771380126476288, "learning_rate": 0.00017427486769319738, "loss": 4.4062, "step": 951 }, { "epoch": 0.8822984244670992, "grad_norm": 0.025736462324857712, "learning_rate": 0.00017161463200442484, "loss": 4.4125, "step": 952 }, { "epoch": 0.8832252085264134, "grad_norm": 0.027890045195817947, "learning_rate": 0.0001689741346913337, "loss": 4.4625, "step": 953 }, { "epoch": 0.8841519925857275, "grad_norm": 0.028950916603207588, "learning_rate": 0.0001663533981381593, "loss": 4.4375, "step": 954 }, { "epoch": 0.8850787766450418, "grad_norm": 0.029823975637555122, "learning_rate": 0.00016375244456162119, "loss": 4.4688, "step": 955 }, { "epoch": 0.8860055607043559, "grad_norm": 0.02855784259736538, "learning_rate": 0.00016117129601073116, "loss": 4.4563, "step": 956 }, { "epoch": 0.8869323447636701, "grad_norm": 0.026093894615769386, "learning_rate": 0.00015860997436661028, "loss": 4.4875, "step": 957 }, { "epoch": 0.8878591288229842, "grad_norm": 0.02811110019683838, "learning_rate": 0.00015606850134229966, "loss": 4.4375, "step": 958 }, { "epoch": 0.8887859128822985, "grad_norm": 0.027288252487778664, "learning_rate": 0.00015354689848257942, "loss": 4.4188, "step": 959 }, { "epoch": 0.8897126969416126, "grad_norm": 0.02676665410399437, "learning_rate": 0.0001510451871637833, "loss": 4.4188, "step": 960 }, { "epoch": 0.8906394810009268, "grad_norm": 0.03431456908583641, "learning_rate": 0.00014856338859362052, "loss": 4.4188, "step": 961 }, { "epoch": 0.891566265060241, "grad_norm": 0.026652604341506958, "learning_rate": 0.0001461015238109925, "loss": 4.375, "step": 962 }, { "epoch": 0.8924930491195552, "grad_norm": 0.032444290816783905, "learning_rate": 0.00014365961368581842, "loss": 4.4313, "step": 963 }, { "epoch": 0.8934198331788693, "grad_norm": 0.02602170594036579, "learning_rate": 0.00014123767891885435, "loss": 4.375, "step": 964 }, { "epoch": 0.8943466172381835, "grad_norm": 0.026148205623030663, "learning_rate": 0.00013883574004152106, "loss": 4.425, "step": 965 }, { "epoch": 0.8952734012974977, "grad_norm": 0.028608886525034904, "learning_rate": 0.0001364538174157273, "loss": 4.3812, "step": 966 }, { "epoch": 0.8962001853568119, "grad_norm": 0.026529457420110703, "learning_rate": 0.00013409193123369996, "loss": 4.3812, "step": 967 }, { "epoch": 0.897126969416126, "grad_norm": 0.029828151687979698, "learning_rate": 0.00013175010151780965, "loss": 4.4188, "step": 968 }, { "epoch": 0.8980537534754403, "grad_norm": 0.03368750587105751, "learning_rate": 0.0001294283481204042, "loss": 4.4313, "step": 969 }, { "epoch": 0.8989805375347544, "grad_norm": 0.02840586192905903, "learning_rate": 0.00012712669072363763, "loss": 4.4375, "step": 970 }, { "epoch": 0.8999073215940686, "grad_norm": 0.030109241604804993, "learning_rate": 0.0001248451488393057, "loss": 4.4125, "step": 971 }, { "epoch": 0.9008341056533827, "grad_norm": 0.028758615255355835, "learning_rate": 0.00012258374180867837, "loss": 4.45, "step": 972 }, { "epoch": 0.901760889712697, "grad_norm": 0.02661893516778946, "learning_rate": 0.00012034248880233744, "loss": 4.4813, "step": 973 }, { "epoch": 0.9026876737720111, "grad_norm": 0.02796340361237526, "learning_rate": 0.00011812140882001277, "loss": 4.45, "step": 974 }, { "epoch": 0.9036144578313253, "grad_norm": 0.024077627807855606, "learning_rate": 0.00011592052069042208, "loss": 4.4625, "step": 975 }, { "epoch": 0.9045412418906394, "grad_norm": 0.02510063722729683, "learning_rate": 0.00011373984307111229, "loss": 4.4188, "step": 976 }, { "epoch": 0.9054680259499537, "grad_norm": 0.02504696324467659, "learning_rate": 0.00011157939444829762, "loss": 4.4437, "step": 977 }, { "epoch": 0.9063948100092678, "grad_norm": 0.026624388992786407, "learning_rate": 0.0001094391931367078, "loss": 4.4563, "step": 978 }, { "epoch": 0.907321594068582, "grad_norm": 0.02774794027209282, "learning_rate": 0.00010731925727942932, "loss": 4.4313, "step": 979 }, { "epoch": 0.9082483781278962, "grad_norm": 0.027720240876078606, "learning_rate": 0.00010521960484775273, "loss": 4.425, "step": 980 }, { "epoch": 0.9091751621872104, "grad_norm": 0.0258037019520998, "learning_rate": 0.00010314025364102087, "loss": 4.425, "step": 981 }, { "epoch": 0.9101019462465245, "grad_norm": 0.031181413680315018, "learning_rate": 0.00010108122128647645, "loss": 4.425, "step": 982 }, { "epoch": 0.9110287303058388, "grad_norm": 0.026958808302879333, "learning_rate": 9.904252523911473e-05, "loss": 4.425, "step": 983 }, { "epoch": 0.9119555143651529, "grad_norm": 0.0251258946955204, "learning_rate": 9.702418278153296e-05, "loss": 4.3938, "step": 984 }, { "epoch": 0.9128822984244671, "grad_norm": 0.026582978665828705, "learning_rate": 9.502621102378706e-05, "loss": 4.4062, "step": 985 }, { "epoch": 0.9138090824837812, "grad_norm": 0.028273189440369606, "learning_rate": 9.304862690324295e-05, "loss": 4.4, "step": 986 }, { "epoch": 0.9147358665430955, "grad_norm": 0.02678096853196621, "learning_rate": 9.109144718443679e-05, "loss": 4.4125, "step": 987 }, { "epoch": 0.9156626506024096, "grad_norm": 0.024335335940122604, "learning_rate": 8.915468845892894e-05, "loss": 4.4125, "step": 988 }, { "epoch": 0.9165894346617238, "grad_norm": 0.02453056164085865, "learning_rate": 8.72383671451668e-05, "loss": 4.4062, "step": 989 }, { "epoch": 0.917516218721038, "grad_norm": 0.025096192955970764, "learning_rate": 8.534249948834311e-05, "loss": 4.4437, "step": 990 }, { "epoch": 0.9184430027803522, "grad_norm": 0.025366991758346558, "learning_rate": 8.346710156026033e-05, "loss": 4.4062, "step": 991 }, { "epoch": 0.9193697868396663, "grad_norm": 0.02832290157675743, "learning_rate": 8.161218925919172e-05, "loss": 4.4625, "step": 992 }, { "epoch": 0.9202965708989805, "grad_norm": 0.027890915051102638, "learning_rate": 7.977777830974947e-05, "loss": 4.4375, "step": 993 }, { "epoch": 0.9212233549582948, "grad_norm": 0.027829816564917564, "learning_rate": 7.796388426274947e-05, "loss": 4.45, "step": 994 }, { "epoch": 0.9221501390176089, "grad_norm": 0.02420070953667164, "learning_rate": 7.61705224950801e-05, "loss": 4.4313, "step": 995 }, { "epoch": 0.9230769230769231, "grad_norm": 0.024921340867877007, "learning_rate": 7.43977082095726e-05, "loss": 4.4125, "step": 996 }, { "epoch": 0.9240037071362373, "grad_norm": 0.02533474750816822, "learning_rate": 7.264545643486997e-05, "loss": 4.4062, "step": 997 }, { "epoch": 0.9249304911955515, "grad_norm": 0.02694832719862461, "learning_rate": 7.091378202530224e-05, "loss": 4.375, "step": 998 }, { "epoch": 0.9258572752548656, "grad_norm": 0.024787478148937225, "learning_rate": 6.920269966075893e-05, "loss": 4.4125, "step": 999 }, { "epoch": 0.9267840593141798, "grad_norm": 0.02519523911178112, "learning_rate": 6.751222384656502e-05, "loss": 4.425, "step": 1000 }, { "epoch": 0.927710843373494, "grad_norm": 0.0249481238424778, "learning_rate": 6.584236891335804e-05, "loss": 4.45, "step": 1001 }, { "epoch": 0.9286376274328082, "grad_norm": 0.027095666155219078, "learning_rate": 6.419314901696671e-05, "loss": 4.4125, "step": 1002 }, { "epoch": 0.9295644114921223, "grad_norm": 0.026183003559708595, "learning_rate": 6.256457813828997e-05, "loss": 4.3938, "step": 1003 }, { "epoch": 0.9304911955514366, "grad_norm": 0.025982800871133804, "learning_rate": 6.095667008318068e-05, "loss": 4.4062, "step": 1004 }, { "epoch": 0.9314179796107507, "grad_norm": 0.027629397809505463, "learning_rate": 5.936943848232568e-05, "loss": 4.4625, "step": 1005 }, { "epoch": 0.9323447636700649, "grad_norm": 0.02437759004533291, "learning_rate": 5.78028967911326e-05, "loss": 4.425, "step": 1006 }, { "epoch": 0.933271547729379, "grad_norm": 0.024311203509569168, "learning_rate": 5.625705828961436e-05, "loss": 4.4375, "step": 1007 }, { "epoch": 0.9341983317886933, "grad_norm": 0.024223096668720245, "learning_rate": 5.473193608227789e-05, "loss": 4.4062, "step": 1008 }, { "epoch": 0.9351251158480074, "grad_norm": 0.023723123595118523, "learning_rate": 5.322754309801115e-05, "loss": 4.45, "step": 1009 }, { "epoch": 0.9360518999073216, "grad_norm": 0.02314998209476471, "learning_rate": 5.174389208997598e-05, "loss": 4.4188, "step": 1010 }, { "epoch": 0.9369786839666358, "grad_norm": 0.028589608147740364, "learning_rate": 5.0280995635497705e-05, "loss": 4.4375, "step": 1011 }, { "epoch": 0.93790546802595, "grad_norm": 0.023467648774385452, "learning_rate": 4.883886613595984e-05, "loss": 4.3938, "step": 1012 }, { "epoch": 0.9388322520852641, "grad_norm": 0.025684082880616188, "learning_rate": 4.74175158166984e-05, "loss": 4.4188, "step": 1013 }, { "epoch": 0.9397590361445783, "grad_norm": 0.028895532712340355, "learning_rate": 4.601695672689921e-05, "loss": 4.4375, "step": 1014 }, { "epoch": 0.9406858202038925, "grad_norm": 0.02598528377711773, "learning_rate": 4.463720073949351e-05, "loss": 4.4375, "step": 1015 }, { "epoch": 0.9416126042632067, "grad_norm": 0.025186927989125252, "learning_rate": 4.3278259551060015e-05, "loss": 4.4188, "step": 1016 }, { "epoch": 0.9425393883225208, "grad_norm": 0.02664157934486866, "learning_rate": 4.194014468172469e-05, "loss": 4.4313, "step": 1017 }, { "epoch": 0.943466172381835, "grad_norm": 0.02440650388598442, "learning_rate": 4.062286747506222e-05, "loss": 4.45, "step": 1018 }, { "epoch": 0.9443929564411492, "grad_norm": 0.022903352975845337, "learning_rate": 3.932643909800082e-05, "loss": 4.3875, "step": 1019 }, { "epoch": 0.9453197405004634, "grad_norm": 0.024947639554739, "learning_rate": 3.805087054072731e-05, "loss": 4.4375, "step": 1020 }, { "epoch": 0.9462465245597775, "grad_norm": 0.027093123644590378, "learning_rate": 3.6796172616594126e-05, "loss": 4.4188, "step": 1021 }, { "epoch": 0.9471733086190918, "grad_norm": 0.023427557200193405, "learning_rate": 3.5562355962027726e-05, "loss": 4.4625, "step": 1022 }, { "epoch": 0.9481000926784059, "grad_norm": 0.02435910701751709, "learning_rate": 3.434943103643728e-05, "loss": 4.4188, "step": 1023 }, { "epoch": 0.9490268767377201, "grad_norm": 0.025206558406352997, "learning_rate": 3.315740812212781e-05, "loss": 4.4062, "step": 1024 }, { "epoch": 0.9499536607970342, "grad_norm": 0.024215737357735634, "learning_rate": 3.198629732421188e-05, "loss": 4.4, "step": 1025 }, { "epoch": 0.9508804448563485, "grad_norm": 0.022633830085396767, "learning_rate": 3.0836108570524154e-05, "loss": 4.4062, "step": 1026 }, { "epoch": 0.9518072289156626, "grad_norm": 0.024218518286943436, "learning_rate": 2.9706851611537023e-05, "loss": 4.4938, "step": 1027 }, { "epoch": 0.9527340129749768, "grad_norm": 0.023550162091851234, "learning_rate": 2.8598536020278676e-05, "loss": 4.4, "step": 1028 }, { "epoch": 0.953660797034291, "grad_norm": 0.024799218401312828, "learning_rate": 2.7511171192250718e-05, "loss": 4.4375, "step": 1029 }, { "epoch": 0.9545875810936052, "grad_norm": 0.025713039562106133, "learning_rate": 2.6444766345350425e-05, "loss": 4.4062, "step": 1030 }, { "epoch": 0.9555143651529193, "grad_norm": 0.024386629462242126, "learning_rate": 2.539933051978971e-05, "loss": 4.4188, "step": 1031 }, { "epoch": 0.9564411492122336, "grad_norm": 0.025705767795443535, "learning_rate": 2.43748725780224e-05, "loss": 4.375, "step": 1032 }, { "epoch": 0.9573679332715477, "grad_norm": 0.026646282523870468, "learning_rate": 2.3371401204664577e-05, "loss": 4.45, "step": 1033 }, { "epoch": 0.9582947173308619, "grad_norm": 0.025327732786536217, "learning_rate": 2.238892490642547e-05, "loss": 4.4437, "step": 1034 }, { "epoch": 0.959221501390176, "grad_norm": 0.024950072169303894, "learning_rate": 2.142745201203139e-05, "loss": 4.45, "step": 1035 }, { "epoch": 0.9601482854494903, "grad_norm": 0.023224515840411186, "learning_rate": 2.048699067215831e-05, "loss": 4.4125, "step": 1036 }, { "epoch": 0.9610750695088045, "grad_norm": 0.024536075070500374, "learning_rate": 1.9567548859359963e-05, "loss": 4.45, "step": 1037 }, { "epoch": 0.9620018535681186, "grad_norm": 0.025291137397289276, "learning_rate": 1.866913436800316e-05, "loss": 4.4563, "step": 1038 }, { "epoch": 0.9629286376274329, "grad_norm": 0.023913368582725525, "learning_rate": 1.7791754814199255e-05, "loss": 4.4563, "step": 1039 }, { "epoch": 0.963855421686747, "grad_norm": 0.02541198581457138, "learning_rate": 1.693541763574058e-05, "loss": 4.45, "step": 1040 }, { "epoch": 0.9647822057460612, "grad_norm": 0.02386779710650444, "learning_rate": 1.6100130092037703e-05, "loss": 4.3812, "step": 1041 }, { "epoch": 0.9657089898053753, "grad_norm": 0.02432171255350113, "learning_rate": 1.528589926405727e-05, "loss": 4.4563, "step": 1042 }, { "epoch": 0.9666357738646896, "grad_norm": 0.026072759181261063, "learning_rate": 1.4492732054262603e-05, "loss": 4.4062, "step": 1043 }, { "epoch": 0.9675625579240037, "grad_norm": 0.02468552440404892, "learning_rate": 1.372063518655403e-05, "loss": 4.45, "step": 1044 }, { "epoch": 0.9684893419833179, "grad_norm": 0.023878788575530052, "learning_rate": 1.2969615206213369e-05, "loss": 4.4188, "step": 1045 }, { "epoch": 0.969416126042632, "grad_norm": 0.0231490395963192, "learning_rate": 1.223967847984786e-05, "loss": 4.4188, "step": 1046 }, { "epoch": 0.9703429101019463, "grad_norm": 0.024373695254325867, "learning_rate": 1.1530831195335767e-05, "loss": 4.4437, "step": 1047 }, { "epoch": 0.9712696941612604, "grad_norm": 0.02477751113474369, "learning_rate": 1.08430793617742e-05, "loss": 4.4188, "step": 1048 }, { "epoch": 0.9721964782205746, "grad_norm": 0.023831041529774666, "learning_rate": 1.0176428809428318e-05, "loss": 4.4813, "step": 1049 }, { "epoch": 0.9731232622798888, "grad_norm": 0.02483510971069336, "learning_rate": 9.530885189681649e-06, "loss": 4.4125, "step": 1050 }, { "epoch": 0.974050046339203, "grad_norm": 0.023760484531521797, "learning_rate": 8.906453974988626e-06, "loss": 4.4062, "step": 1051 }, { "epoch": 0.9749768303985171, "grad_norm": 0.02444753795862198, "learning_rate": 8.303140458827684e-06, "loss": 4.4062, "step": 1052 }, { "epoch": 0.9759036144578314, "grad_norm": 0.021337734535336494, "learning_rate": 7.720949755657125e-06, "loss": 4.4, "step": 1053 }, { "epoch": 0.9768303985171455, "grad_norm": 0.022071754559874535, "learning_rate": 7.159886800869875e-06, "loss": 4.425, "step": 1054 }, { "epoch": 0.9777571825764597, "grad_norm": 0.024915462359786034, "learning_rate": 6.6199563507549075e-06, "loss": 4.3938, "step": 1055 }, { "epoch": 0.9786839666357738, "grad_norm": 0.022235747426748276, "learning_rate": 6.1011629824533895e-06, "loss": 4.4, "step": 1056 }, { "epoch": 0.9796107506950881, "grad_norm": 0.02508777379989624, "learning_rate": 5.60351109392232e-06, "loss": 4.425, "step": 1057 }, { "epoch": 0.9805375347544022, "grad_norm": 0.02421114780008793, "learning_rate": 5.127004903896504e-06, "loss": 4.4688, "step": 1058 }, { "epoch": 0.9814643188137164, "grad_norm": 0.023330386728048325, "learning_rate": 4.6716484518524726e-06, "loss": 4.3875, "step": 1059 }, { "epoch": 0.9823911028730306, "grad_norm": 0.02507002279162407, "learning_rate": 4.237445597974343e-06, "loss": 4.4563, "step": 1060 }, { "epoch": 0.9833178869323448, "grad_norm": 0.023726079612970352, "learning_rate": 3.824400023121621e-06, "loss": 4.4688, "step": 1061 }, { "epoch": 0.9842446709916589, "grad_norm": 0.022975319996476173, "learning_rate": 3.4325152287975615e-06, "loss": 4.3938, "step": 1062 }, { "epoch": 0.9851714550509731, "grad_norm": 0.02411024458706379, "learning_rate": 3.061794537119467e-06, "loss": 4.4563, "step": 1063 }, { "epoch": 0.9860982391102873, "grad_norm": 0.022638075053691864, "learning_rate": 2.7122410907903794e-06, "loss": 4.4563, "step": 1064 }, { "epoch": 0.9870250231696015, "grad_norm": 0.023638809099793434, "learning_rate": 2.383857853073268e-06, "loss": 4.425, "step": 1065 }, { "epoch": 0.9879518072289156, "grad_norm": 0.02219136245548725, "learning_rate": 2.0766476077643813e-06, "loss": 4.4, "step": 1066 }, { "epoch": 0.9888785912882299, "grad_norm": 0.02723466046154499, "learning_rate": 1.7906129591713227e-06, "loss": 4.4437, "step": 1067 }, { "epoch": 0.989805375347544, "grad_norm": 0.024723384529352188, "learning_rate": 1.525756332090289e-06, "loss": 4.4, "step": 1068 }, { "epoch": 0.9907321594068582, "grad_norm": 0.023885123431682587, "learning_rate": 1.2820799717849775e-06, "loss": 4.4, "step": 1069 }, { "epoch": 0.9916589434661723, "grad_norm": 0.022805040702223778, "learning_rate": 1.059585943967989e-06, "loss": 4.4437, "step": 1070 }, { "epoch": 0.9925857275254866, "grad_norm": 0.023890964686870575, "learning_rate": 8.58276134784175e-07, "loss": 4.3812, "step": 1071 }, { "epoch": 0.9935125115848007, "grad_norm": 0.025231240317225456, "learning_rate": 6.781522507925964e-07, "loss": 4.3688, "step": 1072 }, { "epoch": 0.9944392956441149, "grad_norm": 0.021534454077482224, "learning_rate": 5.192158189543106e-07, "loss": 4.4938, "step": 1073 }, { "epoch": 0.995366079703429, "grad_norm": 0.023576676845550537, "learning_rate": 3.8146818661793925e-07, "loss": 4.4, "step": 1074 }, { "epoch": 0.9962928637627433, "grad_norm": 0.02641914412379265, "learning_rate": 2.6491052150884323e-07, "loss": 4.3625, "step": 1075 }, { "epoch": 0.9972196478220574, "grad_norm": 0.02341269887983799, "learning_rate": 1.6954381171885302e-07, "loss": 4.3812, "step": 1076 }, { "epoch": 0.9981464318813716, "grad_norm": 0.022809363901615143, "learning_rate": 9.536886569849746e-08, "loss": 4.4437, "step": 1077 }, { "epoch": 0.9990732159406858, "grad_norm": 0.023255689069628716, "learning_rate": 4.23863122495094e-08, "loss": 4.4437, "step": 1078 }, { "epoch": 1.0, "grad_norm": 0.03218919411301613, "learning_rate": 1.059660052010747e-08, "loss": 4.425, "step": 1079 } ], "logging_steps": 1, "max_steps": 1079, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.154917754792837e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }