{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.4332965821389196, "eval_steps": 500, "global_step": 2600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005512679162072767, "grad_norm": 12.100004196166992, "learning_rate": 2.9999999999999997e-05, "loss": 1.4741, "step": 1 }, { "epoch": 0.0011025358324145535, "grad_norm": 11.976073265075684, "learning_rate": 5.9999999999999995e-05, "loss": 1.4512, "step": 2 }, { "epoch": 0.0016538037486218302, "grad_norm": 4.930200576782227, "learning_rate": 8.999999999999999e-05, "loss": 1.3853, "step": 3 }, { "epoch": 0.002205071664829107, "grad_norm": 1.8625606298446655, "learning_rate": 0.00011999999999999999, "loss": 1.12, "step": 4 }, { "epoch": 0.0027563395810363835, "grad_norm": 1.4577418565750122, "learning_rate": 0.00015, "loss": 1.005, "step": 5 }, { "epoch": 0.0033076074972436605, "grad_norm": 1.1385219097137451, "learning_rate": 0.00017999999999999998, "loss": 0.8992, "step": 6 }, { "epoch": 0.003858875413450937, "grad_norm": 1.5815627574920654, "learning_rate": 0.00020999999999999998, "loss": 0.815, "step": 7 }, { "epoch": 0.004410143329658214, "grad_norm": 0.6205328702926636, "learning_rate": 0.00023999999999999998, "loss": 0.7967, "step": 8 }, { "epoch": 0.004961411245865491, "grad_norm": 1.6408820152282715, "learning_rate": 0.00027, "loss": 0.7702, "step": 9 }, { "epoch": 0.005512679162072767, "grad_norm": 0.8569570183753967, "learning_rate": 0.0003, "loss": 0.7845, "step": 10 }, { "epoch": 0.006063947078280044, "grad_norm": 0.67384272813797, "learning_rate": 0.0002999170812603648, "loss": 0.7192, "step": 11 }, { "epoch": 0.006615214994487321, "grad_norm": 2.0132830142974854, "learning_rate": 0.00029983416252072964, "loss": 0.7354, "step": 12 }, { "epoch": 0.007166482910694598, "grad_norm": 0.6772907972335815, "learning_rate": 0.0002997512437810945, "loss": 0.715, "step": 13 }, { "epoch": 0.007717750826901874, "grad_norm": 0.5798671245574951, "learning_rate": 0.00029966832504145936, "loss": 0.7477, "step": 14 }, { "epoch": 0.008269018743109152, "grad_norm": 0.49168965220451355, "learning_rate": 0.00029958540630182416, "loss": 0.713, "step": 15 }, { "epoch": 0.008820286659316428, "grad_norm": 0.478697806596756, "learning_rate": 0.000299502487562189, "loss": 0.6915, "step": 16 }, { "epoch": 0.009371554575523704, "grad_norm": 0.4884359538555145, "learning_rate": 0.0002994195688225539, "loss": 0.7305, "step": 17 }, { "epoch": 0.009922822491730982, "grad_norm": 0.4691940248012543, "learning_rate": 0.00029933665008291874, "loss": 0.6646, "step": 18 }, { "epoch": 0.010474090407938258, "grad_norm": 0.4946594834327698, "learning_rate": 0.00029925373134328354, "loss": 0.7137, "step": 19 }, { "epoch": 0.011025358324145534, "grad_norm": 0.4412364363670349, "learning_rate": 0.0002991708126036484, "loss": 0.7063, "step": 20 }, { "epoch": 0.011576626240352812, "grad_norm": 0.5092226266860962, "learning_rate": 0.0002990878938640132, "loss": 0.684, "step": 21 }, { "epoch": 0.012127894156560088, "grad_norm": 0.45330244302749634, "learning_rate": 0.00029900497512437807, "loss": 0.6677, "step": 22 }, { "epoch": 0.012679162072767364, "grad_norm": 0.4717816710472107, "learning_rate": 0.0002989220563847429, "loss": 0.6898, "step": 23 }, { "epoch": 0.013230429988974642, "grad_norm": 0.41348159313201904, "learning_rate": 0.0002988391376451078, "loss": 0.6735, "step": 24 }, { "epoch": 0.013781697905181918, "grad_norm": 0.44471853971481323, "learning_rate": 0.0002987562189054726, "loss": 0.6732, "step": 25 }, { "epoch": 0.014332965821389196, "grad_norm": 0.44660595059394836, "learning_rate": 0.00029867330016583745, "loss": 0.7058, "step": 26 }, { "epoch": 0.014884233737596472, "grad_norm": 0.3917936086654663, "learning_rate": 0.0002985903814262023, "loss": 0.6486, "step": 27 }, { "epoch": 0.015435501653803748, "grad_norm": 0.3844316899776459, "learning_rate": 0.00029850746268656717, "loss": 0.6726, "step": 28 }, { "epoch": 0.015986769570011026, "grad_norm": 0.38220199942588806, "learning_rate": 0.00029842454394693197, "loss": 0.6835, "step": 29 }, { "epoch": 0.016538037486218304, "grad_norm": 0.3823130428791046, "learning_rate": 0.00029834162520729683, "loss": 0.6818, "step": 30 }, { "epoch": 0.017089305402425578, "grad_norm": 0.3354315161705017, "learning_rate": 0.00029825870646766164, "loss": 0.6421, "step": 31 }, { "epoch": 0.017640573318632856, "grad_norm": 0.3261851966381073, "learning_rate": 0.0002981757877280265, "loss": 0.6254, "step": 32 }, { "epoch": 0.018191841234840134, "grad_norm": 0.3275938928127289, "learning_rate": 0.00029809286898839135, "loss": 0.6529, "step": 33 }, { "epoch": 0.018743109151047408, "grad_norm": 0.3375149667263031, "learning_rate": 0.0002980099502487562, "loss": 0.664, "step": 34 }, { "epoch": 0.019294377067254686, "grad_norm": 0.33320432901382446, "learning_rate": 0.000297927031509121, "loss": 0.6157, "step": 35 }, { "epoch": 0.019845644983461964, "grad_norm": 0.30827271938323975, "learning_rate": 0.0002978441127694859, "loss": 0.6418, "step": 36 }, { "epoch": 0.020396912899669238, "grad_norm": 0.3377619683742523, "learning_rate": 0.00029776119402985074, "loss": 0.6454, "step": 37 }, { "epoch": 0.020948180815876516, "grad_norm": 0.32735955715179443, "learning_rate": 0.0002976782752902156, "loss": 0.632, "step": 38 }, { "epoch": 0.021499448732083794, "grad_norm": 0.37884464859962463, "learning_rate": 0.0002975953565505804, "loss": 0.6223, "step": 39 }, { "epoch": 0.022050716648291068, "grad_norm": 0.3301836848258972, "learning_rate": 0.00029751243781094526, "loss": 0.6654, "step": 40 }, { "epoch": 0.022601984564498346, "grad_norm": 0.3196747303009033, "learning_rate": 0.00029742951907131006, "loss": 0.6445, "step": 41 }, { "epoch": 0.023153252480705624, "grad_norm": 0.3292658030986786, "learning_rate": 0.0002973466003316749, "loss": 0.6271, "step": 42 }, { "epoch": 0.023704520396912898, "grad_norm": 0.32541969418525696, "learning_rate": 0.0002972636815920398, "loss": 0.6217, "step": 43 }, { "epoch": 0.024255788313120176, "grad_norm": 0.3059806823730469, "learning_rate": 0.00029718076285240464, "loss": 0.6029, "step": 44 }, { "epoch": 0.024807056229327454, "grad_norm": 0.3427717983722687, "learning_rate": 0.00029709784411276945, "loss": 0.6523, "step": 45 }, { "epoch": 0.025358324145534728, "grad_norm": 0.33184289932250977, "learning_rate": 0.0002970149253731343, "loss": 0.6475, "step": 46 }, { "epoch": 0.025909592061742006, "grad_norm": 0.32376739382743835, "learning_rate": 0.00029693200663349917, "loss": 0.6588, "step": 47 }, { "epoch": 0.026460859977949284, "grad_norm": 0.30022457242012024, "learning_rate": 0.000296849087893864, "loss": 0.6316, "step": 48 }, { "epoch": 0.02701212789415656, "grad_norm": 0.3170008957386017, "learning_rate": 0.00029676616915422883, "loss": 0.5847, "step": 49 }, { "epoch": 0.027563395810363836, "grad_norm": 0.3455023765563965, "learning_rate": 0.0002966832504145937, "loss": 0.6668, "step": 50 }, { "epoch": 0.028114663726571114, "grad_norm": 0.3004387617111206, "learning_rate": 0.0002966003316749585, "loss": 0.6599, "step": 51 }, { "epoch": 0.02866593164277839, "grad_norm": 0.33361348509788513, "learning_rate": 0.00029651741293532335, "loss": 0.6502, "step": 52 }, { "epoch": 0.029217199558985666, "grad_norm": 0.34541115164756775, "learning_rate": 0.0002964344941956882, "loss": 0.6764, "step": 53 }, { "epoch": 0.029768467475192944, "grad_norm": 0.32801833748817444, "learning_rate": 0.00029635157545605307, "loss": 0.6347, "step": 54 }, { "epoch": 0.03031973539140022, "grad_norm": 0.30410563945770264, "learning_rate": 0.0002962686567164179, "loss": 0.6117, "step": 55 }, { "epoch": 0.030871003307607496, "grad_norm": 0.31390225887298584, "learning_rate": 0.00029618573797678274, "loss": 0.5973, "step": 56 }, { "epoch": 0.031422271223814774, "grad_norm": 0.34744319319725037, "learning_rate": 0.0002961028192371476, "loss": 0.6544, "step": 57 }, { "epoch": 0.03197353914002205, "grad_norm": 0.3452775180339813, "learning_rate": 0.0002960199004975124, "loss": 0.6234, "step": 58 }, { "epoch": 0.03252480705622933, "grad_norm": 0.34327036142349243, "learning_rate": 0.00029593698175787726, "loss": 0.6485, "step": 59 }, { "epoch": 0.03307607497243661, "grad_norm": 0.317579448223114, "learning_rate": 0.00029585406301824206, "loss": 0.6182, "step": 60 }, { "epoch": 0.03362734288864388, "grad_norm": 0.3586544692516327, "learning_rate": 0.0002957711442786069, "loss": 0.6149, "step": 61 }, { "epoch": 0.034178610804851156, "grad_norm": 0.3077372908592224, "learning_rate": 0.0002956882255389718, "loss": 0.5806, "step": 62 }, { "epoch": 0.034729878721058434, "grad_norm": 0.33191806077957153, "learning_rate": 0.00029560530679933664, "loss": 0.631, "step": 63 }, { "epoch": 0.03528114663726571, "grad_norm": 0.32726630568504333, "learning_rate": 0.00029552238805970145, "loss": 0.6364, "step": 64 }, { "epoch": 0.03583241455347299, "grad_norm": 0.3058015704154968, "learning_rate": 0.0002954394693200663, "loss": 0.6193, "step": 65 }, { "epoch": 0.03638368246968027, "grad_norm": 0.30789121985435486, "learning_rate": 0.00029535655058043116, "loss": 0.6322, "step": 66 }, { "epoch": 0.03693495038588754, "grad_norm": 0.33515268564224243, "learning_rate": 0.000295273631840796, "loss": 0.6581, "step": 67 }, { "epoch": 0.037486218302094816, "grad_norm": 0.3196898400783539, "learning_rate": 0.00029519071310116083, "loss": 0.6134, "step": 68 }, { "epoch": 0.038037486218302094, "grad_norm": 0.3255867660045624, "learning_rate": 0.0002951077943615257, "loss": 0.6176, "step": 69 }, { "epoch": 0.03858875413450937, "grad_norm": 0.3257988691329956, "learning_rate": 0.0002950248756218905, "loss": 0.6214, "step": 70 }, { "epoch": 0.03914002205071665, "grad_norm": 0.29037123918533325, "learning_rate": 0.00029494195688225535, "loss": 0.6098, "step": 71 }, { "epoch": 0.03969128996692393, "grad_norm": 0.3127928674221039, "learning_rate": 0.0002948590381426202, "loss": 0.6532, "step": 72 }, { "epoch": 0.0402425578831312, "grad_norm": 0.2821784019470215, "learning_rate": 0.00029477611940298507, "loss": 0.6101, "step": 73 }, { "epoch": 0.040793825799338476, "grad_norm": 0.2889716923236847, "learning_rate": 0.0002946932006633499, "loss": 0.6097, "step": 74 }, { "epoch": 0.041345093715545754, "grad_norm": 0.3002908527851105, "learning_rate": 0.00029461028192371473, "loss": 0.626, "step": 75 }, { "epoch": 0.04189636163175303, "grad_norm": 0.2943056523799896, "learning_rate": 0.0002945273631840796, "loss": 0.6061, "step": 76 }, { "epoch": 0.04244762954796031, "grad_norm": 0.31590160727500916, "learning_rate": 0.00029444444444444445, "loss": 0.6279, "step": 77 }, { "epoch": 0.04299889746416759, "grad_norm": 0.31002211570739746, "learning_rate": 0.00029436152570480926, "loss": 0.6066, "step": 78 }, { "epoch": 0.043550165380374865, "grad_norm": 0.27883172035217285, "learning_rate": 0.0002942786069651741, "loss": 0.6053, "step": 79 }, { "epoch": 0.044101433296582136, "grad_norm": 0.3098636567592621, "learning_rate": 0.0002941956882255389, "loss": 0.6041, "step": 80 }, { "epoch": 0.044652701212789414, "grad_norm": 0.31574317812919617, "learning_rate": 0.0002941127694859038, "loss": 0.6132, "step": 81 }, { "epoch": 0.04520396912899669, "grad_norm": 0.2871106266975403, "learning_rate": 0.00029402985074626864, "loss": 0.5759, "step": 82 }, { "epoch": 0.04575523704520397, "grad_norm": 0.2808583676815033, "learning_rate": 0.0002939469320066335, "loss": 0.583, "step": 83 }, { "epoch": 0.04630650496141125, "grad_norm": 0.29489415884017944, "learning_rate": 0.0002938640132669983, "loss": 0.6018, "step": 84 }, { "epoch": 0.046857772877618525, "grad_norm": 0.28468286991119385, "learning_rate": 0.00029378109452736316, "loss": 0.602, "step": 85 }, { "epoch": 0.047409040793825796, "grad_norm": 0.28690364956855774, "learning_rate": 0.000293698175787728, "loss": 0.5802, "step": 86 }, { "epoch": 0.047960308710033074, "grad_norm": 0.30015993118286133, "learning_rate": 0.0002936152570480929, "loss": 0.5889, "step": 87 }, { "epoch": 0.04851157662624035, "grad_norm": 0.3080478310585022, "learning_rate": 0.0002935323383084577, "loss": 0.6106, "step": 88 }, { "epoch": 0.04906284454244763, "grad_norm": 0.2852279245853424, "learning_rate": 0.00029344941956882254, "loss": 0.5902, "step": 89 }, { "epoch": 0.04961411245865491, "grad_norm": 0.2944631278514862, "learning_rate": 0.00029336650082918735, "loss": 0.6222, "step": 90 }, { "epoch": 0.050165380374862185, "grad_norm": 0.29476436972618103, "learning_rate": 0.0002932835820895522, "loss": 0.6151, "step": 91 }, { "epoch": 0.050716648291069456, "grad_norm": 0.2786809802055359, "learning_rate": 0.00029320066334991707, "loss": 0.5801, "step": 92 }, { "epoch": 0.051267916207276734, "grad_norm": 0.27844133973121643, "learning_rate": 0.0002931177446102819, "loss": 0.5708, "step": 93 }, { "epoch": 0.05181918412348401, "grad_norm": 0.2947113811969757, "learning_rate": 0.00029303482587064673, "loss": 0.5951, "step": 94 }, { "epoch": 0.05237045203969129, "grad_norm": 0.2926524877548218, "learning_rate": 0.0002929519071310116, "loss": 0.6281, "step": 95 }, { "epoch": 0.05292171995589857, "grad_norm": 0.27508488297462463, "learning_rate": 0.00029286898839137645, "loss": 0.5769, "step": 96 }, { "epoch": 0.053472987872105845, "grad_norm": 0.2983228862285614, "learning_rate": 0.0002927860696517413, "loss": 0.5808, "step": 97 }, { "epoch": 0.05402425578831312, "grad_norm": 0.28955212235450745, "learning_rate": 0.0002927031509121061, "loss": 0.6009, "step": 98 }, { "epoch": 0.054575523704520394, "grad_norm": 0.30267390608787537, "learning_rate": 0.0002926202321724709, "loss": 0.5938, "step": 99 }, { "epoch": 0.05512679162072767, "grad_norm": 0.2869952917098999, "learning_rate": 0.0002925373134328358, "loss": 0.5695, "step": 100 }, { "epoch": 0.05567805953693495, "grad_norm": 0.28908076882362366, "learning_rate": 0.00029245439469320064, "loss": 0.5904, "step": 101 }, { "epoch": 0.05622932745314223, "grad_norm": 0.2866143584251404, "learning_rate": 0.0002923714759535655, "loss": 0.5945, "step": 102 }, { "epoch": 0.056780595369349506, "grad_norm": 0.2788505554199219, "learning_rate": 0.0002922885572139303, "loss": 0.5861, "step": 103 }, { "epoch": 0.05733186328555678, "grad_norm": 0.2852947413921356, "learning_rate": 0.00029220563847429516, "loss": 0.6012, "step": 104 }, { "epoch": 0.057883131201764054, "grad_norm": 0.27692896127700806, "learning_rate": 0.00029212271973466, "loss": 0.5797, "step": 105 }, { "epoch": 0.05843439911797133, "grad_norm": 0.27395880222320557, "learning_rate": 0.0002920398009950249, "loss": 0.5854, "step": 106 }, { "epoch": 0.05898566703417861, "grad_norm": 0.2730069160461426, "learning_rate": 0.0002919568822553897, "loss": 0.5882, "step": 107 }, { "epoch": 0.05953693495038589, "grad_norm": 0.2808207869529724, "learning_rate": 0.00029187396351575454, "loss": 0.5868, "step": 108 }, { "epoch": 0.060088202866593166, "grad_norm": 0.26693934202194214, "learning_rate": 0.00029179104477611935, "loss": 0.5656, "step": 109 }, { "epoch": 0.06063947078280044, "grad_norm": 0.29277607798576355, "learning_rate": 0.0002917081260364842, "loss": 0.608, "step": 110 }, { "epoch": 0.061190738699007714, "grad_norm": 0.29922837018966675, "learning_rate": 0.00029162520729684907, "loss": 0.5952, "step": 111 }, { "epoch": 0.06174200661521499, "grad_norm": 0.26753753423690796, "learning_rate": 0.0002915422885572139, "loss": 0.5964, "step": 112 }, { "epoch": 0.06229327453142227, "grad_norm": 0.2910638451576233, "learning_rate": 0.00029145936981757873, "loss": 0.5822, "step": 113 }, { "epoch": 0.06284454244762955, "grad_norm": 0.3202199339866638, "learning_rate": 0.0002913764510779436, "loss": 0.5927, "step": 114 }, { "epoch": 0.06339581036383682, "grad_norm": 0.26713207364082336, "learning_rate": 0.00029129353233830845, "loss": 0.5698, "step": 115 }, { "epoch": 0.0639470782800441, "grad_norm": 0.3109968304634094, "learning_rate": 0.0002912106135986733, "loss": 0.5954, "step": 116 }, { "epoch": 0.06449834619625137, "grad_norm": 0.30233150720596313, "learning_rate": 0.0002911276948590381, "loss": 0.5941, "step": 117 }, { "epoch": 0.06504961411245866, "grad_norm": 0.28545138239860535, "learning_rate": 0.00029104477611940297, "loss": 0.5773, "step": 118 }, { "epoch": 0.06560088202866593, "grad_norm": 0.29633569717407227, "learning_rate": 0.0002909618573797678, "loss": 0.6014, "step": 119 }, { "epoch": 0.06615214994487321, "grad_norm": 0.29278406500816345, "learning_rate": 0.00029087893864013264, "loss": 0.6001, "step": 120 }, { "epoch": 0.06670341786108049, "grad_norm": 0.29871347546577454, "learning_rate": 0.0002907960199004975, "loss": 0.629, "step": 121 }, { "epoch": 0.06725468577728776, "grad_norm": 0.27272510528564453, "learning_rate": 0.00029071310116086235, "loss": 0.5502, "step": 122 }, { "epoch": 0.06780595369349504, "grad_norm": 0.2796414792537689, "learning_rate": 0.00029063018242122716, "loss": 0.5712, "step": 123 }, { "epoch": 0.06835722160970231, "grad_norm": 0.277700811624527, "learning_rate": 0.000290547263681592, "loss": 0.5654, "step": 124 }, { "epoch": 0.0689084895259096, "grad_norm": 0.2710396647453308, "learning_rate": 0.0002904643449419569, "loss": 0.5866, "step": 125 }, { "epoch": 0.06945975744211687, "grad_norm": 0.28910425305366516, "learning_rate": 0.00029038142620232174, "loss": 0.5679, "step": 126 }, { "epoch": 0.07001102535832414, "grad_norm": 0.2892954647541046, "learning_rate": 0.00029029850746268654, "loss": 0.5915, "step": 127 }, { "epoch": 0.07056229327453142, "grad_norm": 0.3241787552833557, "learning_rate": 0.0002902155887230514, "loss": 0.5818, "step": 128 }, { "epoch": 0.0711135611907387, "grad_norm": 0.29878735542297363, "learning_rate": 0.0002901326699834162, "loss": 0.5813, "step": 129 }, { "epoch": 0.07166482910694598, "grad_norm": 0.27833399176597595, "learning_rate": 0.00029004975124378106, "loss": 0.5865, "step": 130 }, { "epoch": 0.07221609702315325, "grad_norm": 0.3239665627479553, "learning_rate": 0.0002899668325041459, "loss": 0.5898, "step": 131 }, { "epoch": 0.07276736493936053, "grad_norm": 0.31001126766204834, "learning_rate": 0.0002898839137645108, "loss": 0.577, "step": 132 }, { "epoch": 0.0733186328555678, "grad_norm": 0.2673737704753876, "learning_rate": 0.0002898009950248756, "loss": 0.5684, "step": 133 }, { "epoch": 0.07386990077177508, "grad_norm": 0.3218002915382385, "learning_rate": 0.00028971807628524045, "loss": 0.5826, "step": 134 }, { "epoch": 0.07442116868798236, "grad_norm": 0.2867553234100342, "learning_rate": 0.00028963515754560525, "loss": 0.5679, "step": 135 }, { "epoch": 0.07497243660418963, "grad_norm": 0.2790491282939911, "learning_rate": 0.00028955223880597017, "loss": 0.5532, "step": 136 }, { "epoch": 0.07552370452039692, "grad_norm": 0.3101596534252167, "learning_rate": 0.00028946932006633497, "loss": 0.616, "step": 137 }, { "epoch": 0.07607497243660419, "grad_norm": 0.2670627534389496, "learning_rate": 0.00028938640132669983, "loss": 0.5147, "step": 138 }, { "epoch": 0.07662624035281147, "grad_norm": 0.28873148560523987, "learning_rate": 0.00028930348258706463, "loss": 0.5723, "step": 139 }, { "epoch": 0.07717750826901874, "grad_norm": 0.3042322099208832, "learning_rate": 0.0002892205638474295, "loss": 0.5483, "step": 140 }, { "epoch": 0.07772877618522601, "grad_norm": 0.30197396874427795, "learning_rate": 0.00028913764510779435, "loss": 0.5731, "step": 141 }, { "epoch": 0.0782800441014333, "grad_norm": 0.2676428258419037, "learning_rate": 0.0002890547263681592, "loss": 0.5384, "step": 142 }, { "epoch": 0.07883131201764057, "grad_norm": 0.2983885705471039, "learning_rate": 0.000288971807628524, "loss": 0.5777, "step": 143 }, { "epoch": 0.07938257993384785, "grad_norm": 0.3119770586490631, "learning_rate": 0.0002888888888888888, "loss": 0.5682, "step": 144 }, { "epoch": 0.07993384785005513, "grad_norm": 0.28664880990982056, "learning_rate": 0.0002888059701492537, "loss": 0.5875, "step": 145 }, { "epoch": 0.0804851157662624, "grad_norm": 0.2691631615161896, "learning_rate": 0.00028872305140961854, "loss": 0.5841, "step": 146 }, { "epoch": 0.08103638368246968, "grad_norm": 0.29469335079193115, "learning_rate": 0.0002886401326699834, "loss": 0.6111, "step": 147 }, { "epoch": 0.08158765159867695, "grad_norm": 0.27499398589134216, "learning_rate": 0.0002885572139303482, "loss": 0.5984, "step": 148 }, { "epoch": 0.08213891951488424, "grad_norm": 0.2869040369987488, "learning_rate": 0.00028847429519071306, "loss": 0.5862, "step": 149 }, { "epoch": 0.08269018743109151, "grad_norm": 0.25979968905448914, "learning_rate": 0.0002883913764510779, "loss": 0.5948, "step": 150 }, { "epoch": 0.08324145534729879, "grad_norm": 0.2581140398979187, "learning_rate": 0.0002883084577114428, "loss": 0.543, "step": 151 }, { "epoch": 0.08379272326350606, "grad_norm": 0.3241422474384308, "learning_rate": 0.0002882255389718076, "loss": 0.5584, "step": 152 }, { "epoch": 0.08434399117971333, "grad_norm": 0.3122616112232208, "learning_rate": 0.00028814262023217245, "loss": 0.6101, "step": 153 }, { "epoch": 0.08489525909592062, "grad_norm": 0.28104907274246216, "learning_rate": 0.00028805970149253725, "loss": 0.5721, "step": 154 }, { "epoch": 0.08544652701212789, "grad_norm": 0.32965442538261414, "learning_rate": 0.0002879767827529021, "loss": 0.5396, "step": 155 }, { "epoch": 0.08599779492833518, "grad_norm": 0.32811254262924194, "learning_rate": 0.00028789386401326697, "loss": 0.5819, "step": 156 }, { "epoch": 0.08654906284454245, "grad_norm": 0.3046472668647766, "learning_rate": 0.00028781094527363183, "loss": 0.5756, "step": 157 }, { "epoch": 0.08710033076074973, "grad_norm": 0.308413028717041, "learning_rate": 0.00028772802653399663, "loss": 0.611, "step": 158 }, { "epoch": 0.087651598676957, "grad_norm": 0.2636229693889618, "learning_rate": 0.0002876451077943615, "loss": 0.5608, "step": 159 }, { "epoch": 0.08820286659316427, "grad_norm": 0.29085874557495117, "learning_rate": 0.00028756218905472635, "loss": 0.553, "step": 160 }, { "epoch": 0.08875413450937156, "grad_norm": 0.2887280285358429, "learning_rate": 0.0002874792703150912, "loss": 0.5958, "step": 161 }, { "epoch": 0.08930540242557883, "grad_norm": 0.26728978753089905, "learning_rate": 0.000287396351575456, "loss": 0.5487, "step": 162 }, { "epoch": 0.08985667034178611, "grad_norm": 0.25967663526535034, "learning_rate": 0.0002873134328358209, "loss": 0.5657, "step": 163 }, { "epoch": 0.09040793825799338, "grad_norm": 0.2513408064842224, "learning_rate": 0.0002872305140961857, "loss": 0.5358, "step": 164 }, { "epoch": 0.09095920617420065, "grad_norm": 0.28536808490753174, "learning_rate": 0.00028714759535655054, "loss": 0.6057, "step": 165 }, { "epoch": 0.09151047409040794, "grad_norm": 0.28766608238220215, "learning_rate": 0.0002870646766169154, "loss": 0.6108, "step": 166 }, { "epoch": 0.09206174200661521, "grad_norm": 0.25628137588500977, "learning_rate": 0.00028698175787728026, "loss": 0.53, "step": 167 }, { "epoch": 0.0926130099228225, "grad_norm": 0.2983819246292114, "learning_rate": 0.00028689883913764506, "loss": 0.5997, "step": 168 }, { "epoch": 0.09316427783902977, "grad_norm": 0.27762502431869507, "learning_rate": 0.0002868159203980099, "loss": 0.5833, "step": 169 }, { "epoch": 0.09371554575523705, "grad_norm": 0.28496429324150085, "learning_rate": 0.0002867330016583748, "loss": 0.5863, "step": 170 }, { "epoch": 0.09426681367144432, "grad_norm": 0.26081910729408264, "learning_rate": 0.00028665008291873964, "loss": 0.5943, "step": 171 }, { "epoch": 0.09481808158765159, "grad_norm": 0.27544835209846497, "learning_rate": 0.00028656716417910444, "loss": 0.6175, "step": 172 }, { "epoch": 0.09536934950385888, "grad_norm": 0.2690446972846985, "learning_rate": 0.0002864842454394693, "loss": 0.5473, "step": 173 }, { "epoch": 0.09592061742006615, "grad_norm": 0.2816300690174103, "learning_rate": 0.0002864013266998341, "loss": 0.5908, "step": 174 }, { "epoch": 0.09647188533627343, "grad_norm": 0.26558321714401245, "learning_rate": 0.00028631840796019897, "loss": 0.5711, "step": 175 }, { "epoch": 0.0970231532524807, "grad_norm": 0.2692832946777344, "learning_rate": 0.0002862354892205638, "loss": 0.5731, "step": 176 }, { "epoch": 0.09757442116868799, "grad_norm": 0.2814270555973053, "learning_rate": 0.0002861525704809287, "loss": 0.5353, "step": 177 }, { "epoch": 0.09812568908489526, "grad_norm": 0.26562657952308655, "learning_rate": 0.0002860696517412935, "loss": 0.5955, "step": 178 }, { "epoch": 0.09867695700110253, "grad_norm": 0.2592059075832367, "learning_rate": 0.00028598673300165835, "loss": 0.5617, "step": 179 }, { "epoch": 0.09922822491730982, "grad_norm": 0.26579222083091736, "learning_rate": 0.0002859038142620232, "loss": 0.5725, "step": 180 }, { "epoch": 0.09977949283351709, "grad_norm": 0.2731139063835144, "learning_rate": 0.00028582089552238807, "loss": 0.5614, "step": 181 }, { "epoch": 0.10033076074972437, "grad_norm": 0.2470698207616806, "learning_rate": 0.00028573797678275287, "loss": 0.5347, "step": 182 }, { "epoch": 0.10088202866593164, "grad_norm": 0.24656972289085388, "learning_rate": 0.00028565505804311773, "loss": 0.5481, "step": 183 }, { "epoch": 0.10143329658213891, "grad_norm": 0.2857254445552826, "learning_rate": 0.00028557213930348254, "loss": 0.602, "step": 184 }, { "epoch": 0.1019845644983462, "grad_norm": 0.27286651730537415, "learning_rate": 0.0002854892205638474, "loss": 0.5585, "step": 185 }, { "epoch": 0.10253583241455347, "grad_norm": 0.2675493359565735, "learning_rate": 0.00028540630182421225, "loss": 0.567, "step": 186 }, { "epoch": 0.10308710033076075, "grad_norm": 0.26535746455192566, "learning_rate": 0.00028532338308457706, "loss": 0.5696, "step": 187 }, { "epoch": 0.10363836824696802, "grad_norm": 0.2633534371852875, "learning_rate": 0.0002852404643449419, "loss": 0.5326, "step": 188 }, { "epoch": 0.10418963616317531, "grad_norm": 0.2724531292915344, "learning_rate": 0.0002851575456053068, "loss": 0.5905, "step": 189 }, { "epoch": 0.10474090407938258, "grad_norm": 0.2680416405200958, "learning_rate": 0.00028507462686567164, "loss": 0.5924, "step": 190 }, { "epoch": 0.10529217199558985, "grad_norm": 0.28108882904052734, "learning_rate": 0.00028499170812603644, "loss": 0.5926, "step": 191 }, { "epoch": 0.10584343991179714, "grad_norm": 0.2787463366985321, "learning_rate": 0.0002849087893864013, "loss": 0.5699, "step": 192 }, { "epoch": 0.1063947078280044, "grad_norm": 0.2674010396003723, "learning_rate": 0.0002848258706467661, "loss": 0.587, "step": 193 }, { "epoch": 0.10694597574421169, "grad_norm": 0.27142807841300964, "learning_rate": 0.00028474295190713097, "loss": 0.5762, "step": 194 }, { "epoch": 0.10749724366041896, "grad_norm": 0.2817786633968353, "learning_rate": 0.0002846600331674958, "loss": 0.5672, "step": 195 }, { "epoch": 0.10804851157662625, "grad_norm": 0.250627338886261, "learning_rate": 0.0002845771144278607, "loss": 0.5425, "step": 196 }, { "epoch": 0.10859977949283352, "grad_norm": 0.2636951506137848, "learning_rate": 0.0002844941956882255, "loss": 0.579, "step": 197 }, { "epoch": 0.10915104740904079, "grad_norm": 0.2613438665866852, "learning_rate": 0.00028441127694859035, "loss": 0.5531, "step": 198 }, { "epoch": 0.10970231532524807, "grad_norm": 0.28677162528038025, "learning_rate": 0.0002843283582089552, "loss": 0.5875, "step": 199 }, { "epoch": 0.11025358324145534, "grad_norm": 0.2670292258262634, "learning_rate": 0.00028424543946932007, "loss": 0.5625, "step": 200 }, { "epoch": 0.11080485115766263, "grad_norm": 0.23815321922302246, "learning_rate": 0.00028416252072968487, "loss": 0.5484, "step": 201 }, { "epoch": 0.1113561190738699, "grad_norm": 0.2709272503852844, "learning_rate": 0.00028407960199004973, "loss": 0.5387, "step": 202 }, { "epoch": 0.11190738699007717, "grad_norm": 0.25918126106262207, "learning_rate": 0.00028399668325041453, "loss": 0.5686, "step": 203 }, { "epoch": 0.11245865490628446, "grad_norm": 0.27118560671806335, "learning_rate": 0.0002839137645107794, "loss": 0.5637, "step": 204 }, { "epoch": 0.11300992282249173, "grad_norm": 0.26395100355148315, "learning_rate": 0.00028383084577114425, "loss": 0.5499, "step": 205 }, { "epoch": 0.11356119073869901, "grad_norm": 0.272989422082901, "learning_rate": 0.0002837479270315091, "loss": 0.5606, "step": 206 }, { "epoch": 0.11411245865490628, "grad_norm": 0.2708880603313446, "learning_rate": 0.0002836650082918739, "loss": 0.534, "step": 207 }, { "epoch": 0.11466372657111357, "grad_norm": 0.28653857111930847, "learning_rate": 0.0002835820895522388, "loss": 0.5727, "step": 208 }, { "epoch": 0.11521499448732084, "grad_norm": 0.2767845392227173, "learning_rate": 0.00028349917081260364, "loss": 0.5664, "step": 209 }, { "epoch": 0.11576626240352811, "grad_norm": 0.27690836787223816, "learning_rate": 0.0002834162520729685, "loss": 0.5656, "step": 210 }, { "epoch": 0.1163175303197354, "grad_norm": 0.2831721007823944, "learning_rate": 0.0002833333333333333, "loss": 0.596, "step": 211 }, { "epoch": 0.11686879823594266, "grad_norm": 0.3024809658527374, "learning_rate": 0.00028325041459369816, "loss": 0.5849, "step": 212 }, { "epoch": 0.11742006615214995, "grad_norm": 0.2787605822086334, "learning_rate": 0.00028316749585406296, "loss": 0.5606, "step": 213 }, { "epoch": 0.11797133406835722, "grad_norm": 0.2734401226043701, "learning_rate": 0.0002830845771144278, "loss": 0.5524, "step": 214 }, { "epoch": 0.1185226019845645, "grad_norm": 0.2717944085597992, "learning_rate": 0.0002830016583747927, "loss": 0.5533, "step": 215 }, { "epoch": 0.11907386990077178, "grad_norm": 0.2634055018424988, "learning_rate": 0.00028291873963515754, "loss": 0.5552, "step": 216 }, { "epoch": 0.11962513781697905, "grad_norm": 0.27231520414352417, "learning_rate": 0.00028283582089552235, "loss": 0.5608, "step": 217 }, { "epoch": 0.12017640573318633, "grad_norm": 0.2709995210170746, "learning_rate": 0.0002827529021558872, "loss": 0.5608, "step": 218 }, { "epoch": 0.1207276736493936, "grad_norm": 0.24507290124893188, "learning_rate": 0.00028266998341625206, "loss": 0.5324, "step": 219 }, { "epoch": 0.12127894156560089, "grad_norm": 0.26341697573661804, "learning_rate": 0.0002825870646766169, "loss": 0.5686, "step": 220 }, { "epoch": 0.12183020948180816, "grad_norm": 0.2655317783355713, "learning_rate": 0.00028250414593698173, "loss": 0.5792, "step": 221 }, { "epoch": 0.12238147739801543, "grad_norm": 0.263235867023468, "learning_rate": 0.0002824212271973466, "loss": 0.5633, "step": 222 }, { "epoch": 0.12293274531422271, "grad_norm": 0.28087055683135986, "learning_rate": 0.0002823383084577114, "loss": 0.559, "step": 223 }, { "epoch": 0.12348401323042998, "grad_norm": 0.2734236717224121, "learning_rate": 0.00028225538971807625, "loss": 0.5772, "step": 224 }, { "epoch": 0.12403528114663727, "grad_norm": 0.2594766318798065, "learning_rate": 0.0002821724709784411, "loss": 0.5698, "step": 225 }, { "epoch": 0.12458654906284454, "grad_norm": 0.2490595132112503, "learning_rate": 0.00028208955223880597, "loss": 0.5419, "step": 226 }, { "epoch": 0.12513781697905182, "grad_norm": 0.25069767236709595, "learning_rate": 0.0002820066334991708, "loss": 0.531, "step": 227 }, { "epoch": 0.1256890848952591, "grad_norm": 0.2518230080604553, "learning_rate": 0.00028192371475953563, "loss": 0.5509, "step": 228 }, { "epoch": 0.12624035281146637, "grad_norm": 0.2488110512495041, "learning_rate": 0.0002818407960199005, "loss": 0.5341, "step": 229 }, { "epoch": 0.12679162072767364, "grad_norm": 0.26115381717681885, "learning_rate": 0.00028175787728026535, "loss": 0.5433, "step": 230 }, { "epoch": 0.12734288864388094, "grad_norm": 0.24792101979255676, "learning_rate": 0.00028167495854063016, "loss": 0.5672, "step": 231 }, { "epoch": 0.1278941565600882, "grad_norm": 0.2637925148010254, "learning_rate": 0.00028159203980099496, "loss": 0.5868, "step": 232 }, { "epoch": 0.12844542447629548, "grad_norm": 0.2799462676048279, "learning_rate": 0.0002815091210613598, "loss": 0.5514, "step": 233 }, { "epoch": 0.12899669239250275, "grad_norm": 0.2809968590736389, "learning_rate": 0.0002814262023217247, "loss": 0.5847, "step": 234 }, { "epoch": 0.12954796030871002, "grad_norm": 0.27108708024024963, "learning_rate": 0.00028134328358208954, "loss": 0.5718, "step": 235 }, { "epoch": 0.13009922822491732, "grad_norm": 0.2557702660560608, "learning_rate": 0.00028126036484245434, "loss": 0.575, "step": 236 }, { "epoch": 0.1306504961411246, "grad_norm": 0.2593226134777069, "learning_rate": 0.0002811774461028192, "loss": 0.5534, "step": 237 }, { "epoch": 0.13120176405733186, "grad_norm": 0.2657114565372467, "learning_rate": 0.00028109452736318406, "loss": 0.5605, "step": 238 }, { "epoch": 0.13175303197353913, "grad_norm": 0.25616228580474854, "learning_rate": 0.0002810116086235489, "loss": 0.5227, "step": 239 }, { "epoch": 0.13230429988974643, "grad_norm": 0.2749009430408478, "learning_rate": 0.0002809286898839137, "loss": 0.536, "step": 240 }, { "epoch": 0.1328555678059537, "grad_norm": 0.2617826759815216, "learning_rate": 0.0002808457711442786, "loss": 0.5602, "step": 241 }, { "epoch": 0.13340683572216097, "grad_norm": 0.2576202154159546, "learning_rate": 0.0002807628524046434, "loss": 0.5205, "step": 242 }, { "epoch": 0.13395810363836824, "grad_norm": 0.2764850854873657, "learning_rate": 0.00028067993366500825, "loss": 0.5752, "step": 243 }, { "epoch": 0.1345093715545755, "grad_norm": 0.2652502954006195, "learning_rate": 0.0002805970149253731, "loss": 0.5495, "step": 244 }, { "epoch": 0.1350606394707828, "grad_norm": 0.24600890278816223, "learning_rate": 0.00028051409618573797, "loss": 0.5146, "step": 245 }, { "epoch": 0.13561190738699008, "grad_norm": 0.253635048866272, "learning_rate": 0.0002804311774461028, "loss": 0.5483, "step": 246 }, { "epoch": 0.13616317530319735, "grad_norm": 0.24037104845046997, "learning_rate": 0.00028034825870646763, "loss": 0.5624, "step": 247 }, { "epoch": 0.13671444321940462, "grad_norm": 0.24676042795181274, "learning_rate": 0.0002802653399668325, "loss": 0.537, "step": 248 }, { "epoch": 0.1372657111356119, "grad_norm": 0.25283971428871155, "learning_rate": 0.00028018242122719735, "loss": 0.5705, "step": 249 }, { "epoch": 0.1378169790518192, "grad_norm": 0.2672947347164154, "learning_rate": 0.00028009950248756216, "loss": 0.5699, "step": 250 }, { "epoch": 0.13836824696802646, "grad_norm": 0.25930237770080566, "learning_rate": 0.000280016583747927, "loss": 0.5581, "step": 251 }, { "epoch": 0.13891951488423374, "grad_norm": 0.24674735963344574, "learning_rate": 0.0002799336650082918, "loss": 0.5282, "step": 252 }, { "epoch": 0.139470782800441, "grad_norm": 0.2826119065284729, "learning_rate": 0.0002798507462686567, "loss": 0.5261, "step": 253 }, { "epoch": 0.14002205071664828, "grad_norm": 0.290584534406662, "learning_rate": 0.00027976782752902154, "loss": 0.5245, "step": 254 }, { "epoch": 0.14057331863285558, "grad_norm": 0.25072574615478516, "learning_rate": 0.0002796849087893864, "loss": 0.5264, "step": 255 }, { "epoch": 0.14112458654906285, "grad_norm": 0.24929046630859375, "learning_rate": 0.0002796019900497512, "loss": 0.5698, "step": 256 }, { "epoch": 0.14167585446527012, "grad_norm": 0.24978522956371307, "learning_rate": 0.00027951907131011606, "loss": 0.5269, "step": 257 }, { "epoch": 0.1422271223814774, "grad_norm": 0.26195666193962097, "learning_rate": 0.0002794361525704809, "loss": 0.5801, "step": 258 }, { "epoch": 0.1427783902976847, "grad_norm": 0.27321335673332214, "learning_rate": 0.0002793532338308458, "loss": 0.5556, "step": 259 }, { "epoch": 0.14332965821389196, "grad_norm": 0.2694965898990631, "learning_rate": 0.0002792703150912106, "loss": 0.5715, "step": 260 }, { "epoch": 0.14388092613009923, "grad_norm": 0.2757553160190582, "learning_rate": 0.00027918739635157544, "loss": 0.5645, "step": 261 }, { "epoch": 0.1444321940463065, "grad_norm": 0.2602946162223816, "learning_rate": 0.00027910447761194025, "loss": 0.5703, "step": 262 }, { "epoch": 0.14498346196251377, "grad_norm": 0.24068838357925415, "learning_rate": 0.0002790215588723051, "loss": 0.5168, "step": 263 }, { "epoch": 0.14553472987872107, "grad_norm": 0.26140162348747253, "learning_rate": 0.00027893864013266997, "loss": 0.5271, "step": 264 }, { "epoch": 0.14608599779492834, "grad_norm": 0.26940983533859253, "learning_rate": 0.0002788557213930348, "loss": 0.5571, "step": 265 }, { "epoch": 0.1466372657111356, "grad_norm": 0.24524417519569397, "learning_rate": 0.00027877280265339963, "loss": 0.5227, "step": 266 }, { "epoch": 0.14718853362734288, "grad_norm": 0.2636984884738922, "learning_rate": 0.0002786898839137645, "loss": 0.5335, "step": 267 }, { "epoch": 0.14773980154355015, "grad_norm": 0.24600271880626678, "learning_rate": 0.00027860696517412935, "loss": 0.5601, "step": 268 }, { "epoch": 0.14829106945975745, "grad_norm": 0.24977444112300873, "learning_rate": 0.0002785240464344942, "loss": 0.5437, "step": 269 }, { "epoch": 0.14884233737596472, "grad_norm": 0.27960002422332764, "learning_rate": 0.000278441127694859, "loss": 0.548, "step": 270 }, { "epoch": 0.149393605292172, "grad_norm": 0.2514914870262146, "learning_rate": 0.00027835820895522387, "loss": 0.5335, "step": 271 }, { "epoch": 0.14994487320837926, "grad_norm": 0.2503030300140381, "learning_rate": 0.0002782752902155887, "loss": 0.5538, "step": 272 }, { "epoch": 0.15049614112458654, "grad_norm": 0.28311678767204285, "learning_rate": 0.00027819237147595354, "loss": 0.5649, "step": 273 }, { "epoch": 0.15104740904079383, "grad_norm": 0.27529653906822205, "learning_rate": 0.0002781094527363184, "loss": 0.5432, "step": 274 }, { "epoch": 0.1515986769570011, "grad_norm": 0.266111820936203, "learning_rate": 0.0002780265339966832, "loss": 0.5475, "step": 275 }, { "epoch": 0.15214994487320838, "grad_norm": 0.2525365352630615, "learning_rate": 0.00027794361525704806, "loss": 0.5252, "step": 276 }, { "epoch": 0.15270121278941565, "grad_norm": 0.2655681371688843, "learning_rate": 0.0002778606965174129, "loss": 0.5406, "step": 277 }, { "epoch": 0.15325248070562295, "grad_norm": 0.29118314385414124, "learning_rate": 0.0002777777777777778, "loss": 0.5324, "step": 278 }, { "epoch": 0.15380374862183022, "grad_norm": 0.2875930070877075, "learning_rate": 0.0002776948590381426, "loss": 0.5804, "step": 279 }, { "epoch": 0.1543550165380375, "grad_norm": 0.26764920353889465, "learning_rate": 0.00027761194029850744, "loss": 0.5391, "step": 280 }, { "epoch": 0.15490628445424476, "grad_norm": 0.2753891348838806, "learning_rate": 0.00027752902155887225, "loss": 0.5573, "step": 281 }, { "epoch": 0.15545755237045203, "grad_norm": 0.26174411177635193, "learning_rate": 0.0002774461028192371, "loss": 0.5543, "step": 282 }, { "epoch": 0.15600882028665933, "grad_norm": 0.25004303455352783, "learning_rate": 0.00027736318407960196, "loss": 0.5546, "step": 283 }, { "epoch": 0.1565600882028666, "grad_norm": 0.2634401023387909, "learning_rate": 0.0002772802653399668, "loss": 0.524, "step": 284 }, { "epoch": 0.15711135611907387, "grad_norm": 0.26751798391342163, "learning_rate": 0.00027719734660033163, "loss": 0.574, "step": 285 }, { "epoch": 0.15766262403528114, "grad_norm": 0.2556850016117096, "learning_rate": 0.0002771144278606965, "loss": 0.5533, "step": 286 }, { "epoch": 0.1582138919514884, "grad_norm": 0.2557762563228607, "learning_rate": 0.00027703150912106135, "loss": 0.546, "step": 287 }, { "epoch": 0.1587651598676957, "grad_norm": 0.25817009806632996, "learning_rate": 0.0002769485903814262, "loss": 0.5519, "step": 288 }, { "epoch": 0.15931642778390298, "grad_norm": 0.26580142974853516, "learning_rate": 0.000276865671641791, "loss": 0.5438, "step": 289 }, { "epoch": 0.15986769570011025, "grad_norm": 0.25780072808265686, "learning_rate": 0.00027678275290215587, "loss": 0.549, "step": 290 }, { "epoch": 0.16041896361631752, "grad_norm": 0.2627890706062317, "learning_rate": 0.0002766998341625207, "loss": 0.5565, "step": 291 }, { "epoch": 0.1609702315325248, "grad_norm": 0.26781341433525085, "learning_rate": 0.00027661691542288553, "loss": 0.542, "step": 292 }, { "epoch": 0.1615214994487321, "grad_norm": 0.253888338804245, "learning_rate": 0.0002765339966832504, "loss": 0.5424, "step": 293 }, { "epoch": 0.16207276736493936, "grad_norm": 0.2835153043270111, "learning_rate": 0.00027645107794361525, "loss": 0.5354, "step": 294 }, { "epoch": 0.16262403528114663, "grad_norm": 0.286640465259552, "learning_rate": 0.00027636815920398006, "loss": 0.5209, "step": 295 }, { "epoch": 0.1631753031973539, "grad_norm": 0.25742077827453613, "learning_rate": 0.0002762852404643449, "loss": 0.5198, "step": 296 }, { "epoch": 0.1637265711135612, "grad_norm": 0.24710626900196075, "learning_rate": 0.0002762023217247098, "loss": 0.5189, "step": 297 }, { "epoch": 0.16427783902976847, "grad_norm": 0.28113001585006714, "learning_rate": 0.00027611940298507464, "loss": 0.5519, "step": 298 }, { "epoch": 0.16482910694597575, "grad_norm": 0.2573966085910797, "learning_rate": 0.00027603648424543944, "loss": 0.5307, "step": 299 }, { "epoch": 0.16538037486218302, "grad_norm": 0.24416916072368622, "learning_rate": 0.0002759535655058043, "loss": 0.5519, "step": 300 }, { "epoch": 0.1659316427783903, "grad_norm": 0.25596654415130615, "learning_rate": 0.0002758706467661691, "loss": 0.5344, "step": 301 }, { "epoch": 0.16648291069459759, "grad_norm": 0.25158900022506714, "learning_rate": 0.00027578772802653396, "loss": 0.5399, "step": 302 }, { "epoch": 0.16703417861080486, "grad_norm": 0.24854016304016113, "learning_rate": 0.0002757048092868988, "loss": 0.5389, "step": 303 }, { "epoch": 0.16758544652701213, "grad_norm": 0.2592412233352661, "learning_rate": 0.0002756218905472637, "loss": 0.5584, "step": 304 }, { "epoch": 0.1681367144432194, "grad_norm": 0.2527318298816681, "learning_rate": 0.0002755389718076285, "loss": 0.5604, "step": 305 }, { "epoch": 0.16868798235942667, "grad_norm": 0.26560983061790466, "learning_rate": 0.00027545605306799335, "loss": 0.5561, "step": 306 }, { "epoch": 0.16923925027563397, "grad_norm": 0.2634880542755127, "learning_rate": 0.0002753731343283582, "loss": 0.5281, "step": 307 }, { "epoch": 0.16979051819184124, "grad_norm": 0.2732850909233093, "learning_rate": 0.00027529021558872306, "loss": 0.5398, "step": 308 }, { "epoch": 0.1703417861080485, "grad_norm": 0.23158006370067596, "learning_rate": 0.00027520729684908787, "loss": 0.5325, "step": 309 }, { "epoch": 0.17089305402425578, "grad_norm": 0.24649128317832947, "learning_rate": 0.00027512437810945273, "loss": 0.5381, "step": 310 }, { "epoch": 0.17144432194046305, "grad_norm": 0.2770949602127075, "learning_rate": 0.00027504145936981753, "loss": 0.5498, "step": 311 }, { "epoch": 0.17199558985667035, "grad_norm": 0.25388598442077637, "learning_rate": 0.0002749585406301824, "loss": 0.5389, "step": 312 }, { "epoch": 0.17254685777287762, "grad_norm": 0.2431599199771881, "learning_rate": 0.00027487562189054725, "loss": 0.5343, "step": 313 }, { "epoch": 0.1730981256890849, "grad_norm": 0.24289795756340027, "learning_rate": 0.0002747927031509121, "loss": 0.5073, "step": 314 }, { "epoch": 0.17364939360529216, "grad_norm": 0.2458408623933792, "learning_rate": 0.0002747097844112769, "loss": 0.5278, "step": 315 }, { "epoch": 0.17420066152149946, "grad_norm": 0.24127742648124695, "learning_rate": 0.0002746268656716418, "loss": 0.5345, "step": 316 }, { "epoch": 0.17475192943770673, "grad_norm": 0.26737701892852783, "learning_rate": 0.00027454394693200663, "loss": 0.5395, "step": 317 }, { "epoch": 0.175303197353914, "grad_norm": 0.26361507177352905, "learning_rate": 0.0002744610281923715, "loss": 0.5405, "step": 318 }, { "epoch": 0.17585446527012127, "grad_norm": 0.24210020899772644, "learning_rate": 0.0002743781094527363, "loss": 0.5268, "step": 319 }, { "epoch": 0.17640573318632854, "grad_norm": 0.2510232627391815, "learning_rate": 0.0002742951907131011, "loss": 0.5373, "step": 320 }, { "epoch": 0.17695700110253584, "grad_norm": 0.23939576745033264, "learning_rate": 0.00027421227197346596, "loss": 0.5561, "step": 321 }, { "epoch": 0.17750826901874311, "grad_norm": 0.273258239030838, "learning_rate": 0.0002741293532338308, "loss": 0.5507, "step": 322 }, { "epoch": 0.17805953693495039, "grad_norm": 0.23547501862049103, "learning_rate": 0.0002740464344941957, "loss": 0.5293, "step": 323 }, { "epoch": 0.17861080485115766, "grad_norm": 0.24796201288700104, "learning_rate": 0.0002739635157545605, "loss": 0.5378, "step": 324 }, { "epoch": 0.17916207276736493, "grad_norm": 0.23436011373996735, "learning_rate": 0.00027388059701492534, "loss": 0.5432, "step": 325 }, { "epoch": 0.17971334068357223, "grad_norm": 0.22892701625823975, "learning_rate": 0.0002737976782752902, "loss": 0.5221, "step": 326 }, { "epoch": 0.1802646085997795, "grad_norm": 0.23817826807498932, "learning_rate": 0.00027371475953565506, "loss": 0.5284, "step": 327 }, { "epoch": 0.18081587651598677, "grad_norm": 0.23703162372112274, "learning_rate": 0.00027363184079601987, "loss": 0.5223, "step": 328 }, { "epoch": 0.18136714443219404, "grad_norm": 0.24087084829807281, "learning_rate": 0.0002735489220563847, "loss": 0.5489, "step": 329 }, { "epoch": 0.1819184123484013, "grad_norm": 0.2529735267162323, "learning_rate": 0.00027346600331674953, "loss": 0.5485, "step": 330 }, { "epoch": 0.1824696802646086, "grad_norm": 0.23450088500976562, "learning_rate": 0.0002733830845771144, "loss": 0.4971, "step": 331 }, { "epoch": 0.18302094818081588, "grad_norm": 0.23895451426506042, "learning_rate": 0.00027330016583747925, "loss": 0.5165, "step": 332 }, { "epoch": 0.18357221609702315, "grad_norm": 0.24417142570018768, "learning_rate": 0.0002732172470978441, "loss": 0.5491, "step": 333 }, { "epoch": 0.18412348401323042, "grad_norm": 0.2527695596218109, "learning_rate": 0.0002731343283582089, "loss": 0.5255, "step": 334 }, { "epoch": 0.18467475192943772, "grad_norm": 0.24978198111057281, "learning_rate": 0.00027305140961857377, "loss": 0.5389, "step": 335 }, { "epoch": 0.185226019845645, "grad_norm": 0.2539977431297302, "learning_rate": 0.00027296849087893863, "loss": 0.5392, "step": 336 }, { "epoch": 0.18577728776185226, "grad_norm": 0.24033623933792114, "learning_rate": 0.0002728855721393035, "loss": 0.5356, "step": 337 }, { "epoch": 0.18632855567805953, "grad_norm": 0.24697022140026093, "learning_rate": 0.0002728026533996683, "loss": 0.5159, "step": 338 }, { "epoch": 0.1868798235942668, "grad_norm": 0.25741416215896606, "learning_rate": 0.00027271973466003315, "loss": 0.56, "step": 339 }, { "epoch": 0.1874310915104741, "grad_norm": 0.2324167639017105, "learning_rate": 0.00027263681592039796, "loss": 0.5379, "step": 340 }, { "epoch": 0.18798235942668137, "grad_norm": 0.24800144135951996, "learning_rate": 0.0002725538971807628, "loss": 0.5129, "step": 341 }, { "epoch": 0.18853362734288864, "grad_norm": 0.26905378699302673, "learning_rate": 0.0002724709784411277, "loss": 0.5226, "step": 342 }, { "epoch": 0.18908489525909591, "grad_norm": 0.25401249527931213, "learning_rate": 0.00027238805970149254, "loss": 0.5313, "step": 343 }, { "epoch": 0.18963616317530319, "grad_norm": 0.24307483434677124, "learning_rate": 0.00027230514096185734, "loss": 0.5427, "step": 344 }, { "epoch": 0.19018743109151048, "grad_norm": 0.25807374715805054, "learning_rate": 0.0002722222222222222, "loss": 0.524, "step": 345 }, { "epoch": 0.19073869900771775, "grad_norm": 0.2321993112564087, "learning_rate": 0.00027213930348258706, "loss": 0.5314, "step": 346 }, { "epoch": 0.19128996692392503, "grad_norm": 0.23558932542800903, "learning_rate": 0.0002720563847429519, "loss": 0.5223, "step": 347 }, { "epoch": 0.1918412348401323, "grad_norm": 0.25960054993629456, "learning_rate": 0.0002719734660033167, "loss": 0.5436, "step": 348 }, { "epoch": 0.19239250275633957, "grad_norm": 0.2273932248353958, "learning_rate": 0.0002718905472636816, "loss": 0.5048, "step": 349 }, { "epoch": 0.19294377067254687, "grad_norm": 0.2279786467552185, "learning_rate": 0.0002718076285240464, "loss": 0.5164, "step": 350 }, { "epoch": 0.19349503858875414, "grad_norm": 0.23833182454109192, "learning_rate": 0.00027172470978441125, "loss": 0.5378, "step": 351 }, { "epoch": 0.1940463065049614, "grad_norm": 0.2499193549156189, "learning_rate": 0.0002716417910447761, "loss": 0.5494, "step": 352 }, { "epoch": 0.19459757442116868, "grad_norm": 0.2734036147594452, "learning_rate": 0.00027155887230514097, "loss": 0.5391, "step": 353 }, { "epoch": 0.19514884233737598, "grad_norm": 0.25754764676094055, "learning_rate": 0.00027147595356550577, "loss": 0.5212, "step": 354 }, { "epoch": 0.19570011025358325, "grad_norm": 0.22964167594909668, "learning_rate": 0.00027139303482587063, "loss": 0.5301, "step": 355 }, { "epoch": 0.19625137816979052, "grad_norm": 0.24985463917255402, "learning_rate": 0.0002713101160862355, "loss": 0.5177, "step": 356 }, { "epoch": 0.1968026460859978, "grad_norm": 0.27296510338783264, "learning_rate": 0.00027122719734660035, "loss": 0.5443, "step": 357 }, { "epoch": 0.19735391400220506, "grad_norm": 0.2506982982158661, "learning_rate": 0.00027114427860696515, "loss": 0.5419, "step": 358 }, { "epoch": 0.19790518191841236, "grad_norm": 0.2600388526916504, "learning_rate": 0.00027106135986733, "loss": 0.5402, "step": 359 }, { "epoch": 0.19845644983461963, "grad_norm": 0.25040823221206665, "learning_rate": 0.0002709784411276948, "loss": 0.5463, "step": 360 }, { "epoch": 0.1990077177508269, "grad_norm": 0.25567591190338135, "learning_rate": 0.0002708955223880597, "loss": 0.5189, "step": 361 }, { "epoch": 0.19955898566703417, "grad_norm": 0.24336600303649902, "learning_rate": 0.00027081260364842454, "loss": 0.5393, "step": 362 }, { "epoch": 0.20011025358324144, "grad_norm": 0.23660831153392792, "learning_rate": 0.00027072968490878934, "loss": 0.5121, "step": 363 }, { "epoch": 0.20066152149944874, "grad_norm": 0.23589812219142914, "learning_rate": 0.0002706467661691542, "loss": 0.5016, "step": 364 }, { "epoch": 0.201212789415656, "grad_norm": 0.2517778277397156, "learning_rate": 0.000270563847429519, "loss": 0.5127, "step": 365 }, { "epoch": 0.20176405733186328, "grad_norm": 0.263662189245224, "learning_rate": 0.0002704809286898839, "loss": 0.5518, "step": 366 }, { "epoch": 0.20231532524807055, "grad_norm": 0.25211676955223083, "learning_rate": 0.0002703980099502487, "loss": 0.5362, "step": 367 }, { "epoch": 0.20286659316427783, "grad_norm": 0.22718675434589386, "learning_rate": 0.0002703150912106136, "loss": 0.5127, "step": 368 }, { "epoch": 0.20341786108048512, "grad_norm": 0.24481582641601562, "learning_rate": 0.0002702321724709784, "loss": 0.5084, "step": 369 }, { "epoch": 0.2039691289966924, "grad_norm": 0.2656586766242981, "learning_rate": 0.00027014925373134325, "loss": 0.5454, "step": 370 }, { "epoch": 0.20452039691289967, "grad_norm": 0.2491103559732437, "learning_rate": 0.0002700663349917081, "loss": 0.5412, "step": 371 }, { "epoch": 0.20507166482910694, "grad_norm": 0.252030611038208, "learning_rate": 0.00026998341625207296, "loss": 0.5761, "step": 372 }, { "epoch": 0.20562293274531424, "grad_norm": 0.24894152581691742, "learning_rate": 0.00026990049751243777, "loss": 0.5264, "step": 373 }, { "epoch": 0.2061742006615215, "grad_norm": 0.25231489539146423, "learning_rate": 0.00026981757877280263, "loss": 0.5295, "step": 374 }, { "epoch": 0.20672546857772878, "grad_norm": 0.25147655606269836, "learning_rate": 0.00026973466003316743, "loss": 0.5126, "step": 375 }, { "epoch": 0.20727673649393605, "grad_norm": 0.2379835844039917, "learning_rate": 0.0002696517412935323, "loss": 0.4937, "step": 376 }, { "epoch": 0.20782800441014332, "grad_norm": 0.24038439989089966, "learning_rate": 0.00026956882255389715, "loss": 0.5426, "step": 377 }, { "epoch": 0.20837927232635062, "grad_norm": 0.24591150879859924, "learning_rate": 0.000269485903814262, "loss": 0.5191, "step": 378 }, { "epoch": 0.2089305402425579, "grad_norm": 0.23723675310611725, "learning_rate": 0.0002694029850746268, "loss": 0.5247, "step": 379 }, { "epoch": 0.20948180815876516, "grad_norm": 0.2618078887462616, "learning_rate": 0.0002693200663349917, "loss": 0.5559, "step": 380 }, { "epoch": 0.21003307607497243, "grad_norm": 0.2556595504283905, "learning_rate": 0.00026923714759535653, "loss": 0.544, "step": 381 }, { "epoch": 0.2105843439911797, "grad_norm": 0.24010786414146423, "learning_rate": 0.0002691542288557214, "loss": 0.4958, "step": 382 }, { "epoch": 0.211135611907387, "grad_norm": 0.253151535987854, "learning_rate": 0.0002690713101160862, "loss": 0.5371, "step": 383 }, { "epoch": 0.21168687982359427, "grad_norm": 0.2715364694595337, "learning_rate": 0.00026898839137645106, "loss": 0.5788, "step": 384 }, { "epoch": 0.21223814773980154, "grad_norm": 0.2472977191209793, "learning_rate": 0.00026890547263681586, "loss": 0.5359, "step": 385 }, { "epoch": 0.2127894156560088, "grad_norm": 0.2925645411014557, "learning_rate": 0.0002688225538971807, "loss": 0.5373, "step": 386 }, { "epoch": 0.21334068357221608, "grad_norm": 0.23534104228019714, "learning_rate": 0.0002687396351575456, "loss": 0.5421, "step": 387 }, { "epoch": 0.21389195148842338, "grad_norm": 0.25397318601608276, "learning_rate": 0.00026865671641791044, "loss": 0.5538, "step": 388 }, { "epoch": 0.21444321940463065, "grad_norm": 0.26708152890205383, "learning_rate": 0.00026857379767827524, "loss": 0.5088, "step": 389 }, { "epoch": 0.21499448732083792, "grad_norm": 0.24131494760513306, "learning_rate": 0.0002684908789386401, "loss": 0.5215, "step": 390 }, { "epoch": 0.2155457552370452, "grad_norm": 0.25981369614601135, "learning_rate": 0.00026840796019900496, "loss": 0.5481, "step": 391 }, { "epoch": 0.2160970231532525, "grad_norm": 0.25831639766693115, "learning_rate": 0.0002683250414593698, "loss": 0.5352, "step": 392 }, { "epoch": 0.21664829106945976, "grad_norm": 0.24388836324214935, "learning_rate": 0.0002682421227197346, "loss": 0.5047, "step": 393 }, { "epoch": 0.21719955898566704, "grad_norm": 0.25614237785339355, "learning_rate": 0.0002681592039800995, "loss": 0.5236, "step": 394 }, { "epoch": 0.2177508269018743, "grad_norm": 0.23628944158554077, "learning_rate": 0.0002680762852404643, "loss": 0.5118, "step": 395 }, { "epoch": 0.21830209481808158, "grad_norm": 0.25390875339508057, "learning_rate": 0.00026799336650082915, "loss": 0.5231, "step": 396 }, { "epoch": 0.21885336273428888, "grad_norm": 0.27364251017570496, "learning_rate": 0.000267910447761194, "loss": 0.5573, "step": 397 }, { "epoch": 0.21940463065049615, "grad_norm": 0.25110650062561035, "learning_rate": 0.00026782752902155887, "loss": 0.5078, "step": 398 }, { "epoch": 0.21995589856670342, "grad_norm": 0.24438323080539703, "learning_rate": 0.0002677446102819237, "loss": 0.5026, "step": 399 }, { "epoch": 0.2205071664829107, "grad_norm": 0.23745465278625488, "learning_rate": 0.00026766169154228853, "loss": 0.5568, "step": 400 }, { "epoch": 0.22105843439911796, "grad_norm": 0.25559869408607483, "learning_rate": 0.0002675787728026534, "loss": 0.5286, "step": 401 }, { "epoch": 0.22160970231532526, "grad_norm": 0.24587516486644745, "learning_rate": 0.00026749585406301825, "loss": 0.5258, "step": 402 }, { "epoch": 0.22216097023153253, "grad_norm": 0.26151949167251587, "learning_rate": 0.00026741293532338306, "loss": 0.5426, "step": 403 }, { "epoch": 0.2227122381477398, "grad_norm": 0.2910129427909851, "learning_rate": 0.0002673300165837479, "loss": 0.5376, "step": 404 }, { "epoch": 0.22326350606394707, "grad_norm": 0.28276947140693665, "learning_rate": 0.0002672470978441127, "loss": 0.5271, "step": 405 }, { "epoch": 0.22381477398015434, "grad_norm": 0.25096046924591064, "learning_rate": 0.0002671641791044776, "loss": 0.5439, "step": 406 }, { "epoch": 0.22436604189636164, "grad_norm": 0.2461530715227127, "learning_rate": 0.00026708126036484244, "loss": 0.5239, "step": 407 }, { "epoch": 0.2249173098125689, "grad_norm": 0.2833070456981659, "learning_rate": 0.00026699834162520724, "loss": 0.531, "step": 408 }, { "epoch": 0.22546857772877618, "grad_norm": 0.24600760638713837, "learning_rate": 0.0002669154228855721, "loss": 0.5419, "step": 409 }, { "epoch": 0.22601984564498345, "grad_norm": 0.2620793581008911, "learning_rate": 0.00026683250414593696, "loss": 0.5033, "step": 410 }, { "epoch": 0.22657111356119075, "grad_norm": 0.27523407340049744, "learning_rate": 0.0002667495854063018, "loss": 0.5257, "step": 411 }, { "epoch": 0.22712238147739802, "grad_norm": 0.2630368769168854, "learning_rate": 0.0002666666666666666, "loss": 0.5156, "step": 412 }, { "epoch": 0.2276736493936053, "grad_norm": 0.24897338449954987, "learning_rate": 0.0002665837479270315, "loss": 0.5301, "step": 413 }, { "epoch": 0.22822491730981256, "grad_norm": 0.26213693618774414, "learning_rate": 0.0002665008291873963, "loss": 0.5563, "step": 414 }, { "epoch": 0.22877618522601983, "grad_norm": 0.23822888731956482, "learning_rate": 0.00026641791044776115, "loss": 0.5273, "step": 415 }, { "epoch": 0.22932745314222713, "grad_norm": 0.22970083355903625, "learning_rate": 0.000266334991708126, "loss": 0.5321, "step": 416 }, { "epoch": 0.2298787210584344, "grad_norm": 0.26430296897888184, "learning_rate": 0.00026625207296849087, "loss": 0.5539, "step": 417 }, { "epoch": 0.23042998897464168, "grad_norm": 0.25960785150527954, "learning_rate": 0.00026616915422885567, "loss": 0.5357, "step": 418 }, { "epoch": 0.23098125689084895, "grad_norm": 0.23449423909187317, "learning_rate": 0.00026608623548922053, "loss": 0.5143, "step": 419 }, { "epoch": 0.23153252480705622, "grad_norm": 0.2795349061489105, "learning_rate": 0.0002660033167495854, "loss": 0.5363, "step": 420 }, { "epoch": 0.23208379272326352, "grad_norm": 0.2637255787849426, "learning_rate": 0.00026592039800995025, "loss": 0.5607, "step": 421 }, { "epoch": 0.2326350606394708, "grad_norm": 0.23269203305244446, "learning_rate": 0.00026583747927031505, "loss": 0.5239, "step": 422 }, { "epoch": 0.23318632855567806, "grad_norm": 0.2501350939273834, "learning_rate": 0.0002657545605306799, "loss": 0.5303, "step": 423 }, { "epoch": 0.23373759647188533, "grad_norm": 0.25998207926750183, "learning_rate": 0.0002656716417910447, "loss": 0.5258, "step": 424 }, { "epoch": 0.2342888643880926, "grad_norm": 0.25762224197387695, "learning_rate": 0.0002655887230514096, "loss": 0.5427, "step": 425 }, { "epoch": 0.2348401323042999, "grad_norm": 0.2542650103569031, "learning_rate": 0.00026550580431177444, "loss": 0.5363, "step": 426 }, { "epoch": 0.23539140022050717, "grad_norm": 0.24817922711372375, "learning_rate": 0.0002654228855721393, "loss": 0.5294, "step": 427 }, { "epoch": 0.23594266813671444, "grad_norm": 0.23553630709648132, "learning_rate": 0.0002653399668325041, "loss": 0.5401, "step": 428 }, { "epoch": 0.2364939360529217, "grad_norm": 0.2774706184864044, "learning_rate": 0.00026525704809286896, "loss": 0.5352, "step": 429 }, { "epoch": 0.237045203969129, "grad_norm": 0.2383023351430893, "learning_rate": 0.0002651741293532338, "loss": 0.5243, "step": 430 }, { "epoch": 0.23759647188533628, "grad_norm": 0.23838096857070923, "learning_rate": 0.0002650912106135987, "loss": 0.5336, "step": 431 }, { "epoch": 0.23814773980154355, "grad_norm": 0.2416170984506607, "learning_rate": 0.0002650082918739635, "loss": 0.5044, "step": 432 }, { "epoch": 0.23869900771775082, "grad_norm": 0.24407121539115906, "learning_rate": 0.00026492537313432834, "loss": 0.5383, "step": 433 }, { "epoch": 0.2392502756339581, "grad_norm": 0.26349690556526184, "learning_rate": 0.00026484245439469315, "loss": 0.5553, "step": 434 }, { "epoch": 0.2398015435501654, "grad_norm": 0.27343693375587463, "learning_rate": 0.000264759535655058, "loss": 0.5593, "step": 435 }, { "epoch": 0.24035281146637266, "grad_norm": 0.22751976549625397, "learning_rate": 0.00026467661691542287, "loss": 0.5254, "step": 436 }, { "epoch": 0.24090407938257993, "grad_norm": 0.2342759519815445, "learning_rate": 0.0002645936981757877, "loss": 0.5076, "step": 437 }, { "epoch": 0.2414553472987872, "grad_norm": 0.25039923191070557, "learning_rate": 0.00026451077943615253, "loss": 0.4816, "step": 438 }, { "epoch": 0.24200661521499447, "grad_norm": 0.24585099518299103, "learning_rate": 0.0002644278606965174, "loss": 0.5132, "step": 439 }, { "epoch": 0.24255788313120177, "grad_norm": 0.24062813818454742, "learning_rate": 0.00026434494195688225, "loss": 0.5152, "step": 440 }, { "epoch": 0.24310915104740904, "grad_norm": 0.23549048602581024, "learning_rate": 0.0002642620232172471, "loss": 0.5201, "step": 441 }, { "epoch": 0.24366041896361632, "grad_norm": 0.24712547659873962, "learning_rate": 0.0002641791044776119, "loss": 0.5252, "step": 442 }, { "epoch": 0.2442116868798236, "grad_norm": 0.25113359093666077, "learning_rate": 0.00026409618573797677, "loss": 0.5593, "step": 443 }, { "epoch": 0.24476295479603086, "grad_norm": 0.24021007120609283, "learning_rate": 0.0002640132669983416, "loss": 0.5338, "step": 444 }, { "epoch": 0.24531422271223816, "grad_norm": 0.23334236443042755, "learning_rate": 0.00026393034825870643, "loss": 0.4842, "step": 445 }, { "epoch": 0.24586549062844543, "grad_norm": 0.25075432658195496, "learning_rate": 0.0002638474295190713, "loss": 0.5498, "step": 446 }, { "epoch": 0.2464167585446527, "grad_norm": 0.23466569185256958, "learning_rate": 0.00026376451077943615, "loss": 0.5125, "step": 447 }, { "epoch": 0.24696802646085997, "grad_norm": 0.23975308239459991, "learning_rate": 0.00026368159203980096, "loss": 0.5315, "step": 448 }, { "epoch": 0.24751929437706727, "grad_norm": 0.227213054895401, "learning_rate": 0.0002635986733001658, "loss": 0.4826, "step": 449 }, { "epoch": 0.24807056229327454, "grad_norm": 0.23588328063488007, "learning_rate": 0.0002635157545605307, "loss": 0.4902, "step": 450 }, { "epoch": 0.2486218302094818, "grad_norm": 0.24110263586044312, "learning_rate": 0.00026343283582089554, "loss": 0.5152, "step": 451 }, { "epoch": 0.24917309812568908, "grad_norm": 0.24417544901371002, "learning_rate": 0.00026334991708126034, "loss": 0.5326, "step": 452 }, { "epoch": 0.24972436604189635, "grad_norm": 0.24150699377059937, "learning_rate": 0.00026326699834162515, "loss": 0.547, "step": 453 }, { "epoch": 0.25027563395810365, "grad_norm": 0.26009777188301086, "learning_rate": 0.00026318407960199, "loss": 0.5315, "step": 454 }, { "epoch": 0.2508269018743109, "grad_norm": 0.2537683844566345, "learning_rate": 0.00026310116086235486, "loss": 0.5304, "step": 455 }, { "epoch": 0.2513781697905182, "grad_norm": 0.2526278495788574, "learning_rate": 0.0002630182421227197, "loss": 0.5194, "step": 456 }, { "epoch": 0.2519294377067255, "grad_norm": 0.24355928599834442, "learning_rate": 0.00026293532338308453, "loss": 0.5096, "step": 457 }, { "epoch": 0.25248070562293273, "grad_norm": 0.243259459733963, "learning_rate": 0.0002628524046434494, "loss": 0.4971, "step": 458 }, { "epoch": 0.25303197353914003, "grad_norm": 0.2597525417804718, "learning_rate": 0.00026276948590381425, "loss": 0.5224, "step": 459 }, { "epoch": 0.2535832414553473, "grad_norm": 0.2498249113559723, "learning_rate": 0.0002626865671641791, "loss": 0.506, "step": 460 }, { "epoch": 0.2541345093715546, "grad_norm": 0.21408714354038239, "learning_rate": 0.0002626036484245439, "loss": 0.5076, "step": 461 }, { "epoch": 0.25468577728776187, "grad_norm": 0.25370824337005615, "learning_rate": 0.00026252072968490877, "loss": 0.5065, "step": 462 }, { "epoch": 0.2552370452039691, "grad_norm": 0.25148823857307434, "learning_rate": 0.0002624378109452736, "loss": 0.4932, "step": 463 }, { "epoch": 0.2557883131201764, "grad_norm": 0.24903985857963562, "learning_rate": 0.00026235489220563843, "loss": 0.5366, "step": 464 }, { "epoch": 0.25633958103638366, "grad_norm": 0.2521916329860687, "learning_rate": 0.0002622719734660033, "loss": 0.5392, "step": 465 }, { "epoch": 0.25689084895259096, "grad_norm": 0.24553993344306946, "learning_rate": 0.00026218905472636815, "loss": 0.5382, "step": 466 }, { "epoch": 0.25744211686879825, "grad_norm": 0.23382090032100677, "learning_rate": 0.00026210613598673296, "loss": 0.523, "step": 467 }, { "epoch": 0.2579933847850055, "grad_norm": 0.25337761640548706, "learning_rate": 0.0002620232172470978, "loss": 0.5147, "step": 468 }, { "epoch": 0.2585446527012128, "grad_norm": 0.25433778762817383, "learning_rate": 0.0002619402985074627, "loss": 0.5012, "step": 469 }, { "epoch": 0.25909592061742004, "grad_norm": 0.2362672984600067, "learning_rate": 0.00026185737976782753, "loss": 0.5328, "step": 470 }, { "epoch": 0.25964718853362734, "grad_norm": 0.241427481174469, "learning_rate": 0.00026177446102819234, "loss": 0.5207, "step": 471 }, { "epoch": 0.26019845644983464, "grad_norm": 0.24943798780441284, "learning_rate": 0.0002616915422885572, "loss": 0.5607, "step": 472 }, { "epoch": 0.2607497243660419, "grad_norm": 0.21813860535621643, "learning_rate": 0.000261608623548922, "loss": 0.5036, "step": 473 }, { "epoch": 0.2613009922822492, "grad_norm": 0.22680509090423584, "learning_rate": 0.00026152570480928686, "loss": 0.4765, "step": 474 }, { "epoch": 0.2618522601984565, "grad_norm": 0.23577630519866943, "learning_rate": 0.0002614427860696517, "loss": 0.5267, "step": 475 }, { "epoch": 0.2624035281146637, "grad_norm": 0.22560511529445648, "learning_rate": 0.0002613598673300166, "loss": 0.5089, "step": 476 }, { "epoch": 0.262954796030871, "grad_norm": 0.2485722452402115, "learning_rate": 0.0002612769485903814, "loss": 0.5231, "step": 477 }, { "epoch": 0.26350606394707826, "grad_norm": 0.2396019846200943, "learning_rate": 0.00026119402985074624, "loss": 0.515, "step": 478 }, { "epoch": 0.26405733186328556, "grad_norm": 0.24977676570415497, "learning_rate": 0.0002611111111111111, "loss": 0.5303, "step": 479 }, { "epoch": 0.26460859977949286, "grad_norm": 0.2788902521133423, "learning_rate": 0.00026102819237147596, "loss": 0.5324, "step": 480 }, { "epoch": 0.2651598676957001, "grad_norm": 0.2515452802181244, "learning_rate": 0.00026094527363184077, "loss": 0.5373, "step": 481 }, { "epoch": 0.2657111356119074, "grad_norm": 0.2408224493265152, "learning_rate": 0.0002608623548922056, "loss": 0.5021, "step": 482 }, { "epoch": 0.26626240352811464, "grad_norm": 0.25597700476646423, "learning_rate": 0.00026077943615257043, "loss": 0.5292, "step": 483 }, { "epoch": 0.26681367144432194, "grad_norm": 0.24885378777980804, "learning_rate": 0.0002606965174129353, "loss": 0.5047, "step": 484 }, { "epoch": 0.26736493936052924, "grad_norm": 0.24355795979499817, "learning_rate": 0.00026061359867330015, "loss": 0.5258, "step": 485 }, { "epoch": 0.2679162072767365, "grad_norm": 0.2580486238002777, "learning_rate": 0.000260530679933665, "loss": 0.5533, "step": 486 }, { "epoch": 0.2684674751929438, "grad_norm": 0.27081531286239624, "learning_rate": 0.0002604477611940298, "loss": 0.525, "step": 487 }, { "epoch": 0.269018743109151, "grad_norm": 0.2559351325035095, "learning_rate": 0.0002603648424543947, "loss": 0.5074, "step": 488 }, { "epoch": 0.2695700110253583, "grad_norm": 0.2617773711681366, "learning_rate": 0.00026028192371475953, "loss": 0.5244, "step": 489 }, { "epoch": 0.2701212789415656, "grad_norm": 0.23218858242034912, "learning_rate": 0.0002601990049751244, "loss": 0.5048, "step": 490 }, { "epoch": 0.27067254685777287, "grad_norm": 0.24924521148204803, "learning_rate": 0.0002601160862354892, "loss": 0.521, "step": 491 }, { "epoch": 0.27122381477398017, "grad_norm": 0.26815906167030334, "learning_rate": 0.00026003316749585406, "loss": 0.5574, "step": 492 }, { "epoch": 0.2717750826901874, "grad_norm": 0.240220308303833, "learning_rate": 0.00025995024875621886, "loss": 0.483, "step": 493 }, { "epoch": 0.2723263506063947, "grad_norm": 0.24979090690612793, "learning_rate": 0.0002598673300165837, "loss": 0.5262, "step": 494 }, { "epoch": 0.272877618522602, "grad_norm": 0.24111522734165192, "learning_rate": 0.0002597844112769486, "loss": 0.5068, "step": 495 }, { "epoch": 0.27342888643880925, "grad_norm": 0.2612921893596649, "learning_rate": 0.0002597014925373134, "loss": 0.519, "step": 496 }, { "epoch": 0.27398015435501655, "grad_norm": 0.24324454367160797, "learning_rate": 0.00025961857379767824, "loss": 0.4826, "step": 497 }, { "epoch": 0.2745314222712238, "grad_norm": 0.2406265288591385, "learning_rate": 0.0002595356550580431, "loss": 0.5223, "step": 498 }, { "epoch": 0.2750826901874311, "grad_norm": 0.2597537934780121, "learning_rate": 0.00025945273631840796, "loss": 0.535, "step": 499 }, { "epoch": 0.2756339581036384, "grad_norm": 0.2446909099817276, "learning_rate": 0.00025936981757877277, "loss": 0.5108, "step": 500 }, { "epoch": 0.2756339581036384, "eval_loss": 0.5157487988471985, "eval_runtime": 312.0533, "eval_samples_per_second": 3.733, "eval_steps_per_second": 0.468, "step": 500 }, { "epoch": 0.27618522601984563, "grad_norm": 0.2623630166053772, "learning_rate": 0.0002592868988391376, "loss": 0.5414, "step": 501 }, { "epoch": 0.27673649393605293, "grad_norm": 0.2578775882720947, "learning_rate": 0.00025920398009950243, "loss": 0.5121, "step": 502 }, { "epoch": 0.2772877618522602, "grad_norm": 0.23712347447872162, "learning_rate": 0.0002591210613598673, "loss": 0.5085, "step": 503 }, { "epoch": 0.27783902976846747, "grad_norm": 0.22108785808086395, "learning_rate": 0.00025903814262023215, "loss": 0.5202, "step": 504 }, { "epoch": 0.27839029768467477, "grad_norm": 0.25034549832344055, "learning_rate": 0.000258955223880597, "loss": 0.5389, "step": 505 }, { "epoch": 0.278941565600882, "grad_norm": 0.21812468767166138, "learning_rate": 0.0002588723051409618, "loss": 0.4994, "step": 506 }, { "epoch": 0.2794928335170893, "grad_norm": 0.22681641578674316, "learning_rate": 0.00025878938640132667, "loss": 0.5219, "step": 507 }, { "epoch": 0.28004410143329656, "grad_norm": 0.25568950176239014, "learning_rate": 0.00025870646766169153, "loss": 0.5188, "step": 508 }, { "epoch": 0.28059536934950385, "grad_norm": 0.24642765522003174, "learning_rate": 0.0002586235489220564, "loss": 0.4978, "step": 509 }, { "epoch": 0.28114663726571115, "grad_norm": 0.22820910811424255, "learning_rate": 0.0002585406301824212, "loss": 0.5168, "step": 510 }, { "epoch": 0.2816979051819184, "grad_norm": 0.23360006511211395, "learning_rate": 0.00025845771144278605, "loss": 0.5059, "step": 511 }, { "epoch": 0.2822491730981257, "grad_norm": 0.24599935114383698, "learning_rate": 0.00025837479270315086, "loss": 0.5293, "step": 512 }, { "epoch": 0.282800441014333, "grad_norm": 0.23006513714790344, "learning_rate": 0.0002582918739635157, "loss": 0.5028, "step": 513 }, { "epoch": 0.28335170893054024, "grad_norm": 0.22950898110866547, "learning_rate": 0.0002582089552238806, "loss": 0.5064, "step": 514 }, { "epoch": 0.28390297684674753, "grad_norm": 0.23649993538856506, "learning_rate": 0.00025812603648424544, "loss": 0.515, "step": 515 }, { "epoch": 0.2844542447629548, "grad_norm": 0.23335647583007812, "learning_rate": 0.00025804311774461024, "loss": 0.4977, "step": 516 }, { "epoch": 0.2850055126791621, "grad_norm": 0.21914584934711456, "learning_rate": 0.0002579601990049751, "loss": 0.5018, "step": 517 }, { "epoch": 0.2855567805953694, "grad_norm": 0.2474760264158249, "learning_rate": 0.00025787728026533996, "loss": 0.542, "step": 518 }, { "epoch": 0.2861080485115766, "grad_norm": 0.24011823534965515, "learning_rate": 0.0002577943615257048, "loss": 0.5243, "step": 519 }, { "epoch": 0.2866593164277839, "grad_norm": 0.2619330883026123, "learning_rate": 0.0002577114427860696, "loss": 0.5657, "step": 520 }, { "epoch": 0.28721058434399116, "grad_norm": 0.2715679407119751, "learning_rate": 0.0002576285240464345, "loss": 0.5506, "step": 521 }, { "epoch": 0.28776185226019846, "grad_norm": 0.26569628715515137, "learning_rate": 0.0002575456053067993, "loss": 0.5525, "step": 522 }, { "epoch": 0.28831312017640576, "grad_norm": 0.23253163695335388, "learning_rate": 0.00025746268656716415, "loss": 0.5184, "step": 523 }, { "epoch": 0.288864388092613, "grad_norm": 0.2698347866535187, "learning_rate": 0.000257379767827529, "loss": 0.5274, "step": 524 }, { "epoch": 0.2894156560088203, "grad_norm": 0.2556426227092743, "learning_rate": 0.00025729684908789386, "loss": 0.5032, "step": 525 }, { "epoch": 0.28996692392502754, "grad_norm": 0.252575546503067, "learning_rate": 0.00025721393034825867, "loss": 0.525, "step": 526 }, { "epoch": 0.29051819184123484, "grad_norm": 0.26160725951194763, "learning_rate": 0.00025713101160862353, "loss": 0.552, "step": 527 }, { "epoch": 0.29106945975744214, "grad_norm": 0.250885546207428, "learning_rate": 0.0002570480928689884, "loss": 0.5159, "step": 528 }, { "epoch": 0.2916207276736494, "grad_norm": 0.24888747930526733, "learning_rate": 0.00025696517412935325, "loss": 0.5104, "step": 529 }, { "epoch": 0.2921719955898567, "grad_norm": 0.2554168105125427, "learning_rate": 0.00025688225538971805, "loss": 0.4867, "step": 530 }, { "epoch": 0.2927232635060639, "grad_norm": 0.24712808430194855, "learning_rate": 0.0002567993366500829, "loss": 0.5087, "step": 531 }, { "epoch": 0.2932745314222712, "grad_norm": 0.26169416308403015, "learning_rate": 0.0002567164179104477, "loss": 0.5094, "step": 532 }, { "epoch": 0.2938257993384785, "grad_norm": 0.25625213980674744, "learning_rate": 0.0002566334991708126, "loss": 0.5264, "step": 533 }, { "epoch": 0.29437706725468576, "grad_norm": 0.22383877635002136, "learning_rate": 0.00025655058043117743, "loss": 0.4719, "step": 534 }, { "epoch": 0.29492833517089306, "grad_norm": 0.2579217851161957, "learning_rate": 0.0002564676616915423, "loss": 0.5254, "step": 535 }, { "epoch": 0.2954796030871003, "grad_norm": 0.25349318981170654, "learning_rate": 0.0002563847429519071, "loss": 0.4932, "step": 536 }, { "epoch": 0.2960308710033076, "grad_norm": 0.25384828448295593, "learning_rate": 0.00025630182421227196, "loss": 0.51, "step": 537 }, { "epoch": 0.2965821389195149, "grad_norm": 0.22186040878295898, "learning_rate": 0.0002562189054726368, "loss": 0.5074, "step": 538 }, { "epoch": 0.29713340683572215, "grad_norm": 0.2735055685043335, "learning_rate": 0.0002561359867330017, "loss": 0.5151, "step": 539 }, { "epoch": 0.29768467475192945, "grad_norm": 0.24992069602012634, "learning_rate": 0.0002560530679933665, "loss": 0.4987, "step": 540 }, { "epoch": 0.2982359426681367, "grad_norm": 0.24067966639995575, "learning_rate": 0.0002559701492537313, "loss": 0.5434, "step": 541 }, { "epoch": 0.298787210584344, "grad_norm": 0.22907654941082, "learning_rate": 0.00025588723051409614, "loss": 0.5091, "step": 542 }, { "epoch": 0.2993384785005513, "grad_norm": 0.21983608603477478, "learning_rate": 0.000255804311774461, "loss": 0.5234, "step": 543 }, { "epoch": 0.29988974641675853, "grad_norm": 0.2439606636762619, "learning_rate": 0.00025572139303482586, "loss": 0.5271, "step": 544 }, { "epoch": 0.30044101433296583, "grad_norm": 0.25168585777282715, "learning_rate": 0.00025563847429519067, "loss": 0.4998, "step": 545 }, { "epoch": 0.30099228224917307, "grad_norm": 0.22324073314666748, "learning_rate": 0.00025555555555555553, "loss": 0.5086, "step": 546 }, { "epoch": 0.30154355016538037, "grad_norm": 0.22652758657932281, "learning_rate": 0.0002554726368159204, "loss": 0.5044, "step": 547 }, { "epoch": 0.30209481808158767, "grad_norm": 0.2422345131635666, "learning_rate": 0.00025538971807628525, "loss": 0.4968, "step": 548 }, { "epoch": 0.3026460859977949, "grad_norm": 0.24840863049030304, "learning_rate": 0.00025530679933665005, "loss": 0.5267, "step": 549 }, { "epoch": 0.3031973539140022, "grad_norm": 0.26198020577430725, "learning_rate": 0.0002552238805970149, "loss": 0.528, "step": 550 }, { "epoch": 0.3037486218302095, "grad_norm": 0.24763406813144684, "learning_rate": 0.0002551409618573797, "loss": 0.5387, "step": 551 }, { "epoch": 0.30429988974641675, "grad_norm": 0.22976034879684448, "learning_rate": 0.0002550580431177446, "loss": 0.5171, "step": 552 }, { "epoch": 0.30485115766262405, "grad_norm": 0.26161912083625793, "learning_rate": 0.00025497512437810943, "loss": 0.4956, "step": 553 }, { "epoch": 0.3054024255788313, "grad_norm": 0.2695063650608063, "learning_rate": 0.0002548922056384743, "loss": 0.5339, "step": 554 }, { "epoch": 0.3059536934950386, "grad_norm": 0.22745662927627563, "learning_rate": 0.0002548092868988391, "loss": 0.4769, "step": 555 }, { "epoch": 0.3065049614112459, "grad_norm": 0.2539026439189911, "learning_rate": 0.00025472636815920396, "loss": 0.5085, "step": 556 }, { "epoch": 0.30705622932745313, "grad_norm": 0.25683802366256714, "learning_rate": 0.0002546434494195688, "loss": 0.4828, "step": 557 }, { "epoch": 0.30760749724366043, "grad_norm": 0.24806293845176697, "learning_rate": 0.0002545605306799337, "loss": 0.534, "step": 558 }, { "epoch": 0.3081587651598677, "grad_norm": 0.24956698715686798, "learning_rate": 0.0002544776119402985, "loss": 0.4988, "step": 559 }, { "epoch": 0.308710033076075, "grad_norm": 0.2466159611940384, "learning_rate": 0.00025439469320066334, "loss": 0.525, "step": 560 }, { "epoch": 0.3092613009922823, "grad_norm": 0.2732326090335846, "learning_rate": 0.00025431177446102814, "loss": 0.5096, "step": 561 }, { "epoch": 0.3098125689084895, "grad_norm": 0.257656067609787, "learning_rate": 0.000254228855721393, "loss": 0.5241, "step": 562 }, { "epoch": 0.3103638368246968, "grad_norm": 0.2280483990907669, "learning_rate": 0.00025414593698175786, "loss": 0.5051, "step": 563 }, { "epoch": 0.31091510474090406, "grad_norm": 0.24017442762851715, "learning_rate": 0.0002540630182421227, "loss": 0.4923, "step": 564 }, { "epoch": 0.31146637265711136, "grad_norm": 0.27770093083381653, "learning_rate": 0.0002539800995024875, "loss": 0.5068, "step": 565 }, { "epoch": 0.31201764057331866, "grad_norm": 0.2428130954504013, "learning_rate": 0.0002538971807628524, "loss": 0.5223, "step": 566 }, { "epoch": 0.3125689084895259, "grad_norm": 0.24798986315727234, "learning_rate": 0.00025381426202321724, "loss": 0.5269, "step": 567 }, { "epoch": 0.3131201764057332, "grad_norm": 0.2388242930173874, "learning_rate": 0.0002537313432835821, "loss": 0.5328, "step": 568 }, { "epoch": 0.31367144432194044, "grad_norm": 0.24993616342544556, "learning_rate": 0.0002536484245439469, "loss": 0.523, "step": 569 }, { "epoch": 0.31422271223814774, "grad_norm": 0.22417233884334564, "learning_rate": 0.00025356550580431177, "loss": 0.5162, "step": 570 }, { "epoch": 0.31477398015435504, "grad_norm": 0.25001853704452515, "learning_rate": 0.00025348258706467657, "loss": 0.5172, "step": 571 }, { "epoch": 0.3153252480705623, "grad_norm": 0.24982157349586487, "learning_rate": 0.00025339966832504143, "loss": 0.516, "step": 572 }, { "epoch": 0.3158765159867696, "grad_norm": 0.23938202857971191, "learning_rate": 0.0002533167495854063, "loss": 0.4984, "step": 573 }, { "epoch": 0.3164277839029768, "grad_norm": 0.23941190540790558, "learning_rate": 0.00025323383084577115, "loss": 0.5285, "step": 574 }, { "epoch": 0.3169790518191841, "grad_norm": 0.26152345538139343, "learning_rate": 0.00025315091210613595, "loss": 0.5354, "step": 575 }, { "epoch": 0.3175303197353914, "grad_norm": 0.2364695519208908, "learning_rate": 0.0002530679933665008, "loss": 0.4926, "step": 576 }, { "epoch": 0.31808158765159866, "grad_norm": 0.2498009353876114, "learning_rate": 0.00025298507462686567, "loss": 0.4879, "step": 577 }, { "epoch": 0.31863285556780596, "grad_norm": 0.2434455007314682, "learning_rate": 0.00025290215588723053, "loss": 0.4941, "step": 578 }, { "epoch": 0.3191841234840132, "grad_norm": 0.2500743269920349, "learning_rate": 0.00025281923714759534, "loss": 0.5224, "step": 579 }, { "epoch": 0.3197353914002205, "grad_norm": 0.24151727557182312, "learning_rate": 0.0002527363184079602, "loss": 0.5056, "step": 580 }, { "epoch": 0.3202866593164278, "grad_norm": 0.23307417333126068, "learning_rate": 0.000252653399668325, "loss": 0.4944, "step": 581 }, { "epoch": 0.32083792723263505, "grad_norm": 0.25184640288352966, "learning_rate": 0.00025257048092868986, "loss": 0.5471, "step": 582 }, { "epoch": 0.32138919514884234, "grad_norm": 0.21968768537044525, "learning_rate": 0.0002524875621890547, "loss": 0.4773, "step": 583 }, { "epoch": 0.3219404630650496, "grad_norm": 0.22851119935512543, "learning_rate": 0.0002524046434494195, "loss": 0.4964, "step": 584 }, { "epoch": 0.3224917309812569, "grad_norm": 0.2595960795879364, "learning_rate": 0.0002523217247097844, "loss": 0.5109, "step": 585 }, { "epoch": 0.3230429988974642, "grad_norm": 0.25090447068214417, "learning_rate": 0.00025223880597014924, "loss": 0.4932, "step": 586 }, { "epoch": 0.3235942668136714, "grad_norm": 0.24583864212036133, "learning_rate": 0.0002521558872305141, "loss": 0.4779, "step": 587 }, { "epoch": 0.3241455347298787, "grad_norm": 0.23779521882534027, "learning_rate": 0.0002520729684908789, "loss": 0.4925, "step": 588 }, { "epoch": 0.324696802646086, "grad_norm": 0.2614596486091614, "learning_rate": 0.00025199004975124377, "loss": 0.5064, "step": 589 }, { "epoch": 0.32524807056229327, "grad_norm": 0.2449434995651245, "learning_rate": 0.00025190713101160857, "loss": 0.4768, "step": 590 }, { "epoch": 0.32579933847850057, "grad_norm": 0.24249720573425293, "learning_rate": 0.00025182421227197343, "loss": 0.5183, "step": 591 }, { "epoch": 0.3263506063947078, "grad_norm": 0.2366262972354889, "learning_rate": 0.0002517412935323383, "loss": 0.5119, "step": 592 }, { "epoch": 0.3269018743109151, "grad_norm": 0.2465352565050125, "learning_rate": 0.00025165837479270315, "loss": 0.5133, "step": 593 }, { "epoch": 0.3274531422271224, "grad_norm": 0.24108771979808807, "learning_rate": 0.00025157545605306795, "loss": 0.5139, "step": 594 }, { "epoch": 0.32800441014332965, "grad_norm": 0.25272470712661743, "learning_rate": 0.0002514925373134328, "loss": 0.5161, "step": 595 }, { "epoch": 0.32855567805953695, "grad_norm": 0.23254331946372986, "learning_rate": 0.00025140961857379767, "loss": 0.5048, "step": 596 }, { "epoch": 0.3291069459757442, "grad_norm": 0.24523723125457764, "learning_rate": 0.00025132669983416253, "loss": 0.5234, "step": 597 }, { "epoch": 0.3296582138919515, "grad_norm": 0.2396179735660553, "learning_rate": 0.00025124378109452733, "loss": 0.4865, "step": 598 }, { "epoch": 0.3302094818081588, "grad_norm": 0.24812306463718414, "learning_rate": 0.0002511608623548922, "loss": 0.5262, "step": 599 }, { "epoch": 0.33076074972436603, "grad_norm": 0.21982058882713318, "learning_rate": 0.000251077943615257, "loss": 0.5067, "step": 600 }, { "epoch": 0.33131201764057333, "grad_norm": 0.23328660428524017, "learning_rate": 0.00025099502487562186, "loss": 0.5166, "step": 601 }, { "epoch": 0.3318632855567806, "grad_norm": 0.23042722046375275, "learning_rate": 0.0002509121061359867, "loss": 0.4754, "step": 602 }, { "epoch": 0.3324145534729879, "grad_norm": 0.2361726462841034, "learning_rate": 0.0002508291873963516, "loss": 0.5048, "step": 603 }, { "epoch": 0.33296582138919517, "grad_norm": 0.22569622099399567, "learning_rate": 0.0002507462686567164, "loss": 0.5272, "step": 604 }, { "epoch": 0.3335170893054024, "grad_norm": 0.28286513686180115, "learning_rate": 0.00025066334991708124, "loss": 0.5316, "step": 605 }, { "epoch": 0.3340683572216097, "grad_norm": 0.2402937114238739, "learning_rate": 0.0002505804311774461, "loss": 0.5213, "step": 606 }, { "epoch": 0.33461962513781696, "grad_norm": 0.23157329857349396, "learning_rate": 0.00025049751243781096, "loss": 0.5259, "step": 607 }, { "epoch": 0.33517089305402425, "grad_norm": 0.24995861947536469, "learning_rate": 0.00025041459369817576, "loss": 0.4986, "step": 608 }, { "epoch": 0.33572216097023155, "grad_norm": 0.2656213939189911, "learning_rate": 0.0002503316749585406, "loss": 0.4951, "step": 609 }, { "epoch": 0.3362734288864388, "grad_norm": 0.2361687421798706, "learning_rate": 0.00025024875621890543, "loss": 0.4897, "step": 610 }, { "epoch": 0.3368246968026461, "grad_norm": 0.23117870092391968, "learning_rate": 0.0002501658374792703, "loss": 0.5115, "step": 611 }, { "epoch": 0.33737596471885334, "grad_norm": 0.2605067491531372, "learning_rate": 0.00025008291873963515, "loss": 0.4969, "step": 612 }, { "epoch": 0.33792723263506064, "grad_norm": 0.2486005276441574, "learning_rate": 0.00025, "loss": 0.4853, "step": 613 }, { "epoch": 0.33847850055126794, "grad_norm": 0.2559118866920471, "learning_rate": 0.0002499170812603648, "loss": 0.5279, "step": 614 }, { "epoch": 0.3390297684674752, "grad_norm": 0.2579089403152466, "learning_rate": 0.00024983416252072967, "loss": 0.4942, "step": 615 }, { "epoch": 0.3395810363836825, "grad_norm": 0.24982236325740814, "learning_rate": 0.0002497512437810945, "loss": 0.5061, "step": 616 }, { "epoch": 0.3401323042998897, "grad_norm": 0.22861437499523163, "learning_rate": 0.0002496683250414594, "loss": 0.4935, "step": 617 }, { "epoch": 0.340683572216097, "grad_norm": 0.26352861523628235, "learning_rate": 0.0002495854063018242, "loss": 0.4989, "step": 618 }, { "epoch": 0.3412348401323043, "grad_norm": 0.26364725828170776, "learning_rate": 0.00024950248756218905, "loss": 0.5178, "step": 619 }, { "epoch": 0.34178610804851156, "grad_norm": 0.2375265508890152, "learning_rate": 0.00024941956882255386, "loss": 0.5081, "step": 620 }, { "epoch": 0.34233737596471886, "grad_norm": 0.24559634923934937, "learning_rate": 0.0002493366500829187, "loss": 0.5231, "step": 621 }, { "epoch": 0.3428886438809261, "grad_norm": 0.25992295145988464, "learning_rate": 0.0002492537313432836, "loss": 0.4919, "step": 622 }, { "epoch": 0.3434399117971334, "grad_norm": 0.2260003536939621, "learning_rate": 0.00024917081260364843, "loss": 0.4798, "step": 623 }, { "epoch": 0.3439911797133407, "grad_norm": 0.24474291503429413, "learning_rate": 0.00024908789386401324, "loss": 0.5063, "step": 624 }, { "epoch": 0.34454244762954794, "grad_norm": 0.27368757128715515, "learning_rate": 0.0002490049751243781, "loss": 0.5138, "step": 625 }, { "epoch": 0.34509371554575524, "grad_norm": 0.23762589693069458, "learning_rate": 0.0002489220563847429, "loss": 0.4739, "step": 626 }, { "epoch": 0.34564498346196254, "grad_norm": 0.26609158515930176, "learning_rate": 0.00024883913764510776, "loss": 0.5017, "step": 627 }, { "epoch": 0.3461962513781698, "grad_norm": 0.26183345913887024, "learning_rate": 0.0002487562189054726, "loss": 0.5278, "step": 628 }, { "epoch": 0.3467475192943771, "grad_norm": 0.254160076379776, "learning_rate": 0.0002486733001658374, "loss": 0.5178, "step": 629 }, { "epoch": 0.3472987872105843, "grad_norm": 0.23745757341384888, "learning_rate": 0.0002485903814262023, "loss": 0.5152, "step": 630 }, { "epoch": 0.3478500551267916, "grad_norm": 0.24215815961360931, "learning_rate": 0.00024850746268656714, "loss": 0.4821, "step": 631 }, { "epoch": 0.3484013230429989, "grad_norm": 0.2696283459663391, "learning_rate": 0.000248424543946932, "loss": 0.4868, "step": 632 }, { "epoch": 0.34895259095920617, "grad_norm": 0.2615061402320862, "learning_rate": 0.0002483416252072968, "loss": 0.5066, "step": 633 }, { "epoch": 0.34950385887541346, "grad_norm": 0.2618487775325775, "learning_rate": 0.00024825870646766167, "loss": 0.5084, "step": 634 }, { "epoch": 0.3500551267916207, "grad_norm": 0.2500843107700348, "learning_rate": 0.00024817578772802647, "loss": 0.5065, "step": 635 }, { "epoch": 0.350606394707828, "grad_norm": 0.2559143304824829, "learning_rate": 0.00024809286898839133, "loss": 0.5058, "step": 636 }, { "epoch": 0.3511576626240353, "grad_norm": 0.2498316466808319, "learning_rate": 0.0002480099502487562, "loss": 0.5033, "step": 637 }, { "epoch": 0.35170893054024255, "grad_norm": 0.2778237760066986, "learning_rate": 0.00024792703150912105, "loss": 0.5319, "step": 638 }, { "epoch": 0.35226019845644985, "grad_norm": 0.22850993275642395, "learning_rate": 0.00024784411276948585, "loss": 0.4852, "step": 639 }, { "epoch": 0.3528114663726571, "grad_norm": 0.22482328116893768, "learning_rate": 0.0002477611940298507, "loss": 0.5044, "step": 640 }, { "epoch": 0.3533627342888644, "grad_norm": 0.2470054179430008, "learning_rate": 0.0002476782752902156, "loss": 0.5119, "step": 641 }, { "epoch": 0.3539140022050717, "grad_norm": 0.26223158836364746, "learning_rate": 0.00024759535655058043, "loss": 0.5276, "step": 642 }, { "epoch": 0.35446527012127893, "grad_norm": 0.25175783038139343, "learning_rate": 0.00024751243781094524, "loss": 0.4963, "step": 643 }, { "epoch": 0.35501653803748623, "grad_norm": 0.26237010955810547, "learning_rate": 0.0002474295190713101, "loss": 0.4989, "step": 644 }, { "epoch": 0.35556780595369347, "grad_norm": 0.23380139470100403, "learning_rate": 0.0002473466003316749, "loss": 0.5143, "step": 645 }, { "epoch": 0.35611907386990077, "grad_norm": 0.23414726555347443, "learning_rate": 0.00024726368159203976, "loss": 0.4837, "step": 646 }, { "epoch": 0.35667034178610807, "grad_norm": 0.2426154464483261, "learning_rate": 0.0002471807628524046, "loss": 0.4953, "step": 647 }, { "epoch": 0.3572216097023153, "grad_norm": 0.25034722685813904, "learning_rate": 0.0002470978441127695, "loss": 0.505, "step": 648 }, { "epoch": 0.3577728776185226, "grad_norm": 0.21789918839931488, "learning_rate": 0.0002470149253731343, "loss": 0.5121, "step": 649 }, { "epoch": 0.35832414553472985, "grad_norm": 0.2339979112148285, "learning_rate": 0.00024693200663349914, "loss": 0.5065, "step": 650 }, { "epoch": 0.35887541345093715, "grad_norm": 0.22365735471248627, "learning_rate": 0.000246849087893864, "loss": 0.4952, "step": 651 }, { "epoch": 0.35942668136714445, "grad_norm": 0.2149263620376587, "learning_rate": 0.00024676616915422886, "loss": 0.4677, "step": 652 }, { "epoch": 0.3599779492833517, "grad_norm": 0.2143101543188095, "learning_rate": 0.00024668325041459367, "loss": 0.4881, "step": 653 }, { "epoch": 0.360529217199559, "grad_norm": 0.23739519715309143, "learning_rate": 0.0002466003316749585, "loss": 0.5006, "step": 654 }, { "epoch": 0.36108048511576624, "grad_norm": 0.24234917759895325, "learning_rate": 0.00024651741293532333, "loss": 0.5206, "step": 655 }, { "epoch": 0.36163175303197354, "grad_norm": 0.2366551011800766, "learning_rate": 0.0002464344941956882, "loss": 0.5075, "step": 656 }, { "epoch": 0.36218302094818083, "grad_norm": 0.2543952465057373, "learning_rate": 0.00024635157545605305, "loss": 0.4985, "step": 657 }, { "epoch": 0.3627342888643881, "grad_norm": 0.24470911920070648, "learning_rate": 0.0002462686567164179, "loss": 0.5128, "step": 658 }, { "epoch": 0.3632855567805954, "grad_norm": 0.22214102745056152, "learning_rate": 0.0002461857379767827, "loss": 0.5125, "step": 659 }, { "epoch": 0.3638368246968026, "grad_norm": 0.24312040209770203, "learning_rate": 0.00024610281923714757, "loss": 0.4936, "step": 660 }, { "epoch": 0.3643880926130099, "grad_norm": 0.25986719131469727, "learning_rate": 0.00024601990049751243, "loss": 0.5347, "step": 661 }, { "epoch": 0.3649393605292172, "grad_norm": 0.22576284408569336, "learning_rate": 0.0002459369817578773, "loss": 0.4747, "step": 662 }, { "epoch": 0.36549062844542446, "grad_norm": 0.257548451423645, "learning_rate": 0.0002458540630182421, "loss": 0.5083, "step": 663 }, { "epoch": 0.36604189636163176, "grad_norm": 0.26048266887664795, "learning_rate": 0.00024577114427860695, "loss": 0.539, "step": 664 }, { "epoch": 0.36659316427783906, "grad_norm": 0.2594940662384033, "learning_rate": 0.00024568822553897176, "loss": 0.5003, "step": 665 }, { "epoch": 0.3671444321940463, "grad_norm": 0.2651066482067108, "learning_rate": 0.0002456053067993366, "loss": 0.4979, "step": 666 }, { "epoch": 0.3676957001102536, "grad_norm": 0.2542423903942108, "learning_rate": 0.0002455223880597015, "loss": 0.5338, "step": 667 }, { "epoch": 0.36824696802646084, "grad_norm": 0.24032056331634521, "learning_rate": 0.00024543946932006634, "loss": 0.5101, "step": 668 }, { "epoch": 0.36879823594266814, "grad_norm": 0.26019784808158875, "learning_rate": 0.00024535655058043114, "loss": 0.5217, "step": 669 }, { "epoch": 0.36934950385887544, "grad_norm": 0.24449752271175385, "learning_rate": 0.000245273631840796, "loss": 0.5318, "step": 670 }, { "epoch": 0.3699007717750827, "grad_norm": 0.22685208916664124, "learning_rate": 0.00024519071310116086, "loss": 0.5186, "step": 671 }, { "epoch": 0.37045203969129, "grad_norm": 0.2340528517961502, "learning_rate": 0.00024510779436152566, "loss": 0.4879, "step": 672 }, { "epoch": 0.3710033076074972, "grad_norm": 0.2637344002723694, "learning_rate": 0.0002450248756218905, "loss": 0.5225, "step": 673 }, { "epoch": 0.3715545755237045, "grad_norm": 0.2515370845794678, "learning_rate": 0.00024494195688225533, "loss": 0.4913, "step": 674 }, { "epoch": 0.3721058434399118, "grad_norm": 0.22438743710517883, "learning_rate": 0.0002448590381426202, "loss": 0.4733, "step": 675 }, { "epoch": 0.37265711135611906, "grad_norm": 0.24447986483573914, "learning_rate": 0.00024477611940298505, "loss": 0.5138, "step": 676 }, { "epoch": 0.37320837927232636, "grad_norm": 0.2652420699596405, "learning_rate": 0.0002446932006633499, "loss": 0.4897, "step": 677 }, { "epoch": 0.3737596471885336, "grad_norm": 0.23273025453090668, "learning_rate": 0.0002446102819237147, "loss": 0.4823, "step": 678 }, { "epoch": 0.3743109151047409, "grad_norm": 0.24014912545681, "learning_rate": 0.00024452736318407957, "loss": 0.4963, "step": 679 }, { "epoch": 0.3748621830209482, "grad_norm": 0.2454654574394226, "learning_rate": 0.00024444444444444443, "loss": 0.5367, "step": 680 }, { "epoch": 0.37541345093715545, "grad_norm": 0.23897579312324524, "learning_rate": 0.0002443615257048093, "loss": 0.5038, "step": 681 }, { "epoch": 0.37596471885336274, "grad_norm": 0.25277066230773926, "learning_rate": 0.0002442786069651741, "loss": 0.506, "step": 682 }, { "epoch": 0.37651598676957, "grad_norm": 0.22470998764038086, "learning_rate": 0.00024419568822553895, "loss": 0.5038, "step": 683 }, { "epoch": 0.3770672546857773, "grad_norm": 0.2490270882844925, "learning_rate": 0.00024411276948590378, "loss": 0.5073, "step": 684 }, { "epoch": 0.3776185226019846, "grad_norm": 0.23964819312095642, "learning_rate": 0.00024402985074626864, "loss": 0.4932, "step": 685 }, { "epoch": 0.37816979051819183, "grad_norm": 0.2595767676830292, "learning_rate": 0.00024394693200663348, "loss": 0.5263, "step": 686 }, { "epoch": 0.3787210584343991, "grad_norm": 0.23740339279174805, "learning_rate": 0.00024386401326699833, "loss": 0.5019, "step": 687 }, { "epoch": 0.37927232635060637, "grad_norm": 0.23046371340751648, "learning_rate": 0.00024378109452736314, "loss": 0.5071, "step": 688 }, { "epoch": 0.37982359426681367, "grad_norm": 0.24483554065227509, "learning_rate": 0.000243698175787728, "loss": 0.4978, "step": 689 }, { "epoch": 0.38037486218302097, "grad_norm": 0.23441949486732483, "learning_rate": 0.00024361525704809283, "loss": 0.5217, "step": 690 }, { "epoch": 0.3809261300992282, "grad_norm": 0.23334890604019165, "learning_rate": 0.0002435323383084577, "loss": 0.4826, "step": 691 }, { "epoch": 0.3814773980154355, "grad_norm": 0.2869088053703308, "learning_rate": 0.00024344941956882252, "loss": 0.5199, "step": 692 }, { "epoch": 0.38202866593164275, "grad_norm": 0.22842839360237122, "learning_rate": 0.00024336650082918738, "loss": 0.4586, "step": 693 }, { "epoch": 0.38257993384785005, "grad_norm": 0.23558756709098816, "learning_rate": 0.0002432835820895522, "loss": 0.4775, "step": 694 }, { "epoch": 0.38313120176405735, "grad_norm": 0.2528475821018219, "learning_rate": 0.00024320066334991707, "loss": 0.5068, "step": 695 }, { "epoch": 0.3836824696802646, "grad_norm": 0.2580317258834839, "learning_rate": 0.0002431177446102819, "loss": 0.52, "step": 696 }, { "epoch": 0.3842337375964719, "grad_norm": 0.23449361324310303, "learning_rate": 0.00024303482587064676, "loss": 0.4776, "step": 697 }, { "epoch": 0.38478500551267913, "grad_norm": 0.2365398108959198, "learning_rate": 0.00024295190713101157, "loss": 0.5063, "step": 698 }, { "epoch": 0.38533627342888643, "grad_norm": 0.24017611145973206, "learning_rate": 0.00024286898839137643, "loss": 0.4989, "step": 699 }, { "epoch": 0.38588754134509373, "grad_norm": 0.237211212515831, "learning_rate": 0.00024278606965174126, "loss": 0.4942, "step": 700 }, { "epoch": 0.386438809261301, "grad_norm": 0.24133196473121643, "learning_rate": 0.00024270315091210612, "loss": 0.4991, "step": 701 }, { "epoch": 0.3869900771775083, "grad_norm": 0.23730522394180298, "learning_rate": 0.00024262023217247095, "loss": 0.4847, "step": 702 }, { "epoch": 0.3875413450937156, "grad_norm": 0.23267106711864471, "learning_rate": 0.0002425373134328358, "loss": 0.5304, "step": 703 }, { "epoch": 0.3880926130099228, "grad_norm": 0.22734446823596954, "learning_rate": 0.00024245439469320064, "loss": 0.4752, "step": 704 }, { "epoch": 0.3886438809261301, "grad_norm": 0.24138008058071136, "learning_rate": 0.0002423714759535655, "loss": 0.4831, "step": 705 }, { "epoch": 0.38919514884233736, "grad_norm": 0.24015116691589355, "learning_rate": 0.00024228855721393033, "loss": 0.506, "step": 706 }, { "epoch": 0.38974641675854466, "grad_norm": 0.23817308247089386, "learning_rate": 0.0002422056384742952, "loss": 0.4868, "step": 707 }, { "epoch": 0.39029768467475195, "grad_norm": 0.21546156704425812, "learning_rate": 0.00024212271973466, "loss": 0.5102, "step": 708 }, { "epoch": 0.3908489525909592, "grad_norm": 0.2489834874868393, "learning_rate": 0.00024203980099502486, "loss": 0.4985, "step": 709 }, { "epoch": 0.3914002205071665, "grad_norm": 0.23067452013492584, "learning_rate": 0.0002419568822553897, "loss": 0.4985, "step": 710 }, { "epoch": 0.39195148842337374, "grad_norm": 0.24763309955596924, "learning_rate": 0.00024187396351575455, "loss": 0.5124, "step": 711 }, { "epoch": 0.39250275633958104, "grad_norm": 0.2439269721508026, "learning_rate": 0.00024179104477611938, "loss": 0.4939, "step": 712 }, { "epoch": 0.39305402425578834, "grad_norm": 0.23163112998008728, "learning_rate": 0.00024170812603648424, "loss": 0.4954, "step": 713 }, { "epoch": 0.3936052921719956, "grad_norm": 0.24170540273189545, "learning_rate": 0.00024162520729684907, "loss": 0.4947, "step": 714 }, { "epoch": 0.3941565600882029, "grad_norm": 0.23549963533878326, "learning_rate": 0.00024154228855721393, "loss": 0.5132, "step": 715 }, { "epoch": 0.3947078280044101, "grad_norm": 0.2394574135541916, "learning_rate": 0.00024145936981757876, "loss": 0.5153, "step": 716 }, { "epoch": 0.3952590959206174, "grad_norm": 0.2615318298339844, "learning_rate": 0.00024137645107794357, "loss": 0.4971, "step": 717 }, { "epoch": 0.3958103638368247, "grad_norm": 0.2353423684835434, "learning_rate": 0.00024129353233830843, "loss": 0.4966, "step": 718 }, { "epoch": 0.39636163175303196, "grad_norm": 0.22130148112773895, "learning_rate": 0.00024121061359867326, "loss": 0.4487, "step": 719 }, { "epoch": 0.39691289966923926, "grad_norm": 0.234688401222229, "learning_rate": 0.00024112769485903812, "loss": 0.499, "step": 720 }, { "epoch": 0.3974641675854465, "grad_norm": 0.23247137665748596, "learning_rate": 0.00024104477611940295, "loss": 0.4944, "step": 721 }, { "epoch": 0.3980154355016538, "grad_norm": 0.2362777143716812, "learning_rate": 0.0002409618573797678, "loss": 0.481, "step": 722 }, { "epoch": 0.3985667034178611, "grad_norm": 0.24181120097637177, "learning_rate": 0.00024087893864013264, "loss": 0.5211, "step": 723 }, { "epoch": 0.39911797133406834, "grad_norm": 0.22298705577850342, "learning_rate": 0.0002407960199004975, "loss": 0.4888, "step": 724 }, { "epoch": 0.39966923925027564, "grad_norm": 0.2304617017507553, "learning_rate": 0.00024071310116086233, "loss": 0.4811, "step": 725 }, { "epoch": 0.4002205071664829, "grad_norm": 0.24691155552864075, "learning_rate": 0.0002406301824212272, "loss": 0.5189, "step": 726 }, { "epoch": 0.4007717750826902, "grad_norm": 0.25604429841041565, "learning_rate": 0.000240547263681592, "loss": 0.4927, "step": 727 }, { "epoch": 0.4013230429988975, "grad_norm": 0.2280474603176117, "learning_rate": 0.00024046434494195685, "loss": 0.4882, "step": 728 }, { "epoch": 0.4018743109151047, "grad_norm": 0.23425596952438354, "learning_rate": 0.0002403814262023217, "loss": 0.4875, "step": 729 }, { "epoch": 0.402425578831312, "grad_norm": 0.26156267523765564, "learning_rate": 0.00024029850746268655, "loss": 0.5087, "step": 730 }, { "epoch": 0.40297684674751927, "grad_norm": 0.23172809183597565, "learning_rate": 0.00024021558872305138, "loss": 0.5024, "step": 731 }, { "epoch": 0.40352811466372657, "grad_norm": 0.23358501493930817, "learning_rate": 0.00024013266998341624, "loss": 0.4972, "step": 732 }, { "epoch": 0.40407938257993387, "grad_norm": 0.23836782574653625, "learning_rate": 0.00024004975124378107, "loss": 0.5061, "step": 733 }, { "epoch": 0.4046306504961411, "grad_norm": 0.23341165482997894, "learning_rate": 0.00023996683250414593, "loss": 0.4927, "step": 734 }, { "epoch": 0.4051819184123484, "grad_norm": 0.2267657369375229, "learning_rate": 0.00023988391376451076, "loss": 0.4884, "step": 735 }, { "epoch": 0.40573318632855565, "grad_norm": 0.23333032429218292, "learning_rate": 0.00023980099502487562, "loss": 0.4764, "step": 736 }, { "epoch": 0.40628445424476295, "grad_norm": 0.24722862243652344, "learning_rate": 0.00023971807628524042, "loss": 0.5168, "step": 737 }, { "epoch": 0.40683572216097025, "grad_norm": 0.24919219315052032, "learning_rate": 0.00023963515754560528, "loss": 0.4953, "step": 738 }, { "epoch": 0.4073869900771775, "grad_norm": 0.22673016786575317, "learning_rate": 0.00023955223880597012, "loss": 0.4883, "step": 739 }, { "epoch": 0.4079382579933848, "grad_norm": 0.22796331346035004, "learning_rate": 0.00023946932006633497, "loss": 0.4683, "step": 740 }, { "epoch": 0.4084895259095921, "grad_norm": 0.23972417414188385, "learning_rate": 0.0002393864013266998, "loss": 0.4919, "step": 741 }, { "epoch": 0.40904079382579933, "grad_norm": 0.23933400213718414, "learning_rate": 0.00023930348258706467, "loss": 0.5053, "step": 742 }, { "epoch": 0.40959206174200663, "grad_norm": 0.24868054687976837, "learning_rate": 0.0002392205638474295, "loss": 0.4854, "step": 743 }, { "epoch": 0.4101433296582139, "grad_norm": 0.23096708953380585, "learning_rate": 0.00023913764510779436, "loss": 0.4739, "step": 744 }, { "epoch": 0.41069459757442117, "grad_norm": 0.2553226947784424, "learning_rate": 0.0002390547263681592, "loss": 0.4679, "step": 745 }, { "epoch": 0.41124586549062847, "grad_norm": 0.24697932600975037, "learning_rate": 0.00023897180762852405, "loss": 0.4858, "step": 746 }, { "epoch": 0.4117971334068357, "grad_norm": 0.2418091893196106, "learning_rate": 0.00023888888888888885, "loss": 0.5172, "step": 747 }, { "epoch": 0.412348401323043, "grad_norm": 0.24144020676612854, "learning_rate": 0.0002388059701492537, "loss": 0.4711, "step": 748 }, { "epoch": 0.41289966923925026, "grad_norm": 0.24137695133686066, "learning_rate": 0.00023872305140961854, "loss": 0.5106, "step": 749 }, { "epoch": 0.41345093715545755, "grad_norm": 0.220285102725029, "learning_rate": 0.0002386401326699834, "loss": 0.4704, "step": 750 }, { "epoch": 0.41400220507166485, "grad_norm": 0.24430547654628754, "learning_rate": 0.00023855721393034824, "loss": 0.5038, "step": 751 }, { "epoch": 0.4145534729878721, "grad_norm": 0.24019300937652588, "learning_rate": 0.0002384742951907131, "loss": 0.4949, "step": 752 }, { "epoch": 0.4151047409040794, "grad_norm": 0.22668643295764923, "learning_rate": 0.00023839137645107793, "loss": 0.4718, "step": 753 }, { "epoch": 0.41565600882028664, "grad_norm": 0.2277330756187439, "learning_rate": 0.00023830845771144279, "loss": 0.514, "step": 754 }, { "epoch": 0.41620727673649394, "grad_norm": 0.2215653359889984, "learning_rate": 0.00023822553897180762, "loss": 0.4873, "step": 755 }, { "epoch": 0.41675854465270123, "grad_norm": 0.22386564314365387, "learning_rate": 0.00023814262023217248, "loss": 0.4824, "step": 756 }, { "epoch": 0.4173098125689085, "grad_norm": 0.2562282681465149, "learning_rate": 0.00023805970149253728, "loss": 0.5177, "step": 757 }, { "epoch": 0.4178610804851158, "grad_norm": 0.25375691056251526, "learning_rate": 0.00023797678275290214, "loss": 0.51, "step": 758 }, { "epoch": 0.418412348401323, "grad_norm": 0.26564472913742065, "learning_rate": 0.00023789386401326697, "loss": 0.5048, "step": 759 }, { "epoch": 0.4189636163175303, "grad_norm": 0.24918165802955627, "learning_rate": 0.00023781094527363183, "loss": 0.4964, "step": 760 }, { "epoch": 0.4195148842337376, "grad_norm": 0.26909199357032776, "learning_rate": 0.00023772802653399666, "loss": 0.4511, "step": 761 }, { "epoch": 0.42006615214994486, "grad_norm": 0.27723434567451477, "learning_rate": 0.0002376451077943615, "loss": 0.4994, "step": 762 }, { "epoch": 0.42061742006615216, "grad_norm": 0.23842424154281616, "learning_rate": 0.00023756218905472636, "loss": 0.5127, "step": 763 }, { "epoch": 0.4211686879823594, "grad_norm": 0.2599777281284332, "learning_rate": 0.0002374792703150912, "loss": 0.5221, "step": 764 }, { "epoch": 0.4217199558985667, "grad_norm": 0.2541678845882416, "learning_rate": 0.00023739635157545605, "loss": 0.5086, "step": 765 }, { "epoch": 0.422271223814774, "grad_norm": 0.24489666521549225, "learning_rate": 0.00023731343283582085, "loss": 0.5052, "step": 766 }, { "epoch": 0.42282249173098124, "grad_norm": 0.23364123702049255, "learning_rate": 0.0002372305140961857, "loss": 0.4815, "step": 767 }, { "epoch": 0.42337375964718854, "grad_norm": 0.24420395493507385, "learning_rate": 0.00023714759535655054, "loss": 0.4799, "step": 768 }, { "epoch": 0.4239250275633958, "grad_norm": 0.2559242844581604, "learning_rate": 0.0002370646766169154, "loss": 0.5218, "step": 769 }, { "epoch": 0.4244762954796031, "grad_norm": 0.24033527076244354, "learning_rate": 0.00023698175787728023, "loss": 0.4951, "step": 770 }, { "epoch": 0.4250275633958104, "grad_norm": 0.2582804262638092, "learning_rate": 0.0002368988391376451, "loss": 0.4925, "step": 771 }, { "epoch": 0.4255788313120176, "grad_norm": 0.21231015026569366, "learning_rate": 0.00023681592039800992, "loss": 0.4975, "step": 772 }, { "epoch": 0.4261300992282249, "grad_norm": 0.23742909729480743, "learning_rate": 0.00023673300165837478, "loss": 0.5115, "step": 773 }, { "epoch": 0.42668136714443217, "grad_norm": 0.23761944472789764, "learning_rate": 0.00023665008291873962, "loss": 0.5117, "step": 774 }, { "epoch": 0.42723263506063947, "grad_norm": 0.25065210461616516, "learning_rate": 0.00023656716417910448, "loss": 0.5305, "step": 775 }, { "epoch": 0.42778390297684676, "grad_norm": 0.23839645087718964, "learning_rate": 0.00023648424543946928, "loss": 0.5245, "step": 776 }, { "epoch": 0.428335170893054, "grad_norm": 0.22241149842739105, "learning_rate": 0.00023640132669983414, "loss": 0.5041, "step": 777 }, { "epoch": 0.4288864388092613, "grad_norm": 0.23228657245635986, "learning_rate": 0.00023631840796019897, "loss": 0.4955, "step": 778 }, { "epoch": 0.4294377067254686, "grad_norm": 0.24807095527648926, "learning_rate": 0.00023623548922056383, "loss": 0.5057, "step": 779 }, { "epoch": 0.42998897464167585, "grad_norm": 0.253288209438324, "learning_rate": 0.00023615257048092866, "loss": 0.5179, "step": 780 }, { "epoch": 0.43054024255788315, "grad_norm": 0.2280365228652954, "learning_rate": 0.00023606965174129352, "loss": 0.5104, "step": 781 }, { "epoch": 0.4310915104740904, "grad_norm": 0.21497339010238647, "learning_rate": 0.00023598673300165835, "loss": 0.479, "step": 782 }, { "epoch": 0.4316427783902977, "grad_norm": 0.25969845056533813, "learning_rate": 0.0002359038142620232, "loss": 0.4952, "step": 783 }, { "epoch": 0.432194046306505, "grad_norm": 0.24241061508655548, "learning_rate": 0.00023582089552238804, "loss": 0.5147, "step": 784 }, { "epoch": 0.43274531422271223, "grad_norm": 0.23297248780727386, "learning_rate": 0.0002357379767827529, "loss": 0.4698, "step": 785 }, { "epoch": 0.43329658213891953, "grad_norm": 0.23766906559467316, "learning_rate": 0.0002356550580431177, "loss": 0.5127, "step": 786 }, { "epoch": 0.43384785005512677, "grad_norm": 0.225977823138237, "learning_rate": 0.00023557213930348257, "loss": 0.4698, "step": 787 }, { "epoch": 0.43439911797133407, "grad_norm": 0.25361236929893494, "learning_rate": 0.0002354892205638474, "loss": 0.4887, "step": 788 }, { "epoch": 0.43495038588754137, "grad_norm": 0.23103906214237213, "learning_rate": 0.00023540630182421226, "loss": 0.4831, "step": 789 }, { "epoch": 0.4355016538037486, "grad_norm": 0.23840244114398956, "learning_rate": 0.0002353233830845771, "loss": 0.501, "step": 790 }, { "epoch": 0.4360529217199559, "grad_norm": 0.2217642217874527, "learning_rate": 0.00023524046434494195, "loss": 0.4792, "step": 791 }, { "epoch": 0.43660418963616315, "grad_norm": 0.23963388800621033, "learning_rate": 0.00023515754560530678, "loss": 0.5043, "step": 792 }, { "epoch": 0.43715545755237045, "grad_norm": 0.2423614263534546, "learning_rate": 0.00023507462686567164, "loss": 0.4923, "step": 793 }, { "epoch": 0.43770672546857775, "grad_norm": 0.23817111551761627, "learning_rate": 0.00023499170812603645, "loss": 0.4836, "step": 794 }, { "epoch": 0.438257993384785, "grad_norm": 0.22162829339504242, "learning_rate": 0.00023490878938640133, "loss": 0.4919, "step": 795 }, { "epoch": 0.4388092613009923, "grad_norm": 0.22646528482437134, "learning_rate": 0.00023482587064676614, "loss": 0.4727, "step": 796 }, { "epoch": 0.43936052921719954, "grad_norm": 0.2530063986778259, "learning_rate": 0.000234742951907131, "loss": 0.4896, "step": 797 }, { "epoch": 0.43991179713340683, "grad_norm": 0.24201619625091553, "learning_rate": 0.00023466003316749583, "loss": 0.4664, "step": 798 }, { "epoch": 0.44046306504961413, "grad_norm": 0.22222551703453064, "learning_rate": 0.0002345771144278607, "loss": 0.4914, "step": 799 }, { "epoch": 0.4410143329658214, "grad_norm": 0.2384173721075058, "learning_rate": 0.00023449419568822552, "loss": 0.5029, "step": 800 }, { "epoch": 0.4415656008820287, "grad_norm": 0.23053288459777832, "learning_rate": 0.00023441127694859038, "loss": 0.5011, "step": 801 }, { "epoch": 0.4421168687982359, "grad_norm": 0.2338135987520218, "learning_rate": 0.0002343283582089552, "loss": 0.5145, "step": 802 }, { "epoch": 0.4426681367144432, "grad_norm": 0.2439098060131073, "learning_rate": 0.00023424543946932007, "loss": 0.5353, "step": 803 }, { "epoch": 0.4432194046306505, "grad_norm": 0.25395849347114563, "learning_rate": 0.00023416252072968488, "loss": 0.5287, "step": 804 }, { "epoch": 0.44377067254685776, "grad_norm": 0.24382875859737396, "learning_rate": 0.0002340796019900497, "loss": 0.4753, "step": 805 }, { "epoch": 0.44432194046306506, "grad_norm": 0.22943390905857086, "learning_rate": 0.00023399668325041457, "loss": 0.4899, "step": 806 }, { "epoch": 0.4448732083792723, "grad_norm": 0.23026274144649506, "learning_rate": 0.0002339137645107794, "loss": 0.4776, "step": 807 }, { "epoch": 0.4454244762954796, "grad_norm": 0.263637512922287, "learning_rate": 0.00023383084577114426, "loss": 0.5036, "step": 808 }, { "epoch": 0.4459757442116869, "grad_norm": 0.2239854782819748, "learning_rate": 0.0002337479270315091, "loss": 0.5074, "step": 809 }, { "epoch": 0.44652701212789414, "grad_norm": 0.24209174513816833, "learning_rate": 0.00023366500829187395, "loss": 0.4962, "step": 810 }, { "epoch": 0.44707828004410144, "grad_norm": 0.2574441730976105, "learning_rate": 0.00023358208955223878, "loss": 0.4833, "step": 811 }, { "epoch": 0.4476295479603087, "grad_norm": 0.24309788644313812, "learning_rate": 0.00023349917081260364, "loss": 0.4971, "step": 812 }, { "epoch": 0.448180815876516, "grad_norm": 0.23553608357906342, "learning_rate": 0.00023341625207296844, "loss": 0.4951, "step": 813 }, { "epoch": 0.4487320837927233, "grad_norm": 0.23820781707763672, "learning_rate": 0.0002333333333333333, "loss": 0.4974, "step": 814 }, { "epoch": 0.4492833517089305, "grad_norm": 0.26907938718795776, "learning_rate": 0.00023325041459369814, "loss": 0.4904, "step": 815 }, { "epoch": 0.4498346196251378, "grad_norm": 0.2529081702232361, "learning_rate": 0.000233167495854063, "loss": 0.5047, "step": 816 }, { "epoch": 0.4503858875413451, "grad_norm": 0.2080521285533905, "learning_rate": 0.00023308457711442783, "loss": 0.4676, "step": 817 }, { "epoch": 0.45093715545755236, "grad_norm": 0.25028982758522034, "learning_rate": 0.00023300165837479269, "loss": 0.5093, "step": 818 }, { "epoch": 0.45148842337375966, "grad_norm": 0.24182821810245514, "learning_rate": 0.00023291873963515752, "loss": 0.5082, "step": 819 }, { "epoch": 0.4520396912899669, "grad_norm": 0.23918956518173218, "learning_rate": 0.00023283582089552238, "loss": 0.4887, "step": 820 }, { "epoch": 0.4525909592061742, "grad_norm": 0.25016239285469055, "learning_rate": 0.0002327529021558872, "loss": 0.4887, "step": 821 }, { "epoch": 0.4531422271223815, "grad_norm": 0.2489538937807083, "learning_rate": 0.00023266998341625207, "loss": 0.5089, "step": 822 }, { "epoch": 0.45369349503858875, "grad_norm": 0.2490735650062561, "learning_rate": 0.00023258706467661687, "loss": 0.4812, "step": 823 }, { "epoch": 0.45424476295479604, "grad_norm": 0.26727011799812317, "learning_rate": 0.00023250414593698173, "loss": 0.4943, "step": 824 }, { "epoch": 0.4547960308710033, "grad_norm": 0.2334149330854416, "learning_rate": 0.00023242122719734656, "loss": 0.4743, "step": 825 }, { "epoch": 0.4553472987872106, "grad_norm": 0.24874447286128998, "learning_rate": 0.00023233830845771142, "loss": 0.5034, "step": 826 }, { "epoch": 0.4558985667034179, "grad_norm": 0.26186123490333557, "learning_rate": 0.00023225538971807626, "loss": 0.4986, "step": 827 }, { "epoch": 0.4564498346196251, "grad_norm": 0.22734478116035461, "learning_rate": 0.00023217247097844111, "loss": 0.479, "step": 828 }, { "epoch": 0.4570011025358324, "grad_norm": 0.24908246099948883, "learning_rate": 0.00023208955223880595, "loss": 0.5176, "step": 829 }, { "epoch": 0.45755237045203967, "grad_norm": 0.2561740279197693, "learning_rate": 0.0002320066334991708, "loss": 0.5181, "step": 830 }, { "epoch": 0.45810363836824697, "grad_norm": 0.24820713698863983, "learning_rate": 0.00023192371475953564, "loss": 0.5168, "step": 831 }, { "epoch": 0.45865490628445427, "grad_norm": 0.22865842282772064, "learning_rate": 0.0002318407960199005, "loss": 0.5034, "step": 832 }, { "epoch": 0.4592061742006615, "grad_norm": 0.2395135760307312, "learning_rate": 0.0002317578772802653, "loss": 0.4956, "step": 833 }, { "epoch": 0.4597574421168688, "grad_norm": 0.2375570386648178, "learning_rate": 0.00023167495854063016, "loss": 0.4939, "step": 834 }, { "epoch": 0.46030871003307605, "grad_norm": 0.24207614362239838, "learning_rate": 0.000231592039800995, "loss": 0.4998, "step": 835 }, { "epoch": 0.46085997794928335, "grad_norm": 0.231749027967453, "learning_rate": 0.00023150912106135985, "loss": 0.5071, "step": 836 }, { "epoch": 0.46141124586549065, "grad_norm": 0.2529800236225128, "learning_rate": 0.00023142620232172468, "loss": 0.5152, "step": 837 }, { "epoch": 0.4619625137816979, "grad_norm": 0.24748285114765167, "learning_rate": 0.00023134328358208954, "loss": 0.4929, "step": 838 }, { "epoch": 0.4625137816979052, "grad_norm": 0.2481345683336258, "learning_rate": 0.00023126036484245438, "loss": 0.5131, "step": 839 }, { "epoch": 0.46306504961411243, "grad_norm": 0.22557318210601807, "learning_rate": 0.00023117744610281923, "loss": 0.5111, "step": 840 }, { "epoch": 0.46361631753031973, "grad_norm": 0.24130286276340485, "learning_rate": 0.00023109452736318407, "loss": 0.486, "step": 841 }, { "epoch": 0.46416758544652703, "grad_norm": 0.2238035351037979, "learning_rate": 0.00023101160862354893, "loss": 0.4836, "step": 842 }, { "epoch": 0.4647188533627343, "grad_norm": 0.23449353873729706, "learning_rate": 0.00023092868988391373, "loss": 0.4714, "step": 843 }, { "epoch": 0.4652701212789416, "grad_norm": 0.2284533679485321, "learning_rate": 0.0002308457711442786, "loss": 0.4739, "step": 844 }, { "epoch": 0.4658213891951488, "grad_norm": 0.2420201152563095, "learning_rate": 0.00023076285240464342, "loss": 0.4797, "step": 845 }, { "epoch": 0.4663726571113561, "grad_norm": 0.2669530212879181, "learning_rate": 0.00023067993366500828, "loss": 0.5017, "step": 846 }, { "epoch": 0.4669239250275634, "grad_norm": 0.2415032982826233, "learning_rate": 0.0002305970149253731, "loss": 0.5023, "step": 847 }, { "epoch": 0.46747519294377066, "grad_norm": 0.2327703833580017, "learning_rate": 0.00023051409618573797, "loss": 0.5089, "step": 848 }, { "epoch": 0.46802646085997796, "grad_norm": 0.24102593958377838, "learning_rate": 0.0002304311774461028, "loss": 0.5092, "step": 849 }, { "epoch": 0.4685777287761852, "grad_norm": 0.22270776331424713, "learning_rate": 0.00023034825870646764, "loss": 0.4677, "step": 850 }, { "epoch": 0.4691289966923925, "grad_norm": 0.23423947393894196, "learning_rate": 0.0002302653399668325, "loss": 0.4909, "step": 851 }, { "epoch": 0.4696802646085998, "grad_norm": 0.24698768556118011, "learning_rate": 0.0002301824212271973, "loss": 0.5, "step": 852 }, { "epoch": 0.47023153252480704, "grad_norm": 0.24313125014305115, "learning_rate": 0.00023009950248756216, "loss": 0.4908, "step": 853 }, { "epoch": 0.47078280044101434, "grad_norm": 0.2673037648200989, "learning_rate": 0.000230016583747927, "loss": 0.4971, "step": 854 }, { "epoch": 0.47133406835722164, "grad_norm": 0.23639419674873352, "learning_rate": 0.00022993366500829185, "loss": 0.486, "step": 855 }, { "epoch": 0.4718853362734289, "grad_norm": 0.2316926270723343, "learning_rate": 0.00022985074626865668, "loss": 0.5045, "step": 856 }, { "epoch": 0.4724366041896362, "grad_norm": 0.23044279217720032, "learning_rate": 0.00022976782752902154, "loss": 0.4752, "step": 857 }, { "epoch": 0.4729878721058434, "grad_norm": 0.2599242329597473, "learning_rate": 0.00022968490878938637, "loss": 0.5058, "step": 858 }, { "epoch": 0.4735391400220507, "grad_norm": 0.2420707494020462, "learning_rate": 0.00022960199004975123, "loss": 0.4689, "step": 859 }, { "epoch": 0.474090407938258, "grad_norm": 0.26549097895622253, "learning_rate": 0.00022951907131011607, "loss": 0.5161, "step": 860 }, { "epoch": 0.47464167585446526, "grad_norm": 0.24539636075496674, "learning_rate": 0.00022943615257048092, "loss": 0.4887, "step": 861 }, { "epoch": 0.47519294377067256, "grad_norm": 0.23257140815258026, "learning_rate": 0.00022935323383084573, "loss": 0.4841, "step": 862 }, { "epoch": 0.4757442116868798, "grad_norm": 0.27551430463790894, "learning_rate": 0.0002292703150912106, "loss": 0.5369, "step": 863 }, { "epoch": 0.4762954796030871, "grad_norm": 0.2414499670267105, "learning_rate": 0.00022918739635157542, "loss": 0.5031, "step": 864 }, { "epoch": 0.4768467475192944, "grad_norm": 0.24039071798324585, "learning_rate": 0.00022910447761194028, "loss": 0.4958, "step": 865 }, { "epoch": 0.47739801543550164, "grad_norm": 0.23044785857200623, "learning_rate": 0.0002290215588723051, "loss": 0.4884, "step": 866 }, { "epoch": 0.47794928335170894, "grad_norm": 0.2677319645881653, "learning_rate": 0.00022893864013266997, "loss": 0.5096, "step": 867 }, { "epoch": 0.4785005512679162, "grad_norm": 0.22575704753398895, "learning_rate": 0.0002288557213930348, "loss": 0.4968, "step": 868 }, { "epoch": 0.4790518191841235, "grad_norm": 0.24338865280151367, "learning_rate": 0.00022877280265339966, "loss": 0.4669, "step": 869 }, { "epoch": 0.4796030871003308, "grad_norm": 0.25083914399147034, "learning_rate": 0.0002286898839137645, "loss": 0.5035, "step": 870 }, { "epoch": 0.480154355016538, "grad_norm": 0.24006043374538422, "learning_rate": 0.00022860696517412935, "loss": 0.459, "step": 871 }, { "epoch": 0.4807056229327453, "grad_norm": 0.2326238453388214, "learning_rate": 0.00022852404643449416, "loss": 0.4599, "step": 872 }, { "epoch": 0.48125689084895257, "grad_norm": 0.24134741723537445, "learning_rate": 0.00022844112769485902, "loss": 0.4755, "step": 873 }, { "epoch": 0.48180815876515987, "grad_norm": 0.2148948460817337, "learning_rate": 0.00022835820895522385, "loss": 0.4759, "step": 874 }, { "epoch": 0.48235942668136716, "grad_norm": 0.2361116260290146, "learning_rate": 0.0002282752902155887, "loss": 0.4771, "step": 875 }, { "epoch": 0.4829106945975744, "grad_norm": 0.24435687065124512, "learning_rate": 0.00022819237147595354, "loss": 0.492, "step": 876 }, { "epoch": 0.4834619625137817, "grad_norm": 0.23266686499118805, "learning_rate": 0.0002281094527363184, "loss": 0.5269, "step": 877 }, { "epoch": 0.48401323042998895, "grad_norm": 0.2184826284646988, "learning_rate": 0.00022802653399668323, "loss": 0.4741, "step": 878 }, { "epoch": 0.48456449834619625, "grad_norm": 0.24351243674755096, "learning_rate": 0.0002279436152570481, "loss": 0.5121, "step": 879 }, { "epoch": 0.48511576626240355, "grad_norm": 0.2366686463356018, "learning_rate": 0.00022786069651741292, "loss": 0.5002, "step": 880 }, { "epoch": 0.4856670341786108, "grad_norm": 0.23044729232788086, "learning_rate": 0.00022777777777777778, "loss": 0.4742, "step": 881 }, { "epoch": 0.4862183020948181, "grad_norm": 0.23718389868736267, "learning_rate": 0.0002276948590381426, "loss": 0.4864, "step": 882 }, { "epoch": 0.48676957001102533, "grad_norm": 0.25451889634132385, "learning_rate": 0.00022761194029850745, "loss": 0.4809, "step": 883 }, { "epoch": 0.48732083792723263, "grad_norm": 0.22073966264724731, "learning_rate": 0.00022752902155887228, "loss": 0.4853, "step": 884 }, { "epoch": 0.48787210584343993, "grad_norm": 0.24639108777046204, "learning_rate": 0.00022744610281923714, "loss": 0.4848, "step": 885 }, { "epoch": 0.4884233737596472, "grad_norm": 0.2543313503265381, "learning_rate": 0.00022736318407960197, "loss": 0.5109, "step": 886 }, { "epoch": 0.48897464167585447, "grad_norm": 0.24580398201942444, "learning_rate": 0.00022728026533996683, "loss": 0.4919, "step": 887 }, { "epoch": 0.4895259095920617, "grad_norm": 0.23678098618984222, "learning_rate": 0.00022719734660033166, "loss": 0.48, "step": 888 }, { "epoch": 0.490077177508269, "grad_norm": 0.2219116985797882, "learning_rate": 0.00022711442786069652, "loss": 0.4647, "step": 889 }, { "epoch": 0.4906284454244763, "grad_norm": 0.2577376067638397, "learning_rate": 0.00022703150912106135, "loss": 0.4729, "step": 890 }, { "epoch": 0.49117971334068355, "grad_norm": 0.2527279853820801, "learning_rate": 0.0002269485903814262, "loss": 0.4899, "step": 891 }, { "epoch": 0.49173098125689085, "grad_norm": 0.2718394100666046, "learning_rate": 0.00022686567164179102, "loss": 0.5247, "step": 892 }, { "epoch": 0.49228224917309815, "grad_norm": 0.23161333799362183, "learning_rate": 0.00022678275290215585, "loss": 0.4786, "step": 893 }, { "epoch": 0.4928335170893054, "grad_norm": 0.22976607084274292, "learning_rate": 0.0002266998341625207, "loss": 0.4963, "step": 894 }, { "epoch": 0.4933847850055127, "grad_norm": 0.26446732878685, "learning_rate": 0.00022661691542288554, "loss": 0.5076, "step": 895 }, { "epoch": 0.49393605292171994, "grad_norm": 0.2513757348060608, "learning_rate": 0.0002265339966832504, "loss": 0.4906, "step": 896 }, { "epoch": 0.49448732083792724, "grad_norm": 0.2355221062898636, "learning_rate": 0.00022645107794361523, "loss": 0.5083, "step": 897 }, { "epoch": 0.49503858875413453, "grad_norm": 0.24008940160274506, "learning_rate": 0.0002263681592039801, "loss": 0.5075, "step": 898 }, { "epoch": 0.4955898566703418, "grad_norm": 0.23088522255420685, "learning_rate": 0.00022628524046434492, "loss": 0.4975, "step": 899 }, { "epoch": 0.4961411245865491, "grad_norm": 0.2754332721233368, "learning_rate": 0.00022620232172470978, "loss": 0.5144, "step": 900 }, { "epoch": 0.4966923925027563, "grad_norm": 0.25219646096229553, "learning_rate": 0.00022611940298507459, "loss": 0.4854, "step": 901 }, { "epoch": 0.4972436604189636, "grad_norm": 0.2489755004644394, "learning_rate": 0.00022603648424543944, "loss": 0.4708, "step": 902 }, { "epoch": 0.4977949283351709, "grad_norm": 0.24141034483909607, "learning_rate": 0.00022595356550580428, "loss": 0.4917, "step": 903 }, { "epoch": 0.49834619625137816, "grad_norm": 0.23453152179718018, "learning_rate": 0.00022587064676616914, "loss": 0.4754, "step": 904 }, { "epoch": 0.49889746416758546, "grad_norm": 0.25601381063461304, "learning_rate": 0.00022578772802653397, "loss": 0.4909, "step": 905 }, { "epoch": 0.4994487320837927, "grad_norm": 0.22102084755897522, "learning_rate": 0.00022570480928689883, "loss": 0.4673, "step": 906 }, { "epoch": 0.5, "grad_norm": 0.2369261085987091, "learning_rate": 0.00022562189054726366, "loss": 0.4544, "step": 907 }, { "epoch": 0.5005512679162073, "grad_norm": 0.25789421796798706, "learning_rate": 0.00022553897180762852, "loss": 0.5032, "step": 908 }, { "epoch": 0.5011025358324146, "grad_norm": 0.2342817783355713, "learning_rate": 0.00022545605306799335, "loss": 0.4649, "step": 909 }, { "epoch": 0.5016538037486218, "grad_norm": 0.25317567586898804, "learning_rate": 0.0002253731343283582, "loss": 0.4974, "step": 910 }, { "epoch": 0.5022050716648291, "grad_norm": 0.23973771929740906, "learning_rate": 0.00022529021558872301, "loss": 0.5093, "step": 911 }, { "epoch": 0.5027563395810364, "grad_norm": 0.24858252704143524, "learning_rate": 0.00022520729684908787, "loss": 0.4781, "step": 912 }, { "epoch": 0.5033076074972437, "grad_norm": 0.25571468472480774, "learning_rate": 0.0002251243781094527, "loss": 0.4992, "step": 913 }, { "epoch": 0.503858875413451, "grad_norm": 0.2476612776517868, "learning_rate": 0.00022504145936981756, "loss": 0.4803, "step": 914 }, { "epoch": 0.5044101433296582, "grad_norm": 0.24917398393154144, "learning_rate": 0.0002249585406301824, "loss": 0.5022, "step": 915 }, { "epoch": 0.5049614112458655, "grad_norm": 0.24204300343990326, "learning_rate": 0.00022487562189054726, "loss": 0.4919, "step": 916 }, { "epoch": 0.5055126791620728, "grad_norm": 0.23442697525024414, "learning_rate": 0.0002247927031509121, "loss": 0.4754, "step": 917 }, { "epoch": 0.5060639470782801, "grad_norm": 0.26630768179893494, "learning_rate": 0.00022470978441127695, "loss": 0.5119, "step": 918 }, { "epoch": 0.5066152149944874, "grad_norm": 0.2312323898077011, "learning_rate": 0.00022462686567164175, "loss": 0.4735, "step": 919 }, { "epoch": 0.5071664829106945, "grad_norm": 0.23444309830665588, "learning_rate": 0.0002245439469320066, "loss": 0.4718, "step": 920 }, { "epoch": 0.5077177508269018, "grad_norm": 0.2260974645614624, "learning_rate": 0.00022446102819237144, "loss": 0.48, "step": 921 }, { "epoch": 0.5082690187431091, "grad_norm": 0.2403731793165207, "learning_rate": 0.0002243781094527363, "loss": 0.5014, "step": 922 }, { "epoch": 0.5088202866593164, "grad_norm": 0.240118607878685, "learning_rate": 0.00022429519071310113, "loss": 0.4669, "step": 923 }, { "epoch": 0.5093715545755237, "grad_norm": 0.2268829345703125, "learning_rate": 0.000224212271973466, "loss": 0.4924, "step": 924 }, { "epoch": 0.5099228224917309, "grad_norm": 0.23937518894672394, "learning_rate": 0.00022412935323383083, "loss": 0.4743, "step": 925 }, { "epoch": 0.5104740904079382, "grad_norm": 0.25224533677101135, "learning_rate": 0.00022404643449419568, "loss": 0.502, "step": 926 }, { "epoch": 0.5110253583241455, "grad_norm": 0.23434899747371674, "learning_rate": 0.00022396351575456052, "loss": 0.4825, "step": 927 }, { "epoch": 0.5115766262403528, "grad_norm": 0.249129980802536, "learning_rate": 0.00022388059701492538, "loss": 0.4689, "step": 928 }, { "epoch": 0.5121278941565601, "grad_norm": 0.2530542314052582, "learning_rate": 0.00022379767827529018, "loss": 0.4726, "step": 929 }, { "epoch": 0.5126791620727673, "grad_norm": 0.2488546073436737, "learning_rate": 0.00022371475953565504, "loss": 0.5024, "step": 930 }, { "epoch": 0.5132304299889746, "grad_norm": 0.23048900067806244, "learning_rate": 0.00022363184079601987, "loss": 0.4633, "step": 931 }, { "epoch": 0.5137816979051819, "grad_norm": 0.2485697716474533, "learning_rate": 0.00022354892205638473, "loss": 0.4955, "step": 932 }, { "epoch": 0.5143329658213892, "grad_norm": 0.23724399507045746, "learning_rate": 0.00022346600331674956, "loss": 0.4859, "step": 933 }, { "epoch": 0.5148842337375965, "grad_norm": 0.2424692064523697, "learning_rate": 0.00022338308457711442, "loss": 0.5115, "step": 934 }, { "epoch": 0.5154355016538037, "grad_norm": 0.24387586116790771, "learning_rate": 0.00022330016583747925, "loss": 0.4969, "step": 935 }, { "epoch": 0.515986769570011, "grad_norm": 0.22749263048171997, "learning_rate": 0.0002232172470978441, "loss": 0.5014, "step": 936 }, { "epoch": 0.5165380374862183, "grad_norm": 0.22205640375614166, "learning_rate": 0.00022313432835820894, "loss": 0.4912, "step": 937 }, { "epoch": 0.5170893054024256, "grad_norm": 0.23504669964313507, "learning_rate": 0.00022305140961857375, "loss": 0.4841, "step": 938 }, { "epoch": 0.5176405733186329, "grad_norm": 0.2282828390598297, "learning_rate": 0.0002229684908789386, "loss": 0.463, "step": 939 }, { "epoch": 0.5181918412348401, "grad_norm": 0.23592360317707062, "learning_rate": 0.00022288557213930344, "loss": 0.48, "step": 940 }, { "epoch": 0.5187431091510474, "grad_norm": 0.2408529818058014, "learning_rate": 0.0002228026533996683, "loss": 0.485, "step": 941 }, { "epoch": 0.5192943770672547, "grad_norm": 0.2507123351097107, "learning_rate": 0.00022271973466003313, "loss": 0.4696, "step": 942 }, { "epoch": 0.519845644983462, "grad_norm": 0.21724364161491394, "learning_rate": 0.000222636815920398, "loss": 0.4883, "step": 943 }, { "epoch": 0.5203969128996693, "grad_norm": 0.22868378460407257, "learning_rate": 0.00022255389718076282, "loss": 0.4852, "step": 944 }, { "epoch": 0.5209481808158766, "grad_norm": 0.23937176167964935, "learning_rate": 0.00022247097844112768, "loss": 0.4966, "step": 945 }, { "epoch": 0.5214994487320838, "grad_norm": 0.24673771858215332, "learning_rate": 0.00022238805970149251, "loss": 0.5089, "step": 946 }, { "epoch": 0.5220507166482911, "grad_norm": 0.23318541049957275, "learning_rate": 0.00022230514096185737, "loss": 0.4847, "step": 947 }, { "epoch": 0.5226019845644984, "grad_norm": 0.2237371951341629, "learning_rate": 0.00022222222222222218, "loss": 0.4745, "step": 948 }, { "epoch": 0.5231532524807057, "grad_norm": 0.22587883472442627, "learning_rate": 0.00022213930348258704, "loss": 0.502, "step": 949 }, { "epoch": 0.523704520396913, "grad_norm": 0.237474262714386, "learning_rate": 0.00022205638474295187, "loss": 0.5003, "step": 950 }, { "epoch": 0.5242557883131201, "grad_norm": 0.2394198328256607, "learning_rate": 0.00022197346600331673, "loss": 0.5032, "step": 951 }, { "epoch": 0.5248070562293274, "grad_norm": 0.22187075018882751, "learning_rate": 0.00022189054726368156, "loss": 0.4543, "step": 952 }, { "epoch": 0.5253583241455347, "grad_norm": 0.23657891154289246, "learning_rate": 0.00022180762852404642, "loss": 0.496, "step": 953 }, { "epoch": 0.525909592061742, "grad_norm": 0.23503652215003967, "learning_rate": 0.00022172470978441125, "loss": 0.4724, "step": 954 }, { "epoch": 0.5264608599779493, "grad_norm": 0.2500884532928467, "learning_rate": 0.0002216417910447761, "loss": 0.4837, "step": 955 }, { "epoch": 0.5270121278941565, "grad_norm": 0.2291148602962494, "learning_rate": 0.00022155887230514094, "loss": 0.4884, "step": 956 }, { "epoch": 0.5275633958103638, "grad_norm": 0.2256416380405426, "learning_rate": 0.0002214759535655058, "loss": 0.4743, "step": 957 }, { "epoch": 0.5281146637265711, "grad_norm": 0.23922450840473175, "learning_rate": 0.0002213930348258706, "loss": 0.4784, "step": 958 }, { "epoch": 0.5286659316427784, "grad_norm": 0.24849876761436462, "learning_rate": 0.00022131011608623547, "loss": 0.498, "step": 959 }, { "epoch": 0.5292171995589857, "grad_norm": 0.2211284190416336, "learning_rate": 0.0002212271973466003, "loss": 0.4711, "step": 960 }, { "epoch": 0.5297684674751929, "grad_norm": 0.2296118289232254, "learning_rate": 0.00022114427860696516, "loss": 0.49, "step": 961 }, { "epoch": 0.5303197353914002, "grad_norm": 0.22921642661094666, "learning_rate": 0.00022106135986733, "loss": 0.4864, "step": 962 }, { "epoch": 0.5308710033076075, "grad_norm": 0.23854584991931915, "learning_rate": 0.00022097844112769485, "loss": 0.4976, "step": 963 }, { "epoch": 0.5314222712238148, "grad_norm": 0.22192314267158508, "learning_rate": 0.00022089552238805968, "loss": 0.4889, "step": 964 }, { "epoch": 0.5319735391400221, "grad_norm": 0.24450358748435974, "learning_rate": 0.00022081260364842454, "loss": 0.4784, "step": 965 }, { "epoch": 0.5325248070562293, "grad_norm": 0.2145015150308609, "learning_rate": 0.00022072968490878937, "loss": 0.4543, "step": 966 }, { "epoch": 0.5330760749724366, "grad_norm": 0.22203224897384644, "learning_rate": 0.00022064676616915423, "loss": 0.4892, "step": 967 }, { "epoch": 0.5336273428886439, "grad_norm": 0.2423708289861679, "learning_rate": 0.00022056384742951904, "loss": 0.4866, "step": 968 }, { "epoch": 0.5341786108048512, "grad_norm": 0.2290901392698288, "learning_rate": 0.0002204809286898839, "loss": 0.4809, "step": 969 }, { "epoch": 0.5347298787210585, "grad_norm": 0.22281813621520996, "learning_rate": 0.00022039800995024873, "loss": 0.5083, "step": 970 }, { "epoch": 0.5352811466372657, "grad_norm": 0.23863239586353302, "learning_rate": 0.0002203150912106136, "loss": 0.4732, "step": 971 }, { "epoch": 0.535832414553473, "grad_norm": 0.2304835319519043, "learning_rate": 0.00022023217247097842, "loss": 0.4898, "step": 972 }, { "epoch": 0.5363836824696803, "grad_norm": 0.23452985286712646, "learning_rate": 0.00022014925373134328, "loss": 0.5177, "step": 973 }, { "epoch": 0.5369349503858876, "grad_norm": 0.252209335565567, "learning_rate": 0.0002200663349917081, "loss": 0.482, "step": 974 }, { "epoch": 0.5374862183020949, "grad_norm": 0.23390796780586243, "learning_rate": 0.00021998341625207297, "loss": 0.4913, "step": 975 }, { "epoch": 0.538037486218302, "grad_norm": 0.24304579198360443, "learning_rate": 0.0002199004975124378, "loss": 0.4963, "step": 976 }, { "epoch": 0.5385887541345094, "grad_norm": 0.22291411459445953, "learning_rate": 0.00021981757877280266, "loss": 0.4835, "step": 977 }, { "epoch": 0.5391400220507166, "grad_norm": 0.23994603753089905, "learning_rate": 0.00021973466003316746, "loss": 0.4596, "step": 978 }, { "epoch": 0.539691289966924, "grad_norm": 0.2375342845916748, "learning_rate": 0.00021965174129353232, "loss": 0.5138, "step": 979 }, { "epoch": 0.5402425578831312, "grad_norm": 0.22774764895439148, "learning_rate": 0.00021956882255389716, "loss": 0.4949, "step": 980 }, { "epoch": 0.5407938257993384, "grad_norm": 0.2277144491672516, "learning_rate": 0.000219485903814262, "loss": 0.4843, "step": 981 }, { "epoch": 0.5413450937155457, "grad_norm": 0.23078951239585876, "learning_rate": 0.00021940298507462685, "loss": 0.5089, "step": 982 }, { "epoch": 0.541896361631753, "grad_norm": 0.23093165457248688, "learning_rate": 0.00021932006633499168, "loss": 0.4913, "step": 983 }, { "epoch": 0.5424476295479603, "grad_norm": 0.22961430251598358, "learning_rate": 0.00021923714759535654, "loss": 0.4957, "step": 984 }, { "epoch": 0.5429988974641676, "grad_norm": 0.2303048074245453, "learning_rate": 0.00021915422885572137, "loss": 0.4991, "step": 985 }, { "epoch": 0.5435501653803748, "grad_norm": 0.2352553904056549, "learning_rate": 0.00021907131011608623, "loss": 0.4838, "step": 986 }, { "epoch": 0.5441014332965821, "grad_norm": 0.2251589596271515, "learning_rate": 0.00021898839137645103, "loss": 0.4928, "step": 987 }, { "epoch": 0.5446527012127894, "grad_norm": 0.2577657103538513, "learning_rate": 0.0002189054726368159, "loss": 0.4897, "step": 988 }, { "epoch": 0.5452039691289967, "grad_norm": 0.23328843712806702, "learning_rate": 0.00021882255389718073, "loss": 0.4949, "step": 989 }, { "epoch": 0.545755237045204, "grad_norm": 0.23206306993961334, "learning_rate": 0.00021873963515754558, "loss": 0.4791, "step": 990 }, { "epoch": 0.5463065049614112, "grad_norm": 0.2417128086090088, "learning_rate": 0.00021865671641791042, "loss": 0.5161, "step": 991 }, { "epoch": 0.5468577728776185, "grad_norm": 0.2541581988334656, "learning_rate": 0.00021857379767827528, "loss": 0.5253, "step": 992 }, { "epoch": 0.5474090407938258, "grad_norm": 0.23152418434619904, "learning_rate": 0.0002184908789386401, "loss": 0.4854, "step": 993 }, { "epoch": 0.5479603087100331, "grad_norm": 0.21505197882652283, "learning_rate": 0.00021840796019900497, "loss": 0.4664, "step": 994 }, { "epoch": 0.5485115766262404, "grad_norm": 0.23766584694385529, "learning_rate": 0.0002183250414593698, "loss": 0.4976, "step": 995 }, { "epoch": 0.5490628445424476, "grad_norm": 0.23223701119422913, "learning_rate": 0.00021824212271973466, "loss": 0.4485, "step": 996 }, { "epoch": 0.5496141124586549, "grad_norm": 0.25161734223365784, "learning_rate": 0.00021815920398009946, "loss": 0.4818, "step": 997 }, { "epoch": 0.5501653803748622, "grad_norm": 0.23082609474658966, "learning_rate": 0.00021807628524046432, "loss": 0.502, "step": 998 }, { "epoch": 0.5507166482910695, "grad_norm": 0.23080939054489136, "learning_rate": 0.00021799336650082915, "loss": 0.5005, "step": 999 }, { "epoch": 0.5512679162072768, "grad_norm": 0.22184456884860992, "learning_rate": 0.00021791044776119401, "loss": 0.4833, "step": 1000 }, { "epoch": 0.5512679162072768, "eval_loss": 0.48357656598091125, "eval_runtime": 311.7364, "eval_samples_per_second": 3.737, "eval_steps_per_second": 0.468, "step": 1000 }, { "epoch": 0.551819184123484, "grad_norm": 0.25572869181632996, "learning_rate": 0.00021782752902155885, "loss": 0.4925, "step": 1001 }, { "epoch": 0.5523704520396913, "grad_norm": 0.2477078139781952, "learning_rate": 0.0002177446102819237, "loss": 0.4847, "step": 1002 }, { "epoch": 0.5529217199558986, "grad_norm": 0.23749567568302155, "learning_rate": 0.00021766169154228854, "loss": 0.4933, "step": 1003 }, { "epoch": 0.5534729878721059, "grad_norm": 0.22248369455337524, "learning_rate": 0.0002175787728026534, "loss": 0.4883, "step": 1004 }, { "epoch": 0.5540242557883132, "grad_norm": 0.23769117891788483, "learning_rate": 0.00021749585406301823, "loss": 0.4977, "step": 1005 }, { "epoch": 0.5545755237045203, "grad_norm": 0.22872841358184814, "learning_rate": 0.0002174129353233831, "loss": 0.4952, "step": 1006 }, { "epoch": 0.5551267916207276, "grad_norm": 0.23627693951129913, "learning_rate": 0.0002173300165837479, "loss": 0.4653, "step": 1007 }, { "epoch": 0.5556780595369349, "grad_norm": 0.24900414049625397, "learning_rate": 0.00021724709784411275, "loss": 0.4833, "step": 1008 }, { "epoch": 0.5562293274531422, "grad_norm": 0.2288302332162857, "learning_rate": 0.00021716417910447758, "loss": 0.4735, "step": 1009 }, { "epoch": 0.5567805953693495, "grad_norm": 0.2251368761062622, "learning_rate": 0.00021708126036484244, "loss": 0.4887, "step": 1010 }, { "epoch": 0.5573318632855567, "grad_norm": 0.2496083676815033, "learning_rate": 0.00021699834162520727, "loss": 0.4959, "step": 1011 }, { "epoch": 0.557883131201764, "grad_norm": 0.23241998255252838, "learning_rate": 0.00021691542288557213, "loss": 0.462, "step": 1012 }, { "epoch": 0.5584343991179713, "grad_norm": 0.239312544465065, "learning_rate": 0.00021683250414593697, "loss": 0.4792, "step": 1013 }, { "epoch": 0.5589856670341786, "grad_norm": 0.22684402763843536, "learning_rate": 0.00021674958540630182, "loss": 0.4825, "step": 1014 }, { "epoch": 0.5595369349503859, "grad_norm": 0.23261615633964539, "learning_rate": 0.00021666666666666666, "loss": 0.4604, "step": 1015 }, { "epoch": 0.5600882028665931, "grad_norm": 0.26163482666015625, "learning_rate": 0.00021658374792703152, "loss": 0.5158, "step": 1016 }, { "epoch": 0.5606394707828004, "grad_norm": 0.2275197058916092, "learning_rate": 0.00021650082918739632, "loss": 0.4733, "step": 1017 }, { "epoch": 0.5611907386990077, "grad_norm": 0.2636192739009857, "learning_rate": 0.00021641791044776118, "loss": 0.5018, "step": 1018 }, { "epoch": 0.561742006615215, "grad_norm": 0.2224932312965393, "learning_rate": 0.000216334991708126, "loss": 0.5064, "step": 1019 }, { "epoch": 0.5622932745314223, "grad_norm": 0.2518375813961029, "learning_rate": 0.00021625207296849087, "loss": 0.4874, "step": 1020 }, { "epoch": 0.5628445424476296, "grad_norm": 0.24104849994182587, "learning_rate": 0.0002161691542288557, "loss": 0.4864, "step": 1021 }, { "epoch": 0.5633958103638368, "grad_norm": 0.25608646869659424, "learning_rate": 0.00021608623548922056, "loss": 0.4752, "step": 1022 }, { "epoch": 0.5639470782800441, "grad_norm": 0.24174031615257263, "learning_rate": 0.0002160033167495854, "loss": 0.4986, "step": 1023 }, { "epoch": 0.5644983461962514, "grad_norm": 0.23120078444480896, "learning_rate": 0.00021592039800995025, "loss": 0.4615, "step": 1024 }, { "epoch": 0.5650496141124587, "grad_norm": 0.2599080204963684, "learning_rate": 0.00021583747927031509, "loss": 0.4994, "step": 1025 }, { "epoch": 0.565600882028666, "grad_norm": 0.23741313815116882, "learning_rate": 0.0002157545605306799, "loss": 0.4745, "step": 1026 }, { "epoch": 0.5661521499448732, "grad_norm": 0.24400565028190613, "learning_rate": 0.00021567164179104475, "loss": 0.4891, "step": 1027 }, { "epoch": 0.5667034178610805, "grad_norm": 0.2503412663936615, "learning_rate": 0.00021558872305140958, "loss": 0.5014, "step": 1028 }, { "epoch": 0.5672546857772878, "grad_norm": 0.23471197485923767, "learning_rate": 0.00021550580431177444, "loss": 0.4958, "step": 1029 }, { "epoch": 0.5678059536934951, "grad_norm": 0.2323479950428009, "learning_rate": 0.00021542288557213927, "loss": 0.4691, "step": 1030 }, { "epoch": 0.5683572216097024, "grad_norm": 0.23778273165225983, "learning_rate": 0.00021533996683250413, "loss": 0.4881, "step": 1031 }, { "epoch": 0.5689084895259096, "grad_norm": 0.21465396881103516, "learning_rate": 0.00021525704809286896, "loss": 0.4689, "step": 1032 }, { "epoch": 0.5694597574421169, "grad_norm": 0.2397712767124176, "learning_rate": 0.00021517412935323382, "loss": 0.4873, "step": 1033 }, { "epoch": 0.5700110253583242, "grad_norm": 0.2142529934644699, "learning_rate": 0.00021509121061359863, "loss": 0.4686, "step": 1034 }, { "epoch": 0.5705622932745315, "grad_norm": 0.24334488809108734, "learning_rate": 0.00021500829187396351, "loss": 0.508, "step": 1035 }, { "epoch": 0.5711135611907387, "grad_norm": 0.2391451597213745, "learning_rate": 0.00021492537313432832, "loss": 0.5049, "step": 1036 }, { "epoch": 0.5716648291069459, "grad_norm": 0.25972914695739746, "learning_rate": 0.00021484245439469318, "loss": 0.5022, "step": 1037 }, { "epoch": 0.5722160970231532, "grad_norm": 0.23072604835033417, "learning_rate": 0.000214759535655058, "loss": 0.4888, "step": 1038 }, { "epoch": 0.5727673649393605, "grad_norm": 0.2415681630373001, "learning_rate": 0.00021467661691542287, "loss": 0.4787, "step": 1039 }, { "epoch": 0.5733186328555678, "grad_norm": 0.24707187712192535, "learning_rate": 0.0002145936981757877, "loss": 0.4877, "step": 1040 }, { "epoch": 0.5738699007717751, "grad_norm": 0.24816669523715973, "learning_rate": 0.00021451077943615256, "loss": 0.4704, "step": 1041 }, { "epoch": 0.5744211686879823, "grad_norm": 0.23687899112701416, "learning_rate": 0.0002144278606965174, "loss": 0.4757, "step": 1042 }, { "epoch": 0.5749724366041896, "grad_norm": 0.25993046164512634, "learning_rate": 0.00021434494195688225, "loss": 0.4919, "step": 1043 }, { "epoch": 0.5755237045203969, "grad_norm": 0.23352675139904022, "learning_rate": 0.00021426202321724706, "loss": 0.4762, "step": 1044 }, { "epoch": 0.5760749724366042, "grad_norm": 0.23056983947753906, "learning_rate": 0.00021417910447761192, "loss": 0.4638, "step": 1045 }, { "epoch": 0.5766262403528115, "grad_norm": 0.22587046027183533, "learning_rate": 0.00021409618573797675, "loss": 0.4777, "step": 1046 }, { "epoch": 0.5771775082690187, "grad_norm": 0.2561855912208557, "learning_rate": 0.0002140132669983416, "loss": 0.5056, "step": 1047 }, { "epoch": 0.577728776185226, "grad_norm": 0.24537737667560577, "learning_rate": 0.00021393034825870644, "loss": 0.497, "step": 1048 }, { "epoch": 0.5782800441014333, "grad_norm": 0.22903874516487122, "learning_rate": 0.0002138474295190713, "loss": 0.4749, "step": 1049 }, { "epoch": 0.5788313120176406, "grad_norm": 0.24069786071777344, "learning_rate": 0.00021376451077943613, "loss": 0.4901, "step": 1050 }, { "epoch": 0.5793825799338479, "grad_norm": 0.2355291098356247, "learning_rate": 0.000213681592039801, "loss": 0.478, "step": 1051 }, { "epoch": 0.5799338478500551, "grad_norm": 0.24105066061019897, "learning_rate": 0.00021359867330016582, "loss": 0.4832, "step": 1052 }, { "epoch": 0.5804851157662624, "grad_norm": 0.22479461133480072, "learning_rate": 0.00021351575456053068, "loss": 0.4657, "step": 1053 }, { "epoch": 0.5810363836824697, "grad_norm": 0.24978676438331604, "learning_rate": 0.00021343283582089549, "loss": 0.4795, "step": 1054 }, { "epoch": 0.581587651598677, "grad_norm": 0.22877342998981476, "learning_rate": 0.00021334991708126034, "loss": 0.476, "step": 1055 }, { "epoch": 0.5821389195148843, "grad_norm": 0.230316624045372, "learning_rate": 0.00021326699834162518, "loss": 0.4854, "step": 1056 }, { "epoch": 0.5826901874310915, "grad_norm": 0.2178526371717453, "learning_rate": 0.00021318407960199004, "loss": 0.4798, "step": 1057 }, { "epoch": 0.5832414553472988, "grad_norm": 0.23913492262363434, "learning_rate": 0.00021310116086235487, "loss": 0.4759, "step": 1058 }, { "epoch": 0.5837927232635061, "grad_norm": 0.23534056544303894, "learning_rate": 0.00021301824212271973, "loss": 0.475, "step": 1059 }, { "epoch": 0.5843439911797134, "grad_norm": 0.23057684302330017, "learning_rate": 0.00021293532338308456, "loss": 0.4835, "step": 1060 }, { "epoch": 0.5848952590959207, "grad_norm": 0.2420724630355835, "learning_rate": 0.00021285240464344942, "loss": 0.4684, "step": 1061 }, { "epoch": 0.5854465270121278, "grad_norm": 0.23270656168460846, "learning_rate": 0.00021276948590381425, "loss": 0.4714, "step": 1062 }, { "epoch": 0.5859977949283351, "grad_norm": 0.22105982899665833, "learning_rate": 0.0002126865671641791, "loss": 0.4739, "step": 1063 }, { "epoch": 0.5865490628445424, "grad_norm": 0.22896204888820648, "learning_rate": 0.00021260364842454391, "loss": 0.4792, "step": 1064 }, { "epoch": 0.5871003307607497, "grad_norm": 0.22883784770965576, "learning_rate": 0.00021252072968490877, "loss": 0.4775, "step": 1065 }, { "epoch": 0.587651598676957, "grad_norm": 0.22493380308151245, "learning_rate": 0.0002124378109452736, "loss": 0.4565, "step": 1066 }, { "epoch": 0.5882028665931642, "grad_norm": 0.20627589523792267, "learning_rate": 0.00021235489220563846, "loss": 0.4421, "step": 1067 }, { "epoch": 0.5887541345093715, "grad_norm": 0.22995707392692566, "learning_rate": 0.0002122719734660033, "loss": 0.5007, "step": 1068 }, { "epoch": 0.5893054024255788, "grad_norm": 0.22702358663082123, "learning_rate": 0.00021218905472636813, "loss": 0.4848, "step": 1069 }, { "epoch": 0.5898566703417861, "grad_norm": 0.2274836003780365, "learning_rate": 0.000212106135986733, "loss": 0.4512, "step": 1070 }, { "epoch": 0.5904079382579934, "grad_norm": 0.25226280093193054, "learning_rate": 0.00021202321724709782, "loss": 0.4739, "step": 1071 }, { "epoch": 0.5909592061742006, "grad_norm": 0.21378135681152344, "learning_rate": 0.00021194029850746268, "loss": 0.4902, "step": 1072 }, { "epoch": 0.5915104740904079, "grad_norm": 0.2266150563955307, "learning_rate": 0.00021185737976782748, "loss": 0.4787, "step": 1073 }, { "epoch": 0.5920617420066152, "grad_norm": 0.24346543848514557, "learning_rate": 0.00021177446102819234, "loss": 0.4758, "step": 1074 }, { "epoch": 0.5926130099228225, "grad_norm": 0.23416201770305634, "learning_rate": 0.00021169154228855718, "loss": 0.4976, "step": 1075 }, { "epoch": 0.5931642778390298, "grad_norm": 0.22314603626728058, "learning_rate": 0.00021160862354892203, "loss": 0.483, "step": 1076 }, { "epoch": 0.593715545755237, "grad_norm": 0.23636144399642944, "learning_rate": 0.00021152570480928687, "loss": 0.4883, "step": 1077 }, { "epoch": 0.5942668136714443, "grad_norm": 0.25075021386146545, "learning_rate": 0.00021144278606965173, "loss": 0.5093, "step": 1078 }, { "epoch": 0.5948180815876516, "grad_norm": 0.25016966462135315, "learning_rate": 0.00021135986733001656, "loss": 0.4901, "step": 1079 }, { "epoch": 0.5953693495038589, "grad_norm": 0.22505664825439453, "learning_rate": 0.00021127694859038142, "loss": 0.4982, "step": 1080 }, { "epoch": 0.5959206174200662, "grad_norm": 0.2462112158536911, "learning_rate": 0.00021119402985074625, "loss": 0.4925, "step": 1081 }, { "epoch": 0.5964718853362734, "grad_norm": 0.24048367142677307, "learning_rate": 0.0002111111111111111, "loss": 0.4711, "step": 1082 }, { "epoch": 0.5970231532524807, "grad_norm": 0.2399929016828537, "learning_rate": 0.0002110281923714759, "loss": 0.4534, "step": 1083 }, { "epoch": 0.597574421168688, "grad_norm": 0.22102728486061096, "learning_rate": 0.00021094527363184077, "loss": 0.475, "step": 1084 }, { "epoch": 0.5981256890848953, "grad_norm": 0.22623874247074127, "learning_rate": 0.0002108623548922056, "loss": 0.4771, "step": 1085 }, { "epoch": 0.5986769570011026, "grad_norm": 0.22739335894584656, "learning_rate": 0.00021077943615257046, "loss": 0.4524, "step": 1086 }, { "epoch": 0.5992282249173098, "grad_norm": 0.22587355971336365, "learning_rate": 0.0002106965174129353, "loss": 0.481, "step": 1087 }, { "epoch": 0.5997794928335171, "grad_norm": 0.238664448261261, "learning_rate": 0.00021061359867330015, "loss": 0.4812, "step": 1088 }, { "epoch": 0.6003307607497244, "grad_norm": 0.2626015245914459, "learning_rate": 0.00021053067993366499, "loss": 0.5396, "step": 1089 }, { "epoch": 0.6008820286659317, "grad_norm": 0.23110847175121307, "learning_rate": 0.00021044776119402985, "loss": 0.4768, "step": 1090 }, { "epoch": 0.601433296582139, "grad_norm": 0.2324095070362091, "learning_rate": 0.00021036484245439468, "loss": 0.4569, "step": 1091 }, { "epoch": 0.6019845644983461, "grad_norm": 0.2298206239938736, "learning_rate": 0.00021028192371475954, "loss": 0.4867, "step": 1092 }, { "epoch": 0.6025358324145534, "grad_norm": 0.23651166260242462, "learning_rate": 0.00021019900497512434, "loss": 0.5119, "step": 1093 }, { "epoch": 0.6030871003307607, "grad_norm": 0.24213020503520966, "learning_rate": 0.0002101160862354892, "loss": 0.4989, "step": 1094 }, { "epoch": 0.603638368246968, "grad_norm": 0.2975553572177887, "learning_rate": 0.00021003316749585403, "loss": 0.4937, "step": 1095 }, { "epoch": 0.6041896361631753, "grad_norm": 0.22954276204109192, "learning_rate": 0.0002099502487562189, "loss": 0.4569, "step": 1096 }, { "epoch": 0.6047409040793826, "grad_norm": 0.23405365645885468, "learning_rate": 0.00020986733001658372, "loss": 0.476, "step": 1097 }, { "epoch": 0.6052921719955898, "grad_norm": 0.22513137757778168, "learning_rate": 0.00020978441127694858, "loss": 0.4561, "step": 1098 }, { "epoch": 0.6058434399117971, "grad_norm": 0.2296430617570877, "learning_rate": 0.00020970149253731341, "loss": 0.4628, "step": 1099 }, { "epoch": 0.6063947078280044, "grad_norm": 0.24347829818725586, "learning_rate": 0.00020961857379767827, "loss": 0.5152, "step": 1100 }, { "epoch": 0.6069459757442117, "grad_norm": 0.2580801546573639, "learning_rate": 0.0002095356550580431, "loss": 0.4751, "step": 1101 }, { "epoch": 0.607497243660419, "grad_norm": 0.22813639044761658, "learning_rate": 0.00020945273631840797, "loss": 0.4807, "step": 1102 }, { "epoch": 0.6080485115766262, "grad_norm": 0.22047673165798187, "learning_rate": 0.00020936981757877277, "loss": 0.4686, "step": 1103 }, { "epoch": 0.6085997794928335, "grad_norm": 0.2241135686635971, "learning_rate": 0.00020928689883913763, "loss": 0.4826, "step": 1104 }, { "epoch": 0.6091510474090408, "grad_norm": 0.24011586606502533, "learning_rate": 0.00020920398009950246, "loss": 0.4559, "step": 1105 }, { "epoch": 0.6097023153252481, "grad_norm": 0.2351463884115219, "learning_rate": 0.00020912106135986732, "loss": 0.4523, "step": 1106 }, { "epoch": 0.6102535832414554, "grad_norm": 0.2268303632736206, "learning_rate": 0.00020903814262023215, "loss": 0.486, "step": 1107 }, { "epoch": 0.6108048511576626, "grad_norm": 0.2280043363571167, "learning_rate": 0.000208955223880597, "loss": 0.4902, "step": 1108 }, { "epoch": 0.6113561190738699, "grad_norm": 0.21859845519065857, "learning_rate": 0.00020887230514096184, "loss": 0.4593, "step": 1109 }, { "epoch": 0.6119073869900772, "grad_norm": 0.23152512311935425, "learning_rate": 0.0002087893864013267, "loss": 0.4762, "step": 1110 }, { "epoch": 0.6124586549062845, "grad_norm": 0.23346808552742004, "learning_rate": 0.00020870646766169153, "loss": 0.4919, "step": 1111 }, { "epoch": 0.6130099228224918, "grad_norm": 0.2313188761472702, "learning_rate": 0.0002086235489220564, "loss": 0.4792, "step": 1112 }, { "epoch": 0.613561190738699, "grad_norm": 0.2261422574520111, "learning_rate": 0.0002085406301824212, "loss": 0.5008, "step": 1113 }, { "epoch": 0.6141124586549063, "grad_norm": 0.24444694817066193, "learning_rate": 0.00020845771144278603, "loss": 0.503, "step": 1114 }, { "epoch": 0.6146637265711136, "grad_norm": 0.23184862732887268, "learning_rate": 0.0002083747927031509, "loss": 0.5024, "step": 1115 }, { "epoch": 0.6152149944873209, "grad_norm": 0.22305606305599213, "learning_rate": 0.00020829187396351572, "loss": 0.4815, "step": 1116 }, { "epoch": 0.6157662624035282, "grad_norm": 0.24641431868076324, "learning_rate": 0.00020820895522388058, "loss": 0.5079, "step": 1117 }, { "epoch": 0.6163175303197354, "grad_norm": 0.24148327112197876, "learning_rate": 0.0002081260364842454, "loss": 0.507, "step": 1118 }, { "epoch": 0.6168687982359427, "grad_norm": 0.23938195407390594, "learning_rate": 0.00020804311774461027, "loss": 0.4668, "step": 1119 }, { "epoch": 0.61742006615215, "grad_norm": 0.2462988644838333, "learning_rate": 0.0002079601990049751, "loss": 0.4941, "step": 1120 }, { "epoch": 0.6179713340683572, "grad_norm": 0.23903852701187134, "learning_rate": 0.00020787728026533996, "loss": 0.4684, "step": 1121 }, { "epoch": 0.6185226019845645, "grad_norm": 0.2402830719947815, "learning_rate": 0.00020779436152570477, "loss": 0.4705, "step": 1122 }, { "epoch": 0.6190738699007717, "grad_norm": 0.24639341235160828, "learning_rate": 0.00020771144278606963, "loss": 0.4874, "step": 1123 }, { "epoch": 0.619625137816979, "grad_norm": 0.22861522436141968, "learning_rate": 0.00020762852404643446, "loss": 0.4696, "step": 1124 }, { "epoch": 0.6201764057331863, "grad_norm": 0.23462949693202972, "learning_rate": 0.00020754560530679932, "loss": 0.509, "step": 1125 }, { "epoch": 0.6207276736493936, "grad_norm": 0.24041415750980377, "learning_rate": 0.00020746268656716415, "loss": 0.4792, "step": 1126 }, { "epoch": 0.6212789415656009, "grad_norm": 0.23339125514030457, "learning_rate": 0.000207379767827529, "loss": 0.4603, "step": 1127 }, { "epoch": 0.6218302094818081, "grad_norm": 0.23568972945213318, "learning_rate": 0.00020729684908789384, "loss": 0.4882, "step": 1128 }, { "epoch": 0.6223814773980154, "grad_norm": 0.24162200093269348, "learning_rate": 0.0002072139303482587, "loss": 0.4835, "step": 1129 }, { "epoch": 0.6229327453142227, "grad_norm": 0.24957728385925293, "learning_rate": 0.00020713101160862353, "loss": 0.4871, "step": 1130 }, { "epoch": 0.62348401323043, "grad_norm": 0.24710482358932495, "learning_rate": 0.0002070480928689884, "loss": 0.4604, "step": 1131 }, { "epoch": 0.6240352811466373, "grad_norm": 0.24623054265975952, "learning_rate": 0.0002069651741293532, "loss": 0.4986, "step": 1132 }, { "epoch": 0.6245865490628445, "grad_norm": 0.24791941046714783, "learning_rate": 0.00020688225538971806, "loss": 0.4665, "step": 1133 }, { "epoch": 0.6251378169790518, "grad_norm": 0.26239630579948425, "learning_rate": 0.0002067993366500829, "loss": 0.5193, "step": 1134 }, { "epoch": 0.6256890848952591, "grad_norm": 0.2580834925174713, "learning_rate": 0.00020671641791044775, "loss": 0.5162, "step": 1135 }, { "epoch": 0.6262403528114664, "grad_norm": 0.21768338978290558, "learning_rate": 0.00020663349917081258, "loss": 0.4626, "step": 1136 }, { "epoch": 0.6267916207276737, "grad_norm": 0.24815984070301056, "learning_rate": 0.00020655058043117744, "loss": 0.4943, "step": 1137 }, { "epoch": 0.6273428886438809, "grad_norm": 0.2349233627319336, "learning_rate": 0.00020646766169154227, "loss": 0.4819, "step": 1138 }, { "epoch": 0.6278941565600882, "grad_norm": 0.23029837012290955, "learning_rate": 0.00020638474295190713, "loss": 0.488, "step": 1139 }, { "epoch": 0.6284454244762955, "grad_norm": 0.23574088513851166, "learning_rate": 0.00020630182421227196, "loss": 0.4791, "step": 1140 }, { "epoch": 0.6289966923925028, "grad_norm": 0.23277179896831512, "learning_rate": 0.00020621890547263682, "loss": 0.5047, "step": 1141 }, { "epoch": 0.6295479603087101, "grad_norm": 0.2530352473258972, "learning_rate": 0.00020613598673300163, "loss": 0.5143, "step": 1142 }, { "epoch": 0.6300992282249173, "grad_norm": 0.2136935591697693, "learning_rate": 0.00020605306799336649, "loss": 0.4768, "step": 1143 }, { "epoch": 0.6306504961411246, "grad_norm": 0.23165372014045715, "learning_rate": 0.00020597014925373132, "loss": 0.4802, "step": 1144 }, { "epoch": 0.6312017640573319, "grad_norm": 0.23744627833366394, "learning_rate": 0.00020588723051409618, "loss": 0.4751, "step": 1145 }, { "epoch": 0.6317530319735392, "grad_norm": 0.2552582323551178, "learning_rate": 0.000205804311774461, "loss": 0.4949, "step": 1146 }, { "epoch": 0.6323042998897465, "grad_norm": 0.22193565964698792, "learning_rate": 0.00020572139303482587, "loss": 0.4629, "step": 1147 }, { "epoch": 0.6328555678059536, "grad_norm": 0.2249847799539566, "learning_rate": 0.0002056384742951907, "loss": 0.46, "step": 1148 }, { "epoch": 0.6334068357221609, "grad_norm": 0.234629824757576, "learning_rate": 0.00020555555555555556, "loss": 0.4792, "step": 1149 }, { "epoch": 0.6339581036383682, "grad_norm": 0.23007982969284058, "learning_rate": 0.0002054726368159204, "loss": 0.4857, "step": 1150 }, { "epoch": 0.6345093715545755, "grad_norm": 0.24549317359924316, "learning_rate": 0.00020538971807628525, "loss": 0.4697, "step": 1151 }, { "epoch": 0.6350606394707828, "grad_norm": 0.26415401697158813, "learning_rate": 0.00020530679933665005, "loss": 0.4858, "step": 1152 }, { "epoch": 0.63561190738699, "grad_norm": 0.20789586007595062, "learning_rate": 0.00020522388059701491, "loss": 0.4312, "step": 1153 }, { "epoch": 0.6361631753031973, "grad_norm": 0.23789043724536896, "learning_rate": 0.00020514096185737975, "loss": 0.4816, "step": 1154 }, { "epoch": 0.6367144432194046, "grad_norm": 0.23785383999347687, "learning_rate": 0.0002050580431177446, "loss": 0.4743, "step": 1155 }, { "epoch": 0.6372657111356119, "grad_norm": 0.26521044969558716, "learning_rate": 0.00020497512437810944, "loss": 0.4904, "step": 1156 }, { "epoch": 0.6378169790518192, "grad_norm": 0.25412556529045105, "learning_rate": 0.0002048922056384743, "loss": 0.5, "step": 1157 }, { "epoch": 0.6383682469680264, "grad_norm": 0.23178859055042267, "learning_rate": 0.00020480928689883913, "loss": 0.4791, "step": 1158 }, { "epoch": 0.6389195148842337, "grad_norm": 0.23838523030281067, "learning_rate": 0.00020472636815920393, "loss": 0.4539, "step": 1159 }, { "epoch": 0.639470782800441, "grad_norm": 0.23378612101078033, "learning_rate": 0.0002046434494195688, "loss": 0.492, "step": 1160 }, { "epoch": 0.6400220507166483, "grad_norm": 0.24227279424667358, "learning_rate": 0.00020456053067993362, "loss": 0.474, "step": 1161 }, { "epoch": 0.6405733186328556, "grad_norm": 0.23166267573833466, "learning_rate": 0.00020447761194029848, "loss": 0.4684, "step": 1162 }, { "epoch": 0.6411245865490628, "grad_norm": 0.23626738786697388, "learning_rate": 0.00020439469320066332, "loss": 0.4744, "step": 1163 }, { "epoch": 0.6416758544652701, "grad_norm": 0.2464771568775177, "learning_rate": 0.00020431177446102817, "loss": 0.47, "step": 1164 }, { "epoch": 0.6422271223814774, "grad_norm": 0.23458126187324524, "learning_rate": 0.000204228855721393, "loss": 0.4442, "step": 1165 }, { "epoch": 0.6427783902976847, "grad_norm": 0.23561522364616394, "learning_rate": 0.00020414593698175787, "loss": 0.4696, "step": 1166 }, { "epoch": 0.643329658213892, "grad_norm": 0.2327614575624466, "learning_rate": 0.0002040630182421227, "loss": 0.486, "step": 1167 }, { "epoch": 0.6438809261300992, "grad_norm": 0.22742946445941925, "learning_rate": 0.00020398009950248756, "loss": 0.4448, "step": 1168 }, { "epoch": 0.6444321940463065, "grad_norm": 0.22767378389835358, "learning_rate": 0.00020389718076285236, "loss": 0.4749, "step": 1169 }, { "epoch": 0.6449834619625138, "grad_norm": 0.21805496513843536, "learning_rate": 0.00020381426202321722, "loss": 0.4976, "step": 1170 }, { "epoch": 0.6455347298787211, "grad_norm": 0.23068863153457642, "learning_rate": 0.00020373134328358205, "loss": 0.4839, "step": 1171 }, { "epoch": 0.6460859977949284, "grad_norm": 0.24028991162776947, "learning_rate": 0.0002036484245439469, "loss": 0.4613, "step": 1172 }, { "epoch": 0.6466372657111357, "grad_norm": 0.2558547854423523, "learning_rate": 0.00020356550580431174, "loss": 0.4795, "step": 1173 }, { "epoch": 0.6471885336273429, "grad_norm": 0.2363976091146469, "learning_rate": 0.0002034825870646766, "loss": 0.4819, "step": 1174 }, { "epoch": 0.6477398015435502, "grad_norm": 0.23440702259540558, "learning_rate": 0.00020339966832504144, "loss": 0.4676, "step": 1175 }, { "epoch": 0.6482910694597575, "grad_norm": 0.23950831592082977, "learning_rate": 0.0002033167495854063, "loss": 0.4775, "step": 1176 }, { "epoch": 0.6488423373759648, "grad_norm": 0.23549869656562805, "learning_rate": 0.00020323383084577113, "loss": 0.471, "step": 1177 }, { "epoch": 0.649393605292172, "grad_norm": 0.2294132113456726, "learning_rate": 0.00020315091210613599, "loss": 0.4584, "step": 1178 }, { "epoch": 0.6499448732083792, "grad_norm": 0.2511732280254364, "learning_rate": 0.0002030679933665008, "loss": 0.4886, "step": 1179 }, { "epoch": 0.6504961411245865, "grad_norm": 0.23680317401885986, "learning_rate": 0.00020298507462686565, "loss": 0.5026, "step": 1180 }, { "epoch": 0.6510474090407938, "grad_norm": 0.24410556256771088, "learning_rate": 0.00020290215588723048, "loss": 0.4862, "step": 1181 }, { "epoch": 0.6515986769570011, "grad_norm": 0.24827975034713745, "learning_rate": 0.00020281923714759534, "loss": 0.4734, "step": 1182 }, { "epoch": 0.6521499448732084, "grad_norm": 0.24595201015472412, "learning_rate": 0.00020273631840796017, "loss": 0.4754, "step": 1183 }, { "epoch": 0.6527012127894156, "grad_norm": 0.24838019907474518, "learning_rate": 0.00020265339966832503, "loss": 0.4923, "step": 1184 }, { "epoch": 0.6532524807056229, "grad_norm": 0.23605762422084808, "learning_rate": 0.00020257048092868986, "loss": 0.477, "step": 1185 }, { "epoch": 0.6538037486218302, "grad_norm": 0.24502962827682495, "learning_rate": 0.00020248756218905472, "loss": 0.482, "step": 1186 }, { "epoch": 0.6543550165380375, "grad_norm": 0.24489161372184753, "learning_rate": 0.00020240464344941956, "loss": 0.4783, "step": 1187 }, { "epoch": 0.6549062844542448, "grad_norm": 0.236792653799057, "learning_rate": 0.00020232172470978441, "loss": 0.4899, "step": 1188 }, { "epoch": 0.655457552370452, "grad_norm": 0.2327335923910141, "learning_rate": 0.00020223880597014922, "loss": 0.4915, "step": 1189 }, { "epoch": 0.6560088202866593, "grad_norm": 0.21822991967201233, "learning_rate": 0.00020215588723051408, "loss": 0.472, "step": 1190 }, { "epoch": 0.6565600882028666, "grad_norm": 0.2524334788322449, "learning_rate": 0.0002020729684908789, "loss": 0.4942, "step": 1191 }, { "epoch": 0.6571113561190739, "grad_norm": 0.23585528135299683, "learning_rate": 0.00020199004975124377, "loss": 0.5011, "step": 1192 }, { "epoch": 0.6576626240352812, "grad_norm": 0.24948836863040924, "learning_rate": 0.0002019071310116086, "loss": 0.4831, "step": 1193 }, { "epoch": 0.6582138919514884, "grad_norm": 0.2369844615459442, "learning_rate": 0.00020182421227197346, "loss": 0.4923, "step": 1194 }, { "epoch": 0.6587651598676957, "grad_norm": 0.22455725073814392, "learning_rate": 0.0002017412935323383, "loss": 0.4699, "step": 1195 }, { "epoch": 0.659316427783903, "grad_norm": 0.22049696743488312, "learning_rate": 0.00020165837479270315, "loss": 0.4569, "step": 1196 }, { "epoch": 0.6598676957001103, "grad_norm": 0.21964412927627563, "learning_rate": 0.00020157545605306798, "loss": 0.4818, "step": 1197 }, { "epoch": 0.6604189636163176, "grad_norm": 0.24084921181201935, "learning_rate": 0.00020149253731343284, "loss": 0.4834, "step": 1198 }, { "epoch": 0.6609702315325248, "grad_norm": 0.2169031798839569, "learning_rate": 0.00020140961857379765, "loss": 0.458, "step": 1199 }, { "epoch": 0.6615214994487321, "grad_norm": 0.2437864989042282, "learning_rate": 0.0002013266998341625, "loss": 0.4998, "step": 1200 }, { "epoch": 0.6620727673649394, "grad_norm": 0.2373666912317276, "learning_rate": 0.00020124378109452734, "loss": 0.4593, "step": 1201 }, { "epoch": 0.6626240352811467, "grad_norm": 0.2300565093755722, "learning_rate": 0.00020116086235489217, "loss": 0.4698, "step": 1202 }, { "epoch": 0.663175303197354, "grad_norm": 0.2500588595867157, "learning_rate": 0.00020107794361525703, "loss": 0.4847, "step": 1203 }, { "epoch": 0.6637265711135611, "grad_norm": 0.24038562178611755, "learning_rate": 0.00020099502487562186, "loss": 0.4746, "step": 1204 }, { "epoch": 0.6642778390297684, "grad_norm": 0.2691898047924042, "learning_rate": 0.00020091210613598672, "loss": 0.4547, "step": 1205 }, { "epoch": 0.6648291069459757, "grad_norm": 0.23530587553977966, "learning_rate": 0.00020082918739635155, "loss": 0.4618, "step": 1206 }, { "epoch": 0.665380374862183, "grad_norm": 0.24838554859161377, "learning_rate": 0.0002007462686567164, "loss": 0.5093, "step": 1207 }, { "epoch": 0.6659316427783903, "grad_norm": 0.24996088445186615, "learning_rate": 0.00020066334991708122, "loss": 0.4703, "step": 1208 }, { "epoch": 0.6664829106945975, "grad_norm": 0.2432130128145218, "learning_rate": 0.00020058043117744608, "loss": 0.4651, "step": 1209 }, { "epoch": 0.6670341786108048, "grad_norm": 0.2394338846206665, "learning_rate": 0.0002004975124378109, "loss": 0.4679, "step": 1210 }, { "epoch": 0.6675854465270121, "grad_norm": 0.23440587520599365, "learning_rate": 0.00020041459369817577, "loss": 0.4552, "step": 1211 }, { "epoch": 0.6681367144432194, "grad_norm": 0.25409042835235596, "learning_rate": 0.0002003316749585406, "loss": 0.4879, "step": 1212 }, { "epoch": 0.6686879823594267, "grad_norm": 0.24675914645195007, "learning_rate": 0.00020024875621890546, "loss": 0.4935, "step": 1213 }, { "epoch": 0.6692392502756339, "grad_norm": 0.2398385852575302, "learning_rate": 0.0002001658374792703, "loss": 0.4588, "step": 1214 }, { "epoch": 0.6697905181918412, "grad_norm": 0.23646225035190582, "learning_rate": 0.00020008291873963515, "loss": 0.486, "step": 1215 }, { "epoch": 0.6703417861080485, "grad_norm": 0.2433752566576004, "learning_rate": 0.00019999999999999998, "loss": 0.5, "step": 1216 }, { "epoch": 0.6708930540242558, "grad_norm": 0.22759981453418732, "learning_rate": 0.00019991708126036484, "loss": 0.482, "step": 1217 }, { "epoch": 0.6714443219404631, "grad_norm": 0.2414034903049469, "learning_rate": 0.00019983416252072965, "loss": 0.4754, "step": 1218 }, { "epoch": 0.6719955898566703, "grad_norm": 0.23548895120620728, "learning_rate": 0.0001997512437810945, "loss": 0.4793, "step": 1219 }, { "epoch": 0.6725468577728776, "grad_norm": 0.22510850429534912, "learning_rate": 0.00019966832504145934, "loss": 0.474, "step": 1220 }, { "epoch": 0.6730981256890849, "grad_norm": 0.21878324449062347, "learning_rate": 0.0001995854063018242, "loss": 0.4349, "step": 1221 }, { "epoch": 0.6736493936052922, "grad_norm": 0.234661266207695, "learning_rate": 0.00019950248756218903, "loss": 0.4602, "step": 1222 }, { "epoch": 0.6742006615214995, "grad_norm": 0.24233828485012054, "learning_rate": 0.0001994195688225539, "loss": 0.4932, "step": 1223 }, { "epoch": 0.6747519294377067, "grad_norm": 0.22866547107696533, "learning_rate": 0.00019933665008291872, "loss": 0.4697, "step": 1224 }, { "epoch": 0.675303197353914, "grad_norm": 0.2325911670923233, "learning_rate": 0.00019925373134328358, "loss": 0.4631, "step": 1225 }, { "epoch": 0.6758544652701213, "grad_norm": 0.22702381014823914, "learning_rate": 0.0001991708126036484, "loss": 0.4631, "step": 1226 }, { "epoch": 0.6764057331863286, "grad_norm": 0.23354612290859222, "learning_rate": 0.00019908789386401327, "loss": 0.4687, "step": 1227 }, { "epoch": 0.6769570011025359, "grad_norm": 0.2386290431022644, "learning_rate": 0.00019900497512437808, "loss": 0.4777, "step": 1228 }, { "epoch": 0.6775082690187431, "grad_norm": 0.24729053676128387, "learning_rate": 0.00019892205638474293, "loss": 0.4785, "step": 1229 }, { "epoch": 0.6780595369349504, "grad_norm": 0.2109660655260086, "learning_rate": 0.00019883913764510777, "loss": 0.464, "step": 1230 }, { "epoch": 0.6786108048511577, "grad_norm": 0.24349510669708252, "learning_rate": 0.00019875621890547263, "loss": 0.4972, "step": 1231 }, { "epoch": 0.679162072767365, "grad_norm": 0.236436665058136, "learning_rate": 0.00019867330016583746, "loss": 0.4655, "step": 1232 }, { "epoch": 0.6797133406835723, "grad_norm": 0.22133763134479523, "learning_rate": 0.00019859038142620232, "loss": 0.4856, "step": 1233 }, { "epoch": 0.6802646085997794, "grad_norm": 0.23461799323558807, "learning_rate": 0.00019850746268656715, "loss": 0.4974, "step": 1234 }, { "epoch": 0.6808158765159867, "grad_norm": 0.23802213370800018, "learning_rate": 0.000198424543946932, "loss": 0.4634, "step": 1235 }, { "epoch": 0.681367144432194, "grad_norm": 0.23866182565689087, "learning_rate": 0.00019834162520729684, "loss": 0.4962, "step": 1236 }, { "epoch": 0.6819184123484013, "grad_norm": 0.20461198687553406, "learning_rate": 0.0001982587064676617, "loss": 0.479, "step": 1237 }, { "epoch": 0.6824696802646086, "grad_norm": 0.2442476749420166, "learning_rate": 0.0001981757877280265, "loss": 0.5007, "step": 1238 }, { "epoch": 0.6830209481808158, "grad_norm": 0.2257671356201172, "learning_rate": 0.00019809286898839136, "loss": 0.4899, "step": 1239 }, { "epoch": 0.6835722160970231, "grad_norm": 0.214102640748024, "learning_rate": 0.0001980099502487562, "loss": 0.4536, "step": 1240 }, { "epoch": 0.6841234840132304, "grad_norm": 0.21543948352336884, "learning_rate": 0.00019792703150912105, "loss": 0.4811, "step": 1241 }, { "epoch": 0.6846747519294377, "grad_norm": 0.25430455803871155, "learning_rate": 0.00019784411276948589, "loss": 0.486, "step": 1242 }, { "epoch": 0.685226019845645, "grad_norm": 0.2656538486480713, "learning_rate": 0.00019776119402985075, "loss": 0.462, "step": 1243 }, { "epoch": 0.6857772877618522, "grad_norm": 0.23967699706554413, "learning_rate": 0.00019767827529021558, "loss": 0.5004, "step": 1244 }, { "epoch": 0.6863285556780595, "grad_norm": 0.22987446188926697, "learning_rate": 0.00019759535655058044, "loss": 0.4804, "step": 1245 }, { "epoch": 0.6868798235942668, "grad_norm": 0.20953255891799927, "learning_rate": 0.00019751243781094527, "loss": 0.4793, "step": 1246 }, { "epoch": 0.6874310915104741, "grad_norm": 0.256028413772583, "learning_rate": 0.00019742951907131007, "loss": 0.4881, "step": 1247 }, { "epoch": 0.6879823594266814, "grad_norm": 0.23885922133922577, "learning_rate": 0.00019734660033167493, "loss": 0.508, "step": 1248 }, { "epoch": 0.6885336273428887, "grad_norm": 0.24736814200878143, "learning_rate": 0.00019726368159203976, "loss": 0.4935, "step": 1249 }, { "epoch": 0.6890848952590959, "grad_norm": 0.23237743973731995, "learning_rate": 0.00019718076285240462, "loss": 0.4775, "step": 1250 }, { "epoch": 0.6896361631753032, "grad_norm": 0.24340516328811646, "learning_rate": 0.00019709784411276946, "loss": 0.4987, "step": 1251 }, { "epoch": 0.6901874310915105, "grad_norm": 0.22015541791915894, "learning_rate": 0.00019701492537313432, "loss": 0.4524, "step": 1252 }, { "epoch": 0.6907386990077178, "grad_norm": 0.25280436873435974, "learning_rate": 0.00019693200663349915, "loss": 0.4953, "step": 1253 }, { "epoch": 0.6912899669239251, "grad_norm": 0.22572125494480133, "learning_rate": 0.000196849087893864, "loss": 0.4692, "step": 1254 }, { "epoch": 0.6918412348401323, "grad_norm": 0.2326386719942093, "learning_rate": 0.00019676616915422884, "loss": 0.475, "step": 1255 }, { "epoch": 0.6923925027563396, "grad_norm": 0.2248145192861557, "learning_rate": 0.0001966832504145937, "loss": 0.4463, "step": 1256 }, { "epoch": 0.6929437706725469, "grad_norm": 0.236514613032341, "learning_rate": 0.0001966003316749585, "loss": 0.4502, "step": 1257 }, { "epoch": 0.6934950385887542, "grad_norm": 0.2295265942811966, "learning_rate": 0.00019651741293532336, "loss": 0.4559, "step": 1258 }, { "epoch": 0.6940463065049615, "grad_norm": 0.24026772379875183, "learning_rate": 0.0001964344941956882, "loss": 0.4642, "step": 1259 }, { "epoch": 0.6945975744211687, "grad_norm": 0.2558375298976898, "learning_rate": 0.00019635157545605305, "loss": 0.4864, "step": 1260 }, { "epoch": 0.695148842337376, "grad_norm": 0.2334502935409546, "learning_rate": 0.00019626865671641788, "loss": 0.47, "step": 1261 }, { "epoch": 0.6957001102535832, "grad_norm": 0.23098182678222656, "learning_rate": 0.00019618573797678274, "loss": 0.4786, "step": 1262 }, { "epoch": 0.6962513781697905, "grad_norm": 0.22288668155670166, "learning_rate": 0.00019610281923714758, "loss": 0.4638, "step": 1263 }, { "epoch": 0.6968026460859978, "grad_norm": 0.23454713821411133, "learning_rate": 0.00019601990049751244, "loss": 0.4661, "step": 1264 }, { "epoch": 0.697353914002205, "grad_norm": 0.22980453073978424, "learning_rate": 0.00019593698175787727, "loss": 0.4681, "step": 1265 }, { "epoch": 0.6979051819184123, "grad_norm": 0.20731012523174286, "learning_rate": 0.00019585406301824213, "loss": 0.4439, "step": 1266 }, { "epoch": 0.6984564498346196, "grad_norm": 0.22292488813400269, "learning_rate": 0.00019577114427860693, "loss": 0.4663, "step": 1267 }, { "epoch": 0.6990077177508269, "grad_norm": 0.22497937083244324, "learning_rate": 0.0001956882255389718, "loss": 0.4751, "step": 1268 }, { "epoch": 0.6995589856670342, "grad_norm": 0.2342757284641266, "learning_rate": 0.00019560530679933662, "loss": 0.4544, "step": 1269 }, { "epoch": 0.7001102535832414, "grad_norm": 0.23075568675994873, "learning_rate": 0.00019552238805970148, "loss": 0.4634, "step": 1270 }, { "epoch": 0.7006615214994487, "grad_norm": 0.2278735637664795, "learning_rate": 0.0001954394693200663, "loss": 0.4895, "step": 1271 }, { "epoch": 0.701212789415656, "grad_norm": 0.25607171654701233, "learning_rate": 0.00019535655058043117, "loss": 0.49, "step": 1272 }, { "epoch": 0.7017640573318633, "grad_norm": 0.2315627932548523, "learning_rate": 0.000195273631840796, "loss": 0.4522, "step": 1273 }, { "epoch": 0.7023153252480706, "grad_norm": 0.2047976851463318, "learning_rate": 0.00019519071310116086, "loss": 0.4356, "step": 1274 }, { "epoch": 0.7028665931642778, "grad_norm": 0.24180057644844055, "learning_rate": 0.00019510779436152567, "loss": 0.4749, "step": 1275 }, { "epoch": 0.7034178610804851, "grad_norm": 0.2599826753139496, "learning_rate": 0.00019502487562189055, "loss": 0.5082, "step": 1276 }, { "epoch": 0.7039691289966924, "grad_norm": 0.23944783210754395, "learning_rate": 0.00019494195688225536, "loss": 0.4828, "step": 1277 }, { "epoch": 0.7045203969128997, "grad_norm": 0.21794094145298004, "learning_rate": 0.00019485903814262022, "loss": 0.4691, "step": 1278 }, { "epoch": 0.705071664829107, "grad_norm": 0.23379597067832947, "learning_rate": 0.00019477611940298505, "loss": 0.486, "step": 1279 }, { "epoch": 0.7056229327453142, "grad_norm": 0.21778427064418793, "learning_rate": 0.0001946932006633499, "loss": 0.4483, "step": 1280 }, { "epoch": 0.7061742006615215, "grad_norm": 0.23941390216350555, "learning_rate": 0.00019461028192371474, "loss": 0.4885, "step": 1281 }, { "epoch": 0.7067254685777288, "grad_norm": 0.23993995785713196, "learning_rate": 0.0001945273631840796, "loss": 0.5098, "step": 1282 }, { "epoch": 0.7072767364939361, "grad_norm": 0.2523173391819, "learning_rate": 0.00019444444444444443, "loss": 0.4752, "step": 1283 }, { "epoch": 0.7078280044101434, "grad_norm": 0.23337773978710175, "learning_rate": 0.0001943615257048093, "loss": 0.5198, "step": 1284 }, { "epoch": 0.7083792723263506, "grad_norm": 0.24418905377388, "learning_rate": 0.0001942786069651741, "loss": 0.519, "step": 1285 }, { "epoch": 0.7089305402425579, "grad_norm": 0.24214893579483032, "learning_rate": 0.00019419568822553896, "loss": 0.4625, "step": 1286 }, { "epoch": 0.7094818081587652, "grad_norm": 0.25616276264190674, "learning_rate": 0.0001941127694859038, "loss": 0.483, "step": 1287 }, { "epoch": 0.7100330760749725, "grad_norm": 0.2434643656015396, "learning_rate": 0.00019402985074626865, "loss": 0.4834, "step": 1288 }, { "epoch": 0.7105843439911798, "grad_norm": 0.23342913389205933, "learning_rate": 0.00019394693200663348, "loss": 0.4577, "step": 1289 }, { "epoch": 0.7111356119073869, "grad_norm": 0.23564305901527405, "learning_rate": 0.0001938640132669983, "loss": 0.4731, "step": 1290 }, { "epoch": 0.7116868798235942, "grad_norm": 0.2814309000968933, "learning_rate": 0.00019378109452736317, "loss": 0.4845, "step": 1291 }, { "epoch": 0.7122381477398015, "grad_norm": 0.2305363267660141, "learning_rate": 0.000193698175787728, "loss": 0.4577, "step": 1292 }, { "epoch": 0.7127894156560088, "grad_norm": 0.2413802593946457, "learning_rate": 0.00019361525704809286, "loss": 0.5005, "step": 1293 }, { "epoch": 0.7133406835722161, "grad_norm": 0.22398939728736877, "learning_rate": 0.00019353233830845767, "loss": 0.4645, "step": 1294 }, { "epoch": 0.7138919514884233, "grad_norm": 0.2510089874267578, "learning_rate": 0.00019344941956882253, "loss": 0.4721, "step": 1295 }, { "epoch": 0.7144432194046306, "grad_norm": 0.23676623404026031, "learning_rate": 0.00019336650082918736, "loss": 0.5126, "step": 1296 }, { "epoch": 0.7149944873208379, "grad_norm": 0.22751228511333466, "learning_rate": 0.00019328358208955222, "loss": 0.4403, "step": 1297 }, { "epoch": 0.7155457552370452, "grad_norm": 0.23468491435050964, "learning_rate": 0.00019320066334991705, "loss": 0.4697, "step": 1298 }, { "epoch": 0.7160970231532525, "grad_norm": 0.2132336050271988, "learning_rate": 0.0001931177446102819, "loss": 0.4468, "step": 1299 }, { "epoch": 0.7166482910694597, "grad_norm": 0.22579894959926605, "learning_rate": 0.00019303482587064674, "loss": 0.458, "step": 1300 }, { "epoch": 0.717199558985667, "grad_norm": 0.22772036492824554, "learning_rate": 0.0001929519071310116, "loss": 0.457, "step": 1301 }, { "epoch": 0.7177508269018743, "grad_norm": 0.2290082722902298, "learning_rate": 0.00019286898839137643, "loss": 0.4771, "step": 1302 }, { "epoch": 0.7183020948180816, "grad_norm": 0.2190980762243271, "learning_rate": 0.0001927860696517413, "loss": 0.4754, "step": 1303 }, { "epoch": 0.7188533627342889, "grad_norm": 0.2228933423757553, "learning_rate": 0.0001927031509121061, "loss": 0.476, "step": 1304 }, { "epoch": 0.7194046306504961, "grad_norm": 0.23896026611328125, "learning_rate": 0.00019262023217247096, "loss": 0.5008, "step": 1305 }, { "epoch": 0.7199558985667034, "grad_norm": 0.222875714302063, "learning_rate": 0.0001925373134328358, "loss": 0.4526, "step": 1306 }, { "epoch": 0.7205071664829107, "grad_norm": 0.22457565367221832, "learning_rate": 0.00019245439469320065, "loss": 0.5019, "step": 1307 }, { "epoch": 0.721058434399118, "grad_norm": 0.24464376270771027, "learning_rate": 0.00019237147595356548, "loss": 0.4896, "step": 1308 }, { "epoch": 0.7216097023153253, "grad_norm": 0.22952450811862946, "learning_rate": 0.00019228855721393034, "loss": 0.4751, "step": 1309 }, { "epoch": 0.7221609702315325, "grad_norm": 0.22557076811790466, "learning_rate": 0.00019220563847429517, "loss": 0.4859, "step": 1310 }, { "epoch": 0.7227122381477398, "grad_norm": 0.2599719762802124, "learning_rate": 0.00019212271973466003, "loss": 0.4871, "step": 1311 }, { "epoch": 0.7232635060639471, "grad_norm": 0.2541002333164215, "learning_rate": 0.00019203980099502486, "loss": 0.5076, "step": 1312 }, { "epoch": 0.7238147739801544, "grad_norm": 0.234733447432518, "learning_rate": 0.00019195688225538972, "loss": 0.471, "step": 1313 }, { "epoch": 0.7243660418963617, "grad_norm": 0.23307423293590546, "learning_rate": 0.00019187396351575452, "loss": 0.4758, "step": 1314 }, { "epoch": 0.7249173098125689, "grad_norm": 0.22905585169792175, "learning_rate": 0.00019179104477611938, "loss": 0.4674, "step": 1315 }, { "epoch": 0.7254685777287762, "grad_norm": 0.24311380088329315, "learning_rate": 0.00019170812603648422, "loss": 0.4838, "step": 1316 }, { "epoch": 0.7260198456449835, "grad_norm": 0.24221283197402954, "learning_rate": 0.00019162520729684907, "loss": 0.4671, "step": 1317 }, { "epoch": 0.7265711135611908, "grad_norm": 0.2364143580198288, "learning_rate": 0.0001915422885572139, "loss": 0.4496, "step": 1318 }, { "epoch": 0.727122381477398, "grad_norm": 0.2382567673921585, "learning_rate": 0.00019145936981757877, "loss": 0.4516, "step": 1319 }, { "epoch": 0.7276736493936052, "grad_norm": 0.281539648771286, "learning_rate": 0.0001913764510779436, "loss": 0.4742, "step": 1320 }, { "epoch": 0.7282249173098125, "grad_norm": 0.2738378345966339, "learning_rate": 0.00019129353233830846, "loss": 0.5158, "step": 1321 }, { "epoch": 0.7287761852260198, "grad_norm": 0.23668839037418365, "learning_rate": 0.0001912106135986733, "loss": 0.4907, "step": 1322 }, { "epoch": 0.7293274531422271, "grad_norm": 0.2443835288286209, "learning_rate": 0.00019112769485903815, "loss": 0.4887, "step": 1323 }, { "epoch": 0.7298787210584344, "grad_norm": 0.2538048028945923, "learning_rate": 0.00019104477611940295, "loss": 0.4413, "step": 1324 }, { "epoch": 0.7304299889746417, "grad_norm": 0.24266113340854645, "learning_rate": 0.0001909618573797678, "loss": 0.4618, "step": 1325 }, { "epoch": 0.7309812568908489, "grad_norm": 0.2522546648979187, "learning_rate": 0.00019087893864013264, "loss": 0.493, "step": 1326 }, { "epoch": 0.7315325248070562, "grad_norm": 0.24361646175384521, "learning_rate": 0.0001907960199004975, "loss": 0.4552, "step": 1327 }, { "epoch": 0.7320837927232635, "grad_norm": 0.24726730585098267, "learning_rate": 0.00019071310116086234, "loss": 0.4899, "step": 1328 }, { "epoch": 0.7326350606394708, "grad_norm": 0.23533383011817932, "learning_rate": 0.0001906301824212272, "loss": 0.4674, "step": 1329 }, { "epoch": 0.7331863285556781, "grad_norm": 0.23652805387973785, "learning_rate": 0.00019054726368159203, "loss": 0.4734, "step": 1330 }, { "epoch": 0.7337375964718853, "grad_norm": 0.24334965646266937, "learning_rate": 0.00019046434494195689, "loss": 0.4897, "step": 1331 }, { "epoch": 0.7342888643880926, "grad_norm": 0.2077738642692566, "learning_rate": 0.00019038142620232172, "loss": 0.4516, "step": 1332 }, { "epoch": 0.7348401323042999, "grad_norm": 0.23306086659431458, "learning_rate": 0.00019029850746268658, "loss": 0.5076, "step": 1333 }, { "epoch": 0.7353914002205072, "grad_norm": 0.2449159324169159, "learning_rate": 0.00019021558872305138, "loss": 0.4618, "step": 1334 }, { "epoch": 0.7359426681367145, "grad_norm": 0.24829532206058502, "learning_rate": 0.00019013266998341621, "loss": 0.4614, "step": 1335 }, { "epoch": 0.7364939360529217, "grad_norm": 0.23648925125598907, "learning_rate": 0.00019004975124378107, "loss": 0.4616, "step": 1336 }, { "epoch": 0.737045203969129, "grad_norm": 0.23551128804683685, "learning_rate": 0.0001899668325041459, "loss": 0.4724, "step": 1337 }, { "epoch": 0.7375964718853363, "grad_norm": 0.23878498375415802, "learning_rate": 0.00018988391376451076, "loss": 0.4639, "step": 1338 }, { "epoch": 0.7381477398015436, "grad_norm": 0.24612358212471008, "learning_rate": 0.0001898009950248756, "loss": 0.4757, "step": 1339 }, { "epoch": 0.7386990077177509, "grad_norm": 0.2288011610507965, "learning_rate": 0.00018971807628524046, "loss": 0.4598, "step": 1340 }, { "epoch": 0.7392502756339581, "grad_norm": 0.2329450398683548, "learning_rate": 0.0001896351575456053, "loss": 0.4884, "step": 1341 }, { "epoch": 0.7398015435501654, "grad_norm": 0.23273812234401703, "learning_rate": 0.00018955223880597015, "loss": 0.4834, "step": 1342 }, { "epoch": 0.7403528114663727, "grad_norm": 0.24095992743968964, "learning_rate": 0.00018946932006633495, "loss": 0.4352, "step": 1343 }, { "epoch": 0.74090407938258, "grad_norm": 0.24149319529533386, "learning_rate": 0.0001893864013266998, "loss": 0.4675, "step": 1344 }, { "epoch": 0.7414553472987873, "grad_norm": 0.24013857543468475, "learning_rate": 0.00018930348258706464, "loss": 0.4879, "step": 1345 }, { "epoch": 0.7420066152149944, "grad_norm": 0.23142081499099731, "learning_rate": 0.0001892205638474295, "loss": 0.4607, "step": 1346 }, { "epoch": 0.7425578831312017, "grad_norm": 0.2619989514350891, "learning_rate": 0.00018913764510779433, "loss": 0.4784, "step": 1347 }, { "epoch": 0.743109151047409, "grad_norm": 0.23706799745559692, "learning_rate": 0.0001890547263681592, "loss": 0.4716, "step": 1348 }, { "epoch": 0.7436604189636163, "grad_norm": 0.25641632080078125, "learning_rate": 0.00018897180762852403, "loss": 0.4951, "step": 1349 }, { "epoch": 0.7442116868798236, "grad_norm": 0.225026935338974, "learning_rate": 0.00018888888888888888, "loss": 0.4742, "step": 1350 }, { "epoch": 0.7447629547960308, "grad_norm": 0.21225763857364655, "learning_rate": 0.00018880597014925372, "loss": 0.4484, "step": 1351 }, { "epoch": 0.7453142227122381, "grad_norm": 0.2503174841403961, "learning_rate": 0.00018872305140961858, "loss": 0.4832, "step": 1352 }, { "epoch": 0.7458654906284454, "grad_norm": 0.25594860315322876, "learning_rate": 0.00018864013266998338, "loss": 0.4952, "step": 1353 }, { "epoch": 0.7464167585446527, "grad_norm": 0.23849812150001526, "learning_rate": 0.00018855721393034824, "loss": 0.4927, "step": 1354 }, { "epoch": 0.74696802646086, "grad_norm": 0.22114640474319458, "learning_rate": 0.00018847429519071307, "loss": 0.4475, "step": 1355 }, { "epoch": 0.7475192943770672, "grad_norm": 0.23791830241680145, "learning_rate": 0.00018839137645107793, "loss": 0.4846, "step": 1356 }, { "epoch": 0.7480705622932745, "grad_norm": 0.2577480673789978, "learning_rate": 0.00018830845771144276, "loss": 0.4541, "step": 1357 }, { "epoch": 0.7486218302094818, "grad_norm": 0.2754758596420288, "learning_rate": 0.00018822553897180762, "loss": 0.4958, "step": 1358 }, { "epoch": 0.7491730981256891, "grad_norm": 0.2309567779302597, "learning_rate": 0.00018814262023217245, "loss": 0.4671, "step": 1359 }, { "epoch": 0.7497243660418964, "grad_norm": 0.24164016544818878, "learning_rate": 0.0001880597014925373, "loss": 0.4712, "step": 1360 }, { "epoch": 0.7502756339581036, "grad_norm": 0.21853327751159668, "learning_rate": 0.00018797678275290215, "loss": 0.503, "step": 1361 }, { "epoch": 0.7508269018743109, "grad_norm": 0.22078783810138702, "learning_rate": 0.000187893864013267, "loss": 0.4654, "step": 1362 }, { "epoch": 0.7513781697905182, "grad_norm": 0.23638005554676056, "learning_rate": 0.0001878109452736318, "loss": 0.4742, "step": 1363 }, { "epoch": 0.7519294377067255, "grad_norm": 0.23174162209033966, "learning_rate": 0.00018772802653399667, "loss": 0.4599, "step": 1364 }, { "epoch": 0.7524807056229328, "grad_norm": 0.23956626653671265, "learning_rate": 0.0001876451077943615, "loss": 0.477, "step": 1365 }, { "epoch": 0.75303197353914, "grad_norm": 0.23747730255126953, "learning_rate": 0.00018756218905472636, "loss": 0.46, "step": 1366 }, { "epoch": 0.7535832414553473, "grad_norm": 0.22467990219593048, "learning_rate": 0.0001874792703150912, "loss": 0.4502, "step": 1367 }, { "epoch": 0.7541345093715546, "grad_norm": 0.230741485953331, "learning_rate": 0.00018739635157545605, "loss": 0.4718, "step": 1368 }, { "epoch": 0.7546857772877619, "grad_norm": 0.24028630554676056, "learning_rate": 0.00018731343283582088, "loss": 0.4619, "step": 1369 }, { "epoch": 0.7552370452039692, "grad_norm": 0.24253641068935394, "learning_rate": 0.00018723051409618574, "loss": 0.4817, "step": 1370 }, { "epoch": 0.7557883131201764, "grad_norm": 0.22565878927707672, "learning_rate": 0.00018714759535655057, "loss": 0.4663, "step": 1371 }, { "epoch": 0.7563395810363837, "grad_norm": 0.23143254220485687, "learning_rate": 0.00018706467661691543, "loss": 0.4536, "step": 1372 }, { "epoch": 0.756890848952591, "grad_norm": 0.23320366442203522, "learning_rate": 0.00018698175787728024, "loss": 0.4304, "step": 1373 }, { "epoch": 0.7574421168687983, "grad_norm": 0.23350325226783752, "learning_rate": 0.0001868988391376451, "loss": 0.4649, "step": 1374 }, { "epoch": 0.7579933847850056, "grad_norm": 0.2501453757286072, "learning_rate": 0.00018681592039800993, "loss": 0.4696, "step": 1375 }, { "epoch": 0.7585446527012127, "grad_norm": 0.22919632494449615, "learning_rate": 0.0001867330016583748, "loss": 0.4751, "step": 1376 }, { "epoch": 0.75909592061742, "grad_norm": 0.2562139332294464, "learning_rate": 0.00018665008291873962, "loss": 0.49, "step": 1377 }, { "epoch": 0.7596471885336273, "grad_norm": 0.2472946047782898, "learning_rate": 0.00018656716417910445, "loss": 0.4873, "step": 1378 }, { "epoch": 0.7601984564498346, "grad_norm": 0.22273144125938416, "learning_rate": 0.0001864842454394693, "loss": 0.4569, "step": 1379 }, { "epoch": 0.7607497243660419, "grad_norm": 0.24337974190711975, "learning_rate": 0.00018640132669983414, "loss": 0.4717, "step": 1380 }, { "epoch": 0.7613009922822491, "grad_norm": 0.23919668793678284, "learning_rate": 0.000186318407960199, "loss": 0.4966, "step": 1381 }, { "epoch": 0.7618522601984564, "grad_norm": 0.25102800130844116, "learning_rate": 0.0001862354892205638, "loss": 0.4551, "step": 1382 }, { "epoch": 0.7624035281146637, "grad_norm": 0.22430755198001862, "learning_rate": 0.00018615257048092867, "loss": 0.4628, "step": 1383 }, { "epoch": 0.762954796030871, "grad_norm": 0.2542060613632202, "learning_rate": 0.0001860696517412935, "loss": 0.474, "step": 1384 }, { "epoch": 0.7635060639470783, "grad_norm": 0.24267995357513428, "learning_rate": 0.00018598673300165836, "loss": 0.4709, "step": 1385 }, { "epoch": 0.7640573318632855, "grad_norm": 0.24730850756168365, "learning_rate": 0.0001859038142620232, "loss": 0.4703, "step": 1386 }, { "epoch": 0.7646085997794928, "grad_norm": 0.22491230070590973, "learning_rate": 0.00018582089552238805, "loss": 0.4572, "step": 1387 }, { "epoch": 0.7651598676957001, "grad_norm": 0.25823476910591125, "learning_rate": 0.00018573797678275288, "loss": 0.4911, "step": 1388 }, { "epoch": 0.7657111356119074, "grad_norm": 0.2442496418952942, "learning_rate": 0.00018565505804311774, "loss": 0.4514, "step": 1389 }, { "epoch": 0.7662624035281147, "grad_norm": 0.22842232882976532, "learning_rate": 0.00018557213930348257, "loss": 0.459, "step": 1390 }, { "epoch": 0.7668136714443219, "grad_norm": 0.24691414833068848, "learning_rate": 0.00018548922056384743, "loss": 0.4958, "step": 1391 }, { "epoch": 0.7673649393605292, "grad_norm": 0.22024598717689514, "learning_rate": 0.00018540630182421224, "loss": 0.4621, "step": 1392 }, { "epoch": 0.7679162072767365, "grad_norm": 0.24100075662136078, "learning_rate": 0.0001853233830845771, "loss": 0.486, "step": 1393 }, { "epoch": 0.7684674751929438, "grad_norm": 0.2123764157295227, "learning_rate": 0.00018524046434494193, "loss": 0.4575, "step": 1394 }, { "epoch": 0.7690187431091511, "grad_norm": 0.239015132188797, "learning_rate": 0.0001851575456053068, "loss": 0.4777, "step": 1395 }, { "epoch": 0.7695700110253583, "grad_norm": 0.22858455777168274, "learning_rate": 0.00018507462686567162, "loss": 0.438, "step": 1396 }, { "epoch": 0.7701212789415656, "grad_norm": 0.23843710124492645, "learning_rate": 0.00018499170812603648, "loss": 0.456, "step": 1397 }, { "epoch": 0.7706725468577729, "grad_norm": 0.23079745471477509, "learning_rate": 0.0001849087893864013, "loss": 0.4648, "step": 1398 }, { "epoch": 0.7712238147739802, "grad_norm": 0.23103727400302887, "learning_rate": 0.00018482587064676617, "loss": 0.4589, "step": 1399 }, { "epoch": 0.7717750826901875, "grad_norm": 0.2261170893907547, "learning_rate": 0.00018474295190713097, "loss": 0.4734, "step": 1400 }, { "epoch": 0.7723263506063948, "grad_norm": 0.2249629944562912, "learning_rate": 0.00018466003316749586, "loss": 0.4542, "step": 1401 }, { "epoch": 0.772877618522602, "grad_norm": 0.2366032898426056, "learning_rate": 0.00018457711442786067, "loss": 0.458, "step": 1402 }, { "epoch": 0.7734288864388092, "grad_norm": 0.2598401606082916, "learning_rate": 0.00018449419568822552, "loss": 0.4557, "step": 1403 }, { "epoch": 0.7739801543550165, "grad_norm": 0.23570790886878967, "learning_rate": 0.00018441127694859036, "loss": 0.4656, "step": 1404 }, { "epoch": 0.7745314222712238, "grad_norm": 0.23591196537017822, "learning_rate": 0.00018432835820895522, "loss": 0.4689, "step": 1405 }, { "epoch": 0.7750826901874311, "grad_norm": 0.2540998160839081, "learning_rate": 0.00018424543946932005, "loss": 0.4977, "step": 1406 }, { "epoch": 0.7756339581036383, "grad_norm": 0.22981034219264984, "learning_rate": 0.0001841625207296849, "loss": 0.4718, "step": 1407 }, { "epoch": 0.7761852260198456, "grad_norm": 0.2221202403306961, "learning_rate": 0.00018407960199004974, "loss": 0.4784, "step": 1408 }, { "epoch": 0.7767364939360529, "grad_norm": 0.2501460909843445, "learning_rate": 0.0001839966832504146, "loss": 0.4806, "step": 1409 }, { "epoch": 0.7772877618522602, "grad_norm": 0.2174586057662964, "learning_rate": 0.0001839137645107794, "loss": 0.4833, "step": 1410 }, { "epoch": 0.7778390297684675, "grad_norm": 0.2424350082874298, "learning_rate": 0.00018383084577114426, "loss": 0.4902, "step": 1411 }, { "epoch": 0.7783902976846747, "grad_norm": 0.25260457396507263, "learning_rate": 0.0001837479270315091, "loss": 0.4843, "step": 1412 }, { "epoch": 0.778941565600882, "grad_norm": 0.27532869577407837, "learning_rate": 0.00018366500829187395, "loss": 0.4914, "step": 1413 }, { "epoch": 0.7794928335170893, "grad_norm": 0.24072158336639404, "learning_rate": 0.00018358208955223879, "loss": 0.4888, "step": 1414 }, { "epoch": 0.7800441014332966, "grad_norm": 0.24182955920696259, "learning_rate": 0.00018349917081260364, "loss": 0.4589, "step": 1415 }, { "epoch": 0.7805953693495039, "grad_norm": 0.25824496150016785, "learning_rate": 0.00018341625207296848, "loss": 0.4868, "step": 1416 }, { "epoch": 0.7811466372657111, "grad_norm": 0.2336832731962204, "learning_rate": 0.00018333333333333334, "loss": 0.472, "step": 1417 }, { "epoch": 0.7816979051819184, "grad_norm": 0.24849727749824524, "learning_rate": 0.00018325041459369817, "loss": 0.4743, "step": 1418 }, { "epoch": 0.7822491730981257, "grad_norm": 0.21890904009342194, "learning_rate": 0.00018316749585406303, "loss": 0.465, "step": 1419 }, { "epoch": 0.782800441014333, "grad_norm": 0.2601034343242645, "learning_rate": 0.00018308457711442783, "loss": 0.4531, "step": 1420 }, { "epoch": 0.7833517089305403, "grad_norm": 0.2441786229610443, "learning_rate": 0.0001830016583747927, "loss": 0.4536, "step": 1421 }, { "epoch": 0.7839029768467475, "grad_norm": 0.2240273654460907, "learning_rate": 0.00018291873963515752, "loss": 0.461, "step": 1422 }, { "epoch": 0.7844542447629548, "grad_norm": 0.2334737479686737, "learning_rate": 0.00018283582089552235, "loss": 0.4779, "step": 1423 }, { "epoch": 0.7850055126791621, "grad_norm": 0.23395971953868866, "learning_rate": 0.00018275290215588721, "loss": 0.4585, "step": 1424 }, { "epoch": 0.7855567805953694, "grad_norm": 0.24163080751895905, "learning_rate": 0.00018266998341625205, "loss": 0.4781, "step": 1425 }, { "epoch": 0.7861080485115767, "grad_norm": 0.23681163787841797, "learning_rate": 0.0001825870646766169, "loss": 0.4518, "step": 1426 }, { "epoch": 0.7866593164277839, "grad_norm": 0.2450489103794098, "learning_rate": 0.00018250414593698174, "loss": 0.4741, "step": 1427 }, { "epoch": 0.7872105843439912, "grad_norm": 0.23335276544094086, "learning_rate": 0.0001824212271973466, "loss": 0.4938, "step": 1428 }, { "epoch": 0.7877618522601985, "grad_norm": 0.22969652712345123, "learning_rate": 0.0001823383084577114, "loss": 0.4577, "step": 1429 }, { "epoch": 0.7883131201764058, "grad_norm": 0.2162095010280609, "learning_rate": 0.00018225538971807626, "loss": 0.4632, "step": 1430 }, { "epoch": 0.7888643880926131, "grad_norm": 0.2445029318332672, "learning_rate": 0.0001821724709784411, "loss": 0.4657, "step": 1431 }, { "epoch": 0.7894156560088202, "grad_norm": 0.21864482760429382, "learning_rate": 0.00018208955223880595, "loss": 0.4759, "step": 1432 }, { "epoch": 0.7899669239250275, "grad_norm": 0.24577899277210236, "learning_rate": 0.00018200663349917078, "loss": 0.4717, "step": 1433 }, { "epoch": 0.7905181918412348, "grad_norm": 0.21177740395069122, "learning_rate": 0.00018192371475953564, "loss": 0.4564, "step": 1434 }, { "epoch": 0.7910694597574421, "grad_norm": 0.2460215985774994, "learning_rate": 0.00018184079601990047, "loss": 0.4921, "step": 1435 }, { "epoch": 0.7916207276736494, "grad_norm": 0.24731247127056122, "learning_rate": 0.00018175787728026533, "loss": 0.4655, "step": 1436 }, { "epoch": 0.7921719955898566, "grad_norm": 0.24188898503780365, "learning_rate": 0.00018167495854063017, "loss": 0.4665, "step": 1437 }, { "epoch": 0.7927232635060639, "grad_norm": 0.2347448617219925, "learning_rate": 0.00018159203980099502, "loss": 0.4563, "step": 1438 }, { "epoch": 0.7932745314222712, "grad_norm": 0.242751806974411, "learning_rate": 0.00018150912106135983, "loss": 0.4622, "step": 1439 }, { "epoch": 0.7938257993384785, "grad_norm": 0.2598075270652771, "learning_rate": 0.0001814262023217247, "loss": 0.4679, "step": 1440 }, { "epoch": 0.7943770672546858, "grad_norm": 0.23368312418460846, "learning_rate": 0.00018134328358208952, "loss": 0.4627, "step": 1441 }, { "epoch": 0.794928335170893, "grad_norm": 0.24804770946502686, "learning_rate": 0.00018126036484245438, "loss": 0.4663, "step": 1442 }, { "epoch": 0.7954796030871003, "grad_norm": 0.22588974237442017, "learning_rate": 0.0001811774461028192, "loss": 0.4514, "step": 1443 }, { "epoch": 0.7960308710033076, "grad_norm": 0.22374935448169708, "learning_rate": 0.00018109452736318407, "loss": 0.4552, "step": 1444 }, { "epoch": 0.7965821389195149, "grad_norm": 0.24665199220180511, "learning_rate": 0.0001810116086235489, "loss": 0.4639, "step": 1445 }, { "epoch": 0.7971334068357222, "grad_norm": 0.25782036781311035, "learning_rate": 0.00018092868988391376, "loss": 0.4592, "step": 1446 }, { "epoch": 0.7976846747519294, "grad_norm": 0.21815195679664612, "learning_rate": 0.0001808457711442786, "loss": 0.4724, "step": 1447 }, { "epoch": 0.7982359426681367, "grad_norm": 0.24236443638801575, "learning_rate": 0.00018076285240464345, "loss": 0.473, "step": 1448 }, { "epoch": 0.798787210584344, "grad_norm": 0.23173320293426514, "learning_rate": 0.00018067993366500826, "loss": 0.4771, "step": 1449 }, { "epoch": 0.7993384785005513, "grad_norm": 0.22303089499473572, "learning_rate": 0.00018059701492537312, "loss": 0.4545, "step": 1450 }, { "epoch": 0.7998897464167586, "grad_norm": 0.23491422832012177, "learning_rate": 0.00018051409618573795, "loss": 0.4807, "step": 1451 }, { "epoch": 0.8004410143329658, "grad_norm": 0.23925326764583588, "learning_rate": 0.0001804311774461028, "loss": 0.4705, "step": 1452 }, { "epoch": 0.8009922822491731, "grad_norm": 0.2446267306804657, "learning_rate": 0.00018034825870646764, "loss": 0.4514, "step": 1453 }, { "epoch": 0.8015435501653804, "grad_norm": 0.2514120936393738, "learning_rate": 0.0001802653399668325, "loss": 0.4823, "step": 1454 }, { "epoch": 0.8020948180815877, "grad_norm": 0.2469882369041443, "learning_rate": 0.00018018242122719733, "loss": 0.45, "step": 1455 }, { "epoch": 0.802646085997795, "grad_norm": 0.23653636872768402, "learning_rate": 0.0001800995024875622, "loss": 0.4649, "step": 1456 }, { "epoch": 0.8031973539140022, "grad_norm": 0.22585710883140564, "learning_rate": 0.00018001658374792702, "loss": 0.4384, "step": 1457 }, { "epoch": 0.8037486218302095, "grad_norm": 0.24817028641700745, "learning_rate": 0.00017993366500829188, "loss": 0.4739, "step": 1458 }, { "epoch": 0.8042998897464168, "grad_norm": 0.25585106015205383, "learning_rate": 0.0001798507462686567, "loss": 0.4958, "step": 1459 }, { "epoch": 0.804851157662624, "grad_norm": 0.25958600640296936, "learning_rate": 0.00017976782752902155, "loss": 0.4673, "step": 1460 }, { "epoch": 0.8054024255788313, "grad_norm": 0.2447502166032791, "learning_rate": 0.00017968490878938638, "loss": 0.484, "step": 1461 }, { "epoch": 0.8059536934950385, "grad_norm": 0.22878794372081757, "learning_rate": 0.00017960199004975124, "loss": 0.4832, "step": 1462 }, { "epoch": 0.8065049614112458, "grad_norm": 0.24230952560901642, "learning_rate": 0.00017951907131011607, "loss": 0.4498, "step": 1463 }, { "epoch": 0.8070562293274531, "grad_norm": 0.2345331311225891, "learning_rate": 0.00017943615257048093, "loss": 0.4529, "step": 1464 }, { "epoch": 0.8076074972436604, "grad_norm": 0.2564900815486908, "learning_rate": 0.00017935323383084576, "loss": 0.4747, "step": 1465 }, { "epoch": 0.8081587651598677, "grad_norm": 0.2226727157831192, "learning_rate": 0.00017927031509121062, "loss": 0.4453, "step": 1466 }, { "epoch": 0.8087100330760749, "grad_norm": 0.26586976647377014, "learning_rate": 0.00017918739635157545, "loss": 0.5032, "step": 1467 }, { "epoch": 0.8092613009922822, "grad_norm": 0.23573876917362213, "learning_rate": 0.00017910447761194026, "loss": 0.4674, "step": 1468 }, { "epoch": 0.8098125689084895, "grad_norm": 0.24506725370883942, "learning_rate": 0.00017902155887230512, "loss": 0.4605, "step": 1469 }, { "epoch": 0.8103638368246968, "grad_norm": 0.2386348396539688, "learning_rate": 0.00017893864013266995, "loss": 0.4618, "step": 1470 }, { "epoch": 0.8109151047409041, "grad_norm": 0.24811455607414246, "learning_rate": 0.0001788557213930348, "loss": 0.4615, "step": 1471 }, { "epoch": 0.8114663726571113, "grad_norm": 0.2334372103214264, "learning_rate": 0.00017877280265339964, "loss": 0.474, "step": 1472 }, { "epoch": 0.8120176405733186, "grad_norm": 0.247808575630188, "learning_rate": 0.0001786898839137645, "loss": 0.4504, "step": 1473 }, { "epoch": 0.8125689084895259, "grad_norm": 0.21028272807598114, "learning_rate": 0.00017860696517412933, "loss": 0.4425, "step": 1474 }, { "epoch": 0.8131201764057332, "grad_norm": 0.22339411079883575, "learning_rate": 0.0001785240464344942, "loss": 0.449, "step": 1475 }, { "epoch": 0.8136714443219405, "grad_norm": 0.23447810113430023, "learning_rate": 0.00017844112769485902, "loss": 0.4593, "step": 1476 }, { "epoch": 0.8142227122381478, "grad_norm": 0.22381900250911713, "learning_rate": 0.00017835820895522388, "loss": 0.4603, "step": 1477 }, { "epoch": 0.814773980154355, "grad_norm": 0.22677209973335266, "learning_rate": 0.00017827529021558869, "loss": 0.4525, "step": 1478 }, { "epoch": 0.8153252480705623, "grad_norm": 0.2385341227054596, "learning_rate": 0.00017819237147595354, "loss": 0.49, "step": 1479 }, { "epoch": 0.8158765159867696, "grad_norm": 0.24088934063911438, "learning_rate": 0.00017810945273631838, "loss": 0.4984, "step": 1480 }, { "epoch": 0.8164277839029769, "grad_norm": 0.20627839863300323, "learning_rate": 0.00017802653399668324, "loss": 0.4597, "step": 1481 }, { "epoch": 0.8169790518191842, "grad_norm": 0.2268056422472, "learning_rate": 0.00017794361525704807, "loss": 0.4581, "step": 1482 }, { "epoch": 0.8175303197353914, "grad_norm": 0.24342721700668335, "learning_rate": 0.00017786069651741293, "loss": 0.4715, "step": 1483 }, { "epoch": 0.8180815876515987, "grad_norm": 0.23494994640350342, "learning_rate": 0.00017777777777777776, "loss": 0.4859, "step": 1484 }, { "epoch": 0.818632855567806, "grad_norm": 0.23297634720802307, "learning_rate": 0.00017769485903814262, "loss": 0.4644, "step": 1485 }, { "epoch": 0.8191841234840133, "grad_norm": 0.24424344301223755, "learning_rate": 0.00017761194029850745, "loss": 0.456, "step": 1486 }, { "epoch": 0.8197353914002206, "grad_norm": 0.2417961210012436, "learning_rate": 0.0001775290215588723, "loss": 0.5005, "step": 1487 }, { "epoch": 0.8202866593164277, "grad_norm": 0.24089650809764862, "learning_rate": 0.00017744610281923711, "loss": 0.4953, "step": 1488 }, { "epoch": 0.820837927232635, "grad_norm": 0.22983671724796295, "learning_rate": 0.00017736318407960197, "loss": 0.4544, "step": 1489 }, { "epoch": 0.8213891951488423, "grad_norm": 0.20966455340385437, "learning_rate": 0.0001772802653399668, "loss": 0.4724, "step": 1490 }, { "epoch": 0.8219404630650496, "grad_norm": 0.24843506515026093, "learning_rate": 0.00017719734660033166, "loss": 0.4799, "step": 1491 }, { "epoch": 0.8224917309812569, "grad_norm": 0.22664618492126465, "learning_rate": 0.0001771144278606965, "loss": 0.4421, "step": 1492 }, { "epoch": 0.8230429988974641, "grad_norm": 0.22813642024993896, "learning_rate": 0.00017703150912106136, "loss": 0.4622, "step": 1493 }, { "epoch": 0.8235942668136714, "grad_norm": 0.2250567078590393, "learning_rate": 0.0001769485903814262, "loss": 0.4526, "step": 1494 }, { "epoch": 0.8241455347298787, "grad_norm": 0.2317907065153122, "learning_rate": 0.00017686567164179105, "loss": 0.4743, "step": 1495 }, { "epoch": 0.824696802646086, "grad_norm": 0.22760067880153656, "learning_rate": 0.00017678275290215588, "loss": 0.4765, "step": 1496 }, { "epoch": 0.8252480705622933, "grad_norm": 0.21815039217472076, "learning_rate": 0.00017669983416252074, "loss": 0.4588, "step": 1497 }, { "epoch": 0.8257993384785005, "grad_norm": 0.25006452202796936, "learning_rate": 0.00017661691542288554, "loss": 0.451, "step": 1498 }, { "epoch": 0.8263506063947078, "grad_norm": 0.22310319542884827, "learning_rate": 0.0001765339966832504, "loss": 0.4754, "step": 1499 }, { "epoch": 0.8269018743109151, "grad_norm": 0.26363706588745117, "learning_rate": 0.00017645107794361523, "loss": 0.4834, "step": 1500 }, { "epoch": 0.8269018743109151, "eval_loss": 0.4649047255516052, "eval_runtime": 312.7946, "eval_samples_per_second": 3.724, "eval_steps_per_second": 0.467, "step": 1500 }, { "epoch": 0.8274531422271224, "grad_norm": 0.22052568197250366, "learning_rate": 0.0001763681592039801, "loss": 0.4931, "step": 1501 }, { "epoch": 0.8280044101433297, "grad_norm": 0.23108328878879547, "learning_rate": 0.00017628524046434493, "loss": 0.4901, "step": 1502 }, { "epoch": 0.8285556780595369, "grad_norm": 0.23075662553310394, "learning_rate": 0.00017620232172470978, "loss": 0.4484, "step": 1503 }, { "epoch": 0.8291069459757442, "grad_norm": 0.24602019786834717, "learning_rate": 0.00017611940298507462, "loss": 0.4427, "step": 1504 }, { "epoch": 0.8296582138919515, "grad_norm": 0.2438734471797943, "learning_rate": 0.00017603648424543948, "loss": 0.4731, "step": 1505 }, { "epoch": 0.8302094818081588, "grad_norm": 0.23441627621650696, "learning_rate": 0.0001759535655058043, "loss": 0.4628, "step": 1506 }, { "epoch": 0.8307607497243661, "grad_norm": 0.23310305178165436, "learning_rate": 0.00017587064676616917, "loss": 0.4929, "step": 1507 }, { "epoch": 0.8313120176405733, "grad_norm": 0.25448939204216003, "learning_rate": 0.00017578772802653397, "loss": 0.4851, "step": 1508 }, { "epoch": 0.8318632855567806, "grad_norm": 0.2438756674528122, "learning_rate": 0.00017570480928689883, "loss": 0.4706, "step": 1509 }, { "epoch": 0.8324145534729879, "grad_norm": 0.25436931848526, "learning_rate": 0.00017562189054726366, "loss": 0.4869, "step": 1510 }, { "epoch": 0.8329658213891952, "grad_norm": 0.22301998734474182, "learning_rate": 0.0001755389718076285, "loss": 0.4593, "step": 1511 }, { "epoch": 0.8335170893054025, "grad_norm": 0.24233976006507874, "learning_rate": 0.00017545605306799335, "loss": 0.5016, "step": 1512 }, { "epoch": 0.8340683572216097, "grad_norm": 0.22516629099845886, "learning_rate": 0.00017537313432835819, "loss": 0.4732, "step": 1513 }, { "epoch": 0.834619625137817, "grad_norm": 0.22612155973911285, "learning_rate": 0.00017529021558872305, "loss": 0.4625, "step": 1514 }, { "epoch": 0.8351708930540243, "grad_norm": 0.23177853226661682, "learning_rate": 0.00017520729684908785, "loss": 0.4776, "step": 1515 }, { "epoch": 0.8357221609702316, "grad_norm": 0.24279583990573883, "learning_rate": 0.00017512437810945274, "loss": 0.4721, "step": 1516 }, { "epoch": 0.8362734288864389, "grad_norm": 0.23456443846225739, "learning_rate": 0.00017504145936981754, "loss": 0.4635, "step": 1517 }, { "epoch": 0.836824696802646, "grad_norm": 0.23287171125411987, "learning_rate": 0.0001749585406301824, "loss": 0.4739, "step": 1518 }, { "epoch": 0.8373759647188533, "grad_norm": 0.22415684163570404, "learning_rate": 0.00017487562189054723, "loss": 0.4769, "step": 1519 }, { "epoch": 0.8379272326350606, "grad_norm": 0.2180211991071701, "learning_rate": 0.0001747927031509121, "loss": 0.4388, "step": 1520 }, { "epoch": 0.8384785005512679, "grad_norm": 0.2260761708021164, "learning_rate": 0.00017470978441127692, "loss": 0.4972, "step": 1521 }, { "epoch": 0.8390297684674752, "grad_norm": 0.22887657582759857, "learning_rate": 0.00017462686567164178, "loss": 0.4554, "step": 1522 }, { "epoch": 0.8395810363836824, "grad_norm": 0.241640105843544, "learning_rate": 0.00017454394693200662, "loss": 0.4732, "step": 1523 }, { "epoch": 0.8401323042998897, "grad_norm": 0.2288465052843094, "learning_rate": 0.00017446102819237147, "loss": 0.4527, "step": 1524 }, { "epoch": 0.840683572216097, "grad_norm": 0.23457041382789612, "learning_rate": 0.00017437810945273628, "loss": 0.4574, "step": 1525 }, { "epoch": 0.8412348401323043, "grad_norm": 0.25197815895080566, "learning_rate": 0.00017429519071310114, "loss": 0.4597, "step": 1526 }, { "epoch": 0.8417861080485116, "grad_norm": 0.2385404258966446, "learning_rate": 0.00017421227197346597, "loss": 0.4649, "step": 1527 }, { "epoch": 0.8423373759647188, "grad_norm": 0.23451651632785797, "learning_rate": 0.00017412935323383083, "loss": 0.4646, "step": 1528 }, { "epoch": 0.8428886438809261, "grad_norm": 0.2421046793460846, "learning_rate": 0.00017404643449419566, "loss": 0.4852, "step": 1529 }, { "epoch": 0.8434399117971334, "grad_norm": 0.25406989455223083, "learning_rate": 0.00017396351575456052, "loss": 0.4804, "step": 1530 }, { "epoch": 0.8439911797133407, "grad_norm": 0.24752497673034668, "learning_rate": 0.00017388059701492535, "loss": 0.4777, "step": 1531 }, { "epoch": 0.844542447629548, "grad_norm": 0.226281076669693, "learning_rate": 0.0001737976782752902, "loss": 0.4747, "step": 1532 }, { "epoch": 0.8450937155457552, "grad_norm": 0.2519485652446747, "learning_rate": 0.00017371475953565504, "loss": 0.4639, "step": 1533 }, { "epoch": 0.8456449834619625, "grad_norm": 0.2347985804080963, "learning_rate": 0.0001736318407960199, "loss": 0.4715, "step": 1534 }, { "epoch": 0.8461962513781698, "grad_norm": 0.24425053596496582, "learning_rate": 0.0001735489220563847, "loss": 0.445, "step": 1535 }, { "epoch": 0.8467475192943771, "grad_norm": 0.2559725046157837, "learning_rate": 0.00017346600331674957, "loss": 0.49, "step": 1536 }, { "epoch": 0.8472987872105844, "grad_norm": 0.23750551044940948, "learning_rate": 0.0001733830845771144, "loss": 0.4663, "step": 1537 }, { "epoch": 0.8478500551267916, "grad_norm": 0.22861897945404053, "learning_rate": 0.00017330016583747926, "loss": 0.45, "step": 1538 }, { "epoch": 0.8484013230429989, "grad_norm": 0.24839669466018677, "learning_rate": 0.0001732172470978441, "loss": 0.4856, "step": 1539 }, { "epoch": 0.8489525909592062, "grad_norm": 0.23960521817207336, "learning_rate": 0.00017313432835820895, "loss": 0.4933, "step": 1540 }, { "epoch": 0.8495038588754135, "grad_norm": 0.23533576726913452, "learning_rate": 0.00017305140961857378, "loss": 0.4698, "step": 1541 }, { "epoch": 0.8500551267916208, "grad_norm": 0.23979732394218445, "learning_rate": 0.00017296849087893864, "loss": 0.4953, "step": 1542 }, { "epoch": 0.850606394707828, "grad_norm": 0.24841150641441345, "learning_rate": 0.00017288557213930347, "loss": 0.4845, "step": 1543 }, { "epoch": 0.8511576626240352, "grad_norm": 0.22132597863674164, "learning_rate": 0.00017280265339966833, "loss": 0.4643, "step": 1544 }, { "epoch": 0.8517089305402425, "grad_norm": 0.22431734204292297, "learning_rate": 0.00017271973466003314, "loss": 0.4547, "step": 1545 }, { "epoch": 0.8522601984564498, "grad_norm": 0.22704413533210754, "learning_rate": 0.000172636815920398, "loss": 0.4665, "step": 1546 }, { "epoch": 0.8528114663726571, "grad_norm": 0.22971755266189575, "learning_rate": 0.00017255389718076283, "loss": 0.4709, "step": 1547 }, { "epoch": 0.8533627342888643, "grad_norm": 0.2435724288225174, "learning_rate": 0.0001724709784411277, "loss": 0.4733, "step": 1548 }, { "epoch": 0.8539140022050716, "grad_norm": 0.24051538109779358, "learning_rate": 0.00017238805970149252, "loss": 0.4695, "step": 1549 }, { "epoch": 0.8544652701212789, "grad_norm": 0.26592954993247986, "learning_rate": 0.00017230514096185738, "loss": 0.4683, "step": 1550 }, { "epoch": 0.8550165380374862, "grad_norm": 0.24452587962150574, "learning_rate": 0.0001722222222222222, "loss": 0.4623, "step": 1551 }, { "epoch": 0.8555678059536935, "grad_norm": 0.23351791501045227, "learning_rate": 0.00017213930348258707, "loss": 0.4559, "step": 1552 }, { "epoch": 0.8561190738699008, "grad_norm": 0.23652702569961548, "learning_rate": 0.0001720563847429519, "loss": 0.4507, "step": 1553 }, { "epoch": 0.856670341786108, "grad_norm": 0.22390702366828918, "learning_rate": 0.00017197346600331676, "loss": 0.4521, "step": 1554 }, { "epoch": 0.8572216097023153, "grad_norm": 0.24590735137462616, "learning_rate": 0.00017189054726368157, "loss": 0.4712, "step": 1555 }, { "epoch": 0.8577728776185226, "grad_norm": 0.21954110264778137, "learning_rate": 0.0001718076285240464, "loss": 0.4447, "step": 1556 }, { "epoch": 0.8583241455347299, "grad_norm": 0.23404909670352936, "learning_rate": 0.00017172470978441126, "loss": 0.4699, "step": 1557 }, { "epoch": 0.8588754134509372, "grad_norm": 0.24352899193763733, "learning_rate": 0.0001716417910447761, "loss": 0.4904, "step": 1558 }, { "epoch": 0.8594266813671444, "grad_norm": 0.30317431688308716, "learning_rate": 0.00017155887230514095, "loss": 0.4606, "step": 1559 }, { "epoch": 0.8599779492833517, "grad_norm": 0.22517681121826172, "learning_rate": 0.00017147595356550578, "loss": 0.4892, "step": 1560 }, { "epoch": 0.860529217199559, "grad_norm": 0.23503634333610535, "learning_rate": 0.00017139303482587064, "loss": 0.4755, "step": 1561 }, { "epoch": 0.8610804851157663, "grad_norm": 0.22381718456745148, "learning_rate": 0.00017131011608623547, "loss": 0.4492, "step": 1562 }, { "epoch": 0.8616317530319736, "grad_norm": 0.24450813233852386, "learning_rate": 0.00017122719734660033, "loss": 0.4764, "step": 1563 }, { "epoch": 0.8621830209481808, "grad_norm": 0.2357473075389862, "learning_rate": 0.00017114427860696513, "loss": 0.4727, "step": 1564 }, { "epoch": 0.8627342888643881, "grad_norm": 0.22676219046115875, "learning_rate": 0.00017106135986733, "loss": 0.454, "step": 1565 }, { "epoch": 0.8632855567805954, "grad_norm": 0.24174387753009796, "learning_rate": 0.00017097844112769483, "loss": 0.4451, "step": 1566 }, { "epoch": 0.8638368246968027, "grad_norm": 0.24716874957084656, "learning_rate": 0.00017089552238805969, "loss": 0.4639, "step": 1567 }, { "epoch": 0.86438809261301, "grad_norm": 0.24672383069992065, "learning_rate": 0.00017081260364842452, "loss": 0.4811, "step": 1568 }, { "epoch": 0.8649393605292172, "grad_norm": 0.2504035234451294, "learning_rate": 0.00017072968490878938, "loss": 0.4715, "step": 1569 }, { "epoch": 0.8654906284454245, "grad_norm": 0.2296275794506073, "learning_rate": 0.0001706467661691542, "loss": 0.4552, "step": 1570 }, { "epoch": 0.8660418963616318, "grad_norm": 0.24308894574642181, "learning_rate": 0.00017056384742951907, "loss": 0.4798, "step": 1571 }, { "epoch": 0.8665931642778391, "grad_norm": 0.25587549805641174, "learning_rate": 0.0001704809286898839, "loss": 0.473, "step": 1572 }, { "epoch": 0.8671444321940464, "grad_norm": 0.22006462514400482, "learning_rate": 0.00017039800995024876, "loss": 0.4512, "step": 1573 }, { "epoch": 0.8676957001102535, "grad_norm": 0.2469773143529892, "learning_rate": 0.00017031509121061356, "loss": 0.4651, "step": 1574 }, { "epoch": 0.8682469680264608, "grad_norm": 0.23426435887813568, "learning_rate": 0.00017023217247097842, "loss": 0.4658, "step": 1575 }, { "epoch": 0.8687982359426681, "grad_norm": 0.2696544826030731, "learning_rate": 0.00017014925373134325, "loss": 0.4555, "step": 1576 }, { "epoch": 0.8693495038588754, "grad_norm": 0.24263867735862732, "learning_rate": 0.00017006633499170811, "loss": 0.4426, "step": 1577 }, { "epoch": 0.8699007717750827, "grad_norm": 0.24693246185779572, "learning_rate": 0.00016998341625207295, "loss": 0.4876, "step": 1578 }, { "epoch": 0.8704520396912899, "grad_norm": 0.24460558593273163, "learning_rate": 0.0001699004975124378, "loss": 0.4704, "step": 1579 }, { "epoch": 0.8710033076074972, "grad_norm": 0.2212182730436325, "learning_rate": 0.00016981757877280264, "loss": 0.4496, "step": 1580 }, { "epoch": 0.8715545755237045, "grad_norm": 0.23751485347747803, "learning_rate": 0.0001697346600331675, "loss": 0.4546, "step": 1581 }, { "epoch": 0.8721058434399118, "grad_norm": 0.2521110475063324, "learning_rate": 0.00016965174129353233, "loss": 0.4706, "step": 1582 }, { "epoch": 0.8726571113561191, "grad_norm": 0.24147383868694305, "learning_rate": 0.0001695688225538972, "loss": 0.4519, "step": 1583 }, { "epoch": 0.8732083792723263, "grad_norm": 0.2279898077249527, "learning_rate": 0.000169485903814262, "loss": 0.4648, "step": 1584 }, { "epoch": 0.8737596471885336, "grad_norm": 0.24053026735782623, "learning_rate": 0.00016940298507462685, "loss": 0.4747, "step": 1585 }, { "epoch": 0.8743109151047409, "grad_norm": 0.24321089684963226, "learning_rate": 0.00016932006633499168, "loss": 0.4562, "step": 1586 }, { "epoch": 0.8748621830209482, "grad_norm": 0.2396124303340912, "learning_rate": 0.00016923714759535654, "loss": 0.4631, "step": 1587 }, { "epoch": 0.8754134509371555, "grad_norm": 0.23284991085529327, "learning_rate": 0.00016915422885572137, "loss": 0.4452, "step": 1588 }, { "epoch": 0.8759647188533627, "grad_norm": 0.2377912849187851, "learning_rate": 0.00016907131011608623, "loss": 0.4471, "step": 1589 }, { "epoch": 0.87651598676957, "grad_norm": 0.23828253149986267, "learning_rate": 0.00016898839137645107, "loss": 0.4463, "step": 1590 }, { "epoch": 0.8770672546857773, "grad_norm": 0.24640867114067078, "learning_rate": 0.00016890547263681593, "loss": 0.4776, "step": 1591 }, { "epoch": 0.8776185226019846, "grad_norm": 0.24699927866458893, "learning_rate": 0.00016882255389718076, "loss": 0.437, "step": 1592 }, { "epoch": 0.8781697905181919, "grad_norm": 0.24521562457084656, "learning_rate": 0.00016873963515754562, "loss": 0.4805, "step": 1593 }, { "epoch": 0.8787210584343991, "grad_norm": 0.2375350147485733, "learning_rate": 0.00016865671641791042, "loss": 0.4835, "step": 1594 }, { "epoch": 0.8792723263506064, "grad_norm": 0.23784852027893066, "learning_rate": 0.00016857379767827528, "loss": 0.49, "step": 1595 }, { "epoch": 0.8798235942668137, "grad_norm": 0.23371200263500214, "learning_rate": 0.0001684908789386401, "loss": 0.4701, "step": 1596 }, { "epoch": 0.880374862183021, "grad_norm": 0.23373621702194214, "learning_rate": 0.00016840796019900497, "loss": 0.4765, "step": 1597 }, { "epoch": 0.8809261300992283, "grad_norm": 0.25964394211769104, "learning_rate": 0.0001683250414593698, "loss": 0.4505, "step": 1598 }, { "epoch": 0.8814773980154355, "grad_norm": 0.2420414835214615, "learning_rate": 0.00016824212271973464, "loss": 0.5, "step": 1599 }, { "epoch": 0.8820286659316428, "grad_norm": 0.24534733593463898, "learning_rate": 0.0001681592039800995, "loss": 0.4625, "step": 1600 }, { "epoch": 0.88257993384785, "grad_norm": 0.22338466346263885, "learning_rate": 0.00016807628524046433, "loss": 0.4383, "step": 1601 }, { "epoch": 0.8831312017640573, "grad_norm": 0.24304436147212982, "learning_rate": 0.00016799336650082919, "loss": 0.4717, "step": 1602 }, { "epoch": 0.8836824696802646, "grad_norm": 0.24378708004951477, "learning_rate": 0.000167910447761194, "loss": 0.4732, "step": 1603 }, { "epoch": 0.8842337375964718, "grad_norm": 0.22068338096141815, "learning_rate": 0.00016782752902155885, "loss": 0.4709, "step": 1604 }, { "epoch": 0.8847850055126791, "grad_norm": 0.25752487778663635, "learning_rate": 0.00016774461028192368, "loss": 0.4571, "step": 1605 }, { "epoch": 0.8853362734288864, "grad_norm": 0.21915499866008759, "learning_rate": 0.00016766169154228854, "loss": 0.4551, "step": 1606 }, { "epoch": 0.8858875413450937, "grad_norm": 0.220630943775177, "learning_rate": 0.00016757877280265337, "loss": 0.4336, "step": 1607 }, { "epoch": 0.886438809261301, "grad_norm": 0.2279721051454544, "learning_rate": 0.00016749585406301823, "loss": 0.4546, "step": 1608 }, { "epoch": 0.8869900771775082, "grad_norm": 0.23162703216075897, "learning_rate": 0.00016741293532338306, "loss": 0.4596, "step": 1609 }, { "epoch": 0.8875413450937155, "grad_norm": 0.22968967258930206, "learning_rate": 0.00016733001658374792, "loss": 0.4457, "step": 1610 }, { "epoch": 0.8880926130099228, "grad_norm": 0.23839277029037476, "learning_rate": 0.00016724709784411276, "loss": 0.444, "step": 1611 }, { "epoch": 0.8886438809261301, "grad_norm": 0.2291092872619629, "learning_rate": 0.00016716417910447761, "loss": 0.4796, "step": 1612 }, { "epoch": 0.8891951488423374, "grad_norm": 0.2277524322271347, "learning_rate": 0.00016708126036484242, "loss": 0.4373, "step": 1613 }, { "epoch": 0.8897464167585446, "grad_norm": 0.24553948640823364, "learning_rate": 0.00016699834162520728, "loss": 0.4948, "step": 1614 }, { "epoch": 0.8902976846747519, "grad_norm": 0.21850357949733734, "learning_rate": 0.0001669154228855721, "loss": 0.4575, "step": 1615 }, { "epoch": 0.8908489525909592, "grad_norm": 0.23171943426132202, "learning_rate": 0.00016683250414593697, "loss": 0.4947, "step": 1616 }, { "epoch": 0.8914002205071665, "grad_norm": 0.22626076638698578, "learning_rate": 0.0001667495854063018, "loss": 0.4619, "step": 1617 }, { "epoch": 0.8919514884233738, "grad_norm": 0.23768572509288788, "learning_rate": 0.00016666666666666666, "loss": 0.4535, "step": 1618 }, { "epoch": 0.892502756339581, "grad_norm": 0.2264167070388794, "learning_rate": 0.0001665837479270315, "loss": 0.467, "step": 1619 }, { "epoch": 0.8930540242557883, "grad_norm": 0.2234300673007965, "learning_rate": 0.00016650082918739635, "loss": 0.4331, "step": 1620 }, { "epoch": 0.8936052921719956, "grad_norm": 0.22206327319145203, "learning_rate": 0.00016641791044776118, "loss": 0.4442, "step": 1621 }, { "epoch": 0.8941565600882029, "grad_norm": 0.22858171164989471, "learning_rate": 0.00016633499170812604, "loss": 0.4611, "step": 1622 }, { "epoch": 0.8947078280044102, "grad_norm": 0.24421337246894836, "learning_rate": 0.00016625207296849085, "loss": 0.4551, "step": 1623 }, { "epoch": 0.8952590959206174, "grad_norm": 0.20711436867713928, "learning_rate": 0.0001661691542288557, "loss": 0.4555, "step": 1624 }, { "epoch": 0.8958103638368247, "grad_norm": 0.22994433343410492, "learning_rate": 0.00016608623548922054, "loss": 0.4745, "step": 1625 }, { "epoch": 0.896361631753032, "grad_norm": 0.22984014451503754, "learning_rate": 0.0001660033167495854, "loss": 0.4613, "step": 1626 }, { "epoch": 0.8969128996692393, "grad_norm": 0.2339726984500885, "learning_rate": 0.00016592039800995023, "loss": 0.469, "step": 1627 }, { "epoch": 0.8974641675854466, "grad_norm": 0.23884552717208862, "learning_rate": 0.0001658374792703151, "loss": 0.4812, "step": 1628 }, { "epoch": 0.8980154355016539, "grad_norm": 0.23677459359169006, "learning_rate": 0.00016575456053067992, "loss": 0.471, "step": 1629 }, { "epoch": 0.898566703417861, "grad_norm": 0.22945214807987213, "learning_rate": 0.00016567164179104478, "loss": 0.4666, "step": 1630 }, { "epoch": 0.8991179713340683, "grad_norm": 0.231664776802063, "learning_rate": 0.0001655887230514096, "loss": 0.4657, "step": 1631 }, { "epoch": 0.8996692392502756, "grad_norm": 0.22424204647541046, "learning_rate": 0.00016550580431177447, "loss": 0.4682, "step": 1632 }, { "epoch": 0.9002205071664829, "grad_norm": 0.23469983041286469, "learning_rate": 0.00016542288557213928, "loss": 0.4761, "step": 1633 }, { "epoch": 0.9007717750826902, "grad_norm": 0.2397875040769577, "learning_rate": 0.00016533996683250414, "loss": 0.4763, "step": 1634 }, { "epoch": 0.9013230429988974, "grad_norm": 0.21035277843475342, "learning_rate": 0.00016525704809286897, "loss": 0.4225, "step": 1635 }, { "epoch": 0.9018743109151047, "grad_norm": 0.24221475422382355, "learning_rate": 0.00016517412935323383, "loss": 0.4666, "step": 1636 }, { "epoch": 0.902425578831312, "grad_norm": 0.22903227806091309, "learning_rate": 0.00016509121061359866, "loss": 0.4699, "step": 1637 }, { "epoch": 0.9029768467475193, "grad_norm": 0.23368406295776367, "learning_rate": 0.00016500829187396352, "loss": 0.4763, "step": 1638 }, { "epoch": 0.9035281146637266, "grad_norm": 0.2397768199443817, "learning_rate": 0.00016492537313432835, "loss": 0.4552, "step": 1639 }, { "epoch": 0.9040793825799338, "grad_norm": 0.24322962760925293, "learning_rate": 0.0001648424543946932, "loss": 0.4441, "step": 1640 }, { "epoch": 0.9046306504961411, "grad_norm": 0.21771124005317688, "learning_rate": 0.00016475953565505801, "loss": 0.4635, "step": 1641 }, { "epoch": 0.9051819184123484, "grad_norm": 0.21717268228530884, "learning_rate": 0.0001646766169154229, "loss": 0.4459, "step": 1642 }, { "epoch": 0.9057331863285557, "grad_norm": 0.23191964626312256, "learning_rate": 0.0001645936981757877, "loss": 0.4605, "step": 1643 }, { "epoch": 0.906284454244763, "grad_norm": 0.24638865888118744, "learning_rate": 0.00016451077943615254, "loss": 0.4477, "step": 1644 }, { "epoch": 0.9068357221609702, "grad_norm": 0.24050134420394897, "learning_rate": 0.0001644278606965174, "loss": 0.4389, "step": 1645 }, { "epoch": 0.9073869900771775, "grad_norm": 0.23574888706207275, "learning_rate": 0.00016434494195688223, "loss": 0.4556, "step": 1646 }, { "epoch": 0.9079382579933848, "grad_norm": 0.23960547149181366, "learning_rate": 0.0001642620232172471, "loss": 0.4599, "step": 1647 }, { "epoch": 0.9084895259095921, "grad_norm": 0.22923794388771057, "learning_rate": 0.00016417910447761192, "loss": 0.4566, "step": 1648 }, { "epoch": 0.9090407938257994, "grad_norm": 0.23294423520565033, "learning_rate": 0.00016409618573797678, "loss": 0.4726, "step": 1649 }, { "epoch": 0.9095920617420066, "grad_norm": 0.24964945018291473, "learning_rate": 0.00016401326699834158, "loss": 0.483, "step": 1650 }, { "epoch": 0.9101433296582139, "grad_norm": 0.22729866206645966, "learning_rate": 0.00016393034825870644, "loss": 0.4708, "step": 1651 }, { "epoch": 0.9106945975744212, "grad_norm": 0.22324109077453613, "learning_rate": 0.00016384742951907128, "loss": 0.4798, "step": 1652 }, { "epoch": 0.9112458654906285, "grad_norm": 0.2301269918680191, "learning_rate": 0.00016376451077943613, "loss": 0.4659, "step": 1653 }, { "epoch": 0.9117971334068358, "grad_norm": 0.26973679661750793, "learning_rate": 0.00016368159203980097, "loss": 0.4743, "step": 1654 }, { "epoch": 0.912348401323043, "grad_norm": 0.2236243188381195, "learning_rate": 0.00016359867330016583, "loss": 0.4464, "step": 1655 }, { "epoch": 0.9128996692392503, "grad_norm": 0.23898382484912872, "learning_rate": 0.00016351575456053066, "loss": 0.4715, "step": 1656 }, { "epoch": 0.9134509371554576, "grad_norm": 0.226115882396698, "learning_rate": 0.00016343283582089552, "loss": 0.452, "step": 1657 }, { "epoch": 0.9140022050716649, "grad_norm": 0.24120070040225983, "learning_rate": 0.00016334991708126035, "loss": 0.4594, "step": 1658 }, { "epoch": 0.9145534729878722, "grad_norm": 0.2507602870464325, "learning_rate": 0.0001632669983416252, "loss": 0.4759, "step": 1659 }, { "epoch": 0.9151047409040793, "grad_norm": 0.26350581645965576, "learning_rate": 0.00016318407960199, "loss": 0.4553, "step": 1660 }, { "epoch": 0.9156560088202866, "grad_norm": 0.23043513298034668, "learning_rate": 0.00016310116086235487, "loss": 0.4754, "step": 1661 }, { "epoch": 0.9162072767364939, "grad_norm": 0.22888733446598053, "learning_rate": 0.0001630182421227197, "loss": 0.4602, "step": 1662 }, { "epoch": 0.9167585446527012, "grad_norm": 0.23566976189613342, "learning_rate": 0.00016293532338308456, "loss": 0.4492, "step": 1663 }, { "epoch": 0.9173098125689085, "grad_norm": 0.2403411716222763, "learning_rate": 0.0001628524046434494, "loss": 0.4529, "step": 1664 }, { "epoch": 0.9178610804851157, "grad_norm": 0.24615786969661713, "learning_rate": 0.00016276948590381425, "loss": 0.4688, "step": 1665 }, { "epoch": 0.918412348401323, "grad_norm": 0.2582218647003174, "learning_rate": 0.0001626865671641791, "loss": 0.4626, "step": 1666 }, { "epoch": 0.9189636163175303, "grad_norm": 0.2405799925327301, "learning_rate": 0.00016260364842454395, "loss": 0.4529, "step": 1667 }, { "epoch": 0.9195148842337376, "grad_norm": 0.2288394719362259, "learning_rate": 0.00016252072968490878, "loss": 0.4513, "step": 1668 }, { "epoch": 0.9200661521499449, "grad_norm": 0.22039665281772614, "learning_rate": 0.00016243781094527364, "loss": 0.4636, "step": 1669 }, { "epoch": 0.9206174200661521, "grad_norm": 0.2359505444765091, "learning_rate": 0.00016235489220563844, "loss": 0.4703, "step": 1670 }, { "epoch": 0.9211686879823594, "grad_norm": 0.25222134590148926, "learning_rate": 0.0001622719734660033, "loss": 0.4729, "step": 1671 }, { "epoch": 0.9217199558985667, "grad_norm": 0.24714909493923187, "learning_rate": 0.00016218905472636813, "loss": 0.4376, "step": 1672 }, { "epoch": 0.922271223814774, "grad_norm": 0.271454781293869, "learning_rate": 0.000162106135986733, "loss": 0.4771, "step": 1673 }, { "epoch": 0.9228224917309813, "grad_norm": 0.2408027946949005, "learning_rate": 0.00016202321724709782, "loss": 0.4581, "step": 1674 }, { "epoch": 0.9233737596471885, "grad_norm": 0.25041836500167847, "learning_rate": 0.00016194029850746268, "loss": 0.4685, "step": 1675 }, { "epoch": 0.9239250275633958, "grad_norm": 0.2697443664073944, "learning_rate": 0.00016185737976782752, "loss": 0.4905, "step": 1676 }, { "epoch": 0.9244762954796031, "grad_norm": 0.261924684047699, "learning_rate": 0.00016177446102819237, "loss": 0.5045, "step": 1677 }, { "epoch": 0.9250275633958104, "grad_norm": 0.23671838641166687, "learning_rate": 0.0001616915422885572, "loss": 0.4477, "step": 1678 }, { "epoch": 0.9255788313120177, "grad_norm": 0.26420533657073975, "learning_rate": 0.00016160862354892207, "loss": 0.4922, "step": 1679 }, { "epoch": 0.9261300992282249, "grad_norm": 0.2353939265012741, "learning_rate": 0.00016152570480928687, "loss": 0.4434, "step": 1680 }, { "epoch": 0.9266813671444322, "grad_norm": 0.23843790590763092, "learning_rate": 0.00016144278606965173, "loss": 0.4567, "step": 1681 }, { "epoch": 0.9272326350606395, "grad_norm": 0.22744010388851166, "learning_rate": 0.00016135986733001656, "loss": 0.4607, "step": 1682 }, { "epoch": 0.9277839029768468, "grad_norm": 0.2599264979362488, "learning_rate": 0.00016127694859038142, "loss": 0.4839, "step": 1683 }, { "epoch": 0.9283351708930541, "grad_norm": 0.2337629646062851, "learning_rate": 0.00016119402985074625, "loss": 0.4697, "step": 1684 }, { "epoch": 0.9288864388092613, "grad_norm": 0.2365848571062088, "learning_rate": 0.0001611111111111111, "loss": 0.4589, "step": 1685 }, { "epoch": 0.9294377067254685, "grad_norm": 0.22954298555850983, "learning_rate": 0.00016102819237147594, "loss": 0.4071, "step": 1686 }, { "epoch": 0.9299889746416758, "grad_norm": 0.22945284843444824, "learning_rate": 0.00016094527363184078, "loss": 0.4432, "step": 1687 }, { "epoch": 0.9305402425578831, "grad_norm": 0.2274722009897232, "learning_rate": 0.00016086235489220564, "loss": 0.4537, "step": 1688 }, { "epoch": 0.9310915104740904, "grad_norm": 0.23572379350662231, "learning_rate": 0.00016077943615257044, "loss": 0.4621, "step": 1689 }, { "epoch": 0.9316427783902976, "grad_norm": 0.2582686245441437, "learning_rate": 0.0001606965174129353, "loss": 0.4845, "step": 1690 }, { "epoch": 0.9321940463065049, "grad_norm": 0.252638578414917, "learning_rate": 0.00016061359867330013, "loss": 0.4583, "step": 1691 }, { "epoch": 0.9327453142227122, "grad_norm": 0.24242907762527466, "learning_rate": 0.000160530679933665, "loss": 0.4659, "step": 1692 }, { "epoch": 0.9332965821389195, "grad_norm": 0.25426262617111206, "learning_rate": 0.00016044776119402982, "loss": 0.4615, "step": 1693 }, { "epoch": 0.9338478500551268, "grad_norm": 0.2503727972507477, "learning_rate": 0.00016036484245439468, "loss": 0.4732, "step": 1694 }, { "epoch": 0.934399117971334, "grad_norm": 0.23591485619544983, "learning_rate": 0.00016028192371475951, "loss": 0.4865, "step": 1695 }, { "epoch": 0.9349503858875413, "grad_norm": 0.2307887077331543, "learning_rate": 0.00016019900497512437, "loss": 0.4694, "step": 1696 }, { "epoch": 0.9355016538037486, "grad_norm": 0.24209177494049072, "learning_rate": 0.0001601160862354892, "loss": 0.4716, "step": 1697 }, { "epoch": 0.9360529217199559, "grad_norm": 0.23071332275867462, "learning_rate": 0.00016003316749585406, "loss": 0.4548, "step": 1698 }, { "epoch": 0.9366041896361632, "grad_norm": 0.2404324859380722, "learning_rate": 0.00015995024875621887, "loss": 0.4614, "step": 1699 }, { "epoch": 0.9371554575523704, "grad_norm": 0.24288049340248108, "learning_rate": 0.00015986733001658373, "loss": 0.477, "step": 1700 }, { "epoch": 0.9377067254685777, "grad_norm": 0.2315543293952942, "learning_rate": 0.00015978441127694856, "loss": 0.4294, "step": 1701 }, { "epoch": 0.938257993384785, "grad_norm": 0.24326400458812714, "learning_rate": 0.00015970149253731342, "loss": 0.4751, "step": 1702 }, { "epoch": 0.9388092613009923, "grad_norm": 0.23202817142009735, "learning_rate": 0.00015961857379767825, "loss": 0.4539, "step": 1703 }, { "epoch": 0.9393605292171996, "grad_norm": 0.24364544451236725, "learning_rate": 0.0001595356550580431, "loss": 0.4742, "step": 1704 }, { "epoch": 0.9399117971334069, "grad_norm": 0.24248524010181427, "learning_rate": 0.00015945273631840794, "loss": 0.4335, "step": 1705 }, { "epoch": 0.9404630650496141, "grad_norm": 0.2423916757106781, "learning_rate": 0.0001593698175787728, "loss": 0.4825, "step": 1706 }, { "epoch": 0.9410143329658214, "grad_norm": 0.22844377160072327, "learning_rate": 0.00015928689883913763, "loss": 0.468, "step": 1707 }, { "epoch": 0.9415656008820287, "grad_norm": 0.23481746017932892, "learning_rate": 0.0001592039800995025, "loss": 0.459, "step": 1708 }, { "epoch": 0.942116868798236, "grad_norm": 0.23676711320877075, "learning_rate": 0.0001591210613598673, "loss": 0.4748, "step": 1709 }, { "epoch": 0.9426681367144433, "grad_norm": 0.23470185697078705, "learning_rate": 0.00015903814262023216, "loss": 0.4538, "step": 1710 }, { "epoch": 0.9432194046306505, "grad_norm": 0.26180773973464966, "learning_rate": 0.000158955223880597, "loss": 0.4737, "step": 1711 }, { "epoch": 0.9437706725468578, "grad_norm": 0.23656126856803894, "learning_rate": 0.00015887230514096185, "loss": 0.4716, "step": 1712 }, { "epoch": 0.9443219404630651, "grad_norm": 0.2338191270828247, "learning_rate": 0.00015878938640132668, "loss": 0.4712, "step": 1713 }, { "epoch": 0.9448732083792724, "grad_norm": 0.2348823845386505, "learning_rate": 0.00015870646766169154, "loss": 0.4645, "step": 1714 }, { "epoch": 0.9454244762954797, "grad_norm": 0.23620596528053284, "learning_rate": 0.00015862354892205637, "loss": 0.4456, "step": 1715 }, { "epoch": 0.9459757442116868, "grad_norm": 0.25021445751190186, "learning_rate": 0.00015854063018242123, "loss": 0.4807, "step": 1716 }, { "epoch": 0.9465270121278941, "grad_norm": 0.23087383806705475, "learning_rate": 0.00015845771144278606, "loss": 0.4648, "step": 1717 }, { "epoch": 0.9470782800441014, "grad_norm": 0.23474477231502533, "learning_rate": 0.00015837479270315092, "loss": 0.4672, "step": 1718 }, { "epoch": 0.9476295479603087, "grad_norm": 0.2543323338031769, "learning_rate": 0.00015829187396351573, "loss": 0.473, "step": 1719 }, { "epoch": 0.948180815876516, "grad_norm": 0.2378506064414978, "learning_rate": 0.00015820895522388059, "loss": 0.4569, "step": 1720 }, { "epoch": 0.9487320837927232, "grad_norm": 0.23003467917442322, "learning_rate": 0.00015812603648424542, "loss": 0.4621, "step": 1721 }, { "epoch": 0.9492833517089305, "grad_norm": 0.24162529408931732, "learning_rate": 0.00015804311774461028, "loss": 0.445, "step": 1722 }, { "epoch": 0.9498346196251378, "grad_norm": 0.23978053033351898, "learning_rate": 0.0001579601990049751, "loss": 0.4753, "step": 1723 }, { "epoch": 0.9503858875413451, "grad_norm": 0.23133328557014465, "learning_rate": 0.00015787728026533997, "loss": 0.4735, "step": 1724 }, { "epoch": 0.9509371554575524, "grad_norm": 0.20942679047584534, "learning_rate": 0.0001577943615257048, "loss": 0.4208, "step": 1725 }, { "epoch": 0.9514884233737596, "grad_norm": 0.23965676128864288, "learning_rate": 0.00015771144278606966, "loss": 0.4758, "step": 1726 }, { "epoch": 0.9520396912899669, "grad_norm": 0.23537394404411316, "learning_rate": 0.0001576285240464345, "loss": 0.4276, "step": 1727 }, { "epoch": 0.9525909592061742, "grad_norm": 0.24360457062721252, "learning_rate": 0.00015754560530679935, "loss": 0.4686, "step": 1728 }, { "epoch": 0.9531422271223815, "grad_norm": 0.22790101170539856, "learning_rate": 0.00015746268656716416, "loss": 0.4501, "step": 1729 }, { "epoch": 0.9536934950385888, "grad_norm": 0.23862150311470032, "learning_rate": 0.00015737976782752901, "loss": 0.4545, "step": 1730 }, { "epoch": 0.954244762954796, "grad_norm": 0.24378471076488495, "learning_rate": 0.00015729684908789385, "loss": 0.4912, "step": 1731 }, { "epoch": 0.9547960308710033, "grad_norm": 0.23474174737930298, "learning_rate": 0.00015721393034825868, "loss": 0.4692, "step": 1732 }, { "epoch": 0.9553472987872106, "grad_norm": 0.24299736320972443, "learning_rate": 0.00015713101160862354, "loss": 0.4582, "step": 1733 }, { "epoch": 0.9558985667034179, "grad_norm": 0.23355722427368164, "learning_rate": 0.00015704809286898837, "loss": 0.4579, "step": 1734 }, { "epoch": 0.9564498346196252, "grad_norm": 0.2307385504245758, "learning_rate": 0.00015696517412935323, "loss": 0.4276, "step": 1735 }, { "epoch": 0.9570011025358324, "grad_norm": 0.25666573643684387, "learning_rate": 0.00015688225538971806, "loss": 0.4488, "step": 1736 }, { "epoch": 0.9575523704520397, "grad_norm": 0.2472536265850067, "learning_rate": 0.00015679933665008292, "loss": 0.4635, "step": 1737 }, { "epoch": 0.958103638368247, "grad_norm": 0.23561540246009827, "learning_rate": 0.00015671641791044772, "loss": 0.456, "step": 1738 }, { "epoch": 0.9586549062844543, "grad_norm": 0.2695865333080292, "learning_rate": 0.00015663349917081258, "loss": 0.4894, "step": 1739 }, { "epoch": 0.9592061742006616, "grad_norm": 0.23878848552703857, "learning_rate": 0.00015655058043117742, "loss": 0.4945, "step": 1740 }, { "epoch": 0.9597574421168688, "grad_norm": 0.2417537271976471, "learning_rate": 0.00015646766169154228, "loss": 0.4456, "step": 1741 }, { "epoch": 0.960308710033076, "grad_norm": 0.258645623922348, "learning_rate": 0.0001563847429519071, "loss": 0.4767, "step": 1742 }, { "epoch": 0.9608599779492834, "grad_norm": 0.23502197861671448, "learning_rate": 0.00015630182421227197, "loss": 0.4636, "step": 1743 }, { "epoch": 0.9614112458654906, "grad_norm": 0.22951334714889526, "learning_rate": 0.0001562189054726368, "loss": 0.4329, "step": 1744 }, { "epoch": 0.961962513781698, "grad_norm": 0.24502499401569366, "learning_rate": 0.00015613598673300166, "loss": 0.4452, "step": 1745 }, { "epoch": 0.9625137816979051, "grad_norm": 0.24659104645252228, "learning_rate": 0.0001560530679933665, "loss": 0.4489, "step": 1746 }, { "epoch": 0.9630650496141124, "grad_norm": 0.2458224892616272, "learning_rate": 0.00015597014925373135, "loss": 0.4903, "step": 1747 }, { "epoch": 0.9636163175303197, "grad_norm": 0.24105043709278107, "learning_rate": 0.00015588723051409615, "loss": 0.4738, "step": 1748 }, { "epoch": 0.964167585446527, "grad_norm": 0.2505391836166382, "learning_rate": 0.000155804311774461, "loss": 0.4643, "step": 1749 }, { "epoch": 0.9647188533627343, "grad_norm": 0.23488488793373108, "learning_rate": 0.00015572139303482584, "loss": 0.4731, "step": 1750 }, { "epoch": 0.9652701212789415, "grad_norm": 0.2317710667848587, "learning_rate": 0.0001556384742951907, "loss": 0.4736, "step": 1751 }, { "epoch": 0.9658213891951488, "grad_norm": 0.23009353876113892, "learning_rate": 0.00015555555555555554, "loss": 0.4512, "step": 1752 }, { "epoch": 0.9663726571113561, "grad_norm": 0.24625705182552338, "learning_rate": 0.0001554726368159204, "loss": 0.455, "step": 1753 }, { "epoch": 0.9669239250275634, "grad_norm": 0.2400812804698944, "learning_rate": 0.00015538971807628523, "loss": 0.4725, "step": 1754 }, { "epoch": 0.9674751929437707, "grad_norm": 0.26011791825294495, "learning_rate": 0.00015530679933665009, "loss": 0.4868, "step": 1755 }, { "epoch": 0.9680264608599779, "grad_norm": 0.2298017144203186, "learning_rate": 0.0001552238805970149, "loss": 0.4559, "step": 1756 }, { "epoch": 0.9685777287761852, "grad_norm": 0.23378150165081024, "learning_rate": 0.00015514096185737978, "loss": 0.4511, "step": 1757 }, { "epoch": 0.9691289966923925, "grad_norm": 0.24460946023464203, "learning_rate": 0.00015505804311774458, "loss": 0.4571, "step": 1758 }, { "epoch": 0.9696802646085998, "grad_norm": 0.241620734333992, "learning_rate": 0.00015497512437810944, "loss": 0.4743, "step": 1759 }, { "epoch": 0.9702315325248071, "grad_norm": 0.23285698890686035, "learning_rate": 0.00015489220563847427, "loss": 0.4619, "step": 1760 }, { "epoch": 0.9707828004410143, "grad_norm": 0.24175579845905304, "learning_rate": 0.00015480928689883913, "loss": 0.4544, "step": 1761 }, { "epoch": 0.9713340683572216, "grad_norm": 0.22799162566661835, "learning_rate": 0.00015472636815920396, "loss": 0.4679, "step": 1762 }, { "epoch": 0.9718853362734289, "grad_norm": 0.23015514016151428, "learning_rate": 0.00015464344941956882, "loss": 0.4867, "step": 1763 }, { "epoch": 0.9724366041896362, "grad_norm": 0.22983665764331818, "learning_rate": 0.00015456053067993366, "loss": 0.4608, "step": 1764 }, { "epoch": 0.9729878721058435, "grad_norm": 0.22515413165092468, "learning_rate": 0.00015447761194029851, "loss": 0.4578, "step": 1765 }, { "epoch": 0.9735391400220507, "grad_norm": 0.23187264800071716, "learning_rate": 0.00015439469320066332, "loss": 0.4253, "step": 1766 }, { "epoch": 0.974090407938258, "grad_norm": 0.23280374705791473, "learning_rate": 0.00015431177446102818, "loss": 0.4473, "step": 1767 }, { "epoch": 0.9746416758544653, "grad_norm": 0.2500572204589844, "learning_rate": 0.000154228855721393, "loss": 0.4519, "step": 1768 }, { "epoch": 0.9751929437706726, "grad_norm": 0.23001956939697266, "learning_rate": 0.00015414593698175787, "loss": 0.4708, "step": 1769 }, { "epoch": 0.9757442116868799, "grad_norm": 0.23875866830348969, "learning_rate": 0.0001540630182421227, "loss": 0.4679, "step": 1770 }, { "epoch": 0.976295479603087, "grad_norm": 0.22990469634532928, "learning_rate": 0.00015398009950248756, "loss": 0.4632, "step": 1771 }, { "epoch": 0.9768467475192943, "grad_norm": 0.24912653863430023, "learning_rate": 0.0001538971807628524, "loss": 0.4569, "step": 1772 }, { "epoch": 0.9773980154355016, "grad_norm": 0.2521923780441284, "learning_rate": 0.00015381426202321725, "loss": 0.4696, "step": 1773 }, { "epoch": 0.9779492833517089, "grad_norm": 0.23184111714363098, "learning_rate": 0.00015373134328358208, "loss": 0.4518, "step": 1774 }, { "epoch": 0.9785005512679162, "grad_norm": 0.22830599546432495, "learning_rate": 0.0001536484245439469, "loss": 0.4511, "step": 1775 }, { "epoch": 0.9790518191841234, "grad_norm": 0.24908460676670074, "learning_rate": 0.00015356550580431175, "loss": 0.4556, "step": 1776 }, { "epoch": 0.9796030871003307, "grad_norm": 0.2542704939842224, "learning_rate": 0.00015348258706467658, "loss": 0.4876, "step": 1777 }, { "epoch": 0.980154355016538, "grad_norm": 0.23091669380664825, "learning_rate": 0.00015339966832504144, "loss": 0.4502, "step": 1778 }, { "epoch": 0.9807056229327453, "grad_norm": 0.24079181253910065, "learning_rate": 0.00015331674958540627, "loss": 0.4549, "step": 1779 }, { "epoch": 0.9812568908489526, "grad_norm": 0.224042147397995, "learning_rate": 0.00015323383084577113, "loss": 0.4568, "step": 1780 }, { "epoch": 0.9818081587651599, "grad_norm": 0.23204737901687622, "learning_rate": 0.00015315091210613596, "loss": 0.4516, "step": 1781 }, { "epoch": 0.9823594266813671, "grad_norm": 0.24899733066558838, "learning_rate": 0.00015306799336650082, "loss": 0.4422, "step": 1782 }, { "epoch": 0.9829106945975744, "grad_norm": 0.2473718822002411, "learning_rate": 0.00015298507462686565, "loss": 0.4698, "step": 1783 }, { "epoch": 0.9834619625137817, "grad_norm": 0.23376363515853882, "learning_rate": 0.0001529021558872305, "loss": 0.4735, "step": 1784 }, { "epoch": 0.984013230429989, "grad_norm": 0.21901825070381165, "learning_rate": 0.00015281923714759532, "loss": 0.4055, "step": 1785 }, { "epoch": 0.9845644983461963, "grad_norm": 0.24539053440093994, "learning_rate": 0.00015273631840796018, "loss": 0.477, "step": 1786 }, { "epoch": 0.9851157662624035, "grad_norm": 0.2802634537220001, "learning_rate": 0.000152653399668325, "loss": 0.4924, "step": 1787 }, { "epoch": 0.9856670341786108, "grad_norm": 0.2387421429157257, "learning_rate": 0.00015257048092868987, "loss": 0.4671, "step": 1788 }, { "epoch": 0.9862183020948181, "grad_norm": 0.22999261319637299, "learning_rate": 0.0001524875621890547, "loss": 0.4682, "step": 1789 }, { "epoch": 0.9867695700110254, "grad_norm": 0.2567140758037567, "learning_rate": 0.00015240464344941956, "loss": 0.4395, "step": 1790 }, { "epoch": 0.9873208379272327, "grad_norm": 0.24533671140670776, "learning_rate": 0.0001523217247097844, "loss": 0.4415, "step": 1791 }, { "epoch": 0.9878721058434399, "grad_norm": 0.24147699773311615, "learning_rate": 0.00015223880597014925, "loss": 0.4731, "step": 1792 }, { "epoch": 0.9884233737596472, "grad_norm": 0.23697462677955627, "learning_rate": 0.00015215588723051408, "loss": 0.451, "step": 1793 }, { "epoch": 0.9889746416758545, "grad_norm": 0.2380775809288025, "learning_rate": 0.00015207296849087894, "loss": 0.452, "step": 1794 }, { "epoch": 0.9895259095920618, "grad_norm": 0.24654051661491394, "learning_rate": 0.00015199004975124375, "loss": 0.4724, "step": 1795 }, { "epoch": 0.9900771775082691, "grad_norm": 0.2548507750034332, "learning_rate": 0.0001519071310116086, "loss": 0.4578, "step": 1796 }, { "epoch": 0.9906284454244763, "grad_norm": 0.23419903218746185, "learning_rate": 0.00015182421227197344, "loss": 0.4627, "step": 1797 }, { "epoch": 0.9911797133406836, "grad_norm": 0.2721438705921173, "learning_rate": 0.0001517412935323383, "loss": 0.4704, "step": 1798 }, { "epoch": 0.9917309812568909, "grad_norm": 0.22823266685009003, "learning_rate": 0.00015165837479270313, "loss": 0.4402, "step": 1799 }, { "epoch": 0.9922822491730982, "grad_norm": 0.3155699372291565, "learning_rate": 0.000151575456053068, "loss": 0.4537, "step": 1800 }, { "epoch": 0.9928335170893055, "grad_norm": 0.24750587344169617, "learning_rate": 0.00015149253731343282, "loss": 0.4674, "step": 1801 }, { "epoch": 0.9933847850055126, "grad_norm": 0.23167037963867188, "learning_rate": 0.00015140961857379768, "loss": 0.4506, "step": 1802 }, { "epoch": 0.9939360529217199, "grad_norm": 0.24583961069583893, "learning_rate": 0.0001513266998341625, "loss": 0.4809, "step": 1803 }, { "epoch": 0.9944873208379272, "grad_norm": 0.23894868791103363, "learning_rate": 0.00015124378109452737, "loss": 0.4729, "step": 1804 }, { "epoch": 0.9950385887541345, "grad_norm": 0.23357604444026947, "learning_rate": 0.00015116086235489218, "loss": 0.4608, "step": 1805 }, { "epoch": 0.9955898566703418, "grad_norm": 0.2364039272069931, "learning_rate": 0.00015107794361525703, "loss": 0.4803, "step": 1806 }, { "epoch": 0.996141124586549, "grad_norm": 0.23034816980361938, "learning_rate": 0.00015099502487562187, "loss": 0.4687, "step": 1807 }, { "epoch": 0.9966923925027563, "grad_norm": 0.23677074909210205, "learning_rate": 0.00015091210613598673, "loss": 0.4591, "step": 1808 }, { "epoch": 0.9972436604189636, "grad_norm": 0.24638359248638153, "learning_rate": 0.00015082918739635156, "loss": 0.462, "step": 1809 }, { "epoch": 0.9977949283351709, "grad_norm": 0.23346304893493652, "learning_rate": 0.00015074626865671642, "loss": 0.4245, "step": 1810 }, { "epoch": 0.9983461962513782, "grad_norm": 0.2604617774486542, "learning_rate": 0.00015066334991708125, "loss": 0.4665, "step": 1811 }, { "epoch": 0.9988974641675854, "grad_norm": 0.22308942675590515, "learning_rate": 0.0001505804311774461, "loss": 0.4671, "step": 1812 }, { "epoch": 0.9994487320837927, "grad_norm": 0.2405402511358261, "learning_rate": 0.00015049751243781094, "loss": 0.4808, "step": 1813 }, { "epoch": 1.0, "grad_norm": 0.2668411433696747, "learning_rate": 0.0001504145936981758, "loss": 0.4683, "step": 1814 }, { "epoch": 1.0005512679162072, "grad_norm": 0.23000217974185944, "learning_rate": 0.0001503316749585406, "loss": 0.3736, "step": 1815 }, { "epoch": 1.0011025358324146, "grad_norm": 0.2307773381471634, "learning_rate": 0.00015024875621890546, "loss": 0.3834, "step": 1816 }, { "epoch": 1.0016538037486218, "grad_norm": 0.23737002909183502, "learning_rate": 0.0001501658374792703, "loss": 0.3863, "step": 1817 }, { "epoch": 1.0022050716648292, "grad_norm": 0.2283601313829422, "learning_rate": 0.00015008291873963515, "loss": 0.3837, "step": 1818 }, { "epoch": 1.0027563395810364, "grad_norm": 0.21821331977844238, "learning_rate": 0.00015, "loss": 0.4085, "step": 1819 }, { "epoch": 1.0033076074972436, "grad_norm": 0.2391849011182785, "learning_rate": 0.00014991708126036482, "loss": 0.4207, "step": 1820 }, { "epoch": 1.003858875413451, "grad_norm": 0.23875446617603302, "learning_rate": 0.00014983416252072968, "loss": 0.4013, "step": 1821 }, { "epoch": 1.0044101433296582, "grad_norm": 0.24305221438407898, "learning_rate": 0.0001497512437810945, "loss": 0.4106, "step": 1822 }, { "epoch": 1.0049614112458656, "grad_norm": 0.21675904095172882, "learning_rate": 0.00014966832504145937, "loss": 0.3744, "step": 1823 }, { "epoch": 1.0055126791620728, "grad_norm": 0.23470553755760193, "learning_rate": 0.0001495854063018242, "loss": 0.3997, "step": 1824 }, { "epoch": 1.00606394707828, "grad_norm": 0.2310658097267151, "learning_rate": 0.00014950248756218903, "loss": 0.411, "step": 1825 }, { "epoch": 1.0066152149944874, "grad_norm": 0.23178675770759583, "learning_rate": 0.0001494195688225539, "loss": 0.3884, "step": 1826 }, { "epoch": 1.0071664829106945, "grad_norm": 0.23985427618026733, "learning_rate": 0.00014933665008291872, "loss": 0.4026, "step": 1827 }, { "epoch": 1.007717750826902, "grad_norm": 0.228210911154747, "learning_rate": 0.00014925373134328358, "loss": 0.3952, "step": 1828 }, { "epoch": 1.0082690187431091, "grad_norm": 0.22802165150642395, "learning_rate": 0.00014917081260364842, "loss": 0.4194, "step": 1829 }, { "epoch": 1.0088202866593163, "grad_norm": 0.2423812299966812, "learning_rate": 0.00014908789386401325, "loss": 0.4282, "step": 1830 }, { "epoch": 1.0093715545755237, "grad_norm": 0.23589813709259033, "learning_rate": 0.0001490049751243781, "loss": 0.3911, "step": 1831 }, { "epoch": 1.009922822491731, "grad_norm": 0.21917280554771423, "learning_rate": 0.00014892205638474294, "loss": 0.3723, "step": 1832 }, { "epoch": 1.0104740904079383, "grad_norm": 0.22650456428527832, "learning_rate": 0.0001488391376451078, "loss": 0.3962, "step": 1833 }, { "epoch": 1.0110253583241455, "grad_norm": 0.23731641471385956, "learning_rate": 0.00014875621890547263, "loss": 0.4235, "step": 1834 }, { "epoch": 1.0115766262403527, "grad_norm": 0.21167220175266266, "learning_rate": 0.00014867330016583746, "loss": 0.3786, "step": 1835 }, { "epoch": 1.0121278941565601, "grad_norm": 0.23506543040275574, "learning_rate": 0.00014859038142620232, "loss": 0.4098, "step": 1836 }, { "epoch": 1.0126791620727673, "grad_norm": 0.25581830739974976, "learning_rate": 0.00014850746268656715, "loss": 0.4052, "step": 1837 }, { "epoch": 1.0132304299889747, "grad_norm": 0.2236202359199524, "learning_rate": 0.000148424543946932, "loss": 0.3975, "step": 1838 }, { "epoch": 1.013781697905182, "grad_norm": 0.21659554541110992, "learning_rate": 0.00014834162520729684, "loss": 0.3843, "step": 1839 }, { "epoch": 1.014332965821389, "grad_norm": 0.22564005851745605, "learning_rate": 0.00014825870646766168, "loss": 0.4013, "step": 1840 }, { "epoch": 1.0148842337375965, "grad_norm": 0.225655660033226, "learning_rate": 0.00014817578772802654, "loss": 0.3976, "step": 1841 }, { "epoch": 1.0154355016538037, "grad_norm": 0.21095581352710724, "learning_rate": 0.00014809286898839137, "loss": 0.3812, "step": 1842 }, { "epoch": 1.015986769570011, "grad_norm": 0.23854820430278778, "learning_rate": 0.0001480099502487562, "loss": 0.4089, "step": 1843 }, { "epoch": 1.0165380374862183, "grad_norm": 0.22585038840770721, "learning_rate": 0.00014792703150912103, "loss": 0.4193, "step": 1844 }, { "epoch": 1.0170893054024255, "grad_norm": 0.2209796905517578, "learning_rate": 0.0001478441127694859, "loss": 0.3989, "step": 1845 }, { "epoch": 1.017640573318633, "grad_norm": 0.2113056629896164, "learning_rate": 0.00014776119402985072, "loss": 0.4089, "step": 1846 }, { "epoch": 1.01819184123484, "grad_norm": 0.22150270640850067, "learning_rate": 0.00014767827529021558, "loss": 0.3946, "step": 1847 }, { "epoch": 1.0187431091510475, "grad_norm": 0.22819051146507263, "learning_rate": 0.00014759535655058041, "loss": 0.3914, "step": 1848 }, { "epoch": 1.0192943770672547, "grad_norm": 0.21912482380867004, "learning_rate": 0.00014751243781094525, "loss": 0.3621, "step": 1849 }, { "epoch": 1.0198456449834619, "grad_norm": 0.22611315548419952, "learning_rate": 0.0001474295190713101, "loss": 0.386, "step": 1850 }, { "epoch": 1.0203969128996693, "grad_norm": 0.225437730550766, "learning_rate": 0.00014734660033167494, "loss": 0.4115, "step": 1851 }, { "epoch": 1.0209481808158765, "grad_norm": 0.22555771470069885, "learning_rate": 0.0001472636815920398, "loss": 0.4121, "step": 1852 }, { "epoch": 1.0214994487320839, "grad_norm": 0.22996987402439117, "learning_rate": 0.00014718076285240463, "loss": 0.3799, "step": 1853 }, { "epoch": 1.022050716648291, "grad_norm": 0.227546826004982, "learning_rate": 0.00014709784411276946, "loss": 0.406, "step": 1854 }, { "epoch": 1.0226019845644982, "grad_norm": 0.21384532749652863, "learning_rate": 0.00014701492537313432, "loss": 0.393, "step": 1855 }, { "epoch": 1.0231532524807057, "grad_norm": 0.21834981441497803, "learning_rate": 0.00014693200663349915, "loss": 0.3737, "step": 1856 }, { "epoch": 1.0237045203969128, "grad_norm": 0.2231069952249527, "learning_rate": 0.000146849087893864, "loss": 0.3755, "step": 1857 }, { "epoch": 1.0242557883131203, "grad_norm": 0.22336961328983307, "learning_rate": 0.00014676616915422884, "loss": 0.3936, "step": 1858 }, { "epoch": 1.0248070562293274, "grad_norm": 0.22250871360301971, "learning_rate": 0.00014668325041459367, "loss": 0.4021, "step": 1859 }, { "epoch": 1.0253583241455346, "grad_norm": 0.21691983938217163, "learning_rate": 0.00014660033167495853, "loss": 0.375, "step": 1860 }, { "epoch": 1.025909592061742, "grad_norm": 0.2267792969942093, "learning_rate": 0.00014651741293532337, "loss": 0.4089, "step": 1861 }, { "epoch": 1.0264608599779492, "grad_norm": 0.22236919403076172, "learning_rate": 0.00014643449419568823, "loss": 0.384, "step": 1862 }, { "epoch": 1.0270121278941566, "grad_norm": 0.2280534952878952, "learning_rate": 0.00014635157545605306, "loss": 0.3982, "step": 1863 }, { "epoch": 1.0275633958103638, "grad_norm": 0.23323461413383484, "learning_rate": 0.0001462686567164179, "loss": 0.3947, "step": 1864 }, { "epoch": 1.028114663726571, "grad_norm": 0.2187027931213379, "learning_rate": 0.00014618573797678275, "loss": 0.3758, "step": 1865 }, { "epoch": 1.0286659316427784, "grad_norm": 0.2233375459909439, "learning_rate": 0.00014610281923714758, "loss": 0.3889, "step": 1866 }, { "epoch": 1.0292171995589856, "grad_norm": 0.23430676758289337, "learning_rate": 0.00014601990049751244, "loss": 0.3919, "step": 1867 }, { "epoch": 1.029768467475193, "grad_norm": 0.22947613894939423, "learning_rate": 0.00014593698175787727, "loss": 0.3886, "step": 1868 }, { "epoch": 1.0303197353914002, "grad_norm": 0.23334287106990814, "learning_rate": 0.0001458540630182421, "loss": 0.413, "step": 1869 }, { "epoch": 1.0308710033076074, "grad_norm": 0.2178686261177063, "learning_rate": 0.00014577114427860696, "loss": 0.393, "step": 1870 }, { "epoch": 1.0314222712238148, "grad_norm": 0.2510049045085907, "learning_rate": 0.0001456882255389718, "loss": 0.413, "step": 1871 }, { "epoch": 1.031973539140022, "grad_norm": 0.23210124671459198, "learning_rate": 0.00014560530679933665, "loss": 0.3817, "step": 1872 }, { "epoch": 1.0325248070562294, "grad_norm": 0.23246748745441437, "learning_rate": 0.00014552238805970149, "loss": 0.4026, "step": 1873 }, { "epoch": 1.0330760749724366, "grad_norm": 0.22752533853054047, "learning_rate": 0.00014543946932006632, "loss": 0.411, "step": 1874 }, { "epoch": 1.0336273428886438, "grad_norm": 0.21562816202640533, "learning_rate": 0.00014535655058043118, "loss": 0.3966, "step": 1875 }, { "epoch": 1.0341786108048512, "grad_norm": 0.227711021900177, "learning_rate": 0.000145273631840796, "loss": 0.4008, "step": 1876 }, { "epoch": 1.0347298787210584, "grad_norm": 0.22064116597175598, "learning_rate": 0.00014519071310116087, "loss": 0.3855, "step": 1877 }, { "epoch": 1.0352811466372658, "grad_norm": 0.22657108306884766, "learning_rate": 0.0001451077943615257, "loss": 0.4147, "step": 1878 }, { "epoch": 1.035832414553473, "grad_norm": 0.220686674118042, "learning_rate": 0.00014502487562189053, "loss": 0.3953, "step": 1879 }, { "epoch": 1.0363836824696802, "grad_norm": 0.21113237738609314, "learning_rate": 0.0001449419568822554, "loss": 0.3908, "step": 1880 }, { "epoch": 1.0369349503858876, "grad_norm": 0.21575047075748444, "learning_rate": 0.00014485903814262022, "loss": 0.3917, "step": 1881 }, { "epoch": 1.0374862183020948, "grad_norm": 0.22273024916648865, "learning_rate": 0.00014477611940298508, "loss": 0.4007, "step": 1882 }, { "epoch": 1.0380374862183022, "grad_norm": 0.22036762535572052, "learning_rate": 0.00014469320066334991, "loss": 0.3797, "step": 1883 }, { "epoch": 1.0385887541345094, "grad_norm": 0.22144779562950134, "learning_rate": 0.00014461028192371475, "loss": 0.3911, "step": 1884 }, { "epoch": 1.0391400220507165, "grad_norm": 0.22937916219234467, "learning_rate": 0.0001445273631840796, "loss": 0.406, "step": 1885 }, { "epoch": 1.039691289966924, "grad_norm": 0.21770672500133514, "learning_rate": 0.0001444444444444444, "loss": 0.389, "step": 1886 }, { "epoch": 1.0402425578831311, "grad_norm": 0.2170240730047226, "learning_rate": 0.00014436152570480927, "loss": 0.4225, "step": 1887 }, { "epoch": 1.0407938257993385, "grad_norm": 0.23694483935832977, "learning_rate": 0.0001442786069651741, "loss": 0.4124, "step": 1888 }, { "epoch": 1.0413450937155457, "grad_norm": 0.2358977198600769, "learning_rate": 0.00014419568822553896, "loss": 0.3932, "step": 1889 }, { "epoch": 1.041896361631753, "grad_norm": 0.2379174828529358, "learning_rate": 0.0001441127694859038, "loss": 0.3921, "step": 1890 }, { "epoch": 1.0424476295479603, "grad_norm": 0.22685475647449493, "learning_rate": 0.00014402985074626863, "loss": 0.398, "step": 1891 }, { "epoch": 1.0429988974641675, "grad_norm": 0.2381109744310379, "learning_rate": 0.00014394693200663348, "loss": 0.4002, "step": 1892 }, { "epoch": 1.043550165380375, "grad_norm": 0.23132000863552094, "learning_rate": 0.00014386401326699832, "loss": 0.3917, "step": 1893 }, { "epoch": 1.0441014332965821, "grad_norm": 0.23595485091209412, "learning_rate": 0.00014378109452736318, "loss": 0.3811, "step": 1894 }, { "epoch": 1.0446527012127893, "grad_norm": 0.23046362400054932, "learning_rate": 0.000143698175787728, "loss": 0.389, "step": 1895 }, { "epoch": 1.0452039691289967, "grad_norm": 0.21979711949825287, "learning_rate": 0.00014361525704809284, "loss": 0.4008, "step": 1896 }, { "epoch": 1.045755237045204, "grad_norm": 0.21169352531433105, "learning_rate": 0.0001435323383084577, "loss": 0.3767, "step": 1897 }, { "epoch": 1.0463065049614113, "grad_norm": 0.2226918339729309, "learning_rate": 0.00014344941956882253, "loss": 0.4059, "step": 1898 }, { "epoch": 1.0468577728776185, "grad_norm": 0.23048485815525055, "learning_rate": 0.0001433665008291874, "loss": 0.4013, "step": 1899 }, { "epoch": 1.0474090407938257, "grad_norm": 0.22347117960453033, "learning_rate": 0.00014328358208955222, "loss": 0.4042, "step": 1900 }, { "epoch": 1.047960308710033, "grad_norm": 0.2321341335773468, "learning_rate": 0.00014320066334991705, "loss": 0.4055, "step": 1901 }, { "epoch": 1.0485115766262403, "grad_norm": 0.22918953001499176, "learning_rate": 0.0001431177446102819, "loss": 0.3845, "step": 1902 }, { "epoch": 1.0490628445424477, "grad_norm": 0.21781106293201447, "learning_rate": 0.00014303482587064675, "loss": 0.4067, "step": 1903 }, { "epoch": 1.0496141124586549, "grad_norm": 0.21180634200572968, "learning_rate": 0.0001429519071310116, "loss": 0.3891, "step": 1904 }, { "epoch": 1.0501653803748623, "grad_norm": 0.2400248795747757, "learning_rate": 0.00014286898839137644, "loss": 0.3878, "step": 1905 }, { "epoch": 1.0507166482910695, "grad_norm": 0.22464604675769806, "learning_rate": 0.00014278606965174127, "loss": 0.3909, "step": 1906 }, { "epoch": 1.0512679162072767, "grad_norm": 0.23820553719997406, "learning_rate": 0.00014270315091210613, "loss": 0.3967, "step": 1907 }, { "epoch": 1.051819184123484, "grad_norm": 0.23168790340423584, "learning_rate": 0.00014262023217247096, "loss": 0.4057, "step": 1908 }, { "epoch": 1.0523704520396913, "grad_norm": 0.2253868579864502, "learning_rate": 0.00014253731343283582, "loss": 0.3844, "step": 1909 }, { "epoch": 1.0529217199558987, "grad_norm": 0.21465058624744415, "learning_rate": 0.00014245439469320065, "loss": 0.3804, "step": 1910 }, { "epoch": 1.0534729878721059, "grad_norm": 0.22617360949516296, "learning_rate": 0.00014237147595356548, "loss": 0.3738, "step": 1911 }, { "epoch": 1.054024255788313, "grad_norm": 0.23942868411540985, "learning_rate": 0.00014228855721393034, "loss": 0.4044, "step": 1912 }, { "epoch": 1.0545755237045205, "grad_norm": 0.23497670888900757, "learning_rate": 0.00014220563847429517, "loss": 0.4138, "step": 1913 }, { "epoch": 1.0551267916207276, "grad_norm": 0.229624941945076, "learning_rate": 0.00014212271973466003, "loss": 0.402, "step": 1914 }, { "epoch": 1.055678059536935, "grad_norm": 0.22944937646389008, "learning_rate": 0.00014203980099502486, "loss": 0.4016, "step": 1915 }, { "epoch": 1.0562293274531422, "grad_norm": 0.2452874332666397, "learning_rate": 0.0001419568822553897, "loss": 0.4149, "step": 1916 }, { "epoch": 1.0567805953693494, "grad_norm": 0.23434410989284515, "learning_rate": 0.00014187396351575456, "loss": 0.3818, "step": 1917 }, { "epoch": 1.0573318632855568, "grad_norm": 0.22487396001815796, "learning_rate": 0.0001417910447761194, "loss": 0.4071, "step": 1918 }, { "epoch": 1.057883131201764, "grad_norm": 0.2129317820072174, "learning_rate": 0.00014170812603648425, "loss": 0.3653, "step": 1919 }, { "epoch": 1.0584343991179714, "grad_norm": 0.21573378145694733, "learning_rate": 0.00014162520729684908, "loss": 0.3924, "step": 1920 }, { "epoch": 1.0589856670341786, "grad_norm": 0.23635123670101166, "learning_rate": 0.0001415422885572139, "loss": 0.3883, "step": 1921 }, { "epoch": 1.0595369349503858, "grad_norm": 0.23705770075321198, "learning_rate": 0.00014145936981757877, "loss": 0.3865, "step": 1922 }, { "epoch": 1.0600882028665932, "grad_norm": 0.22904790937900543, "learning_rate": 0.0001413764510779436, "loss": 0.3851, "step": 1923 }, { "epoch": 1.0606394707828004, "grad_norm": 0.21958112716674805, "learning_rate": 0.00014129353233830846, "loss": 0.3965, "step": 1924 }, { "epoch": 1.0611907386990078, "grad_norm": 0.232145294547081, "learning_rate": 0.0001412106135986733, "loss": 0.4001, "step": 1925 }, { "epoch": 1.061742006615215, "grad_norm": 0.23748160898685455, "learning_rate": 0.00014112769485903813, "loss": 0.3809, "step": 1926 }, { "epoch": 1.0622932745314222, "grad_norm": 0.25450122356414795, "learning_rate": 0.00014104477611940298, "loss": 0.3986, "step": 1927 }, { "epoch": 1.0628445424476296, "grad_norm": 0.23028801381587982, "learning_rate": 0.00014096185737976782, "loss": 0.3905, "step": 1928 }, { "epoch": 1.0633958103638368, "grad_norm": 0.23206226527690887, "learning_rate": 0.00014087893864013268, "loss": 0.3757, "step": 1929 }, { "epoch": 1.0639470782800442, "grad_norm": 0.23685060441493988, "learning_rate": 0.00014079601990049748, "loss": 0.3844, "step": 1930 }, { "epoch": 1.0644983461962514, "grad_norm": 0.22835825383663177, "learning_rate": 0.00014071310116086234, "loss": 0.388, "step": 1931 }, { "epoch": 1.0650496141124586, "grad_norm": 0.2305503487586975, "learning_rate": 0.00014063018242122717, "loss": 0.4015, "step": 1932 }, { "epoch": 1.065600882028666, "grad_norm": 0.23914876580238342, "learning_rate": 0.00014054726368159203, "loss": 0.3826, "step": 1933 }, { "epoch": 1.0661521499448732, "grad_norm": 0.2508886158466339, "learning_rate": 0.00014046434494195686, "loss": 0.3948, "step": 1934 }, { "epoch": 1.0667034178610806, "grad_norm": 0.280200332403183, "learning_rate": 0.0001403814262023217, "loss": 0.4042, "step": 1935 }, { "epoch": 1.0672546857772878, "grad_norm": 0.22536714375019073, "learning_rate": 0.00014029850746268655, "loss": 0.3948, "step": 1936 }, { "epoch": 1.067805953693495, "grad_norm": 0.24053654074668884, "learning_rate": 0.0001402155887230514, "loss": 0.3976, "step": 1937 }, { "epoch": 1.0683572216097024, "grad_norm": 0.2461492270231247, "learning_rate": 0.00014013266998341625, "loss": 0.385, "step": 1938 }, { "epoch": 1.0689084895259096, "grad_norm": 0.24768413603305817, "learning_rate": 0.00014004975124378108, "loss": 0.3734, "step": 1939 }, { "epoch": 1.069459757442117, "grad_norm": 0.2460828721523285, "learning_rate": 0.0001399668325041459, "loss": 0.3924, "step": 1940 }, { "epoch": 1.0700110253583242, "grad_norm": 0.2739814519882202, "learning_rate": 0.00013988391376451077, "loss": 0.3779, "step": 1941 }, { "epoch": 1.0705622932745313, "grad_norm": 0.23434729874134064, "learning_rate": 0.0001398009950248756, "loss": 0.4186, "step": 1942 }, { "epoch": 1.0711135611907387, "grad_norm": 0.23552288115024567, "learning_rate": 0.00013971807628524046, "loss": 0.3951, "step": 1943 }, { "epoch": 1.071664829106946, "grad_norm": 0.2381044626235962, "learning_rate": 0.0001396351575456053, "loss": 0.3938, "step": 1944 }, { "epoch": 1.0722160970231533, "grad_norm": 0.25459203124046326, "learning_rate": 0.00013955223880597012, "loss": 0.3997, "step": 1945 }, { "epoch": 1.0727673649393605, "grad_norm": 0.2563784718513489, "learning_rate": 0.00013946932006633498, "loss": 0.404, "step": 1946 }, { "epoch": 1.0733186328555677, "grad_norm": 0.23130348324775696, "learning_rate": 0.00013938640132669982, "loss": 0.3844, "step": 1947 }, { "epoch": 1.0738699007717751, "grad_norm": 0.24562886357307434, "learning_rate": 0.00013930348258706467, "loss": 0.4131, "step": 1948 }, { "epoch": 1.0744211686879823, "grad_norm": 0.22779060900211334, "learning_rate": 0.0001392205638474295, "loss": 0.4107, "step": 1949 }, { "epoch": 1.0749724366041897, "grad_norm": 0.23528602719306946, "learning_rate": 0.00013913764510779434, "loss": 0.4128, "step": 1950 }, { "epoch": 1.075523704520397, "grad_norm": 0.23987142741680145, "learning_rate": 0.0001390547263681592, "loss": 0.3987, "step": 1951 }, { "epoch": 1.076074972436604, "grad_norm": 0.2401638627052307, "learning_rate": 0.00013897180762852403, "loss": 0.3923, "step": 1952 }, { "epoch": 1.0766262403528115, "grad_norm": 0.24218258261680603, "learning_rate": 0.0001388888888888889, "loss": 0.4001, "step": 1953 }, { "epoch": 1.0771775082690187, "grad_norm": 0.23231711983680725, "learning_rate": 0.00013880597014925372, "loss": 0.3795, "step": 1954 }, { "epoch": 1.0777287761852261, "grad_norm": 0.2225574404001236, "learning_rate": 0.00013872305140961855, "loss": 0.3867, "step": 1955 }, { "epoch": 1.0782800441014333, "grad_norm": 0.22481811046600342, "learning_rate": 0.0001386401326699834, "loss": 0.3946, "step": 1956 }, { "epoch": 1.0788313120176405, "grad_norm": 0.22649556398391724, "learning_rate": 0.00013855721393034824, "loss": 0.3834, "step": 1957 }, { "epoch": 1.079382579933848, "grad_norm": 0.21780644357204437, "learning_rate": 0.0001384742951907131, "loss": 0.3874, "step": 1958 }, { "epoch": 1.079933847850055, "grad_norm": 0.21539410948753357, "learning_rate": 0.00013839137645107794, "loss": 0.3788, "step": 1959 }, { "epoch": 1.0804851157662625, "grad_norm": 0.22845754027366638, "learning_rate": 0.00013830845771144277, "loss": 0.395, "step": 1960 }, { "epoch": 1.0810363836824697, "grad_norm": 0.23722249269485474, "learning_rate": 0.00013822553897180763, "loss": 0.3993, "step": 1961 }, { "epoch": 1.0815876515986769, "grad_norm": 0.2395038902759552, "learning_rate": 0.00013814262023217246, "loss": 0.4204, "step": 1962 }, { "epoch": 1.0821389195148843, "grad_norm": 0.2149537056684494, "learning_rate": 0.00013805970149253732, "loss": 0.381, "step": 1963 }, { "epoch": 1.0826901874310915, "grad_norm": 0.24547190964221954, "learning_rate": 0.00013797678275290215, "loss": 0.404, "step": 1964 }, { "epoch": 1.0832414553472989, "grad_norm": 0.21485422551631927, "learning_rate": 0.00013789386401326698, "loss": 0.3756, "step": 1965 }, { "epoch": 1.083792723263506, "grad_norm": 0.2199661284685135, "learning_rate": 0.00013781094527363184, "loss": 0.39, "step": 1966 }, { "epoch": 1.0843439911797133, "grad_norm": 0.2321014702320099, "learning_rate": 0.00013772802653399667, "loss": 0.3877, "step": 1967 }, { "epoch": 1.0848952590959207, "grad_norm": 0.23033714294433594, "learning_rate": 0.00013764510779436153, "loss": 0.4018, "step": 1968 }, { "epoch": 1.0854465270121278, "grad_norm": 0.2251034677028656, "learning_rate": 0.00013756218905472636, "loss": 0.3911, "step": 1969 }, { "epoch": 1.0859977949283353, "grad_norm": 0.22630800306797028, "learning_rate": 0.0001374792703150912, "loss": 0.397, "step": 1970 }, { "epoch": 1.0865490628445424, "grad_norm": 0.22938160598278046, "learning_rate": 0.00013739635157545606, "loss": 0.401, "step": 1971 }, { "epoch": 1.0871003307607496, "grad_norm": 0.24200983345508575, "learning_rate": 0.0001373134328358209, "loss": 0.3988, "step": 1972 }, { "epoch": 1.087651598676957, "grad_norm": 0.25386059284210205, "learning_rate": 0.00013723051409618575, "loss": 0.4093, "step": 1973 }, { "epoch": 1.0882028665931642, "grad_norm": 0.2258448451757431, "learning_rate": 0.00013714759535655055, "loss": 0.386, "step": 1974 }, { "epoch": 1.0887541345093716, "grad_norm": 0.2277601659297943, "learning_rate": 0.0001370646766169154, "loss": 0.4041, "step": 1975 }, { "epoch": 1.0893054024255788, "grad_norm": 0.20614218711853027, "learning_rate": 0.00013698175787728024, "loss": 0.3784, "step": 1976 }, { "epoch": 1.089856670341786, "grad_norm": 0.22764301300048828, "learning_rate": 0.0001368988391376451, "loss": 0.395, "step": 1977 }, { "epoch": 1.0904079382579934, "grad_norm": 0.23423810303211212, "learning_rate": 0.00013681592039800993, "loss": 0.4114, "step": 1978 }, { "epoch": 1.0909592061742006, "grad_norm": 0.2042825073003769, "learning_rate": 0.00013673300165837477, "loss": 0.3724, "step": 1979 }, { "epoch": 1.091510474090408, "grad_norm": 0.2203364223241806, "learning_rate": 0.00013665008291873962, "loss": 0.4084, "step": 1980 }, { "epoch": 1.0920617420066152, "grad_norm": 0.23350727558135986, "learning_rate": 0.00013656716417910446, "loss": 0.4041, "step": 1981 }, { "epoch": 1.0926130099228224, "grad_norm": 0.23900878429412842, "learning_rate": 0.00013648424543946932, "loss": 0.3976, "step": 1982 }, { "epoch": 1.0931642778390298, "grad_norm": 0.22579023241996765, "learning_rate": 0.00013640132669983415, "loss": 0.4019, "step": 1983 }, { "epoch": 1.093715545755237, "grad_norm": 0.23907893896102905, "learning_rate": 0.00013631840796019898, "loss": 0.4185, "step": 1984 }, { "epoch": 1.0942668136714444, "grad_norm": 0.22953177988529205, "learning_rate": 0.00013623548922056384, "loss": 0.4009, "step": 1985 }, { "epoch": 1.0948180815876516, "grad_norm": 0.22816117107868195, "learning_rate": 0.00013615257048092867, "loss": 0.3773, "step": 1986 }, { "epoch": 1.0953693495038588, "grad_norm": 0.2403888702392578, "learning_rate": 0.00013606965174129353, "loss": 0.3857, "step": 1987 }, { "epoch": 1.0959206174200662, "grad_norm": 0.2400594800710678, "learning_rate": 0.00013598673300165836, "loss": 0.398, "step": 1988 }, { "epoch": 1.0964718853362734, "grad_norm": 0.2451186329126358, "learning_rate": 0.0001359038142620232, "loss": 0.4066, "step": 1989 }, { "epoch": 1.0970231532524808, "grad_norm": 0.2371450811624527, "learning_rate": 0.00013582089552238805, "loss": 0.3855, "step": 1990 }, { "epoch": 1.097574421168688, "grad_norm": 0.2529587745666504, "learning_rate": 0.00013573797678275289, "loss": 0.3851, "step": 1991 }, { "epoch": 1.0981256890848952, "grad_norm": 0.23810137808322906, "learning_rate": 0.00013565505804311774, "loss": 0.3644, "step": 1992 }, { "epoch": 1.0986769570011026, "grad_norm": 0.23532289266586304, "learning_rate": 0.00013557213930348258, "loss": 0.3813, "step": 1993 }, { "epoch": 1.0992282249173098, "grad_norm": 0.2418917566537857, "learning_rate": 0.0001354892205638474, "loss": 0.3775, "step": 1994 }, { "epoch": 1.0997794928335172, "grad_norm": 0.2366194874048233, "learning_rate": 0.00013540630182421227, "loss": 0.4047, "step": 1995 }, { "epoch": 1.1003307607497244, "grad_norm": 0.23951660096645355, "learning_rate": 0.0001353233830845771, "loss": 0.3956, "step": 1996 }, { "epoch": 1.1008820286659315, "grad_norm": 0.260423481464386, "learning_rate": 0.00013524046434494196, "loss": 0.3979, "step": 1997 }, { "epoch": 1.101433296582139, "grad_norm": 0.22453179955482483, "learning_rate": 0.0001351575456053068, "loss": 0.3918, "step": 1998 }, { "epoch": 1.1019845644983461, "grad_norm": 0.2185899168252945, "learning_rate": 0.00013507462686567162, "loss": 0.38, "step": 1999 }, { "epoch": 1.1025358324145536, "grad_norm": 0.2236957997083664, "learning_rate": 0.00013499170812603648, "loss": 0.4007, "step": 2000 }, { "epoch": 1.1025358324145536, "eval_loss": 0.4581758677959442, "eval_runtime": 312.0177, "eval_samples_per_second": 3.734, "eval_steps_per_second": 0.468, "step": 2000 }, { "epoch": 1.1030871003307607, "grad_norm": 0.2543388903141022, "learning_rate": 0.00013490878938640131, "loss": 0.39, "step": 2001 }, { "epoch": 1.103638368246968, "grad_norm": 0.22843103110790253, "learning_rate": 0.00013482587064676615, "loss": 0.3835, "step": 2002 }, { "epoch": 1.1041896361631753, "grad_norm": 0.226676806807518, "learning_rate": 0.000134742951907131, "loss": 0.3907, "step": 2003 }, { "epoch": 1.1047409040793825, "grad_norm": 0.22164440155029297, "learning_rate": 0.00013466003316749584, "loss": 0.3727, "step": 2004 }, { "epoch": 1.10529217199559, "grad_norm": 0.2151675671339035, "learning_rate": 0.0001345771144278607, "loss": 0.3749, "step": 2005 }, { "epoch": 1.1058434399117971, "grad_norm": 0.23192958533763885, "learning_rate": 0.00013449419568822553, "loss": 0.407, "step": 2006 }, { "epoch": 1.1063947078280043, "grad_norm": 0.2130926102399826, "learning_rate": 0.00013441127694859036, "loss": 0.3702, "step": 2007 }, { "epoch": 1.1069459757442117, "grad_norm": 0.22862909734249115, "learning_rate": 0.00013432835820895522, "loss": 0.3784, "step": 2008 }, { "epoch": 1.107497243660419, "grad_norm": 0.22866345942020416, "learning_rate": 0.00013424543946932005, "loss": 0.4035, "step": 2009 }, { "epoch": 1.1080485115766263, "grad_norm": 0.2159378081560135, "learning_rate": 0.0001341625207296849, "loss": 0.3996, "step": 2010 }, { "epoch": 1.1085997794928335, "grad_norm": 0.22037655115127563, "learning_rate": 0.00013407960199004974, "loss": 0.3873, "step": 2011 }, { "epoch": 1.1091510474090407, "grad_norm": 0.24213933944702148, "learning_rate": 0.00013399668325041458, "loss": 0.4144, "step": 2012 }, { "epoch": 1.109702315325248, "grad_norm": 0.2235259711742401, "learning_rate": 0.00013391376451077943, "loss": 0.4028, "step": 2013 }, { "epoch": 1.1102535832414553, "grad_norm": 0.2354377955198288, "learning_rate": 0.00013383084577114427, "loss": 0.4103, "step": 2014 }, { "epoch": 1.1108048511576627, "grad_norm": 0.22363215684890747, "learning_rate": 0.00013374792703150913, "loss": 0.3962, "step": 2015 }, { "epoch": 1.1113561190738699, "grad_norm": 0.22264409065246582, "learning_rate": 0.00013366500829187396, "loss": 0.3818, "step": 2016 }, { "epoch": 1.111907386990077, "grad_norm": 0.22731584310531616, "learning_rate": 0.0001335820895522388, "loss": 0.4013, "step": 2017 }, { "epoch": 1.1124586549062845, "grad_norm": 0.22340711951255798, "learning_rate": 0.00013349917081260362, "loss": 0.3734, "step": 2018 }, { "epoch": 1.1130099228224917, "grad_norm": 0.23701246082782745, "learning_rate": 0.00013341625207296848, "loss": 0.3943, "step": 2019 }, { "epoch": 1.113561190738699, "grad_norm": 0.22929784655570984, "learning_rate": 0.0001333333333333333, "loss": 0.3848, "step": 2020 }, { "epoch": 1.1141124586549063, "grad_norm": 0.24790272116661072, "learning_rate": 0.00013325041459369814, "loss": 0.4047, "step": 2021 }, { "epoch": 1.1146637265711137, "grad_norm": 0.22452253103256226, "learning_rate": 0.000133167495854063, "loss": 0.385, "step": 2022 }, { "epoch": 1.1152149944873209, "grad_norm": 0.23337581753730774, "learning_rate": 0.00013308457711442784, "loss": 0.3791, "step": 2023 }, { "epoch": 1.115766262403528, "grad_norm": 0.23171287775039673, "learning_rate": 0.0001330016583747927, "loss": 0.3885, "step": 2024 }, { "epoch": 1.1163175303197355, "grad_norm": 0.24028973281383514, "learning_rate": 0.00013291873963515753, "loss": 0.4071, "step": 2025 }, { "epoch": 1.1168687982359427, "grad_norm": 0.23416177928447723, "learning_rate": 0.00013283582089552236, "loss": 0.3815, "step": 2026 }, { "epoch": 1.11742006615215, "grad_norm": 0.2444845736026764, "learning_rate": 0.00013275290215588722, "loss": 0.4048, "step": 2027 }, { "epoch": 1.1179713340683572, "grad_norm": 0.23157843947410583, "learning_rate": 0.00013266998341625205, "loss": 0.402, "step": 2028 }, { "epoch": 1.1185226019845644, "grad_norm": 0.24158456921577454, "learning_rate": 0.0001325870646766169, "loss": 0.3821, "step": 2029 }, { "epoch": 1.1190738699007718, "grad_norm": 0.23520436882972717, "learning_rate": 0.00013250414593698174, "loss": 0.3848, "step": 2030 }, { "epoch": 1.119625137816979, "grad_norm": 0.2458154559135437, "learning_rate": 0.00013242122719734657, "loss": 0.3926, "step": 2031 }, { "epoch": 1.1201764057331864, "grad_norm": 0.2308206707239151, "learning_rate": 0.00013233830845771143, "loss": 0.3982, "step": 2032 }, { "epoch": 1.1207276736493936, "grad_norm": 0.23016606271266937, "learning_rate": 0.00013225538971807626, "loss": 0.3936, "step": 2033 }, { "epoch": 1.1212789415656008, "grad_norm": 0.24838510155677795, "learning_rate": 0.00013217247097844112, "loss": 0.4081, "step": 2034 }, { "epoch": 1.1218302094818082, "grad_norm": 0.2287745475769043, "learning_rate": 0.00013208955223880596, "loss": 0.371, "step": 2035 }, { "epoch": 1.1223814773980154, "grad_norm": 0.23816218972206116, "learning_rate": 0.0001320066334991708, "loss": 0.3952, "step": 2036 }, { "epoch": 1.1229327453142228, "grad_norm": 0.2324012964963913, "learning_rate": 0.00013192371475953565, "loss": 0.3861, "step": 2037 }, { "epoch": 1.12348401323043, "grad_norm": 0.23907962441444397, "learning_rate": 0.00013184079601990048, "loss": 0.3927, "step": 2038 }, { "epoch": 1.1240352811466372, "grad_norm": 0.2464779168367386, "learning_rate": 0.00013175787728026534, "loss": 0.4246, "step": 2039 }, { "epoch": 1.1245865490628446, "grad_norm": 0.23501858115196228, "learning_rate": 0.00013167495854063017, "loss": 0.3918, "step": 2040 }, { "epoch": 1.1251378169790518, "grad_norm": 0.2514742314815521, "learning_rate": 0.000131592039800995, "loss": 0.3828, "step": 2041 }, { "epoch": 1.1256890848952592, "grad_norm": 0.25326284766197205, "learning_rate": 0.00013150912106135986, "loss": 0.4042, "step": 2042 }, { "epoch": 1.1262403528114664, "grad_norm": 0.23037280142307281, "learning_rate": 0.0001314262023217247, "loss": 0.3919, "step": 2043 }, { "epoch": 1.1267916207276736, "grad_norm": 0.241755872964859, "learning_rate": 0.00013134328358208955, "loss": 0.3867, "step": 2044 }, { "epoch": 1.127342888643881, "grad_norm": 0.27031564712524414, "learning_rate": 0.00013126036484245438, "loss": 0.3767, "step": 2045 }, { "epoch": 1.1278941565600882, "grad_norm": 0.24623173475265503, "learning_rate": 0.00013117744610281922, "loss": 0.4077, "step": 2046 }, { "epoch": 1.1284454244762956, "grad_norm": 0.24347223341464996, "learning_rate": 0.00013109452736318408, "loss": 0.3846, "step": 2047 }, { "epoch": 1.1289966923925028, "grad_norm": 0.24663501977920532, "learning_rate": 0.0001310116086235489, "loss": 0.3992, "step": 2048 }, { "epoch": 1.12954796030871, "grad_norm": 0.23556159436702728, "learning_rate": 0.00013092868988391377, "loss": 0.3949, "step": 2049 }, { "epoch": 1.1300992282249174, "grad_norm": 0.21868300437927246, "learning_rate": 0.0001308457711442786, "loss": 0.3824, "step": 2050 }, { "epoch": 1.1306504961411246, "grad_norm": 0.23438437283039093, "learning_rate": 0.00013076285240464343, "loss": 0.3801, "step": 2051 }, { "epoch": 1.131201764057332, "grad_norm": 0.22960849106311798, "learning_rate": 0.0001306799336650083, "loss": 0.4088, "step": 2052 }, { "epoch": 1.1317530319735392, "grad_norm": 0.240730881690979, "learning_rate": 0.00013059701492537312, "loss": 0.3644, "step": 2053 }, { "epoch": 1.1323042998897463, "grad_norm": 0.2219470739364624, "learning_rate": 0.00013051409618573798, "loss": 0.3817, "step": 2054 }, { "epoch": 1.1328555678059538, "grad_norm": 0.22481395304203033, "learning_rate": 0.0001304311774461028, "loss": 0.3858, "step": 2055 }, { "epoch": 1.133406835722161, "grad_norm": 0.24147982895374298, "learning_rate": 0.00013034825870646765, "loss": 0.3977, "step": 2056 }, { "epoch": 1.1339581036383684, "grad_norm": 0.2390933483839035, "learning_rate": 0.0001302653399668325, "loss": 0.3985, "step": 2057 }, { "epoch": 1.1345093715545755, "grad_norm": 0.24776338040828705, "learning_rate": 0.00013018242122719734, "loss": 0.4026, "step": 2058 }, { "epoch": 1.1350606394707827, "grad_norm": 0.23255294561386108, "learning_rate": 0.0001300995024875622, "loss": 0.3975, "step": 2059 }, { "epoch": 1.1356119073869901, "grad_norm": 0.2401493936777115, "learning_rate": 0.00013001658374792703, "loss": 0.3924, "step": 2060 }, { "epoch": 1.1361631753031973, "grad_norm": 0.2360658049583435, "learning_rate": 0.00012993366500829186, "loss": 0.3835, "step": 2061 }, { "epoch": 1.1367144432194047, "grad_norm": 0.24272675812244415, "learning_rate": 0.0001298507462686567, "loss": 0.3816, "step": 2062 }, { "epoch": 1.137265711135612, "grad_norm": 0.2370130568742752, "learning_rate": 0.00012976782752902155, "loss": 0.3807, "step": 2063 }, { "epoch": 1.137816979051819, "grad_norm": 0.22449509799480438, "learning_rate": 0.00012968490878938638, "loss": 0.3857, "step": 2064 }, { "epoch": 1.1383682469680265, "grad_norm": 0.2332579791545868, "learning_rate": 0.00012960199004975121, "loss": 0.3882, "step": 2065 }, { "epoch": 1.1389195148842337, "grad_norm": 0.23922313749790192, "learning_rate": 0.00012951907131011607, "loss": 0.3924, "step": 2066 }, { "epoch": 1.1394707828004411, "grad_norm": 0.23937387764453888, "learning_rate": 0.0001294361525704809, "loss": 0.3982, "step": 2067 }, { "epoch": 1.1400220507166483, "grad_norm": 0.23198926448822021, "learning_rate": 0.00012935323383084577, "loss": 0.3971, "step": 2068 }, { "epoch": 1.1405733186328555, "grad_norm": 0.23774142563343048, "learning_rate": 0.0001292703150912106, "loss": 0.419, "step": 2069 }, { "epoch": 1.141124586549063, "grad_norm": 0.23457486927509308, "learning_rate": 0.00012918739635157543, "loss": 0.3947, "step": 2070 }, { "epoch": 1.14167585446527, "grad_norm": 0.23662830889225006, "learning_rate": 0.0001291044776119403, "loss": 0.3989, "step": 2071 }, { "epoch": 1.1422271223814775, "grad_norm": 0.2307705134153366, "learning_rate": 0.00012902155887230512, "loss": 0.3988, "step": 2072 }, { "epoch": 1.1427783902976847, "grad_norm": 0.23430916666984558, "learning_rate": 0.00012893864013266998, "loss": 0.3956, "step": 2073 }, { "epoch": 1.1433296582138919, "grad_norm": 0.24138319492340088, "learning_rate": 0.0001288557213930348, "loss": 0.4103, "step": 2074 }, { "epoch": 1.1438809261300993, "grad_norm": 0.22443422675132751, "learning_rate": 0.00012877280265339964, "loss": 0.3839, "step": 2075 }, { "epoch": 1.1444321940463065, "grad_norm": 0.2313619703054428, "learning_rate": 0.0001286898839137645, "loss": 0.4063, "step": 2076 }, { "epoch": 1.1449834619625139, "grad_norm": 0.22947578132152557, "learning_rate": 0.00012860696517412933, "loss": 0.3852, "step": 2077 }, { "epoch": 1.145534729878721, "grad_norm": 0.2276720404624939, "learning_rate": 0.0001285240464344942, "loss": 0.3968, "step": 2078 }, { "epoch": 1.1460859977949283, "grad_norm": 0.22463871538639069, "learning_rate": 0.00012844112769485903, "loss": 0.3904, "step": 2079 }, { "epoch": 1.1466372657111357, "grad_norm": 0.22553198039531708, "learning_rate": 0.00012835820895522386, "loss": 0.3902, "step": 2080 }, { "epoch": 1.1471885336273429, "grad_norm": 0.23410287499427795, "learning_rate": 0.00012827529021558872, "loss": 0.3952, "step": 2081 }, { "epoch": 1.1477398015435503, "grad_norm": 0.2365550547838211, "learning_rate": 0.00012819237147595355, "loss": 0.3907, "step": 2082 }, { "epoch": 1.1482910694597575, "grad_norm": 0.22853030264377594, "learning_rate": 0.0001281094527363184, "loss": 0.4041, "step": 2083 }, { "epoch": 1.1488423373759646, "grad_norm": 0.23059257864952087, "learning_rate": 0.00012802653399668324, "loss": 0.4047, "step": 2084 }, { "epoch": 1.149393605292172, "grad_norm": 0.23414267599582672, "learning_rate": 0.00012794361525704807, "loss": 0.4077, "step": 2085 }, { "epoch": 1.1499448732083792, "grad_norm": 0.23295001685619354, "learning_rate": 0.00012786069651741293, "loss": 0.3942, "step": 2086 }, { "epoch": 1.1504961411245866, "grad_norm": 0.23734460771083832, "learning_rate": 0.00012777777777777776, "loss": 0.4074, "step": 2087 }, { "epoch": 1.1510474090407938, "grad_norm": 0.21490591764450073, "learning_rate": 0.00012769485903814262, "loss": 0.3747, "step": 2088 }, { "epoch": 1.151598676957001, "grad_norm": 0.22734799981117249, "learning_rate": 0.00012761194029850745, "loss": 0.3836, "step": 2089 }, { "epoch": 1.1521499448732084, "grad_norm": 0.22835008800029755, "learning_rate": 0.0001275290215588723, "loss": 0.3983, "step": 2090 }, { "epoch": 1.1527012127894156, "grad_norm": 0.2260267287492752, "learning_rate": 0.00012744610281923715, "loss": 0.3785, "step": 2091 }, { "epoch": 1.153252480705623, "grad_norm": 0.22667206823825836, "learning_rate": 0.00012736318407960198, "loss": 0.3945, "step": 2092 }, { "epoch": 1.1538037486218302, "grad_norm": 0.23218148946762085, "learning_rate": 0.00012728026533996684, "loss": 0.3967, "step": 2093 }, { "epoch": 1.1543550165380374, "grad_norm": 0.24123932421207428, "learning_rate": 0.00012719734660033167, "loss": 0.3994, "step": 2094 }, { "epoch": 1.1549062844542448, "grad_norm": 0.23074567317962646, "learning_rate": 0.0001271144278606965, "loss": 0.405, "step": 2095 }, { "epoch": 1.155457552370452, "grad_norm": 0.23828662931919098, "learning_rate": 0.00012703150912106136, "loss": 0.3886, "step": 2096 }, { "epoch": 1.1560088202866594, "grad_norm": 0.22315117716789246, "learning_rate": 0.0001269485903814262, "loss": 0.3925, "step": 2097 }, { "epoch": 1.1565600882028666, "grad_norm": 0.22071965038776398, "learning_rate": 0.00012686567164179105, "loss": 0.3997, "step": 2098 }, { "epoch": 1.1571113561190738, "grad_norm": 0.22145338356494904, "learning_rate": 0.00012678275290215588, "loss": 0.3784, "step": 2099 }, { "epoch": 1.1576626240352812, "grad_norm": 0.2308942675590515, "learning_rate": 0.00012669983416252072, "loss": 0.3576, "step": 2100 }, { "epoch": 1.1582138919514884, "grad_norm": 0.2193097174167633, "learning_rate": 0.00012661691542288557, "loss": 0.3806, "step": 2101 }, { "epoch": 1.1587651598676958, "grad_norm": 0.2277258038520813, "learning_rate": 0.0001265339966832504, "loss": 0.389, "step": 2102 }, { "epoch": 1.159316427783903, "grad_norm": 0.22830741107463837, "learning_rate": 0.00012645107794361527, "loss": 0.4132, "step": 2103 }, { "epoch": 1.1598676957001102, "grad_norm": 0.22856192290782928, "learning_rate": 0.0001263681592039801, "loss": 0.3879, "step": 2104 }, { "epoch": 1.1604189636163176, "grad_norm": 0.23155651986598969, "learning_rate": 0.00012628524046434493, "loss": 0.3902, "step": 2105 }, { "epoch": 1.1609702315325248, "grad_norm": 0.22571994364261627, "learning_rate": 0.00012620232172470976, "loss": 0.4017, "step": 2106 }, { "epoch": 1.1615214994487322, "grad_norm": 0.2258533239364624, "learning_rate": 0.00012611940298507462, "loss": 0.4027, "step": 2107 }, { "epoch": 1.1620727673649394, "grad_norm": 0.24114197492599487, "learning_rate": 0.00012603648424543945, "loss": 0.3983, "step": 2108 }, { "epoch": 1.1626240352811466, "grad_norm": 0.22286631166934967, "learning_rate": 0.00012595356550580429, "loss": 0.4026, "step": 2109 }, { "epoch": 1.163175303197354, "grad_norm": 0.2404211014509201, "learning_rate": 0.00012587064676616914, "loss": 0.4082, "step": 2110 }, { "epoch": 1.1637265711135611, "grad_norm": 0.22578535974025726, "learning_rate": 0.00012578772802653398, "loss": 0.3881, "step": 2111 }, { "epoch": 1.1642778390297686, "grad_norm": 0.24066035449504852, "learning_rate": 0.00012570480928689884, "loss": 0.4144, "step": 2112 }, { "epoch": 1.1648291069459757, "grad_norm": 0.22703833878040314, "learning_rate": 0.00012562189054726367, "loss": 0.3942, "step": 2113 }, { "epoch": 1.165380374862183, "grad_norm": 0.2277577817440033, "learning_rate": 0.0001255389718076285, "loss": 0.4116, "step": 2114 }, { "epoch": 1.1659316427783903, "grad_norm": 0.2201533019542694, "learning_rate": 0.00012545605306799336, "loss": 0.3961, "step": 2115 }, { "epoch": 1.1664829106945975, "grad_norm": 0.22969132661819458, "learning_rate": 0.0001253731343283582, "loss": 0.4146, "step": 2116 }, { "epoch": 1.167034178610805, "grad_norm": 0.2208871990442276, "learning_rate": 0.00012529021558872305, "loss": 0.3925, "step": 2117 }, { "epoch": 1.1675854465270121, "grad_norm": 0.24675814807415009, "learning_rate": 0.00012520729684908788, "loss": 0.3923, "step": 2118 }, { "epoch": 1.1681367144432193, "grad_norm": 0.25365886092185974, "learning_rate": 0.00012512437810945271, "loss": 0.4018, "step": 2119 }, { "epoch": 1.1686879823594267, "grad_norm": 0.2352716475725174, "learning_rate": 0.00012504145936981757, "loss": 0.4136, "step": 2120 }, { "epoch": 1.169239250275634, "grad_norm": 0.22656375169754028, "learning_rate": 0.0001249585406301824, "loss": 0.3896, "step": 2121 }, { "epoch": 1.1697905181918413, "grad_norm": 0.22290179133415222, "learning_rate": 0.00012487562189054724, "loss": 0.4059, "step": 2122 }, { "epoch": 1.1703417861080485, "grad_norm": 0.24139589071273804, "learning_rate": 0.0001247927031509121, "loss": 0.3999, "step": 2123 }, { "epoch": 1.1708930540242557, "grad_norm": 0.24391639232635498, "learning_rate": 0.00012470978441127693, "loss": 0.3876, "step": 2124 }, { "epoch": 1.171444321940463, "grad_norm": 0.2283831685781479, "learning_rate": 0.0001246268656716418, "loss": 0.3988, "step": 2125 }, { "epoch": 1.1719955898566703, "grad_norm": 0.24799783527851105, "learning_rate": 0.00012454394693200662, "loss": 0.396, "step": 2126 }, { "epoch": 1.1725468577728777, "grad_norm": 0.22174561023712158, "learning_rate": 0.00012446102819237145, "loss": 0.3809, "step": 2127 }, { "epoch": 1.173098125689085, "grad_norm": 0.22951188683509827, "learning_rate": 0.0001243781094527363, "loss": 0.3882, "step": 2128 }, { "epoch": 1.173649393605292, "grad_norm": 0.21973788738250732, "learning_rate": 0.00012429519071310114, "loss": 0.3872, "step": 2129 }, { "epoch": 1.1742006615214995, "grad_norm": 0.22701437771320343, "learning_rate": 0.000124212271973466, "loss": 0.3876, "step": 2130 }, { "epoch": 1.1747519294377067, "grad_norm": 0.22394593060016632, "learning_rate": 0.00012412935323383083, "loss": 0.3874, "step": 2131 }, { "epoch": 1.175303197353914, "grad_norm": 0.24040114879608154, "learning_rate": 0.00012404643449419567, "loss": 0.3856, "step": 2132 }, { "epoch": 1.1758544652701213, "grad_norm": 0.2295607030391693, "learning_rate": 0.00012396351575456052, "loss": 0.3861, "step": 2133 }, { "epoch": 1.1764057331863285, "grad_norm": 0.229506716132164, "learning_rate": 0.00012388059701492536, "loss": 0.3877, "step": 2134 }, { "epoch": 1.1769570011025359, "grad_norm": 0.24226558208465576, "learning_rate": 0.00012379767827529022, "loss": 0.4051, "step": 2135 }, { "epoch": 1.177508269018743, "grad_norm": 0.23359960317611694, "learning_rate": 0.00012371475953565505, "loss": 0.3911, "step": 2136 }, { "epoch": 1.1780595369349505, "grad_norm": 0.24533167481422424, "learning_rate": 0.00012363184079601988, "loss": 0.4075, "step": 2137 }, { "epoch": 1.1786108048511577, "grad_norm": 0.22445149719715118, "learning_rate": 0.00012354892205638474, "loss": 0.3762, "step": 2138 }, { "epoch": 1.1791620727673648, "grad_norm": 0.2399044781923294, "learning_rate": 0.00012346600331674957, "loss": 0.375, "step": 2139 }, { "epoch": 1.1797133406835723, "grad_norm": 0.2472797930240631, "learning_rate": 0.00012338308457711443, "loss": 0.4036, "step": 2140 }, { "epoch": 1.1802646085997794, "grad_norm": 0.2297624945640564, "learning_rate": 0.00012330016583747926, "loss": 0.4154, "step": 2141 }, { "epoch": 1.1808158765159869, "grad_norm": 0.23524117469787598, "learning_rate": 0.0001232172470978441, "loss": 0.3879, "step": 2142 }, { "epoch": 1.181367144432194, "grad_norm": 0.23935049772262573, "learning_rate": 0.00012313432835820895, "loss": 0.4107, "step": 2143 }, { "epoch": 1.1819184123484012, "grad_norm": 0.21305608749389648, "learning_rate": 0.00012305140961857379, "loss": 0.3964, "step": 2144 }, { "epoch": 1.1824696802646086, "grad_norm": 0.2339240163564682, "learning_rate": 0.00012296849087893864, "loss": 0.4185, "step": 2145 }, { "epoch": 1.1830209481808158, "grad_norm": 0.23344539105892181, "learning_rate": 0.00012288557213930348, "loss": 0.3934, "step": 2146 }, { "epoch": 1.1835722160970232, "grad_norm": 0.2274356484413147, "learning_rate": 0.0001228026533996683, "loss": 0.3854, "step": 2147 }, { "epoch": 1.1841234840132304, "grad_norm": 0.23241972923278809, "learning_rate": 0.00012271973466003317, "loss": 0.4106, "step": 2148 }, { "epoch": 1.1846747519294376, "grad_norm": 0.22595259547233582, "learning_rate": 0.000122636815920398, "loss": 0.401, "step": 2149 }, { "epoch": 1.185226019845645, "grad_norm": 0.22598454356193542, "learning_rate": 0.00012255389718076283, "loss": 0.4041, "step": 2150 }, { "epoch": 1.1857772877618522, "grad_norm": 0.233281672000885, "learning_rate": 0.00012247097844112766, "loss": 0.3763, "step": 2151 }, { "epoch": 1.1863285556780596, "grad_norm": 0.22901344299316406, "learning_rate": 0.00012238805970149252, "loss": 0.3949, "step": 2152 }, { "epoch": 1.1868798235942668, "grad_norm": 0.24648213386535645, "learning_rate": 0.00012230514096185736, "loss": 0.4229, "step": 2153 }, { "epoch": 1.187431091510474, "grad_norm": 0.24580827355384827, "learning_rate": 0.00012222222222222221, "loss": 0.4125, "step": 2154 }, { "epoch": 1.1879823594266814, "grad_norm": 0.23127946257591248, "learning_rate": 0.00012213930348258705, "loss": 0.3727, "step": 2155 }, { "epoch": 1.1885336273428886, "grad_norm": 0.2267657071352005, "learning_rate": 0.00012205638474295189, "loss": 0.3951, "step": 2156 }, { "epoch": 1.189084895259096, "grad_norm": 0.23497919738292694, "learning_rate": 0.00012197346600331674, "loss": 0.3721, "step": 2157 }, { "epoch": 1.1896361631753032, "grad_norm": 0.22601653635501862, "learning_rate": 0.00012189054726368157, "loss": 0.3945, "step": 2158 }, { "epoch": 1.1901874310915104, "grad_norm": 0.21945270895957947, "learning_rate": 0.00012180762852404642, "loss": 0.3574, "step": 2159 }, { "epoch": 1.1907386990077178, "grad_norm": 0.2285127341747284, "learning_rate": 0.00012172470978441126, "loss": 0.3891, "step": 2160 }, { "epoch": 1.191289966923925, "grad_norm": 0.23766474425792694, "learning_rate": 0.0001216417910447761, "loss": 0.3968, "step": 2161 }, { "epoch": 1.1918412348401324, "grad_norm": 0.23863717913627625, "learning_rate": 0.00012155887230514095, "loss": 0.389, "step": 2162 }, { "epoch": 1.1923925027563396, "grad_norm": 0.22550217807292938, "learning_rate": 0.00012147595356550578, "loss": 0.3842, "step": 2163 }, { "epoch": 1.1929437706725468, "grad_norm": 0.22460085153579712, "learning_rate": 0.00012139303482587063, "loss": 0.3874, "step": 2164 }, { "epoch": 1.1934950385887542, "grad_norm": 0.2168971300125122, "learning_rate": 0.00012131011608623548, "loss": 0.3783, "step": 2165 }, { "epoch": 1.1940463065049614, "grad_norm": 0.2768751084804535, "learning_rate": 0.00012122719734660032, "loss": 0.4206, "step": 2166 }, { "epoch": 1.1945975744211688, "grad_norm": 0.2357032299041748, "learning_rate": 0.00012114427860696517, "loss": 0.3943, "step": 2167 }, { "epoch": 1.195148842337376, "grad_norm": 0.24314233660697937, "learning_rate": 0.00012106135986733, "loss": 0.3983, "step": 2168 }, { "epoch": 1.1957001102535831, "grad_norm": 0.2605820596218109, "learning_rate": 0.00012097844112769484, "loss": 0.4036, "step": 2169 }, { "epoch": 1.1962513781697905, "grad_norm": 0.22138415277004242, "learning_rate": 0.00012089552238805969, "loss": 0.3794, "step": 2170 }, { "epoch": 1.1968026460859977, "grad_norm": 0.2328760325908661, "learning_rate": 0.00012081260364842454, "loss": 0.3948, "step": 2171 }, { "epoch": 1.1973539140022051, "grad_norm": 0.22606134414672852, "learning_rate": 0.00012072968490878938, "loss": 0.3958, "step": 2172 }, { "epoch": 1.1979051819184123, "grad_norm": 0.25683924555778503, "learning_rate": 0.00012064676616915421, "loss": 0.3939, "step": 2173 }, { "epoch": 1.1984564498346195, "grad_norm": 0.22325700521469116, "learning_rate": 0.00012056384742951906, "loss": 0.3915, "step": 2174 }, { "epoch": 1.199007717750827, "grad_norm": 0.21337918937206268, "learning_rate": 0.0001204809286898839, "loss": 0.3699, "step": 2175 }, { "epoch": 1.1995589856670341, "grad_norm": 0.2343214452266693, "learning_rate": 0.00012039800995024875, "loss": 0.4029, "step": 2176 }, { "epoch": 1.2001102535832415, "grad_norm": 0.2408185601234436, "learning_rate": 0.0001203150912106136, "loss": 0.3915, "step": 2177 }, { "epoch": 1.2006615214994487, "grad_norm": 0.2592547535896301, "learning_rate": 0.00012023217247097843, "loss": 0.409, "step": 2178 }, { "epoch": 1.201212789415656, "grad_norm": 0.2201685607433319, "learning_rate": 0.00012014925373134327, "loss": 0.381, "step": 2179 }, { "epoch": 1.2017640573318633, "grad_norm": 0.23619139194488525, "learning_rate": 0.00012006633499170812, "loss": 0.3708, "step": 2180 }, { "epoch": 1.2023153252480705, "grad_norm": 0.24719634652137756, "learning_rate": 0.00011998341625207296, "loss": 0.3996, "step": 2181 }, { "epoch": 1.202866593164278, "grad_norm": 0.24691031873226166, "learning_rate": 0.00011990049751243781, "loss": 0.3897, "step": 2182 }, { "epoch": 1.203417861080485, "grad_norm": 0.2518804967403412, "learning_rate": 0.00011981757877280264, "loss": 0.3886, "step": 2183 }, { "epoch": 1.2039691289966923, "grad_norm": 0.2279016375541687, "learning_rate": 0.00011973466003316749, "loss": 0.3791, "step": 2184 }, { "epoch": 1.2045203969128997, "grad_norm": 0.24580788612365723, "learning_rate": 0.00011965174129353233, "loss": 0.4013, "step": 2185 }, { "epoch": 1.2050716648291069, "grad_norm": 0.2422635406255722, "learning_rate": 0.00011956882255389718, "loss": 0.3831, "step": 2186 }, { "epoch": 1.2056229327453143, "grad_norm": 0.24743367731571198, "learning_rate": 0.00011948590381426202, "loss": 0.3939, "step": 2187 }, { "epoch": 1.2061742006615215, "grad_norm": 0.24504512548446655, "learning_rate": 0.00011940298507462686, "loss": 0.3976, "step": 2188 }, { "epoch": 1.2067254685777287, "grad_norm": 0.2121214121580124, "learning_rate": 0.0001193200663349917, "loss": 0.3692, "step": 2189 }, { "epoch": 1.207276736493936, "grad_norm": 0.23639699816703796, "learning_rate": 0.00011923714759535655, "loss": 0.3999, "step": 2190 }, { "epoch": 1.2078280044101433, "grad_norm": 0.2503402531147003, "learning_rate": 0.00011915422885572139, "loss": 0.3807, "step": 2191 }, { "epoch": 1.2083792723263507, "grad_norm": 0.2412857562303543, "learning_rate": 0.00011907131011608624, "loss": 0.397, "step": 2192 }, { "epoch": 1.2089305402425579, "grad_norm": 0.2293364554643631, "learning_rate": 0.00011898839137645107, "loss": 0.3752, "step": 2193 }, { "epoch": 1.209481808158765, "grad_norm": 0.23062635958194733, "learning_rate": 0.00011890547263681592, "loss": 0.3779, "step": 2194 }, { "epoch": 1.2100330760749725, "grad_norm": 0.23140175640583038, "learning_rate": 0.00011882255389718075, "loss": 0.3763, "step": 2195 }, { "epoch": 1.2105843439911796, "grad_norm": 0.23366335034370422, "learning_rate": 0.0001187396351575456, "loss": 0.3959, "step": 2196 }, { "epoch": 1.211135611907387, "grad_norm": 0.2382514774799347, "learning_rate": 0.00011865671641791043, "loss": 0.3876, "step": 2197 }, { "epoch": 1.2116868798235942, "grad_norm": 0.23558002710342407, "learning_rate": 0.00011857379767827527, "loss": 0.4032, "step": 2198 }, { "epoch": 1.2122381477398014, "grad_norm": 0.23793788254261017, "learning_rate": 0.00011849087893864012, "loss": 0.3909, "step": 2199 }, { "epoch": 1.2127894156560088, "grad_norm": 0.2181142121553421, "learning_rate": 0.00011840796019900496, "loss": 0.3923, "step": 2200 }, { "epoch": 1.213340683572216, "grad_norm": 0.21802657842636108, "learning_rate": 0.00011832504145936981, "loss": 0.3795, "step": 2201 }, { "epoch": 1.2138919514884234, "grad_norm": 0.2436913102865219, "learning_rate": 0.00011824212271973464, "loss": 0.3985, "step": 2202 }, { "epoch": 1.2144432194046306, "grad_norm": 0.22913113236427307, "learning_rate": 0.00011815920398009949, "loss": 0.3872, "step": 2203 }, { "epoch": 1.2149944873208378, "grad_norm": 0.2223367691040039, "learning_rate": 0.00011807628524046433, "loss": 0.3905, "step": 2204 }, { "epoch": 1.2155457552370452, "grad_norm": 0.23263731598854065, "learning_rate": 0.00011799336650082918, "loss": 0.4048, "step": 2205 }, { "epoch": 1.2160970231532524, "grad_norm": 0.2505498230457306, "learning_rate": 0.00011791044776119402, "loss": 0.395, "step": 2206 }, { "epoch": 1.2166482910694598, "grad_norm": 0.2553291916847229, "learning_rate": 0.00011782752902155885, "loss": 0.3935, "step": 2207 }, { "epoch": 1.217199558985667, "grad_norm": 0.22239425778388977, "learning_rate": 0.0001177446102819237, "loss": 0.381, "step": 2208 }, { "epoch": 1.2177508269018742, "grad_norm": 0.21807150542736053, "learning_rate": 0.00011766169154228855, "loss": 0.3878, "step": 2209 }, { "epoch": 1.2183020948180816, "grad_norm": 0.23478740453720093, "learning_rate": 0.00011757877280265339, "loss": 0.3815, "step": 2210 }, { "epoch": 1.2188533627342888, "grad_norm": 0.23702913522720337, "learning_rate": 0.00011749585406301822, "loss": 0.4001, "step": 2211 }, { "epoch": 1.2194046306504962, "grad_norm": 0.23261341452598572, "learning_rate": 0.00011741293532338307, "loss": 0.3935, "step": 2212 }, { "epoch": 1.2199558985667034, "grad_norm": 0.22314967215061188, "learning_rate": 0.00011733001658374791, "loss": 0.4048, "step": 2213 }, { "epoch": 1.2205071664829106, "grad_norm": 0.23277883231639862, "learning_rate": 0.00011724709784411276, "loss": 0.3739, "step": 2214 }, { "epoch": 1.221058434399118, "grad_norm": 0.24505817890167236, "learning_rate": 0.0001171641791044776, "loss": 0.3922, "step": 2215 }, { "epoch": 1.2216097023153252, "grad_norm": 0.24386508762836456, "learning_rate": 0.00011708126036484244, "loss": 0.3872, "step": 2216 }, { "epoch": 1.2221609702315326, "grad_norm": 0.2437102198600769, "learning_rate": 0.00011699834162520728, "loss": 0.4048, "step": 2217 }, { "epoch": 1.2227122381477398, "grad_norm": 0.22707347571849823, "learning_rate": 0.00011691542288557213, "loss": 0.3996, "step": 2218 }, { "epoch": 1.223263506063947, "grad_norm": 0.23951935768127441, "learning_rate": 0.00011683250414593697, "loss": 0.399, "step": 2219 }, { "epoch": 1.2238147739801544, "grad_norm": 0.27458345890045166, "learning_rate": 0.00011674958540630182, "loss": 0.4093, "step": 2220 }, { "epoch": 1.2243660418963616, "grad_norm": 0.23940932750701904, "learning_rate": 0.00011666666666666665, "loss": 0.3915, "step": 2221 }, { "epoch": 1.224917309812569, "grad_norm": 0.24100755155086517, "learning_rate": 0.0001165837479270315, "loss": 0.3915, "step": 2222 }, { "epoch": 1.2254685777287762, "grad_norm": 0.2423773556947708, "learning_rate": 0.00011650082918739634, "loss": 0.4061, "step": 2223 }, { "epoch": 1.2260198456449833, "grad_norm": 0.2552812099456787, "learning_rate": 0.00011641791044776119, "loss": 0.3922, "step": 2224 }, { "epoch": 1.2265711135611908, "grad_norm": 0.24121615290641785, "learning_rate": 0.00011633499170812603, "loss": 0.3949, "step": 2225 }, { "epoch": 1.227122381477398, "grad_norm": 0.24254634976387024, "learning_rate": 0.00011625207296849087, "loss": 0.3776, "step": 2226 }, { "epoch": 1.2276736493936053, "grad_norm": 0.2757539451122284, "learning_rate": 0.00011616915422885571, "loss": 0.4181, "step": 2227 }, { "epoch": 1.2282249173098125, "grad_norm": 0.25508221983909607, "learning_rate": 0.00011608623548922056, "loss": 0.4069, "step": 2228 }, { "epoch": 1.2287761852260197, "grad_norm": 0.24166013300418854, "learning_rate": 0.0001160033167495854, "loss": 0.3848, "step": 2229 }, { "epoch": 1.2293274531422271, "grad_norm": 0.23408280313014984, "learning_rate": 0.00011592039800995025, "loss": 0.3867, "step": 2230 }, { "epoch": 1.2298787210584343, "grad_norm": 0.2366735339164734, "learning_rate": 0.00011583747927031508, "loss": 0.407, "step": 2231 }, { "epoch": 1.2304299889746417, "grad_norm": 0.247688889503479, "learning_rate": 0.00011575456053067993, "loss": 0.3898, "step": 2232 }, { "epoch": 1.230981256890849, "grad_norm": 0.23416852951049805, "learning_rate": 0.00011567164179104477, "loss": 0.3871, "step": 2233 }, { "epoch": 1.231532524807056, "grad_norm": 0.243104949593544, "learning_rate": 0.00011558872305140962, "loss": 0.4209, "step": 2234 }, { "epoch": 1.2320837927232635, "grad_norm": 0.23723013699054718, "learning_rate": 0.00011550580431177446, "loss": 0.3867, "step": 2235 }, { "epoch": 1.2326350606394707, "grad_norm": 0.2383720874786377, "learning_rate": 0.0001154228855721393, "loss": 0.3861, "step": 2236 }, { "epoch": 1.2331863285556781, "grad_norm": 0.25127896666526794, "learning_rate": 0.00011533996683250414, "loss": 0.4039, "step": 2237 }, { "epoch": 1.2337375964718853, "grad_norm": 0.23529255390167236, "learning_rate": 0.00011525704809286899, "loss": 0.3838, "step": 2238 }, { "epoch": 1.2342888643880925, "grad_norm": 0.2100450098514557, "learning_rate": 0.00011517412935323382, "loss": 0.3639, "step": 2239 }, { "epoch": 1.2348401323043, "grad_norm": 0.24556870758533478, "learning_rate": 0.00011509121061359865, "loss": 0.3901, "step": 2240 }, { "epoch": 1.235391400220507, "grad_norm": 0.2549160420894623, "learning_rate": 0.0001150082918739635, "loss": 0.3871, "step": 2241 }, { "epoch": 1.2359426681367145, "grad_norm": 0.23175586760044098, "learning_rate": 0.00011492537313432834, "loss": 0.3886, "step": 2242 }, { "epoch": 1.2364939360529217, "grad_norm": 0.2296617478132248, "learning_rate": 0.00011484245439469319, "loss": 0.406, "step": 2243 }, { "epoch": 1.237045203969129, "grad_norm": 0.2378944754600525, "learning_rate": 0.00011475953565505803, "loss": 0.3949, "step": 2244 }, { "epoch": 1.2375964718853363, "grad_norm": 0.23094962537288666, "learning_rate": 0.00011467661691542286, "loss": 0.3875, "step": 2245 }, { "epoch": 1.2381477398015435, "grad_norm": 0.22399038076400757, "learning_rate": 0.00011459369817578771, "loss": 0.4009, "step": 2246 }, { "epoch": 1.2386990077177509, "grad_norm": 0.24871258437633514, "learning_rate": 0.00011451077943615256, "loss": 0.3926, "step": 2247 }, { "epoch": 1.239250275633958, "grad_norm": 0.23597979545593262, "learning_rate": 0.0001144278606965174, "loss": 0.3803, "step": 2248 }, { "epoch": 1.2398015435501655, "grad_norm": 0.23361554741859436, "learning_rate": 0.00011434494195688225, "loss": 0.3994, "step": 2249 }, { "epoch": 1.2403528114663727, "grad_norm": 0.2614096999168396, "learning_rate": 0.00011426202321724708, "loss": 0.3946, "step": 2250 }, { "epoch": 1.2409040793825798, "grad_norm": 0.23481406271457672, "learning_rate": 0.00011417910447761192, "loss": 0.3981, "step": 2251 }, { "epoch": 1.2414553472987873, "grad_norm": 0.21524877846240997, "learning_rate": 0.00011409618573797677, "loss": 0.3725, "step": 2252 }, { "epoch": 1.2420066152149944, "grad_norm": 0.2307668924331665, "learning_rate": 0.00011401326699834162, "loss": 0.3829, "step": 2253 }, { "epoch": 1.2425578831312019, "grad_norm": 0.2581194341182709, "learning_rate": 0.00011393034825870646, "loss": 0.3901, "step": 2254 }, { "epoch": 1.243109151047409, "grad_norm": 0.235372856259346, "learning_rate": 0.0001138474295190713, "loss": 0.3922, "step": 2255 }, { "epoch": 1.2436604189636162, "grad_norm": 0.23432569205760956, "learning_rate": 0.00011376451077943614, "loss": 0.3767, "step": 2256 }, { "epoch": 1.2442116868798236, "grad_norm": 0.2407122552394867, "learning_rate": 0.00011368159203980098, "loss": 0.4207, "step": 2257 }, { "epoch": 1.2447629547960308, "grad_norm": 0.25739043951034546, "learning_rate": 0.00011359867330016583, "loss": 0.3838, "step": 2258 }, { "epoch": 1.2453142227122382, "grad_norm": 0.25240135192871094, "learning_rate": 0.00011351575456053068, "loss": 0.3989, "step": 2259 }, { "epoch": 1.2458654906284454, "grad_norm": 0.22552815079689026, "learning_rate": 0.00011343283582089551, "loss": 0.3848, "step": 2260 }, { "epoch": 1.2464167585446526, "grad_norm": 0.2320718765258789, "learning_rate": 0.00011334991708126035, "loss": 0.382, "step": 2261 }, { "epoch": 1.24696802646086, "grad_norm": 0.23423726856708527, "learning_rate": 0.0001132669983416252, "loss": 0.3817, "step": 2262 }, { "epoch": 1.2475192943770672, "grad_norm": 0.22892701625823975, "learning_rate": 0.00011318407960199004, "loss": 0.3858, "step": 2263 }, { "epoch": 1.2480705622932746, "grad_norm": 0.23635762929916382, "learning_rate": 0.00011310116086235489, "loss": 0.3946, "step": 2264 }, { "epoch": 1.2486218302094818, "grad_norm": 0.23909956216812134, "learning_rate": 0.00011301824212271972, "loss": 0.3826, "step": 2265 }, { "epoch": 1.249173098125689, "grad_norm": 0.23733805119991302, "learning_rate": 0.00011293532338308457, "loss": 0.4215, "step": 2266 }, { "epoch": 1.2497243660418964, "grad_norm": 0.2257446050643921, "learning_rate": 0.00011285240464344941, "loss": 0.3959, "step": 2267 }, { "epoch": 1.2502756339581036, "grad_norm": 0.2394627183675766, "learning_rate": 0.00011276948590381426, "loss": 0.398, "step": 2268 }, { "epoch": 1.2508269018743108, "grad_norm": 0.22113938629627228, "learning_rate": 0.0001126865671641791, "loss": 0.3837, "step": 2269 }, { "epoch": 1.2513781697905182, "grad_norm": 0.22951479256153107, "learning_rate": 0.00011260364842454394, "loss": 0.391, "step": 2270 }, { "epoch": 1.2519294377067256, "grad_norm": 0.22468437254428864, "learning_rate": 0.00011252072968490878, "loss": 0.3788, "step": 2271 }, { "epoch": 1.2524807056229328, "grad_norm": 0.21054887771606445, "learning_rate": 0.00011243781094527363, "loss": 0.3891, "step": 2272 }, { "epoch": 1.25303197353914, "grad_norm": 0.2274617701768875, "learning_rate": 0.00011235489220563847, "loss": 0.3883, "step": 2273 }, { "epoch": 1.2535832414553472, "grad_norm": 0.22995011508464813, "learning_rate": 0.0001122719734660033, "loss": 0.3847, "step": 2274 }, { "epoch": 1.2541345093715546, "grad_norm": 0.22627364099025726, "learning_rate": 0.00011218905472636815, "loss": 0.3924, "step": 2275 }, { "epoch": 1.254685777287762, "grad_norm": 0.23559615015983582, "learning_rate": 0.000112106135986733, "loss": 0.3966, "step": 2276 }, { "epoch": 1.2552370452039692, "grad_norm": 0.21304303407669067, "learning_rate": 0.00011202321724709784, "loss": 0.3624, "step": 2277 }, { "epoch": 1.2557883131201764, "grad_norm": 0.241587296128273, "learning_rate": 0.00011194029850746269, "loss": 0.3719, "step": 2278 }, { "epoch": 1.2563395810363835, "grad_norm": 0.22992491722106934, "learning_rate": 0.00011185737976782752, "loss": 0.4019, "step": 2279 }, { "epoch": 1.256890848952591, "grad_norm": 0.2323186844587326, "learning_rate": 0.00011177446102819237, "loss": 0.3725, "step": 2280 }, { "epoch": 1.2574421168687984, "grad_norm": 0.23510509729385376, "learning_rate": 0.00011169154228855721, "loss": 0.4176, "step": 2281 }, { "epoch": 1.2579933847850056, "grad_norm": 0.23601877689361572, "learning_rate": 0.00011160862354892206, "loss": 0.4036, "step": 2282 }, { "epoch": 1.2585446527012127, "grad_norm": 0.23654739558696747, "learning_rate": 0.00011152570480928687, "loss": 0.403, "step": 2283 }, { "epoch": 1.25909592061742, "grad_norm": 0.2428976446390152, "learning_rate": 0.00011144278606965172, "loss": 0.3703, "step": 2284 }, { "epoch": 1.2596471885336273, "grad_norm": 0.23753516376018524, "learning_rate": 0.00011135986733001657, "loss": 0.3979, "step": 2285 }, { "epoch": 1.2601984564498347, "grad_norm": 0.2367447316646576, "learning_rate": 0.00011127694859038141, "loss": 0.3822, "step": 2286 }, { "epoch": 1.260749724366042, "grad_norm": 0.2365788072347641, "learning_rate": 0.00011119402985074626, "loss": 0.389, "step": 2287 }, { "epoch": 1.2613009922822491, "grad_norm": 0.22868278622627258, "learning_rate": 0.00011111111111111109, "loss": 0.391, "step": 2288 }, { "epoch": 1.2618522601984565, "grad_norm": 0.23099401593208313, "learning_rate": 0.00011102819237147593, "loss": 0.3947, "step": 2289 }, { "epoch": 1.2624035281146637, "grad_norm": 0.24031782150268555, "learning_rate": 0.00011094527363184078, "loss": 0.3839, "step": 2290 }, { "epoch": 1.2629547960308711, "grad_norm": 0.2490132451057434, "learning_rate": 0.00011086235489220563, "loss": 0.3896, "step": 2291 }, { "epoch": 1.2635060639470783, "grad_norm": 0.2366219013929367, "learning_rate": 0.00011077943615257047, "loss": 0.3933, "step": 2292 }, { "epoch": 1.2640573318632855, "grad_norm": 0.22578656673431396, "learning_rate": 0.0001106965174129353, "loss": 0.3723, "step": 2293 }, { "epoch": 1.264608599779493, "grad_norm": 0.23483921587467194, "learning_rate": 0.00011061359867330015, "loss": 0.3895, "step": 2294 }, { "epoch": 1.2651598676957, "grad_norm": 0.2586977481842041, "learning_rate": 0.000110530679933665, "loss": 0.4042, "step": 2295 }, { "epoch": 1.2657111356119075, "grad_norm": 0.23051442205905914, "learning_rate": 0.00011044776119402984, "loss": 0.3862, "step": 2296 }, { "epoch": 1.2662624035281147, "grad_norm": 0.2358439564704895, "learning_rate": 0.00011036484245439469, "loss": 0.3798, "step": 2297 }, { "epoch": 1.2668136714443219, "grad_norm": 0.23679201304912567, "learning_rate": 0.00011028192371475952, "loss": 0.4037, "step": 2298 }, { "epoch": 1.2673649393605293, "grad_norm": 0.23940104246139526, "learning_rate": 0.00011019900497512436, "loss": 0.3898, "step": 2299 }, { "epoch": 1.2679162072767365, "grad_norm": 0.23662586510181427, "learning_rate": 0.00011011608623548921, "loss": 0.4001, "step": 2300 }, { "epoch": 1.268467475192944, "grad_norm": 0.23159541189670563, "learning_rate": 0.00011003316749585405, "loss": 0.3919, "step": 2301 }, { "epoch": 1.269018743109151, "grad_norm": 0.21939191222190857, "learning_rate": 0.0001099502487562189, "loss": 0.3902, "step": 2302 }, { "epoch": 1.2695700110253583, "grad_norm": 0.24052447080612183, "learning_rate": 0.00010986733001658373, "loss": 0.391, "step": 2303 }, { "epoch": 1.2701212789415657, "grad_norm": 0.22359569370746613, "learning_rate": 0.00010978441127694858, "loss": 0.3813, "step": 2304 }, { "epoch": 1.2706725468577729, "grad_norm": 0.22367626428604126, "learning_rate": 0.00010970149253731342, "loss": 0.3873, "step": 2305 }, { "epoch": 1.2712238147739803, "grad_norm": 0.24156810343265533, "learning_rate": 0.00010961857379767827, "loss": 0.3996, "step": 2306 }, { "epoch": 1.2717750826901875, "grad_norm": 0.23700320720672607, "learning_rate": 0.00010953565505804311, "loss": 0.3901, "step": 2307 }, { "epoch": 1.2723263506063947, "grad_norm": 0.2303237020969391, "learning_rate": 0.00010945273631840795, "loss": 0.4031, "step": 2308 }, { "epoch": 1.272877618522602, "grad_norm": 0.2249428927898407, "learning_rate": 0.00010936981757877279, "loss": 0.3942, "step": 2309 }, { "epoch": 1.2734288864388092, "grad_norm": 0.2448328137397766, "learning_rate": 0.00010928689883913764, "loss": 0.3941, "step": 2310 }, { "epoch": 1.2739801543550167, "grad_norm": 0.23278410732746124, "learning_rate": 0.00010920398009950248, "loss": 0.395, "step": 2311 }, { "epoch": 1.2745314222712238, "grad_norm": 0.24542638659477234, "learning_rate": 0.00010912106135986733, "loss": 0.4278, "step": 2312 }, { "epoch": 1.275082690187431, "grad_norm": 0.22305360436439514, "learning_rate": 0.00010903814262023216, "loss": 0.3932, "step": 2313 }, { "epoch": 1.2756339581036384, "grad_norm": 0.24365827441215515, "learning_rate": 0.00010895522388059701, "loss": 0.3963, "step": 2314 }, { "epoch": 1.2761852260198456, "grad_norm": 0.24421466886997223, "learning_rate": 0.00010887230514096185, "loss": 0.3956, "step": 2315 }, { "epoch": 1.276736493936053, "grad_norm": 0.24353346228599548, "learning_rate": 0.0001087893864013267, "loss": 0.3837, "step": 2316 }, { "epoch": 1.2772877618522602, "grad_norm": 0.24044160544872284, "learning_rate": 0.00010870646766169154, "loss": 0.3964, "step": 2317 }, { "epoch": 1.2778390297684674, "grad_norm": 0.2651362717151642, "learning_rate": 0.00010862354892205638, "loss": 0.388, "step": 2318 }, { "epoch": 1.2783902976846748, "grad_norm": 0.23700033128261566, "learning_rate": 0.00010854063018242122, "loss": 0.38, "step": 2319 }, { "epoch": 1.278941565600882, "grad_norm": 0.23535655438899994, "learning_rate": 0.00010845771144278607, "loss": 0.3934, "step": 2320 }, { "epoch": 1.2794928335170894, "grad_norm": 0.26524481177330017, "learning_rate": 0.00010837479270315091, "loss": 0.3875, "step": 2321 }, { "epoch": 1.2800441014332966, "grad_norm": 0.24175146222114563, "learning_rate": 0.00010829187396351576, "loss": 0.3634, "step": 2322 }, { "epoch": 1.2805953693495038, "grad_norm": 0.231819286942482, "learning_rate": 0.00010820895522388059, "loss": 0.388, "step": 2323 }, { "epoch": 1.2811466372657112, "grad_norm": 0.21814289689064026, "learning_rate": 0.00010812603648424544, "loss": 0.3711, "step": 2324 }, { "epoch": 1.2816979051819184, "grad_norm": 0.23096728324890137, "learning_rate": 0.00010804311774461028, "loss": 0.3974, "step": 2325 }, { "epoch": 1.2822491730981258, "grad_norm": 0.24553930759429932, "learning_rate": 0.00010796019900497513, "loss": 0.3897, "step": 2326 }, { "epoch": 1.282800441014333, "grad_norm": 0.23141168057918549, "learning_rate": 0.00010787728026533995, "loss": 0.3898, "step": 2327 }, { "epoch": 1.2833517089305402, "grad_norm": 0.23394468426704407, "learning_rate": 0.00010779436152570479, "loss": 0.4049, "step": 2328 }, { "epoch": 1.2839029768467476, "grad_norm": 0.2231445461511612, "learning_rate": 0.00010771144278606964, "loss": 0.3911, "step": 2329 }, { "epoch": 1.2844542447629548, "grad_norm": 0.2506980299949646, "learning_rate": 0.00010762852404643448, "loss": 0.423, "step": 2330 }, { "epoch": 1.2850055126791622, "grad_norm": 0.23698961734771729, "learning_rate": 0.00010754560530679931, "loss": 0.4046, "step": 2331 }, { "epoch": 1.2855567805953694, "grad_norm": 0.24735629558563232, "learning_rate": 0.00010746268656716416, "loss": 0.4078, "step": 2332 }, { "epoch": 1.2861080485115766, "grad_norm": 0.25394487380981445, "learning_rate": 0.000107379767827529, "loss": 0.4027, "step": 2333 }, { "epoch": 1.286659316427784, "grad_norm": 0.24036946892738342, "learning_rate": 0.00010729684908789385, "loss": 0.4042, "step": 2334 }, { "epoch": 1.2872105843439912, "grad_norm": 0.24319007992744446, "learning_rate": 0.0001072139303482587, "loss": 0.3901, "step": 2335 }, { "epoch": 1.2877618522601986, "grad_norm": 0.23505842685699463, "learning_rate": 0.00010713101160862353, "loss": 0.3914, "step": 2336 }, { "epoch": 1.2883131201764058, "grad_norm": 0.24473319947719574, "learning_rate": 0.00010704809286898837, "loss": 0.4098, "step": 2337 }, { "epoch": 1.288864388092613, "grad_norm": 0.24411208927631378, "learning_rate": 0.00010696517412935322, "loss": 0.4158, "step": 2338 }, { "epoch": 1.2894156560088204, "grad_norm": 0.2365306317806244, "learning_rate": 0.00010688225538971807, "loss": 0.3955, "step": 2339 }, { "epoch": 1.2899669239250275, "grad_norm": 0.23471403121948242, "learning_rate": 0.00010679933665008291, "loss": 0.3796, "step": 2340 }, { "epoch": 1.290518191841235, "grad_norm": 0.22727487981319427, "learning_rate": 0.00010671641791044774, "loss": 0.4044, "step": 2341 }, { "epoch": 1.2910694597574421, "grad_norm": 0.22571586072444916, "learning_rate": 0.00010663349917081259, "loss": 0.3551, "step": 2342 }, { "epoch": 1.2916207276736493, "grad_norm": 0.24545998871326447, "learning_rate": 0.00010655058043117743, "loss": 0.4144, "step": 2343 }, { "epoch": 1.2921719955898567, "grad_norm": 0.2357962727546692, "learning_rate": 0.00010646766169154228, "loss": 0.391, "step": 2344 }, { "epoch": 1.292723263506064, "grad_norm": 0.23277200758457184, "learning_rate": 0.00010638474295190713, "loss": 0.4027, "step": 2345 }, { "epoch": 1.2932745314222713, "grad_norm": 0.2385130524635315, "learning_rate": 0.00010630182421227196, "loss": 0.4039, "step": 2346 }, { "epoch": 1.2938257993384785, "grad_norm": 0.21902373433113098, "learning_rate": 0.0001062189054726368, "loss": 0.3699, "step": 2347 }, { "epoch": 1.2943770672546857, "grad_norm": 0.23025818169116974, "learning_rate": 0.00010613598673300165, "loss": 0.3822, "step": 2348 }, { "epoch": 1.2949283351708931, "grad_norm": 0.2286684513092041, "learning_rate": 0.0001060530679933665, "loss": 0.401, "step": 2349 }, { "epoch": 1.2954796030871003, "grad_norm": 0.23381029069423676, "learning_rate": 0.00010597014925373134, "loss": 0.3991, "step": 2350 }, { "epoch": 1.2960308710033077, "grad_norm": 0.23572219908237457, "learning_rate": 0.00010588723051409617, "loss": 0.3993, "step": 2351 }, { "epoch": 1.296582138919515, "grad_norm": 0.22969138622283936, "learning_rate": 0.00010580431177446102, "loss": 0.3859, "step": 2352 }, { "epoch": 1.297133406835722, "grad_norm": 0.24054940044879913, "learning_rate": 0.00010572139303482586, "loss": 0.4137, "step": 2353 }, { "epoch": 1.2976846747519295, "grad_norm": 0.235767662525177, "learning_rate": 0.00010563847429519071, "loss": 0.377, "step": 2354 }, { "epoch": 1.2982359426681367, "grad_norm": 0.22807767987251282, "learning_rate": 0.00010555555555555555, "loss": 0.3974, "step": 2355 }, { "epoch": 1.298787210584344, "grad_norm": 0.22131551802158356, "learning_rate": 0.00010547263681592039, "loss": 0.4002, "step": 2356 }, { "epoch": 1.2993384785005513, "grad_norm": 0.24462686479091644, "learning_rate": 0.00010538971807628523, "loss": 0.4169, "step": 2357 }, { "epoch": 1.2998897464167585, "grad_norm": 0.24126161634922028, "learning_rate": 0.00010530679933665008, "loss": 0.3846, "step": 2358 }, { "epoch": 1.3004410143329659, "grad_norm": 0.2536928951740265, "learning_rate": 0.00010522388059701492, "loss": 0.3883, "step": 2359 }, { "epoch": 1.300992282249173, "grad_norm": 0.23638053238391876, "learning_rate": 0.00010514096185737977, "loss": 0.3916, "step": 2360 }, { "epoch": 1.3015435501653805, "grad_norm": 0.21713566780090332, "learning_rate": 0.0001050580431177446, "loss": 0.382, "step": 2361 }, { "epoch": 1.3020948180815877, "grad_norm": 0.23291055858135223, "learning_rate": 0.00010497512437810945, "loss": 0.3831, "step": 2362 }, { "epoch": 1.3026460859977949, "grad_norm": 0.2169044464826584, "learning_rate": 0.00010489220563847429, "loss": 0.3705, "step": 2363 }, { "epoch": 1.3031973539140023, "grad_norm": 0.23216962814331055, "learning_rate": 0.00010480928689883914, "loss": 0.3691, "step": 2364 }, { "epoch": 1.3037486218302095, "grad_norm": 0.2367962896823883, "learning_rate": 0.00010472636815920398, "loss": 0.4011, "step": 2365 }, { "epoch": 1.3042998897464169, "grad_norm": 0.22988784313201904, "learning_rate": 0.00010464344941956881, "loss": 0.3904, "step": 2366 }, { "epoch": 1.304851157662624, "grad_norm": 0.21731241047382355, "learning_rate": 0.00010456053067993366, "loss": 0.3815, "step": 2367 }, { "epoch": 1.3054024255788312, "grad_norm": 0.25733426213264465, "learning_rate": 0.0001044776119402985, "loss": 0.4253, "step": 2368 }, { "epoch": 1.3059536934950386, "grad_norm": 0.23438294231891632, "learning_rate": 0.00010439469320066335, "loss": 0.4041, "step": 2369 }, { "epoch": 1.3065049614112458, "grad_norm": 0.22011101245880127, "learning_rate": 0.0001043117744610282, "loss": 0.3948, "step": 2370 }, { "epoch": 1.3070562293274532, "grad_norm": 0.2404097616672516, "learning_rate": 0.00010422885572139302, "loss": 0.3996, "step": 2371 }, { "epoch": 1.3076074972436604, "grad_norm": 0.23479090631008148, "learning_rate": 0.00010414593698175786, "loss": 0.4048, "step": 2372 }, { "epoch": 1.3081587651598676, "grad_norm": 0.22892162203788757, "learning_rate": 0.0001040630182421227, "loss": 0.3751, "step": 2373 }, { "epoch": 1.308710033076075, "grad_norm": 0.22712910175323486, "learning_rate": 0.00010398009950248755, "loss": 0.3777, "step": 2374 }, { "epoch": 1.3092613009922822, "grad_norm": 0.22894370555877686, "learning_rate": 0.00010389718076285238, "loss": 0.3936, "step": 2375 }, { "epoch": 1.3098125689084896, "grad_norm": 0.24097605049610138, "learning_rate": 0.00010381426202321723, "loss": 0.3693, "step": 2376 }, { "epoch": 1.3103638368246968, "grad_norm": 0.23055890202522278, "learning_rate": 0.00010373134328358208, "loss": 0.3777, "step": 2377 }, { "epoch": 1.310915104740904, "grad_norm": 0.23357531428337097, "learning_rate": 0.00010364842454394692, "loss": 0.3945, "step": 2378 }, { "epoch": 1.3114663726571114, "grad_norm": 0.2378157526254654, "learning_rate": 0.00010356550580431177, "loss": 0.4077, "step": 2379 }, { "epoch": 1.3120176405733186, "grad_norm": 0.2348390370607376, "learning_rate": 0.0001034825870646766, "loss": 0.3905, "step": 2380 }, { "epoch": 1.312568908489526, "grad_norm": 0.24251805245876312, "learning_rate": 0.00010339966832504144, "loss": 0.4174, "step": 2381 }, { "epoch": 1.3131201764057332, "grad_norm": 0.23102574050426483, "learning_rate": 0.00010331674958540629, "loss": 0.3856, "step": 2382 }, { "epoch": 1.3136714443219404, "grad_norm": 0.2383720427751541, "learning_rate": 0.00010323383084577114, "loss": 0.3932, "step": 2383 }, { "epoch": 1.3142227122381478, "grad_norm": 0.22161129117012024, "learning_rate": 0.00010315091210613598, "loss": 0.396, "step": 2384 }, { "epoch": 1.314773980154355, "grad_norm": 0.2228018343448639, "learning_rate": 0.00010306799336650081, "loss": 0.3862, "step": 2385 }, { "epoch": 1.3153252480705624, "grad_norm": 0.22873203456401825, "learning_rate": 0.00010298507462686566, "loss": 0.3513, "step": 2386 }, { "epoch": 1.3158765159867696, "grad_norm": 0.23780828714370728, "learning_rate": 0.0001029021558872305, "loss": 0.3888, "step": 2387 }, { "epoch": 1.3164277839029768, "grad_norm": 0.2447124868631363, "learning_rate": 0.00010281923714759535, "loss": 0.4046, "step": 2388 }, { "epoch": 1.3169790518191842, "grad_norm": 0.24726513028144836, "learning_rate": 0.0001027363184079602, "loss": 0.4086, "step": 2389 }, { "epoch": 1.3175303197353914, "grad_norm": 0.2359735518693924, "learning_rate": 0.00010265339966832503, "loss": 0.4015, "step": 2390 }, { "epoch": 1.3180815876515988, "grad_norm": 0.23657964169979095, "learning_rate": 0.00010257048092868987, "loss": 0.3859, "step": 2391 }, { "epoch": 1.318632855567806, "grad_norm": 0.23830877244472504, "learning_rate": 0.00010248756218905472, "loss": 0.3864, "step": 2392 }, { "epoch": 1.3191841234840131, "grad_norm": 0.2303212434053421, "learning_rate": 0.00010240464344941956, "loss": 0.4036, "step": 2393 }, { "epoch": 1.3197353914002206, "grad_norm": 0.2221781462430954, "learning_rate": 0.0001023217247097844, "loss": 0.3712, "step": 2394 }, { "epoch": 1.3202866593164277, "grad_norm": 0.22085942327976227, "learning_rate": 0.00010223880597014924, "loss": 0.3708, "step": 2395 }, { "epoch": 1.3208379272326352, "grad_norm": 0.24135445058345795, "learning_rate": 0.00010215588723051409, "loss": 0.3896, "step": 2396 }, { "epoch": 1.3213891951488423, "grad_norm": 0.24116064608097076, "learning_rate": 0.00010207296849087893, "loss": 0.3866, "step": 2397 }, { "epoch": 1.3219404630650495, "grad_norm": 0.26890698075294495, "learning_rate": 0.00010199004975124378, "loss": 0.3795, "step": 2398 }, { "epoch": 1.322491730981257, "grad_norm": 0.2322501391172409, "learning_rate": 0.00010190713101160861, "loss": 0.3837, "step": 2399 }, { "epoch": 1.3230429988974641, "grad_norm": 0.24631264805793762, "learning_rate": 0.00010182421227197346, "loss": 0.3954, "step": 2400 }, { "epoch": 1.3235942668136715, "grad_norm": 0.2258647084236145, "learning_rate": 0.0001017412935323383, "loss": 0.3705, "step": 2401 }, { "epoch": 1.3241455347298787, "grad_norm": 0.2519420087337494, "learning_rate": 0.00010165837479270315, "loss": 0.3921, "step": 2402 }, { "epoch": 1.324696802646086, "grad_norm": 0.23400020599365234, "learning_rate": 0.00010157545605306799, "loss": 0.3702, "step": 2403 }, { "epoch": 1.3252480705622933, "grad_norm": 0.22752946615219116, "learning_rate": 0.00010149253731343282, "loss": 0.3756, "step": 2404 }, { "epoch": 1.3257993384785005, "grad_norm": 0.24144931137561798, "learning_rate": 0.00010140961857379767, "loss": 0.41, "step": 2405 }, { "epoch": 1.326350606394708, "grad_norm": 0.24649466574192047, "learning_rate": 0.00010132669983416252, "loss": 0.4227, "step": 2406 }, { "epoch": 1.326901874310915, "grad_norm": 0.22007010877132416, "learning_rate": 0.00010124378109452736, "loss": 0.3802, "step": 2407 }, { "epoch": 1.3274531422271223, "grad_norm": 0.2177124321460724, "learning_rate": 0.00010116086235489221, "loss": 0.3733, "step": 2408 }, { "epoch": 1.3280044101433297, "grad_norm": 0.23224158585071564, "learning_rate": 0.00010107794361525704, "loss": 0.3774, "step": 2409 }, { "epoch": 1.328555678059537, "grad_norm": 0.24728813767433167, "learning_rate": 0.00010099502487562188, "loss": 0.3926, "step": 2410 }, { "epoch": 1.3291069459757443, "grad_norm": 0.22190050780773163, "learning_rate": 0.00010091210613598673, "loss": 0.3826, "step": 2411 }, { "epoch": 1.3296582138919515, "grad_norm": 0.23956191539764404, "learning_rate": 0.00010082918739635158, "loss": 0.3982, "step": 2412 }, { "epoch": 1.3302094818081587, "grad_norm": 0.23789376020431519, "learning_rate": 0.00010074626865671642, "loss": 0.4032, "step": 2413 }, { "epoch": 1.330760749724366, "grad_norm": 0.24080632627010345, "learning_rate": 0.00010066334991708125, "loss": 0.3974, "step": 2414 }, { "epoch": 1.3313120176405733, "grad_norm": 0.22118644416332245, "learning_rate": 0.00010058043117744609, "loss": 0.3848, "step": 2415 }, { "epoch": 1.3318632855567807, "grad_norm": 0.24440258741378784, "learning_rate": 0.00010049751243781093, "loss": 0.3801, "step": 2416 }, { "epoch": 1.3324145534729879, "grad_norm": 0.23864087462425232, "learning_rate": 0.00010041459369817578, "loss": 0.4019, "step": 2417 }, { "epoch": 1.332965821389195, "grad_norm": 0.2365901917219162, "learning_rate": 0.00010033167495854061, "loss": 0.3827, "step": 2418 }, { "epoch": 1.3335170893054025, "grad_norm": 0.22480501234531403, "learning_rate": 0.00010024875621890545, "loss": 0.3696, "step": 2419 }, { "epoch": 1.3340683572216097, "grad_norm": 0.23156774044036865, "learning_rate": 0.0001001658374792703, "loss": 0.3803, "step": 2420 }, { "epoch": 1.334619625137817, "grad_norm": 0.22590211033821106, "learning_rate": 0.00010008291873963515, "loss": 0.387, "step": 2421 }, { "epoch": 1.3351708930540243, "grad_norm": 0.2270091325044632, "learning_rate": 9.999999999999999e-05, "loss": 0.381, "step": 2422 }, { "epoch": 1.3357221609702314, "grad_norm": 0.22601434588432312, "learning_rate": 9.991708126036482e-05, "loss": 0.3907, "step": 2423 }, { "epoch": 1.3362734288864389, "grad_norm": 0.2249268740415573, "learning_rate": 9.983416252072967e-05, "loss": 0.3794, "step": 2424 }, { "epoch": 1.336824696802646, "grad_norm": 0.2406623363494873, "learning_rate": 9.975124378109451e-05, "loss": 0.3912, "step": 2425 }, { "epoch": 1.3373759647188534, "grad_norm": 0.24089276790618896, "learning_rate": 9.966832504145936e-05, "loss": 0.3997, "step": 2426 }, { "epoch": 1.3379272326350606, "grad_norm": 0.2207108587026596, "learning_rate": 9.95854063018242e-05, "loss": 0.3804, "step": 2427 }, { "epoch": 1.3384785005512678, "grad_norm": 0.21747317910194397, "learning_rate": 9.950248756218904e-05, "loss": 0.3808, "step": 2428 }, { "epoch": 1.3390297684674752, "grad_norm": 0.2578473687171936, "learning_rate": 9.941956882255388e-05, "loss": 0.4195, "step": 2429 }, { "epoch": 1.3395810363836824, "grad_norm": 0.22663085162639618, "learning_rate": 9.933665008291873e-05, "loss": 0.3877, "step": 2430 }, { "epoch": 1.3401323042998898, "grad_norm": 0.24075528979301453, "learning_rate": 9.925373134328357e-05, "loss": 0.405, "step": 2431 }, { "epoch": 1.340683572216097, "grad_norm": 0.22877177596092224, "learning_rate": 9.917081260364842e-05, "loss": 0.382, "step": 2432 }, { "epoch": 1.3412348401323042, "grad_norm": 0.22892452776432037, "learning_rate": 9.908789386401325e-05, "loss": 0.3812, "step": 2433 }, { "epoch": 1.3417861080485116, "grad_norm": 0.24187688529491425, "learning_rate": 9.90049751243781e-05, "loss": 0.3825, "step": 2434 }, { "epoch": 1.3423373759647188, "grad_norm": 0.22903688251972198, "learning_rate": 9.892205638474294e-05, "loss": 0.3878, "step": 2435 }, { "epoch": 1.3428886438809262, "grad_norm": 0.22924572229385376, "learning_rate": 9.883913764510779e-05, "loss": 0.388, "step": 2436 }, { "epoch": 1.3434399117971334, "grad_norm": 0.24021534621715546, "learning_rate": 9.875621890547263e-05, "loss": 0.4031, "step": 2437 }, { "epoch": 1.3439911797133406, "grad_norm": 0.23757272958755493, "learning_rate": 9.867330016583747e-05, "loss": 0.3934, "step": 2438 }, { "epoch": 1.344542447629548, "grad_norm": 0.2555783987045288, "learning_rate": 9.859038142620231e-05, "loss": 0.3988, "step": 2439 }, { "epoch": 1.3450937155457552, "grad_norm": 0.23108243942260742, "learning_rate": 9.850746268656716e-05, "loss": 0.379, "step": 2440 }, { "epoch": 1.3456449834619626, "grad_norm": 0.24363455176353455, "learning_rate": 9.8424543946932e-05, "loss": 0.3939, "step": 2441 }, { "epoch": 1.3461962513781698, "grad_norm": 0.2295197993516922, "learning_rate": 9.834162520729685e-05, "loss": 0.3799, "step": 2442 }, { "epoch": 1.346747519294377, "grad_norm": 0.23563653230667114, "learning_rate": 9.825870646766168e-05, "loss": 0.3755, "step": 2443 }, { "epoch": 1.3472987872105844, "grad_norm": 0.2241990715265274, "learning_rate": 9.817578772802653e-05, "loss": 0.3794, "step": 2444 }, { "epoch": 1.3478500551267916, "grad_norm": 0.2593122124671936, "learning_rate": 9.809286898839137e-05, "loss": 0.3766, "step": 2445 }, { "epoch": 1.348401323042999, "grad_norm": 0.22955520451068878, "learning_rate": 9.800995024875622e-05, "loss": 0.3787, "step": 2446 }, { "epoch": 1.3489525909592062, "grad_norm": 0.23866330087184906, "learning_rate": 9.792703150912106e-05, "loss": 0.3955, "step": 2447 }, { "epoch": 1.3495038588754134, "grad_norm": 0.24115972220897675, "learning_rate": 9.78441127694859e-05, "loss": 0.3811, "step": 2448 }, { "epoch": 1.3500551267916208, "grad_norm": 0.23597833514213562, "learning_rate": 9.776119402985074e-05, "loss": 0.3831, "step": 2449 }, { "epoch": 1.350606394707828, "grad_norm": 0.2415011078119278, "learning_rate": 9.767827529021559e-05, "loss": 0.3896, "step": 2450 }, { "epoch": 1.3511576626240354, "grad_norm": 0.2416457235813141, "learning_rate": 9.759535655058043e-05, "loss": 0.3888, "step": 2451 }, { "epoch": 1.3517089305402425, "grad_norm": 0.23950545489788055, "learning_rate": 9.751243781094528e-05, "loss": 0.3942, "step": 2452 }, { "epoch": 1.3522601984564497, "grad_norm": 0.24059046804904938, "learning_rate": 9.742951907131011e-05, "loss": 0.4005, "step": 2453 }, { "epoch": 1.3528114663726571, "grad_norm": 0.2414311021566391, "learning_rate": 9.734660033167496e-05, "loss": 0.3795, "step": 2454 }, { "epoch": 1.3533627342888643, "grad_norm": 0.23370300233364105, "learning_rate": 9.72636815920398e-05, "loss": 0.3728, "step": 2455 }, { "epoch": 1.3539140022050717, "grad_norm": 0.23373939096927643, "learning_rate": 9.718076285240465e-05, "loss": 0.3925, "step": 2456 }, { "epoch": 1.354465270121279, "grad_norm": 0.22576579451560974, "learning_rate": 9.709784411276948e-05, "loss": 0.3787, "step": 2457 }, { "epoch": 1.3550165380374861, "grad_norm": 0.22904476523399353, "learning_rate": 9.701492537313432e-05, "loss": 0.3939, "step": 2458 }, { "epoch": 1.3555678059536935, "grad_norm": 0.24833030998706818, "learning_rate": 9.693200663349916e-05, "loss": 0.394, "step": 2459 }, { "epoch": 1.3561190738699007, "grad_norm": 0.22664152085781097, "learning_rate": 9.6849087893864e-05, "loss": 0.363, "step": 2460 }, { "epoch": 1.3566703417861081, "grad_norm": 0.23569191992282867, "learning_rate": 9.676616915422883e-05, "loss": 0.3823, "step": 2461 }, { "epoch": 1.3572216097023153, "grad_norm": 0.23659692704677582, "learning_rate": 9.668325041459368e-05, "loss": 0.3879, "step": 2462 }, { "epoch": 1.3577728776185225, "grad_norm": 0.22711534798145294, "learning_rate": 9.660033167495852e-05, "loss": 0.3761, "step": 2463 }, { "epoch": 1.35832414553473, "grad_norm": 0.23172332346439362, "learning_rate": 9.651741293532337e-05, "loss": 0.3774, "step": 2464 }, { "epoch": 1.358875413450937, "grad_norm": 0.23141370713710785, "learning_rate": 9.643449419568822e-05, "loss": 0.3976, "step": 2465 }, { "epoch": 1.3594266813671445, "grad_norm": 0.24368800222873688, "learning_rate": 9.635157545605305e-05, "loss": 0.3843, "step": 2466 }, { "epoch": 1.3599779492833517, "grad_norm": 0.22588768601417542, "learning_rate": 9.62686567164179e-05, "loss": 0.3798, "step": 2467 }, { "epoch": 1.3605292171995589, "grad_norm": 0.2269313633441925, "learning_rate": 9.618573797678274e-05, "loss": 0.3874, "step": 2468 }, { "epoch": 1.3610804851157663, "grad_norm": 0.23487702012062073, "learning_rate": 9.610281923714758e-05, "loss": 0.3888, "step": 2469 }, { "epoch": 1.3616317530319735, "grad_norm": 0.2513071894645691, "learning_rate": 9.601990049751243e-05, "loss": 0.4122, "step": 2470 }, { "epoch": 1.362183020948181, "grad_norm": 0.21708211302757263, "learning_rate": 9.593698175787726e-05, "loss": 0.3597, "step": 2471 }, { "epoch": 1.362734288864388, "grad_norm": 0.2279457300901413, "learning_rate": 9.585406301824211e-05, "loss": 0.3834, "step": 2472 }, { "epoch": 1.3632855567805953, "grad_norm": 0.22766946256160736, "learning_rate": 9.577114427860695e-05, "loss": 0.3682, "step": 2473 }, { "epoch": 1.3638368246968027, "grad_norm": 0.22673630714416504, "learning_rate": 9.56882255389718e-05, "loss": 0.3823, "step": 2474 }, { "epoch": 1.3643880926130099, "grad_norm": 0.23767007887363434, "learning_rate": 9.560530679933664e-05, "loss": 0.3991, "step": 2475 }, { "epoch": 1.3649393605292173, "grad_norm": 0.2326952964067459, "learning_rate": 9.552238805970148e-05, "loss": 0.39, "step": 2476 }, { "epoch": 1.3654906284454245, "grad_norm": 0.2336025983095169, "learning_rate": 9.543946932006632e-05, "loss": 0.3748, "step": 2477 }, { "epoch": 1.3660418963616316, "grad_norm": 0.23857955634593964, "learning_rate": 9.535655058043117e-05, "loss": 0.4077, "step": 2478 }, { "epoch": 1.366593164277839, "grad_norm": 0.22810246050357819, "learning_rate": 9.527363184079601e-05, "loss": 0.406, "step": 2479 }, { "epoch": 1.3671444321940462, "grad_norm": 0.23381425440311432, "learning_rate": 9.519071310116086e-05, "loss": 0.395, "step": 2480 }, { "epoch": 1.3676957001102537, "grad_norm": 0.21443428099155426, "learning_rate": 9.510779436152569e-05, "loss": 0.3772, "step": 2481 }, { "epoch": 1.3682469680264608, "grad_norm": 0.23185119032859802, "learning_rate": 9.502487562189054e-05, "loss": 0.3892, "step": 2482 }, { "epoch": 1.368798235942668, "grad_norm": 0.2298753708600998, "learning_rate": 9.494195688225538e-05, "loss": 0.3891, "step": 2483 }, { "epoch": 1.3693495038588754, "grad_norm": 0.216232031583786, "learning_rate": 9.485903814262023e-05, "loss": 0.382, "step": 2484 }, { "epoch": 1.3699007717750826, "grad_norm": 0.23376402258872986, "learning_rate": 9.477611940298507e-05, "loss": 0.3992, "step": 2485 }, { "epoch": 1.37045203969129, "grad_norm": 0.2535459101200104, "learning_rate": 9.46932006633499e-05, "loss": 0.3957, "step": 2486 }, { "epoch": 1.3710033076074972, "grad_norm": 0.22214862704277039, "learning_rate": 9.461028192371475e-05, "loss": 0.3713, "step": 2487 }, { "epoch": 1.3715545755237044, "grad_norm": 0.23064962029457092, "learning_rate": 9.45273631840796e-05, "loss": 0.3821, "step": 2488 }, { "epoch": 1.3721058434399118, "grad_norm": 0.249479740858078, "learning_rate": 9.444444444444444e-05, "loss": 0.3837, "step": 2489 }, { "epoch": 1.372657111356119, "grad_norm": 0.22704121470451355, "learning_rate": 9.436152570480929e-05, "loss": 0.3931, "step": 2490 }, { "epoch": 1.3732083792723264, "grad_norm": 0.23015405237674713, "learning_rate": 9.427860696517412e-05, "loss": 0.4049, "step": 2491 }, { "epoch": 1.3737596471885336, "grad_norm": 0.23387496173381805, "learning_rate": 9.419568822553897e-05, "loss": 0.3727, "step": 2492 }, { "epoch": 1.3743109151047408, "grad_norm": 0.21825988590717316, "learning_rate": 9.411276948590381e-05, "loss": 0.382, "step": 2493 }, { "epoch": 1.3748621830209482, "grad_norm": 0.2230725735425949, "learning_rate": 9.402985074626866e-05, "loss": 0.3935, "step": 2494 }, { "epoch": 1.3754134509371554, "grad_norm": 0.22703075408935547, "learning_rate": 9.39469320066335e-05, "loss": 0.3848, "step": 2495 }, { "epoch": 1.3759647188533628, "grad_norm": 0.2219892293214798, "learning_rate": 9.386401326699833e-05, "loss": 0.3898, "step": 2496 }, { "epoch": 1.37651598676957, "grad_norm": 0.23172403872013092, "learning_rate": 9.378109452736318e-05, "loss": 0.3861, "step": 2497 }, { "epoch": 1.3770672546857772, "grad_norm": 0.23237434029579163, "learning_rate": 9.369817578772803e-05, "loss": 0.3705, "step": 2498 }, { "epoch": 1.3776185226019846, "grad_norm": 0.2246798872947693, "learning_rate": 9.361525704809287e-05, "loss": 0.3679, "step": 2499 }, { "epoch": 1.3781697905181918, "grad_norm": 0.2427067756652832, "learning_rate": 9.353233830845772e-05, "loss": 0.4212, "step": 2500 }, { "epoch": 1.3781697905181918, "eval_loss": 0.4513299763202667, "eval_runtime": 311.7925, "eval_samples_per_second": 3.736, "eval_steps_per_second": 0.468, "step": 2500 }, { "epoch": 1.3787210584343992, "grad_norm": 0.2319420874118805, "learning_rate": 9.344941956882255e-05, "loss": 0.3887, "step": 2501 }, { "epoch": 1.3792723263506064, "grad_norm": 0.23304283618927002, "learning_rate": 9.33665008291874e-05, "loss": 0.396, "step": 2502 }, { "epoch": 1.3798235942668136, "grad_norm": 0.2571066617965698, "learning_rate": 9.328358208955223e-05, "loss": 0.3862, "step": 2503 }, { "epoch": 1.380374862183021, "grad_norm": 0.22332634031772614, "learning_rate": 9.320066334991707e-05, "loss": 0.3608, "step": 2504 }, { "epoch": 1.3809261300992282, "grad_norm": 0.2485717236995697, "learning_rate": 9.31177446102819e-05, "loss": 0.4238, "step": 2505 }, { "epoch": 1.3814773980154356, "grad_norm": 0.230104461312294, "learning_rate": 9.303482587064675e-05, "loss": 0.4036, "step": 2506 }, { "epoch": 1.3820286659316428, "grad_norm": 0.2558598816394806, "learning_rate": 9.29519071310116e-05, "loss": 0.3958, "step": 2507 }, { "epoch": 1.38257993384785, "grad_norm": 0.23400071263313293, "learning_rate": 9.286898839137644e-05, "loss": 0.3862, "step": 2508 }, { "epoch": 1.3831312017640573, "grad_norm": 0.23237945139408112, "learning_rate": 9.278606965174129e-05, "loss": 0.3753, "step": 2509 }, { "epoch": 1.3836824696802645, "grad_norm": 0.2357659935951233, "learning_rate": 9.270315091210612e-05, "loss": 0.3826, "step": 2510 }, { "epoch": 1.384233737596472, "grad_norm": 0.2599101960659027, "learning_rate": 9.262023217247096e-05, "loss": 0.4028, "step": 2511 }, { "epoch": 1.3847850055126791, "grad_norm": 0.2372962385416031, "learning_rate": 9.253731343283581e-05, "loss": 0.4181, "step": 2512 }, { "epoch": 1.3853362734288863, "grad_norm": 0.27277928590774536, "learning_rate": 9.245439469320065e-05, "loss": 0.4025, "step": 2513 }, { "epoch": 1.3858875413450937, "grad_norm": 0.22424361109733582, "learning_rate": 9.237147595356549e-05, "loss": 0.3735, "step": 2514 }, { "epoch": 1.386438809261301, "grad_norm": 0.2312849462032318, "learning_rate": 9.228855721393033e-05, "loss": 0.4009, "step": 2515 }, { "epoch": 1.3869900771775083, "grad_norm": 0.24405118823051453, "learning_rate": 9.220563847429518e-05, "loss": 0.4026, "step": 2516 }, { "epoch": 1.3875413450937155, "grad_norm": 0.25049299001693726, "learning_rate": 9.212271973466002e-05, "loss": 0.3878, "step": 2517 }, { "epoch": 1.3880926130099227, "grad_norm": 0.23999334871768951, "learning_rate": 9.203980099502487e-05, "loss": 0.3758, "step": 2518 }, { "epoch": 1.3886438809261301, "grad_norm": 0.23169536888599396, "learning_rate": 9.19568822553897e-05, "loss": 0.3758, "step": 2519 }, { "epoch": 1.3891951488423373, "grad_norm": 0.228010356426239, "learning_rate": 9.187396351575455e-05, "loss": 0.3731, "step": 2520 }, { "epoch": 1.3897464167585447, "grad_norm": 0.2497485876083374, "learning_rate": 9.179104477611939e-05, "loss": 0.3995, "step": 2521 }, { "epoch": 1.390297684674752, "grad_norm": 0.257614403963089, "learning_rate": 9.170812603648424e-05, "loss": 0.3873, "step": 2522 }, { "epoch": 1.390848952590959, "grad_norm": 0.22421546280384064, "learning_rate": 9.162520729684908e-05, "loss": 0.3746, "step": 2523 }, { "epoch": 1.3914002205071665, "grad_norm": 0.22990712523460388, "learning_rate": 9.154228855721392e-05, "loss": 0.3916, "step": 2524 }, { "epoch": 1.3919514884233737, "grad_norm": 0.24670518934726715, "learning_rate": 9.145936981757876e-05, "loss": 0.3983, "step": 2525 }, { "epoch": 1.392502756339581, "grad_norm": 0.23636974394321442, "learning_rate": 9.137645107794361e-05, "loss": 0.3776, "step": 2526 }, { "epoch": 1.3930540242557883, "grad_norm": 0.2319977879524231, "learning_rate": 9.129353233830845e-05, "loss": 0.3809, "step": 2527 }, { "epoch": 1.3936052921719955, "grad_norm": 0.22971488535404205, "learning_rate": 9.12106135986733e-05, "loss": 0.3643, "step": 2528 }, { "epoch": 1.3941565600882029, "grad_norm": 0.24024169147014618, "learning_rate": 9.112769485903813e-05, "loss": 0.3915, "step": 2529 }, { "epoch": 1.39470782800441, "grad_norm": 0.22295120358467102, "learning_rate": 9.104477611940298e-05, "loss": 0.3702, "step": 2530 }, { "epoch": 1.3952590959206175, "grad_norm": 0.23186278343200684, "learning_rate": 9.096185737976782e-05, "loss": 0.3733, "step": 2531 }, { "epoch": 1.3958103638368247, "grad_norm": 0.25662240386009216, "learning_rate": 9.087893864013267e-05, "loss": 0.3843, "step": 2532 }, { "epoch": 1.3963616317530319, "grad_norm": 0.24374930560588837, "learning_rate": 9.079601990049751e-05, "loss": 0.4025, "step": 2533 }, { "epoch": 1.3969128996692393, "grad_norm": 0.22312727570533752, "learning_rate": 9.071310116086234e-05, "loss": 0.3794, "step": 2534 }, { "epoch": 1.3974641675854464, "grad_norm": 0.21616993844509125, "learning_rate": 9.063018242122719e-05, "loss": 0.3771, "step": 2535 }, { "epoch": 1.3980154355016539, "grad_norm": 0.24162566661834717, "learning_rate": 9.054726368159204e-05, "loss": 0.3797, "step": 2536 }, { "epoch": 1.398566703417861, "grad_norm": 0.24157093465328217, "learning_rate": 9.046434494195688e-05, "loss": 0.3815, "step": 2537 }, { "epoch": 1.3991179713340682, "grad_norm": 0.2437802404165268, "learning_rate": 9.038142620232173e-05, "loss": 0.3944, "step": 2538 }, { "epoch": 1.3996692392502756, "grad_norm": 0.24138353765010834, "learning_rate": 9.029850746268656e-05, "loss": 0.392, "step": 2539 }, { "epoch": 1.4002205071664828, "grad_norm": 0.25548362731933594, "learning_rate": 9.02155887230514e-05, "loss": 0.408, "step": 2540 }, { "epoch": 1.4007717750826902, "grad_norm": 0.24517594277858734, "learning_rate": 9.013266998341625e-05, "loss": 0.3979, "step": 2541 }, { "epoch": 1.4013230429988974, "grad_norm": 0.24252092838287354, "learning_rate": 9.00497512437811e-05, "loss": 0.4122, "step": 2542 }, { "epoch": 1.4018743109151046, "grad_norm": 0.23663447797298431, "learning_rate": 8.996683250414594e-05, "loss": 0.3936, "step": 2543 }, { "epoch": 1.402425578831312, "grad_norm": 0.2445666640996933, "learning_rate": 8.988391376451077e-05, "loss": 0.3863, "step": 2544 }, { "epoch": 1.4029768467475192, "grad_norm": 0.24747510254383087, "learning_rate": 8.980099502487562e-05, "loss": 0.4024, "step": 2545 }, { "epoch": 1.4035281146637266, "grad_norm": 0.22010785341262817, "learning_rate": 8.971807628524046e-05, "loss": 0.3765, "step": 2546 }, { "epoch": 1.4040793825799338, "grad_norm": 0.24189656972885132, "learning_rate": 8.963515754560531e-05, "loss": 0.3735, "step": 2547 }, { "epoch": 1.404630650496141, "grad_norm": 0.23379263281822205, "learning_rate": 8.955223880597013e-05, "loss": 0.3886, "step": 2548 }, { "epoch": 1.4051819184123484, "grad_norm": 0.2319820672273636, "learning_rate": 8.946932006633497e-05, "loss": 0.3932, "step": 2549 }, { "epoch": 1.4057331863285556, "grad_norm": 0.2426556944847107, "learning_rate": 8.938640132669982e-05, "loss": 0.3579, "step": 2550 }, { "epoch": 1.406284454244763, "grad_norm": 0.23170387744903564, "learning_rate": 8.930348258706467e-05, "loss": 0.3657, "step": 2551 }, { "epoch": 1.4068357221609702, "grad_norm": 0.24107246100902557, "learning_rate": 8.922056384742951e-05, "loss": 0.4121, "step": 2552 }, { "epoch": 1.4073869900771774, "grad_norm": 0.23268483579158783, "learning_rate": 8.913764510779434e-05, "loss": 0.3964, "step": 2553 }, { "epoch": 1.4079382579933848, "grad_norm": 0.24437369406223297, "learning_rate": 8.905472636815919e-05, "loss": 0.3886, "step": 2554 }, { "epoch": 1.4084895259095922, "grad_norm": 0.2408677190542221, "learning_rate": 8.897180762852403e-05, "loss": 0.4128, "step": 2555 }, { "epoch": 1.4090407938257994, "grad_norm": 0.24828049540519714, "learning_rate": 8.888888888888888e-05, "loss": 0.3968, "step": 2556 }, { "epoch": 1.4095920617420066, "grad_norm": 0.25326454639434814, "learning_rate": 8.880597014925373e-05, "loss": 0.4163, "step": 2557 }, { "epoch": 1.4101433296582138, "grad_norm": 0.2104220986366272, "learning_rate": 8.872305140961856e-05, "loss": 0.3861, "step": 2558 }, { "epoch": 1.4106945975744212, "grad_norm": 0.24456249177455902, "learning_rate": 8.86401326699834e-05, "loss": 0.3969, "step": 2559 }, { "epoch": 1.4112458654906286, "grad_norm": 0.23775126039981842, "learning_rate": 8.855721393034825e-05, "loss": 0.4024, "step": 2560 }, { "epoch": 1.4117971334068358, "grad_norm": 0.2330765575170517, "learning_rate": 8.84742951907131e-05, "loss": 0.3988, "step": 2561 }, { "epoch": 1.412348401323043, "grad_norm": 0.23499152064323425, "learning_rate": 8.839137645107794e-05, "loss": 0.4021, "step": 2562 }, { "epoch": 1.4128996692392501, "grad_norm": 0.23784568905830383, "learning_rate": 8.830845771144277e-05, "loss": 0.4093, "step": 2563 }, { "epoch": 1.4134509371554576, "grad_norm": 0.25330281257629395, "learning_rate": 8.822553897180762e-05, "loss": 0.3896, "step": 2564 }, { "epoch": 1.414002205071665, "grad_norm": 0.2372010052204132, "learning_rate": 8.814262023217246e-05, "loss": 0.3887, "step": 2565 }, { "epoch": 1.4145534729878722, "grad_norm": 0.227810338139534, "learning_rate": 8.805970149253731e-05, "loss": 0.3727, "step": 2566 }, { "epoch": 1.4151047409040793, "grad_norm": 0.23357363045215607, "learning_rate": 8.797678275290215e-05, "loss": 0.3735, "step": 2567 }, { "epoch": 1.4156560088202865, "grad_norm": 0.23767000436782837, "learning_rate": 8.789386401326699e-05, "loss": 0.3906, "step": 2568 }, { "epoch": 1.416207276736494, "grad_norm": 0.22021612524986267, "learning_rate": 8.781094527363183e-05, "loss": 0.3907, "step": 2569 }, { "epoch": 1.4167585446527013, "grad_norm": 0.22677011787891388, "learning_rate": 8.772802653399668e-05, "loss": 0.3568, "step": 2570 }, { "epoch": 1.4173098125689085, "grad_norm": 0.23188649117946625, "learning_rate": 8.764510779436152e-05, "loss": 0.3872, "step": 2571 }, { "epoch": 1.4178610804851157, "grad_norm": 0.24772998690605164, "learning_rate": 8.756218905472637e-05, "loss": 0.4013, "step": 2572 }, { "epoch": 1.418412348401323, "grad_norm": 0.23278258740901947, "learning_rate": 8.74792703150912e-05, "loss": 0.3783, "step": 2573 }, { "epoch": 1.4189636163175303, "grad_norm": 0.24379077553749084, "learning_rate": 8.739635157545605e-05, "loss": 0.3929, "step": 2574 }, { "epoch": 1.4195148842337377, "grad_norm": 0.23344534635543823, "learning_rate": 8.731343283582089e-05, "loss": 0.3709, "step": 2575 }, { "epoch": 1.420066152149945, "grad_norm": 0.23678019642829895, "learning_rate": 8.723051409618574e-05, "loss": 0.3973, "step": 2576 }, { "epoch": 1.420617420066152, "grad_norm": 0.23193979263305664, "learning_rate": 8.714759535655057e-05, "loss": 0.3778, "step": 2577 }, { "epoch": 1.4211686879823593, "grad_norm": 0.24555335938930511, "learning_rate": 8.706467661691541e-05, "loss": 0.4252, "step": 2578 }, { "epoch": 1.4217199558985667, "grad_norm": 0.22985686361789703, "learning_rate": 8.698175787728026e-05, "loss": 0.3896, "step": 2579 }, { "epoch": 1.422271223814774, "grad_norm": 0.24446120858192444, "learning_rate": 8.68988391376451e-05, "loss": 0.3969, "step": 2580 }, { "epoch": 1.4228224917309813, "grad_norm": 0.22781571745872498, "learning_rate": 8.681592039800995e-05, "loss": 0.3836, "step": 2581 }, { "epoch": 1.4233737596471885, "grad_norm": 0.2543814778327942, "learning_rate": 8.673300165837478e-05, "loss": 0.3934, "step": 2582 }, { "epoch": 1.4239250275633957, "grad_norm": 0.2298593968153, "learning_rate": 8.665008291873963e-05, "loss": 0.3894, "step": 2583 }, { "epoch": 1.424476295479603, "grad_norm": 0.24680182337760925, "learning_rate": 8.656716417910447e-05, "loss": 0.3928, "step": 2584 }, { "epoch": 1.4250275633958105, "grad_norm": 0.2492562234401703, "learning_rate": 8.648424543946932e-05, "loss": 0.3793, "step": 2585 }, { "epoch": 1.4255788313120177, "grad_norm": 0.24546745419502258, "learning_rate": 8.640132669983417e-05, "loss": 0.3671, "step": 2586 }, { "epoch": 1.4261300992282249, "grad_norm": 0.24431215226650238, "learning_rate": 8.6318407960199e-05, "loss": 0.3613, "step": 2587 }, { "epoch": 1.426681367144432, "grad_norm": 0.24530234932899475, "learning_rate": 8.623548922056384e-05, "loss": 0.3894, "step": 2588 }, { "epoch": 1.4272326350606395, "grad_norm": 0.2521824240684509, "learning_rate": 8.615257048092869e-05, "loss": 0.3938, "step": 2589 }, { "epoch": 1.4277839029768469, "grad_norm": 0.23589465022087097, "learning_rate": 8.606965174129353e-05, "loss": 0.377, "step": 2590 }, { "epoch": 1.428335170893054, "grad_norm": 0.22879983484745026, "learning_rate": 8.598673300165838e-05, "loss": 0.387, "step": 2591 }, { "epoch": 1.4288864388092613, "grad_norm": 0.2426953762769699, "learning_rate": 8.59038142620232e-05, "loss": 0.3921, "step": 2592 }, { "epoch": 1.4294377067254687, "grad_norm": 0.2464035451412201, "learning_rate": 8.582089552238804e-05, "loss": 0.3842, "step": 2593 }, { "epoch": 1.4299889746416758, "grad_norm": 0.24871256947517395, "learning_rate": 8.573797678275289e-05, "loss": 0.4075, "step": 2594 }, { "epoch": 1.4305402425578833, "grad_norm": 0.22682443261146545, "learning_rate": 8.565505804311774e-05, "loss": 0.3538, "step": 2595 }, { "epoch": 1.4310915104740904, "grad_norm": 0.23264093697071075, "learning_rate": 8.557213930348257e-05, "loss": 0.3802, "step": 2596 }, { "epoch": 1.4316427783902976, "grad_norm": 0.2368372529745102, "learning_rate": 8.548922056384741e-05, "loss": 0.3897, "step": 2597 }, { "epoch": 1.432194046306505, "grad_norm": 0.23906560242176056, "learning_rate": 8.540630182421226e-05, "loss": 0.3691, "step": 2598 }, { "epoch": 1.4327453142227122, "grad_norm": 0.22911648452281952, "learning_rate": 8.53233830845771e-05, "loss": 0.3829, "step": 2599 }, { "epoch": 1.4332965821389196, "grad_norm": 0.23407630622386932, "learning_rate": 8.524046434494195e-05, "loss": 0.3841, "step": 2600 } ], "logging_steps": 1, "max_steps": 3628, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.918731552725244e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }