{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.99338146811071, "eval_steps": 104, "global_step": 1660, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024067388688327317, "grad_norm": 1.6241142749786377, "learning_rate": 2e-05, "loss": 1.2341, "step": 1 }, { "epoch": 0.0024067388688327317, "eval_loss": 0.8392955660820007, "eval_runtime": 2134.2947, "eval_samples_per_second": 1.785, "eval_steps_per_second": 0.893, "step": 1 }, { "epoch": 0.0048134777376654635, "grad_norm": 1.805857539176941, "learning_rate": 4e-05, "loss": 1.1419, "step": 2 }, { "epoch": 0.007220216606498195, "grad_norm": 1.6161996126174927, "learning_rate": 6e-05, "loss": 1.2124, "step": 3 }, { "epoch": 0.009626955475330927, "grad_norm": 1.2951513528823853, "learning_rate": 8e-05, "loss": 1.2103, "step": 4 }, { "epoch": 0.012033694344163659, "grad_norm": 1.0456116199493408, "learning_rate": 0.0001, "loss": 1.0705, "step": 5 }, { "epoch": 0.01444043321299639, "grad_norm": 1.0461788177490234, "learning_rate": 0.00012, "loss": 1.1151, "step": 6 }, { "epoch": 0.01684717208182912, "grad_norm": 1.2591978311538696, "learning_rate": 0.00014, "loss": 1.2, "step": 7 }, { "epoch": 0.019253910950661854, "grad_norm": 1.2936573028564453, "learning_rate": 0.00016, "loss": 1.2202, "step": 8 }, { "epoch": 0.021660649819494584, "grad_norm": 1.1709610223770142, "learning_rate": 0.00018, "loss": 1.1382, "step": 9 }, { "epoch": 0.024067388688327317, "grad_norm": 1.0805938243865967, "learning_rate": 0.0002, "loss": 1.0272, "step": 10 }, { "epoch": 0.026474127557160047, "grad_norm": 1.1062763929367065, "learning_rate": 0.00019999981874010248, "loss": 1.1452, "step": 11 }, { "epoch": 0.02888086642599278, "grad_norm": 1.0558199882507324, "learning_rate": 0.00019999927496106707, "loss": 1.1706, "step": 12 }, { "epoch": 0.031287605294825514, "grad_norm": 1.1429730653762817, "learning_rate": 0.00019999836866486503, "loss": 1.2205, "step": 13 }, { "epoch": 0.03369434416365824, "grad_norm": 1.1409598588943481, "learning_rate": 0.00019999709985478188, "loss": 1.1198, "step": 14 }, { "epoch": 0.036101083032490974, "grad_norm": 1.01558256149292, "learning_rate": 0.0001999954685354173, "loss": 1.1469, "step": 15 }, { "epoch": 0.03850782190132371, "grad_norm": 0.9958528280258179, "learning_rate": 0.00019999347471268516, "loss": 1.1071, "step": 16 }, { "epoch": 0.04091456077015644, "grad_norm": 1.1720150709152222, "learning_rate": 0.00019999111839381345, "loss": 1.1311, "step": 17 }, { "epoch": 0.04332129963898917, "grad_norm": 0.9903748631477356, "learning_rate": 0.0001999883995873443, "loss": 1.1231, "step": 18 }, { "epoch": 0.0457280385078219, "grad_norm": 1.0638104677200317, "learning_rate": 0.00019998531830313395, "loss": 1.1023, "step": 19 }, { "epoch": 0.048134777376654635, "grad_norm": 0.9195047616958618, "learning_rate": 0.0001999818745523526, "loss": 1.0447, "step": 20 }, { "epoch": 0.05054151624548736, "grad_norm": 0.9006258249282837, "learning_rate": 0.00019997806834748456, "loss": 1.0267, "step": 21 }, { "epoch": 0.052948255114320095, "grad_norm": 0.9647263884544373, "learning_rate": 0.0001999738997023281, "loss": 1.1211, "step": 22 }, { "epoch": 0.05535499398315283, "grad_norm": 0.950730562210083, "learning_rate": 0.00019996936863199535, "loss": 1.1123, "step": 23 }, { "epoch": 0.05776173285198556, "grad_norm": 0.998926043510437, "learning_rate": 0.00019996447515291233, "loss": 1.1281, "step": 24 }, { "epoch": 0.06016847172081829, "grad_norm": 0.9844056367874146, "learning_rate": 0.00019995921928281894, "loss": 1.0204, "step": 25 }, { "epoch": 0.06257521058965103, "grad_norm": 0.9354597330093384, "learning_rate": 0.00019995360104076867, "loss": 1.0971, "step": 26 }, { "epoch": 0.06498194945848375, "grad_norm": 0.9460365772247314, "learning_rate": 0.0001999476204471288, "loss": 1.0748, "step": 27 }, { "epoch": 0.06738868832731648, "grad_norm": 1.196228265762329, "learning_rate": 0.00019994127752358013, "loss": 1.1588, "step": 28 }, { "epoch": 0.06979542719614922, "grad_norm": 0.9586326479911804, "learning_rate": 0.00019993457229311708, "loss": 1.1092, "step": 29 }, { "epoch": 0.07220216606498195, "grad_norm": 0.9055193662643433, "learning_rate": 0.00019992750478004738, "loss": 1.0624, "step": 30 }, { "epoch": 0.07460890493381468, "grad_norm": 0.9031230807304382, "learning_rate": 0.00019992007500999214, "loss": 1.0479, "step": 31 }, { "epoch": 0.07701564380264742, "grad_norm": 0.8589805364608765, "learning_rate": 0.00019991228300988585, "loss": 1.0468, "step": 32 }, { "epoch": 0.07942238267148015, "grad_norm": 0.909577488899231, "learning_rate": 0.00019990412880797597, "loss": 1.082, "step": 33 }, { "epoch": 0.08182912154031288, "grad_norm": 3.2044453620910645, "learning_rate": 0.00019989561243382312, "loss": 0.9973, "step": 34 }, { "epoch": 0.0842358604091456, "grad_norm": 1.0137778520584106, "learning_rate": 0.0001998867339183008, "loss": 1.0774, "step": 35 }, { "epoch": 0.08664259927797834, "grad_norm": 2.4891610145568848, "learning_rate": 0.00019987749329359548, "loss": 1.0952, "step": 36 }, { "epoch": 0.08904933814681107, "grad_norm": 1.01234769821167, "learning_rate": 0.00019986789059320615, "loss": 1.0991, "step": 37 }, { "epoch": 0.0914560770156438, "grad_norm": 0.882671594619751, "learning_rate": 0.00019985792585194457, "loss": 1.0471, "step": 38 }, { "epoch": 0.09386281588447654, "grad_norm": 0.8638483881950378, "learning_rate": 0.00019984759910593488, "loss": 1.0702, "step": 39 }, { "epoch": 0.09626955475330927, "grad_norm": 0.884689450263977, "learning_rate": 0.00019983691039261357, "loss": 1.1066, "step": 40 }, { "epoch": 0.098676293622142, "grad_norm": 0.8822845220565796, "learning_rate": 0.00019982585975072937, "loss": 1.0255, "step": 41 }, { "epoch": 0.10108303249097472, "grad_norm": 0.9822187423706055, "learning_rate": 0.000199814447220343, "loss": 0.9812, "step": 42 }, { "epoch": 0.10348977135980746, "grad_norm": 0.8197717070579529, "learning_rate": 0.00019980267284282717, "loss": 0.9455, "step": 43 }, { "epoch": 0.10589651022864019, "grad_norm": 0.8288549184799194, "learning_rate": 0.00019979053666086634, "loss": 1.0362, "step": 44 }, { "epoch": 0.10830324909747292, "grad_norm": 0.8918671011924744, "learning_rate": 0.0001997780387184565, "loss": 1.0007, "step": 45 }, { "epoch": 0.11070998796630566, "grad_norm": 0.9474024176597595, "learning_rate": 0.00019976517906090529, "loss": 1.0645, "step": 46 }, { "epoch": 0.11311672683513839, "grad_norm": 0.8152378797531128, "learning_rate": 0.0001997519577348314, "loss": 0.9731, "step": 47 }, { "epoch": 0.11552346570397112, "grad_norm": 0.9036313891410828, "learning_rate": 0.0001997383747881648, "loss": 1.1146, "step": 48 }, { "epoch": 0.11793020457280386, "grad_norm": 0.9020432233810425, "learning_rate": 0.0001997244302701464, "loss": 1.0945, "step": 49 }, { "epoch": 0.12033694344163658, "grad_norm": 0.903535008430481, "learning_rate": 0.00019971012423132775, "loss": 1.1153, "step": 50 }, { "epoch": 0.12274368231046931, "grad_norm": 0.8101151585578918, "learning_rate": 0.00019969545672357116, "loss": 1.0654, "step": 51 }, { "epoch": 0.12515042117930206, "grad_norm": 0.8507478833198547, "learning_rate": 0.00019968042780004917, "loss": 1.0414, "step": 52 }, { "epoch": 0.12755716004813478, "grad_norm": 1.2401365041732788, "learning_rate": 0.00019966503751524465, "loss": 1.0744, "step": 53 }, { "epoch": 0.1299638989169675, "grad_norm": 0.8836433291435242, "learning_rate": 0.00019964928592495045, "loss": 1.1125, "step": 54 }, { "epoch": 0.13237063778580024, "grad_norm": 0.8277642130851746, "learning_rate": 0.00019963317308626914, "loss": 1.0974, "step": 55 }, { "epoch": 0.13477737665463296, "grad_norm": 0.8704286217689514, "learning_rate": 0.00019961669905761302, "loss": 0.964, "step": 56 }, { "epoch": 0.1371841155234657, "grad_norm": 0.8706910014152527, "learning_rate": 0.00019959986389870364, "loss": 1.0249, "step": 57 }, { "epoch": 0.13959085439229843, "grad_norm": 0.8885878920555115, "learning_rate": 0.0001995826676705718, "loss": 0.9353, "step": 58 }, { "epoch": 0.14199759326113118, "grad_norm": 0.8675719499588013, "learning_rate": 0.00019956511043555728, "loss": 1.0741, "step": 59 }, { "epoch": 0.1444043321299639, "grad_norm": 0.8110966086387634, "learning_rate": 0.00019954719225730847, "loss": 1.0503, "step": 60 }, { "epoch": 0.14681107099879662, "grad_norm": 0.8182883262634277, "learning_rate": 0.00019952891320078236, "loss": 0.9776, "step": 61 }, { "epoch": 0.14921780986762936, "grad_norm": 0.8101396560668945, "learning_rate": 0.0001995102733322441, "loss": 1.0523, "step": 62 }, { "epoch": 0.15162454873646208, "grad_norm": 0.7879651784896851, "learning_rate": 0.00019949127271926695, "loss": 1.079, "step": 63 }, { "epoch": 0.15403128760529483, "grad_norm": 0.8286521434783936, "learning_rate": 0.00019947191143073186, "loss": 0.9527, "step": 64 }, { "epoch": 0.15643802647412755, "grad_norm": 0.9895939826965332, "learning_rate": 0.00019945218953682734, "loss": 1.0374, "step": 65 }, { "epoch": 0.1588447653429603, "grad_norm": 0.7657825350761414, "learning_rate": 0.00019943210710904918, "loss": 0.9542, "step": 66 }, { "epoch": 0.16125150421179302, "grad_norm": 0.821657121181488, "learning_rate": 0.00019941166422020014, "loss": 1.0401, "step": 67 }, { "epoch": 0.16365824308062576, "grad_norm": 0.9410754442214966, "learning_rate": 0.00019939086094438975, "loss": 1.0187, "step": 68 }, { "epoch": 0.16606498194945848, "grad_norm": 0.793220579624176, "learning_rate": 0.00019936969735703396, "loss": 0.9305, "step": 69 }, { "epoch": 0.1684717208182912, "grad_norm": 0.8068125247955322, "learning_rate": 0.00019934817353485501, "loss": 0.9708, "step": 70 }, { "epoch": 0.17087845968712395, "grad_norm": 0.8741897344589233, "learning_rate": 0.00019932628955588103, "loss": 1.0476, "step": 71 }, { "epoch": 0.17328519855595667, "grad_norm": 0.879451334476471, "learning_rate": 0.00019930404549944574, "loss": 1.036, "step": 72 }, { "epoch": 0.17569193742478942, "grad_norm": 0.8232237100601196, "learning_rate": 0.00019928144144618824, "loss": 1.0494, "step": 73 }, { "epoch": 0.17809867629362214, "grad_norm": 0.7794287204742432, "learning_rate": 0.00019925847747805274, "loss": 0.9803, "step": 74 }, { "epoch": 0.18050541516245489, "grad_norm": 0.8436810970306396, "learning_rate": 0.0001992351536782881, "loss": 1.003, "step": 75 }, { "epoch": 0.1829121540312876, "grad_norm": 1.5992075204849243, "learning_rate": 0.0001992114701314478, "loss": 1.0202, "step": 76 }, { "epoch": 0.18531889290012032, "grad_norm": 0.8204182982444763, "learning_rate": 0.00019918742692338933, "loss": 1.0107, "step": 77 }, { "epoch": 0.18772563176895307, "grad_norm": 0.7770963311195374, "learning_rate": 0.00019916302414127408, "loss": 0.9618, "step": 78 }, { "epoch": 0.1901323706377858, "grad_norm": 0.7679833769798279, "learning_rate": 0.00019913826187356696, "loss": 0.9485, "step": 79 }, { "epoch": 0.19253910950661854, "grad_norm": 0.8449601531028748, "learning_rate": 0.00019911314021003613, "loss": 1.0106, "step": 80 }, { "epoch": 0.19494584837545126, "grad_norm": 0.8383705019950867, "learning_rate": 0.00019908765924175258, "loss": 1.0358, "step": 81 }, { "epoch": 0.197352587244284, "grad_norm": 0.7661250233650208, "learning_rate": 0.00019906181906108984, "loss": 1.0271, "step": 82 }, { "epoch": 0.19975932611311673, "grad_norm": 0.742223858833313, "learning_rate": 0.00019903561976172368, "loss": 0.9159, "step": 83 }, { "epoch": 0.20216606498194944, "grad_norm": 0.8393533825874329, "learning_rate": 0.0001990090614386318, "loss": 1.0542, "step": 84 }, { "epoch": 0.2045728038507822, "grad_norm": 0.8213642239570618, "learning_rate": 0.0001989821441880933, "loss": 0.972, "step": 85 }, { "epoch": 0.2069795427196149, "grad_norm": 0.7490133047103882, "learning_rate": 0.00019895486810768856, "loss": 0.9551, "step": 86 }, { "epoch": 0.20938628158844766, "grad_norm": 0.8169188499450684, "learning_rate": 0.00019892723329629887, "loss": 1.037, "step": 87 }, { "epoch": 0.21179302045728038, "grad_norm": 0.7848641276359558, "learning_rate": 0.00019889923985410576, "loss": 0.9854, "step": 88 }, { "epoch": 0.21419975932611313, "grad_norm": 0.8486091494560242, "learning_rate": 0.00019887088788259102, "loss": 1.0496, "step": 89 }, { "epoch": 0.21660649819494585, "grad_norm": 0.8359668850898743, "learning_rate": 0.00019884217748453623, "loss": 0.9957, "step": 90 }, { "epoch": 0.2190132370637786, "grad_norm": 0.8299211859703064, "learning_rate": 0.00019881310876402223, "loss": 0.9658, "step": 91 }, { "epoch": 0.2214199759326113, "grad_norm": 0.8090376257896423, "learning_rate": 0.0001987836818264289, "loss": 0.9937, "step": 92 }, { "epoch": 0.22382671480144403, "grad_norm": 0.8346531987190247, "learning_rate": 0.0001987538967784347, "loss": 0.9421, "step": 93 }, { "epoch": 0.22623345367027678, "grad_norm": 0.783414900302887, "learning_rate": 0.0001987237537280163, "loss": 0.9735, "step": 94 }, { "epoch": 0.2286401925391095, "grad_norm": 0.7546629309654236, "learning_rate": 0.00019869325278444824, "loss": 1.0041, "step": 95 }, { "epoch": 0.23104693140794225, "grad_norm": 0.8800073266029358, "learning_rate": 0.00019866239405830248, "loss": 0.9562, "step": 96 }, { "epoch": 0.23345367027677497, "grad_norm": 0.7581021785736084, "learning_rate": 0.00019863117766144806, "loss": 0.9073, "step": 97 }, { "epoch": 0.2358604091456077, "grad_norm": 0.7516024708747864, "learning_rate": 0.0001985996037070505, "loss": 0.999, "step": 98 }, { "epoch": 0.23826714801444043, "grad_norm": 0.8055709600448608, "learning_rate": 0.00019856767230957173, "loss": 0.9662, "step": 99 }, { "epoch": 0.24067388688327315, "grad_norm": 0.8118072748184204, "learning_rate": 0.00019853538358476932, "loss": 1.0511, "step": 100 }, { "epoch": 0.2430806257521059, "grad_norm": 0.7562913298606873, "learning_rate": 0.00019850273764969632, "loss": 0.9906, "step": 101 }, { "epoch": 0.24548736462093862, "grad_norm": 0.7456312775611877, "learning_rate": 0.0001984697346227007, "loss": 0.7996, "step": 102 }, { "epoch": 0.24789410348977137, "grad_norm": 0.748582124710083, "learning_rate": 0.00019843637462342497, "loss": 0.9562, "step": 103 }, { "epoch": 0.2503008423586041, "grad_norm": 0.850549578666687, "learning_rate": 0.0001984026577728057, "loss": 1.017, "step": 104 }, { "epoch": 0.2503008423586041, "eval_loss": 0.5893637537956238, "eval_runtime": 2133.4959, "eval_samples_per_second": 1.786, "eval_steps_per_second": 0.893, "step": 104 }, { "epoch": 0.2527075812274368, "grad_norm": 0.7537848949432373, "learning_rate": 0.00019836858419307324, "loss": 0.9121, "step": 105 }, { "epoch": 0.25511432009626955, "grad_norm": 0.7593055367469788, "learning_rate": 0.00019833415400775093, "loss": 0.9615, "step": 106 }, { "epoch": 0.2575210589651023, "grad_norm": 0.7436023950576782, "learning_rate": 0.0001982993673416551, "loss": 0.9523, "step": 107 }, { "epoch": 0.259927797833935, "grad_norm": 0.8100264668464661, "learning_rate": 0.0001982642243208943, "loss": 0.9877, "step": 108 }, { "epoch": 0.26233453670276774, "grad_norm": 0.7653257846832275, "learning_rate": 0.0001982287250728689, "loss": 0.9504, "step": 109 }, { "epoch": 0.2647412755716005, "grad_norm": 0.7917811274528503, "learning_rate": 0.00019819286972627066, "loss": 0.994, "step": 110 }, { "epoch": 0.26714801444043323, "grad_norm": 0.7219581007957458, "learning_rate": 0.00019815665841108241, "loss": 0.9188, "step": 111 }, { "epoch": 0.2695547533092659, "grad_norm": 0.7833922505378723, "learning_rate": 0.00019812009125857728, "loss": 1.0097, "step": 112 }, { "epoch": 0.2719614921780987, "grad_norm": 0.8043282628059387, "learning_rate": 0.00019808316840131846, "loss": 0.9194, "step": 113 }, { "epoch": 0.2743682310469314, "grad_norm": 0.9073807001113892, "learning_rate": 0.00019804588997315858, "loss": 0.9949, "step": 114 }, { "epoch": 0.2767749699157641, "grad_norm": 0.7887935638427734, "learning_rate": 0.00019800825610923934, "loss": 0.963, "step": 115 }, { "epoch": 0.27918170878459686, "grad_norm": 0.734038233757019, "learning_rate": 0.00019797026694599098, "loss": 1.0006, "step": 116 }, { "epoch": 0.2815884476534296, "grad_norm": 0.7415774464607239, "learning_rate": 0.00019793192262113166, "loss": 0.9535, "step": 117 }, { "epoch": 0.28399518652226236, "grad_norm": 0.8428921103477478, "learning_rate": 0.00019789322327366723, "loss": 1.0364, "step": 118 }, { "epoch": 0.28640192539109505, "grad_norm": 0.7179557085037231, "learning_rate": 0.00019785416904389042, "loss": 0.8641, "step": 119 }, { "epoch": 0.2888086642599278, "grad_norm": 0.7368788719177246, "learning_rate": 0.00019781476007338058, "loss": 0.9645, "step": 120 }, { "epoch": 0.29121540312876054, "grad_norm": 0.7618100643157959, "learning_rate": 0.000197774996505003, "loss": 0.9317, "step": 121 }, { "epoch": 0.29362214199759323, "grad_norm": 0.7682094573974609, "learning_rate": 0.00019773487848290854, "loss": 0.9532, "step": 122 }, { "epoch": 0.296028880866426, "grad_norm": 0.8207808136940002, "learning_rate": 0.00019769440615253293, "loss": 1.064, "step": 123 }, { "epoch": 0.29843561973525873, "grad_norm": 0.804612934589386, "learning_rate": 0.00019765357966059638, "loss": 1.0504, "step": 124 }, { "epoch": 0.3008423586040915, "grad_norm": 0.7408599853515625, "learning_rate": 0.00019761239915510302, "loss": 0.9916, "step": 125 }, { "epoch": 0.30324909747292417, "grad_norm": 0.7420434355735779, "learning_rate": 0.0001975708647853403, "loss": 0.9906, "step": 126 }, { "epoch": 0.3056558363417569, "grad_norm": 0.7062914371490479, "learning_rate": 0.0001975289767018786, "loss": 0.9051, "step": 127 }, { "epoch": 0.30806257521058966, "grad_norm": 0.7812306880950928, "learning_rate": 0.00019748673505657046, "loss": 0.9938, "step": 128 }, { "epoch": 0.3104693140794224, "grad_norm": 0.7287104725837708, "learning_rate": 0.0001974441400025502, "loss": 0.9733, "step": 129 }, { "epoch": 0.3128760529482551, "grad_norm": 1.278515338897705, "learning_rate": 0.00019740119169423337, "loss": 0.9952, "step": 130 }, { "epoch": 0.31528279181708785, "grad_norm": 0.723901629447937, "learning_rate": 0.00019735789028731604, "loss": 0.9529, "step": 131 }, { "epoch": 0.3176895306859206, "grad_norm": 0.6783624887466431, "learning_rate": 0.0001973142359387744, "loss": 0.8826, "step": 132 }, { "epoch": 0.3200962695547533, "grad_norm": 0.736638605594635, "learning_rate": 0.00019727022880686412, "loss": 0.9316, "step": 133 }, { "epoch": 0.32250300842358604, "grad_norm": 0.70645672082901, "learning_rate": 0.00019722586905111976, "loss": 0.9257, "step": 134 }, { "epoch": 0.3249097472924188, "grad_norm": 0.7493844032287598, "learning_rate": 0.00019718115683235417, "loss": 0.9299, "step": 135 }, { "epoch": 0.32731648616125153, "grad_norm": 0.7431819438934326, "learning_rate": 0.00019713609231265805, "loss": 0.9491, "step": 136 }, { "epoch": 0.3297232250300842, "grad_norm": 0.7641167640686035, "learning_rate": 0.0001970906756553992, "loss": 0.9634, "step": 137 }, { "epoch": 0.33212996389891697, "grad_norm": 0.7450122833251953, "learning_rate": 0.00019704490702522197, "loss": 0.9572, "step": 138 }, { "epoch": 0.3345367027677497, "grad_norm": 0.7910891771316528, "learning_rate": 0.00019699878658804672, "loss": 0.9996, "step": 139 }, { "epoch": 0.3369434416365824, "grad_norm": 0.9418014287948608, "learning_rate": 0.00019695231451106912, "loss": 0.9213, "step": 140 }, { "epoch": 0.33935018050541516, "grad_norm": 1.201551914215088, "learning_rate": 0.00019690549096275972, "loss": 0.9648, "step": 141 }, { "epoch": 0.3417569193742479, "grad_norm": 0.7676377892494202, "learning_rate": 0.0001968583161128631, "loss": 0.9709, "step": 142 }, { "epoch": 0.34416365824308065, "grad_norm": 0.7007735967636108, "learning_rate": 0.00019681079013239748, "loss": 0.9203, "step": 143 }, { "epoch": 0.34657039711191334, "grad_norm": 0.7584015130996704, "learning_rate": 0.00019676291319365387, "loss": 0.9722, "step": 144 }, { "epoch": 0.3489771359807461, "grad_norm": 0.7326116561889648, "learning_rate": 0.00019671468547019573, "loss": 0.9131, "step": 145 }, { "epoch": 0.35138387484957884, "grad_norm": 0.7101622819900513, "learning_rate": 0.00019666610713685802, "loss": 0.9041, "step": 146 }, { "epoch": 0.35379061371841153, "grad_norm": 0.7699556946754456, "learning_rate": 0.0001966171783697469, "loss": 0.9169, "step": 147 }, { "epoch": 0.3561973525872443, "grad_norm": 0.7440523505210876, "learning_rate": 0.00019656789934623881, "loss": 0.9679, "step": 148 }, { "epoch": 0.358604091456077, "grad_norm": 0.6797596216201782, "learning_rate": 0.0001965182702449799, "loss": 0.8953, "step": 149 }, { "epoch": 0.36101083032490977, "grad_norm": 0.8142282366752625, "learning_rate": 0.0001964682912458856, "loss": 0.8886, "step": 150 }, { "epoch": 0.36341756919374246, "grad_norm": 0.740627110004425, "learning_rate": 0.00019641796253013958, "loss": 0.9535, "step": 151 }, { "epoch": 0.3658243080625752, "grad_norm": 0.7169809341430664, "learning_rate": 0.0001963672842801934, "loss": 0.9026, "step": 152 }, { "epoch": 0.36823104693140796, "grad_norm": 0.7335914969444275, "learning_rate": 0.00019631625667976583, "loss": 0.9585, "step": 153 }, { "epoch": 0.37063778580024065, "grad_norm": 0.7438033223152161, "learning_rate": 0.00019626487991384196, "loss": 0.9308, "step": 154 }, { "epoch": 0.3730445246690734, "grad_norm": 0.7219699025154114, "learning_rate": 0.00019621315416867274, "loss": 0.9359, "step": 155 }, { "epoch": 0.37545126353790614, "grad_norm": 0.7092848420143127, "learning_rate": 0.00019616107963177425, "loss": 0.8517, "step": 156 }, { "epoch": 0.3778580024067389, "grad_norm": 0.7738973498344421, "learning_rate": 0.00019610865649192697, "loss": 0.9772, "step": 157 }, { "epoch": 0.3802647412755716, "grad_norm": 0.6858243942260742, "learning_rate": 0.00019605588493917518, "loss": 0.8954, "step": 158 }, { "epoch": 0.38267148014440433, "grad_norm": 0.6589340567588806, "learning_rate": 0.00019600276516482622, "loss": 0.8633, "step": 159 }, { "epoch": 0.3850782190132371, "grad_norm": 0.7128363847732544, "learning_rate": 0.00019594929736144976, "loss": 0.9576, "step": 160 }, { "epoch": 0.38748495788206977, "grad_norm": 0.7730962038040161, "learning_rate": 0.00019589548172287719, "loss": 0.9451, "step": 161 }, { "epoch": 0.3898916967509025, "grad_norm": 0.7269492149353027, "learning_rate": 0.00019584131844420082, "loss": 0.8981, "step": 162 }, { "epoch": 0.39229843561973526, "grad_norm": 0.7042537331581116, "learning_rate": 0.00019578680772177327, "loss": 0.9246, "step": 163 }, { "epoch": 0.394705174488568, "grad_norm": 0.8260946869850159, "learning_rate": 0.00019573194975320673, "loss": 0.9975, "step": 164 }, { "epoch": 0.3971119133574007, "grad_norm": 0.7814927697181702, "learning_rate": 0.00019567674473737218, "loss": 0.9803, "step": 165 }, { "epoch": 0.39951865222623345, "grad_norm": 0.8008944392204285, "learning_rate": 0.00019562119287439873, "loss": 0.9761, "step": 166 }, { "epoch": 0.4019253910950662, "grad_norm": 0.758780300617218, "learning_rate": 0.00019556529436567287, "loss": 0.894, "step": 167 }, { "epoch": 0.4043321299638989, "grad_norm": 0.7311479449272156, "learning_rate": 0.00019550904941383773, "loss": 0.944, "step": 168 }, { "epoch": 0.40673886883273164, "grad_norm": 0.7663231492042542, "learning_rate": 0.00019545245822279243, "loss": 0.9571, "step": 169 }, { "epoch": 0.4091456077015644, "grad_norm": 0.701337456703186, "learning_rate": 0.00019539552099769126, "loss": 0.8524, "step": 170 }, { "epoch": 0.41155234657039713, "grad_norm": 0.6994669437408447, "learning_rate": 0.00019533823794494292, "loss": 0.8579, "step": 171 }, { "epoch": 0.4139590854392298, "grad_norm": 0.7405064105987549, "learning_rate": 0.0001952806092722098, "loss": 0.9258, "step": 172 }, { "epoch": 0.41636582430806257, "grad_norm": 0.6685819029808044, "learning_rate": 0.0001952226351884072, "loss": 0.8113, "step": 173 }, { "epoch": 0.4187725631768953, "grad_norm": 0.7590517997741699, "learning_rate": 0.00019516431590370278, "loss": 0.9802, "step": 174 }, { "epoch": 0.42117930204572807, "grad_norm": 0.6981585025787354, "learning_rate": 0.00019510565162951537, "loss": 0.8353, "step": 175 }, { "epoch": 0.42358604091456076, "grad_norm": 0.7846630811691284, "learning_rate": 0.0001950466425785146, "loss": 0.8734, "step": 176 }, { "epoch": 0.4259927797833935, "grad_norm": 0.7799774408340454, "learning_rate": 0.00019498728896462, "loss": 0.964, "step": 177 }, { "epoch": 0.42839951865222625, "grad_norm": 0.8154809474945068, "learning_rate": 0.00019492759100300019, "loss": 0.9105, "step": 178 }, { "epoch": 0.43080625752105894, "grad_norm": 0.6931765079498291, "learning_rate": 0.00019486754891007198, "loss": 0.9286, "step": 179 }, { "epoch": 0.4332129963898917, "grad_norm": 0.7486812472343445, "learning_rate": 0.00019480716290349995, "loss": 0.9415, "step": 180 }, { "epoch": 0.43561973525872444, "grad_norm": 0.7170876860618591, "learning_rate": 0.00019474643320219532, "loss": 0.9379, "step": 181 }, { "epoch": 0.4380264741275572, "grad_norm": 0.7242069244384766, "learning_rate": 0.0001946853600263152, "loss": 0.9377, "step": 182 }, { "epoch": 0.4404332129963899, "grad_norm": 0.6873876452445984, "learning_rate": 0.00019462394359726206, "loss": 0.9207, "step": 183 }, { "epoch": 0.4428399518652226, "grad_norm": 0.6617278456687927, "learning_rate": 0.0001945621841376825, "loss": 0.8636, "step": 184 }, { "epoch": 0.4452466907340554, "grad_norm": 0.7275647521018982, "learning_rate": 0.00019450008187146684, "loss": 0.9743, "step": 185 }, { "epoch": 0.44765342960288806, "grad_norm": 0.7597945928573608, "learning_rate": 0.00019443763702374812, "loss": 0.9661, "step": 186 }, { "epoch": 0.4500601684717208, "grad_norm": 0.6745346784591675, "learning_rate": 0.0001943748498209012, "loss": 0.86, "step": 187 }, { "epoch": 0.45246690734055356, "grad_norm": 0.7283429503440857, "learning_rate": 0.0001943117204905422, "loss": 0.9486, "step": 188 }, { "epoch": 0.4548736462093863, "grad_norm": 0.6715584993362427, "learning_rate": 0.00019424824926152735, "loss": 0.8849, "step": 189 }, { "epoch": 0.457280385078219, "grad_norm": 0.7713329792022705, "learning_rate": 0.00019418443636395248, "loss": 0.9484, "step": 190 }, { "epoch": 0.45968712394705175, "grad_norm": 0.6619556546211243, "learning_rate": 0.00019412028202915198, "loss": 0.8894, "step": 191 }, { "epoch": 0.4620938628158845, "grad_norm": 0.6532876491546631, "learning_rate": 0.00019405578648969796, "loss": 0.8478, "step": 192 }, { "epoch": 0.4645006016847172, "grad_norm": 1.2763874530792236, "learning_rate": 0.00019399094997939957, "loss": 0.9393, "step": 193 }, { "epoch": 0.46690734055354993, "grad_norm": 0.7221686244010925, "learning_rate": 0.00019392577273330197, "loss": 1.0116, "step": 194 }, { "epoch": 0.4693140794223827, "grad_norm": 0.7458716034889221, "learning_rate": 0.00019386025498768558, "loss": 0.895, "step": 195 }, { "epoch": 0.4717208182912154, "grad_norm": 0.6642630100250244, "learning_rate": 0.0001937943969800652, "loss": 0.9187, "step": 196 }, { "epoch": 0.4741275571600481, "grad_norm": 0.719078004360199, "learning_rate": 0.00019372819894918915, "loss": 0.963, "step": 197 }, { "epoch": 0.47653429602888087, "grad_norm": 0.6626113653182983, "learning_rate": 0.0001936616611350384, "loss": 0.9149, "step": 198 }, { "epoch": 0.4789410348977136, "grad_norm": 0.675428569316864, "learning_rate": 0.00019359478377882567, "loss": 0.7919, "step": 199 }, { "epoch": 0.4813477737665463, "grad_norm": 0.8895409107208252, "learning_rate": 0.00019352756712299468, "loss": 0.8779, "step": 200 }, { "epoch": 0.48375451263537905, "grad_norm": 0.7445719242095947, "learning_rate": 0.000193460011411219, "loss": 0.9072, "step": 201 }, { "epoch": 0.4861612515042118, "grad_norm": 0.6794666051864624, "learning_rate": 0.00019339211688840157, "loss": 0.8743, "step": 202 }, { "epoch": 0.48856799037304455, "grad_norm": 0.6897733211517334, "learning_rate": 0.0001933238838006734, "loss": 0.9425, "step": 203 }, { "epoch": 0.49097472924187724, "grad_norm": 0.7478888630867004, "learning_rate": 0.000193255312395393, "loss": 0.9222, "step": 204 }, { "epoch": 0.49338146811071, "grad_norm": 0.7975496649742126, "learning_rate": 0.00019318640292114524, "loss": 0.9317, "step": 205 }, { "epoch": 0.49578820697954273, "grad_norm": 0.627334713935852, "learning_rate": 0.00019311715562774062, "loss": 0.8144, "step": 206 }, { "epoch": 0.4981949458483754, "grad_norm": 0.6750829815864563, "learning_rate": 0.0001930475707662143, "loss": 0.8927, "step": 207 }, { "epoch": 0.5006016847172082, "grad_norm": 0.6967096328735352, "learning_rate": 0.00019297764858882514, "loss": 0.9544, "step": 208 }, { "epoch": 0.5006016847172082, "eval_loss": 0.5531798005104065, "eval_runtime": 2129.3732, "eval_samples_per_second": 1.789, "eval_steps_per_second": 0.895, "step": 208 }, { "epoch": 0.5030084235860409, "grad_norm": 0.6312858462333679, "learning_rate": 0.00019290738934905492, "loss": 0.7756, "step": 209 }, { "epoch": 0.5054151624548736, "grad_norm": 0.7019824981689453, "learning_rate": 0.00019283679330160726, "loss": 0.9664, "step": 210 }, { "epoch": 0.5078219013237064, "grad_norm": 0.7321256995201111, "learning_rate": 0.00019276586070240682, "loss": 0.9106, "step": 211 }, { "epoch": 0.5102286401925391, "grad_norm": 0.7145900726318359, "learning_rate": 0.0001926945918085983, "loss": 0.968, "step": 212 }, { "epoch": 0.5126353790613718, "grad_norm": 0.626711905002594, "learning_rate": 0.0001926229868785456, "loss": 0.8505, "step": 213 }, { "epoch": 0.5150421179302046, "grad_norm": 0.6327537894248962, "learning_rate": 0.0001925510461718307, "loss": 0.8628, "step": 214 }, { "epoch": 0.5174488567990373, "grad_norm": 0.6911125779151917, "learning_rate": 0.00019247876994925292, "loss": 0.982, "step": 215 }, { "epoch": 0.51985559566787, "grad_norm": 0.6829891204833984, "learning_rate": 0.00019240615847282788, "loss": 0.902, "step": 216 }, { "epoch": 0.5222623345367028, "grad_norm": 0.676946759223938, "learning_rate": 0.0001923332120057866, "loss": 0.9305, "step": 217 }, { "epoch": 0.5246690734055355, "grad_norm": 0.6674066781997681, "learning_rate": 0.00019225993081257436, "loss": 0.9217, "step": 218 }, { "epoch": 0.5270758122743683, "grad_norm": 0.7439377307891846, "learning_rate": 0.00019218631515885006, "loss": 0.9311, "step": 219 }, { "epoch": 0.529482551143201, "grad_norm": 0.7182191610336304, "learning_rate": 0.000192112365311485, "loss": 0.9538, "step": 220 }, { "epoch": 0.5318892900120337, "grad_norm": 0.6811573505401611, "learning_rate": 0.00019203808153856207, "loss": 0.9658, "step": 221 }, { "epoch": 0.5342960288808665, "grad_norm": 0.8502155542373657, "learning_rate": 0.00019196346410937457, "loss": 0.8781, "step": 222 }, { "epoch": 0.5367027677496992, "grad_norm": 0.7439753413200378, "learning_rate": 0.00019188851329442547, "loss": 0.9073, "step": 223 }, { "epoch": 0.5391095066185319, "grad_norm": 0.7889792323112488, "learning_rate": 0.00019181322936542635, "loss": 0.9637, "step": 224 }, { "epoch": 0.5415162454873647, "grad_norm": 0.7137686014175415, "learning_rate": 0.00019173761259529633, "loss": 0.9351, "step": 225 }, { "epoch": 0.5439229843561973, "grad_norm": 0.7067937254905701, "learning_rate": 0.00019166166325816118, "loss": 0.9517, "step": 226 }, { "epoch": 0.54632972322503, "grad_norm": 0.712842583656311, "learning_rate": 0.00019158538162935225, "loss": 0.8999, "step": 227 }, { "epoch": 0.5487364620938628, "grad_norm": 0.7125033736228943, "learning_rate": 0.0001915087679854056, "loss": 0.921, "step": 228 }, { "epoch": 0.5511432009626955, "grad_norm": 0.6885653734207153, "learning_rate": 0.0001914318226040608, "loss": 0.91, "step": 229 }, { "epoch": 0.5535499398315282, "grad_norm": 0.6739046573638916, "learning_rate": 0.0001913545457642601, "loss": 0.9068, "step": 230 }, { "epoch": 0.555956678700361, "grad_norm": 0.8396746516227722, "learning_rate": 0.00019127693774614738, "loss": 0.9382, "step": 231 }, { "epoch": 0.5583634175691937, "grad_norm": 0.8137013912200928, "learning_rate": 0.000191198998831067, "loss": 0.9231, "step": 232 }, { "epoch": 0.5607701564380265, "grad_norm": 0.6958538293838501, "learning_rate": 0.00019112072930156302, "loss": 0.9616, "step": 233 }, { "epoch": 0.5631768953068592, "grad_norm": 0.7363241910934448, "learning_rate": 0.00019104212944137796, "loss": 0.9428, "step": 234 }, { "epoch": 0.5655836341756919, "grad_norm": 0.7556502819061279, "learning_rate": 0.00019096319953545185, "loss": 0.9326, "step": 235 }, { "epoch": 0.5679903730445247, "grad_norm": 0.7994537353515625, "learning_rate": 0.00019088393986992124, "loss": 0.977, "step": 236 }, { "epoch": 0.5703971119133574, "grad_norm": 0.6968033313751221, "learning_rate": 0.0001908043507321181, "loss": 0.8939, "step": 237 }, { "epoch": 0.5728038507821901, "grad_norm": 0.7016099095344543, "learning_rate": 0.00019072443241056883, "loss": 0.9075, "step": 238 }, { "epoch": 0.5752105896510229, "grad_norm": 0.7213168740272522, "learning_rate": 0.00019064418519499317, "loss": 0.9448, "step": 239 }, { "epoch": 0.5776173285198556, "grad_norm": 0.7496588826179504, "learning_rate": 0.0001905636093763031, "loss": 0.9513, "step": 240 }, { "epoch": 0.5800240673886883, "grad_norm": 0.6599727869033813, "learning_rate": 0.00019048270524660196, "loss": 0.9126, "step": 241 }, { "epoch": 0.5824308062575211, "grad_norm": 0.9110409021377563, "learning_rate": 0.00019040147309918326, "loss": 0.9368, "step": 242 }, { "epoch": 0.5848375451263538, "grad_norm": 0.691779375076294, "learning_rate": 0.00019031991322852955, "loss": 0.8921, "step": 243 }, { "epoch": 0.5872442839951865, "grad_norm": 0.6789968609809875, "learning_rate": 0.00019023802593031154, "loss": 0.9329, "step": 244 }, { "epoch": 0.5896510228640193, "grad_norm": 0.6759790182113647, "learning_rate": 0.00019015581150138693, "loss": 0.8943, "step": 245 }, { "epoch": 0.592057761732852, "grad_norm": 0.7184381484985352, "learning_rate": 0.00019007327023979923, "loss": 0.8975, "step": 246 }, { "epoch": 0.5944645006016848, "grad_norm": 0.6547626852989197, "learning_rate": 0.0001899904024447769, "loss": 0.8774, "step": 247 }, { "epoch": 0.5968712394705175, "grad_norm": 0.6693066954612732, "learning_rate": 0.00018990720841673207, "loss": 0.9502, "step": 248 }, { "epoch": 0.5992779783393501, "grad_norm": 0.6593300104141235, "learning_rate": 0.0001898236884572596, "loss": 0.9178, "step": 249 }, { "epoch": 0.601684717208183, "grad_norm": 0.7282694578170776, "learning_rate": 0.00018973984286913584, "loss": 0.8041, "step": 250 }, { "epoch": 0.6040914560770156, "grad_norm": 0.665924608707428, "learning_rate": 0.00018965567195631765, "loss": 0.9181, "step": 251 }, { "epoch": 0.6064981949458483, "grad_norm": 0.6083967089653015, "learning_rate": 0.0001895711760239413, "loss": 0.881, "step": 252 }, { "epoch": 0.6089049338146811, "grad_norm": 0.6531050205230713, "learning_rate": 0.0001894863553783212, "loss": 0.9215, "step": 253 }, { "epoch": 0.6113116726835138, "grad_norm": 0.7065880298614502, "learning_rate": 0.00018940121032694898, "loss": 0.9419, "step": 254 }, { "epoch": 0.6137184115523465, "grad_norm": 0.6446614861488342, "learning_rate": 0.0001893157411784924, "loss": 0.9465, "step": 255 }, { "epoch": 0.6161251504211793, "grad_norm": 0.7137323021888733, "learning_rate": 0.00018922994824279395, "loss": 0.9631, "step": 256 }, { "epoch": 0.618531889290012, "grad_norm": 0.6316159963607788, "learning_rate": 0.00018914383183087002, "loss": 0.9317, "step": 257 }, { "epoch": 0.6209386281588448, "grad_norm": 0.6457177400588989, "learning_rate": 0.00018905739225490967, "loss": 0.823, "step": 258 }, { "epoch": 0.6233453670276775, "grad_norm": 0.6792429089546204, "learning_rate": 0.00018897062982827344, "loss": 0.8937, "step": 259 }, { "epoch": 0.6257521058965102, "grad_norm": 0.7672622799873352, "learning_rate": 0.00018888354486549237, "loss": 0.9268, "step": 260 }, { "epoch": 0.628158844765343, "grad_norm": 0.6826128363609314, "learning_rate": 0.0001887961376822666, "loss": 0.9626, "step": 261 }, { "epoch": 0.6305655836341757, "grad_norm": 0.6681162118911743, "learning_rate": 0.00018870840859546456, "loss": 0.9262, "step": 262 }, { "epoch": 0.6329723225030084, "grad_norm": 0.7267283201217651, "learning_rate": 0.00018862035792312147, "loss": 0.9449, "step": 263 }, { "epoch": 0.6353790613718412, "grad_norm": 0.6700749397277832, "learning_rate": 0.00018853198598443852, "loss": 0.9154, "step": 264 }, { "epoch": 0.6377858002406739, "grad_norm": 0.6982499957084656, "learning_rate": 0.00018844329309978145, "loss": 0.9573, "step": 265 }, { "epoch": 0.6401925391095066, "grad_norm": 0.5931220054626465, "learning_rate": 0.0001883542795906795, "loss": 0.8006, "step": 266 }, { "epoch": 0.6425992779783394, "grad_norm": 0.6673518419265747, "learning_rate": 0.00018826494577982433, "loss": 0.8673, "step": 267 }, { "epoch": 0.6450060168471721, "grad_norm": 0.6904028058052063, "learning_rate": 0.0001881752919910686, "loss": 0.8839, "step": 268 }, { "epoch": 0.6474127557160048, "grad_norm": 0.6733436584472656, "learning_rate": 0.0001880853185494251, "loss": 0.9311, "step": 269 }, { "epoch": 0.6498194945848376, "grad_norm": 0.6887320876121521, "learning_rate": 0.00018799502578106534, "loss": 0.8835, "step": 270 }, { "epoch": 0.6522262334536703, "grad_norm": 0.7354311347007751, "learning_rate": 0.00018790441401331847, "loss": 0.9152, "step": 271 }, { "epoch": 0.6546329723225031, "grad_norm": 0.6331139802932739, "learning_rate": 0.00018781348357467013, "loss": 0.9298, "step": 272 }, { "epoch": 0.6570397111913358, "grad_norm": 0.6849923133850098, "learning_rate": 0.00018772223479476114, "loss": 0.8927, "step": 273 }, { "epoch": 0.6594464500601684, "grad_norm": 0.737738847732544, "learning_rate": 0.00018763066800438636, "loss": 0.8868, "step": 274 }, { "epoch": 0.6618531889290012, "grad_norm": 0.6602675318717957, "learning_rate": 0.00018753878353549357, "loss": 0.8876, "step": 275 }, { "epoch": 0.6642599277978339, "grad_norm": 0.6486974358558655, "learning_rate": 0.00018744658172118215, "loss": 0.8379, "step": 276 }, { "epoch": 0.6666666666666666, "grad_norm": 0.6205615401268005, "learning_rate": 0.00018735406289570192, "loss": 0.8807, "step": 277 }, { "epoch": 0.6690734055354994, "grad_norm": 0.703909158706665, "learning_rate": 0.0001872612273944519, "loss": 0.8526, "step": 278 }, { "epoch": 0.6714801444043321, "grad_norm": 0.7326840758323669, "learning_rate": 0.0001871680755539792, "loss": 0.8761, "step": 279 }, { "epoch": 0.6738868832731648, "grad_norm": 0.6408722400665283, "learning_rate": 0.00018707460771197774, "loss": 0.9078, "step": 280 }, { "epoch": 0.6762936221419976, "grad_norm": 0.6457056403160095, "learning_rate": 0.00018698082420728684, "loss": 0.8687, "step": 281 }, { "epoch": 0.6787003610108303, "grad_norm": 0.7293155789375305, "learning_rate": 0.0001868867253798903, "loss": 0.9244, "step": 282 }, { "epoch": 0.681107099879663, "grad_norm": 0.699804961681366, "learning_rate": 0.00018679231157091506, "loss": 0.9379, "step": 283 }, { "epoch": 0.6835138387484958, "grad_norm": 1.239624261856079, "learning_rate": 0.00018669758312262976, "loss": 0.8984, "step": 284 }, { "epoch": 0.6859205776173285, "grad_norm": 0.6780726313591003, "learning_rate": 0.00018660254037844388, "loss": 0.8943, "step": 285 }, { "epoch": 0.6883273164861613, "grad_norm": 0.6162465214729309, "learning_rate": 0.0001865071836829061, "loss": 0.8627, "step": 286 }, { "epoch": 0.690734055354994, "grad_norm": 0.6380164623260498, "learning_rate": 0.0001864115133817034, "loss": 0.8856, "step": 287 }, { "epoch": 0.6931407942238267, "grad_norm": 0.6355769634246826, "learning_rate": 0.00018631552982165944, "loss": 0.8456, "step": 288 }, { "epoch": 0.6955475330926595, "grad_norm": 0.6145781874656677, "learning_rate": 0.00018621923335073376, "loss": 0.8414, "step": 289 }, { "epoch": 0.6979542719614922, "grad_norm": 0.6590957641601562, "learning_rate": 0.00018612262431802007, "loss": 0.8793, "step": 290 }, { "epoch": 0.7003610108303249, "grad_norm": 0.6873210072517395, "learning_rate": 0.0001860257030737452, "loss": 0.8809, "step": 291 }, { "epoch": 0.7027677496991577, "grad_norm": 0.7032055854797363, "learning_rate": 0.00018592846996926793, "loss": 0.9304, "step": 292 }, { "epoch": 0.7051744885679904, "grad_norm": 0.7025747299194336, "learning_rate": 0.0001858309253570774, "loss": 0.8188, "step": 293 }, { "epoch": 0.7075812274368231, "grad_norm": 0.6647669076919556, "learning_rate": 0.0001857330695907922, "loss": 0.8686, "step": 294 }, { "epoch": 0.7099879663056559, "grad_norm": 0.6497501730918884, "learning_rate": 0.0001856349030251589, "loss": 0.8704, "step": 295 }, { "epoch": 0.7123947051744886, "grad_norm": 0.651369571685791, "learning_rate": 0.00018553642601605068, "loss": 0.938, "step": 296 }, { "epoch": 0.7148014440433214, "grad_norm": 0.6719600558280945, "learning_rate": 0.00018543763892046617, "loss": 0.9087, "step": 297 }, { "epoch": 0.717208182912154, "grad_norm": 0.6322699785232544, "learning_rate": 0.00018533854209652818, "loss": 0.8328, "step": 298 }, { "epoch": 0.7196149217809867, "grad_norm": 0.6316370368003845, "learning_rate": 0.0001852391359034823, "loss": 0.8445, "step": 299 }, { "epoch": 0.7220216606498195, "grad_norm": 0.9449910521507263, "learning_rate": 0.0001851394207016957, "loss": 0.8106, "step": 300 }, { "epoch": 0.7244283995186522, "grad_norm": 0.607499361038208, "learning_rate": 0.00018503939685265568, "loss": 0.8829, "step": 301 }, { "epoch": 0.7268351383874849, "grad_norm": 0.6118245720863342, "learning_rate": 0.00018493906471896848, "loss": 0.9138, "step": 302 }, { "epoch": 0.7292418772563177, "grad_norm": 0.6160489320755005, "learning_rate": 0.00018483842466435798, "loss": 0.8339, "step": 303 }, { "epoch": 0.7316486161251504, "grad_norm": 0.6591795682907104, "learning_rate": 0.00018473747705366426, "loss": 0.8693, "step": 304 }, { "epoch": 0.7340553549939831, "grad_norm": 0.6387032270431519, "learning_rate": 0.00018463622225284242, "loss": 0.9353, "step": 305 }, { "epoch": 0.7364620938628159, "grad_norm": 0.6233176589012146, "learning_rate": 0.0001845346606289612, "loss": 0.8642, "step": 306 }, { "epoch": 0.7388688327316486, "grad_norm": 0.6539813280105591, "learning_rate": 0.00018443279255020152, "loss": 0.9163, "step": 307 }, { "epoch": 0.7412755716004813, "grad_norm": 0.6271246671676636, "learning_rate": 0.00018433061838585534, "loss": 0.8518, "step": 308 }, { "epoch": 0.7436823104693141, "grad_norm": 0.6527572274208069, "learning_rate": 0.0001842281385063243, "loss": 0.9372, "step": 309 }, { "epoch": 0.7460890493381468, "grad_norm": 0.6503356099128723, "learning_rate": 0.00018412535328311814, "loss": 0.8629, "step": 310 }, { "epoch": 0.7484957882069796, "grad_norm": 0.6259830594062805, "learning_rate": 0.00018402226308885368, "loss": 0.8974, "step": 311 }, { "epoch": 0.7509025270758123, "grad_norm": 0.6025224924087524, "learning_rate": 0.00018391886829725334, "loss": 0.9395, "step": 312 }, { "epoch": 0.7509025270758123, "eval_loss": 0.5430218577384949, "eval_runtime": 2127.2598, "eval_samples_per_second": 1.791, "eval_steps_per_second": 0.896, "step": 312 }, { "epoch": 0.753309265944645, "grad_norm": 0.6124367117881775, "learning_rate": 0.00018381516928314367, "loss": 0.8906, "step": 313 }, { "epoch": 0.7557160048134778, "grad_norm": 0.6602182984352112, "learning_rate": 0.00018371116642245408, "loss": 0.8489, "step": 314 }, { "epoch": 0.7581227436823105, "grad_norm": 0.6531786918640137, "learning_rate": 0.0001836068600922156, "loss": 0.8568, "step": 315 }, { "epoch": 0.7605294825511432, "grad_norm": 0.6167216897010803, "learning_rate": 0.00018350225067055925, "loss": 0.9312, "step": 316 }, { "epoch": 0.762936221419976, "grad_norm": 0.6144131422042847, "learning_rate": 0.00018339733853671496, "loss": 0.8911, "step": 317 }, { "epoch": 0.7653429602888087, "grad_norm": 0.6474859118461609, "learning_rate": 0.00018329212407100994, "loss": 0.9092, "step": 318 }, { "epoch": 0.7677496991576414, "grad_norm": 0.6905191540718079, "learning_rate": 0.00018318660765486748, "loss": 0.8547, "step": 319 }, { "epoch": 0.7701564380264742, "grad_norm": 0.6689995527267456, "learning_rate": 0.00018308078967080546, "loss": 0.8266, "step": 320 }, { "epoch": 0.7725631768953068, "grad_norm": 0.590982973575592, "learning_rate": 0.00018297467050243501, "loss": 0.7962, "step": 321 }, { "epoch": 0.7749699157641395, "grad_norm": 0.5800091624259949, "learning_rate": 0.00018286825053445918, "loss": 0.8617, "step": 322 }, { "epoch": 0.7773766546329723, "grad_norm": 0.5949111580848694, "learning_rate": 0.00018276153015267134, "loss": 0.8074, "step": 323 }, { "epoch": 0.779783393501805, "grad_norm": 0.681008517742157, "learning_rate": 0.00018265450974395403, "loss": 0.9507, "step": 324 }, { "epoch": 0.7821901323706378, "grad_norm": 0.6455432772636414, "learning_rate": 0.0001825471896962774, "loss": 0.9292, "step": 325 }, { "epoch": 0.7845968712394705, "grad_norm": 0.6346482038497925, "learning_rate": 0.0001824395703986979, "loss": 0.8795, "step": 326 }, { "epoch": 0.7870036101083032, "grad_norm": 0.6404364109039307, "learning_rate": 0.00018233165224135678, "loss": 0.8191, "step": 327 }, { "epoch": 0.789410348977136, "grad_norm": 0.6154891848564148, "learning_rate": 0.00018222343561547874, "loss": 0.8644, "step": 328 }, { "epoch": 0.7918170878459687, "grad_norm": 0.642311692237854, "learning_rate": 0.00018211492091337042, "loss": 0.8182, "step": 329 }, { "epoch": 0.7942238267148014, "grad_norm": 0.583395779132843, "learning_rate": 0.00018200610852841913, "loss": 0.8055, "step": 330 }, { "epoch": 0.7966305655836342, "grad_norm": 0.6394252777099609, "learning_rate": 0.00018189699885509127, "loss": 0.9147, "step": 331 }, { "epoch": 0.7990373044524669, "grad_norm": 0.5927478075027466, "learning_rate": 0.00018178759228893108, "loss": 0.8986, "step": 332 }, { "epoch": 0.8014440433212996, "grad_norm": 0.6793590188026428, "learning_rate": 0.00018167788922655894, "loss": 0.8537, "step": 333 }, { "epoch": 0.8038507821901324, "grad_norm": 0.6416248083114624, "learning_rate": 0.0001815678900656702, "loss": 0.8984, "step": 334 }, { "epoch": 0.8062575210589651, "grad_norm": 0.6718860268592834, "learning_rate": 0.00018145759520503358, "loss": 0.943, "step": 335 }, { "epoch": 0.8086642599277978, "grad_norm": 0.7188106775283813, "learning_rate": 0.0001813470050444898, "loss": 0.8964, "step": 336 }, { "epoch": 0.8110709987966306, "grad_norm": 0.6132128834724426, "learning_rate": 0.00018123611998495007, "loss": 0.8435, "step": 337 }, { "epoch": 0.8134777376654633, "grad_norm": 0.642383873462677, "learning_rate": 0.0001811249404283947, "loss": 0.9301, "step": 338 }, { "epoch": 0.8158844765342961, "grad_norm": 0.652371346950531, "learning_rate": 0.00018101346677787156, "loss": 0.8687, "step": 339 }, { "epoch": 0.8182912154031288, "grad_norm": 0.6033048033714294, "learning_rate": 0.00018090169943749476, "loss": 0.8931, "step": 340 }, { "epoch": 0.8206979542719615, "grad_norm": 0.6060802340507507, "learning_rate": 0.00018078963881244296, "loss": 0.8375, "step": 341 }, { "epoch": 0.8231046931407943, "grad_norm": 0.7405040860176086, "learning_rate": 0.00018067728530895815, "loss": 0.8876, "step": 342 }, { "epoch": 0.825511432009627, "grad_norm": 0.5660969018936157, "learning_rate": 0.00018056463933434398, "loss": 0.8457, "step": 343 }, { "epoch": 0.8279181708784596, "grad_norm": 0.6403225660324097, "learning_rate": 0.0001804517012969644, "loss": 0.8467, "step": 344 }, { "epoch": 0.8303249097472925, "grad_norm": 0.7246482968330383, "learning_rate": 0.00018033847160624225, "loss": 0.8904, "step": 345 }, { "epoch": 0.8327316486161251, "grad_norm": 0.6677387952804565, "learning_rate": 0.00018022495067265753, "loss": 0.8809, "step": 346 }, { "epoch": 0.8351383874849578, "grad_norm": 0.6579807996749878, "learning_rate": 0.00018011113890774603, "loss": 0.8565, "step": 347 }, { "epoch": 0.8375451263537906, "grad_norm": 0.6844046711921692, "learning_rate": 0.000179997036724098, "loss": 0.9092, "step": 348 }, { "epoch": 0.8399518652226233, "grad_norm": 0.5906157493591309, "learning_rate": 0.0001798826445353564, "loss": 0.87, "step": 349 }, { "epoch": 0.8423586040914561, "grad_norm": 0.6796956062316895, "learning_rate": 0.00017976796275621555, "loss": 0.8836, "step": 350 }, { "epoch": 0.8447653429602888, "grad_norm": 0.6199307441711426, "learning_rate": 0.00017965299180241963, "loss": 0.8396, "step": 351 }, { "epoch": 0.8471720818291215, "grad_norm": 0.6326633095741272, "learning_rate": 0.0001795377320907611, "loss": 0.8646, "step": 352 }, { "epoch": 0.8495788206979543, "grad_norm": 0.6216900944709778, "learning_rate": 0.00017942218403907924, "loss": 0.8527, "step": 353 }, { "epoch": 0.851985559566787, "grad_norm": 0.632237434387207, "learning_rate": 0.0001793063480662586, "loss": 0.8461, "step": 354 }, { "epoch": 0.8543922984356197, "grad_norm": 0.5908830165863037, "learning_rate": 0.00017919022459222752, "loss": 0.8378, "step": 355 }, { "epoch": 0.8567990373044525, "grad_norm": 0.6086264252662659, "learning_rate": 0.00017907381403795656, "loss": 0.8673, "step": 356 }, { "epoch": 0.8592057761732852, "grad_norm": 0.6095126271247864, "learning_rate": 0.00017895711682545704, "loss": 0.8139, "step": 357 }, { "epoch": 0.8616125150421179, "grad_norm": 0.6316693425178528, "learning_rate": 0.00017884013337777943, "loss": 0.9291, "step": 358 }, { "epoch": 0.8640192539109507, "grad_norm": 0.5753441452980042, "learning_rate": 0.00017872286411901191, "loss": 0.833, "step": 359 }, { "epoch": 0.8664259927797834, "grad_norm": 0.5970266461372375, "learning_rate": 0.00017860530947427875, "loss": 0.7249, "step": 360 }, { "epoch": 0.8688327316486161, "grad_norm": 0.6204938292503357, "learning_rate": 0.00017848746986973883, "loss": 0.7887, "step": 361 }, { "epoch": 0.8712394705174489, "grad_norm": 0.6160289645195007, "learning_rate": 0.000178369345732584, "loss": 0.9228, "step": 362 }, { "epoch": 0.8736462093862816, "grad_norm": 0.6035860776901245, "learning_rate": 0.00017825093749103765, "loss": 0.8264, "step": 363 }, { "epoch": 0.8760529482551144, "grad_norm": 0.6397987008094788, "learning_rate": 0.00017813224557435312, "loss": 0.8386, "step": 364 }, { "epoch": 0.8784596871239471, "grad_norm": 0.6122485399246216, "learning_rate": 0.00017801327041281207, "loss": 0.8626, "step": 365 }, { "epoch": 0.8808664259927798, "grad_norm": 0.5941351652145386, "learning_rate": 0.00017789401243772305, "loss": 0.821, "step": 366 }, { "epoch": 0.8832731648616126, "grad_norm": 0.5857208371162415, "learning_rate": 0.0001777744720814198, "loss": 0.7837, "step": 367 }, { "epoch": 0.8856799037304453, "grad_norm": 0.6580957770347595, "learning_rate": 0.00017765464977725977, "loss": 0.8768, "step": 368 }, { "epoch": 0.8880866425992779, "grad_norm": 0.6706531047821045, "learning_rate": 0.00017753454595962256, "loss": 0.9334, "step": 369 }, { "epoch": 0.8904933814681107, "grad_norm": 0.5867465138435364, "learning_rate": 0.00017741416106390826, "loss": 0.7783, "step": 370 }, { "epoch": 0.8929001203369434, "grad_norm": 0.6559320688247681, "learning_rate": 0.00017729349552653595, "loss": 0.8861, "step": 371 }, { "epoch": 0.8953068592057761, "grad_norm": 0.6223561763763428, "learning_rate": 0.0001771725497849421, "loss": 0.8798, "step": 372 }, { "epoch": 0.8977135980746089, "grad_norm": 0.604897141456604, "learning_rate": 0.00017705132427757895, "loss": 0.748, "step": 373 }, { "epoch": 0.9001203369434416, "grad_norm": 0.6216434240341187, "learning_rate": 0.00017692981944391294, "loss": 0.8742, "step": 374 }, { "epoch": 0.9025270758122743, "grad_norm": 0.9587754607200623, "learning_rate": 0.00017680803572442318, "loss": 0.8369, "step": 375 }, { "epoch": 0.9049338146811071, "grad_norm": 0.6133635640144348, "learning_rate": 0.00017668597356059978, "loss": 0.859, "step": 376 }, { "epoch": 0.9073405535499398, "grad_norm": 0.616245448589325, "learning_rate": 0.0001765636333949422, "loss": 0.9155, "step": 377 }, { "epoch": 0.9097472924187726, "grad_norm": 0.6076980829238892, "learning_rate": 0.00017644101567095778, "loss": 0.9254, "step": 378 }, { "epoch": 0.9121540312876053, "grad_norm": 0.5926278829574585, "learning_rate": 0.00017631812083316003, "loss": 0.871, "step": 379 }, { "epoch": 0.914560770156438, "grad_norm": 0.6268554329872131, "learning_rate": 0.0001761949493270671, "loss": 0.8876, "step": 380 }, { "epoch": 0.9169675090252708, "grad_norm": 0.5804511308670044, "learning_rate": 0.0001760715015992, "loss": 0.8347, "step": 381 }, { "epoch": 0.9193742478941035, "grad_norm": 0.5575320720672607, "learning_rate": 0.00017594777809708126, "loss": 0.8084, "step": 382 }, { "epoch": 0.9217809867629362, "grad_norm": 0.5840063095092773, "learning_rate": 0.00017582377926923305, "loss": 0.8514, "step": 383 }, { "epoch": 0.924187725631769, "grad_norm": 0.5655162334442139, "learning_rate": 0.00017569950556517566, "loss": 0.8078, "step": 384 }, { "epoch": 0.9265944645006017, "grad_norm": 0.6167016625404358, "learning_rate": 0.00017557495743542585, "loss": 0.8957, "step": 385 }, { "epoch": 0.9290012033694344, "grad_norm": 0.6130027770996094, "learning_rate": 0.00017545013533149523, "loss": 0.8803, "step": 386 }, { "epoch": 0.9314079422382672, "grad_norm": 0.5918142199516296, "learning_rate": 0.0001753250397058887, "loss": 0.8374, "step": 387 }, { "epoch": 0.9338146811070999, "grad_norm": 0.6262494921684265, "learning_rate": 0.0001751996710121026, "loss": 0.8342, "step": 388 }, { "epoch": 0.9362214199759326, "grad_norm": 0.6100596189498901, "learning_rate": 0.0001750740297046233, "loss": 0.8492, "step": 389 }, { "epoch": 0.9386281588447654, "grad_norm": 0.5893505215644836, "learning_rate": 0.0001749481162389254, "loss": 0.8588, "step": 390 }, { "epoch": 0.941034897713598, "grad_norm": 0.6651292443275452, "learning_rate": 0.00017482193107147014, "loss": 0.8109, "step": 391 }, { "epoch": 0.9434416365824309, "grad_norm": 0.5894994139671326, "learning_rate": 0.00017469547465970373, "loss": 0.9184, "step": 392 }, { "epoch": 0.9458483754512635, "grad_norm": 0.6112183928489685, "learning_rate": 0.00017456874746205568, "loss": 0.8971, "step": 393 }, { "epoch": 0.9482551143200962, "grad_norm": 0.6397886872291565, "learning_rate": 0.0001744417499379372, "loss": 0.8857, "step": 394 }, { "epoch": 0.950661853188929, "grad_norm": 0.6054075956344604, "learning_rate": 0.00017431448254773944, "loss": 0.8581, "step": 395 }, { "epoch": 0.9530685920577617, "grad_norm": 0.604507565498352, "learning_rate": 0.00017418694575283185, "loss": 0.8695, "step": 396 }, { "epoch": 0.9554753309265944, "grad_norm": 0.5803477168083191, "learning_rate": 0.0001740591400155606, "loss": 0.7751, "step": 397 }, { "epoch": 0.9578820697954272, "grad_norm": 0.6100960373878479, "learning_rate": 0.00017393106579924677, "loss": 0.8684, "step": 398 }, { "epoch": 0.9602888086642599, "grad_norm": 0.6000844836235046, "learning_rate": 0.00017380272356818473, "loss": 0.9406, "step": 399 }, { "epoch": 0.9626955475330926, "grad_norm": 0.6083583235740662, "learning_rate": 0.0001736741137876405, "loss": 0.8573, "step": 400 }, { "epoch": 0.9651022864019254, "grad_norm": 0.6315954327583313, "learning_rate": 0.00017354523692385, "loss": 0.9364, "step": 401 }, { "epoch": 0.9675090252707581, "grad_norm": 0.5958694219589233, "learning_rate": 0.0001734160934440173, "loss": 0.8561, "step": 402 }, { "epoch": 0.9699157641395909, "grad_norm": 0.6201788187026978, "learning_rate": 0.00017328668381631318, "loss": 0.8845, "step": 403 }, { "epoch": 0.9723225030084236, "grad_norm": 0.6730347275733948, "learning_rate": 0.0001731570085098731, "loss": 0.9207, "step": 404 }, { "epoch": 0.9747292418772563, "grad_norm": 0.5722523331642151, "learning_rate": 0.00017302706799479574, "loss": 0.8931, "step": 405 }, { "epoch": 0.9771359807460891, "grad_norm": 0.6000478267669678, "learning_rate": 0.00017289686274214118, "loss": 0.9138, "step": 406 }, { "epoch": 0.9795427196149218, "grad_norm": 0.5823535919189453, "learning_rate": 0.00017276639322392917, "loss": 0.8783, "step": 407 }, { "epoch": 0.9819494584837545, "grad_norm": 0.5480247139930725, "learning_rate": 0.00017263565991313765, "loss": 0.797, "step": 408 }, { "epoch": 0.9843561973525873, "grad_norm": 0.6205552220344543, "learning_rate": 0.0001725046632837007, "loss": 0.8406, "step": 409 }, { "epoch": 0.98676293622142, "grad_norm": 0.6053999066352844, "learning_rate": 0.00017237340381050703, "loss": 0.8955, "step": 410 }, { "epoch": 0.9891696750902527, "grad_norm": 0.5670915246009827, "learning_rate": 0.00017224188196939818, "loss": 0.7598, "step": 411 }, { "epoch": 0.9915764139590855, "grad_norm": 0.6079475283622742, "learning_rate": 0.00017211009823716694, "loss": 0.9242, "step": 412 }, { "epoch": 0.9939831528279182, "grad_norm": 0.5941441059112549, "learning_rate": 0.00017197805309155536, "loss": 0.8234, "step": 413 }, { "epoch": 0.9963898916967509, "grad_norm": 0.5606356263160706, "learning_rate": 0.00017184574701125326, "loss": 0.8647, "step": 414 }, { "epoch": 0.9987966305655837, "grad_norm": 0.6050673723220825, "learning_rate": 0.00017171318047589637, "loss": 0.8777, "step": 415 }, { "epoch": 1.0006016847172081, "grad_norm": 0.7562869191169739, "learning_rate": 0.00017158035396606458, "loss": 0.9921, "step": 416 }, { "epoch": 1.0006016847172081, "eval_loss": 0.5245047211647034, "eval_runtime": 2126.7419, "eval_samples_per_second": 1.791, "eval_steps_per_second": 0.896, "step": 416 }, { "epoch": 1.0030084235860408, "grad_norm": 0.5807491540908813, "learning_rate": 0.00017144726796328034, "loss": 0.74, "step": 417 }, { "epoch": 1.0054151624548737, "grad_norm": 0.6011835336685181, "learning_rate": 0.00017131392295000674, "loss": 0.7783, "step": 418 }, { "epoch": 1.0078219013237064, "grad_norm": 0.646964967250824, "learning_rate": 0.00017118031940964584, "loss": 0.712, "step": 419 }, { "epoch": 1.010228640192539, "grad_norm": 0.5902695059776306, "learning_rate": 0.0001710464578265369, "loss": 0.7266, "step": 420 }, { "epoch": 1.0126353790613718, "grad_norm": 0.6124182343482971, "learning_rate": 0.00017091233868595467, "loss": 0.7498, "step": 421 }, { "epoch": 1.0150421179302045, "grad_norm": 0.5674487352371216, "learning_rate": 0.0001707779624741076, "loss": 0.752, "step": 422 }, { "epoch": 1.0174488567990374, "grad_norm": 0.5971553325653076, "learning_rate": 0.00017064332967813605, "loss": 0.7733, "step": 423 }, { "epoch": 1.01985559566787, "grad_norm": 0.6159399747848511, "learning_rate": 0.00017050844078611056, "loss": 0.7206, "step": 424 }, { "epoch": 1.0222623345367028, "grad_norm": 0.5770703554153442, "learning_rate": 0.00017037329628703004, "loss": 0.7396, "step": 425 }, { "epoch": 1.0246690734055355, "grad_norm": 0.6062514781951904, "learning_rate": 0.00017023789667082012, "loss": 0.7446, "step": 426 }, { "epoch": 1.0270758122743682, "grad_norm": 0.5871722102165222, "learning_rate": 0.0001701022424283311, "loss": 0.7341, "step": 427 }, { "epoch": 1.0294825511432009, "grad_norm": 0.6125348210334778, "learning_rate": 0.00016996633405133655, "loss": 0.8103, "step": 428 }, { "epoch": 1.0318892900120338, "grad_norm": 0.6002120971679688, "learning_rate": 0.00016983017203253122, "loss": 0.7551, "step": 429 }, { "epoch": 1.0342960288808665, "grad_norm": 0.6098855137825012, "learning_rate": 0.00016969375686552937, "loss": 0.7524, "step": 430 }, { "epoch": 1.0367027677496992, "grad_norm": 0.5839653015136719, "learning_rate": 0.00016955708904486296, "loss": 0.691, "step": 431 }, { "epoch": 1.0391095066185319, "grad_norm": 0.5647653341293335, "learning_rate": 0.00016942016906597995, "loss": 0.6705, "step": 432 }, { "epoch": 1.0415162454873645, "grad_norm": 0.6066052317619324, "learning_rate": 0.00016928299742524234, "loss": 0.736, "step": 433 }, { "epoch": 1.0439229843561975, "grad_norm": 0.6028389930725098, "learning_rate": 0.00016914557461992447, "loss": 0.7473, "step": 434 }, { "epoch": 1.0463297232250302, "grad_norm": 2.736790418624878, "learning_rate": 0.00016900790114821122, "loss": 0.7611, "step": 435 }, { "epoch": 1.0487364620938628, "grad_norm": 0.65738844871521, "learning_rate": 0.00016886997750919619, "loss": 0.6944, "step": 436 }, { "epoch": 1.0511432009626955, "grad_norm": 0.5276406407356262, "learning_rate": 0.0001687318042028798, "loss": 0.6091, "step": 437 }, { "epoch": 1.0535499398315282, "grad_norm": 0.565412700176239, "learning_rate": 0.0001685933817301678, "loss": 0.7011, "step": 438 }, { "epoch": 1.055956678700361, "grad_norm": 0.6402344107627869, "learning_rate": 0.00016845471059286887, "loss": 0.7358, "step": 439 }, { "epoch": 1.0583634175691938, "grad_norm": 0.6545571684837341, "learning_rate": 0.00016831579129369346, "loss": 0.7502, "step": 440 }, { "epoch": 1.0607701564380265, "grad_norm": 0.6190571188926697, "learning_rate": 0.00016817662433625148, "loss": 0.7281, "step": 441 }, { "epoch": 1.0631768953068592, "grad_norm": 0.5711254477500916, "learning_rate": 0.00016803721022505067, "loss": 0.6452, "step": 442 }, { "epoch": 1.065583634175692, "grad_norm": 0.6505111455917358, "learning_rate": 0.00016789754946549485, "loss": 0.7174, "step": 443 }, { "epoch": 1.0679903730445246, "grad_norm": 0.6108816862106323, "learning_rate": 0.00016775764256388186, "loss": 0.8115, "step": 444 }, { "epoch": 1.0703971119133575, "grad_norm": 0.6219894289970398, "learning_rate": 0.00016761749002740193, "loss": 0.7206, "step": 445 }, { "epoch": 1.0728038507821902, "grad_norm": 0.60597825050354, "learning_rate": 0.0001674770923641358, "loss": 0.6746, "step": 446 }, { "epoch": 1.075210589651023, "grad_norm": 0.5559236407279968, "learning_rate": 0.00016733645008305272, "loss": 0.7447, "step": 447 }, { "epoch": 1.0776173285198556, "grad_norm": 0.6250678896903992, "learning_rate": 0.0001671955636940088, "loss": 0.8012, "step": 448 }, { "epoch": 1.0800240673886883, "grad_norm": 0.5929338335990906, "learning_rate": 0.00016705443370774515, "loss": 0.7432, "step": 449 }, { "epoch": 1.082430806257521, "grad_norm": 0.5781651139259338, "learning_rate": 0.00016691306063588583, "loss": 0.7214, "step": 450 }, { "epoch": 1.0848375451263539, "grad_norm": 0.6220395565032959, "learning_rate": 0.00016677144499093626, "loss": 0.8293, "step": 451 }, { "epoch": 1.0872442839951866, "grad_norm": 0.6164456009864807, "learning_rate": 0.0001666295872862812, "loss": 0.7588, "step": 452 }, { "epoch": 1.0896510228640193, "grad_norm": 0.569885790348053, "learning_rate": 0.00016648748803618286, "loss": 0.7121, "step": 453 }, { "epoch": 1.092057761732852, "grad_norm": 0.6496075987815857, "learning_rate": 0.0001663451477557792, "loss": 0.7496, "step": 454 }, { "epoch": 1.0944645006016847, "grad_norm": 0.6388487219810486, "learning_rate": 0.00016620256696108188, "loss": 0.7412, "step": 455 }, { "epoch": 1.0968712394705173, "grad_norm": 0.5865011811256409, "learning_rate": 0.00016605974616897449, "loss": 0.7547, "step": 456 }, { "epoch": 1.0992779783393503, "grad_norm": 0.5791248679161072, "learning_rate": 0.0001659166858972107, "loss": 0.7455, "step": 457 }, { "epoch": 1.101684717208183, "grad_norm": 0.6210140585899353, "learning_rate": 0.00016577338666441232, "loss": 0.6833, "step": 458 }, { "epoch": 1.1040914560770156, "grad_norm": 0.6160838007926941, "learning_rate": 0.00016562984899006744, "loss": 0.7654, "step": 459 }, { "epoch": 1.1064981949458483, "grad_norm": 0.540804922580719, "learning_rate": 0.00016548607339452853, "loss": 0.7427, "step": 460 }, { "epoch": 1.108904933814681, "grad_norm": 0.6406325101852417, "learning_rate": 0.00016534206039901057, "loss": 0.8532, "step": 461 }, { "epoch": 1.111311672683514, "grad_norm": 0.6207169890403748, "learning_rate": 0.00016519781052558917, "loss": 0.7164, "step": 462 }, { "epoch": 1.1137184115523466, "grad_norm": 0.5802072286605835, "learning_rate": 0.0001650533242971987, "loss": 0.7232, "step": 463 }, { "epoch": 1.1161251504211793, "grad_norm": 0.6440045237541199, "learning_rate": 0.00016490860223763036, "loss": 0.7913, "step": 464 }, { "epoch": 1.118531889290012, "grad_norm": 0.6056708097457886, "learning_rate": 0.00016476364487153023, "loss": 0.6958, "step": 465 }, { "epoch": 1.1209386281588447, "grad_norm": 0.5459867715835571, "learning_rate": 0.00016461845272439741, "loss": 0.675, "step": 466 }, { "epoch": 1.1233453670276774, "grad_norm": 0.7036973237991333, "learning_rate": 0.0001644730263225823, "loss": 0.7568, "step": 467 }, { "epoch": 1.1257521058965103, "grad_norm": 0.6424573659896851, "learning_rate": 0.00016432736619328425, "loss": 0.8332, "step": 468 }, { "epoch": 1.128158844765343, "grad_norm": 0.6176357269287109, "learning_rate": 0.0001641814728645502, "loss": 0.7743, "step": 469 }, { "epoch": 1.1305655836341757, "grad_norm": 0.6653046011924744, "learning_rate": 0.00016403534686527225, "loss": 0.8213, "step": 470 }, { "epoch": 1.1329723225030084, "grad_norm": 0.6054510474205017, "learning_rate": 0.0001638889887251861, "loss": 0.7597, "step": 471 }, { "epoch": 1.135379061371841, "grad_norm": 0.5923228859901428, "learning_rate": 0.000163742398974869, "loss": 0.7672, "step": 472 }, { "epoch": 1.1377858002406738, "grad_norm": 0.6395425200462341, "learning_rate": 0.00016359557814573777, "loss": 0.7054, "step": 473 }, { "epoch": 1.1401925391095067, "grad_norm": 0.5609533786773682, "learning_rate": 0.000163448526770047, "loss": 0.7397, "step": 474 }, { "epoch": 1.1425992779783394, "grad_norm": 5.949091911315918, "learning_rate": 0.00016330124538088705, "loss": 0.8663, "step": 475 }, { "epoch": 1.145006016847172, "grad_norm": 0.6687883734703064, "learning_rate": 0.0001631537345121821, "loss": 0.7318, "step": 476 }, { "epoch": 1.1474127557160048, "grad_norm": 0.5940186977386475, "learning_rate": 0.00016300599469868825, "loss": 0.7307, "step": 477 }, { "epoch": 1.1498194945848375, "grad_norm": 0.6269271969795227, "learning_rate": 0.00016285802647599156, "loss": 0.7196, "step": 478 }, { "epoch": 1.1522262334536704, "grad_norm": 0.5856907367706299, "learning_rate": 0.00016270983038050614, "loss": 0.7354, "step": 479 }, { "epoch": 1.154632972322503, "grad_norm": 0.5796502232551575, "learning_rate": 0.00016256140694947217, "loss": 0.6906, "step": 480 }, { "epoch": 1.1570397111913358, "grad_norm": 0.5744217038154602, "learning_rate": 0.00016241275672095395, "loss": 0.7735, "step": 481 }, { "epoch": 1.1594464500601684, "grad_norm": 0.5830369591712952, "learning_rate": 0.000162263880233838, "loss": 0.7388, "step": 482 }, { "epoch": 1.1618531889290011, "grad_norm": 1.1190905570983887, "learning_rate": 0.00016211477802783103, "loss": 0.7925, "step": 483 }, { "epoch": 1.164259927797834, "grad_norm": 0.566752016544342, "learning_rate": 0.00016196545064345812, "loss": 0.7249, "step": 484 }, { "epoch": 1.1666666666666667, "grad_norm": 0.6458240747451782, "learning_rate": 0.00016181589862206052, "loss": 0.7452, "step": 485 }, { "epoch": 1.1690734055354994, "grad_norm": 0.628702700138092, "learning_rate": 0.00016166612250579395, "loss": 0.8069, "step": 486 }, { "epoch": 1.1714801444043321, "grad_norm": 0.813092827796936, "learning_rate": 0.00016151612283762652, "loss": 0.7477, "step": 487 }, { "epoch": 1.1738868832731648, "grad_norm": 0.5661517977714539, "learning_rate": 0.00016136590016133662, "loss": 0.6735, "step": 488 }, { "epoch": 1.1762936221419975, "grad_norm": 1.1136127710342407, "learning_rate": 0.00016121545502151125, "loss": 0.7827, "step": 489 }, { "epoch": 1.1787003610108304, "grad_norm": 0.6130191683769226, "learning_rate": 0.00016106478796354382, "loss": 0.734, "step": 490 }, { "epoch": 1.1811070998796631, "grad_norm": 0.6565666198730469, "learning_rate": 0.00016091389953363226, "loss": 0.7703, "step": 491 }, { "epoch": 1.1835138387484958, "grad_norm": 0.6334533095359802, "learning_rate": 0.0001607627902787769, "loss": 0.7702, "step": 492 }, { "epoch": 1.1859205776173285, "grad_norm": 0.6772276759147644, "learning_rate": 0.00016061146074677885, "loss": 0.7541, "step": 493 }, { "epoch": 1.1883273164861612, "grad_norm": 0.6609050035476685, "learning_rate": 0.0001604599114862375, "loss": 0.8008, "step": 494 }, { "epoch": 1.1907340553549939, "grad_norm": 0.6136945486068726, "learning_rate": 0.00016030814304654895, "loss": 0.7762, "step": 495 }, { "epoch": 1.1931407942238268, "grad_norm": 0.6235164999961853, "learning_rate": 0.00016015615597790388, "loss": 0.7238, "step": 496 }, { "epoch": 1.1955475330926595, "grad_norm": 0.6044726371765137, "learning_rate": 0.00016000395083128543, "loss": 0.7855, "step": 497 }, { "epoch": 1.1979542719614922, "grad_norm": 0.6308525204658508, "learning_rate": 0.00015985152815846745, "loss": 0.7599, "step": 498 }, { "epoch": 1.2003610108303249, "grad_norm": 0.6668804287910461, "learning_rate": 0.00015969888851201226, "loss": 0.7976, "step": 499 }, { "epoch": 1.2027677496991576, "grad_norm": 0.6050263047218323, "learning_rate": 0.0001595460324452688, "loss": 0.6982, "step": 500 }, { "epoch": 1.2051744885679905, "grad_norm": 2.436272621154785, "learning_rate": 0.0001593929605123706, "loss": 0.6578, "step": 501 }, { "epoch": 1.2075812274368232, "grad_norm": 0.570894718170166, "learning_rate": 0.00015923967326823368, "loss": 0.7259, "step": 502 }, { "epoch": 1.2099879663056559, "grad_norm": 0.5668876767158508, "learning_rate": 0.00015908617126855466, "loss": 0.7383, "step": 503 }, { "epoch": 1.2123947051744886, "grad_norm": 0.5849233865737915, "learning_rate": 0.00015893245506980866, "loss": 0.6846, "step": 504 }, { "epoch": 1.2148014440433212, "grad_norm": 0.6635212898254395, "learning_rate": 0.00015877852522924732, "loss": 0.8403, "step": 505 }, { "epoch": 1.217208182912154, "grad_norm": 0.5373646020889282, "learning_rate": 0.0001586243823048968, "loss": 0.715, "step": 506 }, { "epoch": 1.2196149217809868, "grad_norm": 0.7217029333114624, "learning_rate": 0.00015847002685555578, "loss": 0.7523, "step": 507 }, { "epoch": 1.2220216606498195, "grad_norm": 2.0628678798675537, "learning_rate": 0.0001583154594407932, "loss": 0.8253, "step": 508 }, { "epoch": 1.2244283995186522, "grad_norm": 0.5775021314620972, "learning_rate": 0.0001581606806209466, "loss": 0.7047, "step": 509 }, { "epoch": 1.226835138387485, "grad_norm": 0.6098729372024536, "learning_rate": 0.00015800569095711982, "loss": 0.73, "step": 510 }, { "epoch": 1.2292418772563176, "grad_norm": 0.5929751396179199, "learning_rate": 0.0001578504910111811, "loss": 0.7643, "step": 511 }, { "epoch": 1.2316486161251503, "grad_norm": 0.5432533025741577, "learning_rate": 0.00015769508134576095, "loss": 0.7061, "step": 512 }, { "epoch": 1.2340553549939832, "grad_norm": 0.6032195091247559, "learning_rate": 0.00015753946252425013, "loss": 0.7682, "step": 513 }, { "epoch": 1.236462093862816, "grad_norm": 0.5717134475708008, "learning_rate": 0.00015738363511079776, "loss": 0.7404, "step": 514 }, { "epoch": 1.2388688327316486, "grad_norm": 0.6095199584960938, "learning_rate": 0.00015722759967030898, "loss": 0.7649, "step": 515 }, { "epoch": 1.2412755716004813, "grad_norm": 0.5780326128005981, "learning_rate": 0.0001570713567684432, "loss": 0.7609, "step": 516 }, { "epoch": 1.243682310469314, "grad_norm": 0.5788719058036804, "learning_rate": 0.00015691490697161182, "loss": 0.7187, "step": 517 }, { "epoch": 1.246089049338147, "grad_norm": 1.111929178237915, "learning_rate": 0.00015675825084697636, "loss": 0.7952, "step": 518 }, { "epoch": 1.2484957882069796, "grad_norm": 9.908884048461914, "learning_rate": 0.00015660138896244624, "loss": 0.8249, "step": 519 }, { "epoch": 1.2509025270758123, "grad_norm": 0.9300994873046875, "learning_rate": 0.00015644432188667695, "loss": 0.8153, "step": 520 }, { "epoch": 1.2509025270758123, "eval_loss": 0.5291587710380554, "eval_runtime": 2128.4115, "eval_samples_per_second": 1.79, "eval_steps_per_second": 0.895, "step": 520 }, { "epoch": 1.253309265944645, "grad_norm": 0.5882400870323181, "learning_rate": 0.0001562870501890676, "loss": 0.7851, "step": 521 }, { "epoch": 1.2557160048134777, "grad_norm": 0.6073246598243713, "learning_rate": 0.0001561295744397593, "loss": 0.7122, "step": 522 }, { "epoch": 1.2581227436823106, "grad_norm": 0.5966162085533142, "learning_rate": 0.00015597189520963277, "loss": 0.7448, "step": 523 }, { "epoch": 1.2605294825511433, "grad_norm": 0.5820335149765015, "learning_rate": 0.00015581401307030647, "loss": 0.7836, "step": 524 }, { "epoch": 1.262936221419976, "grad_norm": 0.6412204504013062, "learning_rate": 0.0001556559285941344, "loss": 0.7959, "step": 525 }, { "epoch": 1.2653429602888087, "grad_norm": 2.784876823425293, "learning_rate": 0.00015549764235420405, "loss": 0.8096, "step": 526 }, { "epoch": 1.2677496991576414, "grad_norm": 0.6194364428520203, "learning_rate": 0.00015533915492433443, "loss": 0.8003, "step": 527 }, { "epoch": 1.2701564380264743, "grad_norm": 0.5962981581687927, "learning_rate": 0.00015518046687907377, "loss": 0.6985, "step": 528 }, { "epoch": 1.2725631768953067, "grad_norm": 0.5945665240287781, "learning_rate": 0.0001550215787936977, "loss": 0.7912, "step": 529 }, { "epoch": 1.2749699157641396, "grad_norm": 0.5881643295288086, "learning_rate": 0.000154862491244207, "loss": 0.7454, "step": 530 }, { "epoch": 1.2773766546329723, "grad_norm": 0.5845112204551697, "learning_rate": 0.0001547032048073255, "loss": 0.7889, "step": 531 }, { "epoch": 1.279783393501805, "grad_norm": 0.600200355052948, "learning_rate": 0.00015454372006049803, "loss": 0.759, "step": 532 }, { "epoch": 1.2821901323706377, "grad_norm": 0.6359658241271973, "learning_rate": 0.0001543840375818884, "loss": 0.7683, "step": 533 }, { "epoch": 1.2845968712394704, "grad_norm": 0.5839866995811462, "learning_rate": 0.0001542241579503772, "loss": 0.726, "step": 534 }, { "epoch": 1.2870036101083033, "grad_norm": 0.5976535081863403, "learning_rate": 0.00015406408174555976, "loss": 0.7545, "step": 535 }, { "epoch": 1.289410348977136, "grad_norm": 0.6069065928459167, "learning_rate": 0.000153903809547744, "loss": 0.8069, "step": 536 }, { "epoch": 1.2918170878459687, "grad_norm": 0.6518110632896423, "learning_rate": 0.00015374334193794838, "loss": 0.7702, "step": 537 }, { "epoch": 1.2942238267148014, "grad_norm": 0.5915398597717285, "learning_rate": 0.00015358267949789966, "loss": 0.8045, "step": 538 }, { "epoch": 1.296630565583634, "grad_norm": 0.5987196564674377, "learning_rate": 0.00015342182281003112, "loss": 0.7635, "step": 539 }, { "epoch": 1.299037304452467, "grad_norm": 0.6012448072433472, "learning_rate": 0.00015326077245747999, "loss": 0.7191, "step": 540 }, { "epoch": 1.3014440433212997, "grad_norm": 0.614378035068512, "learning_rate": 0.00015309952902408576, "loss": 0.7204, "step": 541 }, { "epoch": 1.3038507821901324, "grad_norm": 0.6271599531173706, "learning_rate": 0.00015293809309438773, "loss": 0.7552, "step": 542 }, { "epoch": 1.306257521058965, "grad_norm": 0.5646169781684875, "learning_rate": 0.0001527764652536231, "loss": 0.7896, "step": 543 }, { "epoch": 1.3086642599277978, "grad_norm": 0.6039982438087463, "learning_rate": 0.00015261464608772488, "loss": 0.7689, "step": 544 }, { "epoch": 1.3110709987966307, "grad_norm": 0.5979913473129272, "learning_rate": 0.00015245263618331945, "loss": 0.804, "step": 545 }, { "epoch": 1.3134777376654632, "grad_norm": 0.602784276008606, "learning_rate": 0.00015229043612772486, "loss": 0.7656, "step": 546 }, { "epoch": 1.315884476534296, "grad_norm": 0.6027132868766785, "learning_rate": 0.0001521280465089484, "loss": 0.7594, "step": 547 }, { "epoch": 1.3182912154031288, "grad_norm": 0.5821241140365601, "learning_rate": 0.0001519654679156846, "loss": 0.7157, "step": 548 }, { "epoch": 1.3206979542719615, "grad_norm": 0.5676792860031128, "learning_rate": 0.00015180270093731303, "loss": 0.6684, "step": 549 }, { "epoch": 1.3231046931407942, "grad_norm": 0.6748465299606323, "learning_rate": 0.0001516397461638962, "loss": 0.875, "step": 550 }, { "epoch": 1.3255114320096268, "grad_norm": 0.6399796009063721, "learning_rate": 0.00015147660418617743, "loss": 0.7824, "step": 551 }, { "epoch": 1.3279181708784598, "grad_norm": 0.8065871000289917, "learning_rate": 0.00015131327559557867, "loss": 0.7135, "step": 552 }, { "epoch": 1.3303249097472925, "grad_norm": 0.6035123467445374, "learning_rate": 0.00015114976098419842, "loss": 0.745, "step": 553 }, { "epoch": 1.3327316486161251, "grad_norm": 0.5744829773902893, "learning_rate": 0.00015098606094480948, "loss": 0.7301, "step": 554 }, { "epoch": 1.3351383874849578, "grad_norm": 0.591141402721405, "learning_rate": 0.00015082217607085692, "loss": 0.7244, "step": 555 }, { "epoch": 1.3375451263537905, "grad_norm": 0.5961554050445557, "learning_rate": 0.00015065810695645584, "loss": 0.7643, "step": 556 }, { "epoch": 1.3399518652226234, "grad_norm": 0.5940012335777283, "learning_rate": 0.00015049385419638926, "loss": 0.781, "step": 557 }, { "epoch": 1.3423586040914561, "grad_norm": 0.8781886100769043, "learning_rate": 0.00015032941838610597, "loss": 0.7596, "step": 558 }, { "epoch": 1.3447653429602888, "grad_norm": 1.6951484680175781, "learning_rate": 0.00015016480012171828, "loss": 0.8364, "step": 559 }, { "epoch": 1.3471720818291215, "grad_norm": 0.6107861995697021, "learning_rate": 0.00015000000000000001, "loss": 0.7916, "step": 560 }, { "epoch": 1.3495788206979542, "grad_norm": 0.5677779912948608, "learning_rate": 0.00014983501861838425, "loss": 0.7217, "step": 561 }, { "epoch": 1.3519855595667871, "grad_norm": 0.5688561201095581, "learning_rate": 0.00014966985657496114, "loss": 0.7163, "step": 562 }, { "epoch": 1.3543922984356198, "grad_norm": 0.6165011525154114, "learning_rate": 0.00014950451446847578, "loss": 0.7692, "step": 563 }, { "epoch": 1.3567990373044525, "grad_norm": 0.6124313473701477, "learning_rate": 0.00014933899289832603, "loss": 0.7072, "step": 564 }, { "epoch": 1.3592057761732852, "grad_norm": 0.6154414415359497, "learning_rate": 0.0001491732924645604, "loss": 0.6805, "step": 565 }, { "epoch": 1.3616125150421179, "grad_norm": 0.5777294635772705, "learning_rate": 0.0001490074137678757, "loss": 0.7506, "step": 566 }, { "epoch": 1.3640192539109506, "grad_norm": 0.554263710975647, "learning_rate": 0.00014884135740961504, "loss": 0.6594, "step": 567 }, { "epoch": 1.3664259927797833, "grad_norm": 0.6005025506019592, "learning_rate": 0.00014867512399176563, "loss": 0.7548, "step": 568 }, { "epoch": 1.3688327316486162, "grad_norm": 0.6279603242874146, "learning_rate": 0.0001485087141169565, "loss": 0.7967, "step": 569 }, { "epoch": 1.3712394705174489, "grad_norm": 0.6061936020851135, "learning_rate": 0.00014834212838845637, "loss": 0.7711, "step": 570 }, { "epoch": 1.3736462093862816, "grad_norm": 0.5648444890975952, "learning_rate": 0.00014817536741017152, "loss": 0.745, "step": 571 }, { "epoch": 1.3760529482551143, "grad_norm": 0.586818277835846, "learning_rate": 0.0001480084317866435, "loss": 0.8276, "step": 572 }, { "epoch": 1.378459687123947, "grad_norm": 0.5470929145812988, "learning_rate": 0.00014784132212304694, "loss": 0.673, "step": 573 }, { "epoch": 1.3808664259927799, "grad_norm": 0.567538857460022, "learning_rate": 0.0001476740390251875, "loss": 0.7394, "step": 574 }, { "epoch": 1.3832731648616126, "grad_norm": 0.5874766111373901, "learning_rate": 0.0001475065830994995, "loss": 0.7675, "step": 575 }, { "epoch": 1.3856799037304453, "grad_norm": 0.5491836667060852, "learning_rate": 0.0001473389549530438, "loss": 0.7216, "step": 576 }, { "epoch": 1.388086642599278, "grad_norm": 0.5456269383430481, "learning_rate": 0.00014717115519350567, "loss": 0.702, "step": 577 }, { "epoch": 1.3904933814681106, "grad_norm": 0.592060387134552, "learning_rate": 0.00014700318442919242, "loss": 0.7458, "step": 578 }, { "epoch": 1.3929001203369435, "grad_norm": 0.5611606240272522, "learning_rate": 0.00014683504326903134, "loss": 0.6829, "step": 579 }, { "epoch": 1.3953068592057762, "grad_norm": 0.5985727906227112, "learning_rate": 0.00014666673232256738, "loss": 0.7154, "step": 580 }, { "epoch": 1.397713598074609, "grad_norm": 0.5680240392684937, "learning_rate": 0.00014649825219996106, "loss": 0.755, "step": 581 }, { "epoch": 1.4001203369434416, "grad_norm": 0.5446626543998718, "learning_rate": 0.00014632960351198618, "loss": 0.6367, "step": 582 }, { "epoch": 1.4025270758122743, "grad_norm": 0.5872788429260254, "learning_rate": 0.0001461607868700276, "loss": 0.7366, "step": 583 }, { "epoch": 1.4049338146811072, "grad_norm": 0.5854357481002808, "learning_rate": 0.0001459918028860791, "loss": 0.7698, "step": 584 }, { "epoch": 1.4073405535499397, "grad_norm": 0.5797271728515625, "learning_rate": 0.00014582265217274104, "loss": 0.7483, "step": 585 }, { "epoch": 1.4097472924187726, "grad_norm": 0.5771715044975281, "learning_rate": 0.00014565333534321826, "loss": 0.7324, "step": 586 }, { "epoch": 1.4121540312876053, "grad_norm": 0.595783531665802, "learning_rate": 0.0001454838530113178, "loss": 0.7375, "step": 587 }, { "epoch": 1.414560770156438, "grad_norm": 1.5128209590911865, "learning_rate": 0.00014531420579144656, "loss": 0.777, "step": 588 }, { "epoch": 1.4169675090252707, "grad_norm": 0.5895392298698425, "learning_rate": 0.00014514439429860943, "loss": 0.7731, "step": 589 }, { "epoch": 1.4193742478941034, "grad_norm": 0.6568642854690552, "learning_rate": 0.0001449744191484066, "loss": 0.7819, "step": 590 }, { "epoch": 1.4217809867629363, "grad_norm": 0.633415937423706, "learning_rate": 0.00014480428095703165, "loss": 0.7659, "step": 591 }, { "epoch": 1.424187725631769, "grad_norm": 0.598420262336731, "learning_rate": 0.0001446339803412692, "loss": 0.7249, "step": 592 }, { "epoch": 1.4265944645006017, "grad_norm": 0.5692817568778992, "learning_rate": 0.00014446351791849276, "loss": 0.6993, "step": 593 }, { "epoch": 1.4290012033694344, "grad_norm": 0.5995094180107117, "learning_rate": 0.00014429289430666227, "loss": 0.7211, "step": 594 }, { "epoch": 1.431407942238267, "grad_norm": 0.611709475517273, "learning_rate": 0.00014412211012432212, "loss": 0.7028, "step": 595 }, { "epoch": 1.4338146811071, "grad_norm": 0.620934009552002, "learning_rate": 0.0001439511659905988, "loss": 0.8071, "step": 596 }, { "epoch": 1.4362214199759327, "grad_norm": 0.6703519225120544, "learning_rate": 0.00014378006252519865, "loss": 0.8192, "step": 597 }, { "epoch": 1.4386281588447654, "grad_norm": 0.5902607440948486, "learning_rate": 0.00014360880034840554, "loss": 0.7817, "step": 598 }, { "epoch": 1.441034897713598, "grad_norm": 0.6152598857879639, "learning_rate": 0.0001434373800810788, "loss": 0.724, "step": 599 }, { "epoch": 1.4434416365824307, "grad_norm": 0.621351420879364, "learning_rate": 0.00014326580234465085, "loss": 0.7642, "step": 600 }, { "epoch": 1.4458483754512637, "grad_norm": 0.604678213596344, "learning_rate": 0.0001430940677611249, "loss": 0.7619, "step": 601 }, { "epoch": 1.4482551143200963, "grad_norm": 0.5845671892166138, "learning_rate": 0.00014292217695307285, "loss": 0.7458, "step": 602 }, { "epoch": 1.450661853188929, "grad_norm": 0.5957922339439392, "learning_rate": 0.00014275013054363287, "loss": 0.744, "step": 603 }, { "epoch": 1.4530685920577617, "grad_norm": 0.6502370834350586, "learning_rate": 0.00014257792915650728, "loss": 0.8039, "step": 604 }, { "epoch": 1.4554753309265944, "grad_norm": 0.5479421019554138, "learning_rate": 0.00014240557341596018, "loss": 0.7329, "step": 605 }, { "epoch": 1.4578820697954271, "grad_norm": 0.5662285685539246, "learning_rate": 0.00014223306394681528, "loss": 0.7869, "step": 606 }, { "epoch": 1.4602888086642598, "grad_norm": 0.5961649417877197, "learning_rate": 0.00014206040137445348, "loss": 0.7596, "step": 607 }, { "epoch": 1.4626955475330927, "grad_norm": 0.6080659627914429, "learning_rate": 0.0001418875863248109, "loss": 0.7844, "step": 608 }, { "epoch": 1.4651022864019254, "grad_norm": 0.5647626519203186, "learning_rate": 0.0001417146194243762, "loss": 0.7329, "step": 609 }, { "epoch": 1.467509025270758, "grad_norm": 0.6011881232261658, "learning_rate": 0.00014154150130018866, "loss": 0.8094, "step": 610 }, { "epoch": 1.4699157641395908, "grad_norm": 0.5778882503509521, "learning_rate": 0.00014136823257983577, "loss": 0.7549, "step": 611 }, { "epoch": 1.4723225030084235, "grad_norm": 0.565356433391571, "learning_rate": 0.0001411948138914509, "loss": 0.7033, "step": 612 }, { "epoch": 1.4747292418772564, "grad_norm": 0.5538718104362488, "learning_rate": 0.0001410212458637112, "loss": 0.7296, "step": 613 }, { "epoch": 1.477135980746089, "grad_norm": 0.5984217524528503, "learning_rate": 0.00014084752912583504, "loss": 0.7727, "step": 614 }, { "epoch": 1.4795427196149218, "grad_norm": 0.6261722445487976, "learning_rate": 0.00014067366430758004, "loss": 0.7921, "step": 615 }, { "epoch": 1.4819494584837545, "grad_norm": 0.5742819309234619, "learning_rate": 0.00014049965203924054, "loss": 0.7003, "step": 616 }, { "epoch": 1.4843561973525872, "grad_norm": 0.6170026063919067, "learning_rate": 0.00014032549295164552, "loss": 0.7288, "step": 617 }, { "epoch": 1.48676293622142, "grad_norm": 0.5423979163169861, "learning_rate": 0.00014015118767615606, "loss": 0.6443, "step": 618 }, { "epoch": 1.4891696750902528, "grad_norm": 0.5620379447937012, "learning_rate": 0.0001399767368446634, "loss": 0.7297, "step": 619 }, { "epoch": 1.4915764139590855, "grad_norm": 0.5677056312561035, "learning_rate": 0.00013980214108958624, "loss": 0.7511, "step": 620 }, { "epoch": 1.4939831528279182, "grad_norm": 0.5550370216369629, "learning_rate": 0.00013962740104386876, "loss": 0.7991, "step": 621 }, { "epoch": 1.4963898916967509, "grad_norm": 0.568217933177948, "learning_rate": 0.00013945251734097828, "loss": 0.7294, "step": 622 }, { "epoch": 1.4987966305655838, "grad_norm": 0.5843414664268494, "learning_rate": 0.0001392774906149028, "loss": 0.7222, "step": 623 }, { "epoch": 1.5012033694344162, "grad_norm": 0.5477676391601562, "learning_rate": 0.00013910232150014885, "loss": 0.708, "step": 624 }, { "epoch": 1.5012033694344162, "eval_loss": 0.5220046043395996, "eval_runtime": 2127.0862, "eval_samples_per_second": 1.791, "eval_steps_per_second": 0.896, "step": 624 }, { "epoch": 1.5036101083032491, "grad_norm": 0.5987933874130249, "learning_rate": 0.00013892701063173918, "loss": 0.7728, "step": 625 }, { "epoch": 1.5060168471720818, "grad_norm": 0.6070023775100708, "learning_rate": 0.0001387515586452103, "loss": 0.7626, "step": 626 }, { "epoch": 1.5084235860409145, "grad_norm": 0.5851220488548279, "learning_rate": 0.00013857596617661047, "loss": 0.7314, "step": 627 }, { "epoch": 1.5108303249097474, "grad_norm": 0.5754553079605103, "learning_rate": 0.00013840023386249713, "loss": 0.6743, "step": 628 }, { "epoch": 1.51323706377858, "grad_norm": 0.5737704634666443, "learning_rate": 0.00013822436233993475, "loss": 0.7632, "step": 629 }, { "epoch": 1.5156438026474128, "grad_norm": 0.5909572243690491, "learning_rate": 0.0001380483522464923, "loss": 0.7589, "step": 630 }, { "epoch": 1.5180505415162455, "grad_norm": 0.6345940232276917, "learning_rate": 0.00013787220422024134, "loss": 0.7456, "step": 631 }, { "epoch": 1.5204572803850782, "grad_norm": 0.5519130229949951, "learning_rate": 0.0001376959188997532, "loss": 0.6693, "step": 632 }, { "epoch": 1.522864019253911, "grad_norm": 0.5597511529922485, "learning_rate": 0.00013751949692409718, "loss": 0.7598, "step": 633 }, { "epoch": 1.5252707581227436, "grad_norm": 0.6272690296173096, "learning_rate": 0.00013734293893283783, "loss": 0.7493, "step": 634 }, { "epoch": 1.5276774969915765, "grad_norm": 0.5946121215820312, "learning_rate": 0.00013716624556603274, "loss": 0.7354, "step": 635 }, { "epoch": 1.530084235860409, "grad_norm": 0.5691307783126831, "learning_rate": 0.00013698941746423046, "loss": 0.6958, "step": 636 }, { "epoch": 1.532490974729242, "grad_norm": 0.6021568775177002, "learning_rate": 0.00013681245526846783, "loss": 0.7525, "step": 637 }, { "epoch": 1.5348977135980746, "grad_norm": 0.5632941126823425, "learning_rate": 0.00013663535962026778, "loss": 0.7736, "step": 638 }, { "epoch": 1.5373044524669073, "grad_norm": 0.54964280128479, "learning_rate": 0.00013645813116163713, "loss": 0.6586, "step": 639 }, { "epoch": 1.5397111913357402, "grad_norm": 0.5408549904823303, "learning_rate": 0.0001362807705350641, "loss": 0.6651, "step": 640 }, { "epoch": 1.5421179302045727, "grad_norm": 0.5978827476501465, "learning_rate": 0.00013610327838351613, "loss": 0.7963, "step": 641 }, { "epoch": 1.5445246690734056, "grad_norm": 0.5659887194633484, "learning_rate": 0.00013592565535043737, "loss": 0.7226, "step": 642 }, { "epoch": 1.5469314079422383, "grad_norm": 0.5809053778648376, "learning_rate": 0.00013574790207974646, "loss": 0.75, "step": 643 }, { "epoch": 1.549338146811071, "grad_norm": 0.5693362951278687, "learning_rate": 0.0001355700192158342, "loss": 0.7166, "step": 644 }, { "epoch": 1.5517448856799039, "grad_norm": 0.5767548084259033, "learning_rate": 0.00013539200740356118, "loss": 0.785, "step": 645 }, { "epoch": 1.5541516245487363, "grad_norm": 0.5969561338424683, "learning_rate": 0.0001352138672882555, "loss": 0.7005, "step": 646 }, { "epoch": 1.5565583634175693, "grad_norm": 0.5951581597328186, "learning_rate": 0.0001350355995157103, "loss": 0.7838, "step": 647 }, { "epoch": 1.558965102286402, "grad_norm": 0.5613061189651489, "learning_rate": 0.00013485720473218154, "loss": 0.7266, "step": 648 }, { "epoch": 1.5613718411552346, "grad_norm": 0.5469250082969666, "learning_rate": 0.00013467868358438563, "loss": 0.6417, "step": 649 }, { "epoch": 1.5637785800240676, "grad_norm": 0.5973423719406128, "learning_rate": 0.00013450003671949706, "loss": 0.7185, "step": 650 }, { "epoch": 1.5661853188929, "grad_norm": 0.5983613133430481, "learning_rate": 0.00013432126478514614, "loss": 0.7353, "step": 651 }, { "epoch": 1.568592057761733, "grad_norm": 0.6064116358757019, "learning_rate": 0.00013414236842941644, "loss": 0.7287, "step": 652 }, { "epoch": 1.5709987966305656, "grad_norm": 0.6146507263183594, "learning_rate": 0.0001339633483008427, "loss": 0.7748, "step": 653 }, { "epoch": 1.5734055354993983, "grad_norm": 0.5666840672492981, "learning_rate": 0.00013378420504840828, "loss": 0.6801, "step": 654 }, { "epoch": 1.575812274368231, "grad_norm": 0.5750359296798706, "learning_rate": 0.00013360493932154302, "loss": 0.8198, "step": 655 }, { "epoch": 1.5782190132370637, "grad_norm": 0.5974833965301514, "learning_rate": 0.0001334255517701206, "loss": 0.7081, "step": 656 }, { "epoch": 1.5806257521058966, "grad_norm": 0.637682318687439, "learning_rate": 0.0001332460430444564, "loss": 0.8031, "step": 657 }, { "epoch": 1.583032490974729, "grad_norm": 1.503692388534546, "learning_rate": 0.00013306641379530514, "loss": 0.7415, "step": 658 }, { "epoch": 1.585439229843562, "grad_norm": 0.5546070337295532, "learning_rate": 0.00013288666467385833, "loss": 0.6477, "step": 659 }, { "epoch": 1.5878459687123947, "grad_norm": 0.6268326640129089, "learning_rate": 0.00013270679633174218, "loss": 0.8012, "step": 660 }, { "epoch": 1.5902527075812274, "grad_norm": 0.6226966381072998, "learning_rate": 0.000132526809421015, "loss": 0.7937, "step": 661 }, { "epoch": 1.5926594464500603, "grad_norm": 0.5584068894386292, "learning_rate": 0.00013234670459416498, "loss": 0.7075, "step": 662 }, { "epoch": 1.5950661853188928, "grad_norm": 0.6198967695236206, "learning_rate": 0.00013216648250410776, "loss": 0.7418, "step": 663 }, { "epoch": 1.5974729241877257, "grad_norm": 0.6915008425712585, "learning_rate": 0.00013198614380418412, "loss": 0.748, "step": 664 }, { "epoch": 1.5998796630565584, "grad_norm": 0.5626258254051208, "learning_rate": 0.00013180568914815752, "loss": 0.7825, "step": 665 }, { "epoch": 1.602286401925391, "grad_norm": 0.5585489869117737, "learning_rate": 0.00013162511919021178, "loss": 0.7421, "step": 666 }, { "epoch": 1.604693140794224, "grad_norm": 0.5733473300933838, "learning_rate": 0.00013144443458494882, "loss": 0.7747, "step": 667 }, { "epoch": 1.6070998796630565, "grad_norm": 0.5311198830604553, "learning_rate": 0.00013126363598738603, "loss": 0.7134, "step": 668 }, { "epoch": 1.6095066185318894, "grad_norm": 0.6149599552154541, "learning_rate": 0.00013108272405295415, "loss": 0.7534, "step": 669 }, { "epoch": 1.611913357400722, "grad_norm": 0.577946662902832, "learning_rate": 0.00013090169943749476, "loss": 0.7694, "step": 670 }, { "epoch": 1.6143200962695547, "grad_norm": 0.5462052822113037, "learning_rate": 0.00013072056279725788, "loss": 0.7735, "step": 671 }, { "epoch": 1.6167268351383874, "grad_norm": 0.7406601905822754, "learning_rate": 0.00013053931478889975, "loss": 0.7516, "step": 672 }, { "epoch": 1.6191335740072201, "grad_norm": 0.5492813587188721, "learning_rate": 0.00013035795606948023, "loss": 0.6862, "step": 673 }, { "epoch": 1.621540312876053, "grad_norm": 0.587531328201294, "learning_rate": 0.0001301764872964606, "loss": 0.7076, "step": 674 }, { "epoch": 1.6239470517448855, "grad_norm": 0.5853806138038635, "learning_rate": 0.00012999490912770107, "loss": 0.7717, "step": 675 }, { "epoch": 1.6263537906137184, "grad_norm": 0.628341555595398, "learning_rate": 0.00012981322222145846, "loss": 0.7578, "step": 676 }, { "epoch": 1.6287605294825511, "grad_norm": 0.544722318649292, "learning_rate": 0.00012963142723638378, "loss": 0.7153, "step": 677 }, { "epoch": 1.6311672683513838, "grad_norm": 0.5789739489555359, "learning_rate": 0.00012944952483151978, "loss": 0.7603, "step": 678 }, { "epoch": 1.6335740072202167, "grad_norm": 0.5968687534332275, "learning_rate": 0.00012926751566629875, "loss": 0.8041, "step": 679 }, { "epoch": 1.6359807460890492, "grad_norm": 0.5878944396972656, "learning_rate": 0.0001290854004005399, "loss": 0.7511, "step": 680 }, { "epoch": 1.6383874849578821, "grad_norm": 0.5231705904006958, "learning_rate": 0.00012890317969444716, "loss": 0.6831, "step": 681 }, { "epoch": 1.6407942238267148, "grad_norm": 0.5970608592033386, "learning_rate": 0.00012872085420860665, "loss": 0.7723, "step": 682 }, { "epoch": 1.6432009626955475, "grad_norm": 0.5778161883354187, "learning_rate": 0.00012853842460398428, "loss": 0.6997, "step": 683 }, { "epoch": 1.6456077015643804, "grad_norm": 0.5468333959579468, "learning_rate": 0.00012835589154192357, "loss": 0.6658, "step": 684 }, { "epoch": 1.6480144404332129, "grad_norm": 0.6297695636749268, "learning_rate": 0.00012817325568414297, "loss": 0.7894, "step": 685 }, { "epoch": 1.6504211793020458, "grad_norm": 0.6080107092857361, "learning_rate": 0.00012799051769273362, "loss": 0.7775, "step": 686 }, { "epoch": 1.6528279181708785, "grad_norm": 0.631283700466156, "learning_rate": 0.0001278076782301569, "loss": 0.7035, "step": 687 }, { "epoch": 1.6552346570397112, "grad_norm": 0.5465618371963501, "learning_rate": 0.00012762473795924204, "loss": 0.6744, "step": 688 }, { "epoch": 1.6576413959085439, "grad_norm": 0.589749813079834, "learning_rate": 0.00012744169754318375, "loss": 0.77, "step": 689 }, { "epoch": 1.6600481347773766, "grad_norm": 0.614506721496582, "learning_rate": 0.0001272585576455398, "loss": 0.8041, "step": 690 }, { "epoch": 1.6624548736462095, "grad_norm": 0.5538762211799622, "learning_rate": 0.00012707531893022854, "loss": 0.7046, "step": 691 }, { "epoch": 1.6648616125150422, "grad_norm": 0.593899667263031, "learning_rate": 0.00012689198206152657, "loss": 0.7638, "step": 692 }, { "epoch": 1.6672683513838749, "grad_norm": 0.5845565795898438, "learning_rate": 0.0001267085477040664, "loss": 0.7343, "step": 693 }, { "epoch": 1.6696750902527075, "grad_norm": 0.6129235029220581, "learning_rate": 0.00012652501652283377, "loss": 0.7712, "step": 694 }, { "epoch": 1.6720818291215402, "grad_norm": 0.5438042879104614, "learning_rate": 0.00012634138918316568, "loss": 0.6948, "step": 695 }, { "epoch": 1.6744885679903732, "grad_norm": 0.6199984550476074, "learning_rate": 0.0001261576663507475, "loss": 0.7399, "step": 696 }, { "epoch": 1.6768953068592056, "grad_norm": 0.5407736897468567, "learning_rate": 0.00012597384869161084, "loss": 0.6513, "step": 697 }, { "epoch": 1.6793020457280385, "grad_norm": 0.598435640335083, "learning_rate": 0.00012578993687213118, "loss": 0.7587, "step": 698 }, { "epoch": 1.6817087845968712, "grad_norm": 0.5624896883964539, "learning_rate": 0.00012560593155902522, "loss": 0.7209, "step": 699 }, { "epoch": 1.684115523465704, "grad_norm": 0.571467936038971, "learning_rate": 0.00012542183341934872, "loss": 0.7259, "step": 700 }, { "epoch": 1.6865222623345368, "grad_norm": 0.5723950266838074, "learning_rate": 0.00012523764312049376, "loss": 0.7309, "step": 701 }, { "epoch": 1.6889290012033693, "grad_norm": 0.5856860876083374, "learning_rate": 0.00012505336133018672, "loss": 0.7143, "step": 702 }, { "epoch": 1.6913357400722022, "grad_norm": 0.5989895462989807, "learning_rate": 0.0001248689887164855, "loss": 0.8027, "step": 703 }, { "epoch": 1.693742478941035, "grad_norm": 0.54822838306427, "learning_rate": 0.00012468452594777737, "loss": 0.7433, "step": 704 }, { "epoch": 1.6961492178098676, "grad_norm": 0.5095949172973633, "learning_rate": 0.0001244999736927764, "loss": 0.7476, "step": 705 }, { "epoch": 1.6985559566787005, "grad_norm": 0.5592285990715027, "learning_rate": 0.00012431533262052098, "loss": 0.7348, "step": 706 }, { "epoch": 1.700962695547533, "grad_norm": 0.6262854933738708, "learning_rate": 0.00012413060340037163, "loss": 0.7588, "step": 707 }, { "epoch": 1.703369434416366, "grad_norm": 0.5886446833610535, "learning_rate": 0.00012394578670200826, "loss": 0.7187, "step": 708 }, { "epoch": 1.7057761732851986, "grad_norm": 0.5735413432121277, "learning_rate": 0.000123760883195428, "loss": 0.7656, "step": 709 }, { "epoch": 1.7081829121540313, "grad_norm": 0.572733461856842, "learning_rate": 0.00012357589355094275, "loss": 0.7673, "step": 710 }, { "epoch": 1.710589651022864, "grad_norm": 0.584449291229248, "learning_rate": 0.00012339081843917645, "loss": 0.794, "step": 711 }, { "epoch": 1.7129963898916967, "grad_norm": 0.5957013368606567, "learning_rate": 0.00012320565853106316, "loss": 0.7713, "step": 712 }, { "epoch": 1.7154031287605296, "grad_norm": 0.5649765729904175, "learning_rate": 0.00012302041449784409, "loss": 0.6923, "step": 713 }, { "epoch": 1.717809867629362, "grad_norm": 0.5303330421447754, "learning_rate": 0.00012283508701106557, "loss": 0.6851, "step": 714 }, { "epoch": 1.720216606498195, "grad_norm": 0.5509385466575623, "learning_rate": 0.00012264967674257646, "loss": 0.7127, "step": 715 }, { "epoch": 1.7226233453670277, "grad_norm": 0.563927412033081, "learning_rate": 0.00012246418436452562, "loss": 0.7133, "step": 716 }, { "epoch": 1.7250300842358604, "grad_norm": 0.5566623210906982, "learning_rate": 0.00012227861054935968, "loss": 0.7248, "step": 717 }, { "epoch": 1.7274368231046933, "grad_norm": 0.5854185819625854, "learning_rate": 0.00012209295596982042, "loss": 0.7259, "step": 718 }, { "epoch": 1.7298435619735257, "grad_norm": 0.5715108513832092, "learning_rate": 0.00012190722129894248, "loss": 0.7351, "step": 719 }, { "epoch": 1.7322503008423586, "grad_norm": 0.5447136759757996, "learning_rate": 0.00012172140721005079, "loss": 0.72, "step": 720 }, { "epoch": 1.7346570397111913, "grad_norm": 0.594906210899353, "learning_rate": 0.00012153551437675821, "loss": 0.7274, "step": 721 }, { "epoch": 1.737063778580024, "grad_norm": 0.5668012499809265, "learning_rate": 0.00012134954347296305, "loss": 0.7827, "step": 722 }, { "epoch": 1.739470517448857, "grad_norm": 0.5607947111129761, "learning_rate": 0.00012116349517284665, "loss": 0.7455, "step": 723 }, { "epoch": 1.7418772563176894, "grad_norm": 0.5687163472175598, "learning_rate": 0.00012097737015087094, "loss": 0.7726, "step": 724 }, { "epoch": 1.7442839951865223, "grad_norm": 0.562584638595581, "learning_rate": 0.00012079116908177593, "loss": 0.7341, "step": 725 }, { "epoch": 1.746690734055355, "grad_norm": 0.5559719800949097, "learning_rate": 0.00012060489264057742, "loss": 0.7493, "step": 726 }, { "epoch": 1.7490974729241877, "grad_norm": 0.5658311247825623, "learning_rate": 0.00012041854150256433, "loss": 0.6888, "step": 727 }, { "epoch": 1.7515042117930204, "grad_norm": 0.6048003435134888, "learning_rate": 0.00012023211634329643, "loss": 0.7295, "step": 728 }, { "epoch": 1.7515042117930204, "eval_loss": 0.5096825957298279, "eval_runtime": 2126.974, "eval_samples_per_second": 1.791, "eval_steps_per_second": 0.896, "step": 728 }, { "epoch": 1.753910950661853, "grad_norm": 0.5162447094917297, "learning_rate": 0.00012004561783860186, "loss": 0.6772, "step": 729 }, { "epoch": 1.756317689530686, "grad_norm": 0.5578247308731079, "learning_rate": 0.00011985904666457455, "loss": 0.7846, "step": 730 }, { "epoch": 1.7587244283995187, "grad_norm": 0.5341318249702454, "learning_rate": 0.00011967240349757203, "loss": 0.7434, "step": 731 }, { "epoch": 1.7611311672683514, "grad_norm": 0.5623995065689087, "learning_rate": 0.00011948568901421263, "loss": 0.7163, "step": 732 }, { "epoch": 1.763537906137184, "grad_norm": 0.5329893231391907, "learning_rate": 0.00011929890389137337, "loss": 0.7146, "step": 733 }, { "epoch": 1.7659446450060168, "grad_norm": 0.5228726863861084, "learning_rate": 0.00011911204880618729, "loss": 0.6295, "step": 734 }, { "epoch": 1.7683513838748497, "grad_norm": 0.5674324035644531, "learning_rate": 0.00011892512443604102, "loss": 0.7107, "step": 735 }, { "epoch": 1.7707581227436822, "grad_norm": 0.5487015843391418, "learning_rate": 0.00011873813145857249, "loss": 0.6899, "step": 736 }, { "epoch": 1.773164861612515, "grad_norm": 0.5783795118331909, "learning_rate": 0.00011855107055166814, "loss": 0.7305, "step": 737 }, { "epoch": 1.7755716004813478, "grad_norm": 0.5087436437606812, "learning_rate": 0.00011836394239346091, "loss": 0.647, "step": 738 }, { "epoch": 1.7779783393501805, "grad_norm": 0.5917911529541016, "learning_rate": 0.00011817674766232734, "loss": 0.7596, "step": 739 }, { "epoch": 1.7803850782190134, "grad_norm": 0.5736182928085327, "learning_rate": 0.00011798948703688539, "loss": 0.7309, "step": 740 }, { "epoch": 1.7827918170878458, "grad_norm": 0.6054282188415527, "learning_rate": 0.00011780216119599192, "loss": 0.7805, "step": 741 }, { "epoch": 1.7851985559566788, "grad_norm": 0.5418753623962402, "learning_rate": 0.00011761477081874015, "loss": 0.6868, "step": 742 }, { "epoch": 1.7876052948255114, "grad_norm": 0.5932491421699524, "learning_rate": 0.00011742731658445738, "loss": 0.7065, "step": 743 }, { "epoch": 1.7900120336943441, "grad_norm": 0.5726205110549927, "learning_rate": 0.00011723979917270218, "loss": 0.7667, "step": 744 }, { "epoch": 1.792418772563177, "grad_norm": 0.5876070857048035, "learning_rate": 0.0001170522192632624, "loss": 0.7441, "step": 745 }, { "epoch": 1.7948255114320095, "grad_norm": 0.5843150615692139, "learning_rate": 0.00011686457753615228, "loss": 0.7348, "step": 746 }, { "epoch": 1.7972322503008424, "grad_norm": 0.5844396352767944, "learning_rate": 0.00011667687467161024, "loss": 0.7859, "step": 747 }, { "epoch": 1.7996389891696751, "grad_norm": 0.6563355326652527, "learning_rate": 0.00011648911135009634, "loss": 0.6938, "step": 748 }, { "epoch": 1.8020457280385078, "grad_norm": 0.5793863534927368, "learning_rate": 0.00011630128825228974, "loss": 0.7151, "step": 749 }, { "epoch": 1.8044524669073405, "grad_norm": 0.5459253787994385, "learning_rate": 0.00011611340605908642, "loss": 0.6803, "step": 750 }, { "epoch": 1.8068592057761732, "grad_norm": 0.5786491632461548, "learning_rate": 0.00011592546545159645, "loss": 0.737, "step": 751 }, { "epoch": 1.8092659446450061, "grad_norm": 0.5864343047142029, "learning_rate": 0.00011573746711114179, "loss": 0.7081, "step": 752 }, { "epoch": 1.8116726835138386, "grad_norm": 0.5932061076164246, "learning_rate": 0.00011554941171925365, "loss": 0.7936, "step": 753 }, { "epoch": 1.8140794223826715, "grad_norm": 0.5331100821495056, "learning_rate": 0.00011536129995766996, "loss": 0.6987, "step": 754 }, { "epoch": 1.8164861612515042, "grad_norm": 0.5045899152755737, "learning_rate": 0.00011517313250833317, "loss": 0.6788, "step": 755 }, { "epoch": 1.8188929001203369, "grad_norm": 0.6275632381439209, "learning_rate": 0.0001149849100533875, "loss": 0.7662, "step": 756 }, { "epoch": 1.8212996389891698, "grad_norm": 0.599075973033905, "learning_rate": 0.00011479663327517667, "loss": 0.8063, "step": 757 }, { "epoch": 1.8237063778580023, "grad_norm": 0.5953880548477173, "learning_rate": 0.00011460830285624118, "loss": 0.7458, "step": 758 }, { "epoch": 1.8261131167268352, "grad_norm": 0.5669587254524231, "learning_rate": 0.00011441991947931612, "loss": 0.6929, "step": 759 }, { "epoch": 1.8285198555956679, "grad_norm": 0.5455871224403381, "learning_rate": 0.00011423148382732853, "loss": 0.7314, "step": 760 }, { "epoch": 1.8309265944645006, "grad_norm": 0.5651932954788208, "learning_rate": 0.00011404299658339493, "loss": 0.6821, "step": 761 }, { "epoch": 1.8333333333333335, "grad_norm": 0.5726320743560791, "learning_rate": 0.00011385445843081892, "loss": 0.7439, "step": 762 }, { "epoch": 1.835740072202166, "grad_norm": 0.566521167755127, "learning_rate": 0.00011366587005308858, "loss": 0.7675, "step": 763 }, { "epoch": 1.8381468110709989, "grad_norm": 0.6666224002838135, "learning_rate": 0.00011347723213387416, "loss": 0.7255, "step": 764 }, { "epoch": 1.8405535499398316, "grad_norm": 0.549959123134613, "learning_rate": 0.00011328854535702543, "loss": 0.7077, "step": 765 }, { "epoch": 1.8429602888086642, "grad_norm": 0.5623028874397278, "learning_rate": 0.0001130998104065693, "loss": 0.7124, "step": 766 }, { "epoch": 1.845367027677497, "grad_norm": 0.5426890254020691, "learning_rate": 0.00011291102796670734, "loss": 0.7083, "step": 767 }, { "epoch": 1.8477737665463296, "grad_norm": 0.5995566844940186, "learning_rate": 0.00011272219872181322, "loss": 0.7629, "step": 768 }, { "epoch": 1.8501805054151625, "grad_norm": 0.5425085425376892, "learning_rate": 0.00011253332335643043, "loss": 0.733, "step": 769 }, { "epoch": 1.8525872442839952, "grad_norm": 0.5052370429039001, "learning_rate": 0.00011234440255526948, "loss": 0.5811, "step": 770 }, { "epoch": 1.854993983152828, "grad_norm": 0.5724800229072571, "learning_rate": 0.0001121554370032057, "loss": 0.7406, "step": 771 }, { "epoch": 1.8574007220216606, "grad_norm": 0.545842707157135, "learning_rate": 0.00011196642738527659, "loss": 0.7094, "step": 772 }, { "epoch": 1.8598074608904933, "grad_norm": 0.5419096350669861, "learning_rate": 0.00011177737438667948, "loss": 0.6724, "step": 773 }, { "epoch": 1.8622141997593262, "grad_norm": 0.5490409135818481, "learning_rate": 0.00011158827869276887, "loss": 0.7153, "step": 774 }, { "epoch": 1.8646209386281587, "grad_norm": 0.5598011016845703, "learning_rate": 0.00011139914098905406, "loss": 0.6969, "step": 775 }, { "epoch": 1.8670276774969916, "grad_norm": 0.6060622334480286, "learning_rate": 0.00011120996196119675, "loss": 0.767, "step": 776 }, { "epoch": 1.8694344163658243, "grad_norm": 0.6176811456680298, "learning_rate": 0.0001110207422950083, "loss": 0.733, "step": 777 }, { "epoch": 1.871841155234657, "grad_norm": 0.5489953756332397, "learning_rate": 0.00011083148267644747, "loss": 0.7313, "step": 778 }, { "epoch": 1.87424789410349, "grad_norm": 0.6024463176727295, "learning_rate": 0.00011064218379161786, "loss": 0.7473, "step": 779 }, { "epoch": 1.8766546329723224, "grad_norm": 0.5825113654136658, "learning_rate": 0.00011045284632676536, "loss": 0.7618, "step": 780 }, { "epoch": 1.8790613718411553, "grad_norm": 0.5687229633331299, "learning_rate": 0.00011026347096827578, "loss": 0.704, "step": 781 }, { "epoch": 1.881468110709988, "grad_norm": 0.5716497898101807, "learning_rate": 0.00011007405840267228, "loss": 0.7865, "step": 782 }, { "epoch": 1.8838748495788207, "grad_norm": 0.5841460227966309, "learning_rate": 0.00010988460931661295, "loss": 0.7429, "step": 783 }, { "epoch": 1.8862815884476536, "grad_norm": 0.5839403867721558, "learning_rate": 0.00010969512439688816, "loss": 0.771, "step": 784 }, { "epoch": 1.888688327316486, "grad_norm": 0.578109085559845, "learning_rate": 0.00010950560433041826, "loss": 0.7438, "step": 785 }, { "epoch": 1.891095066185319, "grad_norm": 0.6067484617233276, "learning_rate": 0.00010931604980425108, "loss": 0.7496, "step": 786 }, { "epoch": 1.8935018050541517, "grad_norm": 0.579124927520752, "learning_rate": 0.00010912646150555919, "loss": 0.7472, "step": 787 }, { "epoch": 1.8959085439229844, "grad_norm": 0.5641288161277771, "learning_rate": 0.00010893684012163779, "loss": 0.7254, "step": 788 }, { "epoch": 1.898315282791817, "grad_norm": 0.5396283268928528, "learning_rate": 0.00010874718633990189, "loss": 0.7074, "step": 789 }, { "epoch": 1.9007220216606497, "grad_norm": 0.5646089911460876, "learning_rate": 0.00010855750084788398, "loss": 0.6699, "step": 790 }, { "epoch": 1.9031287605294827, "grad_norm": 0.529952883720398, "learning_rate": 0.00010836778433323158, "loss": 0.7292, "step": 791 }, { "epoch": 1.9055354993983151, "grad_norm": 0.5230306386947632, "learning_rate": 0.00010817803748370452, "loss": 0.7003, "step": 792 }, { "epoch": 1.907942238267148, "grad_norm": 0.5671941637992859, "learning_rate": 0.00010798826098717276, "loss": 0.7294, "step": 793 }, { "epoch": 1.9103489771359807, "grad_norm": 0.5470700860023499, "learning_rate": 0.00010779845553161362, "loss": 0.685, "step": 794 }, { "epoch": 1.9127557160048134, "grad_norm": 0.5477457642555237, "learning_rate": 0.00010760862180510951, "loss": 0.6632, "step": 795 }, { "epoch": 1.9151624548736463, "grad_norm": 0.5607609748840332, "learning_rate": 0.00010741876049584523, "loss": 0.6872, "step": 796 }, { "epoch": 1.9175691937424788, "grad_norm": 0.5889490246772766, "learning_rate": 0.00010722887229210557, "loss": 0.7256, "step": 797 }, { "epoch": 1.9199759326113117, "grad_norm": 0.546259880065918, "learning_rate": 0.00010703895788227292, "loss": 0.6982, "step": 798 }, { "epoch": 1.9223826714801444, "grad_norm": 0.5511704087257385, "learning_rate": 0.00010684901795482456, "loss": 0.7219, "step": 799 }, { "epoch": 1.924789410348977, "grad_norm": 0.5606042146682739, "learning_rate": 0.00010665905319833041, "loss": 0.676, "step": 800 }, { "epoch": 1.92719614921781, "grad_norm": 0.5675393342971802, "learning_rate": 0.00010646906430145018, "loss": 0.743, "step": 801 }, { "epoch": 1.9296028880866425, "grad_norm": 0.7092172503471375, "learning_rate": 0.00010627905195293135, "loss": 0.6927, "step": 802 }, { "epoch": 1.9320096269554754, "grad_norm": 0.5645425319671631, "learning_rate": 0.00010608901684160624, "loss": 0.6501, "step": 803 }, { "epoch": 1.934416365824308, "grad_norm": 0.5768856406211853, "learning_rate": 0.00010589895965638976, "loss": 0.7101, "step": 804 }, { "epoch": 1.9368231046931408, "grad_norm": 0.5573006868362427, "learning_rate": 0.00010570888108627681, "loss": 0.7372, "step": 805 }, { "epoch": 1.9392298435619735, "grad_norm": 0.5512224435806274, "learning_rate": 0.00010551878182033985, "loss": 0.6656, "step": 806 }, { "epoch": 1.9416365824308062, "grad_norm": 0.5883570909500122, "learning_rate": 0.00010532866254772638, "loss": 0.7214, "step": 807 }, { "epoch": 1.944043321299639, "grad_norm": 0.5947162508964539, "learning_rate": 0.00010513852395765631, "loss": 0.7312, "step": 808 }, { "epoch": 1.9464500601684718, "grad_norm": 0.5537188053131104, "learning_rate": 0.00010494836673941977, "loss": 0.6448, "step": 809 }, { "epoch": 1.9488567990373045, "grad_norm": 0.5741223096847534, "learning_rate": 0.00010475819158237425, "loss": 0.7802, "step": 810 }, { "epoch": 1.9512635379061372, "grad_norm": 0.5413981676101685, "learning_rate": 0.00010456799917594233, "loss": 0.6887, "step": 811 }, { "epoch": 1.9536702767749698, "grad_norm": 0.5528275370597839, "learning_rate": 0.00010437779020960921, "loss": 0.7391, "step": 812 }, { "epoch": 1.9560770156438028, "grad_norm": 0.5960240364074707, "learning_rate": 0.00010418756537291996, "loss": 0.7657, "step": 813 }, { "epoch": 1.9584837545126352, "grad_norm": 0.5392475128173828, "learning_rate": 0.00010399732535547734, "loss": 0.7455, "step": 814 }, { "epoch": 1.9608904933814681, "grad_norm": 0.5753099918365479, "learning_rate": 0.00010380707084693901, "loss": 0.7358, "step": 815 }, { "epoch": 1.9632972322503008, "grad_norm": 0.555989146232605, "learning_rate": 0.00010361680253701527, "loss": 0.7355, "step": 816 }, { "epoch": 1.9657039711191335, "grad_norm": 0.5837514400482178, "learning_rate": 0.00010342652111546635, "loss": 0.749, "step": 817 }, { "epoch": 1.9681107099879664, "grad_norm": 0.6191967725753784, "learning_rate": 0.00010323622727210012, "loss": 0.7916, "step": 818 }, { "epoch": 1.970517448856799, "grad_norm": 0.5738831758499146, "learning_rate": 0.00010304592169676943, "loss": 0.7585, "step": 819 }, { "epoch": 1.9729241877256318, "grad_norm": 0.5551449060440063, "learning_rate": 0.00010285560507936961, "loss": 0.7044, "step": 820 }, { "epoch": 1.9753309265944645, "grad_norm": 0.5258319973945618, "learning_rate": 0.00010266527810983617, "loss": 0.6722, "step": 821 }, { "epoch": 1.9777376654632972, "grad_norm": 0.5929287075996399, "learning_rate": 0.00010247494147814196, "loss": 0.688, "step": 822 }, { "epoch": 1.9801444043321301, "grad_norm": 0.560385525226593, "learning_rate": 0.00010228459587429497, "loss": 0.7461, "step": 823 }, { "epoch": 1.9825511432009626, "grad_norm": 0.5541728734970093, "learning_rate": 0.0001020942419883357, "loss": 0.7014, "step": 824 }, { "epoch": 1.9849578820697955, "grad_norm": 0.5902739763259888, "learning_rate": 0.00010190388051033466, "loss": 0.7798, "step": 825 }, { "epoch": 1.9873646209386282, "grad_norm": 0.5368677377700806, "learning_rate": 0.00010171351213038993, "loss": 0.726, "step": 826 }, { "epoch": 1.989771359807461, "grad_norm": 0.5635613203048706, "learning_rate": 0.0001015231375386245, "loss": 0.746, "step": 827 }, { "epoch": 1.9921780986762936, "grad_norm": 0.5378161668777466, "learning_rate": 0.00010133275742518403, "loss": 0.7031, "step": 828 }, { "epoch": 1.9945848375451263, "grad_norm": 0.5777206420898438, "learning_rate": 0.00010114237248023404, "loss": 0.7263, "step": 829 }, { "epoch": 1.9969915764139592, "grad_norm": 0.562624454498291, "learning_rate": 0.00010095198339395769, "loss": 0.7171, "step": 830 }, { "epoch": 1.9993983152827917, "grad_norm": 0.599170982837677, "learning_rate": 0.00010076159085655308, "loss": 0.7685, "step": 831 }, { "epoch": 2.0012033694344162, "grad_norm": 0.7281579375267029, "learning_rate": 0.00010057119555823085, "loss": 0.7776, "step": 832 }, { "epoch": 2.0012033694344162, "eval_loss": 0.5095618963241577, "eval_runtime": 2128.3624, "eval_samples_per_second": 1.79, "eval_steps_per_second": 0.895, "step": 832 }, { "epoch": 2.003610108303249, "grad_norm": 0.5513686537742615, "learning_rate": 0.00010038079818921166, "loss": 0.6596, "step": 833 }, { "epoch": 2.0060168471720816, "grad_norm": 0.5227069854736328, "learning_rate": 0.00010019039943972366, "loss": 0.6747, "step": 834 }, { "epoch": 2.0084235860409145, "grad_norm": 0.5072826743125916, "learning_rate": 0.0001, "loss": 0.6305, "step": 835 }, { "epoch": 2.0108303249097474, "grad_norm": 0.5399265885353088, "learning_rate": 9.980960056027636e-05, "loss": 0.6042, "step": 836 }, { "epoch": 2.01323706377858, "grad_norm": 0.5491809248924255, "learning_rate": 9.961920181078835e-05, "loss": 0.6224, "step": 837 }, { "epoch": 2.015643802647413, "grad_norm": 0.5515012741088867, "learning_rate": 9.942880444176918e-05, "loss": 0.6798, "step": 838 }, { "epoch": 2.0180505415162453, "grad_norm": 0.5995374917984009, "learning_rate": 9.923840914344695e-05, "loss": 0.6152, "step": 839 }, { "epoch": 2.020457280385078, "grad_norm": 0.5192496180534363, "learning_rate": 9.904801660604234e-05, "loss": 0.5834, "step": 840 }, { "epoch": 2.022864019253911, "grad_norm": 0.5734530091285706, "learning_rate": 9.8857627519766e-05, "loss": 0.6184, "step": 841 }, { "epoch": 2.0252707581227436, "grad_norm": 0.5471757650375366, "learning_rate": 9.8667242574816e-05, "loss": 0.5988, "step": 842 }, { "epoch": 2.0276774969915765, "grad_norm": 0.569009006023407, "learning_rate": 9.847686246137551e-05, "loss": 0.6277, "step": 843 }, { "epoch": 2.030084235860409, "grad_norm": 1.3306018114089966, "learning_rate": 9.828648786961008e-05, "loss": 0.6274, "step": 844 }, { "epoch": 2.032490974729242, "grad_norm": 0.5703548789024353, "learning_rate": 9.809611948966533e-05, "loss": 0.6095, "step": 845 }, { "epoch": 2.034897713598075, "grad_norm": 0.5432790517807007, "learning_rate": 9.790575801166432e-05, "loss": 0.5975, "step": 846 }, { "epoch": 2.0373044524669073, "grad_norm": 0.5251730680465698, "learning_rate": 9.771540412570504e-05, "loss": 0.6388, "step": 847 }, { "epoch": 2.03971119133574, "grad_norm": 0.5519959330558777, "learning_rate": 9.752505852185805e-05, "loss": 0.6307, "step": 848 }, { "epoch": 2.0421179302045727, "grad_norm": 0.5868892073631287, "learning_rate": 9.733472189016383e-05, "loss": 0.669, "step": 849 }, { "epoch": 2.0445246690734056, "grad_norm": 0.5717694759368896, "learning_rate": 9.71443949206304e-05, "loss": 0.6596, "step": 850 }, { "epoch": 2.046931407942238, "grad_norm": 0.5760344862937927, "learning_rate": 9.69540783032306e-05, "loss": 0.6226, "step": 851 }, { "epoch": 2.049338146811071, "grad_norm": 0.5564358234405518, "learning_rate": 9.676377272789992e-05, "loss": 0.5864, "step": 852 }, { "epoch": 2.051744885679904, "grad_norm": 0.5798995494842529, "learning_rate": 9.657347888453367e-05, "loss": 0.6237, "step": 853 }, { "epoch": 2.0541516245487363, "grad_norm": 0.5300430059432983, "learning_rate": 9.638319746298478e-05, "loss": 0.5923, "step": 854 }, { "epoch": 2.0565583634175693, "grad_norm": 0.526861846446991, "learning_rate": 9.619292915306101e-05, "loss": 0.5924, "step": 855 }, { "epoch": 2.0589651022864017, "grad_norm": 0.518176794052124, "learning_rate": 9.60026746445227e-05, "loss": 0.6152, "step": 856 }, { "epoch": 2.0613718411552346, "grad_norm": 0.5753731727600098, "learning_rate": 9.581243462708006e-05, "loss": 0.6267, "step": 857 }, { "epoch": 2.0637785800240676, "grad_norm": 0.5636255145072937, "learning_rate": 9.562220979039082e-05, "loss": 0.6692, "step": 858 }, { "epoch": 2.0661853188929, "grad_norm": 0.5535212159156799, "learning_rate": 9.543200082405768e-05, "loss": 0.6518, "step": 859 }, { "epoch": 2.068592057761733, "grad_norm": 0.5443201065063477, "learning_rate": 9.524180841762577e-05, "loss": 0.6241, "step": 860 }, { "epoch": 2.0709987966305654, "grad_norm": 0.5821400880813599, "learning_rate": 9.505163326058027e-05, "loss": 0.6575, "step": 861 }, { "epoch": 2.0734055354993983, "grad_norm": 0.5670173764228821, "learning_rate": 9.486147604234371e-05, "loss": 0.6167, "step": 862 }, { "epoch": 2.0758122743682312, "grad_norm": 0.5479863882064819, "learning_rate": 9.467133745227366e-05, "loss": 0.6364, "step": 863 }, { "epoch": 2.0782190132370637, "grad_norm": 0.5435499548912048, "learning_rate": 9.448121817966018e-05, "loss": 0.593, "step": 864 }, { "epoch": 2.0806257521058966, "grad_norm": 0.5634912252426147, "learning_rate": 9.42911189137232e-05, "loss": 0.5972, "step": 865 }, { "epoch": 2.083032490974729, "grad_norm": 0.5187667608261108, "learning_rate": 9.410104034361027e-05, "loss": 0.5618, "step": 866 }, { "epoch": 2.085439229843562, "grad_norm": 0.6102979183197021, "learning_rate": 9.39109831583938e-05, "loss": 0.6337, "step": 867 }, { "epoch": 2.087845968712395, "grad_norm": 0.5340936183929443, "learning_rate": 9.372094804706867e-05, "loss": 0.6036, "step": 868 }, { "epoch": 2.0902527075812274, "grad_norm": 0.5434143543243408, "learning_rate": 9.353093569854983e-05, "loss": 0.6085, "step": 869 }, { "epoch": 2.0926594464500603, "grad_norm": 0.5487204194068909, "learning_rate": 9.334094680166962e-05, "loss": 0.6067, "step": 870 }, { "epoch": 2.0950661853188928, "grad_norm": 0.560757040977478, "learning_rate": 9.315098204517543e-05, "loss": 0.6264, "step": 871 }, { "epoch": 2.0974729241877257, "grad_norm": 0.5696537494659424, "learning_rate": 9.296104211772709e-05, "loss": 0.5971, "step": 872 }, { "epoch": 2.099879663056558, "grad_norm": 0.5335119962692261, "learning_rate": 9.277112770789443e-05, "loss": 0.5849, "step": 873 }, { "epoch": 2.102286401925391, "grad_norm": 0.5663617253303528, "learning_rate": 9.258123950415479e-05, "loss": 0.6424, "step": 874 }, { "epoch": 2.104693140794224, "grad_norm": 0.5611468553543091, "learning_rate": 9.239137819489047e-05, "loss": 0.6103, "step": 875 }, { "epoch": 2.1070998796630565, "grad_norm": 0.5442248582839966, "learning_rate": 9.220154446838637e-05, "loss": 0.6191, "step": 876 }, { "epoch": 2.1095066185318894, "grad_norm": 0.5744360685348511, "learning_rate": 9.201173901282724e-05, "loss": 0.608, "step": 877 }, { "epoch": 2.111913357400722, "grad_norm": 0.5854671597480774, "learning_rate": 9.182196251629552e-05, "loss": 0.6729, "step": 878 }, { "epoch": 2.1143200962695547, "grad_norm": 0.5583740472793579, "learning_rate": 9.163221566676847e-05, "loss": 0.6228, "step": 879 }, { "epoch": 2.1167268351383877, "grad_norm": 0.5478801131248474, "learning_rate": 9.144249915211605e-05, "loss": 0.6011, "step": 880 }, { "epoch": 2.11913357400722, "grad_norm": 0.5797863006591797, "learning_rate": 9.125281366009815e-05, "loss": 0.6532, "step": 881 }, { "epoch": 2.121540312876053, "grad_norm": 0.5861325263977051, "learning_rate": 9.106315987836225e-05, "loss": 0.6371, "step": 882 }, { "epoch": 2.1239470517448855, "grad_norm": 0.656015932559967, "learning_rate": 9.087353849444085e-05, "loss": 0.601, "step": 883 }, { "epoch": 2.1263537906137184, "grad_norm": 0.5134221315383911, "learning_rate": 9.068395019574897e-05, "loss": 0.5589, "step": 884 }, { "epoch": 2.1287605294825513, "grad_norm": 0.5585229992866516, "learning_rate": 9.049439566958175e-05, "loss": 0.5958, "step": 885 }, { "epoch": 2.131167268351384, "grad_norm": 0.5942258238792419, "learning_rate": 9.030487560311186e-05, "loss": 0.6322, "step": 886 }, { "epoch": 2.1335740072202167, "grad_norm": 0.5352315902709961, "learning_rate": 9.011539068338708e-05, "loss": 0.6032, "step": 887 }, { "epoch": 2.135980746089049, "grad_norm": 0.5572310090065002, "learning_rate": 8.992594159732774e-05, "loss": 0.619, "step": 888 }, { "epoch": 2.138387484957882, "grad_norm": 0.5454102754592896, "learning_rate": 8.973652903172423e-05, "loss": 0.649, "step": 889 }, { "epoch": 2.140794223826715, "grad_norm": 0.5283105969429016, "learning_rate": 8.954715367323468e-05, "loss": 0.5608, "step": 890 }, { "epoch": 2.1432009626955475, "grad_norm": 0.5921598672866821, "learning_rate": 8.935781620838216e-05, "loss": 0.594, "step": 891 }, { "epoch": 2.1456077015643804, "grad_norm": 0.5260235071182251, "learning_rate": 8.916851732355255e-05, "loss": 0.5931, "step": 892 }, { "epoch": 2.148014440433213, "grad_norm": 0.5041037797927856, "learning_rate": 8.897925770499174e-05, "loss": 0.5503, "step": 893 }, { "epoch": 2.150421179302046, "grad_norm": 0.5520951151847839, "learning_rate": 8.879003803880326e-05, "loss": 0.5806, "step": 894 }, { "epoch": 2.1528279181708783, "grad_norm": 0.6075021624565125, "learning_rate": 8.860085901094595e-05, "loss": 0.6205, "step": 895 }, { "epoch": 2.155234657039711, "grad_norm": 0.5365018248558044, "learning_rate": 8.841172130723115e-05, "loss": 0.6216, "step": 896 }, { "epoch": 2.157641395908544, "grad_norm": 0.5436618328094482, "learning_rate": 8.822262561332056e-05, "loss": 0.623, "step": 897 }, { "epoch": 2.1600481347773766, "grad_norm": 0.8953309059143066, "learning_rate": 8.803357261472343e-05, "loss": 0.62, "step": 898 }, { "epoch": 2.1624548736462095, "grad_norm": 0.5410037040710449, "learning_rate": 8.784456299679432e-05, "loss": 0.6135, "step": 899 }, { "epoch": 2.164861612515042, "grad_norm": 0.5173628330230713, "learning_rate": 8.765559744473053e-05, "loss": 0.5965, "step": 900 }, { "epoch": 2.167268351383875, "grad_norm": 0.5655604004859924, "learning_rate": 8.746667664356956e-05, "loss": 0.604, "step": 901 }, { "epoch": 2.1696750902527078, "grad_norm": 0.556789755821228, "learning_rate": 8.727780127818677e-05, "loss": 0.5997, "step": 902 }, { "epoch": 2.1720818291215402, "grad_norm": 0.540797770023346, "learning_rate": 8.708897203329267e-05, "loss": 0.6151, "step": 903 }, { "epoch": 2.174488567990373, "grad_norm": 0.530121922492981, "learning_rate": 8.690018959343072e-05, "loss": 0.6101, "step": 904 }, { "epoch": 2.1768953068592056, "grad_norm": 0.5557842254638672, "learning_rate": 8.67114546429746e-05, "loss": 0.5788, "step": 905 }, { "epoch": 2.1793020457280385, "grad_norm": 0.5859378576278687, "learning_rate": 8.652276786612584e-05, "loss": 0.645, "step": 906 }, { "epoch": 2.1817087845968715, "grad_norm": 0.5701287388801575, "learning_rate": 8.633412994691144e-05, "loss": 0.6361, "step": 907 }, { "epoch": 2.184115523465704, "grad_norm": 0.5231797695159912, "learning_rate": 8.614554156918113e-05, "loss": 0.5357, "step": 908 }, { "epoch": 2.186522262334537, "grad_norm": 1.1207327842712402, "learning_rate": 8.595700341660511e-05, "loss": 0.6955, "step": 909 }, { "epoch": 2.1889290012033693, "grad_norm": 0.6264053583145142, "learning_rate": 8.57685161726715e-05, "loss": 0.6939, "step": 910 }, { "epoch": 2.191335740072202, "grad_norm": 0.5935765504837036, "learning_rate": 8.558008052068392e-05, "loss": 0.6809, "step": 911 }, { "epoch": 2.1937424789410347, "grad_norm": 0.5455060601234436, "learning_rate": 8.539169714375885e-05, "loss": 0.6399, "step": 912 }, { "epoch": 2.1961492178098676, "grad_norm": 0.5558289885520935, "learning_rate": 8.520336672482338e-05, "loss": 0.5931, "step": 913 }, { "epoch": 2.1985559566787005, "grad_norm": 0.5988699197769165, "learning_rate": 8.501508994661251e-05, "loss": 0.6038, "step": 914 }, { "epoch": 2.200962695547533, "grad_norm": 0.5497554540634155, "learning_rate": 8.482686749166686e-05, "loss": 0.6514, "step": 915 }, { "epoch": 2.203369434416366, "grad_norm": 0.5613542199134827, "learning_rate": 8.463870004233008e-05, "loss": 0.6084, "step": 916 }, { "epoch": 2.2057761732851984, "grad_norm": 0.5549713969230652, "learning_rate": 8.445058828074639e-05, "loss": 0.6094, "step": 917 }, { "epoch": 2.2081829121540313, "grad_norm": 0.5139541625976562, "learning_rate": 8.426253288885822e-05, "loss": 0.5557, "step": 918 }, { "epoch": 2.210589651022864, "grad_norm": 0.5233466029167175, "learning_rate": 8.407453454840357e-05, "loss": 0.5851, "step": 919 }, { "epoch": 2.2129963898916967, "grad_norm": 0.5774407982826233, "learning_rate": 8.38865939409136e-05, "loss": 0.6197, "step": 920 }, { "epoch": 2.2154031287605296, "grad_norm": 0.5754866003990173, "learning_rate": 8.369871174771027e-05, "loss": 0.5867, "step": 921 }, { "epoch": 2.217809867629362, "grad_norm": 0.5905739068984985, "learning_rate": 8.351088864990368e-05, "loss": 0.6607, "step": 922 }, { "epoch": 2.220216606498195, "grad_norm": 0.5966110825538635, "learning_rate": 8.332312532838978e-05, "loss": 0.6325, "step": 923 }, { "epoch": 2.222623345367028, "grad_norm": 0.5526286959648132, "learning_rate": 8.313542246384775e-05, "loss": 0.613, "step": 924 }, { "epoch": 2.2250300842358604, "grad_norm": 0.5557963252067566, "learning_rate": 8.294778073673762e-05, "loss": 0.5794, "step": 925 }, { "epoch": 2.2274368231046933, "grad_norm": 0.5573340654373169, "learning_rate": 8.276020082729783e-05, "loss": 0.6557, "step": 926 }, { "epoch": 2.2298435619735257, "grad_norm": 0.5305102467536926, "learning_rate": 8.257268341554264e-05, "loss": 0.6122, "step": 927 }, { "epoch": 2.2322503008423586, "grad_norm": 0.5541068315505981, "learning_rate": 8.238522918125983e-05, "loss": 0.6495, "step": 928 }, { "epoch": 2.234657039711191, "grad_norm": 0.5354228019714355, "learning_rate": 8.219783880400808e-05, "loss": 0.5993, "step": 929 }, { "epoch": 2.237063778580024, "grad_norm": 0.5587093234062195, "learning_rate": 8.201051296311462e-05, "loss": 0.6313, "step": 930 }, { "epoch": 2.239470517448857, "grad_norm": 0.5479996800422668, "learning_rate": 8.182325233767267e-05, "loss": 0.5669, "step": 931 }, { "epoch": 2.2418772563176894, "grad_norm": 0.5154574513435364, "learning_rate": 8.16360576065391e-05, "loss": 0.5661, "step": 932 }, { "epoch": 2.2442839951865223, "grad_norm": 0.5436407327651978, "learning_rate": 8.144892944833184e-05, "loss": 0.6487, "step": 933 }, { "epoch": 2.246690734055355, "grad_norm": 0.5264855623245239, "learning_rate": 8.126186854142752e-05, "loss": 0.5718, "step": 934 }, { "epoch": 2.2490974729241877, "grad_norm": 0.5434893369674683, "learning_rate": 8.107487556395901e-05, "loss": 0.6424, "step": 935 }, { "epoch": 2.2515042117930206, "grad_norm": 0.5234639644622803, "learning_rate": 8.088795119381276e-05, "loss": 0.6028, "step": 936 }, { "epoch": 2.2515042117930206, "eval_loss": 0.5286158919334412, "eval_runtime": 2125.8225, "eval_samples_per_second": 1.792, "eval_steps_per_second": 0.896, "step": 936 }, { "epoch": 2.253910950661853, "grad_norm": 0.5786437392234802, "learning_rate": 8.070109610862668e-05, "loss": 0.6031, "step": 937 }, { "epoch": 2.256317689530686, "grad_norm": 0.5654465556144714, "learning_rate": 8.051431098578741e-05, "loss": 0.6299, "step": 938 }, { "epoch": 2.2587244283995185, "grad_norm": 0.5593127608299255, "learning_rate": 8.032759650242802e-05, "loss": 0.669, "step": 939 }, { "epoch": 2.2611311672683514, "grad_norm": 0.566678524017334, "learning_rate": 8.014095333542548e-05, "loss": 0.6155, "step": 940 }, { "epoch": 2.2635379061371843, "grad_norm": 0.5609601736068726, "learning_rate": 7.995438216139818e-05, "loss": 0.6605, "step": 941 }, { "epoch": 2.2659446450060168, "grad_norm": 0.5589974522590637, "learning_rate": 7.97678836567036e-05, "loss": 0.6196, "step": 942 }, { "epoch": 2.2683513838748497, "grad_norm": 0.5427587032318115, "learning_rate": 7.958145849743569e-05, "loss": 0.6237, "step": 943 }, { "epoch": 2.270758122743682, "grad_norm": 0.5628014802932739, "learning_rate": 7.939510735942262e-05, "loss": 0.61, "step": 944 }, { "epoch": 2.273164861612515, "grad_norm": 0.5489886999130249, "learning_rate": 7.920883091822408e-05, "loss": 0.6144, "step": 945 }, { "epoch": 2.2755716004813475, "grad_norm": 0.5218595266342163, "learning_rate": 7.902262984912909e-05, "loss": 0.6009, "step": 946 }, { "epoch": 2.2779783393501805, "grad_norm": 0.5673239231109619, "learning_rate": 7.883650482715338e-05, "loss": 0.6251, "step": 947 }, { "epoch": 2.2803850782190134, "grad_norm": 0.5901681780815125, "learning_rate": 7.865045652703697e-05, "loss": 0.6504, "step": 948 }, { "epoch": 2.282791817087846, "grad_norm": 0.5236272215843201, "learning_rate": 7.846448562324183e-05, "loss": 0.5617, "step": 949 }, { "epoch": 2.2851985559566788, "grad_norm": 0.5611291527748108, "learning_rate": 7.827859278994925e-05, "loss": 0.6412, "step": 950 }, { "epoch": 2.2876052948255117, "grad_norm": 0.5313939452171326, "learning_rate": 7.809277870105753e-05, "loss": 0.5881, "step": 951 }, { "epoch": 2.290012033694344, "grad_norm": 0.5616933703422546, "learning_rate": 7.79070440301796e-05, "loss": 0.5722, "step": 952 }, { "epoch": 2.292418772563177, "grad_norm": 0.5475797057151794, "learning_rate": 7.772138945064035e-05, "loss": 0.5781, "step": 953 }, { "epoch": 2.2948255114320095, "grad_norm": 0.5745391249656677, "learning_rate": 7.753581563547441e-05, "loss": 0.62, "step": 954 }, { "epoch": 2.2972322503008424, "grad_norm": 0.5661266446113586, "learning_rate": 7.735032325742355e-05, "loss": 0.6495, "step": 955 }, { "epoch": 2.299638989169675, "grad_norm": 0.5461136698722839, "learning_rate": 7.716491298893442e-05, "loss": 0.6065, "step": 956 }, { "epoch": 2.302045728038508, "grad_norm": 0.5324814915657043, "learning_rate": 7.697958550215592e-05, "loss": 0.5598, "step": 957 }, { "epoch": 2.3044524669073407, "grad_norm": 0.5649887919425964, "learning_rate": 7.679434146893685e-05, "loss": 0.5912, "step": 958 }, { "epoch": 2.306859205776173, "grad_norm": 0.571933388710022, "learning_rate": 7.660918156082354e-05, "loss": 0.6278, "step": 959 }, { "epoch": 2.309265944645006, "grad_norm": 0.5734432935714722, "learning_rate": 7.642410644905726e-05, "loss": 0.6241, "step": 960 }, { "epoch": 2.3116726835138386, "grad_norm": 0.5563123822212219, "learning_rate": 7.623911680457198e-05, "loss": 0.6131, "step": 961 }, { "epoch": 2.3140794223826715, "grad_norm": 0.5277346968650818, "learning_rate": 7.605421329799176e-05, "loss": 0.5998, "step": 962 }, { "epoch": 2.316486161251504, "grad_norm": 0.5425368547439575, "learning_rate": 7.58693965996284e-05, "loss": 0.6172, "step": 963 }, { "epoch": 2.318892900120337, "grad_norm": 0.5608372092247009, "learning_rate": 7.568466737947905e-05, "loss": 0.6227, "step": 964 }, { "epoch": 2.32129963898917, "grad_norm": 0.5675607919692993, "learning_rate": 7.550002630722366e-05, "loss": 0.6357, "step": 965 }, { "epoch": 2.3237063778580023, "grad_norm": 0.5526049733161926, "learning_rate": 7.531547405222268e-05, "loss": 0.6218, "step": 966 }, { "epoch": 2.326113116726835, "grad_norm": 0.5726636648178101, "learning_rate": 7.513101128351454e-05, "loss": 0.6115, "step": 967 }, { "epoch": 2.328519855595668, "grad_norm": 0.5585021376609802, "learning_rate": 7.494663866981335e-05, "loss": 0.6454, "step": 968 }, { "epoch": 2.3309265944645006, "grad_norm": 0.5192469954490662, "learning_rate": 7.476235687950628e-05, "loss": 0.5739, "step": 969 }, { "epoch": 2.3333333333333335, "grad_norm": 0.5307746529579163, "learning_rate": 7.457816658065134e-05, "loss": 0.5815, "step": 970 }, { "epoch": 2.335740072202166, "grad_norm": 0.548683762550354, "learning_rate": 7.439406844097479e-05, "loss": 0.6225, "step": 971 }, { "epoch": 2.338146811070999, "grad_norm": 0.5624722838401794, "learning_rate": 7.421006312786883e-05, "loss": 0.6468, "step": 972 }, { "epoch": 2.3405535499398313, "grad_norm": 0.5551783442497253, "learning_rate": 7.402615130838917e-05, "loss": 0.6055, "step": 973 }, { "epoch": 2.3429602888086642, "grad_norm": 0.5535985231399536, "learning_rate": 7.384233364925253e-05, "loss": 0.6531, "step": 974 }, { "epoch": 2.345367027677497, "grad_norm": 0.5185163617134094, "learning_rate": 7.365861081683433e-05, "loss": 0.5907, "step": 975 }, { "epoch": 2.3477737665463296, "grad_norm": 0.5540043115615845, "learning_rate": 7.347498347716624e-05, "loss": 0.5941, "step": 976 }, { "epoch": 2.3501805054151625, "grad_norm": 0.8615466952323914, "learning_rate": 7.329145229593364e-05, "loss": 0.6108, "step": 977 }, { "epoch": 2.352587244283995, "grad_norm": 0.5509060025215149, "learning_rate": 7.310801793847344e-05, "loss": 0.6394, "step": 978 }, { "epoch": 2.354993983152828, "grad_norm": 0.5411714911460876, "learning_rate": 7.292468106977148e-05, "loss": 0.6253, "step": 979 }, { "epoch": 2.357400722021661, "grad_norm": 0.5261589884757996, "learning_rate": 7.274144235446023e-05, "loss": 0.614, "step": 980 }, { "epoch": 2.3598074608904933, "grad_norm": 0.5445355772972107, "learning_rate": 7.255830245681626e-05, "loss": 0.6264, "step": 981 }, { "epoch": 2.3622141997593262, "grad_norm": 0.5595593452453613, "learning_rate": 7.237526204075797e-05, "loss": 0.6477, "step": 982 }, { "epoch": 2.3646209386281587, "grad_norm": 0.5386821031570435, "learning_rate": 7.219232176984314e-05, "loss": 0.6144, "step": 983 }, { "epoch": 2.3670276774969916, "grad_norm": 0.5352283716201782, "learning_rate": 7.200948230726639e-05, "loss": 0.5703, "step": 984 }, { "epoch": 2.3694344163658245, "grad_norm": 0.5502164959907532, "learning_rate": 7.182674431585704e-05, "loss": 0.604, "step": 985 }, { "epoch": 2.371841155234657, "grad_norm": 0.5633226633071899, "learning_rate": 7.16441084580764e-05, "loss": 0.6541, "step": 986 }, { "epoch": 2.37424789410349, "grad_norm": 0.5691085457801819, "learning_rate": 7.14615753960157e-05, "loss": 0.5956, "step": 987 }, { "epoch": 2.3766546329723224, "grad_norm": 0.5683085918426514, "learning_rate": 7.127914579139338e-05, "loss": 0.6327, "step": 988 }, { "epoch": 2.3790613718411553, "grad_norm": 0.5360345244407654, "learning_rate": 7.109682030555283e-05, "loss": 0.6055, "step": 989 }, { "epoch": 2.3814681107099878, "grad_norm": 0.5412148833274841, "learning_rate": 7.09145995994601e-05, "loss": 0.6185, "step": 990 }, { "epoch": 2.3838748495788207, "grad_norm": 0.5633129477500916, "learning_rate": 7.073248433370124e-05, "loss": 0.6555, "step": 991 }, { "epoch": 2.3862815884476536, "grad_norm": 0.5606641173362732, "learning_rate": 7.055047516848025e-05, "loss": 0.6102, "step": 992 }, { "epoch": 2.388688327316486, "grad_norm": 0.5675422549247742, "learning_rate": 7.036857276361627e-05, "loss": 0.605, "step": 993 }, { "epoch": 2.391095066185319, "grad_norm": 0.5666801333427429, "learning_rate": 7.018677777854157e-05, "loss": 0.6251, "step": 994 }, { "epoch": 2.3935018050541514, "grad_norm": 0.6645628213882446, "learning_rate": 7.000509087229895e-05, "loss": 0.6259, "step": 995 }, { "epoch": 2.3959085439229844, "grad_norm": 0.544975221157074, "learning_rate": 6.982351270353944e-05, "loss": 0.6075, "step": 996 }, { "epoch": 2.3983152827918173, "grad_norm": 0.5587804317474365, "learning_rate": 6.964204393051981e-05, "loss": 0.6284, "step": 997 }, { "epoch": 2.4007220216606497, "grad_norm": 0.5575830340385437, "learning_rate": 6.946068521110028e-05, "loss": 0.6285, "step": 998 }, { "epoch": 2.4031287605294827, "grad_norm": 0.564072847366333, "learning_rate": 6.927943720274215e-05, "loss": 0.6465, "step": 999 }, { "epoch": 2.405535499398315, "grad_norm": 0.5231256484985352, "learning_rate": 6.909830056250527e-05, "loss": 0.5224, "step": 1000 }, { "epoch": 2.407942238267148, "grad_norm": 0.525948703289032, "learning_rate": 6.891727594704587e-05, "loss": 0.5999, "step": 1001 }, { "epoch": 2.410348977135981, "grad_norm": 0.5733210444450378, "learning_rate": 6.873636401261401e-05, "loss": 0.638, "step": 1002 }, { "epoch": 2.4127557160048134, "grad_norm": 0.5540027022361755, "learning_rate": 6.855556541505122e-05, "loss": 0.6342, "step": 1003 }, { "epoch": 2.4151624548736463, "grad_norm": 0.5455323457717896, "learning_rate": 6.837488080978824e-05, "loss": 0.6204, "step": 1004 }, { "epoch": 2.417569193742479, "grad_norm": 0.5960592031478882, "learning_rate": 6.819431085184251e-05, "loss": 0.6524, "step": 1005 }, { "epoch": 2.4199759326113117, "grad_norm": 0.5391654968261719, "learning_rate": 6.801385619581592e-05, "loss": 0.5879, "step": 1006 }, { "epoch": 2.422382671480144, "grad_norm": 0.5236111283302307, "learning_rate": 6.783351749589225e-05, "loss": 0.6225, "step": 1007 }, { "epoch": 2.424789410348977, "grad_norm": 0.567409336566925, "learning_rate": 6.765329540583504e-05, "loss": 0.6269, "step": 1008 }, { "epoch": 2.42719614921781, "grad_norm": 0.5611469745635986, "learning_rate": 6.747319057898503e-05, "loss": 0.5785, "step": 1009 }, { "epoch": 2.4296028880866425, "grad_norm": 0.5473028421401978, "learning_rate": 6.729320366825784e-05, "loss": 0.6387, "step": 1010 }, { "epoch": 2.4320096269554754, "grad_norm": 0.5664867758750916, "learning_rate": 6.711333532614168e-05, "loss": 0.6087, "step": 1011 }, { "epoch": 2.434416365824308, "grad_norm": 0.5588098168373108, "learning_rate": 6.693358620469487e-05, "loss": 0.6278, "step": 1012 }, { "epoch": 2.436823104693141, "grad_norm": 0.5740928053855896, "learning_rate": 6.675395695554359e-05, "loss": 0.6614, "step": 1013 }, { "epoch": 2.4392298435619737, "grad_norm": 0.583690881729126, "learning_rate": 6.657444822987942e-05, "loss": 0.6079, "step": 1014 }, { "epoch": 2.441636582430806, "grad_norm": 0.524733304977417, "learning_rate": 6.639506067845697e-05, "loss": 0.5781, "step": 1015 }, { "epoch": 2.444043321299639, "grad_norm": 0.547199010848999, "learning_rate": 6.621579495159171e-05, "loss": 0.624, "step": 1016 }, { "epoch": 2.4464500601684716, "grad_norm": 0.5518101453781128, "learning_rate": 6.603665169915732e-05, "loss": 0.6681, "step": 1017 }, { "epoch": 2.4488567990373045, "grad_norm": 0.5439870953559875, "learning_rate": 6.585763157058358e-05, "loss": 0.6148, "step": 1018 }, { "epoch": 2.4512635379061374, "grad_norm": 0.5591013431549072, "learning_rate": 6.567873521485389e-05, "loss": 0.5738, "step": 1019 }, { "epoch": 2.45367027677497, "grad_norm": 0.5799447894096375, "learning_rate": 6.549996328050296e-05, "loss": 0.6393, "step": 1020 }, { "epoch": 2.4560770156438028, "grad_norm": 0.5795313119888306, "learning_rate": 6.53213164156144e-05, "loss": 0.674, "step": 1021 }, { "epoch": 2.4584837545126352, "grad_norm": 0.5749733448028564, "learning_rate": 6.51427952678185e-05, "loss": 0.6613, "step": 1022 }, { "epoch": 2.460890493381468, "grad_norm": 0.4792674779891968, "learning_rate": 6.496440048428976e-05, "loss": 0.5172, "step": 1023 }, { "epoch": 2.4632972322503006, "grad_norm": 0.5440679788589478, "learning_rate": 6.478613271174453e-05, "loss": 0.5897, "step": 1024 }, { "epoch": 2.4657039711191335, "grad_norm": 0.5223029851913452, "learning_rate": 6.460799259643884e-05, "loss": 0.5618, "step": 1025 }, { "epoch": 2.4681107099879664, "grad_norm": 0.5517489910125732, "learning_rate": 6.442998078416583e-05, "loss": 0.6631, "step": 1026 }, { "epoch": 2.470517448856799, "grad_norm": 0.5471394062042236, "learning_rate": 6.425209792025358e-05, "loss": 0.5811, "step": 1027 }, { "epoch": 2.472924187725632, "grad_norm": 0.5807005167007446, "learning_rate": 6.407434464956266e-05, "loss": 0.6463, "step": 1028 }, { "epoch": 2.4753309265944647, "grad_norm": 0.598617434501648, "learning_rate": 6.389672161648389e-05, "loss": 0.6125, "step": 1029 }, { "epoch": 2.477737665463297, "grad_norm": 2.0399231910705566, "learning_rate": 6.371922946493591e-05, "loss": 0.6652, "step": 1030 }, { "epoch": 2.48014440433213, "grad_norm": 0.5657243728637695, "learning_rate": 6.35418688383629e-05, "loss": 0.6775, "step": 1031 }, { "epoch": 2.4825511432009626, "grad_norm": 0.5614724159240723, "learning_rate": 6.336464037973226e-05, "loss": 0.6514, "step": 1032 }, { "epoch": 2.4849578820697955, "grad_norm": 0.5802749395370483, "learning_rate": 6.318754473153221e-05, "loss": 0.6688, "step": 1033 }, { "epoch": 2.487364620938628, "grad_norm": 0.5460872054100037, "learning_rate": 6.301058253576955e-05, "loss": 0.6322, "step": 1034 }, { "epoch": 2.489771359807461, "grad_norm": 0.554598867893219, "learning_rate": 6.283375443396726e-05, "loss": 0.627, "step": 1035 }, { "epoch": 2.492178098676294, "grad_norm": 0.5883776545524597, "learning_rate": 6.26570610671622e-05, "loss": 0.656, "step": 1036 }, { "epoch": 2.4945848375451263, "grad_norm": 3.8686351776123047, "learning_rate": 6.248050307590283e-05, "loss": 0.7179, "step": 1037 }, { "epoch": 2.496991576413959, "grad_norm": 0.5231539011001587, "learning_rate": 6.230408110024679e-05, "loss": 0.5909, "step": 1038 }, { "epoch": 2.4993983152827917, "grad_norm": 0.565879762172699, "learning_rate": 6.21277957797587e-05, "loss": 0.5941, "step": 1039 }, { "epoch": 2.5018050541516246, "grad_norm": 0.5527289509773254, "learning_rate": 6.19516477535077e-05, "loss": 0.6434, "step": 1040 }, { "epoch": 2.5018050541516246, "eval_loss": 0.5174195766448975, "eval_runtime": 2127.4659, "eval_samples_per_second": 1.791, "eval_steps_per_second": 0.895, "step": 1040 }, { "epoch": 2.504211793020457, "grad_norm": 0.555989146232605, "learning_rate": 6.177563766006526e-05, "loss": 0.6689, "step": 1041 }, { "epoch": 2.50661853188929, "grad_norm": 0.5723418593406677, "learning_rate": 6.159976613750286e-05, "loss": 0.6116, "step": 1042 }, { "epoch": 2.509025270758123, "grad_norm": 0.5345631837844849, "learning_rate": 6.142403382338951e-05, "loss": 0.6074, "step": 1043 }, { "epoch": 2.5114320096269553, "grad_norm": 0.5345121622085571, "learning_rate": 6.12484413547897e-05, "loss": 0.5917, "step": 1044 }, { "epoch": 2.5138387484957883, "grad_norm": 0.5275918841362, "learning_rate": 6.107298936826086e-05, "loss": 0.5959, "step": 1045 }, { "epoch": 2.516245487364621, "grad_norm": 0.5729963183403015, "learning_rate": 6.089767849985114e-05, "loss": 0.6006, "step": 1046 }, { "epoch": 2.5186522262334536, "grad_norm": 0.536594033241272, "learning_rate": 6.0722509385097205e-05, "loss": 0.6561, "step": 1047 }, { "epoch": 2.5210589651022866, "grad_norm": 0.8976889252662659, "learning_rate": 6.0547482659021706e-05, "loss": 0.6933, "step": 1048 }, { "epoch": 2.523465703971119, "grad_norm": 0.7876880764961243, "learning_rate": 6.0372598956131265e-05, "loss": 0.569, "step": 1049 }, { "epoch": 2.525872442839952, "grad_norm": 0.8972963094711304, "learning_rate": 6.019785891041381e-05, "loss": 0.6569, "step": 1050 }, { "epoch": 2.5282791817087844, "grad_norm": 0.582245409488678, "learning_rate": 6.002326315533665e-05, "loss": 0.6298, "step": 1051 }, { "epoch": 2.5306859205776173, "grad_norm": 0.5356953144073486, "learning_rate": 5.984881232384394e-05, "loss": 0.5431, "step": 1052 }, { "epoch": 2.5330926594464502, "grad_norm": 0.5469051599502563, "learning_rate": 5.967450704835452e-05, "loss": 0.6118, "step": 1053 }, { "epoch": 2.5354993983152827, "grad_norm": 0.5375023484230042, "learning_rate": 5.950034796075947e-05, "loss": 0.6206, "step": 1054 }, { "epoch": 2.5379061371841156, "grad_norm": 0.5081560611724854, "learning_rate": 5.9326335692419995e-05, "loss": 0.5357, "step": 1055 }, { "epoch": 2.5403128760529485, "grad_norm": 0.5181150436401367, "learning_rate": 5.9152470874165e-05, "loss": 0.5882, "step": 1056 }, { "epoch": 2.542719614921781, "grad_norm": 0.5010731816291809, "learning_rate": 5.897875413628884e-05, "loss": 0.5653, "step": 1057 }, { "epoch": 2.5451263537906135, "grad_norm": 0.5337491631507874, "learning_rate": 5.8805186108549114e-05, "loss": 0.6198, "step": 1058 }, { "epoch": 2.5475330926594464, "grad_norm": 0.5276105403900146, "learning_rate": 5.863176742016425e-05, "loss": 0.6009, "step": 1059 }, { "epoch": 2.5499398315282793, "grad_norm": 0.5396655201911926, "learning_rate": 5.845849869981137e-05, "loss": 0.607, "step": 1060 }, { "epoch": 2.5523465703971118, "grad_norm": 0.5457972288131714, "learning_rate": 5.8285380575623826e-05, "loss": 0.5653, "step": 1061 }, { "epoch": 2.5547533092659447, "grad_norm": 0.5833747982978821, "learning_rate": 5.811241367518914e-05, "loss": 0.6514, "step": 1062 }, { "epoch": 2.5571600481347776, "grad_norm": 0.5590879917144775, "learning_rate": 5.793959862554652e-05, "loss": 0.5983, "step": 1063 }, { "epoch": 2.55956678700361, "grad_norm": 0.54843670129776, "learning_rate": 5.776693605318476e-05, "loss": 0.6391, "step": 1064 }, { "epoch": 2.561973525872443, "grad_norm": 0.5572134852409363, "learning_rate": 5.759442658403985e-05, "loss": 0.652, "step": 1065 }, { "epoch": 2.5643802647412755, "grad_norm": 0.664635181427002, "learning_rate": 5.7422070843492734e-05, "loss": 0.6497, "step": 1066 }, { "epoch": 2.5667870036101084, "grad_norm": 0.5374277830123901, "learning_rate": 5.7249869456367146e-05, "loss": 0.5951, "step": 1067 }, { "epoch": 2.569193742478941, "grad_norm": 0.5380240678787231, "learning_rate": 5.707782304692719e-05, "loss": 0.652, "step": 1068 }, { "epoch": 2.5716004813477737, "grad_norm": 0.5415376424789429, "learning_rate": 5.6905932238875123e-05, "loss": 0.6449, "step": 1069 }, { "epoch": 2.5740072202166067, "grad_norm": 0.5317239761352539, "learning_rate": 5.6734197655349156e-05, "loss": 0.5957, "step": 1070 }, { "epoch": 2.576413959085439, "grad_norm": 0.5938167572021484, "learning_rate": 5.65626199189212e-05, "loss": 0.6879, "step": 1071 }, { "epoch": 2.578820697954272, "grad_norm": 0.5575851202011108, "learning_rate": 5.639119965159446e-05, "loss": 0.6445, "step": 1072 }, { "epoch": 2.581227436823105, "grad_norm": 0.5630468130111694, "learning_rate": 5.6219937474801366e-05, "loss": 0.6016, "step": 1073 }, { "epoch": 2.5836341756919374, "grad_norm": 0.6110846400260925, "learning_rate": 5.6048834009401196e-05, "loss": 0.6013, "step": 1074 }, { "epoch": 2.58604091456077, "grad_norm": 0.5540307760238647, "learning_rate": 5.5877889875677845e-05, "loss": 0.6397, "step": 1075 }, { "epoch": 2.588447653429603, "grad_norm": 0.9078941345214844, "learning_rate": 5.570710569333772e-05, "loss": 0.6592, "step": 1076 }, { "epoch": 2.5908543922984357, "grad_norm": 0.5398726463317871, "learning_rate": 5.553648208150728e-05, "loss": 0.6079, "step": 1077 }, { "epoch": 2.593261131167268, "grad_norm": 0.5087079405784607, "learning_rate": 5.5366019658730825e-05, "loss": 0.5574, "step": 1078 }, { "epoch": 2.595667870036101, "grad_norm": 0.5509259104728699, "learning_rate": 5.5195719042968365e-05, "loss": 0.6251, "step": 1079 }, { "epoch": 2.598074608904934, "grad_norm": 0.5321153402328491, "learning_rate": 5.5025580851593436e-05, "loss": 0.5761, "step": 1080 }, { "epoch": 2.6004813477737665, "grad_norm": 0.5523797869682312, "learning_rate": 5.485560570139061e-05, "loss": 0.6083, "step": 1081 }, { "epoch": 2.6028880866425994, "grad_norm": 0.5575142502784729, "learning_rate": 5.4685794208553465e-05, "loss": 0.6139, "step": 1082 }, { "epoch": 2.605294825511432, "grad_norm": 0.5489847660064697, "learning_rate": 5.4516146988682285e-05, "loss": 0.6333, "step": 1083 }, { "epoch": 2.607701564380265, "grad_norm": 0.5419440865516663, "learning_rate": 5.434666465678175e-05, "loss": 0.6411, "step": 1084 }, { "epoch": 2.6101083032490973, "grad_norm": 0.5227987170219421, "learning_rate": 5.417734782725896e-05, "loss": 0.603, "step": 1085 }, { "epoch": 2.61251504211793, "grad_norm": 0.542655885219574, "learning_rate": 5.400819711392091e-05, "loss": 0.584, "step": 1086 }, { "epoch": 2.614921780986763, "grad_norm": 0.5241451859474182, "learning_rate": 5.383921312997242e-05, "loss": 0.5723, "step": 1087 }, { "epoch": 2.6173285198555956, "grad_norm": 0.542667031288147, "learning_rate": 5.3670396488013854e-05, "loss": 0.6004, "step": 1088 }, { "epoch": 2.6197352587244285, "grad_norm": 0.5759837627410889, "learning_rate": 5.3501747800038934e-05, "loss": 0.6457, "step": 1089 }, { "epoch": 2.6221419975932614, "grad_norm": 0.5503437519073486, "learning_rate": 5.333326767743263e-05, "loss": 0.629, "step": 1090 }, { "epoch": 2.624548736462094, "grad_norm": 0.5209627151489258, "learning_rate": 5.316495673096869e-05, "loss": 0.589, "step": 1091 }, { "epoch": 2.6269554753309263, "grad_norm": 0.5268390774726868, "learning_rate": 5.299681557080759e-05, "loss": 0.5513, "step": 1092 }, { "epoch": 2.6293622141997592, "grad_norm": 0.5540508031845093, "learning_rate": 5.282884480649435e-05, "loss": 0.6472, "step": 1093 }, { "epoch": 2.631768953068592, "grad_norm": 0.5173966288566589, "learning_rate": 5.266104504695617e-05, "loss": 0.6047, "step": 1094 }, { "epoch": 2.6341756919374246, "grad_norm": 0.697161078453064, "learning_rate": 5.249341690050051e-05, "loss": 0.5981, "step": 1095 }, { "epoch": 2.6365824308062575, "grad_norm": 0.5417733788490295, "learning_rate": 5.232596097481251e-05, "loss": 0.5968, "step": 1096 }, { "epoch": 2.6389891696750905, "grad_norm": 0.5155860185623169, "learning_rate": 5.2158677876953075e-05, "loss": 0.5475, "step": 1097 }, { "epoch": 2.641395908543923, "grad_norm": 0.5783323645591736, "learning_rate": 5.199156821335653e-05, "loss": 0.6539, "step": 1098 }, { "epoch": 2.643802647412756, "grad_norm": 0.7172238230705261, "learning_rate": 5.182463258982846e-05, "loss": 0.6077, "step": 1099 }, { "epoch": 2.6462093862815883, "grad_norm": 0.5712655782699585, "learning_rate": 5.1657871611543605e-05, "loss": 0.6444, "step": 1100 }, { "epoch": 2.648616125150421, "grad_norm": 0.5796595215797424, "learning_rate": 5.149128588304351e-05, "loss": 0.6362, "step": 1101 }, { "epoch": 2.6510228640192537, "grad_norm": 0.565563440322876, "learning_rate": 5.132487600823438e-05, "loss": 0.6289, "step": 1102 }, { "epoch": 2.6534296028880866, "grad_norm": 0.508616030216217, "learning_rate": 5.115864259038498e-05, "loss": 0.5531, "step": 1103 }, { "epoch": 2.6558363417569195, "grad_norm": 0.5539884567260742, "learning_rate": 5.099258623212431e-05, "loss": 0.5966, "step": 1104 }, { "epoch": 2.658243080625752, "grad_norm": 0.5526125431060791, "learning_rate": 5.082670753543961e-05, "loss": 0.6169, "step": 1105 }, { "epoch": 2.660649819494585, "grad_norm": 0.6125399470329285, "learning_rate": 5.066100710167401e-05, "loss": 0.6107, "step": 1106 }, { "epoch": 2.663056558363418, "grad_norm": 0.5800756216049194, "learning_rate": 5.049548553152428e-05, "loss": 0.6185, "step": 1107 }, { "epoch": 2.6654632972322503, "grad_norm": 0.5141105651855469, "learning_rate": 5.033014342503889e-05, "loss": 0.5439, "step": 1108 }, { "epoch": 2.667870036101083, "grad_norm": 0.5347982048988342, "learning_rate": 5.0164981381615786e-05, "loss": 0.5868, "step": 1109 }, { "epoch": 2.6702767749699157, "grad_norm": 0.537040114402771, "learning_rate": 5.000000000000002e-05, "loss": 0.6078, "step": 1110 }, { "epoch": 2.6726835138387486, "grad_norm": 0.5337335467338562, "learning_rate": 4.9835199878281765e-05, "loss": 0.552, "step": 1111 }, { "epoch": 2.675090252707581, "grad_norm": 0.5230998992919922, "learning_rate": 4.9670581613894094e-05, "loss": 0.6033, "step": 1112 }, { "epoch": 2.677496991576414, "grad_norm": 0.5692113041877747, "learning_rate": 4.950614580361076e-05, "loss": 0.6846, "step": 1113 }, { "epoch": 2.679903730445247, "grad_norm": 0.5505898594856262, "learning_rate": 4.9341893043544185e-05, "loss": 0.6155, "step": 1114 }, { "epoch": 2.6823104693140793, "grad_norm": 0.5415592193603516, "learning_rate": 4.9177823929143106e-05, "loss": 0.6106, "step": 1115 }, { "epoch": 2.6847172081829123, "grad_norm": 0.5600382089614868, "learning_rate": 4.901393905519055e-05, "loss": 0.6202, "step": 1116 }, { "epoch": 2.6871239470517447, "grad_norm": 0.5817874670028687, "learning_rate": 4.8850239015801625e-05, "loss": 0.6178, "step": 1117 }, { "epoch": 2.6895306859205776, "grad_norm": 0.5410627126693726, "learning_rate": 4.868672440442134e-05, "loss": 0.6449, "step": 1118 }, { "epoch": 2.69193742478941, "grad_norm": 0.531937837600708, "learning_rate": 4.852339581382258e-05, "loss": 0.6247, "step": 1119 }, { "epoch": 2.694344163658243, "grad_norm": 0.5456275343894958, "learning_rate": 4.836025383610382e-05, "loss": 0.6258, "step": 1120 }, { "epoch": 2.696750902527076, "grad_norm": 0.5466488599777222, "learning_rate": 4.8197299062686995e-05, "loss": 0.596, "step": 1121 }, { "epoch": 2.6991576413959084, "grad_norm": 0.559273362159729, "learning_rate": 4.80345320843154e-05, "loss": 0.6777, "step": 1122 }, { "epoch": 2.7015643802647413, "grad_norm": 0.9880828261375427, "learning_rate": 4.787195349105159e-05, "loss": 0.6002, "step": 1123 }, { "epoch": 2.7039711191335742, "grad_norm": 0.5400123000144958, "learning_rate": 4.770956387227515e-05, "loss": 0.5942, "step": 1124 }, { "epoch": 2.7063778580024067, "grad_norm": 0.51639723777771, "learning_rate": 4.754736381668057e-05, "loss": 0.5553, "step": 1125 }, { "epoch": 2.7087845968712396, "grad_norm": 0.497186541557312, "learning_rate": 4.7385353912275165e-05, "loss": 0.5294, "step": 1126 }, { "epoch": 2.711191335740072, "grad_norm": 0.5120139122009277, "learning_rate": 4.7223534746376884e-05, "loss": 0.5674, "step": 1127 }, { "epoch": 2.713598074608905, "grad_norm": 0.5414555072784424, "learning_rate": 4.706190690561228e-05, "loss": 0.6599, "step": 1128 }, { "epoch": 2.7160048134777375, "grad_norm": 0.5490611791610718, "learning_rate": 4.690047097591427e-05, "loss": 0.5729, "step": 1129 }, { "epoch": 2.7184115523465704, "grad_norm": 0.5207080841064453, "learning_rate": 4.673922754252002e-05, "loss": 0.601, "step": 1130 }, { "epoch": 2.7208182912154033, "grad_norm": 0.5412114262580872, "learning_rate": 4.657817718996891e-05, "loss": 0.6061, "step": 1131 }, { "epoch": 2.7232250300842358, "grad_norm": 0.5910362601280212, "learning_rate": 4.6417320502100316e-05, "loss": 0.6712, "step": 1132 }, { "epoch": 2.7256317689530687, "grad_norm": 0.5826935172080994, "learning_rate": 4.625665806205164e-05, "loss": 0.6463, "step": 1133 }, { "epoch": 2.728038507821901, "grad_norm": 0.5470381379127502, "learning_rate": 4.609619045225604e-05, "loss": 0.5813, "step": 1134 }, { "epoch": 2.730445246690734, "grad_norm": 0.526949405670166, "learning_rate": 4.593591825444028e-05, "loss": 0.5555, "step": 1135 }, { "epoch": 2.7328519855595665, "grad_norm": 0.5491508841514587, "learning_rate": 4.5775842049622806e-05, "loss": 0.6348, "step": 1136 }, { "epoch": 2.7352587244283995, "grad_norm": 0.6132414937019348, "learning_rate": 4.5615962418111625e-05, "loss": 0.633, "step": 1137 }, { "epoch": 2.7376654632972324, "grad_norm": 0.5601387023925781, "learning_rate": 4.545627993950201e-05, "loss": 0.6512, "step": 1138 }, { "epoch": 2.740072202166065, "grad_norm": 0.5478011965751648, "learning_rate": 4.529679519267456e-05, "loss": 0.612, "step": 1139 }, { "epoch": 2.7424789410348978, "grad_norm": 0.5426222085952759, "learning_rate": 4.513750875579303e-05, "loss": 0.6469, "step": 1140 }, { "epoch": 2.7448856799037307, "grad_norm": 0.5752906799316406, "learning_rate": 4.497842120630229e-05, "loss": 0.6036, "step": 1141 }, { "epoch": 2.747292418772563, "grad_norm": 0.585828959941864, "learning_rate": 4.4819533120926236e-05, "loss": 0.6224, "step": 1142 }, { "epoch": 2.749699157641396, "grad_norm": 0.5365166068077087, "learning_rate": 4.46608450756656e-05, "loss": 0.5838, "step": 1143 }, { "epoch": 2.7521058965102285, "grad_norm": 0.5801700949668884, "learning_rate": 4.4502357645795976e-05, "loss": 0.6063, "step": 1144 }, { "epoch": 2.7521058965102285, "eval_loss": 0.5213890075683594, "eval_runtime": 2125.1462, "eval_samples_per_second": 1.793, "eval_steps_per_second": 0.896, "step": 1144 }, { "epoch": 2.7545126353790614, "grad_norm": 0.5485831499099731, "learning_rate": 4.434407140586565e-05, "loss": 0.6233, "step": 1145 }, { "epoch": 2.756919374247894, "grad_norm": 0.7163209915161133, "learning_rate": 4.4185986929693546e-05, "loss": 0.627, "step": 1146 }, { "epoch": 2.759326113116727, "grad_norm": 0.6097565293312073, "learning_rate": 4.402810479036725e-05, "loss": 0.6262, "step": 1147 }, { "epoch": 2.7617328519855597, "grad_norm": 0.5748540163040161, "learning_rate": 4.387042556024074e-05, "loss": 0.6227, "step": 1148 }, { "epoch": 2.764139590854392, "grad_norm": 0.5551732182502747, "learning_rate": 4.371294981093244e-05, "loss": 0.6109, "step": 1149 }, { "epoch": 2.766546329723225, "grad_norm": 0.5600747466087341, "learning_rate": 4.355567811332311e-05, "loss": 0.6689, "step": 1150 }, { "epoch": 2.768953068592058, "grad_norm": 0.5633524060249329, "learning_rate": 4.339861103755374e-05, "loss": 0.5923, "step": 1151 }, { "epoch": 2.7713598074608905, "grad_norm": 0.5267031192779541, "learning_rate": 4.324174915302366e-05, "loss": 0.5884, "step": 1152 }, { "epoch": 2.773766546329723, "grad_norm": 0.5406028628349304, "learning_rate": 4.30850930283882e-05, "loss": 0.6762, "step": 1153 }, { "epoch": 2.776173285198556, "grad_norm": 0.5297737121582031, "learning_rate": 4.2928643231556844e-05, "loss": 0.5878, "step": 1154 }, { "epoch": 2.778580024067389, "grad_norm": 0.5439836382865906, "learning_rate": 4.277240032969105e-05, "loss": 0.6102, "step": 1155 }, { "epoch": 2.7809867629362213, "grad_norm": 1.7782272100448608, "learning_rate": 4.2616364889202254e-05, "loss": 0.6004, "step": 1156 }, { "epoch": 2.783393501805054, "grad_norm": 0.5534839034080505, "learning_rate": 4.2460537475749864e-05, "loss": 0.6114, "step": 1157 }, { "epoch": 2.785800240673887, "grad_norm": 0.5557342171669006, "learning_rate": 4.230491865423908e-05, "loss": 0.6168, "step": 1158 }, { "epoch": 2.7882069795427196, "grad_norm": 0.5569643974304199, "learning_rate": 4.214950898881892e-05, "loss": 0.6726, "step": 1159 }, { "epoch": 2.7906137184115525, "grad_norm": 0.5202840566635132, "learning_rate": 4.19943090428802e-05, "loss": 0.5638, "step": 1160 }, { "epoch": 2.793020457280385, "grad_norm": 0.5370293259620667, "learning_rate": 4.18393193790534e-05, "loss": 0.6123, "step": 1161 }, { "epoch": 2.795427196149218, "grad_norm": 0.5455282330513, "learning_rate": 4.168454055920681e-05, "loss": 0.6256, "step": 1162 }, { "epoch": 2.7978339350180503, "grad_norm": 0.5539038777351379, "learning_rate": 4.152997314444428e-05, "loss": 0.5864, "step": 1163 }, { "epoch": 2.8002406738868832, "grad_norm": 0.5746882557868958, "learning_rate": 4.137561769510322e-05, "loss": 0.6393, "step": 1164 }, { "epoch": 2.802647412755716, "grad_norm": 0.5234166383743286, "learning_rate": 4.12214747707527e-05, "loss": 0.601, "step": 1165 }, { "epoch": 2.8050541516245486, "grad_norm": 0.5199108719825745, "learning_rate": 4.106754493019138e-05, "loss": 0.5907, "step": 1166 }, { "epoch": 2.8074608904933815, "grad_norm": 0.5570451617240906, "learning_rate": 4.091382873144539e-05, "loss": 0.598, "step": 1167 }, { "epoch": 2.8098676293622145, "grad_norm": 0.5706470608711243, "learning_rate": 4.0760326731766374e-05, "loss": 0.647, "step": 1168 }, { "epoch": 2.812274368231047, "grad_norm": 0.5356807708740234, "learning_rate": 4.060703948762945e-05, "loss": 0.6199, "step": 1169 }, { "epoch": 2.8146811070998794, "grad_norm": 0.5090421438217163, "learning_rate": 4.045396755473121e-05, "loss": 0.551, "step": 1170 }, { "epoch": 2.8170878459687123, "grad_norm": 0.5151572823524475, "learning_rate": 4.030111148798775e-05, "loss": 0.5771, "step": 1171 }, { "epoch": 2.8194945848375452, "grad_norm": 0.5614504814147949, "learning_rate": 4.014847184153258e-05, "loss": 0.6461, "step": 1172 }, { "epoch": 2.8219013237063777, "grad_norm": 0.5628297924995422, "learning_rate": 3.9996049168714586e-05, "loss": 0.683, "step": 1173 }, { "epoch": 2.8243080625752106, "grad_norm": 0.5342167019844055, "learning_rate": 3.9843844022096135e-05, "loss": 0.5959, "step": 1174 }, { "epoch": 2.8267148014440435, "grad_norm": 0.5450764894485474, "learning_rate": 3.969185695345105e-05, "loss": 0.624, "step": 1175 }, { "epoch": 2.829121540312876, "grad_norm": 0.5570500493049622, "learning_rate": 3.954008851376252e-05, "loss": 0.6376, "step": 1176 }, { "epoch": 2.831528279181709, "grad_norm": 0.565272867679596, "learning_rate": 3.938853925322118e-05, "loss": 0.6591, "step": 1177 }, { "epoch": 2.8339350180505414, "grad_norm": 0.5241490602493286, "learning_rate": 3.923720972122311e-05, "loss": 0.5768, "step": 1178 }, { "epoch": 2.8363417569193743, "grad_norm": 0.5536178350448608, "learning_rate": 3.908610046636776e-05, "loss": 0.5905, "step": 1179 }, { "epoch": 2.8387484957882068, "grad_norm": 0.5152692794799805, "learning_rate": 3.893521203645618e-05, "loss": 0.5733, "step": 1180 }, { "epoch": 2.8411552346570397, "grad_norm": 0.5434930324554443, "learning_rate": 3.8784544978488756e-05, "loss": 0.6141, "step": 1181 }, { "epoch": 2.8435619735258726, "grad_norm": 0.49646705389022827, "learning_rate": 3.863409983866341e-05, "loss": 0.5261, "step": 1182 }, { "epoch": 2.845968712394705, "grad_norm": 0.54221111536026, "learning_rate": 3.848387716237353e-05, "loss": 0.6173, "step": 1183 }, { "epoch": 2.848375451263538, "grad_norm": 0.5244923233985901, "learning_rate": 3.833387749420603e-05, "loss": 0.6331, "step": 1184 }, { "epoch": 2.850782190132371, "grad_norm": 0.5556166768074036, "learning_rate": 3.8184101377939476e-05, "loss": 0.6757, "step": 1185 }, { "epoch": 2.8531889290012034, "grad_norm": 0.4855385422706604, "learning_rate": 3.8034549356541894e-05, "loss": 0.4839, "step": 1186 }, { "epoch": 2.855595667870036, "grad_norm": 0.8238577246665955, "learning_rate": 3.788522197216897e-05, "loss": 0.6403, "step": 1187 }, { "epoch": 2.8580024067388687, "grad_norm": 0.5301892161369324, "learning_rate": 3.773611976616203e-05, "loss": 0.5789, "step": 1188 }, { "epoch": 2.8604091456077017, "grad_norm": 0.5225841999053955, "learning_rate": 3.7587243279046056e-05, "loss": 0.584, "step": 1189 }, { "epoch": 2.862815884476534, "grad_norm": 0.5338698029518127, "learning_rate": 3.7438593050527845e-05, "loss": 0.589, "step": 1190 }, { "epoch": 2.865222623345367, "grad_norm": 0.5654701590538025, "learning_rate": 3.729016961949391e-05, "loss": 0.6859, "step": 1191 }, { "epoch": 2.8676293622142, "grad_norm": 0.5275447368621826, "learning_rate": 3.714197352400849e-05, "loss": 0.5972, "step": 1192 }, { "epoch": 2.8700361010830324, "grad_norm": 0.9144940376281738, "learning_rate": 3.6994005301311777e-05, "loss": 0.5421, "step": 1193 }, { "epoch": 2.8724428399518653, "grad_norm": 0.5549452900886536, "learning_rate": 3.684626548781792e-05, "loss": 0.6525, "step": 1194 }, { "epoch": 2.874849578820698, "grad_norm": 0.5169296860694885, "learning_rate": 3.669875461911297e-05, "loss": 0.572, "step": 1195 }, { "epoch": 2.8772563176895307, "grad_norm": 0.6043078303337097, "learning_rate": 3.6551473229953037e-05, "loss": 0.6208, "step": 1196 }, { "epoch": 2.879663056558363, "grad_norm": 0.5294334292411804, "learning_rate": 3.640442185426228e-05, "loss": 0.5969, "step": 1197 }, { "epoch": 2.882069795427196, "grad_norm": 0.5621278285980225, "learning_rate": 3.6257601025131026e-05, "loss": 0.6373, "step": 1198 }, { "epoch": 2.884476534296029, "grad_norm": 0.5122761130332947, "learning_rate": 3.611101127481392e-05, "loss": 0.5804, "step": 1199 }, { "epoch": 2.8868832731648615, "grad_norm": 0.5505006313323975, "learning_rate": 3.5964653134727776e-05, "loss": 0.6243, "step": 1200 }, { "epoch": 2.8892900120336944, "grad_norm": 0.5394690632820129, "learning_rate": 3.581852713544983e-05, "loss": 0.5897, "step": 1201 }, { "epoch": 2.8916967509025273, "grad_norm": 0.5276006460189819, "learning_rate": 3.567263380671576e-05, "loss": 0.5988, "step": 1202 }, { "epoch": 2.89410348977136, "grad_norm": 0.4876402020454407, "learning_rate": 3.552697367741772e-05, "loss": 0.5417, "step": 1203 }, { "epoch": 2.8965102286401927, "grad_norm": 0.5334110260009766, "learning_rate": 3.538154727560259e-05, "loss": 0.6384, "step": 1204 }, { "epoch": 2.898916967509025, "grad_norm": 0.5780425667762756, "learning_rate": 3.523635512846981e-05, "loss": 0.6644, "step": 1205 }, { "epoch": 2.901323706377858, "grad_norm": 0.5308248400688171, "learning_rate": 3.509139776236967e-05, "loss": 0.5558, "step": 1206 }, { "epoch": 2.9037304452466906, "grad_norm": 0.666962742805481, "learning_rate": 3.494667570280132e-05, "loss": 0.6482, "step": 1207 }, { "epoch": 2.9061371841155235, "grad_norm": 0.5265256762504578, "learning_rate": 3.480218947441083e-05, "loss": 0.6167, "step": 1208 }, { "epoch": 2.9085439229843564, "grad_norm": 0.543232798576355, "learning_rate": 3.465793960098945e-05, "loss": 0.6311, "step": 1209 }, { "epoch": 2.910950661853189, "grad_norm": 0.516883134841919, "learning_rate": 3.45139266054715e-05, "loss": 0.5705, "step": 1210 }, { "epoch": 2.9133574007220218, "grad_norm": 0.552045464515686, "learning_rate": 3.4370151009932584e-05, "loss": 0.646, "step": 1211 }, { "epoch": 2.9157641395908542, "grad_norm": 0.5413665175437927, "learning_rate": 3.4226613335587695e-05, "loss": 0.5887, "step": 1212 }, { "epoch": 2.918170878459687, "grad_norm": 0.5470811128616333, "learning_rate": 3.408331410278929e-05, "loss": 0.6445, "step": 1213 }, { "epoch": 2.9205776173285196, "grad_norm": 0.5725565552711487, "learning_rate": 3.394025383102552e-05, "loss": 0.6211, "step": 1214 }, { "epoch": 2.9229843561973525, "grad_norm": 0.5045185089111328, "learning_rate": 3.379743303891815e-05, "loss": 0.5571, "step": 1215 }, { "epoch": 2.9253910950661854, "grad_norm": 0.5587323904037476, "learning_rate": 3.3654852244220826e-05, "loss": 0.6273, "step": 1216 }, { "epoch": 2.927797833935018, "grad_norm": 0.5473192930221558, "learning_rate": 3.351251196381716e-05, "loss": 0.6418, "step": 1217 }, { "epoch": 2.930204572803851, "grad_norm": 0.5763916373252869, "learning_rate": 3.33704127137188e-05, "loss": 0.706, "step": 1218 }, { "epoch": 2.9326113116726837, "grad_norm": 0.5404964089393616, "learning_rate": 3.322855500906373e-05, "loss": 0.6578, "step": 1219 }, { "epoch": 2.935018050541516, "grad_norm": 0.5233117938041687, "learning_rate": 3.308693936411421e-05, "loss": 0.5865, "step": 1220 }, { "epoch": 2.937424789410349, "grad_norm": 0.5532284379005432, "learning_rate": 3.294556629225488e-05, "loss": 0.6864, "step": 1221 }, { "epoch": 2.9398315282791816, "grad_norm": 0.5246838331222534, "learning_rate": 3.2804436305991214e-05, "loss": 0.5949, "step": 1222 }, { "epoch": 2.9422382671480145, "grad_norm": 0.5318245887756348, "learning_rate": 3.266354991694732e-05, "loss": 0.6074, "step": 1223 }, { "epoch": 2.944645006016847, "grad_norm": 0.5444689989089966, "learning_rate": 3.2522907635864244e-05, "loss": 0.5912, "step": 1224 }, { "epoch": 2.94705174488568, "grad_norm": 0.525635302066803, "learning_rate": 3.238250997259808e-05, "loss": 0.5903, "step": 1225 }, { "epoch": 2.949458483754513, "grad_norm": 0.5306883454322815, "learning_rate": 3.224235743611814e-05, "loss": 0.6351, "step": 1226 }, { "epoch": 2.9518652226233453, "grad_norm": 0.5118337273597717, "learning_rate": 3.210245053450517e-05, "loss": 0.5544, "step": 1227 }, { "epoch": 2.954271961492178, "grad_norm": 0.5161775350570679, "learning_rate": 3.196278977494934e-05, "loss": 0.5581, "step": 1228 }, { "epoch": 2.956678700361011, "grad_norm": 0.5330590605735779, "learning_rate": 3.182337566374856e-05, "loss": 0.6247, "step": 1229 }, { "epoch": 2.9590854392298436, "grad_norm": 0.5316399931907654, "learning_rate": 3.1684208706306574e-05, "loss": 0.6196, "step": 1230 }, { "epoch": 2.961492178098676, "grad_norm": 0.51682448387146, "learning_rate": 3.154528940713113e-05, "loss": 0.6089, "step": 1231 }, { "epoch": 2.963898916967509, "grad_norm": 0.5348968505859375, "learning_rate": 3.140661826983223e-05, "loss": 0.6433, "step": 1232 }, { "epoch": 2.966305655836342, "grad_norm": 0.48855385184288025, "learning_rate": 3.1268195797120195e-05, "loss": 0.495, "step": 1233 }, { "epoch": 2.9687123947051743, "grad_norm": 0.5560404658317566, "learning_rate": 3.113002249080386e-05, "loss": 0.6094, "step": 1234 }, { "epoch": 2.9711191335740073, "grad_norm": 0.49993187189102173, "learning_rate": 3.099209885178882e-05, "loss": 0.6169, "step": 1235 }, { "epoch": 2.97352587244284, "grad_norm": 0.4938056468963623, "learning_rate": 3.0854425380075544e-05, "loss": 0.5725, "step": 1236 }, { "epoch": 2.9759326113116726, "grad_norm": 0.5062902569770813, "learning_rate": 3.071700257475768e-05, "loss": 0.5332, "step": 1237 }, { "epoch": 2.9783393501805056, "grad_norm": 0.5624229311943054, "learning_rate": 3.0579830934020057e-05, "loss": 0.6275, "step": 1238 }, { "epoch": 2.980746089049338, "grad_norm": 0.49906042218208313, "learning_rate": 3.044291095513705e-05, "loss": 0.5961, "step": 1239 }, { "epoch": 2.983152827918171, "grad_norm": 0.5100592970848083, "learning_rate": 3.030624313447067e-05, "loss": 0.6014, "step": 1240 }, { "epoch": 2.9855595667870034, "grad_norm": 0.5456565618515015, "learning_rate": 3.016982796746879e-05, "loss": 0.6271, "step": 1241 }, { "epoch": 2.9879663056558363, "grad_norm": 0.48046863079071045, "learning_rate": 3.0033665948663448e-05, "loss": 0.5534, "step": 1242 }, { "epoch": 2.9903730445246692, "grad_norm": 0.6164072751998901, "learning_rate": 2.9897757571668905e-05, "loss": 0.6274, "step": 1243 }, { "epoch": 2.9927797833935017, "grad_norm": 0.5391572713851929, "learning_rate": 2.9762103329179913e-05, "loss": 0.6138, "step": 1244 }, { "epoch": 2.9951865222623346, "grad_norm": 0.5705344080924988, "learning_rate": 2.962670371296996e-05, "loss": 0.5744, "step": 1245 }, { "epoch": 2.9975932611311675, "grad_norm": 0.5595329403877258, "learning_rate": 2.949155921388943e-05, "loss": 0.5954, "step": 1246 }, { "epoch": 3.0, "grad_norm": 0.6322901844978333, "learning_rate": 2.9356670321863942e-05, "loss": 0.6958, "step": 1247 }, { "epoch": 3.0018050541516246, "grad_norm": 0.5690078735351562, "learning_rate": 2.922203752589243e-05, "loss": 0.6265, "step": 1248 }, { "epoch": 3.0018050541516246, "eval_loss": 0.517622709274292, "eval_runtime": 2128.6747, "eval_samples_per_second": 1.79, "eval_steps_per_second": 0.895, "step": 1248 }, { "epoch": 3.0042117930204575, "grad_norm": 0.5014199018478394, "learning_rate": 2.9087661314045366e-05, "loss": 0.51, "step": 1249 }, { "epoch": 3.00661853188929, "grad_norm": 0.5085039138793945, "learning_rate": 2.8953542173463133e-05, "loss": 0.6033, "step": 1250 }, { "epoch": 3.009025270758123, "grad_norm": 0.5191147923469543, "learning_rate": 2.8819680590354202e-05, "loss": 0.5635, "step": 1251 }, { "epoch": 3.0114320096269553, "grad_norm": 0.5107114911079407, "learning_rate": 2.8686077049993287e-05, "loss": 0.5361, "step": 1252 }, { "epoch": 3.0138387484957883, "grad_norm": 0.47617989778518677, "learning_rate": 2.8552732036719687e-05, "loss": 0.4945, "step": 1253 }, { "epoch": 3.0162454873646207, "grad_norm": 0.46349620819091797, "learning_rate": 2.8419646033935444e-05, "loss": 0.4774, "step": 1254 }, { "epoch": 3.0186522262334536, "grad_norm": 0.4879171848297119, "learning_rate": 2.828681952410366e-05, "loss": 0.5102, "step": 1255 }, { "epoch": 3.0210589651022866, "grad_norm": 0.5078036189079285, "learning_rate": 2.8154252988746755e-05, "loss": 0.5438, "step": 1256 }, { "epoch": 3.023465703971119, "grad_norm": 0.5153116583824158, "learning_rate": 2.802194690844465e-05, "loss": 0.5359, "step": 1257 }, { "epoch": 3.025872442839952, "grad_norm": 0.4846814274787903, "learning_rate": 2.7889901762833083e-05, "loss": 0.5175, "step": 1258 }, { "epoch": 3.0282791817087844, "grad_norm": 0.503777801990509, "learning_rate": 2.775811803060183e-05, "loss": 0.5265, "step": 1259 }, { "epoch": 3.0306859205776173, "grad_norm": 0.5214651823043823, "learning_rate": 2.7626596189492983e-05, "loss": 0.5156, "step": 1260 }, { "epoch": 3.0330926594464502, "grad_norm": 0.5059404373168945, "learning_rate": 2.7495336716299313e-05, "loss": 0.5103, "step": 1261 }, { "epoch": 3.0354993983152827, "grad_norm": 0.5167142748832703, "learning_rate": 2.736434008686235e-05, "loss": 0.5562, "step": 1262 }, { "epoch": 3.0379061371841156, "grad_norm": 0.518477201461792, "learning_rate": 2.723360677607083e-05, "loss": 0.543, "step": 1263 }, { "epoch": 3.040312876052948, "grad_norm": 0.5129715204238892, "learning_rate": 2.7103137257858868e-05, "loss": 0.5413, "step": 1264 }, { "epoch": 3.042719614921781, "grad_norm": 0.5404799580574036, "learning_rate": 2.6972932005204267e-05, "loss": 0.5634, "step": 1265 }, { "epoch": 3.045126353790614, "grad_norm": 0.5333388447761536, "learning_rate": 2.68429914901269e-05, "loss": 0.5718, "step": 1266 }, { "epoch": 3.0475330926594464, "grad_norm": 0.5307161808013916, "learning_rate": 2.671331618368682e-05, "loss": 0.5902, "step": 1267 }, { "epoch": 3.0499398315282793, "grad_norm": 0.5273099541664124, "learning_rate": 2.6583906555982697e-05, "loss": 0.5345, "step": 1268 }, { "epoch": 3.0523465703971118, "grad_norm": 0.510177731513977, "learning_rate": 2.6454763076150046e-05, "loss": 0.5206, "step": 1269 }, { "epoch": 3.0547533092659447, "grad_norm": 0.5169177651405334, "learning_rate": 2.6325886212359498e-05, "loss": 0.5226, "step": 1270 }, { "epoch": 3.0571600481347776, "grad_norm": 0.5109610557556152, "learning_rate": 2.6197276431815277e-05, "loss": 0.5471, "step": 1271 }, { "epoch": 3.05956678700361, "grad_norm": 0.5054513812065125, "learning_rate": 2.606893420075325e-05, "loss": 0.5604, "step": 1272 }, { "epoch": 3.061973525872443, "grad_norm": 0.5295777320861816, "learning_rate": 2.5940859984439424e-05, "loss": 0.543, "step": 1273 }, { "epoch": 3.0643802647412755, "grad_norm": 0.5169811248779297, "learning_rate": 2.5813054247168167e-05, "loss": 0.5219, "step": 1274 }, { "epoch": 3.0667870036101084, "grad_norm": 0.5088948607444763, "learning_rate": 2.5685517452260567e-05, "loss": 0.5119, "step": 1275 }, { "epoch": 3.069193742478941, "grad_norm": 0.5349400043487549, "learning_rate": 2.5558250062062828e-05, "loss": 0.5498, "step": 1276 }, { "epoch": 3.0716004813477737, "grad_norm": 0.5120639204978943, "learning_rate": 2.543125253794434e-05, "loss": 0.5008, "step": 1277 }, { "epoch": 3.0740072202166067, "grad_norm": 0.5499658584594727, "learning_rate": 2.530452534029627e-05, "loss": 0.5638, "step": 1278 }, { "epoch": 3.076413959085439, "grad_norm": 0.5711025595664978, "learning_rate": 2.5178068928529864e-05, "loss": 0.5906, "step": 1279 }, { "epoch": 3.078820697954272, "grad_norm": 0.5479505062103271, "learning_rate": 2.5051883761074614e-05, "loss": 0.546, "step": 1280 }, { "epoch": 3.0812274368231045, "grad_norm": 0.5041561722755432, "learning_rate": 2.4925970295376722e-05, "loss": 0.5099, "step": 1281 }, { "epoch": 3.0836341756919374, "grad_norm": 0.5195086598396301, "learning_rate": 2.4800328987897427e-05, "loss": 0.5489, "step": 1282 }, { "epoch": 3.0860409145607703, "grad_norm": 0.5311887264251709, "learning_rate": 2.4674960294111314e-05, "loss": 0.5588, "step": 1283 }, { "epoch": 3.088447653429603, "grad_norm": 0.5344866514205933, "learning_rate": 2.4549864668504774e-05, "loss": 0.5618, "step": 1284 }, { "epoch": 3.0908543922984357, "grad_norm": 0.5275121927261353, "learning_rate": 2.4425042564574184e-05, "loss": 0.5324, "step": 1285 }, { "epoch": 3.093261131167268, "grad_norm": 0.5238671898841858, "learning_rate": 2.4300494434824373e-05, "loss": 0.565, "step": 1286 }, { "epoch": 3.095667870036101, "grad_norm": 0.5070421099662781, "learning_rate": 2.4176220730766974e-05, "loss": 0.4853, "step": 1287 }, { "epoch": 3.098074608904934, "grad_norm": 0.5159258842468262, "learning_rate": 2.4052221902918725e-05, "loss": 0.4916, "step": 1288 }, { "epoch": 3.1004813477737665, "grad_norm": 0.5102206468582153, "learning_rate": 2.3928498400799993e-05, "loss": 0.5324, "step": 1289 }, { "epoch": 3.1028880866425994, "grad_norm": 0.49183228611946106, "learning_rate": 2.3805050672932928e-05, "loss": 0.472, "step": 1290 }, { "epoch": 3.105294825511432, "grad_norm": 0.5284923911094666, "learning_rate": 2.368187916683997e-05, "loss": 0.5608, "step": 1291 }, { "epoch": 3.107701564380265, "grad_norm": 0.4916832149028778, "learning_rate": 2.3558984329042233e-05, "loss": 0.4823, "step": 1292 }, { "epoch": 3.1101083032490973, "grad_norm": 0.5477405190467834, "learning_rate": 2.343636660505779e-05, "loss": 0.5921, "step": 1293 }, { "epoch": 3.11251504211793, "grad_norm": 0.6153537034988403, "learning_rate": 2.3314026439400217e-05, "loss": 0.539, "step": 1294 }, { "epoch": 3.114921780986763, "grad_norm": 0.5392540097236633, "learning_rate": 2.3191964275576805e-05, "loss": 0.5247, "step": 1295 }, { "epoch": 3.1173285198555956, "grad_norm": 0.5372434258460999, "learning_rate": 2.3070180556087074e-05, "loss": 0.5756, "step": 1296 }, { "epoch": 3.1197352587244285, "grad_norm": 0.518665611743927, "learning_rate": 2.2948675722421086e-05, "loss": 0.5249, "step": 1297 }, { "epoch": 3.122141997593261, "grad_norm": 0.5433489084243774, "learning_rate": 2.282745021505791e-05, "loss": 0.5499, "step": 1298 }, { "epoch": 3.124548736462094, "grad_norm": 0.5481027364730835, "learning_rate": 2.2706504473464063e-05, "loss": 0.5792, "step": 1299 }, { "epoch": 3.1269554753309268, "grad_norm": 0.5179407596588135, "learning_rate": 2.2585838936091754e-05, "loss": 0.535, "step": 1300 }, { "epoch": 3.1293622141997592, "grad_norm": 0.5445900559425354, "learning_rate": 2.2465454040377455e-05, "loss": 0.5784, "step": 1301 }, { "epoch": 3.131768953068592, "grad_norm": 0.5221489071846008, "learning_rate": 2.2345350222740247e-05, "loss": 0.5246, "step": 1302 }, { "epoch": 3.1341756919374246, "grad_norm": 0.5492375493049622, "learning_rate": 2.2225527918580204e-05, "loss": 0.5912, "step": 1303 }, { "epoch": 3.1365824308062575, "grad_norm": 0.5303949117660522, "learning_rate": 2.2105987562276953e-05, "loss": 0.5415, "step": 1304 }, { "epoch": 3.1389891696750905, "grad_norm": 0.5176538825035095, "learning_rate": 2.198672958718796e-05, "loss": 0.5481, "step": 1305 }, { "epoch": 3.141395908543923, "grad_norm": 0.5144507884979248, "learning_rate": 2.1867754425646926e-05, "loss": 0.4955, "step": 1306 }, { "epoch": 3.143802647412756, "grad_norm": 0.5130107998847961, "learning_rate": 2.174906250896237e-05, "loss": 0.5164, "step": 1307 }, { "epoch": 3.1462093862815883, "grad_norm": 0.4807518422603607, "learning_rate": 2.163065426741603e-05, "loss": 0.466, "step": 1308 }, { "epoch": 3.148616125150421, "grad_norm": 0.5608721971511841, "learning_rate": 2.151253013026121e-05, "loss": 0.5847, "step": 1309 }, { "epoch": 3.1510228640192537, "grad_norm": 0.5231450200080872, "learning_rate": 2.139469052572127e-05, "loss": 0.4905, "step": 1310 }, { "epoch": 3.1534296028880866, "grad_norm": 0.5513438582420349, "learning_rate": 2.127713588098812e-05, "loss": 0.5504, "step": 1311 }, { "epoch": 3.1558363417569195, "grad_norm": 0.5849348902702332, "learning_rate": 2.115986662222058e-05, "loss": 0.5734, "step": 1312 }, { "epoch": 3.158243080625752, "grad_norm": 0.546474039554596, "learning_rate": 2.1042883174542992e-05, "loss": 0.5304, "step": 1313 }, { "epoch": 3.160649819494585, "grad_norm": 0.4997680187225342, "learning_rate": 2.0926185962043466e-05, "loss": 0.542, "step": 1314 }, { "epoch": 3.1630565583634174, "grad_norm": 0.5557947754859924, "learning_rate": 2.0809775407772503e-05, "loss": 0.5846, "step": 1315 }, { "epoch": 3.1654632972322503, "grad_norm": 0.5380398631095886, "learning_rate": 2.069365193374142e-05, "loss": 0.5448, "step": 1316 }, { "epoch": 3.167870036101083, "grad_norm": 0.5411592721939087, "learning_rate": 2.0577815960920754e-05, "loss": 0.5591, "step": 1317 }, { "epoch": 3.1702767749699157, "grad_norm": 0.5361132025718689, "learning_rate": 2.0462267909238896e-05, "loss": 0.5473, "step": 1318 }, { "epoch": 3.1726835138387486, "grad_norm": 0.5358035564422607, "learning_rate": 2.0347008197580374e-05, "loss": 0.571, "step": 1319 }, { "epoch": 3.175090252707581, "grad_norm": 0.554479718208313, "learning_rate": 2.0232037243784475e-05, "loss": 0.6003, "step": 1320 }, { "epoch": 3.177496991576414, "grad_norm": 0.5590445399284363, "learning_rate": 2.011735546464365e-05, "loss": 0.5673, "step": 1321 }, { "epoch": 3.179903730445247, "grad_norm": 0.535627007484436, "learning_rate": 2.000296327590202e-05, "loss": 0.5016, "step": 1322 }, { "epoch": 3.1823104693140793, "grad_norm": 0.5440089106559753, "learning_rate": 1.9888861092253975e-05, "loss": 0.5294, "step": 1323 }, { "epoch": 3.1847172081829123, "grad_norm": 0.517289400100708, "learning_rate": 1.9775049327342486e-05, "loss": 0.5182, "step": 1324 }, { "epoch": 3.1871239470517447, "grad_norm": 0.5191426277160645, "learning_rate": 1.9661528393757744e-05, "loss": 0.5095, "step": 1325 }, { "epoch": 3.1895306859205776, "grad_norm": 0.5212904810905457, "learning_rate": 1.954829870303555e-05, "loss": 0.5014, "step": 1326 }, { "epoch": 3.19193742478941, "grad_norm": 0.5149190425872803, "learning_rate": 1.943536066565603e-05, "loss": 0.5599, "step": 1327 }, { "epoch": 3.194344163658243, "grad_norm": 0.5006343126296997, "learning_rate": 1.9322714691041878e-05, "loss": 0.506, "step": 1328 }, { "epoch": 3.196750902527076, "grad_norm": 0.5566385984420776, "learning_rate": 1.9210361187557057e-05, "loss": 0.5321, "step": 1329 }, { "epoch": 3.1991576413959084, "grad_norm": 0.5385393500328064, "learning_rate": 1.9098300562505266e-05, "loss": 0.5514, "step": 1330 }, { "epoch": 3.2015643802647413, "grad_norm": 0.49694156646728516, "learning_rate": 1.8986533222128413e-05, "loss": 0.5038, "step": 1331 }, { "epoch": 3.2039711191335742, "grad_norm": 0.5700713992118835, "learning_rate": 1.8875059571605293e-05, "loss": 0.5821, "step": 1332 }, { "epoch": 3.2063778580024067, "grad_norm": 0.5090578198432922, "learning_rate": 1.876388001504995e-05, "loss": 0.5227, "step": 1333 }, { "epoch": 3.2087845968712396, "grad_norm": 0.5069187879562378, "learning_rate": 1.8652994955510227e-05, "loss": 0.5024, "step": 1334 }, { "epoch": 3.211191335740072, "grad_norm": 0.5275895595550537, "learning_rate": 1.854240479496643e-05, "loss": 0.5204, "step": 1335 }, { "epoch": 3.213598074608905, "grad_norm": 0.5007919669151306, "learning_rate": 1.8432109934329834e-05, "loss": 0.4634, "step": 1336 }, { "epoch": 3.2160048134777375, "grad_norm": 0.5285254716873169, "learning_rate": 1.832211077344109e-05, "loss": 0.5278, "step": 1337 }, { "epoch": 3.2184115523465704, "grad_norm": 0.5296740531921387, "learning_rate": 1.8212407711068958e-05, "loss": 0.5664, "step": 1338 }, { "epoch": 3.2208182912154033, "grad_norm": 0.5545907020568848, "learning_rate": 1.810300114490875e-05, "loss": 0.5639, "step": 1339 }, { "epoch": 3.2232250300842358, "grad_norm": 0.5125924944877625, "learning_rate": 1.7993891471580893e-05, "loss": 0.4853, "step": 1340 }, { "epoch": 3.2256317689530687, "grad_norm": 0.5552356839179993, "learning_rate": 1.78850790866296e-05, "loss": 0.552, "step": 1341 }, { "epoch": 3.228038507821901, "grad_norm": 0.501331090927124, "learning_rate": 1.777656438452129e-05, "loss": 0.5068, "step": 1342 }, { "epoch": 3.230445246690734, "grad_norm": 0.5555436015129089, "learning_rate": 1.7668347758643233e-05, "loss": 0.599, "step": 1343 }, { "epoch": 3.232851985559567, "grad_norm": 0.5258322358131409, "learning_rate": 1.756042960130212e-05, "loss": 0.4981, "step": 1344 }, { "epoch": 3.2352587244283995, "grad_norm": 0.5356730818748474, "learning_rate": 1.74528103037226e-05, "loss": 0.5556, "step": 1345 }, { "epoch": 3.2376654632972324, "grad_norm": 0.5419634580612183, "learning_rate": 1.7345490256045993e-05, "loss": 0.5583, "step": 1346 }, { "epoch": 3.240072202166065, "grad_norm": 0.5094894766807556, "learning_rate": 1.7238469847328697e-05, "loss": 0.5455, "step": 1347 }, { "epoch": 3.2424789410348978, "grad_norm": 0.5496340990066528, "learning_rate": 1.713174946554086e-05, "loss": 0.5761, "step": 1348 }, { "epoch": 3.2448856799037307, "grad_norm": 0.5420581698417664, "learning_rate": 1.702532949756499e-05, "loss": 0.5248, "step": 1349 }, { "epoch": 3.247292418772563, "grad_norm": 0.7173995971679688, "learning_rate": 1.6919210329194533e-05, "loss": 0.5724, "step": 1350 }, { "epoch": 3.249699157641396, "grad_norm": 0.5376344919204712, "learning_rate": 1.6813392345132518e-05, "loss": 0.5557, "step": 1351 }, { "epoch": 3.2521058965102285, "grad_norm": 0.5363267660140991, "learning_rate": 1.6707875928990058e-05, "loss": 0.5423, "step": 1352 }, { "epoch": 3.2521058965102285, "eval_loss": 0.5366878509521484, "eval_runtime": 2129.2139, "eval_samples_per_second": 1.789, "eval_steps_per_second": 0.895, "step": 1352 }, { "epoch": 3.2545126353790614, "grad_norm": 0.5646700859069824, "learning_rate": 1.660266146328504e-05, "loss": 0.5616, "step": 1353 }, { "epoch": 3.256919374247894, "grad_norm": 0.547985851764679, "learning_rate": 1.649774932944075e-05, "loss": 0.5512, "step": 1354 }, { "epoch": 3.259326113116727, "grad_norm": 0.5682858824729919, "learning_rate": 1.6393139907784404e-05, "loss": 0.5456, "step": 1355 }, { "epoch": 3.2617328519855597, "grad_norm": 0.5054064989089966, "learning_rate": 1.6288833577545914e-05, "loss": 0.5334, "step": 1356 }, { "epoch": 3.264139590854392, "grad_norm": 0.5089443325996399, "learning_rate": 1.6184830716856347e-05, "loss": 0.5617, "step": 1357 }, { "epoch": 3.266546329723225, "grad_norm": 0.5328949093818665, "learning_rate": 1.6081131702746667e-05, "loss": 0.539, "step": 1358 }, { "epoch": 3.2689530685920576, "grad_norm": 0.5444447994232178, "learning_rate": 1.5977736911146324e-05, "loss": 0.5137, "step": 1359 }, { "epoch": 3.2713598074608905, "grad_norm": 0.5105332732200623, "learning_rate": 1.587464671688187e-05, "loss": 0.5287, "step": 1360 }, { "epoch": 3.2737665463297234, "grad_norm": 0.521909773349762, "learning_rate": 1.5771861493675732e-05, "loss": 0.5273, "step": 1361 }, { "epoch": 3.276173285198556, "grad_norm": 0.5191594958305359, "learning_rate": 1.5669381614144685e-05, "loss": 0.5383, "step": 1362 }, { "epoch": 3.278580024067389, "grad_norm": 0.5011019110679626, "learning_rate": 1.5567207449798515e-05, "loss": 0.4968, "step": 1363 }, { "epoch": 3.2809867629362213, "grad_norm": 0.509896457195282, "learning_rate": 1.546533937103881e-05, "loss": 0.5228, "step": 1364 }, { "epoch": 3.283393501805054, "grad_norm": 0.5439499616622925, "learning_rate": 1.5363777747157572e-05, "loss": 0.5381, "step": 1365 }, { "epoch": 3.285800240673887, "grad_norm": 0.5635488033294678, "learning_rate": 1.5262522946335755e-05, "loss": 0.5761, "step": 1366 }, { "epoch": 3.2882069795427196, "grad_norm": 0.5016472339630127, "learning_rate": 1.5161575335642064e-05, "loss": 0.4863, "step": 1367 }, { "epoch": 3.2906137184115525, "grad_norm": 0.5103170871734619, "learning_rate": 1.5060935281031563e-05, "loss": 0.5196, "step": 1368 }, { "epoch": 3.293020457280385, "grad_norm": 0.550857424736023, "learning_rate": 1.4960603147344343e-05, "loss": 0.5974, "step": 1369 }, { "epoch": 3.295427196149218, "grad_norm": 0.6426451206207275, "learning_rate": 1.4860579298304312e-05, "loss": 0.5279, "step": 1370 }, { "epoch": 3.2978339350180503, "grad_norm": 0.5445566773414612, "learning_rate": 1.4760864096517701e-05, "loss": 0.5283, "step": 1371 }, { "epoch": 3.3002406738868832, "grad_norm": 0.5562369227409363, "learning_rate": 1.466145790347183e-05, "loss": 0.5231, "step": 1372 }, { "epoch": 3.302647412755716, "grad_norm": 0.5637094378471375, "learning_rate": 1.4562361079533849e-05, "loss": 0.5798, "step": 1373 }, { "epoch": 3.3050541516245486, "grad_norm": 0.5442596673965454, "learning_rate": 1.4463573983949341e-05, "loss": 0.5278, "step": 1374 }, { "epoch": 3.3074608904933815, "grad_norm": 0.5335230827331543, "learning_rate": 1.4365096974841108e-05, "loss": 0.5579, "step": 1375 }, { "epoch": 3.309867629362214, "grad_norm": 0.5049663782119751, "learning_rate": 1.4266930409207791e-05, "loss": 0.4992, "step": 1376 }, { "epoch": 3.312274368231047, "grad_norm": 0.5578943490982056, "learning_rate": 1.416907464292262e-05, "loss": 0.5485, "step": 1377 }, { "epoch": 3.31468110709988, "grad_norm": 0.5703057646751404, "learning_rate": 1.4071530030732095e-05, "loss": 0.4996, "step": 1378 }, { "epoch": 3.3170878459687123, "grad_norm": 0.5284083485603333, "learning_rate": 1.3974296926254794e-05, "loss": 0.5158, "step": 1379 }, { "epoch": 3.3194945848375452, "grad_norm": 0.5459629893302917, "learning_rate": 1.3877375681979943e-05, "loss": 0.5801, "step": 1380 }, { "epoch": 3.3219013237063777, "grad_norm": 0.543509304523468, "learning_rate": 1.3780766649266242e-05, "loss": 0.5626, "step": 1381 }, { "epoch": 3.3243080625752106, "grad_norm": 0.4977230131626129, "learning_rate": 1.3684470178340548e-05, "loss": 0.4749, "step": 1382 }, { "epoch": 3.3267148014440435, "grad_norm": 0.5060301423072815, "learning_rate": 1.3588486618296615e-05, "loss": 0.5244, "step": 1383 }, { "epoch": 3.329121540312876, "grad_norm": 0.5263816714286804, "learning_rate": 1.3492816317093893e-05, "loss": 0.5209, "step": 1384 }, { "epoch": 3.331528279181709, "grad_norm": 0.49335339665412903, "learning_rate": 1.339745962155613e-05, "loss": 0.5134, "step": 1385 }, { "epoch": 3.3339350180505414, "grad_norm": 0.4969792366027832, "learning_rate": 1.3302416877370239e-05, "loss": 0.4631, "step": 1386 }, { "epoch": 3.3363417569193743, "grad_norm": 0.5222917795181274, "learning_rate": 1.3207688429084974e-05, "loss": 0.5592, "step": 1387 }, { "epoch": 3.3387484957882068, "grad_norm": 0.5176815986633301, "learning_rate": 1.3113274620109684e-05, "loss": 0.5199, "step": 1388 }, { "epoch": 3.3411552346570397, "grad_norm": 0.48987865447998047, "learning_rate": 1.3019175792713167e-05, "loss": 0.4956, "step": 1389 }, { "epoch": 3.3435619735258726, "grad_norm": 0.51878821849823, "learning_rate": 1.2925392288022298e-05, "loss": 0.5382, "step": 1390 }, { "epoch": 3.345968712394705, "grad_norm": 0.4918244779109955, "learning_rate": 1.2831924446020805e-05, "loss": 0.4887, "step": 1391 }, { "epoch": 3.348375451263538, "grad_norm": 0.5739325881004333, "learning_rate": 1.27387726055481e-05, "loss": 0.5824, "step": 1392 }, { "epoch": 3.350782190132371, "grad_norm": 0.5434862375259399, "learning_rate": 1.2645937104298111e-05, "loss": 0.5407, "step": 1393 }, { "epoch": 3.3531889290012034, "grad_norm": 0.5267785787582397, "learning_rate": 1.2553418278817874e-05, "loss": 0.5572, "step": 1394 }, { "epoch": 3.3555956678700363, "grad_norm": 0.5286926627159119, "learning_rate": 1.2461216464506454e-05, "loss": 0.5797, "step": 1395 }, { "epoch": 3.3580024067388687, "grad_norm": 0.5185725688934326, "learning_rate": 1.2369331995613665e-05, "loss": 0.5196, "step": 1396 }, { "epoch": 3.3604091456077017, "grad_norm": 0.5312631726264954, "learning_rate": 1.2277765205238879e-05, "loss": 0.5252, "step": 1397 }, { "epoch": 3.362815884476534, "grad_norm": 0.5479881763458252, "learning_rate": 1.2186516425329874e-05, "loss": 0.5277, "step": 1398 }, { "epoch": 3.365222623345367, "grad_norm": 0.5451713800430298, "learning_rate": 1.2095585986681535e-05, "loss": 0.5614, "step": 1399 }, { "epoch": 3.3676293622142, "grad_norm": 0.5219354033470154, "learning_rate": 1.2004974218934695e-05, "loss": 0.5227, "step": 1400 }, { "epoch": 3.3700361010830324, "grad_norm": 0.5387021899223328, "learning_rate": 1.1914681450574949e-05, "loss": 0.5561, "step": 1401 }, { "epoch": 3.3724428399518653, "grad_norm": 0.5162135362625122, "learning_rate": 1.1824708008931418e-05, "loss": 0.5189, "step": 1402 }, { "epoch": 3.374849578820698, "grad_norm": 0.533231794834137, "learning_rate": 1.1735054220175711e-05, "loss": 0.5031, "step": 1403 }, { "epoch": 3.3772563176895307, "grad_norm": 0.5345563888549805, "learning_rate": 1.1645720409320504e-05, "loss": 0.5435, "step": 1404 }, { "epoch": 3.379663056558363, "grad_norm": 0.5260090231895447, "learning_rate": 1.1556706900218572e-05, "loss": 0.4999, "step": 1405 }, { "epoch": 3.382069795427196, "grad_norm": 0.5422772765159607, "learning_rate": 1.14680140155615e-05, "loss": 0.5107, "step": 1406 }, { "epoch": 3.384476534296029, "grad_norm": 0.5384384393692017, "learning_rate": 1.1379642076878527e-05, "loss": 0.5323, "step": 1407 }, { "epoch": 3.3868832731648615, "grad_norm": 0.5225452780723572, "learning_rate": 1.1291591404535462e-05, "loss": 0.5201, "step": 1408 }, { "epoch": 3.3892900120336944, "grad_norm": 0.5273723602294922, "learning_rate": 1.1203862317733404e-05, "loss": 0.4788, "step": 1409 }, { "epoch": 3.3916967509025273, "grad_norm": 0.5345770716667175, "learning_rate": 1.1116455134507664e-05, "loss": 0.5221, "step": 1410 }, { "epoch": 3.39410348977136, "grad_norm": 0.5359362363815308, "learning_rate": 1.1029370171726571e-05, "loss": 0.5683, "step": 1411 }, { "epoch": 3.3965102286401927, "grad_norm": 0.510586678981781, "learning_rate": 1.094260774509035e-05, "loss": 0.4954, "step": 1412 }, { "epoch": 3.398916967509025, "grad_norm": 0.5380223989486694, "learning_rate": 1.085616816913e-05, "loss": 0.5481, "step": 1413 }, { "epoch": 3.401323706377858, "grad_norm": 0.5332891941070557, "learning_rate": 1.0770051757206079e-05, "loss": 0.5297, "step": 1414 }, { "epoch": 3.4037304452466906, "grad_norm": 0.5353958010673523, "learning_rate": 1.068425882150762e-05, "loss": 0.5199, "step": 1415 }, { "epoch": 3.4061371841155235, "grad_norm": 0.5359684228897095, "learning_rate": 1.0598789673051014e-05, "loss": 0.566, "step": 1416 }, { "epoch": 3.4085439229843564, "grad_norm": 0.5203873515129089, "learning_rate": 1.051364462167881e-05, "loss": 0.5023, "step": 1417 }, { "epoch": 3.410950661853189, "grad_norm": 0.5531004071235657, "learning_rate": 1.042882397605871e-05, "loss": 0.5676, "step": 1418 }, { "epoch": 3.4133574007220218, "grad_norm": 0.5231027603149414, "learning_rate": 1.034432804368235e-05, "loss": 0.5225, "step": 1419 }, { "epoch": 3.4157641395908542, "grad_norm": 0.5327243804931641, "learning_rate": 1.026015713086418e-05, "loss": 0.5423, "step": 1420 }, { "epoch": 3.418170878459687, "grad_norm": 0.5418352484703064, "learning_rate": 1.0176311542740413e-05, "loss": 0.5645, "step": 1421 }, { "epoch": 3.4205776173285196, "grad_norm": 0.5340038537979126, "learning_rate": 1.0092791583267936e-05, "loss": 0.5584, "step": 1422 }, { "epoch": 3.4229843561973525, "grad_norm": 0.5358564257621765, "learning_rate": 1.0009597555223128e-05, "loss": 0.5557, "step": 1423 }, { "epoch": 3.4253910950661854, "grad_norm": 0.540876030921936, "learning_rate": 9.926729760200803e-06, "loss": 0.5687, "step": 1424 }, { "epoch": 3.427797833935018, "grad_norm": 0.5110324621200562, "learning_rate": 9.844188498613116e-06, "loss": 0.4881, "step": 1425 }, { "epoch": 3.430204572803851, "grad_norm": 0.5295665264129639, "learning_rate": 9.761974069688461e-06, "loss": 0.5369, "step": 1426 }, { "epoch": 3.4326113116726837, "grad_norm": 0.5555456876754761, "learning_rate": 9.680086771470475e-06, "loss": 0.5984, "step": 1427 }, { "epoch": 3.435018050541516, "grad_norm": 0.5261369943618774, "learning_rate": 9.598526900816774e-06, "loss": 0.5336, "step": 1428 }, { "epoch": 3.437424789410349, "grad_norm": 0.5204235911369324, "learning_rate": 9.517294753398064e-06, "loss": 0.4853, "step": 1429 }, { "epoch": 3.4398315282791816, "grad_norm": 0.541969895362854, "learning_rate": 9.436390623696911e-06, "loss": 0.5425, "step": 1430 }, { "epoch": 3.4422382671480145, "grad_norm": 0.5288179516792297, "learning_rate": 9.355814805006858e-06, "loss": 0.5539, "step": 1431 }, { "epoch": 3.444645006016847, "grad_norm": 0.5125930905342102, "learning_rate": 9.275567589431178e-06, "loss": 0.493, "step": 1432 }, { "epoch": 3.44705174488568, "grad_norm": 0.5379872918128967, "learning_rate": 9.195649267881911e-06, "loss": 0.5598, "step": 1433 }, { "epoch": 3.449458483754513, "grad_norm": 0.5161752104759216, "learning_rate": 9.116060130078775e-06, "loss": 0.5484, "step": 1434 }, { "epoch": 3.4518652226233453, "grad_norm": 0.5211566090583801, "learning_rate": 9.036800464548157e-06, "loss": 0.5663, "step": 1435 }, { "epoch": 3.454271961492178, "grad_norm": 0.536629855632782, "learning_rate": 8.957870558622061e-06, "loss": 0.5524, "step": 1436 }, { "epoch": 3.4566787003610107, "grad_norm": 0.5558271408081055, "learning_rate": 8.879270698436993e-06, "loss": 0.5651, "step": 1437 }, { "epoch": 3.4590854392298436, "grad_norm": 0.5530038475990295, "learning_rate": 8.80100116893301e-06, "loss": 0.5723, "step": 1438 }, { "epoch": 3.4614921780986765, "grad_norm": 0.5189397931098938, "learning_rate": 8.723062253852654e-06, "loss": 0.5139, "step": 1439 }, { "epoch": 3.463898916967509, "grad_norm": 0.5141570568084717, "learning_rate": 8.645454235739903e-06, "loss": 0.5172, "step": 1440 }, { "epoch": 3.466305655836342, "grad_norm": 0.5209838151931763, "learning_rate": 8.568177395939215e-06, "loss": 0.4993, "step": 1441 }, { "epoch": 3.4687123947051743, "grad_norm": 0.5598371028900146, "learning_rate": 8.491232014594418e-06, "loss": 0.58, "step": 1442 }, { "epoch": 3.4711191335740073, "grad_norm": 0.5433810949325562, "learning_rate": 8.414618370647753e-06, "loss": 0.5276, "step": 1443 }, { "epoch": 3.47352587244284, "grad_norm": 0.5131919980049133, "learning_rate": 8.338336741838838e-06, "loss": 0.5276, "step": 1444 }, { "epoch": 3.4759326113116726, "grad_norm": 0.5175043940544128, "learning_rate": 8.262387404703653e-06, "loss": 0.4951, "step": 1445 }, { "epoch": 3.4783393501805056, "grad_norm": 0.5281484723091125, "learning_rate": 8.186770634573637e-06, "loss": 0.5151, "step": 1446 }, { "epoch": 3.480746089049338, "grad_norm": 0.5298442244529724, "learning_rate": 8.111486705574534e-06, "loss": 0.5294, "step": 1447 }, { "epoch": 3.483152827918171, "grad_norm": 0.5065947771072388, "learning_rate": 8.036535890625463e-06, "loss": 0.4853, "step": 1448 }, { "epoch": 3.4855595667870034, "grad_norm": 0.5149825811386108, "learning_rate": 7.961918461437946e-06, "loss": 0.5466, "step": 1449 }, { "epoch": 3.4879663056558363, "grad_norm": 0.494171679019928, "learning_rate": 7.887634688515e-06, "loss": 0.477, "step": 1450 }, { "epoch": 3.4903730445246692, "grad_norm": 0.5328231453895569, "learning_rate": 7.81368484114996e-06, "loss": 0.5478, "step": 1451 }, { "epoch": 3.4927797833935017, "grad_norm": 0.5374577045440674, "learning_rate": 7.74006918742567e-06, "loss": 0.4917, "step": 1452 }, { "epoch": 3.4951865222623346, "grad_norm": 0.5328057408332825, "learning_rate": 7.666787994213453e-06, "loss": 0.5068, "step": 1453 }, { "epoch": 3.497593261131167, "grad_norm": 0.47660404443740845, "learning_rate": 7.593841527172118e-06, "loss": 0.4804, "step": 1454 }, { "epoch": 3.5, "grad_norm": 0.48948952555656433, "learning_rate": 7.521230050747086e-06, "loss": 0.4736, "step": 1455 }, { "epoch": 3.5024067388688325, "grad_norm": 0.5580392479896545, "learning_rate": 7.448953828169314e-06, "loss": 0.5805, "step": 1456 }, { "epoch": 3.5024067388688325, "eval_loss": 0.5366643667221069, "eval_runtime": 2124.456, "eval_samples_per_second": 1.793, "eval_steps_per_second": 0.897, "step": 1456 }, { "epoch": 3.5048134777376654, "grad_norm": 0.5454499125480652, "learning_rate": 7.377013121454412e-06, "loss": 0.5656, "step": 1457 }, { "epoch": 3.5072202166064983, "grad_norm": 0.5387991666793823, "learning_rate": 7.305408191401697e-06, "loss": 0.5644, "step": 1458 }, { "epoch": 3.5096269554753308, "grad_norm": 0.5216233730316162, "learning_rate": 7.2341392975931785e-06, "loss": 0.5149, "step": 1459 }, { "epoch": 3.5120336943441637, "grad_norm": 0.48791953921318054, "learning_rate": 7.163206698392744e-06, "loss": 0.5212, "step": 1460 }, { "epoch": 3.5144404332129966, "grad_norm": 0.5340071320533752, "learning_rate": 7.092610650945086e-06, "loss": 0.5851, "step": 1461 }, { "epoch": 3.516847172081829, "grad_norm": 0.5237607955932617, "learning_rate": 7.022351411174866e-06, "loss": 0.52, "step": 1462 }, { "epoch": 3.519253910950662, "grad_norm": 0.5251827836036682, "learning_rate": 6.952429233785729e-06, "loss": 0.5539, "step": 1463 }, { "epoch": 3.5216606498194944, "grad_norm": 0.5037477016448975, "learning_rate": 6.8828443722593784e-06, "loss": 0.5047, "step": 1464 }, { "epoch": 3.5240673886883274, "grad_norm": 0.5386367440223694, "learning_rate": 6.813597078854772e-06, "loss": 0.5464, "step": 1465 }, { "epoch": 3.52647412755716, "grad_norm": 0.552398681640625, "learning_rate": 6.744687604607014e-06, "loss": 0.5576, "step": 1466 }, { "epoch": 3.5288808664259927, "grad_norm": 0.5244596600532532, "learning_rate": 6.6761161993265985e-06, "loss": 0.5083, "step": 1467 }, { "epoch": 3.5312876052948257, "grad_norm": 0.5132273435592651, "learning_rate": 6.607883111598445e-06, "loss": 0.5102, "step": 1468 }, { "epoch": 3.533694344163658, "grad_norm": 0.5266515612602234, "learning_rate": 6.539988588780988e-06, "loss": 0.4794, "step": 1469 }, { "epoch": 3.536101083032491, "grad_norm": 0.5166556239128113, "learning_rate": 6.472432877005341e-06, "loss": 0.5627, "step": 1470 }, { "epoch": 3.538507821901324, "grad_norm": 0.5013148784637451, "learning_rate": 6.405216221174326e-06, "loss": 0.4836, "step": 1471 }, { "epoch": 3.5409145607701564, "grad_norm": 0.585021436214447, "learning_rate": 6.338338864961612e-06, "loss": 0.5436, "step": 1472 }, { "epoch": 3.543321299638989, "grad_norm": 0.5281007885932922, "learning_rate": 6.2718010508108545e-06, "loss": 0.5468, "step": 1473 }, { "epoch": 3.545728038507822, "grad_norm": 0.5172537565231323, "learning_rate": 6.205603019934791e-06, "loss": 0.4843, "step": 1474 }, { "epoch": 3.5481347773766547, "grad_norm": 0.49250200390815735, "learning_rate": 6.139745012314424e-06, "loss": 0.4913, "step": 1475 }, { "epoch": 3.550541516245487, "grad_norm": 0.5450341701507568, "learning_rate": 6.0742272666980625e-06, "loss": 0.5986, "step": 1476 }, { "epoch": 3.55294825511432, "grad_norm": 0.5360328555107117, "learning_rate": 6.009050020600459e-06, "loss": 0.5576, "step": 1477 }, { "epoch": 3.555354993983153, "grad_norm": 0.5166814923286438, "learning_rate": 5.944213510302054e-06, "loss": 0.5203, "step": 1478 }, { "epoch": 3.5577617328519855, "grad_norm": 0.5611687898635864, "learning_rate": 5.879717970848053e-06, "loss": 0.5623, "step": 1479 }, { "epoch": 3.5601684717208184, "grad_norm": 0.5130377411842346, "learning_rate": 5.8155636360475385e-06, "loss": 0.5292, "step": 1480 }, { "epoch": 3.562575210589651, "grad_norm": 0.5012634992599487, "learning_rate": 5.751750738472672e-06, "loss": 0.5145, "step": 1481 }, { "epoch": 3.564981949458484, "grad_norm": 0.5497102737426758, "learning_rate": 5.688279509457828e-06, "loss": 0.5521, "step": 1482 }, { "epoch": 3.5673886883273163, "grad_norm": 0.5982948541641235, "learning_rate": 5.625150179098804e-06, "loss": 0.5226, "step": 1483 }, { "epoch": 3.569795427196149, "grad_norm": 0.5084141492843628, "learning_rate": 5.562362976251901e-06, "loss": 0.502, "step": 1484 }, { "epoch": 3.572202166064982, "grad_norm": 0.5133024454116821, "learning_rate": 5.499918128533155e-06, "loss": 0.5613, "step": 1485 }, { "epoch": 3.5746089049338146, "grad_norm": 0.5170729160308838, "learning_rate": 5.437815862317519e-06, "loss": 0.542, "step": 1486 }, { "epoch": 3.5770156438026475, "grad_norm": 0.5304492712020874, "learning_rate": 5.3760564027379615e-06, "loss": 0.503, "step": 1487 }, { "epoch": 3.5794223826714804, "grad_norm": 0.5612712502479553, "learning_rate": 5.314639973684787e-06, "loss": 0.539, "step": 1488 }, { "epoch": 3.581829121540313, "grad_norm": 0.5053697228431702, "learning_rate": 5.25356679780471e-06, "loss": 0.4805, "step": 1489 }, { "epoch": 3.5842358604091458, "grad_norm": 0.5186309814453125, "learning_rate": 5.192837096500058e-06, "loss": 0.5197, "step": 1490 }, { "epoch": 3.5866425992779782, "grad_norm": 0.5357400178909302, "learning_rate": 5.132451089928026e-06, "loss": 0.5941, "step": 1491 }, { "epoch": 3.589049338146811, "grad_norm": 0.5067396759986877, "learning_rate": 5.072408996999844e-06, "loss": 0.5246, "step": 1492 }, { "epoch": 3.5914560770156436, "grad_norm": 0.5239845514297485, "learning_rate": 5.0127110353799915e-06, "loss": 0.5087, "step": 1493 }, { "epoch": 3.5938628158844765, "grad_norm": 0.5163912177085876, "learning_rate": 4.953357421485394e-06, "loss": 0.5323, "step": 1494 }, { "epoch": 3.5962695547533094, "grad_norm": 0.515769362449646, "learning_rate": 4.8943483704846475e-06, "loss": 0.5198, "step": 1495 }, { "epoch": 3.598676293622142, "grad_norm": 0.5073641538619995, "learning_rate": 4.835684096297244e-06, "loss": 0.4967, "step": 1496 }, { "epoch": 3.601083032490975, "grad_norm": 0.5255662202835083, "learning_rate": 4.777364811592766e-06, "loss": 0.4975, "step": 1497 }, { "epoch": 3.6034897713598073, "grad_norm": 0.5564093589782715, "learning_rate": 4.719390727790218e-06, "loss": 0.586, "step": 1498 }, { "epoch": 3.60589651022864, "grad_norm": 0.5399735569953918, "learning_rate": 4.661762055057084e-06, "loss": 0.5465, "step": 1499 }, { "epoch": 3.6083032490974727, "grad_norm": 0.5164072513580322, "learning_rate": 4.604479002308737e-06, "loss": 0.5599, "step": 1500 }, { "epoch": 3.6107099879663056, "grad_norm": 0.5384510159492493, "learning_rate": 4.547541777207565e-06, "loss": 0.5556, "step": 1501 }, { "epoch": 3.6131167268351385, "grad_norm": 0.5303323864936829, "learning_rate": 4.490950586162279e-06, "loss": 0.5587, "step": 1502 }, { "epoch": 3.615523465703971, "grad_norm": 0.5274252891540527, "learning_rate": 4.434705634327163e-06, "loss": 0.5372, "step": 1503 }, { "epoch": 3.617930204572804, "grad_norm": 0.5262970328330994, "learning_rate": 4.378807125601303e-06, "loss": 0.5447, "step": 1504 }, { "epoch": 3.620336943441637, "grad_norm": 0.5311320424079895, "learning_rate": 4.323255262627846e-06, "loss": 0.5534, "step": 1505 }, { "epoch": 3.6227436823104693, "grad_norm": 0.5176156759262085, "learning_rate": 4.268050246793276e-06, "loss": 0.5104, "step": 1506 }, { "epoch": 3.625150421179302, "grad_norm": 0.5465610027313232, "learning_rate": 4.2131922782267405e-06, "loss": 0.5463, "step": 1507 }, { "epoch": 3.6275571600481347, "grad_norm": 0.5134376287460327, "learning_rate": 4.158681555799204e-06, "loss": 0.4829, "step": 1508 }, { "epoch": 3.6299638989169676, "grad_norm": 0.49965766072273254, "learning_rate": 4.104518277122848e-06, "loss": 0.5089, "step": 1509 }, { "epoch": 3.6323706377858, "grad_norm": 0.5277451872825623, "learning_rate": 4.050702638550275e-06, "loss": 0.5594, "step": 1510 }, { "epoch": 3.634777376654633, "grad_norm": 0.5222126841545105, "learning_rate": 3.997234835173802e-06, "loss": 0.5312, "step": 1511 }, { "epoch": 3.637184115523466, "grad_norm": 0.5584061145782471, "learning_rate": 3.944115060824826e-06, "loss": 0.5922, "step": 1512 }, { "epoch": 3.6395908543922983, "grad_norm": 0.5429688692092896, "learning_rate": 3.891343508073053e-06, "loss": 0.5937, "step": 1513 }, { "epoch": 3.6419975932611313, "grad_norm": 0.4977768361568451, "learning_rate": 3.838920368225784e-06, "loss": 0.4874, "step": 1514 }, { "epoch": 3.644404332129964, "grad_norm": 0.549617350101471, "learning_rate": 3.7868458313272904e-06, "loss": 0.5738, "step": 1515 }, { "epoch": 3.6468110709987966, "grad_norm": 0.5078455805778503, "learning_rate": 3.7351200861580617e-06, "loss": 0.5282, "step": 1516 }, { "epoch": 3.649217809867629, "grad_norm": 0.5522779822349548, "learning_rate": 3.68374332023419e-06, "loss": 0.5508, "step": 1517 }, { "epoch": 3.651624548736462, "grad_norm": 0.5348331332206726, "learning_rate": 3.632715719806601e-06, "loss": 0.5553, "step": 1518 }, { "epoch": 3.654031287605295, "grad_norm": 0.5424596071243286, "learning_rate": 3.5820374698604555e-06, "loss": 0.5135, "step": 1519 }, { "epoch": 3.6564380264741274, "grad_norm": 0.534249484539032, "learning_rate": 3.5317087541144377e-06, "loss": 0.5258, "step": 1520 }, { "epoch": 3.6588447653429603, "grad_norm": 0.4976341724395752, "learning_rate": 3.4817297550200913e-06, "loss": 0.479, "step": 1521 }, { "epoch": 3.6612515042117932, "grad_norm": 0.5301332473754883, "learning_rate": 3.4321006537612165e-06, "loss": 0.5362, "step": 1522 }, { "epoch": 3.6636582430806257, "grad_norm": 0.5435537695884705, "learning_rate": 3.3828216302531078e-06, "loss": 0.5445, "step": 1523 }, { "epoch": 3.6660649819494586, "grad_norm": 0.5274588465690613, "learning_rate": 3.333892863141974e-06, "loss": 0.5553, "step": 1524 }, { "epoch": 3.668471720818291, "grad_norm": 0.5328760743141174, "learning_rate": 3.2853145298042953e-06, "loss": 0.5263, "step": 1525 }, { "epoch": 3.670878459687124, "grad_norm": 0.5072853565216064, "learning_rate": 3.2370868063461236e-06, "loss": 0.5211, "step": 1526 }, { "epoch": 3.6732851985559565, "grad_norm": 0.5845298171043396, "learning_rate": 3.1892098676025274e-06, "loss": 0.5199, "step": 1527 }, { "epoch": 3.6756919374247894, "grad_norm": 0.5024800300598145, "learning_rate": 3.1416838871368924e-06, "loss": 0.5129, "step": 1528 }, { "epoch": 3.6780986762936223, "grad_norm": 0.5574220418930054, "learning_rate": 3.0945090372402785e-06, "loss": 0.5055, "step": 1529 }, { "epoch": 3.6805054151624548, "grad_norm": 0.5461306571960449, "learning_rate": 3.047685488930874e-06, "loss": 0.5292, "step": 1530 }, { "epoch": 3.6829121540312877, "grad_norm": 0.5502173900604248, "learning_rate": 3.0012134119532964e-06, "loss": 0.5532, "step": 1531 }, { "epoch": 3.6853188929001206, "grad_norm": 0.5376430749893188, "learning_rate": 2.955092974778051e-06, "loss": 0.5092, "step": 1532 }, { "epoch": 3.687725631768953, "grad_norm": 0.5565453767776489, "learning_rate": 2.9093243446008166e-06, "loss": 0.5555, "step": 1533 }, { "epoch": 3.6901323706377855, "grad_norm": 0.5353987216949463, "learning_rate": 2.863907687341949e-06, "loss": 0.5159, "step": 1534 }, { "epoch": 3.6925391095066185, "grad_norm": 0.5596614480018616, "learning_rate": 2.818843167645835e-06, "loss": 0.5959, "step": 1535 }, { "epoch": 3.6949458483754514, "grad_norm": 0.5104334354400635, "learning_rate": 2.7741309488802712e-06, "loss": 0.5229, "step": 1536 }, { "epoch": 3.697352587244284, "grad_norm": 0.53873610496521, "learning_rate": 2.7297711931358993e-06, "loss": 0.5576, "step": 1537 }, { "epoch": 3.6997593261131168, "grad_norm": 0.5460606217384338, "learning_rate": 2.685764061225615e-06, "loss": 0.5197, "step": 1538 }, { "epoch": 3.7021660649819497, "grad_norm": 0.5604812502861023, "learning_rate": 2.6421097126839712e-06, "loss": 0.5764, "step": 1539 }, { "epoch": 3.704572803850782, "grad_norm": 0.515990138053894, "learning_rate": 2.5988083057666533e-06, "loss": 0.5386, "step": 1540 }, { "epoch": 3.706979542719615, "grad_norm": 0.5101820826530457, "learning_rate": 2.5558599974498054e-06, "loss": 0.4925, "step": 1541 }, { "epoch": 3.7093862815884475, "grad_norm": 0.5436517596244812, "learning_rate": 2.5132649434295606e-06, "loss": 0.5477, "step": 1542 }, { "epoch": 3.7117930204572804, "grad_norm": 0.5322871804237366, "learning_rate": 2.471023298121422e-06, "loss": 0.57, "step": 1543 }, { "epoch": 3.714199759326113, "grad_norm": 0.5071890354156494, "learning_rate": 2.4291352146596945e-06, "loss": 0.504, "step": 1544 }, { "epoch": 3.716606498194946, "grad_norm": 0.5320134162902832, "learning_rate": 2.3876008448969976e-06, "loss": 0.502, "step": 1545 }, { "epoch": 3.7190132370637787, "grad_norm": 0.5402212142944336, "learning_rate": 2.3464203394036322e-06, "loss": 0.5349, "step": 1546 }, { "epoch": 3.721419975932611, "grad_norm": 0.5399084687232971, "learning_rate": 2.3055938474670915e-06, "loss": 0.5043, "step": 1547 }, { "epoch": 3.723826714801444, "grad_norm": 0.528980016708374, "learning_rate": 2.265121517091473e-06, "loss": 0.5746, "step": 1548 }, { "epoch": 3.726233453670277, "grad_norm": 0.5291767120361328, "learning_rate": 2.2250034949969913e-06, "loss": 0.5246, "step": 1549 }, { "epoch": 3.7286401925391095, "grad_norm": 0.518034040927887, "learning_rate": 2.1852399266194314e-06, "loss": 0.5462, "step": 1550 }, { "epoch": 3.731046931407942, "grad_norm": 0.5207061767578125, "learning_rate": 2.145830956109596e-06, "loss": 0.533, "step": 1551 }, { "epoch": 3.733453670276775, "grad_norm": 0.5242441892623901, "learning_rate": 2.1067767263327933e-06, "loss": 0.566, "step": 1552 }, { "epoch": 3.735860409145608, "grad_norm": 0.5360183715820312, "learning_rate": 2.0680773788683494e-06, "loss": 0.5408, "step": 1553 }, { "epoch": 3.7382671480144403, "grad_norm": 0.517473042011261, "learning_rate": 2.029733054009042e-06, "loss": 0.5123, "step": 1554 }, { "epoch": 3.740673886883273, "grad_norm": 0.518516480922699, "learning_rate": 1.9917438907606556e-06, "loss": 0.5193, "step": 1555 }, { "epoch": 3.743080625752106, "grad_norm": 0.5022774934768677, "learning_rate": 1.954110026841427e-06, "loss": 0.5142, "step": 1556 }, { "epoch": 3.7454873646209386, "grad_norm": 0.5154804587364197, "learning_rate": 1.9168315986815567e-06, "loss": 0.5146, "step": 1557 }, { "epoch": 3.7478941034897715, "grad_norm": 0.5571584105491638, "learning_rate": 1.87990874142272e-06, "loss": 0.5989, "step": 1558 }, { "epoch": 3.750300842358604, "grad_norm": 0.5272416472434998, "learning_rate": 1.8433415889175799e-06, "loss": 0.5519, "step": 1559 }, { "epoch": 3.752707581227437, "grad_norm": 0.5540259480476379, "learning_rate": 1.8071302737293295e-06, "loss": 0.5257, "step": 1560 }, { "epoch": 3.752707581227437, "eval_loss": 0.5370286703109741, "eval_runtime": 2128.1893, "eval_samples_per_second": 1.79, "eval_steps_per_second": 0.895, "step": 1560 }, { "epoch": 3.7551143200962693, "grad_norm": 0.4957631826400757, "learning_rate": 1.771274927131139e-06, "loss": 0.4836, "step": 1561 }, { "epoch": 3.7575210589651022, "grad_norm": 0.526425302028656, "learning_rate": 1.7357756791057334e-06, "loss": 0.5241, "step": 1562 }, { "epoch": 3.759927797833935, "grad_norm": 0.5558169484138489, "learning_rate": 1.7006326583449029e-06, "loss": 0.575, "step": 1563 }, { "epoch": 3.7623345367027676, "grad_norm": 0.537803053855896, "learning_rate": 1.665845992249071e-06, "loss": 0.5504, "step": 1564 }, { "epoch": 3.7647412755716005, "grad_norm": 0.5539751648902893, "learning_rate": 1.6314158069267948e-06, "loss": 0.5827, "step": 1565 }, { "epoch": 3.7671480144404335, "grad_norm": 0.5415998697280884, "learning_rate": 1.5973422271942985e-06, "loss": 0.5596, "step": 1566 }, { "epoch": 3.769554753309266, "grad_norm": 0.5143044590950012, "learning_rate": 1.5636253765750508e-06, "loss": 0.484, "step": 1567 }, { "epoch": 3.771961492178099, "grad_norm": 0.5324181914329529, "learning_rate": 1.5302653772993225e-06, "loss": 0.5325, "step": 1568 }, { "epoch": 3.7743682310469313, "grad_norm": 0.5281087756156921, "learning_rate": 1.4972623503036965e-06, "loss": 0.5446, "step": 1569 }, { "epoch": 3.7767749699157642, "grad_norm": 0.536406934261322, "learning_rate": 1.4646164152307018e-06, "loss": 0.5705, "step": 1570 }, { "epoch": 3.7791817087845967, "grad_norm": 0.50053870677948, "learning_rate": 1.4323276904283034e-06, "loss": 0.5451, "step": 1571 }, { "epoch": 3.7815884476534296, "grad_norm": 0.4841516613960266, "learning_rate": 1.400396292949513e-06, "loss": 0.4777, "step": 1572 }, { "epoch": 3.7839951865222625, "grad_norm": 0.5405700206756592, "learning_rate": 1.3688223385519672e-06, "loss": 0.5363, "step": 1573 }, { "epoch": 3.786401925391095, "grad_norm": 0.5379523038864136, "learning_rate": 1.3376059416975172e-06, "loss": 0.5865, "step": 1574 }, { "epoch": 3.788808664259928, "grad_norm": 0.5427036285400391, "learning_rate": 1.3067472155517735e-06, "loss": 0.5745, "step": 1575 }, { "epoch": 3.7912154031287604, "grad_norm": 0.4855702519416809, "learning_rate": 1.2762462719837275e-06, "loss": 0.4946, "step": 1576 }, { "epoch": 3.7936221419975933, "grad_norm": 0.4786948263645172, "learning_rate": 1.2461032215653311e-06, "loss": 0.4578, "step": 1577 }, { "epoch": 3.7960288808664258, "grad_norm": 0.5227766036987305, "learning_rate": 1.2163181735711072e-06, "loss": 0.5066, "step": 1578 }, { "epoch": 3.7984356197352587, "grad_norm": 0.5232408046722412, "learning_rate": 1.1868912359777607e-06, "loss": 0.5145, "step": 1579 }, { "epoch": 3.8008423586040916, "grad_norm": 0.5634422898292542, "learning_rate": 1.157822515463758e-06, "loss": 0.5347, "step": 1580 }, { "epoch": 3.803249097472924, "grad_norm": 0.48469093441963196, "learning_rate": 1.1291121174089703e-06, "loss": 0.4665, "step": 1581 }, { "epoch": 3.805655836341757, "grad_norm": 0.5792638063430786, "learning_rate": 1.1007601458942752e-06, "loss": 0.5455, "step": 1582 }, { "epoch": 3.80806257521059, "grad_norm": 0.5238813757896423, "learning_rate": 1.0727667037011668e-06, "loss": 0.5396, "step": 1583 }, { "epoch": 3.8104693140794224, "grad_norm": 0.4941558539867401, "learning_rate": 1.045131892311435e-06, "loss": 0.5125, "step": 1584 }, { "epoch": 3.8128760529482553, "grad_norm": 0.5210914611816406, "learning_rate": 1.0178558119067315e-06, "loss": 0.5071, "step": 1585 }, { "epoch": 3.8152827918170877, "grad_norm": 0.6175683736801147, "learning_rate": 9.909385613682375e-07, "loss": 0.6136, "step": 1586 }, { "epoch": 3.8176895306859207, "grad_norm": 0.5337561368942261, "learning_rate": 9.6438023827633e-07, "loss": 0.5139, "step": 1587 }, { "epoch": 3.820096269554753, "grad_norm": 0.5303792357444763, "learning_rate": 9.381809389101825e-07, "loss": 0.5497, "step": 1588 }, { "epoch": 3.822503008423586, "grad_norm": 0.5012595653533936, "learning_rate": 9.123407582474541e-07, "loss": 0.4713, "step": 1589 }, { "epoch": 3.824909747292419, "grad_norm": 0.47677862644195557, "learning_rate": 8.868597899638898e-07, "loss": 0.4671, "step": 1590 }, { "epoch": 3.8273164861612514, "grad_norm": 0.5122169256210327, "learning_rate": 8.617381264330426e-07, "loss": 0.5145, "step": 1591 }, { "epoch": 3.8297232250300843, "grad_norm": 0.5102027058601379, "learning_rate": 8.369758587259413e-07, "loss": 0.4856, "step": 1592 }, { "epoch": 3.832129963898917, "grad_norm": 0.5359063148498535, "learning_rate": 8.125730766107009e-07, "loss": 0.5264, "step": 1593 }, { "epoch": 3.8345367027677497, "grad_norm": 2.2371790409088135, "learning_rate": 7.885298685522235e-07, "loss": 0.5475, "step": 1594 }, { "epoch": 3.836943441636582, "grad_norm": 0.5451308488845825, "learning_rate": 7.648463217118984e-07, "loss": 0.5779, "step": 1595 }, { "epoch": 3.839350180505415, "grad_norm": 0.5331814289093018, "learning_rate": 7.415225219472799e-07, "loss": 0.523, "step": 1596 }, { "epoch": 3.841756919374248, "grad_norm": 0.5021839141845703, "learning_rate": 7.185585538117657e-07, "loss": 0.5119, "step": 1597 }, { "epoch": 3.8441636582430805, "grad_norm": 0.5290696620941162, "learning_rate": 6.959545005542744e-07, "loss": 0.5562, "step": 1598 }, { "epoch": 3.8465703971119134, "grad_norm": 0.5380403399467468, "learning_rate": 6.737104441189801e-07, "loss": 0.5391, "step": 1599 }, { "epoch": 3.8489771359807463, "grad_norm": 0.5525466799736023, "learning_rate": 6.518264651449779e-07, "loss": 0.5477, "step": 1600 }, { "epoch": 3.851383874849579, "grad_norm": 0.5132647752761841, "learning_rate": 6.303026429660408e-07, "loss": 0.5191, "step": 1601 }, { "epoch": 3.8537906137184117, "grad_norm": 0.5168213844299316, "learning_rate": 6.09139055610275e-07, "loss": 0.4596, "step": 1602 }, { "epoch": 3.856197352587244, "grad_norm": 0.5156052112579346, "learning_rate": 5.883357797998757e-07, "loss": 0.5091, "step": 1603 }, { "epoch": 3.858604091456077, "grad_norm": 0.5132429599761963, "learning_rate": 5.678928909508275e-07, "loss": 0.5105, "step": 1604 }, { "epoch": 3.8610108303249095, "grad_norm": 0.5520070195198059, "learning_rate": 5.478104631726711e-07, "loss": 0.6083, "step": 1605 }, { "epoch": 3.8634175691937425, "grad_norm": 0.5423392057418823, "learning_rate": 5.280885692681592e-07, "loss": 0.5574, "step": 1606 }, { "epoch": 3.8658243080625754, "grad_norm": 0.5047164559364319, "learning_rate": 5.087272807330679e-07, "loss": 0.4764, "step": 1607 }, { "epoch": 3.868231046931408, "grad_norm": 0.5135738849639893, "learning_rate": 4.897266677559187e-07, "loss": 0.5396, "step": 1608 }, { "epoch": 3.8706377858002408, "grad_norm": 0.5215672254562378, "learning_rate": 4.710867992176682e-07, "loss": 0.5231, "step": 1609 }, { "epoch": 3.8730445246690737, "grad_norm": 0.49933186173439026, "learning_rate": 4.5280774269154115e-07, "loss": 0.4843, "step": 1610 }, { "epoch": 3.875451263537906, "grad_norm": 0.5038561224937439, "learning_rate": 4.348895644427309e-07, "loss": 0.5222, "step": 1611 }, { "epoch": 3.8778580024067386, "grad_norm": 0.5360286831855774, "learning_rate": 4.173323294281994e-07, "loss": 0.5187, "step": 1612 }, { "epoch": 3.8802647412755715, "grad_norm": 0.5300511121749878, "learning_rate": 4.001361012963778e-07, "loss": 0.5072, "step": 1613 }, { "epoch": 3.8826714801444044, "grad_norm": 0.5378491878509521, "learning_rate": 3.833009423869993e-07, "loss": 0.5525, "step": 1614 }, { "epoch": 3.885078219013237, "grad_norm": 0.5634551644325256, "learning_rate": 3.6682691373086665e-07, "loss": 0.5978, "step": 1615 }, { "epoch": 3.88748495788207, "grad_norm": 0.5250343084335327, "learning_rate": 3.50714075049563e-07, "loss": 0.5003, "step": 1616 }, { "epoch": 3.8898916967509027, "grad_norm": 0.49958157539367676, "learning_rate": 3.349624847553412e-07, "loss": 0.4911, "step": 1617 }, { "epoch": 3.892298435619735, "grad_norm": 0.5446503758430481, "learning_rate": 3.195721999508461e-07, "loss": 0.5702, "step": 1618 }, { "epoch": 3.894705174488568, "grad_norm": 0.5573654770851135, "learning_rate": 3.045432764288703e-07, "loss": 0.5448, "step": 1619 }, { "epoch": 3.8971119133574006, "grad_norm": 0.5020934343338013, "learning_rate": 2.898757686722542e-07, "loss": 0.4744, "step": 1620 }, { "epoch": 3.8995186522262335, "grad_norm": 0.5629104971885681, "learning_rate": 2.7556972985363085e-07, "loss": 0.5579, "step": 1621 }, { "epoch": 3.901925391095066, "grad_norm": 0.544587254524231, "learning_rate": 2.616252118352036e-07, "loss": 0.5475, "step": 1622 }, { "epoch": 3.904332129963899, "grad_norm": 0.5227930545806885, "learning_rate": 2.480422651686132e-07, "loss": 0.5058, "step": 1623 }, { "epoch": 3.906738868832732, "grad_norm": 0.5269834399223328, "learning_rate": 2.3482093909473756e-07, "loss": 0.5095, "step": 1624 }, { "epoch": 3.9091456077015643, "grad_norm": 0.524409830570221, "learning_rate": 2.219612815434924e-07, "loss": 0.4888, "step": 1625 }, { "epoch": 3.911552346570397, "grad_norm": 0.5220035910606384, "learning_rate": 2.0946333913368643e-07, "loss": 0.502, "step": 1626 }, { "epoch": 3.91395908543923, "grad_norm": 0.5198480486869812, "learning_rate": 1.973271571728441e-07, "loss": 0.5043, "step": 1627 }, { "epoch": 3.9163658243080626, "grad_norm": 0.5344683527946472, "learning_rate": 1.8555277965701668e-07, "loss": 0.5406, "step": 1628 }, { "epoch": 3.918772563176895, "grad_norm": 0.5334414839744568, "learning_rate": 1.7414024927064897e-07, "loss": 0.5675, "step": 1629 }, { "epoch": 3.921179302045728, "grad_norm": 0.5547230243682861, "learning_rate": 1.630896073864352e-07, "loss": 0.5693, "step": 1630 }, { "epoch": 3.923586040914561, "grad_norm": 0.49964791536331177, "learning_rate": 1.5240089406513003e-07, "loss": 0.5039, "step": 1631 }, { "epoch": 3.9259927797833933, "grad_norm": 0.508751392364502, "learning_rate": 1.4207414805543774e-07, "loss": 0.4667, "step": 1632 }, { "epoch": 3.9283995186522263, "grad_norm": 0.5567895770072937, "learning_rate": 1.3210940679385664e-07, "loss": 0.6011, "step": 1633 }, { "epoch": 3.930806257521059, "grad_norm": 0.520437479019165, "learning_rate": 1.2250670640454597e-07, "loss": 0.5458, "step": 1634 }, { "epoch": 3.9332129963898916, "grad_norm": 0.5191381573677063, "learning_rate": 1.1326608169920372e-07, "loss": 0.5205, "step": 1635 }, { "epoch": 3.9356197352587245, "grad_norm": 0.5143102407455444, "learning_rate": 1.0438756617691115e-07, "loss": 0.5361, "step": 1636 }, { "epoch": 3.938026474127557, "grad_norm": 0.5390070080757141, "learning_rate": 9.587119202405515e-08, "loss": 0.5317, "step": 1637 }, { "epoch": 3.94043321299639, "grad_norm": 0.5079724788665771, "learning_rate": 8.771699011416168e-08, "loss": 0.5282, "step": 1638 }, { "epoch": 3.9428399518652224, "grad_norm": 0.5208767056465149, "learning_rate": 7.992499000785136e-08, "loss": 0.514, "step": 1639 }, { "epoch": 3.9452466907340553, "grad_norm": 0.5088074207305908, "learning_rate": 7.249521995263964e-08, "loss": 0.4852, "step": 1640 }, { "epoch": 3.9476534296028882, "grad_norm": 0.541882336139679, "learning_rate": 6.542770688293676e-08, "loss": 0.5704, "step": 1641 }, { "epoch": 3.9500601684717207, "grad_norm": 0.5161964893341064, "learning_rate": 5.872247641987016e-08, "loss": 0.5003, "step": 1642 }, { "epoch": 3.9524669073405536, "grad_norm": 0.5363638401031494, "learning_rate": 5.2379552871217875e-08, "loss": 0.5609, "step": 1643 }, { "epoch": 3.9548736462093865, "grad_norm": 0.5613672137260437, "learning_rate": 4.639895923134185e-08, "loss": 0.5576, "step": 1644 }, { "epoch": 3.957280385078219, "grad_norm": 0.5385997891426086, "learning_rate": 4.078071718107701e-08, "loss": 0.5501, "step": 1645 }, { "epoch": 3.9596871239470515, "grad_norm": 0.6680697798728943, "learning_rate": 3.552484708766457e-08, "loss": 0.5343, "step": 1646 }, { "epoch": 3.9620938628158844, "grad_norm": 0.5627241730690002, "learning_rate": 3.0631368004663263e-08, "loss": 0.5917, "step": 1647 }, { "epoch": 3.9645006016847173, "grad_norm": 0.5101611614227295, "learning_rate": 2.610029767191602e-08, "loss": 0.5131, "step": 1648 }, { "epoch": 3.9669073405535498, "grad_norm": 0.5435637831687927, "learning_rate": 2.193165251545004e-08, "loss": 0.5438, "step": 1649 }, { "epoch": 3.9693140794223827, "grad_norm": 0.544251024723053, "learning_rate": 1.81254476474213e-08, "loss": 0.5313, "step": 1650 }, { "epoch": 3.9717208182912156, "grad_norm": 0.5343690514564514, "learning_rate": 1.4681696866081229e-08, "loss": 0.5335, "step": 1651 }, { "epoch": 3.974127557160048, "grad_norm": 0.5317139029502869, "learning_rate": 1.1600412655710102e-08, "loss": 0.5103, "step": 1652 }, { "epoch": 3.976534296028881, "grad_norm": 0.5443331599235535, "learning_rate": 8.881606186561531e-09, "loss": 0.5788, "step": 1653 }, { "epoch": 3.9789410348977134, "grad_norm": 0.5473819375038147, "learning_rate": 6.525287314851358e-09, "loss": 0.5982, "step": 1654 }, { "epoch": 3.9813477737665464, "grad_norm": 0.5293187499046326, "learning_rate": 4.531464582713252e-09, "loss": 0.5848, "step": 1655 }, { "epoch": 3.983754512635379, "grad_norm": 0.5230333805084229, "learning_rate": 2.900145218143191e-09, "loss": 0.5597, "step": 1656 }, { "epoch": 3.9861612515042117, "grad_norm": 0.5230133533477783, "learning_rate": 1.6313351349883655e-09, "loss": 0.5277, "step": 1657 }, { "epoch": 3.9885679903730447, "grad_norm": 0.5411299467086792, "learning_rate": 7.250389329471751e-10, "loss": 0.5187, "step": 1658 }, { "epoch": 3.990974729241877, "grad_norm": 0.5238697528839111, "learning_rate": 1.812598975137192e-10, "loss": 0.5229, "step": 1659 }, { "epoch": 3.99338146811071, "grad_norm": 0.5127192735671997, "learning_rate": 0.0, "loss": 0.5438, "step": 1660 } ], "logging_steps": 1, "max_steps": 1660, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 415, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.648208421253284e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }