{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 3148, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012706480304955528, "grad_norm": 18.464955964990292, "learning_rate": 5.000000000000001e-07, "loss": 2.786, "step": 1 }, { "epoch": 0.0025412960609911056, "grad_norm": 22.399585167061772, "learning_rate": 1.0000000000000002e-06, "loss": 2.756, "step": 2 }, { "epoch": 0.0038119440914866584, "grad_norm": 30.235203421905396, "learning_rate": 1.5e-06, "loss": 2.8511, "step": 3 }, { "epoch": 0.005082592121982211, "grad_norm": 22.642837103761043, "learning_rate": 2.0000000000000003e-06, "loss": 2.7227, "step": 4 }, { "epoch": 0.0063532401524777635, "grad_norm": 20.47380205039988, "learning_rate": 2.5e-06, "loss": 2.7037, "step": 5 }, { "epoch": 0.007623888182973317, "grad_norm": 16.488755316837164, "learning_rate": 3e-06, "loss": 2.9394, "step": 6 }, { "epoch": 0.008894536213468869, "grad_norm": 14.260362898657814, "learning_rate": 3.5e-06, "loss": 2.6189, "step": 7 }, { "epoch": 0.010165184243964422, "grad_norm": 12.280586408164009, "learning_rate": 4.000000000000001e-06, "loss": 2.723, "step": 8 }, { "epoch": 0.011435832274459974, "grad_norm": 9.22070378756126, "learning_rate": 4.5e-06, "loss": 2.3702, "step": 9 }, { "epoch": 0.012706480304955527, "grad_norm": 3.4759485405997577, "learning_rate": 5e-06, "loss": 2.4602, "step": 10 }, { "epoch": 0.01397712833545108, "grad_norm": 6.467286745342885, "learning_rate": 5.500000000000001e-06, "loss": 2.5916, "step": 11 }, { "epoch": 0.015247776365946633, "grad_norm": 4.256176100814695, "learning_rate": 6e-06, "loss": 2.5279, "step": 12 }, { "epoch": 0.016518424396442185, "grad_norm": 4.571095329033233, "learning_rate": 6.5000000000000004e-06, "loss": 2.6184, "step": 13 }, { "epoch": 0.017789072426937738, "grad_norm": 3.1593416383056456, "learning_rate": 7e-06, "loss": 2.3682, "step": 14 }, { "epoch": 0.01905972045743329, "grad_norm": 3.690992422084765, "learning_rate": 7.500000000000001e-06, "loss": 2.6305, "step": 15 }, { "epoch": 0.020330368487928845, "grad_norm": 4.793845160014369, "learning_rate": 8.000000000000001e-06, "loss": 2.7471, "step": 16 }, { "epoch": 0.021601016518424398, "grad_norm": 3.190746262709345, "learning_rate": 8.5e-06, "loss": 2.5544, "step": 17 }, { "epoch": 0.022871664548919948, "grad_norm": 1.8680728770342985, "learning_rate": 9e-06, "loss": 2.3858, "step": 18 }, { "epoch": 0.0241423125794155, "grad_norm": 2.1468024025661063, "learning_rate": 9.5e-06, "loss": 2.2027, "step": 19 }, { "epoch": 0.025412960609911054, "grad_norm": 2.7989100165411993, "learning_rate": 1e-05, "loss": 2.4284, "step": 20 }, { "epoch": 0.026683608640406607, "grad_norm": 2.187238787656559, "learning_rate": 1.0500000000000001e-05, "loss": 2.4529, "step": 21 }, { "epoch": 0.02795425667090216, "grad_norm": 1.8943347226376168, "learning_rate": 1.1000000000000001e-05, "loss": 2.3487, "step": 22 }, { "epoch": 0.029224904701397714, "grad_norm": 2.318723461163645, "learning_rate": 1.15e-05, "loss": 2.3675, "step": 23 }, { "epoch": 0.030495552731893267, "grad_norm": 2.926184983142819, "learning_rate": 1.2e-05, "loss": 2.5222, "step": 24 }, { "epoch": 0.03176620076238882, "grad_norm": 1.7831526528754407, "learning_rate": 1.25e-05, "loss": 2.2888, "step": 25 }, { "epoch": 0.03303684879288437, "grad_norm": 1.7233279523412535, "learning_rate": 1.3000000000000001e-05, "loss": 2.4598, "step": 26 }, { "epoch": 0.03430749682337993, "grad_norm": 3.3608919691001637, "learning_rate": 1.3500000000000001e-05, "loss": 2.5002, "step": 27 }, { "epoch": 0.035578144853875476, "grad_norm": 2.1066564841053332, "learning_rate": 1.4e-05, "loss": 2.2744, "step": 28 }, { "epoch": 0.036848792884371026, "grad_norm": 3.222039338985631, "learning_rate": 1.45e-05, "loss": 2.4058, "step": 29 }, { "epoch": 0.03811944091486658, "grad_norm": 1.524480532049421, "learning_rate": 1.5000000000000002e-05, "loss": 2.2636, "step": 30 }, { "epoch": 0.03939008894536213, "grad_norm": 2.8592397792332145, "learning_rate": 1.55e-05, "loss": 2.2612, "step": 31 }, { "epoch": 0.04066073697585769, "grad_norm": 1.5062409854326497, "learning_rate": 1.6000000000000003e-05, "loss": 2.3015, "step": 32 }, { "epoch": 0.04193138500635324, "grad_norm": 3.5634874512948898, "learning_rate": 1.65e-05, "loss": 2.5092, "step": 33 }, { "epoch": 0.043202033036848796, "grad_norm": 1.6040953286994395, "learning_rate": 1.7e-05, "loss": 2.4476, "step": 34 }, { "epoch": 0.044472681067344345, "grad_norm": 2.649185181368789, "learning_rate": 1.7500000000000002e-05, "loss": 2.4681, "step": 35 }, { "epoch": 0.045743329097839895, "grad_norm": 1.5158813542671679, "learning_rate": 1.8e-05, "loss": 2.292, "step": 36 }, { "epoch": 0.04701397712833545, "grad_norm": 2.8599704552305836, "learning_rate": 1.8500000000000002e-05, "loss": 2.467, "step": 37 }, { "epoch": 0.048284625158831, "grad_norm": 1.955652517362263, "learning_rate": 1.9e-05, "loss": 2.3329, "step": 38 }, { "epoch": 0.04955527318932656, "grad_norm": 2.036186380457819, "learning_rate": 1.95e-05, "loss": 2.4255, "step": 39 }, { "epoch": 0.05082592121982211, "grad_norm": 1.906309806811743, "learning_rate": 2e-05, "loss": 2.3245, "step": 40 }, { "epoch": 0.052096569250317665, "grad_norm": 1.7089031257315084, "learning_rate": 1.9999994891331854e-05, "loss": 2.2173, "step": 41 }, { "epoch": 0.053367217280813214, "grad_norm": 1.7988981102734363, "learning_rate": 1.9999979565332626e-05, "loss": 2.3608, "step": 42 }, { "epoch": 0.054637865311308764, "grad_norm": 1.833920539197692, "learning_rate": 1.9999954022017984e-05, "loss": 2.3494, "step": 43 }, { "epoch": 0.05590851334180432, "grad_norm": 1.8097536920457602, "learning_rate": 1.9999918261414016e-05, "loss": 2.2209, "step": 44 }, { "epoch": 0.05717916137229987, "grad_norm": 1.7411452453996696, "learning_rate": 1.9999872283557267e-05, "loss": 2.3133, "step": 45 }, { "epoch": 0.05844980940279543, "grad_norm": 1.70501831199215, "learning_rate": 1.9999816088494717e-05, "loss": 2.5452, "step": 46 }, { "epoch": 0.05972045743329098, "grad_norm": 2.350235220222271, "learning_rate": 1.9999749676283775e-05, "loss": 2.4249, "step": 47 }, { "epoch": 0.060991105463786534, "grad_norm": 2.6056898788385565, "learning_rate": 1.9999673046992304e-05, "loss": 2.4791, "step": 48 }, { "epoch": 0.062261753494282084, "grad_norm": 2.023938613975313, "learning_rate": 1.9999586200698588e-05, "loss": 2.5564, "step": 49 }, { "epoch": 0.06353240152477764, "grad_norm": 2.202019619137906, "learning_rate": 1.999948913749137e-05, "loss": 2.512, "step": 50 }, { "epoch": 0.06480304955527319, "grad_norm": 1.6889275088093092, "learning_rate": 1.999938185746982e-05, "loss": 2.4475, "step": 51 }, { "epoch": 0.06607369758576874, "grad_norm": 1.4638142699756047, "learning_rate": 1.999926436074355e-05, "loss": 2.5486, "step": 52 }, { "epoch": 0.06734434561626429, "grad_norm": 1.4493503821610307, "learning_rate": 1.999913664743261e-05, "loss": 2.2954, "step": 53 }, { "epoch": 0.06861499364675985, "grad_norm": 1.4930408619252327, "learning_rate": 1.999899871766749e-05, "loss": 2.3253, "step": 54 }, { "epoch": 0.0698856416772554, "grad_norm": 1.4795888492851268, "learning_rate": 1.9998850571589114e-05, "loss": 2.2496, "step": 55 }, { "epoch": 0.07115628970775095, "grad_norm": 4.386537889011212, "learning_rate": 1.9998692209348852e-05, "loss": 2.3639, "step": 56 }, { "epoch": 0.0724269377382465, "grad_norm": 1.9608285617315484, "learning_rate": 1.9998523631108506e-05, "loss": 2.4487, "step": 57 }, { "epoch": 0.07369758576874205, "grad_norm": 2.3825490594197998, "learning_rate": 1.9998344837040318e-05, "loss": 2.2838, "step": 58 }, { "epoch": 0.07496823379923762, "grad_norm": 1.8996986104829006, "learning_rate": 1.999815582732697e-05, "loss": 2.4616, "step": 59 }, { "epoch": 0.07623888182973317, "grad_norm": 1.6631718517357963, "learning_rate": 1.9997956602161577e-05, "loss": 2.3015, "step": 60 }, { "epoch": 0.07750952986022872, "grad_norm": 1.9538892233672316, "learning_rate": 1.9997747161747696e-05, "loss": 2.501, "step": 61 }, { "epoch": 0.07878017789072427, "grad_norm": 1.2795079596218877, "learning_rate": 1.9997527506299318e-05, "loss": 2.3801, "step": 62 }, { "epoch": 0.08005082592121983, "grad_norm": 4.523259680668686, "learning_rate": 1.999729763604087e-05, "loss": 2.6326, "step": 63 }, { "epoch": 0.08132147395171538, "grad_norm": 1.647620072325872, "learning_rate": 1.9997057551207223e-05, "loss": 2.22, "step": 64 }, { "epoch": 0.08259212198221093, "grad_norm": 1.5998559204773664, "learning_rate": 1.9996807252043677e-05, "loss": 2.3037, "step": 65 }, { "epoch": 0.08386277001270648, "grad_norm": 2.1714649315563017, "learning_rate": 1.9996546738805972e-05, "loss": 2.2916, "step": 66 }, { "epoch": 0.08513341804320203, "grad_norm": 2.1204211958942802, "learning_rate": 1.999627601176028e-05, "loss": 2.1388, "step": 67 }, { "epoch": 0.08640406607369759, "grad_norm": 3.1235773393190307, "learning_rate": 1.999599507118322e-05, "loss": 2.3557, "step": 68 }, { "epoch": 0.08767471410419314, "grad_norm": 1.6752389846723672, "learning_rate": 1.999570391736183e-05, "loss": 2.4894, "step": 69 }, { "epoch": 0.08894536213468869, "grad_norm": 2.1252725394449645, "learning_rate": 1.999540255059359e-05, "loss": 2.5256, "step": 70 }, { "epoch": 0.09021601016518424, "grad_norm": 1.4228164737722837, "learning_rate": 1.999509097118643e-05, "loss": 2.4114, "step": 71 }, { "epoch": 0.09148665819567979, "grad_norm": 1.594698364320451, "learning_rate": 1.9994769179458687e-05, "loss": 2.2206, "step": 72 }, { "epoch": 0.09275730622617535, "grad_norm": 1.4260368390306692, "learning_rate": 1.9994437175739154e-05, "loss": 2.344, "step": 73 }, { "epoch": 0.0940279542566709, "grad_norm": 1.6052123309042803, "learning_rate": 1.999409496036705e-05, "loss": 2.324, "step": 74 }, { "epoch": 0.09529860228716645, "grad_norm": 1.1887947295478682, "learning_rate": 1.999374253369202e-05, "loss": 2.1568, "step": 75 }, { "epoch": 0.096569250317662, "grad_norm": 1.7214375838627587, "learning_rate": 1.9993379896074163e-05, "loss": 2.3768, "step": 76 }, { "epoch": 0.09783989834815757, "grad_norm": 1.546493635353293, "learning_rate": 1.9993007047883988e-05, "loss": 2.2587, "step": 77 }, { "epoch": 0.09911054637865312, "grad_norm": 1.408228139736878, "learning_rate": 1.9992623989502448e-05, "loss": 2.3015, "step": 78 }, { "epoch": 0.10038119440914867, "grad_norm": 1.3631710614333141, "learning_rate": 1.9992230721320932e-05, "loss": 2.1168, "step": 79 }, { "epoch": 0.10165184243964422, "grad_norm": 1.4184248992150634, "learning_rate": 1.9991827243741253e-05, "loss": 2.5526, "step": 80 }, { "epoch": 0.10292249047013977, "grad_norm": 1.431143102369531, "learning_rate": 1.9991413557175656e-05, "loss": 2.2523, "step": 81 }, { "epoch": 0.10419313850063533, "grad_norm": 2.00398683475326, "learning_rate": 1.999098966204682e-05, "loss": 2.428, "step": 82 }, { "epoch": 0.10546378653113088, "grad_norm": 1.8800068765705902, "learning_rate": 1.9990555558787847e-05, "loss": 2.6345, "step": 83 }, { "epoch": 0.10673443456162643, "grad_norm": 1.4579776022353235, "learning_rate": 1.9990111247842285e-05, "loss": 2.459, "step": 84 }, { "epoch": 0.10800508259212198, "grad_norm": 1.9205744158321651, "learning_rate": 1.998965672966409e-05, "loss": 2.0765, "step": 85 }, { "epoch": 0.10927573062261753, "grad_norm": 2.68259040488466, "learning_rate": 1.9989192004717672e-05, "loss": 2.58, "step": 86 }, { "epoch": 0.11054637865311309, "grad_norm": 1.7085159849494436, "learning_rate": 1.9988717073477842e-05, "loss": 2.4019, "step": 87 }, { "epoch": 0.11181702668360864, "grad_norm": 1.3008915942158903, "learning_rate": 1.9988231936429866e-05, "loss": 2.1258, "step": 88 }, { "epoch": 0.11308767471410419, "grad_norm": 1.5495282607802583, "learning_rate": 1.9987736594069417e-05, "loss": 2.2544, "step": 89 }, { "epoch": 0.11435832274459974, "grad_norm": 1.5785369727348832, "learning_rate": 1.9987231046902602e-05, "loss": 2.194, "step": 90 }, { "epoch": 0.1156289707750953, "grad_norm": 1.570746098900601, "learning_rate": 1.9986715295445963e-05, "loss": 2.0638, "step": 91 }, { "epoch": 0.11689961880559085, "grad_norm": 1.7320562240650357, "learning_rate": 1.9986189340226455e-05, "loss": 2.3778, "step": 92 }, { "epoch": 0.1181702668360864, "grad_norm": 2.421076291096759, "learning_rate": 1.9985653181781465e-05, "loss": 2.4102, "step": 93 }, { "epoch": 0.11944091486658195, "grad_norm": 2.6125970691744205, "learning_rate": 1.99851068206588e-05, "loss": 2.543, "step": 94 }, { "epoch": 0.1207115628970775, "grad_norm": 2.1586629440192024, "learning_rate": 1.9984550257416706e-05, "loss": 2.4479, "step": 95 }, { "epoch": 0.12198221092757307, "grad_norm": 1.3699724096006964, "learning_rate": 1.9983983492623832e-05, "loss": 2.413, "step": 96 }, { "epoch": 0.12325285895806862, "grad_norm": 1.7543498248329765, "learning_rate": 1.9983406526859266e-05, "loss": 2.2759, "step": 97 }, { "epoch": 0.12452350698856417, "grad_norm": 1.4243383326582497, "learning_rate": 1.9982819360712514e-05, "loss": 1.9626, "step": 98 }, { "epoch": 0.12579415501905972, "grad_norm": 1.708842956682712, "learning_rate": 1.99822219947835e-05, "loss": 2.315, "step": 99 }, { "epoch": 0.12706480304955528, "grad_norm": 1.5848118391318178, "learning_rate": 1.9981614429682576e-05, "loss": 2.5247, "step": 100 }, { "epoch": 0.12833545108005082, "grad_norm": 1.7565244063500633, "learning_rate": 1.9980996666030507e-05, "loss": 2.569, "step": 101 }, { "epoch": 0.12960609911054638, "grad_norm": 1.7188897035182684, "learning_rate": 1.998036870445849e-05, "loss": 2.5498, "step": 102 }, { "epoch": 0.13087674714104194, "grad_norm": 1.321697488805632, "learning_rate": 1.9979730545608128e-05, "loss": 2.0094, "step": 103 }, { "epoch": 0.13214739517153748, "grad_norm": 1.3411797947362514, "learning_rate": 1.997908219013145e-05, "loss": 2.4512, "step": 104 }, { "epoch": 0.13341804320203304, "grad_norm": 1.4764463188652248, "learning_rate": 1.997842363869091e-05, "loss": 2.0464, "step": 105 }, { "epoch": 0.13468869123252858, "grad_norm": 6.0220828263716175, "learning_rate": 1.9977754891959363e-05, "loss": 2.3022, "step": 106 }, { "epoch": 0.13595933926302414, "grad_norm": 1.9760805767200045, "learning_rate": 1.9977075950620093e-05, "loss": 2.3951, "step": 107 }, { "epoch": 0.1372299872935197, "grad_norm": 1.5413897633546927, "learning_rate": 1.9976386815366796e-05, "loss": 2.3606, "step": 108 }, { "epoch": 0.13850063532401524, "grad_norm": 1.8991731806444883, "learning_rate": 1.997568748690359e-05, "loss": 2.2424, "step": 109 }, { "epoch": 0.1397712833545108, "grad_norm": 1.526135893853535, "learning_rate": 1.9974977965945e-05, "loss": 2.2526, "step": 110 }, { "epoch": 0.14104193138500634, "grad_norm": 1.602806512484664, "learning_rate": 1.9974258253215964e-05, "loss": 2.2839, "step": 111 }, { "epoch": 0.1423125794155019, "grad_norm": 1.4325519869083885, "learning_rate": 1.997352834945184e-05, "loss": 2.3297, "step": 112 }, { "epoch": 0.14358322744599747, "grad_norm": 1.5168784833615567, "learning_rate": 1.997278825539839e-05, "loss": 2.3959, "step": 113 }, { "epoch": 0.144853875476493, "grad_norm": 1.9135394229796725, "learning_rate": 1.9972037971811802e-05, "loss": 1.7828, "step": 114 }, { "epoch": 0.14612452350698857, "grad_norm": 1.6792769441260569, "learning_rate": 1.9971277499458663e-05, "loss": 2.1888, "step": 115 }, { "epoch": 0.1473951715374841, "grad_norm": 1.739028731042485, "learning_rate": 1.9970506839115965e-05, "loss": 2.2422, "step": 116 }, { "epoch": 0.14866581956797967, "grad_norm": 1.4765946902455809, "learning_rate": 1.996972599157113e-05, "loss": 2.2422, "step": 117 }, { "epoch": 0.14993646759847523, "grad_norm": 2.0492840257759557, "learning_rate": 1.996893495762197e-05, "loss": 2.399, "step": 118 }, { "epoch": 0.15120711562897077, "grad_norm": 1.4275096196327925, "learning_rate": 1.9968133738076707e-05, "loss": 2.3486, "step": 119 }, { "epoch": 0.15247776365946633, "grad_norm": 1.6302330872080086, "learning_rate": 1.9967322333753978e-05, "loss": 2.2748, "step": 120 }, { "epoch": 0.15374841168996187, "grad_norm": 1.412615834080105, "learning_rate": 1.9966500745482824e-05, "loss": 2.3581, "step": 121 }, { "epoch": 0.15501905972045743, "grad_norm": 1.453502241535778, "learning_rate": 1.996566897410269e-05, "loss": 2.4823, "step": 122 }, { "epoch": 0.156289707750953, "grad_norm": 1.627229751845881, "learning_rate": 1.9964827020463418e-05, "loss": 2.2998, "step": 123 }, { "epoch": 0.15756035578144853, "grad_norm": 1.4185856495685691, "learning_rate": 1.9963974885425267e-05, "loss": 2.0273, "step": 124 }, { "epoch": 0.1588310038119441, "grad_norm": 1.5689622208187057, "learning_rate": 1.996311256985889e-05, "loss": 2.131, "step": 125 }, { "epoch": 0.16010165184243966, "grad_norm": 1.9586155379600372, "learning_rate": 1.9962240074645344e-05, "loss": 2.4964, "step": 126 }, { "epoch": 0.1613722998729352, "grad_norm": 1.5528955040049794, "learning_rate": 1.9961357400676085e-05, "loss": 2.3646, "step": 127 }, { "epoch": 0.16264294790343076, "grad_norm": 8.391044079407488, "learning_rate": 1.996046454885297e-05, "loss": 2.4039, "step": 128 }, { "epoch": 0.1639135959339263, "grad_norm": 1.7371418333998156, "learning_rate": 1.995956152008826e-05, "loss": 2.252, "step": 129 }, { "epoch": 0.16518424396442186, "grad_norm": 1.697773515006207, "learning_rate": 1.9958648315304606e-05, "loss": 2.3198, "step": 130 }, { "epoch": 0.16645489199491742, "grad_norm": 1.28409692971463, "learning_rate": 1.9957724935435065e-05, "loss": 2.1874, "step": 131 }, { "epoch": 0.16772554002541296, "grad_norm": 1.267390055193146, "learning_rate": 1.995679138142308e-05, "loss": 2.1704, "step": 132 }, { "epoch": 0.16899618805590852, "grad_norm": 1.3666767964046194, "learning_rate": 1.9955847654222493e-05, "loss": 2.1776, "step": 133 }, { "epoch": 0.17026683608640406, "grad_norm": 1.2425502438881062, "learning_rate": 1.995489375479755e-05, "loss": 2.2165, "step": 134 }, { "epoch": 0.17153748411689962, "grad_norm": 1.2599811393514255, "learning_rate": 1.9953929684122875e-05, "loss": 2.3532, "step": 135 }, { "epoch": 0.17280813214739518, "grad_norm": 1.6206053415885442, "learning_rate": 1.995295544318349e-05, "loss": 2.3016, "step": 136 }, { "epoch": 0.17407878017789072, "grad_norm": 1.6451053052999345, "learning_rate": 1.995197103297482e-05, "loss": 2.4567, "step": 137 }, { "epoch": 0.17534942820838628, "grad_norm": 1.548272572718846, "learning_rate": 1.995097645450266e-05, "loss": 2.3029, "step": 138 }, { "epoch": 0.17662007623888182, "grad_norm": 1.3212151931749498, "learning_rate": 1.9949971708783212e-05, "loss": 2.0577, "step": 139 }, { "epoch": 0.17789072426937738, "grad_norm": 9.408887399092182, "learning_rate": 1.994895679684305e-05, "loss": 2.3772, "step": 140 }, { "epoch": 0.17916137229987295, "grad_norm": 1.6125401231020167, "learning_rate": 1.9947931719719146e-05, "loss": 2.4229, "step": 141 }, { "epoch": 0.18043202033036848, "grad_norm": 1.6662864267128135, "learning_rate": 1.9946896478458862e-05, "loss": 2.1524, "step": 142 }, { "epoch": 0.18170266836086404, "grad_norm": 1.9673156126026754, "learning_rate": 1.9945851074119934e-05, "loss": 2.4578, "step": 143 }, { "epoch": 0.18297331639135958, "grad_norm": 1.4828274089192293, "learning_rate": 1.9944795507770487e-05, "loss": 2.1372, "step": 144 }, { "epoch": 0.18424396442185514, "grad_norm": 1.79936209005378, "learning_rate": 1.994372978048903e-05, "loss": 2.4361, "step": 145 }, { "epoch": 0.1855146124523507, "grad_norm": 1.7759170388327536, "learning_rate": 1.9942653893364446e-05, "loss": 2.1204, "step": 146 }, { "epoch": 0.18678526048284624, "grad_norm": 1.7281108781420167, "learning_rate": 1.9941567847496012e-05, "loss": 2.1991, "step": 147 }, { "epoch": 0.1880559085133418, "grad_norm": 2.1166769515572086, "learning_rate": 1.994047164399338e-05, "loss": 2.3989, "step": 148 }, { "epoch": 0.18932655654383734, "grad_norm": 1.2950345583922922, "learning_rate": 1.993936528397657e-05, "loss": 2.1799, "step": 149 }, { "epoch": 0.1905972045743329, "grad_norm": 1.5348226477369444, "learning_rate": 1.993824876857599e-05, "loss": 1.9653, "step": 150 }, { "epoch": 0.19186785260482847, "grad_norm": 1.7518894681262407, "learning_rate": 1.9937122098932428e-05, "loss": 2.5341, "step": 151 }, { "epoch": 0.193138500635324, "grad_norm": 1.5845440592839513, "learning_rate": 1.9935985276197033e-05, "loss": 2.2863, "step": 152 }, { "epoch": 0.19440914866581957, "grad_norm": 1.1806097502271196, "learning_rate": 1.9934838301531334e-05, "loss": 2.062, "step": 153 }, { "epoch": 0.19567979669631513, "grad_norm": 1.3147281942523097, "learning_rate": 1.9933681176107237e-05, "loss": 2.2483, "step": 154 }, { "epoch": 0.19695044472681067, "grad_norm": 1.2369606160269775, "learning_rate": 1.9932513901107017e-05, "loss": 2.2221, "step": 155 }, { "epoch": 0.19822109275730623, "grad_norm": 1.2180750717072797, "learning_rate": 1.9931336477723315e-05, "loss": 2.3452, "step": 156 }, { "epoch": 0.19949174078780177, "grad_norm": 1.3792727435327043, "learning_rate": 1.9930148907159146e-05, "loss": 2.2339, "step": 157 }, { "epoch": 0.20076238881829733, "grad_norm": 1.5961667123190166, "learning_rate": 1.992895119062789e-05, "loss": 2.2199, "step": 158 }, { "epoch": 0.2020330368487929, "grad_norm": 1.5749862121075178, "learning_rate": 1.9927743329353295e-05, "loss": 2.3563, "step": 159 }, { "epoch": 0.20330368487928843, "grad_norm": 2.2620408058967474, "learning_rate": 1.992652532456947e-05, "loss": 2.1471, "step": 160 }, { "epoch": 0.204574332909784, "grad_norm": 1.4796151191311295, "learning_rate": 1.9925297177520903e-05, "loss": 2.3141, "step": 161 }, { "epoch": 0.20584498094027953, "grad_norm": 1.4466702526825321, "learning_rate": 1.9924058889462413e-05, "loss": 2.4301, "step": 162 }, { "epoch": 0.2071156289707751, "grad_norm": 1.7194780999677342, "learning_rate": 1.992281046165922e-05, "loss": 2.1867, "step": 163 }, { "epoch": 0.20838627700127066, "grad_norm": 1.4209604569153904, "learning_rate": 1.9921551895386875e-05, "loss": 2.3164, "step": 164 }, { "epoch": 0.2096569250317662, "grad_norm": 1.7307653962070104, "learning_rate": 1.99202831919313e-05, "loss": 2.0334, "step": 165 }, { "epoch": 0.21092757306226176, "grad_norm": 1.5080354301530285, "learning_rate": 1.9919004352588768e-05, "loss": 2.6573, "step": 166 }, { "epoch": 0.2121982210927573, "grad_norm": 1.9916488578875324, "learning_rate": 1.991771537866592e-05, "loss": 2.5658, "step": 167 }, { "epoch": 0.21346886912325286, "grad_norm": 1.4649797775396523, "learning_rate": 1.9916416271479736e-05, "loss": 2.3194, "step": 168 }, { "epoch": 0.21473951715374842, "grad_norm": 1.4247137988825938, "learning_rate": 1.9915107032357564e-05, "loss": 2.5108, "step": 169 }, { "epoch": 0.21601016518424396, "grad_norm": 1.3024065770687974, "learning_rate": 1.9913787662637093e-05, "loss": 2.0994, "step": 170 }, { "epoch": 0.21728081321473952, "grad_norm": 1.2857669615933136, "learning_rate": 1.9912458163666367e-05, "loss": 2.2093, "step": 171 }, { "epoch": 0.21855146124523506, "grad_norm": 1.4238166128200767, "learning_rate": 1.9911118536803785e-05, "loss": 2.2752, "step": 172 }, { "epoch": 0.21982210927573062, "grad_norm": 1.575778430022986, "learning_rate": 1.9909768783418086e-05, "loss": 2.1218, "step": 173 }, { "epoch": 0.22109275730622618, "grad_norm": 1.2962857082842534, "learning_rate": 1.9908408904888356e-05, "loss": 2.3936, "step": 174 }, { "epoch": 0.22236340533672172, "grad_norm": 1.5279108597720172, "learning_rate": 1.9907038902604033e-05, "loss": 2.249, "step": 175 }, { "epoch": 0.22363405336721728, "grad_norm": 1.2704750696590625, "learning_rate": 1.9905658777964888e-05, "loss": 2.3086, "step": 176 }, { "epoch": 0.22490470139771285, "grad_norm": 1.3969610069878315, "learning_rate": 1.990426853238105e-05, "loss": 2.2686, "step": 177 }, { "epoch": 0.22617534942820838, "grad_norm": 1.460679593441549, "learning_rate": 1.990286816727297e-05, "loss": 2.1181, "step": 178 }, { "epoch": 0.22744599745870395, "grad_norm": 1.3366579414273725, "learning_rate": 1.9901457684071453e-05, "loss": 2.2765, "step": 179 }, { "epoch": 0.22871664548919948, "grad_norm": 1.245913482097206, "learning_rate": 1.9900037084217637e-05, "loss": 2.1238, "step": 180 }, { "epoch": 0.22998729351969505, "grad_norm": 1.355073381692421, "learning_rate": 1.9898606369163e-05, "loss": 2.3252, "step": 181 }, { "epoch": 0.2312579415501906, "grad_norm": 1.28382798355187, "learning_rate": 1.989716554036935e-05, "loss": 2.3593, "step": 182 }, { "epoch": 0.23252858958068615, "grad_norm": 1.3272489788914297, "learning_rate": 1.9895714599308822e-05, "loss": 2.2453, "step": 183 }, { "epoch": 0.2337992376111817, "grad_norm": 1.2869596536907777, "learning_rate": 1.9894253547463897e-05, "loss": 2.2947, "step": 184 }, { "epoch": 0.23506988564167725, "grad_norm": 1.2978788321340524, "learning_rate": 1.9892782386327385e-05, "loss": 2.5479, "step": 185 }, { "epoch": 0.2363405336721728, "grad_norm": 1.2918291928610448, "learning_rate": 1.9891301117402415e-05, "loss": 2.5686, "step": 186 }, { "epoch": 0.23761118170266837, "grad_norm": 1.155321594136107, "learning_rate": 1.9889809742202454e-05, "loss": 2.0011, "step": 187 }, { "epoch": 0.2388818297331639, "grad_norm": 1.1459073035417648, "learning_rate": 1.9888308262251286e-05, "loss": 2.0969, "step": 188 }, { "epoch": 0.24015247776365947, "grad_norm": 1.252131993544361, "learning_rate": 1.9886796679083027e-05, "loss": 2.3877, "step": 189 }, { "epoch": 0.241423125794155, "grad_norm": 1.20738751209847, "learning_rate": 1.988527499424211e-05, "loss": 2.4384, "step": 190 }, { "epoch": 0.24269377382465057, "grad_norm": 1.3691140924554905, "learning_rate": 1.9883743209283293e-05, "loss": 2.2901, "step": 191 }, { "epoch": 0.24396442185514614, "grad_norm": 1.638413576701321, "learning_rate": 1.988220132577165e-05, "loss": 2.3583, "step": 192 }, { "epoch": 0.24523506988564167, "grad_norm": 1.2730316144536322, "learning_rate": 1.9880649345282577e-05, "loss": 1.9856, "step": 193 }, { "epoch": 0.24650571791613723, "grad_norm": 1.2358397961128764, "learning_rate": 1.9879087269401782e-05, "loss": 2.23, "step": 194 }, { "epoch": 0.24777636594663277, "grad_norm": 1.547745354676377, "learning_rate": 1.9877515099725294e-05, "loss": 2.4606, "step": 195 }, { "epoch": 0.24904701397712833, "grad_norm": 1.4391127109203996, "learning_rate": 1.987593283785945e-05, "loss": 2.3736, "step": 196 }, { "epoch": 0.2503176620076239, "grad_norm": 1.7585520256425062, "learning_rate": 1.9874340485420904e-05, "loss": 2.1316, "step": 197 }, { "epoch": 0.25158831003811943, "grad_norm": 1.319651195207614, "learning_rate": 1.987273804403661e-05, "loss": 2.1887, "step": 198 }, { "epoch": 0.25285895806861497, "grad_norm": 1.286157362467024, "learning_rate": 1.987112551534384e-05, "loss": 1.9623, "step": 199 }, { "epoch": 0.25412960609911056, "grad_norm": 1.3073822572525833, "learning_rate": 1.9869502900990168e-05, "loss": 2.4822, "step": 200 }, { "epoch": 0.2554002541296061, "grad_norm": 1.446649580245481, "learning_rate": 1.986787020263347e-05, "loss": 2.2048, "step": 201 }, { "epoch": 0.25667090216010163, "grad_norm": 1.3327450989551783, "learning_rate": 1.9866227421941934e-05, "loss": 2.1352, "step": 202 }, { "epoch": 0.2579415501905972, "grad_norm": 1.3718218115332597, "learning_rate": 1.9864574560594043e-05, "loss": 2.1044, "step": 203 }, { "epoch": 0.25921219822109276, "grad_norm": 1.797849151509624, "learning_rate": 1.986291162027858e-05, "loss": 2.199, "step": 204 }, { "epoch": 0.2604828462515883, "grad_norm": 1.3083060220775833, "learning_rate": 1.9861238602694624e-05, "loss": 2.2676, "step": 205 }, { "epoch": 0.2617534942820839, "grad_norm": 1.424052626314714, "learning_rate": 1.9859555509551564e-05, "loss": 2.0826, "step": 206 }, { "epoch": 0.2630241423125794, "grad_norm": 1.5293094955596058, "learning_rate": 1.985786234256906e-05, "loss": 1.9344, "step": 207 }, { "epoch": 0.26429479034307496, "grad_norm": 1.7076258249022205, "learning_rate": 1.9856159103477085e-05, "loss": 2.3119, "step": 208 }, { "epoch": 0.2655654383735705, "grad_norm": 1.2127373243223818, "learning_rate": 1.9854445794015895e-05, "loss": 2.2381, "step": 209 }, { "epoch": 0.2668360864040661, "grad_norm": 1.4196056184224093, "learning_rate": 1.9852722415936034e-05, "loss": 2.216, "step": 210 }, { "epoch": 0.2681067344345616, "grad_norm": 1.4123460554161025, "learning_rate": 1.9850988970998334e-05, "loss": 2.4406, "step": 211 }, { "epoch": 0.26937738246505716, "grad_norm": 1.9097154188260683, "learning_rate": 1.984924546097392e-05, "loss": 2.2335, "step": 212 }, { "epoch": 0.27064803049555275, "grad_norm": 1.570238579935913, "learning_rate": 1.984749188764419e-05, "loss": 2.2708, "step": 213 }, { "epoch": 0.2719186785260483, "grad_norm": 1.2007821744706566, "learning_rate": 1.9845728252800827e-05, "loss": 2.018, "step": 214 }, { "epoch": 0.2731893265565438, "grad_norm": 2.4648382684532737, "learning_rate": 1.98439545582458e-05, "loss": 2.6116, "step": 215 }, { "epoch": 0.2744599745870394, "grad_norm": 1.8858078450769689, "learning_rate": 1.9842170805791356e-05, "loss": 2.7223, "step": 216 }, { "epoch": 0.27573062261753495, "grad_norm": 1.211266603268658, "learning_rate": 1.9840376997260005e-05, "loss": 2.1076, "step": 217 }, { "epoch": 0.2770012706480305, "grad_norm": 1.5327466889696617, "learning_rate": 1.983857313448455e-05, "loss": 2.4202, "step": 218 }, { "epoch": 0.2782719186785261, "grad_norm": 1.1157105533698264, "learning_rate": 1.983675921930805e-05, "loss": 2.0862, "step": 219 }, { "epoch": 0.2795425667090216, "grad_norm": 1.3261147782715659, "learning_rate": 1.983493525358385e-05, "loss": 2.1126, "step": 220 }, { "epoch": 0.28081321473951715, "grad_norm": 1.1567356334830963, "learning_rate": 1.983310123917556e-05, "loss": 2.1587, "step": 221 }, { "epoch": 0.2820838627700127, "grad_norm": 1.08583197973077, "learning_rate": 1.9831257177957045e-05, "loss": 2.418, "step": 222 }, { "epoch": 0.2833545108005083, "grad_norm": 1.1077378609711996, "learning_rate": 1.9829403071812448e-05, "loss": 1.9391, "step": 223 }, { "epoch": 0.2846251588310038, "grad_norm": 1.2147030605199152, "learning_rate": 1.9827538922636174e-05, "loss": 2.2372, "step": 224 }, { "epoch": 0.28589580686149935, "grad_norm": 1.3494759002510996, "learning_rate": 1.9825664732332886e-05, "loss": 2.1855, "step": 225 }, { "epoch": 0.28716645489199494, "grad_norm": 1.1340108074767525, "learning_rate": 1.98237805028175e-05, "loss": 2.1137, "step": 226 }, { "epoch": 0.2884371029224905, "grad_norm": 1.185055421017148, "learning_rate": 1.982188623601521e-05, "loss": 2.2954, "step": 227 }, { "epoch": 0.289707750952986, "grad_norm": 1.3631793970945223, "learning_rate": 1.9819981933861446e-05, "loss": 2.356, "step": 228 }, { "epoch": 0.2909783989834816, "grad_norm": 1.3765107028630217, "learning_rate": 1.9818067598301894e-05, "loss": 2.3977, "step": 229 }, { "epoch": 0.29224904701397714, "grad_norm": 1.3177872840035507, "learning_rate": 1.9816143231292496e-05, "loss": 2.4979, "step": 230 }, { "epoch": 0.2935196950444727, "grad_norm": 1.2597614560659143, "learning_rate": 1.9814208834799446e-05, "loss": 2.3582, "step": 231 }, { "epoch": 0.2947903430749682, "grad_norm": 1.4455310696608084, "learning_rate": 1.981226441079918e-05, "loss": 2.3261, "step": 232 }, { "epoch": 0.2960609911054638, "grad_norm": 1.1129922300811594, "learning_rate": 1.9810309961278383e-05, "loss": 2.0914, "step": 233 }, { "epoch": 0.29733163913595934, "grad_norm": 1.4263394114610715, "learning_rate": 1.980834548823398e-05, "loss": 2.2741, "step": 234 }, { "epoch": 0.29860228716645487, "grad_norm": 1.2498079803290318, "learning_rate": 1.980637099367314e-05, "loss": 2.236, "step": 235 }, { "epoch": 0.29987293519695046, "grad_norm": 1.348249170128615, "learning_rate": 1.9804386479613268e-05, "loss": 2.2804, "step": 236 }, { "epoch": 0.301143583227446, "grad_norm": 1.272181875212829, "learning_rate": 1.9802391948082013e-05, "loss": 2.3993, "step": 237 }, { "epoch": 0.30241423125794153, "grad_norm": 1.4361318089870239, "learning_rate": 1.9800387401117252e-05, "loss": 2.1691, "step": 238 }, { "epoch": 0.3036848792884371, "grad_norm": 1.3958415978926788, "learning_rate": 1.9798372840767096e-05, "loss": 2.481, "step": 239 }, { "epoch": 0.30495552731893266, "grad_norm": 1.2566347823253226, "learning_rate": 1.97963482690899e-05, "loss": 2.2282, "step": 240 }, { "epoch": 0.3062261753494282, "grad_norm": 1.999809093000962, "learning_rate": 1.9794313688154222e-05, "loss": 2.0083, "step": 241 }, { "epoch": 0.30749682337992373, "grad_norm": 1.3790939937384645, "learning_rate": 1.979226910003887e-05, "loss": 2.2375, "step": 242 }, { "epoch": 0.3087674714104193, "grad_norm": 1.7157559828108924, "learning_rate": 1.9790214506832868e-05, "loss": 2.2433, "step": 243 }, { "epoch": 0.31003811944091486, "grad_norm": 1.576223838850493, "learning_rate": 1.978814991063546e-05, "loss": 2.4754, "step": 244 }, { "epoch": 0.3113087674714104, "grad_norm": 1.3533983164059167, "learning_rate": 1.9786075313556115e-05, "loss": 2.2725, "step": 245 }, { "epoch": 0.312579415501906, "grad_norm": 1.912049364623592, "learning_rate": 1.978399071771452e-05, "loss": 2.2438, "step": 246 }, { "epoch": 0.3138500635324015, "grad_norm": 1.3691066714761335, "learning_rate": 1.9781896125240577e-05, "loss": 2.3712, "step": 247 }, { "epoch": 0.31512071156289706, "grad_norm": 1.591417595217935, "learning_rate": 1.9779791538274403e-05, "loss": 2.0608, "step": 248 }, { "epoch": 0.31639135959339265, "grad_norm": 1.3332773327402838, "learning_rate": 1.9777676958966318e-05, "loss": 2.5189, "step": 249 }, { "epoch": 0.3176620076238882, "grad_norm": 1.451125313869464, "learning_rate": 1.9775552389476865e-05, "loss": 2.226, "step": 250 }, { "epoch": 0.3189326556543837, "grad_norm": 1.6449115746390213, "learning_rate": 1.9773417831976783e-05, "loss": 2.4042, "step": 251 }, { "epoch": 0.3202033036848793, "grad_norm": 1.4155289951041365, "learning_rate": 1.977127328864703e-05, "loss": 2.2907, "step": 252 }, { "epoch": 0.32147395171537485, "grad_norm": 1.3395584815709398, "learning_rate": 1.9769118761678748e-05, "loss": 2.1805, "step": 253 }, { "epoch": 0.3227445997458704, "grad_norm": 1.1847706337861326, "learning_rate": 1.9766954253273297e-05, "loss": 1.9415, "step": 254 }, { "epoch": 0.3240152477763659, "grad_norm": 1.4562207170493793, "learning_rate": 1.9764779765642226e-05, "loss": 2.8472, "step": 255 }, { "epoch": 0.3252858958068615, "grad_norm": 1.4537245318712395, "learning_rate": 1.9762595301007282e-05, "loss": 2.5208, "step": 256 }, { "epoch": 0.32655654383735705, "grad_norm": 1.161757040542941, "learning_rate": 1.97604008616004e-05, "loss": 2.1415, "step": 257 }, { "epoch": 0.3278271918678526, "grad_norm": 1.1214870383786562, "learning_rate": 1.9758196449663726e-05, "loss": 2.1826, "step": 258 }, { "epoch": 0.3290978398983482, "grad_norm": 1.2880601651737729, "learning_rate": 1.9755982067449565e-05, "loss": 2.3432, "step": 259 }, { "epoch": 0.3303684879288437, "grad_norm": 1.503028999152933, "learning_rate": 1.975375771722044e-05, "loss": 2.2745, "step": 260 }, { "epoch": 0.33163913595933925, "grad_norm": 1.2953888075265865, "learning_rate": 1.975152340124904e-05, "loss": 2.23, "step": 261 }, { "epoch": 0.33290978398983484, "grad_norm": 1.3411701590648721, "learning_rate": 1.9749279121818235e-05, "loss": 2.2582, "step": 262 }, { "epoch": 0.3341804320203304, "grad_norm": 1.798210025564672, "learning_rate": 1.974702488122109e-05, "loss": 2.4651, "step": 263 }, { "epoch": 0.3354510800508259, "grad_norm": 1.549188730172701, "learning_rate": 1.9744760681760832e-05, "loss": 2.3012, "step": 264 }, { "epoch": 0.33672172808132145, "grad_norm": 1.3376711650759068, "learning_rate": 1.9742486525750875e-05, "loss": 2.3175, "step": 265 }, { "epoch": 0.33799237611181704, "grad_norm": 1.4373993014473352, "learning_rate": 1.9740202415514794e-05, "loss": 2.2695, "step": 266 }, { "epoch": 0.3392630241423126, "grad_norm": 1.1679976611001082, "learning_rate": 1.9737908353386345e-05, "loss": 2.1141, "step": 267 }, { "epoch": 0.3405336721728081, "grad_norm": 1.153202631564055, "learning_rate": 1.9735604341709448e-05, "loss": 2.0185, "step": 268 }, { "epoch": 0.3418043202033037, "grad_norm": 1.3668597801871027, "learning_rate": 1.973329038283819e-05, "loss": 2.5549, "step": 269 }, { "epoch": 0.34307496823379924, "grad_norm": 1.217837506597909, "learning_rate": 1.973096647913682e-05, "loss": 2.3022, "step": 270 }, { "epoch": 0.3443456162642948, "grad_norm": 1.344490125479597, "learning_rate": 1.9728632632979746e-05, "loss": 2.3829, "step": 271 }, { "epoch": 0.34561626429479037, "grad_norm": 1.0773850473756055, "learning_rate": 1.9726288846751544e-05, "loss": 2.0332, "step": 272 }, { "epoch": 0.3468869123252859, "grad_norm": 1.1878431921754486, "learning_rate": 1.972393512284693e-05, "loss": 2.2342, "step": 273 }, { "epoch": 0.34815756035578144, "grad_norm": 4.06183553812225, "learning_rate": 1.9721571463670794e-05, "loss": 2.3137, "step": 274 }, { "epoch": 0.34942820838627703, "grad_norm": 1.3852941436439452, "learning_rate": 1.9719197871638154e-05, "loss": 2.1223, "step": 275 }, { "epoch": 0.35069885641677256, "grad_norm": 1.1642263744963426, "learning_rate": 1.9716814349174193e-05, "loss": 2.3546, "step": 276 }, { "epoch": 0.3519695044472681, "grad_norm": 1.7204368329652955, "learning_rate": 1.9714420898714243e-05, "loss": 2.2857, "step": 277 }, { "epoch": 0.35324015247776364, "grad_norm": 1.3528935674549563, "learning_rate": 1.9712017522703764e-05, "loss": 2.0075, "step": 278 }, { "epoch": 0.3545108005082592, "grad_norm": 1.848608080743895, "learning_rate": 1.970960422359837e-05, "loss": 2.4993, "step": 279 }, { "epoch": 0.35578144853875476, "grad_norm": 1.115912725965341, "learning_rate": 1.970718100386381e-05, "loss": 2.2729, "step": 280 }, { "epoch": 0.3570520965692503, "grad_norm": 1.7126229343091541, "learning_rate": 1.9704747865975968e-05, "loss": 2.0126, "step": 281 }, { "epoch": 0.3583227445997459, "grad_norm": 1.2859654653410564, "learning_rate": 1.9702304812420864e-05, "loss": 2.0828, "step": 282 }, { "epoch": 0.3595933926302414, "grad_norm": 1.2069593223490485, "learning_rate": 1.9699851845694646e-05, "loss": 2.2182, "step": 283 }, { "epoch": 0.36086404066073696, "grad_norm": 1.244963257392342, "learning_rate": 1.9697388968303596e-05, "loss": 2.1136, "step": 284 }, { "epoch": 0.36213468869123255, "grad_norm": 1.6361926372534965, "learning_rate": 1.9694916182764113e-05, "loss": 2.1869, "step": 285 }, { "epoch": 0.3634053367217281, "grad_norm": 1.214743213942469, "learning_rate": 1.9692433491602732e-05, "loss": 1.7988, "step": 286 }, { "epoch": 0.3646759847522236, "grad_norm": 1.2833630632840507, "learning_rate": 1.96899408973561e-05, "loss": 2.3083, "step": 287 }, { "epoch": 0.36594663278271916, "grad_norm": 1.1401604797768607, "learning_rate": 1.9687438402570976e-05, "loss": 2.2576, "step": 288 }, { "epoch": 0.36721728081321475, "grad_norm": 1.2275194123657456, "learning_rate": 1.9684926009804254e-05, "loss": 1.9264, "step": 289 }, { "epoch": 0.3684879288437103, "grad_norm": 1.136589027712856, "learning_rate": 1.9682403721622928e-05, "loss": 2.1373, "step": 290 }, { "epoch": 0.3697585768742058, "grad_norm": 1.6453722784067453, "learning_rate": 1.96798715406041e-05, "loss": 2.2285, "step": 291 }, { "epoch": 0.3710292249047014, "grad_norm": 1.2928052738281843, "learning_rate": 1.967732946933499e-05, "loss": 2.311, "step": 292 }, { "epoch": 0.37229987293519695, "grad_norm": 1.5312617633714525, "learning_rate": 1.9674777510412913e-05, "loss": 2.0747, "step": 293 }, { "epoch": 0.3735705209656925, "grad_norm": 1.1884554795098419, "learning_rate": 1.9672215666445295e-05, "loss": 2.2441, "step": 294 }, { "epoch": 0.3748411689961881, "grad_norm": 1.3471883018180635, "learning_rate": 1.9669643940049657e-05, "loss": 2.2793, "step": 295 }, { "epoch": 0.3761118170266836, "grad_norm": 1.6844014779879697, "learning_rate": 1.9667062333853618e-05, "loss": 2.14, "step": 296 }, { "epoch": 0.37738246505717915, "grad_norm": 1.4151511187076775, "learning_rate": 1.966447085049489e-05, "loss": 2.3913, "step": 297 }, { "epoch": 0.3786531130876747, "grad_norm": 1.7966731737314259, "learning_rate": 1.966186949262128e-05, "loss": 2.5553, "step": 298 }, { "epoch": 0.3799237611181703, "grad_norm": 1.195568621844938, "learning_rate": 1.9659258262890683e-05, "loss": 2.2099, "step": 299 }, { "epoch": 0.3811944091486658, "grad_norm": 1.1219471693644816, "learning_rate": 1.9656637163971083e-05, "loss": 2.0938, "step": 300 }, { "epoch": 0.38246505717916135, "grad_norm": 1.7171705514194913, "learning_rate": 1.9654006198540543e-05, "loss": 2.2291, "step": 301 }, { "epoch": 0.38373570520965694, "grad_norm": 2.1299812627833705, "learning_rate": 1.9651365369287206e-05, "loss": 2.2524, "step": 302 }, { "epoch": 0.3850063532401525, "grad_norm": 1.2578418042723456, "learning_rate": 1.9648714678909296e-05, "loss": 2.1554, "step": 303 }, { "epoch": 0.386277001270648, "grad_norm": 1.639835412483817, "learning_rate": 1.964605413011512e-05, "loss": 2.1907, "step": 304 }, { "epoch": 0.3875476493011436, "grad_norm": 1.4296870500572354, "learning_rate": 1.9643383725623042e-05, "loss": 2.4861, "step": 305 }, { "epoch": 0.38881829733163914, "grad_norm": 1.5950024668148066, "learning_rate": 1.9640703468161508e-05, "loss": 2.3117, "step": 306 }, { "epoch": 0.3900889453621347, "grad_norm": 1.2586676337646305, "learning_rate": 1.9638013360469026e-05, "loss": 2.1, "step": 307 }, { "epoch": 0.39135959339263027, "grad_norm": 1.3155012674354023, "learning_rate": 1.963531340529417e-05, "loss": 2.3952, "step": 308 }, { "epoch": 0.3926302414231258, "grad_norm": 1.6277796635877058, "learning_rate": 1.9632603605395576e-05, "loss": 2.2169, "step": 309 }, { "epoch": 0.39390088945362134, "grad_norm": 1.406180736008549, "learning_rate": 1.9629883963541933e-05, "loss": 2.2009, "step": 310 }, { "epoch": 0.3951715374841169, "grad_norm": 1.3310192257266478, "learning_rate": 1.9627154482511995e-05, "loss": 2.1501, "step": 311 }, { "epoch": 0.39644218551461247, "grad_norm": 1.3757223525941409, "learning_rate": 1.9624415165094567e-05, "loss": 2.1629, "step": 312 }, { "epoch": 0.397712833545108, "grad_norm": 1.572533153332573, "learning_rate": 1.9621666014088495e-05, "loss": 2.314, "step": 313 }, { "epoch": 0.39898348157560354, "grad_norm": 1.3862969039502098, "learning_rate": 1.9618907032302684e-05, "loss": 2.3253, "step": 314 }, { "epoch": 0.40025412960609913, "grad_norm": 1.8202182825860844, "learning_rate": 1.9616138222556075e-05, "loss": 2.339, "step": 315 }, { "epoch": 0.40152477763659467, "grad_norm": 1.3790668607780714, "learning_rate": 1.9613359587677658e-05, "loss": 2.2941, "step": 316 }, { "epoch": 0.4027954256670902, "grad_norm": 2.1227907347388104, "learning_rate": 1.961057113050645e-05, "loss": 2.3847, "step": 317 }, { "epoch": 0.4040660736975858, "grad_norm": 1.6782062198815728, "learning_rate": 1.9607772853891528e-05, "loss": 2.4474, "step": 318 }, { "epoch": 0.40533672172808133, "grad_norm": 1.1299388068648677, "learning_rate": 1.9604964760691966e-05, "loss": 2.2332, "step": 319 }, { "epoch": 0.40660736975857686, "grad_norm": 1.356462757817847, "learning_rate": 1.9602146853776894e-05, "loss": 2.0578, "step": 320 }, { "epoch": 0.4078780177890724, "grad_norm": 1.6129815716777167, "learning_rate": 1.959931913602547e-05, "loss": 2.1875, "step": 321 }, { "epoch": 0.409148665819568, "grad_norm": 1.5767888873450564, "learning_rate": 1.959648161032686e-05, "loss": 2.2612, "step": 322 }, { "epoch": 0.4104193138500635, "grad_norm": 2.185303482734268, "learning_rate": 1.9593634279580258e-05, "loss": 2.3963, "step": 323 }, { "epoch": 0.41168996188055906, "grad_norm": 1.7641635525451629, "learning_rate": 1.9590777146694888e-05, "loss": 1.9981, "step": 324 }, { "epoch": 0.41296060991105465, "grad_norm": 1.4282615623822061, "learning_rate": 1.9587910214589966e-05, "loss": 2.1085, "step": 325 }, { "epoch": 0.4142312579415502, "grad_norm": 1.5919068774694538, "learning_rate": 1.958503348619474e-05, "loss": 2.2317, "step": 326 }, { "epoch": 0.4155019059720457, "grad_norm": 1.550564026935414, "learning_rate": 1.9582146964448457e-05, "loss": 2.247, "step": 327 }, { "epoch": 0.4167725540025413, "grad_norm": 1.2251036177676735, "learning_rate": 1.957925065230038e-05, "loss": 2.0521, "step": 328 }, { "epoch": 0.41804320203303685, "grad_norm": 1.6011207580960447, "learning_rate": 1.9576344552709762e-05, "loss": 2.2269, "step": 329 }, { "epoch": 0.4193138500635324, "grad_norm": 2.2905735549438457, "learning_rate": 1.9573428668645865e-05, "loss": 1.9699, "step": 330 }, { "epoch": 0.420584498094028, "grad_norm": 1.2596706172279368, "learning_rate": 1.9570503003087947e-05, "loss": 1.986, "step": 331 }, { "epoch": 0.4218551461245235, "grad_norm": 1.3420938625327155, "learning_rate": 1.9567567559025257e-05, "loss": 2.1155, "step": 332 }, { "epoch": 0.42312579415501905, "grad_norm": 1.0397203955504706, "learning_rate": 1.956462233945703e-05, "loss": 1.9222, "step": 333 }, { "epoch": 0.4243964421855146, "grad_norm": 1.3942914467388836, "learning_rate": 1.956166734739251e-05, "loss": 2.2585, "step": 334 }, { "epoch": 0.4256670902160102, "grad_norm": 1.636497018081038, "learning_rate": 1.9558702585850902e-05, "loss": 2.2782, "step": 335 }, { "epoch": 0.4269377382465057, "grad_norm": 1.1133171000095392, "learning_rate": 1.955572805786141e-05, "loss": 2.1721, "step": 336 }, { "epoch": 0.42820838627700125, "grad_norm": 1.2097521097251087, "learning_rate": 1.95527437664632e-05, "loss": 2.1854, "step": 337 }, { "epoch": 0.42947903430749684, "grad_norm": 1.2620840723571427, "learning_rate": 1.954974971470543e-05, "loss": 2.2633, "step": 338 }, { "epoch": 0.4307496823379924, "grad_norm": 1.3328233261636864, "learning_rate": 1.954674590564722e-05, "loss": 2.2392, "step": 339 }, { "epoch": 0.4320203303684879, "grad_norm": 1.1554243062365197, "learning_rate": 1.9543732342357664e-05, "loss": 2.1155, "step": 340 }, { "epoch": 0.4332909783989835, "grad_norm": 1.0601647426096334, "learning_rate": 1.954070902791582e-05, "loss": 2.1215, "step": 341 }, { "epoch": 0.43456162642947904, "grad_norm": 1.3082564186519354, "learning_rate": 1.953767596541071e-05, "loss": 2.2465, "step": 342 }, { "epoch": 0.4358322744599746, "grad_norm": 1.1678820791024087, "learning_rate": 1.9534633157941315e-05, "loss": 2.5573, "step": 343 }, { "epoch": 0.4371029224904701, "grad_norm": 1.0375732983429344, "learning_rate": 1.9531580608616578e-05, "loss": 1.8765, "step": 344 }, { "epoch": 0.4383735705209657, "grad_norm": 1.0354698112304326, "learning_rate": 1.952851832055539e-05, "loss": 2.2035, "step": 345 }, { "epoch": 0.43964421855146124, "grad_norm": 1.3397961801605331, "learning_rate": 1.9525446296886593e-05, "loss": 2.3499, "step": 346 }, { "epoch": 0.4409148665819568, "grad_norm": 2.4318786956134764, "learning_rate": 1.952236454074897e-05, "loss": 2.0937, "step": 347 }, { "epoch": 0.44218551461245237, "grad_norm": 1.3384117168999674, "learning_rate": 1.9519273055291266e-05, "loss": 2.0813, "step": 348 }, { "epoch": 0.4434561626429479, "grad_norm": 1.3106293831176954, "learning_rate": 1.9516171843672153e-05, "loss": 2.1467, "step": 349 }, { "epoch": 0.44472681067344344, "grad_norm": 1.5449558564457215, "learning_rate": 1.9513060909060237e-05, "loss": 2.6179, "step": 350 }, { "epoch": 0.44599745870393903, "grad_norm": 4.436046259180675, "learning_rate": 1.950994025463407e-05, "loss": 2.4073, "step": 351 }, { "epoch": 0.44726810673443457, "grad_norm": 1.327079328183306, "learning_rate": 1.9506809883582126e-05, "loss": 2.0597, "step": 352 }, { "epoch": 0.4485387547649301, "grad_norm": 1.6916627624113527, "learning_rate": 1.9503669799102815e-05, "loss": 2.2454, "step": 353 }, { "epoch": 0.4498094027954257, "grad_norm": 1.516462557151638, "learning_rate": 1.9500520004404458e-05, "loss": 2.4039, "step": 354 }, { "epoch": 0.45108005082592123, "grad_norm": 1.595275957141959, "learning_rate": 1.949736050270532e-05, "loss": 1.9971, "step": 355 }, { "epoch": 0.45235069885641677, "grad_norm": 1.5893741248692173, "learning_rate": 1.949419129723356e-05, "loss": 2.3408, "step": 356 }, { "epoch": 0.4536213468869123, "grad_norm": 2.5440159135576557, "learning_rate": 1.9491012391227266e-05, "loss": 2.3986, "step": 357 }, { "epoch": 0.4548919949174079, "grad_norm": 1.934010697927457, "learning_rate": 1.948782378793443e-05, "loss": 2.2534, "step": 358 }, { "epoch": 0.45616264294790343, "grad_norm": 1.5206954937016697, "learning_rate": 1.9484625490612957e-05, "loss": 2.1512, "step": 359 }, { "epoch": 0.45743329097839897, "grad_norm": 1.2772650225420106, "learning_rate": 1.9481417502530654e-05, "loss": 2.0985, "step": 360 }, { "epoch": 0.45870393900889456, "grad_norm": 1.7400209934793354, "learning_rate": 1.9478199826965232e-05, "loss": 2.1081, "step": 361 }, { "epoch": 0.4599745870393901, "grad_norm": 1.3556440359592625, "learning_rate": 1.9474972467204298e-05, "loss": 1.8941, "step": 362 }, { "epoch": 0.46124523506988563, "grad_norm": 1.3453307109148631, "learning_rate": 1.9471735426545356e-05, "loss": 2.1768, "step": 363 }, { "epoch": 0.4625158831003812, "grad_norm": 1.1858948028790681, "learning_rate": 1.9468488708295793e-05, "loss": 2.2965, "step": 364 }, { "epoch": 0.46378653113087676, "grad_norm": 1.4003361314968354, "learning_rate": 1.9465232315772896e-05, "loss": 2.0454, "step": 365 }, { "epoch": 0.4650571791613723, "grad_norm": 1.720112687089305, "learning_rate": 1.9461966252303825e-05, "loss": 2.0443, "step": 366 }, { "epoch": 0.4663278271918678, "grad_norm": 1.3109306255966038, "learning_rate": 1.9458690521225634e-05, "loss": 2.2423, "step": 367 }, { "epoch": 0.4675984752223634, "grad_norm": 1.2095315060799088, "learning_rate": 1.9455405125885244e-05, "loss": 1.9604, "step": 368 }, { "epoch": 0.46886912325285895, "grad_norm": 1.3841358768722092, "learning_rate": 1.945211006963945e-05, "loss": 2.3195, "step": 369 }, { "epoch": 0.4701397712833545, "grad_norm": 1.5744617183370688, "learning_rate": 1.9448805355854932e-05, "loss": 2.2957, "step": 370 }, { "epoch": 0.4714104193138501, "grad_norm": 1.6922764749008226, "learning_rate": 1.944549098790822e-05, "loss": 2.4135, "step": 371 }, { "epoch": 0.4726810673443456, "grad_norm": 4.102432396640123, "learning_rate": 1.9442166969185715e-05, "loss": 2.2834, "step": 372 }, { "epoch": 0.47395171537484115, "grad_norm": 1.4905796877370419, "learning_rate": 1.9438833303083677e-05, "loss": 2.1638, "step": 373 }, { "epoch": 0.47522236340533675, "grad_norm": 1.450353779580371, "learning_rate": 1.943548999300823e-05, "loss": 2.3498, "step": 374 }, { "epoch": 0.4764930114358323, "grad_norm": 3.7843721284254297, "learning_rate": 1.9432137042375345e-05, "loss": 2.0612, "step": 375 }, { "epoch": 0.4777636594663278, "grad_norm": 2.7285211777033096, "learning_rate": 1.9428774454610845e-05, "loss": 2.2545, "step": 376 }, { "epoch": 0.47903430749682335, "grad_norm": 1.4297449124991255, "learning_rate": 1.9425402233150394e-05, "loss": 2.0982, "step": 377 }, { "epoch": 0.48030495552731894, "grad_norm": 1.3657540433108728, "learning_rate": 1.942202038143951e-05, "loss": 1.9351, "step": 378 }, { "epoch": 0.4815756035578145, "grad_norm": 1.8325177673920885, "learning_rate": 1.941862890293354e-05, "loss": 2.245, "step": 379 }, { "epoch": 0.48284625158831, "grad_norm": 1.4657268997197, "learning_rate": 1.9415227801097677e-05, "loss": 2.0714, "step": 380 }, { "epoch": 0.4841168996188056, "grad_norm": 1.427569121813906, "learning_rate": 1.9411817079406936e-05, "loss": 1.9048, "step": 381 }, { "epoch": 0.48538754764930114, "grad_norm": 1.5640321858304782, "learning_rate": 1.9408396741346167e-05, "loss": 2.3833, "step": 382 }, { "epoch": 0.4866581956797967, "grad_norm": 1.2857820843328875, "learning_rate": 1.9404966790410047e-05, "loss": 2.2031, "step": 383 }, { "epoch": 0.48792884371029227, "grad_norm": 1.141659221043011, "learning_rate": 1.940152723010307e-05, "loss": 2.247, "step": 384 }, { "epoch": 0.4891994917407878, "grad_norm": 1.5323549274637351, "learning_rate": 1.9398078063939552e-05, "loss": 2.1866, "step": 385 }, { "epoch": 0.49047013977128334, "grad_norm": 1.3996052498405696, "learning_rate": 1.9394619295443622e-05, "loss": 2.3547, "step": 386 }, { "epoch": 0.49174078780177893, "grad_norm": 1.19341579288055, "learning_rate": 1.9391150928149218e-05, "loss": 2.164, "step": 387 }, { "epoch": 0.49301143583227447, "grad_norm": 1.5852734903111279, "learning_rate": 1.9387672965600088e-05, "loss": 2.1857, "step": 388 }, { "epoch": 0.49428208386277, "grad_norm": 1.3587470658011351, "learning_rate": 1.9384185411349786e-05, "loss": 2.1427, "step": 389 }, { "epoch": 0.49555273189326554, "grad_norm": 1.3204033096459802, "learning_rate": 1.938068826896166e-05, "loss": 2.2107, "step": 390 }, { "epoch": 0.49682337992376113, "grad_norm": 1.3063591306506408, "learning_rate": 1.937718154200886e-05, "loss": 2.3279, "step": 391 }, { "epoch": 0.49809402795425667, "grad_norm": 2.3102205889364438, "learning_rate": 1.9373665234074328e-05, "loss": 2.2933, "step": 392 }, { "epoch": 0.4993646759847522, "grad_norm": 1.2926523346468037, "learning_rate": 1.937013934875079e-05, "loss": 2.4271, "step": 393 }, { "epoch": 0.5006353240152478, "grad_norm": 1.7975874404286474, "learning_rate": 1.9366603889640765e-05, "loss": 2.1837, "step": 394 }, { "epoch": 0.5019059720457433, "grad_norm": 1.4749865365496588, "learning_rate": 1.9363058860356548e-05, "loss": 2.2177, "step": 395 }, { "epoch": 0.5031766200762389, "grad_norm": 1.1259560135112834, "learning_rate": 1.9359504264520218e-05, "loss": 2.0392, "step": 396 }, { "epoch": 0.5044472681067345, "grad_norm": 1.3569865041821507, "learning_rate": 1.9355940105763622e-05, "loss": 2.4177, "step": 397 }, { "epoch": 0.5057179161372299, "grad_norm": 1.2836527221719194, "learning_rate": 1.9352366387728385e-05, "loss": 2.0478, "step": 398 }, { "epoch": 0.5069885641677255, "grad_norm": 1.9600150344140488, "learning_rate": 1.934878311406589e-05, "loss": 2.2771, "step": 399 }, { "epoch": 0.5082592121982211, "grad_norm": 1.3253015736567784, "learning_rate": 1.9345190288437292e-05, "loss": 2.4092, "step": 400 }, { "epoch": 0.5095298602287166, "grad_norm": 1.2347627921797202, "learning_rate": 1.9341587914513496e-05, "loss": 2.1296, "step": 401 }, { "epoch": 0.5108005082592122, "grad_norm": 1.135211318424933, "learning_rate": 1.933797599597518e-05, "loss": 2.2406, "step": 402 }, { "epoch": 0.5120711562897078, "grad_norm": 1.1498013355042722, "learning_rate": 1.9334354536512746e-05, "loss": 2.1021, "step": 403 }, { "epoch": 0.5133418043202033, "grad_norm": 1.2455423054908155, "learning_rate": 1.9330723539826373e-05, "loss": 2.1064, "step": 404 }, { "epoch": 0.5146124523506989, "grad_norm": 1.2931710810726527, "learning_rate": 1.9327083009625974e-05, "loss": 2.107, "step": 405 }, { "epoch": 0.5158831003811944, "grad_norm": 1.282732123781329, "learning_rate": 1.9323432949631195e-05, "loss": 2.0613, "step": 406 }, { "epoch": 0.5171537484116899, "grad_norm": 1.2908466078690843, "learning_rate": 1.9319773363571424e-05, "loss": 2.0705, "step": 407 }, { "epoch": 0.5184243964421855, "grad_norm": 1.3060341367429515, "learning_rate": 1.931610425518579e-05, "loss": 2.0751, "step": 408 }, { "epoch": 0.5196950444726811, "grad_norm": 1.59682319598207, "learning_rate": 1.9312425628223134e-05, "loss": 2.3488, "step": 409 }, { "epoch": 0.5209656925031766, "grad_norm": 1.246031551721595, "learning_rate": 1.9308737486442045e-05, "loss": 2.2637, "step": 410 }, { "epoch": 0.5222363405336722, "grad_norm": 1.0094782030605576, "learning_rate": 1.930503983361081e-05, "loss": 1.876, "step": 411 }, { "epoch": 0.5235069885641678, "grad_norm": 2.221169423253986, "learning_rate": 1.930133267350746e-05, "loss": 2.3233, "step": 412 }, { "epoch": 0.5247776365946633, "grad_norm": 2.190877335011136, "learning_rate": 1.9297616009919708e-05, "loss": 2.2177, "step": 413 }, { "epoch": 0.5260482846251588, "grad_norm": 1.3699043310370755, "learning_rate": 1.9293889846645008e-05, "loss": 2.1351, "step": 414 }, { "epoch": 0.5273189326556544, "grad_norm": 1.4624860775913118, "learning_rate": 1.9290154187490497e-05, "loss": 2.411, "step": 415 }, { "epoch": 0.5285895806861499, "grad_norm": 1.6401976862702454, "learning_rate": 1.9286409036273027e-05, "loss": 2.1643, "step": 416 }, { "epoch": 0.5298602287166455, "grad_norm": 1.5842195123976015, "learning_rate": 1.9282654396819145e-05, "loss": 2.67, "step": 417 }, { "epoch": 0.531130876747141, "grad_norm": 1.6335296017089804, "learning_rate": 1.9278890272965097e-05, "loss": 2.2863, "step": 418 }, { "epoch": 0.5324015247776366, "grad_norm": 1.6531683834499145, "learning_rate": 1.9275116668556805e-05, "loss": 2.1697, "step": 419 }, { "epoch": 0.5336721728081322, "grad_norm": 1.2631900846528858, "learning_rate": 1.9271333587449895e-05, "loss": 2.2538, "step": 420 }, { "epoch": 0.5349428208386277, "grad_norm": 1.6490841748768568, "learning_rate": 1.9267541033509667e-05, "loss": 2.1101, "step": 421 }, { "epoch": 0.5362134688691232, "grad_norm": 2.320810299489566, "learning_rate": 1.92637390106111e-05, "loss": 2.2869, "step": 422 }, { "epoch": 0.5374841168996188, "grad_norm": 1.286261486771375, "learning_rate": 1.925992752263885e-05, "loss": 2.2688, "step": 423 }, { "epoch": 0.5387547649301143, "grad_norm": 1.5581878124813053, "learning_rate": 1.9256106573487238e-05, "loss": 2.2076, "step": 424 }, { "epoch": 0.5400254129606099, "grad_norm": 1.7249893993080114, "learning_rate": 1.925227616706026e-05, "loss": 2.2496, "step": 425 }, { "epoch": 0.5412960609911055, "grad_norm": 1.6256150858689495, "learning_rate": 1.924843630727157e-05, "loss": 2.1455, "step": 426 }, { "epoch": 0.542566709021601, "grad_norm": 1.1965038217009443, "learning_rate": 1.9244586998044485e-05, "loss": 2.0354, "step": 427 }, { "epoch": 0.5438373570520966, "grad_norm": 1.4418630674402975, "learning_rate": 1.924072824331197e-05, "loss": 2.1294, "step": 428 }, { "epoch": 0.5451080050825922, "grad_norm": 1.2574351344869776, "learning_rate": 1.9236860047016647e-05, "loss": 2.1624, "step": 429 }, { "epoch": 0.5463786531130876, "grad_norm": 1.3978680736362221, "learning_rate": 1.923298241311078e-05, "loss": 2.2992, "step": 430 }, { "epoch": 0.5476493011435832, "grad_norm": 1.2755866211804594, "learning_rate": 1.9229095345556278e-05, "loss": 2.2508, "step": 431 }, { "epoch": 0.5489199491740788, "grad_norm": 1.314894688674993, "learning_rate": 1.9225198848324687e-05, "loss": 2.0979, "step": 432 }, { "epoch": 0.5501905972045743, "grad_norm": 1.5314349817624966, "learning_rate": 1.9221292925397196e-05, "loss": 1.8943, "step": 433 }, { "epoch": 0.5514612452350699, "grad_norm": 1.1921961065015845, "learning_rate": 1.921737758076461e-05, "loss": 2.1535, "step": 434 }, { "epoch": 0.5527318932655655, "grad_norm": 1.186044614997674, "learning_rate": 1.9213452818427374e-05, "loss": 2.1741, "step": 435 }, { "epoch": 0.554002541296061, "grad_norm": 1.5267477608246511, "learning_rate": 1.920951864239555e-05, "loss": 2.2164, "step": 436 }, { "epoch": 0.5552731893265566, "grad_norm": 1.3936059207900588, "learning_rate": 1.920557505668881e-05, "loss": 2.1367, "step": 437 }, { "epoch": 0.5565438373570522, "grad_norm": 1.3363717534452426, "learning_rate": 1.9201622065336455e-05, "loss": 2.0526, "step": 438 }, { "epoch": 0.5578144853875476, "grad_norm": 1.373993468638361, "learning_rate": 1.9197659672377388e-05, "loss": 2.1925, "step": 439 }, { "epoch": 0.5590851334180432, "grad_norm": 1.1534456508897548, "learning_rate": 1.919368788186012e-05, "loss": 2.2284, "step": 440 }, { "epoch": 0.5603557814485387, "grad_norm": 1.2512328467790448, "learning_rate": 1.918970669784276e-05, "loss": 2.2891, "step": 441 }, { "epoch": 0.5616264294790343, "grad_norm": 1.3611375165189201, "learning_rate": 1.918571612439302e-05, "loss": 2.2749, "step": 442 }, { "epoch": 0.5628970775095299, "grad_norm": 1.165275113048963, "learning_rate": 1.9181716165588206e-05, "loss": 2.0572, "step": 443 }, { "epoch": 0.5641677255400254, "grad_norm": 1.3439653647129153, "learning_rate": 1.9177706825515204e-05, "loss": 2.0461, "step": 444 }, { "epoch": 0.565438373570521, "grad_norm": 1.156674833687087, "learning_rate": 1.9173688108270495e-05, "loss": 2.1879, "step": 445 }, { "epoch": 0.5667090216010165, "grad_norm": 1.1998871634588988, "learning_rate": 1.9169660017960135e-05, "loss": 2.1873, "step": 446 }, { "epoch": 0.567979669631512, "grad_norm": 1.1882908352021604, "learning_rate": 1.9165622558699763e-05, "loss": 2.3475, "step": 447 }, { "epoch": 0.5692503176620076, "grad_norm": 1.27008784722015, "learning_rate": 1.9161575734614587e-05, "loss": 2.4298, "step": 448 }, { "epoch": 0.5705209656925032, "grad_norm": 1.119449905986438, "learning_rate": 1.915751954983938e-05, "loss": 2.1871, "step": 449 }, { "epoch": 0.5717916137229987, "grad_norm": 1.3424406440424546, "learning_rate": 1.915345400851848e-05, "loss": 2.4036, "step": 450 }, { "epoch": 0.5730622617534943, "grad_norm": 1.4009748452912008, "learning_rate": 1.9149379114805798e-05, "loss": 2.5499, "step": 451 }, { "epoch": 0.5743329097839899, "grad_norm": 1.2584770296153747, "learning_rate": 1.914529487286478e-05, "loss": 2.192, "step": 452 }, { "epoch": 0.5756035578144854, "grad_norm": 1.447563760715695, "learning_rate": 1.9141201286868435e-05, "loss": 2.2728, "step": 453 }, { "epoch": 0.576874205844981, "grad_norm": 1.1271867759241965, "learning_rate": 1.913709836099932e-05, "loss": 2.2126, "step": 454 }, { "epoch": 0.5781448538754765, "grad_norm": 1.2495009606349796, "learning_rate": 1.9132986099449535e-05, "loss": 2.1981, "step": 455 }, { "epoch": 0.579415501905972, "grad_norm": 1.2699559949583559, "learning_rate": 1.912886450642071e-05, "loss": 2.2818, "step": 456 }, { "epoch": 0.5806861499364676, "grad_norm": 1.1079978001775894, "learning_rate": 1.9124733586124015e-05, "loss": 2.0022, "step": 457 }, { "epoch": 0.5819567979669632, "grad_norm": 1.2644461116241026, "learning_rate": 1.9120593342780158e-05, "loss": 2.4457, "step": 458 }, { "epoch": 0.5832274459974587, "grad_norm": 1.148860883716844, "learning_rate": 1.9116443780619357e-05, "loss": 2.0796, "step": 459 }, { "epoch": 0.5844980940279543, "grad_norm": 1.1412514808049359, "learning_rate": 1.911228490388136e-05, "loss": 2.3325, "step": 460 }, { "epoch": 0.5857687420584498, "grad_norm": 1.1387216820020802, "learning_rate": 1.9108116716815433e-05, "loss": 2.3263, "step": 461 }, { "epoch": 0.5870393900889453, "grad_norm": 1.2897407598165023, "learning_rate": 1.9103939223680353e-05, "loss": 2.4233, "step": 462 }, { "epoch": 0.5883100381194409, "grad_norm": 1.2087089474395942, "learning_rate": 1.9099752428744407e-05, "loss": 2.1524, "step": 463 }, { "epoch": 0.5895806861499364, "grad_norm": 1.2164503356307568, "learning_rate": 1.9095556336285382e-05, "loss": 2.0391, "step": 464 }, { "epoch": 0.590851334180432, "grad_norm": 1.196760865478763, "learning_rate": 1.9091350950590563e-05, "loss": 2.2708, "step": 465 }, { "epoch": 0.5921219822109276, "grad_norm": 1.2313846896394445, "learning_rate": 1.9087136275956745e-05, "loss": 2.2378, "step": 466 }, { "epoch": 0.5933926302414231, "grad_norm": 1.2373291069034087, "learning_rate": 1.908291231669019e-05, "loss": 2.3246, "step": 467 }, { "epoch": 0.5946632782719187, "grad_norm": 1.1829656057092561, "learning_rate": 1.9078679077106666e-05, "loss": 2.1872, "step": 468 }, { "epoch": 0.5959339263024143, "grad_norm": 1.4347451249915018, "learning_rate": 1.907443656153142e-05, "loss": 2.6066, "step": 469 }, { "epoch": 0.5972045743329097, "grad_norm": 1.0984143438577232, "learning_rate": 1.9070184774299162e-05, "loss": 2.0474, "step": 470 }, { "epoch": 0.5984752223634053, "grad_norm": 1.1850906238628265, "learning_rate": 1.9065923719754097e-05, "loss": 1.9647, "step": 471 }, { "epoch": 0.5997458703939009, "grad_norm": 1.3926199263292411, "learning_rate": 1.906165340224988e-05, "loss": 2.4496, "step": 472 }, { "epoch": 0.6010165184243964, "grad_norm": 1.2275438489433488, "learning_rate": 1.9057373826149642e-05, "loss": 2.1911, "step": 473 }, { "epoch": 0.602287166454892, "grad_norm": 1.33064305688056, "learning_rate": 1.905308499582597e-05, "loss": 2.3691, "step": 474 }, { "epoch": 0.6035578144853876, "grad_norm": 1.0882172714073037, "learning_rate": 1.9048786915660903e-05, "loss": 2.1702, "step": 475 }, { "epoch": 0.6048284625158831, "grad_norm": 1.1379935541068376, "learning_rate": 1.9044479590045936e-05, "loss": 1.9241, "step": 476 }, { "epoch": 0.6060991105463787, "grad_norm": 1.2631080747027876, "learning_rate": 1.904016302338201e-05, "loss": 2.3022, "step": 477 }, { "epoch": 0.6073697585768743, "grad_norm": 1.3033474316729674, "learning_rate": 1.90358372200795e-05, "loss": 2.1681, "step": 478 }, { "epoch": 0.6086404066073697, "grad_norm": 1.3374719992920634, "learning_rate": 1.9031502184558235e-05, "loss": 2.3015, "step": 479 }, { "epoch": 0.6099110546378653, "grad_norm": 1.25539896962734, "learning_rate": 1.902715792124746e-05, "loss": 2.2975, "step": 480 }, { "epoch": 0.6111817026683609, "grad_norm": 1.1954446631182531, "learning_rate": 1.9022804434585854e-05, "loss": 2.0728, "step": 481 }, { "epoch": 0.6124523506988564, "grad_norm": 1.205447491528233, "learning_rate": 1.9018441729021525e-05, "loss": 2.2925, "step": 482 }, { "epoch": 0.613722998729352, "grad_norm": 1.141922162715847, "learning_rate": 1.901406980901199e-05, "loss": 2.1577, "step": 483 }, { "epoch": 0.6149936467598475, "grad_norm": 1.0941168595388158, "learning_rate": 1.900968867902419e-05, "loss": 2.301, "step": 484 }, { "epoch": 0.6162642947903431, "grad_norm": 1.1515497727617856, "learning_rate": 1.900529834353448e-05, "loss": 2.1877, "step": 485 }, { "epoch": 0.6175349428208387, "grad_norm": 1.080151589093454, "learning_rate": 1.9000898807028602e-05, "loss": 2.128, "step": 486 }, { "epoch": 0.6188055908513341, "grad_norm": 0.9583578019928071, "learning_rate": 1.8996490074001714e-05, "loss": 1.8186, "step": 487 }, { "epoch": 0.6200762388818297, "grad_norm": 1.209860122515118, "learning_rate": 1.8992072148958368e-05, "loss": 2.322, "step": 488 }, { "epoch": 0.6213468869123253, "grad_norm": 1.0923359800130836, "learning_rate": 1.898764503641251e-05, "loss": 2.2531, "step": 489 }, { "epoch": 0.6226175349428208, "grad_norm": 1.31837882949895, "learning_rate": 1.8983208740887464e-05, "loss": 2.3352, "step": 490 }, { "epoch": 0.6238881829733164, "grad_norm": 1.2597603066413212, "learning_rate": 1.8978763266915942e-05, "loss": 2.5245, "step": 491 }, { "epoch": 0.625158831003812, "grad_norm": 1.3715703169530913, "learning_rate": 1.897430861904004e-05, "loss": 2.1769, "step": 492 }, { "epoch": 0.6264294790343075, "grad_norm": 1.5225437610970938, "learning_rate": 1.8969844801811216e-05, "loss": 2.141, "step": 493 }, { "epoch": 0.627700127064803, "grad_norm": 1.3735055288768747, "learning_rate": 1.8965371819790305e-05, "loss": 2.0583, "step": 494 }, { "epoch": 0.6289707750952986, "grad_norm": 1.3329381523109856, "learning_rate": 1.8960889677547506e-05, "loss": 2.0973, "step": 495 }, { "epoch": 0.6302414231257941, "grad_norm": 1.421090762882285, "learning_rate": 1.8956398379662368e-05, "loss": 2.5422, "step": 496 }, { "epoch": 0.6315120711562897, "grad_norm": 1.6552633728958857, "learning_rate": 1.8951897930723806e-05, "loss": 2.4162, "step": 497 }, { "epoch": 0.6327827191867853, "grad_norm": 1.4506798439934194, "learning_rate": 1.8947388335330076e-05, "loss": 2.4185, "step": 498 }, { "epoch": 0.6340533672172808, "grad_norm": 1.4020055076985136, "learning_rate": 1.8942869598088785e-05, "loss": 2.1794, "step": 499 }, { "epoch": 0.6353240152477764, "grad_norm": 1.1768335660084828, "learning_rate": 1.8938341723616883e-05, "loss": 2.2581, "step": 500 }, { "epoch": 0.636594663278272, "grad_norm": 1.2174192479655537, "learning_rate": 1.8933804716540646e-05, "loss": 2.446, "step": 501 }, { "epoch": 0.6378653113087674, "grad_norm": 1.0553638547069482, "learning_rate": 1.8929258581495688e-05, "loss": 2.105, "step": 502 }, { "epoch": 0.639135959339263, "grad_norm": 1.1267989554506508, "learning_rate": 1.892470332312695e-05, "loss": 2.1001, "step": 503 }, { "epoch": 0.6404066073697586, "grad_norm": 1.3249636163461571, "learning_rate": 1.892013894608869e-05, "loss": 2.2481, "step": 504 }, { "epoch": 0.6416772554002541, "grad_norm": 1.294216594755417, "learning_rate": 1.8915565455044483e-05, "loss": 2.2538, "step": 505 }, { "epoch": 0.6429479034307497, "grad_norm": 1.1232090441804183, "learning_rate": 1.8910982854667228e-05, "loss": 2.0829, "step": 506 }, { "epoch": 0.6442185514612452, "grad_norm": 1.95930991197237, "learning_rate": 1.8906391149639115e-05, "loss": 2.5869, "step": 507 }, { "epoch": 0.6454891994917408, "grad_norm": 1.3453985330330351, "learning_rate": 1.8901790344651643e-05, "loss": 1.9773, "step": 508 }, { "epoch": 0.6467598475222364, "grad_norm": 1.2296929125860547, "learning_rate": 1.8897180444405615e-05, "loss": 2.2737, "step": 509 }, { "epoch": 0.6480304955527318, "grad_norm": 1.51817047299658, "learning_rate": 1.8892561453611113e-05, "loss": 2.393, "step": 510 }, { "epoch": 0.6493011435832274, "grad_norm": 1.3781830080412667, "learning_rate": 1.8887933376987524e-05, "loss": 2.3916, "step": 511 }, { "epoch": 0.650571791613723, "grad_norm": 1.6740810365239487, "learning_rate": 1.8883296219263503e-05, "loss": 2.5292, "step": 512 }, { "epoch": 0.6518424396442185, "grad_norm": 1.201702350296437, "learning_rate": 1.887864998517699e-05, "loss": 1.9545, "step": 513 }, { "epoch": 0.6531130876747141, "grad_norm": 1.0800862791397763, "learning_rate": 1.88739946794752e-05, "loss": 2.0514, "step": 514 }, { "epoch": 0.6543837357052097, "grad_norm": 1.1405856469174804, "learning_rate": 1.886933030691462e-05, "loss": 2.1928, "step": 515 }, { "epoch": 0.6556543837357052, "grad_norm": 1.2141463875986063, "learning_rate": 1.8864656872260985e-05, "loss": 2.3328, "step": 516 }, { "epoch": 0.6569250317662008, "grad_norm": 1.2320242281546439, "learning_rate": 1.8859974380289317e-05, "loss": 2.2226, "step": 517 }, { "epoch": 0.6581956797966964, "grad_norm": 1.1049627459213187, "learning_rate": 1.8855282835783858e-05, "loss": 2.192, "step": 518 }, { "epoch": 0.6594663278271918, "grad_norm": 1.242887571869999, "learning_rate": 1.885058224353813e-05, "loss": 2.1095, "step": 519 }, { "epoch": 0.6607369758576874, "grad_norm": 1.0690686043099307, "learning_rate": 1.8845872608354877e-05, "loss": 2.0908, "step": 520 }, { "epoch": 0.662007623888183, "grad_norm": 1.3192664166276828, "learning_rate": 1.8841153935046098e-05, "loss": 1.9455, "step": 521 }, { "epoch": 0.6632782719186785, "grad_norm": 1.5454386592048066, "learning_rate": 1.883642622843302e-05, "loss": 2.4256, "step": 522 }, { "epoch": 0.6645489199491741, "grad_norm": 1.1200275979051053, "learning_rate": 1.8831689493346095e-05, "loss": 1.7918, "step": 523 }, { "epoch": 0.6658195679796697, "grad_norm": 1.2792896107464529, "learning_rate": 1.8826943734625006e-05, "loss": 2.2147, "step": 524 }, { "epoch": 0.6670902160101652, "grad_norm": 1.252552573310735, "learning_rate": 1.8822188957118656e-05, "loss": 2.4019, "step": 525 }, { "epoch": 0.6683608640406608, "grad_norm": 1.1245940180547487, "learning_rate": 1.8817425165685166e-05, "loss": 2.1881, "step": 526 }, { "epoch": 0.6696315120711563, "grad_norm": 1.0708468149170747, "learning_rate": 1.8812652365191854e-05, "loss": 2.0764, "step": 527 }, { "epoch": 0.6709021601016518, "grad_norm": 1.1777540466866772, "learning_rate": 1.880787056051525e-05, "loss": 2.2589, "step": 528 }, { "epoch": 0.6721728081321474, "grad_norm": 1.1832499421529257, "learning_rate": 1.880307975654109e-05, "loss": 2.4334, "step": 529 }, { "epoch": 0.6734434561626429, "grad_norm": 1.3102714120785524, "learning_rate": 1.8798279958164295e-05, "loss": 2.0436, "step": 530 }, { "epoch": 0.6747141041931385, "grad_norm": 1.222035984008157, "learning_rate": 1.8793471170288984e-05, "loss": 2.1924, "step": 531 }, { "epoch": 0.6759847522236341, "grad_norm": 1.1732666538562266, "learning_rate": 1.8788653397828458e-05, "loss": 2.4146, "step": 532 }, { "epoch": 0.6772554002541296, "grad_norm": 1.1409816836583633, "learning_rate": 1.8783826645705195e-05, "loss": 2.2124, "step": 533 }, { "epoch": 0.6785260482846251, "grad_norm": 1.3231706367571725, "learning_rate": 1.8778990918850852e-05, "loss": 2.0627, "step": 534 }, { "epoch": 0.6797966963151207, "grad_norm": 1.2526020473537403, "learning_rate": 1.877414622220625e-05, "loss": 2.2245, "step": 535 }, { "epoch": 0.6810673443456162, "grad_norm": 1.1845130281014962, "learning_rate": 1.876929256072138e-05, "loss": 2.2451, "step": 536 }, { "epoch": 0.6823379923761118, "grad_norm": 1.3625735833969421, "learning_rate": 1.8764429939355394e-05, "loss": 2.4728, "step": 537 }, { "epoch": 0.6836086404066074, "grad_norm": 1.1553595122128892, "learning_rate": 1.8759558363076588e-05, "loss": 2.1704, "step": 538 }, { "epoch": 0.6848792884371029, "grad_norm": 1.2851221382973055, "learning_rate": 1.875467783686243e-05, "loss": 2.431, "step": 539 }, { "epoch": 0.6861499364675985, "grad_norm": 1.2639041760774359, "learning_rate": 1.87497883656995e-05, "loss": 2.2614, "step": 540 }, { "epoch": 0.6874205844980941, "grad_norm": 1.1565308038920128, "learning_rate": 1.8744889954583544e-05, "loss": 2.0451, "step": 541 }, { "epoch": 0.6886912325285895, "grad_norm": 1.1481601695101114, "learning_rate": 1.8739982608519438e-05, "loss": 2.0516, "step": 542 }, { "epoch": 0.6899618805590851, "grad_norm": 1.168831243257151, "learning_rate": 1.8735066332521174e-05, "loss": 2.3054, "step": 543 }, { "epoch": 0.6912325285895807, "grad_norm": 1.0979119040449332, "learning_rate": 1.8730141131611882e-05, "loss": 2.1107, "step": 544 }, { "epoch": 0.6925031766200762, "grad_norm": 1.474991738773486, "learning_rate": 1.8725207010823804e-05, "loss": 2.2627, "step": 545 }, { "epoch": 0.6937738246505718, "grad_norm": 1.139596979539453, "learning_rate": 1.8720263975198295e-05, "loss": 1.9434, "step": 546 }, { "epoch": 0.6950444726810674, "grad_norm": 1.1481823274967125, "learning_rate": 1.8715312029785825e-05, "loss": 2.1494, "step": 547 }, { "epoch": 0.6963151207115629, "grad_norm": 1.1406321517448839, "learning_rate": 1.871035117964596e-05, "loss": 2.1246, "step": 548 }, { "epoch": 0.6975857687420585, "grad_norm": 1.256664969056528, "learning_rate": 1.8705381429847364e-05, "loss": 2.1875, "step": 549 }, { "epoch": 0.6988564167725541, "grad_norm": 1.146997870707755, "learning_rate": 1.8700402785467804e-05, "loss": 2.2522, "step": 550 }, { "epoch": 0.7001270648030495, "grad_norm": 1.7921394223729616, "learning_rate": 1.8695415251594123e-05, "loss": 2.421, "step": 551 }, { "epoch": 0.7013977128335451, "grad_norm": 1.127645304716579, "learning_rate": 1.869041883332226e-05, "loss": 2.0452, "step": 552 }, { "epoch": 0.7026683608640406, "grad_norm": 1.2712912170354675, "learning_rate": 1.8685413535757217e-05, "loss": 2.3269, "step": 553 }, { "epoch": 0.7039390088945362, "grad_norm": 1.1225547807371499, "learning_rate": 1.8680399364013075e-05, "loss": 1.995, "step": 554 }, { "epoch": 0.7052096569250318, "grad_norm": 1.1182511171971892, "learning_rate": 1.8675376323212985e-05, "loss": 2.313, "step": 555 }, { "epoch": 0.7064803049555273, "grad_norm": 1.3258764971873436, "learning_rate": 1.8670344418489154e-05, "loss": 2.0656, "step": 556 }, { "epoch": 0.7077509529860229, "grad_norm": 1.0748166146008136, "learning_rate": 1.866530365498285e-05, "loss": 1.9555, "step": 557 }, { "epoch": 0.7090216010165185, "grad_norm": 1.3514593399056483, "learning_rate": 1.866025403784439e-05, "loss": 2.1068, "step": 558 }, { "epoch": 0.7102922490470139, "grad_norm": 1.079548325327528, "learning_rate": 1.8655195572233135e-05, "loss": 1.9282, "step": 559 }, { "epoch": 0.7115628970775095, "grad_norm": 1.1487053535243719, "learning_rate": 1.8650128263317494e-05, "loss": 1.879, "step": 560 }, { "epoch": 0.7128335451080051, "grad_norm": 1.3507734064247627, "learning_rate": 1.8645052116274904e-05, "loss": 2.2114, "step": 561 }, { "epoch": 0.7141041931385006, "grad_norm": 1.122492897069288, "learning_rate": 1.8639967136291837e-05, "loss": 2.3406, "step": 562 }, { "epoch": 0.7153748411689962, "grad_norm": 1.929970759366875, "learning_rate": 1.863487332856378e-05, "loss": 1.9846, "step": 563 }, { "epoch": 0.7166454891994918, "grad_norm": 1.0536788314464651, "learning_rate": 1.8629770698295267e-05, "loss": 1.899, "step": 564 }, { "epoch": 0.7179161372299873, "grad_norm": 1.1944128852816356, "learning_rate": 1.8624659250699807e-05, "loss": 2.1493, "step": 565 }, { "epoch": 0.7191867852604829, "grad_norm": 1.1971873826364454, "learning_rate": 1.8619538990999947e-05, "loss": 2.1729, "step": 566 }, { "epoch": 0.7204574332909784, "grad_norm": 1.206679262476945, "learning_rate": 1.861440992442723e-05, "loss": 2.38, "step": 567 }, { "epoch": 0.7217280813214739, "grad_norm": 1.185571910306438, "learning_rate": 1.8609272056222186e-05, "loss": 2.2179, "step": 568 }, { "epoch": 0.7229987293519695, "grad_norm": 1.6058101563071772, "learning_rate": 1.860412539163436e-05, "loss": 2.3207, "step": 569 }, { "epoch": 0.7242693773824651, "grad_norm": 1.136888523638023, "learning_rate": 1.8598969935922263e-05, "loss": 1.989, "step": 570 }, { "epoch": 0.7255400254129606, "grad_norm": 1.0341755689373624, "learning_rate": 1.8593805694353407e-05, "loss": 1.9567, "step": 571 }, { "epoch": 0.7268106734434562, "grad_norm": 1.0685534450721437, "learning_rate": 1.8588632672204264e-05, "loss": 2.1098, "step": 572 }, { "epoch": 0.7280813214739518, "grad_norm": 1.1375771659832385, "learning_rate": 1.8583450874760282e-05, "loss": 2.1579, "step": 573 }, { "epoch": 0.7293519695044473, "grad_norm": 1.1501768702752024, "learning_rate": 1.8578260307315888e-05, "loss": 2.1482, "step": 574 }, { "epoch": 0.7306226175349428, "grad_norm": 1.2847569142855215, "learning_rate": 1.8573060975174447e-05, "loss": 1.9585, "step": 575 }, { "epoch": 0.7318932655654383, "grad_norm": 1.2972019979542169, "learning_rate": 1.8567852883648302e-05, "loss": 2.2788, "step": 576 }, { "epoch": 0.7331639135959339, "grad_norm": 1.1878480969568581, "learning_rate": 1.856263603805873e-05, "loss": 2.1769, "step": 577 }, { "epoch": 0.7344345616264295, "grad_norm": 1.0494532518122905, "learning_rate": 1.855741044373596e-05, "loss": 2.0674, "step": 578 }, { "epoch": 0.735705209656925, "grad_norm": 1.4204377502276189, "learning_rate": 1.8552176106019156e-05, "loss": 2.1398, "step": 579 }, { "epoch": 0.7369758576874206, "grad_norm": 1.1297999222036323, "learning_rate": 1.8546933030256417e-05, "loss": 2.1572, "step": 580 }, { "epoch": 0.7382465057179162, "grad_norm": 0.9938535341282585, "learning_rate": 1.854168122180477e-05, "loss": 2.0417, "step": 581 }, { "epoch": 0.7395171537484116, "grad_norm": 1.0939285871388238, "learning_rate": 1.853642068603016e-05, "loss": 2.193, "step": 582 }, { "epoch": 0.7407878017789072, "grad_norm": 1.3213960210210096, "learning_rate": 1.8531151428307464e-05, "loss": 2.2197, "step": 583 }, { "epoch": 0.7420584498094028, "grad_norm": 1.1372240170454513, "learning_rate": 1.8525873454020452e-05, "loss": 2.238, "step": 584 }, { "epoch": 0.7433290978398983, "grad_norm": 1.3759947059281574, "learning_rate": 1.8520586768561804e-05, "loss": 2.0981, "step": 585 }, { "epoch": 0.7445997458703939, "grad_norm": 1.3255967943604101, "learning_rate": 1.8515291377333114e-05, "loss": 2.3322, "step": 586 }, { "epoch": 0.7458703939008895, "grad_norm": 1.0698356148683228, "learning_rate": 1.8509987285744856e-05, "loss": 2.1271, "step": 587 }, { "epoch": 0.747141041931385, "grad_norm": 1.1523769456097608, "learning_rate": 1.85046744992164e-05, "loss": 2.2146, "step": 588 }, { "epoch": 0.7484116899618806, "grad_norm": 1.3299045617102017, "learning_rate": 1.8499353023176e-05, "loss": 2.4415, "step": 589 }, { "epoch": 0.7496823379923762, "grad_norm": 1.307200173595717, "learning_rate": 1.8494022863060782e-05, "loss": 2.2369, "step": 590 }, { "epoch": 0.7509529860228716, "grad_norm": 1.1629936680632729, "learning_rate": 1.848868402431675e-05, "loss": 2.2199, "step": 591 }, { "epoch": 0.7522236340533672, "grad_norm": 1.0365243593426372, "learning_rate": 1.8483336512398783e-05, "loss": 1.9964, "step": 592 }, { "epoch": 0.7534942820838628, "grad_norm": 1.1183367890272324, "learning_rate": 1.847798033277061e-05, "loss": 2.2742, "step": 593 }, { "epoch": 0.7547649301143583, "grad_norm": 1.5349286915016844, "learning_rate": 1.8472615490904817e-05, "loss": 2.2677, "step": 594 }, { "epoch": 0.7560355781448539, "grad_norm": 1.2144970266650321, "learning_rate": 1.8467241992282842e-05, "loss": 2.2199, "step": 595 }, { "epoch": 0.7573062261753494, "grad_norm": 1.1183355199544283, "learning_rate": 1.8461859842394976e-05, "loss": 2.1626, "step": 596 }, { "epoch": 0.758576874205845, "grad_norm": 1.4314400266582858, "learning_rate": 1.845646904674034e-05, "loss": 2.2842, "step": 597 }, { "epoch": 0.7598475222363406, "grad_norm": 1.1831971425721604, "learning_rate": 1.8451069610826885e-05, "loss": 2.0274, "step": 598 }, { "epoch": 0.761118170266836, "grad_norm": 1.2516564815200508, "learning_rate": 1.8445661540171408e-05, "loss": 2.151, "step": 599 }, { "epoch": 0.7623888182973316, "grad_norm": 1.0199374404736399, "learning_rate": 1.8440244840299507e-05, "loss": 1.7867, "step": 600 }, { "epoch": 0.7636594663278272, "grad_norm": 1.2355296914792018, "learning_rate": 1.843481951674561e-05, "loss": 2.2535, "step": 601 }, { "epoch": 0.7649301143583227, "grad_norm": 1.1498066544729326, "learning_rate": 1.8429385575052947e-05, "loss": 2.226, "step": 602 }, { "epoch": 0.7662007623888183, "grad_norm": 1.1212282930105197, "learning_rate": 1.842394302077357e-05, "loss": 2.0142, "step": 603 }, { "epoch": 0.7674714104193139, "grad_norm": 1.0961030851113989, "learning_rate": 1.841849185946831e-05, "loss": 2.0159, "step": 604 }, { "epoch": 0.7687420584498094, "grad_norm": 1.175556484686974, "learning_rate": 1.8413032096706808e-05, "loss": 2.2503, "step": 605 }, { "epoch": 0.770012706480305, "grad_norm": 1.095037415345117, "learning_rate": 1.8407563738067483e-05, "loss": 1.797, "step": 606 }, { "epoch": 0.7712833545108005, "grad_norm": 1.123170821307597, "learning_rate": 1.8402086789137547e-05, "loss": 2.2509, "step": 607 }, { "epoch": 0.772554002541296, "grad_norm": 1.1069902916432202, "learning_rate": 1.8396601255512973e-05, "loss": 2.1634, "step": 608 }, { "epoch": 0.7738246505717916, "grad_norm": 1.207271414909803, "learning_rate": 1.8391107142798523e-05, "loss": 2.3651, "step": 609 }, { "epoch": 0.7750952986022872, "grad_norm": 1.1822823085451364, "learning_rate": 1.8385604456607716e-05, "loss": 2.1507, "step": 610 }, { "epoch": 0.7763659466327827, "grad_norm": 1.2195635504431297, "learning_rate": 1.8380093202562828e-05, "loss": 2.2428, "step": 611 }, { "epoch": 0.7776365946632783, "grad_norm": 1.1167931838706007, "learning_rate": 1.8374573386294896e-05, "loss": 2.0545, "step": 612 }, { "epoch": 0.7789072426937739, "grad_norm": 1.0629250710234373, "learning_rate": 1.8369045013443697e-05, "loss": 2.235, "step": 613 }, { "epoch": 0.7801778907242694, "grad_norm": 1.1080069750410502, "learning_rate": 1.8363508089657763e-05, "loss": 2.1209, "step": 614 }, { "epoch": 0.7814485387547649, "grad_norm": 1.0830334587830448, "learning_rate": 1.835796262059435e-05, "loss": 2.2635, "step": 615 }, { "epoch": 0.7827191867852605, "grad_norm": 1.1121154546549203, "learning_rate": 1.8352408611919453e-05, "loss": 2.1994, "step": 616 }, { "epoch": 0.783989834815756, "grad_norm": 1.0460510814774757, "learning_rate": 1.8346846069307784e-05, "loss": 2.0689, "step": 617 }, { "epoch": 0.7852604828462516, "grad_norm": 1.2289832098681894, "learning_rate": 1.8341274998442786e-05, "loss": 2.0958, "step": 618 }, { "epoch": 0.7865311308767471, "grad_norm": 1.1016802793946234, "learning_rate": 1.8335695405016608e-05, "loss": 2.4416, "step": 619 }, { "epoch": 0.7878017789072427, "grad_norm": 1.0581802964347833, "learning_rate": 1.833010729473011e-05, "loss": 2.3486, "step": 620 }, { "epoch": 0.7890724269377383, "grad_norm": 1.102357635535828, "learning_rate": 1.8324510673292844e-05, "loss": 2.2709, "step": 621 }, { "epoch": 0.7903430749682337, "grad_norm": 1.0332914444862387, "learning_rate": 1.8318905546423074e-05, "loss": 2.0209, "step": 622 }, { "epoch": 0.7916137229987293, "grad_norm": 2.0154352316730932, "learning_rate": 1.8313291919847743e-05, "loss": 2.2009, "step": 623 }, { "epoch": 0.7928843710292249, "grad_norm": 1.1693281047264406, "learning_rate": 1.8307669799302488e-05, "loss": 1.9567, "step": 624 }, { "epoch": 0.7941550190597204, "grad_norm": 1.0641878819552213, "learning_rate": 1.830203919053161e-05, "loss": 1.9002, "step": 625 }, { "epoch": 0.795425667090216, "grad_norm": 1.1394831585966236, "learning_rate": 1.8296400099288097e-05, "loss": 2.1372, "step": 626 }, { "epoch": 0.7966963151207116, "grad_norm": 1.373148541319477, "learning_rate": 1.82907525313336e-05, "loss": 2.171, "step": 627 }, { "epoch": 0.7979669631512071, "grad_norm": 1.2881927858172024, "learning_rate": 1.8285096492438424e-05, "loss": 2.2768, "step": 628 }, { "epoch": 0.7992376111817027, "grad_norm": 1.314788727096053, "learning_rate": 1.8279431988381534e-05, "loss": 2.2702, "step": 629 }, { "epoch": 0.8005082592121983, "grad_norm": 1.1722514211564103, "learning_rate": 1.8273759024950547e-05, "loss": 1.9007, "step": 630 }, { "epoch": 0.8017789072426937, "grad_norm": 1.331031931387042, "learning_rate": 1.8268077607941722e-05, "loss": 2.2078, "step": 631 }, { "epoch": 0.8030495552731893, "grad_norm": 1.278464976980853, "learning_rate": 1.826238774315995e-05, "loss": 2.1024, "step": 632 }, { "epoch": 0.8043202033036849, "grad_norm": 1.2183423143125827, "learning_rate": 1.8256689436418758e-05, "loss": 1.7729, "step": 633 }, { "epoch": 0.8055908513341804, "grad_norm": 1.0565672193398135, "learning_rate": 1.82509826935403e-05, "loss": 2.1151, "step": 634 }, { "epoch": 0.806861499364676, "grad_norm": 1.1943999681129567, "learning_rate": 1.8245267520355348e-05, "loss": 2.1064, "step": 635 }, { "epoch": 0.8081321473951716, "grad_norm": 1.2429531929474038, "learning_rate": 1.823954392270328e-05, "loss": 2.2624, "step": 636 }, { "epoch": 0.8094027954256671, "grad_norm": 1.0129041059124908, "learning_rate": 1.8233811906432097e-05, "loss": 2.1492, "step": 637 }, { "epoch": 0.8106734434561627, "grad_norm": 1.4593002297051365, "learning_rate": 1.8228071477398384e-05, "loss": 2.045, "step": 638 }, { "epoch": 0.8119440914866582, "grad_norm": 1.1260803678002682, "learning_rate": 1.8222322641467335e-05, "loss": 2.1809, "step": 639 }, { "epoch": 0.8132147395171537, "grad_norm": 1.264065884036665, "learning_rate": 1.8216565404512732e-05, "loss": 2.2959, "step": 640 }, { "epoch": 0.8144853875476493, "grad_norm": 1.0952598714718136, "learning_rate": 1.8210799772416933e-05, "loss": 1.9048, "step": 641 }, { "epoch": 0.8157560355781448, "grad_norm": 1.270951012467556, "learning_rate": 1.8205025751070878e-05, "loss": 2.3176, "step": 642 }, { "epoch": 0.8170266836086404, "grad_norm": 1.1980202479132505, "learning_rate": 1.819924334637408e-05, "loss": 1.9996, "step": 643 }, { "epoch": 0.818297331639136, "grad_norm": 1.1630902255370354, "learning_rate": 1.8193452564234616e-05, "loss": 1.9429, "step": 644 }, { "epoch": 0.8195679796696315, "grad_norm": 1.198283450616259, "learning_rate": 1.8187653410569125e-05, "loss": 2.115, "step": 645 }, { "epoch": 0.820838627700127, "grad_norm": 1.2144890181469097, "learning_rate": 1.8181845891302798e-05, "loss": 2.2417, "step": 646 }, { "epoch": 0.8221092757306226, "grad_norm": 1.4017077851347404, "learning_rate": 1.8176030012369367e-05, "loss": 2.5008, "step": 647 }, { "epoch": 0.8233799237611181, "grad_norm": 1.1659526738815076, "learning_rate": 1.817020577971112e-05, "loss": 2.1242, "step": 648 }, { "epoch": 0.8246505717916137, "grad_norm": 1.278036339848494, "learning_rate": 1.8164373199278858e-05, "loss": 2.2445, "step": 649 }, { "epoch": 0.8259212198221093, "grad_norm": 1.2190826973272975, "learning_rate": 1.8158532277031937e-05, "loss": 2.2701, "step": 650 }, { "epoch": 0.8271918678526048, "grad_norm": 1.2816423646347477, "learning_rate": 1.815268301893822e-05, "loss": 2.1431, "step": 651 }, { "epoch": 0.8284625158831004, "grad_norm": 1.1717979527697433, "learning_rate": 1.814682543097409e-05, "loss": 2.2215, "step": 652 }, { "epoch": 0.829733163913596, "grad_norm": 1.3144642425327488, "learning_rate": 1.8140959519124436e-05, "loss": 2.3288, "step": 653 }, { "epoch": 0.8310038119440915, "grad_norm": 1.3305027804563203, "learning_rate": 1.813508528938267e-05, "loss": 2.2093, "step": 654 }, { "epoch": 0.832274459974587, "grad_norm": 1.1980383998904782, "learning_rate": 1.8129202747750682e-05, "loss": 2.1226, "step": 655 }, { "epoch": 0.8335451080050826, "grad_norm": 1.1783895710806225, "learning_rate": 1.812331190023886e-05, "loss": 2.2583, "step": 656 }, { "epoch": 0.8348157560355781, "grad_norm": 1.4976785081599993, "learning_rate": 1.811741275286609e-05, "loss": 2.091, "step": 657 }, { "epoch": 0.8360864040660737, "grad_norm": 1.1357480648220457, "learning_rate": 1.811150531165972e-05, "loss": 1.9162, "step": 658 }, { "epoch": 0.8373570520965693, "grad_norm": 1.2253645225834875, "learning_rate": 1.8105589582655585e-05, "loss": 2.1036, "step": 659 }, { "epoch": 0.8386277001270648, "grad_norm": 1.08960201806807, "learning_rate": 1.8099665571897987e-05, "loss": 2.2, "step": 660 }, { "epoch": 0.8398983481575604, "grad_norm": 1.2544735038806298, "learning_rate": 1.809373328543968e-05, "loss": 2.4893, "step": 661 }, { "epoch": 0.841168996188056, "grad_norm": 1.0809294146988426, "learning_rate": 1.808779272934189e-05, "loss": 2.153, "step": 662 }, { "epoch": 0.8424396442185514, "grad_norm": 1.1511962715890744, "learning_rate": 1.8081843909674277e-05, "loss": 1.9579, "step": 663 }, { "epoch": 0.843710292249047, "grad_norm": 1.2452670658097118, "learning_rate": 1.807588683251495e-05, "loss": 2.2061, "step": 664 }, { "epoch": 0.8449809402795425, "grad_norm": 1.248947570313016, "learning_rate": 1.8069921503950457e-05, "loss": 2.1359, "step": 665 }, { "epoch": 0.8462515883100381, "grad_norm": 1.2289662739094933, "learning_rate": 1.8063947930075776e-05, "loss": 2.2559, "step": 666 }, { "epoch": 0.8475222363405337, "grad_norm": 1.1868517146260327, "learning_rate": 1.8057966116994304e-05, "loss": 2.2514, "step": 667 }, { "epoch": 0.8487928843710292, "grad_norm": 1.0039146134359633, "learning_rate": 1.8051976070817864e-05, "loss": 1.6718, "step": 668 }, { "epoch": 0.8500635324015248, "grad_norm": 1.2292256786802576, "learning_rate": 1.8045977797666685e-05, "loss": 2.0274, "step": 669 }, { "epoch": 0.8513341804320204, "grad_norm": 1.1408696071690931, "learning_rate": 1.8039971303669407e-05, "loss": 2.2972, "step": 670 }, { "epoch": 0.8526048284625158, "grad_norm": 1.136087224803189, "learning_rate": 1.8033956594963067e-05, "loss": 1.9694, "step": 671 }, { "epoch": 0.8538754764930114, "grad_norm": 1.13467315760421, "learning_rate": 1.802793367769309e-05, "loss": 2.1446, "step": 672 }, { "epoch": 0.855146124523507, "grad_norm": 1.2426557197669015, "learning_rate": 1.8021902558013305e-05, "loss": 2.2789, "step": 673 }, { "epoch": 0.8564167725540025, "grad_norm": 1.0850125836271671, "learning_rate": 1.8015863242085893e-05, "loss": 2.023, "step": 674 }, { "epoch": 0.8576874205844981, "grad_norm": 1.1350570729505167, "learning_rate": 1.8009815736081442e-05, "loss": 1.8265, "step": 675 }, { "epoch": 0.8589580686149937, "grad_norm": 1.3055283343990955, "learning_rate": 1.8003760046178884e-05, "loss": 2.2501, "step": 676 }, { "epoch": 0.8602287166454892, "grad_norm": 1.1220882930352254, "learning_rate": 1.799769617856552e-05, "loss": 2.1533, "step": 677 }, { "epoch": 0.8614993646759848, "grad_norm": 1.3728913831386764, "learning_rate": 1.7991624139437013e-05, "loss": 2.1099, "step": 678 }, { "epoch": 0.8627700127064803, "grad_norm": 1.2170881551817552, "learning_rate": 1.7985543934997363e-05, "loss": 2.0939, "step": 679 }, { "epoch": 0.8640406607369758, "grad_norm": 1.1162608077349283, "learning_rate": 1.7979455571458926e-05, "loss": 2.1503, "step": 680 }, { "epoch": 0.8653113087674714, "grad_norm": 1.591964124880649, "learning_rate": 1.7973359055042384e-05, "loss": 2.3787, "step": 681 }, { "epoch": 0.866581956797967, "grad_norm": 1.0934479331373792, "learning_rate": 1.7967254391976752e-05, "loss": 2.1388, "step": 682 }, { "epoch": 0.8678526048284625, "grad_norm": 1.1745188572200094, "learning_rate": 1.796114158849937e-05, "loss": 2.1515, "step": 683 }, { "epoch": 0.8691232528589581, "grad_norm": 1.34721793744304, "learning_rate": 1.79550206508559e-05, "loss": 2.3695, "step": 684 }, { "epoch": 0.8703939008894537, "grad_norm": 1.40311554976704, "learning_rate": 1.7948891585300304e-05, "loss": 2.0496, "step": 685 }, { "epoch": 0.8716645489199492, "grad_norm": 1.4412821283719928, "learning_rate": 1.7942754398094858e-05, "loss": 2.3265, "step": 686 }, { "epoch": 0.8729351969504447, "grad_norm": 1.3094208316843936, "learning_rate": 1.793660909551013e-05, "loss": 2.1029, "step": 687 }, { "epoch": 0.8742058449809402, "grad_norm": 1.3235639078241626, "learning_rate": 1.793045568382498e-05, "loss": 2.2132, "step": 688 }, { "epoch": 0.8754764930114358, "grad_norm": 1.1612146363790252, "learning_rate": 1.792429416932656e-05, "loss": 2.2556, "step": 689 }, { "epoch": 0.8767471410419314, "grad_norm": 1.3204673619230718, "learning_rate": 1.7918124558310298e-05, "loss": 2.3886, "step": 690 }, { "epoch": 0.8780177890724269, "grad_norm": 1.389359391349538, "learning_rate": 1.7911946857079886e-05, "loss": 2.0822, "step": 691 }, { "epoch": 0.8792884371029225, "grad_norm": 1.302496075608027, "learning_rate": 1.7905761071947298e-05, "loss": 2.2212, "step": 692 }, { "epoch": 0.8805590851334181, "grad_norm": 1.2177378809010373, "learning_rate": 1.7899567209232747e-05, "loss": 2.1448, "step": 693 }, { "epoch": 0.8818297331639136, "grad_norm": 1.0887690351215271, "learning_rate": 1.7893365275264723e-05, "loss": 2.1947, "step": 694 }, { "epoch": 0.8831003811944091, "grad_norm": 1.1611483527631685, "learning_rate": 1.7887155276379946e-05, "loss": 2.2277, "step": 695 }, { "epoch": 0.8843710292249047, "grad_norm": 1.3321497305815846, "learning_rate": 1.788093721892338e-05, "loss": 2.1868, "step": 696 }, { "epoch": 0.8856416772554002, "grad_norm": 1.1677109697934798, "learning_rate": 1.7874711109248223e-05, "loss": 2.0527, "step": 697 }, { "epoch": 0.8869123252858958, "grad_norm": 1.2935614226803498, "learning_rate": 1.78684769537159e-05, "loss": 2.1411, "step": 698 }, { "epoch": 0.8881829733163914, "grad_norm": 1.2070887595655209, "learning_rate": 1.7862234758696064e-05, "loss": 2.2359, "step": 699 }, { "epoch": 0.8894536213468869, "grad_norm": 1.1156624127053323, "learning_rate": 1.7855984530566564e-05, "loss": 2.186, "step": 700 }, { "epoch": 0.8907242693773825, "grad_norm": 1.2030018813648606, "learning_rate": 1.7849726275713477e-05, "loss": 2.2642, "step": 701 }, { "epoch": 0.8919949174078781, "grad_norm": 1.157278199265185, "learning_rate": 1.7843460000531066e-05, "loss": 2.0533, "step": 702 }, { "epoch": 0.8932655654383735, "grad_norm": 1.1334035384047099, "learning_rate": 1.78371857114218e-05, "loss": 2.1672, "step": 703 }, { "epoch": 0.8945362134688691, "grad_norm": 1.1698257316198195, "learning_rate": 1.7830903414796338e-05, "loss": 2.1527, "step": 704 }, { "epoch": 0.8958068614993647, "grad_norm": 1.0649348969607102, "learning_rate": 1.78246131170735e-05, "loss": 2.001, "step": 705 }, { "epoch": 0.8970775095298602, "grad_norm": 16.103293834646067, "learning_rate": 1.78183148246803e-05, "loss": 2.0628, "step": 706 }, { "epoch": 0.8983481575603558, "grad_norm": 1.4097073445715416, "learning_rate": 1.781200854405192e-05, "loss": 2.187, "step": 707 }, { "epoch": 0.8996188055908514, "grad_norm": 1.2758116339569952, "learning_rate": 1.7805694281631687e-05, "loss": 2.3274, "step": 708 }, { "epoch": 0.9008894536213469, "grad_norm": 1.1765512106097247, "learning_rate": 1.7799372043871107e-05, "loss": 2.2118, "step": 709 }, { "epoch": 0.9021601016518425, "grad_norm": 1.4124203146752978, "learning_rate": 1.779304183722982e-05, "loss": 2.0081, "step": 710 }, { "epoch": 0.9034307496823379, "grad_norm": 1.0319073252257085, "learning_rate": 1.778670366817561e-05, "loss": 1.8963, "step": 711 }, { "epoch": 0.9047013977128335, "grad_norm": 1.3123039682019952, "learning_rate": 1.7780357543184396e-05, "loss": 2.0833, "step": 712 }, { "epoch": 0.9059720457433291, "grad_norm": 1.5121323183764703, "learning_rate": 1.777400346874023e-05, "loss": 2.2133, "step": 713 }, { "epoch": 0.9072426937738246, "grad_norm": 1.240278972114728, "learning_rate": 1.776764145133528e-05, "loss": 2.0854, "step": 714 }, { "epoch": 0.9085133418043202, "grad_norm": 1.2638134939297732, "learning_rate": 1.776127149746984e-05, "loss": 2.0003, "step": 715 }, { "epoch": 0.9097839898348158, "grad_norm": 1.8027487698425262, "learning_rate": 1.7754893613652296e-05, "loss": 2.2157, "step": 716 }, { "epoch": 0.9110546378653113, "grad_norm": 1.0790178813589926, "learning_rate": 1.7748507806399158e-05, "loss": 2.1216, "step": 717 }, { "epoch": 0.9123252858958069, "grad_norm": 1.1894613911325647, "learning_rate": 1.774211408223501e-05, "loss": 2.1762, "step": 718 }, { "epoch": 0.9135959339263025, "grad_norm": 6.847859792225901, "learning_rate": 1.773571244769254e-05, "loss": 1.9941, "step": 719 }, { "epoch": 0.9148665819567979, "grad_norm": 1.089714241501095, "learning_rate": 1.772930290931251e-05, "loss": 1.8715, "step": 720 }, { "epoch": 0.9161372299872935, "grad_norm": 1.049498859144584, "learning_rate": 1.7722885473643767e-05, "loss": 2.1452, "step": 721 }, { "epoch": 0.9174078780177891, "grad_norm": 1.3170625509291867, "learning_rate": 1.7716460147243216e-05, "loss": 2.3355, "step": 722 }, { "epoch": 0.9186785260482846, "grad_norm": 1.1145107193428012, "learning_rate": 1.771002693667583e-05, "loss": 2.2057, "step": 723 }, { "epoch": 0.9199491740787802, "grad_norm": 1.0733911923183925, "learning_rate": 1.770358584851463e-05, "loss": 2.1601, "step": 724 }, { "epoch": 0.9212198221092758, "grad_norm": 1.1931077636376628, "learning_rate": 1.7697136889340707e-05, "loss": 2.3301, "step": 725 }, { "epoch": 0.9224904701397713, "grad_norm": 1.0214572577114507, "learning_rate": 1.769068006574317e-05, "loss": 2.1586, "step": 726 }, { "epoch": 0.9237611181702668, "grad_norm": 1.0430710658686952, "learning_rate": 1.7684215384319174e-05, "loss": 2.2141, "step": 727 }, { "epoch": 0.9250317662007624, "grad_norm": 1.1588749834203884, "learning_rate": 1.7677742851673902e-05, "loss": 1.9551, "step": 728 }, { "epoch": 0.9263024142312579, "grad_norm": 1.140144952770098, "learning_rate": 1.7671262474420556e-05, "loss": 2.0731, "step": 729 }, { "epoch": 0.9275730622617535, "grad_norm": 1.2093997211822138, "learning_rate": 1.766477425918036e-05, "loss": 2.2245, "step": 730 }, { "epoch": 0.928843710292249, "grad_norm": 1.1070779104664465, "learning_rate": 1.7658278212582535e-05, "loss": 2.2871, "step": 731 }, { "epoch": 0.9301143583227446, "grad_norm": 1.1627076492057788, "learning_rate": 1.7651774341264318e-05, "loss": 2.2414, "step": 732 }, { "epoch": 0.9313850063532402, "grad_norm": 1.2056060527074959, "learning_rate": 1.7645262651870926e-05, "loss": 2.2713, "step": 733 }, { "epoch": 0.9326556543837357, "grad_norm": 1.0225727328183227, "learning_rate": 1.763874315105558e-05, "loss": 1.9503, "step": 734 }, { "epoch": 0.9339263024142312, "grad_norm": 1.118218927788584, "learning_rate": 1.7632215845479463e-05, "loss": 2.1835, "step": 735 }, { "epoch": 0.9351969504447268, "grad_norm": 1.1167093296828845, "learning_rate": 1.7625680741811745e-05, "loss": 2.2381, "step": 736 }, { "epoch": 0.9364675984752223, "grad_norm": 1.12942128452186, "learning_rate": 1.7619137846729567e-05, "loss": 2.0646, "step": 737 }, { "epoch": 0.9377382465057179, "grad_norm": 1.238502072438946, "learning_rate": 1.7612587166918023e-05, "loss": 2.2532, "step": 738 }, { "epoch": 0.9390088945362135, "grad_norm": 1.0984145820665707, "learning_rate": 1.760602870907016e-05, "loss": 2.1203, "step": 739 }, { "epoch": 0.940279542566709, "grad_norm": 1.5818781364794277, "learning_rate": 1.7599462479886976e-05, "loss": 2.375, "step": 740 }, { "epoch": 0.9415501905972046, "grad_norm": 1.1523217743831182, "learning_rate": 1.759288848607741e-05, "loss": 2.086, "step": 741 }, { "epoch": 0.9428208386277002, "grad_norm": 1.8822451973588672, "learning_rate": 1.758630673435833e-05, "loss": 2.1809, "step": 742 }, { "epoch": 0.9440914866581956, "grad_norm": 1.2253188338937813, "learning_rate": 1.757971723145453e-05, "loss": 2.1441, "step": 743 }, { "epoch": 0.9453621346886912, "grad_norm": 1.1325770019028607, "learning_rate": 1.7573119984098736e-05, "loss": 2.1922, "step": 744 }, { "epoch": 0.9466327827191868, "grad_norm": 1.4051086333055203, "learning_rate": 1.756651499903157e-05, "loss": 2.1348, "step": 745 }, { "epoch": 0.9479034307496823, "grad_norm": 1.4161450474571087, "learning_rate": 1.7559902283001568e-05, "loss": 2.4787, "step": 746 }, { "epoch": 0.9491740787801779, "grad_norm": 1.3285177253807692, "learning_rate": 1.755328184276517e-05, "loss": 2.3242, "step": 747 }, { "epoch": 0.9504447268106735, "grad_norm": 1.109819420322136, "learning_rate": 1.7546653685086696e-05, "loss": 1.939, "step": 748 }, { "epoch": 0.951715374841169, "grad_norm": 1.026234939259307, "learning_rate": 1.7540017816738358e-05, "loss": 1.8793, "step": 749 }, { "epoch": 0.9529860228716646, "grad_norm": 1.0441675637509298, "learning_rate": 1.753337424450025e-05, "loss": 2.0132, "step": 750 }, { "epoch": 0.9542566709021602, "grad_norm": 1.1594745381636706, "learning_rate": 1.7526722975160334e-05, "loss": 2.2498, "step": 751 }, { "epoch": 0.9555273189326556, "grad_norm": 1.3775100569123298, "learning_rate": 1.7520064015514425e-05, "loss": 2.3491, "step": 752 }, { "epoch": 0.9567979669631512, "grad_norm": 1.208713834802541, "learning_rate": 1.751339737236622e-05, "loss": 2.1424, "step": 753 }, { "epoch": 0.9580686149936467, "grad_norm": 1.3760413828742442, "learning_rate": 1.7506723052527243e-05, "loss": 1.9454, "step": 754 }, { "epoch": 0.9593392630241423, "grad_norm": 1.2670339177037218, "learning_rate": 1.7500041062816875e-05, "loss": 2.1874, "step": 755 }, { "epoch": 0.9606099110546379, "grad_norm": 1.1343893644179321, "learning_rate": 1.749335141006233e-05, "loss": 2.1512, "step": 756 }, { "epoch": 0.9618805590851334, "grad_norm": 1.2181137851036752, "learning_rate": 1.748665410109865e-05, "loss": 2.1203, "step": 757 }, { "epoch": 0.963151207115629, "grad_norm": 1.1951718410904983, "learning_rate": 1.7479949142768703e-05, "loss": 2.0477, "step": 758 }, { "epoch": 0.9644218551461246, "grad_norm": 1.1707207007043994, "learning_rate": 1.747323654192316e-05, "loss": 2.2146, "step": 759 }, { "epoch": 0.96569250317662, "grad_norm": 1.1453945786997743, "learning_rate": 1.7466516305420524e-05, "loss": 2.2439, "step": 760 }, { "epoch": 0.9669631512071156, "grad_norm": 1.0994338227829146, "learning_rate": 1.7459788440127083e-05, "loss": 2.1315, "step": 761 }, { "epoch": 0.9682337992376112, "grad_norm": 1.0612891908924782, "learning_rate": 1.7453052952916924e-05, "loss": 2.2191, "step": 762 }, { "epoch": 0.9695044472681067, "grad_norm": 1.367502676104824, "learning_rate": 1.7446309850671913e-05, "loss": 2.4506, "step": 763 }, { "epoch": 0.9707750952986023, "grad_norm": 1.1449655488905122, "learning_rate": 1.7439559140281713e-05, "loss": 1.9004, "step": 764 }, { "epoch": 0.9720457433290979, "grad_norm": 1.1512864595180432, "learning_rate": 1.7432800828643747e-05, "loss": 2.2074, "step": 765 }, { "epoch": 0.9733163913595934, "grad_norm": 1.5315760269981107, "learning_rate": 1.7426034922663217e-05, "loss": 2.4534, "step": 766 }, { "epoch": 0.974587039390089, "grad_norm": 1.2894382443177794, "learning_rate": 1.7419261429253063e-05, "loss": 2.054, "step": 767 }, { "epoch": 0.9758576874205845, "grad_norm": 1.074928556598427, "learning_rate": 1.7412480355334006e-05, "loss": 1.9893, "step": 768 }, { "epoch": 0.97712833545108, "grad_norm": 1.1272788651886883, "learning_rate": 1.7405691707834484e-05, "loss": 1.8961, "step": 769 }, { "epoch": 0.9783989834815756, "grad_norm": 1.0740886049498373, "learning_rate": 1.7398895493690694e-05, "loss": 2.2882, "step": 770 }, { "epoch": 0.9796696315120712, "grad_norm": 1.2385420633979962, "learning_rate": 1.7392091719846557e-05, "loss": 2.2462, "step": 771 }, { "epoch": 0.9809402795425667, "grad_norm": 1.1447185949365566, "learning_rate": 1.7385280393253717e-05, "loss": 2.4673, "step": 772 }, { "epoch": 0.9822109275730623, "grad_norm": 1.095160743555286, "learning_rate": 1.7378461520871533e-05, "loss": 2.1134, "step": 773 }, { "epoch": 0.9834815756035579, "grad_norm": 1.407532638593707, "learning_rate": 1.7371635109667077e-05, "loss": 2.4098, "step": 774 }, { "epoch": 0.9847522236340533, "grad_norm": 1.274115078111504, "learning_rate": 1.7364801166615124e-05, "loss": 2.1994, "step": 775 }, { "epoch": 0.9860228716645489, "grad_norm": 1.1102796517887026, "learning_rate": 1.7357959698698142e-05, "loss": 2.162, "step": 776 }, { "epoch": 0.9872935196950444, "grad_norm": 1.1391943620076197, "learning_rate": 1.735111071290629e-05, "loss": 2.2424, "step": 777 }, { "epoch": 0.98856416772554, "grad_norm": 1.3106849259724793, "learning_rate": 1.7344254216237405e-05, "loss": 2.3158, "step": 778 }, { "epoch": 0.9898348157560356, "grad_norm": 1.2261422208301112, "learning_rate": 1.7337390215697005e-05, "loss": 2.4132, "step": 779 }, { "epoch": 0.9911054637865311, "grad_norm": 1.320383705607811, "learning_rate": 1.7330518718298263e-05, "loss": 2.137, "step": 780 }, { "epoch": 0.9923761118170267, "grad_norm": 1.052165293466006, "learning_rate": 1.732363973106203e-05, "loss": 2.1737, "step": 781 }, { "epoch": 0.9936467598475223, "grad_norm": 1.1620672915501906, "learning_rate": 1.7316753261016782e-05, "loss": 2.1117, "step": 782 }, { "epoch": 0.9949174078780177, "grad_norm": 1.2006436789964456, "learning_rate": 1.7309859315198676e-05, "loss": 2.0257, "step": 783 }, { "epoch": 0.9961880559085133, "grad_norm": 1.1215243404250974, "learning_rate": 1.7302957900651477e-05, "loss": 2.0687, "step": 784 }, { "epoch": 0.9974587039390089, "grad_norm": 1.1093169148694415, "learning_rate": 1.729604902442659e-05, "loss": 2.3016, "step": 785 }, { "epoch": 0.9987293519695044, "grad_norm": 1.1811840102612634, "learning_rate": 1.7289132693583054e-05, "loss": 2.2341, "step": 786 }, { "epoch": 1.0, "grad_norm": 1.1099305877051853, "learning_rate": 1.7282208915187512e-05, "loss": 2.1384, "step": 787 }, { "epoch": 1.0012706480304956, "grad_norm": 1.6865753377023602, "learning_rate": 1.727527769631422e-05, "loss": 2.0813, "step": 788 }, { "epoch": 1.0025412960609912, "grad_norm": 1.6166021118485312, "learning_rate": 1.7268339044045044e-05, "loss": 2.0591, "step": 789 }, { "epoch": 1.0038119440914866, "grad_norm": 1.1822516961256184, "learning_rate": 1.7261392965469436e-05, "loss": 1.9321, "step": 790 }, { "epoch": 1.0050825921219821, "grad_norm": 1.366285683480337, "learning_rate": 1.7254439467684433e-05, "loss": 1.8407, "step": 791 }, { "epoch": 1.0063532401524777, "grad_norm": 2.1905662751665536, "learning_rate": 1.7247478557794662e-05, "loss": 1.8602, "step": 792 }, { "epoch": 1.0076238881829733, "grad_norm": 1.6623048912149558, "learning_rate": 1.7240510242912315e-05, "loss": 2.1063, "step": 793 }, { "epoch": 1.008894536213469, "grad_norm": 1.3109263123151378, "learning_rate": 1.7233534530157163e-05, "loss": 1.9212, "step": 794 }, { "epoch": 1.0101651842439645, "grad_norm": 1.2236823988689063, "learning_rate": 1.7226551426656514e-05, "loss": 1.8459, "step": 795 }, { "epoch": 1.0114358322744599, "grad_norm": 1.418709456304477, "learning_rate": 1.7219560939545246e-05, "loss": 2.0208, "step": 796 }, { "epoch": 1.0127064803049555, "grad_norm": 1.2535964475433163, "learning_rate": 1.7212563075965774e-05, "loss": 1.9942, "step": 797 }, { "epoch": 1.013977128335451, "grad_norm": 1.0444045046192412, "learning_rate": 1.7205557843068053e-05, "loss": 1.8317, "step": 798 }, { "epoch": 1.0152477763659467, "grad_norm": 1.2623627126003907, "learning_rate": 1.719854524800956e-05, "loss": 1.8213, "step": 799 }, { "epoch": 1.0165184243964422, "grad_norm": 1.325424500121208, "learning_rate": 1.7191525297955306e-05, "loss": 1.9402, "step": 800 }, { "epoch": 1.0177890724269378, "grad_norm": 1.2087028271623546, "learning_rate": 1.7184498000077804e-05, "loss": 1.8926, "step": 801 }, { "epoch": 1.0190597204574332, "grad_norm": 1.4038085355277468, "learning_rate": 1.7177463361557082e-05, "loss": 2.0807, "step": 802 }, { "epoch": 1.0203303684879288, "grad_norm": 1.21958929015843, "learning_rate": 1.7170421389580666e-05, "loss": 1.7006, "step": 803 }, { "epoch": 1.0216010165184244, "grad_norm": 1.1964204779992977, "learning_rate": 1.7163372091343578e-05, "loss": 1.6272, "step": 804 }, { "epoch": 1.02287166454892, "grad_norm": 1.1449886242015361, "learning_rate": 1.7156315474048323e-05, "loss": 1.8457, "step": 805 }, { "epoch": 1.0241423125794156, "grad_norm": 1.2278404715003532, "learning_rate": 1.7149251544904882e-05, "loss": 1.6541, "step": 806 }, { "epoch": 1.0254129606099112, "grad_norm": 1.1610162199882954, "learning_rate": 1.714218031113071e-05, "loss": 1.8906, "step": 807 }, { "epoch": 1.0266836086404065, "grad_norm": 1.1675901686694194, "learning_rate": 1.7135101779950724e-05, "loss": 1.7901, "step": 808 }, { "epoch": 1.0279542566709021, "grad_norm": 1.2124269726496553, "learning_rate": 1.71280159585973e-05, "loss": 1.8684, "step": 809 }, { "epoch": 1.0292249047013977, "grad_norm": 1.2294992092903338, "learning_rate": 1.712092285431026e-05, "loss": 1.7482, "step": 810 }, { "epoch": 1.0304955527318933, "grad_norm": 1.1287183716961813, "learning_rate": 1.7113822474336857e-05, "loss": 1.7951, "step": 811 }, { "epoch": 1.031766200762389, "grad_norm": 1.4134078470159432, "learning_rate": 1.7106714825931803e-05, "loss": 2.0693, "step": 812 }, { "epoch": 1.0330368487928843, "grad_norm": 1.1782788890686648, "learning_rate": 1.709959991635721e-05, "loss": 1.8525, "step": 813 }, { "epoch": 1.0343074968233799, "grad_norm": 1.2043372856739532, "learning_rate": 1.7092477752882626e-05, "loss": 1.9692, "step": 814 }, { "epoch": 1.0355781448538754, "grad_norm": 1.224457390433863, "learning_rate": 1.7085348342785003e-05, "loss": 1.886, "step": 815 }, { "epoch": 1.036848792884371, "grad_norm": 1.2410254417348827, "learning_rate": 1.7078211693348704e-05, "loss": 1.9182, "step": 816 }, { "epoch": 1.0381194409148666, "grad_norm": 1.182732969839237, "learning_rate": 1.7071067811865477e-05, "loss": 1.9967, "step": 817 }, { "epoch": 1.0393900889453622, "grad_norm": 1.0655611049771787, "learning_rate": 1.706391670563447e-05, "loss": 1.8914, "step": 818 }, { "epoch": 1.0406607369758576, "grad_norm": 1.2324879012238552, "learning_rate": 1.7056758381962204e-05, "loss": 1.9612, "step": 819 }, { "epoch": 1.0419313850063532, "grad_norm": 1.2666348326958925, "learning_rate": 1.7049592848162583e-05, "loss": 1.9061, "step": 820 }, { "epoch": 1.0432020330368488, "grad_norm": 1.4256014873272047, "learning_rate": 1.7042420111556874e-05, "loss": 2.2497, "step": 821 }, { "epoch": 1.0444726810673444, "grad_norm": 1.200697855013483, "learning_rate": 1.7035240179473703e-05, "loss": 1.7399, "step": 822 }, { "epoch": 1.04574332909784, "grad_norm": 1.124806936679507, "learning_rate": 1.7028053059249045e-05, "loss": 1.7169, "step": 823 }, { "epoch": 1.0470139771283355, "grad_norm": 1.3764541156500518, "learning_rate": 1.702085875822623e-05, "loss": 1.9372, "step": 824 }, { "epoch": 1.048284625158831, "grad_norm": 1.2206817503147485, "learning_rate": 1.7013657283755904e-05, "loss": 1.9339, "step": 825 }, { "epoch": 1.0495552731893265, "grad_norm": 1.5533366987820596, "learning_rate": 1.700644864319607e-05, "loss": 1.8839, "step": 826 }, { "epoch": 1.050825921219822, "grad_norm": 1.3504868408426431, "learning_rate": 1.699923284391203e-05, "loss": 2.0348, "step": 827 }, { "epoch": 1.0520965692503177, "grad_norm": 1.542565833973917, "learning_rate": 1.699200989327641e-05, "loss": 1.8666, "step": 828 }, { "epoch": 1.0533672172808133, "grad_norm": 1.2848638016187683, "learning_rate": 1.6984779798669144e-05, "loss": 2.013, "step": 829 }, { "epoch": 1.0546378653113089, "grad_norm": 1.2129845471109177, "learning_rate": 1.6977542567477464e-05, "loss": 2.0537, "step": 830 }, { "epoch": 1.0559085133418042, "grad_norm": 1.3442257128941968, "learning_rate": 1.6970298207095887e-05, "loss": 1.8845, "step": 831 }, { "epoch": 1.0571791613722998, "grad_norm": 1.257055494734371, "learning_rate": 1.6963046724926222e-05, "loss": 1.9277, "step": 832 }, { "epoch": 1.0584498094027954, "grad_norm": 1.281494268857617, "learning_rate": 1.6955788128377552e-05, "loss": 1.8302, "step": 833 }, { "epoch": 1.059720457433291, "grad_norm": 1.2904533027843197, "learning_rate": 1.6948522424866233e-05, "loss": 1.9057, "step": 834 }, { "epoch": 1.0609911054637866, "grad_norm": 1.7221630896204667, "learning_rate": 1.6941249621815872e-05, "loss": 2.0127, "step": 835 }, { "epoch": 1.062261753494282, "grad_norm": 5.24316375130168, "learning_rate": 1.6933969726657344e-05, "loss": 1.7981, "step": 836 }, { "epoch": 1.0635324015247776, "grad_norm": 1.165789449691196, "learning_rate": 1.6926682746828756e-05, "loss": 1.9228, "step": 837 }, { "epoch": 1.0648030495552732, "grad_norm": 1.2885203823264162, "learning_rate": 1.6919388689775463e-05, "loss": 1.9446, "step": 838 }, { "epoch": 1.0660736975857688, "grad_norm": 1.1429621958809697, "learning_rate": 1.691208756295005e-05, "loss": 1.8116, "step": 839 }, { "epoch": 1.0673443456162643, "grad_norm": 1.1263030732817316, "learning_rate": 1.690477937381232e-05, "loss": 1.6862, "step": 840 }, { "epoch": 1.06861499364676, "grad_norm": 1.2805477063039459, "learning_rate": 1.68974641298293e-05, "loss": 2.0322, "step": 841 }, { "epoch": 1.0698856416772553, "grad_norm": 1.2665800972478127, "learning_rate": 1.689014183847522e-05, "loss": 1.7609, "step": 842 }, { "epoch": 1.071156289707751, "grad_norm": 2.2885846073426794, "learning_rate": 1.6882812507231508e-05, "loss": 2.0702, "step": 843 }, { "epoch": 1.0724269377382465, "grad_norm": 1.3510261901534273, "learning_rate": 1.6875476143586788e-05, "loss": 1.9481, "step": 844 }, { "epoch": 1.073697585768742, "grad_norm": 1.1744973051438936, "learning_rate": 1.6868132755036875e-05, "loss": 1.7137, "step": 845 }, { "epoch": 1.0749682337992377, "grad_norm": 1.3373246241234256, "learning_rate": 1.686078234908475e-05, "loss": 1.9151, "step": 846 }, { "epoch": 1.0762388818297333, "grad_norm": 1.1717664629864653, "learning_rate": 1.6853424933240575e-05, "loss": 1.9916, "step": 847 }, { "epoch": 1.0775095298602286, "grad_norm": 1.38230628449887, "learning_rate": 1.6846060515021665e-05, "loss": 1.7794, "step": 848 }, { "epoch": 1.0787801778907242, "grad_norm": 1.1800364634451153, "learning_rate": 1.68386891019525e-05, "loss": 1.9383, "step": 849 }, { "epoch": 1.0800508259212198, "grad_norm": 1.1714365329438117, "learning_rate": 1.683131070156469e-05, "loss": 1.7648, "step": 850 }, { "epoch": 1.0813214739517154, "grad_norm": 1.4271044105935007, "learning_rate": 1.6823925321397004e-05, "loss": 2.0348, "step": 851 }, { "epoch": 1.082592121982211, "grad_norm": 1.2096672277237541, "learning_rate": 1.681653296899533e-05, "loss": 1.8899, "step": 852 }, { "epoch": 1.0838627700127064, "grad_norm": 1.2638593194626848, "learning_rate": 1.6809133651912682e-05, "loss": 1.8186, "step": 853 }, { "epoch": 1.085133418043202, "grad_norm": 1.3009262132337112, "learning_rate": 1.6801727377709195e-05, "loss": 1.6465, "step": 854 }, { "epoch": 1.0864040660736975, "grad_norm": 1.3550953845397025, "learning_rate": 1.6794314153952105e-05, "loss": 1.7697, "step": 855 }, { "epoch": 1.0876747141041931, "grad_norm": 1.43920839620835, "learning_rate": 1.6786893988215753e-05, "loss": 1.7269, "step": 856 }, { "epoch": 1.0889453621346887, "grad_norm": 1.4408291141229703, "learning_rate": 1.677946688808157e-05, "loss": 1.8983, "step": 857 }, { "epoch": 1.0902160101651843, "grad_norm": 1.2230880717822934, "learning_rate": 1.6772032861138078e-05, "loss": 1.885, "step": 858 }, { "epoch": 1.0914866581956797, "grad_norm": 1.3449063439142093, "learning_rate": 1.676459191498087e-05, "loss": 2.0056, "step": 859 }, { "epoch": 1.0927573062261753, "grad_norm": 1.3984387684398114, "learning_rate": 1.675714405721261e-05, "loss": 1.9134, "step": 860 }, { "epoch": 1.0940279542566709, "grad_norm": 2.3409719411538252, "learning_rate": 1.674968929544303e-05, "loss": 1.5216, "step": 861 }, { "epoch": 1.0952986022871665, "grad_norm": 1.702044326540242, "learning_rate": 1.6742227637288898e-05, "loss": 2.0485, "step": 862 }, { "epoch": 1.096569250317662, "grad_norm": 1.264330261539595, "learning_rate": 1.6734759090374057e-05, "loss": 1.7836, "step": 863 }, { "epoch": 1.0978398983481577, "grad_norm": 1.1184031235776326, "learning_rate": 1.6727283662329365e-05, "loss": 1.8595, "step": 864 }, { "epoch": 1.099110546378653, "grad_norm": 1.467822990156111, "learning_rate": 1.6719801360792713e-05, "loss": 1.8487, "step": 865 }, { "epoch": 1.1003811944091486, "grad_norm": 1.4373221055972418, "learning_rate": 1.6712312193409032e-05, "loss": 2.1972, "step": 866 }, { "epoch": 1.1016518424396442, "grad_norm": 1.4625603630755457, "learning_rate": 1.6704816167830244e-05, "loss": 2.3222, "step": 867 }, { "epoch": 1.1029224904701398, "grad_norm": 1.3277403647399606, "learning_rate": 1.6697313291715297e-05, "loss": 2.0532, "step": 868 }, { "epoch": 1.1041931385006354, "grad_norm": 1.2948019609048766, "learning_rate": 1.6689803572730135e-05, "loss": 1.7394, "step": 869 }, { "epoch": 1.105463786531131, "grad_norm": 1.3749134131691831, "learning_rate": 1.6682287018547683e-05, "loss": 1.8651, "step": 870 }, { "epoch": 1.1067344345616263, "grad_norm": 1.2882783772029283, "learning_rate": 1.667476363684786e-05, "loss": 2.0374, "step": 871 }, { "epoch": 1.108005082592122, "grad_norm": 1.2171841086218058, "learning_rate": 1.6667233435317563e-05, "loss": 2.0968, "step": 872 }, { "epoch": 1.1092757306226175, "grad_norm": 1.3344992215686204, "learning_rate": 1.6659696421650645e-05, "loss": 1.8383, "step": 873 }, { "epoch": 1.1105463786531131, "grad_norm": 1.2961439792458473, "learning_rate": 1.6652152603547928e-05, "loss": 1.9911, "step": 874 }, { "epoch": 1.1118170266836087, "grad_norm": 1.3617320682602514, "learning_rate": 1.6644601988717188e-05, "loss": 1.9132, "step": 875 }, { "epoch": 1.1130876747141043, "grad_norm": 1.6872293277125439, "learning_rate": 1.6637044584873137e-05, "loss": 1.8031, "step": 876 }, { "epoch": 1.1143583227445997, "grad_norm": 1.2658405906055896, "learning_rate": 1.6629480399737432e-05, "loss": 1.7167, "step": 877 }, { "epoch": 1.1156289707750953, "grad_norm": 1.3956453079025612, "learning_rate": 1.6621909441038657e-05, "loss": 2.0243, "step": 878 }, { "epoch": 1.1168996188055909, "grad_norm": 1.4283319075492786, "learning_rate": 1.661433171651231e-05, "loss": 2.0217, "step": 879 }, { "epoch": 1.1181702668360864, "grad_norm": 1.3955373688541373, "learning_rate": 1.6606747233900816e-05, "loss": 1.8204, "step": 880 }, { "epoch": 1.119440914866582, "grad_norm": 1.1624948055016757, "learning_rate": 1.6599156000953486e-05, "loss": 1.6933, "step": 881 }, { "epoch": 1.1207115628970774, "grad_norm": 1.658026570394342, "learning_rate": 1.6591558025426544e-05, "loss": 1.8716, "step": 882 }, { "epoch": 1.121982210927573, "grad_norm": 1.0879821817763893, "learning_rate": 1.658395331508309e-05, "loss": 1.7472, "step": 883 }, { "epoch": 1.1232528589580686, "grad_norm": 1.3531044307917879, "learning_rate": 1.6576341877693126e-05, "loss": 2.0277, "step": 884 }, { "epoch": 1.1245235069885642, "grad_norm": 1.2802821111389169, "learning_rate": 1.65687237210335e-05, "loss": 1.6896, "step": 885 }, { "epoch": 1.1257941550190598, "grad_norm": 1.6340628162781528, "learning_rate": 1.656109885288794e-05, "loss": 2.2014, "step": 886 }, { "epoch": 1.1270648030495554, "grad_norm": 1.4061256442942187, "learning_rate": 1.655346728104704e-05, "loss": 1.7754, "step": 887 }, { "epoch": 1.1283354510800507, "grad_norm": 1.5504183318124294, "learning_rate": 1.6545829013308225e-05, "loss": 2.0022, "step": 888 }, { "epoch": 1.1296060991105463, "grad_norm": 1.455095611292746, "learning_rate": 1.653818405747577e-05, "loss": 2.0624, "step": 889 }, { "epoch": 1.130876747141042, "grad_norm": 1.3253465178231698, "learning_rate": 1.653053242136079e-05, "loss": 1.9104, "step": 890 }, { "epoch": 1.1321473951715375, "grad_norm": 1.182950414429057, "learning_rate": 1.6522874112781213e-05, "loss": 1.8832, "step": 891 }, { "epoch": 1.133418043202033, "grad_norm": 1.3356155853185752, "learning_rate": 1.6515209139561796e-05, "loss": 1.9008, "step": 892 }, { "epoch": 1.1346886912325287, "grad_norm": 1.4560487097685317, "learning_rate": 1.6507537509534094e-05, "loss": 1.9428, "step": 893 }, { "epoch": 1.135959339263024, "grad_norm": 1.2827728857110126, "learning_rate": 1.6499859230536468e-05, "loss": 1.6692, "step": 894 }, { "epoch": 1.1372299872935197, "grad_norm": 1.4362212085193078, "learning_rate": 1.6492174310414082e-05, "loss": 1.7576, "step": 895 }, { "epoch": 1.1385006353240152, "grad_norm": 1.482776094668404, "learning_rate": 1.6484482757018873e-05, "loss": 1.7185, "step": 896 }, { "epoch": 1.1397712833545108, "grad_norm": 1.6901901653673548, "learning_rate": 1.6476784578209556e-05, "loss": 2.017, "step": 897 }, { "epoch": 1.1410419313850064, "grad_norm": 1.3282297106480905, "learning_rate": 1.6469079781851625e-05, "loss": 1.5978, "step": 898 }, { "epoch": 1.1423125794155018, "grad_norm": 1.5504185271608384, "learning_rate": 1.6461368375817328e-05, "loss": 1.8685, "step": 899 }, { "epoch": 1.1435832274459974, "grad_norm": 1.279342293663459, "learning_rate": 1.6453650367985666e-05, "loss": 1.9452, "step": 900 }, { "epoch": 1.144853875476493, "grad_norm": 1.5724299314720493, "learning_rate": 1.6445925766242392e-05, "loss": 1.8533, "step": 901 }, { "epoch": 1.1461245235069886, "grad_norm": 1.4037091216044704, "learning_rate": 1.6438194578479987e-05, "loss": 2.0887, "step": 902 }, { "epoch": 1.1473951715374842, "grad_norm": 2.3844841954482137, "learning_rate": 1.6430456812597664e-05, "loss": 1.8733, "step": 903 }, { "epoch": 1.1486658195679798, "grad_norm": 1.5873692114727178, "learning_rate": 1.642271247650136e-05, "loss": 1.9257, "step": 904 }, { "epoch": 1.1499364675984753, "grad_norm": 1.2912057122897365, "learning_rate": 1.6414961578103728e-05, "loss": 2.0363, "step": 905 }, { "epoch": 1.1512071156289707, "grad_norm": 1.2729098473928389, "learning_rate": 1.6407204125324117e-05, "loss": 1.9805, "step": 906 }, { "epoch": 1.1524777636594663, "grad_norm": 1.319722169062551, "learning_rate": 1.639944012608858e-05, "loss": 2.0761, "step": 907 }, { "epoch": 1.153748411689962, "grad_norm": 1.646575352037399, "learning_rate": 1.639166958832985e-05, "loss": 1.9992, "step": 908 }, { "epoch": 1.1550190597204575, "grad_norm": 1.2594306934624784, "learning_rate": 1.6383892519987355e-05, "loss": 2.0578, "step": 909 }, { "epoch": 1.156289707750953, "grad_norm": 1.4116327121078327, "learning_rate": 1.6376108929007182e-05, "loss": 1.9224, "step": 910 }, { "epoch": 1.1575603557814484, "grad_norm": 1.3234022103520608, "learning_rate": 1.6368318823342093e-05, "loss": 2.0213, "step": 911 }, { "epoch": 1.158831003811944, "grad_norm": 1.212138268792923, "learning_rate": 1.6360522210951493e-05, "loss": 1.8848, "step": 912 }, { "epoch": 1.1601016518424396, "grad_norm": 1.3304063220446036, "learning_rate": 1.635271909980145e-05, "loss": 1.8339, "step": 913 }, { "epoch": 1.1613722998729352, "grad_norm": 1.1642529244284137, "learning_rate": 1.6344909497864663e-05, "loss": 1.6725, "step": 914 }, { "epoch": 1.1626429479034308, "grad_norm": 1.4292035396863283, "learning_rate": 1.6337093413120463e-05, "loss": 1.8768, "step": 915 }, { "epoch": 1.1639135959339262, "grad_norm": 1.3692914071794549, "learning_rate": 1.6329270853554807e-05, "loss": 2.0424, "step": 916 }, { "epoch": 1.1651842439644218, "grad_norm": 1.253500022515538, "learning_rate": 1.632144182716027e-05, "loss": 1.8572, "step": 917 }, { "epoch": 1.1664548919949174, "grad_norm": 1.3906010379457432, "learning_rate": 1.631360634193603e-05, "loss": 1.9153, "step": 918 }, { "epoch": 1.167725540025413, "grad_norm": 1.2968509614505375, "learning_rate": 1.6305764405887865e-05, "loss": 1.8864, "step": 919 }, { "epoch": 1.1689961880559085, "grad_norm": 1.2017682531071847, "learning_rate": 1.6297916027028146e-05, "loss": 1.6371, "step": 920 }, { "epoch": 1.1702668360864041, "grad_norm": 1.1043180912826305, "learning_rate": 1.6290061213375824e-05, "loss": 1.7654, "step": 921 }, { "epoch": 1.1715374841168997, "grad_norm": 1.2251539346147005, "learning_rate": 1.6282199972956425e-05, "loss": 1.8949, "step": 922 }, { "epoch": 1.172808132147395, "grad_norm": 1.1938270609692188, "learning_rate": 1.6274332313802046e-05, "loss": 1.7624, "step": 923 }, { "epoch": 1.1740787801778907, "grad_norm": 1.296471490009607, "learning_rate": 1.626645824395134e-05, "loss": 2.0527, "step": 924 }, { "epoch": 1.1753494282083863, "grad_norm": 1.1555457048005713, "learning_rate": 1.6258577771449505e-05, "loss": 1.7623, "step": 925 }, { "epoch": 1.1766200762388819, "grad_norm": 1.2672242046698075, "learning_rate": 1.6250690904348288e-05, "loss": 1.6398, "step": 926 }, { "epoch": 1.1778907242693775, "grad_norm": 1.1599189505741077, "learning_rate": 1.6242797650705965e-05, "loss": 1.8078, "step": 927 }, { "epoch": 1.1791613722998728, "grad_norm": 1.201908050838568, "learning_rate": 1.6234898018587336e-05, "loss": 1.8192, "step": 928 }, { "epoch": 1.1804320203303684, "grad_norm": 1.4051652511781014, "learning_rate": 1.6226992016063726e-05, "loss": 1.7557, "step": 929 }, { "epoch": 1.181702668360864, "grad_norm": 1.2414377174588196, "learning_rate": 1.621907965121296e-05, "loss": 1.8567, "step": 930 }, { "epoch": 1.1829733163913596, "grad_norm": 1.2205450231326151, "learning_rate": 1.621116093211937e-05, "loss": 1.789, "step": 931 }, { "epoch": 1.1842439644218552, "grad_norm": 1.3239114373783731, "learning_rate": 1.6203235866873776e-05, "loss": 1.8891, "step": 932 }, { "epoch": 1.1855146124523508, "grad_norm": 1.143125803189706, "learning_rate": 1.6195304463573483e-05, "loss": 1.9097, "step": 933 }, { "epoch": 1.1867852604828462, "grad_norm": 1.3543160684882207, "learning_rate": 1.618736673032227e-05, "loss": 1.9701, "step": 934 }, { "epoch": 1.1880559085133418, "grad_norm": 1.1892390739019483, "learning_rate": 1.6179422675230393e-05, "loss": 1.8929, "step": 935 }, { "epoch": 1.1893265565438373, "grad_norm": 1.2473887928107703, "learning_rate": 1.6171472306414554e-05, "loss": 1.9877, "step": 936 }, { "epoch": 1.190597204574333, "grad_norm": 1.2343561498635625, "learning_rate": 1.6163515631997916e-05, "loss": 1.8281, "step": 937 }, { "epoch": 1.1918678526048285, "grad_norm": 1.6334791738239713, "learning_rate": 1.6155552660110076e-05, "loss": 2.063, "step": 938 }, { "epoch": 1.1931385006353241, "grad_norm": 1.552460365301119, "learning_rate": 1.6147583398887078e-05, "loss": 2.2069, "step": 939 }, { "epoch": 1.1944091486658195, "grad_norm": 1.5037802443003923, "learning_rate": 1.6139607856471377e-05, "loss": 2.082, "step": 940 }, { "epoch": 1.195679796696315, "grad_norm": 1.3073618678744163, "learning_rate": 1.613162604101186e-05, "loss": 1.8659, "step": 941 }, { "epoch": 1.1969504447268107, "grad_norm": 1.1930066131323456, "learning_rate": 1.6123637960663807e-05, "loss": 1.7654, "step": 942 }, { "epoch": 1.1982210927573063, "grad_norm": 1.4632481383057956, "learning_rate": 1.6115643623588915e-05, "loss": 1.8771, "step": 943 }, { "epoch": 1.1994917407878019, "grad_norm": 1.3551013868985258, "learning_rate": 1.6107643037955268e-05, "loss": 1.8159, "step": 944 }, { "epoch": 1.2007623888182972, "grad_norm": 1.3709358096052835, "learning_rate": 1.6099636211937326e-05, "loss": 1.923, "step": 945 }, { "epoch": 1.2020330368487928, "grad_norm": 1.2385525220234572, "learning_rate": 1.6091623153715937e-05, "loss": 1.7866, "step": 946 }, { "epoch": 1.2033036848792884, "grad_norm": 1.380934926954665, "learning_rate": 1.6083603871478316e-05, "loss": 1.9634, "step": 947 }, { "epoch": 1.204574332909784, "grad_norm": 1.504835347255383, "learning_rate": 1.6075578373418028e-05, "loss": 1.8734, "step": 948 }, { "epoch": 1.2058449809402796, "grad_norm": 1.2570433169210586, "learning_rate": 1.6067546667734996e-05, "loss": 1.7331, "step": 949 }, { "epoch": 1.2071156289707752, "grad_norm": 1.470968126969614, "learning_rate": 1.6059508762635482e-05, "loss": 1.8766, "step": 950 }, { "epoch": 1.2083862770012708, "grad_norm": 2.9329383359308383, "learning_rate": 1.6051464666332087e-05, "loss": 1.8247, "step": 951 }, { "epoch": 1.2096569250317661, "grad_norm": 1.3856521724757025, "learning_rate": 1.604341438704373e-05, "loss": 1.7699, "step": 952 }, { "epoch": 1.2109275730622617, "grad_norm": 1.2420809627636966, "learning_rate": 1.603535793299566e-05, "loss": 1.7875, "step": 953 }, { "epoch": 1.2121982210927573, "grad_norm": 1.1896169838405297, "learning_rate": 1.6027295312419423e-05, "loss": 1.6536, "step": 954 }, { "epoch": 1.213468869123253, "grad_norm": 1.3059711657788073, "learning_rate": 1.6019226533552865e-05, "loss": 1.7259, "step": 955 }, { "epoch": 1.2147395171537485, "grad_norm": 1.294022377822766, "learning_rate": 1.6011151604640137e-05, "loss": 1.9751, "step": 956 }, { "epoch": 1.2160101651842439, "grad_norm": 1.3747145360760893, "learning_rate": 1.6003070533931657e-05, "loss": 1.9108, "step": 957 }, { "epoch": 1.2172808132147395, "grad_norm": 1.3085675703201363, "learning_rate": 1.5994983329684134e-05, "loss": 1.773, "step": 958 }, { "epoch": 1.218551461245235, "grad_norm": 1.3082155598851708, "learning_rate": 1.598689000016053e-05, "loss": 2.0224, "step": 959 }, { "epoch": 1.2198221092757306, "grad_norm": 1.2699164051757423, "learning_rate": 1.597879055363008e-05, "loss": 1.7492, "step": 960 }, { "epoch": 1.2210927573062262, "grad_norm": 1.494781254547359, "learning_rate": 1.597068499836825e-05, "loss": 1.9116, "step": 961 }, { "epoch": 1.2223634053367216, "grad_norm": 1.2803737503935362, "learning_rate": 1.5962573342656765e-05, "loss": 2.0579, "step": 962 }, { "epoch": 1.2236340533672172, "grad_norm": 1.424002755059869, "learning_rate": 1.5954455594783583e-05, "loss": 1.8386, "step": 963 }, { "epoch": 1.2249047013977128, "grad_norm": 1.3124210633130282, "learning_rate": 1.594633176304287e-05, "loss": 1.8892, "step": 964 }, { "epoch": 1.2261753494282084, "grad_norm": 1.1789841678181265, "learning_rate": 1.5938201855735017e-05, "loss": 1.9728, "step": 965 }, { "epoch": 1.227445997458704, "grad_norm": 1.5207365066909595, "learning_rate": 1.5930065881166633e-05, "loss": 1.8877, "step": 966 }, { "epoch": 1.2287166454891996, "grad_norm": 1.4247222633395513, "learning_rate": 1.592192384765051e-05, "loss": 1.9222, "step": 967 }, { "epoch": 1.2299872935196952, "grad_norm": 1.3398663036299368, "learning_rate": 1.5913775763505637e-05, "loss": 1.9223, "step": 968 }, { "epoch": 1.2312579415501905, "grad_norm": 1.4400845015618418, "learning_rate": 1.590562163705719e-05, "loss": 1.721, "step": 969 }, { "epoch": 1.2325285895806861, "grad_norm": 1.4564048181197873, "learning_rate": 1.589746147663651e-05, "loss": 1.8639, "step": 970 }, { "epoch": 1.2337992376111817, "grad_norm": 1.305706120438892, "learning_rate": 1.588929529058111e-05, "loss": 1.6415, "step": 971 }, { "epoch": 1.2350698856416773, "grad_norm": 1.438336680808511, "learning_rate": 1.588112308723466e-05, "loss": 1.7595, "step": 972 }, { "epoch": 1.236340533672173, "grad_norm": 1.3131659046574775, "learning_rate": 1.5872944874946964e-05, "loss": 1.8001, "step": 973 }, { "epoch": 1.2376111817026683, "grad_norm": 1.5464720716272369, "learning_rate": 1.5864760662073987e-05, "loss": 2.1457, "step": 974 }, { "epoch": 1.2388818297331639, "grad_norm": 1.4378603489197082, "learning_rate": 1.5856570456977813e-05, "loss": 1.5821, "step": 975 }, { "epoch": 1.2401524777636594, "grad_norm": 1.2980626938677702, "learning_rate": 1.5848374268026647e-05, "loss": 1.5928, "step": 976 }, { "epoch": 1.241423125794155, "grad_norm": 1.3960821340170957, "learning_rate": 1.5840172103594814e-05, "loss": 1.8233, "step": 977 }, { "epoch": 1.2426937738246506, "grad_norm": 1.4911685659071707, "learning_rate": 1.5831963972062734e-05, "loss": 1.9979, "step": 978 }, { "epoch": 1.2439644218551462, "grad_norm": 1.6023316647723762, "learning_rate": 1.582374988181694e-05, "loss": 1.9972, "step": 979 }, { "epoch": 1.2452350698856416, "grad_norm": 1.3300409718579196, "learning_rate": 1.581552984125004e-05, "loss": 1.9057, "step": 980 }, { "epoch": 1.2465057179161372, "grad_norm": 1.3795536330193834, "learning_rate": 1.5807303858760727e-05, "loss": 1.7926, "step": 981 }, { "epoch": 1.2477763659466328, "grad_norm": 1.4409025468690164, "learning_rate": 1.5799071942753762e-05, "loss": 1.9829, "step": 982 }, { "epoch": 1.2490470139771284, "grad_norm": 1.3140965455913203, "learning_rate": 1.5790834101639974e-05, "loss": 1.8821, "step": 983 }, { "epoch": 1.250317662007624, "grad_norm": 1.3114225109547808, "learning_rate": 1.578259034383624e-05, "loss": 1.9938, "step": 984 }, { "epoch": 1.2515883100381195, "grad_norm": 1.1795051238190635, "learning_rate": 1.5774340677765483e-05, "loss": 1.715, "step": 985 }, { "epoch": 1.252858958068615, "grad_norm": 1.592645153504347, "learning_rate": 1.5766085111856668e-05, "loss": 1.9455, "step": 986 }, { "epoch": 1.2541296060991105, "grad_norm": 1.2715453295489205, "learning_rate": 1.575782365454478e-05, "loss": 1.8798, "step": 987 }, { "epoch": 1.255400254129606, "grad_norm": 1.28007175664308, "learning_rate": 1.574955631427083e-05, "loss": 1.6589, "step": 988 }, { "epoch": 1.2566709021601017, "grad_norm": 1.343344815115729, "learning_rate": 1.5741283099481842e-05, "loss": 1.7512, "step": 989 }, { "epoch": 1.2579415501905973, "grad_norm": 1.2250258478063394, "learning_rate": 1.5733004018630826e-05, "loss": 1.8518, "step": 990 }, { "epoch": 1.2592121982210926, "grad_norm": 1.3278996410852044, "learning_rate": 1.572471908017681e-05, "loss": 1.9421, "step": 991 }, { "epoch": 1.2604828462515882, "grad_norm": 1.277235069888488, "learning_rate": 1.5716428292584788e-05, "loss": 2.0307, "step": 992 }, { "epoch": 1.2617534942820838, "grad_norm": 1.2367617891006584, "learning_rate": 1.570813166432574e-05, "loss": 1.9464, "step": 993 }, { "epoch": 1.2630241423125794, "grad_norm": 1.3106685413405286, "learning_rate": 1.5699829203876603e-05, "loss": 2.085, "step": 994 }, { "epoch": 1.264294790343075, "grad_norm": 1.3322085260452259, "learning_rate": 1.5691520919720285e-05, "loss": 1.8839, "step": 995 }, { "epoch": 1.2655654383735704, "grad_norm": 1.3242145801398495, "learning_rate": 1.568320682034564e-05, "loss": 1.65, "step": 996 }, { "epoch": 1.2668360864040662, "grad_norm": 1.419805813423947, "learning_rate": 1.5674886914247464e-05, "loss": 2.007, "step": 997 }, { "epoch": 1.2681067344345616, "grad_norm": 1.4366987929669561, "learning_rate": 1.5666561209926484e-05, "loss": 2.0678, "step": 998 }, { "epoch": 1.2693773824650572, "grad_norm": 1.4758663789147342, "learning_rate": 1.5658229715889345e-05, "loss": 1.9956, "step": 999 }, { "epoch": 1.2706480304955527, "grad_norm": 1.3717768991812873, "learning_rate": 1.5649892440648625e-05, "loss": 1.9548, "step": 1000 }, { "epoch": 1.2719186785260483, "grad_norm": 1.3419808182638524, "learning_rate": 1.5641549392722794e-05, "loss": 2.0226, "step": 1001 }, { "epoch": 1.273189326556544, "grad_norm": 1.3204598751627847, "learning_rate": 1.563320058063622e-05, "loss": 1.9622, "step": 1002 }, { "epoch": 1.2744599745870393, "grad_norm": 1.3348555688151162, "learning_rate": 1.5624846012919176e-05, "loss": 1.863, "step": 1003 }, { "epoch": 1.275730622617535, "grad_norm": 1.2319709711845521, "learning_rate": 1.5616485698107795e-05, "loss": 1.9321, "step": 1004 }, { "epoch": 1.2770012706480305, "grad_norm": 1.304500718931566, "learning_rate": 1.5608119644744094e-05, "loss": 1.8073, "step": 1005 }, { "epoch": 1.278271918678526, "grad_norm": 1.4587969118807587, "learning_rate": 1.5599747861375957e-05, "loss": 1.8367, "step": 1006 }, { "epoch": 1.2795425667090217, "grad_norm": 2.670844257464096, "learning_rate": 1.559137035655711e-05, "loss": 1.9432, "step": 1007 }, { "epoch": 1.280813214739517, "grad_norm": 1.391990361869969, "learning_rate": 1.558298713884713e-05, "loss": 1.8169, "step": 1008 }, { "epoch": 1.2820838627700126, "grad_norm": 1.4730317876459385, "learning_rate": 1.557459821681144e-05, "loss": 1.9452, "step": 1009 }, { "epoch": 1.2833545108005082, "grad_norm": 1.2810624692796522, "learning_rate": 1.5566203599021275e-05, "loss": 1.9344, "step": 1010 }, { "epoch": 1.2846251588310038, "grad_norm": 1.3035625667895672, "learning_rate": 1.5557803294053705e-05, "loss": 1.8236, "step": 1011 }, { "epoch": 1.2858958068614994, "grad_norm": 1.3803171198408561, "learning_rate": 1.5549397310491605e-05, "loss": 1.8784, "step": 1012 }, { "epoch": 1.287166454891995, "grad_norm": 1.5491986721199595, "learning_rate": 1.5540985656923648e-05, "loss": 1.9153, "step": 1013 }, { "epoch": 1.2884371029224906, "grad_norm": 1.4653474934180506, "learning_rate": 1.55325683419443e-05, "loss": 1.7483, "step": 1014 }, { "epoch": 1.289707750952986, "grad_norm": 1.2889672514437487, "learning_rate": 1.5524145374153822e-05, "loss": 1.6976, "step": 1015 }, { "epoch": 1.2909783989834815, "grad_norm": 1.2494387273702594, "learning_rate": 1.5515716762158235e-05, "loss": 1.9428, "step": 1016 }, { "epoch": 1.2922490470139771, "grad_norm": 1.2519364350061895, "learning_rate": 1.5507282514569345e-05, "loss": 1.8044, "step": 1017 }, { "epoch": 1.2935196950444727, "grad_norm": 1.1327543441072128, "learning_rate": 1.5498842640004698e-05, "loss": 1.8495, "step": 1018 }, { "epoch": 1.2947903430749683, "grad_norm": 1.3215774506253692, "learning_rate": 1.54903971470876e-05, "loss": 1.7701, "step": 1019 }, { "epoch": 1.2960609911054637, "grad_norm": 1.2963665937170432, "learning_rate": 1.54819460444471e-05, "loss": 1.8535, "step": 1020 }, { "epoch": 1.2973316391359593, "grad_norm": 1.2794249882878206, "learning_rate": 1.547348934071797e-05, "loss": 1.8859, "step": 1021 }, { "epoch": 1.2986022871664549, "grad_norm": 1.297699172544122, "learning_rate": 1.5465027044540705e-05, "loss": 1.9074, "step": 1022 }, { "epoch": 1.2998729351969505, "grad_norm": 1.3122435056880466, "learning_rate": 1.5456559164561522e-05, "loss": 1.813, "step": 1023 }, { "epoch": 1.301143583227446, "grad_norm": 1.3336015416831544, "learning_rate": 1.5448085709432338e-05, "loss": 1.8336, "step": 1024 }, { "epoch": 1.3024142312579414, "grad_norm": 1.3902805693497766, "learning_rate": 1.5439606687810767e-05, "loss": 1.7061, "step": 1025 }, { "epoch": 1.3036848792884372, "grad_norm": 1.3184469533981749, "learning_rate": 1.5431122108360114e-05, "loss": 1.8433, "step": 1026 }, { "epoch": 1.3049555273189326, "grad_norm": 1.288658892407689, "learning_rate": 1.5422631979749354e-05, "loss": 1.9737, "step": 1027 }, { "epoch": 1.3062261753494282, "grad_norm": 1.2678604780793599, "learning_rate": 1.5414136310653135e-05, "loss": 1.7707, "step": 1028 }, { "epoch": 1.3074968233799238, "grad_norm": 1.2796312807674892, "learning_rate": 1.5405635109751776e-05, "loss": 1.8034, "step": 1029 }, { "epoch": 1.3087674714104194, "grad_norm": 1.3113897898932672, "learning_rate": 1.5397128385731234e-05, "loss": 1.9589, "step": 1030 }, { "epoch": 1.310038119440915, "grad_norm": 1.3528783655661367, "learning_rate": 1.5388616147283116e-05, "loss": 1.9354, "step": 1031 }, { "epoch": 1.3113087674714103, "grad_norm": 1.2717937721137405, "learning_rate": 1.538009840310466e-05, "loss": 1.6102, "step": 1032 }, { "epoch": 1.312579415501906, "grad_norm": 1.4365578486864976, "learning_rate": 1.537157516189874e-05, "loss": 1.6837, "step": 1033 }, { "epoch": 1.3138500635324015, "grad_norm": 1.210272849766323, "learning_rate": 1.5363046432373824e-05, "loss": 1.6498, "step": 1034 }, { "epoch": 1.3151207115628971, "grad_norm": 1.22942762191297, "learning_rate": 1.5354512223244017e-05, "loss": 1.9632, "step": 1035 }, { "epoch": 1.3163913595933927, "grad_norm": 1.4829589605126763, "learning_rate": 1.5345972543229e-05, "loss": 1.7339, "step": 1036 }, { "epoch": 1.317662007623888, "grad_norm": 1.1986964294583307, "learning_rate": 1.533742740105405e-05, "loss": 1.7729, "step": 1037 }, { "epoch": 1.3189326556543837, "grad_norm": 1.3595188028232712, "learning_rate": 1.532887680545003e-05, "loss": 1.7879, "step": 1038 }, { "epoch": 1.3202033036848793, "grad_norm": 1.3754660533164054, "learning_rate": 1.5320320765153367e-05, "loss": 1.8581, "step": 1039 }, { "epoch": 1.3214739517153749, "grad_norm": 1.3128095568502105, "learning_rate": 1.5311759288906058e-05, "loss": 1.9147, "step": 1040 }, { "epoch": 1.3227445997458704, "grad_norm": 1.2909869827817089, "learning_rate": 1.5303192385455652e-05, "loss": 1.9877, "step": 1041 }, { "epoch": 1.3240152477763658, "grad_norm": 1.276703618063512, "learning_rate": 1.529462006355524e-05, "loss": 1.8402, "step": 1042 }, { "epoch": 1.3252858958068616, "grad_norm": 1.3230638250924585, "learning_rate": 1.528604233196345e-05, "loss": 1.9794, "step": 1043 }, { "epoch": 1.326556543837357, "grad_norm": 1.4336921448073396, "learning_rate": 1.5277459199444443e-05, "loss": 1.9894, "step": 1044 }, { "epoch": 1.3278271918678526, "grad_norm": 1.7737486792928339, "learning_rate": 1.5268870674767896e-05, "loss": 1.8742, "step": 1045 }, { "epoch": 1.3290978398983482, "grad_norm": 1.4078166067251785, "learning_rate": 1.5260276766708984e-05, "loss": 2.0651, "step": 1046 }, { "epoch": 1.3303684879288438, "grad_norm": 1.3665398516482419, "learning_rate": 1.52516774840484e-05, "loss": 2.0082, "step": 1047 }, { "epoch": 1.3316391359593394, "grad_norm": 1.2660456306800727, "learning_rate": 1.5243072835572319e-05, "loss": 2.0152, "step": 1048 }, { "epoch": 1.3329097839898347, "grad_norm": 1.4400191027082891, "learning_rate": 1.5234462830072399e-05, "loss": 1.8645, "step": 1049 }, { "epoch": 1.3341804320203303, "grad_norm": 1.2665509048717487, "learning_rate": 1.522584747634577e-05, "loss": 1.8708, "step": 1050 }, { "epoch": 1.335451080050826, "grad_norm": 1.260635175489598, "learning_rate": 1.5217226783195029e-05, "loss": 1.8302, "step": 1051 }, { "epoch": 1.3367217280813215, "grad_norm": 1.2921682665430776, "learning_rate": 1.5208600759428233e-05, "loss": 1.8964, "step": 1052 }, { "epoch": 1.337992376111817, "grad_norm": 1.1808646501636042, "learning_rate": 1.5199969413858877e-05, "loss": 1.8027, "step": 1053 }, { "epoch": 1.3392630241423125, "grad_norm": 1.2375925259174092, "learning_rate": 1.5191332755305897e-05, "loss": 1.9994, "step": 1054 }, { "epoch": 1.340533672172808, "grad_norm": 1.3342416016936127, "learning_rate": 1.5182690792593659e-05, "loss": 1.8661, "step": 1055 }, { "epoch": 1.3418043202033036, "grad_norm": 1.1515642847973244, "learning_rate": 1.517404353455194e-05, "loss": 1.7801, "step": 1056 }, { "epoch": 1.3430749682337992, "grad_norm": 1.1481938432677656, "learning_rate": 1.5165390990015947e-05, "loss": 1.708, "step": 1057 }, { "epoch": 1.3443456162642948, "grad_norm": 1.2206968911997678, "learning_rate": 1.5156733167826265e-05, "loss": 1.8242, "step": 1058 }, { "epoch": 1.3456162642947904, "grad_norm": 1.3879378412492531, "learning_rate": 1.5148070076828885e-05, "loss": 2.0539, "step": 1059 }, { "epoch": 1.346886912325286, "grad_norm": 1.278359228270404, "learning_rate": 1.513940172587518e-05, "loss": 1.7368, "step": 1060 }, { "epoch": 1.3481575603557814, "grad_norm": 1.2881471856222333, "learning_rate": 1.5130728123821898e-05, "loss": 1.8111, "step": 1061 }, { "epoch": 1.349428208386277, "grad_norm": 1.3135644893277756, "learning_rate": 1.5122049279531143e-05, "loss": 1.8608, "step": 1062 }, { "epoch": 1.3506988564167726, "grad_norm": 1.2408858052927654, "learning_rate": 1.5113365201870388e-05, "loss": 1.9712, "step": 1063 }, { "epoch": 1.3519695044472682, "grad_norm": 1.4792483560642713, "learning_rate": 1.5104675899712447e-05, "loss": 1.8097, "step": 1064 }, { "epoch": 1.3532401524777637, "grad_norm": 1.7074901287628397, "learning_rate": 1.5095981381935468e-05, "loss": 2.0828, "step": 1065 }, { "epoch": 1.3545108005082591, "grad_norm": 1.4288502131269385, "learning_rate": 1.5087281657422935e-05, "loss": 1.8427, "step": 1066 }, { "epoch": 1.3557814485387547, "grad_norm": 1.4385655658495924, "learning_rate": 1.5078576735063646e-05, "loss": 1.5808, "step": 1067 }, { "epoch": 1.3570520965692503, "grad_norm": 1.285975158006785, "learning_rate": 1.5069866623751718e-05, "loss": 1.8827, "step": 1068 }, { "epoch": 1.358322744599746, "grad_norm": 1.3164134269798404, "learning_rate": 1.5061151332386565e-05, "loss": 1.9938, "step": 1069 }, { "epoch": 1.3595933926302415, "grad_norm": 1.4052341123659684, "learning_rate": 1.5052430869872888e-05, "loss": 1.9635, "step": 1070 }, { "epoch": 1.3608640406607369, "grad_norm": 1.4081093932218003, "learning_rate": 1.504370524512068e-05, "loss": 1.8709, "step": 1071 }, { "epoch": 1.3621346886912327, "grad_norm": 1.2520645518775244, "learning_rate": 1.50349744670452e-05, "loss": 1.7629, "step": 1072 }, { "epoch": 1.363405336721728, "grad_norm": 1.4543344242633094, "learning_rate": 1.5026238544566986e-05, "loss": 2.0301, "step": 1073 }, { "epoch": 1.3646759847522236, "grad_norm": 1.3253506262354477, "learning_rate": 1.501749748661182e-05, "loss": 1.8146, "step": 1074 }, { "epoch": 1.3659466327827192, "grad_norm": 1.5603060746011057, "learning_rate": 1.5008751302110738e-05, "loss": 1.8916, "step": 1075 }, { "epoch": 1.3672172808132148, "grad_norm": 1.2963408122032392, "learning_rate": 1.5000000000000002e-05, "loss": 1.6603, "step": 1076 }, { "epoch": 1.3684879288437104, "grad_norm": 29.216318101427753, "learning_rate": 1.4991243589221118e-05, "loss": 1.8385, "step": 1077 }, { "epoch": 1.3697585768742058, "grad_norm": 1.281414727075171, "learning_rate": 1.4982482078720808e-05, "loss": 1.605, "step": 1078 }, { "epoch": 1.3710292249047014, "grad_norm": 1.4359392726360947, "learning_rate": 1.4973715477450996e-05, "loss": 1.8942, "step": 1079 }, { "epoch": 1.372299872935197, "grad_norm": 1.3770086129527501, "learning_rate": 1.4964943794368815e-05, "loss": 1.8196, "step": 1080 }, { "epoch": 1.3735705209656925, "grad_norm": 1.1848449267963983, "learning_rate": 1.4956167038436594e-05, "loss": 1.8403, "step": 1081 }, { "epoch": 1.3748411689961881, "grad_norm": 1.271521238883279, "learning_rate": 1.4947385218621832e-05, "loss": 1.8443, "step": 1082 }, { "epoch": 1.3761118170266835, "grad_norm": 1.1738548733510952, "learning_rate": 1.4938598343897215e-05, "loss": 1.7652, "step": 1083 }, { "epoch": 1.377382465057179, "grad_norm": 1.4836826510060177, "learning_rate": 1.4929806423240582e-05, "loss": 1.8507, "step": 1084 }, { "epoch": 1.3786531130876747, "grad_norm": 1.358512949766454, "learning_rate": 1.4921009465634941e-05, "loss": 2.0604, "step": 1085 }, { "epoch": 1.3799237611181703, "grad_norm": 1.2784656221631014, "learning_rate": 1.4912207480068437e-05, "loss": 2.0367, "step": 1086 }, { "epoch": 1.3811944091486659, "grad_norm": 1.4821834722243332, "learning_rate": 1.4903400475534355e-05, "loss": 1.8966, "step": 1087 }, { "epoch": 1.3824650571791612, "grad_norm": 1.3037716958740244, "learning_rate": 1.4894588461031107e-05, "loss": 1.9964, "step": 1088 }, { "epoch": 1.383735705209657, "grad_norm": 1.2493808686091994, "learning_rate": 1.4885771445562225e-05, "loss": 1.9861, "step": 1089 }, { "epoch": 1.3850063532401524, "grad_norm": 1.3612017402957912, "learning_rate": 1.4876949438136348e-05, "loss": 2.021, "step": 1090 }, { "epoch": 1.386277001270648, "grad_norm": 1.1468990675471178, "learning_rate": 1.486812244776722e-05, "loss": 1.6981, "step": 1091 }, { "epoch": 1.3875476493011436, "grad_norm": 1.2227722381131463, "learning_rate": 1.4859290483473671e-05, "loss": 1.7964, "step": 1092 }, { "epoch": 1.3888182973316392, "grad_norm": 1.1223438081355355, "learning_rate": 1.4850453554279622e-05, "loss": 1.835, "step": 1093 }, { "epoch": 1.3900889453621348, "grad_norm": 1.2681547197998326, "learning_rate": 1.4841611669214056e-05, "loss": 2.0526, "step": 1094 }, { "epoch": 1.3913595933926302, "grad_norm": 1.2400725181931955, "learning_rate": 1.4832764837311026e-05, "loss": 1.9443, "step": 1095 }, { "epoch": 1.3926302414231257, "grad_norm": 1.2268627779535064, "learning_rate": 1.4823913067609639e-05, "loss": 1.9066, "step": 1096 }, { "epoch": 1.3939008894536213, "grad_norm": 1.3479500429031042, "learning_rate": 1.4815056369154039e-05, "loss": 1.7896, "step": 1097 }, { "epoch": 1.395171537484117, "grad_norm": 1.1534860117987928, "learning_rate": 1.4806194750993422e-05, "loss": 1.8786, "step": 1098 }, { "epoch": 1.3964421855146125, "grad_norm": 1.1962912803044135, "learning_rate": 1.4797328222181995e-05, "loss": 1.9501, "step": 1099 }, { "epoch": 1.397712833545108, "grad_norm": 1.3054107727293938, "learning_rate": 1.478845679177899e-05, "loss": 2.0076, "step": 1100 }, { "epoch": 1.3989834815756035, "grad_norm": 1.3366491474516522, "learning_rate": 1.4779580468848647e-05, "loss": 1.8628, "step": 1101 }, { "epoch": 1.400254129606099, "grad_norm": 1.1245658180174034, "learning_rate": 1.4770699262460204e-05, "loss": 1.6988, "step": 1102 }, { "epoch": 1.4015247776365947, "grad_norm": 1.2951297055619024, "learning_rate": 1.4761813181687885e-05, "loss": 1.7491, "step": 1103 }, { "epoch": 1.4027954256670903, "grad_norm": 1.3722443895167817, "learning_rate": 1.47529222356109e-05, "loss": 2.1425, "step": 1104 }, { "epoch": 1.4040660736975858, "grad_norm": 1.4854506056127954, "learning_rate": 1.474402643331343e-05, "loss": 1.8417, "step": 1105 }, { "epoch": 1.4053367217280814, "grad_norm": 1.48203680056898, "learning_rate": 1.4735125783884609e-05, "loss": 1.8387, "step": 1106 }, { "epoch": 1.4066073697585768, "grad_norm": 1.1523601582917917, "learning_rate": 1.4726220296418536e-05, "loss": 1.8076, "step": 1107 }, { "epoch": 1.4078780177890724, "grad_norm": 1.220791922803522, "learning_rate": 1.4717309980014245e-05, "loss": 1.8264, "step": 1108 }, { "epoch": 1.409148665819568, "grad_norm": 1.4844350959579224, "learning_rate": 1.4708394843775704e-05, "loss": 1.8089, "step": 1109 }, { "epoch": 1.4104193138500636, "grad_norm": 1.3676963543110698, "learning_rate": 1.4699474896811809e-05, "loss": 1.8093, "step": 1110 }, { "epoch": 1.4116899618805592, "grad_norm": 1.390402133401193, "learning_rate": 1.4690550148236371e-05, "loss": 1.6331, "step": 1111 }, { "epoch": 1.4129606099110545, "grad_norm": 1.213969912031899, "learning_rate": 1.4681620607168104e-05, "loss": 1.7546, "step": 1112 }, { "epoch": 1.4142312579415501, "grad_norm": 1.2301054744197815, "learning_rate": 1.4672686282730622e-05, "loss": 1.8846, "step": 1113 }, { "epoch": 1.4155019059720457, "grad_norm": 1.322173497693607, "learning_rate": 1.4663747184052425e-05, "loss": 2.0186, "step": 1114 }, { "epoch": 1.4167725540025413, "grad_norm": 1.372166178996649, "learning_rate": 1.4654803320266883e-05, "loss": 1.8133, "step": 1115 }, { "epoch": 1.418043202033037, "grad_norm": 1.355472281521608, "learning_rate": 1.4645854700512254e-05, "loss": 1.7767, "step": 1116 }, { "epoch": 1.4193138500635323, "grad_norm": 1.1852757912386718, "learning_rate": 1.463690133393164e-05, "loss": 1.9112, "step": 1117 }, { "epoch": 1.420584498094028, "grad_norm": 1.4821403273379334, "learning_rate": 1.4627943229672992e-05, "loss": 1.8262, "step": 1118 }, { "epoch": 1.4218551461245235, "grad_norm": 1.2697543939806661, "learning_rate": 1.461898039688911e-05, "loss": 2.0448, "step": 1119 }, { "epoch": 1.423125794155019, "grad_norm": 1.274206462270112, "learning_rate": 1.4610012844737622e-05, "loss": 1.7997, "step": 1120 }, { "epoch": 1.4243964421855146, "grad_norm": 1.3575689355994345, "learning_rate": 1.4601040582380976e-05, "loss": 2.0597, "step": 1121 }, { "epoch": 1.4256670902160102, "grad_norm": 2.138323377849098, "learning_rate": 1.4592063618986439e-05, "loss": 1.7497, "step": 1122 }, { "epoch": 1.4269377382465058, "grad_norm": 1.4614623364414636, "learning_rate": 1.4583081963726068e-05, "loss": 1.7695, "step": 1123 }, { "epoch": 1.4282083862770012, "grad_norm": 1.3726036998687277, "learning_rate": 1.457409562577673e-05, "loss": 1.8282, "step": 1124 }, { "epoch": 1.4294790343074968, "grad_norm": 1.297862988769582, "learning_rate": 1.4565104614320065e-05, "loss": 1.7631, "step": 1125 }, { "epoch": 1.4307496823379924, "grad_norm": 1.2157552056387835, "learning_rate": 1.455610893854249e-05, "loss": 1.8712, "step": 1126 }, { "epoch": 1.432020330368488, "grad_norm": 1.225192364749645, "learning_rate": 1.4547108607635194e-05, "loss": 2.0084, "step": 1127 }, { "epoch": 1.4332909783989836, "grad_norm": 1.4469029193769432, "learning_rate": 1.4538103630794117e-05, "loss": 1.8373, "step": 1128 }, { "epoch": 1.434561626429479, "grad_norm": 1.6695257574356568, "learning_rate": 1.452909401721994e-05, "loss": 2.0349, "step": 1129 }, { "epoch": 1.4358322744599745, "grad_norm": 1.3172507478374025, "learning_rate": 1.45200797761181e-05, "loss": 1.9316, "step": 1130 }, { "epoch": 1.4371029224904701, "grad_norm": 1.2623048899621796, "learning_rate": 1.4511060916698739e-05, "loss": 1.4695, "step": 1131 }, { "epoch": 1.4383735705209657, "grad_norm": 1.7724378270582546, "learning_rate": 1.4502037448176734e-05, "loss": 2.028, "step": 1132 }, { "epoch": 1.4396442185514613, "grad_norm": 1.2047296714294544, "learning_rate": 1.4493009379771667e-05, "loss": 1.8363, "step": 1133 }, { "epoch": 1.4409148665819567, "grad_norm": 1.3014621811645635, "learning_rate": 1.4483976720707817e-05, "loss": 1.7993, "step": 1134 }, { "epoch": 1.4421855146124525, "grad_norm": 1.312615920278082, "learning_rate": 1.4474939480214156e-05, "loss": 1.8531, "step": 1135 }, { "epoch": 1.4434561626429478, "grad_norm": 1.3840101493642059, "learning_rate": 1.446589766752434e-05, "loss": 2.0521, "step": 1136 }, { "epoch": 1.4447268106734434, "grad_norm": 1.3006679273166455, "learning_rate": 1.4456851291876688e-05, "loss": 1.882, "step": 1137 }, { "epoch": 1.445997458703939, "grad_norm": 1.376007968190896, "learning_rate": 1.4447800362514188e-05, "loss": 2.1259, "step": 1138 }, { "epoch": 1.4472681067344346, "grad_norm": 1.5744017178587613, "learning_rate": 1.4438744888684481e-05, "loss": 2.036, "step": 1139 }, { "epoch": 1.4485387547649302, "grad_norm": 1.0898107837156374, "learning_rate": 1.4429684879639848e-05, "loss": 2.0197, "step": 1140 }, { "epoch": 1.4498094027954256, "grad_norm": 1.4478727647608343, "learning_rate": 1.44206203446372e-05, "loss": 1.8264, "step": 1141 }, { "epoch": 1.4510800508259212, "grad_norm": 1.2597561559788928, "learning_rate": 1.4411551292938087e-05, "loss": 1.5697, "step": 1142 }, { "epoch": 1.4523506988564168, "grad_norm": 1.2427984789425064, "learning_rate": 1.4402477733808656e-05, "loss": 2.0338, "step": 1143 }, { "epoch": 1.4536213468869124, "grad_norm": 1.2574876773229697, "learning_rate": 1.4393399676519668e-05, "loss": 1.616, "step": 1144 }, { "epoch": 1.454891994917408, "grad_norm": 1.3019702996517974, "learning_rate": 1.4384317130346484e-05, "loss": 2.0132, "step": 1145 }, { "epoch": 1.4561626429479033, "grad_norm": 1.2706570967585542, "learning_rate": 1.4375230104569044e-05, "loss": 1.9072, "step": 1146 }, { "epoch": 1.457433290978399, "grad_norm": 1.369054262158506, "learning_rate": 1.436613860847187e-05, "loss": 1.8305, "step": 1147 }, { "epoch": 1.4587039390088945, "grad_norm": 1.2389341173958066, "learning_rate": 1.4357042651344047e-05, "loss": 1.8479, "step": 1148 }, { "epoch": 1.45997458703939, "grad_norm": 1.2176996548775592, "learning_rate": 1.4347942242479217e-05, "loss": 1.9907, "step": 1149 }, { "epoch": 1.4612452350698857, "grad_norm": 1.3531336555921543, "learning_rate": 1.4338837391175582e-05, "loss": 1.9116, "step": 1150 }, { "epoch": 1.4625158831003813, "grad_norm": 1.2600570697403215, "learning_rate": 1.432972810673587e-05, "loss": 1.8925, "step": 1151 }, { "epoch": 1.4637865311308769, "grad_norm": 1.2661028151416078, "learning_rate": 1.4320614398467342e-05, "loss": 1.8575, "step": 1152 }, { "epoch": 1.4650571791613722, "grad_norm": 1.3397535457153715, "learning_rate": 1.4311496275681785e-05, "loss": 2.005, "step": 1153 }, { "epoch": 1.4663278271918678, "grad_norm": 1.2929579386821513, "learning_rate": 1.4302373747695488e-05, "loss": 2.0029, "step": 1154 }, { "epoch": 1.4675984752223634, "grad_norm": 1.246952940681954, "learning_rate": 1.4293246823829242e-05, "loss": 1.8672, "step": 1155 }, { "epoch": 1.468869123252859, "grad_norm": 1.23098474049904, "learning_rate": 1.4284115513408337e-05, "loss": 1.8379, "step": 1156 }, { "epoch": 1.4701397712833546, "grad_norm": 1.1801976280115345, "learning_rate": 1.4274979825762541e-05, "loss": 1.7649, "step": 1157 }, { "epoch": 1.47141041931385, "grad_norm": 1.5075282662210534, "learning_rate": 1.4265839770226087e-05, "loss": 1.794, "step": 1158 }, { "epoch": 1.4726810673443456, "grad_norm": 1.2562664740051244, "learning_rate": 1.4256695356137683e-05, "loss": 1.6773, "step": 1159 }, { "epoch": 1.4739517153748412, "grad_norm": 1.3319817163409906, "learning_rate": 1.424754659284048e-05, "loss": 1.8983, "step": 1160 }, { "epoch": 1.4752223634053367, "grad_norm": 1.3009487967372897, "learning_rate": 1.4238393489682078e-05, "loss": 1.4876, "step": 1161 }, { "epoch": 1.4764930114358323, "grad_norm": 1.4381420793797635, "learning_rate": 1.4229236056014517e-05, "loss": 2.1053, "step": 1162 }, { "epoch": 1.4777636594663277, "grad_norm": 1.4382466575045552, "learning_rate": 1.4220074301194244e-05, "loss": 1.6265, "step": 1163 }, { "epoch": 1.4790343074968233, "grad_norm": 1.3299729844702393, "learning_rate": 1.4210908234582141e-05, "loss": 1.8122, "step": 1164 }, { "epoch": 1.4803049555273189, "grad_norm": 1.498254150158053, "learning_rate": 1.4201737865543481e-05, "loss": 2.1078, "step": 1165 }, { "epoch": 1.4815756035578145, "grad_norm": 1.2037816133533588, "learning_rate": 1.4192563203447941e-05, "loss": 1.975, "step": 1166 }, { "epoch": 1.48284625158831, "grad_norm": 1.0785324145030633, "learning_rate": 1.418338425766958e-05, "loss": 1.4234, "step": 1167 }, { "epoch": 1.4841168996188057, "grad_norm": 1.1533361400572302, "learning_rate": 1.4174201037586841e-05, "loss": 1.85, "step": 1168 }, { "epoch": 1.4853875476493013, "grad_norm": 1.188900432669104, "learning_rate": 1.416501355258252e-05, "loss": 1.8593, "step": 1169 }, { "epoch": 1.4866581956797966, "grad_norm": 1.3221956875383332, "learning_rate": 1.4155821812043787e-05, "loss": 2.0455, "step": 1170 }, { "epoch": 1.4879288437102922, "grad_norm": 1.2783010187174453, "learning_rate": 1.4146625825362147e-05, "loss": 1.8903, "step": 1171 }, { "epoch": 1.4891994917407878, "grad_norm": 1.203190732721331, "learning_rate": 1.4137425601933457e-05, "loss": 2.0258, "step": 1172 }, { "epoch": 1.4904701397712834, "grad_norm": 1.464676328426507, "learning_rate": 1.4128221151157882e-05, "loss": 2.0596, "step": 1173 }, { "epoch": 1.491740787801779, "grad_norm": 10.1925542723116, "learning_rate": 1.4119012482439929e-05, "loss": 1.6295, "step": 1174 }, { "epoch": 1.4930114358322744, "grad_norm": 1.3805107030458463, "learning_rate": 1.41097996051884e-05, "loss": 1.9646, "step": 1175 }, { "epoch": 1.49428208386277, "grad_norm": 1.5218594454266174, "learning_rate": 1.4100582528816404e-05, "loss": 1.7224, "step": 1176 }, { "epoch": 1.4955527318932655, "grad_norm": 1.386172494199785, "learning_rate": 1.4091361262741337e-05, "loss": 1.9983, "step": 1177 }, { "epoch": 1.4968233799237611, "grad_norm": 1.2242855036178195, "learning_rate": 1.4082135816384877e-05, "loss": 2.078, "step": 1178 }, { "epoch": 1.4980940279542567, "grad_norm": 1.2570037053181151, "learning_rate": 1.4072906199172969e-05, "loss": 1.7809, "step": 1179 }, { "epoch": 1.499364675984752, "grad_norm": 1.4758796109715635, "learning_rate": 1.406367242053583e-05, "loss": 1.8389, "step": 1180 }, { "epoch": 1.500635324015248, "grad_norm": 1.395166479758458, "learning_rate": 1.4054434489907916e-05, "loss": 1.7727, "step": 1181 }, { "epoch": 1.5019059720457433, "grad_norm": 1.4118358945476996, "learning_rate": 1.4045192416727937e-05, "loss": 1.7145, "step": 1182 }, { "epoch": 1.5031766200762389, "grad_norm": 1.2784623059267264, "learning_rate": 1.4035946210438827e-05, "loss": 2.0701, "step": 1183 }, { "epoch": 1.5044472681067345, "grad_norm": 1.4109413545265437, "learning_rate": 1.4026695880487744e-05, "loss": 1.984, "step": 1184 }, { "epoch": 1.5057179161372298, "grad_norm": 1.2849466440320692, "learning_rate": 1.4017441436326063e-05, "loss": 1.8814, "step": 1185 }, { "epoch": 1.5069885641677256, "grad_norm": 1.4766606508276032, "learning_rate": 1.4008182887409363e-05, "loss": 2.0233, "step": 1186 }, { "epoch": 1.508259212198221, "grad_norm": 1.463036495890669, "learning_rate": 1.3998920243197408e-05, "loss": 2.0958, "step": 1187 }, { "epoch": 1.5095298602287166, "grad_norm": 1.384808363410981, "learning_rate": 1.3989653513154165e-05, "loss": 1.7643, "step": 1188 }, { "epoch": 1.5108005082592122, "grad_norm": 1.4283450082814748, "learning_rate": 1.3980382706747752e-05, "loss": 2.1955, "step": 1189 }, { "epoch": 1.5120711562897078, "grad_norm": 1.2690160617888337, "learning_rate": 1.397110783345047e-05, "loss": 1.7904, "step": 1190 }, { "epoch": 1.5133418043202034, "grad_norm": 1.285380785106247, "learning_rate": 1.3961828902738768e-05, "loss": 1.8982, "step": 1191 }, { "epoch": 1.5146124523506987, "grad_norm": 1.4046160448580636, "learning_rate": 1.3952545924093239e-05, "loss": 1.9344, "step": 1192 }, { "epoch": 1.5158831003811946, "grad_norm": 1.217728050346088, "learning_rate": 1.3943258906998615e-05, "loss": 1.9526, "step": 1193 }, { "epoch": 1.51715374841169, "grad_norm": 1.3155374681124927, "learning_rate": 1.393396786094376e-05, "loss": 1.9693, "step": 1194 }, { "epoch": 1.5184243964421855, "grad_norm": 1.2375353252029027, "learning_rate": 1.3924672795421638e-05, "loss": 1.9235, "step": 1195 }, { "epoch": 1.5196950444726811, "grad_norm": 1.196859858514443, "learning_rate": 1.391537371992934e-05, "loss": 1.9201, "step": 1196 }, { "epoch": 1.5209656925031765, "grad_norm": 1.5229479940500787, "learning_rate": 1.3906070643968035e-05, "loss": 1.8756, "step": 1197 }, { "epoch": 1.5222363405336723, "grad_norm": 1.2629206956366628, "learning_rate": 1.3896763577042995e-05, "loss": 1.9048, "step": 1198 }, { "epoch": 1.5235069885641677, "grad_norm": 1.7258718732318938, "learning_rate": 1.3887452528663558e-05, "loss": 2.0766, "step": 1199 }, { "epoch": 1.5247776365946633, "grad_norm": 1.4627023384109283, "learning_rate": 1.3878137508343143e-05, "loss": 1.9649, "step": 1200 }, { "epoch": 1.5260482846251588, "grad_norm": 1.222588419713132, "learning_rate": 1.3868818525599215e-05, "loss": 2.0395, "step": 1201 }, { "epoch": 1.5273189326556544, "grad_norm": 1.349113525510833, "learning_rate": 1.3859495589953289e-05, "loss": 1.8303, "step": 1202 }, { "epoch": 1.52858958068615, "grad_norm": 2.981849494497899, "learning_rate": 1.3850168710930927e-05, "loss": 1.9033, "step": 1203 }, { "epoch": 1.5298602287166454, "grad_norm": 1.3647918261639207, "learning_rate": 1.3840837898061711e-05, "loss": 1.6911, "step": 1204 }, { "epoch": 1.531130876747141, "grad_norm": 1.2208582908937131, "learning_rate": 1.3831503160879249e-05, "loss": 1.6361, "step": 1205 }, { "epoch": 1.5324015247776366, "grad_norm": 1.2583667151308502, "learning_rate": 1.3822164508921157e-05, "loss": 1.7389, "step": 1206 }, { "epoch": 1.5336721728081322, "grad_norm": 1.4805538938186564, "learning_rate": 1.3812821951729044e-05, "loss": 1.7825, "step": 1207 }, { "epoch": 1.5349428208386278, "grad_norm": 1.3870334698484725, "learning_rate": 1.3803475498848522e-05, "loss": 1.8398, "step": 1208 }, { "epoch": 1.5362134688691231, "grad_norm": 1.3706103550947548, "learning_rate": 1.3794125159829173e-05, "loss": 1.93, "step": 1209 }, { "epoch": 1.537484116899619, "grad_norm": 1.5719797919516998, "learning_rate": 1.378477094422455e-05, "loss": 1.8066, "step": 1210 }, { "epoch": 1.5387547649301143, "grad_norm": 1.4245514066888003, "learning_rate": 1.3775412861592175e-05, "loss": 1.91, "step": 1211 }, { "epoch": 1.54002541296061, "grad_norm": 1.5978689392490961, "learning_rate": 1.3766050921493513e-05, "loss": 1.9467, "step": 1212 }, { "epoch": 1.5412960609911055, "grad_norm": 1.147772260129036, "learning_rate": 1.375668513349397e-05, "loss": 2.0117, "step": 1213 }, { "epoch": 1.5425667090216009, "grad_norm": 1.2286312283945457, "learning_rate": 1.3747315507162892e-05, "loss": 1.929, "step": 1214 }, { "epoch": 1.5438373570520967, "grad_norm": 1.179000137924479, "learning_rate": 1.373794205207354e-05, "loss": 1.8902, "step": 1215 }, { "epoch": 1.545108005082592, "grad_norm": 1.248366462875927, "learning_rate": 1.3728564777803089e-05, "loss": 1.8247, "step": 1216 }, { "epoch": 1.5463786531130876, "grad_norm": 1.4648780876266096, "learning_rate": 1.371918369393261e-05, "loss": 1.8984, "step": 1217 }, { "epoch": 1.5476493011435832, "grad_norm": 1.6506309397732746, "learning_rate": 1.3709798810047079e-05, "loss": 1.872, "step": 1218 }, { "epoch": 1.5489199491740788, "grad_norm": 1.2165902320177087, "learning_rate": 1.370041013573534e-05, "loss": 1.7121, "step": 1219 }, { "epoch": 1.5501905972045744, "grad_norm": 1.397563042124808, "learning_rate": 1.3691017680590126e-05, "loss": 2.0396, "step": 1220 }, { "epoch": 1.5514612452350698, "grad_norm": 1.2637963882307408, "learning_rate": 1.3681621454208017e-05, "loss": 1.7806, "step": 1221 }, { "epoch": 1.5527318932655656, "grad_norm": 1.2003710827084932, "learning_rate": 1.3672221466189457e-05, "loss": 1.7892, "step": 1222 }, { "epoch": 1.554002541296061, "grad_norm": 1.1875218869022044, "learning_rate": 1.3662817726138729e-05, "loss": 1.7981, "step": 1223 }, { "epoch": 1.5552731893265566, "grad_norm": 1.2566514736815448, "learning_rate": 1.3653410243663953e-05, "loss": 2.0253, "step": 1224 }, { "epoch": 1.5565438373570522, "grad_norm": 1.384730420879102, "learning_rate": 1.3643999028377065e-05, "loss": 1.9198, "step": 1225 }, { "epoch": 1.5578144853875475, "grad_norm": 1.2313997374157073, "learning_rate": 1.3634584089893826e-05, "loss": 1.9868, "step": 1226 }, { "epoch": 1.5590851334180433, "grad_norm": 1.1639822831560722, "learning_rate": 1.3625165437833787e-05, "loss": 1.8653, "step": 1227 }, { "epoch": 1.5603557814485387, "grad_norm": 1.3401076066880915, "learning_rate": 1.361574308182031e-05, "loss": 1.9842, "step": 1228 }, { "epoch": 1.5616264294790343, "grad_norm": 1.272446557869709, "learning_rate": 1.3606317031480529e-05, "loss": 1.787, "step": 1229 }, { "epoch": 1.5628970775095299, "grad_norm": 1.1357843978743838, "learning_rate": 1.359688729644536e-05, "loss": 1.4677, "step": 1230 }, { "epoch": 1.5641677255400253, "grad_norm": 1.4180732475647855, "learning_rate": 1.3587453886349474e-05, "loss": 1.7196, "step": 1231 }, { "epoch": 1.565438373570521, "grad_norm": 1.3772896602327505, "learning_rate": 1.3578016810831311e-05, "loss": 1.9846, "step": 1232 }, { "epoch": 1.5667090216010164, "grad_norm": 1.2727299017038909, "learning_rate": 1.3568576079533043e-05, "loss": 1.7325, "step": 1233 }, { "epoch": 1.567979669631512, "grad_norm": 1.284756489838671, "learning_rate": 1.3559131702100584e-05, "loss": 2.0046, "step": 1234 }, { "epoch": 1.5692503176620076, "grad_norm": 1.121721245316067, "learning_rate": 1.354968368818357e-05, "loss": 1.8843, "step": 1235 }, { "epoch": 1.5705209656925032, "grad_norm": 1.2667770466964212, "learning_rate": 1.354023204743536e-05, "loss": 1.7568, "step": 1236 }, { "epoch": 1.5717916137229988, "grad_norm": 1.539311259281202, "learning_rate": 1.3530776789513009e-05, "loss": 1.8506, "step": 1237 }, { "epoch": 1.5730622617534942, "grad_norm": 1.6379449129945398, "learning_rate": 1.3521317924077275e-05, "loss": 1.6479, "step": 1238 }, { "epoch": 1.57433290978399, "grad_norm": 1.2920503074243581, "learning_rate": 1.3511855460792593e-05, "loss": 1.9804, "step": 1239 }, { "epoch": 1.5756035578144854, "grad_norm": 1.2389856953911573, "learning_rate": 1.3502389409327087e-05, "loss": 1.9727, "step": 1240 }, { "epoch": 1.576874205844981, "grad_norm": 1.3950063040745804, "learning_rate": 1.3492919779352536e-05, "loss": 2.054, "step": 1241 }, { "epoch": 1.5781448538754765, "grad_norm": 1.310297987074961, "learning_rate": 1.348344658054438e-05, "loss": 2.0237, "step": 1242 }, { "epoch": 1.579415501905972, "grad_norm": 1.2925087100054253, "learning_rate": 1.3473969822581707e-05, "loss": 2.0117, "step": 1243 }, { "epoch": 1.5806861499364677, "grad_norm": 1.3105575963221772, "learning_rate": 1.3464489515147239e-05, "loss": 1.9496, "step": 1244 }, { "epoch": 1.581956797966963, "grad_norm": 1.2237031081859495, "learning_rate": 1.3455005667927318e-05, "loss": 2.0285, "step": 1245 }, { "epoch": 1.5832274459974587, "grad_norm": 1.3435645122073598, "learning_rate": 1.3445518290611918e-05, "loss": 1.7926, "step": 1246 }, { "epoch": 1.5844980940279543, "grad_norm": 1.2346080750718063, "learning_rate": 1.343602739289461e-05, "loss": 1.7282, "step": 1247 }, { "epoch": 1.5857687420584496, "grad_norm": 1.2454640590173194, "learning_rate": 1.3426532984472561e-05, "loss": 1.901, "step": 1248 }, { "epoch": 1.5870393900889455, "grad_norm": 1.275365347669394, "learning_rate": 1.3417035075046527e-05, "loss": 1.8574, "step": 1249 }, { "epoch": 1.5883100381194408, "grad_norm": 1.1338251344047352, "learning_rate": 1.3407533674320848e-05, "loss": 1.8699, "step": 1250 }, { "epoch": 1.5895806861499364, "grad_norm": 1.2431631957755682, "learning_rate": 1.3398028792003413e-05, "loss": 1.6381, "step": 1251 }, { "epoch": 1.590851334180432, "grad_norm": 1.4132487408887402, "learning_rate": 1.338852043780569e-05, "loss": 1.7747, "step": 1252 }, { "epoch": 1.5921219822109276, "grad_norm": 1.6491888618473156, "learning_rate": 1.337900862144268e-05, "loss": 1.9426, "step": 1253 }, { "epoch": 1.5933926302414232, "grad_norm": 1.3622479954441118, "learning_rate": 1.3369493352632925e-05, "loss": 2.0793, "step": 1254 }, { "epoch": 1.5946632782719186, "grad_norm": 1.192420588196507, "learning_rate": 1.3359974641098497e-05, "loss": 1.8084, "step": 1255 }, { "epoch": 1.5959339263024144, "grad_norm": 1.2797006019475736, "learning_rate": 1.3350452496564985e-05, "loss": 1.9092, "step": 1256 }, { "epoch": 1.5972045743329097, "grad_norm": 1.2755999292541016, "learning_rate": 1.3340926928761477e-05, "loss": 1.8576, "step": 1257 }, { "epoch": 1.5984752223634053, "grad_norm": 1.2395072504037918, "learning_rate": 1.3331397947420578e-05, "loss": 1.7841, "step": 1258 }, { "epoch": 1.599745870393901, "grad_norm": 1.296854658829394, "learning_rate": 1.332186556227836e-05, "loss": 1.8771, "step": 1259 }, { "epoch": 1.6010165184243963, "grad_norm": 1.1164654884285337, "learning_rate": 1.3312329783074383e-05, "loss": 2.0762, "step": 1260 }, { "epoch": 1.602287166454892, "grad_norm": 1.2093316079814684, "learning_rate": 1.3302790619551673e-05, "loss": 1.8883, "step": 1261 }, { "epoch": 1.6035578144853875, "grad_norm": 1.3153570486567234, "learning_rate": 1.3293248081456717e-05, "loss": 1.9228, "step": 1262 }, { "epoch": 1.604828462515883, "grad_norm": 1.7801035961064582, "learning_rate": 1.3283702178539441e-05, "loss": 1.6673, "step": 1263 }, { "epoch": 1.6060991105463787, "grad_norm": 1.2602038214603772, "learning_rate": 1.3274152920553225e-05, "loss": 1.9318, "step": 1264 }, { "epoch": 1.6073697585768743, "grad_norm": 1.3031972530092082, "learning_rate": 1.3264600317254854e-05, "loss": 1.6836, "step": 1265 }, { "epoch": 1.6086404066073698, "grad_norm": 1.4977716352782071, "learning_rate": 1.3255044378404557e-05, "loss": 2.0444, "step": 1266 }, { "epoch": 1.6099110546378652, "grad_norm": 1.33666349801072, "learning_rate": 1.3245485113765952e-05, "loss": 1.8835, "step": 1267 }, { "epoch": 1.611181702668361, "grad_norm": 55.6034937167438, "learning_rate": 1.323592253310606e-05, "loss": 2.0328, "step": 1268 }, { "epoch": 1.6124523506988564, "grad_norm": 1.3049852476469035, "learning_rate": 1.3226356646195293e-05, "loss": 2.0586, "step": 1269 }, { "epoch": 1.613722998729352, "grad_norm": 1.2839308854301152, "learning_rate": 1.3216787462807442e-05, "loss": 2.0095, "step": 1270 }, { "epoch": 1.6149936467598476, "grad_norm": 1.2078234396212837, "learning_rate": 1.3207214992719654e-05, "loss": 1.7023, "step": 1271 }, { "epoch": 1.616264294790343, "grad_norm": 1.0942474151382433, "learning_rate": 1.3197639245712454e-05, "loss": 2.0584, "step": 1272 }, { "epoch": 1.6175349428208388, "grad_norm": 1.640743926625236, "learning_rate": 1.3188060231569701e-05, "loss": 2.0192, "step": 1273 }, { "epoch": 1.6188055908513341, "grad_norm": 1.445559822354054, "learning_rate": 1.3178477960078594e-05, "loss": 1.8092, "step": 1274 }, { "epoch": 1.6200762388818297, "grad_norm": 1.2699367663057821, "learning_rate": 1.3168892441029666e-05, "loss": 1.6643, "step": 1275 }, { "epoch": 1.6213468869123253, "grad_norm": 1.2961056322093205, "learning_rate": 1.3159303684216761e-05, "loss": 2.0516, "step": 1276 }, { "epoch": 1.6226175349428207, "grad_norm": 1.328661586517937, "learning_rate": 1.3149711699437035e-05, "loss": 1.9553, "step": 1277 }, { "epoch": 1.6238881829733165, "grad_norm": 1.5570126645670899, "learning_rate": 1.3140116496490944e-05, "loss": 1.8117, "step": 1278 }, { "epoch": 1.6251588310038119, "grad_norm": 1.232675245720794, "learning_rate": 1.3130518085182224e-05, "loss": 1.8473, "step": 1279 }, { "epoch": 1.6264294790343075, "grad_norm": 1.639625976170505, "learning_rate": 1.31209164753179e-05, "loss": 1.7237, "step": 1280 }, { "epoch": 1.627700127064803, "grad_norm": 1.3365143272845428, "learning_rate": 1.3111311676708256e-05, "loss": 1.7455, "step": 1281 }, { "epoch": 1.6289707750952986, "grad_norm": 1.4841371892966886, "learning_rate": 1.3101703699166843e-05, "loss": 1.8929, "step": 1282 }, { "epoch": 1.6302414231257942, "grad_norm": 1.194723119658828, "learning_rate": 1.3092092552510445e-05, "loss": 1.7472, "step": 1283 }, { "epoch": 1.6315120711562896, "grad_norm": 1.325696772260916, "learning_rate": 1.3082478246559104e-05, "loss": 1.7873, "step": 1284 }, { "epoch": 1.6327827191867854, "grad_norm": 1.2056417302548919, "learning_rate": 1.3072860791136075e-05, "loss": 1.8021, "step": 1285 }, { "epoch": 1.6340533672172808, "grad_norm": 1.2675138648758477, "learning_rate": 1.3063240196067837e-05, "loss": 1.8093, "step": 1286 }, { "epoch": 1.6353240152477764, "grad_norm": 1.3495984616195633, "learning_rate": 1.3053616471184071e-05, "loss": 1.8553, "step": 1287 }, { "epoch": 1.636594663278272, "grad_norm": 1.9427242076457587, "learning_rate": 1.3043989626317668e-05, "loss": 1.8769, "step": 1288 }, { "epoch": 1.6378653113087673, "grad_norm": 1.5938140030309664, "learning_rate": 1.3034359671304693e-05, "loss": 2.0313, "step": 1289 }, { "epoch": 1.6391359593392631, "grad_norm": 1.2722119373656835, "learning_rate": 1.30247266159844e-05, "loss": 1.946, "step": 1290 }, { "epoch": 1.6404066073697585, "grad_norm": 1.1733976456758377, "learning_rate": 1.3015090470199201e-05, "loss": 2.0665, "step": 1291 }, { "epoch": 1.641677255400254, "grad_norm": 1.3386678697498235, "learning_rate": 1.3005451243794672e-05, "loss": 1.8375, "step": 1292 }, { "epoch": 1.6429479034307497, "grad_norm": 1.3635571022521007, "learning_rate": 1.2995808946619533e-05, "loss": 1.7194, "step": 1293 }, { "epoch": 1.644218551461245, "grad_norm": 1.4234061688686828, "learning_rate": 1.2986163588525646e-05, "loss": 1.851, "step": 1294 }, { "epoch": 1.6454891994917409, "grad_norm": 1.3951660972295525, "learning_rate": 1.2976515179367996e-05, "loss": 2.1171, "step": 1295 }, { "epoch": 1.6467598475222363, "grad_norm": 1.2092866983227297, "learning_rate": 1.2966863729004691e-05, "loss": 1.6927, "step": 1296 }, { "epoch": 1.6480304955527318, "grad_norm": 1.0797174046168765, "learning_rate": 1.2957209247296935e-05, "loss": 2.0332, "step": 1297 }, { "epoch": 1.6493011435832274, "grad_norm": 1.1392147959819172, "learning_rate": 1.2947551744109044e-05, "loss": 1.7436, "step": 1298 }, { "epoch": 1.650571791613723, "grad_norm": 1.1836701182877878, "learning_rate": 1.293789122930841e-05, "loss": 1.9638, "step": 1299 }, { "epoch": 1.6518424396442186, "grad_norm": 1.2257252313691758, "learning_rate": 1.2928227712765504e-05, "loss": 1.9834, "step": 1300 }, { "epoch": 1.653113087674714, "grad_norm": 1.1761208630174838, "learning_rate": 1.2918561204353871e-05, "loss": 1.7215, "step": 1301 }, { "epoch": 1.6543837357052098, "grad_norm": 1.3891650801080222, "learning_rate": 1.2908891713950107e-05, "loss": 1.7344, "step": 1302 }, { "epoch": 1.6556543837357052, "grad_norm": 1.3855957023677499, "learning_rate": 1.2899219251433848e-05, "loss": 1.8464, "step": 1303 }, { "epoch": 1.6569250317662008, "grad_norm": 1.3598192540799567, "learning_rate": 1.2889543826687785e-05, "loss": 1.8313, "step": 1304 }, { "epoch": 1.6581956797966964, "grad_norm": 1.5002056868237887, "learning_rate": 1.2879865449597617e-05, "loss": 1.9405, "step": 1305 }, { "epoch": 1.6594663278271917, "grad_norm": 1.3216736210005633, "learning_rate": 1.287018413005207e-05, "loss": 1.7279, "step": 1306 }, { "epoch": 1.6607369758576875, "grad_norm": 1.3819372523696916, "learning_rate": 1.2860499877942876e-05, "loss": 2.0484, "step": 1307 }, { "epoch": 1.662007623888183, "grad_norm": 1.170834326805149, "learning_rate": 1.2850812703164754e-05, "loss": 1.7203, "step": 1308 }, { "epoch": 1.6632782719186785, "grad_norm": 1.4256686717965028, "learning_rate": 1.2841122615615426e-05, "loss": 1.7459, "step": 1309 }, { "epoch": 1.664548919949174, "grad_norm": 1.6382014723072733, "learning_rate": 1.2831429625195576e-05, "loss": 1.5154, "step": 1310 }, { "epoch": 1.6658195679796697, "grad_norm": 1.2509445085153295, "learning_rate": 1.2821733741808855e-05, "loss": 1.7097, "step": 1311 }, { "epoch": 1.6670902160101653, "grad_norm": 1.3131910081504445, "learning_rate": 1.2812034975361876e-05, "loss": 1.8529, "step": 1312 }, { "epoch": 1.6683608640406606, "grad_norm": 1.2647384119627838, "learning_rate": 1.2802333335764194e-05, "loss": 1.7561, "step": 1313 }, { "epoch": 1.6696315120711565, "grad_norm": 1.52124313252029, "learning_rate": 1.2792628832928302e-05, "loss": 2.0929, "step": 1314 }, { "epoch": 1.6709021601016518, "grad_norm": 1.2844058844721489, "learning_rate": 1.2782921476769616e-05, "loss": 1.9279, "step": 1315 }, { "epoch": 1.6721728081321474, "grad_norm": 1.1525042552706943, "learning_rate": 1.277321127720647e-05, "loss": 1.9666, "step": 1316 }, { "epoch": 1.673443456162643, "grad_norm": 1.5850927229591063, "learning_rate": 1.2763498244160097e-05, "loss": 1.5506, "step": 1317 }, { "epoch": 1.6747141041931384, "grad_norm": 1.478772099784771, "learning_rate": 1.2753782387554633e-05, "loss": 2.1687, "step": 1318 }, { "epoch": 1.6759847522236342, "grad_norm": 1.2500162015385214, "learning_rate": 1.2744063717317094e-05, "loss": 1.5892, "step": 1319 }, { "epoch": 1.6772554002541296, "grad_norm": 1.4520444580272065, "learning_rate": 1.2734342243377376e-05, "loss": 2.0015, "step": 1320 }, { "epoch": 1.6785260482846251, "grad_norm": 1.5010211903038795, "learning_rate": 1.2724617975668229e-05, "loss": 2.1132, "step": 1321 }, { "epoch": 1.6797966963151207, "grad_norm": 1.2386575390441286, "learning_rate": 1.271489092412527e-05, "loss": 1.8858, "step": 1322 }, { "epoch": 1.681067344345616, "grad_norm": 1.2150549039148064, "learning_rate": 1.2705161098686953e-05, "loss": 1.8268, "step": 1323 }, { "epoch": 1.682337992376112, "grad_norm": 1.4354657637645867, "learning_rate": 1.2695428509294567e-05, "loss": 1.7492, "step": 1324 }, { "epoch": 1.6836086404066073, "grad_norm": 1.405427966456244, "learning_rate": 1.2685693165892228e-05, "loss": 1.898, "step": 1325 }, { "epoch": 1.6848792884371029, "grad_norm": 1.311553544198372, "learning_rate": 1.267595507842686e-05, "loss": 1.9387, "step": 1326 }, { "epoch": 1.6861499364675985, "grad_norm": 1.2420356703845903, "learning_rate": 1.2666214256848197e-05, "loss": 1.8164, "step": 1327 }, { "epoch": 1.687420584498094, "grad_norm": 1.3373903023080569, "learning_rate": 1.2656470711108763e-05, "loss": 1.9783, "step": 1328 }, { "epoch": 1.6886912325285897, "grad_norm": 1.4874301493310746, "learning_rate": 1.264672445116387e-05, "loss": 2.0626, "step": 1329 }, { "epoch": 1.689961880559085, "grad_norm": 1.273505177911983, "learning_rate": 1.2636975486971594e-05, "loss": 1.8729, "step": 1330 }, { "epoch": 1.6912325285895808, "grad_norm": 1.7074068671901932, "learning_rate": 1.2627223828492785e-05, "loss": 2.0717, "step": 1331 }, { "epoch": 1.6925031766200762, "grad_norm": 1.4110321983565615, "learning_rate": 1.2617469485691034e-05, "loss": 1.9849, "step": 1332 }, { "epoch": 1.6937738246505718, "grad_norm": 1.143137082547412, "learning_rate": 1.2607712468532688e-05, "loss": 1.9109, "step": 1333 }, { "epoch": 1.6950444726810674, "grad_norm": 1.4170049840764738, "learning_rate": 1.2597952786986813e-05, "loss": 2.086, "step": 1334 }, { "epoch": 1.6963151207115628, "grad_norm": 1.3493025409661936, "learning_rate": 1.2588190451025209e-05, "loss": 1.7331, "step": 1335 }, { "epoch": 1.6975857687420586, "grad_norm": 1.2325439608220607, "learning_rate": 1.257842547062238e-05, "loss": 1.8408, "step": 1336 }, { "epoch": 1.698856416772554, "grad_norm": 1.2903745252099554, "learning_rate": 1.256865785575554e-05, "loss": 1.6788, "step": 1337 }, { "epoch": 1.7001270648030495, "grad_norm": 1.6126834280166753, "learning_rate": 1.255888761640458e-05, "loss": 1.8393, "step": 1338 }, { "epoch": 1.7013977128335451, "grad_norm": 12.132017979711518, "learning_rate": 1.254911476255209e-05, "loss": 1.7632, "step": 1339 }, { "epoch": 1.7026683608640405, "grad_norm": 1.4264111838446671, "learning_rate": 1.253933930418332e-05, "loss": 1.9023, "step": 1340 }, { "epoch": 1.7039390088945363, "grad_norm": 1.3396460965865984, "learning_rate": 1.2529561251286184e-05, "loss": 1.8552, "step": 1341 }, { "epoch": 1.7052096569250317, "grad_norm": 1.8974603880479586, "learning_rate": 1.2519780613851254e-05, "loss": 1.9448, "step": 1342 }, { "epoch": 1.7064803049555273, "grad_norm": 1.2414447550947338, "learning_rate": 1.250999740187173e-05, "loss": 1.8183, "step": 1343 }, { "epoch": 1.7077509529860229, "grad_norm": 1.754129000217617, "learning_rate": 1.2500211625343448e-05, "loss": 1.9146, "step": 1344 }, { "epoch": 1.7090216010165185, "grad_norm": 1.144544650820467, "learning_rate": 1.2490423294264866e-05, "loss": 1.8175, "step": 1345 }, { "epoch": 1.710292249047014, "grad_norm": 1.1845927231500297, "learning_rate": 1.2480632418637054e-05, "loss": 1.8947, "step": 1346 }, { "epoch": 1.7115628970775094, "grad_norm": 1.716408411076265, "learning_rate": 1.2470839008463676e-05, "loss": 2.0708, "step": 1347 }, { "epoch": 1.7128335451080052, "grad_norm": 1.333654976939953, "learning_rate": 1.2461043073750988e-05, "loss": 1.9351, "step": 1348 }, { "epoch": 1.7141041931385006, "grad_norm": 1.7726917448928459, "learning_rate": 1.2451244624507831e-05, "loss": 1.9672, "step": 1349 }, { "epoch": 1.7153748411689962, "grad_norm": 1.4690410407624557, "learning_rate": 1.2441443670745606e-05, "loss": 1.8296, "step": 1350 }, { "epoch": 1.7166454891994918, "grad_norm": 1.41542125842506, "learning_rate": 1.2431640222478275e-05, "loss": 1.7263, "step": 1351 }, { "epoch": 1.7179161372299872, "grad_norm": 1.2083108369431936, "learning_rate": 1.2421834289722354e-05, "loss": 1.8448, "step": 1352 }, { "epoch": 1.719186785260483, "grad_norm": 1.4383951762557672, "learning_rate": 1.2412025882496895e-05, "loss": 1.7514, "step": 1353 }, { "epoch": 1.7204574332909783, "grad_norm": 1.3817843859209873, "learning_rate": 1.2402215010823472e-05, "loss": 1.8088, "step": 1354 }, { "epoch": 1.721728081321474, "grad_norm": 1.3865663226106641, "learning_rate": 1.239240168472619e-05, "loss": 1.7988, "step": 1355 }, { "epoch": 1.7229987293519695, "grad_norm": 1.4096566413292009, "learning_rate": 1.238258591423165e-05, "loss": 1.8322, "step": 1356 }, { "epoch": 1.724269377382465, "grad_norm": 1.4853260723970567, "learning_rate": 1.2372767709368957e-05, "loss": 1.6997, "step": 1357 }, { "epoch": 1.7255400254129607, "grad_norm": 1.3826381702954678, "learning_rate": 1.23629470801697e-05, "loss": 1.9372, "step": 1358 }, { "epoch": 1.726810673443456, "grad_norm": 1.2355594794312026, "learning_rate": 1.2353124036667946e-05, "loss": 1.6416, "step": 1359 }, { "epoch": 1.7280813214739519, "grad_norm": 1.3577984488669015, "learning_rate": 1.2343298588900226e-05, "loss": 1.9556, "step": 1360 }, { "epoch": 1.7293519695044473, "grad_norm": 1.1389770751646968, "learning_rate": 1.2333470746905534e-05, "loss": 1.7399, "step": 1361 }, { "epoch": 1.7306226175349428, "grad_norm": 1.2795159617708947, "learning_rate": 1.2323640520725306e-05, "loss": 1.8617, "step": 1362 }, { "epoch": 1.7318932655654384, "grad_norm": 1.4461540608966716, "learning_rate": 1.2313807920403419e-05, "loss": 1.958, "step": 1363 }, { "epoch": 1.7331639135959338, "grad_norm": 1.397233423826948, "learning_rate": 1.2303972955986161e-05, "loss": 1.7538, "step": 1364 }, { "epoch": 1.7344345616264296, "grad_norm": 1.2889823172447274, "learning_rate": 1.2294135637522254e-05, "loss": 1.8574, "step": 1365 }, { "epoch": 1.735705209656925, "grad_norm": 1.26840599009331, "learning_rate": 1.2284295975062814e-05, "loss": 1.8328, "step": 1366 }, { "epoch": 1.7369758576874206, "grad_norm": 1.3660457927942886, "learning_rate": 1.2274453978661356e-05, "loss": 1.8039, "step": 1367 }, { "epoch": 1.7382465057179162, "grad_norm": 1.0947022560711457, "learning_rate": 1.226460965837378e-05, "loss": 1.896, "step": 1368 }, { "epoch": 1.7395171537484115, "grad_norm": 1.3136869299218366, "learning_rate": 1.225476302425836e-05, "loss": 1.8964, "step": 1369 }, { "epoch": 1.7407878017789074, "grad_norm": 1.404627845671186, "learning_rate": 1.2244914086375726e-05, "loss": 1.7482, "step": 1370 }, { "epoch": 1.7420584498094027, "grad_norm": 1.2443528146690166, "learning_rate": 1.223506285478888e-05, "loss": 1.9536, "step": 1371 }, { "epoch": 1.7433290978398983, "grad_norm": 1.2042931826355405, "learning_rate": 1.2225209339563144e-05, "loss": 1.7529, "step": 1372 }, { "epoch": 1.744599745870394, "grad_norm": 1.263280657296549, "learning_rate": 1.2215353550766197e-05, "loss": 1.9768, "step": 1373 }, { "epoch": 1.7458703939008895, "grad_norm": 1.1766118912224472, "learning_rate": 1.2205495498468025e-05, "loss": 2.0298, "step": 1374 }, { "epoch": 1.747141041931385, "grad_norm": 1.3899210224600584, "learning_rate": 1.219563519274093e-05, "loss": 1.8235, "step": 1375 }, { "epoch": 1.7484116899618805, "grad_norm": 1.2008533872792837, "learning_rate": 1.2185772643659521e-05, "loss": 1.9107, "step": 1376 }, { "epoch": 1.7496823379923763, "grad_norm": 1.0941137156711913, "learning_rate": 1.2175907861300698e-05, "loss": 1.9886, "step": 1377 }, { "epoch": 1.7509529860228716, "grad_norm": 1.2944173746818755, "learning_rate": 1.2166040855743635e-05, "loss": 1.8527, "step": 1378 }, { "epoch": 1.7522236340533672, "grad_norm": 1.2642170229105665, "learning_rate": 1.2156171637069785e-05, "loss": 1.8462, "step": 1379 }, { "epoch": 1.7534942820838628, "grad_norm": 1.2430017289787458, "learning_rate": 1.2146300215362863e-05, "loss": 1.8839, "step": 1380 }, { "epoch": 1.7547649301143582, "grad_norm": 1.2199528498427983, "learning_rate": 1.2136426600708833e-05, "loss": 1.8889, "step": 1381 }, { "epoch": 1.756035578144854, "grad_norm": 1.3984242744552862, "learning_rate": 1.2126550803195895e-05, "loss": 1.7563, "step": 1382 }, { "epoch": 1.7573062261753494, "grad_norm": 1.305331265532622, "learning_rate": 1.2116672832914489e-05, "loss": 2.0087, "step": 1383 }, { "epoch": 1.758576874205845, "grad_norm": 1.2898722634068678, "learning_rate": 1.2106792699957264e-05, "loss": 1.6728, "step": 1384 }, { "epoch": 1.7598475222363406, "grad_norm": 1.5640598557270948, "learning_rate": 1.2096910414419087e-05, "loss": 2.004, "step": 1385 }, { "epoch": 1.761118170266836, "grad_norm": 1.554862105286235, "learning_rate": 1.2087025986397023e-05, "loss": 1.9123, "step": 1386 }, { "epoch": 1.7623888182973317, "grad_norm": 1.2168486180432576, "learning_rate": 1.2077139425990321e-05, "loss": 1.8045, "step": 1387 }, { "epoch": 1.763659466327827, "grad_norm": 1.2331850367414905, "learning_rate": 1.2067250743300414e-05, "loss": 1.8561, "step": 1388 }, { "epoch": 1.7649301143583227, "grad_norm": 1.2716647592623593, "learning_rate": 1.2057359948430903e-05, "loss": 1.7265, "step": 1389 }, { "epoch": 1.7662007623888183, "grad_norm": 1.7712185520839197, "learning_rate": 1.204746705148754e-05, "loss": 1.7379, "step": 1390 }, { "epoch": 1.7674714104193139, "grad_norm": 2.654250532277399, "learning_rate": 1.2037572062578238e-05, "loss": 1.9402, "step": 1391 }, { "epoch": 1.7687420584498095, "grad_norm": 1.2759556317976997, "learning_rate": 1.2027674991813037e-05, "loss": 1.9721, "step": 1392 }, { "epoch": 1.7700127064803048, "grad_norm": 1.1357265349476322, "learning_rate": 1.2017775849304105e-05, "loss": 1.7183, "step": 1393 }, { "epoch": 1.7712833545108007, "grad_norm": 1.362592119463336, "learning_rate": 1.200787464516573e-05, "loss": 1.8925, "step": 1394 }, { "epoch": 1.772554002541296, "grad_norm": 1.3969098719536086, "learning_rate": 1.199797138951431e-05, "loss": 1.8477, "step": 1395 }, { "epoch": 1.7738246505717916, "grad_norm": 1.2586495073757091, "learning_rate": 1.1988066092468325e-05, "loss": 1.6721, "step": 1396 }, { "epoch": 1.7750952986022872, "grad_norm": 1.3661649219281586, "learning_rate": 1.1978158764148358e-05, "loss": 1.824, "step": 1397 }, { "epoch": 1.7763659466327826, "grad_norm": 1.3366780041839457, "learning_rate": 1.1968249414677055e-05, "loss": 2.0305, "step": 1398 }, { "epoch": 1.7776365946632784, "grad_norm": 1.2910755256326734, "learning_rate": 1.1958338054179135e-05, "loss": 1.5409, "step": 1399 }, { "epoch": 1.7789072426937738, "grad_norm": 1.2662512836713595, "learning_rate": 1.1948424692781364e-05, "loss": 1.8672, "step": 1400 }, { "epoch": 1.7801778907242694, "grad_norm": 1.4770845576189542, "learning_rate": 1.1938509340612565e-05, "loss": 2.0235, "step": 1401 }, { "epoch": 1.781448538754765, "grad_norm": 1.3208511318376104, "learning_rate": 1.1928592007803575e-05, "loss": 1.8901, "step": 1402 }, { "epoch": 1.7827191867852605, "grad_norm": 1.504667429298727, "learning_rate": 1.1918672704487275e-05, "loss": 1.7732, "step": 1403 }, { "epoch": 1.7839898348157561, "grad_norm": 1.404356174075473, "learning_rate": 1.1908751440798549e-05, "loss": 1.9319, "step": 1404 }, { "epoch": 1.7852604828462515, "grad_norm": 1.3037826155994945, "learning_rate": 1.1898828226874284e-05, "loss": 1.9241, "step": 1405 }, { "epoch": 1.786531130876747, "grad_norm": 1.5776711349588757, "learning_rate": 1.1888903072853364e-05, "loss": 2.0882, "step": 1406 }, { "epoch": 1.7878017789072427, "grad_norm": 1.3374617894396248, "learning_rate": 1.1878975988876648e-05, "loss": 2.001, "step": 1407 }, { "epoch": 1.7890724269377383, "grad_norm": 1.5095423208868328, "learning_rate": 1.1869046985086978e-05, "loss": 1.8458, "step": 1408 }, { "epoch": 1.7903430749682339, "grad_norm": 1.3296549033376626, "learning_rate": 1.1859116071629148e-05, "loss": 1.6368, "step": 1409 }, { "epoch": 1.7916137229987292, "grad_norm": 1.593559857193169, "learning_rate": 1.1849183258649903e-05, "loss": 1.8177, "step": 1410 }, { "epoch": 1.792884371029225, "grad_norm": 1.342662280127331, "learning_rate": 1.1839248556297938e-05, "loss": 2.0107, "step": 1411 }, { "epoch": 1.7941550190597204, "grad_norm": 1.421403096440148, "learning_rate": 1.1829311974723868e-05, "loss": 2.0703, "step": 1412 }, { "epoch": 1.795425667090216, "grad_norm": 1.4724131785793708, "learning_rate": 1.1819373524080233e-05, "loss": 1.7461, "step": 1413 }, { "epoch": 1.7966963151207116, "grad_norm": 1.257857762799035, "learning_rate": 1.1809433214521486e-05, "loss": 1.8401, "step": 1414 }, { "epoch": 1.797966963151207, "grad_norm": 1.5059440643051154, "learning_rate": 1.1799491056203973e-05, "loss": 1.9471, "step": 1415 }, { "epoch": 1.7992376111817028, "grad_norm": 1.475426427921491, "learning_rate": 1.1789547059285928e-05, "loss": 1.7771, "step": 1416 }, { "epoch": 1.8005082592121981, "grad_norm": 1.2583682337559847, "learning_rate": 1.1779601233927475e-05, "loss": 1.9209, "step": 1417 }, { "epoch": 1.8017789072426937, "grad_norm": 1.4923177597664052, "learning_rate": 1.1769653590290591e-05, "loss": 2.2439, "step": 1418 }, { "epoch": 1.8030495552731893, "grad_norm": 1.3124287124126588, "learning_rate": 1.1759704138539121e-05, "loss": 1.6536, "step": 1419 }, { "epoch": 1.804320203303685, "grad_norm": 1.3458331741364764, "learning_rate": 1.1749752888838754e-05, "loss": 1.8087, "step": 1420 }, { "epoch": 1.8055908513341805, "grad_norm": 1.4012331966186844, "learning_rate": 1.1739799851357021e-05, "loss": 1.9317, "step": 1421 }, { "epoch": 1.8068614993646759, "grad_norm": 1.2257761558605946, "learning_rate": 1.1729845036263263e-05, "loss": 1.8353, "step": 1422 }, { "epoch": 1.8081321473951717, "grad_norm": 1.320466289986687, "learning_rate": 1.1719888453728665e-05, "loss": 1.8284, "step": 1423 }, { "epoch": 1.809402795425667, "grad_norm": 1.304744652776889, "learning_rate": 1.170993011392619e-05, "loss": 1.8663, "step": 1424 }, { "epoch": 1.8106734434561627, "grad_norm": 1.3777909750839532, "learning_rate": 1.1699970027030613e-05, "loss": 2.008, "step": 1425 }, { "epoch": 1.8119440914866582, "grad_norm": 1.1053815499377322, "learning_rate": 1.1690008203218493e-05, "loss": 1.8928, "step": 1426 }, { "epoch": 1.8132147395171536, "grad_norm": 1.2615493941222704, "learning_rate": 1.1680044652668156e-05, "loss": 1.8193, "step": 1427 }, { "epoch": 1.8144853875476494, "grad_norm": 1.5095707151442657, "learning_rate": 1.1670079385559693e-05, "loss": 1.7967, "step": 1428 }, { "epoch": 1.8157560355781448, "grad_norm": 1.6773738017413227, "learning_rate": 1.1660112412074964e-05, "loss": 1.7579, "step": 1429 }, { "epoch": 1.8170266836086404, "grad_norm": 1.2175924278708228, "learning_rate": 1.1650143742397553e-05, "loss": 1.8115, "step": 1430 }, { "epoch": 1.818297331639136, "grad_norm": 1.2587370859429523, "learning_rate": 1.1640173386712786e-05, "loss": 2.0036, "step": 1431 }, { "epoch": 1.8195679796696314, "grad_norm": 1.2906712298735914, "learning_rate": 1.1630201355207709e-05, "loss": 1.83, "step": 1432 }, { "epoch": 1.8208386277001272, "grad_norm": 1.247476772493617, "learning_rate": 1.1620227658071088e-05, "loss": 2.0614, "step": 1433 }, { "epoch": 1.8221092757306225, "grad_norm": 1.8256172773347648, "learning_rate": 1.1610252305493374e-05, "loss": 1.9096, "step": 1434 }, { "epoch": 1.8233799237611181, "grad_norm": 1.1724464368042593, "learning_rate": 1.1600275307666735e-05, "loss": 1.7725, "step": 1435 }, { "epoch": 1.8246505717916137, "grad_norm": 1.3964553497467094, "learning_rate": 1.1590296674784991e-05, "loss": 1.8193, "step": 1436 }, { "epoch": 1.8259212198221093, "grad_norm": 1.1998179600419243, "learning_rate": 1.158031641704366e-05, "loss": 1.8011, "step": 1437 }, { "epoch": 1.827191867852605, "grad_norm": 1.5231632235669916, "learning_rate": 1.1570334544639896e-05, "loss": 1.713, "step": 1438 }, { "epoch": 1.8284625158831003, "grad_norm": 1.2046516674581838, "learning_rate": 1.1560351067772517e-05, "loss": 1.8704, "step": 1439 }, { "epoch": 1.829733163913596, "grad_norm": 1.3178265369589643, "learning_rate": 1.155036599664198e-05, "loss": 2.0145, "step": 1440 }, { "epoch": 1.8310038119440915, "grad_norm": 1.7226811184489739, "learning_rate": 1.1540379341450365e-05, "loss": 2.0435, "step": 1441 }, { "epoch": 1.832274459974587, "grad_norm": 1.281356140125465, "learning_rate": 1.1530391112401373e-05, "loss": 1.7186, "step": 1442 }, { "epoch": 1.8335451080050826, "grad_norm": 1.4583740366931355, "learning_rate": 1.1520401319700318e-05, "loss": 1.9342, "step": 1443 }, { "epoch": 1.834815756035578, "grad_norm": 1.484281275441423, "learning_rate": 1.15104099735541e-05, "loss": 1.8925, "step": 1444 }, { "epoch": 1.8360864040660738, "grad_norm": 1.412304238699942, "learning_rate": 1.150041708417122e-05, "loss": 1.5939, "step": 1445 }, { "epoch": 1.8373570520965692, "grad_norm": 1.3438716295100606, "learning_rate": 1.1490422661761744e-05, "loss": 1.8617, "step": 1446 }, { "epoch": 1.8386277001270648, "grad_norm": 1.9605821009953353, "learning_rate": 1.1480426716537316e-05, "loss": 1.8308, "step": 1447 }, { "epoch": 1.8398983481575604, "grad_norm": 1.3991191334668605, "learning_rate": 1.1470429258711122e-05, "loss": 1.9475, "step": 1448 }, { "epoch": 1.841168996188056, "grad_norm": 1.2670213328810134, "learning_rate": 1.1460430298497907e-05, "loss": 1.7133, "step": 1449 }, { "epoch": 1.8424396442185516, "grad_norm": 1.3114626960479783, "learning_rate": 1.145042984611394e-05, "loss": 1.6649, "step": 1450 }, { "epoch": 1.843710292249047, "grad_norm": 1.2816095446222748, "learning_rate": 1.144042791177702e-05, "loss": 1.9007, "step": 1451 }, { "epoch": 1.8449809402795425, "grad_norm": 1.563591169561673, "learning_rate": 1.1430424505706466e-05, "loss": 1.8746, "step": 1452 }, { "epoch": 1.846251588310038, "grad_norm": 1.5350916206044278, "learning_rate": 1.1420419638123088e-05, "loss": 1.9019, "step": 1453 }, { "epoch": 1.8475222363405337, "grad_norm": 1.6647715831066532, "learning_rate": 1.1410413319249193e-05, "loss": 1.8594, "step": 1454 }, { "epoch": 1.8487928843710293, "grad_norm": 1.2738941811409745, "learning_rate": 1.1400405559308583e-05, "loss": 1.6744, "step": 1455 }, { "epoch": 1.8500635324015247, "grad_norm": 1.637080701428525, "learning_rate": 1.1390396368526518e-05, "loss": 2.043, "step": 1456 }, { "epoch": 1.8513341804320205, "grad_norm": 1.3667523210206463, "learning_rate": 1.1380385757129722e-05, "loss": 2.0122, "step": 1457 }, { "epoch": 1.8526048284625158, "grad_norm": 1.9195347645962353, "learning_rate": 1.1370373735346376e-05, "loss": 1.8422, "step": 1458 }, { "epoch": 1.8538754764930114, "grad_norm": 1.3121875093141986, "learning_rate": 1.1360360313406103e-05, "loss": 1.8128, "step": 1459 }, { "epoch": 1.855146124523507, "grad_norm": 1.2148920185298064, "learning_rate": 1.1350345501539941e-05, "loss": 1.8175, "step": 1460 }, { "epoch": 1.8564167725540024, "grad_norm": 1.3891527875307301, "learning_rate": 1.1340329309980379e-05, "loss": 1.9379, "step": 1461 }, { "epoch": 1.8576874205844982, "grad_norm": 1.6586252458635242, "learning_rate": 1.1330311748961278e-05, "loss": 1.5624, "step": 1462 }, { "epoch": 1.8589580686149936, "grad_norm": 1.4942382644648793, "learning_rate": 1.1320292828717927e-05, "loss": 1.7223, "step": 1463 }, { "epoch": 1.8602287166454892, "grad_norm": 1.2896948455619404, "learning_rate": 1.1310272559486992e-05, "loss": 1.9926, "step": 1464 }, { "epoch": 1.8614993646759848, "grad_norm": 1.3366120995045905, "learning_rate": 1.130025095150652e-05, "loss": 1.956, "step": 1465 }, { "epoch": 1.8627700127064803, "grad_norm": 1.399795765221416, "learning_rate": 1.1290228015015923e-05, "loss": 1.9303, "step": 1466 }, { "epoch": 1.864040660736976, "grad_norm": 1.4568850504637758, "learning_rate": 1.1280203760255974e-05, "loss": 1.7904, "step": 1467 }, { "epoch": 1.8653113087674713, "grad_norm": 1.3276010632182842, "learning_rate": 1.1270178197468788e-05, "loss": 1.995, "step": 1468 }, { "epoch": 1.8665819567979671, "grad_norm": 1.1450291243004833, "learning_rate": 1.1260151336897824e-05, "loss": 1.7994, "step": 1469 }, { "epoch": 1.8678526048284625, "grad_norm": 1.372707712609518, "learning_rate": 1.125012318878786e-05, "loss": 1.7564, "step": 1470 }, { "epoch": 1.869123252858958, "grad_norm": 1.2881075904795836, "learning_rate": 1.1240093763384991e-05, "loss": 1.8716, "step": 1471 }, { "epoch": 1.8703939008894537, "grad_norm": 1.478739460363151, "learning_rate": 1.1230063070936624e-05, "loss": 1.848, "step": 1472 }, { "epoch": 1.871664548919949, "grad_norm": 1.229621468255002, "learning_rate": 1.1220031121691449e-05, "loss": 1.6655, "step": 1473 }, { "epoch": 1.8729351969504449, "grad_norm": 1.2141725032976105, "learning_rate": 1.1209997925899442e-05, "loss": 1.933, "step": 1474 }, { "epoch": 1.8742058449809402, "grad_norm": 1.2617137046426306, "learning_rate": 1.119996349381187e-05, "loss": 1.9028, "step": 1475 }, { "epoch": 1.8754764930114358, "grad_norm": 1.4541254544342954, "learning_rate": 1.118992783568124e-05, "loss": 1.6844, "step": 1476 }, { "epoch": 1.8767471410419314, "grad_norm": 1.4416164883757325, "learning_rate": 1.1179890961761321e-05, "loss": 1.7029, "step": 1477 }, { "epoch": 1.8780177890724268, "grad_norm": 1.6711166429056248, "learning_rate": 1.1169852882307128e-05, "loss": 1.9285, "step": 1478 }, { "epoch": 1.8792884371029226, "grad_norm": 1.4073114075531266, "learning_rate": 1.1159813607574905e-05, "loss": 1.844, "step": 1479 }, { "epoch": 1.880559085133418, "grad_norm": 1.3495544307796417, "learning_rate": 1.1149773147822112e-05, "loss": 1.8957, "step": 1480 }, { "epoch": 1.8818297331639136, "grad_norm": 1.2613021446156025, "learning_rate": 1.113973151330743e-05, "loss": 1.9607, "step": 1481 }, { "epoch": 1.8831003811944091, "grad_norm": 1.289020288898543, "learning_rate": 1.112968871429073e-05, "loss": 1.7818, "step": 1482 }, { "epoch": 1.8843710292249047, "grad_norm": 1.2187657323684478, "learning_rate": 1.1119644761033079e-05, "loss": 1.7153, "step": 1483 }, { "epoch": 1.8856416772554003, "grad_norm": 1.2728351809042304, "learning_rate": 1.1109599663796724e-05, "loss": 1.9181, "step": 1484 }, { "epoch": 1.8869123252858957, "grad_norm": 1.2457636252898838, "learning_rate": 1.1099553432845079e-05, "loss": 2.0376, "step": 1485 }, { "epoch": 1.8881829733163915, "grad_norm": 1.2556364575716603, "learning_rate": 1.1089506078442709e-05, "loss": 1.8307, "step": 1486 }, { "epoch": 1.8894536213468869, "grad_norm": 1.2422773081745468, "learning_rate": 1.1079457610855342e-05, "loss": 1.5723, "step": 1487 }, { "epoch": 1.8907242693773825, "grad_norm": 1.2166331727886142, "learning_rate": 1.1069408040349832e-05, "loss": 1.8537, "step": 1488 }, { "epoch": 1.891994917407878, "grad_norm": 1.8899361122781229, "learning_rate": 1.1059357377194161e-05, "loss": 2.1507, "step": 1489 }, { "epoch": 1.8932655654383734, "grad_norm": 1.4418770421153397, "learning_rate": 1.1049305631657434e-05, "loss": 1.9585, "step": 1490 }, { "epoch": 1.8945362134688692, "grad_norm": 1.2903505581245351, "learning_rate": 1.1039252814009858e-05, "loss": 1.6915, "step": 1491 }, { "epoch": 1.8958068614993646, "grad_norm": 1.2237462796711218, "learning_rate": 1.1029198934522725e-05, "loss": 1.7736, "step": 1492 }, { "epoch": 1.8970775095298602, "grad_norm": 1.3825824140430243, "learning_rate": 1.1019144003468434e-05, "loss": 1.8529, "step": 1493 }, { "epoch": 1.8983481575603558, "grad_norm": 1.5019641930632714, "learning_rate": 1.100908803112044e-05, "loss": 1.9011, "step": 1494 }, { "epoch": 1.8996188055908514, "grad_norm": 1.1970623904098372, "learning_rate": 1.0999031027753269e-05, "loss": 1.8513, "step": 1495 }, { "epoch": 1.900889453621347, "grad_norm": 3.9496021065781814, "learning_rate": 1.09889730036425e-05, "loss": 1.7886, "step": 1496 }, { "epoch": 1.9021601016518423, "grad_norm": 1.1909345439294168, "learning_rate": 1.0978913969064753e-05, "loss": 2.0297, "step": 1497 }, { "epoch": 1.903430749682338, "grad_norm": 1.1853023179130728, "learning_rate": 1.0968853934297686e-05, "loss": 1.8918, "step": 1498 }, { "epoch": 1.9047013977128335, "grad_norm": 1.1951448764156936, "learning_rate": 1.095879290961997e-05, "loss": 1.7644, "step": 1499 }, { "epoch": 1.9059720457433291, "grad_norm": 1.3805797328593437, "learning_rate": 1.0948730905311294e-05, "loss": 1.9272, "step": 1500 }, { "epoch": 1.9072426937738247, "grad_norm": 1.5137753880951426, "learning_rate": 1.0938667931652347e-05, "loss": 1.7265, "step": 1501 }, { "epoch": 1.90851334180432, "grad_norm": 1.7052797441607885, "learning_rate": 1.0928603998924807e-05, "loss": 2.0261, "step": 1502 }, { "epoch": 1.909783989834816, "grad_norm": 1.3312756229631164, "learning_rate": 1.0918539117411334e-05, "loss": 1.9571, "step": 1503 }, { "epoch": 1.9110546378653113, "grad_norm": 1.2612079574489132, "learning_rate": 1.0908473297395552e-05, "loss": 1.7756, "step": 1504 }, { "epoch": 1.9123252858958069, "grad_norm": 1.28009759085661, "learning_rate": 1.0898406549162053e-05, "loss": 1.8911, "step": 1505 }, { "epoch": 1.9135959339263025, "grad_norm": 1.315883162812706, "learning_rate": 1.0888338882996365e-05, "loss": 1.9947, "step": 1506 }, { "epoch": 1.9148665819567978, "grad_norm": 1.4388442688673047, "learning_rate": 1.0878270309184973e-05, "loss": 1.9072, "step": 1507 }, { "epoch": 1.9161372299872936, "grad_norm": 1.4997070928756402, "learning_rate": 1.0868200838015265e-05, "loss": 1.8826, "step": 1508 }, { "epoch": 1.917407878017789, "grad_norm": 1.2407765148265102, "learning_rate": 1.0858130479775564e-05, "loss": 1.7608, "step": 1509 }, { "epoch": 1.9186785260482846, "grad_norm": 1.2162156867582095, "learning_rate": 1.0848059244755093e-05, "loss": 1.8828, "step": 1510 }, { "epoch": 1.9199491740787802, "grad_norm": 1.3753812289304952, "learning_rate": 1.0837987143243972e-05, "loss": 1.8763, "step": 1511 }, { "epoch": 1.9212198221092758, "grad_norm": 1.317136281554182, "learning_rate": 1.0827914185533206e-05, "loss": 2.0366, "step": 1512 }, { "epoch": 1.9224904701397714, "grad_norm": 1.312553059882304, "learning_rate": 1.0817840381914675e-05, "loss": 2.0488, "step": 1513 }, { "epoch": 1.9237611181702667, "grad_norm": 1.3785690601450935, "learning_rate": 1.080776574268112e-05, "loss": 1.869, "step": 1514 }, { "epoch": 1.9250317662007626, "grad_norm": 1.4184047290196182, "learning_rate": 1.079769027812614e-05, "loss": 1.993, "step": 1515 }, { "epoch": 1.926302414231258, "grad_norm": 1.1348801262998818, "learning_rate": 1.0787613998544179e-05, "loss": 1.8294, "step": 1516 }, { "epoch": 1.9275730622617535, "grad_norm": 1.220243637792347, "learning_rate": 1.0777536914230509e-05, "loss": 1.833, "step": 1517 }, { "epoch": 1.928843710292249, "grad_norm": 1.202059656455235, "learning_rate": 1.0767459035481222e-05, "loss": 1.7776, "step": 1518 }, { "epoch": 1.9301143583227445, "grad_norm": 1.2616653504897855, "learning_rate": 1.0757380372593234e-05, "loss": 2.092, "step": 1519 }, { "epoch": 1.9313850063532403, "grad_norm": 1.3040846469184033, "learning_rate": 1.0747300935864245e-05, "loss": 1.7734, "step": 1520 }, { "epoch": 1.9326556543837357, "grad_norm": 1.3425908648306686, "learning_rate": 1.0737220735592759e-05, "loss": 1.9033, "step": 1521 }, { "epoch": 1.9339263024142312, "grad_norm": 1.4151275474867275, "learning_rate": 1.0727139782078054e-05, "loss": 1.574, "step": 1522 }, { "epoch": 1.9351969504447268, "grad_norm": 1.3925123627658118, "learning_rate": 1.071705808562018e-05, "loss": 1.7745, "step": 1523 }, { "epoch": 1.9364675984752222, "grad_norm": 1.313720131215306, "learning_rate": 1.0706975656519946e-05, "loss": 1.905, "step": 1524 }, { "epoch": 1.937738246505718, "grad_norm": 1.240030432064742, "learning_rate": 1.0696892505078913e-05, "loss": 1.8227, "step": 1525 }, { "epoch": 1.9390088945362134, "grad_norm": 1.734954931533088, "learning_rate": 1.0686808641599364e-05, "loss": 1.4944, "step": 1526 }, { "epoch": 1.940279542566709, "grad_norm": 1.4328846041574559, "learning_rate": 1.0676724076384333e-05, "loss": 1.8688, "step": 1527 }, { "epoch": 1.9415501905972046, "grad_norm": 1.19033566630027, "learning_rate": 1.0666638819737554e-05, "loss": 1.8812, "step": 1528 }, { "epoch": 1.9428208386277002, "grad_norm": 1.3933868256455422, "learning_rate": 1.0656552881963474e-05, "loss": 1.8662, "step": 1529 }, { "epoch": 1.9440914866581958, "grad_norm": 1.5607668837933577, "learning_rate": 1.0646466273367235e-05, "loss": 1.8623, "step": 1530 }, { "epoch": 1.9453621346886911, "grad_norm": 1.3214091567545014, "learning_rate": 1.0636379004254665e-05, "loss": 1.81, "step": 1531 }, { "epoch": 1.946632782719187, "grad_norm": 1.4549576254701408, "learning_rate": 1.062629108493226e-05, "loss": 1.8601, "step": 1532 }, { "epoch": 1.9479034307496823, "grad_norm": 1.2124857082535867, "learning_rate": 1.06162025257072e-05, "loss": 1.8104, "step": 1533 }, { "epoch": 1.949174078780178, "grad_norm": 1.2162357925040301, "learning_rate": 1.060611333688729e-05, "loss": 2.0063, "step": 1534 }, { "epoch": 1.9504447268106735, "grad_norm": 1.4642054743330333, "learning_rate": 1.0596023528781003e-05, "loss": 1.8917, "step": 1535 }, { "epoch": 1.9517153748411689, "grad_norm": 1.2812106018682774, "learning_rate": 1.058593311169743e-05, "loss": 1.7892, "step": 1536 }, { "epoch": 1.9529860228716647, "grad_norm": 1.3705385042814477, "learning_rate": 1.0575842095946298e-05, "loss": 2.1955, "step": 1537 }, { "epoch": 1.95425667090216, "grad_norm": 1.203576822595647, "learning_rate": 1.0565750491837925e-05, "loss": 1.824, "step": 1538 }, { "epoch": 1.9555273189326556, "grad_norm": 1.3119912838272654, "learning_rate": 1.0555658309683251e-05, "loss": 1.8335, "step": 1539 }, { "epoch": 1.9567979669631512, "grad_norm": 1.2837165208756307, "learning_rate": 1.0545565559793796e-05, "loss": 1.5003, "step": 1540 }, { "epoch": 1.9580686149936466, "grad_norm": 1.285094358215575, "learning_rate": 1.053547225248166e-05, "loss": 2.0034, "step": 1541 }, { "epoch": 1.9593392630241424, "grad_norm": 1.6897389171605062, "learning_rate": 1.0525378398059516e-05, "loss": 1.8101, "step": 1542 }, { "epoch": 1.9606099110546378, "grad_norm": 1.5569612139284217, "learning_rate": 1.0515284006840596e-05, "loss": 1.7041, "step": 1543 }, { "epoch": 1.9618805590851334, "grad_norm": 1.4460708928459691, "learning_rate": 1.0505189089138672e-05, "loss": 1.8843, "step": 1544 }, { "epoch": 1.963151207115629, "grad_norm": 1.3246627898468204, "learning_rate": 1.049509365526807e-05, "loss": 1.719, "step": 1545 }, { "epoch": 1.9644218551461246, "grad_norm": 1.4195235677913507, "learning_rate": 1.0484997715543632e-05, "loss": 2.12, "step": 1546 }, { "epoch": 1.9656925031766201, "grad_norm": 1.3449356124037468, "learning_rate": 1.0474901280280717e-05, "loss": 2.1002, "step": 1547 }, { "epoch": 1.9669631512071155, "grad_norm": 1.411689091489764, "learning_rate": 1.046480435979519e-05, "loss": 1.922, "step": 1548 }, { "epoch": 1.9682337992376113, "grad_norm": 1.3889025429782529, "learning_rate": 1.0454706964403421e-05, "loss": 1.8338, "step": 1549 }, { "epoch": 1.9695044472681067, "grad_norm": 1.3058427582138856, "learning_rate": 1.0444609104422253e-05, "loss": 1.8817, "step": 1550 }, { "epoch": 1.9707750952986023, "grad_norm": 1.737171300498622, "learning_rate": 1.0434510790169014e-05, "loss": 1.8244, "step": 1551 }, { "epoch": 1.9720457433290979, "grad_norm": 1.3377667180365431, "learning_rate": 1.0424412031961485e-05, "loss": 1.6864, "step": 1552 }, { "epoch": 1.9733163913595932, "grad_norm": 1.5352021161297504, "learning_rate": 1.041431284011791e-05, "loss": 2.0236, "step": 1553 }, { "epoch": 1.974587039390089, "grad_norm": 1.4409359496788379, "learning_rate": 1.0404213224956974e-05, "loss": 1.8878, "step": 1554 }, { "epoch": 1.9758576874205844, "grad_norm": 1.2230201495547812, "learning_rate": 1.0394113196797793e-05, "loss": 1.7995, "step": 1555 }, { "epoch": 1.97712833545108, "grad_norm": 1.3318113041240423, "learning_rate": 1.0384012765959904e-05, "loss": 1.9446, "step": 1556 }, { "epoch": 1.9783989834815756, "grad_norm": 1.4399026405166524, "learning_rate": 1.037391194276326e-05, "loss": 1.7571, "step": 1557 }, { "epoch": 1.9796696315120712, "grad_norm": 1.2649007523271352, "learning_rate": 1.0363810737528204e-05, "loss": 1.6082, "step": 1558 }, { "epoch": 1.9809402795425668, "grad_norm": 1.3362245297305004, "learning_rate": 1.0353709160575488e-05, "loss": 1.5646, "step": 1559 }, { "epoch": 1.9822109275730622, "grad_norm": 1.3186964992067625, "learning_rate": 1.0343607222226227e-05, "loss": 1.5753, "step": 1560 }, { "epoch": 1.983481575603558, "grad_norm": 1.2582780186793407, "learning_rate": 1.0333504932801907e-05, "loss": 2.0277, "step": 1561 }, { "epoch": 1.9847522236340533, "grad_norm": 4.105112342397828, "learning_rate": 1.0323402302624386e-05, "loss": 2.1881, "step": 1562 }, { "epoch": 1.986022871664549, "grad_norm": 1.316337918857903, "learning_rate": 1.0313299342015855e-05, "loss": 1.9083, "step": 1563 }, { "epoch": 1.9872935196950445, "grad_norm": 1.4610849359414284, "learning_rate": 1.030319606129885e-05, "loss": 1.9046, "step": 1564 }, { "epoch": 1.98856416772554, "grad_norm": 1.2497272565358144, "learning_rate": 1.0293092470796236e-05, "loss": 1.8425, "step": 1565 }, { "epoch": 1.9898348157560357, "grad_norm": 1.4980665158805504, "learning_rate": 1.0282988580831183e-05, "loss": 1.9863, "step": 1566 }, { "epoch": 1.991105463786531, "grad_norm": 1.6926076544107527, "learning_rate": 1.027288440172718e-05, "loss": 1.8363, "step": 1567 }, { "epoch": 1.9923761118170267, "grad_norm": 1.4868892082114677, "learning_rate": 1.026277994380801e-05, "loss": 2.0619, "step": 1568 }, { "epoch": 1.9936467598475223, "grad_norm": 1.3481010200559678, "learning_rate": 1.0252675217397734e-05, "loss": 1.7867, "step": 1569 }, { "epoch": 1.9949174078780176, "grad_norm": 1.3063690989319638, "learning_rate": 1.0242570232820687e-05, "loss": 1.7639, "step": 1570 }, { "epoch": 1.9961880559085134, "grad_norm": 1.269868093923743, "learning_rate": 1.0232465000401482e-05, "loss": 1.695, "step": 1571 }, { "epoch": 1.9974587039390088, "grad_norm": 1.4284273129199416, "learning_rate": 1.0222359530464964e-05, "loss": 1.791, "step": 1572 }, { "epoch": 1.9987293519695044, "grad_norm": 1.4282702884606915, "learning_rate": 1.0212253833336237e-05, "loss": 1.7568, "step": 1573 }, { "epoch": 2.0, "grad_norm": 1.4072827771859442, "learning_rate": 1.020214791934063e-05, "loss": 1.7505, "step": 1574 }, { "epoch": 2.0012706480304954, "grad_norm": 2.1922567521585887, "learning_rate": 1.0192041798803696e-05, "loss": 1.5317, "step": 1575 }, { "epoch": 2.002541296060991, "grad_norm": 1.8162527031778972, "learning_rate": 1.0181935482051198e-05, "loss": 1.4413, "step": 1576 }, { "epoch": 2.0038119440914866, "grad_norm": 1.750499161090013, "learning_rate": 1.0171828979409099e-05, "loss": 1.636, "step": 1577 }, { "epoch": 2.0050825921219824, "grad_norm": 1.7380731045199291, "learning_rate": 1.0161722301203554e-05, "loss": 1.6421, "step": 1578 }, { "epoch": 2.0063532401524777, "grad_norm": 2.1595843457290678, "learning_rate": 1.0151615457760895e-05, "loss": 1.4908, "step": 1579 }, { "epoch": 2.007623888182973, "grad_norm": 3.1077355080184828, "learning_rate": 1.0141508459407622e-05, "loss": 1.557, "step": 1580 }, { "epoch": 2.008894536213469, "grad_norm": 1.883335019434399, "learning_rate": 1.01314013164704e-05, "loss": 1.4266, "step": 1581 }, { "epoch": 2.0101651842439643, "grad_norm": 2.0929579922525448, "learning_rate": 1.0121294039276031e-05, "loss": 1.7675, "step": 1582 }, { "epoch": 2.01143583227446, "grad_norm": 1.6067207672407877, "learning_rate": 1.0111186638151464e-05, "loss": 1.5462, "step": 1583 }, { "epoch": 2.0127064803049555, "grad_norm": 2.0002465791789734, "learning_rate": 1.0101079123423771e-05, "loss": 1.3479, "step": 1584 }, { "epoch": 2.0139771283354513, "grad_norm": 1.521193085600093, "learning_rate": 1.009097150542014e-05, "loss": 1.4775, "step": 1585 }, { "epoch": 2.0152477763659467, "grad_norm": 1.602107005268662, "learning_rate": 1.0080863794467859e-05, "loss": 1.475, "step": 1586 }, { "epoch": 2.016518424396442, "grad_norm": 1.9436436215116601, "learning_rate": 1.0070756000894321e-05, "loss": 1.5147, "step": 1587 }, { "epoch": 2.017789072426938, "grad_norm": 1.809577975993538, "learning_rate": 1.0060648135026999e-05, "loss": 1.4581, "step": 1588 }, { "epoch": 2.019059720457433, "grad_norm": 1.720543171952214, "learning_rate": 1.0050540207193433e-05, "loss": 1.5072, "step": 1589 }, { "epoch": 2.020330368487929, "grad_norm": 1.7047275216163906, "learning_rate": 1.0040432227721242e-05, "loss": 1.6827, "step": 1590 }, { "epoch": 2.0216010165184244, "grad_norm": 1.5472544495690708, "learning_rate": 1.0030324206938084e-05, "loss": 1.5279, "step": 1591 }, { "epoch": 2.0228716645489198, "grad_norm": 1.5430046286635146, "learning_rate": 1.0020216155171662e-05, "loss": 1.5495, "step": 1592 }, { "epoch": 2.0241423125794156, "grad_norm": 1.3820694129150732, "learning_rate": 1.0010108082749716e-05, "loss": 1.8249, "step": 1593 }, { "epoch": 2.025412960609911, "grad_norm": 1.6196746033148492, "learning_rate": 1e-05, "loss": 1.6198, "step": 1594 }, { "epoch": 2.0266836086404068, "grad_norm": 1.4633038434106642, "learning_rate": 9.989891917250286e-06, "loss": 1.4854, "step": 1595 }, { "epoch": 2.027954256670902, "grad_norm": 1.5375930378258063, "learning_rate": 9.979783844828343e-06, "loss": 1.5181, "step": 1596 }, { "epoch": 2.0292249047013975, "grad_norm": 1.7262813627601707, "learning_rate": 9.969675793061917e-06, "loss": 1.3203, "step": 1597 }, { "epoch": 2.0304955527318933, "grad_norm": 1.5531837084914053, "learning_rate": 9.95956777227876e-06, "loss": 1.5943, "step": 1598 }, { "epoch": 2.0317662007623887, "grad_norm": 2.314212281134008, "learning_rate": 9.949459792806569e-06, "loss": 1.3614, "step": 1599 }, { "epoch": 2.0330368487928845, "grad_norm": 1.56264513568958, "learning_rate": 9.939351864973006e-06, "loss": 1.623, "step": 1600 }, { "epoch": 2.03430749682338, "grad_norm": 1.5310040399778346, "learning_rate": 9.929243999105682e-06, "loss": 1.3823, "step": 1601 }, { "epoch": 2.0355781448538757, "grad_norm": 1.5122091613542985, "learning_rate": 9.919136205532146e-06, "loss": 1.5671, "step": 1602 }, { "epoch": 2.036848792884371, "grad_norm": 1.602275195991115, "learning_rate": 9.909028494579862e-06, "loss": 1.4149, "step": 1603 }, { "epoch": 2.0381194409148664, "grad_norm": 1.555548981748731, "learning_rate": 9.89892087657623e-06, "loss": 1.5887, "step": 1604 }, { "epoch": 2.0393900889453622, "grad_norm": 1.789096954693768, "learning_rate": 9.888813361848538e-06, "loss": 1.6299, "step": 1605 }, { "epoch": 2.0406607369758576, "grad_norm": 1.4371116279202802, "learning_rate": 9.87870596072397e-06, "loss": 1.5942, "step": 1606 }, { "epoch": 2.0419313850063534, "grad_norm": 1.5448683516858457, "learning_rate": 9.868598683529603e-06, "loss": 1.41, "step": 1607 }, { "epoch": 2.0432020330368488, "grad_norm": 1.5005003454194215, "learning_rate": 9.858491540592383e-06, "loss": 1.5644, "step": 1608 }, { "epoch": 2.044472681067344, "grad_norm": 1.4227489543845717, "learning_rate": 9.848384542239109e-06, "loss": 1.3633, "step": 1609 }, { "epoch": 2.04574332909784, "grad_norm": 1.566848584554528, "learning_rate": 9.83827769879645e-06, "loss": 1.4524, "step": 1610 }, { "epoch": 2.0470139771283353, "grad_norm": 1.5994679206945885, "learning_rate": 9.828171020590903e-06, "loss": 1.3457, "step": 1611 }, { "epoch": 2.048284625158831, "grad_norm": 1.5669877597200432, "learning_rate": 9.818064517948806e-06, "loss": 1.6205, "step": 1612 }, { "epoch": 2.0495552731893265, "grad_norm": 1.8089768376002517, "learning_rate": 9.807958201196307e-06, "loss": 1.6253, "step": 1613 }, { "epoch": 2.0508259212198223, "grad_norm": 1.5784294416030926, "learning_rate": 9.797852080659375e-06, "loss": 1.5439, "step": 1614 }, { "epoch": 2.0520965692503177, "grad_norm": 1.6938171380318698, "learning_rate": 9.787746166663765e-06, "loss": 1.5235, "step": 1615 }, { "epoch": 2.053367217280813, "grad_norm": 1.5592263744572192, "learning_rate": 9.777640469535037e-06, "loss": 1.3738, "step": 1616 }, { "epoch": 2.054637865311309, "grad_norm": 1.6890696349677892, "learning_rate": 9.76753499959852e-06, "loss": 1.4987, "step": 1617 }, { "epoch": 2.0559085133418042, "grad_norm": 1.7276203896324043, "learning_rate": 9.757429767179314e-06, "loss": 1.5838, "step": 1618 }, { "epoch": 2.0571791613723, "grad_norm": 1.4661617737870516, "learning_rate": 9.74732478260227e-06, "loss": 1.503, "step": 1619 }, { "epoch": 2.0584498094027954, "grad_norm": 1.3693147692588428, "learning_rate": 9.737220056191995e-06, "loss": 1.5142, "step": 1620 }, { "epoch": 2.059720457433291, "grad_norm": 1.517200118021235, "learning_rate": 9.727115598272821e-06, "loss": 1.518, "step": 1621 }, { "epoch": 2.0609911054637866, "grad_norm": 1.3717809560979908, "learning_rate": 9.71701141916882e-06, "loss": 1.6858, "step": 1622 }, { "epoch": 2.062261753494282, "grad_norm": 1.5954676865844675, "learning_rate": 9.706907529203769e-06, "loss": 1.6237, "step": 1623 }, { "epoch": 2.063532401524778, "grad_norm": 28.333500518838918, "learning_rate": 9.696803938701153e-06, "loss": 2.1156, "step": 1624 }, { "epoch": 2.064803049555273, "grad_norm": 1.495655727010147, "learning_rate": 9.686700657984148e-06, "loss": 1.4315, "step": 1625 }, { "epoch": 2.0660736975857685, "grad_norm": 1.46669444898845, "learning_rate": 9.676597697375615e-06, "loss": 1.5926, "step": 1626 }, { "epoch": 2.0673443456162643, "grad_norm": 1.4893524309502175, "learning_rate": 9.666495067198094e-06, "loss": 1.6936, "step": 1627 }, { "epoch": 2.0686149936467597, "grad_norm": 1.4657156784120036, "learning_rate": 9.656392777773778e-06, "loss": 1.7206, "step": 1628 }, { "epoch": 2.0698856416772555, "grad_norm": 1.528267204530225, "learning_rate": 9.646290839424515e-06, "loss": 1.5584, "step": 1629 }, { "epoch": 2.071156289707751, "grad_norm": 1.626357254375774, "learning_rate": 9.6361892624718e-06, "loss": 1.6127, "step": 1630 }, { "epoch": 2.0724269377382467, "grad_norm": 1.4399509284642533, "learning_rate": 9.626088057236745e-06, "loss": 1.5282, "step": 1631 }, { "epoch": 2.073697585768742, "grad_norm": 1.4694728842146232, "learning_rate": 9.615987234040098e-06, "loss": 1.6358, "step": 1632 }, { "epoch": 2.0749682337992374, "grad_norm": 1.3107896183282848, "learning_rate": 9.60588680320221e-06, "loss": 1.4943, "step": 1633 }, { "epoch": 2.0762388818297333, "grad_norm": 1.3258155601804371, "learning_rate": 9.595786775043028e-06, "loss": 1.5086, "step": 1634 }, { "epoch": 2.0775095298602286, "grad_norm": 1.6867655906427885, "learning_rate": 9.585687159882092e-06, "loss": 1.6189, "step": 1635 }, { "epoch": 2.0787801778907244, "grad_norm": 1.5458307869086416, "learning_rate": 9.57558796803852e-06, "loss": 1.7701, "step": 1636 }, { "epoch": 2.08005082592122, "grad_norm": 1.556087014644666, "learning_rate": 9.565489209830991e-06, "loss": 1.5319, "step": 1637 }, { "epoch": 2.081321473951715, "grad_norm": 1.5341891110262698, "learning_rate": 9.555390895577748e-06, "loss": 1.5834, "step": 1638 }, { "epoch": 2.082592121982211, "grad_norm": 1.6591782020014392, "learning_rate": 9.54529303559658e-06, "loss": 1.3932, "step": 1639 }, { "epoch": 2.0838627700127064, "grad_norm": 1.6103571396160286, "learning_rate": 9.535195640204811e-06, "loss": 1.5176, "step": 1640 }, { "epoch": 2.085133418043202, "grad_norm": 1.648255448151265, "learning_rate": 9.525098719719285e-06, "loss": 1.6029, "step": 1641 }, { "epoch": 2.0864040660736975, "grad_norm": 1.576871559589654, "learning_rate": 9.515002284456373e-06, "loss": 1.4553, "step": 1642 }, { "epoch": 2.0876747141041934, "grad_norm": 1.573271991288505, "learning_rate": 9.504906344731933e-06, "loss": 1.5202, "step": 1643 }, { "epoch": 2.0889453621346887, "grad_norm": 1.5214587878309893, "learning_rate": 9.494810910861328e-06, "loss": 1.5757, "step": 1644 }, { "epoch": 2.090216010165184, "grad_norm": 1.6462726696144563, "learning_rate": 9.484715993159407e-06, "loss": 1.5592, "step": 1645 }, { "epoch": 2.09148665819568, "grad_norm": 1.4765489474527553, "learning_rate": 9.474621601940488e-06, "loss": 1.5569, "step": 1646 }, { "epoch": 2.0927573062261753, "grad_norm": 1.3643850828357655, "learning_rate": 9.464527747518344e-06, "loss": 1.4732, "step": 1647 }, { "epoch": 2.094027954256671, "grad_norm": 1.5624165272696018, "learning_rate": 9.454434440206211e-06, "loss": 1.5263, "step": 1648 }, { "epoch": 2.0952986022871665, "grad_norm": 1.5291621427497337, "learning_rate": 9.444341690316754e-06, "loss": 1.3224, "step": 1649 }, { "epoch": 2.096569250317662, "grad_norm": 1.6367742909471383, "learning_rate": 9.434249508162076e-06, "loss": 1.472, "step": 1650 }, { "epoch": 2.0978398983481577, "grad_norm": 1.5909269637605061, "learning_rate": 9.424157904053705e-06, "loss": 1.3923, "step": 1651 }, { "epoch": 2.099110546378653, "grad_norm": 1.5625260293281584, "learning_rate": 9.414066888302572e-06, "loss": 1.4278, "step": 1652 }, { "epoch": 2.100381194409149, "grad_norm": 1.6300100018177057, "learning_rate": 9.403976471219e-06, "loss": 1.584, "step": 1653 }, { "epoch": 2.101651842439644, "grad_norm": 1.6298229936291917, "learning_rate": 9.393886663112714e-06, "loss": 1.7778, "step": 1654 }, { "epoch": 2.1029224904701396, "grad_norm": 1.8296972349127292, "learning_rate": 9.383797474292804e-06, "loss": 1.5345, "step": 1655 }, { "epoch": 2.1041931385006354, "grad_norm": 1.566619499783697, "learning_rate": 9.373708915067738e-06, "loss": 1.3907, "step": 1656 }, { "epoch": 2.1054637865311308, "grad_norm": 1.927986374560233, "learning_rate": 9.363620995745337e-06, "loss": 1.7173, "step": 1657 }, { "epoch": 2.1067344345616266, "grad_norm": 1.707036241486098, "learning_rate": 9.353533726632768e-06, "loss": 1.5264, "step": 1658 }, { "epoch": 2.108005082592122, "grad_norm": 1.6716763707563085, "learning_rate": 9.343447118036528e-06, "loss": 1.6883, "step": 1659 }, { "epoch": 2.1092757306226178, "grad_norm": 1.4522206766291692, "learning_rate": 9.33336118026245e-06, "loss": 1.5433, "step": 1660 }, { "epoch": 2.110546378653113, "grad_norm": 1.5721077886542751, "learning_rate": 9.323275923615669e-06, "loss": 1.6695, "step": 1661 }, { "epoch": 2.1118170266836085, "grad_norm": 1.5761380639840767, "learning_rate": 9.313191358400638e-06, "loss": 1.2518, "step": 1662 }, { "epoch": 2.1130876747141043, "grad_norm": 1.672209113109455, "learning_rate": 9.30310749492109e-06, "loss": 1.6208, "step": 1663 }, { "epoch": 2.1143583227445997, "grad_norm": 1.656915671786083, "learning_rate": 9.293024343480056e-06, "loss": 1.4643, "step": 1664 }, { "epoch": 2.1156289707750955, "grad_norm": 1.483378364811783, "learning_rate": 9.282941914379821e-06, "loss": 1.5757, "step": 1665 }, { "epoch": 2.116899618805591, "grad_norm": 1.4992838950739726, "learning_rate": 9.272860217921951e-06, "loss": 1.5028, "step": 1666 }, { "epoch": 2.1181702668360862, "grad_norm": 1.6155109191754602, "learning_rate": 9.262779264407245e-06, "loss": 1.6201, "step": 1667 }, { "epoch": 2.119440914866582, "grad_norm": 1.4819695282440337, "learning_rate": 9.252699064135759e-06, "loss": 1.4906, "step": 1668 }, { "epoch": 2.1207115628970774, "grad_norm": 1.4590395485050054, "learning_rate": 9.24261962740677e-06, "loss": 1.3716, "step": 1669 }, { "epoch": 2.121982210927573, "grad_norm": 1.7291314303929641, "learning_rate": 9.23254096451878e-06, "loss": 1.8731, "step": 1670 }, { "epoch": 2.1232528589580686, "grad_norm": 1.5878570493809223, "learning_rate": 9.222463085769495e-06, "loss": 1.565, "step": 1671 }, { "epoch": 2.124523506988564, "grad_norm": 1.6488457808778345, "learning_rate": 9.212386001455826e-06, "loss": 1.8541, "step": 1672 }, { "epoch": 2.1257941550190598, "grad_norm": 1.5243218279056652, "learning_rate": 9.202309721873861e-06, "loss": 1.5937, "step": 1673 }, { "epoch": 2.127064803049555, "grad_norm": 1.7912749204453913, "learning_rate": 9.192234257318883e-06, "loss": 1.5461, "step": 1674 }, { "epoch": 2.128335451080051, "grad_norm": 1.440834640424777, "learning_rate": 9.182159618085328e-06, "loss": 1.5988, "step": 1675 }, { "epoch": 2.1296060991105463, "grad_norm": 1.429961079837902, "learning_rate": 9.172085814466798e-06, "loss": 1.5742, "step": 1676 }, { "epoch": 2.130876747141042, "grad_norm": 1.7624163980029814, "learning_rate": 9.162012856756031e-06, "loss": 1.5154, "step": 1677 }, { "epoch": 2.1321473951715375, "grad_norm": 1.430025091724334, "learning_rate": 9.151940755244912e-06, "loss": 1.7124, "step": 1678 }, { "epoch": 2.133418043202033, "grad_norm": 1.4365126628311997, "learning_rate": 9.141869520224438e-06, "loss": 1.5253, "step": 1679 }, { "epoch": 2.1346886912325287, "grad_norm": 1.6293871964817268, "learning_rate": 9.131799161984738e-06, "loss": 1.585, "step": 1680 }, { "epoch": 2.135959339263024, "grad_norm": 1.6621412081818165, "learning_rate": 9.12172969081503e-06, "loss": 1.6056, "step": 1681 }, { "epoch": 2.13722998729352, "grad_norm": 1.6866346755461035, "learning_rate": 9.111661117003637e-06, "loss": 1.4979, "step": 1682 }, { "epoch": 2.1385006353240152, "grad_norm": 1.5713172138081546, "learning_rate": 9.101593450837952e-06, "loss": 1.4382, "step": 1683 }, { "epoch": 2.1397712833545106, "grad_norm": 1.5893242031849881, "learning_rate": 9.091526702604448e-06, "loss": 1.4081, "step": 1684 }, { "epoch": 2.1410419313850064, "grad_norm": 1.6075705545790635, "learning_rate": 9.081460882588668e-06, "loss": 1.7753, "step": 1685 }, { "epoch": 2.142312579415502, "grad_norm": 1.5260956701483068, "learning_rate": 9.071396001075195e-06, "loss": 1.5592, "step": 1686 }, { "epoch": 2.1435832274459976, "grad_norm": 1.5492888125581123, "learning_rate": 9.061332068347654e-06, "loss": 1.727, "step": 1687 }, { "epoch": 2.144853875476493, "grad_norm": 1.3887209879997233, "learning_rate": 9.05126909468871e-06, "loss": 1.3388, "step": 1688 }, { "epoch": 2.1461245235069883, "grad_norm": 1.5828322115240465, "learning_rate": 9.041207090380035e-06, "loss": 1.8861, "step": 1689 }, { "epoch": 2.147395171537484, "grad_norm": 1.5801839038857823, "learning_rate": 9.031146065702316e-06, "loss": 1.6062, "step": 1690 }, { "epoch": 2.1486658195679795, "grad_norm": 1.6933001742313516, "learning_rate": 9.021086030935248e-06, "loss": 1.6028, "step": 1691 }, { "epoch": 2.1499364675984753, "grad_norm": 1.4970349556990488, "learning_rate": 9.011026996357504e-06, "loss": 1.5388, "step": 1692 }, { "epoch": 2.1512071156289707, "grad_norm": 1.5235751770699257, "learning_rate": 9.000968972246734e-06, "loss": 1.6252, "step": 1693 }, { "epoch": 2.1524777636594665, "grad_norm": 1.5804044095803875, "learning_rate": 8.990911968879566e-06, "loss": 1.6521, "step": 1694 }, { "epoch": 2.153748411689962, "grad_norm": 1.7761792114379973, "learning_rate": 8.98085599653157e-06, "loss": 1.5486, "step": 1695 }, { "epoch": 2.1550190597204573, "grad_norm": 1.734623727959135, "learning_rate": 8.970801065477276e-06, "loss": 1.6489, "step": 1696 }, { "epoch": 2.156289707750953, "grad_norm": 1.4507938151329542, "learning_rate": 8.960747185990147e-06, "loss": 1.2433, "step": 1697 }, { "epoch": 2.1575603557814484, "grad_norm": 1.4898993070280564, "learning_rate": 8.950694368342568e-06, "loss": 1.4389, "step": 1698 }, { "epoch": 2.1588310038119443, "grad_norm": 1.570539349783722, "learning_rate": 8.94064262280584e-06, "loss": 1.5924, "step": 1699 }, { "epoch": 2.1601016518424396, "grad_norm": 1.6282082966926994, "learning_rate": 8.930591959650173e-06, "loss": 1.6758, "step": 1700 }, { "epoch": 2.161372299872935, "grad_norm": 1.4261021613348666, "learning_rate": 8.920542389144663e-06, "loss": 1.5793, "step": 1701 }, { "epoch": 2.162642947903431, "grad_norm": 1.5578874586391542, "learning_rate": 8.910493921557293e-06, "loss": 1.6233, "step": 1702 }, { "epoch": 2.163913595933926, "grad_norm": 1.7188727597552311, "learning_rate": 8.900446567154924e-06, "loss": 1.4443, "step": 1703 }, { "epoch": 2.165184243964422, "grad_norm": 1.4826880086010712, "learning_rate": 8.89040033620328e-06, "loss": 1.5363, "step": 1704 }, { "epoch": 2.1664548919949174, "grad_norm": 1.5783201593369776, "learning_rate": 8.880355238966923e-06, "loss": 1.626, "step": 1705 }, { "epoch": 2.1677255400254127, "grad_norm": 1.6043102679888508, "learning_rate": 8.870311285709274e-06, "loss": 1.1562, "step": 1706 }, { "epoch": 2.1689961880559085, "grad_norm": 1.6574392377919835, "learning_rate": 8.860268486692575e-06, "loss": 1.8377, "step": 1707 }, { "epoch": 2.170266836086404, "grad_norm": 1.4871357831315282, "learning_rate": 8.85022685217789e-06, "loss": 1.457, "step": 1708 }, { "epoch": 2.1715374841168997, "grad_norm": 1.5616476648026785, "learning_rate": 8.840186392425098e-06, "loss": 1.6417, "step": 1709 }, { "epoch": 2.172808132147395, "grad_norm": 1.7895366394288845, "learning_rate": 8.830147117692876e-06, "loss": 1.6573, "step": 1710 }, { "epoch": 2.174078780177891, "grad_norm": 1.7137363094879385, "learning_rate": 8.820109038238682e-06, "loss": 1.6524, "step": 1711 }, { "epoch": 2.1753494282083863, "grad_norm": 1.845553280517191, "learning_rate": 8.810072164318766e-06, "loss": 1.6691, "step": 1712 }, { "epoch": 2.1766200762388817, "grad_norm": 1.6081167744680605, "learning_rate": 8.80003650618813e-06, "loss": 1.47, "step": 1713 }, { "epoch": 2.1778907242693775, "grad_norm": 1.5525755260553589, "learning_rate": 8.790002074100556e-06, "loss": 1.3088, "step": 1714 }, { "epoch": 2.179161372299873, "grad_norm": 10.268603491141855, "learning_rate": 8.779968878308554e-06, "loss": 1.6433, "step": 1715 }, { "epoch": 2.1804320203303686, "grad_norm": 1.4235078712294176, "learning_rate": 8.769936929063381e-06, "loss": 1.5117, "step": 1716 }, { "epoch": 2.181702668360864, "grad_norm": 1.7351705689202577, "learning_rate": 8.75990623661501e-06, "loss": 1.6901, "step": 1717 }, { "epoch": 2.1829733163913594, "grad_norm": 1.4694761500743818, "learning_rate": 8.749876811212144e-06, "loss": 1.6197, "step": 1718 }, { "epoch": 2.184243964421855, "grad_norm": 1.516154165521079, "learning_rate": 8.739848663102176e-06, "loss": 1.5381, "step": 1719 }, { "epoch": 2.1855146124523506, "grad_norm": 1.6996412072679006, "learning_rate": 8.729821802531213e-06, "loss": 1.5123, "step": 1720 }, { "epoch": 2.1867852604828464, "grad_norm": 1.3431724285108444, "learning_rate": 8.719796239744029e-06, "loss": 1.5364, "step": 1721 }, { "epoch": 2.1880559085133418, "grad_norm": 1.5892564675817367, "learning_rate": 8.70977198498408e-06, "loss": 1.7507, "step": 1722 }, { "epoch": 2.189326556543837, "grad_norm": 1.5438579499894396, "learning_rate": 8.699749048493483e-06, "loss": 1.4018, "step": 1723 }, { "epoch": 2.190597204574333, "grad_norm": 1.4519940531190099, "learning_rate": 8.689727440513013e-06, "loss": 1.672, "step": 1724 }, { "epoch": 2.1918678526048283, "grad_norm": 1.703684111028615, "learning_rate": 8.679707171282073e-06, "loss": 1.2744, "step": 1725 }, { "epoch": 2.193138500635324, "grad_norm": 1.4577231337063985, "learning_rate": 8.669688251038726e-06, "loss": 1.6224, "step": 1726 }, { "epoch": 2.1944091486658195, "grad_norm": 1.4475200873626741, "learning_rate": 8.659670690019626e-06, "loss": 1.508, "step": 1727 }, { "epoch": 2.1956797966963153, "grad_norm": 1.5782944530270577, "learning_rate": 8.64965449846006e-06, "loss": 1.4206, "step": 1728 }, { "epoch": 2.1969504447268107, "grad_norm": 1.4495894755794485, "learning_rate": 8.639639686593904e-06, "loss": 1.5102, "step": 1729 }, { "epoch": 2.198221092757306, "grad_norm": 1.7498846987619256, "learning_rate": 8.62962626465363e-06, "loss": 1.5369, "step": 1730 }, { "epoch": 2.199491740787802, "grad_norm": 1.5255113933892452, "learning_rate": 8.61961424287028e-06, "loss": 1.3851, "step": 1731 }, { "epoch": 2.200762388818297, "grad_norm": 1.6512648505337741, "learning_rate": 8.609603631473487e-06, "loss": 1.53, "step": 1732 }, { "epoch": 2.202033036848793, "grad_norm": 1.5399236926216269, "learning_rate": 8.599594440691419e-06, "loss": 1.4815, "step": 1733 }, { "epoch": 2.2033036848792884, "grad_norm": 1.689193523447338, "learning_rate": 8.58958668075081e-06, "loss": 1.5956, "step": 1734 }, { "epoch": 2.204574332909784, "grad_norm": 1.5459365286861693, "learning_rate": 8.579580361876917e-06, "loss": 1.5584, "step": 1735 }, { "epoch": 2.2058449809402796, "grad_norm": 1.6568848921883528, "learning_rate": 8.56957549429354e-06, "loss": 1.5814, "step": 1736 }, { "epoch": 2.207115628970775, "grad_norm": 1.61258329416859, "learning_rate": 8.55957208822298e-06, "loss": 1.4887, "step": 1737 }, { "epoch": 2.2083862770012708, "grad_norm": 1.753927164526745, "learning_rate": 8.549570153886062e-06, "loss": 1.5408, "step": 1738 }, { "epoch": 2.209656925031766, "grad_norm": 1.4274659907296536, "learning_rate": 8.539569701502096e-06, "loss": 1.5375, "step": 1739 }, { "epoch": 2.210927573062262, "grad_norm": 1.620886641379908, "learning_rate": 8.529570741288882e-06, "loss": 1.6127, "step": 1740 }, { "epoch": 2.2121982210927573, "grad_norm": 1.4694381578113502, "learning_rate": 8.519573283462688e-06, "loss": 1.6902, "step": 1741 }, { "epoch": 2.2134688691232527, "grad_norm": 1.4208135400626172, "learning_rate": 8.509577338238255e-06, "loss": 1.759, "step": 1742 }, { "epoch": 2.2147395171537485, "grad_norm": 1.6453102941428899, "learning_rate": 8.499582915828782e-06, "loss": 1.5061, "step": 1743 }, { "epoch": 2.216010165184244, "grad_norm": 1.5204855654557505, "learning_rate": 8.489590026445902e-06, "loss": 1.5374, "step": 1744 }, { "epoch": 2.2172808132147397, "grad_norm": 1.9856026444957109, "learning_rate": 8.479598680299686e-06, "loss": 1.3209, "step": 1745 }, { "epoch": 2.218551461245235, "grad_norm": 1.675785199137644, "learning_rate": 8.46960888759863e-06, "loss": 1.6129, "step": 1746 }, { "epoch": 2.2198221092757304, "grad_norm": 1.4868040261292785, "learning_rate": 8.459620658549638e-06, "loss": 1.5243, "step": 1747 }, { "epoch": 2.2210927573062262, "grad_norm": 1.6585116655037806, "learning_rate": 8.449634003358022e-06, "loss": 1.4083, "step": 1748 }, { "epoch": 2.2223634053367216, "grad_norm": 1.6873083046784862, "learning_rate": 8.439648932227483e-06, "loss": 1.3082, "step": 1749 }, { "epoch": 2.2236340533672174, "grad_norm": 1.6185024009440863, "learning_rate": 8.429665455360107e-06, "loss": 1.6422, "step": 1750 }, { "epoch": 2.224904701397713, "grad_norm": 1.424835120406413, "learning_rate": 8.419683582956343e-06, "loss": 1.4796, "step": 1751 }, { "epoch": 2.2261753494282086, "grad_norm": 1.6087218356524635, "learning_rate": 8.40970332521501e-06, "loss": 1.3767, "step": 1752 }, { "epoch": 2.227445997458704, "grad_norm": 1.513737894452076, "learning_rate": 8.39972469233327e-06, "loss": 1.512, "step": 1753 }, { "epoch": 2.2287166454891993, "grad_norm": 1.370802180357388, "learning_rate": 8.389747694506626e-06, "loss": 1.3081, "step": 1754 }, { "epoch": 2.229987293519695, "grad_norm": 1.4791124657047963, "learning_rate": 8.379772341928916e-06, "loss": 1.7237, "step": 1755 }, { "epoch": 2.2312579415501905, "grad_norm": 1.5737279887950195, "learning_rate": 8.369798644792295e-06, "loss": 1.5387, "step": 1756 }, { "epoch": 2.2325285895806863, "grad_norm": 1.5662459212887674, "learning_rate": 8.359826613287218e-06, "loss": 1.2879, "step": 1757 }, { "epoch": 2.2337992376111817, "grad_norm": 1.799364522743647, "learning_rate": 8.349856257602453e-06, "loss": 1.5041, "step": 1758 }, { "epoch": 2.235069885641677, "grad_norm": 1.5973977396672083, "learning_rate": 8.33988758792504e-06, "loss": 1.4523, "step": 1759 }, { "epoch": 2.236340533672173, "grad_norm": 2.074809025149893, "learning_rate": 8.329920614440306e-06, "loss": 1.5942, "step": 1760 }, { "epoch": 2.2376111817026683, "grad_norm": 1.4469566898299258, "learning_rate": 8.319955347331847e-06, "loss": 1.5738, "step": 1761 }, { "epoch": 2.238881829733164, "grad_norm": 1.667672361818746, "learning_rate": 8.309991796781512e-06, "loss": 1.591, "step": 1762 }, { "epoch": 2.2401524777636594, "grad_norm": 1.5784668955515517, "learning_rate": 8.300029972969389e-06, "loss": 1.8242, "step": 1763 }, { "epoch": 2.241423125794155, "grad_norm": 1.5327500460115688, "learning_rate": 8.290069886073815e-06, "loss": 1.4986, "step": 1764 }, { "epoch": 2.2426937738246506, "grad_norm": 1.713595354060149, "learning_rate": 8.280111546271342e-06, "loss": 1.6527, "step": 1765 }, { "epoch": 2.243964421855146, "grad_norm": 1.5073458704886182, "learning_rate": 8.270154963736737e-06, "loss": 1.6408, "step": 1766 }, { "epoch": 2.245235069885642, "grad_norm": 1.7682102522502663, "learning_rate": 8.260200148642982e-06, "loss": 1.4513, "step": 1767 }, { "epoch": 2.246505717916137, "grad_norm": 1.6063360903705282, "learning_rate": 8.250247111161248e-06, "loss": 1.5636, "step": 1768 }, { "epoch": 2.247776365946633, "grad_norm": 1.8512710323136712, "learning_rate": 8.24029586146088e-06, "loss": 1.5229, "step": 1769 }, { "epoch": 2.2490470139771284, "grad_norm": 1.506792015467357, "learning_rate": 8.230346409709414e-06, "loss": 1.6667, "step": 1770 }, { "epoch": 2.2503176620076237, "grad_norm": 1.490949180957695, "learning_rate": 8.220398766072526e-06, "loss": 1.5765, "step": 1771 }, { "epoch": 2.2515883100381195, "grad_norm": 1.5367252773147595, "learning_rate": 8.210452940714072e-06, "loss": 1.588, "step": 1772 }, { "epoch": 2.252858958068615, "grad_norm": 1.5451088380268685, "learning_rate": 8.20050894379603e-06, "loss": 1.5953, "step": 1773 }, { "epoch": 2.2541296060991107, "grad_norm": 1.3478757161343975, "learning_rate": 8.190566785478517e-06, "loss": 1.5497, "step": 1774 }, { "epoch": 2.255400254129606, "grad_norm": 1.5285510060778615, "learning_rate": 8.180626475919768e-06, "loss": 1.6153, "step": 1775 }, { "epoch": 2.2566709021601015, "grad_norm": 1.7740152739154276, "learning_rate": 8.170688025276134e-06, "loss": 1.6365, "step": 1776 }, { "epoch": 2.2579415501905973, "grad_norm": 1.7135233218143595, "learning_rate": 8.160751443702062e-06, "loss": 1.5003, "step": 1777 }, { "epoch": 2.2592121982210926, "grad_norm": 1.408636468122467, "learning_rate": 8.150816741350099e-06, "loss": 1.6208, "step": 1778 }, { "epoch": 2.2604828462515885, "grad_norm": 1.841401544872553, "learning_rate": 8.140883928370855e-06, "loss": 1.6257, "step": 1779 }, { "epoch": 2.261753494282084, "grad_norm": 1.8089992703143447, "learning_rate": 8.130953014913025e-06, "loss": 1.5855, "step": 1780 }, { "epoch": 2.263024142312579, "grad_norm": 1.382352287583554, "learning_rate": 8.121024011123353e-06, "loss": 1.4472, "step": 1781 }, { "epoch": 2.264294790343075, "grad_norm": 1.9623033136293564, "learning_rate": 8.11109692714664e-06, "loss": 1.5006, "step": 1782 }, { "epoch": 2.2655654383735704, "grad_norm": 1.4466012909581296, "learning_rate": 8.101171773125716e-06, "loss": 1.4422, "step": 1783 }, { "epoch": 2.266836086404066, "grad_norm": 1.4731390191249392, "learning_rate": 8.091248559201453e-06, "loss": 1.5608, "step": 1784 }, { "epoch": 2.2681067344345616, "grad_norm": 1.9585030783409343, "learning_rate": 8.081327295512726e-06, "loss": 1.5666, "step": 1785 }, { "epoch": 2.2693773824650574, "grad_norm": 1.8934712477147808, "learning_rate": 8.071407992196428e-06, "loss": 1.6951, "step": 1786 }, { "epoch": 2.2706480304955527, "grad_norm": 1.4255685952424664, "learning_rate": 8.061490659387441e-06, "loss": 1.5243, "step": 1787 }, { "epoch": 2.271918678526048, "grad_norm": 1.6125269843284684, "learning_rate": 8.051575307218637e-06, "loss": 1.6832, "step": 1788 }, { "epoch": 2.273189326556544, "grad_norm": 1.5699039685068195, "learning_rate": 8.041661945820866e-06, "loss": 1.5794, "step": 1789 }, { "epoch": 2.2744599745870393, "grad_norm": 1.4677802591183078, "learning_rate": 8.031750585322948e-06, "loss": 1.7092, "step": 1790 }, { "epoch": 2.275730622617535, "grad_norm": 1.6717478208537573, "learning_rate": 8.021841235851646e-06, "loss": 1.6947, "step": 1791 }, { "epoch": 2.2770012706480305, "grad_norm": 1.4459052193819664, "learning_rate": 8.01193390753168e-06, "loss": 1.5133, "step": 1792 }, { "epoch": 2.2782719186785263, "grad_norm": 1.3865773438986266, "learning_rate": 8.002028610485695e-06, "loss": 1.4625, "step": 1793 }, { "epoch": 2.2795425667090217, "grad_norm": 1.488213716387411, "learning_rate": 7.992125354834273e-06, "loss": 1.7199, "step": 1794 }, { "epoch": 2.280813214739517, "grad_norm": 1.388050280743275, "learning_rate": 7.982224150695896e-06, "loss": 1.5348, "step": 1795 }, { "epoch": 2.282083862770013, "grad_norm": 1.3073310825896807, "learning_rate": 7.972325008186966e-06, "loss": 1.4043, "step": 1796 }, { "epoch": 2.283354510800508, "grad_norm": 1.8266034226850034, "learning_rate": 7.962427937421763e-06, "loss": 1.6173, "step": 1797 }, { "epoch": 2.2846251588310036, "grad_norm": 1.580567884125329, "learning_rate": 7.952532948512464e-06, "loss": 1.4702, "step": 1798 }, { "epoch": 2.2858958068614994, "grad_norm": 1.729096492111216, "learning_rate": 7.942640051569102e-06, "loss": 1.831, "step": 1799 }, { "epoch": 2.2871664548919948, "grad_norm": 1.6894573030661406, "learning_rate": 7.932749256699588e-06, "loss": 1.4857, "step": 1800 }, { "epoch": 2.2884371029224906, "grad_norm": 1.6467272171651914, "learning_rate": 7.92286057400968e-06, "loss": 1.6362, "step": 1801 }, { "epoch": 2.289707750952986, "grad_norm": 1.6330545870487407, "learning_rate": 7.91297401360298e-06, "loss": 1.6352, "step": 1802 }, { "epoch": 2.2909783989834818, "grad_norm": 1.4708231561749436, "learning_rate": 7.903089585580914e-06, "loss": 1.5156, "step": 1803 }, { "epoch": 2.292249047013977, "grad_norm": 1.7160301684303687, "learning_rate": 7.89320730004274e-06, "loss": 1.4859, "step": 1804 }, { "epoch": 2.2935196950444725, "grad_norm": 1.6527094214901459, "learning_rate": 7.883327167085514e-06, "loss": 1.4175, "step": 1805 }, { "epoch": 2.2947903430749683, "grad_norm": 1.6225243402916338, "learning_rate": 7.873449196804106e-06, "loss": 1.3302, "step": 1806 }, { "epoch": 2.2960609911054637, "grad_norm": 1.6122001794500487, "learning_rate": 7.863573399291169e-06, "loss": 1.5241, "step": 1807 }, { "epoch": 2.2973316391359595, "grad_norm": 1.69386117216537, "learning_rate": 7.853699784637139e-06, "loss": 1.4738, "step": 1808 }, { "epoch": 2.298602287166455, "grad_norm": 1.6021965442081367, "learning_rate": 7.843828362930217e-06, "loss": 1.5937, "step": 1809 }, { "epoch": 2.2998729351969507, "grad_norm": 1.4471567250070998, "learning_rate": 7.83395914425637e-06, "loss": 1.5809, "step": 1810 }, { "epoch": 2.301143583227446, "grad_norm": 1.5467670582026032, "learning_rate": 7.824092138699307e-06, "loss": 1.4007, "step": 1811 }, { "epoch": 2.3024142312579414, "grad_norm": 1.5768834499096565, "learning_rate": 7.81422735634048e-06, "loss": 1.3662, "step": 1812 }, { "epoch": 2.3036848792884372, "grad_norm": 1.3139335059190138, "learning_rate": 7.804364807259071e-06, "loss": 1.5668, "step": 1813 }, { "epoch": 2.3049555273189326, "grad_norm": 1.4085970135760213, "learning_rate": 7.794504501531978e-06, "loss": 1.5215, "step": 1814 }, { "epoch": 2.306226175349428, "grad_norm": 1.4314080687544268, "learning_rate": 7.784646449233806e-06, "loss": 1.6986, "step": 1815 }, { "epoch": 2.307496823379924, "grad_norm": 1.5284739850338658, "learning_rate": 7.774790660436857e-06, "loss": 1.5482, "step": 1816 }, { "epoch": 2.308767471410419, "grad_norm": 1.4524893798000362, "learning_rate": 7.764937145211126e-06, "loss": 1.4476, "step": 1817 }, { "epoch": 2.310038119440915, "grad_norm": 1.6073140233069667, "learning_rate": 7.755085913624274e-06, "loss": 1.5359, "step": 1818 }, { "epoch": 2.3113087674714103, "grad_norm": 1.3996353796614822, "learning_rate": 7.745236975741643e-06, "loss": 1.554, "step": 1819 }, { "epoch": 2.312579415501906, "grad_norm": 1.9511188694547437, "learning_rate": 7.735390341626223e-06, "loss": 1.7084, "step": 1820 }, { "epoch": 2.3138500635324015, "grad_norm": 1.752974608444825, "learning_rate": 7.725546021338645e-06, "loss": 1.4784, "step": 1821 }, { "epoch": 2.315120711562897, "grad_norm": 1.574142271538089, "learning_rate": 7.715704024937188e-06, "loss": 1.4726, "step": 1822 }, { "epoch": 2.3163913595933927, "grad_norm": 1.439019695120883, "learning_rate": 7.705864362477751e-06, "loss": 1.4401, "step": 1823 }, { "epoch": 2.317662007623888, "grad_norm": 1.644489663644214, "learning_rate": 7.696027044013842e-06, "loss": 1.5001, "step": 1824 }, { "epoch": 2.318932655654384, "grad_norm": 1.72913661111972, "learning_rate": 7.686192079596586e-06, "loss": 1.5118, "step": 1825 }, { "epoch": 2.3202033036848793, "grad_norm": 1.5907616296090283, "learning_rate": 7.676359479274697e-06, "loss": 1.4179, "step": 1826 }, { "epoch": 2.321473951715375, "grad_norm": 1.5720060242626939, "learning_rate": 7.666529253094469e-06, "loss": 1.5574, "step": 1827 }, { "epoch": 2.3227445997458704, "grad_norm": 1.4179955050324362, "learning_rate": 7.656701411099777e-06, "loss": 1.3249, "step": 1828 }, { "epoch": 2.324015247776366, "grad_norm": 1.666878305887207, "learning_rate": 7.646875963332056e-06, "loss": 1.5944, "step": 1829 }, { "epoch": 2.3252858958068616, "grad_norm": 1.3842116235724446, "learning_rate": 7.637052919830303e-06, "loss": 1.609, "step": 1830 }, { "epoch": 2.326556543837357, "grad_norm": 1.5360748481078, "learning_rate": 7.627232290631045e-06, "loss": 1.6189, "step": 1831 }, { "epoch": 2.3278271918678524, "grad_norm": 1.400367893827521, "learning_rate": 7.617414085768352e-06, "loss": 1.3703, "step": 1832 }, { "epoch": 2.329097839898348, "grad_norm": 1.5844149233811378, "learning_rate": 7.607598315273812e-06, "loss": 1.6094, "step": 1833 }, { "epoch": 2.3303684879288435, "grad_norm": 1.3974980324420434, "learning_rate": 7.59778498917653e-06, "loss": 1.5484, "step": 1834 }, { "epoch": 2.3316391359593394, "grad_norm": 1.4213588910308288, "learning_rate": 7.587974117503107e-06, "loss": 1.4974, "step": 1835 }, { "epoch": 2.3329097839898347, "grad_norm": 1.4614803253524362, "learning_rate": 7.578165710277648e-06, "loss": 1.5113, "step": 1836 }, { "epoch": 2.3341804320203305, "grad_norm": 1.8778425505561283, "learning_rate": 7.568359777521728e-06, "loss": 1.4722, "step": 1837 }, { "epoch": 2.335451080050826, "grad_norm": 1.5484667982235163, "learning_rate": 7.558556329254397e-06, "loss": 1.3134, "step": 1838 }, { "epoch": 2.3367217280813213, "grad_norm": 1.566989203339903, "learning_rate": 7.548755375492173e-06, "loss": 1.5581, "step": 1839 }, { "epoch": 2.337992376111817, "grad_norm": 1.4639957631528235, "learning_rate": 7.538956926249013e-06, "loss": 1.2382, "step": 1840 }, { "epoch": 2.3392630241423125, "grad_norm": 1.4873332167896953, "learning_rate": 7.5291609915363255e-06, "loss": 1.5546, "step": 1841 }, { "epoch": 2.3405336721728083, "grad_norm": 1.5861369473390936, "learning_rate": 7.519367581362949e-06, "loss": 1.7043, "step": 1842 }, { "epoch": 2.3418043202033036, "grad_norm": 1.5022068690318784, "learning_rate": 7.509576705735136e-06, "loss": 1.7078, "step": 1843 }, { "epoch": 2.3430749682337995, "grad_norm": 1.3789401080202535, "learning_rate": 7.499788374656556e-06, "loss": 1.693, "step": 1844 }, { "epoch": 2.344345616264295, "grad_norm": 1.295675618650908, "learning_rate": 7.490002598128276e-06, "loss": 1.5228, "step": 1845 }, { "epoch": 2.34561626429479, "grad_norm": 1.584834337879101, "learning_rate": 7.480219386148751e-06, "loss": 1.4103, "step": 1846 }, { "epoch": 2.346886912325286, "grad_norm": 1.8601162293779845, "learning_rate": 7.470438748713815e-06, "loss": 1.5678, "step": 1847 }, { "epoch": 2.3481575603557814, "grad_norm": 1.7949723193377425, "learning_rate": 7.4606606958166836e-06, "loss": 1.6394, "step": 1848 }, { "epoch": 2.349428208386277, "grad_norm": 1.5699342047653413, "learning_rate": 7.450885237447913e-06, "loss": 1.5852, "step": 1849 }, { "epoch": 2.3506988564167726, "grad_norm": 1.736272312862605, "learning_rate": 7.441112383595424e-06, "loss": 1.48, "step": 1850 }, { "epoch": 2.351969504447268, "grad_norm": 1.3995849038786334, "learning_rate": 7.431342144244466e-06, "loss": 1.6927, "step": 1851 }, { "epoch": 2.3532401524777637, "grad_norm": 1.5149788541962488, "learning_rate": 7.421574529377623e-06, "loss": 1.5541, "step": 1852 }, { "epoch": 2.354510800508259, "grad_norm": 1.6383395325740433, "learning_rate": 7.411809548974792e-06, "loss": 1.7801, "step": 1853 }, { "epoch": 2.355781448538755, "grad_norm": 1.6062109121903854, "learning_rate": 7.4020472130131905e-06, "loss": 1.3986, "step": 1854 }, { "epoch": 2.3570520965692503, "grad_norm": 1.5758152979946658, "learning_rate": 7.392287531467316e-06, "loss": 1.6625, "step": 1855 }, { "epoch": 2.3583227445997457, "grad_norm": 1.4684572170126242, "learning_rate": 7.3825305143089675e-06, "loss": 1.4875, "step": 1856 }, { "epoch": 2.3595933926302415, "grad_norm": 1.4137996834272322, "learning_rate": 7.372776171507221e-06, "loss": 1.6975, "step": 1857 }, { "epoch": 2.360864040660737, "grad_norm": 1.377852716087896, "learning_rate": 7.363024513028407e-06, "loss": 1.669, "step": 1858 }, { "epoch": 2.3621346886912327, "grad_norm": 1.5830834118017767, "learning_rate": 7.353275548836132e-06, "loss": 1.3004, "step": 1859 }, { "epoch": 2.363405336721728, "grad_norm": 1.4210056302021212, "learning_rate": 7.343529288891239e-06, "loss": 1.3457, "step": 1860 }, { "epoch": 2.364675984752224, "grad_norm": 1.7089106539251813, "learning_rate": 7.333785743151806e-06, "loss": 1.6682, "step": 1861 }, { "epoch": 2.365946632782719, "grad_norm": 1.4693395253273343, "learning_rate": 7.3240449215731435e-06, "loss": 1.5347, "step": 1862 }, { "epoch": 2.3672172808132146, "grad_norm": 1.607552134774339, "learning_rate": 7.314306834107779e-06, "loss": 1.437, "step": 1863 }, { "epoch": 2.3684879288437104, "grad_norm": 1.5446117794334342, "learning_rate": 7.3045714907054345e-06, "loss": 1.5402, "step": 1864 }, { "epoch": 2.3697585768742058, "grad_norm": 1.5876854232853284, "learning_rate": 7.2948389013130486e-06, "loss": 1.3766, "step": 1865 }, { "epoch": 2.3710292249047016, "grad_norm": 1.9756101593829416, "learning_rate": 7.2851090758747325e-06, "loss": 1.6314, "step": 1866 }, { "epoch": 2.372299872935197, "grad_norm": 1.4918518931463847, "learning_rate": 7.275382024331773e-06, "loss": 1.4346, "step": 1867 }, { "epoch": 2.3735705209656923, "grad_norm": 1.4528251214023307, "learning_rate": 7.265657756622628e-06, "loss": 1.3892, "step": 1868 }, { "epoch": 2.374841168996188, "grad_norm": 1.6147228175947876, "learning_rate": 7.25593628268291e-06, "loss": 1.5138, "step": 1869 }, { "epoch": 2.3761118170266835, "grad_norm": 1.5053062534440564, "learning_rate": 7.246217612445368e-06, "loss": 1.6059, "step": 1870 }, { "epoch": 2.3773824650571793, "grad_norm": 1.9562779959252934, "learning_rate": 7.236501755839904e-06, "loss": 1.2351, "step": 1871 }, { "epoch": 2.3786531130876747, "grad_norm": 1.345287123205452, "learning_rate": 7.226788722793533e-06, "loss": 1.5513, "step": 1872 }, { "epoch": 2.37992376111817, "grad_norm": 1.6129590200155135, "learning_rate": 7.217078523230388e-06, "loss": 1.5787, "step": 1873 }, { "epoch": 2.381194409148666, "grad_norm": 1.4423268084620797, "learning_rate": 7.2073711670717e-06, "loss": 1.3152, "step": 1874 }, { "epoch": 2.3824650571791612, "grad_norm": 1.53578538307741, "learning_rate": 7.1976666642358105e-06, "loss": 1.6515, "step": 1875 }, { "epoch": 2.383735705209657, "grad_norm": 1.6359029671002592, "learning_rate": 7.187965024638127e-06, "loss": 1.5311, "step": 1876 }, { "epoch": 2.3850063532401524, "grad_norm": 1.4939051880974468, "learning_rate": 7.178266258191149e-06, "loss": 1.4407, "step": 1877 }, { "epoch": 2.3862770012706482, "grad_norm": 1.5770847918080684, "learning_rate": 7.168570374804428e-06, "loss": 1.4534, "step": 1878 }, { "epoch": 2.3875476493011436, "grad_norm": 1.5861571880340288, "learning_rate": 7.158877384384577e-06, "loss": 1.5207, "step": 1879 }, { "epoch": 2.388818297331639, "grad_norm": 1.6339465749340256, "learning_rate": 7.149187296835247e-06, "loss": 1.6262, "step": 1880 }, { "epoch": 2.390088945362135, "grad_norm": 1.3860018025537892, "learning_rate": 7.13950012205713e-06, "loss": 1.4204, "step": 1881 }, { "epoch": 2.39135959339263, "grad_norm": 1.5066011151313867, "learning_rate": 7.129815869947931e-06, "loss": 1.4559, "step": 1882 }, { "epoch": 2.392630241423126, "grad_norm": 1.6645327284475002, "learning_rate": 7.1201345504023855e-06, "loss": 1.4492, "step": 1883 }, { "epoch": 2.3939008894536213, "grad_norm": 1.5789075326437572, "learning_rate": 7.110456173312218e-06, "loss": 1.5254, "step": 1884 }, { "epoch": 2.3951715374841167, "grad_norm": 1.6463321606552945, "learning_rate": 7.100780748566154e-06, "loss": 1.643, "step": 1885 }, { "epoch": 2.3964421855146125, "grad_norm": 1.763048767867577, "learning_rate": 7.091108286049898e-06, "loss": 1.444, "step": 1886 }, { "epoch": 2.397712833545108, "grad_norm": 1.5902629901340428, "learning_rate": 7.081438795646129e-06, "loss": 1.6437, "step": 1887 }, { "epoch": 2.3989834815756037, "grad_norm": 1.416496098097569, "learning_rate": 7.071772287234497e-06, "loss": 1.3835, "step": 1888 }, { "epoch": 2.400254129606099, "grad_norm": 1.7152467379497507, "learning_rate": 7.062108770691594e-06, "loss": 1.4119, "step": 1889 }, { "epoch": 2.4015247776365944, "grad_norm": 1.4841722169436224, "learning_rate": 7.052448255890958e-06, "loss": 1.4004, "step": 1890 }, { "epoch": 2.4027954256670903, "grad_norm": 1.451227129551864, "learning_rate": 7.042790752703068e-06, "loss": 1.5291, "step": 1891 }, { "epoch": 2.4040660736975856, "grad_norm": 1.7429828031780958, "learning_rate": 7.033136270995313e-06, "loss": 1.217, "step": 1892 }, { "epoch": 2.4053367217280814, "grad_norm": 1.648381923053194, "learning_rate": 7.023484820632005e-06, "loss": 1.5879, "step": 1893 }, { "epoch": 2.406607369758577, "grad_norm": 1.646583274623715, "learning_rate": 7.013836411474358e-06, "loss": 1.2618, "step": 1894 }, { "epoch": 2.4078780177890726, "grad_norm": 1.5842634112121, "learning_rate": 7.004191053380469e-06, "loss": 1.4015, "step": 1895 }, { "epoch": 2.409148665819568, "grad_norm": 1.5471584771822826, "learning_rate": 6.994548756205332e-06, "loss": 1.5965, "step": 1896 }, { "epoch": 2.4104193138500634, "grad_norm": 1.472937778908629, "learning_rate": 6.984909529800804e-06, "loss": 1.5414, "step": 1897 }, { "epoch": 2.411689961880559, "grad_norm": 1.725692441507782, "learning_rate": 6.975273384015604e-06, "loss": 1.4092, "step": 1898 }, { "epoch": 2.4129606099110545, "grad_norm": 1.6991877267409594, "learning_rate": 6.965640328695307e-06, "loss": 1.4633, "step": 1899 }, { "epoch": 2.4142312579415504, "grad_norm": 1.3888509550747612, "learning_rate": 6.956010373682334e-06, "loss": 1.4386, "step": 1900 }, { "epoch": 2.4155019059720457, "grad_norm": 1.6374270895416039, "learning_rate": 6.9463835288159295e-06, "loss": 1.457, "step": 1901 }, { "epoch": 2.4167725540025415, "grad_norm": 1.797814081810981, "learning_rate": 6.936759803932167e-06, "loss": 1.3279, "step": 1902 }, { "epoch": 2.418043202033037, "grad_norm": 1.5405246741346568, "learning_rate": 6.927139208863929e-06, "loss": 1.6427, "step": 1903 }, { "epoch": 2.4193138500635323, "grad_norm": 1.6513898886347445, "learning_rate": 6.917521753440899e-06, "loss": 1.5625, "step": 1904 }, { "epoch": 2.420584498094028, "grad_norm": 1.4660904718823222, "learning_rate": 6.9079074474895545e-06, "loss": 1.4887, "step": 1905 }, { "epoch": 2.4218551461245235, "grad_norm": 1.617362131355844, "learning_rate": 6.8982963008331605e-06, "loss": 1.4095, "step": 1906 }, { "epoch": 2.423125794155019, "grad_norm": 1.657786559949924, "learning_rate": 6.888688323291746e-06, "loss": 1.474, "step": 1907 }, { "epoch": 2.4243964421855146, "grad_norm": 1.541370968546513, "learning_rate": 6.879083524682102e-06, "loss": 1.3467, "step": 1908 }, { "epoch": 2.42566709021601, "grad_norm": 2.2101482935442056, "learning_rate": 6.869481914817779e-06, "loss": 1.5522, "step": 1909 }, { "epoch": 2.426937738246506, "grad_norm": 1.703555663572547, "learning_rate": 6.859883503509062e-06, "loss": 1.2727, "step": 1910 }, { "epoch": 2.428208386277001, "grad_norm": 1.603473275026458, "learning_rate": 6.850288300562966e-06, "loss": 1.5417, "step": 1911 }, { "epoch": 2.429479034307497, "grad_norm": 1.5091116759413492, "learning_rate": 6.840696315783239e-06, "loss": 1.4483, "step": 1912 }, { "epoch": 2.4307496823379924, "grad_norm": 2.0729938037695423, "learning_rate": 6.831107558970337e-06, "loss": 1.9117, "step": 1913 }, { "epoch": 2.4320203303684877, "grad_norm": 1.5659133357027375, "learning_rate": 6.821522039921407e-06, "loss": 1.4462, "step": 1914 }, { "epoch": 2.4332909783989836, "grad_norm": 2.0163592713853715, "learning_rate": 6.811939768430303e-06, "loss": 1.4736, "step": 1915 }, { "epoch": 2.434561626429479, "grad_norm": 1.6466677451967848, "learning_rate": 6.802360754287548e-06, "loss": 1.5789, "step": 1916 }, { "epoch": 2.4358322744599747, "grad_norm": 1.793437180849285, "learning_rate": 6.792785007280347e-06, "loss": 1.4847, "step": 1917 }, { "epoch": 2.43710292249047, "grad_norm": 1.497933365260444, "learning_rate": 6.7832125371925625e-06, "loss": 1.2717, "step": 1918 }, { "epoch": 2.438373570520966, "grad_norm": 1.3552963469287735, "learning_rate": 6.773643353804711e-06, "loss": 1.4794, "step": 1919 }, { "epoch": 2.4396442185514613, "grad_norm": 1.5896549816261436, "learning_rate": 6.764077466893944e-06, "loss": 1.5418, "step": 1920 }, { "epoch": 2.4409148665819567, "grad_norm": 1.4623712262992057, "learning_rate": 6.754514886234054e-06, "loss": 1.5112, "step": 1921 }, { "epoch": 2.4421855146124525, "grad_norm": 1.3949476808159789, "learning_rate": 6.7449556215954435e-06, "loss": 1.5416, "step": 1922 }, { "epoch": 2.443456162642948, "grad_norm": 1.8033252619694837, "learning_rate": 6.735399682745145e-06, "loss": 1.4908, "step": 1923 }, { "epoch": 2.444726810673443, "grad_norm": 1.5860225872596243, "learning_rate": 6.725847079446779e-06, "loss": 1.6275, "step": 1924 }, { "epoch": 2.445997458703939, "grad_norm": 1.6053507982449213, "learning_rate": 6.7162978214605615e-06, "loss": 1.2901, "step": 1925 }, { "epoch": 2.4472681067344344, "grad_norm": 1.6751208486108087, "learning_rate": 6.706751918543288e-06, "loss": 1.8313, "step": 1926 }, { "epoch": 2.44853875476493, "grad_norm": 1.7560303434584272, "learning_rate": 6.697209380448333e-06, "loss": 1.6988, "step": 1927 }, { "epoch": 2.4498094027954256, "grad_norm": 1.6864532391677458, "learning_rate": 6.687670216925621e-06, "loss": 1.6406, "step": 1928 }, { "epoch": 2.4510800508259214, "grad_norm": 1.5156868555239327, "learning_rate": 6.678134437721644e-06, "loss": 1.3891, "step": 1929 }, { "epoch": 2.4523506988564168, "grad_norm": 1.4058841595948535, "learning_rate": 6.668602052579425e-06, "loss": 1.6672, "step": 1930 }, { "epoch": 2.453621346886912, "grad_norm": 1.49639471800268, "learning_rate": 6.659073071238524e-06, "loss": 1.4048, "step": 1931 }, { "epoch": 2.454891994917408, "grad_norm": 1.4973993571816406, "learning_rate": 6.649547503435021e-06, "loss": 1.6426, "step": 1932 }, { "epoch": 2.4561626429479033, "grad_norm": 1.4078171554908248, "learning_rate": 6.640025358901509e-06, "loss": 1.511, "step": 1933 }, { "epoch": 2.457433290978399, "grad_norm": 1.580293969920001, "learning_rate": 6.6305066473670765e-06, "loss": 1.5076, "step": 1934 }, { "epoch": 2.4587039390088945, "grad_norm": 1.6589775719575122, "learning_rate": 6.6209913785573245e-06, "loss": 1.3575, "step": 1935 }, { "epoch": 2.4599745870393903, "grad_norm": 1.6922684808031945, "learning_rate": 6.611479562194314e-06, "loss": 1.406, "step": 1936 }, { "epoch": 2.4612452350698857, "grad_norm": 1.421608647626518, "learning_rate": 6.601971207996592e-06, "loss": 1.6037, "step": 1937 }, { "epoch": 2.462515883100381, "grad_norm": 1.6218271395345765, "learning_rate": 6.592466325679159e-06, "loss": 1.4237, "step": 1938 }, { "epoch": 2.463786531130877, "grad_norm": 1.6665643671210966, "learning_rate": 6.582964924953477e-06, "loss": 1.6236, "step": 1939 }, { "epoch": 2.4650571791613722, "grad_norm": 1.4575551061156442, "learning_rate": 6.573467015527439e-06, "loss": 1.6012, "step": 1940 }, { "epoch": 2.4663278271918676, "grad_norm": 1.629582440111225, "learning_rate": 6.563972607105393e-06, "loss": 1.4583, "step": 1941 }, { "epoch": 2.4675984752223634, "grad_norm": 1.5051220386961484, "learning_rate": 6.554481709388083e-06, "loss": 1.7184, "step": 1942 }, { "epoch": 2.468869123252859, "grad_norm": 1.6787388884773073, "learning_rate": 6.544994332072685e-06, "loss": 1.4597, "step": 1943 }, { "epoch": 2.4701397712833546, "grad_norm": 1.559077650508331, "learning_rate": 6.535510484852767e-06, "loss": 1.5626, "step": 1944 }, { "epoch": 2.47141041931385, "grad_norm": 1.8559133633737679, "learning_rate": 6.526030177418294e-06, "loss": 1.783, "step": 1945 }, { "epoch": 2.472681067344346, "grad_norm": 1.410279022037204, "learning_rate": 6.51655341945562e-06, "loss": 1.3541, "step": 1946 }, { "epoch": 2.473951715374841, "grad_norm": 1.5190331881625136, "learning_rate": 6.507080220647466e-06, "loss": 1.4551, "step": 1947 }, { "epoch": 2.4752223634053365, "grad_norm": 1.5160178031623226, "learning_rate": 6.497610590672916e-06, "loss": 1.4626, "step": 1948 }, { "epoch": 2.4764930114358323, "grad_norm": 1.4461522772273703, "learning_rate": 6.488144539207411e-06, "loss": 1.5452, "step": 1949 }, { "epoch": 2.4777636594663277, "grad_norm": 1.7233086915731444, "learning_rate": 6.478682075922731e-06, "loss": 1.4749, "step": 1950 }, { "epoch": 2.4790343074968235, "grad_norm": 1.316593875218675, "learning_rate": 6.469223210486992e-06, "loss": 1.7607, "step": 1951 }, { "epoch": 2.480304955527319, "grad_norm": 1.7001060873689287, "learning_rate": 6.459767952564642e-06, "loss": 1.6113, "step": 1952 }, { "epoch": 2.4815756035578147, "grad_norm": 1.4831999469527257, "learning_rate": 6.450316311816432e-06, "loss": 1.462, "step": 1953 }, { "epoch": 2.48284625158831, "grad_norm": 1.6656890074314092, "learning_rate": 6.4408682978994195e-06, "loss": 1.3112, "step": 1954 }, { "epoch": 2.4841168996188054, "grad_norm": 1.5589781912381324, "learning_rate": 6.431423920466963e-06, "loss": 1.369, "step": 1955 }, { "epoch": 2.4853875476493013, "grad_norm": 1.7347879717445733, "learning_rate": 6.421983189168695e-06, "loss": 1.544, "step": 1956 }, { "epoch": 2.4866581956797966, "grad_norm": 1.7130648976384044, "learning_rate": 6.412546113650526e-06, "loss": 1.528, "step": 1957 }, { "epoch": 2.4879288437102924, "grad_norm": 1.4912029502003834, "learning_rate": 6.403112703554643e-06, "loss": 1.4823, "step": 1958 }, { "epoch": 2.489199491740788, "grad_norm": 1.5878108147444938, "learning_rate": 6.393682968519474e-06, "loss": 1.3936, "step": 1959 }, { "epoch": 2.490470139771283, "grad_norm": 1.862810522369295, "learning_rate": 6.384256918179692e-06, "loss": 1.4847, "step": 1960 }, { "epoch": 2.491740787801779, "grad_norm": 1.5408358555456292, "learning_rate": 6.374834562166217e-06, "loss": 1.6876, "step": 1961 }, { "epoch": 2.4930114358322744, "grad_norm": 1.5269426517183589, "learning_rate": 6.365415910106181e-06, "loss": 1.6543, "step": 1962 }, { "epoch": 2.49428208386277, "grad_norm": 1.8042970266257117, "learning_rate": 6.356000971622938e-06, "loss": 1.3149, "step": 1963 }, { "epoch": 2.4955527318932655, "grad_norm": 1.5245698459517367, "learning_rate": 6.34658975633605e-06, "loss": 1.6631, "step": 1964 }, { "epoch": 2.496823379923761, "grad_norm": 1.8477085437262835, "learning_rate": 6.337182273861273e-06, "loss": 1.6602, "step": 1965 }, { "epoch": 2.4980940279542567, "grad_norm": 1.4583773739409853, "learning_rate": 6.327778533810545e-06, "loss": 1.7056, "step": 1966 }, { "epoch": 2.499364675984752, "grad_norm": 1.7380518267388891, "learning_rate": 6.318378545791988e-06, "loss": 1.6442, "step": 1967 }, { "epoch": 2.500635324015248, "grad_norm": 1.5569361056404658, "learning_rate": 6.308982319409878e-06, "loss": 1.3564, "step": 1968 }, { "epoch": 2.5019059720457433, "grad_norm": 1.4894114236528235, "learning_rate": 6.299589864264662e-06, "loss": 1.6204, "step": 1969 }, { "epoch": 2.503176620076239, "grad_norm": 1.5744140289250657, "learning_rate": 6.290201189952925e-06, "loss": 1.5788, "step": 1970 }, { "epoch": 2.5044472681067345, "grad_norm": 1.4729487530295562, "learning_rate": 6.280816306067393e-06, "loss": 1.577, "step": 1971 }, { "epoch": 2.50571791613723, "grad_norm": 1.6059178983559093, "learning_rate": 6.2714352221969155e-06, "loss": 1.6455, "step": 1972 }, { "epoch": 2.5069885641677256, "grad_norm": 1.4392169542776443, "learning_rate": 6.262057947926463e-06, "loss": 1.318, "step": 1973 }, { "epoch": 2.508259212198221, "grad_norm": 1.6892962025361826, "learning_rate": 6.252684492837107e-06, "loss": 1.411, "step": 1974 }, { "epoch": 2.5095298602287164, "grad_norm": 1.7367334870989624, "learning_rate": 6.2433148665060305e-06, "loss": 1.4848, "step": 1975 }, { "epoch": 2.510800508259212, "grad_norm": 1.6643612132262988, "learning_rate": 6.233949078506489e-06, "loss": 1.6053, "step": 1976 }, { "epoch": 2.512071156289708, "grad_norm": 1.4933363345894404, "learning_rate": 6.22458713840783e-06, "loss": 1.7322, "step": 1977 }, { "epoch": 2.5133418043202034, "grad_norm": 1.622785162327148, "learning_rate": 6.215229055775454e-06, "loss": 1.4641, "step": 1978 }, { "epoch": 2.5146124523506987, "grad_norm": 1.5606802690693409, "learning_rate": 6.205874840170833e-06, "loss": 1.6456, "step": 1979 }, { "epoch": 2.5158831003811946, "grad_norm": 1.7392960071598602, "learning_rate": 6.196524501151479e-06, "loss": 1.5845, "step": 1980 }, { "epoch": 2.51715374841169, "grad_norm": 1.5907237596599346, "learning_rate": 6.187178048270956e-06, "loss": 1.2511, "step": 1981 }, { "epoch": 2.5184243964421853, "grad_norm": 1.515123504297544, "learning_rate": 6.1778354910788465e-06, "loss": 1.593, "step": 1982 }, { "epoch": 2.519695044472681, "grad_norm": 1.6709740601289755, "learning_rate": 6.168496839120754e-06, "loss": 1.455, "step": 1983 }, { "epoch": 2.5209656925031765, "grad_norm": 1.6767746763503226, "learning_rate": 6.159162101938292e-06, "loss": 1.5791, "step": 1984 }, { "epoch": 2.5222363405336723, "grad_norm": 1.3286317631433775, "learning_rate": 6.149831289069079e-06, "loss": 1.5604, "step": 1985 }, { "epoch": 2.5235069885641677, "grad_norm": 1.3069539059345867, "learning_rate": 6.140504410046712e-06, "loss": 1.5462, "step": 1986 }, { "epoch": 2.5247776365946635, "grad_norm": 1.319442531061846, "learning_rate": 6.131181474400789e-06, "loss": 1.3234, "step": 1987 }, { "epoch": 2.526048284625159, "grad_norm": 1.6566533985853409, "learning_rate": 6.121862491656858e-06, "loss": 1.5241, "step": 1988 }, { "epoch": 2.527318932655654, "grad_norm": 1.7007255613732077, "learning_rate": 6.112547471336443e-06, "loss": 1.6529, "step": 1989 }, { "epoch": 2.52858958068615, "grad_norm": 1.6346339798837812, "learning_rate": 6.103236422957009e-06, "loss": 1.6411, "step": 1990 }, { "epoch": 2.5298602287166454, "grad_norm": 1.3974924068573895, "learning_rate": 6.09392935603197e-06, "loss": 1.4445, "step": 1991 }, { "epoch": 2.5311308767471408, "grad_norm": 1.675829876273148, "learning_rate": 6.084626280070663e-06, "loss": 1.4593, "step": 1992 }, { "epoch": 2.5324015247776366, "grad_norm": 1.5825081593692385, "learning_rate": 6.075327204578363e-06, "loss": 1.5878, "step": 1993 }, { "epoch": 2.5336721728081324, "grad_norm": 1.3884029886128855, "learning_rate": 6.066032139056244e-06, "loss": 1.426, "step": 1994 }, { "epoch": 2.5349428208386278, "grad_norm": 1.6846001017280459, "learning_rate": 6.056741093001387e-06, "loss": 1.5619, "step": 1995 }, { "epoch": 2.536213468869123, "grad_norm": 1.594688745927412, "learning_rate": 6.0474540759067645e-06, "loss": 1.5126, "step": 1996 }, { "epoch": 2.537484116899619, "grad_norm": 1.568543889486274, "learning_rate": 6.038171097261234e-06, "loss": 1.6029, "step": 1997 }, { "epoch": 2.5387547649301143, "grad_norm": 1.8935050808469194, "learning_rate": 6.02889216654953e-06, "loss": 1.5576, "step": 1998 }, { "epoch": 2.5400254129606097, "grad_norm": 1.674665638907306, "learning_rate": 6.019617293252249e-06, "loss": 1.4246, "step": 1999 }, { "epoch": 2.5412960609911055, "grad_norm": 1.3876686424816804, "learning_rate": 6.010346486845837e-06, "loss": 1.5987, "step": 2000 }, { "epoch": 2.542566709021601, "grad_norm": 1.612133543103817, "learning_rate": 6.001079756802592e-06, "loss": 1.4918, "step": 2001 }, { "epoch": 2.5438373570520967, "grad_norm": 1.6228281232000563, "learning_rate": 5.991817112590641e-06, "loss": 1.4632, "step": 2002 }, { "epoch": 2.545108005082592, "grad_norm": 1.4937812398891073, "learning_rate": 5.982558563673938e-06, "loss": 1.5252, "step": 2003 }, { "epoch": 2.546378653113088, "grad_norm": 1.534737369108716, "learning_rate": 5.973304119512258e-06, "loss": 1.3607, "step": 2004 }, { "epoch": 2.5476493011435832, "grad_norm": 1.7211354755831836, "learning_rate": 5.964053789561177e-06, "loss": 1.4835, "step": 2005 }, { "epoch": 2.5489199491740786, "grad_norm": 1.7018457215922282, "learning_rate": 5.9548075832720655e-06, "loss": 1.2578, "step": 2006 }, { "epoch": 2.5501905972045744, "grad_norm": 1.575855457005483, "learning_rate": 5.945565510092086e-06, "loss": 1.4262, "step": 2007 }, { "epoch": 2.55146124523507, "grad_norm": 1.5276479971645556, "learning_rate": 5.936327579464174e-06, "loss": 1.5924, "step": 2008 }, { "epoch": 2.5527318932655656, "grad_norm": 1.5564125880265227, "learning_rate": 5.927093800827032e-06, "loss": 1.6219, "step": 2009 }, { "epoch": 2.554002541296061, "grad_norm": 1.498257762944526, "learning_rate": 5.917864183615125e-06, "loss": 1.4304, "step": 2010 }, { "epoch": 2.555273189326557, "grad_norm": 1.51373375392949, "learning_rate": 5.908638737258666e-06, "loss": 1.5876, "step": 2011 }, { "epoch": 2.556543837357052, "grad_norm": 1.6551224771522475, "learning_rate": 5.8994174711836e-06, "loss": 1.307, "step": 2012 }, { "epoch": 2.5578144853875475, "grad_norm": 1.8144453783675563, "learning_rate": 5.890200394811605e-06, "loss": 1.5938, "step": 2013 }, { "epoch": 2.5590851334180433, "grad_norm": 1.6966638492890553, "learning_rate": 5.880987517560075e-06, "loss": 1.4143, "step": 2014 }, { "epoch": 2.5603557814485387, "grad_norm": 1.6047126629279407, "learning_rate": 5.87177884884212e-06, "loss": 1.5542, "step": 2015 }, { "epoch": 2.561626429479034, "grad_norm": 1.502741448048564, "learning_rate": 5.862574398066547e-06, "loss": 1.5601, "step": 2016 }, { "epoch": 2.56289707750953, "grad_norm": 1.6164079656020516, "learning_rate": 5.853374174637855e-06, "loss": 1.6705, "step": 2017 }, { "epoch": 2.5641677255400253, "grad_norm": 1.571527752327933, "learning_rate": 5.844178187956215e-06, "loss": 1.4099, "step": 2018 }, { "epoch": 2.565438373570521, "grad_norm": 1.826938166341695, "learning_rate": 5.834986447417481e-06, "loss": 1.6639, "step": 2019 }, { "epoch": 2.5667090216010164, "grad_norm": 1.5305172337767854, "learning_rate": 5.825798962413164e-06, "loss": 1.3573, "step": 2020 }, { "epoch": 2.5679796696315123, "grad_norm": 1.4847243067385838, "learning_rate": 5.81661574233042e-06, "loss": 1.6699, "step": 2021 }, { "epoch": 2.5692503176620076, "grad_norm": 1.5910837917800331, "learning_rate": 5.807436796552062e-06, "loss": 1.4542, "step": 2022 }, { "epoch": 2.570520965692503, "grad_norm": 1.5691937901257724, "learning_rate": 5.79826213445652e-06, "loss": 1.5276, "step": 2023 }, { "epoch": 2.571791613722999, "grad_norm": 1.7161084283422166, "learning_rate": 5.789091765417862e-06, "loss": 1.5906, "step": 2024 }, { "epoch": 2.573062261753494, "grad_norm": 1.403230062689204, "learning_rate": 5.77992569880576e-06, "loss": 1.4501, "step": 2025 }, { "epoch": 2.57433290978399, "grad_norm": 1.4964283684395638, "learning_rate": 5.7707639439854865e-06, "loss": 1.5517, "step": 2026 }, { "epoch": 2.5756035578144854, "grad_norm": 1.3622802648369978, "learning_rate": 5.761606510317921e-06, "loss": 1.307, "step": 2027 }, { "epoch": 2.576874205844981, "grad_norm": 1.326365025189336, "learning_rate": 5.752453407159521e-06, "loss": 1.7017, "step": 2028 }, { "epoch": 2.5781448538754765, "grad_norm": 1.5433055701471474, "learning_rate": 5.743304643862322e-06, "loss": 1.4591, "step": 2029 }, { "epoch": 2.579415501905972, "grad_norm": 2.016978937351612, "learning_rate": 5.7341602297739185e-06, "loss": 1.6965, "step": 2030 }, { "epoch": 2.5806861499364677, "grad_norm": 1.4870993206979286, "learning_rate": 5.725020174237463e-06, "loss": 1.4431, "step": 2031 }, { "epoch": 2.581956797966963, "grad_norm": 1.622926065692516, "learning_rate": 5.715884486591663e-06, "loss": 1.5469, "step": 2032 }, { "epoch": 2.5832274459974585, "grad_norm": 1.4456629337597853, "learning_rate": 5.706753176170761e-06, "loss": 1.3962, "step": 2033 }, { "epoch": 2.5844980940279543, "grad_norm": 1.6027971002444077, "learning_rate": 5.697626252304518e-06, "loss": 1.3743, "step": 2034 }, { "epoch": 2.5857687420584496, "grad_norm": 1.5331799529340098, "learning_rate": 5.688503724318217e-06, "loss": 1.3231, "step": 2035 }, { "epoch": 2.5870393900889455, "grad_norm": 1.490824381640179, "learning_rate": 5.67938560153266e-06, "loss": 1.5394, "step": 2036 }, { "epoch": 2.588310038119441, "grad_norm": 1.4492078377518673, "learning_rate": 5.670271893264135e-06, "loss": 1.3416, "step": 2037 }, { "epoch": 2.5895806861499366, "grad_norm": 1.579716437800065, "learning_rate": 5.66116260882442e-06, "loss": 1.6252, "step": 2038 }, { "epoch": 2.590851334180432, "grad_norm": 1.5764787878452378, "learning_rate": 5.652057757520782e-06, "loss": 1.5564, "step": 2039 }, { "epoch": 2.5921219822109274, "grad_norm": 1.6354725147911908, "learning_rate": 5.642957348655957e-06, "loss": 1.3986, "step": 2040 }, { "epoch": 2.593392630241423, "grad_norm": 1.4289914223659326, "learning_rate": 5.633861391528135e-06, "loss": 1.2618, "step": 2041 }, { "epoch": 2.5946632782719186, "grad_norm": 1.523265210937983, "learning_rate": 5.6247698954309616e-06, "loss": 1.5941, "step": 2042 }, { "epoch": 2.5959339263024144, "grad_norm": 1.4714236027113614, "learning_rate": 5.615682869653518e-06, "loss": 1.5671, "step": 2043 }, { "epoch": 2.5972045743329097, "grad_norm": 1.4876259379201178, "learning_rate": 5.606600323480332e-06, "loss": 1.4436, "step": 2044 }, { "epoch": 2.5984752223634056, "grad_norm": 1.4783074748512717, "learning_rate": 5.597522266191348e-06, "loss": 1.4667, "step": 2045 }, { "epoch": 2.599745870393901, "grad_norm": 1.90561408566863, "learning_rate": 5.5884487070619184e-06, "loss": 1.6617, "step": 2046 }, { "epoch": 2.6010165184243963, "grad_norm": 1.5778631845722175, "learning_rate": 5.579379655362801e-06, "loss": 1.5352, "step": 2047 }, { "epoch": 2.602287166454892, "grad_norm": 1.4646496408634837, "learning_rate": 5.570315120360157e-06, "loss": 1.6054, "step": 2048 }, { "epoch": 2.6035578144853875, "grad_norm": 1.9450738926737605, "learning_rate": 5.561255111315525e-06, "loss": 1.819, "step": 2049 }, { "epoch": 2.604828462515883, "grad_norm": 1.5400473317263956, "learning_rate": 5.5521996374858134e-06, "loss": 1.3851, "step": 2050 }, { "epoch": 2.6060991105463787, "grad_norm": 1.8228604096633896, "learning_rate": 5.5431487081233115e-06, "loss": 1.582, "step": 2051 }, { "epoch": 2.6073697585768745, "grad_norm": 1.551585038967548, "learning_rate": 5.534102332475661e-06, "loss": 1.2821, "step": 2052 }, { "epoch": 2.60864040660737, "grad_norm": 1.9066203734674334, "learning_rate": 5.525060519785845e-06, "loss": 1.4285, "step": 2053 }, { "epoch": 2.609911054637865, "grad_norm": 1.7219081400274896, "learning_rate": 5.5160232792921845e-06, "loss": 1.6345, "step": 2054 }, { "epoch": 2.611181702668361, "grad_norm": 1.7065215233402147, "learning_rate": 5.5069906202283315e-06, "loss": 1.6086, "step": 2055 }, { "epoch": 2.6124523506988564, "grad_norm": 1.7118658292689697, "learning_rate": 5.497962551823266e-06, "loss": 1.5247, "step": 2056 }, { "epoch": 2.6137229987293518, "grad_norm": 1.5000744419560181, "learning_rate": 5.488939083301264e-06, "loss": 1.4179, "step": 2057 }, { "epoch": 2.6149936467598476, "grad_norm": 1.5433374554439336, "learning_rate": 5.479920223881906e-06, "loss": 1.6244, "step": 2058 }, { "epoch": 2.616264294790343, "grad_norm": 1.6079090761483708, "learning_rate": 5.47090598278006e-06, "loss": 1.6962, "step": 2059 }, { "epoch": 2.6175349428208388, "grad_norm": 1.6881653881113183, "learning_rate": 5.461896369205888e-06, "loss": 1.2013, "step": 2060 }, { "epoch": 2.618805590851334, "grad_norm": 1.599027772485402, "learning_rate": 5.452891392364808e-06, "loss": 1.5009, "step": 2061 }, { "epoch": 2.62007623888183, "grad_norm": 1.8886355634647696, "learning_rate": 5.4438910614575115e-06, "loss": 1.5144, "step": 2062 }, { "epoch": 2.6213468869123253, "grad_norm": 1.5412645355637977, "learning_rate": 5.434895385679937e-06, "loss": 1.3677, "step": 2063 }, { "epoch": 2.6226175349428207, "grad_norm": 1.593887186726804, "learning_rate": 5.425904374223272e-06, "loss": 1.4759, "step": 2064 }, { "epoch": 2.6238881829733165, "grad_norm": 1.4494031826945593, "learning_rate": 5.416918036273935e-06, "loss": 1.3884, "step": 2065 }, { "epoch": 2.625158831003812, "grad_norm": 1.7941454557018781, "learning_rate": 5.407936381013564e-06, "loss": 1.6359, "step": 2066 }, { "epoch": 2.6264294790343072, "grad_norm": 1.6654720648068262, "learning_rate": 5.398959417619022e-06, "loss": 1.6703, "step": 2067 }, { "epoch": 2.627700127064803, "grad_norm": 1.5915824639539826, "learning_rate": 5.38998715526238e-06, "loss": 1.5062, "step": 2068 }, { "epoch": 2.628970775095299, "grad_norm": 1.4993321994594955, "learning_rate": 5.381019603110893e-06, "loss": 1.364, "step": 2069 }, { "epoch": 2.6302414231257942, "grad_norm": 1.5252693073346493, "learning_rate": 5.3720567703270135e-06, "loss": 1.5045, "step": 2070 }, { "epoch": 2.6315120711562896, "grad_norm": 1.5144681548120225, "learning_rate": 5.3630986660683644e-06, "loss": 1.5413, "step": 2071 }, { "epoch": 2.6327827191867854, "grad_norm": 1.475701365613179, "learning_rate": 5.35414529948775e-06, "loss": 1.1327, "step": 2072 }, { "epoch": 2.634053367217281, "grad_norm": 1.4656305801975826, "learning_rate": 5.345196679733118e-06, "loss": 1.5157, "step": 2073 }, { "epoch": 2.635324015247776, "grad_norm": 1.9554243405684435, "learning_rate": 5.336252815947581e-06, "loss": 1.6661, "step": 2074 }, { "epoch": 2.636594663278272, "grad_norm": 1.4324007946095954, "learning_rate": 5.32731371726938e-06, "loss": 1.3692, "step": 2075 }, { "epoch": 2.6378653113087673, "grad_norm": 1.395704553067545, "learning_rate": 5.3183793928318986e-06, "loss": 1.465, "step": 2076 }, { "epoch": 2.639135959339263, "grad_norm": 1.4527379036498123, "learning_rate": 5.3094498517636324e-06, "loss": 1.5268, "step": 2077 }, { "epoch": 2.6404066073697585, "grad_norm": 1.401333052050349, "learning_rate": 5.3005251031881925e-06, "loss": 1.5663, "step": 2078 }, { "epoch": 2.6416772554002543, "grad_norm": 1.6150571455447142, "learning_rate": 5.291605156224295e-06, "loss": 1.5084, "step": 2079 }, { "epoch": 2.6429479034307497, "grad_norm": 1.4759980850740504, "learning_rate": 5.282690019985756e-06, "loss": 1.3241, "step": 2080 }, { "epoch": 2.644218551461245, "grad_norm": 1.8072175837094544, "learning_rate": 5.273779703581468e-06, "loss": 1.4983, "step": 2081 }, { "epoch": 2.645489199491741, "grad_norm": 1.645313057282961, "learning_rate": 5.264874216115391e-06, "loss": 1.3519, "step": 2082 }, { "epoch": 2.6467598475222363, "grad_norm": 4.221302465317087, "learning_rate": 5.255973566686574e-06, "loss": 1.5022, "step": 2083 }, { "epoch": 2.6480304955527316, "grad_norm": 1.4510208188771505, "learning_rate": 5.247077764389099e-06, "loss": 1.4536, "step": 2084 }, { "epoch": 2.6493011435832274, "grad_norm": 1.5493648471181438, "learning_rate": 5.238186818312117e-06, "loss": 1.4785, "step": 2085 }, { "epoch": 2.6505717916137232, "grad_norm": 1.4302419242555682, "learning_rate": 5.229300737539801e-06, "loss": 1.5484, "step": 2086 }, { "epoch": 2.6518424396442186, "grad_norm": 1.849299532453937, "learning_rate": 5.220419531151355e-06, "loss": 1.6416, "step": 2087 }, { "epoch": 2.653113087674714, "grad_norm": 1.7434161293677466, "learning_rate": 5.211543208221013e-06, "loss": 1.6274, "step": 2088 }, { "epoch": 2.65438373570521, "grad_norm": 2.0005460426933634, "learning_rate": 5.20267177781801e-06, "loss": 1.6919, "step": 2089 }, { "epoch": 2.655654383735705, "grad_norm": 1.5018561095353853, "learning_rate": 5.193805249006581e-06, "loss": 1.5313, "step": 2090 }, { "epoch": 2.6569250317662005, "grad_norm": 1.491362650772649, "learning_rate": 5.18494363084596e-06, "loss": 1.7367, "step": 2091 }, { "epoch": 2.6581956797966964, "grad_norm": 1.5255178081426906, "learning_rate": 5.176086932390365e-06, "loss": 1.4245, "step": 2092 }, { "epoch": 2.6594663278271917, "grad_norm": 1.9086179984912208, "learning_rate": 5.167235162688977e-06, "loss": 1.3983, "step": 2093 }, { "epoch": 2.6607369758576875, "grad_norm": 1.4844669866832318, "learning_rate": 5.158388330785944e-06, "loss": 1.2229, "step": 2094 }, { "epoch": 2.662007623888183, "grad_norm": 1.8342374109409971, "learning_rate": 5.149546445720381e-06, "loss": 1.6674, "step": 2095 }, { "epoch": 2.6632782719186787, "grad_norm": 1.442105359076775, "learning_rate": 5.140709516526328e-06, "loss": 1.7274, "step": 2096 }, { "epoch": 2.664548919949174, "grad_norm": 1.3759796092109167, "learning_rate": 5.131877552232783e-06, "loss": 1.4564, "step": 2097 }, { "epoch": 2.6658195679796695, "grad_norm": 1.4110779592420157, "learning_rate": 5.1230505618636575e-06, "loss": 1.4833, "step": 2098 }, { "epoch": 2.6670902160101653, "grad_norm": 1.5201835870073264, "learning_rate": 5.114228554437779e-06, "loss": 1.3928, "step": 2099 }, { "epoch": 2.6683608640406606, "grad_norm": 1.5241126316189535, "learning_rate": 5.105411538968898e-06, "loss": 1.3865, "step": 2100 }, { "epoch": 2.6696315120711565, "grad_norm": 1.6070488288142075, "learning_rate": 5.0965995244656504e-06, "loss": 1.5122, "step": 2101 }, { "epoch": 2.670902160101652, "grad_norm": 1.6530350204573605, "learning_rate": 5.087792519931565e-06, "loss": 1.4907, "step": 2102 }, { "epoch": 2.6721728081321476, "grad_norm": 1.7113670629919329, "learning_rate": 5.078990534365058e-06, "loss": 1.5365, "step": 2103 }, { "epoch": 2.673443456162643, "grad_norm": 1.5090773096609014, "learning_rate": 5.070193576759419e-06, "loss": 1.4808, "step": 2104 }, { "epoch": 2.6747141041931384, "grad_norm": 1.5895263186914708, "learning_rate": 5.061401656102791e-06, "loss": 1.4958, "step": 2105 }, { "epoch": 2.675984752223634, "grad_norm": 1.3676785553851607, "learning_rate": 5.05261478137817e-06, "loss": 1.6263, "step": 2106 }, { "epoch": 2.6772554002541296, "grad_norm": 1.5053464148984217, "learning_rate": 5.043832961563411e-06, "loss": 1.4466, "step": 2107 }, { "epoch": 2.678526048284625, "grad_norm": 1.5858334316382405, "learning_rate": 5.035056205631183e-06, "loss": 1.3851, "step": 2108 }, { "epoch": 2.6797966963151207, "grad_norm": 1.6235052285091025, "learning_rate": 5.026284522549006e-06, "loss": 1.4365, "step": 2109 }, { "epoch": 2.681067344345616, "grad_norm": 1.5799018059764751, "learning_rate": 5.017517921279198e-06, "loss": 1.4248, "step": 2110 }, { "epoch": 2.682337992376112, "grad_norm": 1.4343381070499153, "learning_rate": 5.0087564107788835e-06, "loss": 1.2388, "step": 2111 }, { "epoch": 2.6836086404066073, "grad_norm": 1.718308551073008, "learning_rate": 5.000000000000003e-06, "loss": 1.4583, "step": 2112 }, { "epoch": 2.684879288437103, "grad_norm": 1.616801102187747, "learning_rate": 4.991248697889266e-06, "loss": 1.522, "step": 2113 }, { "epoch": 2.6861499364675985, "grad_norm": 1.674093541127283, "learning_rate": 4.982502513388182e-06, "loss": 1.4097, "step": 2114 }, { "epoch": 2.687420584498094, "grad_norm": 1.5210203143542569, "learning_rate": 4.973761455433014e-06, "loss": 1.4347, "step": 2115 }, { "epoch": 2.6886912325285897, "grad_norm": 1.390482685222237, "learning_rate": 4.9650255329548016e-06, "loss": 1.484, "step": 2116 }, { "epoch": 2.689961880559085, "grad_norm": 1.5907098780154691, "learning_rate": 4.9562947548793275e-06, "loss": 1.4206, "step": 2117 }, { "epoch": 2.691232528589581, "grad_norm": 1.4255923958634236, "learning_rate": 4.947569130127115e-06, "loss": 1.5406, "step": 2118 }, { "epoch": 2.692503176620076, "grad_norm": 1.5360499494919408, "learning_rate": 4.938848667613436e-06, "loss": 1.3503, "step": 2119 }, { "epoch": 2.693773824650572, "grad_norm": 1.415753496961855, "learning_rate": 4.930133376248282e-06, "loss": 1.4805, "step": 2120 }, { "epoch": 2.6950444726810674, "grad_norm": 1.7055062463214483, "learning_rate": 4.921423264936356e-06, "loss": 1.4416, "step": 2121 }, { "epoch": 2.6963151207115628, "grad_norm": 1.6167589168048309, "learning_rate": 4.912718342577068e-06, "loss": 1.7105, "step": 2122 }, { "epoch": 2.6975857687420586, "grad_norm": 1.668769249731688, "learning_rate": 4.904018618064536e-06, "loss": 1.4102, "step": 2123 }, { "epoch": 2.698856416772554, "grad_norm": 1.7919989171042408, "learning_rate": 4.8953241002875585e-06, "loss": 1.5343, "step": 2124 }, { "epoch": 2.7001270648030493, "grad_norm": 1.907222422302549, "learning_rate": 4.886634798129612e-06, "loss": 1.457, "step": 2125 }, { "epoch": 2.701397712833545, "grad_norm": 1.3685673054290053, "learning_rate": 4.8779507204688595e-06, "loss": 1.6121, "step": 2126 }, { "epoch": 2.7026683608640405, "grad_norm": 1.5385795370037307, "learning_rate": 4.869271876178103e-06, "loss": 1.5327, "step": 2127 }, { "epoch": 2.7039390088945363, "grad_norm": 1.6371369682481829, "learning_rate": 4.860598274124821e-06, "loss": 1.4613, "step": 2128 }, { "epoch": 2.7052096569250317, "grad_norm": 1.5363684219769342, "learning_rate": 4.851929923171118e-06, "loss": 1.6277, "step": 2129 }, { "epoch": 2.7064803049555275, "grad_norm": 1.5724072159200964, "learning_rate": 4.843266832173737e-06, "loss": 1.5082, "step": 2130 }, { "epoch": 2.707750952986023, "grad_norm": 1.5739904757793726, "learning_rate": 4.834609009984055e-06, "loss": 1.6271, "step": 2131 }, { "epoch": 2.7090216010165182, "grad_norm": 1.6771818386475754, "learning_rate": 4.825956465448061e-06, "loss": 1.5462, "step": 2132 }, { "epoch": 2.710292249047014, "grad_norm": 1.704072020358441, "learning_rate": 4.817309207406347e-06, "loss": 1.179, "step": 2133 }, { "epoch": 2.7115628970775094, "grad_norm": 1.513942054124072, "learning_rate": 4.808667244694105e-06, "loss": 1.7137, "step": 2134 }, { "epoch": 2.7128335451080052, "grad_norm": 1.558758018604813, "learning_rate": 4.800030586141125e-06, "loss": 1.3797, "step": 2135 }, { "epoch": 2.7141041931385006, "grad_norm": 1.9040432234103206, "learning_rate": 4.791399240571771e-06, "loss": 1.4426, "step": 2136 }, { "epoch": 2.7153748411689964, "grad_norm": 1.647871591808805, "learning_rate": 4.782773216804971e-06, "loss": 1.361, "step": 2137 }, { "epoch": 2.716645489199492, "grad_norm": 1.2647652100439177, "learning_rate": 4.774152523654235e-06, "loss": 1.5768, "step": 2138 }, { "epoch": 2.717916137229987, "grad_norm": 1.7706175430015751, "learning_rate": 4.765537169927604e-06, "loss": 1.4845, "step": 2139 }, { "epoch": 2.719186785260483, "grad_norm": 1.6110442013395836, "learning_rate": 4.756927164427685e-06, "loss": 1.7129, "step": 2140 }, { "epoch": 2.7204574332909783, "grad_norm": 1.5657124909629976, "learning_rate": 4.748322515951605e-06, "loss": 1.371, "step": 2141 }, { "epoch": 2.7217280813214737, "grad_norm": 1.6632605767728326, "learning_rate": 4.739723233291019e-06, "loss": 1.4385, "step": 2142 }, { "epoch": 2.7229987293519695, "grad_norm": 1.4014563049575284, "learning_rate": 4.731129325232106e-06, "loss": 1.4947, "step": 2143 }, { "epoch": 2.7242693773824653, "grad_norm": 1.3985696072178182, "learning_rate": 4.722540800555559e-06, "loss": 1.4192, "step": 2144 }, { "epoch": 2.7255400254129607, "grad_norm": 1.3735185623637343, "learning_rate": 4.713957668036553e-06, "loss": 1.4882, "step": 2145 }, { "epoch": 2.726810673443456, "grad_norm": 1.586671747488094, "learning_rate": 4.7053799364447625e-06, "loss": 1.3832, "step": 2146 }, { "epoch": 2.728081321473952, "grad_norm": 1.6349004373412965, "learning_rate": 4.696807614544352e-06, "loss": 1.8207, "step": 2147 }, { "epoch": 2.7293519695044473, "grad_norm": 2.29076717553526, "learning_rate": 4.688240711093942e-06, "loss": 1.5078, "step": 2148 }, { "epoch": 2.7306226175349426, "grad_norm": 1.8004150981237756, "learning_rate": 4.679679234846636e-06, "loss": 1.7272, "step": 2149 }, { "epoch": 2.7318932655654384, "grad_norm": 1.5136935349046672, "learning_rate": 4.671123194549971e-06, "loss": 1.6561, "step": 2150 }, { "epoch": 2.733163913595934, "grad_norm": 1.4483326917445587, "learning_rate": 4.662572598945951e-06, "loss": 1.8069, "step": 2151 }, { "epoch": 2.7344345616264296, "grad_norm": 1.6464150263965145, "learning_rate": 4.6540274567710044e-06, "loss": 1.4983, "step": 2152 }, { "epoch": 2.735705209656925, "grad_norm": 1.6737452035371205, "learning_rate": 4.645487776755988e-06, "loss": 1.4596, "step": 2153 }, { "epoch": 2.736975857687421, "grad_norm": 1.5188060122231457, "learning_rate": 4.636953567626176e-06, "loss": 1.4163, "step": 2154 }, { "epoch": 2.738246505717916, "grad_norm": 1.724069268162742, "learning_rate": 4.628424838101263e-06, "loss": 1.4403, "step": 2155 }, { "epoch": 2.7395171537484115, "grad_norm": 1.541246935958105, "learning_rate": 4.619901596895342e-06, "loss": 1.4836, "step": 2156 }, { "epoch": 2.7407878017789074, "grad_norm": 1.6058072617047359, "learning_rate": 4.61138385271689e-06, "loss": 1.5705, "step": 2157 }, { "epoch": 2.7420584498094027, "grad_norm": 1.5665710983434642, "learning_rate": 4.602871614268769e-06, "loss": 1.3489, "step": 2158 }, { "epoch": 2.743329097839898, "grad_norm": 1.4042165480312023, "learning_rate": 4.594364890248229e-06, "loss": 1.7615, "step": 2159 }, { "epoch": 2.744599745870394, "grad_norm": 1.7081967156520237, "learning_rate": 4.585863689346865e-06, "loss": 2.019, "step": 2160 }, { "epoch": 2.7458703939008897, "grad_norm": 1.470732998371279, "learning_rate": 4.57736802025065e-06, "loss": 1.5371, "step": 2161 }, { "epoch": 2.747141041931385, "grad_norm": 1.7047816374648823, "learning_rate": 4.568877891639887e-06, "loss": 1.4689, "step": 2162 }, { "epoch": 2.7484116899618805, "grad_norm": 3.6959727917658265, "learning_rate": 4.560393312189233e-06, "loss": 1.6444, "step": 2163 }, { "epoch": 2.7496823379923763, "grad_norm": 1.5603922210765961, "learning_rate": 4.551914290567665e-06, "loss": 1.4311, "step": 2164 }, { "epoch": 2.7509529860228716, "grad_norm": 1.6051704189943057, "learning_rate": 4.543440835438483e-06, "loss": 1.5238, "step": 2165 }, { "epoch": 2.752223634053367, "grad_norm": 1.4293857594876154, "learning_rate": 4.534972955459299e-06, "loss": 1.5342, "step": 2166 }, { "epoch": 2.753494282083863, "grad_norm": 1.4783762134150356, "learning_rate": 4.5265106592820344e-06, "loss": 1.4105, "step": 2167 }, { "epoch": 2.754764930114358, "grad_norm": 1.446137187771283, "learning_rate": 4.518053955552903e-06, "loss": 1.5654, "step": 2168 }, { "epoch": 2.756035578144854, "grad_norm": 1.6473357899274805, "learning_rate": 4.509602852912403e-06, "loss": 1.5307, "step": 2169 }, { "epoch": 2.7573062261753494, "grad_norm": 1.450012021625794, "learning_rate": 4.5011573599953054e-06, "loss": 1.5723, "step": 2170 }, { "epoch": 2.758576874205845, "grad_norm": 1.3467797974730438, "learning_rate": 4.492717485430657e-06, "loss": 1.5967, "step": 2171 }, { "epoch": 2.7598475222363406, "grad_norm": 1.5359734040408937, "learning_rate": 4.484283237841766e-06, "loss": 1.4576, "step": 2172 }, { "epoch": 2.761118170266836, "grad_norm": 1.338077406445025, "learning_rate": 4.475854625846183e-06, "loss": 1.6186, "step": 2173 }, { "epoch": 2.7623888182973317, "grad_norm": 1.7898288413802088, "learning_rate": 4.467431658055701e-06, "loss": 1.2723, "step": 2174 }, { "epoch": 2.763659466327827, "grad_norm": 1.7675244465580209, "learning_rate": 4.459014343076356e-06, "loss": 1.4509, "step": 2175 }, { "epoch": 2.7649301143583225, "grad_norm": 1.6014064081289283, "learning_rate": 4.450602689508399e-06, "loss": 1.6356, "step": 2176 }, { "epoch": 2.7662007623888183, "grad_norm": 1.4284741772133758, "learning_rate": 4.442196705946295e-06, "loss": 1.4993, "step": 2177 }, { "epoch": 2.767471410419314, "grad_norm": 1.458309275467031, "learning_rate": 4.433796400978722e-06, "loss": 1.6167, "step": 2178 }, { "epoch": 2.7687420584498095, "grad_norm": 1.8266623098630126, "learning_rate": 4.425401783188563e-06, "loss": 1.5206, "step": 2179 }, { "epoch": 2.770012706480305, "grad_norm": 1.4553210631084015, "learning_rate": 4.417012861152873e-06, "loss": 1.6961, "step": 2180 }, { "epoch": 2.7712833545108007, "grad_norm": 1.3513808318342604, "learning_rate": 4.408629643442896e-06, "loss": 1.6516, "step": 2181 }, { "epoch": 2.772554002541296, "grad_norm": 1.5580064345986402, "learning_rate": 4.400252138624047e-06, "loss": 1.2505, "step": 2182 }, { "epoch": 2.7738246505717914, "grad_norm": 1.7947293920430765, "learning_rate": 4.391880355255905e-06, "loss": 1.4121, "step": 2183 }, { "epoch": 2.775095298602287, "grad_norm": 1.6920046249691607, "learning_rate": 4.3835143018922075e-06, "loss": 1.8273, "step": 2184 }, { "epoch": 2.7763659466327826, "grad_norm": 1.5498008118708304, "learning_rate": 4.375153987080829e-06, "loss": 1.5396, "step": 2185 }, { "epoch": 2.7776365946632784, "grad_norm": 2.17633003006867, "learning_rate": 4.3667994193637794e-06, "loss": 1.6837, "step": 2186 }, { "epoch": 2.7789072426937738, "grad_norm": 1.4609801522718546, "learning_rate": 4.35845060727721e-06, "loss": 1.4504, "step": 2187 }, { "epoch": 2.7801778907242696, "grad_norm": 1.582605698584194, "learning_rate": 4.35010755935138e-06, "loss": 1.5495, "step": 2188 }, { "epoch": 2.781448538754765, "grad_norm": 1.5466728476915508, "learning_rate": 4.341770284110655e-06, "loss": 1.3189, "step": 2189 }, { "epoch": 2.7827191867852603, "grad_norm": 1.4794817703134617, "learning_rate": 4.333438790073518e-06, "loss": 1.4851, "step": 2190 }, { "epoch": 2.783989834815756, "grad_norm": 1.543011701944719, "learning_rate": 4.325113085752537e-06, "loss": 1.6088, "step": 2191 }, { "epoch": 2.7852604828462515, "grad_norm": 1.5245810456131066, "learning_rate": 4.316793179654362e-06, "loss": 1.1476, "step": 2192 }, { "epoch": 2.786531130876747, "grad_norm": 1.7529414814172941, "learning_rate": 4.308479080279718e-06, "loss": 1.8261, "step": 2193 }, { "epoch": 2.7878017789072427, "grad_norm": 1.4884060379109785, "learning_rate": 4.3001707961233994e-06, "loss": 1.5434, "step": 2194 }, { "epoch": 2.7890724269377385, "grad_norm": 1.5961743329595908, "learning_rate": 4.291868335674263e-06, "loss": 1.3902, "step": 2195 }, { "epoch": 2.790343074968234, "grad_norm": 1.4509705784168312, "learning_rate": 4.283571707415214e-06, "loss": 1.533, "step": 2196 }, { "epoch": 2.7916137229987292, "grad_norm": 1.6271203889491292, "learning_rate": 4.275280919823193e-06, "loss": 1.407, "step": 2197 }, { "epoch": 2.792884371029225, "grad_norm": 2.4592303081294, "learning_rate": 4.266995981369174e-06, "loss": 1.3711, "step": 2198 }, { "epoch": 2.7941550190597204, "grad_norm": 1.6611590728869305, "learning_rate": 4.258716900518164e-06, "loss": 1.5451, "step": 2199 }, { "epoch": 2.795425667090216, "grad_norm": 1.3924565008903842, "learning_rate": 4.25044368572917e-06, "loss": 1.6155, "step": 2200 }, { "epoch": 2.7966963151207116, "grad_norm": 1.7698735241912924, "learning_rate": 4.2421763454552225e-06, "loss": 1.5948, "step": 2201 }, { "epoch": 2.797966963151207, "grad_norm": 1.6408950608222295, "learning_rate": 4.233914888143333e-06, "loss": 1.5465, "step": 2202 }, { "epoch": 2.799237611181703, "grad_norm": 1.373912692213604, "learning_rate": 4.2256593222345185e-06, "loss": 1.6543, "step": 2203 }, { "epoch": 2.800508259212198, "grad_norm": 1.451843605699671, "learning_rate": 4.2174096561637644e-06, "loss": 1.488, "step": 2204 }, { "epoch": 2.801778907242694, "grad_norm": 1.6196668958041813, "learning_rate": 4.2091658983600305e-06, "loss": 1.3349, "step": 2205 }, { "epoch": 2.8030495552731893, "grad_norm": 1.407428500062136, "learning_rate": 4.20092805724624e-06, "loss": 1.5528, "step": 2206 }, { "epoch": 2.8043202033036847, "grad_norm": 1.6616455879553866, "learning_rate": 4.192696141239273e-06, "loss": 1.4103, "step": 2207 }, { "epoch": 2.8055908513341805, "grad_norm": 1.476847437744362, "learning_rate": 4.184470158749961e-06, "loss": 1.6905, "step": 2208 }, { "epoch": 2.806861499364676, "grad_norm": 1.72921436484817, "learning_rate": 4.176250118183063e-06, "loss": 1.4946, "step": 2209 }, { "epoch": 2.8081321473951717, "grad_norm": 1.571841716038649, "learning_rate": 4.168036027937267e-06, "loss": 1.4062, "step": 2210 }, { "epoch": 2.809402795425667, "grad_norm": 1.287244228350747, "learning_rate": 4.159827896405192e-06, "loss": 1.5741, "step": 2211 }, { "epoch": 2.810673443456163, "grad_norm": 1.6389873500505736, "learning_rate": 4.151625731973354e-06, "loss": 1.5757, "step": 2212 }, { "epoch": 2.8119440914866582, "grad_norm": 1.4866099503879866, "learning_rate": 4.143429543022191e-06, "loss": 1.5559, "step": 2213 }, { "epoch": 2.8132147395171536, "grad_norm": 1.704282712354898, "learning_rate": 4.1352393379260125e-06, "loss": 1.4492, "step": 2214 }, { "epoch": 2.8144853875476494, "grad_norm": 1.536418241417774, "learning_rate": 4.127055125053037e-06, "loss": 1.4443, "step": 2215 }, { "epoch": 2.815756035578145, "grad_norm": 1.5113596778312552, "learning_rate": 4.118876912765347e-06, "loss": 1.6588, "step": 2216 }, { "epoch": 2.81702668360864, "grad_norm": 1.6039842349514237, "learning_rate": 4.1107047094188946e-06, "loss": 1.3748, "step": 2217 }, { "epoch": 2.818297331639136, "grad_norm": 1.4682553751308622, "learning_rate": 4.10253852336349e-06, "loss": 1.4886, "step": 2218 }, { "epoch": 2.8195679796696314, "grad_norm": 1.5395939610698777, "learning_rate": 4.094378362942812e-06, "loss": 1.5196, "step": 2219 }, { "epoch": 2.820838627700127, "grad_norm": 1.9269379366055792, "learning_rate": 4.086224236494366e-06, "loss": 1.4785, "step": 2220 }, { "epoch": 2.8221092757306225, "grad_norm": 1.4274328734813035, "learning_rate": 4.078076152349496e-06, "loss": 1.4438, "step": 2221 }, { "epoch": 2.8233799237611183, "grad_norm": 1.595498803332665, "learning_rate": 4.06993411883337e-06, "loss": 1.4873, "step": 2222 }, { "epoch": 2.8246505717916137, "grad_norm": 1.3971580437463702, "learning_rate": 4.061798144264986e-06, "loss": 1.5435, "step": 2223 }, { "epoch": 2.825921219822109, "grad_norm": 1.4532248424265208, "learning_rate": 4.053668236957135e-06, "loss": 1.5382, "step": 2224 }, { "epoch": 2.827191867852605, "grad_norm": 1.519642472222294, "learning_rate": 4.045544405216422e-06, "loss": 1.53, "step": 2225 }, { "epoch": 2.8284625158831003, "grad_norm": 1.3947109208414181, "learning_rate": 4.037426657343233e-06, "loss": 1.2854, "step": 2226 }, { "epoch": 2.829733163913596, "grad_norm": 1.6532271964354837, "learning_rate": 4.029315001631753e-06, "loss": 1.6441, "step": 2227 }, { "epoch": 2.8310038119440915, "grad_norm": 1.656923087401254, "learning_rate": 4.021209446369927e-06, "loss": 1.5589, "step": 2228 }, { "epoch": 2.8322744599745873, "grad_norm": 1.3989622503720005, "learning_rate": 4.013109999839472e-06, "loss": 1.5257, "step": 2229 }, { "epoch": 2.8335451080050826, "grad_norm": 1.5786592082395432, "learning_rate": 4.005016670315867e-06, "loss": 1.6313, "step": 2230 }, { "epoch": 2.834815756035578, "grad_norm": 1.8123966305128363, "learning_rate": 3.996929466068344e-06, "loss": 1.2574, "step": 2231 }, { "epoch": 2.836086404066074, "grad_norm": 1.5203938631127307, "learning_rate": 3.988848395359866e-06, "loss": 1.6327, "step": 2232 }, { "epoch": 2.837357052096569, "grad_norm": 1.6702076939009192, "learning_rate": 3.980773466447138e-06, "loss": 1.7363, "step": 2233 }, { "epoch": 2.8386277001270646, "grad_norm": 1.4094045845321321, "learning_rate": 3.97270468758058e-06, "loss": 1.5077, "step": 2234 }, { "epoch": 2.8398983481575604, "grad_norm": 1.3924696277906912, "learning_rate": 3.964642067004338e-06, "loss": 1.6309, "step": 2235 }, { "epoch": 2.841168996188056, "grad_norm": 1.6232709109768664, "learning_rate": 3.956585612956268e-06, "loss": 1.3125, "step": 2236 }, { "epoch": 2.8424396442185516, "grad_norm": 1.7944603999037159, "learning_rate": 3.948535333667916e-06, "loss": 1.6526, "step": 2237 }, { "epoch": 2.843710292249047, "grad_norm": 1.7124321584478142, "learning_rate": 3.940491237364519e-06, "loss": 1.7881, "step": 2238 }, { "epoch": 2.8449809402795427, "grad_norm": 1.6072517443508763, "learning_rate": 3.9324533322650075e-06, "loss": 1.5408, "step": 2239 }, { "epoch": 2.846251588310038, "grad_norm": 2.32896329624759, "learning_rate": 3.9244216265819755e-06, "loss": 1.5828, "step": 2240 }, { "epoch": 2.8475222363405335, "grad_norm": 1.4163103231202079, "learning_rate": 3.916396128521686e-06, "loss": 1.4773, "step": 2241 }, { "epoch": 2.8487928843710293, "grad_norm": 1.7274268711443028, "learning_rate": 3.908376846284061e-06, "loss": 1.5606, "step": 2242 }, { "epoch": 2.8500635324015247, "grad_norm": 1.7404585688099852, "learning_rate": 3.9003637880626765e-06, "loss": 1.691, "step": 2243 }, { "epoch": 2.8513341804320205, "grad_norm": 1.709195129169358, "learning_rate": 3.8923569620447375e-06, "loss": 1.5409, "step": 2244 }, { "epoch": 2.852604828462516, "grad_norm": 1.6705919031761796, "learning_rate": 3.884356376411089e-06, "loss": 1.4196, "step": 2245 }, { "epoch": 2.8538754764930117, "grad_norm": 1.5192501448692692, "learning_rate": 3.876362039336196e-06, "loss": 1.4834, "step": 2246 }, { "epoch": 2.855146124523507, "grad_norm": 1.490906994462505, "learning_rate": 3.868373958988142e-06, "loss": 1.1566, "step": 2247 }, { "epoch": 2.8564167725540024, "grad_norm": 1.4843564294014155, "learning_rate": 3.860392143528624e-06, "loss": 1.4525, "step": 2248 }, { "epoch": 2.857687420584498, "grad_norm": 1.5348131899543818, "learning_rate": 3.852416601112925e-06, "loss": 1.5744, "step": 2249 }, { "epoch": 2.8589580686149936, "grad_norm": 1.43630845243318, "learning_rate": 3.844447339889924e-06, "loss": 1.5896, "step": 2250 }, { "epoch": 2.860228716645489, "grad_norm": 1.5318404589559234, "learning_rate": 3.836484368002088e-06, "loss": 1.4683, "step": 2251 }, { "epoch": 2.8614993646759848, "grad_norm": 1.4485601453260954, "learning_rate": 3.828527693585451e-06, "loss": 1.2319, "step": 2252 }, { "epoch": 2.8627700127064806, "grad_norm": 1.3452807831122484, "learning_rate": 3.8205773247696105e-06, "loss": 1.469, "step": 2253 }, { "epoch": 2.864040660736976, "grad_norm": 1.7164858031983954, "learning_rate": 3.81263326967773e-06, "loss": 1.8427, "step": 2254 }, { "epoch": 2.8653113087674713, "grad_norm": 1.306710039223864, "learning_rate": 3.8046955364265214e-06, "loss": 1.5889, "step": 2255 }, { "epoch": 2.866581956797967, "grad_norm": 1.707073776699419, "learning_rate": 3.7967641331262295e-06, "loss": 1.4975, "step": 2256 }, { "epoch": 2.8678526048284625, "grad_norm": 1.4091419057103962, "learning_rate": 3.788839067880635e-06, "loss": 1.5389, "step": 2257 }, { "epoch": 2.869123252858958, "grad_norm": 1.9789535173445154, "learning_rate": 3.7809203487870395e-06, "loss": 1.7107, "step": 2258 }, { "epoch": 2.8703939008894537, "grad_norm": 1.6677355476402764, "learning_rate": 3.7730079839362755e-06, "loss": 1.5299, "step": 2259 }, { "epoch": 2.871664548919949, "grad_norm": 1.5611925343525848, "learning_rate": 3.7651019814126656e-06, "loss": 1.3673, "step": 2260 }, { "epoch": 2.872935196950445, "grad_norm": 1.694686562421888, "learning_rate": 3.75720234929404e-06, "loss": 1.3858, "step": 2261 }, { "epoch": 2.8742058449809402, "grad_norm": 1.439181040260762, "learning_rate": 3.7493090956517142e-06, "loss": 1.4208, "step": 2262 }, { "epoch": 2.875476493011436, "grad_norm": 1.3895366097637492, "learning_rate": 3.7414222285504986e-06, "loss": 1.5845, "step": 2263 }, { "epoch": 2.8767471410419314, "grad_norm": 1.5428002728859518, "learning_rate": 3.733541756048662e-06, "loss": 1.3557, "step": 2264 }, { "epoch": 2.878017789072427, "grad_norm": 1.650158881869085, "learning_rate": 3.725667686197956e-06, "loss": 1.5372, "step": 2265 }, { "epoch": 2.8792884371029226, "grad_norm": 1.5303760428628421, "learning_rate": 3.7178000270435765e-06, "loss": 1.6347, "step": 2266 }, { "epoch": 2.880559085133418, "grad_norm": 1.4439928286384305, "learning_rate": 3.709938786624181e-06, "loss": 1.4305, "step": 2267 }, { "epoch": 2.8818297331639133, "grad_norm": 1.334703851173668, "learning_rate": 3.7020839729718606e-06, "loss": 1.6817, "step": 2268 }, { "epoch": 2.883100381194409, "grad_norm": 1.5994007060692943, "learning_rate": 3.6942355941121424e-06, "loss": 1.4062, "step": 2269 }, { "epoch": 2.884371029224905, "grad_norm": 1.3134585277590238, "learning_rate": 3.6863936580639714e-06, "loss": 1.4929, "step": 2270 }, { "epoch": 2.8856416772554003, "grad_norm": 1.5620862407921547, "learning_rate": 3.6785581728397312e-06, "loss": 1.5845, "step": 2271 }, { "epoch": 2.8869123252858957, "grad_norm": 1.8218826820601304, "learning_rate": 3.6707291464451953e-06, "loss": 1.6253, "step": 2272 }, { "epoch": 2.8881829733163915, "grad_norm": 1.4859377686939863, "learning_rate": 3.662906586879542e-06, "loss": 1.417, "step": 2273 }, { "epoch": 2.889453621346887, "grad_norm": 1.6305922022358126, "learning_rate": 3.65509050213534e-06, "loss": 1.276, "step": 2274 }, { "epoch": 2.8907242693773822, "grad_norm": 1.4732452648881487, "learning_rate": 3.6472809001985534e-06, "loss": 1.4943, "step": 2275 }, { "epoch": 2.891994917407878, "grad_norm": 1.5109274893828362, "learning_rate": 3.6394777890485077e-06, "loss": 1.5711, "step": 2276 }, { "epoch": 2.8932655654383734, "grad_norm": 1.6857878009893548, "learning_rate": 3.6316811766579106e-06, "loss": 1.5752, "step": 2277 }, { "epoch": 2.8945362134688692, "grad_norm": 1.6377320186466984, "learning_rate": 3.6238910709928176e-06, "loss": 1.5244, "step": 2278 }, { "epoch": 2.8958068614993646, "grad_norm": 1.5226900332449655, "learning_rate": 3.616107480012647e-06, "loss": 1.4875, "step": 2279 }, { "epoch": 2.8970775095298604, "grad_norm": 1.6081651224187385, "learning_rate": 3.6083304116701535e-06, "loss": 1.4027, "step": 2280 }, { "epoch": 2.898348157560356, "grad_norm": 1.706130168269608, "learning_rate": 3.6005598739114243e-06, "loss": 1.5992, "step": 2281 }, { "epoch": 2.899618805590851, "grad_norm": 1.5643040540066309, "learning_rate": 3.592795874675884e-06, "loss": 1.5868, "step": 2282 }, { "epoch": 2.900889453621347, "grad_norm": 1.6816599483507202, "learning_rate": 3.5850384218962743e-06, "loss": 1.298, "step": 2283 }, { "epoch": 2.9021601016518423, "grad_norm": 1.8470396107981721, "learning_rate": 3.5772875234986413e-06, "loss": 1.5678, "step": 2284 }, { "epoch": 2.9034307496823377, "grad_norm": 1.6046584287550623, "learning_rate": 3.569543187402341e-06, "loss": 1.6325, "step": 2285 }, { "epoch": 2.9047013977128335, "grad_norm": 1.523995355325757, "learning_rate": 3.5618054215200173e-06, "loss": 1.5939, "step": 2286 }, { "epoch": 2.9059720457433293, "grad_norm": 1.5270449092754086, "learning_rate": 3.5540742337576083e-06, "loss": 1.4063, "step": 2287 }, { "epoch": 2.9072426937738247, "grad_norm": 2.10017297192366, "learning_rate": 3.546349632014334e-06, "loss": 1.5012, "step": 2288 }, { "epoch": 2.90851334180432, "grad_norm": 1.6155665835478203, "learning_rate": 3.5386316241826748e-06, "loss": 1.5012, "step": 2289 }, { "epoch": 2.909783989834816, "grad_norm": 1.7787223778412131, "learning_rate": 3.530920218148376e-06, "loss": 1.3199, "step": 2290 }, { "epoch": 2.9110546378653113, "grad_norm": 1.560549594236504, "learning_rate": 3.523215421790447e-06, "loss": 1.2565, "step": 2291 }, { "epoch": 2.9123252858958066, "grad_norm": 1.394812239009204, "learning_rate": 3.5155172429811336e-06, "loss": 1.4569, "step": 2292 }, { "epoch": 2.9135959339263025, "grad_norm": 1.6038138492524496, "learning_rate": 3.5078256895859207e-06, "loss": 1.5995, "step": 2293 }, { "epoch": 2.914866581956798, "grad_norm": 1.6728720063218776, "learning_rate": 3.5001407694635326e-06, "loss": 1.5695, "step": 2294 }, { "epoch": 2.9161372299872936, "grad_norm": 2.7465688150554537, "learning_rate": 3.492462490465911e-06, "loss": 1.5363, "step": 2295 }, { "epoch": 2.917407878017789, "grad_norm": 1.4065911779100326, "learning_rate": 3.4847908604382095e-06, "loss": 1.4767, "step": 2296 }, { "epoch": 2.918678526048285, "grad_norm": 1.5795356577772306, "learning_rate": 3.4771258872187917e-06, "loss": 1.5937, "step": 2297 }, { "epoch": 2.91994917407878, "grad_norm": 1.5741018037150643, "learning_rate": 3.469467578639214e-06, "loss": 1.3136, "step": 2298 }, { "epoch": 2.9212198221092756, "grad_norm": 1.4731933338365752, "learning_rate": 3.4618159425242304e-06, "loss": 1.5331, "step": 2299 }, { "epoch": 2.9224904701397714, "grad_norm": 1.703185023187937, "learning_rate": 3.4541709866917793e-06, "loss": 1.5962, "step": 2300 }, { "epoch": 2.9237611181702667, "grad_norm": 1.2521835252137483, "learning_rate": 3.446532718952966e-06, "loss": 1.4288, "step": 2301 }, { "epoch": 2.9250317662007626, "grad_norm": 1.6440713932270161, "learning_rate": 3.4389011471120614e-06, "loss": 1.3655, "step": 2302 }, { "epoch": 2.926302414231258, "grad_norm": 1.4889131386748893, "learning_rate": 3.4312762789665067e-06, "loss": 1.6922, "step": 2303 }, { "epoch": 2.9275730622617537, "grad_norm": 1.6650208766938137, "learning_rate": 3.423658122306882e-06, "loss": 1.5555, "step": 2304 }, { "epoch": 2.928843710292249, "grad_norm": 1.2522929715604783, "learning_rate": 3.4160466849169106e-06, "loss": 1.507, "step": 2305 }, { "epoch": 2.9301143583227445, "grad_norm": 1.7803511980376172, "learning_rate": 3.4084419745734577e-06, "loss": 1.3705, "step": 2306 }, { "epoch": 2.9313850063532403, "grad_norm": 1.7597512395948038, "learning_rate": 3.400843999046516e-06, "loss": 1.4553, "step": 2307 }, { "epoch": 2.9326556543837357, "grad_norm": 1.5418716965422024, "learning_rate": 3.3932527660991877e-06, "loss": 1.4955, "step": 2308 }, { "epoch": 2.933926302414231, "grad_norm": 1.8698683202288493, "learning_rate": 3.3856682834876884e-06, "loss": 1.4275, "step": 2309 }, { "epoch": 2.935196950444727, "grad_norm": 1.4531375655005387, "learning_rate": 3.3780905589613457e-06, "loss": 1.4324, "step": 2310 }, { "epoch": 2.936467598475222, "grad_norm": 1.8266056817430822, "learning_rate": 3.370519600262567e-06, "loss": 1.7186, "step": 2311 }, { "epoch": 2.937738246505718, "grad_norm": 1.3272915282363376, "learning_rate": 3.362955415126865e-06, "loss": 1.3731, "step": 2312 }, { "epoch": 2.9390088945362134, "grad_norm": 1.601240214729954, "learning_rate": 3.3553980112828177e-06, "loss": 1.4686, "step": 2313 }, { "epoch": 2.940279542566709, "grad_norm": 1.798449523100172, "learning_rate": 3.3478473964520754e-06, "loss": 1.6216, "step": 2314 }, { "epoch": 2.9415501905972046, "grad_norm": 1.3685831897094658, "learning_rate": 3.340303578349361e-06, "loss": 1.3927, "step": 2315 }, { "epoch": 2.9428208386277, "grad_norm": 1.5147383181377905, "learning_rate": 3.3327665646824404e-06, "loss": 1.5493, "step": 2316 }, { "epoch": 2.9440914866581958, "grad_norm": 1.548599616748368, "learning_rate": 3.325236363152142e-06, "loss": 1.5739, "step": 2317 }, { "epoch": 2.945362134688691, "grad_norm": 1.3556053708238454, "learning_rate": 3.317712981452319e-06, "loss": 1.4329, "step": 2318 }, { "epoch": 2.946632782719187, "grad_norm": 1.4473213850301048, "learning_rate": 3.3101964272698693e-06, "loss": 1.479, "step": 2319 }, { "epoch": 2.9479034307496823, "grad_norm": 1.4337258339715722, "learning_rate": 3.3026867082847058e-06, "loss": 1.4843, "step": 2320 }, { "epoch": 2.949174078780178, "grad_norm": 1.6875137284988417, "learning_rate": 3.295183832169758e-06, "loss": 1.5555, "step": 2321 }, { "epoch": 2.9504447268106735, "grad_norm": 1.469035977971227, "learning_rate": 3.2876878065909714e-06, "loss": 1.5137, "step": 2322 }, { "epoch": 2.951715374841169, "grad_norm": 1.4600563300664042, "learning_rate": 3.2801986392072882e-06, "loss": 1.3999, "step": 2323 }, { "epoch": 2.9529860228716647, "grad_norm": 1.2802500447086944, "learning_rate": 3.2727163376706408e-06, "loss": 1.6125, "step": 2324 }, { "epoch": 2.95425667090216, "grad_norm": 1.5732563628912453, "learning_rate": 3.2652409096259473e-06, "loss": 1.3998, "step": 2325 }, { "epoch": 2.9555273189326554, "grad_norm": 2.0170281525799414, "learning_rate": 3.2577723627111022e-06, "loss": 1.6881, "step": 2326 }, { "epoch": 2.9567979669631512, "grad_norm": 1.4425748850274933, "learning_rate": 3.250310704556976e-06, "loss": 1.4648, "step": 2327 }, { "epoch": 2.9580686149936466, "grad_norm": 1.4309238275691571, "learning_rate": 3.2428559427873908e-06, "loss": 1.4489, "step": 2328 }, { "epoch": 2.9593392630241424, "grad_norm": 1.5982818277408608, "learning_rate": 3.2354080850191328e-06, "loss": 1.5934, "step": 2329 }, { "epoch": 2.9606099110546378, "grad_norm": 1.621257458394931, "learning_rate": 3.227967138861923e-06, "loss": 1.4109, "step": 2330 }, { "epoch": 2.9618805590851336, "grad_norm": 1.571220115256424, "learning_rate": 3.2205331119184313e-06, "loss": 1.3235, "step": 2331 }, { "epoch": 2.963151207115629, "grad_norm": 1.5107281267332602, "learning_rate": 3.2131060117842505e-06, "loss": 1.8047, "step": 2332 }, { "epoch": 2.9644218551461243, "grad_norm": 1.6060147078324802, "learning_rate": 3.205685846047897e-06, "loss": 1.1387, "step": 2333 }, { "epoch": 2.96569250317662, "grad_norm": 1.662098799358042, "learning_rate": 3.1982726222908046e-06, "loss": 1.5366, "step": 2334 }, { "epoch": 2.9669631512071155, "grad_norm": 1.5249414874021523, "learning_rate": 3.1908663480873182e-06, "loss": 1.4588, "step": 2335 }, { "epoch": 2.9682337992376113, "grad_norm": 1.5489987249881296, "learning_rate": 3.1834670310046735e-06, "loss": 1.558, "step": 2336 }, { "epoch": 2.9695044472681067, "grad_norm": 1.440444947964428, "learning_rate": 3.1760746786030004e-06, "loss": 1.4947, "step": 2337 }, { "epoch": 2.9707750952986025, "grad_norm": 1.4026839845489567, "learning_rate": 3.1686892984353124e-06, "loss": 1.3553, "step": 2338 }, { "epoch": 2.972045743329098, "grad_norm": 1.4476234611526049, "learning_rate": 3.161310898047507e-06, "loss": 1.4604, "step": 2339 }, { "epoch": 2.9733163913595932, "grad_norm": 1.5098981969866436, "learning_rate": 3.1539394849783367e-06, "loss": 1.4829, "step": 2340 }, { "epoch": 2.974587039390089, "grad_norm": 1.759099977767992, "learning_rate": 3.1465750667594286e-06, "loss": 1.3885, "step": 2341 }, { "epoch": 2.9758576874205844, "grad_norm": 1.3084296325082836, "learning_rate": 3.1392176509152507e-06, "loss": 1.5668, "step": 2342 }, { "epoch": 2.97712833545108, "grad_norm": 1.5872405076149434, "learning_rate": 3.1318672449631283e-06, "loss": 1.6528, "step": 2343 }, { "epoch": 2.9783989834815756, "grad_norm": 1.783486852855159, "learning_rate": 3.124523856413216e-06, "loss": 1.563, "step": 2344 }, { "epoch": 2.9796696315120714, "grad_norm": 1.6482688878777818, "learning_rate": 3.1171874927684964e-06, "loss": 1.6659, "step": 2345 }, { "epoch": 2.980940279542567, "grad_norm": 1.555191564256173, "learning_rate": 3.1098581615247825e-06, "loss": 1.4207, "step": 2346 }, { "epoch": 2.982210927573062, "grad_norm": 1.6353378112947163, "learning_rate": 3.102535870170702e-06, "loss": 1.5507, "step": 2347 }, { "epoch": 2.983481575603558, "grad_norm": 1.578849275240601, "learning_rate": 3.0952206261876827e-06, "loss": 1.4754, "step": 2348 }, { "epoch": 2.9847522236340533, "grad_norm": 1.546255150623972, "learning_rate": 3.0879124370499515e-06, "loss": 1.5183, "step": 2349 }, { "epoch": 2.9860228716645487, "grad_norm": 1.5875157884351954, "learning_rate": 3.0806113102245395e-06, "loss": 1.6667, "step": 2350 }, { "epoch": 2.9872935196950445, "grad_norm": 1.398826145213781, "learning_rate": 3.073317253171245e-06, "loss": 1.4393, "step": 2351 }, { "epoch": 2.98856416772554, "grad_norm": 2.692603145819837, "learning_rate": 3.0660302733426595e-06, "loss": 1.4171, "step": 2352 }, { "epoch": 2.9898348157560357, "grad_norm": 1.5460129343554816, "learning_rate": 3.0587503781841298e-06, "loss": 1.5645, "step": 2353 }, { "epoch": 2.991105463786531, "grad_norm": 4.535547058502799, "learning_rate": 3.05147757513377e-06, "loss": 1.6246, "step": 2354 }, { "epoch": 2.992376111817027, "grad_norm": 1.6514865378676962, "learning_rate": 3.04421187162245e-06, "loss": 1.5007, "step": 2355 }, { "epoch": 2.9936467598475223, "grad_norm": 1.6672665946304461, "learning_rate": 3.036953275073783e-06, "loss": 1.688, "step": 2356 }, { "epoch": 2.9949174078780176, "grad_norm": 1.5455780021092613, "learning_rate": 3.029701792904117e-06, "loss": 1.5103, "step": 2357 }, { "epoch": 2.9961880559085134, "grad_norm": 1.5389372324362445, "learning_rate": 3.0224574325225385e-06, "loss": 1.6554, "step": 2358 }, { "epoch": 2.997458703939009, "grad_norm": 1.8161574897561656, "learning_rate": 3.0152202013308573e-06, "loss": 1.5419, "step": 2359 }, { "epoch": 2.998729351969504, "grad_norm": 1.5867170148575802, "learning_rate": 3.007990106723593e-06, "loss": 1.4492, "step": 2360 }, { "epoch": 3.0, "grad_norm": 1.4436629920446682, "learning_rate": 3.0007671560879724e-06, "loss": 1.5805, "step": 2361 }, { "epoch": 3.0012706480304954, "grad_norm": 2.09892976706466, "learning_rate": 2.993551356803933e-06, "loss": 1.3329, "step": 2362 }, { "epoch": 3.002541296060991, "grad_norm": 2.573425871573313, "learning_rate": 2.9863427162440963e-06, "loss": 1.3915, "step": 2363 }, { "epoch": 3.0038119440914866, "grad_norm": 1.9744349108834152, "learning_rate": 2.979141241773775e-06, "loss": 1.1761, "step": 2364 }, { "epoch": 3.0050825921219824, "grad_norm": 2.8085612456415947, "learning_rate": 2.971946940750958e-06, "loss": 1.4812, "step": 2365 }, { "epoch": 3.0063532401524777, "grad_norm": 1.8526064065366492, "learning_rate": 2.9647598205262996e-06, "loss": 1.3335, "step": 2366 }, { "epoch": 3.007623888182973, "grad_norm": 1.7332062309204277, "learning_rate": 2.9575798884431297e-06, "loss": 1.4045, "step": 2367 }, { "epoch": 3.008894536213469, "grad_norm": 1.918921092336155, "learning_rate": 2.950407151837421e-06, "loss": 1.0056, "step": 2368 }, { "epoch": 3.0101651842439643, "grad_norm": 2.2556239875071147, "learning_rate": 2.9432416180377997e-06, "loss": 1.2765, "step": 2369 }, { "epoch": 3.01143583227446, "grad_norm": 2.7301888570855835, "learning_rate": 2.9360832943655327e-06, "loss": 1.2056, "step": 2370 }, { "epoch": 3.0127064803049555, "grad_norm": 1.9099132389678797, "learning_rate": 2.9289321881345257e-06, "loss": 1.2137, "step": 2371 }, { "epoch": 3.0139771283354513, "grad_norm": 2.0717116847331103, "learning_rate": 2.9217883066512996e-06, "loss": 0.9831, "step": 2372 }, { "epoch": 3.0152477763659467, "grad_norm": 2.076544317014107, "learning_rate": 2.914651657214996e-06, "loss": 1.2405, "step": 2373 }, { "epoch": 3.016518424396442, "grad_norm": 1.8240041683884223, "learning_rate": 2.9075222471173725e-06, "loss": 1.3826, "step": 2374 }, { "epoch": 3.017789072426938, "grad_norm": 2.133027343181042, "learning_rate": 2.9004000836427915e-06, "loss": 1.1397, "step": 2375 }, { "epoch": 3.019059720457433, "grad_norm": 1.7455345298398421, "learning_rate": 2.893285174068201e-06, "loss": 1.2348, "step": 2376 }, { "epoch": 3.020330368487929, "grad_norm": 1.9656471710779257, "learning_rate": 2.886177525663143e-06, "loss": 1.1276, "step": 2377 }, { "epoch": 3.0216010165184244, "grad_norm": 2.1534610852280327, "learning_rate": 2.879077145689746e-06, "loss": 1.3181, "step": 2378 }, { "epoch": 3.0228716645489198, "grad_norm": 1.6287196021115788, "learning_rate": 2.8719840414027047e-06, "loss": 1.1469, "step": 2379 }, { "epoch": 3.0241423125794156, "grad_norm": 1.6221238932125104, "learning_rate": 2.864898220049277e-06, "loss": 1.3919, "step": 2380 }, { "epoch": 3.025412960609911, "grad_norm": 1.7903952814121018, "learning_rate": 2.8578196888692932e-06, "loss": 1.2985, "step": 2381 }, { "epoch": 3.0266836086404068, "grad_norm": 1.9414238550600529, "learning_rate": 2.8507484550951203e-06, "loss": 1.3897, "step": 2382 }, { "epoch": 3.027954256670902, "grad_norm": 1.6706825834310917, "learning_rate": 2.843684525951681e-06, "loss": 1.2016, "step": 2383 }, { "epoch": 3.0292249047013975, "grad_norm": 1.7792669153317013, "learning_rate": 2.8366279086564265e-06, "loss": 1.3073, "step": 2384 }, { "epoch": 3.0304955527318933, "grad_norm": 2.0092840867613146, "learning_rate": 2.829578610419337e-06, "loss": 1.1512, "step": 2385 }, { "epoch": 3.0317662007623887, "grad_norm": 2.0069330063950392, "learning_rate": 2.8225366384429197e-06, "loss": 1.3232, "step": 2386 }, { "epoch": 3.0330368487928845, "grad_norm": 1.7756567922278317, "learning_rate": 2.8155019999221988e-06, "loss": 1.1533, "step": 2387 }, { "epoch": 3.03430749682338, "grad_norm": 1.90732193983837, "learning_rate": 2.8084747020446977e-06, "loss": 1.4712, "step": 2388 }, { "epoch": 3.0355781448538757, "grad_norm": 1.7243416047414686, "learning_rate": 2.80145475199044e-06, "loss": 1.0818, "step": 2389 }, { "epoch": 3.036848792884371, "grad_norm": 1.6847557181638975, "learning_rate": 2.794442156931949e-06, "loss": 1.2525, "step": 2390 }, { "epoch": 3.0381194409148664, "grad_norm": 1.6368875045530906, "learning_rate": 2.787436924034228e-06, "loss": 1.2932, "step": 2391 }, { "epoch": 3.0393900889453622, "grad_norm": 1.5948357268555189, "learning_rate": 2.780439060454756e-06, "loss": 1.3573, "step": 2392 }, { "epoch": 3.0406607369758576, "grad_norm": 1.8124373478736528, "learning_rate": 2.77344857334349e-06, "loss": 1.4497, "step": 2393 }, { "epoch": 3.0419313850063534, "grad_norm": 1.6889098700427108, "learning_rate": 2.7664654698428407e-06, "loss": 1.4171, "step": 2394 }, { "epoch": 3.0432020330368488, "grad_norm": 1.8567968600834586, "learning_rate": 2.7594897570876866e-06, "loss": 1.411, "step": 2395 }, { "epoch": 3.044472681067344, "grad_norm": 1.6434300105223425, "learning_rate": 2.7525214422053424e-06, "loss": 1.1523, "step": 2396 }, { "epoch": 3.04574332909784, "grad_norm": 1.7170595548391778, "learning_rate": 2.7455605323155697e-06, "loss": 1.2543, "step": 2397 }, { "epoch": 3.0470139771283353, "grad_norm": 1.7839808070985894, "learning_rate": 2.738607034530566e-06, "loss": 1.306, "step": 2398 }, { "epoch": 3.048284625158831, "grad_norm": 1.9548446588583341, "learning_rate": 2.7316609559549568e-06, "loss": 1.4932, "step": 2399 }, { "epoch": 3.0495552731893265, "grad_norm": 1.7197357275194696, "learning_rate": 2.724722303685781e-06, "loss": 1.2957, "step": 2400 }, { "epoch": 3.0508259212198223, "grad_norm": 1.8084821385370988, "learning_rate": 2.71779108481249e-06, "loss": 1.1916, "step": 2401 }, { "epoch": 3.0520965692503177, "grad_norm": 1.8019568439707627, "learning_rate": 2.71086730641695e-06, "loss": 1.2896, "step": 2402 }, { "epoch": 3.053367217280813, "grad_norm": 1.6232294705360168, "learning_rate": 2.7039509755734117e-06, "loss": 1.3035, "step": 2403 }, { "epoch": 3.054637865311309, "grad_norm": 1.732272049812764, "learning_rate": 2.697042099348528e-06, "loss": 1.3815, "step": 2404 }, { "epoch": 3.0559085133418042, "grad_norm": 1.522102653553474, "learning_rate": 2.6901406848013254e-06, "loss": 1.2552, "step": 2405 }, { "epoch": 3.0571791613723, "grad_norm": 1.9470133300553394, "learning_rate": 2.683246738983217e-06, "loss": 1.2267, "step": 2406 }, { "epoch": 3.0584498094027954, "grad_norm": 1.4692551227434485, "learning_rate": 2.6763602689379753e-06, "loss": 1.139, "step": 2407 }, { "epoch": 3.059720457433291, "grad_norm": 1.766405834849575, "learning_rate": 2.669481281701739e-06, "loss": 1.2124, "step": 2408 }, { "epoch": 3.0609911054637866, "grad_norm": 1.6423190989642726, "learning_rate": 2.6626097843029986e-06, "loss": 1.3618, "step": 2409 }, { "epoch": 3.062261753494282, "grad_norm": 1.7563432167722974, "learning_rate": 2.6557457837625956e-06, "loss": 1.1546, "step": 2410 }, { "epoch": 3.063532401524778, "grad_norm": 1.7587679513921985, "learning_rate": 2.648889287093713e-06, "loss": 1.3976, "step": 2411 }, { "epoch": 3.064803049555273, "grad_norm": 1.663781245316886, "learning_rate": 2.642040301301861e-06, "loss": 1.2392, "step": 2412 }, { "epoch": 3.0660736975857685, "grad_norm": 1.5989960700089227, "learning_rate": 2.6351988333848787e-06, "loss": 1.186, "step": 2413 }, { "epoch": 3.0673443456162643, "grad_norm": 1.7586610135204503, "learning_rate": 2.6283648903329263e-06, "loss": 1.1437, "step": 2414 }, { "epoch": 3.0686149936467597, "grad_norm": 1.9317190502603545, "learning_rate": 2.621538479128468e-06, "loss": 1.2426, "step": 2415 }, { "epoch": 3.0698856416772555, "grad_norm": 1.9112048957538366, "learning_rate": 2.6147196067462855e-06, "loss": 1.2112, "step": 2416 }, { "epoch": 3.071156289707751, "grad_norm": 1.9348624538847758, "learning_rate": 2.6079082801534417e-06, "loss": 1.0744, "step": 2417 }, { "epoch": 3.0724269377382467, "grad_norm": 1.83420768568301, "learning_rate": 2.6011045063093064e-06, "loss": 1.3859, "step": 2418 }, { "epoch": 3.073697585768742, "grad_norm": 1.5765598823395752, "learning_rate": 2.5943082921655194e-06, "loss": 1.4534, "step": 2419 }, { "epoch": 3.0749682337992374, "grad_norm": 1.6995207801918275, "learning_rate": 2.587519644666001e-06, "loss": 1.2786, "step": 2420 }, { "epoch": 3.0762388818297333, "grad_norm": 1.7860290662716032, "learning_rate": 2.580738570746939e-06, "loss": 1.2452, "step": 2421 }, { "epoch": 3.0775095298602286, "grad_norm": 1.5989460694010966, "learning_rate": 2.5739650773367876e-06, "loss": 1.4941, "step": 2422 }, { "epoch": 3.0787801778907244, "grad_norm": 1.7672357089554391, "learning_rate": 2.567199171356255e-06, "loss": 1.298, "step": 2423 }, { "epoch": 3.08005082592122, "grad_norm": 1.6720009691768105, "learning_rate": 2.5604408597182917e-06, "loss": 1.1471, "step": 2424 }, { "epoch": 3.081321473951715, "grad_norm": 1.5102495836791583, "learning_rate": 2.5536901493280897e-06, "loss": 1.3379, "step": 2425 }, { "epoch": 3.082592121982211, "grad_norm": 1.8372717401271044, "learning_rate": 2.5469470470830827e-06, "loss": 1.2962, "step": 2426 }, { "epoch": 3.0838627700127064, "grad_norm": 1.880212000099131, "learning_rate": 2.5402115598729182e-06, "loss": 1.2711, "step": 2427 }, { "epoch": 3.085133418043202, "grad_norm": 1.581484833594291, "learning_rate": 2.533483694579477e-06, "loss": 1.2823, "step": 2428 }, { "epoch": 3.0864040660736975, "grad_norm": 1.6436382107472818, "learning_rate": 2.5267634580768398e-06, "loss": 1.4279, "step": 2429 }, { "epoch": 3.0876747141041934, "grad_norm": 1.7784636953279571, "learning_rate": 2.520050857231302e-06, "loss": 1.2795, "step": 2430 }, { "epoch": 3.0889453621346887, "grad_norm": 1.6686373614869814, "learning_rate": 2.5133458989013536e-06, "loss": 1.1758, "step": 2431 }, { "epoch": 3.090216010165184, "grad_norm": 2.0466360249592146, "learning_rate": 2.5066485899376704e-06, "loss": 1.1162, "step": 2432 }, { "epoch": 3.09148665819568, "grad_norm": 1.8507598532475589, "learning_rate": 2.4999589371831258e-06, "loss": 1.2016, "step": 2433 }, { "epoch": 3.0927573062261753, "grad_norm": 1.5824017681378673, "learning_rate": 2.493276947472756e-06, "loss": 1.2221, "step": 2434 }, { "epoch": 3.094027954256671, "grad_norm": 1.633379375594195, "learning_rate": 2.4866026276337818e-06, "loss": 1.2296, "step": 2435 }, { "epoch": 3.0952986022871665, "grad_norm": 1.754909121517295, "learning_rate": 2.4799359844855763e-06, "loss": 1.1888, "step": 2436 }, { "epoch": 3.096569250317662, "grad_norm": 1.5519156929438713, "learning_rate": 2.47327702483967e-06, "loss": 1.2978, "step": 2437 }, { "epoch": 3.0978398983481577, "grad_norm": 1.7766717505881073, "learning_rate": 2.4666257554997496e-06, "loss": 1.407, "step": 2438 }, { "epoch": 3.099110546378653, "grad_norm": 1.7497637609072278, "learning_rate": 2.459982183261642e-06, "loss": 1.2113, "step": 2439 }, { "epoch": 3.100381194409149, "grad_norm": 1.9264978326369875, "learning_rate": 2.4533463149133073e-06, "loss": 1.3552, "step": 2440 }, { "epoch": 3.101651842439644, "grad_norm": 1.7947498856520572, "learning_rate": 2.446718157234832e-06, "loss": 1.1678, "step": 2441 }, { "epoch": 3.1029224904701396, "grad_norm": 2.4641410743895142, "learning_rate": 2.440097716998433e-06, "loss": 1.3406, "step": 2442 }, { "epoch": 3.1041931385006354, "grad_norm": 1.8883725023205793, "learning_rate": 2.4334850009684342e-06, "loss": 1.173, "step": 2443 }, { "epoch": 3.1054637865311308, "grad_norm": 1.6354338430570137, "learning_rate": 2.4268800159012664e-06, "loss": 1.1719, "step": 2444 }, { "epoch": 3.1067344345616266, "grad_norm": 1.6103664625611638, "learning_rate": 2.420282768545469e-06, "loss": 1.2, "step": 2445 }, { "epoch": 3.108005082592122, "grad_norm": 1.4172615712487338, "learning_rate": 2.4136932656416735e-06, "loss": 1.3682, "step": 2446 }, { "epoch": 3.1092757306226178, "grad_norm": 1.6721589174587272, "learning_rate": 2.407111513922594e-06, "loss": 1.16, "step": 2447 }, { "epoch": 3.110546378653113, "grad_norm": 1.6061198847753624, "learning_rate": 2.4005375201130275e-06, "loss": 1.2102, "step": 2448 }, { "epoch": 3.1118170266836085, "grad_norm": 2.0932204461880897, "learning_rate": 2.393971290929842e-06, "loss": 1.4676, "step": 2449 }, { "epoch": 3.1130876747141043, "grad_norm": 1.8592189977974782, "learning_rate": 2.3874128330819768e-06, "loss": 1.277, "step": 2450 }, { "epoch": 3.1143583227445997, "grad_norm": 1.851433962265915, "learning_rate": 2.3808621532704324e-06, "loss": 1.34, "step": 2451 }, { "epoch": 3.1156289707750955, "grad_norm": 1.7810350194668882, "learning_rate": 2.3743192581882557e-06, "loss": 1.3526, "step": 2452 }, { "epoch": 3.116899618805591, "grad_norm": 1.6744866612908063, "learning_rate": 2.3677841545205394e-06, "loss": 1.1099, "step": 2453 }, { "epoch": 3.1181702668360862, "grad_norm": 1.7583645293576202, "learning_rate": 2.3612568489444255e-06, "loss": 1.0625, "step": 2454 }, { "epoch": 3.119440914866582, "grad_norm": 1.756564024894285, "learning_rate": 2.354737348129077e-06, "loss": 1.2936, "step": 2455 }, { "epoch": 3.1207115628970774, "grad_norm": 1.586413743190357, "learning_rate": 2.3482256587356857e-06, "loss": 1.2569, "step": 2456 }, { "epoch": 3.121982210927573, "grad_norm": 1.7556994065125986, "learning_rate": 2.341721787417466e-06, "loss": 1.35, "step": 2457 }, { "epoch": 3.1232528589580686, "grad_norm": 2.0852506827055515, "learning_rate": 2.3352257408196444e-06, "loss": 1.1543, "step": 2458 }, { "epoch": 3.124523506988564, "grad_norm": 1.7712498030499715, "learning_rate": 2.3287375255794488e-06, "loss": 1.2117, "step": 2459 }, { "epoch": 3.1257941550190598, "grad_norm": 1.6886622101501163, "learning_rate": 2.322257148326105e-06, "loss": 1.3069, "step": 2460 }, { "epoch": 3.127064803049555, "grad_norm": 1.9420365086561293, "learning_rate": 2.3157846156808304e-06, "loss": 1.3194, "step": 2461 }, { "epoch": 3.128335451080051, "grad_norm": 1.5276811681070064, "learning_rate": 2.3093199342568316e-06, "loss": 1.2025, "step": 2462 }, { "epoch": 3.1296060991105463, "grad_norm": 1.8115947546522564, "learning_rate": 2.3028631106592947e-06, "loss": 1.3846, "step": 2463 }, { "epoch": 3.130876747141042, "grad_norm": 1.6659681066420193, "learning_rate": 2.296414151485371e-06, "loss": 1.2816, "step": 2464 }, { "epoch": 3.1321473951715375, "grad_norm": 1.8727411129268166, "learning_rate": 2.2899730633241747e-06, "loss": 1.3635, "step": 2465 }, { "epoch": 3.133418043202033, "grad_norm": 1.846374256482884, "learning_rate": 2.2835398527567888e-06, "loss": 1.3548, "step": 2466 }, { "epoch": 3.1346886912325287, "grad_norm": 1.6356275346464342, "learning_rate": 2.2771145263562355e-06, "loss": 1.452, "step": 2467 }, { "epoch": 3.135959339263024, "grad_norm": 1.9577792022247684, "learning_rate": 2.2706970906874913e-06, "loss": 1.5602, "step": 2468 }, { "epoch": 3.13722998729352, "grad_norm": 1.8068650757372393, "learning_rate": 2.2642875523074613e-06, "loss": 1.1777, "step": 2469 }, { "epoch": 3.1385006353240152, "grad_norm": 1.7538676709057037, "learning_rate": 2.2578859177649924e-06, "loss": 1.4586, "step": 2470 }, { "epoch": 3.1397712833545106, "grad_norm": 1.7879657250480516, "learning_rate": 2.251492193600846e-06, "loss": 1.3761, "step": 2471 }, { "epoch": 3.1410419313850064, "grad_norm": 2.3249828103843457, "learning_rate": 2.245106386347706e-06, "loss": 1.5108, "step": 2472 }, { "epoch": 3.142312579415502, "grad_norm": 1.7462573196518558, "learning_rate": 2.238728502530161e-06, "loss": 1.2632, "step": 2473 }, { "epoch": 3.1435832274459976, "grad_norm": 1.736296512648159, "learning_rate": 2.2323585486647193e-06, "loss": 1.2525, "step": 2474 }, { "epoch": 3.144853875476493, "grad_norm": 1.9470504131851696, "learning_rate": 2.225996531259772e-06, "loss": 1.3773, "step": 2475 }, { "epoch": 3.1461245235069883, "grad_norm": 1.9076770489781227, "learning_rate": 2.2196424568156073e-06, "loss": 1.0692, "step": 2476 }, { "epoch": 3.147395171537484, "grad_norm": 1.7248979518869032, "learning_rate": 2.2132963318243917e-06, "loss": 1.486, "step": 2477 }, { "epoch": 3.1486658195679795, "grad_norm": 2.035045665415418, "learning_rate": 2.206958162770183e-06, "loss": 1.3372, "step": 2478 }, { "epoch": 3.1499364675984753, "grad_norm": 1.571699409302641, "learning_rate": 2.2006279561288934e-06, "loss": 1.2629, "step": 2479 }, { "epoch": 3.1512071156289707, "grad_norm": 1.5587477711625815, "learning_rate": 2.1943057183683146e-06, "loss": 1.1787, "step": 2480 }, { "epoch": 3.1524777636594665, "grad_norm": 1.8196764892645745, "learning_rate": 2.1879914559480853e-06, "loss": 1.2803, "step": 2481 }, { "epoch": 3.153748411689962, "grad_norm": 1.7798333128896995, "learning_rate": 2.1816851753197023e-06, "loss": 1.1567, "step": 2482 }, { "epoch": 3.1550190597204573, "grad_norm": 1.6955424352749122, "learning_rate": 2.1753868829265046e-06, "loss": 1.2298, "step": 2483 }, { "epoch": 3.156289707750953, "grad_norm": 1.7276703176456651, "learning_rate": 2.169096585203668e-06, "loss": 1.1972, "step": 2484 }, { "epoch": 3.1575603557814484, "grad_norm": 1.7683359583678933, "learning_rate": 2.1628142885781966e-06, "loss": 1.2756, "step": 2485 }, { "epoch": 3.1588310038119443, "grad_norm": 1.775103180200822, "learning_rate": 2.156539999468934e-06, "loss": 1.1889, "step": 2486 }, { "epoch": 3.1601016518424396, "grad_norm": 1.4900632070960849, "learning_rate": 2.1502737242865266e-06, "loss": 1.3028, "step": 2487 }, { "epoch": 3.161372299872935, "grad_norm": 1.7758388346832947, "learning_rate": 2.14401546943344e-06, "loss": 1.2355, "step": 2488 }, { "epoch": 3.162642947903431, "grad_norm": 1.576094652086718, "learning_rate": 2.1377652413039405e-06, "loss": 1.2363, "step": 2489 }, { "epoch": 3.163913595933926, "grad_norm": 1.7874303237007443, "learning_rate": 2.1315230462840985e-06, "loss": 1.3443, "step": 2490 }, { "epoch": 3.165184243964422, "grad_norm": 1.6787912401680196, "learning_rate": 2.125288890751779e-06, "loss": 1.2906, "step": 2491 }, { "epoch": 3.1664548919949174, "grad_norm": 2.0180482623233202, "learning_rate": 2.1190627810766228e-06, "loss": 1.4137, "step": 2492 }, { "epoch": 3.1677255400254127, "grad_norm": 1.564533811757316, "learning_rate": 2.1128447236200544e-06, "loss": 1.1543, "step": 2493 }, { "epoch": 3.1689961880559085, "grad_norm": 1.7774083993726737, "learning_rate": 2.106634724735278e-06, "loss": 1.259, "step": 2494 }, { "epoch": 3.170266836086404, "grad_norm": 1.5448424912442245, "learning_rate": 2.100432790767254e-06, "loss": 1.2078, "step": 2495 }, { "epoch": 3.1715374841168997, "grad_norm": 2.142201872129878, "learning_rate": 2.0942389280527066e-06, "loss": 1.2173, "step": 2496 }, { "epoch": 3.172808132147395, "grad_norm": 1.6587617050189192, "learning_rate": 2.0880531429201146e-06, "loss": 1.3604, "step": 2497 }, { "epoch": 3.174078780177891, "grad_norm": 1.7834074653396286, "learning_rate": 2.081875441689706e-06, "loss": 1.1448, "step": 2498 }, { "epoch": 3.1753494282083863, "grad_norm": 1.7572575087822744, "learning_rate": 2.0757058306734433e-06, "loss": 1.2094, "step": 2499 }, { "epoch": 3.1766200762388817, "grad_norm": 2.083569077535876, "learning_rate": 2.069544316175025e-06, "loss": 1.3869, "step": 2500 }, { "epoch": 3.1778907242693775, "grad_norm": 1.823152571267711, "learning_rate": 2.0633909044898748e-06, "loss": 1.2244, "step": 2501 }, { "epoch": 3.179161372299873, "grad_norm": 1.773039069704479, "learning_rate": 2.0572456019051446e-06, "loss": 1.3954, "step": 2502 }, { "epoch": 3.1804320203303686, "grad_norm": 1.8971742409013088, "learning_rate": 2.0511084146996975e-06, "loss": 1.3243, "step": 2503 }, { "epoch": 3.181702668360864, "grad_norm": 1.7944894466951278, "learning_rate": 2.0449793491441026e-06, "loss": 1.4261, "step": 2504 }, { "epoch": 3.1829733163913594, "grad_norm": 1.5140189328195208, "learning_rate": 2.038858411500629e-06, "loss": 1.1631, "step": 2505 }, { "epoch": 3.184243964421855, "grad_norm": 1.7061152885765603, "learning_rate": 2.03274560802325e-06, "loss": 1.3066, "step": 2506 }, { "epoch": 3.1855146124523506, "grad_norm": 1.4811927119334365, "learning_rate": 2.026640944957621e-06, "loss": 1.292, "step": 2507 }, { "epoch": 3.1867852604828464, "grad_norm": 1.5809354619829985, "learning_rate": 2.020544428541077e-06, "loss": 1.3237, "step": 2508 }, { "epoch": 3.1880559085133418, "grad_norm": 1.5956049131349987, "learning_rate": 2.014456065002637e-06, "loss": 1.1337, "step": 2509 }, { "epoch": 3.189326556543837, "grad_norm": 1.5734953655845838, "learning_rate": 2.00837586056299e-06, "loss": 1.2489, "step": 2510 }, { "epoch": 3.190597204574333, "grad_norm": 1.5257536631287314, "learning_rate": 2.0023038214344827e-06, "loss": 1.3132, "step": 2511 }, { "epoch": 3.1918678526048283, "grad_norm": 1.875502687536241, "learning_rate": 1.996239953821121e-06, "loss": 1.444, "step": 2512 }, { "epoch": 3.193138500635324, "grad_norm": 1.8223412917715889, "learning_rate": 1.990184263918561e-06, "loss": 1.4234, "step": 2513 }, { "epoch": 3.1944091486658195, "grad_norm": 1.9417963652654793, "learning_rate": 1.9841367579141057e-06, "loss": 1.4581, "step": 2514 }, { "epoch": 3.1956797966963153, "grad_norm": 1.6085196469552832, "learning_rate": 1.9780974419866995e-06, "loss": 1.1893, "step": 2515 }, { "epoch": 3.1969504447268107, "grad_norm": 1.6969997688746226, "learning_rate": 1.9720663223069115e-06, "loss": 1.2482, "step": 2516 }, { "epoch": 3.198221092757306, "grad_norm": 1.749402884036117, "learning_rate": 1.966043405036936e-06, "loss": 1.1891, "step": 2517 }, { "epoch": 3.199491740787802, "grad_norm": 1.8730967483194572, "learning_rate": 1.960028696330596e-06, "loss": 1.2247, "step": 2518 }, { "epoch": 3.200762388818297, "grad_norm": 1.7762581172294063, "learning_rate": 1.9540222023333165e-06, "loss": 1.3077, "step": 2519 }, { "epoch": 3.202033036848793, "grad_norm": 1.4899743911392564, "learning_rate": 1.94802392918214e-06, "loss": 1.2546, "step": 2520 }, { "epoch": 3.2033036848792884, "grad_norm": 11.200908479200699, "learning_rate": 1.9420338830056984e-06, "loss": 1.3995, "step": 2521 }, { "epoch": 3.204574332909784, "grad_norm": 1.834755307368342, "learning_rate": 1.936052069924228e-06, "loss": 1.1952, "step": 2522 }, { "epoch": 3.2058449809402796, "grad_norm": 1.9324793062038865, "learning_rate": 1.9300784960495454e-06, "loss": 1.4198, "step": 2523 }, { "epoch": 3.207115628970775, "grad_norm": 1.8000970358216641, "learning_rate": 1.924113167485054e-06, "loss": 1.2533, "step": 2524 }, { "epoch": 3.2083862770012708, "grad_norm": 1.5880216587333218, "learning_rate": 1.9181560903257234e-06, "loss": 1.3579, "step": 2525 }, { "epoch": 3.209656925031766, "grad_norm": 1.8480812181506967, "learning_rate": 1.9122072706581107e-06, "loss": 1.3448, "step": 2526 }, { "epoch": 3.210927573062262, "grad_norm": 2.011537400480146, "learning_rate": 1.9062667145603208e-06, "loss": 1.4513, "step": 2527 }, { "epoch": 3.2121982210927573, "grad_norm": 1.9724271618153089, "learning_rate": 1.9003344281020185e-06, "loss": 1.3168, "step": 2528 }, { "epoch": 3.2134688691232527, "grad_norm": 1.7502666918999221, "learning_rate": 1.8944104173444178e-06, "loss": 1.3477, "step": 2529 }, { "epoch": 3.2147395171537485, "grad_norm": 1.7319409313243204, "learning_rate": 1.8884946883402845e-06, "loss": 1.3608, "step": 2530 }, { "epoch": 3.216010165184244, "grad_norm": 1.783038046631286, "learning_rate": 1.8825872471339146e-06, "loss": 1.2214, "step": 2531 }, { "epoch": 3.2172808132147397, "grad_norm": 1.720134512976163, "learning_rate": 1.8766880997611424e-06, "loss": 1.3674, "step": 2532 }, { "epoch": 3.218551461245235, "grad_norm": 1.7045142043861203, "learning_rate": 1.8707972522493211e-06, "loss": 1.0517, "step": 2533 }, { "epoch": 3.2198221092757304, "grad_norm": 1.6352671308189537, "learning_rate": 1.8649147106173326e-06, "loss": 1.2439, "step": 2534 }, { "epoch": 3.2210927573062262, "grad_norm": 1.4847088232236991, "learning_rate": 1.8590404808755646e-06, "loss": 1.2547, "step": 2535 }, { "epoch": 3.2223634053367216, "grad_norm": 3.125888668868235, "learning_rate": 1.853174569025914e-06, "loss": 1.2783, "step": 2536 }, { "epoch": 3.2236340533672174, "grad_norm": 1.988625654965949, "learning_rate": 1.847316981061782e-06, "loss": 1.2393, "step": 2537 }, { "epoch": 3.224904701397713, "grad_norm": 11.26007842938525, "learning_rate": 1.8414677229680645e-06, "loss": 1.4299, "step": 2538 }, { "epoch": 3.2261753494282086, "grad_norm": 1.8211500733017565, "learning_rate": 1.8356268007211442e-06, "loss": 1.1572, "step": 2539 }, { "epoch": 3.227445997458704, "grad_norm": 1.528557119278922, "learning_rate": 1.8297942202888874e-06, "loss": 1.3529, "step": 2540 }, { "epoch": 3.2287166454891993, "grad_norm": 1.6232166805721628, "learning_rate": 1.823969987630635e-06, "loss": 1.3527, "step": 2541 }, { "epoch": 3.229987293519695, "grad_norm": 1.5340998572237294, "learning_rate": 1.8181541086972066e-06, "loss": 1.0331, "step": 2542 }, { "epoch": 3.2312579415501905, "grad_norm": 1.6425772528850264, "learning_rate": 1.8123465894308756e-06, "loss": 1.3634, "step": 2543 }, { "epoch": 3.2325285895806863, "grad_norm": 1.7235750348622498, "learning_rate": 1.8065474357653855e-06, "loss": 1.1661, "step": 2544 }, { "epoch": 3.2337992376111817, "grad_norm": 1.7389286929216596, "learning_rate": 1.8007566536259224e-06, "loss": 1.3072, "step": 2545 }, { "epoch": 3.235069885641677, "grad_norm": 1.7967188959884117, "learning_rate": 1.7949742489291256e-06, "loss": 1.4596, "step": 2546 }, { "epoch": 3.236340533672173, "grad_norm": 1.6685604006798536, "learning_rate": 1.7892002275830723e-06, "loss": 1.4423, "step": 2547 }, { "epoch": 3.2376111817026683, "grad_norm": 1.9010633740483633, "learning_rate": 1.7834345954872711e-06, "loss": 1.2548, "step": 2548 }, { "epoch": 3.238881829733164, "grad_norm": 1.8558798945459534, "learning_rate": 1.7776773585326645e-06, "loss": 1.3086, "step": 2549 }, { "epoch": 3.2401524777636594, "grad_norm": 1.5729842199934867, "learning_rate": 1.7719285226016181e-06, "loss": 1.3923, "step": 2550 }, { "epoch": 3.241423125794155, "grad_norm": 1.7812302004801492, "learning_rate": 1.7661880935679077e-06, "loss": 1.1471, "step": 2551 }, { "epoch": 3.2426937738246506, "grad_norm": 1.7290420005529408, "learning_rate": 1.7604560772967228e-06, "loss": 1.3251, "step": 2552 }, { "epoch": 3.243964421855146, "grad_norm": 1.9556366082379057, "learning_rate": 1.7547324796446553e-06, "loss": 1.1404, "step": 2553 }, { "epoch": 3.245235069885642, "grad_norm": 1.6160249986866686, "learning_rate": 1.7490173064596994e-06, "loss": 1.4717, "step": 2554 }, { "epoch": 3.246505717916137, "grad_norm": 1.7583921649936753, "learning_rate": 1.743310563581242e-06, "loss": 1.3199, "step": 2555 }, { "epoch": 3.247776365946633, "grad_norm": 2.032519224190622, "learning_rate": 1.7376122568400533e-06, "loss": 1.2158, "step": 2556 }, { "epoch": 3.2490470139771284, "grad_norm": 1.9452217164271726, "learning_rate": 1.7319223920582795e-06, "loss": 1.3222, "step": 2557 }, { "epoch": 3.2503176620076237, "grad_norm": 1.670607279727031, "learning_rate": 1.7262409750494546e-06, "loss": 1.2129, "step": 2558 }, { "epoch": 3.2515883100381195, "grad_norm": 1.8375210610863488, "learning_rate": 1.7205680116184698e-06, "loss": 1.0889, "step": 2559 }, { "epoch": 3.252858958068615, "grad_norm": 1.8173242314446705, "learning_rate": 1.7149035075615795e-06, "loss": 1.1841, "step": 2560 }, { "epoch": 3.2541296060991107, "grad_norm": 1.5470934273402792, "learning_rate": 1.7092474686664018e-06, "loss": 1.38, "step": 2561 }, { "epoch": 3.255400254129606, "grad_norm": 1.7698700793992441, "learning_rate": 1.703599900711903e-06, "loss": 1.0056, "step": 2562 }, { "epoch": 3.2566709021601015, "grad_norm": 1.9637213866070415, "learning_rate": 1.697960809468392e-06, "loss": 1.5702, "step": 2563 }, { "epoch": 3.2579415501905973, "grad_norm": 1.568118703304819, "learning_rate": 1.6923302006975174e-06, "loss": 1.1889, "step": 2564 }, { "epoch": 3.2592121982210926, "grad_norm": 2.1019949451684052, "learning_rate": 1.6867080801522584e-06, "loss": 1.317, "step": 2565 }, { "epoch": 3.2604828462515885, "grad_norm": 1.7030736642023787, "learning_rate": 1.681094453576928e-06, "loss": 1.3978, "step": 2566 }, { "epoch": 3.261753494282084, "grad_norm": 1.6902151932446823, "learning_rate": 1.6754893267071593e-06, "loss": 1.0607, "step": 2567 }, { "epoch": 3.263024142312579, "grad_norm": 1.564217748349614, "learning_rate": 1.6698927052698965e-06, "loss": 1.2916, "step": 2568 }, { "epoch": 3.264294790343075, "grad_norm": 1.7903482631113141, "learning_rate": 1.6643045949833936e-06, "loss": 1.2744, "step": 2569 }, { "epoch": 3.2655654383735704, "grad_norm": 1.5940467435876144, "learning_rate": 1.6587250015572164e-06, "loss": 1.0312, "step": 2570 }, { "epoch": 3.266836086404066, "grad_norm": 1.7225652116973902, "learning_rate": 1.6531539306922195e-06, "loss": 1.2726, "step": 2571 }, { "epoch": 3.2681067344345616, "grad_norm": 1.6006370824120357, "learning_rate": 1.6475913880805516e-06, "loss": 1.1295, "step": 2572 }, { "epoch": 3.2693773824650574, "grad_norm": 1.5880416946242164, "learning_rate": 1.642037379405651e-06, "loss": 1.4118, "step": 2573 }, { "epoch": 3.2706480304955527, "grad_norm": 1.740882673940497, "learning_rate": 1.6364919103422394e-06, "loss": 1.2281, "step": 2574 }, { "epoch": 3.271918678526048, "grad_norm": 1.8539367045431134, "learning_rate": 1.6309549865563047e-06, "loss": 1.213, "step": 2575 }, { "epoch": 3.273189326556544, "grad_norm": 2.008826510862433, "learning_rate": 1.6254266137051077e-06, "loss": 1.3727, "step": 2576 }, { "epoch": 3.2744599745870393, "grad_norm": 1.7726019489981462, "learning_rate": 1.619906797437173e-06, "loss": 1.3896, "step": 2577 }, { "epoch": 3.275730622617535, "grad_norm": 1.4784689844202854, "learning_rate": 1.6143955433922864e-06, "loss": 1.2795, "step": 2578 }, { "epoch": 3.2770012706480305, "grad_norm": 1.586961648155315, "learning_rate": 1.6088928572014795e-06, "loss": 1.265, "step": 2579 }, { "epoch": 3.2782719186785263, "grad_norm": 1.493365217093996, "learning_rate": 1.6033987444870303e-06, "loss": 1.2889, "step": 2580 }, { "epoch": 3.2795425667090217, "grad_norm": 1.6544268804041813, "learning_rate": 1.5979132108624572e-06, "loss": 1.3982, "step": 2581 }, { "epoch": 3.280813214739517, "grad_norm": 1.7778419407714103, "learning_rate": 1.5924362619325184e-06, "loss": 1.4281, "step": 2582 }, { "epoch": 3.282083862770013, "grad_norm": 1.5289639491789007, "learning_rate": 1.586967903293194e-06, "loss": 1.2833, "step": 2583 }, { "epoch": 3.283354510800508, "grad_norm": 1.85855118389914, "learning_rate": 1.5815081405316912e-06, "loss": 1.1843, "step": 2584 }, { "epoch": 3.2846251588310036, "grad_norm": 1.9327977737083843, "learning_rate": 1.5760569792264324e-06, "loss": 1.4376, "step": 2585 }, { "epoch": 3.2858958068614994, "grad_norm": 2.0395641152232797, "learning_rate": 1.5706144249470545e-06, "loss": 1.2756, "step": 2586 }, { "epoch": 3.2871664548919948, "grad_norm": 1.8734734560385995, "learning_rate": 1.565180483254396e-06, "loss": 1.2866, "step": 2587 }, { "epoch": 3.2884371029224906, "grad_norm": 1.9185335011211122, "learning_rate": 1.5597551597004968e-06, "loss": 1.5165, "step": 2588 }, { "epoch": 3.289707750952986, "grad_norm": 1.7423865067684943, "learning_rate": 1.5543384598285938e-06, "loss": 1.1624, "step": 2589 }, { "epoch": 3.2909783989834818, "grad_norm": 1.9425305357709244, "learning_rate": 1.5489303891731144e-06, "loss": 1.4843, "step": 2590 }, { "epoch": 3.292249047013977, "grad_norm": 2.0576161228612673, "learning_rate": 1.5435309532596644e-06, "loss": 1.1159, "step": 2591 }, { "epoch": 3.2935196950444725, "grad_norm": 1.799508736226043, "learning_rate": 1.538140157605027e-06, "loss": 1.4685, "step": 2592 }, { "epoch": 3.2947903430749683, "grad_norm": 1.653492445202349, "learning_rate": 1.5327580077171589e-06, "loss": 1.3122, "step": 2593 }, { "epoch": 3.2960609911054637, "grad_norm": 1.8240709075790245, "learning_rate": 1.5273845090951877e-06, "loss": 1.2987, "step": 2594 }, { "epoch": 3.2973316391359595, "grad_norm": 1.6468537576431075, "learning_rate": 1.522019667229393e-06, "loss": 1.266, "step": 2595 }, { "epoch": 3.298602287166455, "grad_norm": 1.7727011306314875, "learning_rate": 1.5166634876012187e-06, "loss": 1.3907, "step": 2596 }, { "epoch": 3.2998729351969507, "grad_norm": 1.7476788965426242, "learning_rate": 1.5113159756832497e-06, "loss": 1.2959, "step": 2597 }, { "epoch": 3.301143583227446, "grad_norm": 1.6468660618165196, "learning_rate": 1.5059771369392229e-06, "loss": 1.0908, "step": 2598 }, { "epoch": 3.3024142312579414, "grad_norm": 1.5128065024394974, "learning_rate": 1.5006469768240062e-06, "loss": 1.2446, "step": 2599 }, { "epoch": 3.3036848792884372, "grad_norm": 1.8642886178980311, "learning_rate": 1.4953255007836021e-06, "loss": 1.2274, "step": 2600 }, { "epoch": 3.3049555273189326, "grad_norm": 1.6539737668218981, "learning_rate": 1.4900127142551446e-06, "loss": 1.3047, "step": 2601 }, { "epoch": 3.306226175349428, "grad_norm": 1.5516833997523842, "learning_rate": 1.4847086226668871e-06, "loss": 1.3523, "step": 2602 }, { "epoch": 3.307496823379924, "grad_norm": 2.088689276710946, "learning_rate": 1.479413231438197e-06, "loss": 1.5014, "step": 2603 }, { "epoch": 3.308767471410419, "grad_norm": 1.9576724887018369, "learning_rate": 1.4741265459795517e-06, "loss": 1.2595, "step": 2604 }, { "epoch": 3.310038119440915, "grad_norm": 1.838294166050524, "learning_rate": 1.4688485716925394e-06, "loss": 1.3377, "step": 2605 }, { "epoch": 3.3113087674714103, "grad_norm": 1.8197922760443668, "learning_rate": 1.4635793139698384e-06, "loss": 1.4283, "step": 2606 }, { "epoch": 3.312579415501906, "grad_norm": 1.7019671459826233, "learning_rate": 1.4583187781952335e-06, "loss": 1.2218, "step": 2607 }, { "epoch": 3.3138500635324015, "grad_norm": 1.8758449204713026, "learning_rate": 1.4530669697435861e-06, "loss": 1.2324, "step": 2608 }, { "epoch": 3.315120711562897, "grad_norm": 1.8105019506396125, "learning_rate": 1.4478238939808454e-06, "loss": 1.2777, "step": 2609 }, { "epoch": 3.3163913595933927, "grad_norm": 1.87633395598687, "learning_rate": 1.4425895562640424e-06, "loss": 1.3799, "step": 2610 }, { "epoch": 3.317662007623888, "grad_norm": 1.7606582062257212, "learning_rate": 1.4373639619412715e-06, "loss": 1.3026, "step": 2611 }, { "epoch": 3.318932655654384, "grad_norm": 1.6010525932324566, "learning_rate": 1.4321471163516998e-06, "loss": 1.168, "step": 2612 }, { "epoch": 3.3202033036848793, "grad_norm": 1.6255357989695598, "learning_rate": 1.4269390248255521e-06, "loss": 1.1377, "step": 2613 }, { "epoch": 3.321473951715375, "grad_norm": 1.6876307022543744, "learning_rate": 1.4217396926841153e-06, "loss": 1.2671, "step": 2614 }, { "epoch": 3.3227445997458704, "grad_norm": 1.699645779116517, "learning_rate": 1.4165491252397202e-06, "loss": 1.1274, "step": 2615 }, { "epoch": 3.324015247776366, "grad_norm": 1.6818213641961743, "learning_rate": 1.4113673277957395e-06, "loss": 1.4025, "step": 2616 }, { "epoch": 3.3252858958068616, "grad_norm": 3.2032076510413536, "learning_rate": 1.4061943056465965e-06, "loss": 1.3181, "step": 2617 }, { "epoch": 3.326556543837357, "grad_norm": 1.8105928565910216, "learning_rate": 1.4010300640777352e-06, "loss": 1.6368, "step": 2618 }, { "epoch": 3.3278271918678524, "grad_norm": 1.542688925757691, "learning_rate": 1.3958746083656428e-06, "loss": 1.2069, "step": 2619 }, { "epoch": 3.329097839898348, "grad_norm": 1.71104874448952, "learning_rate": 1.3907279437778154e-06, "loss": 1.098, "step": 2620 }, { "epoch": 3.3303684879288435, "grad_norm": 1.8722048302574334, "learning_rate": 1.3855900755727747e-06, "loss": 1.1335, "step": 2621 }, { "epoch": 3.3316391359593394, "grad_norm": 1.9022709916857448, "learning_rate": 1.3804610090000558e-06, "loss": 1.4643, "step": 2622 }, { "epoch": 3.3329097839898347, "grad_norm": 1.923857938211866, "learning_rate": 1.3753407493001968e-06, "loss": 1.2675, "step": 2623 }, { "epoch": 3.3341804320203305, "grad_norm": 2.065059666018281, "learning_rate": 1.3702293017047375e-06, "loss": 1.2717, "step": 2624 }, { "epoch": 3.335451080050826, "grad_norm": 1.7471415191321704, "learning_rate": 1.3651266714362166e-06, "loss": 1.2484, "step": 2625 }, { "epoch": 3.3367217280813213, "grad_norm": 1.7389887302213156, "learning_rate": 1.3600328637081672e-06, "loss": 1.3138, "step": 2626 }, { "epoch": 3.337992376111817, "grad_norm": 1.594221698681159, "learning_rate": 1.3549478837250995e-06, "loss": 1.2117, "step": 2627 }, { "epoch": 3.3392630241423125, "grad_norm": 2.0026558850987137, "learning_rate": 1.3498717366825086e-06, "loss": 1.3661, "step": 2628 }, { "epoch": 3.3405336721728083, "grad_norm": 1.5019394801634307, "learning_rate": 1.3448044277668682e-06, "loss": 1.2723, "step": 2629 }, { "epoch": 3.3418043202033036, "grad_norm": 1.92187489823837, "learning_rate": 1.339745962155613e-06, "loss": 1.2405, "step": 2630 }, { "epoch": 3.3430749682337995, "grad_norm": 1.834804762069544, "learning_rate": 1.3346963450171536e-06, "loss": 1.1661, "step": 2631 }, { "epoch": 3.344345616264295, "grad_norm": 1.7181637503941978, "learning_rate": 1.329655581510847e-06, "loss": 1.2759, "step": 2632 }, { "epoch": 3.34561626429479, "grad_norm": 1.9299844130635269, "learning_rate": 1.324623676787017e-06, "loss": 1.2278, "step": 2633 }, { "epoch": 3.346886912325286, "grad_norm": 1.9112199664175715, "learning_rate": 1.3196006359869273e-06, "loss": 1.2686, "step": 2634 }, { "epoch": 3.3481575603557814, "grad_norm": 1.7199969917035751, "learning_rate": 1.3145864642427841e-06, "loss": 1.4043, "step": 2635 }, { "epoch": 3.349428208386277, "grad_norm": 1.8152094794925473, "learning_rate": 1.3095811666777413e-06, "loss": 1.2477, "step": 2636 }, { "epoch": 3.3506988564167726, "grad_norm": 1.7537576703915967, "learning_rate": 1.3045847484058748e-06, "loss": 1.1339, "step": 2637 }, { "epoch": 3.351969504447268, "grad_norm": 1.7522948208376807, "learning_rate": 1.2995972145321979e-06, "loss": 1.1855, "step": 2638 }, { "epoch": 3.3532401524777637, "grad_norm": 1.9192958082012628, "learning_rate": 1.2946185701526392e-06, "loss": 1.1036, "step": 2639 }, { "epoch": 3.354510800508259, "grad_norm": 1.9867290806679554, "learning_rate": 1.2896488203540447e-06, "loss": 1.4721, "step": 2640 }, { "epoch": 3.355781448538755, "grad_norm": 1.9360686626231494, "learning_rate": 1.2846879702141769e-06, "loss": 1.4588, "step": 2641 }, { "epoch": 3.3570520965692503, "grad_norm": 1.9907414210884793, "learning_rate": 1.2797360248017055e-06, "loss": 1.3047, "step": 2642 }, { "epoch": 3.3583227445997457, "grad_norm": 1.7923504196629938, "learning_rate": 1.2747929891761978e-06, "loss": 1.355, "step": 2643 }, { "epoch": 3.3595933926302415, "grad_norm": 1.7949718703785045, "learning_rate": 1.2698588683881185e-06, "loss": 1.1143, "step": 2644 }, { "epoch": 3.360864040660737, "grad_norm": 1.7318736966786377, "learning_rate": 1.264933667478827e-06, "loss": 1.3079, "step": 2645 }, { "epoch": 3.3621346886912327, "grad_norm": 1.6551174733979463, "learning_rate": 1.2600173914805647e-06, "loss": 1.1932, "step": 2646 }, { "epoch": 3.363405336721728, "grad_norm": 1.5903014941151614, "learning_rate": 1.2551100454164556e-06, "loss": 1.0314, "step": 2647 }, { "epoch": 3.364675984752224, "grad_norm": 1.6469867484327598, "learning_rate": 1.2502116343005033e-06, "loss": 1.2687, "step": 2648 }, { "epoch": 3.365946632782719, "grad_norm": 1.7078274190427007, "learning_rate": 1.2453221631375755e-06, "loss": 1.4505, "step": 2649 }, { "epoch": 3.3672172808132146, "grad_norm": 1.7782134771535814, "learning_rate": 1.240441636923413e-06, "loss": 1.2899, "step": 2650 }, { "epoch": 3.3684879288437104, "grad_norm": 1.844153101820231, "learning_rate": 1.2355700606446119e-06, "loss": 1.1901, "step": 2651 }, { "epoch": 3.3697585768742058, "grad_norm": 1.9132378925639788, "learning_rate": 1.2307074392786233e-06, "loss": 1.267, "step": 2652 }, { "epoch": 3.3710292249047016, "grad_norm": 2.0661967196773086, "learning_rate": 1.2258537777937517e-06, "loss": 1.2886, "step": 2653 }, { "epoch": 3.372299872935197, "grad_norm": 1.7694658087370272, "learning_rate": 1.2210090811491515e-06, "loss": 1.1099, "step": 2654 }, { "epoch": 3.3735705209656923, "grad_norm": 1.7817581559328577, "learning_rate": 1.2161733542948073e-06, "loss": 1.38, "step": 2655 }, { "epoch": 3.374841168996188, "grad_norm": 1.7130467222950003, "learning_rate": 1.2113466021715426e-06, "loss": 1.4198, "step": 2656 }, { "epoch": 3.3761118170266835, "grad_norm": 1.6031405696122085, "learning_rate": 1.2065288297110167e-06, "loss": 1.2814, "step": 2657 }, { "epoch": 3.3773824650571793, "grad_norm": 1.8655197755258583, "learning_rate": 1.2017200418357077e-06, "loss": 1.3108, "step": 2658 }, { "epoch": 3.3786531130876747, "grad_norm": 1.7812268166685585, "learning_rate": 1.1969202434589133e-06, "loss": 1.2573, "step": 2659 }, { "epoch": 3.37992376111817, "grad_norm": 1.6440103317494115, "learning_rate": 1.1921294394847537e-06, "loss": 1.2683, "step": 2660 }, { "epoch": 3.381194409148666, "grad_norm": 1.7585958796842118, "learning_rate": 1.1873476348081514e-06, "loss": 1.2807, "step": 2661 }, { "epoch": 3.3824650571791612, "grad_norm": 2.5852035542332876, "learning_rate": 1.182574834314838e-06, "loss": 1.1824, "step": 2662 }, { "epoch": 3.383735705209657, "grad_norm": 1.5086924890786593, "learning_rate": 1.177811042881345e-06, "loss": 1.2598, "step": 2663 }, { "epoch": 3.3850063532401524, "grad_norm": 1.882253458637874, "learning_rate": 1.1730562653749956e-06, "loss": 1.1506, "step": 2664 }, { "epoch": 3.3862770012706482, "grad_norm": 1.743303518267941, "learning_rate": 1.1683105066539068e-06, "loss": 1.361, "step": 2665 }, { "epoch": 3.3875476493011436, "grad_norm": 1.47191660579717, "learning_rate": 1.1635737715669827e-06, "loss": 1.2394, "step": 2666 }, { "epoch": 3.388818297331639, "grad_norm": 1.9385081502225063, "learning_rate": 1.1588460649539036e-06, "loss": 1.3206, "step": 2667 }, { "epoch": 3.390088945362135, "grad_norm": 1.85582224027926, "learning_rate": 1.1541273916451234e-06, "loss": 1.3906, "step": 2668 }, { "epoch": 3.39135959339263, "grad_norm": 1.8045536579743726, "learning_rate": 1.1494177564618724e-06, "loss": 1.1999, "step": 2669 }, { "epoch": 3.392630241423126, "grad_norm": 1.9045572029921076, "learning_rate": 1.1447171642161415e-06, "loss": 1.3235, "step": 2670 }, { "epoch": 3.3939008894536213, "grad_norm": 1.9426040842359356, "learning_rate": 1.1400256197106873e-06, "loss": 1.247, "step": 2671 }, { "epoch": 3.3951715374841167, "grad_norm": 1.9443946239187213, "learning_rate": 1.1353431277390125e-06, "loss": 0.9854, "step": 2672 }, { "epoch": 3.3964421855146125, "grad_norm": 1.9467877942662004, "learning_rate": 1.1306696930853834e-06, "loss": 1.1952, "step": 2673 }, { "epoch": 3.397712833545108, "grad_norm": 1.8498200669109355, "learning_rate": 1.1260053205248023e-06, "loss": 0.94, "step": 2674 }, { "epoch": 3.3989834815756037, "grad_norm": 2.180463686085276, "learning_rate": 1.121350014823014e-06, "loss": 1.2531, "step": 2675 }, { "epoch": 3.400254129606099, "grad_norm": 1.7466906155479762, "learning_rate": 1.116703780736501e-06, "loss": 1.2283, "step": 2676 }, { "epoch": 3.4015247776365944, "grad_norm": 1.6930497827444948, "learning_rate": 1.1120666230124777e-06, "loss": 1.1114, "step": 2677 }, { "epoch": 3.4027954256670903, "grad_norm": 1.725500785012662, "learning_rate": 1.107438546388887e-06, "loss": 1.2197, "step": 2678 }, { "epoch": 3.4040660736975856, "grad_norm": 1.6125906508638936, "learning_rate": 1.1028195555943877e-06, "loss": 1.2707, "step": 2679 }, { "epoch": 3.4053367217280814, "grad_norm": 1.7063770961027755, "learning_rate": 1.0982096553483568e-06, "loss": 1.3152, "step": 2680 }, { "epoch": 3.406607369758577, "grad_norm": 1.8909990165065969, "learning_rate": 1.0936088503608876e-06, "loss": 1.3825, "step": 2681 }, { "epoch": 3.4078780177890726, "grad_norm": 1.8107735180118376, "learning_rate": 1.0890171453327735e-06, "loss": 1.1619, "step": 2682 }, { "epoch": 3.409148665819568, "grad_norm": 1.6224194884354048, "learning_rate": 1.0844345449555172e-06, "loss": 1.2711, "step": 2683 }, { "epoch": 3.4104193138500634, "grad_norm": 1.8196149695370645, "learning_rate": 1.079861053911313e-06, "loss": 1.2594, "step": 2684 }, { "epoch": 3.411689961880559, "grad_norm": 2.034877917783471, "learning_rate": 1.0752966768730543e-06, "loss": 1.3763, "step": 2685 }, { "epoch": 3.4129606099110545, "grad_norm": 2.020320327891875, "learning_rate": 1.0707414185043163e-06, "loss": 1.1893, "step": 2686 }, { "epoch": 3.4142312579415504, "grad_norm": 1.6828139208453592, "learning_rate": 1.066195283459359e-06, "loss": 1.4082, "step": 2687 }, { "epoch": 3.4155019059720457, "grad_norm": 2.078181012396198, "learning_rate": 1.0616582763831206e-06, "loss": 1.429, "step": 2688 }, { "epoch": 3.4167725540025415, "grad_norm": 1.9832043832040667, "learning_rate": 1.057130401911215e-06, "loss": 1.3872, "step": 2689 }, { "epoch": 3.418043202033037, "grad_norm": 1.6905291246322078, "learning_rate": 1.0526116646699269e-06, "loss": 1.4047, "step": 2690 }, { "epoch": 3.4193138500635323, "grad_norm": 1.9283552129583106, "learning_rate": 1.048102069276199e-06, "loss": 1.2151, "step": 2691 }, { "epoch": 3.420584498094028, "grad_norm": 2.065009343914376, "learning_rate": 1.0436016203376343e-06, "loss": 1.4083, "step": 2692 }, { "epoch": 3.4218551461245235, "grad_norm": 1.5669351407838714, "learning_rate": 1.0391103224524957e-06, "loss": 1.2232, "step": 2693 }, { "epoch": 3.423125794155019, "grad_norm": 1.5827806159536164, "learning_rate": 1.0346281802096946e-06, "loss": 1.1738, "step": 2694 }, { "epoch": 3.4243964421855146, "grad_norm": 2.0284739060049457, "learning_rate": 1.0301551981887848e-06, "loss": 1.3213, "step": 2695 }, { "epoch": 3.42566709021601, "grad_norm": 1.687141604822629, "learning_rate": 1.0256913809599611e-06, "loss": 1.2716, "step": 2696 }, { "epoch": 3.426937738246506, "grad_norm": 1.765244292834096, "learning_rate": 1.021236733084059e-06, "loss": 1.3456, "step": 2697 }, { "epoch": 3.428208386277001, "grad_norm": 1.4938395155118565, "learning_rate": 1.0167912591125407e-06, "loss": 1.2981, "step": 2698 }, { "epoch": 3.429479034307497, "grad_norm": 1.704721002072044, "learning_rate": 1.012354963587493e-06, "loss": 1.2263, "step": 2699 }, { "epoch": 3.4307496823379924, "grad_norm": 1.7142357107118333, "learning_rate": 1.0079278510416313e-06, "loss": 1.2548, "step": 2700 }, { "epoch": 3.4320203303684877, "grad_norm": 1.9947239089373425, "learning_rate": 1.0035099259982873e-06, "loss": 1.2137, "step": 2701 }, { "epoch": 3.4332909783989836, "grad_norm": 1.7276553341679752, "learning_rate": 9.99101192971401e-07, "loss": 1.1533, "step": 2702 }, { "epoch": 3.434561626429479, "grad_norm": 1.7686756288098728, "learning_rate": 9.947016564655243e-07, "loss": 1.3311, "step": 2703 }, { "epoch": 3.4358322744599747, "grad_norm": 1.820270383580354, "learning_rate": 9.903113209758098e-07, "loss": 1.1685, "step": 2704 }, { "epoch": 3.43710292249047, "grad_norm": 2.156402860822522, "learning_rate": 9.859301909880103e-07, "loss": 1.3577, "step": 2705 }, { "epoch": 3.438373570520966, "grad_norm": 1.8012062861999263, "learning_rate": 9.815582709784788e-07, "loss": 1.4139, "step": 2706 }, { "epoch": 3.4396442185514613, "grad_norm": 1.797709871472713, "learning_rate": 9.771955654141496e-07, "loss": 1.3349, "step": 2707 }, { "epoch": 3.4409148665819567, "grad_norm": 1.7472924648225128, "learning_rate": 9.728420787525428e-07, "loss": 1.2374, "step": 2708 }, { "epoch": 3.4421855146124525, "grad_norm": 1.4860934647077977, "learning_rate": 9.684978154417678e-07, "loss": 1.194, "step": 2709 }, { "epoch": 3.443456162642948, "grad_norm": 1.531343096586481, "learning_rate": 9.641627799205012e-07, "loss": 1.2311, "step": 2710 }, { "epoch": 3.444726810673443, "grad_norm": 1.6723020451318749, "learning_rate": 9.598369766179937e-07, "loss": 1.3318, "step": 2711 }, { "epoch": 3.445997458703939, "grad_norm": 1.8285098197851093, "learning_rate": 9.55520409954066e-07, "loss": 1.255, "step": 2712 }, { "epoch": 3.4472681067344344, "grad_norm": 1.593358400223005, "learning_rate": 9.512130843390998e-07, "loss": 1.3607, "step": 2713 }, { "epoch": 3.44853875476493, "grad_norm": 1.793270101846004, "learning_rate": 9.469150041740338e-07, "loss": 1.2904, "step": 2714 }, { "epoch": 3.4498094027954256, "grad_norm": 1.8526523892890903, "learning_rate": 9.426261738503617e-07, "loss": 1.3375, "step": 2715 }, { "epoch": 3.4510800508259214, "grad_norm": 2.1340765529575054, "learning_rate": 9.383465977501227e-07, "loss": 1.0028, "step": 2716 }, { "epoch": 3.4523506988564168, "grad_norm": 1.7981696709178416, "learning_rate": 9.340762802459047e-07, "loss": 1.1427, "step": 2717 }, { "epoch": 3.453621346886912, "grad_norm": 1.785072921646824, "learning_rate": 9.298152257008386e-07, "loss": 1.2483, "step": 2718 }, { "epoch": 3.454891994917408, "grad_norm": 1.6301498593465438, "learning_rate": 9.255634384685841e-07, "loss": 1.3141, "step": 2719 }, { "epoch": 3.4561626429479033, "grad_norm": 1.62370888948084, "learning_rate": 9.213209228933339e-07, "loss": 1.2085, "step": 2720 }, { "epoch": 3.457433290978399, "grad_norm": 1.7534655841893003, "learning_rate": 9.170876833098119e-07, "loss": 1.3301, "step": 2721 }, { "epoch": 3.4587039390088945, "grad_norm": 1.806965898077104, "learning_rate": 9.128637240432581e-07, "loss": 1.4144, "step": 2722 }, { "epoch": 3.4599745870393903, "grad_norm": 1.731938997833424, "learning_rate": 9.086490494094369e-07, "loss": 1.2284, "step": 2723 }, { "epoch": 3.4612452350698857, "grad_norm": 1.737150913677021, "learning_rate": 9.044436637146204e-07, "loss": 1.071, "step": 2724 }, { "epoch": 3.462515883100381, "grad_norm": 1.6466063414818866, "learning_rate": 9.002475712555959e-07, "loss": 0.9581, "step": 2725 }, { "epoch": 3.463786531130877, "grad_norm": 1.5537041351043512, "learning_rate": 8.960607763196494e-07, "loss": 1.301, "step": 2726 }, { "epoch": 3.4650571791613722, "grad_norm": 1.776184933266563, "learning_rate": 8.918832831845714e-07, "loss": 1.4461, "step": 2727 }, { "epoch": 3.4663278271918676, "grad_norm": 1.7993064628118447, "learning_rate": 8.87715096118642e-07, "loss": 1.1932, "step": 2728 }, { "epoch": 3.4675984752223634, "grad_norm": 1.7030496492394833, "learning_rate": 8.835562193806469e-07, "loss": 1.2039, "step": 2729 }, { "epoch": 3.468869123252859, "grad_norm": 1.7805576415021542, "learning_rate": 8.794066572198456e-07, "loss": 1.3519, "step": 2730 }, { "epoch": 3.4701397712833546, "grad_norm": 2.3091277332278177, "learning_rate": 8.752664138759858e-07, "loss": 1.1834, "step": 2731 }, { "epoch": 3.47141041931385, "grad_norm": 1.9229797089060061, "learning_rate": 8.711354935792926e-07, "loss": 1.0087, "step": 2732 }, { "epoch": 3.472681067344346, "grad_norm": 1.942133714071417, "learning_rate": 8.670139005504674e-07, "loss": 1.5387, "step": 2733 }, { "epoch": 3.473951715374841, "grad_norm": 1.6671088931646612, "learning_rate": 8.629016390006783e-07, "loss": 1.2378, "step": 2734 }, { "epoch": 3.4752223634053365, "grad_norm": 1.5640065614416736, "learning_rate": 8.587987131315656e-07, "loss": 1.3365, "step": 2735 }, { "epoch": 3.4764930114358323, "grad_norm": 1.8177900821827773, "learning_rate": 8.547051271352213e-07, "loss": 1.0777, "step": 2736 }, { "epoch": 3.4777636594663277, "grad_norm": 1.7065655482669073, "learning_rate": 8.506208851942043e-07, "loss": 1.2597, "step": 2737 }, { "epoch": 3.4790343074968235, "grad_norm": 1.6404825855437422, "learning_rate": 8.4654599148152e-07, "loss": 1.2792, "step": 2738 }, { "epoch": 3.480304955527319, "grad_norm": 1.6952150831350192, "learning_rate": 8.424804501606254e-07, "loss": 1.3297, "step": 2739 }, { "epoch": 3.4815756035578147, "grad_norm": 1.744695544224006, "learning_rate": 8.384242653854146e-07, "loss": 1.2434, "step": 2740 }, { "epoch": 3.48284625158831, "grad_norm": 1.6822982224698178, "learning_rate": 8.343774413002382e-07, "loss": 1.1478, "step": 2741 }, { "epoch": 3.4841168996188054, "grad_norm": 1.596300241861607, "learning_rate": 8.303399820398672e-07, "loss": 1.3028, "step": 2742 }, { "epoch": 3.4853875476493013, "grad_norm": 1.8058411192139043, "learning_rate": 8.263118917295088e-07, "loss": 1.4362, "step": 2743 }, { "epoch": 3.4866581956797966, "grad_norm": 1.7936411191700221, "learning_rate": 8.222931744847984e-07, "loss": 1.2268, "step": 2744 }, { "epoch": 3.4879288437102924, "grad_norm": 1.956650496418258, "learning_rate": 8.182838344117971e-07, "loss": 1.3653, "step": 2745 }, { "epoch": 3.489199491740788, "grad_norm": 1.9750594248748314, "learning_rate": 8.142838756069793e-07, "loss": 1.1586, "step": 2746 }, { "epoch": 3.490470139771283, "grad_norm": 1.569757205227885, "learning_rate": 8.102933021572412e-07, "loss": 1.3598, "step": 2747 }, { "epoch": 3.491740787801779, "grad_norm": 1.8717609483597424, "learning_rate": 8.063121181398814e-07, "loss": 1.3104, "step": 2748 }, { "epoch": 3.4930114358322744, "grad_norm": 1.6794716570302477, "learning_rate": 8.023403276226127e-07, "loss": 1.0687, "step": 2749 }, { "epoch": 3.49428208386277, "grad_norm": 2.152611708136472, "learning_rate": 7.983779346635479e-07, "loss": 1.3144, "step": 2750 }, { "epoch": 3.4955527318932655, "grad_norm": 1.6025361349231622, "learning_rate": 7.944249433111917e-07, "loss": 1.0832, "step": 2751 }, { "epoch": 3.496823379923761, "grad_norm": 1.9966220200266624, "learning_rate": 7.904813576044534e-07, "loss": 1.5379, "step": 2752 }, { "epoch": 3.4980940279542567, "grad_norm": 1.9867104129705102, "learning_rate": 7.865471815726266e-07, "loss": 1.3133, "step": 2753 }, { "epoch": 3.499364675984752, "grad_norm": 1.7758698233245882, "learning_rate": 7.826224192353916e-07, "loss": 1.2638, "step": 2754 }, { "epoch": 3.500635324015248, "grad_norm": 1.7856567193948802, "learning_rate": 7.78707074602808e-07, "loss": 1.2643, "step": 2755 }, { "epoch": 3.5019059720457433, "grad_norm": 1.6819281034220812, "learning_rate": 7.74801151675314e-07, "loss": 0.9395, "step": 2756 }, { "epoch": 3.503176620076239, "grad_norm": 1.8774041278593898, "learning_rate": 7.709046544437238e-07, "loss": 1.3941, "step": 2757 }, { "epoch": 3.5044472681067345, "grad_norm": 1.7228875613432229, "learning_rate": 7.670175868892227e-07, "loss": 1.1982, "step": 2758 }, { "epoch": 3.50571791613723, "grad_norm": 1.9901388879104207, "learning_rate": 7.63139952983356e-07, "loss": 1.2779, "step": 2759 }, { "epoch": 3.5069885641677256, "grad_norm": 1.856781653989378, "learning_rate": 7.592717566880304e-07, "loss": 1.4188, "step": 2760 }, { "epoch": 3.508259212198221, "grad_norm": 1.7613715805684167, "learning_rate": 7.554130019555161e-07, "loss": 1.1357, "step": 2761 }, { "epoch": 3.5095298602287164, "grad_norm": 1.7158166759805475, "learning_rate": 7.515636927284309e-07, "loss": 1.208, "step": 2762 }, { "epoch": 3.510800508259212, "grad_norm": 1.591258186122319, "learning_rate": 7.477238329397419e-07, "loss": 0.8926, "step": 2763 }, { "epoch": 3.512071156289708, "grad_norm": 1.7451266194100759, "learning_rate": 7.43893426512764e-07, "loss": 1.3221, "step": 2764 }, { "epoch": 3.5133418043202034, "grad_norm": 1.7447778840735144, "learning_rate": 7.400724773611545e-07, "loss": 1.3805, "step": 2765 }, { "epoch": 3.5146124523506987, "grad_norm": 1.6199393605082222, "learning_rate": 7.362609893889028e-07, "loss": 1.219, "step": 2766 }, { "epoch": 3.5158831003811946, "grad_norm": 1.6402867048923917, "learning_rate": 7.324589664903359e-07, "loss": 1.3827, "step": 2767 }, { "epoch": 3.51715374841169, "grad_norm": 1.5917563468080689, "learning_rate": 7.286664125501064e-07, "loss": 1.3233, "step": 2768 }, { "epoch": 3.5184243964421853, "grad_norm": 1.6203147189496971, "learning_rate": 7.248833314431958e-07, "loss": 1.2932, "step": 2769 }, { "epoch": 3.519695044472681, "grad_norm": 1.868463453498782, "learning_rate": 7.211097270349065e-07, "loss": 1.3553, "step": 2770 }, { "epoch": 3.5209656925031765, "grad_norm": 1.5099116369166523, "learning_rate": 7.173456031808568e-07, "loss": 1.3424, "step": 2771 }, { "epoch": 3.5222363405336723, "grad_norm": 1.8810330792980126, "learning_rate": 7.135909637269745e-07, "loss": 1.3142, "step": 2772 }, { "epoch": 3.5235069885641677, "grad_norm": 1.7933284606688886, "learning_rate": 7.098458125095064e-07, "loss": 1.3502, "step": 2773 }, { "epoch": 3.5247776365946635, "grad_norm": 1.4080364805760883, "learning_rate": 7.061101533549952e-07, "loss": 1.3269, "step": 2774 }, { "epoch": 3.526048284625159, "grad_norm": 1.722577290621046, "learning_rate": 7.023839900802931e-07, "loss": 1.2447, "step": 2775 }, { "epoch": 3.527318932655654, "grad_norm": 1.6793120859124784, "learning_rate": 6.986673264925437e-07, "loss": 1.2638, "step": 2776 }, { "epoch": 3.52858958068615, "grad_norm": 1.5963888079729813, "learning_rate": 6.949601663891891e-07, "loss": 1.1815, "step": 2777 }, { "epoch": 3.5298602287166454, "grad_norm": 1.5495246513860983, "learning_rate": 6.912625135579587e-07, "loss": 1.1055, "step": 2778 }, { "epoch": 3.5311308767471408, "grad_norm": 1.99558693420346, "learning_rate": 6.875743717768679e-07, "loss": 1.2659, "step": 2779 }, { "epoch": 3.5324015247776366, "grad_norm": 1.786030201932944, "learning_rate": 6.838957448142136e-07, "loss": 1.2315, "step": 2780 }, { "epoch": 3.5336721728081324, "grad_norm": 1.5668698975548767, "learning_rate": 6.802266364285782e-07, "loss": 1.0094, "step": 2781 }, { "epoch": 3.5349428208386278, "grad_norm": 1.6261749245699866, "learning_rate": 6.765670503688093e-07, "loss": 1.1143, "step": 2782 }, { "epoch": 3.536213468869123, "grad_norm": 1.5394960346250208, "learning_rate": 6.729169903740296e-07, "loss": 1.3297, "step": 2783 }, { "epoch": 3.537484116899619, "grad_norm": 1.8368062621220478, "learning_rate": 6.692764601736268e-07, "loss": 1.4588, "step": 2784 }, { "epoch": 3.5387547649301143, "grad_norm": 1.6342571665890158, "learning_rate": 6.656454634872556e-07, "loss": 1.3219, "step": 2785 }, { "epoch": 3.5400254129606097, "grad_norm": 1.6687677275629142, "learning_rate": 6.62024004024825e-07, "loss": 1.5602, "step": 2786 }, { "epoch": 3.5412960609911055, "grad_norm": 1.7375449094773614, "learning_rate": 6.58412085486505e-07, "loss": 1.2297, "step": 2787 }, { "epoch": 3.542566709021601, "grad_norm": 1.552416702318075, "learning_rate": 6.548097115627106e-07, "loss": 1.2901, "step": 2788 }, { "epoch": 3.5438373570520967, "grad_norm": 1.9130146549727756, "learning_rate": 6.512168859341117e-07, "loss": 1.3471, "step": 2789 }, { "epoch": 3.545108005082592, "grad_norm": 1.7147759293088716, "learning_rate": 6.476336122716175e-07, "loss": 1.205, "step": 2790 }, { "epoch": 3.546378653113088, "grad_norm": 2.0546460990652347, "learning_rate": 6.440598942363796e-07, "loss": 1.3571, "step": 2791 }, { "epoch": 3.5476493011435832, "grad_norm": 1.503927361267473, "learning_rate": 6.404957354797825e-07, "loss": 1.3311, "step": 2792 }, { "epoch": 3.5489199491740786, "grad_norm": 1.737486376743467, "learning_rate": 6.369411396434522e-07, "loss": 1.0611, "step": 2793 }, { "epoch": 3.5501905972045744, "grad_norm": 1.645776561560631, "learning_rate": 6.333961103592379e-07, "loss": 1.0911, "step": 2794 }, { "epoch": 3.55146124523507, "grad_norm": 2.112524956117361, "learning_rate": 6.298606512492134e-07, "loss": 1.5111, "step": 2795 }, { "epoch": 3.5527318932655656, "grad_norm": 1.4997925507175143, "learning_rate": 6.263347659256758e-07, "loss": 1.2825, "step": 2796 }, { "epoch": 3.554002541296061, "grad_norm": 1.6710261427064792, "learning_rate": 6.228184579911423e-07, "loss": 1.1149, "step": 2797 }, { "epoch": 3.555273189326557, "grad_norm": 1.621016823761684, "learning_rate": 6.193117310383412e-07, "loss": 1.296, "step": 2798 }, { "epoch": 3.556543837357052, "grad_norm": 1.7308866298487022, "learning_rate": 6.158145886502165e-07, "loss": 1.1074, "step": 2799 }, { "epoch": 3.5578144853875475, "grad_norm": 1.7267963162549822, "learning_rate": 6.123270343999132e-07, "loss": 1.2309, "step": 2800 }, { "epoch": 3.5590851334180433, "grad_norm": 1.6143755437223648, "learning_rate": 6.088490718507845e-07, "loss": 1.2571, "step": 2801 }, { "epoch": 3.5603557814485387, "grad_norm": 1.484465575202803, "learning_rate": 6.053807045563808e-07, "loss": 1.2447, "step": 2802 }, { "epoch": 3.561626429479034, "grad_norm": 1.9146266955465872, "learning_rate": 6.019219360604489e-07, "loss": 1.3864, "step": 2803 }, { "epoch": 3.56289707750953, "grad_norm": 1.7988196675952783, "learning_rate": 5.984727698969306e-07, "loss": 1.2575, "step": 2804 }, { "epoch": 3.5641677255400253, "grad_norm": 1.7598875918220322, "learning_rate": 5.950332095899547e-07, "loss": 1.2743, "step": 2805 }, { "epoch": 3.565438373570521, "grad_norm": 1.8552392774785547, "learning_rate": 5.916032586538345e-07, "loss": 1.2884, "step": 2806 }, { "epoch": 3.5667090216010164, "grad_norm": 1.826409137549182, "learning_rate": 5.881829205930678e-07, "loss": 1.3555, "step": 2807 }, { "epoch": 3.5679796696315123, "grad_norm": 1.5524145194881622, "learning_rate": 5.847721989023258e-07, "loss": 1.3055, "step": 2808 }, { "epoch": 3.5692503176620076, "grad_norm": 1.7064428420759787, "learning_rate": 5.81371097066461e-07, "loss": 1.2404, "step": 2809 }, { "epoch": 3.570520965692503, "grad_norm": 1.6450930111039546, "learning_rate": 5.779796185604925e-07, "loss": 1.3306, "step": 2810 }, { "epoch": 3.571791613722999, "grad_norm": 1.762384800746856, "learning_rate": 5.745977668496084e-07, "loss": 1.4835, "step": 2811 }, { "epoch": 3.573062261753494, "grad_norm": 2.0210484040370686, "learning_rate": 5.71225545389158e-07, "loss": 1.4861, "step": 2812 }, { "epoch": 3.57433290978399, "grad_norm": 1.8646019100822542, "learning_rate": 5.678629576246575e-07, "loss": 1.359, "step": 2813 }, { "epoch": 3.5756035578144854, "grad_norm": 1.9664729844151032, "learning_rate": 5.64510006991772e-07, "loss": 1.1857, "step": 2814 }, { "epoch": 3.576874205844981, "grad_norm": 1.6227577074984558, "learning_rate": 5.611666969163243e-07, "loss": 1.1995, "step": 2815 }, { "epoch": 3.5781448538754765, "grad_norm": 1.778946623070315, "learning_rate": 5.578330308142887e-07, "loss": 1.0837, "step": 2816 }, { "epoch": 3.579415501905972, "grad_norm": 1.7167523857038194, "learning_rate": 5.54509012091784e-07, "loss": 1.2531, "step": 2817 }, { "epoch": 3.5806861499364677, "grad_norm": 1.7830165224929857, "learning_rate": 5.511946441450711e-07, "loss": 1.3503, "step": 2818 }, { "epoch": 3.581956797966963, "grad_norm": 1.566038817550546, "learning_rate": 5.478899303605512e-07, "loss": 1.1498, "step": 2819 }, { "epoch": 3.5832274459974585, "grad_norm": 1.6522474906143725, "learning_rate": 5.445948741147589e-07, "loss": 1.3361, "step": 2820 }, { "epoch": 3.5844980940279543, "grad_norm": 1.7629684303673683, "learning_rate": 5.413094787743678e-07, "loss": 0.9675, "step": 2821 }, { "epoch": 3.5857687420584496, "grad_norm": 1.4674919742802814, "learning_rate": 5.380337476961762e-07, "loss": 1.417, "step": 2822 }, { "epoch": 3.5870393900889455, "grad_norm": 1.8295725890516696, "learning_rate": 5.347676842271088e-07, "loss": 1.1349, "step": 2823 }, { "epoch": 3.588310038119441, "grad_norm": 2.1103444012235903, "learning_rate": 5.315112917042097e-07, "loss": 1.3127, "step": 2824 }, { "epoch": 3.5895806861499366, "grad_norm": 1.7780483291050404, "learning_rate": 5.282645734546477e-07, "loss": 1.3138, "step": 2825 }, { "epoch": 3.590851334180432, "grad_norm": 1.8841007440618305, "learning_rate": 5.250275327957033e-07, "loss": 1.3983, "step": 2826 }, { "epoch": 3.5921219822109274, "grad_norm": 1.646128930724859, "learning_rate": 5.218001730347688e-07, "loss": 1.0253, "step": 2827 }, { "epoch": 3.593392630241423, "grad_norm": 2.135120725044294, "learning_rate": 5.185824974693454e-07, "loss": 1.4821, "step": 2828 }, { "epoch": 3.5946632782719186, "grad_norm": 1.5744547156520436, "learning_rate": 5.153745093870443e-07, "loss": 1.3668, "step": 2829 }, { "epoch": 3.5959339263024144, "grad_norm": 1.581711380685895, "learning_rate": 5.121762120655727e-07, "loss": 1.2427, "step": 2830 }, { "epoch": 3.5972045743329097, "grad_norm": 1.5580465083953998, "learning_rate": 5.089876087727364e-07, "loss": 1.3276, "step": 2831 }, { "epoch": 3.5984752223634056, "grad_norm": 1.5977517162237127, "learning_rate": 5.058087027664404e-07, "loss": 1.1969, "step": 2832 }, { "epoch": 3.599745870393901, "grad_norm": 1.5332829739649898, "learning_rate": 5.026394972946813e-07, "loss": 1.2062, "step": 2833 }, { "epoch": 3.6010165184243963, "grad_norm": 1.910884717881685, "learning_rate": 4.994799955955409e-07, "loss": 1.4812, "step": 2834 }, { "epoch": 3.602287166454892, "grad_norm": 2.103982668673676, "learning_rate": 4.963302008971904e-07, "loss": 1.145, "step": 2835 }, { "epoch": 3.6035578144853875, "grad_norm": 1.6329999265083088, "learning_rate": 4.931901164178765e-07, "loss": 1.4175, "step": 2836 }, { "epoch": 3.604828462515883, "grad_norm": 1.994566922245568, "learning_rate": 4.90059745365935e-07, "loss": 1.4077, "step": 2837 }, { "epoch": 3.6060991105463787, "grad_norm": 1.9225933837025715, "learning_rate": 4.869390909397664e-07, "loss": 1.3837, "step": 2838 }, { "epoch": 3.6073697585768745, "grad_norm": 1.5585153656236626, "learning_rate": 4.838281563278513e-07, "loss": 1.1987, "step": 2839 }, { "epoch": 3.60864040660737, "grad_norm": 1.8921769764839649, "learning_rate": 4.807269447087348e-07, "loss": 1.4019, "step": 2840 }, { "epoch": 3.609911054637865, "grad_norm": 1.7808532225030733, "learning_rate": 4.776354592510302e-07, "loss": 1.3058, "step": 2841 }, { "epoch": 3.611181702668361, "grad_norm": 2.787504736154616, "learning_rate": 4.7455370311341174e-07, "loss": 1.2682, "step": 2842 }, { "epoch": 3.6124523506988564, "grad_norm": 1.5353303670722411, "learning_rate": 4.71481679444612e-07, "loss": 1.4039, "step": 2843 }, { "epoch": 3.6137229987293518, "grad_norm": 1.8489732003472905, "learning_rate": 4.684193913834212e-07, "loss": 1.2498, "step": 2844 }, { "epoch": 3.6149936467598476, "grad_norm": 1.846562588561736, "learning_rate": 4.653668420586843e-07, "loss": 1.3524, "step": 2845 }, { "epoch": 3.616264294790343, "grad_norm": 1.7596402046071762, "learning_rate": 4.623240345892932e-07, "loss": 1.2591, "step": 2846 }, { "epoch": 3.6175349428208388, "grad_norm": 1.6877790123769085, "learning_rate": 4.592909720841843e-07, "loss": 1.3654, "step": 2847 }, { "epoch": 3.618805590851334, "grad_norm": 1.8048006323504517, "learning_rate": 4.562676576423397e-07, "loss": 1.3649, "step": 2848 }, { "epoch": 3.62007623888183, "grad_norm": 1.7126379547511625, "learning_rate": 4.53254094352783e-07, "loss": 1.4378, "step": 2849 }, { "epoch": 3.6213468869123253, "grad_norm": 1.856720805247342, "learning_rate": 4.5025028529457225e-07, "loss": 1.1204, "step": 2850 }, { "epoch": 3.6226175349428207, "grad_norm": 1.637590427858388, "learning_rate": 4.4725623353680246e-07, "loss": 1.2142, "step": 2851 }, { "epoch": 3.6238881829733165, "grad_norm": 1.6166541136566293, "learning_rate": 4.4427194213859216e-07, "loss": 1.2543, "step": 2852 }, { "epoch": 3.625158831003812, "grad_norm": 1.8068453256972625, "learning_rate": 4.4129741414909776e-07, "loss": 1.4532, "step": 2853 }, { "epoch": 3.6264294790343072, "grad_norm": 1.8418139517980867, "learning_rate": 4.3833265260749157e-07, "loss": 1.2231, "step": 2854 }, { "epoch": 3.627700127064803, "grad_norm": 1.5087395562608672, "learning_rate": 4.3537766054296935e-07, "loss": 1.2651, "step": 2855 }, { "epoch": 3.628970775095299, "grad_norm": 1.7131033116168544, "learning_rate": 4.324324409747471e-07, "loss": 1.2731, "step": 2856 }, { "epoch": 3.6302414231257942, "grad_norm": 1.5717863519639734, "learning_rate": 4.2949699691205547e-07, "loss": 1.2464, "step": 2857 }, { "epoch": 3.6315120711562896, "grad_norm": 1.7552470672585625, "learning_rate": 4.2657133135413643e-07, "loss": 1.3178, "step": 2858 }, { "epoch": 3.6327827191867854, "grad_norm": 1.654582843775076, "learning_rate": 4.2365544729023766e-07, "loss": 1.155, "step": 2859 }, { "epoch": 3.634053367217281, "grad_norm": 1.9057224665000165, "learning_rate": 4.207493476996205e-07, "loss": 1.3287, "step": 2860 }, { "epoch": 3.635324015247776, "grad_norm": 1.8357969271756474, "learning_rate": 4.178530355515409e-07, "loss": 1.2494, "step": 2861 }, { "epoch": 3.636594663278272, "grad_norm": 1.8333493950977857, "learning_rate": 4.1496651380526164e-07, "loss": 1.3367, "step": 2862 }, { "epoch": 3.6378653113087673, "grad_norm": 1.6409649153529624, "learning_rate": 4.1208978541003694e-07, "loss": 1.3905, "step": 2863 }, { "epoch": 3.639135959339263, "grad_norm": 1.6182993052070216, "learning_rate": 4.092228533051157e-07, "loss": 1.3968, "step": 2864 }, { "epoch": 3.6404066073697585, "grad_norm": 2.0970039390292037, "learning_rate": 4.063657204197424e-07, "loss": 1.2981, "step": 2865 }, { "epoch": 3.6416772554002543, "grad_norm": 1.778171798732873, "learning_rate": 4.0351838967314427e-07, "loss": 1.1876, "step": 2866 }, { "epoch": 3.6429479034307497, "grad_norm": 2.0284644661241895, "learning_rate": 4.0068086397453297e-07, "loss": 1.5399, "step": 2867 }, { "epoch": 3.644218551461245, "grad_norm": 1.8146083791558065, "learning_rate": 3.97853146223105e-07, "loss": 1.3671, "step": 2868 }, { "epoch": 3.645489199491741, "grad_norm": 1.5427820417969758, "learning_rate": 3.95035239308037e-07, "loss": 1.1703, "step": 2869 }, { "epoch": 3.6467598475222363, "grad_norm": 1.8578951301502669, "learning_rate": 3.92227146108477e-07, "loss": 1.3245, "step": 2870 }, { "epoch": 3.6480304955527316, "grad_norm": 2.1604438011867773, "learning_rate": 3.8942886949354777e-07, "loss": 1.255, "step": 2871 }, { "epoch": 3.6493011435832274, "grad_norm": 1.6022324609710354, "learning_rate": 3.866404123223444e-07, "loss": 1.2646, "step": 2872 }, { "epoch": 3.6505717916137232, "grad_norm": 1.6355491971211404, "learning_rate": 3.838617774439257e-07, "loss": 1.3028, "step": 2873 }, { "epoch": 3.6518424396442186, "grad_norm": 1.9864901590036022, "learning_rate": 3.810929676973185e-07, "loss": 1.2847, "step": 2874 }, { "epoch": 3.653113087674714, "grad_norm": 1.7128003678677792, "learning_rate": 3.783339859115065e-07, "loss": 1.0734, "step": 2875 }, { "epoch": 3.65438373570521, "grad_norm": 2.042760288386399, "learning_rate": 3.7558483490543475e-07, "loss": 1.3907, "step": 2876 }, { "epoch": 3.655654383735705, "grad_norm": 1.64236759707824, "learning_rate": 3.728455174880052e-07, "loss": 1.2805, "step": 2877 }, { "epoch": 3.6569250317662005, "grad_norm": 1.982255555801729, "learning_rate": 3.7011603645806917e-07, "loss": 1.4128, "step": 2878 }, { "epoch": 3.6581956797966964, "grad_norm": 1.8216182340964904, "learning_rate": 3.673963946044268e-07, "loss": 1.4314, "step": 2879 }, { "epoch": 3.6594663278271917, "grad_norm": 1.8666880653898337, "learning_rate": 3.646865947058309e-07, "loss": 1.4772, "step": 2880 }, { "epoch": 3.6607369758576875, "grad_norm": 2.020424953693691, "learning_rate": 3.619866395309757e-07, "loss": 1.2956, "step": 2881 }, { "epoch": 3.662007623888183, "grad_norm": 1.9641852338990382, "learning_rate": 3.5929653183849444e-07, "loss": 1.2681, "step": 2882 }, { "epoch": 3.6632782719186787, "grad_norm": 1.959468794096944, "learning_rate": 3.566162743769597e-07, "loss": 1.2469, "step": 2883 }, { "epoch": 3.664548919949174, "grad_norm": 1.8099735782469673, "learning_rate": 3.53945869884883e-07, "loss": 1.2767, "step": 2884 }, { "epoch": 3.6658195679796695, "grad_norm": 1.5849198909026307, "learning_rate": 3.51285321090703e-07, "loss": 1.244, "step": 2885 }, { "epoch": 3.6670902160101653, "grad_norm": 1.6530831894074411, "learning_rate": 3.4863463071279636e-07, "loss": 1.5761, "step": 2886 }, { "epoch": 3.6683608640406606, "grad_norm": 1.7370571229861118, "learning_rate": 3.45993801459461e-07, "loss": 1.2814, "step": 2887 }, { "epoch": 3.6696315120711565, "grad_norm": 1.8130340023816736, "learning_rate": 3.4336283602891875e-07, "loss": 1.2127, "step": 2888 }, { "epoch": 3.670902160101652, "grad_norm": 1.851199820168119, "learning_rate": 3.4074173710931804e-07, "loss": 1.4793, "step": 2889 }, { "epoch": 3.6721728081321476, "grad_norm": 1.5016518796267735, "learning_rate": 3.381305073787211e-07, "loss": 1.0988, "step": 2890 }, { "epoch": 3.673443456162643, "grad_norm": 1.758242592174552, "learning_rate": 3.355291495051127e-07, "loss": 1.2777, "step": 2891 }, { "epoch": 3.6747141041931384, "grad_norm": 2.016754925103769, "learning_rate": 3.3293766614638457e-07, "loss": 1.5474, "step": 2892 }, { "epoch": 3.675984752223634, "grad_norm": 1.8421964175588048, "learning_rate": 3.3035605995034524e-07, "loss": 1.3418, "step": 2893 }, { "epoch": 3.6772554002541296, "grad_norm": 1.9715525690894853, "learning_rate": 3.277843335547071e-07, "loss": 1.4832, "step": 2894 }, { "epoch": 3.678526048284625, "grad_norm": 1.9264416532591508, "learning_rate": 3.2522248958708814e-07, "loss": 1.431, "step": 2895 }, { "epoch": 3.6797966963151207, "grad_norm": 1.3478815832993138, "learning_rate": 3.226705306650113e-07, "loss": 1.1502, "step": 2896 }, { "epoch": 3.681067344345616, "grad_norm": 1.6254549393829039, "learning_rate": 3.201284593959009e-07, "loss": 1.4309, "step": 2897 }, { "epoch": 3.682337992376112, "grad_norm": 1.7209625236120987, "learning_rate": 3.1759627837707475e-07, "loss": 1.2548, "step": 2898 }, { "epoch": 3.6836086404066073, "grad_norm": 1.719823940394226, "learning_rate": 3.150739901957467e-07, "loss": 1.1791, "step": 2899 }, { "epoch": 3.684879288437103, "grad_norm": 1.7501731746528646, "learning_rate": 3.1256159742902527e-07, "loss": 1.112, "step": 2900 }, { "epoch": 3.6861499364675985, "grad_norm": 1.818068130323241, "learning_rate": 3.100591026439059e-07, "loss": 1.2971, "step": 2901 }, { "epoch": 3.687420584498094, "grad_norm": 1.704715465340639, "learning_rate": 3.075665083972701e-07, "loss": 1.1321, "step": 2902 }, { "epoch": 3.6886912325285897, "grad_norm": 1.6476840934716483, "learning_rate": 3.050838172358883e-07, "loss": 1.1951, "step": 2903 }, { "epoch": 3.689961880559085, "grad_norm": 1.7312533780583264, "learning_rate": 3.0261103169640594e-07, "loss": 1.2046, "step": 2904 }, { "epoch": 3.691232528589581, "grad_norm": 1.5300711820989914, "learning_rate": 3.0014815430535524e-07, "loss": 1.3225, "step": 2905 }, { "epoch": 3.692503176620076, "grad_norm": 1.7017019203015675, "learning_rate": 2.9769518757913785e-07, "loss": 1.4056, "step": 2906 }, { "epoch": 3.693773824650572, "grad_norm": 2.021153403583211, "learning_rate": 2.952521340240333e-07, "loss": 1.1538, "step": 2907 }, { "epoch": 3.6950444726810674, "grad_norm": 1.7738241525920886, "learning_rate": 2.9281899613619047e-07, "loss": 1.1375, "step": 2908 }, { "epoch": 3.6963151207115628, "grad_norm": 1.7016217948023906, "learning_rate": 2.9039577640163077e-07, "loss": 1.2858, "step": 2909 }, { "epoch": 3.6975857687420586, "grad_norm": 1.8183261497067524, "learning_rate": 2.879824772962381e-07, "loss": 1.3034, "step": 2910 }, { "epoch": 3.698856416772554, "grad_norm": 1.6033159308319713, "learning_rate": 2.8557910128575897e-07, "loss": 1.3922, "step": 2911 }, { "epoch": 3.7001270648030493, "grad_norm": 2.005423047013152, "learning_rate": 2.8318565082580686e-07, "loss": 1.3093, "step": 2912 }, { "epoch": 3.701397712833545, "grad_norm": 1.6877843318389423, "learning_rate": 2.8080212836185006e-07, "loss": 1.2727, "step": 2913 }, { "epoch": 3.7026683608640405, "grad_norm": 1.7753712002547122, "learning_rate": 2.784285363292105e-07, "loss": 1.1072, "step": 2914 }, { "epoch": 3.7039390088945363, "grad_norm": 1.8499481759690708, "learning_rate": 2.760648771530705e-07, "loss": 1.4622, "step": 2915 }, { "epoch": 3.7052096569250317, "grad_norm": 1.6641759613967153, "learning_rate": 2.737111532484582e-07, "loss": 1.359, "step": 2916 }, { "epoch": 3.7064803049555275, "grad_norm": 1.8321493388148502, "learning_rate": 2.7136736702025436e-07, "loss": 0.8954, "step": 2917 }, { "epoch": 3.707750952986023, "grad_norm": 1.785858099671127, "learning_rate": 2.6903352086318336e-07, "loss": 1.2825, "step": 2918 }, { "epoch": 3.7090216010165182, "grad_norm": 1.82639553313416, "learning_rate": 2.667096171618122e-07, "loss": 1.2128, "step": 2919 }, { "epoch": 3.710292249047014, "grad_norm": 1.5961260281125593, "learning_rate": 2.6439565829055267e-07, "loss": 1.0634, "step": 2920 }, { "epoch": 3.7115628970775094, "grad_norm": 1.7764458700386587, "learning_rate": 2.620916466136569e-07, "loss": 1.1089, "step": 2921 }, { "epoch": 3.7128335451080052, "grad_norm": 1.9957371319377588, "learning_rate": 2.5979758448520854e-07, "loss": 1.2149, "step": 2922 }, { "epoch": 3.7141041931385006, "grad_norm": 1.7950216758943818, "learning_rate": 2.57513474249127e-07, "loss": 1.5193, "step": 2923 }, { "epoch": 3.7153748411689964, "grad_norm": 1.9839075878351453, "learning_rate": 2.552393182391677e-07, "loss": 1.1559, "step": 2924 }, { "epoch": 3.716645489199492, "grad_norm": 1.5837936980441778, "learning_rate": 2.529751187789098e-07, "loss": 1.2536, "step": 2925 }, { "epoch": 3.717916137229987, "grad_norm": 1.7725334600507785, "learning_rate": 2.507208781817638e-07, "loss": 1.21, "step": 2926 }, { "epoch": 3.719186785260483, "grad_norm": 1.8919214319444393, "learning_rate": 2.4847659875096184e-07, "loss": 1.1894, "step": 2927 }, { "epoch": 3.7204574332909783, "grad_norm": 1.6578469740116388, "learning_rate": 2.4624228277956077e-07, "loss": 1.3188, "step": 2928 }, { "epoch": 3.7217280813214737, "grad_norm": 1.7026605237047239, "learning_rate": 2.4401793255043436e-07, "loss": 1.3334, "step": 2929 }, { "epoch": 3.7229987293519695, "grad_norm": 1.8933344589067407, "learning_rate": 2.4180355033627925e-07, "loss": 1.2346, "step": 2930 }, { "epoch": 3.7242693773824653, "grad_norm": 1.6804946093757482, "learning_rate": 2.395991383995999e-07, "loss": 1.2925, "step": 2931 }, { "epoch": 3.7255400254129607, "grad_norm": 1.7666045402959094, "learning_rate": 2.3740469899272144e-07, "loss": 1.0185, "step": 2932 }, { "epoch": 3.726810673443456, "grad_norm": 2.3268439238309475, "learning_rate": 2.3522023435777585e-07, "loss": 1.5004, "step": 2933 }, { "epoch": 3.728081321473952, "grad_norm": 1.8370176464784342, "learning_rate": 2.3304574672670444e-07, "loss": 1.3942, "step": 2934 }, { "epoch": 3.7293519695044473, "grad_norm": 1.9368162512567109, "learning_rate": 2.308812383212522e-07, "loss": 1.4361, "step": 2935 }, { "epoch": 3.7306226175349426, "grad_norm": 1.8444929993476247, "learning_rate": 2.2872671135297342e-07, "loss": 1.3625, "step": 2936 }, { "epoch": 3.7318932655654384, "grad_norm": 1.8800705753592861, "learning_rate": 2.265821680232172e-07, "loss": 1.2644, "step": 2937 }, { "epoch": 3.733163913595934, "grad_norm": 1.7966966962401014, "learning_rate": 2.2444761052313857e-07, "loss": 1.2963, "step": 2938 }, { "epoch": 3.7344345616264296, "grad_norm": 1.5288572533419622, "learning_rate": 2.2232304103368408e-07, "loss": 1.4129, "step": 2939 }, { "epoch": 3.735705209656925, "grad_norm": 1.778890367102727, "learning_rate": 2.2020846172560062e-07, "loss": 1.3096, "step": 2940 }, { "epoch": 3.736975857687421, "grad_norm": 1.9359706302321475, "learning_rate": 2.181038747594244e-07, "loss": 1.3953, "step": 2941 }, { "epoch": 3.738246505717916, "grad_norm": 1.7357617348857184, "learning_rate": 2.160092822854809e-07, "loss": 1.2162, "step": 2942 }, { "epoch": 3.7395171537484115, "grad_norm": 1.64367042406681, "learning_rate": 2.1392468644388598e-07, "loss": 1.4227, "step": 2943 }, { "epoch": 3.7407878017789074, "grad_norm": 1.4922126743992972, "learning_rate": 2.1185008936454253e-07, "loss": 1.2542, "step": 2944 }, { "epoch": 3.7420584498094027, "grad_norm": 1.827408648948783, "learning_rate": 2.0978549316713615e-07, "loss": 1.3295, "step": 2945 }, { "epoch": 3.743329097839898, "grad_norm": 1.6536944906486557, "learning_rate": 2.0773089996113382e-07, "loss": 1.1333, "step": 2946 }, { "epoch": 3.744599745870394, "grad_norm": 1.6467853780164043, "learning_rate": 2.0568631184578082e-07, "loss": 1.2169, "step": 2947 }, { "epoch": 3.7458703939008897, "grad_norm": 1.6666714776463187, "learning_rate": 2.0365173091010382e-07, "loss": 1.2184, "step": 2948 }, { "epoch": 3.747141041931385, "grad_norm": 1.7900600111005363, "learning_rate": 2.0162715923290333e-07, "loss": 1.5177, "step": 2949 }, { "epoch": 3.7484116899618805, "grad_norm": 1.8741298704629243, "learning_rate": 1.996125988827502e-07, "loss": 1.4674, "step": 2950 }, { "epoch": 3.7496823379923763, "grad_norm": 1.6800530248687902, "learning_rate": 1.9760805191798903e-07, "loss": 1.2756, "step": 2951 }, { "epoch": 3.7509529860228716, "grad_norm": 1.7106727901557028, "learning_rate": 1.9561352038673264e-07, "loss": 1.2821, "step": 2952 }, { "epoch": 3.752223634053367, "grad_norm": 1.6267132760386054, "learning_rate": 1.936290063268631e-07, "loss": 1.2844, "step": 2953 }, { "epoch": 3.753494282083863, "grad_norm": 1.631756064192363, "learning_rate": 1.916545117660218e-07, "loss": 1.3697, "step": 2954 }, { "epoch": 3.754764930114358, "grad_norm": 1.9361139579297768, "learning_rate": 1.8969003872161718e-07, "loss": 1.2899, "step": 2955 }, { "epoch": 3.756035578144854, "grad_norm": 1.8273051054382337, "learning_rate": 1.8773558920082037e-07, "loss": 1.3229, "step": 2956 }, { "epoch": 3.7573062261753494, "grad_norm": 1.954200663567673, "learning_rate": 1.8579116520055508e-07, "loss": 1.4069, "step": 2957 }, { "epoch": 3.758576874205845, "grad_norm": 1.6921839191320842, "learning_rate": 1.8385676870750545e-07, "loss": 1.3057, "step": 2958 }, { "epoch": 3.7598475222363406, "grad_norm": 1.780772890095342, "learning_rate": 1.8193240169810943e-07, "loss": 1.0153, "step": 2959 }, { "epoch": 3.761118170266836, "grad_norm": 1.9333807192242591, "learning_rate": 1.8001806613855642e-07, "loss": 1.3781, "step": 2960 }, { "epoch": 3.7623888182973317, "grad_norm": 2.022433044933993, "learning_rate": 1.7811376398479075e-07, "loss": 1.3182, "step": 2961 }, { "epoch": 3.763659466327827, "grad_norm": 1.612073428189375, "learning_rate": 1.762194971824993e-07, "loss": 1.1196, "step": 2962 }, { "epoch": 3.7649301143583225, "grad_norm": 1.9915334072940494, "learning_rate": 1.7433526766711727e-07, "loss": 1.3878, "step": 2963 }, { "epoch": 3.7662007623888183, "grad_norm": 1.7732026497255406, "learning_rate": 1.72461077363828e-07, "loss": 1.3563, "step": 2964 }, { "epoch": 3.767471410419314, "grad_norm": 1.7834823656515153, "learning_rate": 1.7059692818755414e-07, "loss": 1.1275, "step": 2965 }, { "epoch": 3.7687420584498095, "grad_norm": 1.897016759777693, "learning_rate": 1.6874282204295765e-07, "loss": 1.0414, "step": 2966 }, { "epoch": 3.770012706480305, "grad_norm": 2.0080223575450806, "learning_rate": 1.6689876082444323e-07, "loss": 1.286, "step": 2967 }, { "epoch": 3.7712833545108007, "grad_norm": 1.482766063008824, "learning_rate": 1.6506474641614923e-07, "loss": 1.2941, "step": 2968 }, { "epoch": 3.772554002541296, "grad_norm": 1.7159381033338508, "learning_rate": 1.6324078069195005e-07, "loss": 1.1676, "step": 2969 }, { "epoch": 3.7738246505717914, "grad_norm": 2.0441924176493522, "learning_rate": 1.6142686551545385e-07, "loss": 1.3063, "step": 2970 }, { "epoch": 3.775095298602287, "grad_norm": 1.7507259640438184, "learning_rate": 1.5962300273999586e-07, "loss": 1.3503, "step": 2971 }, { "epoch": 3.7763659466327826, "grad_norm": 1.6456676972136066, "learning_rate": 1.5782919420864628e-07, "loss": 1.1359, "step": 2972 }, { "epoch": 3.7776365946632784, "grad_norm": 1.6442053485039345, "learning_rate": 1.5604544175419901e-07, "loss": 1.2885, "step": 2973 }, { "epoch": 3.7789072426937738, "grad_norm": 1.5890118382331657, "learning_rate": 1.542717471991728e-07, "loss": 1.3009, "step": 2974 }, { "epoch": 3.7801778907242696, "grad_norm": 1.7493121867933188, "learning_rate": 1.5250811235581142e-07, "loss": 1.1525, "step": 2975 }, { "epoch": 3.781448538754765, "grad_norm": 1.65820116434337, "learning_rate": 1.5075453902608117e-07, "loss": 1.0238, "step": 2976 }, { "epoch": 3.7827191867852603, "grad_norm": 1.8538014168797075, "learning_rate": 1.4901102900166554e-07, "loss": 1.1603, "step": 2977 }, { "epoch": 3.783989834815756, "grad_norm": 1.7397589588999856, "learning_rate": 1.472775840639673e-07, "loss": 1.211, "step": 2978 }, { "epoch": 3.7852604828462515, "grad_norm": 1.7225410662035296, "learning_rate": 1.4555420598410642e-07, "loss": 1.1997, "step": 2979 }, { "epoch": 3.786531130876747, "grad_norm": 1.810570089026905, "learning_rate": 1.4384089652291544e-07, "loss": 1.3315, "step": 2980 }, { "epoch": 3.7878017789072427, "grad_norm": 1.8889965513400706, "learning_rate": 1.4213765743094077e-07, "loss": 1.3262, "step": 2981 }, { "epoch": 3.7890724269377385, "grad_norm": 1.5660552596803883, "learning_rate": 1.4044449044843921e-07, "loss": 1.2192, "step": 2982 }, { "epoch": 3.790343074968234, "grad_norm": 1.746991509973848, "learning_rate": 1.3876139730537475e-07, "loss": 1.2917, "step": 2983 }, { "epoch": 3.7916137229987292, "grad_norm": 1.8285601882985356, "learning_rate": 1.3708837972142176e-07, "loss": 1.2735, "step": 2984 }, { "epoch": 3.792884371029225, "grad_norm": 1.5513317560129432, "learning_rate": 1.3542543940595953e-07, "loss": 1.3793, "step": 2985 }, { "epoch": 3.7941550190597204, "grad_norm": 1.447340929091766, "learning_rate": 1.3377257805806786e-07, "loss": 1.076, "step": 2986 }, { "epoch": 3.795425667090216, "grad_norm": 1.6517394517306694, "learning_rate": 1.3212979736653142e-07, "loss": 1.2815, "step": 2987 }, { "epoch": 3.7966963151207116, "grad_norm": 1.5754085331155292, "learning_rate": 1.3049709900983643e-07, "loss": 1.3616, "step": 2988 }, { "epoch": 3.797966963151207, "grad_norm": 1.8740750673950757, "learning_rate": 1.2887448465616292e-07, "loss": 1.2242, "step": 2989 }, { "epoch": 3.799237611181703, "grad_norm": 1.8367249089502404, "learning_rate": 1.272619559633914e-07, "loss": 1.1385, "step": 2990 }, { "epoch": 3.800508259212198, "grad_norm": 1.8082582017934608, "learning_rate": 1.256595145790973e-07, "loss": 1.4317, "step": 2991 }, { "epoch": 3.801778907242694, "grad_norm": 1.9061150195764778, "learning_rate": 1.2406716214054982e-07, "loss": 1.2792, "step": 2992 }, { "epoch": 3.8030495552731893, "grad_norm": 1.8110974893118241, "learning_rate": 1.2248490027470748e-07, "loss": 1.0067, "step": 2993 }, { "epoch": 3.8043202033036847, "grad_norm": 1.7372482940152012, "learning_rate": 1.209127305982205e-07, "loss": 1.3623, "step": 2994 }, { "epoch": 3.8055908513341805, "grad_norm": 1.696128262457204, "learning_rate": 1.1935065471742612e-07, "loss": 1.0419, "step": 2995 }, { "epoch": 3.806861499364676, "grad_norm": 1.7845542976410695, "learning_rate": 1.1779867422835323e-07, "loss": 1.3506, "step": 2996 }, { "epoch": 3.8081321473951717, "grad_norm": 1.7022176451932132, "learning_rate": 1.1625679071671114e-07, "loss": 1.3518, "step": 2997 }, { "epoch": 3.809402795425667, "grad_norm": 1.6165430147381685, "learning_rate": 1.1472500575789302e-07, "loss": 1.2496, "step": 2998 }, { "epoch": 3.810673443456163, "grad_norm": 1.7143957115232276, "learning_rate": 1.1320332091697473e-07, "loss": 1.3813, "step": 2999 }, { "epoch": 3.8119440914866582, "grad_norm": 1.7730705702795937, "learning_rate": 1.1169173774871478e-07, "loss": 1.374, "step": 3000 }, { "epoch": 3.8132147395171536, "grad_norm": 1.6553721276053135, "learning_rate": 1.1019025779754666e-07, "loss": 1.1461, "step": 3001 }, { "epoch": 3.8144853875476494, "grad_norm": 1.6235163337614682, "learning_rate": 1.0869888259758543e-07, "loss": 1.2913, "step": 3002 }, { "epoch": 3.815756035578145, "grad_norm": 1.7194122264096625, "learning_rate": 1.0721761367261662e-07, "loss": 1.3193, "step": 3003 }, { "epoch": 3.81702668360864, "grad_norm": 1.8003799827594853, "learning_rate": 1.0574645253610405e-07, "loss": 1.32, "step": 3004 }, { "epoch": 3.818297331639136, "grad_norm": 2.029674106133309, "learning_rate": 1.0428540069118199e-07, "loss": 1.1933, "step": 3005 }, { "epoch": 3.8195679796696314, "grad_norm": 1.8422067171601757, "learning_rate": 1.028344596306552e-07, "loss": 1.1397, "step": 3006 }, { "epoch": 3.820838627700127, "grad_norm": 1.6832468467920436, "learning_rate": 1.0139363083700116e-07, "loss": 1.3426, "step": 3007 }, { "epoch": 3.8221092757306225, "grad_norm": 1.956043652707399, "learning_rate": 9.996291578236228e-08, "loss": 1.348, "step": 3008 }, { "epoch": 3.8233799237611183, "grad_norm": 1.6432019123161303, "learning_rate": 9.854231592854702e-08, "loss": 1.3335, "step": 3009 }, { "epoch": 3.8246505717916137, "grad_norm": 1.8269339963735867, "learning_rate": 9.713183272703208e-08, "loss": 1.1764, "step": 3010 }, { "epoch": 3.825921219822109, "grad_norm": 1.6720143501255003, "learning_rate": 9.573146761895358e-08, "loss": 1.2134, "step": 3011 }, { "epoch": 3.827191867852605, "grad_norm": 1.5742217245088348, "learning_rate": 9.434122203511253e-08, "loss": 1.1075, "step": 3012 }, { "epoch": 3.8284625158831003, "grad_norm": 1.9518318186239076, "learning_rate": 9.296109739597047e-08, "loss": 1.2332, "step": 3013 }, { "epoch": 3.829733163913596, "grad_norm": 1.6623111274270994, "learning_rate": 9.15910951116461e-08, "loss": 1.0146, "step": 3014 }, { "epoch": 3.8310038119440915, "grad_norm": 1.84924252006562, "learning_rate": 9.023121658191636e-08, "loss": 1.3765, "step": 3015 }, { "epoch": 3.8322744599745873, "grad_norm": 1.6623671762455492, "learning_rate": 8.888146319621538e-08, "loss": 1.234, "step": 3016 }, { "epoch": 3.8335451080050826, "grad_norm": 1.6942240744570565, "learning_rate": 8.754183633363334e-08, "loss": 1.2557, "step": 3017 }, { "epoch": 3.834815756035578, "grad_norm": 1.7066974151328211, "learning_rate": 8.621233736290868e-08, "loss": 1.2491, "step": 3018 }, { "epoch": 3.836086404066074, "grad_norm": 1.6533387949571765, "learning_rate": 8.489296764243704e-08, "loss": 1.235, "step": 3019 }, { "epoch": 3.837357052096569, "grad_norm": 1.7665935151869387, "learning_rate": 8.358372852026342e-08, "loss": 1.4332, "step": 3020 }, { "epoch": 3.8386277001270646, "grad_norm": 1.6998377266516242, "learning_rate": 8.228462133408111e-08, "loss": 1.3522, "step": 3021 }, { "epoch": 3.8398983481575604, "grad_norm": 1.4689041575981403, "learning_rate": 8.099564741123167e-08, "loss": 1.4345, "step": 3022 }, { "epoch": 3.841168996188056, "grad_norm": 1.593025880943212, "learning_rate": 7.971680806870163e-08, "loss": 1.3203, "step": 3023 }, { "epoch": 3.8424396442185516, "grad_norm": 1.704653882953453, "learning_rate": 7.84481046131258e-08, "loss": 1.2147, "step": 3024 }, { "epoch": 3.843710292249047, "grad_norm": 2.071019884817444, "learning_rate": 7.718953834078058e-08, "loss": 1.4055, "step": 3025 }, { "epoch": 3.8449809402795427, "grad_norm": 1.8372347907743876, "learning_rate": 7.594111053758624e-08, "loss": 1.5519, "step": 3026 }, { "epoch": 3.846251588310038, "grad_norm": 1.682413532760154, "learning_rate": 7.470282247910132e-08, "loss": 1.3645, "step": 3027 }, { "epoch": 3.8475222363405335, "grad_norm": 1.7578519280129183, "learning_rate": 7.347467543052932e-08, "loss": 1.5367, "step": 3028 }, { "epoch": 3.8487928843710293, "grad_norm": 1.6770289020194398, "learning_rate": 7.225667064670761e-08, "loss": 1.2428, "step": 3029 }, { "epoch": 3.8500635324015247, "grad_norm": 1.7272876491470486, "learning_rate": 7.104880937211178e-08, "loss": 1.2414, "step": 3030 }, { "epoch": 3.8513341804320205, "grad_norm": 1.5814762997311047, "learning_rate": 6.985109284085578e-08, "loss": 1.2969, "step": 3031 }, { "epoch": 3.852604828462516, "grad_norm": 1.5723974149649933, "learning_rate": 6.866352227668626e-08, "loss": 1.2302, "step": 3032 }, { "epoch": 3.8538754764930117, "grad_norm": 1.968228242890551, "learning_rate": 6.748609889298596e-08, "loss": 1.2859, "step": 3033 }, { "epoch": 3.855146124523507, "grad_norm": 1.5146300885257318, "learning_rate": 6.631882389276478e-08, "loss": 1.1577, "step": 3034 }, { "epoch": 3.8564167725540024, "grad_norm": 2.0136855015637236, "learning_rate": 6.51616984686676e-08, "loss": 1.2817, "step": 3035 }, { "epoch": 3.857687420584498, "grad_norm": 1.6774381577901225, "learning_rate": 6.401472380297091e-08, "loss": 1.2733, "step": 3036 }, { "epoch": 3.8589580686149936, "grad_norm": 1.7053505051775177, "learning_rate": 6.287790106757396e-08, "loss": 1.2441, "step": 3037 }, { "epoch": 3.860228716645489, "grad_norm": 1.7911463890045214, "learning_rate": 6.175123142400986e-08, "loss": 1.385, "step": 3038 }, { "epoch": 3.8614993646759848, "grad_norm": 1.7537350290360554, "learning_rate": 6.063471602343219e-08, "loss": 1.2987, "step": 3039 }, { "epoch": 3.8627700127064806, "grad_norm": 1.643927395956265, "learning_rate": 5.952835600662288e-08, "loss": 1.2382, "step": 3040 }, { "epoch": 3.864040660736976, "grad_norm": 2.123215412392022, "learning_rate": 5.843215250398882e-08, "loss": 1.3108, "step": 3041 }, { "epoch": 3.8653113087674713, "grad_norm": 1.8836046079215185, "learning_rate": 5.7346106635556286e-08, "loss": 1.2853, "step": 3042 }, { "epoch": 3.866581956797967, "grad_norm": 1.7277767829316242, "learning_rate": 5.6270219510975445e-08, "loss": 1.1018, "step": 3043 }, { "epoch": 3.8678526048284625, "grad_norm": 1.787729576761809, "learning_rate": 5.5204492229515846e-08, "loss": 1.2945, "step": 3044 }, { "epoch": 3.869123252858958, "grad_norm": 1.8200958656037172, "learning_rate": 5.4148925880068705e-08, "loss": 1.3888, "step": 3045 }, { "epoch": 3.8703939008894537, "grad_norm": 1.7378636533919039, "learning_rate": 5.310352154113907e-08, "loss": 1.1941, "step": 3046 }, { "epoch": 3.871664548919949, "grad_norm": 1.8610609441033346, "learning_rate": 5.206828028085364e-08, "loss": 1.2257, "step": 3047 }, { "epoch": 3.872935196950445, "grad_norm": 1.8081119313521299, "learning_rate": 5.104320315695188e-08, "loss": 1.3963, "step": 3048 }, { "epoch": 3.8742058449809402, "grad_norm": 1.59550034274709, "learning_rate": 5.002829121679154e-08, "loss": 1.2953, "step": 3049 }, { "epoch": 3.875476493011436, "grad_norm": 1.6764129655232993, "learning_rate": 4.902354549733979e-08, "loss": 1.2611, "step": 3050 }, { "epoch": 3.8767471410419314, "grad_norm": 1.632876323628899, "learning_rate": 4.8028967025181005e-08, "loss": 1.4017, "step": 3051 }, { "epoch": 3.878017789072427, "grad_norm": 1.794162893632187, "learning_rate": 4.704455681650788e-08, "loss": 1.2708, "step": 3052 }, { "epoch": 3.8792884371029226, "grad_norm": 1.8126195614094525, "learning_rate": 4.607031587712696e-08, "loss": 1.2623, "step": 3053 }, { "epoch": 3.880559085133418, "grad_norm": 1.8230091802966468, "learning_rate": 4.5106245202453106e-08, "loss": 1.3042, "step": 3054 }, { "epoch": 3.8818297331639133, "grad_norm": 1.682422086282267, "learning_rate": 4.4152345777507263e-08, "loss": 1.3326, "step": 3055 }, { "epoch": 3.883100381194409, "grad_norm": 1.6824514684501524, "learning_rate": 4.320861857692316e-08, "loss": 1.1603, "step": 3056 }, { "epoch": 3.884371029224905, "grad_norm": 1.6110806549573322, "learning_rate": 4.227506456493835e-08, "loss": 1.2895, "step": 3057 }, { "epoch": 3.8856416772554003, "grad_norm": 1.7676659860940012, "learning_rate": 4.13516846953943e-08, "loss": 1.3506, "step": 3058 }, { "epoch": 3.8869123252858957, "grad_norm": 1.767369964062659, "learning_rate": 4.043847991174188e-08, "loss": 1.2727, "step": 3059 }, { "epoch": 3.8881829733163915, "grad_norm": 1.5053172893324513, "learning_rate": 3.953545114703139e-08, "loss": 1.3379, "step": 3060 }, { "epoch": 3.889453621346887, "grad_norm": 1.9212747322334562, "learning_rate": 3.864259932391923e-08, "loss": 1.4492, "step": 3061 }, { "epoch": 3.8907242693773822, "grad_norm": 1.6797433050046398, "learning_rate": 3.775992535466011e-08, "loss": 1.1713, "step": 3062 }, { "epoch": 3.891994917407878, "grad_norm": 1.5690469601995618, "learning_rate": 3.688743014111262e-08, "loss": 1.254, "step": 3063 }, { "epoch": 3.8932655654383734, "grad_norm": 1.672822569238482, "learning_rate": 3.602511457473479e-08, "loss": 1.0805, "step": 3064 }, { "epoch": 3.8945362134688692, "grad_norm": 1.8613409795594205, "learning_rate": 3.517297953658405e-08, "loss": 1.2872, "step": 3065 }, { "epoch": 3.8958068614993646, "grad_norm": 1.8887233528686516, "learning_rate": 3.4331025897313964e-08, "loss": 1.351, "step": 3066 }, { "epoch": 3.8970775095298604, "grad_norm": 1.7636899325155904, "learning_rate": 3.34992545171775e-08, "loss": 0.9495, "step": 3067 }, { "epoch": 3.898348157560356, "grad_norm": 1.638791874209901, "learning_rate": 3.267766624602375e-08, "loss": 0.9402, "step": 3068 }, { "epoch": 3.899618805590851, "grad_norm": 1.9138239371342511, "learning_rate": 3.186626192329678e-08, "loss": 1.4469, "step": 3069 }, { "epoch": 3.900889453621347, "grad_norm": 1.6216518038849317, "learning_rate": 3.106504237803454e-08, "loss": 1.1445, "step": 3070 }, { "epoch": 3.9021601016518423, "grad_norm": 1.6910513589662268, "learning_rate": 3.027400842887218e-08, "loss": 1.3907, "step": 3071 }, { "epoch": 3.9034307496823377, "grad_norm": 1.6053862098839753, "learning_rate": 2.9493160884035422e-08, "loss": 1.2571, "step": 3072 }, { "epoch": 3.9047013977128335, "grad_norm": 1.6766860443136364, "learning_rate": 2.8722500541340515e-08, "loss": 1.2078, "step": 3073 }, { "epoch": 3.9059720457433293, "grad_norm": 1.700314508038029, "learning_rate": 2.796202818819871e-08, "loss": 1.2652, "step": 3074 }, { "epoch": 3.9072426937738247, "grad_norm": 1.7644853472221245, "learning_rate": 2.721174460160958e-08, "loss": 1.5189, "step": 3075 }, { "epoch": 3.90851334180432, "grad_norm": 1.587777721341335, "learning_rate": 2.6471650548163253e-08, "loss": 1.2959, "step": 3076 }, { "epoch": 3.909783989834816, "grad_norm": 1.6559481427920208, "learning_rate": 2.574174678403818e-08, "loss": 1.2242, "step": 3077 }, { "epoch": 3.9110546378653113, "grad_norm": 1.650624585828068, "learning_rate": 2.5022034055003363e-08, "loss": 1.1872, "step": 3078 }, { "epoch": 3.9123252858958066, "grad_norm": 1.6626045471929531, "learning_rate": 2.4312513096410585e-08, "loss": 1.1773, "step": 3079 }, { "epoch": 3.9135959339263025, "grad_norm": 1.6733515413223996, "learning_rate": 2.361318463320439e-08, "loss": 1.102, "step": 3080 }, { "epoch": 3.914866581956798, "grad_norm": 2.1408182193354253, "learning_rate": 2.2924049379909884e-08, "loss": 1.4914, "step": 3081 }, { "epoch": 3.9161372299872936, "grad_norm": 1.8681977538957117, "learning_rate": 2.2245108040640505e-08, "loss": 1.2554, "step": 3082 }, { "epoch": 3.917407878017789, "grad_norm": 1.9529352454215083, "learning_rate": 2.1576361309093575e-08, "loss": 1.4007, "step": 3083 }, { "epoch": 3.918678526048285, "grad_norm": 1.631295188914433, "learning_rate": 2.09178098685503e-08, "loss": 1.3517, "step": 3084 }, { "epoch": 3.91994917407878, "grad_norm": 1.709811335555246, "learning_rate": 2.0269454391874665e-08, "loss": 1.3244, "step": 3085 }, { "epoch": 3.9212198221092756, "grad_norm": 1.7447655051383881, "learning_rate": 1.963129554151344e-08, "loss": 1.3004, "step": 3086 }, { "epoch": 3.9224904701397714, "grad_norm": 1.7599315058217841, "learning_rate": 1.9003333969493942e-08, "loss": 1.4566, "step": 3087 }, { "epoch": 3.9237611181702667, "grad_norm": 1.743197743728091, "learning_rate": 1.8385570317427382e-08, "loss": 1.2948, "step": 3088 }, { "epoch": 3.9250317662007626, "grad_norm": 1.7696025370080466, "learning_rate": 1.777800521650219e-08, "loss": 1.5456, "step": 3089 }, { "epoch": 3.926302414231258, "grad_norm": 1.7468091292083294, "learning_rate": 1.7180639287488476e-08, "loss": 1.532, "step": 3090 }, { "epoch": 3.9275730622617537, "grad_norm": 1.6879458253212902, "learning_rate": 1.6593473140734673e-08, "loss": 1.346, "step": 3091 }, { "epoch": 3.928843710292249, "grad_norm": 1.855691619109339, "learning_rate": 1.6016507376169776e-08, "loss": 1.1771, "step": 3092 }, { "epoch": 3.9301143583227445, "grad_norm": 1.7090759637140573, "learning_rate": 1.544974258329668e-08, "loss": 1.2063, "step": 3093 }, { "epoch": 3.9313850063532403, "grad_norm": 1.6552561885273969, "learning_rate": 1.4893179341199936e-08, "loss": 1.2813, "step": 3094 }, { "epoch": 3.9326556543837357, "grad_norm": 1.4363593756042043, "learning_rate": 1.4346818218539115e-08, "loss": 1.145, "step": 3095 }, { "epoch": 3.933926302414231, "grad_norm": 1.9013228076996314, "learning_rate": 1.3810659773547675e-08, "loss": 1.329, "step": 3096 }, { "epoch": 3.935196950444727, "grad_norm": 1.8939443423681859, "learning_rate": 1.328470455403963e-08, "loss": 1.0988, "step": 3097 }, { "epoch": 3.936467598475222, "grad_norm": 1.6738124704075703, "learning_rate": 1.276895309739845e-08, "loss": 1.512, "step": 3098 }, { "epoch": 3.937738246505718, "grad_norm": 1.6652367861274973, "learning_rate": 1.2263405930585947e-08, "loss": 1.2814, "step": 3099 }, { "epoch": 3.9390088945362134, "grad_norm": 1.6795036473859142, "learning_rate": 1.1768063570136712e-08, "loss": 1.3776, "step": 3100 }, { "epoch": 3.940279542566709, "grad_norm": 1.7541917603843578, "learning_rate": 1.1282926522158121e-08, "loss": 1.3044, "step": 3101 }, { "epoch": 3.9415501905972046, "grad_norm": 1.771550776590971, "learning_rate": 1.0807995282332562e-08, "loss": 1.3544, "step": 3102 }, { "epoch": 3.9428208386277, "grad_norm": 1.7367784516596882, "learning_rate": 1.034327033591076e-08, "loss": 1.4199, "step": 3103 }, { "epoch": 3.9440914866581958, "grad_norm": 1.9025994701869688, "learning_rate": 9.888752157719562e-09, "loss": 1.2801, "step": 3104 }, { "epoch": 3.945362134688691, "grad_norm": 1.7790550525516902, "learning_rate": 9.444441212155264e-09, "loss": 1.3545, "step": 3105 }, { "epoch": 3.946632782719187, "grad_norm": 1.6489191377760344, "learning_rate": 9.010337953185843e-09, "loss": 1.3388, "step": 3106 }, { "epoch": 3.9479034307496823, "grad_norm": 1.6857591547682904, "learning_rate": 8.586442824347618e-09, "loss": 1.2036, "step": 3107 }, { "epoch": 3.949174078780178, "grad_norm": 1.7276376614276525, "learning_rate": 8.172756258748582e-09, "loss": 1.2178, "step": 3108 }, { "epoch": 3.9504447268106735, "grad_norm": 1.8403311129365838, "learning_rate": 7.769278679068404e-09, "loss": 1.2601, "step": 3109 }, { "epoch": 3.951715374841169, "grad_norm": 1.8610845240663354, "learning_rate": 7.3760104975517665e-09, "loss": 1.1905, "step": 3110 }, { "epoch": 3.9529860228716647, "grad_norm": 1.7683248465418167, "learning_rate": 6.992952116013918e-09, "loss": 1.1733, "step": 3111 }, { "epoch": 3.95425667090216, "grad_norm": 1.978842698024534, "learning_rate": 6.620103925840671e-09, "loss": 1.2842, "step": 3112 }, { "epoch": 3.9555273189326554, "grad_norm": 1.8693913246007345, "learning_rate": 6.257466307980631e-09, "loss": 1.2822, "step": 3113 }, { "epoch": 3.9567979669631512, "grad_norm": 1.8968847407535872, "learning_rate": 5.905039632954079e-09, "loss": 1.231, "step": 3114 }, { "epoch": 3.9580686149936466, "grad_norm": 1.8537024086868568, "learning_rate": 5.562824260848532e-09, "loss": 1.3544, "step": 3115 }, { "epoch": 3.9593392630241424, "grad_norm": 1.4458633275440091, "learning_rate": 5.230820541314296e-09, "loss": 1.3089, "step": 3116 }, { "epoch": 3.9606099110546378, "grad_norm": 1.9149880517081121, "learning_rate": 4.909028813573358e-09, "loss": 1.3764, "step": 3117 }, { "epoch": 3.9618805590851336, "grad_norm": 2.0025213342886645, "learning_rate": 4.597449406409382e-09, "loss": 1.2419, "step": 3118 }, { "epoch": 3.963151207115629, "grad_norm": 1.7262784322679252, "learning_rate": 4.296082638173271e-09, "loss": 1.4185, "step": 3119 }, { "epoch": 3.9644218551461243, "grad_norm": 2.0961436951711696, "learning_rate": 4.00492881678427e-09, "loss": 1.4669, "step": 3120 }, { "epoch": 3.96569250317662, "grad_norm": 1.705911916933581, "learning_rate": 3.723988239721088e-09, "loss": 1.3224, "step": 3121 }, { "epoch": 3.9669631512071155, "grad_norm": 1.839944201594869, "learning_rate": 3.453261194030777e-09, "loss": 1.3765, "step": 3122 }, { "epoch": 3.9682337992376113, "grad_norm": 1.7966456195324074, "learning_rate": 3.1927479563254037e-09, "loss": 1.2228, "step": 3123 }, { "epoch": 3.9695044472681067, "grad_norm": 1.7698139775117516, "learning_rate": 2.942448792778718e-09, "loss": 1.1008, "step": 3124 }, { "epoch": 3.9707750952986025, "grad_norm": 1.9453271358852795, "learning_rate": 2.702363959131704e-09, "loss": 1.3687, "step": 3125 }, { "epoch": 3.972045743329098, "grad_norm": 1.6560152016888756, "learning_rate": 2.4724937006848083e-09, "loss": 1.3055, "step": 3126 }, { "epoch": 3.9733163913595932, "grad_norm": 1.688241672688269, "learning_rate": 2.2528382523057115e-09, "loss": 1.3405, "step": 3127 }, { "epoch": 3.974587039390089, "grad_norm": 1.689276490503405, "learning_rate": 2.0433978384237772e-09, "loss": 1.1854, "step": 3128 }, { "epoch": 3.9758576874205844, "grad_norm": 1.6608383150125265, "learning_rate": 1.8441726730300535e-09, "loss": 1.2289, "step": 3129 }, { "epoch": 3.97712833545108, "grad_norm": 1.8290640252525683, "learning_rate": 1.6551629596817109e-09, "loss": 1.2416, "step": 3130 }, { "epoch": 3.9783989834815756, "grad_norm": 1.619845081875068, "learning_rate": 1.4763688914942732e-09, "loss": 1.3471, "step": 3131 }, { "epoch": 3.9796696315120714, "grad_norm": 1.793203277408509, "learning_rate": 1.3077906511482773e-09, "loss": 1.1703, "step": 3132 }, { "epoch": 3.980940279542567, "grad_norm": 1.7349015618821286, "learning_rate": 1.1494284108859443e-09, "loss": 1.2498, "step": 3133 }, { "epoch": 3.982210927573062, "grad_norm": 1.9995653740531916, "learning_rate": 1.0012823325111776e-09, "loss": 1.4463, "step": 3134 }, { "epoch": 3.983481575603558, "grad_norm": 1.9850571096043719, "learning_rate": 8.63352567390674e-10, "loss": 1.3383, "step": 3135 }, { "epoch": 3.9847522236340533, "grad_norm": 1.732098526894926, "learning_rate": 7.356392564505932e-10, "loss": 1.2589, "step": 3136 }, { "epoch": 3.9860228716645487, "grad_norm": 1.5875173750980283, "learning_rate": 6.181425301809985e-10, "loss": 1.3793, "step": 3137 }, { "epoch": 3.9872935196950445, "grad_norm": 2.0291698208290225, "learning_rate": 5.108625086314157e-10, "loss": 1.1664, "step": 3138 }, { "epoch": 3.98856416772554, "grad_norm": 1.7860671094093037, "learning_rate": 4.137993014130537e-10, "loss": 1.396, "step": 3139 }, { "epoch": 3.9898348157560357, "grad_norm": 1.7815082903328416, "learning_rate": 3.2695300769991503e-10, "loss": 1.3193, "step": 3140 }, { "epoch": 3.991105463786531, "grad_norm": 1.6699156756254965, "learning_rate": 2.503237162254646e-10, "loss": 1.3357, "step": 3141 }, { "epoch": 3.992376111817027, "grad_norm": 1.8035271386920084, "learning_rate": 1.8391150528485058e-10, "loss": 1.4147, "step": 3142 }, { "epoch": 3.9936467598475223, "grad_norm": 1.9501122932797061, "learning_rate": 1.277164427326838e-10, "loss": 1.4584, "step": 3143 }, { "epoch": 3.9949174078780176, "grad_norm": 1.7768540849474679, "learning_rate": 8.173858598525819e-11, "loss": 1.2683, "step": 3144 }, { "epoch": 3.9961880559085134, "grad_norm": 1.5464287026588919, "learning_rate": 4.597798201944059e-11, "loss": 1.105, "step": 3145 }, { "epoch": 3.997458703939009, "grad_norm": 1.7342586128482869, "learning_rate": 2.043466737489119e-11, "loss": 1.1672, "step": 3146 }, { "epoch": 3.998729351969504, "grad_norm": 1.7855077119093086, "learning_rate": 5.108668148512408e-12, "loss": 1.349, "step": 3147 }, { "epoch": 4.0, "grad_norm": 1.9978620033593717, "learning_rate": 0.0, "loss": 1.4619, "step": 3148 } ], "logging_steps": 1, "max_steps": 3148, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 394, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 988690734120960.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }