diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,22069 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 3148, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012706480304955528, + "grad_norm": 18.464955964990292, + "learning_rate": 5.000000000000001e-07, + "loss": 2.786, + "step": 1 + }, + { + "epoch": 0.0025412960609911056, + "grad_norm": 22.399585167061772, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.756, + "step": 2 + }, + { + "epoch": 0.0038119440914866584, + "grad_norm": 30.235203421905396, + "learning_rate": 1.5e-06, + "loss": 2.8511, + "step": 3 + }, + { + "epoch": 0.005082592121982211, + "grad_norm": 22.642837103761043, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.7227, + "step": 4 + }, + { + "epoch": 0.0063532401524777635, + "grad_norm": 20.47380205039988, + "learning_rate": 2.5e-06, + "loss": 2.7037, + "step": 5 + }, + { + "epoch": 0.007623888182973317, + "grad_norm": 16.488755316837164, + "learning_rate": 3e-06, + "loss": 2.9394, + "step": 6 + }, + { + "epoch": 0.008894536213468869, + "grad_norm": 14.260362898657814, + "learning_rate": 3.5e-06, + "loss": 2.6189, + "step": 7 + }, + { + "epoch": 0.010165184243964422, + "grad_norm": 12.280586408164009, + "learning_rate": 4.000000000000001e-06, + "loss": 2.723, + "step": 8 + }, + { + "epoch": 0.011435832274459974, + "grad_norm": 9.22070378756126, + "learning_rate": 4.5e-06, + "loss": 2.3702, + "step": 9 + }, + { + "epoch": 0.012706480304955527, + "grad_norm": 3.4759485405997577, + "learning_rate": 5e-06, + "loss": 2.4602, + "step": 10 + }, + { + "epoch": 0.01397712833545108, + "grad_norm": 6.467286745342885, + "learning_rate": 5.500000000000001e-06, + "loss": 2.5916, + "step": 11 + }, + { + "epoch": 0.015247776365946633, + "grad_norm": 4.256176100814695, + "learning_rate": 6e-06, + "loss": 2.5279, + "step": 12 + }, + { + "epoch": 0.016518424396442185, + "grad_norm": 4.571095329033233, + "learning_rate": 6.5000000000000004e-06, + "loss": 2.6184, + "step": 13 + }, + { + "epoch": 0.017789072426937738, + "grad_norm": 3.1593416383056456, + "learning_rate": 7e-06, + "loss": 2.3682, + "step": 14 + }, + { + "epoch": 0.01905972045743329, + "grad_norm": 3.690992422084765, + "learning_rate": 7.500000000000001e-06, + "loss": 2.6305, + "step": 15 + }, + { + "epoch": 0.020330368487928845, + "grad_norm": 4.793845160014369, + "learning_rate": 8.000000000000001e-06, + "loss": 2.7471, + "step": 16 + }, + { + "epoch": 0.021601016518424398, + "grad_norm": 3.190746262709345, + "learning_rate": 8.5e-06, + "loss": 2.5544, + "step": 17 + }, + { + "epoch": 0.022871664548919948, + "grad_norm": 1.8680728770342985, + "learning_rate": 9e-06, + "loss": 2.3858, + "step": 18 + }, + { + "epoch": 0.0241423125794155, + "grad_norm": 2.1468024025661063, + "learning_rate": 9.5e-06, + "loss": 2.2027, + "step": 19 + }, + { + "epoch": 0.025412960609911054, + "grad_norm": 2.7989100165411993, + "learning_rate": 1e-05, + "loss": 2.4284, + "step": 20 + }, + { + "epoch": 0.026683608640406607, + "grad_norm": 2.187238787656559, + "learning_rate": 1.0500000000000001e-05, + "loss": 2.4529, + "step": 21 + }, + { + "epoch": 0.02795425667090216, + "grad_norm": 1.8943347226376168, + "learning_rate": 1.1000000000000001e-05, + "loss": 2.3487, + "step": 22 + }, + { + "epoch": 0.029224904701397714, + "grad_norm": 2.318723461163645, + "learning_rate": 1.15e-05, + "loss": 2.3675, + "step": 23 + }, + { + "epoch": 0.030495552731893267, + "grad_norm": 2.926184983142819, + "learning_rate": 1.2e-05, + "loss": 2.5222, + "step": 24 + }, + { + "epoch": 0.03176620076238882, + "grad_norm": 1.7831526528754407, + "learning_rate": 1.25e-05, + "loss": 2.2888, + "step": 25 + }, + { + "epoch": 0.03303684879288437, + "grad_norm": 1.7233279523412535, + "learning_rate": 1.3000000000000001e-05, + "loss": 2.4598, + "step": 26 + }, + { + "epoch": 0.03430749682337993, + "grad_norm": 3.3608919691001637, + "learning_rate": 1.3500000000000001e-05, + "loss": 2.5002, + "step": 27 + }, + { + "epoch": 0.035578144853875476, + "grad_norm": 2.1066564841053332, + "learning_rate": 1.4e-05, + "loss": 2.2744, + "step": 28 + }, + { + "epoch": 0.036848792884371026, + "grad_norm": 3.222039338985631, + "learning_rate": 1.45e-05, + "loss": 2.4058, + "step": 29 + }, + { + "epoch": 0.03811944091486658, + "grad_norm": 1.524480532049421, + "learning_rate": 1.5000000000000002e-05, + "loss": 2.2636, + "step": 30 + }, + { + "epoch": 0.03939008894536213, + "grad_norm": 2.8592397792332145, + "learning_rate": 1.55e-05, + "loss": 2.2612, + "step": 31 + }, + { + "epoch": 0.04066073697585769, + "grad_norm": 1.5062409854326497, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.3015, + "step": 32 + }, + { + "epoch": 0.04193138500635324, + "grad_norm": 3.5634874512948898, + "learning_rate": 1.65e-05, + "loss": 2.5092, + "step": 33 + }, + { + "epoch": 0.043202033036848796, + "grad_norm": 1.6040953286994395, + "learning_rate": 1.7e-05, + "loss": 2.4476, + "step": 34 + }, + { + "epoch": 0.044472681067344345, + "grad_norm": 2.649185181368789, + "learning_rate": 1.7500000000000002e-05, + "loss": 2.4681, + "step": 35 + }, + { + "epoch": 0.045743329097839895, + "grad_norm": 1.5158813542671679, + "learning_rate": 1.8e-05, + "loss": 2.292, + "step": 36 + }, + { + "epoch": 0.04701397712833545, + "grad_norm": 2.8599704552305836, + "learning_rate": 1.8500000000000002e-05, + "loss": 2.467, + "step": 37 + }, + { + "epoch": 0.048284625158831, + "grad_norm": 1.955652517362263, + "learning_rate": 1.9e-05, + "loss": 2.3329, + "step": 38 + }, + { + "epoch": 0.04955527318932656, + "grad_norm": 2.036186380457819, + "learning_rate": 1.95e-05, + "loss": 2.4255, + "step": 39 + }, + { + "epoch": 0.05082592121982211, + "grad_norm": 1.906309806811743, + "learning_rate": 2e-05, + "loss": 2.3245, + "step": 40 + }, + { + "epoch": 0.052096569250317665, + "grad_norm": 1.7089031257315084, + "learning_rate": 1.9999994891331854e-05, + "loss": 2.2173, + "step": 41 + }, + { + "epoch": 0.053367217280813214, + "grad_norm": 1.7988981102734363, + "learning_rate": 1.9999979565332626e-05, + "loss": 2.3608, + "step": 42 + }, + { + "epoch": 0.054637865311308764, + "grad_norm": 1.833920539197692, + "learning_rate": 1.9999954022017984e-05, + "loss": 2.3494, + "step": 43 + }, + { + "epoch": 0.05590851334180432, + "grad_norm": 1.8097536920457602, + "learning_rate": 1.9999918261414016e-05, + "loss": 2.2209, + "step": 44 + }, + { + "epoch": 0.05717916137229987, + "grad_norm": 1.7411452453996696, + "learning_rate": 1.9999872283557267e-05, + "loss": 2.3133, + "step": 45 + }, + { + "epoch": 0.05844980940279543, + "grad_norm": 1.70501831199215, + "learning_rate": 1.9999816088494717e-05, + "loss": 2.5452, + "step": 46 + }, + { + "epoch": 0.05972045743329098, + "grad_norm": 2.350235220222271, + "learning_rate": 1.9999749676283775e-05, + "loss": 2.4249, + "step": 47 + }, + { + "epoch": 0.060991105463786534, + "grad_norm": 2.6056898788385565, + "learning_rate": 1.9999673046992304e-05, + "loss": 2.4791, + "step": 48 + }, + { + "epoch": 0.062261753494282084, + "grad_norm": 2.023938613975313, + "learning_rate": 1.9999586200698588e-05, + "loss": 2.5564, + "step": 49 + }, + { + "epoch": 0.06353240152477764, + "grad_norm": 2.202019619137906, + "learning_rate": 1.999948913749137e-05, + "loss": 2.512, + "step": 50 + }, + { + "epoch": 0.06480304955527319, + "grad_norm": 1.6889275088093092, + "learning_rate": 1.999938185746982e-05, + "loss": 2.4475, + "step": 51 + }, + { + "epoch": 0.06607369758576874, + "grad_norm": 1.4638142699756047, + "learning_rate": 1.999926436074355e-05, + "loss": 2.5486, + "step": 52 + }, + { + "epoch": 0.06734434561626429, + "grad_norm": 1.4493503821610307, + "learning_rate": 1.999913664743261e-05, + "loss": 2.2954, + "step": 53 + }, + { + "epoch": 0.06861499364675985, + "grad_norm": 1.4930408619252327, + "learning_rate": 1.999899871766749e-05, + "loss": 2.3253, + "step": 54 + }, + { + "epoch": 0.0698856416772554, + "grad_norm": 1.4795888492851268, + "learning_rate": 1.9998850571589114e-05, + "loss": 2.2496, + "step": 55 + }, + { + "epoch": 0.07115628970775095, + "grad_norm": 4.386537889011212, + "learning_rate": 1.9998692209348852e-05, + "loss": 2.3639, + "step": 56 + }, + { + "epoch": 0.0724269377382465, + "grad_norm": 1.9608285617315484, + "learning_rate": 1.9998523631108506e-05, + "loss": 2.4487, + "step": 57 + }, + { + "epoch": 0.07369758576874205, + "grad_norm": 2.3825490594197998, + "learning_rate": 1.9998344837040318e-05, + "loss": 2.2838, + "step": 58 + }, + { + "epoch": 0.07496823379923762, + "grad_norm": 1.8996986104829006, + "learning_rate": 1.999815582732697e-05, + "loss": 2.4616, + "step": 59 + }, + { + "epoch": 0.07623888182973317, + "grad_norm": 1.6631718517357963, + "learning_rate": 1.9997956602161577e-05, + "loss": 2.3015, + "step": 60 + }, + { + "epoch": 0.07750952986022872, + "grad_norm": 1.9538892233672316, + "learning_rate": 1.9997747161747696e-05, + "loss": 2.501, + "step": 61 + }, + { + "epoch": 0.07878017789072427, + "grad_norm": 1.2795079596218877, + "learning_rate": 1.9997527506299318e-05, + "loss": 2.3801, + "step": 62 + }, + { + "epoch": 0.08005082592121983, + "grad_norm": 4.523259680668686, + "learning_rate": 1.999729763604087e-05, + "loss": 2.6326, + "step": 63 + }, + { + "epoch": 0.08132147395171538, + "grad_norm": 1.647620072325872, + "learning_rate": 1.9997057551207223e-05, + "loss": 2.22, + "step": 64 + }, + { + "epoch": 0.08259212198221093, + "grad_norm": 1.5998559204773664, + "learning_rate": 1.9996807252043677e-05, + "loss": 2.3037, + "step": 65 + }, + { + "epoch": 0.08386277001270648, + "grad_norm": 2.1714649315563017, + "learning_rate": 1.9996546738805972e-05, + "loss": 2.2916, + "step": 66 + }, + { + "epoch": 0.08513341804320203, + "grad_norm": 2.1204211958942802, + "learning_rate": 1.999627601176028e-05, + "loss": 2.1388, + "step": 67 + }, + { + "epoch": 0.08640406607369759, + "grad_norm": 3.1235773393190307, + "learning_rate": 1.999599507118322e-05, + "loss": 2.3557, + "step": 68 + }, + { + "epoch": 0.08767471410419314, + "grad_norm": 1.6752389846723672, + "learning_rate": 1.999570391736183e-05, + "loss": 2.4894, + "step": 69 + }, + { + "epoch": 0.08894536213468869, + "grad_norm": 2.1252725394449645, + "learning_rate": 1.999540255059359e-05, + "loss": 2.5256, + "step": 70 + }, + { + "epoch": 0.09021601016518424, + "grad_norm": 1.4228164737722837, + "learning_rate": 1.999509097118643e-05, + "loss": 2.4114, + "step": 71 + }, + { + "epoch": 0.09148665819567979, + "grad_norm": 1.594698364320451, + "learning_rate": 1.9994769179458687e-05, + "loss": 2.2206, + "step": 72 + }, + { + "epoch": 0.09275730622617535, + "grad_norm": 1.4260368390306692, + "learning_rate": 1.9994437175739154e-05, + "loss": 2.344, + "step": 73 + }, + { + "epoch": 0.0940279542566709, + "grad_norm": 1.6052123309042803, + "learning_rate": 1.999409496036705e-05, + "loss": 2.324, + "step": 74 + }, + { + "epoch": 0.09529860228716645, + "grad_norm": 1.1887947295478682, + "learning_rate": 1.999374253369202e-05, + "loss": 2.1568, + "step": 75 + }, + { + "epoch": 0.096569250317662, + "grad_norm": 1.7214375838627587, + "learning_rate": 1.9993379896074163e-05, + "loss": 2.3768, + "step": 76 + }, + { + "epoch": 0.09783989834815757, + "grad_norm": 1.546493635353293, + "learning_rate": 1.9993007047883988e-05, + "loss": 2.2587, + "step": 77 + }, + { + "epoch": 0.09911054637865312, + "grad_norm": 1.408228139736878, + "learning_rate": 1.9992623989502448e-05, + "loss": 2.3015, + "step": 78 + }, + { + "epoch": 0.10038119440914867, + "grad_norm": 1.3631710614333141, + "learning_rate": 1.9992230721320932e-05, + "loss": 2.1168, + "step": 79 + }, + { + "epoch": 0.10165184243964422, + "grad_norm": 1.4184248992150634, + "learning_rate": 1.9991827243741253e-05, + "loss": 2.5526, + "step": 80 + }, + { + "epoch": 0.10292249047013977, + "grad_norm": 1.431143102369531, + "learning_rate": 1.9991413557175656e-05, + "loss": 2.2523, + "step": 81 + }, + { + "epoch": 0.10419313850063533, + "grad_norm": 2.00398683475326, + "learning_rate": 1.999098966204682e-05, + "loss": 2.428, + "step": 82 + }, + { + "epoch": 0.10546378653113088, + "grad_norm": 1.8800068765705902, + "learning_rate": 1.9990555558787847e-05, + "loss": 2.6345, + "step": 83 + }, + { + "epoch": 0.10673443456162643, + "grad_norm": 1.4579776022353235, + "learning_rate": 1.9990111247842285e-05, + "loss": 2.459, + "step": 84 + }, + { + "epoch": 0.10800508259212198, + "grad_norm": 1.9205744158321651, + "learning_rate": 1.998965672966409e-05, + "loss": 2.0765, + "step": 85 + }, + { + "epoch": 0.10927573062261753, + "grad_norm": 2.68259040488466, + "learning_rate": 1.9989192004717672e-05, + "loss": 2.58, + "step": 86 + }, + { + "epoch": 0.11054637865311309, + "grad_norm": 1.7085159849494436, + "learning_rate": 1.9988717073477842e-05, + "loss": 2.4019, + "step": 87 + }, + { + "epoch": 0.11181702668360864, + "grad_norm": 1.3008915942158903, + "learning_rate": 1.9988231936429866e-05, + "loss": 2.1258, + "step": 88 + }, + { + "epoch": 0.11308767471410419, + "grad_norm": 1.5495282607802583, + "learning_rate": 1.9987736594069417e-05, + "loss": 2.2544, + "step": 89 + }, + { + "epoch": 0.11435832274459974, + "grad_norm": 1.5785369727348832, + "learning_rate": 1.9987231046902602e-05, + "loss": 2.194, + "step": 90 + }, + { + "epoch": 0.1156289707750953, + "grad_norm": 1.570746098900601, + "learning_rate": 1.9986715295445963e-05, + "loss": 2.0638, + "step": 91 + }, + { + "epoch": 0.11689961880559085, + "grad_norm": 1.7320562240650357, + "learning_rate": 1.9986189340226455e-05, + "loss": 2.3778, + "step": 92 + }, + { + "epoch": 0.1181702668360864, + "grad_norm": 2.421076291096759, + "learning_rate": 1.9985653181781465e-05, + "loss": 2.4102, + "step": 93 + }, + { + "epoch": 0.11944091486658195, + "grad_norm": 2.6125970691744205, + "learning_rate": 1.99851068206588e-05, + "loss": 2.543, + "step": 94 + }, + { + "epoch": 0.1207115628970775, + "grad_norm": 2.1586629440192024, + "learning_rate": 1.9984550257416706e-05, + "loss": 2.4479, + "step": 95 + }, + { + "epoch": 0.12198221092757307, + "grad_norm": 1.3699724096006964, + "learning_rate": 1.9983983492623832e-05, + "loss": 2.413, + "step": 96 + }, + { + "epoch": 0.12325285895806862, + "grad_norm": 1.7543498248329765, + "learning_rate": 1.9983406526859266e-05, + "loss": 2.2759, + "step": 97 + }, + { + "epoch": 0.12452350698856417, + "grad_norm": 1.4243383326582497, + "learning_rate": 1.9982819360712514e-05, + "loss": 1.9626, + "step": 98 + }, + { + "epoch": 0.12579415501905972, + "grad_norm": 1.708842956682712, + "learning_rate": 1.99822219947835e-05, + "loss": 2.315, + "step": 99 + }, + { + "epoch": 0.12706480304955528, + "grad_norm": 1.5848118391318178, + "learning_rate": 1.9981614429682576e-05, + "loss": 2.5247, + "step": 100 + }, + { + "epoch": 0.12833545108005082, + "grad_norm": 1.7565244063500633, + "learning_rate": 1.9980996666030507e-05, + "loss": 2.569, + "step": 101 + }, + { + "epoch": 0.12960609911054638, + "grad_norm": 1.7188897035182684, + "learning_rate": 1.998036870445849e-05, + "loss": 2.5498, + "step": 102 + }, + { + "epoch": 0.13087674714104194, + "grad_norm": 1.321697488805632, + "learning_rate": 1.9979730545608128e-05, + "loss": 2.0094, + "step": 103 + }, + { + "epoch": 0.13214739517153748, + "grad_norm": 1.3411797947362514, + "learning_rate": 1.997908219013145e-05, + "loss": 2.4512, + "step": 104 + }, + { + "epoch": 0.13341804320203304, + "grad_norm": 1.4764463188652248, + "learning_rate": 1.997842363869091e-05, + "loss": 2.0464, + "step": 105 + }, + { + "epoch": 0.13468869123252858, + "grad_norm": 6.0220828263716175, + "learning_rate": 1.9977754891959363e-05, + "loss": 2.3022, + "step": 106 + }, + { + "epoch": 0.13595933926302414, + "grad_norm": 1.9760805767200045, + "learning_rate": 1.9977075950620093e-05, + "loss": 2.3951, + "step": 107 + }, + { + "epoch": 0.1372299872935197, + "grad_norm": 1.5413897633546927, + "learning_rate": 1.9976386815366796e-05, + "loss": 2.3606, + "step": 108 + }, + { + "epoch": 0.13850063532401524, + "grad_norm": 1.8991731806444883, + "learning_rate": 1.997568748690359e-05, + "loss": 2.2424, + "step": 109 + }, + { + "epoch": 0.1397712833545108, + "grad_norm": 1.526135893853535, + "learning_rate": 1.9974977965945e-05, + "loss": 2.2526, + "step": 110 + }, + { + "epoch": 0.14104193138500634, + "grad_norm": 1.602806512484664, + "learning_rate": 1.9974258253215964e-05, + "loss": 2.2839, + "step": 111 + }, + { + "epoch": 0.1423125794155019, + "grad_norm": 1.4325519869083885, + "learning_rate": 1.997352834945184e-05, + "loss": 2.3297, + "step": 112 + }, + { + "epoch": 0.14358322744599747, + "grad_norm": 1.5168784833615567, + "learning_rate": 1.997278825539839e-05, + "loss": 2.3959, + "step": 113 + }, + { + "epoch": 0.144853875476493, + "grad_norm": 1.9135394229796725, + "learning_rate": 1.9972037971811802e-05, + "loss": 1.7828, + "step": 114 + }, + { + "epoch": 0.14612452350698857, + "grad_norm": 1.6792769441260569, + "learning_rate": 1.9971277499458663e-05, + "loss": 2.1888, + "step": 115 + }, + { + "epoch": 0.1473951715374841, + "grad_norm": 1.739028731042485, + "learning_rate": 1.9970506839115965e-05, + "loss": 2.2422, + "step": 116 + }, + { + "epoch": 0.14866581956797967, + "grad_norm": 1.4765946902455809, + "learning_rate": 1.996972599157113e-05, + "loss": 2.2422, + "step": 117 + }, + { + "epoch": 0.14993646759847523, + "grad_norm": 2.0492840257759557, + "learning_rate": 1.996893495762197e-05, + "loss": 2.399, + "step": 118 + }, + { + "epoch": 0.15120711562897077, + "grad_norm": 1.4275096196327925, + "learning_rate": 1.9968133738076707e-05, + "loss": 2.3486, + "step": 119 + }, + { + "epoch": 0.15247776365946633, + "grad_norm": 1.6302330872080086, + "learning_rate": 1.9967322333753978e-05, + "loss": 2.2748, + "step": 120 + }, + { + "epoch": 0.15374841168996187, + "grad_norm": 1.412615834080105, + "learning_rate": 1.9966500745482824e-05, + "loss": 2.3581, + "step": 121 + }, + { + "epoch": 0.15501905972045743, + "grad_norm": 1.453502241535778, + "learning_rate": 1.996566897410269e-05, + "loss": 2.4823, + "step": 122 + }, + { + "epoch": 0.156289707750953, + "grad_norm": 1.627229751845881, + "learning_rate": 1.9964827020463418e-05, + "loss": 2.2998, + "step": 123 + }, + { + "epoch": 0.15756035578144853, + "grad_norm": 1.4185856495685691, + "learning_rate": 1.9963974885425267e-05, + "loss": 2.0273, + "step": 124 + }, + { + "epoch": 0.1588310038119441, + "grad_norm": 1.5689622208187057, + "learning_rate": 1.996311256985889e-05, + "loss": 2.131, + "step": 125 + }, + { + "epoch": 0.16010165184243966, + "grad_norm": 1.9586155379600372, + "learning_rate": 1.9962240074645344e-05, + "loss": 2.4964, + "step": 126 + }, + { + "epoch": 0.1613722998729352, + "grad_norm": 1.5528955040049794, + "learning_rate": 1.9961357400676085e-05, + "loss": 2.3646, + "step": 127 + }, + { + "epoch": 0.16264294790343076, + "grad_norm": 8.391044079407488, + "learning_rate": 1.996046454885297e-05, + "loss": 2.4039, + "step": 128 + }, + { + "epoch": 0.1639135959339263, + "grad_norm": 1.7371418333998156, + "learning_rate": 1.995956152008826e-05, + "loss": 2.252, + "step": 129 + }, + { + "epoch": 0.16518424396442186, + "grad_norm": 1.697773515006207, + "learning_rate": 1.9958648315304606e-05, + "loss": 2.3198, + "step": 130 + }, + { + "epoch": 0.16645489199491742, + "grad_norm": 1.28409692971463, + "learning_rate": 1.9957724935435065e-05, + "loss": 2.1874, + "step": 131 + }, + { + "epoch": 0.16772554002541296, + "grad_norm": 1.267390055193146, + "learning_rate": 1.995679138142308e-05, + "loss": 2.1704, + "step": 132 + }, + { + "epoch": 0.16899618805590852, + "grad_norm": 1.3666767964046194, + "learning_rate": 1.9955847654222493e-05, + "loss": 2.1776, + "step": 133 + }, + { + "epoch": 0.17026683608640406, + "grad_norm": 1.2425502438881062, + "learning_rate": 1.995489375479755e-05, + "loss": 2.2165, + "step": 134 + }, + { + "epoch": 0.17153748411689962, + "grad_norm": 1.2599811393514255, + "learning_rate": 1.9953929684122875e-05, + "loss": 2.3532, + "step": 135 + }, + { + "epoch": 0.17280813214739518, + "grad_norm": 1.6206053415885442, + "learning_rate": 1.995295544318349e-05, + "loss": 2.3016, + "step": 136 + }, + { + "epoch": 0.17407878017789072, + "grad_norm": 1.6451053052999345, + "learning_rate": 1.995197103297482e-05, + "loss": 2.4567, + "step": 137 + }, + { + "epoch": 0.17534942820838628, + "grad_norm": 1.548272572718846, + "learning_rate": 1.995097645450266e-05, + "loss": 2.3029, + "step": 138 + }, + { + "epoch": 0.17662007623888182, + "grad_norm": 1.3212151931749498, + "learning_rate": 1.9949971708783212e-05, + "loss": 2.0577, + "step": 139 + }, + { + "epoch": 0.17789072426937738, + "grad_norm": 9.408887399092182, + "learning_rate": 1.994895679684305e-05, + "loss": 2.3772, + "step": 140 + }, + { + "epoch": 0.17916137229987295, + "grad_norm": 1.6125401231020167, + "learning_rate": 1.9947931719719146e-05, + "loss": 2.4229, + "step": 141 + }, + { + "epoch": 0.18043202033036848, + "grad_norm": 1.6662864267128135, + "learning_rate": 1.9946896478458862e-05, + "loss": 2.1524, + "step": 142 + }, + { + "epoch": 0.18170266836086404, + "grad_norm": 1.9673156126026754, + "learning_rate": 1.9945851074119934e-05, + "loss": 2.4578, + "step": 143 + }, + { + "epoch": 0.18297331639135958, + "grad_norm": 1.4828274089192293, + "learning_rate": 1.9944795507770487e-05, + "loss": 2.1372, + "step": 144 + }, + { + "epoch": 0.18424396442185514, + "grad_norm": 1.79936209005378, + "learning_rate": 1.994372978048903e-05, + "loss": 2.4361, + "step": 145 + }, + { + "epoch": 0.1855146124523507, + "grad_norm": 1.7759170388327536, + "learning_rate": 1.9942653893364446e-05, + "loss": 2.1204, + "step": 146 + }, + { + "epoch": 0.18678526048284624, + "grad_norm": 1.7281108781420167, + "learning_rate": 1.9941567847496012e-05, + "loss": 2.1991, + "step": 147 + }, + { + "epoch": 0.1880559085133418, + "grad_norm": 2.1166769515572086, + "learning_rate": 1.994047164399338e-05, + "loss": 2.3989, + "step": 148 + }, + { + "epoch": 0.18932655654383734, + "grad_norm": 1.2950345583922922, + "learning_rate": 1.993936528397657e-05, + "loss": 2.1799, + "step": 149 + }, + { + "epoch": 0.1905972045743329, + "grad_norm": 1.5348226477369444, + "learning_rate": 1.993824876857599e-05, + "loss": 1.9653, + "step": 150 + }, + { + "epoch": 0.19186785260482847, + "grad_norm": 1.7518894681262407, + "learning_rate": 1.9937122098932428e-05, + "loss": 2.5341, + "step": 151 + }, + { + "epoch": 0.193138500635324, + "grad_norm": 1.5845440592839513, + "learning_rate": 1.9935985276197033e-05, + "loss": 2.2863, + "step": 152 + }, + { + "epoch": 0.19440914866581957, + "grad_norm": 1.1806097502271196, + "learning_rate": 1.9934838301531334e-05, + "loss": 2.062, + "step": 153 + }, + { + "epoch": 0.19567979669631513, + "grad_norm": 1.3147281942523097, + "learning_rate": 1.9933681176107237e-05, + "loss": 2.2483, + "step": 154 + }, + { + "epoch": 0.19695044472681067, + "grad_norm": 1.2369606160269775, + "learning_rate": 1.9932513901107017e-05, + "loss": 2.2221, + "step": 155 + }, + { + "epoch": 0.19822109275730623, + "grad_norm": 1.2180750717072797, + "learning_rate": 1.9931336477723315e-05, + "loss": 2.3452, + "step": 156 + }, + { + "epoch": 0.19949174078780177, + "grad_norm": 1.3792727435327043, + "learning_rate": 1.9930148907159146e-05, + "loss": 2.2339, + "step": 157 + }, + { + "epoch": 0.20076238881829733, + "grad_norm": 1.5961667123190166, + "learning_rate": 1.992895119062789e-05, + "loss": 2.2199, + "step": 158 + }, + { + "epoch": 0.2020330368487929, + "grad_norm": 1.5749862121075178, + "learning_rate": 1.9927743329353295e-05, + "loss": 2.3563, + "step": 159 + }, + { + "epoch": 0.20330368487928843, + "grad_norm": 2.2620408058967474, + "learning_rate": 1.992652532456947e-05, + "loss": 2.1471, + "step": 160 + }, + { + "epoch": 0.204574332909784, + "grad_norm": 1.4796151191311295, + "learning_rate": 1.9925297177520903e-05, + "loss": 2.3141, + "step": 161 + }, + { + "epoch": 0.20584498094027953, + "grad_norm": 1.4466702526825321, + "learning_rate": 1.9924058889462413e-05, + "loss": 2.4301, + "step": 162 + }, + { + "epoch": 0.2071156289707751, + "grad_norm": 1.7194780999677342, + "learning_rate": 1.992281046165922e-05, + "loss": 2.1867, + "step": 163 + }, + { + "epoch": 0.20838627700127066, + "grad_norm": 1.4209604569153904, + "learning_rate": 1.9921551895386875e-05, + "loss": 2.3164, + "step": 164 + }, + { + "epoch": 0.2096569250317662, + "grad_norm": 1.7307653962070104, + "learning_rate": 1.99202831919313e-05, + "loss": 2.0334, + "step": 165 + }, + { + "epoch": 0.21092757306226176, + "grad_norm": 1.5080354301530285, + "learning_rate": 1.9919004352588768e-05, + "loss": 2.6573, + "step": 166 + }, + { + "epoch": 0.2121982210927573, + "grad_norm": 1.9916488578875324, + "learning_rate": 1.991771537866592e-05, + "loss": 2.5658, + "step": 167 + }, + { + "epoch": 0.21346886912325286, + "grad_norm": 1.4649797775396523, + "learning_rate": 1.9916416271479736e-05, + "loss": 2.3194, + "step": 168 + }, + { + "epoch": 0.21473951715374842, + "grad_norm": 1.4247137988825938, + "learning_rate": 1.9915107032357564e-05, + "loss": 2.5108, + "step": 169 + }, + { + "epoch": 0.21601016518424396, + "grad_norm": 1.3024065770687974, + "learning_rate": 1.9913787662637093e-05, + "loss": 2.0994, + "step": 170 + }, + { + "epoch": 0.21728081321473952, + "grad_norm": 1.2857669615933136, + "learning_rate": 1.9912458163666367e-05, + "loss": 2.2093, + "step": 171 + }, + { + "epoch": 0.21855146124523506, + "grad_norm": 1.4238166128200767, + "learning_rate": 1.9911118536803785e-05, + "loss": 2.2752, + "step": 172 + }, + { + "epoch": 0.21982210927573062, + "grad_norm": 1.575778430022986, + "learning_rate": 1.9909768783418086e-05, + "loss": 2.1218, + "step": 173 + }, + { + "epoch": 0.22109275730622618, + "grad_norm": 1.2962857082842534, + "learning_rate": 1.9908408904888356e-05, + "loss": 2.3936, + "step": 174 + }, + { + "epoch": 0.22236340533672172, + "grad_norm": 1.5279108597720172, + "learning_rate": 1.9907038902604033e-05, + "loss": 2.249, + "step": 175 + }, + { + "epoch": 0.22363405336721728, + "grad_norm": 1.2704750696590625, + "learning_rate": 1.9905658777964888e-05, + "loss": 2.3086, + "step": 176 + }, + { + "epoch": 0.22490470139771285, + "grad_norm": 1.3969610069878315, + "learning_rate": 1.990426853238105e-05, + "loss": 2.2686, + "step": 177 + }, + { + "epoch": 0.22617534942820838, + "grad_norm": 1.460679593441549, + "learning_rate": 1.990286816727297e-05, + "loss": 2.1181, + "step": 178 + }, + { + "epoch": 0.22744599745870395, + "grad_norm": 1.3366579414273725, + "learning_rate": 1.9901457684071453e-05, + "loss": 2.2765, + "step": 179 + }, + { + "epoch": 0.22871664548919948, + "grad_norm": 1.245913482097206, + "learning_rate": 1.9900037084217637e-05, + "loss": 2.1238, + "step": 180 + }, + { + "epoch": 0.22998729351969505, + "grad_norm": 1.355073381692421, + "learning_rate": 1.9898606369163e-05, + "loss": 2.3252, + "step": 181 + }, + { + "epoch": 0.2312579415501906, + "grad_norm": 1.28382798355187, + "learning_rate": 1.989716554036935e-05, + "loss": 2.3593, + "step": 182 + }, + { + "epoch": 0.23252858958068615, + "grad_norm": 1.3272489788914297, + "learning_rate": 1.9895714599308822e-05, + "loss": 2.2453, + "step": 183 + }, + { + "epoch": 0.2337992376111817, + "grad_norm": 1.2869596536907777, + "learning_rate": 1.9894253547463897e-05, + "loss": 2.2947, + "step": 184 + }, + { + "epoch": 0.23506988564167725, + "grad_norm": 1.2978788321340524, + "learning_rate": 1.9892782386327385e-05, + "loss": 2.5479, + "step": 185 + }, + { + "epoch": 0.2363405336721728, + "grad_norm": 1.2918291928610448, + "learning_rate": 1.9891301117402415e-05, + "loss": 2.5686, + "step": 186 + }, + { + "epoch": 0.23761118170266837, + "grad_norm": 1.155321594136107, + "learning_rate": 1.9889809742202454e-05, + "loss": 2.0011, + "step": 187 + }, + { + "epoch": 0.2388818297331639, + "grad_norm": 1.1459073035417648, + "learning_rate": 1.9888308262251286e-05, + "loss": 2.0969, + "step": 188 + }, + { + "epoch": 0.24015247776365947, + "grad_norm": 1.252131993544361, + "learning_rate": 1.9886796679083027e-05, + "loss": 2.3877, + "step": 189 + }, + { + "epoch": 0.241423125794155, + "grad_norm": 1.20738751209847, + "learning_rate": 1.988527499424211e-05, + "loss": 2.4384, + "step": 190 + }, + { + "epoch": 0.24269377382465057, + "grad_norm": 1.3691140924554905, + "learning_rate": 1.9883743209283293e-05, + "loss": 2.2901, + "step": 191 + }, + { + "epoch": 0.24396442185514614, + "grad_norm": 1.638413576701321, + "learning_rate": 1.988220132577165e-05, + "loss": 2.3583, + "step": 192 + }, + { + "epoch": 0.24523506988564167, + "grad_norm": 1.2730316144536322, + "learning_rate": 1.9880649345282577e-05, + "loss": 1.9856, + "step": 193 + }, + { + "epoch": 0.24650571791613723, + "grad_norm": 1.2358397961128764, + "learning_rate": 1.9879087269401782e-05, + "loss": 2.23, + "step": 194 + }, + { + "epoch": 0.24777636594663277, + "grad_norm": 1.547745354676377, + "learning_rate": 1.9877515099725294e-05, + "loss": 2.4606, + "step": 195 + }, + { + "epoch": 0.24904701397712833, + "grad_norm": 1.4391127109203996, + "learning_rate": 1.987593283785945e-05, + "loss": 2.3736, + "step": 196 + }, + { + "epoch": 0.2503176620076239, + "grad_norm": 1.7585520256425062, + "learning_rate": 1.9874340485420904e-05, + "loss": 2.1316, + "step": 197 + }, + { + "epoch": 0.25158831003811943, + "grad_norm": 1.319651195207614, + "learning_rate": 1.987273804403661e-05, + "loss": 2.1887, + "step": 198 + }, + { + "epoch": 0.25285895806861497, + "grad_norm": 1.286157362467024, + "learning_rate": 1.987112551534384e-05, + "loss": 1.9623, + "step": 199 + }, + { + "epoch": 0.25412960609911056, + "grad_norm": 1.3073822572525833, + "learning_rate": 1.9869502900990168e-05, + "loss": 2.4822, + "step": 200 + }, + { + "epoch": 0.2554002541296061, + "grad_norm": 1.446649580245481, + "learning_rate": 1.986787020263347e-05, + "loss": 2.2048, + "step": 201 + }, + { + "epoch": 0.25667090216010163, + "grad_norm": 1.3327450989551783, + "learning_rate": 1.9866227421941934e-05, + "loss": 2.1352, + "step": 202 + }, + { + "epoch": 0.2579415501905972, + "grad_norm": 1.3718218115332597, + "learning_rate": 1.9864574560594043e-05, + "loss": 2.1044, + "step": 203 + }, + { + "epoch": 0.25921219822109276, + "grad_norm": 1.797849151509624, + "learning_rate": 1.986291162027858e-05, + "loss": 2.199, + "step": 204 + }, + { + "epoch": 0.2604828462515883, + "grad_norm": 1.3083060220775833, + "learning_rate": 1.9861238602694624e-05, + "loss": 2.2676, + "step": 205 + }, + { + "epoch": 0.2617534942820839, + "grad_norm": 1.424052626314714, + "learning_rate": 1.9859555509551564e-05, + "loss": 2.0826, + "step": 206 + }, + { + "epoch": 0.2630241423125794, + "grad_norm": 1.5293094955596058, + "learning_rate": 1.985786234256906e-05, + "loss": 1.9344, + "step": 207 + }, + { + "epoch": 0.26429479034307496, + "grad_norm": 1.7076258249022205, + "learning_rate": 1.9856159103477085e-05, + "loss": 2.3119, + "step": 208 + }, + { + "epoch": 0.2655654383735705, + "grad_norm": 1.2127373243223818, + "learning_rate": 1.9854445794015895e-05, + "loss": 2.2381, + "step": 209 + }, + { + "epoch": 0.2668360864040661, + "grad_norm": 1.4196056184224093, + "learning_rate": 1.9852722415936034e-05, + "loss": 2.216, + "step": 210 + }, + { + "epoch": 0.2681067344345616, + "grad_norm": 1.4123460554161025, + "learning_rate": 1.9850988970998334e-05, + "loss": 2.4406, + "step": 211 + }, + { + "epoch": 0.26937738246505716, + "grad_norm": 1.9097154188260683, + "learning_rate": 1.984924546097392e-05, + "loss": 2.2335, + "step": 212 + }, + { + "epoch": 0.27064803049555275, + "grad_norm": 1.570238579935913, + "learning_rate": 1.984749188764419e-05, + "loss": 2.2708, + "step": 213 + }, + { + "epoch": 0.2719186785260483, + "grad_norm": 1.2007821744706566, + "learning_rate": 1.9845728252800827e-05, + "loss": 2.018, + "step": 214 + }, + { + "epoch": 0.2731893265565438, + "grad_norm": 2.4648382684532737, + "learning_rate": 1.98439545582458e-05, + "loss": 2.6116, + "step": 215 + }, + { + "epoch": 0.2744599745870394, + "grad_norm": 1.8858078450769689, + "learning_rate": 1.9842170805791356e-05, + "loss": 2.7223, + "step": 216 + }, + { + "epoch": 0.27573062261753495, + "grad_norm": 1.211266603268658, + "learning_rate": 1.9840376997260005e-05, + "loss": 2.1076, + "step": 217 + }, + { + "epoch": 0.2770012706480305, + "grad_norm": 1.5327466889696617, + "learning_rate": 1.983857313448455e-05, + "loss": 2.4202, + "step": 218 + }, + { + "epoch": 0.2782719186785261, + "grad_norm": 1.1157105533698264, + "learning_rate": 1.983675921930805e-05, + "loss": 2.0862, + "step": 219 + }, + { + "epoch": 0.2795425667090216, + "grad_norm": 1.3261147782715659, + "learning_rate": 1.983493525358385e-05, + "loss": 2.1126, + "step": 220 + }, + { + "epoch": 0.28081321473951715, + "grad_norm": 1.1567356334830963, + "learning_rate": 1.983310123917556e-05, + "loss": 2.1587, + "step": 221 + }, + { + "epoch": 0.2820838627700127, + "grad_norm": 1.08583197973077, + "learning_rate": 1.9831257177957045e-05, + "loss": 2.418, + "step": 222 + }, + { + "epoch": 0.2833545108005083, + "grad_norm": 1.1077378609711996, + "learning_rate": 1.9829403071812448e-05, + "loss": 1.9391, + "step": 223 + }, + { + "epoch": 0.2846251588310038, + "grad_norm": 1.2147030605199152, + "learning_rate": 1.9827538922636174e-05, + "loss": 2.2372, + "step": 224 + }, + { + "epoch": 0.28589580686149935, + "grad_norm": 1.3494759002510996, + "learning_rate": 1.9825664732332886e-05, + "loss": 2.1855, + "step": 225 + }, + { + "epoch": 0.28716645489199494, + "grad_norm": 1.1340108074767525, + "learning_rate": 1.98237805028175e-05, + "loss": 2.1137, + "step": 226 + }, + { + "epoch": 0.2884371029224905, + "grad_norm": 1.185055421017148, + "learning_rate": 1.982188623601521e-05, + "loss": 2.2954, + "step": 227 + }, + { + "epoch": 0.289707750952986, + "grad_norm": 1.3631793970945223, + "learning_rate": 1.9819981933861446e-05, + "loss": 2.356, + "step": 228 + }, + { + "epoch": 0.2909783989834816, + "grad_norm": 1.3765107028630217, + "learning_rate": 1.9818067598301894e-05, + "loss": 2.3977, + "step": 229 + }, + { + "epoch": 0.29224904701397714, + "grad_norm": 1.3177872840035507, + "learning_rate": 1.9816143231292496e-05, + "loss": 2.4979, + "step": 230 + }, + { + "epoch": 0.2935196950444727, + "grad_norm": 1.2597614560659143, + "learning_rate": 1.9814208834799446e-05, + "loss": 2.3582, + "step": 231 + }, + { + "epoch": 0.2947903430749682, + "grad_norm": 1.4455310696608084, + "learning_rate": 1.981226441079918e-05, + "loss": 2.3261, + "step": 232 + }, + { + "epoch": 0.2960609911054638, + "grad_norm": 1.1129922300811594, + "learning_rate": 1.9810309961278383e-05, + "loss": 2.0914, + "step": 233 + }, + { + "epoch": 0.29733163913595934, + "grad_norm": 1.4263394114610715, + "learning_rate": 1.980834548823398e-05, + "loss": 2.2741, + "step": 234 + }, + { + "epoch": 0.29860228716645487, + "grad_norm": 1.2498079803290318, + "learning_rate": 1.980637099367314e-05, + "loss": 2.236, + "step": 235 + }, + { + "epoch": 0.29987293519695046, + "grad_norm": 1.348249170128615, + "learning_rate": 1.9804386479613268e-05, + "loss": 2.2804, + "step": 236 + }, + { + "epoch": 0.301143583227446, + "grad_norm": 1.272181875212829, + "learning_rate": 1.9802391948082013e-05, + "loss": 2.3993, + "step": 237 + }, + { + "epoch": 0.30241423125794153, + "grad_norm": 1.4361318089870239, + "learning_rate": 1.9800387401117252e-05, + "loss": 2.1691, + "step": 238 + }, + { + "epoch": 0.3036848792884371, + "grad_norm": 1.3958415978926788, + "learning_rate": 1.9798372840767096e-05, + "loss": 2.481, + "step": 239 + }, + { + "epoch": 0.30495552731893266, + "grad_norm": 1.2566347823253226, + "learning_rate": 1.97963482690899e-05, + "loss": 2.2282, + "step": 240 + }, + { + "epoch": 0.3062261753494282, + "grad_norm": 1.999809093000962, + "learning_rate": 1.9794313688154222e-05, + "loss": 2.0083, + "step": 241 + }, + { + "epoch": 0.30749682337992373, + "grad_norm": 1.3790939937384645, + "learning_rate": 1.979226910003887e-05, + "loss": 2.2375, + "step": 242 + }, + { + "epoch": 0.3087674714104193, + "grad_norm": 1.7157559828108924, + "learning_rate": 1.9790214506832868e-05, + "loss": 2.2433, + "step": 243 + }, + { + "epoch": 0.31003811944091486, + "grad_norm": 1.576223838850493, + "learning_rate": 1.978814991063546e-05, + "loss": 2.4754, + "step": 244 + }, + { + "epoch": 0.3113087674714104, + "grad_norm": 1.3533983164059167, + "learning_rate": 1.9786075313556115e-05, + "loss": 2.2725, + "step": 245 + }, + { + "epoch": 0.312579415501906, + "grad_norm": 1.912049364623592, + "learning_rate": 1.978399071771452e-05, + "loss": 2.2438, + "step": 246 + }, + { + "epoch": 0.3138500635324015, + "grad_norm": 1.3691066714761335, + "learning_rate": 1.9781896125240577e-05, + "loss": 2.3712, + "step": 247 + }, + { + "epoch": 0.31512071156289706, + "grad_norm": 1.591417595217935, + "learning_rate": 1.9779791538274403e-05, + "loss": 2.0608, + "step": 248 + }, + { + "epoch": 0.31639135959339265, + "grad_norm": 1.3332773327402838, + "learning_rate": 1.9777676958966318e-05, + "loss": 2.5189, + "step": 249 + }, + { + "epoch": 0.3176620076238882, + "grad_norm": 1.451125313869464, + "learning_rate": 1.9775552389476865e-05, + "loss": 2.226, + "step": 250 + }, + { + "epoch": 0.3189326556543837, + "grad_norm": 1.6449115746390213, + "learning_rate": 1.9773417831976783e-05, + "loss": 2.4042, + "step": 251 + }, + { + "epoch": 0.3202033036848793, + "grad_norm": 1.4155289951041365, + "learning_rate": 1.977127328864703e-05, + "loss": 2.2907, + "step": 252 + }, + { + "epoch": 0.32147395171537485, + "grad_norm": 1.3395584815709398, + "learning_rate": 1.9769118761678748e-05, + "loss": 2.1805, + "step": 253 + }, + { + "epoch": 0.3227445997458704, + "grad_norm": 1.1847706337861326, + "learning_rate": 1.9766954253273297e-05, + "loss": 1.9415, + "step": 254 + }, + { + "epoch": 0.3240152477763659, + "grad_norm": 1.4562207170493793, + "learning_rate": 1.9764779765642226e-05, + "loss": 2.8472, + "step": 255 + }, + { + "epoch": 0.3252858958068615, + "grad_norm": 1.4537245318712395, + "learning_rate": 1.9762595301007282e-05, + "loss": 2.5208, + "step": 256 + }, + { + "epoch": 0.32655654383735705, + "grad_norm": 1.161757040542941, + "learning_rate": 1.97604008616004e-05, + "loss": 2.1415, + "step": 257 + }, + { + "epoch": 0.3278271918678526, + "grad_norm": 1.1214870383786562, + "learning_rate": 1.9758196449663726e-05, + "loss": 2.1826, + "step": 258 + }, + { + "epoch": 0.3290978398983482, + "grad_norm": 1.2880601651737729, + "learning_rate": 1.9755982067449565e-05, + "loss": 2.3432, + "step": 259 + }, + { + "epoch": 0.3303684879288437, + "grad_norm": 1.503028999152933, + "learning_rate": 1.975375771722044e-05, + "loss": 2.2745, + "step": 260 + }, + { + "epoch": 0.33163913595933925, + "grad_norm": 1.2953888075265865, + "learning_rate": 1.975152340124904e-05, + "loss": 2.23, + "step": 261 + }, + { + "epoch": 0.33290978398983484, + "grad_norm": 1.3411701590648721, + "learning_rate": 1.9749279121818235e-05, + "loss": 2.2582, + "step": 262 + }, + { + "epoch": 0.3341804320203304, + "grad_norm": 1.798210025564672, + "learning_rate": 1.974702488122109e-05, + "loss": 2.4651, + "step": 263 + }, + { + "epoch": 0.3354510800508259, + "grad_norm": 1.549188730172701, + "learning_rate": 1.9744760681760832e-05, + "loss": 2.3012, + "step": 264 + }, + { + "epoch": 0.33672172808132145, + "grad_norm": 1.3376711650759068, + "learning_rate": 1.9742486525750875e-05, + "loss": 2.3175, + "step": 265 + }, + { + "epoch": 0.33799237611181704, + "grad_norm": 1.4373993014473352, + "learning_rate": 1.9740202415514794e-05, + "loss": 2.2695, + "step": 266 + }, + { + "epoch": 0.3392630241423126, + "grad_norm": 1.1679976611001082, + "learning_rate": 1.9737908353386345e-05, + "loss": 2.1141, + "step": 267 + }, + { + "epoch": 0.3405336721728081, + "grad_norm": 1.153202631564055, + "learning_rate": 1.9735604341709448e-05, + "loss": 2.0185, + "step": 268 + }, + { + "epoch": 0.3418043202033037, + "grad_norm": 1.3668597801871027, + "learning_rate": 1.973329038283819e-05, + "loss": 2.5549, + "step": 269 + }, + { + "epoch": 0.34307496823379924, + "grad_norm": 1.217837506597909, + "learning_rate": 1.973096647913682e-05, + "loss": 2.3022, + "step": 270 + }, + { + "epoch": 0.3443456162642948, + "grad_norm": 1.344490125479597, + "learning_rate": 1.9728632632979746e-05, + "loss": 2.3829, + "step": 271 + }, + { + "epoch": 0.34561626429479037, + "grad_norm": 1.0773850473756055, + "learning_rate": 1.9726288846751544e-05, + "loss": 2.0332, + "step": 272 + }, + { + "epoch": 0.3468869123252859, + "grad_norm": 1.1878431921754486, + "learning_rate": 1.972393512284693e-05, + "loss": 2.2342, + "step": 273 + }, + { + "epoch": 0.34815756035578144, + "grad_norm": 4.06183553812225, + "learning_rate": 1.9721571463670794e-05, + "loss": 2.3137, + "step": 274 + }, + { + "epoch": 0.34942820838627703, + "grad_norm": 1.3852941436439452, + "learning_rate": 1.9719197871638154e-05, + "loss": 2.1223, + "step": 275 + }, + { + "epoch": 0.35069885641677256, + "grad_norm": 1.1642263744963426, + "learning_rate": 1.9716814349174193e-05, + "loss": 2.3546, + "step": 276 + }, + { + "epoch": 0.3519695044472681, + "grad_norm": 1.7204368329652955, + "learning_rate": 1.9714420898714243e-05, + "loss": 2.2857, + "step": 277 + }, + { + "epoch": 0.35324015247776364, + "grad_norm": 1.3528935674549563, + "learning_rate": 1.9712017522703764e-05, + "loss": 2.0075, + "step": 278 + }, + { + "epoch": 0.3545108005082592, + "grad_norm": 1.848608080743895, + "learning_rate": 1.970960422359837e-05, + "loss": 2.4993, + "step": 279 + }, + { + "epoch": 0.35578144853875476, + "grad_norm": 1.115912725965341, + "learning_rate": 1.970718100386381e-05, + "loss": 2.2729, + "step": 280 + }, + { + "epoch": 0.3570520965692503, + "grad_norm": 1.7126229343091541, + "learning_rate": 1.9704747865975968e-05, + "loss": 2.0126, + "step": 281 + }, + { + "epoch": 0.3583227445997459, + "grad_norm": 1.2859654653410564, + "learning_rate": 1.9702304812420864e-05, + "loss": 2.0828, + "step": 282 + }, + { + "epoch": 0.3595933926302414, + "grad_norm": 1.2069593223490485, + "learning_rate": 1.9699851845694646e-05, + "loss": 2.2182, + "step": 283 + }, + { + "epoch": 0.36086404066073696, + "grad_norm": 1.244963257392342, + "learning_rate": 1.9697388968303596e-05, + "loss": 2.1136, + "step": 284 + }, + { + "epoch": 0.36213468869123255, + "grad_norm": 1.6361926372534965, + "learning_rate": 1.9694916182764113e-05, + "loss": 2.1869, + "step": 285 + }, + { + "epoch": 0.3634053367217281, + "grad_norm": 1.214743213942469, + "learning_rate": 1.9692433491602732e-05, + "loss": 1.7988, + "step": 286 + }, + { + "epoch": 0.3646759847522236, + "grad_norm": 1.2833630632840507, + "learning_rate": 1.96899408973561e-05, + "loss": 2.3083, + "step": 287 + }, + { + "epoch": 0.36594663278271916, + "grad_norm": 1.1401604797768607, + "learning_rate": 1.9687438402570976e-05, + "loss": 2.2576, + "step": 288 + }, + { + "epoch": 0.36721728081321475, + "grad_norm": 1.2275194123657456, + "learning_rate": 1.9684926009804254e-05, + "loss": 1.9264, + "step": 289 + }, + { + "epoch": 0.3684879288437103, + "grad_norm": 1.136589027712856, + "learning_rate": 1.9682403721622928e-05, + "loss": 2.1373, + "step": 290 + }, + { + "epoch": 0.3697585768742058, + "grad_norm": 1.6453722784067453, + "learning_rate": 1.96798715406041e-05, + "loss": 2.2285, + "step": 291 + }, + { + "epoch": 0.3710292249047014, + "grad_norm": 1.2928052738281843, + "learning_rate": 1.967732946933499e-05, + "loss": 2.311, + "step": 292 + }, + { + "epoch": 0.37229987293519695, + "grad_norm": 1.5312617633714525, + "learning_rate": 1.9674777510412913e-05, + "loss": 2.0747, + "step": 293 + }, + { + "epoch": 0.3735705209656925, + "grad_norm": 1.1884554795098419, + "learning_rate": 1.9672215666445295e-05, + "loss": 2.2441, + "step": 294 + }, + { + "epoch": 0.3748411689961881, + "grad_norm": 1.3471883018180635, + "learning_rate": 1.9669643940049657e-05, + "loss": 2.2793, + "step": 295 + }, + { + "epoch": 0.3761118170266836, + "grad_norm": 1.6844014779879697, + "learning_rate": 1.9667062333853618e-05, + "loss": 2.14, + "step": 296 + }, + { + "epoch": 0.37738246505717915, + "grad_norm": 1.4151511187076775, + "learning_rate": 1.966447085049489e-05, + "loss": 2.3913, + "step": 297 + }, + { + "epoch": 0.3786531130876747, + "grad_norm": 1.7966731737314259, + "learning_rate": 1.966186949262128e-05, + "loss": 2.5553, + "step": 298 + }, + { + "epoch": 0.3799237611181703, + "grad_norm": 1.195568621844938, + "learning_rate": 1.9659258262890683e-05, + "loss": 2.2099, + "step": 299 + }, + { + "epoch": 0.3811944091486658, + "grad_norm": 1.1219471693644816, + "learning_rate": 1.9656637163971083e-05, + "loss": 2.0938, + "step": 300 + }, + { + "epoch": 0.38246505717916135, + "grad_norm": 1.7171705514194913, + "learning_rate": 1.9654006198540543e-05, + "loss": 2.2291, + "step": 301 + }, + { + "epoch": 0.38373570520965694, + "grad_norm": 2.1299812627833705, + "learning_rate": 1.9651365369287206e-05, + "loss": 2.2524, + "step": 302 + }, + { + "epoch": 0.3850063532401525, + "grad_norm": 1.2578418042723456, + "learning_rate": 1.9648714678909296e-05, + "loss": 2.1554, + "step": 303 + }, + { + "epoch": 0.386277001270648, + "grad_norm": 1.639835412483817, + "learning_rate": 1.964605413011512e-05, + "loss": 2.1907, + "step": 304 + }, + { + "epoch": 0.3875476493011436, + "grad_norm": 1.4296870500572354, + "learning_rate": 1.9643383725623042e-05, + "loss": 2.4861, + "step": 305 + }, + { + "epoch": 0.38881829733163914, + "grad_norm": 1.5950024668148066, + "learning_rate": 1.9640703468161508e-05, + "loss": 2.3117, + "step": 306 + }, + { + "epoch": 0.3900889453621347, + "grad_norm": 1.2586676337646305, + "learning_rate": 1.9638013360469026e-05, + "loss": 2.1, + "step": 307 + }, + { + "epoch": 0.39135959339263027, + "grad_norm": 1.3155012674354023, + "learning_rate": 1.963531340529417e-05, + "loss": 2.3952, + "step": 308 + }, + { + "epoch": 0.3926302414231258, + "grad_norm": 1.6277796635877058, + "learning_rate": 1.9632603605395576e-05, + "loss": 2.2169, + "step": 309 + }, + { + "epoch": 0.39390088945362134, + "grad_norm": 1.406180736008549, + "learning_rate": 1.9629883963541933e-05, + "loss": 2.2009, + "step": 310 + }, + { + "epoch": 0.3951715374841169, + "grad_norm": 1.3310192257266478, + "learning_rate": 1.9627154482511995e-05, + "loss": 2.1501, + "step": 311 + }, + { + "epoch": 0.39644218551461247, + "grad_norm": 1.3757223525941409, + "learning_rate": 1.9624415165094567e-05, + "loss": 2.1629, + "step": 312 + }, + { + "epoch": 0.397712833545108, + "grad_norm": 1.572533153332573, + "learning_rate": 1.9621666014088495e-05, + "loss": 2.314, + "step": 313 + }, + { + "epoch": 0.39898348157560354, + "grad_norm": 1.3862969039502098, + "learning_rate": 1.9618907032302684e-05, + "loss": 2.3253, + "step": 314 + }, + { + "epoch": 0.40025412960609913, + "grad_norm": 1.8202182825860844, + "learning_rate": 1.9616138222556075e-05, + "loss": 2.339, + "step": 315 + }, + { + "epoch": 0.40152477763659467, + "grad_norm": 1.3790668607780714, + "learning_rate": 1.9613359587677658e-05, + "loss": 2.2941, + "step": 316 + }, + { + "epoch": 0.4027954256670902, + "grad_norm": 2.1227907347388104, + "learning_rate": 1.961057113050645e-05, + "loss": 2.3847, + "step": 317 + }, + { + "epoch": 0.4040660736975858, + "grad_norm": 1.6782062198815728, + "learning_rate": 1.9607772853891528e-05, + "loss": 2.4474, + "step": 318 + }, + { + "epoch": 0.40533672172808133, + "grad_norm": 1.1299388068648677, + "learning_rate": 1.9604964760691966e-05, + "loss": 2.2332, + "step": 319 + }, + { + "epoch": 0.40660736975857686, + "grad_norm": 1.356462757817847, + "learning_rate": 1.9602146853776894e-05, + "loss": 2.0578, + "step": 320 + }, + { + "epoch": 0.4078780177890724, + "grad_norm": 1.6129815716777167, + "learning_rate": 1.959931913602547e-05, + "loss": 2.1875, + "step": 321 + }, + { + "epoch": 0.409148665819568, + "grad_norm": 1.5767888873450564, + "learning_rate": 1.959648161032686e-05, + "loss": 2.2612, + "step": 322 + }, + { + "epoch": 0.4104193138500635, + "grad_norm": 2.185303482734268, + "learning_rate": 1.9593634279580258e-05, + "loss": 2.3963, + "step": 323 + }, + { + "epoch": 0.41168996188055906, + "grad_norm": 1.7641635525451629, + "learning_rate": 1.9590777146694888e-05, + "loss": 1.9981, + "step": 324 + }, + { + "epoch": 0.41296060991105465, + "grad_norm": 1.4282615623822061, + "learning_rate": 1.9587910214589966e-05, + "loss": 2.1085, + "step": 325 + }, + { + "epoch": 0.4142312579415502, + "grad_norm": 1.5919068774694538, + "learning_rate": 1.958503348619474e-05, + "loss": 2.2317, + "step": 326 + }, + { + "epoch": 0.4155019059720457, + "grad_norm": 1.550564026935414, + "learning_rate": 1.9582146964448457e-05, + "loss": 2.247, + "step": 327 + }, + { + "epoch": 0.4167725540025413, + "grad_norm": 1.2251036177676735, + "learning_rate": 1.957925065230038e-05, + "loss": 2.0521, + "step": 328 + }, + { + "epoch": 0.41804320203303685, + "grad_norm": 1.6011207580960447, + "learning_rate": 1.9576344552709762e-05, + "loss": 2.2269, + "step": 329 + }, + { + "epoch": 0.4193138500635324, + "grad_norm": 2.2905735549438457, + "learning_rate": 1.9573428668645865e-05, + "loss": 1.9699, + "step": 330 + }, + { + "epoch": 0.420584498094028, + "grad_norm": 1.2596706172279368, + "learning_rate": 1.9570503003087947e-05, + "loss": 1.986, + "step": 331 + }, + { + "epoch": 0.4218551461245235, + "grad_norm": 1.3420938625327155, + "learning_rate": 1.9567567559025257e-05, + "loss": 2.1155, + "step": 332 + }, + { + "epoch": 0.42312579415501905, + "grad_norm": 1.0397203955504706, + "learning_rate": 1.956462233945703e-05, + "loss": 1.9222, + "step": 333 + }, + { + "epoch": 0.4243964421855146, + "grad_norm": 1.3942914467388836, + "learning_rate": 1.956166734739251e-05, + "loss": 2.2585, + "step": 334 + }, + { + "epoch": 0.4256670902160102, + "grad_norm": 1.636497018081038, + "learning_rate": 1.9558702585850902e-05, + "loss": 2.2782, + "step": 335 + }, + { + "epoch": 0.4269377382465057, + "grad_norm": 1.1133171000095392, + "learning_rate": 1.955572805786141e-05, + "loss": 2.1721, + "step": 336 + }, + { + "epoch": 0.42820838627700125, + "grad_norm": 1.2097521097251087, + "learning_rate": 1.95527437664632e-05, + "loss": 2.1854, + "step": 337 + }, + { + "epoch": 0.42947903430749684, + "grad_norm": 1.2620840723571427, + "learning_rate": 1.954974971470543e-05, + "loss": 2.2633, + "step": 338 + }, + { + "epoch": 0.4307496823379924, + "grad_norm": 1.3328233261636864, + "learning_rate": 1.954674590564722e-05, + "loss": 2.2392, + "step": 339 + }, + { + "epoch": 0.4320203303684879, + "grad_norm": 1.1554243062365197, + "learning_rate": 1.9543732342357664e-05, + "loss": 2.1155, + "step": 340 + }, + { + "epoch": 0.4332909783989835, + "grad_norm": 1.0601647426096334, + "learning_rate": 1.954070902791582e-05, + "loss": 2.1215, + "step": 341 + }, + { + "epoch": 0.43456162642947904, + "grad_norm": 1.3082564186519354, + "learning_rate": 1.953767596541071e-05, + "loss": 2.2465, + "step": 342 + }, + { + "epoch": 0.4358322744599746, + "grad_norm": 1.1678820791024087, + "learning_rate": 1.9534633157941315e-05, + "loss": 2.5573, + "step": 343 + }, + { + "epoch": 0.4371029224904701, + "grad_norm": 1.0375732983429344, + "learning_rate": 1.9531580608616578e-05, + "loss": 1.8765, + "step": 344 + }, + { + "epoch": 0.4383735705209657, + "grad_norm": 1.0354698112304326, + "learning_rate": 1.952851832055539e-05, + "loss": 2.2035, + "step": 345 + }, + { + "epoch": 0.43964421855146124, + "grad_norm": 1.3397961801605331, + "learning_rate": 1.9525446296886593e-05, + "loss": 2.3499, + "step": 346 + }, + { + "epoch": 0.4409148665819568, + "grad_norm": 2.4318786956134764, + "learning_rate": 1.952236454074897e-05, + "loss": 2.0937, + "step": 347 + }, + { + "epoch": 0.44218551461245237, + "grad_norm": 1.3384117168999674, + "learning_rate": 1.9519273055291266e-05, + "loss": 2.0813, + "step": 348 + }, + { + "epoch": 0.4434561626429479, + "grad_norm": 1.3106293831176954, + "learning_rate": 1.9516171843672153e-05, + "loss": 2.1467, + "step": 349 + }, + { + "epoch": 0.44472681067344344, + "grad_norm": 1.5449558564457215, + "learning_rate": 1.9513060909060237e-05, + "loss": 2.6179, + "step": 350 + }, + { + "epoch": 0.44599745870393903, + "grad_norm": 4.436046259180675, + "learning_rate": 1.950994025463407e-05, + "loss": 2.4073, + "step": 351 + }, + { + "epoch": 0.44726810673443457, + "grad_norm": 1.327079328183306, + "learning_rate": 1.9506809883582126e-05, + "loss": 2.0597, + "step": 352 + }, + { + "epoch": 0.4485387547649301, + "grad_norm": 1.6916627624113527, + "learning_rate": 1.9503669799102815e-05, + "loss": 2.2454, + "step": 353 + }, + { + "epoch": 0.4498094027954257, + "grad_norm": 1.516462557151638, + "learning_rate": 1.9500520004404458e-05, + "loss": 2.4039, + "step": 354 + }, + { + "epoch": 0.45108005082592123, + "grad_norm": 1.595275957141959, + "learning_rate": 1.949736050270532e-05, + "loss": 1.9971, + "step": 355 + }, + { + "epoch": 0.45235069885641677, + "grad_norm": 1.5893741248692173, + "learning_rate": 1.949419129723356e-05, + "loss": 2.3408, + "step": 356 + }, + { + "epoch": 0.4536213468869123, + "grad_norm": 2.5440159135576557, + "learning_rate": 1.9491012391227266e-05, + "loss": 2.3986, + "step": 357 + }, + { + "epoch": 0.4548919949174079, + "grad_norm": 1.934010697927457, + "learning_rate": 1.948782378793443e-05, + "loss": 2.2534, + "step": 358 + }, + { + "epoch": 0.45616264294790343, + "grad_norm": 1.5206954937016697, + "learning_rate": 1.9484625490612957e-05, + "loss": 2.1512, + "step": 359 + }, + { + "epoch": 0.45743329097839897, + "grad_norm": 1.2772650225420106, + "learning_rate": 1.9481417502530654e-05, + "loss": 2.0985, + "step": 360 + }, + { + "epoch": 0.45870393900889456, + "grad_norm": 1.7400209934793354, + "learning_rate": 1.9478199826965232e-05, + "loss": 2.1081, + "step": 361 + }, + { + "epoch": 0.4599745870393901, + "grad_norm": 1.3556440359592625, + "learning_rate": 1.9474972467204298e-05, + "loss": 1.8941, + "step": 362 + }, + { + "epoch": 0.46124523506988563, + "grad_norm": 1.3453307109148631, + "learning_rate": 1.9471735426545356e-05, + "loss": 2.1768, + "step": 363 + }, + { + "epoch": 0.4625158831003812, + "grad_norm": 1.1858948028790681, + "learning_rate": 1.9468488708295793e-05, + "loss": 2.2965, + "step": 364 + }, + { + "epoch": 0.46378653113087676, + "grad_norm": 1.4003361314968354, + "learning_rate": 1.9465232315772896e-05, + "loss": 2.0454, + "step": 365 + }, + { + "epoch": 0.4650571791613723, + "grad_norm": 1.720112687089305, + "learning_rate": 1.9461966252303825e-05, + "loss": 2.0443, + "step": 366 + }, + { + "epoch": 0.4663278271918678, + "grad_norm": 1.3109306255966038, + "learning_rate": 1.9458690521225634e-05, + "loss": 2.2423, + "step": 367 + }, + { + "epoch": 0.4675984752223634, + "grad_norm": 1.2095315060799088, + "learning_rate": 1.9455405125885244e-05, + "loss": 1.9604, + "step": 368 + }, + { + "epoch": 0.46886912325285895, + "grad_norm": 1.3841358768722092, + "learning_rate": 1.945211006963945e-05, + "loss": 2.3195, + "step": 369 + }, + { + "epoch": 0.4701397712833545, + "grad_norm": 1.5744617183370688, + "learning_rate": 1.9448805355854932e-05, + "loss": 2.2957, + "step": 370 + }, + { + "epoch": 0.4714104193138501, + "grad_norm": 1.6922764749008226, + "learning_rate": 1.944549098790822e-05, + "loss": 2.4135, + "step": 371 + }, + { + "epoch": 0.4726810673443456, + "grad_norm": 4.102432396640123, + "learning_rate": 1.9442166969185715e-05, + "loss": 2.2834, + "step": 372 + }, + { + "epoch": 0.47395171537484115, + "grad_norm": 1.4905796877370419, + "learning_rate": 1.9438833303083677e-05, + "loss": 2.1638, + "step": 373 + }, + { + "epoch": 0.47522236340533675, + "grad_norm": 1.450353779580371, + "learning_rate": 1.943548999300823e-05, + "loss": 2.3498, + "step": 374 + }, + { + "epoch": 0.4764930114358323, + "grad_norm": 3.7843721284254297, + "learning_rate": 1.9432137042375345e-05, + "loss": 2.0612, + "step": 375 + }, + { + "epoch": 0.4777636594663278, + "grad_norm": 2.7285211777033096, + "learning_rate": 1.9428774454610845e-05, + "loss": 2.2545, + "step": 376 + }, + { + "epoch": 0.47903430749682335, + "grad_norm": 1.4297449124991255, + "learning_rate": 1.9425402233150394e-05, + "loss": 2.0982, + "step": 377 + }, + { + "epoch": 0.48030495552731894, + "grad_norm": 1.3657540433108728, + "learning_rate": 1.942202038143951e-05, + "loss": 1.9351, + "step": 378 + }, + { + "epoch": 0.4815756035578145, + "grad_norm": 1.8325177673920885, + "learning_rate": 1.941862890293354e-05, + "loss": 2.245, + "step": 379 + }, + { + "epoch": 0.48284625158831, + "grad_norm": 1.4657268997197, + "learning_rate": 1.9415227801097677e-05, + "loss": 2.0714, + "step": 380 + }, + { + "epoch": 0.4841168996188056, + "grad_norm": 1.427569121813906, + "learning_rate": 1.9411817079406936e-05, + "loss": 1.9048, + "step": 381 + }, + { + "epoch": 0.48538754764930114, + "grad_norm": 1.5640321858304782, + "learning_rate": 1.9408396741346167e-05, + "loss": 2.3833, + "step": 382 + }, + { + "epoch": 0.4866581956797967, + "grad_norm": 1.2857820843328875, + "learning_rate": 1.9404966790410047e-05, + "loss": 2.2031, + "step": 383 + }, + { + "epoch": 0.48792884371029227, + "grad_norm": 1.141659221043011, + "learning_rate": 1.940152723010307e-05, + "loss": 2.247, + "step": 384 + }, + { + "epoch": 0.4891994917407878, + "grad_norm": 1.5323549274637351, + "learning_rate": 1.9398078063939552e-05, + "loss": 2.1866, + "step": 385 + }, + { + "epoch": 0.49047013977128334, + "grad_norm": 1.3996052498405696, + "learning_rate": 1.9394619295443622e-05, + "loss": 2.3547, + "step": 386 + }, + { + "epoch": 0.49174078780177893, + "grad_norm": 1.19341579288055, + "learning_rate": 1.9391150928149218e-05, + "loss": 2.164, + "step": 387 + }, + { + "epoch": 0.49301143583227447, + "grad_norm": 1.5852734903111279, + "learning_rate": 1.9387672965600088e-05, + "loss": 2.1857, + "step": 388 + }, + { + "epoch": 0.49428208386277, + "grad_norm": 1.3587470658011351, + "learning_rate": 1.9384185411349786e-05, + "loss": 2.1427, + "step": 389 + }, + { + "epoch": 0.49555273189326554, + "grad_norm": 1.3204033096459802, + "learning_rate": 1.938068826896166e-05, + "loss": 2.2107, + "step": 390 + }, + { + "epoch": 0.49682337992376113, + "grad_norm": 1.3063591306506408, + "learning_rate": 1.937718154200886e-05, + "loss": 2.3279, + "step": 391 + }, + { + "epoch": 0.49809402795425667, + "grad_norm": 2.3102205889364438, + "learning_rate": 1.9373665234074328e-05, + "loss": 2.2933, + "step": 392 + }, + { + "epoch": 0.4993646759847522, + "grad_norm": 1.2926523346468037, + "learning_rate": 1.937013934875079e-05, + "loss": 2.4271, + "step": 393 + }, + { + "epoch": 0.5006353240152478, + "grad_norm": 1.7975874404286474, + "learning_rate": 1.9366603889640765e-05, + "loss": 2.1837, + "step": 394 + }, + { + "epoch": 0.5019059720457433, + "grad_norm": 1.4749865365496588, + "learning_rate": 1.9363058860356548e-05, + "loss": 2.2177, + "step": 395 + }, + { + "epoch": 0.5031766200762389, + "grad_norm": 1.1259560135112834, + "learning_rate": 1.9359504264520218e-05, + "loss": 2.0392, + "step": 396 + }, + { + "epoch": 0.5044472681067345, + "grad_norm": 1.3569865041821507, + "learning_rate": 1.9355940105763622e-05, + "loss": 2.4177, + "step": 397 + }, + { + "epoch": 0.5057179161372299, + "grad_norm": 1.2836527221719194, + "learning_rate": 1.9352366387728385e-05, + "loss": 2.0478, + "step": 398 + }, + { + "epoch": 0.5069885641677255, + "grad_norm": 1.9600150344140488, + "learning_rate": 1.934878311406589e-05, + "loss": 2.2771, + "step": 399 + }, + { + "epoch": 0.5082592121982211, + "grad_norm": 1.3253015736567784, + "learning_rate": 1.9345190288437292e-05, + "loss": 2.4092, + "step": 400 + }, + { + "epoch": 0.5095298602287166, + "grad_norm": 1.2347627921797202, + "learning_rate": 1.9341587914513496e-05, + "loss": 2.1296, + "step": 401 + }, + { + "epoch": 0.5108005082592122, + "grad_norm": 1.135211318424933, + "learning_rate": 1.933797599597518e-05, + "loss": 2.2406, + "step": 402 + }, + { + "epoch": 0.5120711562897078, + "grad_norm": 1.1498013355042722, + "learning_rate": 1.9334354536512746e-05, + "loss": 2.1021, + "step": 403 + }, + { + "epoch": 0.5133418043202033, + "grad_norm": 1.2455423054908155, + "learning_rate": 1.9330723539826373e-05, + "loss": 2.1064, + "step": 404 + }, + { + "epoch": 0.5146124523506989, + "grad_norm": 1.2931710810726527, + "learning_rate": 1.9327083009625974e-05, + "loss": 2.107, + "step": 405 + }, + { + "epoch": 0.5158831003811944, + "grad_norm": 1.282732123781329, + "learning_rate": 1.9323432949631195e-05, + "loss": 2.0613, + "step": 406 + }, + { + "epoch": 0.5171537484116899, + "grad_norm": 1.2908466078690843, + "learning_rate": 1.9319773363571424e-05, + "loss": 2.0705, + "step": 407 + }, + { + "epoch": 0.5184243964421855, + "grad_norm": 1.3060341367429515, + "learning_rate": 1.931610425518579e-05, + "loss": 2.0751, + "step": 408 + }, + { + "epoch": 0.5196950444726811, + "grad_norm": 1.59682319598207, + "learning_rate": 1.9312425628223134e-05, + "loss": 2.3488, + "step": 409 + }, + { + "epoch": 0.5209656925031766, + "grad_norm": 1.246031551721595, + "learning_rate": 1.9308737486442045e-05, + "loss": 2.2637, + "step": 410 + }, + { + "epoch": 0.5222363405336722, + "grad_norm": 1.0094782030605576, + "learning_rate": 1.930503983361081e-05, + "loss": 1.876, + "step": 411 + }, + { + "epoch": 0.5235069885641678, + "grad_norm": 2.221169423253986, + "learning_rate": 1.930133267350746e-05, + "loss": 2.3233, + "step": 412 + }, + { + "epoch": 0.5247776365946633, + "grad_norm": 2.190877335011136, + "learning_rate": 1.9297616009919708e-05, + "loss": 2.2177, + "step": 413 + }, + { + "epoch": 0.5260482846251588, + "grad_norm": 1.3699043310370755, + "learning_rate": 1.9293889846645008e-05, + "loss": 2.1351, + "step": 414 + }, + { + "epoch": 0.5273189326556544, + "grad_norm": 1.4624860775913118, + "learning_rate": 1.9290154187490497e-05, + "loss": 2.411, + "step": 415 + }, + { + "epoch": 0.5285895806861499, + "grad_norm": 1.6401976862702454, + "learning_rate": 1.9286409036273027e-05, + "loss": 2.1643, + "step": 416 + }, + { + "epoch": 0.5298602287166455, + "grad_norm": 1.5842195123976015, + "learning_rate": 1.9282654396819145e-05, + "loss": 2.67, + "step": 417 + }, + { + "epoch": 0.531130876747141, + "grad_norm": 1.6335296017089804, + "learning_rate": 1.9278890272965097e-05, + "loss": 2.2863, + "step": 418 + }, + { + "epoch": 0.5324015247776366, + "grad_norm": 1.6531683834499145, + "learning_rate": 1.9275116668556805e-05, + "loss": 2.1697, + "step": 419 + }, + { + "epoch": 0.5336721728081322, + "grad_norm": 1.2631900846528858, + "learning_rate": 1.9271333587449895e-05, + "loss": 2.2538, + "step": 420 + }, + { + "epoch": 0.5349428208386277, + "grad_norm": 1.6490841748768568, + "learning_rate": 1.9267541033509667e-05, + "loss": 2.1101, + "step": 421 + }, + { + "epoch": 0.5362134688691232, + "grad_norm": 2.320810299489566, + "learning_rate": 1.92637390106111e-05, + "loss": 2.2869, + "step": 422 + }, + { + "epoch": 0.5374841168996188, + "grad_norm": 1.286261486771375, + "learning_rate": 1.925992752263885e-05, + "loss": 2.2688, + "step": 423 + }, + { + "epoch": 0.5387547649301143, + "grad_norm": 1.5581878124813053, + "learning_rate": 1.9256106573487238e-05, + "loss": 2.2076, + "step": 424 + }, + { + "epoch": 0.5400254129606099, + "grad_norm": 1.7249893993080114, + "learning_rate": 1.925227616706026e-05, + "loss": 2.2496, + "step": 425 + }, + { + "epoch": 0.5412960609911055, + "grad_norm": 1.6256150858689495, + "learning_rate": 1.924843630727157e-05, + "loss": 2.1455, + "step": 426 + }, + { + "epoch": 0.542566709021601, + "grad_norm": 1.1965038217009443, + "learning_rate": 1.9244586998044485e-05, + "loss": 2.0354, + "step": 427 + }, + { + "epoch": 0.5438373570520966, + "grad_norm": 1.4418630674402975, + "learning_rate": 1.924072824331197e-05, + "loss": 2.1294, + "step": 428 + }, + { + "epoch": 0.5451080050825922, + "grad_norm": 1.2574351344869776, + "learning_rate": 1.9236860047016647e-05, + "loss": 2.1624, + "step": 429 + }, + { + "epoch": 0.5463786531130876, + "grad_norm": 1.3978680736362221, + "learning_rate": 1.923298241311078e-05, + "loss": 2.2992, + "step": 430 + }, + { + "epoch": 0.5476493011435832, + "grad_norm": 1.2755866211804594, + "learning_rate": 1.9229095345556278e-05, + "loss": 2.2508, + "step": 431 + }, + { + "epoch": 0.5489199491740788, + "grad_norm": 1.314894688674993, + "learning_rate": 1.9225198848324687e-05, + "loss": 2.0979, + "step": 432 + }, + { + "epoch": 0.5501905972045743, + "grad_norm": 1.5314349817624966, + "learning_rate": 1.9221292925397196e-05, + "loss": 1.8943, + "step": 433 + }, + { + "epoch": 0.5514612452350699, + "grad_norm": 1.1921961065015845, + "learning_rate": 1.921737758076461e-05, + "loss": 2.1535, + "step": 434 + }, + { + "epoch": 0.5527318932655655, + "grad_norm": 1.186044614997674, + "learning_rate": 1.9213452818427374e-05, + "loss": 2.1741, + "step": 435 + }, + { + "epoch": 0.554002541296061, + "grad_norm": 1.5267477608246511, + "learning_rate": 1.920951864239555e-05, + "loss": 2.2164, + "step": 436 + }, + { + "epoch": 0.5552731893265566, + "grad_norm": 1.3936059207900588, + "learning_rate": 1.920557505668881e-05, + "loss": 2.1367, + "step": 437 + }, + { + "epoch": 0.5565438373570522, + "grad_norm": 1.3363717534452426, + "learning_rate": 1.9201622065336455e-05, + "loss": 2.0526, + "step": 438 + }, + { + "epoch": 0.5578144853875476, + "grad_norm": 1.373993468638361, + "learning_rate": 1.9197659672377388e-05, + "loss": 2.1925, + "step": 439 + }, + { + "epoch": 0.5590851334180432, + "grad_norm": 1.1534456508897548, + "learning_rate": 1.919368788186012e-05, + "loss": 2.2284, + "step": 440 + }, + { + "epoch": 0.5603557814485387, + "grad_norm": 1.2512328467790448, + "learning_rate": 1.918970669784276e-05, + "loss": 2.2891, + "step": 441 + }, + { + "epoch": 0.5616264294790343, + "grad_norm": 1.3611375165189201, + "learning_rate": 1.918571612439302e-05, + "loss": 2.2749, + "step": 442 + }, + { + "epoch": 0.5628970775095299, + "grad_norm": 1.165275113048963, + "learning_rate": 1.9181716165588206e-05, + "loss": 2.0572, + "step": 443 + }, + { + "epoch": 0.5641677255400254, + "grad_norm": 1.3439653647129153, + "learning_rate": 1.9177706825515204e-05, + "loss": 2.0461, + "step": 444 + }, + { + "epoch": 0.565438373570521, + "grad_norm": 1.156674833687087, + "learning_rate": 1.9173688108270495e-05, + "loss": 2.1879, + "step": 445 + }, + { + "epoch": 0.5667090216010165, + "grad_norm": 1.1998871634588988, + "learning_rate": 1.9169660017960135e-05, + "loss": 2.1873, + "step": 446 + }, + { + "epoch": 0.567979669631512, + "grad_norm": 1.1882908352021604, + "learning_rate": 1.9165622558699763e-05, + "loss": 2.3475, + "step": 447 + }, + { + "epoch": 0.5692503176620076, + "grad_norm": 1.27008784722015, + "learning_rate": 1.9161575734614587e-05, + "loss": 2.4298, + "step": 448 + }, + { + "epoch": 0.5705209656925032, + "grad_norm": 1.119449905986438, + "learning_rate": 1.915751954983938e-05, + "loss": 2.1871, + "step": 449 + }, + { + "epoch": 0.5717916137229987, + "grad_norm": 1.3424406440424546, + "learning_rate": 1.915345400851848e-05, + "loss": 2.4036, + "step": 450 + }, + { + "epoch": 0.5730622617534943, + "grad_norm": 1.4009748452912008, + "learning_rate": 1.9149379114805798e-05, + "loss": 2.5499, + "step": 451 + }, + { + "epoch": 0.5743329097839899, + "grad_norm": 1.2584770296153747, + "learning_rate": 1.914529487286478e-05, + "loss": 2.192, + "step": 452 + }, + { + "epoch": 0.5756035578144854, + "grad_norm": 1.447563760715695, + "learning_rate": 1.9141201286868435e-05, + "loss": 2.2728, + "step": 453 + }, + { + "epoch": 0.576874205844981, + "grad_norm": 1.1271867759241965, + "learning_rate": 1.913709836099932e-05, + "loss": 2.2126, + "step": 454 + }, + { + "epoch": 0.5781448538754765, + "grad_norm": 1.2495009606349796, + "learning_rate": 1.9132986099449535e-05, + "loss": 2.1981, + "step": 455 + }, + { + "epoch": 0.579415501905972, + "grad_norm": 1.2699559949583559, + "learning_rate": 1.912886450642071e-05, + "loss": 2.2818, + "step": 456 + }, + { + "epoch": 0.5806861499364676, + "grad_norm": 1.1079978001775894, + "learning_rate": 1.9124733586124015e-05, + "loss": 2.0022, + "step": 457 + }, + { + "epoch": 0.5819567979669632, + "grad_norm": 1.2644461116241026, + "learning_rate": 1.9120593342780158e-05, + "loss": 2.4457, + "step": 458 + }, + { + "epoch": 0.5832274459974587, + "grad_norm": 1.148860883716844, + "learning_rate": 1.9116443780619357e-05, + "loss": 2.0796, + "step": 459 + }, + { + "epoch": 0.5844980940279543, + "grad_norm": 1.1412514808049359, + "learning_rate": 1.911228490388136e-05, + "loss": 2.3325, + "step": 460 + }, + { + "epoch": 0.5857687420584498, + "grad_norm": 1.1387216820020802, + "learning_rate": 1.9108116716815433e-05, + "loss": 2.3263, + "step": 461 + }, + { + "epoch": 0.5870393900889453, + "grad_norm": 1.2897407598165023, + "learning_rate": 1.9103939223680353e-05, + "loss": 2.4233, + "step": 462 + }, + { + "epoch": 0.5883100381194409, + "grad_norm": 1.2087089474395942, + "learning_rate": 1.9099752428744407e-05, + "loss": 2.1524, + "step": 463 + }, + { + "epoch": 0.5895806861499364, + "grad_norm": 1.2164503356307568, + "learning_rate": 1.9095556336285382e-05, + "loss": 2.0391, + "step": 464 + }, + { + "epoch": 0.590851334180432, + "grad_norm": 1.196760865478763, + "learning_rate": 1.9091350950590563e-05, + "loss": 2.2708, + "step": 465 + }, + { + "epoch": 0.5921219822109276, + "grad_norm": 1.2313846896394445, + "learning_rate": 1.9087136275956745e-05, + "loss": 2.2378, + "step": 466 + }, + { + "epoch": 0.5933926302414231, + "grad_norm": 1.2373291069034087, + "learning_rate": 1.908291231669019e-05, + "loss": 2.3246, + "step": 467 + }, + { + "epoch": 0.5946632782719187, + "grad_norm": 1.1829656057092561, + "learning_rate": 1.9078679077106666e-05, + "loss": 2.1872, + "step": 468 + }, + { + "epoch": 0.5959339263024143, + "grad_norm": 1.4347451249915018, + "learning_rate": 1.907443656153142e-05, + "loss": 2.6066, + "step": 469 + }, + { + "epoch": 0.5972045743329097, + "grad_norm": 1.0984143438577232, + "learning_rate": 1.9070184774299162e-05, + "loss": 2.0474, + "step": 470 + }, + { + "epoch": 0.5984752223634053, + "grad_norm": 1.1850906238628265, + "learning_rate": 1.9065923719754097e-05, + "loss": 1.9647, + "step": 471 + }, + { + "epoch": 0.5997458703939009, + "grad_norm": 1.3926199263292411, + "learning_rate": 1.906165340224988e-05, + "loss": 2.4496, + "step": 472 + }, + { + "epoch": 0.6010165184243964, + "grad_norm": 1.2275438489433488, + "learning_rate": 1.9057373826149642e-05, + "loss": 2.1911, + "step": 473 + }, + { + "epoch": 0.602287166454892, + "grad_norm": 1.33064305688056, + "learning_rate": 1.905308499582597e-05, + "loss": 2.3691, + "step": 474 + }, + { + "epoch": 0.6035578144853876, + "grad_norm": 1.0882172714073037, + "learning_rate": 1.9048786915660903e-05, + "loss": 2.1702, + "step": 475 + }, + { + "epoch": 0.6048284625158831, + "grad_norm": 1.1379935541068376, + "learning_rate": 1.9044479590045936e-05, + "loss": 1.9241, + "step": 476 + }, + { + "epoch": 0.6060991105463787, + "grad_norm": 1.2631080747027876, + "learning_rate": 1.904016302338201e-05, + "loss": 2.3022, + "step": 477 + }, + { + "epoch": 0.6073697585768743, + "grad_norm": 1.3033474316729674, + "learning_rate": 1.90358372200795e-05, + "loss": 2.1681, + "step": 478 + }, + { + "epoch": 0.6086404066073697, + "grad_norm": 1.3374719992920634, + "learning_rate": 1.9031502184558235e-05, + "loss": 2.3015, + "step": 479 + }, + { + "epoch": 0.6099110546378653, + "grad_norm": 1.25539896962734, + "learning_rate": 1.902715792124746e-05, + "loss": 2.2975, + "step": 480 + }, + { + "epoch": 0.6111817026683609, + "grad_norm": 1.1954446631182531, + "learning_rate": 1.9022804434585854e-05, + "loss": 2.0728, + "step": 481 + }, + { + "epoch": 0.6124523506988564, + "grad_norm": 1.205447491528233, + "learning_rate": 1.9018441729021525e-05, + "loss": 2.2925, + "step": 482 + }, + { + "epoch": 0.613722998729352, + "grad_norm": 1.141922162715847, + "learning_rate": 1.901406980901199e-05, + "loss": 2.1577, + "step": 483 + }, + { + "epoch": 0.6149936467598475, + "grad_norm": 1.0941168595388158, + "learning_rate": 1.900968867902419e-05, + "loss": 2.301, + "step": 484 + }, + { + "epoch": 0.6162642947903431, + "grad_norm": 1.1515497727617856, + "learning_rate": 1.900529834353448e-05, + "loss": 2.1877, + "step": 485 + }, + { + "epoch": 0.6175349428208387, + "grad_norm": 1.080151589093454, + "learning_rate": 1.9000898807028602e-05, + "loss": 2.128, + "step": 486 + }, + { + "epoch": 0.6188055908513341, + "grad_norm": 0.9583578019928071, + "learning_rate": 1.8996490074001714e-05, + "loss": 1.8186, + "step": 487 + }, + { + "epoch": 0.6200762388818297, + "grad_norm": 1.209860122515118, + "learning_rate": 1.8992072148958368e-05, + "loss": 2.322, + "step": 488 + }, + { + "epoch": 0.6213468869123253, + "grad_norm": 1.0923359800130836, + "learning_rate": 1.898764503641251e-05, + "loss": 2.2531, + "step": 489 + }, + { + "epoch": 0.6226175349428208, + "grad_norm": 1.31837882949895, + "learning_rate": 1.8983208740887464e-05, + "loss": 2.3352, + "step": 490 + }, + { + "epoch": 0.6238881829733164, + "grad_norm": 1.2597603066413212, + "learning_rate": 1.8978763266915942e-05, + "loss": 2.5245, + "step": 491 + }, + { + "epoch": 0.625158831003812, + "grad_norm": 1.3715703169530913, + "learning_rate": 1.897430861904004e-05, + "loss": 2.1769, + "step": 492 + }, + { + "epoch": 0.6264294790343075, + "grad_norm": 1.5225437610970938, + "learning_rate": 1.8969844801811216e-05, + "loss": 2.141, + "step": 493 + }, + { + "epoch": 0.627700127064803, + "grad_norm": 1.3735055288768747, + "learning_rate": 1.8965371819790305e-05, + "loss": 2.0583, + "step": 494 + }, + { + "epoch": 0.6289707750952986, + "grad_norm": 1.3329381523109856, + "learning_rate": 1.8960889677547506e-05, + "loss": 2.0973, + "step": 495 + }, + { + "epoch": 0.6302414231257941, + "grad_norm": 1.421090762882285, + "learning_rate": 1.8956398379662368e-05, + "loss": 2.5422, + "step": 496 + }, + { + "epoch": 0.6315120711562897, + "grad_norm": 1.6552633728958857, + "learning_rate": 1.8951897930723806e-05, + "loss": 2.4162, + "step": 497 + }, + { + "epoch": 0.6327827191867853, + "grad_norm": 1.4506798439934194, + "learning_rate": 1.8947388335330076e-05, + "loss": 2.4185, + "step": 498 + }, + { + "epoch": 0.6340533672172808, + "grad_norm": 1.4020055076985136, + "learning_rate": 1.8942869598088785e-05, + "loss": 2.1794, + "step": 499 + }, + { + "epoch": 0.6353240152477764, + "grad_norm": 1.1768335660084828, + "learning_rate": 1.8938341723616883e-05, + "loss": 2.2581, + "step": 500 + }, + { + "epoch": 0.636594663278272, + "grad_norm": 1.2174192479655537, + "learning_rate": 1.8933804716540646e-05, + "loss": 2.446, + "step": 501 + }, + { + "epoch": 0.6378653113087674, + "grad_norm": 1.0553638547069482, + "learning_rate": 1.8929258581495688e-05, + "loss": 2.105, + "step": 502 + }, + { + "epoch": 0.639135959339263, + "grad_norm": 1.1267989554506508, + "learning_rate": 1.892470332312695e-05, + "loss": 2.1001, + "step": 503 + }, + { + "epoch": 0.6404066073697586, + "grad_norm": 1.3249636163461571, + "learning_rate": 1.892013894608869e-05, + "loss": 2.2481, + "step": 504 + }, + { + "epoch": 0.6416772554002541, + "grad_norm": 1.294216594755417, + "learning_rate": 1.8915565455044483e-05, + "loss": 2.2538, + "step": 505 + }, + { + "epoch": 0.6429479034307497, + "grad_norm": 1.1232090441804183, + "learning_rate": 1.8910982854667228e-05, + "loss": 2.0829, + "step": 506 + }, + { + "epoch": 0.6442185514612452, + "grad_norm": 1.95930991197237, + "learning_rate": 1.8906391149639115e-05, + "loss": 2.5869, + "step": 507 + }, + { + "epoch": 0.6454891994917408, + "grad_norm": 1.3453985330330351, + "learning_rate": 1.8901790344651643e-05, + "loss": 1.9773, + "step": 508 + }, + { + "epoch": 0.6467598475222364, + "grad_norm": 1.2296929125860547, + "learning_rate": 1.8897180444405615e-05, + "loss": 2.2737, + "step": 509 + }, + { + "epoch": 0.6480304955527318, + "grad_norm": 1.51817047299658, + "learning_rate": 1.8892561453611113e-05, + "loss": 2.393, + "step": 510 + }, + { + "epoch": 0.6493011435832274, + "grad_norm": 1.3781830080412667, + "learning_rate": 1.8887933376987524e-05, + "loss": 2.3916, + "step": 511 + }, + { + "epoch": 0.650571791613723, + "grad_norm": 1.6740810365239487, + "learning_rate": 1.8883296219263503e-05, + "loss": 2.5292, + "step": 512 + }, + { + "epoch": 0.6518424396442185, + "grad_norm": 1.201702350296437, + "learning_rate": 1.887864998517699e-05, + "loss": 1.9545, + "step": 513 + }, + { + "epoch": 0.6531130876747141, + "grad_norm": 1.0800862791397763, + "learning_rate": 1.88739946794752e-05, + "loss": 2.0514, + "step": 514 + }, + { + "epoch": 0.6543837357052097, + "grad_norm": 1.1405856469174804, + "learning_rate": 1.886933030691462e-05, + "loss": 2.1928, + "step": 515 + }, + { + "epoch": 0.6556543837357052, + "grad_norm": 1.2141463875986063, + "learning_rate": 1.8864656872260985e-05, + "loss": 2.3328, + "step": 516 + }, + { + "epoch": 0.6569250317662008, + "grad_norm": 1.2320242281546439, + "learning_rate": 1.8859974380289317e-05, + "loss": 2.2226, + "step": 517 + }, + { + "epoch": 0.6581956797966964, + "grad_norm": 1.1049627459213187, + "learning_rate": 1.8855282835783858e-05, + "loss": 2.192, + "step": 518 + }, + { + "epoch": 0.6594663278271918, + "grad_norm": 1.242887571869999, + "learning_rate": 1.885058224353813e-05, + "loss": 2.1095, + "step": 519 + }, + { + "epoch": 0.6607369758576874, + "grad_norm": 1.0690686043099307, + "learning_rate": 1.8845872608354877e-05, + "loss": 2.0908, + "step": 520 + }, + { + "epoch": 0.662007623888183, + "grad_norm": 1.3192664166276828, + "learning_rate": 1.8841153935046098e-05, + "loss": 1.9455, + "step": 521 + }, + { + "epoch": 0.6632782719186785, + "grad_norm": 1.5454386592048066, + "learning_rate": 1.883642622843302e-05, + "loss": 2.4256, + "step": 522 + }, + { + "epoch": 0.6645489199491741, + "grad_norm": 1.1200275979051053, + "learning_rate": 1.8831689493346095e-05, + "loss": 1.7918, + "step": 523 + }, + { + "epoch": 0.6658195679796697, + "grad_norm": 1.2792896107464529, + "learning_rate": 1.8826943734625006e-05, + "loss": 2.2147, + "step": 524 + }, + { + "epoch": 0.6670902160101652, + "grad_norm": 1.252552573310735, + "learning_rate": 1.8822188957118656e-05, + "loss": 2.4019, + "step": 525 + }, + { + "epoch": 0.6683608640406608, + "grad_norm": 1.1245940180547487, + "learning_rate": 1.8817425165685166e-05, + "loss": 2.1881, + "step": 526 + }, + { + "epoch": 0.6696315120711563, + "grad_norm": 1.0708468149170747, + "learning_rate": 1.8812652365191854e-05, + "loss": 2.0764, + "step": 527 + }, + { + "epoch": 0.6709021601016518, + "grad_norm": 1.1777540466866772, + "learning_rate": 1.880787056051525e-05, + "loss": 2.2589, + "step": 528 + }, + { + "epoch": 0.6721728081321474, + "grad_norm": 1.1832499421529257, + "learning_rate": 1.880307975654109e-05, + "loss": 2.4334, + "step": 529 + }, + { + "epoch": 0.6734434561626429, + "grad_norm": 1.3102714120785524, + "learning_rate": 1.8798279958164295e-05, + "loss": 2.0436, + "step": 530 + }, + { + "epoch": 0.6747141041931385, + "grad_norm": 1.222035984008157, + "learning_rate": 1.8793471170288984e-05, + "loss": 2.1924, + "step": 531 + }, + { + "epoch": 0.6759847522236341, + "grad_norm": 1.1732666538562266, + "learning_rate": 1.8788653397828458e-05, + "loss": 2.4146, + "step": 532 + }, + { + "epoch": 0.6772554002541296, + "grad_norm": 1.1409816836583633, + "learning_rate": 1.8783826645705195e-05, + "loss": 2.2124, + "step": 533 + }, + { + "epoch": 0.6785260482846251, + "grad_norm": 1.3231706367571725, + "learning_rate": 1.8778990918850852e-05, + "loss": 2.0627, + "step": 534 + }, + { + "epoch": 0.6797966963151207, + "grad_norm": 1.2526020473537403, + "learning_rate": 1.877414622220625e-05, + "loss": 2.2245, + "step": 535 + }, + { + "epoch": 0.6810673443456162, + "grad_norm": 1.1845130281014962, + "learning_rate": 1.876929256072138e-05, + "loss": 2.2451, + "step": 536 + }, + { + "epoch": 0.6823379923761118, + "grad_norm": 1.3625735833969421, + "learning_rate": 1.8764429939355394e-05, + "loss": 2.4728, + "step": 537 + }, + { + "epoch": 0.6836086404066074, + "grad_norm": 1.1553595122128892, + "learning_rate": 1.8759558363076588e-05, + "loss": 2.1704, + "step": 538 + }, + { + "epoch": 0.6848792884371029, + "grad_norm": 1.2851221382973055, + "learning_rate": 1.875467783686243e-05, + "loss": 2.431, + "step": 539 + }, + { + "epoch": 0.6861499364675985, + "grad_norm": 1.2639041760774359, + "learning_rate": 1.87497883656995e-05, + "loss": 2.2614, + "step": 540 + }, + { + "epoch": 0.6874205844980941, + "grad_norm": 1.1565308038920128, + "learning_rate": 1.8744889954583544e-05, + "loss": 2.0451, + "step": 541 + }, + { + "epoch": 0.6886912325285895, + "grad_norm": 1.1481601695101114, + "learning_rate": 1.8739982608519438e-05, + "loss": 2.0516, + "step": 542 + }, + { + "epoch": 0.6899618805590851, + "grad_norm": 1.168831243257151, + "learning_rate": 1.8735066332521174e-05, + "loss": 2.3054, + "step": 543 + }, + { + "epoch": 0.6912325285895807, + "grad_norm": 1.0979119040449332, + "learning_rate": 1.8730141131611882e-05, + "loss": 2.1107, + "step": 544 + }, + { + "epoch": 0.6925031766200762, + "grad_norm": 1.474991738773486, + "learning_rate": 1.8725207010823804e-05, + "loss": 2.2627, + "step": 545 + }, + { + "epoch": 0.6937738246505718, + "grad_norm": 1.139596979539453, + "learning_rate": 1.8720263975198295e-05, + "loss": 1.9434, + "step": 546 + }, + { + "epoch": 0.6950444726810674, + "grad_norm": 1.1481823274967125, + "learning_rate": 1.8715312029785825e-05, + "loss": 2.1494, + "step": 547 + }, + { + "epoch": 0.6963151207115629, + "grad_norm": 1.1406321517448839, + "learning_rate": 1.871035117964596e-05, + "loss": 2.1246, + "step": 548 + }, + { + "epoch": 0.6975857687420585, + "grad_norm": 1.256664969056528, + "learning_rate": 1.8705381429847364e-05, + "loss": 2.1875, + "step": 549 + }, + { + "epoch": 0.6988564167725541, + "grad_norm": 1.146997870707755, + "learning_rate": 1.8700402785467804e-05, + "loss": 2.2522, + "step": 550 + }, + { + "epoch": 0.7001270648030495, + "grad_norm": 1.7921394223729616, + "learning_rate": 1.8695415251594123e-05, + "loss": 2.421, + "step": 551 + }, + { + "epoch": 0.7013977128335451, + "grad_norm": 1.127645304716579, + "learning_rate": 1.869041883332226e-05, + "loss": 2.0452, + "step": 552 + }, + { + "epoch": 0.7026683608640406, + "grad_norm": 1.2712912170354675, + "learning_rate": 1.8685413535757217e-05, + "loss": 2.3269, + "step": 553 + }, + { + "epoch": 0.7039390088945362, + "grad_norm": 1.1225547807371499, + "learning_rate": 1.8680399364013075e-05, + "loss": 1.995, + "step": 554 + }, + { + "epoch": 0.7052096569250318, + "grad_norm": 1.1182511171971892, + "learning_rate": 1.8675376323212985e-05, + "loss": 2.313, + "step": 555 + }, + { + "epoch": 0.7064803049555273, + "grad_norm": 1.3258764971873436, + "learning_rate": 1.8670344418489154e-05, + "loss": 2.0656, + "step": 556 + }, + { + "epoch": 0.7077509529860229, + "grad_norm": 1.0748166146008136, + "learning_rate": 1.866530365498285e-05, + "loss": 1.9555, + "step": 557 + }, + { + "epoch": 0.7090216010165185, + "grad_norm": 1.3514593399056483, + "learning_rate": 1.866025403784439e-05, + "loss": 2.1068, + "step": 558 + }, + { + "epoch": 0.7102922490470139, + "grad_norm": 1.079548325327528, + "learning_rate": 1.8655195572233135e-05, + "loss": 1.9282, + "step": 559 + }, + { + "epoch": 0.7115628970775095, + "grad_norm": 1.1487053535243719, + "learning_rate": 1.8650128263317494e-05, + "loss": 1.879, + "step": 560 + }, + { + "epoch": 0.7128335451080051, + "grad_norm": 1.3507734064247627, + "learning_rate": 1.8645052116274904e-05, + "loss": 2.2114, + "step": 561 + }, + { + "epoch": 0.7141041931385006, + "grad_norm": 1.122492897069288, + "learning_rate": 1.8639967136291837e-05, + "loss": 2.3406, + "step": 562 + }, + { + "epoch": 0.7153748411689962, + "grad_norm": 1.929970759366875, + "learning_rate": 1.863487332856378e-05, + "loss": 1.9846, + "step": 563 + }, + { + "epoch": 0.7166454891994918, + "grad_norm": 1.0536788314464651, + "learning_rate": 1.8629770698295267e-05, + "loss": 1.899, + "step": 564 + }, + { + "epoch": 0.7179161372299873, + "grad_norm": 1.1944128852816356, + "learning_rate": 1.8624659250699807e-05, + "loss": 2.1493, + "step": 565 + }, + { + "epoch": 0.7191867852604829, + "grad_norm": 1.1971873826364454, + "learning_rate": 1.8619538990999947e-05, + "loss": 2.1729, + "step": 566 + }, + { + "epoch": 0.7204574332909784, + "grad_norm": 1.206679262476945, + "learning_rate": 1.861440992442723e-05, + "loss": 2.38, + "step": 567 + }, + { + "epoch": 0.7217280813214739, + "grad_norm": 1.185571910306438, + "learning_rate": 1.8609272056222186e-05, + "loss": 2.2179, + "step": 568 + }, + { + "epoch": 0.7229987293519695, + "grad_norm": 1.6058101563071772, + "learning_rate": 1.860412539163436e-05, + "loss": 2.3207, + "step": 569 + }, + { + "epoch": 0.7242693773824651, + "grad_norm": 1.136888523638023, + "learning_rate": 1.8598969935922263e-05, + "loss": 1.989, + "step": 570 + }, + { + "epoch": 0.7255400254129606, + "grad_norm": 1.0341755689373624, + "learning_rate": 1.8593805694353407e-05, + "loss": 1.9567, + "step": 571 + }, + { + "epoch": 0.7268106734434562, + "grad_norm": 1.0685534450721437, + "learning_rate": 1.8588632672204264e-05, + "loss": 2.1098, + "step": 572 + }, + { + "epoch": 0.7280813214739518, + "grad_norm": 1.1375771659832385, + "learning_rate": 1.8583450874760282e-05, + "loss": 2.1579, + "step": 573 + }, + { + "epoch": 0.7293519695044473, + "grad_norm": 1.1501768702752024, + "learning_rate": 1.8578260307315888e-05, + "loss": 2.1482, + "step": 574 + }, + { + "epoch": 0.7306226175349428, + "grad_norm": 1.2847569142855215, + "learning_rate": 1.8573060975174447e-05, + "loss": 1.9585, + "step": 575 + }, + { + "epoch": 0.7318932655654383, + "grad_norm": 1.2972019979542169, + "learning_rate": 1.8567852883648302e-05, + "loss": 2.2788, + "step": 576 + }, + { + "epoch": 0.7331639135959339, + "grad_norm": 1.1878480969568581, + "learning_rate": 1.856263603805873e-05, + "loss": 2.1769, + "step": 577 + }, + { + "epoch": 0.7344345616264295, + "grad_norm": 1.0494532518122905, + "learning_rate": 1.855741044373596e-05, + "loss": 2.0674, + "step": 578 + }, + { + "epoch": 0.735705209656925, + "grad_norm": 1.4204377502276189, + "learning_rate": 1.8552176106019156e-05, + "loss": 2.1398, + "step": 579 + }, + { + "epoch": 0.7369758576874206, + "grad_norm": 1.1297999222036323, + "learning_rate": 1.8546933030256417e-05, + "loss": 2.1572, + "step": 580 + }, + { + "epoch": 0.7382465057179162, + "grad_norm": 0.9938535341282585, + "learning_rate": 1.854168122180477e-05, + "loss": 2.0417, + "step": 581 + }, + { + "epoch": 0.7395171537484116, + "grad_norm": 1.0939285871388238, + "learning_rate": 1.853642068603016e-05, + "loss": 2.193, + "step": 582 + }, + { + "epoch": 0.7407878017789072, + "grad_norm": 1.3213960210210096, + "learning_rate": 1.8531151428307464e-05, + "loss": 2.2197, + "step": 583 + }, + { + "epoch": 0.7420584498094028, + "grad_norm": 1.1372240170454513, + "learning_rate": 1.8525873454020452e-05, + "loss": 2.238, + "step": 584 + }, + { + "epoch": 0.7433290978398983, + "grad_norm": 1.3759947059281574, + "learning_rate": 1.8520586768561804e-05, + "loss": 2.0981, + "step": 585 + }, + { + "epoch": 0.7445997458703939, + "grad_norm": 1.3255967943604101, + "learning_rate": 1.8515291377333114e-05, + "loss": 2.3322, + "step": 586 + }, + { + "epoch": 0.7458703939008895, + "grad_norm": 1.0698356148683228, + "learning_rate": 1.8509987285744856e-05, + "loss": 2.1271, + "step": 587 + }, + { + "epoch": 0.747141041931385, + "grad_norm": 1.1523769456097608, + "learning_rate": 1.85046744992164e-05, + "loss": 2.2146, + "step": 588 + }, + { + "epoch": 0.7484116899618806, + "grad_norm": 1.3299045617102017, + "learning_rate": 1.8499353023176e-05, + "loss": 2.4415, + "step": 589 + }, + { + "epoch": 0.7496823379923762, + "grad_norm": 1.307200173595717, + "learning_rate": 1.8494022863060782e-05, + "loss": 2.2369, + "step": 590 + }, + { + "epoch": 0.7509529860228716, + "grad_norm": 1.1629936680632729, + "learning_rate": 1.848868402431675e-05, + "loss": 2.2199, + "step": 591 + }, + { + "epoch": 0.7522236340533672, + "grad_norm": 1.0365243593426372, + "learning_rate": 1.8483336512398783e-05, + "loss": 1.9964, + "step": 592 + }, + { + "epoch": 0.7534942820838628, + "grad_norm": 1.1183367890272324, + "learning_rate": 1.847798033277061e-05, + "loss": 2.2742, + "step": 593 + }, + { + "epoch": 0.7547649301143583, + "grad_norm": 1.5349286915016844, + "learning_rate": 1.8472615490904817e-05, + "loss": 2.2677, + "step": 594 + }, + { + "epoch": 0.7560355781448539, + "grad_norm": 1.2144970266650321, + "learning_rate": 1.8467241992282842e-05, + "loss": 2.2199, + "step": 595 + }, + { + "epoch": 0.7573062261753494, + "grad_norm": 1.1183355199544283, + "learning_rate": 1.8461859842394976e-05, + "loss": 2.1626, + "step": 596 + }, + { + "epoch": 0.758576874205845, + "grad_norm": 1.4314400266582858, + "learning_rate": 1.845646904674034e-05, + "loss": 2.2842, + "step": 597 + }, + { + "epoch": 0.7598475222363406, + "grad_norm": 1.1831971425721604, + "learning_rate": 1.8451069610826885e-05, + "loss": 2.0274, + "step": 598 + }, + { + "epoch": 0.761118170266836, + "grad_norm": 1.2516564815200508, + "learning_rate": 1.8445661540171408e-05, + "loss": 2.151, + "step": 599 + }, + { + "epoch": 0.7623888182973316, + "grad_norm": 1.0199374404736399, + "learning_rate": 1.8440244840299507e-05, + "loss": 1.7867, + "step": 600 + }, + { + "epoch": 0.7636594663278272, + "grad_norm": 1.2355296914792018, + "learning_rate": 1.843481951674561e-05, + "loss": 2.2535, + "step": 601 + }, + { + "epoch": 0.7649301143583227, + "grad_norm": 1.1498066544729326, + "learning_rate": 1.8429385575052947e-05, + "loss": 2.226, + "step": 602 + }, + { + "epoch": 0.7662007623888183, + "grad_norm": 1.1212282930105197, + "learning_rate": 1.842394302077357e-05, + "loss": 2.0142, + "step": 603 + }, + { + "epoch": 0.7674714104193139, + "grad_norm": 1.0961030851113989, + "learning_rate": 1.841849185946831e-05, + "loss": 2.0159, + "step": 604 + }, + { + "epoch": 0.7687420584498094, + "grad_norm": 1.175556484686974, + "learning_rate": 1.8413032096706808e-05, + "loss": 2.2503, + "step": 605 + }, + { + "epoch": 0.770012706480305, + "grad_norm": 1.095037415345117, + "learning_rate": 1.8407563738067483e-05, + "loss": 1.797, + "step": 606 + }, + { + "epoch": 0.7712833545108005, + "grad_norm": 1.123170821307597, + "learning_rate": 1.8402086789137547e-05, + "loss": 2.2509, + "step": 607 + }, + { + "epoch": 0.772554002541296, + "grad_norm": 1.1069902916432202, + "learning_rate": 1.8396601255512973e-05, + "loss": 2.1634, + "step": 608 + }, + { + "epoch": 0.7738246505717916, + "grad_norm": 1.207271414909803, + "learning_rate": 1.8391107142798523e-05, + "loss": 2.3651, + "step": 609 + }, + { + "epoch": 0.7750952986022872, + "grad_norm": 1.1822823085451364, + "learning_rate": 1.8385604456607716e-05, + "loss": 2.1507, + "step": 610 + }, + { + "epoch": 0.7763659466327827, + "grad_norm": 1.2195635504431297, + "learning_rate": 1.8380093202562828e-05, + "loss": 2.2428, + "step": 611 + }, + { + "epoch": 0.7776365946632783, + "grad_norm": 1.1167931838706007, + "learning_rate": 1.8374573386294896e-05, + "loss": 2.0545, + "step": 612 + }, + { + "epoch": 0.7789072426937739, + "grad_norm": 1.0629250710234373, + "learning_rate": 1.8369045013443697e-05, + "loss": 2.235, + "step": 613 + }, + { + "epoch": 0.7801778907242694, + "grad_norm": 1.1080069750410502, + "learning_rate": 1.8363508089657763e-05, + "loss": 2.1209, + "step": 614 + }, + { + "epoch": 0.7814485387547649, + "grad_norm": 1.0830334587830448, + "learning_rate": 1.835796262059435e-05, + "loss": 2.2635, + "step": 615 + }, + { + "epoch": 0.7827191867852605, + "grad_norm": 1.1121154546549203, + "learning_rate": 1.8352408611919453e-05, + "loss": 2.1994, + "step": 616 + }, + { + "epoch": 0.783989834815756, + "grad_norm": 1.0460510814774757, + "learning_rate": 1.8346846069307784e-05, + "loss": 2.0689, + "step": 617 + }, + { + "epoch": 0.7852604828462516, + "grad_norm": 1.2289832098681894, + "learning_rate": 1.8341274998442786e-05, + "loss": 2.0958, + "step": 618 + }, + { + "epoch": 0.7865311308767471, + "grad_norm": 1.1016802793946234, + "learning_rate": 1.8335695405016608e-05, + "loss": 2.4416, + "step": 619 + }, + { + "epoch": 0.7878017789072427, + "grad_norm": 1.0581802964347833, + "learning_rate": 1.833010729473011e-05, + "loss": 2.3486, + "step": 620 + }, + { + "epoch": 0.7890724269377383, + "grad_norm": 1.102357635535828, + "learning_rate": 1.8324510673292844e-05, + "loss": 2.2709, + "step": 621 + }, + { + "epoch": 0.7903430749682337, + "grad_norm": 1.0332914444862387, + "learning_rate": 1.8318905546423074e-05, + "loss": 2.0209, + "step": 622 + }, + { + "epoch": 0.7916137229987293, + "grad_norm": 2.0154352316730932, + "learning_rate": 1.8313291919847743e-05, + "loss": 2.2009, + "step": 623 + }, + { + "epoch": 0.7928843710292249, + "grad_norm": 1.1693281047264406, + "learning_rate": 1.8307669799302488e-05, + "loss": 1.9567, + "step": 624 + }, + { + "epoch": 0.7941550190597204, + "grad_norm": 1.0641878819552213, + "learning_rate": 1.830203919053161e-05, + "loss": 1.9002, + "step": 625 + }, + { + "epoch": 0.795425667090216, + "grad_norm": 1.1394831585966236, + "learning_rate": 1.8296400099288097e-05, + "loss": 2.1372, + "step": 626 + }, + { + "epoch": 0.7966963151207116, + "grad_norm": 1.373148541319477, + "learning_rate": 1.82907525313336e-05, + "loss": 2.171, + "step": 627 + }, + { + "epoch": 0.7979669631512071, + "grad_norm": 1.2881927858172024, + "learning_rate": 1.8285096492438424e-05, + "loss": 2.2768, + "step": 628 + }, + { + "epoch": 0.7992376111817027, + "grad_norm": 1.314788727096053, + "learning_rate": 1.8279431988381534e-05, + "loss": 2.2702, + "step": 629 + }, + { + "epoch": 0.8005082592121983, + "grad_norm": 1.1722514211564103, + "learning_rate": 1.8273759024950547e-05, + "loss": 1.9007, + "step": 630 + }, + { + "epoch": 0.8017789072426937, + "grad_norm": 1.331031931387042, + "learning_rate": 1.8268077607941722e-05, + "loss": 2.2078, + "step": 631 + }, + { + "epoch": 0.8030495552731893, + "grad_norm": 1.278464976980853, + "learning_rate": 1.826238774315995e-05, + "loss": 2.1024, + "step": 632 + }, + { + "epoch": 0.8043202033036849, + "grad_norm": 1.2183423143125827, + "learning_rate": 1.8256689436418758e-05, + "loss": 1.7729, + "step": 633 + }, + { + "epoch": 0.8055908513341804, + "grad_norm": 1.0565672193398135, + "learning_rate": 1.82509826935403e-05, + "loss": 2.1151, + "step": 634 + }, + { + "epoch": 0.806861499364676, + "grad_norm": 1.1943999681129567, + "learning_rate": 1.8245267520355348e-05, + "loss": 2.1064, + "step": 635 + }, + { + "epoch": 0.8081321473951716, + "grad_norm": 1.2429531929474038, + "learning_rate": 1.823954392270328e-05, + "loss": 2.2624, + "step": 636 + }, + { + "epoch": 0.8094027954256671, + "grad_norm": 1.0129041059124908, + "learning_rate": 1.8233811906432097e-05, + "loss": 2.1492, + "step": 637 + }, + { + "epoch": 0.8106734434561627, + "grad_norm": 1.4593002297051365, + "learning_rate": 1.8228071477398384e-05, + "loss": 2.045, + "step": 638 + }, + { + "epoch": 0.8119440914866582, + "grad_norm": 1.1260803678002682, + "learning_rate": 1.8222322641467335e-05, + "loss": 2.1809, + "step": 639 + }, + { + "epoch": 0.8132147395171537, + "grad_norm": 1.264065884036665, + "learning_rate": 1.8216565404512732e-05, + "loss": 2.2959, + "step": 640 + }, + { + "epoch": 0.8144853875476493, + "grad_norm": 1.0952598714718136, + "learning_rate": 1.8210799772416933e-05, + "loss": 1.9048, + "step": 641 + }, + { + "epoch": 0.8157560355781448, + "grad_norm": 1.270951012467556, + "learning_rate": 1.8205025751070878e-05, + "loss": 2.3176, + "step": 642 + }, + { + "epoch": 0.8170266836086404, + "grad_norm": 1.1980202479132505, + "learning_rate": 1.819924334637408e-05, + "loss": 1.9996, + "step": 643 + }, + { + "epoch": 0.818297331639136, + "grad_norm": 1.1630902255370354, + "learning_rate": 1.8193452564234616e-05, + "loss": 1.9429, + "step": 644 + }, + { + "epoch": 0.8195679796696315, + "grad_norm": 1.198283450616259, + "learning_rate": 1.8187653410569125e-05, + "loss": 2.115, + "step": 645 + }, + { + "epoch": 0.820838627700127, + "grad_norm": 1.2144890181469097, + "learning_rate": 1.8181845891302798e-05, + "loss": 2.2417, + "step": 646 + }, + { + "epoch": 0.8221092757306226, + "grad_norm": 1.4017077851347404, + "learning_rate": 1.8176030012369367e-05, + "loss": 2.5008, + "step": 647 + }, + { + "epoch": 0.8233799237611181, + "grad_norm": 1.1659526738815076, + "learning_rate": 1.817020577971112e-05, + "loss": 2.1242, + "step": 648 + }, + { + "epoch": 0.8246505717916137, + "grad_norm": 1.278036339848494, + "learning_rate": 1.8164373199278858e-05, + "loss": 2.2445, + "step": 649 + }, + { + "epoch": 0.8259212198221093, + "grad_norm": 1.2190826973272975, + "learning_rate": 1.8158532277031937e-05, + "loss": 2.2701, + "step": 650 + }, + { + "epoch": 0.8271918678526048, + "grad_norm": 1.2816423646347477, + "learning_rate": 1.815268301893822e-05, + "loss": 2.1431, + "step": 651 + }, + { + "epoch": 0.8284625158831004, + "grad_norm": 1.1717979527697433, + "learning_rate": 1.814682543097409e-05, + "loss": 2.2215, + "step": 652 + }, + { + "epoch": 0.829733163913596, + "grad_norm": 1.3144642425327488, + "learning_rate": 1.8140959519124436e-05, + "loss": 2.3288, + "step": 653 + }, + { + "epoch": 0.8310038119440915, + "grad_norm": 1.3305027804563203, + "learning_rate": 1.813508528938267e-05, + "loss": 2.2093, + "step": 654 + }, + { + "epoch": 0.832274459974587, + "grad_norm": 1.1980383998904782, + "learning_rate": 1.8129202747750682e-05, + "loss": 2.1226, + "step": 655 + }, + { + "epoch": 0.8335451080050826, + "grad_norm": 1.1783895710806225, + "learning_rate": 1.812331190023886e-05, + "loss": 2.2583, + "step": 656 + }, + { + "epoch": 0.8348157560355781, + "grad_norm": 1.4976785081599993, + "learning_rate": 1.811741275286609e-05, + "loss": 2.091, + "step": 657 + }, + { + "epoch": 0.8360864040660737, + "grad_norm": 1.1357480648220457, + "learning_rate": 1.811150531165972e-05, + "loss": 1.9162, + "step": 658 + }, + { + "epoch": 0.8373570520965693, + "grad_norm": 1.2253645225834875, + "learning_rate": 1.8105589582655585e-05, + "loss": 2.1036, + "step": 659 + }, + { + "epoch": 0.8386277001270648, + "grad_norm": 1.08960201806807, + "learning_rate": 1.8099665571897987e-05, + "loss": 2.2, + "step": 660 + }, + { + "epoch": 0.8398983481575604, + "grad_norm": 1.2544735038806298, + "learning_rate": 1.809373328543968e-05, + "loss": 2.4893, + "step": 661 + }, + { + "epoch": 0.841168996188056, + "grad_norm": 1.0809294146988426, + "learning_rate": 1.808779272934189e-05, + "loss": 2.153, + "step": 662 + }, + { + "epoch": 0.8424396442185514, + "grad_norm": 1.1511962715890744, + "learning_rate": 1.8081843909674277e-05, + "loss": 1.9579, + "step": 663 + }, + { + "epoch": 0.843710292249047, + "grad_norm": 1.2452670658097118, + "learning_rate": 1.807588683251495e-05, + "loss": 2.2061, + "step": 664 + }, + { + "epoch": 0.8449809402795425, + "grad_norm": 1.248947570313016, + "learning_rate": 1.8069921503950457e-05, + "loss": 2.1359, + "step": 665 + }, + { + "epoch": 0.8462515883100381, + "grad_norm": 1.2289662739094933, + "learning_rate": 1.8063947930075776e-05, + "loss": 2.2559, + "step": 666 + }, + { + "epoch": 0.8475222363405337, + "grad_norm": 1.1868517146260327, + "learning_rate": 1.8057966116994304e-05, + "loss": 2.2514, + "step": 667 + }, + { + "epoch": 0.8487928843710292, + "grad_norm": 1.0039146134359633, + "learning_rate": 1.8051976070817864e-05, + "loss": 1.6718, + "step": 668 + }, + { + "epoch": 0.8500635324015248, + "grad_norm": 1.2292256786802576, + "learning_rate": 1.8045977797666685e-05, + "loss": 2.0274, + "step": 669 + }, + { + "epoch": 0.8513341804320204, + "grad_norm": 1.1408696071690931, + "learning_rate": 1.8039971303669407e-05, + "loss": 2.2972, + "step": 670 + }, + { + "epoch": 0.8526048284625158, + "grad_norm": 1.136087224803189, + "learning_rate": 1.8033956594963067e-05, + "loss": 1.9694, + "step": 671 + }, + { + "epoch": 0.8538754764930114, + "grad_norm": 1.13467315760421, + "learning_rate": 1.802793367769309e-05, + "loss": 2.1446, + "step": 672 + }, + { + "epoch": 0.855146124523507, + "grad_norm": 1.2426557197669015, + "learning_rate": 1.8021902558013305e-05, + "loss": 2.2789, + "step": 673 + }, + { + "epoch": 0.8564167725540025, + "grad_norm": 1.0850125836271671, + "learning_rate": 1.8015863242085893e-05, + "loss": 2.023, + "step": 674 + }, + { + "epoch": 0.8576874205844981, + "grad_norm": 1.1350570729505167, + "learning_rate": 1.8009815736081442e-05, + "loss": 1.8265, + "step": 675 + }, + { + "epoch": 0.8589580686149937, + "grad_norm": 1.3055283343990955, + "learning_rate": 1.8003760046178884e-05, + "loss": 2.2501, + "step": 676 + }, + { + "epoch": 0.8602287166454892, + "grad_norm": 1.1220882930352254, + "learning_rate": 1.799769617856552e-05, + "loss": 2.1533, + "step": 677 + }, + { + "epoch": 0.8614993646759848, + "grad_norm": 1.3728913831386764, + "learning_rate": 1.7991624139437013e-05, + "loss": 2.1099, + "step": 678 + }, + { + "epoch": 0.8627700127064803, + "grad_norm": 1.2170881551817552, + "learning_rate": 1.7985543934997363e-05, + "loss": 2.0939, + "step": 679 + }, + { + "epoch": 0.8640406607369758, + "grad_norm": 1.1162608077349283, + "learning_rate": 1.7979455571458926e-05, + "loss": 2.1503, + "step": 680 + }, + { + "epoch": 0.8653113087674714, + "grad_norm": 1.591964124880649, + "learning_rate": 1.7973359055042384e-05, + "loss": 2.3787, + "step": 681 + }, + { + "epoch": 0.866581956797967, + "grad_norm": 1.0934479331373792, + "learning_rate": 1.7967254391976752e-05, + "loss": 2.1388, + "step": 682 + }, + { + "epoch": 0.8678526048284625, + "grad_norm": 1.1745188572200094, + "learning_rate": 1.796114158849937e-05, + "loss": 2.1515, + "step": 683 + }, + { + "epoch": 0.8691232528589581, + "grad_norm": 1.34721793744304, + "learning_rate": 1.79550206508559e-05, + "loss": 2.3695, + "step": 684 + }, + { + "epoch": 0.8703939008894537, + "grad_norm": 1.40311554976704, + "learning_rate": 1.7948891585300304e-05, + "loss": 2.0496, + "step": 685 + }, + { + "epoch": 0.8716645489199492, + "grad_norm": 1.4412821283719928, + "learning_rate": 1.7942754398094858e-05, + "loss": 2.3265, + "step": 686 + }, + { + "epoch": 0.8729351969504447, + "grad_norm": 1.3094208316843936, + "learning_rate": 1.793660909551013e-05, + "loss": 2.1029, + "step": 687 + }, + { + "epoch": 0.8742058449809402, + "grad_norm": 1.3235639078241626, + "learning_rate": 1.793045568382498e-05, + "loss": 2.2132, + "step": 688 + }, + { + "epoch": 0.8754764930114358, + "grad_norm": 1.1612146363790252, + "learning_rate": 1.792429416932656e-05, + "loss": 2.2556, + "step": 689 + }, + { + "epoch": 0.8767471410419314, + "grad_norm": 1.3204673619230718, + "learning_rate": 1.7918124558310298e-05, + "loss": 2.3886, + "step": 690 + }, + { + "epoch": 0.8780177890724269, + "grad_norm": 1.389359391349538, + "learning_rate": 1.7911946857079886e-05, + "loss": 2.0822, + "step": 691 + }, + { + "epoch": 0.8792884371029225, + "grad_norm": 1.302496075608027, + "learning_rate": 1.7905761071947298e-05, + "loss": 2.2212, + "step": 692 + }, + { + "epoch": 0.8805590851334181, + "grad_norm": 1.2177378809010373, + "learning_rate": 1.7899567209232747e-05, + "loss": 2.1448, + "step": 693 + }, + { + "epoch": 0.8818297331639136, + "grad_norm": 1.0887690351215271, + "learning_rate": 1.7893365275264723e-05, + "loss": 2.1947, + "step": 694 + }, + { + "epoch": 0.8831003811944091, + "grad_norm": 1.1611483527631685, + "learning_rate": 1.7887155276379946e-05, + "loss": 2.2277, + "step": 695 + }, + { + "epoch": 0.8843710292249047, + "grad_norm": 1.3321497305815846, + "learning_rate": 1.788093721892338e-05, + "loss": 2.1868, + "step": 696 + }, + { + "epoch": 0.8856416772554002, + "grad_norm": 1.1677109697934798, + "learning_rate": 1.7874711109248223e-05, + "loss": 2.0527, + "step": 697 + }, + { + "epoch": 0.8869123252858958, + "grad_norm": 1.2935614226803498, + "learning_rate": 1.78684769537159e-05, + "loss": 2.1411, + "step": 698 + }, + { + "epoch": 0.8881829733163914, + "grad_norm": 1.2070887595655209, + "learning_rate": 1.7862234758696064e-05, + "loss": 2.2359, + "step": 699 + }, + { + "epoch": 0.8894536213468869, + "grad_norm": 1.1156624127053323, + "learning_rate": 1.7855984530566564e-05, + "loss": 2.186, + "step": 700 + }, + { + "epoch": 0.8907242693773825, + "grad_norm": 1.2030018813648606, + "learning_rate": 1.7849726275713477e-05, + "loss": 2.2642, + "step": 701 + }, + { + "epoch": 0.8919949174078781, + "grad_norm": 1.157278199265185, + "learning_rate": 1.7843460000531066e-05, + "loss": 2.0533, + "step": 702 + }, + { + "epoch": 0.8932655654383735, + "grad_norm": 1.1334035384047099, + "learning_rate": 1.78371857114218e-05, + "loss": 2.1672, + "step": 703 + }, + { + "epoch": 0.8945362134688691, + "grad_norm": 1.1698257316198195, + "learning_rate": 1.7830903414796338e-05, + "loss": 2.1527, + "step": 704 + }, + { + "epoch": 0.8958068614993647, + "grad_norm": 1.0649348969607102, + "learning_rate": 1.78246131170735e-05, + "loss": 2.001, + "step": 705 + }, + { + "epoch": 0.8970775095298602, + "grad_norm": 16.103293834646067, + "learning_rate": 1.78183148246803e-05, + "loss": 2.0628, + "step": 706 + }, + { + "epoch": 0.8983481575603558, + "grad_norm": 1.4097073445715416, + "learning_rate": 1.781200854405192e-05, + "loss": 2.187, + "step": 707 + }, + { + "epoch": 0.8996188055908514, + "grad_norm": 1.2758116339569952, + "learning_rate": 1.7805694281631687e-05, + "loss": 2.3274, + "step": 708 + }, + { + "epoch": 0.9008894536213469, + "grad_norm": 1.1765512106097247, + "learning_rate": 1.7799372043871107e-05, + "loss": 2.2118, + "step": 709 + }, + { + "epoch": 0.9021601016518425, + "grad_norm": 1.4124203146752978, + "learning_rate": 1.779304183722982e-05, + "loss": 2.0081, + "step": 710 + }, + { + "epoch": 0.9034307496823379, + "grad_norm": 1.0319073252257085, + "learning_rate": 1.778670366817561e-05, + "loss": 1.8963, + "step": 711 + }, + { + "epoch": 0.9047013977128335, + "grad_norm": 1.3123039682019952, + "learning_rate": 1.7780357543184396e-05, + "loss": 2.0833, + "step": 712 + }, + { + "epoch": 0.9059720457433291, + "grad_norm": 1.5121323183764703, + "learning_rate": 1.777400346874023e-05, + "loss": 2.2133, + "step": 713 + }, + { + "epoch": 0.9072426937738246, + "grad_norm": 1.240278972114728, + "learning_rate": 1.776764145133528e-05, + "loss": 2.0854, + "step": 714 + }, + { + "epoch": 0.9085133418043202, + "grad_norm": 1.2638134939297732, + "learning_rate": 1.776127149746984e-05, + "loss": 2.0003, + "step": 715 + }, + { + "epoch": 0.9097839898348158, + "grad_norm": 1.8027487698425262, + "learning_rate": 1.7754893613652296e-05, + "loss": 2.2157, + "step": 716 + }, + { + "epoch": 0.9110546378653113, + "grad_norm": 1.0790178813589926, + "learning_rate": 1.7748507806399158e-05, + "loss": 2.1216, + "step": 717 + }, + { + "epoch": 0.9123252858958069, + "grad_norm": 1.1894613911325647, + "learning_rate": 1.774211408223501e-05, + "loss": 2.1762, + "step": 718 + }, + { + "epoch": 0.9135959339263025, + "grad_norm": 6.847859792225901, + "learning_rate": 1.773571244769254e-05, + "loss": 1.9941, + "step": 719 + }, + { + "epoch": 0.9148665819567979, + "grad_norm": 1.089714241501095, + "learning_rate": 1.772930290931251e-05, + "loss": 1.8715, + "step": 720 + }, + { + "epoch": 0.9161372299872935, + "grad_norm": 1.049498859144584, + "learning_rate": 1.7722885473643767e-05, + "loss": 2.1452, + "step": 721 + }, + { + "epoch": 0.9174078780177891, + "grad_norm": 1.3170625509291867, + "learning_rate": 1.7716460147243216e-05, + "loss": 2.3355, + "step": 722 + }, + { + "epoch": 0.9186785260482846, + "grad_norm": 1.1145107193428012, + "learning_rate": 1.771002693667583e-05, + "loss": 2.2057, + "step": 723 + }, + { + "epoch": 0.9199491740787802, + "grad_norm": 1.0733911923183925, + "learning_rate": 1.770358584851463e-05, + "loss": 2.1601, + "step": 724 + }, + { + "epoch": 0.9212198221092758, + "grad_norm": 1.1931077636376628, + "learning_rate": 1.7697136889340707e-05, + "loss": 2.3301, + "step": 725 + }, + { + "epoch": 0.9224904701397713, + "grad_norm": 1.0214572577114507, + "learning_rate": 1.769068006574317e-05, + "loss": 2.1586, + "step": 726 + }, + { + "epoch": 0.9237611181702668, + "grad_norm": 1.0430710658686952, + "learning_rate": 1.7684215384319174e-05, + "loss": 2.2141, + "step": 727 + }, + { + "epoch": 0.9250317662007624, + "grad_norm": 1.1588749834203884, + "learning_rate": 1.7677742851673902e-05, + "loss": 1.9551, + "step": 728 + }, + { + "epoch": 0.9263024142312579, + "grad_norm": 1.140144952770098, + "learning_rate": 1.7671262474420556e-05, + "loss": 2.0731, + "step": 729 + }, + { + "epoch": 0.9275730622617535, + "grad_norm": 1.2093997211822138, + "learning_rate": 1.766477425918036e-05, + "loss": 2.2245, + "step": 730 + }, + { + "epoch": 0.928843710292249, + "grad_norm": 1.1070779104664465, + "learning_rate": 1.7658278212582535e-05, + "loss": 2.2871, + "step": 731 + }, + { + "epoch": 0.9301143583227446, + "grad_norm": 1.1627076492057788, + "learning_rate": 1.7651774341264318e-05, + "loss": 2.2414, + "step": 732 + }, + { + "epoch": 0.9313850063532402, + "grad_norm": 1.2056060527074959, + "learning_rate": 1.7645262651870926e-05, + "loss": 2.2713, + "step": 733 + }, + { + "epoch": 0.9326556543837357, + "grad_norm": 1.0225727328183227, + "learning_rate": 1.763874315105558e-05, + "loss": 1.9503, + "step": 734 + }, + { + "epoch": 0.9339263024142312, + "grad_norm": 1.118218927788584, + "learning_rate": 1.7632215845479463e-05, + "loss": 2.1835, + "step": 735 + }, + { + "epoch": 0.9351969504447268, + "grad_norm": 1.1167093296828845, + "learning_rate": 1.7625680741811745e-05, + "loss": 2.2381, + "step": 736 + }, + { + "epoch": 0.9364675984752223, + "grad_norm": 1.12942128452186, + "learning_rate": 1.7619137846729567e-05, + "loss": 2.0646, + "step": 737 + }, + { + "epoch": 0.9377382465057179, + "grad_norm": 1.238502072438946, + "learning_rate": 1.7612587166918023e-05, + "loss": 2.2532, + "step": 738 + }, + { + "epoch": 0.9390088945362135, + "grad_norm": 1.0984145820665707, + "learning_rate": 1.760602870907016e-05, + "loss": 2.1203, + "step": 739 + }, + { + "epoch": 0.940279542566709, + "grad_norm": 1.5818781364794277, + "learning_rate": 1.7599462479886976e-05, + "loss": 2.375, + "step": 740 + }, + { + "epoch": 0.9415501905972046, + "grad_norm": 1.1523217743831182, + "learning_rate": 1.759288848607741e-05, + "loss": 2.086, + "step": 741 + }, + { + "epoch": 0.9428208386277002, + "grad_norm": 1.8822451973588672, + "learning_rate": 1.758630673435833e-05, + "loss": 2.1809, + "step": 742 + }, + { + "epoch": 0.9440914866581956, + "grad_norm": 1.2253188338937813, + "learning_rate": 1.757971723145453e-05, + "loss": 2.1441, + "step": 743 + }, + { + "epoch": 0.9453621346886912, + "grad_norm": 1.1325770019028607, + "learning_rate": 1.7573119984098736e-05, + "loss": 2.1922, + "step": 744 + }, + { + "epoch": 0.9466327827191868, + "grad_norm": 1.4051086333055203, + "learning_rate": 1.756651499903157e-05, + "loss": 2.1348, + "step": 745 + }, + { + "epoch": 0.9479034307496823, + "grad_norm": 1.4161450474571087, + "learning_rate": 1.7559902283001568e-05, + "loss": 2.4787, + "step": 746 + }, + { + "epoch": 0.9491740787801779, + "grad_norm": 1.3285177253807692, + "learning_rate": 1.755328184276517e-05, + "loss": 2.3242, + "step": 747 + }, + { + "epoch": 0.9504447268106735, + "grad_norm": 1.109819420322136, + "learning_rate": 1.7546653685086696e-05, + "loss": 1.939, + "step": 748 + }, + { + "epoch": 0.951715374841169, + "grad_norm": 1.026234939259307, + "learning_rate": 1.7540017816738358e-05, + "loss": 1.8793, + "step": 749 + }, + { + "epoch": 0.9529860228716646, + "grad_norm": 1.0441675637509298, + "learning_rate": 1.753337424450025e-05, + "loss": 2.0132, + "step": 750 + }, + { + "epoch": 0.9542566709021602, + "grad_norm": 1.1594745381636706, + "learning_rate": 1.7526722975160334e-05, + "loss": 2.2498, + "step": 751 + }, + { + "epoch": 0.9555273189326556, + "grad_norm": 1.3775100569123298, + "learning_rate": 1.7520064015514425e-05, + "loss": 2.3491, + "step": 752 + }, + { + "epoch": 0.9567979669631512, + "grad_norm": 1.208713834802541, + "learning_rate": 1.751339737236622e-05, + "loss": 2.1424, + "step": 753 + }, + { + "epoch": 0.9580686149936467, + "grad_norm": 1.3760413828742442, + "learning_rate": 1.7506723052527243e-05, + "loss": 1.9454, + "step": 754 + }, + { + "epoch": 0.9593392630241423, + "grad_norm": 1.2670339177037218, + "learning_rate": 1.7500041062816875e-05, + "loss": 2.1874, + "step": 755 + }, + { + "epoch": 0.9606099110546379, + "grad_norm": 1.1343893644179321, + "learning_rate": 1.749335141006233e-05, + "loss": 2.1512, + "step": 756 + }, + { + "epoch": 0.9618805590851334, + "grad_norm": 1.2181137851036752, + "learning_rate": 1.748665410109865e-05, + "loss": 2.1203, + "step": 757 + }, + { + "epoch": 0.963151207115629, + "grad_norm": 1.1951718410904983, + "learning_rate": 1.7479949142768703e-05, + "loss": 2.0477, + "step": 758 + }, + { + "epoch": 0.9644218551461246, + "grad_norm": 1.1707207007043994, + "learning_rate": 1.747323654192316e-05, + "loss": 2.2146, + "step": 759 + }, + { + "epoch": 0.96569250317662, + "grad_norm": 1.1453945786997743, + "learning_rate": 1.7466516305420524e-05, + "loss": 2.2439, + "step": 760 + }, + { + "epoch": 0.9669631512071156, + "grad_norm": 1.0994338227829146, + "learning_rate": 1.7459788440127083e-05, + "loss": 2.1315, + "step": 761 + }, + { + "epoch": 0.9682337992376112, + "grad_norm": 1.0612891908924782, + "learning_rate": 1.7453052952916924e-05, + "loss": 2.2191, + "step": 762 + }, + { + "epoch": 0.9695044472681067, + "grad_norm": 1.367502676104824, + "learning_rate": 1.7446309850671913e-05, + "loss": 2.4506, + "step": 763 + }, + { + "epoch": 0.9707750952986023, + "grad_norm": 1.1449655488905122, + "learning_rate": 1.7439559140281713e-05, + "loss": 1.9004, + "step": 764 + }, + { + "epoch": 0.9720457433290979, + "grad_norm": 1.1512864595180432, + "learning_rate": 1.7432800828643747e-05, + "loss": 2.2074, + "step": 765 + }, + { + "epoch": 0.9733163913595934, + "grad_norm": 1.5315760269981107, + "learning_rate": 1.7426034922663217e-05, + "loss": 2.4534, + "step": 766 + }, + { + "epoch": 0.974587039390089, + "grad_norm": 1.2894382443177794, + "learning_rate": 1.7419261429253063e-05, + "loss": 2.054, + "step": 767 + }, + { + "epoch": 0.9758576874205845, + "grad_norm": 1.074928556598427, + "learning_rate": 1.7412480355334006e-05, + "loss": 1.9893, + "step": 768 + }, + { + "epoch": 0.97712833545108, + "grad_norm": 1.1272788651886883, + "learning_rate": 1.7405691707834484e-05, + "loss": 1.8961, + "step": 769 + }, + { + "epoch": 0.9783989834815756, + "grad_norm": 1.0740886049498373, + "learning_rate": 1.7398895493690694e-05, + "loss": 2.2882, + "step": 770 + }, + { + "epoch": 0.9796696315120712, + "grad_norm": 1.2385420633979962, + "learning_rate": 1.7392091719846557e-05, + "loss": 2.2462, + "step": 771 + }, + { + "epoch": 0.9809402795425667, + "grad_norm": 1.1447185949365566, + "learning_rate": 1.7385280393253717e-05, + "loss": 2.4673, + "step": 772 + }, + { + "epoch": 0.9822109275730623, + "grad_norm": 1.095160743555286, + "learning_rate": 1.7378461520871533e-05, + "loss": 2.1134, + "step": 773 + }, + { + "epoch": 0.9834815756035579, + "grad_norm": 1.407532638593707, + "learning_rate": 1.7371635109667077e-05, + "loss": 2.4098, + "step": 774 + }, + { + "epoch": 0.9847522236340533, + "grad_norm": 1.274115078111504, + "learning_rate": 1.7364801166615124e-05, + "loss": 2.1994, + "step": 775 + }, + { + "epoch": 0.9860228716645489, + "grad_norm": 1.1102796517887026, + "learning_rate": 1.7357959698698142e-05, + "loss": 2.162, + "step": 776 + }, + { + "epoch": 0.9872935196950444, + "grad_norm": 1.1391943620076197, + "learning_rate": 1.735111071290629e-05, + "loss": 2.2424, + "step": 777 + }, + { + "epoch": 0.98856416772554, + "grad_norm": 1.3106849259724793, + "learning_rate": 1.7344254216237405e-05, + "loss": 2.3158, + "step": 778 + }, + { + "epoch": 0.9898348157560356, + "grad_norm": 1.2261422208301112, + "learning_rate": 1.7337390215697005e-05, + "loss": 2.4132, + "step": 779 + }, + { + "epoch": 0.9911054637865311, + "grad_norm": 1.320383705607811, + "learning_rate": 1.7330518718298263e-05, + "loss": 2.137, + "step": 780 + }, + { + "epoch": 0.9923761118170267, + "grad_norm": 1.052165293466006, + "learning_rate": 1.732363973106203e-05, + "loss": 2.1737, + "step": 781 + }, + { + "epoch": 0.9936467598475223, + "grad_norm": 1.1620672915501906, + "learning_rate": 1.7316753261016782e-05, + "loss": 2.1117, + "step": 782 + }, + { + "epoch": 0.9949174078780177, + "grad_norm": 1.2006436789964456, + "learning_rate": 1.7309859315198676e-05, + "loss": 2.0257, + "step": 783 + }, + { + "epoch": 0.9961880559085133, + "grad_norm": 1.1215243404250974, + "learning_rate": 1.7302957900651477e-05, + "loss": 2.0687, + "step": 784 + }, + { + "epoch": 0.9974587039390089, + "grad_norm": 1.1093169148694415, + "learning_rate": 1.729604902442659e-05, + "loss": 2.3016, + "step": 785 + }, + { + "epoch": 0.9987293519695044, + "grad_norm": 1.1811840102612634, + "learning_rate": 1.7289132693583054e-05, + "loss": 2.2341, + "step": 786 + }, + { + "epoch": 1.0, + "grad_norm": 1.1099305877051853, + "learning_rate": 1.7282208915187512e-05, + "loss": 2.1384, + "step": 787 + }, + { + "epoch": 1.0012706480304956, + "grad_norm": 1.6865753377023602, + "learning_rate": 1.727527769631422e-05, + "loss": 2.0813, + "step": 788 + }, + { + "epoch": 1.0025412960609912, + "grad_norm": 1.6166021118485312, + "learning_rate": 1.7268339044045044e-05, + "loss": 2.0591, + "step": 789 + }, + { + "epoch": 1.0038119440914866, + "grad_norm": 1.1822516961256184, + "learning_rate": 1.7261392965469436e-05, + "loss": 1.9321, + "step": 790 + }, + { + "epoch": 1.0050825921219821, + "grad_norm": 1.366285683480337, + "learning_rate": 1.7254439467684433e-05, + "loss": 1.8407, + "step": 791 + }, + { + "epoch": 1.0063532401524777, + "grad_norm": 2.1905662751665536, + "learning_rate": 1.7247478557794662e-05, + "loss": 1.8602, + "step": 792 + }, + { + "epoch": 1.0076238881829733, + "grad_norm": 1.6623048912149558, + "learning_rate": 1.7240510242912315e-05, + "loss": 2.1063, + "step": 793 + }, + { + "epoch": 1.008894536213469, + "grad_norm": 1.3109263123151378, + "learning_rate": 1.7233534530157163e-05, + "loss": 1.9212, + "step": 794 + }, + { + "epoch": 1.0101651842439645, + "grad_norm": 1.2236823988689063, + "learning_rate": 1.7226551426656514e-05, + "loss": 1.8459, + "step": 795 + }, + { + "epoch": 1.0114358322744599, + "grad_norm": 1.418709456304477, + "learning_rate": 1.7219560939545246e-05, + "loss": 2.0208, + "step": 796 + }, + { + "epoch": 1.0127064803049555, + "grad_norm": 1.2535964475433163, + "learning_rate": 1.7212563075965774e-05, + "loss": 1.9942, + "step": 797 + }, + { + "epoch": 1.013977128335451, + "grad_norm": 1.0444045046192412, + "learning_rate": 1.7205557843068053e-05, + "loss": 1.8317, + "step": 798 + }, + { + "epoch": 1.0152477763659467, + "grad_norm": 1.2623627126003907, + "learning_rate": 1.719854524800956e-05, + "loss": 1.8213, + "step": 799 + }, + { + "epoch": 1.0165184243964422, + "grad_norm": 1.325424500121208, + "learning_rate": 1.7191525297955306e-05, + "loss": 1.9402, + "step": 800 + }, + { + "epoch": 1.0177890724269378, + "grad_norm": 1.2087028271623546, + "learning_rate": 1.7184498000077804e-05, + "loss": 1.8926, + "step": 801 + }, + { + "epoch": 1.0190597204574332, + "grad_norm": 1.4038085355277468, + "learning_rate": 1.7177463361557082e-05, + "loss": 2.0807, + "step": 802 + }, + { + "epoch": 1.0203303684879288, + "grad_norm": 1.21958929015843, + "learning_rate": 1.7170421389580666e-05, + "loss": 1.7006, + "step": 803 + }, + { + "epoch": 1.0216010165184244, + "grad_norm": 1.1964204779992977, + "learning_rate": 1.7163372091343578e-05, + "loss": 1.6272, + "step": 804 + }, + { + "epoch": 1.02287166454892, + "grad_norm": 1.1449886242015361, + "learning_rate": 1.7156315474048323e-05, + "loss": 1.8457, + "step": 805 + }, + { + "epoch": 1.0241423125794156, + "grad_norm": 1.2278404715003532, + "learning_rate": 1.7149251544904882e-05, + "loss": 1.6541, + "step": 806 + }, + { + "epoch": 1.0254129606099112, + "grad_norm": 1.1610162199882954, + "learning_rate": 1.714218031113071e-05, + "loss": 1.8906, + "step": 807 + }, + { + "epoch": 1.0266836086404065, + "grad_norm": 1.1675901686694194, + "learning_rate": 1.7135101779950724e-05, + "loss": 1.7901, + "step": 808 + }, + { + "epoch": 1.0279542566709021, + "grad_norm": 1.2124269726496553, + "learning_rate": 1.71280159585973e-05, + "loss": 1.8684, + "step": 809 + }, + { + "epoch": 1.0292249047013977, + "grad_norm": 1.2294992092903338, + "learning_rate": 1.712092285431026e-05, + "loss": 1.7482, + "step": 810 + }, + { + "epoch": 1.0304955527318933, + "grad_norm": 1.1287183716961813, + "learning_rate": 1.7113822474336857e-05, + "loss": 1.7951, + "step": 811 + }, + { + "epoch": 1.031766200762389, + "grad_norm": 1.4134078470159432, + "learning_rate": 1.7106714825931803e-05, + "loss": 2.0693, + "step": 812 + }, + { + "epoch": 1.0330368487928843, + "grad_norm": 1.1782788890686648, + "learning_rate": 1.709959991635721e-05, + "loss": 1.8525, + "step": 813 + }, + { + "epoch": 1.0343074968233799, + "grad_norm": 1.2043372856739532, + "learning_rate": 1.7092477752882626e-05, + "loss": 1.9692, + "step": 814 + }, + { + "epoch": 1.0355781448538754, + "grad_norm": 1.224457390433863, + "learning_rate": 1.7085348342785003e-05, + "loss": 1.886, + "step": 815 + }, + { + "epoch": 1.036848792884371, + "grad_norm": 1.2410254417348827, + "learning_rate": 1.7078211693348704e-05, + "loss": 1.9182, + "step": 816 + }, + { + "epoch": 1.0381194409148666, + "grad_norm": 1.182732969839237, + "learning_rate": 1.7071067811865477e-05, + "loss": 1.9967, + "step": 817 + }, + { + "epoch": 1.0393900889453622, + "grad_norm": 1.0655611049771787, + "learning_rate": 1.706391670563447e-05, + "loss": 1.8914, + "step": 818 + }, + { + "epoch": 1.0406607369758576, + "grad_norm": 1.2324879012238552, + "learning_rate": 1.7056758381962204e-05, + "loss": 1.9612, + "step": 819 + }, + { + "epoch": 1.0419313850063532, + "grad_norm": 1.2666348326958925, + "learning_rate": 1.7049592848162583e-05, + "loss": 1.9061, + "step": 820 + }, + { + "epoch": 1.0432020330368488, + "grad_norm": 1.4256014873272047, + "learning_rate": 1.7042420111556874e-05, + "loss": 2.2497, + "step": 821 + }, + { + "epoch": 1.0444726810673444, + "grad_norm": 1.200697855013483, + "learning_rate": 1.7035240179473703e-05, + "loss": 1.7399, + "step": 822 + }, + { + "epoch": 1.04574332909784, + "grad_norm": 1.124806936679507, + "learning_rate": 1.7028053059249045e-05, + "loss": 1.7169, + "step": 823 + }, + { + "epoch": 1.0470139771283355, + "grad_norm": 1.3764541156500518, + "learning_rate": 1.702085875822623e-05, + "loss": 1.9372, + "step": 824 + }, + { + "epoch": 1.048284625158831, + "grad_norm": 1.2206817503147485, + "learning_rate": 1.7013657283755904e-05, + "loss": 1.9339, + "step": 825 + }, + { + "epoch": 1.0495552731893265, + "grad_norm": 1.5533366987820596, + "learning_rate": 1.700644864319607e-05, + "loss": 1.8839, + "step": 826 + }, + { + "epoch": 1.050825921219822, + "grad_norm": 1.3504868408426431, + "learning_rate": 1.699923284391203e-05, + "loss": 2.0348, + "step": 827 + }, + { + "epoch": 1.0520965692503177, + "grad_norm": 1.542565833973917, + "learning_rate": 1.699200989327641e-05, + "loss": 1.8666, + "step": 828 + }, + { + "epoch": 1.0533672172808133, + "grad_norm": 1.2848638016187683, + "learning_rate": 1.6984779798669144e-05, + "loss": 2.013, + "step": 829 + }, + { + "epoch": 1.0546378653113089, + "grad_norm": 1.2129845471109177, + "learning_rate": 1.6977542567477464e-05, + "loss": 2.0537, + "step": 830 + }, + { + "epoch": 1.0559085133418042, + "grad_norm": 1.3442257128941968, + "learning_rate": 1.6970298207095887e-05, + "loss": 1.8845, + "step": 831 + }, + { + "epoch": 1.0571791613722998, + "grad_norm": 1.257055494734371, + "learning_rate": 1.6963046724926222e-05, + "loss": 1.9277, + "step": 832 + }, + { + "epoch": 1.0584498094027954, + "grad_norm": 1.281494268857617, + "learning_rate": 1.6955788128377552e-05, + "loss": 1.8302, + "step": 833 + }, + { + "epoch": 1.059720457433291, + "grad_norm": 1.2904533027843197, + "learning_rate": 1.6948522424866233e-05, + "loss": 1.9057, + "step": 834 + }, + { + "epoch": 1.0609911054637866, + "grad_norm": 1.7221630896204667, + "learning_rate": 1.6941249621815872e-05, + "loss": 2.0127, + "step": 835 + }, + { + "epoch": 1.062261753494282, + "grad_norm": 5.24316375130168, + "learning_rate": 1.6933969726657344e-05, + "loss": 1.7981, + "step": 836 + }, + { + "epoch": 1.0635324015247776, + "grad_norm": 1.165789449691196, + "learning_rate": 1.6926682746828756e-05, + "loss": 1.9228, + "step": 837 + }, + { + "epoch": 1.0648030495552732, + "grad_norm": 1.2885203823264162, + "learning_rate": 1.6919388689775463e-05, + "loss": 1.9446, + "step": 838 + }, + { + "epoch": 1.0660736975857688, + "grad_norm": 1.1429621958809697, + "learning_rate": 1.691208756295005e-05, + "loss": 1.8116, + "step": 839 + }, + { + "epoch": 1.0673443456162643, + "grad_norm": 1.1263030732817316, + "learning_rate": 1.690477937381232e-05, + "loss": 1.6862, + "step": 840 + }, + { + "epoch": 1.06861499364676, + "grad_norm": 1.2805477063039459, + "learning_rate": 1.68974641298293e-05, + "loss": 2.0322, + "step": 841 + }, + { + "epoch": 1.0698856416772553, + "grad_norm": 1.2665800972478127, + "learning_rate": 1.689014183847522e-05, + "loss": 1.7609, + "step": 842 + }, + { + "epoch": 1.071156289707751, + "grad_norm": 2.2885846073426794, + "learning_rate": 1.6882812507231508e-05, + "loss": 2.0702, + "step": 843 + }, + { + "epoch": 1.0724269377382465, + "grad_norm": 1.3510261901534273, + "learning_rate": 1.6875476143586788e-05, + "loss": 1.9481, + "step": 844 + }, + { + "epoch": 1.073697585768742, + "grad_norm": 1.1744973051438936, + "learning_rate": 1.6868132755036875e-05, + "loss": 1.7137, + "step": 845 + }, + { + "epoch": 1.0749682337992377, + "grad_norm": 1.3373246241234256, + "learning_rate": 1.686078234908475e-05, + "loss": 1.9151, + "step": 846 + }, + { + "epoch": 1.0762388818297333, + "grad_norm": 1.1717664629864653, + "learning_rate": 1.6853424933240575e-05, + "loss": 1.9916, + "step": 847 + }, + { + "epoch": 1.0775095298602286, + "grad_norm": 1.38230628449887, + "learning_rate": 1.6846060515021665e-05, + "loss": 1.7794, + "step": 848 + }, + { + "epoch": 1.0787801778907242, + "grad_norm": 1.1800364634451153, + "learning_rate": 1.68386891019525e-05, + "loss": 1.9383, + "step": 849 + }, + { + "epoch": 1.0800508259212198, + "grad_norm": 1.1714365329438117, + "learning_rate": 1.683131070156469e-05, + "loss": 1.7648, + "step": 850 + }, + { + "epoch": 1.0813214739517154, + "grad_norm": 1.4271044105935007, + "learning_rate": 1.6823925321397004e-05, + "loss": 2.0348, + "step": 851 + }, + { + "epoch": 1.082592121982211, + "grad_norm": 1.2096672277237541, + "learning_rate": 1.681653296899533e-05, + "loss": 1.8899, + "step": 852 + }, + { + "epoch": 1.0838627700127064, + "grad_norm": 1.2638593194626848, + "learning_rate": 1.6809133651912682e-05, + "loss": 1.8186, + "step": 853 + }, + { + "epoch": 1.085133418043202, + "grad_norm": 1.3009262132337112, + "learning_rate": 1.6801727377709195e-05, + "loss": 1.6465, + "step": 854 + }, + { + "epoch": 1.0864040660736975, + "grad_norm": 1.3550953845397025, + "learning_rate": 1.6794314153952105e-05, + "loss": 1.7697, + "step": 855 + }, + { + "epoch": 1.0876747141041931, + "grad_norm": 1.43920839620835, + "learning_rate": 1.6786893988215753e-05, + "loss": 1.7269, + "step": 856 + }, + { + "epoch": 1.0889453621346887, + "grad_norm": 1.4408291141229703, + "learning_rate": 1.677946688808157e-05, + "loss": 1.8983, + "step": 857 + }, + { + "epoch": 1.0902160101651843, + "grad_norm": 1.2230880717822934, + "learning_rate": 1.6772032861138078e-05, + "loss": 1.885, + "step": 858 + }, + { + "epoch": 1.0914866581956797, + "grad_norm": 1.3449063439142093, + "learning_rate": 1.676459191498087e-05, + "loss": 2.0056, + "step": 859 + }, + { + "epoch": 1.0927573062261753, + "grad_norm": 1.3984387684398114, + "learning_rate": 1.675714405721261e-05, + "loss": 1.9134, + "step": 860 + }, + { + "epoch": 1.0940279542566709, + "grad_norm": 2.3409719411538252, + "learning_rate": 1.674968929544303e-05, + "loss": 1.5216, + "step": 861 + }, + { + "epoch": 1.0952986022871665, + "grad_norm": 1.702044326540242, + "learning_rate": 1.6742227637288898e-05, + "loss": 2.0485, + "step": 862 + }, + { + "epoch": 1.096569250317662, + "grad_norm": 1.264330261539595, + "learning_rate": 1.6734759090374057e-05, + "loss": 1.7836, + "step": 863 + }, + { + "epoch": 1.0978398983481577, + "grad_norm": 1.1184031235776326, + "learning_rate": 1.6727283662329365e-05, + "loss": 1.8595, + "step": 864 + }, + { + "epoch": 1.099110546378653, + "grad_norm": 1.467822990156111, + "learning_rate": 1.6719801360792713e-05, + "loss": 1.8487, + "step": 865 + }, + { + "epoch": 1.1003811944091486, + "grad_norm": 1.4373221055972418, + "learning_rate": 1.6712312193409032e-05, + "loss": 2.1972, + "step": 866 + }, + { + "epoch": 1.1016518424396442, + "grad_norm": 1.4625603630755457, + "learning_rate": 1.6704816167830244e-05, + "loss": 2.3222, + "step": 867 + }, + { + "epoch": 1.1029224904701398, + "grad_norm": 1.3277403647399606, + "learning_rate": 1.6697313291715297e-05, + "loss": 2.0532, + "step": 868 + }, + { + "epoch": 1.1041931385006354, + "grad_norm": 1.2948019609048766, + "learning_rate": 1.6689803572730135e-05, + "loss": 1.7394, + "step": 869 + }, + { + "epoch": 1.105463786531131, + "grad_norm": 1.3749134131691831, + "learning_rate": 1.6682287018547683e-05, + "loss": 1.8651, + "step": 870 + }, + { + "epoch": 1.1067344345616263, + "grad_norm": 1.2882783772029283, + "learning_rate": 1.667476363684786e-05, + "loss": 2.0374, + "step": 871 + }, + { + "epoch": 1.108005082592122, + "grad_norm": 1.2171841086218058, + "learning_rate": 1.6667233435317563e-05, + "loss": 2.0968, + "step": 872 + }, + { + "epoch": 1.1092757306226175, + "grad_norm": 1.3344992215686204, + "learning_rate": 1.6659696421650645e-05, + "loss": 1.8383, + "step": 873 + }, + { + "epoch": 1.1105463786531131, + "grad_norm": 1.2961439792458473, + "learning_rate": 1.6652152603547928e-05, + "loss": 1.9911, + "step": 874 + }, + { + "epoch": 1.1118170266836087, + "grad_norm": 1.3617320682602514, + "learning_rate": 1.6644601988717188e-05, + "loss": 1.9132, + "step": 875 + }, + { + "epoch": 1.1130876747141043, + "grad_norm": 1.6872293277125439, + "learning_rate": 1.6637044584873137e-05, + "loss": 1.8031, + "step": 876 + }, + { + "epoch": 1.1143583227445997, + "grad_norm": 1.2658405906055896, + "learning_rate": 1.6629480399737432e-05, + "loss": 1.7167, + "step": 877 + }, + { + "epoch": 1.1156289707750953, + "grad_norm": 1.3956453079025612, + "learning_rate": 1.6621909441038657e-05, + "loss": 2.0243, + "step": 878 + }, + { + "epoch": 1.1168996188055909, + "grad_norm": 1.4283319075492786, + "learning_rate": 1.661433171651231e-05, + "loss": 2.0217, + "step": 879 + }, + { + "epoch": 1.1181702668360864, + "grad_norm": 1.3955373688541373, + "learning_rate": 1.6606747233900816e-05, + "loss": 1.8204, + "step": 880 + }, + { + "epoch": 1.119440914866582, + "grad_norm": 1.1624948055016757, + "learning_rate": 1.6599156000953486e-05, + "loss": 1.6933, + "step": 881 + }, + { + "epoch": 1.1207115628970774, + "grad_norm": 1.658026570394342, + "learning_rate": 1.6591558025426544e-05, + "loss": 1.8716, + "step": 882 + }, + { + "epoch": 1.121982210927573, + "grad_norm": 1.0879821817763893, + "learning_rate": 1.658395331508309e-05, + "loss": 1.7472, + "step": 883 + }, + { + "epoch": 1.1232528589580686, + "grad_norm": 1.3531044307917879, + "learning_rate": 1.6576341877693126e-05, + "loss": 2.0277, + "step": 884 + }, + { + "epoch": 1.1245235069885642, + "grad_norm": 1.2802821111389169, + "learning_rate": 1.65687237210335e-05, + "loss": 1.6896, + "step": 885 + }, + { + "epoch": 1.1257941550190598, + "grad_norm": 1.6340628162781528, + "learning_rate": 1.656109885288794e-05, + "loss": 2.2014, + "step": 886 + }, + { + "epoch": 1.1270648030495554, + "grad_norm": 1.4061256442942187, + "learning_rate": 1.655346728104704e-05, + "loss": 1.7754, + "step": 887 + }, + { + "epoch": 1.1283354510800507, + "grad_norm": 1.5504183318124294, + "learning_rate": 1.6545829013308225e-05, + "loss": 2.0022, + "step": 888 + }, + { + "epoch": 1.1296060991105463, + "grad_norm": 1.455095611292746, + "learning_rate": 1.653818405747577e-05, + "loss": 2.0624, + "step": 889 + }, + { + "epoch": 1.130876747141042, + "grad_norm": 1.3253465178231698, + "learning_rate": 1.653053242136079e-05, + "loss": 1.9104, + "step": 890 + }, + { + "epoch": 1.1321473951715375, + "grad_norm": 1.182950414429057, + "learning_rate": 1.6522874112781213e-05, + "loss": 1.8832, + "step": 891 + }, + { + "epoch": 1.133418043202033, + "grad_norm": 1.3356155853185752, + "learning_rate": 1.6515209139561796e-05, + "loss": 1.9008, + "step": 892 + }, + { + "epoch": 1.1346886912325287, + "grad_norm": 1.4560487097685317, + "learning_rate": 1.6507537509534094e-05, + "loss": 1.9428, + "step": 893 + }, + { + "epoch": 1.135959339263024, + "grad_norm": 1.2827728857110126, + "learning_rate": 1.6499859230536468e-05, + "loss": 1.6692, + "step": 894 + }, + { + "epoch": 1.1372299872935197, + "grad_norm": 1.4362212085193078, + "learning_rate": 1.6492174310414082e-05, + "loss": 1.7576, + "step": 895 + }, + { + "epoch": 1.1385006353240152, + "grad_norm": 1.482776094668404, + "learning_rate": 1.6484482757018873e-05, + "loss": 1.7185, + "step": 896 + }, + { + "epoch": 1.1397712833545108, + "grad_norm": 1.6901901653673548, + "learning_rate": 1.6476784578209556e-05, + "loss": 2.017, + "step": 897 + }, + { + "epoch": 1.1410419313850064, + "grad_norm": 1.3282297106480905, + "learning_rate": 1.6469079781851625e-05, + "loss": 1.5978, + "step": 898 + }, + { + "epoch": 1.1423125794155018, + "grad_norm": 1.5504185271608384, + "learning_rate": 1.6461368375817328e-05, + "loss": 1.8685, + "step": 899 + }, + { + "epoch": 1.1435832274459974, + "grad_norm": 1.279342293663459, + "learning_rate": 1.6453650367985666e-05, + "loss": 1.9452, + "step": 900 + }, + { + "epoch": 1.144853875476493, + "grad_norm": 1.5724299314720493, + "learning_rate": 1.6445925766242392e-05, + "loss": 1.8533, + "step": 901 + }, + { + "epoch": 1.1461245235069886, + "grad_norm": 1.4037091216044704, + "learning_rate": 1.6438194578479987e-05, + "loss": 2.0887, + "step": 902 + }, + { + "epoch": 1.1473951715374842, + "grad_norm": 2.3844841954482137, + "learning_rate": 1.6430456812597664e-05, + "loss": 1.8733, + "step": 903 + }, + { + "epoch": 1.1486658195679798, + "grad_norm": 1.5873692114727178, + "learning_rate": 1.642271247650136e-05, + "loss": 1.9257, + "step": 904 + }, + { + "epoch": 1.1499364675984753, + "grad_norm": 1.2912057122897365, + "learning_rate": 1.6414961578103728e-05, + "loss": 2.0363, + "step": 905 + }, + { + "epoch": 1.1512071156289707, + "grad_norm": 1.2729098473928389, + "learning_rate": 1.6407204125324117e-05, + "loss": 1.9805, + "step": 906 + }, + { + "epoch": 1.1524777636594663, + "grad_norm": 1.319722169062551, + "learning_rate": 1.639944012608858e-05, + "loss": 2.0761, + "step": 907 + }, + { + "epoch": 1.153748411689962, + "grad_norm": 1.646575352037399, + "learning_rate": 1.639166958832985e-05, + "loss": 1.9992, + "step": 908 + }, + { + "epoch": 1.1550190597204575, + "grad_norm": 1.2594306934624784, + "learning_rate": 1.6383892519987355e-05, + "loss": 2.0578, + "step": 909 + }, + { + "epoch": 1.156289707750953, + "grad_norm": 1.4116327121078327, + "learning_rate": 1.6376108929007182e-05, + "loss": 1.9224, + "step": 910 + }, + { + "epoch": 1.1575603557814484, + "grad_norm": 1.3234022103520608, + "learning_rate": 1.6368318823342093e-05, + "loss": 2.0213, + "step": 911 + }, + { + "epoch": 1.158831003811944, + "grad_norm": 1.212138268792923, + "learning_rate": 1.6360522210951493e-05, + "loss": 1.8848, + "step": 912 + }, + { + "epoch": 1.1601016518424396, + "grad_norm": 1.3304063220446036, + "learning_rate": 1.635271909980145e-05, + "loss": 1.8339, + "step": 913 + }, + { + "epoch": 1.1613722998729352, + "grad_norm": 1.1642529244284137, + "learning_rate": 1.6344909497864663e-05, + "loss": 1.6725, + "step": 914 + }, + { + "epoch": 1.1626429479034308, + "grad_norm": 1.4292035396863283, + "learning_rate": 1.6337093413120463e-05, + "loss": 1.8768, + "step": 915 + }, + { + "epoch": 1.1639135959339262, + "grad_norm": 1.3692914071794549, + "learning_rate": 1.6329270853554807e-05, + "loss": 2.0424, + "step": 916 + }, + { + "epoch": 1.1651842439644218, + "grad_norm": 1.253500022515538, + "learning_rate": 1.632144182716027e-05, + "loss": 1.8572, + "step": 917 + }, + { + "epoch": 1.1664548919949174, + "grad_norm": 1.3906010379457432, + "learning_rate": 1.631360634193603e-05, + "loss": 1.9153, + "step": 918 + }, + { + "epoch": 1.167725540025413, + "grad_norm": 1.2968509614505375, + "learning_rate": 1.6305764405887865e-05, + "loss": 1.8864, + "step": 919 + }, + { + "epoch": 1.1689961880559085, + "grad_norm": 1.2017682531071847, + "learning_rate": 1.6297916027028146e-05, + "loss": 1.6371, + "step": 920 + }, + { + "epoch": 1.1702668360864041, + "grad_norm": 1.1043180912826305, + "learning_rate": 1.6290061213375824e-05, + "loss": 1.7654, + "step": 921 + }, + { + "epoch": 1.1715374841168997, + "grad_norm": 1.2251539346147005, + "learning_rate": 1.6282199972956425e-05, + "loss": 1.8949, + "step": 922 + }, + { + "epoch": 1.172808132147395, + "grad_norm": 1.1938270609692188, + "learning_rate": 1.6274332313802046e-05, + "loss": 1.7624, + "step": 923 + }, + { + "epoch": 1.1740787801778907, + "grad_norm": 1.296471490009607, + "learning_rate": 1.626645824395134e-05, + "loss": 2.0527, + "step": 924 + }, + { + "epoch": 1.1753494282083863, + "grad_norm": 1.1555457048005713, + "learning_rate": 1.6258577771449505e-05, + "loss": 1.7623, + "step": 925 + }, + { + "epoch": 1.1766200762388819, + "grad_norm": 1.2672242046698075, + "learning_rate": 1.6250690904348288e-05, + "loss": 1.6398, + "step": 926 + }, + { + "epoch": 1.1778907242693775, + "grad_norm": 1.1599189505741077, + "learning_rate": 1.6242797650705965e-05, + "loss": 1.8078, + "step": 927 + }, + { + "epoch": 1.1791613722998728, + "grad_norm": 1.201908050838568, + "learning_rate": 1.6234898018587336e-05, + "loss": 1.8192, + "step": 928 + }, + { + "epoch": 1.1804320203303684, + "grad_norm": 1.4051652511781014, + "learning_rate": 1.6226992016063726e-05, + "loss": 1.7557, + "step": 929 + }, + { + "epoch": 1.181702668360864, + "grad_norm": 1.2414377174588196, + "learning_rate": 1.621907965121296e-05, + "loss": 1.8567, + "step": 930 + }, + { + "epoch": 1.1829733163913596, + "grad_norm": 1.2205450231326151, + "learning_rate": 1.621116093211937e-05, + "loss": 1.789, + "step": 931 + }, + { + "epoch": 1.1842439644218552, + "grad_norm": 1.3239114373783731, + "learning_rate": 1.6203235866873776e-05, + "loss": 1.8891, + "step": 932 + }, + { + "epoch": 1.1855146124523508, + "grad_norm": 1.143125803189706, + "learning_rate": 1.6195304463573483e-05, + "loss": 1.9097, + "step": 933 + }, + { + "epoch": 1.1867852604828462, + "grad_norm": 1.3543160684882207, + "learning_rate": 1.618736673032227e-05, + "loss": 1.9701, + "step": 934 + }, + { + "epoch": 1.1880559085133418, + "grad_norm": 1.1892390739019483, + "learning_rate": 1.6179422675230393e-05, + "loss": 1.8929, + "step": 935 + }, + { + "epoch": 1.1893265565438373, + "grad_norm": 1.2473887928107703, + "learning_rate": 1.6171472306414554e-05, + "loss": 1.9877, + "step": 936 + }, + { + "epoch": 1.190597204574333, + "grad_norm": 1.2343561498635625, + "learning_rate": 1.6163515631997916e-05, + "loss": 1.8281, + "step": 937 + }, + { + "epoch": 1.1918678526048285, + "grad_norm": 1.6334791738239713, + "learning_rate": 1.6155552660110076e-05, + "loss": 2.063, + "step": 938 + }, + { + "epoch": 1.1931385006353241, + "grad_norm": 1.552460365301119, + "learning_rate": 1.6147583398887078e-05, + "loss": 2.2069, + "step": 939 + }, + { + "epoch": 1.1944091486658195, + "grad_norm": 1.5037802443003923, + "learning_rate": 1.6139607856471377e-05, + "loss": 2.082, + "step": 940 + }, + { + "epoch": 1.195679796696315, + "grad_norm": 1.3073618678744163, + "learning_rate": 1.613162604101186e-05, + "loss": 1.8659, + "step": 941 + }, + { + "epoch": 1.1969504447268107, + "grad_norm": 1.1930066131323456, + "learning_rate": 1.6123637960663807e-05, + "loss": 1.7654, + "step": 942 + }, + { + "epoch": 1.1982210927573063, + "grad_norm": 1.4632481383057956, + "learning_rate": 1.6115643623588915e-05, + "loss": 1.8771, + "step": 943 + }, + { + "epoch": 1.1994917407878019, + "grad_norm": 1.3551013868985258, + "learning_rate": 1.6107643037955268e-05, + "loss": 1.8159, + "step": 944 + }, + { + "epoch": 1.2007623888182972, + "grad_norm": 1.3709358096052835, + "learning_rate": 1.6099636211937326e-05, + "loss": 1.923, + "step": 945 + }, + { + "epoch": 1.2020330368487928, + "grad_norm": 1.2385525220234572, + "learning_rate": 1.6091623153715937e-05, + "loss": 1.7866, + "step": 946 + }, + { + "epoch": 1.2033036848792884, + "grad_norm": 1.380934926954665, + "learning_rate": 1.6083603871478316e-05, + "loss": 1.9634, + "step": 947 + }, + { + "epoch": 1.204574332909784, + "grad_norm": 1.504835347255383, + "learning_rate": 1.6075578373418028e-05, + "loss": 1.8734, + "step": 948 + }, + { + "epoch": 1.2058449809402796, + "grad_norm": 1.2570433169210586, + "learning_rate": 1.6067546667734996e-05, + "loss": 1.7331, + "step": 949 + }, + { + "epoch": 1.2071156289707752, + "grad_norm": 1.470968126969614, + "learning_rate": 1.6059508762635482e-05, + "loss": 1.8766, + "step": 950 + }, + { + "epoch": 1.2083862770012708, + "grad_norm": 2.9329383359308383, + "learning_rate": 1.6051464666332087e-05, + "loss": 1.8247, + "step": 951 + }, + { + "epoch": 1.2096569250317661, + "grad_norm": 1.3856521724757025, + "learning_rate": 1.604341438704373e-05, + "loss": 1.7699, + "step": 952 + }, + { + "epoch": 1.2109275730622617, + "grad_norm": 1.2420809627636966, + "learning_rate": 1.603535793299566e-05, + "loss": 1.7875, + "step": 953 + }, + { + "epoch": 1.2121982210927573, + "grad_norm": 1.1896169838405297, + "learning_rate": 1.6027295312419423e-05, + "loss": 1.6536, + "step": 954 + }, + { + "epoch": 1.213468869123253, + "grad_norm": 1.3059711657788073, + "learning_rate": 1.6019226533552865e-05, + "loss": 1.7259, + "step": 955 + }, + { + "epoch": 1.2147395171537485, + "grad_norm": 1.294022377822766, + "learning_rate": 1.6011151604640137e-05, + "loss": 1.9751, + "step": 956 + }, + { + "epoch": 1.2160101651842439, + "grad_norm": 1.3747145360760893, + "learning_rate": 1.6003070533931657e-05, + "loss": 1.9108, + "step": 957 + }, + { + "epoch": 1.2172808132147395, + "grad_norm": 1.3085675703201363, + "learning_rate": 1.5994983329684134e-05, + "loss": 1.773, + "step": 958 + }, + { + "epoch": 1.218551461245235, + "grad_norm": 1.3082155598851708, + "learning_rate": 1.598689000016053e-05, + "loss": 2.0224, + "step": 959 + }, + { + "epoch": 1.2198221092757306, + "grad_norm": 1.2699164051757423, + "learning_rate": 1.597879055363008e-05, + "loss": 1.7492, + "step": 960 + }, + { + "epoch": 1.2210927573062262, + "grad_norm": 1.494781254547359, + "learning_rate": 1.597068499836825e-05, + "loss": 1.9116, + "step": 961 + }, + { + "epoch": 1.2223634053367216, + "grad_norm": 1.2803737503935362, + "learning_rate": 1.5962573342656765e-05, + "loss": 2.0579, + "step": 962 + }, + { + "epoch": 1.2236340533672172, + "grad_norm": 1.424002755059869, + "learning_rate": 1.5954455594783583e-05, + "loss": 1.8386, + "step": 963 + }, + { + "epoch": 1.2249047013977128, + "grad_norm": 1.3124210633130282, + "learning_rate": 1.594633176304287e-05, + "loss": 1.8892, + "step": 964 + }, + { + "epoch": 1.2261753494282084, + "grad_norm": 1.1789841678181265, + "learning_rate": 1.5938201855735017e-05, + "loss": 1.9728, + "step": 965 + }, + { + "epoch": 1.227445997458704, + "grad_norm": 1.5207365066909595, + "learning_rate": 1.5930065881166633e-05, + "loss": 1.8877, + "step": 966 + }, + { + "epoch": 1.2287166454891996, + "grad_norm": 1.4247222633395513, + "learning_rate": 1.592192384765051e-05, + "loss": 1.9222, + "step": 967 + }, + { + "epoch": 1.2299872935196952, + "grad_norm": 1.3398663036299368, + "learning_rate": 1.5913775763505637e-05, + "loss": 1.9223, + "step": 968 + }, + { + "epoch": 1.2312579415501905, + "grad_norm": 1.4400845015618418, + "learning_rate": 1.590562163705719e-05, + "loss": 1.721, + "step": 969 + }, + { + "epoch": 1.2325285895806861, + "grad_norm": 1.4564048181197873, + "learning_rate": 1.589746147663651e-05, + "loss": 1.8639, + "step": 970 + }, + { + "epoch": 1.2337992376111817, + "grad_norm": 1.305706120438892, + "learning_rate": 1.588929529058111e-05, + "loss": 1.6415, + "step": 971 + }, + { + "epoch": 1.2350698856416773, + "grad_norm": 1.438336680808511, + "learning_rate": 1.588112308723466e-05, + "loss": 1.7595, + "step": 972 + }, + { + "epoch": 1.236340533672173, + "grad_norm": 1.3131659046574775, + "learning_rate": 1.5872944874946964e-05, + "loss": 1.8001, + "step": 973 + }, + { + "epoch": 1.2376111817026683, + "grad_norm": 1.5464720716272369, + "learning_rate": 1.5864760662073987e-05, + "loss": 2.1457, + "step": 974 + }, + { + "epoch": 1.2388818297331639, + "grad_norm": 1.4378603489197082, + "learning_rate": 1.5856570456977813e-05, + "loss": 1.5821, + "step": 975 + }, + { + "epoch": 1.2401524777636594, + "grad_norm": 1.2980626938677702, + "learning_rate": 1.5848374268026647e-05, + "loss": 1.5928, + "step": 976 + }, + { + "epoch": 1.241423125794155, + "grad_norm": 1.3960821340170957, + "learning_rate": 1.5840172103594814e-05, + "loss": 1.8233, + "step": 977 + }, + { + "epoch": 1.2426937738246506, + "grad_norm": 1.4911685659071707, + "learning_rate": 1.5831963972062734e-05, + "loss": 1.9979, + "step": 978 + }, + { + "epoch": 1.2439644218551462, + "grad_norm": 1.6023316647723762, + "learning_rate": 1.582374988181694e-05, + "loss": 1.9972, + "step": 979 + }, + { + "epoch": 1.2452350698856416, + "grad_norm": 1.3300409718579196, + "learning_rate": 1.581552984125004e-05, + "loss": 1.9057, + "step": 980 + }, + { + "epoch": 1.2465057179161372, + "grad_norm": 1.3795536330193834, + "learning_rate": 1.5807303858760727e-05, + "loss": 1.7926, + "step": 981 + }, + { + "epoch": 1.2477763659466328, + "grad_norm": 1.4409025468690164, + "learning_rate": 1.5799071942753762e-05, + "loss": 1.9829, + "step": 982 + }, + { + "epoch": 1.2490470139771284, + "grad_norm": 1.3140965455913203, + "learning_rate": 1.5790834101639974e-05, + "loss": 1.8821, + "step": 983 + }, + { + "epoch": 1.250317662007624, + "grad_norm": 1.3114225109547808, + "learning_rate": 1.578259034383624e-05, + "loss": 1.9938, + "step": 984 + }, + { + "epoch": 1.2515883100381195, + "grad_norm": 1.1795051238190635, + "learning_rate": 1.5774340677765483e-05, + "loss": 1.715, + "step": 985 + }, + { + "epoch": 1.252858958068615, + "grad_norm": 1.592645153504347, + "learning_rate": 1.5766085111856668e-05, + "loss": 1.9455, + "step": 986 + }, + { + "epoch": 1.2541296060991105, + "grad_norm": 1.2715453295489205, + "learning_rate": 1.575782365454478e-05, + "loss": 1.8798, + "step": 987 + }, + { + "epoch": 1.255400254129606, + "grad_norm": 1.28007175664308, + "learning_rate": 1.574955631427083e-05, + "loss": 1.6589, + "step": 988 + }, + { + "epoch": 1.2566709021601017, + "grad_norm": 1.343344815115729, + "learning_rate": 1.5741283099481842e-05, + "loss": 1.7512, + "step": 989 + }, + { + "epoch": 1.2579415501905973, + "grad_norm": 1.2250258478063394, + "learning_rate": 1.5733004018630826e-05, + "loss": 1.8518, + "step": 990 + }, + { + "epoch": 1.2592121982210926, + "grad_norm": 1.3278996410852044, + "learning_rate": 1.572471908017681e-05, + "loss": 1.9421, + "step": 991 + }, + { + "epoch": 1.2604828462515882, + "grad_norm": 1.277235069888488, + "learning_rate": 1.5716428292584788e-05, + "loss": 2.0307, + "step": 992 + }, + { + "epoch": 1.2617534942820838, + "grad_norm": 1.2367617891006584, + "learning_rate": 1.570813166432574e-05, + "loss": 1.9464, + "step": 993 + }, + { + "epoch": 1.2630241423125794, + "grad_norm": 1.3106685413405286, + "learning_rate": 1.5699829203876603e-05, + "loss": 2.085, + "step": 994 + }, + { + "epoch": 1.264294790343075, + "grad_norm": 1.3322085260452259, + "learning_rate": 1.5691520919720285e-05, + "loss": 1.8839, + "step": 995 + }, + { + "epoch": 1.2655654383735704, + "grad_norm": 1.3242145801398495, + "learning_rate": 1.568320682034564e-05, + "loss": 1.65, + "step": 996 + }, + { + "epoch": 1.2668360864040662, + "grad_norm": 1.419805813423947, + "learning_rate": 1.5674886914247464e-05, + "loss": 2.007, + "step": 997 + }, + { + "epoch": 1.2681067344345616, + "grad_norm": 1.4366987929669561, + "learning_rate": 1.5666561209926484e-05, + "loss": 2.0678, + "step": 998 + }, + { + "epoch": 1.2693773824650572, + "grad_norm": 1.4758663789147342, + "learning_rate": 1.5658229715889345e-05, + "loss": 1.9956, + "step": 999 + }, + { + "epoch": 1.2706480304955527, + "grad_norm": 1.3717768991812873, + "learning_rate": 1.5649892440648625e-05, + "loss": 1.9548, + "step": 1000 + }, + { + "epoch": 1.2719186785260483, + "grad_norm": 1.3419808182638524, + "learning_rate": 1.5641549392722794e-05, + "loss": 2.0226, + "step": 1001 + }, + { + "epoch": 1.273189326556544, + "grad_norm": 1.3204598751627847, + "learning_rate": 1.563320058063622e-05, + "loss": 1.9622, + "step": 1002 + }, + { + "epoch": 1.2744599745870393, + "grad_norm": 1.3348555688151162, + "learning_rate": 1.5624846012919176e-05, + "loss": 1.863, + "step": 1003 + }, + { + "epoch": 1.275730622617535, + "grad_norm": 1.2319709711845521, + "learning_rate": 1.5616485698107795e-05, + "loss": 1.9321, + "step": 1004 + }, + { + "epoch": 1.2770012706480305, + "grad_norm": 1.304500718931566, + "learning_rate": 1.5608119644744094e-05, + "loss": 1.8073, + "step": 1005 + }, + { + "epoch": 1.278271918678526, + "grad_norm": 1.4587969118807587, + "learning_rate": 1.5599747861375957e-05, + "loss": 1.8367, + "step": 1006 + }, + { + "epoch": 1.2795425667090217, + "grad_norm": 2.670844257464096, + "learning_rate": 1.559137035655711e-05, + "loss": 1.9432, + "step": 1007 + }, + { + "epoch": 1.280813214739517, + "grad_norm": 1.391990361869969, + "learning_rate": 1.558298713884713e-05, + "loss": 1.8169, + "step": 1008 + }, + { + "epoch": 1.2820838627700126, + "grad_norm": 1.4730317876459385, + "learning_rate": 1.557459821681144e-05, + "loss": 1.9452, + "step": 1009 + }, + { + "epoch": 1.2833545108005082, + "grad_norm": 1.2810624692796522, + "learning_rate": 1.5566203599021275e-05, + "loss": 1.9344, + "step": 1010 + }, + { + "epoch": 1.2846251588310038, + "grad_norm": 1.3035625667895672, + "learning_rate": 1.5557803294053705e-05, + "loss": 1.8236, + "step": 1011 + }, + { + "epoch": 1.2858958068614994, + "grad_norm": 1.3803171198408561, + "learning_rate": 1.5549397310491605e-05, + "loss": 1.8784, + "step": 1012 + }, + { + "epoch": 1.287166454891995, + "grad_norm": 1.5491986721199595, + "learning_rate": 1.5540985656923648e-05, + "loss": 1.9153, + "step": 1013 + }, + { + "epoch": 1.2884371029224906, + "grad_norm": 1.4653474934180506, + "learning_rate": 1.55325683419443e-05, + "loss": 1.7483, + "step": 1014 + }, + { + "epoch": 1.289707750952986, + "grad_norm": 1.2889672514437487, + "learning_rate": 1.5524145374153822e-05, + "loss": 1.6976, + "step": 1015 + }, + { + "epoch": 1.2909783989834815, + "grad_norm": 1.2494387273702594, + "learning_rate": 1.5515716762158235e-05, + "loss": 1.9428, + "step": 1016 + }, + { + "epoch": 1.2922490470139771, + "grad_norm": 1.2519364350061895, + "learning_rate": 1.5507282514569345e-05, + "loss": 1.8044, + "step": 1017 + }, + { + "epoch": 1.2935196950444727, + "grad_norm": 1.1327543441072128, + "learning_rate": 1.5498842640004698e-05, + "loss": 1.8495, + "step": 1018 + }, + { + "epoch": 1.2947903430749683, + "grad_norm": 1.3215774506253692, + "learning_rate": 1.54903971470876e-05, + "loss": 1.7701, + "step": 1019 + }, + { + "epoch": 1.2960609911054637, + "grad_norm": 1.2963665937170432, + "learning_rate": 1.54819460444471e-05, + "loss": 1.8535, + "step": 1020 + }, + { + "epoch": 1.2973316391359593, + "grad_norm": 1.2794249882878206, + "learning_rate": 1.547348934071797e-05, + "loss": 1.8859, + "step": 1021 + }, + { + "epoch": 1.2986022871664549, + "grad_norm": 1.297699172544122, + "learning_rate": 1.5465027044540705e-05, + "loss": 1.9074, + "step": 1022 + }, + { + "epoch": 1.2998729351969505, + "grad_norm": 1.3122435056880466, + "learning_rate": 1.5456559164561522e-05, + "loss": 1.813, + "step": 1023 + }, + { + "epoch": 1.301143583227446, + "grad_norm": 1.3336015416831544, + "learning_rate": 1.5448085709432338e-05, + "loss": 1.8336, + "step": 1024 + }, + { + "epoch": 1.3024142312579414, + "grad_norm": 1.3902805693497766, + "learning_rate": 1.5439606687810767e-05, + "loss": 1.7061, + "step": 1025 + }, + { + "epoch": 1.3036848792884372, + "grad_norm": 1.3184469533981749, + "learning_rate": 1.5431122108360114e-05, + "loss": 1.8433, + "step": 1026 + }, + { + "epoch": 1.3049555273189326, + "grad_norm": 1.288658892407689, + "learning_rate": 1.5422631979749354e-05, + "loss": 1.9737, + "step": 1027 + }, + { + "epoch": 1.3062261753494282, + "grad_norm": 1.2678604780793599, + "learning_rate": 1.5414136310653135e-05, + "loss": 1.7707, + "step": 1028 + }, + { + "epoch": 1.3074968233799238, + "grad_norm": 1.2796312807674892, + "learning_rate": 1.5405635109751776e-05, + "loss": 1.8034, + "step": 1029 + }, + { + "epoch": 1.3087674714104194, + "grad_norm": 1.3113897898932672, + "learning_rate": 1.5397128385731234e-05, + "loss": 1.9589, + "step": 1030 + }, + { + "epoch": 1.310038119440915, + "grad_norm": 1.3528783655661367, + "learning_rate": 1.5388616147283116e-05, + "loss": 1.9354, + "step": 1031 + }, + { + "epoch": 1.3113087674714103, + "grad_norm": 1.2717937721137405, + "learning_rate": 1.538009840310466e-05, + "loss": 1.6102, + "step": 1032 + }, + { + "epoch": 1.312579415501906, + "grad_norm": 1.4365578486864976, + "learning_rate": 1.537157516189874e-05, + "loss": 1.6837, + "step": 1033 + }, + { + "epoch": 1.3138500635324015, + "grad_norm": 1.210272849766323, + "learning_rate": 1.5363046432373824e-05, + "loss": 1.6498, + "step": 1034 + }, + { + "epoch": 1.3151207115628971, + "grad_norm": 1.22942762191297, + "learning_rate": 1.5354512223244017e-05, + "loss": 1.9632, + "step": 1035 + }, + { + "epoch": 1.3163913595933927, + "grad_norm": 1.4829589605126763, + "learning_rate": 1.5345972543229e-05, + "loss": 1.7339, + "step": 1036 + }, + { + "epoch": 1.317662007623888, + "grad_norm": 1.1986964294583307, + "learning_rate": 1.533742740105405e-05, + "loss": 1.7729, + "step": 1037 + }, + { + "epoch": 1.3189326556543837, + "grad_norm": 1.3595188028232712, + "learning_rate": 1.532887680545003e-05, + "loss": 1.7879, + "step": 1038 + }, + { + "epoch": 1.3202033036848793, + "grad_norm": 1.3754660533164054, + "learning_rate": 1.5320320765153367e-05, + "loss": 1.8581, + "step": 1039 + }, + { + "epoch": 1.3214739517153749, + "grad_norm": 1.3128095568502105, + "learning_rate": 1.5311759288906058e-05, + "loss": 1.9147, + "step": 1040 + }, + { + "epoch": 1.3227445997458704, + "grad_norm": 1.2909869827817089, + "learning_rate": 1.5303192385455652e-05, + "loss": 1.9877, + "step": 1041 + }, + { + "epoch": 1.3240152477763658, + "grad_norm": 1.276703618063512, + "learning_rate": 1.529462006355524e-05, + "loss": 1.8402, + "step": 1042 + }, + { + "epoch": 1.3252858958068616, + "grad_norm": 1.3230638250924585, + "learning_rate": 1.528604233196345e-05, + "loss": 1.9794, + "step": 1043 + }, + { + "epoch": 1.326556543837357, + "grad_norm": 1.4336921448073396, + "learning_rate": 1.5277459199444443e-05, + "loss": 1.9894, + "step": 1044 + }, + { + "epoch": 1.3278271918678526, + "grad_norm": 1.7737486792928339, + "learning_rate": 1.5268870674767896e-05, + "loss": 1.8742, + "step": 1045 + }, + { + "epoch": 1.3290978398983482, + "grad_norm": 1.4078166067251785, + "learning_rate": 1.5260276766708984e-05, + "loss": 2.0651, + "step": 1046 + }, + { + "epoch": 1.3303684879288438, + "grad_norm": 1.3665398516482419, + "learning_rate": 1.52516774840484e-05, + "loss": 2.0082, + "step": 1047 + }, + { + "epoch": 1.3316391359593394, + "grad_norm": 1.2660456306800727, + "learning_rate": 1.5243072835572319e-05, + "loss": 2.0152, + "step": 1048 + }, + { + "epoch": 1.3329097839898347, + "grad_norm": 1.4400191027082891, + "learning_rate": 1.5234462830072399e-05, + "loss": 1.8645, + "step": 1049 + }, + { + "epoch": 1.3341804320203303, + "grad_norm": 1.2665509048717487, + "learning_rate": 1.522584747634577e-05, + "loss": 1.8708, + "step": 1050 + }, + { + "epoch": 1.335451080050826, + "grad_norm": 1.260635175489598, + "learning_rate": 1.5217226783195029e-05, + "loss": 1.8302, + "step": 1051 + }, + { + "epoch": 1.3367217280813215, + "grad_norm": 1.2921682665430776, + "learning_rate": 1.5208600759428233e-05, + "loss": 1.8964, + "step": 1052 + }, + { + "epoch": 1.337992376111817, + "grad_norm": 1.1808646501636042, + "learning_rate": 1.5199969413858877e-05, + "loss": 1.8027, + "step": 1053 + }, + { + "epoch": 1.3392630241423125, + "grad_norm": 1.2375925259174092, + "learning_rate": 1.5191332755305897e-05, + "loss": 1.9994, + "step": 1054 + }, + { + "epoch": 1.340533672172808, + "grad_norm": 1.3342416016936127, + "learning_rate": 1.5182690792593659e-05, + "loss": 1.8661, + "step": 1055 + }, + { + "epoch": 1.3418043202033036, + "grad_norm": 1.1515642847973244, + "learning_rate": 1.517404353455194e-05, + "loss": 1.7801, + "step": 1056 + }, + { + "epoch": 1.3430749682337992, + "grad_norm": 1.1481938432677656, + "learning_rate": 1.5165390990015947e-05, + "loss": 1.708, + "step": 1057 + }, + { + "epoch": 1.3443456162642948, + "grad_norm": 1.2206968911997678, + "learning_rate": 1.5156733167826265e-05, + "loss": 1.8242, + "step": 1058 + }, + { + "epoch": 1.3456162642947904, + "grad_norm": 1.3879378412492531, + "learning_rate": 1.5148070076828885e-05, + "loss": 2.0539, + "step": 1059 + }, + { + "epoch": 1.346886912325286, + "grad_norm": 1.278359228270404, + "learning_rate": 1.513940172587518e-05, + "loss": 1.7368, + "step": 1060 + }, + { + "epoch": 1.3481575603557814, + "grad_norm": 1.2881471856222333, + "learning_rate": 1.5130728123821898e-05, + "loss": 1.8111, + "step": 1061 + }, + { + "epoch": 1.349428208386277, + "grad_norm": 1.3135644893277756, + "learning_rate": 1.5122049279531143e-05, + "loss": 1.8608, + "step": 1062 + }, + { + "epoch": 1.3506988564167726, + "grad_norm": 1.2408858052927654, + "learning_rate": 1.5113365201870388e-05, + "loss": 1.9712, + "step": 1063 + }, + { + "epoch": 1.3519695044472682, + "grad_norm": 1.4792483560642713, + "learning_rate": 1.5104675899712447e-05, + "loss": 1.8097, + "step": 1064 + }, + { + "epoch": 1.3532401524777637, + "grad_norm": 1.7074901287628397, + "learning_rate": 1.5095981381935468e-05, + "loss": 2.0828, + "step": 1065 + }, + { + "epoch": 1.3545108005082591, + "grad_norm": 1.4288502131269385, + "learning_rate": 1.5087281657422935e-05, + "loss": 1.8427, + "step": 1066 + }, + { + "epoch": 1.3557814485387547, + "grad_norm": 1.4385655658495924, + "learning_rate": 1.5078576735063646e-05, + "loss": 1.5808, + "step": 1067 + }, + { + "epoch": 1.3570520965692503, + "grad_norm": 1.285975158006785, + "learning_rate": 1.5069866623751718e-05, + "loss": 1.8827, + "step": 1068 + }, + { + "epoch": 1.358322744599746, + "grad_norm": 1.3164134269798404, + "learning_rate": 1.5061151332386565e-05, + "loss": 1.9938, + "step": 1069 + }, + { + "epoch": 1.3595933926302415, + "grad_norm": 1.4052341123659684, + "learning_rate": 1.5052430869872888e-05, + "loss": 1.9635, + "step": 1070 + }, + { + "epoch": 1.3608640406607369, + "grad_norm": 1.4081093932218003, + "learning_rate": 1.504370524512068e-05, + "loss": 1.8709, + "step": 1071 + }, + { + "epoch": 1.3621346886912327, + "grad_norm": 1.2520645518775244, + "learning_rate": 1.50349744670452e-05, + "loss": 1.7629, + "step": 1072 + }, + { + "epoch": 1.363405336721728, + "grad_norm": 1.4543344242633094, + "learning_rate": 1.5026238544566986e-05, + "loss": 2.0301, + "step": 1073 + }, + { + "epoch": 1.3646759847522236, + "grad_norm": 1.3253506262354477, + "learning_rate": 1.501749748661182e-05, + "loss": 1.8146, + "step": 1074 + }, + { + "epoch": 1.3659466327827192, + "grad_norm": 1.5603060746011057, + "learning_rate": 1.5008751302110738e-05, + "loss": 1.8916, + "step": 1075 + }, + { + "epoch": 1.3672172808132148, + "grad_norm": 1.2963408122032392, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.6603, + "step": 1076 + }, + { + "epoch": 1.3684879288437104, + "grad_norm": 29.216318101427753, + "learning_rate": 1.4991243589221118e-05, + "loss": 1.8385, + "step": 1077 + }, + { + "epoch": 1.3697585768742058, + "grad_norm": 1.281414727075171, + "learning_rate": 1.4982482078720808e-05, + "loss": 1.605, + "step": 1078 + }, + { + "epoch": 1.3710292249047014, + "grad_norm": 1.4359392726360947, + "learning_rate": 1.4973715477450996e-05, + "loss": 1.8942, + "step": 1079 + }, + { + "epoch": 1.372299872935197, + "grad_norm": 1.3770086129527501, + "learning_rate": 1.4964943794368815e-05, + "loss": 1.8196, + "step": 1080 + }, + { + "epoch": 1.3735705209656925, + "grad_norm": 1.1848449267963983, + "learning_rate": 1.4956167038436594e-05, + "loss": 1.8403, + "step": 1081 + }, + { + "epoch": 1.3748411689961881, + "grad_norm": 1.271521238883279, + "learning_rate": 1.4947385218621832e-05, + "loss": 1.8443, + "step": 1082 + }, + { + "epoch": 1.3761118170266835, + "grad_norm": 1.1738548733510952, + "learning_rate": 1.4938598343897215e-05, + "loss": 1.7652, + "step": 1083 + }, + { + "epoch": 1.377382465057179, + "grad_norm": 1.4836826510060177, + "learning_rate": 1.4929806423240582e-05, + "loss": 1.8507, + "step": 1084 + }, + { + "epoch": 1.3786531130876747, + "grad_norm": 1.358512949766454, + "learning_rate": 1.4921009465634941e-05, + "loss": 2.0604, + "step": 1085 + }, + { + "epoch": 1.3799237611181703, + "grad_norm": 1.2784656221631014, + "learning_rate": 1.4912207480068437e-05, + "loss": 2.0367, + "step": 1086 + }, + { + "epoch": 1.3811944091486659, + "grad_norm": 1.4821834722243332, + "learning_rate": 1.4903400475534355e-05, + "loss": 1.8966, + "step": 1087 + }, + { + "epoch": 1.3824650571791612, + "grad_norm": 1.3037716958740244, + "learning_rate": 1.4894588461031107e-05, + "loss": 1.9964, + "step": 1088 + }, + { + "epoch": 1.383735705209657, + "grad_norm": 1.2493808686091994, + "learning_rate": 1.4885771445562225e-05, + "loss": 1.9861, + "step": 1089 + }, + { + "epoch": 1.3850063532401524, + "grad_norm": 1.3612017402957912, + "learning_rate": 1.4876949438136348e-05, + "loss": 2.021, + "step": 1090 + }, + { + "epoch": 1.386277001270648, + "grad_norm": 1.1468990675471178, + "learning_rate": 1.486812244776722e-05, + "loss": 1.6981, + "step": 1091 + }, + { + "epoch": 1.3875476493011436, + "grad_norm": 1.2227722381131463, + "learning_rate": 1.4859290483473671e-05, + "loss": 1.7964, + "step": 1092 + }, + { + "epoch": 1.3888182973316392, + "grad_norm": 1.1223438081355355, + "learning_rate": 1.4850453554279622e-05, + "loss": 1.835, + "step": 1093 + }, + { + "epoch": 1.3900889453621348, + "grad_norm": 1.2681547197998326, + "learning_rate": 1.4841611669214056e-05, + "loss": 2.0526, + "step": 1094 + }, + { + "epoch": 1.3913595933926302, + "grad_norm": 1.2400725181931955, + "learning_rate": 1.4832764837311026e-05, + "loss": 1.9443, + "step": 1095 + }, + { + "epoch": 1.3926302414231257, + "grad_norm": 1.2268627779535064, + "learning_rate": 1.4823913067609639e-05, + "loss": 1.9066, + "step": 1096 + }, + { + "epoch": 1.3939008894536213, + "grad_norm": 1.3479500429031042, + "learning_rate": 1.4815056369154039e-05, + "loss": 1.7896, + "step": 1097 + }, + { + "epoch": 1.395171537484117, + "grad_norm": 1.1534860117987928, + "learning_rate": 1.4806194750993422e-05, + "loss": 1.8786, + "step": 1098 + }, + { + "epoch": 1.3964421855146125, + "grad_norm": 1.1962912803044135, + "learning_rate": 1.4797328222181995e-05, + "loss": 1.9501, + "step": 1099 + }, + { + "epoch": 1.397712833545108, + "grad_norm": 1.3054107727293938, + "learning_rate": 1.478845679177899e-05, + "loss": 2.0076, + "step": 1100 + }, + { + "epoch": 1.3989834815756035, + "grad_norm": 1.3366491474516522, + "learning_rate": 1.4779580468848647e-05, + "loss": 1.8628, + "step": 1101 + }, + { + "epoch": 1.400254129606099, + "grad_norm": 1.1245658180174034, + "learning_rate": 1.4770699262460204e-05, + "loss": 1.6988, + "step": 1102 + }, + { + "epoch": 1.4015247776365947, + "grad_norm": 1.2951297055619024, + "learning_rate": 1.4761813181687885e-05, + "loss": 1.7491, + "step": 1103 + }, + { + "epoch": 1.4027954256670903, + "grad_norm": 1.3722443895167817, + "learning_rate": 1.47529222356109e-05, + "loss": 2.1425, + "step": 1104 + }, + { + "epoch": 1.4040660736975858, + "grad_norm": 1.4854506056127954, + "learning_rate": 1.474402643331343e-05, + "loss": 1.8417, + "step": 1105 + }, + { + "epoch": 1.4053367217280814, + "grad_norm": 1.48203680056898, + "learning_rate": 1.4735125783884609e-05, + "loss": 1.8387, + "step": 1106 + }, + { + "epoch": 1.4066073697585768, + "grad_norm": 1.1523601582917917, + "learning_rate": 1.4726220296418536e-05, + "loss": 1.8076, + "step": 1107 + }, + { + "epoch": 1.4078780177890724, + "grad_norm": 1.220791922803522, + "learning_rate": 1.4717309980014245e-05, + "loss": 1.8264, + "step": 1108 + }, + { + "epoch": 1.409148665819568, + "grad_norm": 1.4844350959579224, + "learning_rate": 1.4708394843775704e-05, + "loss": 1.8089, + "step": 1109 + }, + { + "epoch": 1.4104193138500636, + "grad_norm": 1.3676963543110698, + "learning_rate": 1.4699474896811809e-05, + "loss": 1.8093, + "step": 1110 + }, + { + "epoch": 1.4116899618805592, + "grad_norm": 1.390402133401193, + "learning_rate": 1.4690550148236371e-05, + "loss": 1.6331, + "step": 1111 + }, + { + "epoch": 1.4129606099110545, + "grad_norm": 1.213969912031899, + "learning_rate": 1.4681620607168104e-05, + "loss": 1.7546, + "step": 1112 + }, + { + "epoch": 1.4142312579415501, + "grad_norm": 1.2301054744197815, + "learning_rate": 1.4672686282730622e-05, + "loss": 1.8846, + "step": 1113 + }, + { + "epoch": 1.4155019059720457, + "grad_norm": 1.322173497693607, + "learning_rate": 1.4663747184052425e-05, + "loss": 2.0186, + "step": 1114 + }, + { + "epoch": 1.4167725540025413, + "grad_norm": 1.372166178996649, + "learning_rate": 1.4654803320266883e-05, + "loss": 1.8133, + "step": 1115 + }, + { + "epoch": 1.418043202033037, + "grad_norm": 1.355472281521608, + "learning_rate": 1.4645854700512254e-05, + "loss": 1.7767, + "step": 1116 + }, + { + "epoch": 1.4193138500635323, + "grad_norm": 1.1852757912386718, + "learning_rate": 1.463690133393164e-05, + "loss": 1.9112, + "step": 1117 + }, + { + "epoch": 1.420584498094028, + "grad_norm": 1.4821403273379334, + "learning_rate": 1.4627943229672992e-05, + "loss": 1.8262, + "step": 1118 + }, + { + "epoch": 1.4218551461245235, + "grad_norm": 1.2697543939806661, + "learning_rate": 1.461898039688911e-05, + "loss": 2.0448, + "step": 1119 + }, + { + "epoch": 1.423125794155019, + "grad_norm": 1.274206462270112, + "learning_rate": 1.4610012844737622e-05, + "loss": 1.7997, + "step": 1120 + }, + { + "epoch": 1.4243964421855146, + "grad_norm": 1.3575689355994345, + "learning_rate": 1.4601040582380976e-05, + "loss": 2.0597, + "step": 1121 + }, + { + "epoch": 1.4256670902160102, + "grad_norm": 2.138323377849098, + "learning_rate": 1.4592063618986439e-05, + "loss": 1.7497, + "step": 1122 + }, + { + "epoch": 1.4269377382465058, + "grad_norm": 1.4614623364414636, + "learning_rate": 1.4583081963726068e-05, + "loss": 1.7695, + "step": 1123 + }, + { + "epoch": 1.4282083862770012, + "grad_norm": 1.3726036998687277, + "learning_rate": 1.457409562577673e-05, + "loss": 1.8282, + "step": 1124 + }, + { + "epoch": 1.4294790343074968, + "grad_norm": 1.297862988769582, + "learning_rate": 1.4565104614320065e-05, + "loss": 1.7631, + "step": 1125 + }, + { + "epoch": 1.4307496823379924, + "grad_norm": 1.2157552056387835, + "learning_rate": 1.455610893854249e-05, + "loss": 1.8712, + "step": 1126 + }, + { + "epoch": 1.432020330368488, + "grad_norm": 1.225192364749645, + "learning_rate": 1.4547108607635194e-05, + "loss": 2.0084, + "step": 1127 + }, + { + "epoch": 1.4332909783989836, + "grad_norm": 1.4469029193769432, + "learning_rate": 1.4538103630794117e-05, + "loss": 1.8373, + "step": 1128 + }, + { + "epoch": 1.434561626429479, + "grad_norm": 1.6695257574356568, + "learning_rate": 1.452909401721994e-05, + "loss": 2.0349, + "step": 1129 + }, + { + "epoch": 1.4358322744599745, + "grad_norm": 1.3172507478374025, + "learning_rate": 1.45200797761181e-05, + "loss": 1.9316, + "step": 1130 + }, + { + "epoch": 1.4371029224904701, + "grad_norm": 1.2623048899621796, + "learning_rate": 1.4511060916698739e-05, + "loss": 1.4695, + "step": 1131 + }, + { + "epoch": 1.4383735705209657, + "grad_norm": 1.7724378270582546, + "learning_rate": 1.4502037448176734e-05, + "loss": 2.028, + "step": 1132 + }, + { + "epoch": 1.4396442185514613, + "grad_norm": 1.2047296714294544, + "learning_rate": 1.4493009379771667e-05, + "loss": 1.8363, + "step": 1133 + }, + { + "epoch": 1.4409148665819567, + "grad_norm": 1.3014621811645635, + "learning_rate": 1.4483976720707817e-05, + "loss": 1.7993, + "step": 1134 + }, + { + "epoch": 1.4421855146124525, + "grad_norm": 1.312615920278082, + "learning_rate": 1.4474939480214156e-05, + "loss": 1.8531, + "step": 1135 + }, + { + "epoch": 1.4434561626429478, + "grad_norm": 1.3840101493642059, + "learning_rate": 1.446589766752434e-05, + "loss": 2.0521, + "step": 1136 + }, + { + "epoch": 1.4447268106734434, + "grad_norm": 1.3006679273166455, + "learning_rate": 1.4456851291876688e-05, + "loss": 1.882, + "step": 1137 + }, + { + "epoch": 1.445997458703939, + "grad_norm": 1.376007968190896, + "learning_rate": 1.4447800362514188e-05, + "loss": 2.1259, + "step": 1138 + }, + { + "epoch": 1.4472681067344346, + "grad_norm": 1.5744017178587613, + "learning_rate": 1.4438744888684481e-05, + "loss": 2.036, + "step": 1139 + }, + { + "epoch": 1.4485387547649302, + "grad_norm": 1.0898107837156374, + "learning_rate": 1.4429684879639848e-05, + "loss": 2.0197, + "step": 1140 + }, + { + "epoch": 1.4498094027954256, + "grad_norm": 1.4478727647608343, + "learning_rate": 1.44206203446372e-05, + "loss": 1.8264, + "step": 1141 + }, + { + "epoch": 1.4510800508259212, + "grad_norm": 1.2597561559788928, + "learning_rate": 1.4411551292938087e-05, + "loss": 1.5697, + "step": 1142 + }, + { + "epoch": 1.4523506988564168, + "grad_norm": 1.2427984789425064, + "learning_rate": 1.4402477733808656e-05, + "loss": 2.0338, + "step": 1143 + }, + { + "epoch": 1.4536213468869124, + "grad_norm": 1.2574876773229697, + "learning_rate": 1.4393399676519668e-05, + "loss": 1.616, + "step": 1144 + }, + { + "epoch": 1.454891994917408, + "grad_norm": 1.3019702996517974, + "learning_rate": 1.4384317130346484e-05, + "loss": 2.0132, + "step": 1145 + }, + { + "epoch": 1.4561626429479033, + "grad_norm": 1.2706570967585542, + "learning_rate": 1.4375230104569044e-05, + "loss": 1.9072, + "step": 1146 + }, + { + "epoch": 1.457433290978399, + "grad_norm": 1.369054262158506, + "learning_rate": 1.436613860847187e-05, + "loss": 1.8305, + "step": 1147 + }, + { + "epoch": 1.4587039390088945, + "grad_norm": 1.2389341173958066, + "learning_rate": 1.4357042651344047e-05, + "loss": 1.8479, + "step": 1148 + }, + { + "epoch": 1.45997458703939, + "grad_norm": 1.2176996548775592, + "learning_rate": 1.4347942242479217e-05, + "loss": 1.9907, + "step": 1149 + }, + { + "epoch": 1.4612452350698857, + "grad_norm": 1.3531336555921543, + "learning_rate": 1.4338837391175582e-05, + "loss": 1.9116, + "step": 1150 + }, + { + "epoch": 1.4625158831003813, + "grad_norm": 1.2600570697403215, + "learning_rate": 1.432972810673587e-05, + "loss": 1.8925, + "step": 1151 + }, + { + "epoch": 1.4637865311308769, + "grad_norm": 1.2661028151416078, + "learning_rate": 1.4320614398467342e-05, + "loss": 1.8575, + "step": 1152 + }, + { + "epoch": 1.4650571791613722, + "grad_norm": 1.3397535457153715, + "learning_rate": 1.4311496275681785e-05, + "loss": 2.005, + "step": 1153 + }, + { + "epoch": 1.4663278271918678, + "grad_norm": 1.2929579386821513, + "learning_rate": 1.4302373747695488e-05, + "loss": 2.0029, + "step": 1154 + }, + { + "epoch": 1.4675984752223634, + "grad_norm": 1.246952940681954, + "learning_rate": 1.4293246823829242e-05, + "loss": 1.8672, + "step": 1155 + }, + { + "epoch": 1.468869123252859, + "grad_norm": 1.23098474049904, + "learning_rate": 1.4284115513408337e-05, + "loss": 1.8379, + "step": 1156 + }, + { + "epoch": 1.4701397712833546, + "grad_norm": 1.1801976280115345, + "learning_rate": 1.4274979825762541e-05, + "loss": 1.7649, + "step": 1157 + }, + { + "epoch": 1.47141041931385, + "grad_norm": 1.5075282662210534, + "learning_rate": 1.4265839770226087e-05, + "loss": 1.794, + "step": 1158 + }, + { + "epoch": 1.4726810673443456, + "grad_norm": 1.2562664740051244, + "learning_rate": 1.4256695356137683e-05, + "loss": 1.6773, + "step": 1159 + }, + { + "epoch": 1.4739517153748412, + "grad_norm": 1.3319817163409906, + "learning_rate": 1.424754659284048e-05, + "loss": 1.8983, + "step": 1160 + }, + { + "epoch": 1.4752223634053367, + "grad_norm": 1.3009487967372897, + "learning_rate": 1.4238393489682078e-05, + "loss": 1.4876, + "step": 1161 + }, + { + "epoch": 1.4764930114358323, + "grad_norm": 1.4381420793797635, + "learning_rate": 1.4229236056014517e-05, + "loss": 2.1053, + "step": 1162 + }, + { + "epoch": 1.4777636594663277, + "grad_norm": 1.4382466575045552, + "learning_rate": 1.4220074301194244e-05, + "loss": 1.6265, + "step": 1163 + }, + { + "epoch": 1.4790343074968233, + "grad_norm": 1.3299729844702393, + "learning_rate": 1.4210908234582141e-05, + "loss": 1.8122, + "step": 1164 + }, + { + "epoch": 1.4803049555273189, + "grad_norm": 1.498254150158053, + "learning_rate": 1.4201737865543481e-05, + "loss": 2.1078, + "step": 1165 + }, + { + "epoch": 1.4815756035578145, + "grad_norm": 1.2037816133533588, + "learning_rate": 1.4192563203447941e-05, + "loss": 1.975, + "step": 1166 + }, + { + "epoch": 1.48284625158831, + "grad_norm": 1.0785324145030633, + "learning_rate": 1.418338425766958e-05, + "loss": 1.4234, + "step": 1167 + }, + { + "epoch": 1.4841168996188057, + "grad_norm": 1.1533361400572302, + "learning_rate": 1.4174201037586841e-05, + "loss": 1.85, + "step": 1168 + }, + { + "epoch": 1.4853875476493013, + "grad_norm": 1.188900432669104, + "learning_rate": 1.416501355258252e-05, + "loss": 1.8593, + "step": 1169 + }, + { + "epoch": 1.4866581956797966, + "grad_norm": 1.3221956875383332, + "learning_rate": 1.4155821812043787e-05, + "loss": 2.0455, + "step": 1170 + }, + { + "epoch": 1.4879288437102922, + "grad_norm": 1.2783010187174453, + "learning_rate": 1.4146625825362147e-05, + "loss": 1.8903, + "step": 1171 + }, + { + "epoch": 1.4891994917407878, + "grad_norm": 1.203190732721331, + "learning_rate": 1.4137425601933457e-05, + "loss": 2.0258, + "step": 1172 + }, + { + "epoch": 1.4904701397712834, + "grad_norm": 1.464676328426507, + "learning_rate": 1.4128221151157882e-05, + "loss": 2.0596, + "step": 1173 + }, + { + "epoch": 1.491740787801779, + "grad_norm": 10.1925542723116, + "learning_rate": 1.4119012482439929e-05, + "loss": 1.6295, + "step": 1174 + }, + { + "epoch": 1.4930114358322744, + "grad_norm": 1.3805107030458463, + "learning_rate": 1.41097996051884e-05, + "loss": 1.9646, + "step": 1175 + }, + { + "epoch": 1.49428208386277, + "grad_norm": 1.5218594454266174, + "learning_rate": 1.4100582528816404e-05, + "loss": 1.7224, + "step": 1176 + }, + { + "epoch": 1.4955527318932655, + "grad_norm": 1.386172494199785, + "learning_rate": 1.4091361262741337e-05, + "loss": 1.9983, + "step": 1177 + }, + { + "epoch": 1.4968233799237611, + "grad_norm": 1.2242855036178195, + "learning_rate": 1.4082135816384877e-05, + "loss": 2.078, + "step": 1178 + }, + { + "epoch": 1.4980940279542567, + "grad_norm": 1.2570037053181151, + "learning_rate": 1.4072906199172969e-05, + "loss": 1.7809, + "step": 1179 + }, + { + "epoch": 1.499364675984752, + "grad_norm": 1.4758796109715635, + "learning_rate": 1.406367242053583e-05, + "loss": 1.8389, + "step": 1180 + }, + { + "epoch": 1.500635324015248, + "grad_norm": 1.395166479758458, + "learning_rate": 1.4054434489907916e-05, + "loss": 1.7727, + "step": 1181 + }, + { + "epoch": 1.5019059720457433, + "grad_norm": 1.4118358945476996, + "learning_rate": 1.4045192416727937e-05, + "loss": 1.7145, + "step": 1182 + }, + { + "epoch": 1.5031766200762389, + "grad_norm": 1.2784623059267264, + "learning_rate": 1.4035946210438827e-05, + "loss": 2.0701, + "step": 1183 + }, + { + "epoch": 1.5044472681067345, + "grad_norm": 1.4109413545265437, + "learning_rate": 1.4026695880487744e-05, + "loss": 1.984, + "step": 1184 + }, + { + "epoch": 1.5057179161372298, + "grad_norm": 1.2849466440320692, + "learning_rate": 1.4017441436326063e-05, + "loss": 1.8814, + "step": 1185 + }, + { + "epoch": 1.5069885641677256, + "grad_norm": 1.4766606508276032, + "learning_rate": 1.4008182887409363e-05, + "loss": 2.0233, + "step": 1186 + }, + { + "epoch": 1.508259212198221, + "grad_norm": 1.463036495890669, + "learning_rate": 1.3998920243197408e-05, + "loss": 2.0958, + "step": 1187 + }, + { + "epoch": 1.5095298602287166, + "grad_norm": 1.384808363410981, + "learning_rate": 1.3989653513154165e-05, + "loss": 1.7643, + "step": 1188 + }, + { + "epoch": 1.5108005082592122, + "grad_norm": 1.4283450082814748, + "learning_rate": 1.3980382706747752e-05, + "loss": 2.1955, + "step": 1189 + }, + { + "epoch": 1.5120711562897078, + "grad_norm": 1.2690160617888337, + "learning_rate": 1.397110783345047e-05, + "loss": 1.7904, + "step": 1190 + }, + { + "epoch": 1.5133418043202034, + "grad_norm": 1.285380785106247, + "learning_rate": 1.3961828902738768e-05, + "loss": 1.8982, + "step": 1191 + }, + { + "epoch": 1.5146124523506987, + "grad_norm": 1.4046160448580636, + "learning_rate": 1.3952545924093239e-05, + "loss": 1.9344, + "step": 1192 + }, + { + "epoch": 1.5158831003811946, + "grad_norm": 1.217728050346088, + "learning_rate": 1.3943258906998615e-05, + "loss": 1.9526, + "step": 1193 + }, + { + "epoch": 1.51715374841169, + "grad_norm": 1.3155374681124927, + "learning_rate": 1.393396786094376e-05, + "loss": 1.9693, + "step": 1194 + }, + { + "epoch": 1.5184243964421855, + "grad_norm": 1.2375353252029027, + "learning_rate": 1.3924672795421638e-05, + "loss": 1.9235, + "step": 1195 + }, + { + "epoch": 1.5196950444726811, + "grad_norm": 1.196859858514443, + "learning_rate": 1.391537371992934e-05, + "loss": 1.9201, + "step": 1196 + }, + { + "epoch": 1.5209656925031765, + "grad_norm": 1.5229479940500787, + "learning_rate": 1.3906070643968035e-05, + "loss": 1.8756, + "step": 1197 + }, + { + "epoch": 1.5222363405336723, + "grad_norm": 1.2629206956366628, + "learning_rate": 1.3896763577042995e-05, + "loss": 1.9048, + "step": 1198 + }, + { + "epoch": 1.5235069885641677, + "grad_norm": 1.7258718732318938, + "learning_rate": 1.3887452528663558e-05, + "loss": 2.0766, + "step": 1199 + }, + { + "epoch": 1.5247776365946633, + "grad_norm": 1.4627023384109283, + "learning_rate": 1.3878137508343143e-05, + "loss": 1.9649, + "step": 1200 + }, + { + "epoch": 1.5260482846251588, + "grad_norm": 1.222588419713132, + "learning_rate": 1.3868818525599215e-05, + "loss": 2.0395, + "step": 1201 + }, + { + "epoch": 1.5273189326556544, + "grad_norm": 1.349113525510833, + "learning_rate": 1.3859495589953289e-05, + "loss": 1.8303, + "step": 1202 + }, + { + "epoch": 1.52858958068615, + "grad_norm": 2.981849494497899, + "learning_rate": 1.3850168710930927e-05, + "loss": 1.9033, + "step": 1203 + }, + { + "epoch": 1.5298602287166454, + "grad_norm": 1.3647918261639207, + "learning_rate": 1.3840837898061711e-05, + "loss": 1.6911, + "step": 1204 + }, + { + "epoch": 1.531130876747141, + "grad_norm": 1.2208582908937131, + "learning_rate": 1.3831503160879249e-05, + "loss": 1.6361, + "step": 1205 + }, + { + "epoch": 1.5324015247776366, + "grad_norm": 1.2583667151308502, + "learning_rate": 1.3822164508921157e-05, + "loss": 1.7389, + "step": 1206 + }, + { + "epoch": 1.5336721728081322, + "grad_norm": 1.4805538938186564, + "learning_rate": 1.3812821951729044e-05, + "loss": 1.7825, + "step": 1207 + }, + { + "epoch": 1.5349428208386278, + "grad_norm": 1.3870334698484725, + "learning_rate": 1.3803475498848522e-05, + "loss": 1.8398, + "step": 1208 + }, + { + "epoch": 1.5362134688691231, + "grad_norm": 1.3706103550947548, + "learning_rate": 1.3794125159829173e-05, + "loss": 1.93, + "step": 1209 + }, + { + "epoch": 1.537484116899619, + "grad_norm": 1.5719797919516998, + "learning_rate": 1.378477094422455e-05, + "loss": 1.8066, + "step": 1210 + }, + { + "epoch": 1.5387547649301143, + "grad_norm": 1.4245514066888003, + "learning_rate": 1.3775412861592175e-05, + "loss": 1.91, + "step": 1211 + }, + { + "epoch": 1.54002541296061, + "grad_norm": 1.5978689392490961, + "learning_rate": 1.3766050921493513e-05, + "loss": 1.9467, + "step": 1212 + }, + { + "epoch": 1.5412960609911055, + "grad_norm": 1.147772260129036, + "learning_rate": 1.375668513349397e-05, + "loss": 2.0117, + "step": 1213 + }, + { + "epoch": 1.5425667090216009, + "grad_norm": 1.2286312283945457, + "learning_rate": 1.3747315507162892e-05, + "loss": 1.929, + "step": 1214 + }, + { + "epoch": 1.5438373570520967, + "grad_norm": 1.179000137924479, + "learning_rate": 1.373794205207354e-05, + "loss": 1.8902, + "step": 1215 + }, + { + "epoch": 1.545108005082592, + "grad_norm": 1.248366462875927, + "learning_rate": 1.3728564777803089e-05, + "loss": 1.8247, + "step": 1216 + }, + { + "epoch": 1.5463786531130876, + "grad_norm": 1.4648780876266096, + "learning_rate": 1.371918369393261e-05, + "loss": 1.8984, + "step": 1217 + }, + { + "epoch": 1.5476493011435832, + "grad_norm": 1.6506309397732746, + "learning_rate": 1.3709798810047079e-05, + "loss": 1.872, + "step": 1218 + }, + { + "epoch": 1.5489199491740788, + "grad_norm": 1.2165902320177087, + "learning_rate": 1.370041013573534e-05, + "loss": 1.7121, + "step": 1219 + }, + { + "epoch": 1.5501905972045744, + "grad_norm": 1.397563042124808, + "learning_rate": 1.3691017680590126e-05, + "loss": 2.0396, + "step": 1220 + }, + { + "epoch": 1.5514612452350698, + "grad_norm": 1.2637963882307408, + "learning_rate": 1.3681621454208017e-05, + "loss": 1.7806, + "step": 1221 + }, + { + "epoch": 1.5527318932655656, + "grad_norm": 1.2003710827084932, + "learning_rate": 1.3672221466189457e-05, + "loss": 1.7892, + "step": 1222 + }, + { + "epoch": 1.554002541296061, + "grad_norm": 1.1875218869022044, + "learning_rate": 1.3662817726138729e-05, + "loss": 1.7981, + "step": 1223 + }, + { + "epoch": 1.5552731893265566, + "grad_norm": 1.2566514736815448, + "learning_rate": 1.3653410243663953e-05, + "loss": 2.0253, + "step": 1224 + }, + { + "epoch": 1.5565438373570522, + "grad_norm": 1.384730420879102, + "learning_rate": 1.3643999028377065e-05, + "loss": 1.9198, + "step": 1225 + }, + { + "epoch": 1.5578144853875475, + "grad_norm": 1.2313997374157073, + "learning_rate": 1.3634584089893826e-05, + "loss": 1.9868, + "step": 1226 + }, + { + "epoch": 1.5590851334180433, + "grad_norm": 1.1639822831560722, + "learning_rate": 1.3625165437833787e-05, + "loss": 1.8653, + "step": 1227 + }, + { + "epoch": 1.5603557814485387, + "grad_norm": 1.3401076066880915, + "learning_rate": 1.361574308182031e-05, + "loss": 1.9842, + "step": 1228 + }, + { + "epoch": 1.5616264294790343, + "grad_norm": 1.272446557869709, + "learning_rate": 1.3606317031480529e-05, + "loss": 1.787, + "step": 1229 + }, + { + "epoch": 1.5628970775095299, + "grad_norm": 1.1357843978743838, + "learning_rate": 1.359688729644536e-05, + "loss": 1.4677, + "step": 1230 + }, + { + "epoch": 1.5641677255400253, + "grad_norm": 1.4180732475647855, + "learning_rate": 1.3587453886349474e-05, + "loss": 1.7196, + "step": 1231 + }, + { + "epoch": 1.565438373570521, + "grad_norm": 1.3772896602327505, + "learning_rate": 1.3578016810831311e-05, + "loss": 1.9846, + "step": 1232 + }, + { + "epoch": 1.5667090216010164, + "grad_norm": 1.2727299017038909, + "learning_rate": 1.3568576079533043e-05, + "loss": 1.7325, + "step": 1233 + }, + { + "epoch": 1.567979669631512, + "grad_norm": 1.284756489838671, + "learning_rate": 1.3559131702100584e-05, + "loss": 2.0046, + "step": 1234 + }, + { + "epoch": 1.5692503176620076, + "grad_norm": 1.121721245316067, + "learning_rate": 1.354968368818357e-05, + "loss": 1.8843, + "step": 1235 + }, + { + "epoch": 1.5705209656925032, + "grad_norm": 1.2667770466964212, + "learning_rate": 1.354023204743536e-05, + "loss": 1.7568, + "step": 1236 + }, + { + "epoch": 1.5717916137229988, + "grad_norm": 1.539311259281202, + "learning_rate": 1.3530776789513009e-05, + "loss": 1.8506, + "step": 1237 + }, + { + "epoch": 1.5730622617534942, + "grad_norm": 1.6379449129945398, + "learning_rate": 1.3521317924077275e-05, + "loss": 1.6479, + "step": 1238 + }, + { + "epoch": 1.57433290978399, + "grad_norm": 1.2920503074243581, + "learning_rate": 1.3511855460792593e-05, + "loss": 1.9804, + "step": 1239 + }, + { + "epoch": 1.5756035578144854, + "grad_norm": 1.2389856953911573, + "learning_rate": 1.3502389409327087e-05, + "loss": 1.9727, + "step": 1240 + }, + { + "epoch": 1.576874205844981, + "grad_norm": 1.3950063040745804, + "learning_rate": 1.3492919779352536e-05, + "loss": 2.054, + "step": 1241 + }, + { + "epoch": 1.5781448538754765, + "grad_norm": 1.310297987074961, + "learning_rate": 1.348344658054438e-05, + "loss": 2.0237, + "step": 1242 + }, + { + "epoch": 1.579415501905972, + "grad_norm": 1.2925087100054253, + "learning_rate": 1.3473969822581707e-05, + "loss": 2.0117, + "step": 1243 + }, + { + "epoch": 1.5806861499364677, + "grad_norm": 1.3105575963221772, + "learning_rate": 1.3464489515147239e-05, + "loss": 1.9496, + "step": 1244 + }, + { + "epoch": 1.581956797966963, + "grad_norm": 1.2237031081859495, + "learning_rate": 1.3455005667927318e-05, + "loss": 2.0285, + "step": 1245 + }, + { + "epoch": 1.5832274459974587, + "grad_norm": 1.3435645122073598, + "learning_rate": 1.3445518290611918e-05, + "loss": 1.7926, + "step": 1246 + }, + { + "epoch": 1.5844980940279543, + "grad_norm": 1.2346080750718063, + "learning_rate": 1.343602739289461e-05, + "loss": 1.7282, + "step": 1247 + }, + { + "epoch": 1.5857687420584496, + "grad_norm": 1.2454640590173194, + "learning_rate": 1.3426532984472561e-05, + "loss": 1.901, + "step": 1248 + }, + { + "epoch": 1.5870393900889455, + "grad_norm": 1.275365347669394, + "learning_rate": 1.3417035075046527e-05, + "loss": 1.8574, + "step": 1249 + }, + { + "epoch": 1.5883100381194408, + "grad_norm": 1.1338251344047352, + "learning_rate": 1.3407533674320848e-05, + "loss": 1.8699, + "step": 1250 + }, + { + "epoch": 1.5895806861499364, + "grad_norm": 1.2431631957755682, + "learning_rate": 1.3398028792003413e-05, + "loss": 1.6381, + "step": 1251 + }, + { + "epoch": 1.590851334180432, + "grad_norm": 1.4132487408887402, + "learning_rate": 1.338852043780569e-05, + "loss": 1.7747, + "step": 1252 + }, + { + "epoch": 1.5921219822109276, + "grad_norm": 1.6491888618473156, + "learning_rate": 1.337900862144268e-05, + "loss": 1.9426, + "step": 1253 + }, + { + "epoch": 1.5933926302414232, + "grad_norm": 1.3622479954441118, + "learning_rate": 1.3369493352632925e-05, + "loss": 2.0793, + "step": 1254 + }, + { + "epoch": 1.5946632782719186, + "grad_norm": 1.192420588196507, + "learning_rate": 1.3359974641098497e-05, + "loss": 1.8084, + "step": 1255 + }, + { + "epoch": 1.5959339263024144, + "grad_norm": 1.2797006019475736, + "learning_rate": 1.3350452496564985e-05, + "loss": 1.9092, + "step": 1256 + }, + { + "epoch": 1.5972045743329097, + "grad_norm": 1.2755999292541016, + "learning_rate": 1.3340926928761477e-05, + "loss": 1.8576, + "step": 1257 + }, + { + "epoch": 1.5984752223634053, + "grad_norm": 1.2395072504037918, + "learning_rate": 1.3331397947420578e-05, + "loss": 1.7841, + "step": 1258 + }, + { + "epoch": 1.599745870393901, + "grad_norm": 1.296854658829394, + "learning_rate": 1.332186556227836e-05, + "loss": 1.8771, + "step": 1259 + }, + { + "epoch": 1.6010165184243963, + "grad_norm": 1.1164654884285337, + "learning_rate": 1.3312329783074383e-05, + "loss": 2.0762, + "step": 1260 + }, + { + "epoch": 1.602287166454892, + "grad_norm": 1.2093316079814684, + "learning_rate": 1.3302790619551673e-05, + "loss": 1.8883, + "step": 1261 + }, + { + "epoch": 1.6035578144853875, + "grad_norm": 1.3153570486567234, + "learning_rate": 1.3293248081456717e-05, + "loss": 1.9228, + "step": 1262 + }, + { + "epoch": 1.604828462515883, + "grad_norm": 1.7801035961064582, + "learning_rate": 1.3283702178539441e-05, + "loss": 1.6673, + "step": 1263 + }, + { + "epoch": 1.6060991105463787, + "grad_norm": 1.2602038214603772, + "learning_rate": 1.3274152920553225e-05, + "loss": 1.9318, + "step": 1264 + }, + { + "epoch": 1.6073697585768743, + "grad_norm": 1.3031972530092082, + "learning_rate": 1.3264600317254854e-05, + "loss": 1.6836, + "step": 1265 + }, + { + "epoch": 1.6086404066073698, + "grad_norm": 1.4977716352782071, + "learning_rate": 1.3255044378404557e-05, + "loss": 2.0444, + "step": 1266 + }, + { + "epoch": 1.6099110546378652, + "grad_norm": 1.33666349801072, + "learning_rate": 1.3245485113765952e-05, + "loss": 1.8835, + "step": 1267 + }, + { + "epoch": 1.611181702668361, + "grad_norm": 55.6034937167438, + "learning_rate": 1.323592253310606e-05, + "loss": 2.0328, + "step": 1268 + }, + { + "epoch": 1.6124523506988564, + "grad_norm": 1.3049852476469035, + "learning_rate": 1.3226356646195293e-05, + "loss": 2.0586, + "step": 1269 + }, + { + "epoch": 1.613722998729352, + "grad_norm": 1.2839308854301152, + "learning_rate": 1.3216787462807442e-05, + "loss": 2.0095, + "step": 1270 + }, + { + "epoch": 1.6149936467598476, + "grad_norm": 1.2078234396212837, + "learning_rate": 1.3207214992719654e-05, + "loss": 1.7023, + "step": 1271 + }, + { + "epoch": 1.616264294790343, + "grad_norm": 1.0942474151382433, + "learning_rate": 1.3197639245712454e-05, + "loss": 2.0584, + "step": 1272 + }, + { + "epoch": 1.6175349428208388, + "grad_norm": 1.640743926625236, + "learning_rate": 1.3188060231569701e-05, + "loss": 2.0192, + "step": 1273 + }, + { + "epoch": 1.6188055908513341, + "grad_norm": 1.445559822354054, + "learning_rate": 1.3178477960078594e-05, + "loss": 1.8092, + "step": 1274 + }, + { + "epoch": 1.6200762388818297, + "grad_norm": 1.2699367663057821, + "learning_rate": 1.3168892441029666e-05, + "loss": 1.6643, + "step": 1275 + }, + { + "epoch": 1.6213468869123253, + "grad_norm": 1.2961056322093205, + "learning_rate": 1.3159303684216761e-05, + "loss": 2.0516, + "step": 1276 + }, + { + "epoch": 1.6226175349428207, + "grad_norm": 1.328661586517937, + "learning_rate": 1.3149711699437035e-05, + "loss": 1.9553, + "step": 1277 + }, + { + "epoch": 1.6238881829733165, + "grad_norm": 1.5570126645670899, + "learning_rate": 1.3140116496490944e-05, + "loss": 1.8117, + "step": 1278 + }, + { + "epoch": 1.6251588310038119, + "grad_norm": 1.232675245720794, + "learning_rate": 1.3130518085182224e-05, + "loss": 1.8473, + "step": 1279 + }, + { + "epoch": 1.6264294790343075, + "grad_norm": 1.639625976170505, + "learning_rate": 1.31209164753179e-05, + "loss": 1.7237, + "step": 1280 + }, + { + "epoch": 1.627700127064803, + "grad_norm": 1.3365143272845428, + "learning_rate": 1.3111311676708256e-05, + "loss": 1.7455, + "step": 1281 + }, + { + "epoch": 1.6289707750952986, + "grad_norm": 1.4841371892966886, + "learning_rate": 1.3101703699166843e-05, + "loss": 1.8929, + "step": 1282 + }, + { + "epoch": 1.6302414231257942, + "grad_norm": 1.194723119658828, + "learning_rate": 1.3092092552510445e-05, + "loss": 1.7472, + "step": 1283 + }, + { + "epoch": 1.6315120711562896, + "grad_norm": 1.325696772260916, + "learning_rate": 1.3082478246559104e-05, + "loss": 1.7873, + "step": 1284 + }, + { + "epoch": 1.6327827191867854, + "grad_norm": 1.2056417302548919, + "learning_rate": 1.3072860791136075e-05, + "loss": 1.8021, + "step": 1285 + }, + { + "epoch": 1.6340533672172808, + "grad_norm": 1.2675138648758477, + "learning_rate": 1.3063240196067837e-05, + "loss": 1.8093, + "step": 1286 + }, + { + "epoch": 1.6353240152477764, + "grad_norm": 1.3495984616195633, + "learning_rate": 1.3053616471184071e-05, + "loss": 1.8553, + "step": 1287 + }, + { + "epoch": 1.636594663278272, + "grad_norm": 1.9427242076457587, + "learning_rate": 1.3043989626317668e-05, + "loss": 1.8769, + "step": 1288 + }, + { + "epoch": 1.6378653113087673, + "grad_norm": 1.5938140030309664, + "learning_rate": 1.3034359671304693e-05, + "loss": 2.0313, + "step": 1289 + }, + { + "epoch": 1.6391359593392631, + "grad_norm": 1.2722119373656835, + "learning_rate": 1.30247266159844e-05, + "loss": 1.946, + "step": 1290 + }, + { + "epoch": 1.6404066073697585, + "grad_norm": 1.1733976456758377, + "learning_rate": 1.3015090470199201e-05, + "loss": 2.0665, + "step": 1291 + }, + { + "epoch": 1.641677255400254, + "grad_norm": 1.3386678697498235, + "learning_rate": 1.3005451243794672e-05, + "loss": 1.8375, + "step": 1292 + }, + { + "epoch": 1.6429479034307497, + "grad_norm": 1.3635571022521007, + "learning_rate": 1.2995808946619533e-05, + "loss": 1.7194, + "step": 1293 + }, + { + "epoch": 1.644218551461245, + "grad_norm": 1.4234061688686828, + "learning_rate": 1.2986163588525646e-05, + "loss": 1.851, + "step": 1294 + }, + { + "epoch": 1.6454891994917409, + "grad_norm": 1.3951660972295525, + "learning_rate": 1.2976515179367996e-05, + "loss": 2.1171, + "step": 1295 + }, + { + "epoch": 1.6467598475222363, + "grad_norm": 1.2092866983227297, + "learning_rate": 1.2966863729004691e-05, + "loss": 1.6927, + "step": 1296 + }, + { + "epoch": 1.6480304955527318, + "grad_norm": 1.0797174046168765, + "learning_rate": 1.2957209247296935e-05, + "loss": 2.0332, + "step": 1297 + }, + { + "epoch": 1.6493011435832274, + "grad_norm": 1.1392147959819172, + "learning_rate": 1.2947551744109044e-05, + "loss": 1.7436, + "step": 1298 + }, + { + "epoch": 1.650571791613723, + "grad_norm": 1.1836701182877878, + "learning_rate": 1.293789122930841e-05, + "loss": 1.9638, + "step": 1299 + }, + { + "epoch": 1.6518424396442186, + "grad_norm": 1.2257252313691758, + "learning_rate": 1.2928227712765504e-05, + "loss": 1.9834, + "step": 1300 + }, + { + "epoch": 1.653113087674714, + "grad_norm": 1.1761208630174838, + "learning_rate": 1.2918561204353871e-05, + "loss": 1.7215, + "step": 1301 + }, + { + "epoch": 1.6543837357052098, + "grad_norm": 1.3891650801080222, + "learning_rate": 1.2908891713950107e-05, + "loss": 1.7344, + "step": 1302 + }, + { + "epoch": 1.6556543837357052, + "grad_norm": 1.3855957023677499, + "learning_rate": 1.2899219251433848e-05, + "loss": 1.8464, + "step": 1303 + }, + { + "epoch": 1.6569250317662008, + "grad_norm": 1.3598192540799567, + "learning_rate": 1.2889543826687785e-05, + "loss": 1.8313, + "step": 1304 + }, + { + "epoch": 1.6581956797966964, + "grad_norm": 1.5002056868237887, + "learning_rate": 1.2879865449597617e-05, + "loss": 1.9405, + "step": 1305 + }, + { + "epoch": 1.6594663278271917, + "grad_norm": 1.3216736210005633, + "learning_rate": 1.287018413005207e-05, + "loss": 1.7279, + "step": 1306 + }, + { + "epoch": 1.6607369758576875, + "grad_norm": 1.3819372523696916, + "learning_rate": 1.2860499877942876e-05, + "loss": 2.0484, + "step": 1307 + }, + { + "epoch": 1.662007623888183, + "grad_norm": 1.170834326805149, + "learning_rate": 1.2850812703164754e-05, + "loss": 1.7203, + "step": 1308 + }, + { + "epoch": 1.6632782719186785, + "grad_norm": 1.4256686717965028, + "learning_rate": 1.2841122615615426e-05, + "loss": 1.7459, + "step": 1309 + }, + { + "epoch": 1.664548919949174, + "grad_norm": 1.6382014723072733, + "learning_rate": 1.2831429625195576e-05, + "loss": 1.5154, + "step": 1310 + }, + { + "epoch": 1.6658195679796697, + "grad_norm": 1.2509445085153295, + "learning_rate": 1.2821733741808855e-05, + "loss": 1.7097, + "step": 1311 + }, + { + "epoch": 1.6670902160101653, + "grad_norm": 1.3131910081504445, + "learning_rate": 1.2812034975361876e-05, + "loss": 1.8529, + "step": 1312 + }, + { + "epoch": 1.6683608640406606, + "grad_norm": 1.2647384119627838, + "learning_rate": 1.2802333335764194e-05, + "loss": 1.7561, + "step": 1313 + }, + { + "epoch": 1.6696315120711565, + "grad_norm": 1.52124313252029, + "learning_rate": 1.2792628832928302e-05, + "loss": 2.0929, + "step": 1314 + }, + { + "epoch": 1.6709021601016518, + "grad_norm": 1.2844058844721489, + "learning_rate": 1.2782921476769616e-05, + "loss": 1.9279, + "step": 1315 + }, + { + "epoch": 1.6721728081321474, + "grad_norm": 1.1525042552706943, + "learning_rate": 1.277321127720647e-05, + "loss": 1.9666, + "step": 1316 + }, + { + "epoch": 1.673443456162643, + "grad_norm": 1.5850927229591063, + "learning_rate": 1.2763498244160097e-05, + "loss": 1.5506, + "step": 1317 + }, + { + "epoch": 1.6747141041931384, + "grad_norm": 1.478772099784771, + "learning_rate": 1.2753782387554633e-05, + "loss": 2.1687, + "step": 1318 + }, + { + "epoch": 1.6759847522236342, + "grad_norm": 1.2500162015385214, + "learning_rate": 1.2744063717317094e-05, + "loss": 1.5892, + "step": 1319 + }, + { + "epoch": 1.6772554002541296, + "grad_norm": 1.4520444580272065, + "learning_rate": 1.2734342243377376e-05, + "loss": 2.0015, + "step": 1320 + }, + { + "epoch": 1.6785260482846251, + "grad_norm": 1.5010211903038795, + "learning_rate": 1.2724617975668229e-05, + "loss": 2.1132, + "step": 1321 + }, + { + "epoch": 1.6797966963151207, + "grad_norm": 1.2386575390441286, + "learning_rate": 1.271489092412527e-05, + "loss": 1.8858, + "step": 1322 + }, + { + "epoch": 1.681067344345616, + "grad_norm": 1.2150549039148064, + "learning_rate": 1.2705161098686953e-05, + "loss": 1.8268, + "step": 1323 + }, + { + "epoch": 1.682337992376112, + "grad_norm": 1.4354657637645867, + "learning_rate": 1.2695428509294567e-05, + "loss": 1.7492, + "step": 1324 + }, + { + "epoch": 1.6836086404066073, + "grad_norm": 1.405427966456244, + "learning_rate": 1.2685693165892228e-05, + "loss": 1.898, + "step": 1325 + }, + { + "epoch": 1.6848792884371029, + "grad_norm": 1.311553544198372, + "learning_rate": 1.267595507842686e-05, + "loss": 1.9387, + "step": 1326 + }, + { + "epoch": 1.6861499364675985, + "grad_norm": 1.2420356703845903, + "learning_rate": 1.2666214256848197e-05, + "loss": 1.8164, + "step": 1327 + }, + { + "epoch": 1.687420584498094, + "grad_norm": 1.3373903023080569, + "learning_rate": 1.2656470711108763e-05, + "loss": 1.9783, + "step": 1328 + }, + { + "epoch": 1.6886912325285897, + "grad_norm": 1.4874301493310746, + "learning_rate": 1.264672445116387e-05, + "loss": 2.0626, + "step": 1329 + }, + { + "epoch": 1.689961880559085, + "grad_norm": 1.273505177911983, + "learning_rate": 1.2636975486971594e-05, + "loss": 1.8729, + "step": 1330 + }, + { + "epoch": 1.6912325285895808, + "grad_norm": 1.7074068671901932, + "learning_rate": 1.2627223828492785e-05, + "loss": 2.0717, + "step": 1331 + }, + { + "epoch": 1.6925031766200762, + "grad_norm": 1.4110321983565615, + "learning_rate": 1.2617469485691034e-05, + "loss": 1.9849, + "step": 1332 + }, + { + "epoch": 1.6937738246505718, + "grad_norm": 1.143137082547412, + "learning_rate": 1.2607712468532688e-05, + "loss": 1.9109, + "step": 1333 + }, + { + "epoch": 1.6950444726810674, + "grad_norm": 1.4170049840764738, + "learning_rate": 1.2597952786986813e-05, + "loss": 2.086, + "step": 1334 + }, + { + "epoch": 1.6963151207115628, + "grad_norm": 1.3493025409661936, + "learning_rate": 1.2588190451025209e-05, + "loss": 1.7331, + "step": 1335 + }, + { + "epoch": 1.6975857687420586, + "grad_norm": 1.2325439608220607, + "learning_rate": 1.257842547062238e-05, + "loss": 1.8408, + "step": 1336 + }, + { + "epoch": 1.698856416772554, + "grad_norm": 1.2903745252099554, + "learning_rate": 1.256865785575554e-05, + "loss": 1.6788, + "step": 1337 + }, + { + "epoch": 1.7001270648030495, + "grad_norm": 1.6126834280166753, + "learning_rate": 1.255888761640458e-05, + "loss": 1.8393, + "step": 1338 + }, + { + "epoch": 1.7013977128335451, + "grad_norm": 12.132017979711518, + "learning_rate": 1.254911476255209e-05, + "loss": 1.7632, + "step": 1339 + }, + { + "epoch": 1.7026683608640405, + "grad_norm": 1.4264111838446671, + "learning_rate": 1.253933930418332e-05, + "loss": 1.9023, + "step": 1340 + }, + { + "epoch": 1.7039390088945363, + "grad_norm": 1.3396460965865984, + "learning_rate": 1.2529561251286184e-05, + "loss": 1.8552, + "step": 1341 + }, + { + "epoch": 1.7052096569250317, + "grad_norm": 1.8974603880479586, + "learning_rate": 1.2519780613851254e-05, + "loss": 1.9448, + "step": 1342 + }, + { + "epoch": 1.7064803049555273, + "grad_norm": 1.2414447550947338, + "learning_rate": 1.250999740187173e-05, + "loss": 1.8183, + "step": 1343 + }, + { + "epoch": 1.7077509529860229, + "grad_norm": 1.754129000217617, + "learning_rate": 1.2500211625343448e-05, + "loss": 1.9146, + "step": 1344 + }, + { + "epoch": 1.7090216010165185, + "grad_norm": 1.144544650820467, + "learning_rate": 1.2490423294264866e-05, + "loss": 1.8175, + "step": 1345 + }, + { + "epoch": 1.710292249047014, + "grad_norm": 1.1845927231500297, + "learning_rate": 1.2480632418637054e-05, + "loss": 1.8947, + "step": 1346 + }, + { + "epoch": 1.7115628970775094, + "grad_norm": 1.716408411076265, + "learning_rate": 1.2470839008463676e-05, + "loss": 2.0708, + "step": 1347 + }, + { + "epoch": 1.7128335451080052, + "grad_norm": 1.333654976939953, + "learning_rate": 1.2461043073750988e-05, + "loss": 1.9351, + "step": 1348 + }, + { + "epoch": 1.7141041931385006, + "grad_norm": 1.7726917448928459, + "learning_rate": 1.2451244624507831e-05, + "loss": 1.9672, + "step": 1349 + }, + { + "epoch": 1.7153748411689962, + "grad_norm": 1.4690410407624557, + "learning_rate": 1.2441443670745606e-05, + "loss": 1.8296, + "step": 1350 + }, + { + "epoch": 1.7166454891994918, + "grad_norm": 1.41542125842506, + "learning_rate": 1.2431640222478275e-05, + "loss": 1.7263, + "step": 1351 + }, + { + "epoch": 1.7179161372299872, + "grad_norm": 1.2083108369431936, + "learning_rate": 1.2421834289722354e-05, + "loss": 1.8448, + "step": 1352 + }, + { + "epoch": 1.719186785260483, + "grad_norm": 1.4383951762557672, + "learning_rate": 1.2412025882496895e-05, + "loss": 1.7514, + "step": 1353 + }, + { + "epoch": 1.7204574332909783, + "grad_norm": 1.3817843859209873, + "learning_rate": 1.2402215010823472e-05, + "loss": 1.8088, + "step": 1354 + }, + { + "epoch": 1.721728081321474, + "grad_norm": 1.3865663226106641, + "learning_rate": 1.239240168472619e-05, + "loss": 1.7988, + "step": 1355 + }, + { + "epoch": 1.7229987293519695, + "grad_norm": 1.4096566413292009, + "learning_rate": 1.238258591423165e-05, + "loss": 1.8322, + "step": 1356 + }, + { + "epoch": 1.724269377382465, + "grad_norm": 1.4853260723970567, + "learning_rate": 1.2372767709368957e-05, + "loss": 1.6997, + "step": 1357 + }, + { + "epoch": 1.7255400254129607, + "grad_norm": 1.3826381702954678, + "learning_rate": 1.23629470801697e-05, + "loss": 1.9372, + "step": 1358 + }, + { + "epoch": 1.726810673443456, + "grad_norm": 1.2355594794312026, + "learning_rate": 1.2353124036667946e-05, + "loss": 1.6416, + "step": 1359 + }, + { + "epoch": 1.7280813214739519, + "grad_norm": 1.3577984488669015, + "learning_rate": 1.2343298588900226e-05, + "loss": 1.9556, + "step": 1360 + }, + { + "epoch": 1.7293519695044473, + "grad_norm": 1.1389770751646968, + "learning_rate": 1.2333470746905534e-05, + "loss": 1.7399, + "step": 1361 + }, + { + "epoch": 1.7306226175349428, + "grad_norm": 1.2795159617708947, + "learning_rate": 1.2323640520725306e-05, + "loss": 1.8617, + "step": 1362 + }, + { + "epoch": 1.7318932655654384, + "grad_norm": 1.4461540608966716, + "learning_rate": 1.2313807920403419e-05, + "loss": 1.958, + "step": 1363 + }, + { + "epoch": 1.7331639135959338, + "grad_norm": 1.397233423826948, + "learning_rate": 1.2303972955986161e-05, + "loss": 1.7538, + "step": 1364 + }, + { + "epoch": 1.7344345616264296, + "grad_norm": 1.2889823172447274, + "learning_rate": 1.2294135637522254e-05, + "loss": 1.8574, + "step": 1365 + }, + { + "epoch": 1.735705209656925, + "grad_norm": 1.26840599009331, + "learning_rate": 1.2284295975062814e-05, + "loss": 1.8328, + "step": 1366 + }, + { + "epoch": 1.7369758576874206, + "grad_norm": 1.3660457927942886, + "learning_rate": 1.2274453978661356e-05, + "loss": 1.8039, + "step": 1367 + }, + { + "epoch": 1.7382465057179162, + "grad_norm": 1.0947022560711457, + "learning_rate": 1.226460965837378e-05, + "loss": 1.896, + "step": 1368 + }, + { + "epoch": 1.7395171537484115, + "grad_norm": 1.3136869299218366, + "learning_rate": 1.225476302425836e-05, + "loss": 1.8964, + "step": 1369 + }, + { + "epoch": 1.7407878017789074, + "grad_norm": 1.404627845671186, + "learning_rate": 1.2244914086375726e-05, + "loss": 1.7482, + "step": 1370 + }, + { + "epoch": 1.7420584498094027, + "grad_norm": 1.2443528146690166, + "learning_rate": 1.223506285478888e-05, + "loss": 1.9536, + "step": 1371 + }, + { + "epoch": 1.7433290978398983, + "grad_norm": 1.2042931826355405, + "learning_rate": 1.2225209339563144e-05, + "loss": 1.7529, + "step": 1372 + }, + { + "epoch": 1.744599745870394, + "grad_norm": 1.263280657296549, + "learning_rate": 1.2215353550766197e-05, + "loss": 1.9768, + "step": 1373 + }, + { + "epoch": 1.7458703939008895, + "grad_norm": 1.1766118912224472, + "learning_rate": 1.2205495498468025e-05, + "loss": 2.0298, + "step": 1374 + }, + { + "epoch": 1.747141041931385, + "grad_norm": 1.3899210224600584, + "learning_rate": 1.219563519274093e-05, + "loss": 1.8235, + "step": 1375 + }, + { + "epoch": 1.7484116899618805, + "grad_norm": 1.2008533872792837, + "learning_rate": 1.2185772643659521e-05, + "loss": 1.9107, + "step": 1376 + }, + { + "epoch": 1.7496823379923763, + "grad_norm": 1.0941137156711913, + "learning_rate": 1.2175907861300698e-05, + "loss": 1.9886, + "step": 1377 + }, + { + "epoch": 1.7509529860228716, + "grad_norm": 1.2944173746818755, + "learning_rate": 1.2166040855743635e-05, + "loss": 1.8527, + "step": 1378 + }, + { + "epoch": 1.7522236340533672, + "grad_norm": 1.2642170229105665, + "learning_rate": 1.2156171637069785e-05, + "loss": 1.8462, + "step": 1379 + }, + { + "epoch": 1.7534942820838628, + "grad_norm": 1.2430017289787458, + "learning_rate": 1.2146300215362863e-05, + "loss": 1.8839, + "step": 1380 + }, + { + "epoch": 1.7547649301143582, + "grad_norm": 1.2199528498427983, + "learning_rate": 1.2136426600708833e-05, + "loss": 1.8889, + "step": 1381 + }, + { + "epoch": 1.756035578144854, + "grad_norm": 1.3984242744552862, + "learning_rate": 1.2126550803195895e-05, + "loss": 1.7563, + "step": 1382 + }, + { + "epoch": 1.7573062261753494, + "grad_norm": 1.305331265532622, + "learning_rate": 1.2116672832914489e-05, + "loss": 2.0087, + "step": 1383 + }, + { + "epoch": 1.758576874205845, + "grad_norm": 1.2898722634068678, + "learning_rate": 1.2106792699957264e-05, + "loss": 1.6728, + "step": 1384 + }, + { + "epoch": 1.7598475222363406, + "grad_norm": 1.5640598557270948, + "learning_rate": 1.2096910414419087e-05, + "loss": 2.004, + "step": 1385 + }, + { + "epoch": 1.761118170266836, + "grad_norm": 1.554862105286235, + "learning_rate": 1.2087025986397023e-05, + "loss": 1.9123, + "step": 1386 + }, + { + "epoch": 1.7623888182973317, + "grad_norm": 1.2168486180432576, + "learning_rate": 1.2077139425990321e-05, + "loss": 1.8045, + "step": 1387 + }, + { + "epoch": 1.763659466327827, + "grad_norm": 1.2331850367414905, + "learning_rate": 1.2067250743300414e-05, + "loss": 1.8561, + "step": 1388 + }, + { + "epoch": 1.7649301143583227, + "grad_norm": 1.2716647592623593, + "learning_rate": 1.2057359948430903e-05, + "loss": 1.7265, + "step": 1389 + }, + { + "epoch": 1.7662007623888183, + "grad_norm": 1.7712185520839197, + "learning_rate": 1.204746705148754e-05, + "loss": 1.7379, + "step": 1390 + }, + { + "epoch": 1.7674714104193139, + "grad_norm": 2.654250532277399, + "learning_rate": 1.2037572062578238e-05, + "loss": 1.9402, + "step": 1391 + }, + { + "epoch": 1.7687420584498095, + "grad_norm": 1.2759556317976997, + "learning_rate": 1.2027674991813037e-05, + "loss": 1.9721, + "step": 1392 + }, + { + "epoch": 1.7700127064803048, + "grad_norm": 1.1357265349476322, + "learning_rate": 1.2017775849304105e-05, + "loss": 1.7183, + "step": 1393 + }, + { + "epoch": 1.7712833545108007, + "grad_norm": 1.362592119463336, + "learning_rate": 1.200787464516573e-05, + "loss": 1.8925, + "step": 1394 + }, + { + "epoch": 1.772554002541296, + "grad_norm": 1.3969098719536086, + "learning_rate": 1.199797138951431e-05, + "loss": 1.8477, + "step": 1395 + }, + { + "epoch": 1.7738246505717916, + "grad_norm": 1.2586495073757091, + "learning_rate": 1.1988066092468325e-05, + "loss": 1.6721, + "step": 1396 + }, + { + "epoch": 1.7750952986022872, + "grad_norm": 1.3661649219281586, + "learning_rate": 1.1978158764148358e-05, + "loss": 1.824, + "step": 1397 + }, + { + "epoch": 1.7763659466327826, + "grad_norm": 1.3366780041839457, + "learning_rate": 1.1968249414677055e-05, + "loss": 2.0305, + "step": 1398 + }, + { + "epoch": 1.7776365946632784, + "grad_norm": 1.2910755256326734, + "learning_rate": 1.1958338054179135e-05, + "loss": 1.5409, + "step": 1399 + }, + { + "epoch": 1.7789072426937738, + "grad_norm": 1.2662512836713595, + "learning_rate": 1.1948424692781364e-05, + "loss": 1.8672, + "step": 1400 + }, + { + "epoch": 1.7801778907242694, + "grad_norm": 1.4770845576189542, + "learning_rate": 1.1938509340612565e-05, + "loss": 2.0235, + "step": 1401 + }, + { + "epoch": 1.781448538754765, + "grad_norm": 1.3208511318376104, + "learning_rate": 1.1928592007803575e-05, + "loss": 1.8901, + "step": 1402 + }, + { + "epoch": 1.7827191867852605, + "grad_norm": 1.504667429298727, + "learning_rate": 1.1918672704487275e-05, + "loss": 1.7732, + "step": 1403 + }, + { + "epoch": 1.7839898348157561, + "grad_norm": 1.404356174075473, + "learning_rate": 1.1908751440798549e-05, + "loss": 1.9319, + "step": 1404 + }, + { + "epoch": 1.7852604828462515, + "grad_norm": 1.3037826155994945, + "learning_rate": 1.1898828226874284e-05, + "loss": 1.9241, + "step": 1405 + }, + { + "epoch": 1.786531130876747, + "grad_norm": 1.5776711349588757, + "learning_rate": 1.1888903072853364e-05, + "loss": 2.0882, + "step": 1406 + }, + { + "epoch": 1.7878017789072427, + "grad_norm": 1.3374617894396248, + "learning_rate": 1.1878975988876648e-05, + "loss": 2.001, + "step": 1407 + }, + { + "epoch": 1.7890724269377383, + "grad_norm": 1.5095423208868328, + "learning_rate": 1.1869046985086978e-05, + "loss": 1.8458, + "step": 1408 + }, + { + "epoch": 1.7903430749682339, + "grad_norm": 1.3296549033376626, + "learning_rate": 1.1859116071629148e-05, + "loss": 1.6368, + "step": 1409 + }, + { + "epoch": 1.7916137229987292, + "grad_norm": 1.593559857193169, + "learning_rate": 1.1849183258649903e-05, + "loss": 1.8177, + "step": 1410 + }, + { + "epoch": 1.792884371029225, + "grad_norm": 1.342662280127331, + "learning_rate": 1.1839248556297938e-05, + "loss": 2.0107, + "step": 1411 + }, + { + "epoch": 1.7941550190597204, + "grad_norm": 1.421403096440148, + "learning_rate": 1.1829311974723868e-05, + "loss": 2.0703, + "step": 1412 + }, + { + "epoch": 1.795425667090216, + "grad_norm": 1.4724131785793708, + "learning_rate": 1.1819373524080233e-05, + "loss": 1.7461, + "step": 1413 + }, + { + "epoch": 1.7966963151207116, + "grad_norm": 1.257857762799035, + "learning_rate": 1.1809433214521486e-05, + "loss": 1.8401, + "step": 1414 + }, + { + "epoch": 1.797966963151207, + "grad_norm": 1.5059440643051154, + "learning_rate": 1.1799491056203973e-05, + "loss": 1.9471, + "step": 1415 + }, + { + "epoch": 1.7992376111817028, + "grad_norm": 1.475426427921491, + "learning_rate": 1.1789547059285928e-05, + "loss": 1.7771, + "step": 1416 + }, + { + "epoch": 1.8005082592121981, + "grad_norm": 1.2583682337559847, + "learning_rate": 1.1779601233927475e-05, + "loss": 1.9209, + "step": 1417 + }, + { + "epoch": 1.8017789072426937, + "grad_norm": 1.4923177597664052, + "learning_rate": 1.1769653590290591e-05, + "loss": 2.2439, + "step": 1418 + }, + { + "epoch": 1.8030495552731893, + "grad_norm": 1.3124287124126588, + "learning_rate": 1.1759704138539121e-05, + "loss": 1.6536, + "step": 1419 + }, + { + "epoch": 1.804320203303685, + "grad_norm": 1.3458331741364764, + "learning_rate": 1.1749752888838754e-05, + "loss": 1.8087, + "step": 1420 + }, + { + "epoch": 1.8055908513341805, + "grad_norm": 1.4012331966186844, + "learning_rate": 1.1739799851357021e-05, + "loss": 1.9317, + "step": 1421 + }, + { + "epoch": 1.8068614993646759, + "grad_norm": 1.2257761558605946, + "learning_rate": 1.1729845036263263e-05, + "loss": 1.8353, + "step": 1422 + }, + { + "epoch": 1.8081321473951717, + "grad_norm": 1.320466289986687, + "learning_rate": 1.1719888453728665e-05, + "loss": 1.8284, + "step": 1423 + }, + { + "epoch": 1.809402795425667, + "grad_norm": 1.304744652776889, + "learning_rate": 1.170993011392619e-05, + "loss": 1.8663, + "step": 1424 + }, + { + "epoch": 1.8106734434561627, + "grad_norm": 1.3777909750839532, + "learning_rate": 1.1699970027030613e-05, + "loss": 2.008, + "step": 1425 + }, + { + "epoch": 1.8119440914866582, + "grad_norm": 1.1053815499377322, + "learning_rate": 1.1690008203218493e-05, + "loss": 1.8928, + "step": 1426 + }, + { + "epoch": 1.8132147395171536, + "grad_norm": 1.2615493941222704, + "learning_rate": 1.1680044652668156e-05, + "loss": 1.8193, + "step": 1427 + }, + { + "epoch": 1.8144853875476494, + "grad_norm": 1.5095707151442657, + "learning_rate": 1.1670079385559693e-05, + "loss": 1.7967, + "step": 1428 + }, + { + "epoch": 1.8157560355781448, + "grad_norm": 1.6773738017413227, + "learning_rate": 1.1660112412074964e-05, + "loss": 1.7579, + "step": 1429 + }, + { + "epoch": 1.8170266836086404, + "grad_norm": 1.2175924278708228, + "learning_rate": 1.1650143742397553e-05, + "loss": 1.8115, + "step": 1430 + }, + { + "epoch": 1.818297331639136, + "grad_norm": 1.2587370859429523, + "learning_rate": 1.1640173386712786e-05, + "loss": 2.0036, + "step": 1431 + }, + { + "epoch": 1.8195679796696314, + "grad_norm": 1.2906712298735914, + "learning_rate": 1.1630201355207709e-05, + "loss": 1.83, + "step": 1432 + }, + { + "epoch": 1.8208386277001272, + "grad_norm": 1.247476772493617, + "learning_rate": 1.1620227658071088e-05, + "loss": 2.0614, + "step": 1433 + }, + { + "epoch": 1.8221092757306225, + "grad_norm": 1.8256172773347648, + "learning_rate": 1.1610252305493374e-05, + "loss": 1.9096, + "step": 1434 + }, + { + "epoch": 1.8233799237611181, + "grad_norm": 1.1724464368042593, + "learning_rate": 1.1600275307666735e-05, + "loss": 1.7725, + "step": 1435 + }, + { + "epoch": 1.8246505717916137, + "grad_norm": 1.3964553497467094, + "learning_rate": 1.1590296674784991e-05, + "loss": 1.8193, + "step": 1436 + }, + { + "epoch": 1.8259212198221093, + "grad_norm": 1.1998179600419243, + "learning_rate": 1.158031641704366e-05, + "loss": 1.8011, + "step": 1437 + }, + { + "epoch": 1.827191867852605, + "grad_norm": 1.5231632235669916, + "learning_rate": 1.1570334544639896e-05, + "loss": 1.713, + "step": 1438 + }, + { + "epoch": 1.8284625158831003, + "grad_norm": 1.2046516674581838, + "learning_rate": 1.1560351067772517e-05, + "loss": 1.8704, + "step": 1439 + }, + { + "epoch": 1.829733163913596, + "grad_norm": 1.3178265369589643, + "learning_rate": 1.155036599664198e-05, + "loss": 2.0145, + "step": 1440 + }, + { + "epoch": 1.8310038119440915, + "grad_norm": 1.7226811184489739, + "learning_rate": 1.1540379341450365e-05, + "loss": 2.0435, + "step": 1441 + }, + { + "epoch": 1.832274459974587, + "grad_norm": 1.281356140125465, + "learning_rate": 1.1530391112401373e-05, + "loss": 1.7186, + "step": 1442 + }, + { + "epoch": 1.8335451080050826, + "grad_norm": 1.4583740366931355, + "learning_rate": 1.1520401319700318e-05, + "loss": 1.9342, + "step": 1443 + }, + { + "epoch": 1.834815756035578, + "grad_norm": 1.484281275441423, + "learning_rate": 1.15104099735541e-05, + "loss": 1.8925, + "step": 1444 + }, + { + "epoch": 1.8360864040660738, + "grad_norm": 1.412304238699942, + "learning_rate": 1.150041708417122e-05, + "loss": 1.5939, + "step": 1445 + }, + { + "epoch": 1.8373570520965692, + "grad_norm": 1.3438716295100606, + "learning_rate": 1.1490422661761744e-05, + "loss": 1.8617, + "step": 1446 + }, + { + "epoch": 1.8386277001270648, + "grad_norm": 1.9605821009953353, + "learning_rate": 1.1480426716537316e-05, + "loss": 1.8308, + "step": 1447 + }, + { + "epoch": 1.8398983481575604, + "grad_norm": 1.3991191334668605, + "learning_rate": 1.1470429258711122e-05, + "loss": 1.9475, + "step": 1448 + }, + { + "epoch": 1.841168996188056, + "grad_norm": 1.2670213328810134, + "learning_rate": 1.1460430298497907e-05, + "loss": 1.7133, + "step": 1449 + }, + { + "epoch": 1.8424396442185516, + "grad_norm": 1.3114626960479783, + "learning_rate": 1.145042984611394e-05, + "loss": 1.6649, + "step": 1450 + }, + { + "epoch": 1.843710292249047, + "grad_norm": 1.2816095446222748, + "learning_rate": 1.144042791177702e-05, + "loss": 1.9007, + "step": 1451 + }, + { + "epoch": 1.8449809402795425, + "grad_norm": 1.563591169561673, + "learning_rate": 1.1430424505706466e-05, + "loss": 1.8746, + "step": 1452 + }, + { + "epoch": 1.846251588310038, + "grad_norm": 1.5350916206044278, + "learning_rate": 1.1420419638123088e-05, + "loss": 1.9019, + "step": 1453 + }, + { + "epoch": 1.8475222363405337, + "grad_norm": 1.6647715831066532, + "learning_rate": 1.1410413319249193e-05, + "loss": 1.8594, + "step": 1454 + }, + { + "epoch": 1.8487928843710293, + "grad_norm": 1.2738941811409745, + "learning_rate": 1.1400405559308583e-05, + "loss": 1.6744, + "step": 1455 + }, + { + "epoch": 1.8500635324015247, + "grad_norm": 1.637080701428525, + "learning_rate": 1.1390396368526518e-05, + "loss": 2.043, + "step": 1456 + }, + { + "epoch": 1.8513341804320205, + "grad_norm": 1.3667523210206463, + "learning_rate": 1.1380385757129722e-05, + "loss": 2.0122, + "step": 1457 + }, + { + "epoch": 1.8526048284625158, + "grad_norm": 1.9195347645962353, + "learning_rate": 1.1370373735346376e-05, + "loss": 1.8422, + "step": 1458 + }, + { + "epoch": 1.8538754764930114, + "grad_norm": 1.3121875093141986, + "learning_rate": 1.1360360313406103e-05, + "loss": 1.8128, + "step": 1459 + }, + { + "epoch": 1.855146124523507, + "grad_norm": 1.2148920185298064, + "learning_rate": 1.1350345501539941e-05, + "loss": 1.8175, + "step": 1460 + }, + { + "epoch": 1.8564167725540024, + "grad_norm": 1.3891527875307301, + "learning_rate": 1.1340329309980379e-05, + "loss": 1.9379, + "step": 1461 + }, + { + "epoch": 1.8576874205844982, + "grad_norm": 1.6586252458635242, + "learning_rate": 1.1330311748961278e-05, + "loss": 1.5624, + "step": 1462 + }, + { + "epoch": 1.8589580686149936, + "grad_norm": 1.4942382644648793, + "learning_rate": 1.1320292828717927e-05, + "loss": 1.7223, + "step": 1463 + }, + { + "epoch": 1.8602287166454892, + "grad_norm": 1.2896948455619404, + "learning_rate": 1.1310272559486992e-05, + "loss": 1.9926, + "step": 1464 + }, + { + "epoch": 1.8614993646759848, + "grad_norm": 1.3366120995045905, + "learning_rate": 1.130025095150652e-05, + "loss": 1.956, + "step": 1465 + }, + { + "epoch": 1.8627700127064803, + "grad_norm": 1.399795765221416, + "learning_rate": 1.1290228015015923e-05, + "loss": 1.9303, + "step": 1466 + }, + { + "epoch": 1.864040660736976, + "grad_norm": 1.4568850504637758, + "learning_rate": 1.1280203760255974e-05, + "loss": 1.7904, + "step": 1467 + }, + { + "epoch": 1.8653113087674713, + "grad_norm": 1.3276010632182842, + "learning_rate": 1.1270178197468788e-05, + "loss": 1.995, + "step": 1468 + }, + { + "epoch": 1.8665819567979671, + "grad_norm": 1.1450291243004833, + "learning_rate": 1.1260151336897824e-05, + "loss": 1.7994, + "step": 1469 + }, + { + "epoch": 1.8678526048284625, + "grad_norm": 1.372707712609518, + "learning_rate": 1.125012318878786e-05, + "loss": 1.7564, + "step": 1470 + }, + { + "epoch": 1.869123252858958, + "grad_norm": 1.2881075904795836, + "learning_rate": 1.1240093763384991e-05, + "loss": 1.8716, + "step": 1471 + }, + { + "epoch": 1.8703939008894537, + "grad_norm": 1.478739460363151, + "learning_rate": 1.1230063070936624e-05, + "loss": 1.848, + "step": 1472 + }, + { + "epoch": 1.871664548919949, + "grad_norm": 1.229621468255002, + "learning_rate": 1.1220031121691449e-05, + "loss": 1.6655, + "step": 1473 + }, + { + "epoch": 1.8729351969504449, + "grad_norm": 1.2141725032976105, + "learning_rate": 1.1209997925899442e-05, + "loss": 1.933, + "step": 1474 + }, + { + "epoch": 1.8742058449809402, + "grad_norm": 1.2617137046426306, + "learning_rate": 1.119996349381187e-05, + "loss": 1.9028, + "step": 1475 + }, + { + "epoch": 1.8754764930114358, + "grad_norm": 1.4541254544342954, + "learning_rate": 1.118992783568124e-05, + "loss": 1.6844, + "step": 1476 + }, + { + "epoch": 1.8767471410419314, + "grad_norm": 1.4416164883757325, + "learning_rate": 1.1179890961761321e-05, + "loss": 1.7029, + "step": 1477 + }, + { + "epoch": 1.8780177890724268, + "grad_norm": 1.6711166429056248, + "learning_rate": 1.1169852882307128e-05, + "loss": 1.9285, + "step": 1478 + }, + { + "epoch": 1.8792884371029226, + "grad_norm": 1.4073114075531266, + "learning_rate": 1.1159813607574905e-05, + "loss": 1.844, + "step": 1479 + }, + { + "epoch": 1.880559085133418, + "grad_norm": 1.3495544307796417, + "learning_rate": 1.1149773147822112e-05, + "loss": 1.8957, + "step": 1480 + }, + { + "epoch": 1.8818297331639136, + "grad_norm": 1.2613021446156025, + "learning_rate": 1.113973151330743e-05, + "loss": 1.9607, + "step": 1481 + }, + { + "epoch": 1.8831003811944091, + "grad_norm": 1.289020288898543, + "learning_rate": 1.112968871429073e-05, + "loss": 1.7818, + "step": 1482 + }, + { + "epoch": 1.8843710292249047, + "grad_norm": 1.2187657323684478, + "learning_rate": 1.1119644761033079e-05, + "loss": 1.7153, + "step": 1483 + }, + { + "epoch": 1.8856416772554003, + "grad_norm": 1.2728351809042304, + "learning_rate": 1.1109599663796724e-05, + "loss": 1.9181, + "step": 1484 + }, + { + "epoch": 1.8869123252858957, + "grad_norm": 1.2457636252898838, + "learning_rate": 1.1099553432845079e-05, + "loss": 2.0376, + "step": 1485 + }, + { + "epoch": 1.8881829733163915, + "grad_norm": 1.2556364575716603, + "learning_rate": 1.1089506078442709e-05, + "loss": 1.8307, + "step": 1486 + }, + { + "epoch": 1.8894536213468869, + "grad_norm": 1.2422773081745468, + "learning_rate": 1.1079457610855342e-05, + "loss": 1.5723, + "step": 1487 + }, + { + "epoch": 1.8907242693773825, + "grad_norm": 1.2166331727886142, + "learning_rate": 1.1069408040349832e-05, + "loss": 1.8537, + "step": 1488 + }, + { + "epoch": 1.891994917407878, + "grad_norm": 1.8899361122781229, + "learning_rate": 1.1059357377194161e-05, + "loss": 2.1507, + "step": 1489 + }, + { + "epoch": 1.8932655654383734, + "grad_norm": 1.4418770421153397, + "learning_rate": 1.1049305631657434e-05, + "loss": 1.9585, + "step": 1490 + }, + { + "epoch": 1.8945362134688692, + "grad_norm": 1.2903505581245351, + "learning_rate": 1.1039252814009858e-05, + "loss": 1.6915, + "step": 1491 + }, + { + "epoch": 1.8958068614993646, + "grad_norm": 1.2237462796711218, + "learning_rate": 1.1029198934522725e-05, + "loss": 1.7736, + "step": 1492 + }, + { + "epoch": 1.8970775095298602, + "grad_norm": 1.3825824140430243, + "learning_rate": 1.1019144003468434e-05, + "loss": 1.8529, + "step": 1493 + }, + { + "epoch": 1.8983481575603558, + "grad_norm": 1.5019641930632714, + "learning_rate": 1.100908803112044e-05, + "loss": 1.9011, + "step": 1494 + }, + { + "epoch": 1.8996188055908514, + "grad_norm": 1.1970623904098372, + "learning_rate": 1.0999031027753269e-05, + "loss": 1.8513, + "step": 1495 + }, + { + "epoch": 1.900889453621347, + "grad_norm": 3.9496021065781814, + "learning_rate": 1.09889730036425e-05, + "loss": 1.7886, + "step": 1496 + }, + { + "epoch": 1.9021601016518423, + "grad_norm": 1.1909345439294168, + "learning_rate": 1.0978913969064753e-05, + "loss": 2.0297, + "step": 1497 + }, + { + "epoch": 1.903430749682338, + "grad_norm": 1.1853023179130728, + "learning_rate": 1.0968853934297686e-05, + "loss": 1.8918, + "step": 1498 + }, + { + "epoch": 1.9047013977128335, + "grad_norm": 1.1951448764156936, + "learning_rate": 1.095879290961997e-05, + "loss": 1.7644, + "step": 1499 + }, + { + "epoch": 1.9059720457433291, + "grad_norm": 1.3805797328593437, + "learning_rate": 1.0948730905311294e-05, + "loss": 1.9272, + "step": 1500 + }, + { + "epoch": 1.9072426937738247, + "grad_norm": 1.5137753880951426, + "learning_rate": 1.0938667931652347e-05, + "loss": 1.7265, + "step": 1501 + }, + { + "epoch": 1.90851334180432, + "grad_norm": 1.7052797441607885, + "learning_rate": 1.0928603998924807e-05, + "loss": 2.0261, + "step": 1502 + }, + { + "epoch": 1.909783989834816, + "grad_norm": 1.3312756229631164, + "learning_rate": 1.0918539117411334e-05, + "loss": 1.9571, + "step": 1503 + }, + { + "epoch": 1.9110546378653113, + "grad_norm": 1.2612079574489132, + "learning_rate": 1.0908473297395552e-05, + "loss": 1.7756, + "step": 1504 + }, + { + "epoch": 1.9123252858958069, + "grad_norm": 1.28009759085661, + "learning_rate": 1.0898406549162053e-05, + "loss": 1.8911, + "step": 1505 + }, + { + "epoch": 1.9135959339263025, + "grad_norm": 1.315883162812706, + "learning_rate": 1.0888338882996365e-05, + "loss": 1.9947, + "step": 1506 + }, + { + "epoch": 1.9148665819567978, + "grad_norm": 1.4388442688673047, + "learning_rate": 1.0878270309184973e-05, + "loss": 1.9072, + "step": 1507 + }, + { + "epoch": 1.9161372299872936, + "grad_norm": 1.4997070928756402, + "learning_rate": 1.0868200838015265e-05, + "loss": 1.8826, + "step": 1508 + }, + { + "epoch": 1.917407878017789, + "grad_norm": 1.2407765148265102, + "learning_rate": 1.0858130479775564e-05, + "loss": 1.7608, + "step": 1509 + }, + { + "epoch": 1.9186785260482846, + "grad_norm": 1.2162156867582095, + "learning_rate": 1.0848059244755093e-05, + "loss": 1.8828, + "step": 1510 + }, + { + "epoch": 1.9199491740787802, + "grad_norm": 1.3753812289304952, + "learning_rate": 1.0837987143243972e-05, + "loss": 1.8763, + "step": 1511 + }, + { + "epoch": 1.9212198221092758, + "grad_norm": 1.317136281554182, + "learning_rate": 1.0827914185533206e-05, + "loss": 2.0366, + "step": 1512 + }, + { + "epoch": 1.9224904701397714, + "grad_norm": 1.312553059882304, + "learning_rate": 1.0817840381914675e-05, + "loss": 2.0488, + "step": 1513 + }, + { + "epoch": 1.9237611181702667, + "grad_norm": 1.3785690601450935, + "learning_rate": 1.080776574268112e-05, + "loss": 1.869, + "step": 1514 + }, + { + "epoch": 1.9250317662007626, + "grad_norm": 1.4184047290196182, + "learning_rate": 1.079769027812614e-05, + "loss": 1.993, + "step": 1515 + }, + { + "epoch": 1.926302414231258, + "grad_norm": 1.1348801262998818, + "learning_rate": 1.0787613998544179e-05, + "loss": 1.8294, + "step": 1516 + }, + { + "epoch": 1.9275730622617535, + "grad_norm": 1.220243637792347, + "learning_rate": 1.0777536914230509e-05, + "loss": 1.833, + "step": 1517 + }, + { + "epoch": 1.928843710292249, + "grad_norm": 1.202059656455235, + "learning_rate": 1.0767459035481222e-05, + "loss": 1.7776, + "step": 1518 + }, + { + "epoch": 1.9301143583227445, + "grad_norm": 1.2616653504897855, + "learning_rate": 1.0757380372593234e-05, + "loss": 2.092, + "step": 1519 + }, + { + "epoch": 1.9313850063532403, + "grad_norm": 1.3040846469184033, + "learning_rate": 1.0747300935864245e-05, + "loss": 1.7734, + "step": 1520 + }, + { + "epoch": 1.9326556543837357, + "grad_norm": 1.3425908648306686, + "learning_rate": 1.0737220735592759e-05, + "loss": 1.9033, + "step": 1521 + }, + { + "epoch": 1.9339263024142312, + "grad_norm": 1.4151275474867275, + "learning_rate": 1.0727139782078054e-05, + "loss": 1.574, + "step": 1522 + }, + { + "epoch": 1.9351969504447268, + "grad_norm": 1.3925123627658118, + "learning_rate": 1.071705808562018e-05, + "loss": 1.7745, + "step": 1523 + }, + { + "epoch": 1.9364675984752222, + "grad_norm": 1.313720131215306, + "learning_rate": 1.0706975656519946e-05, + "loss": 1.905, + "step": 1524 + }, + { + "epoch": 1.937738246505718, + "grad_norm": 1.240030432064742, + "learning_rate": 1.0696892505078913e-05, + "loss": 1.8227, + "step": 1525 + }, + { + "epoch": 1.9390088945362134, + "grad_norm": 1.734954931533088, + "learning_rate": 1.0686808641599364e-05, + "loss": 1.4944, + "step": 1526 + }, + { + "epoch": 1.940279542566709, + "grad_norm": 1.4328846041574559, + "learning_rate": 1.0676724076384333e-05, + "loss": 1.8688, + "step": 1527 + }, + { + "epoch": 1.9415501905972046, + "grad_norm": 1.19033566630027, + "learning_rate": 1.0666638819737554e-05, + "loss": 1.8812, + "step": 1528 + }, + { + "epoch": 1.9428208386277002, + "grad_norm": 1.3933868256455422, + "learning_rate": 1.0656552881963474e-05, + "loss": 1.8662, + "step": 1529 + }, + { + "epoch": 1.9440914866581958, + "grad_norm": 1.5607668837933577, + "learning_rate": 1.0646466273367235e-05, + "loss": 1.8623, + "step": 1530 + }, + { + "epoch": 1.9453621346886911, + "grad_norm": 1.3214091567545014, + "learning_rate": 1.0636379004254665e-05, + "loss": 1.81, + "step": 1531 + }, + { + "epoch": 1.946632782719187, + "grad_norm": 1.4549576254701408, + "learning_rate": 1.062629108493226e-05, + "loss": 1.8601, + "step": 1532 + }, + { + "epoch": 1.9479034307496823, + "grad_norm": 1.2124857082535867, + "learning_rate": 1.06162025257072e-05, + "loss": 1.8104, + "step": 1533 + }, + { + "epoch": 1.949174078780178, + "grad_norm": 1.2162357925040301, + "learning_rate": 1.060611333688729e-05, + "loss": 2.0063, + "step": 1534 + }, + { + "epoch": 1.9504447268106735, + "grad_norm": 1.4642054743330333, + "learning_rate": 1.0596023528781003e-05, + "loss": 1.8917, + "step": 1535 + }, + { + "epoch": 1.9517153748411689, + "grad_norm": 1.2812106018682774, + "learning_rate": 1.058593311169743e-05, + "loss": 1.7892, + "step": 1536 + }, + { + "epoch": 1.9529860228716647, + "grad_norm": 1.3705385042814477, + "learning_rate": 1.0575842095946298e-05, + "loss": 2.1955, + "step": 1537 + }, + { + "epoch": 1.95425667090216, + "grad_norm": 1.203576822595647, + "learning_rate": 1.0565750491837925e-05, + "loss": 1.824, + "step": 1538 + }, + { + "epoch": 1.9555273189326556, + "grad_norm": 1.3119912838272654, + "learning_rate": 1.0555658309683251e-05, + "loss": 1.8335, + "step": 1539 + }, + { + "epoch": 1.9567979669631512, + "grad_norm": 1.2837165208756307, + "learning_rate": 1.0545565559793796e-05, + "loss": 1.5003, + "step": 1540 + }, + { + "epoch": 1.9580686149936466, + "grad_norm": 1.285094358215575, + "learning_rate": 1.053547225248166e-05, + "loss": 2.0034, + "step": 1541 + }, + { + "epoch": 1.9593392630241424, + "grad_norm": 1.6897389171605062, + "learning_rate": 1.0525378398059516e-05, + "loss": 1.8101, + "step": 1542 + }, + { + "epoch": 1.9606099110546378, + "grad_norm": 1.5569612139284217, + "learning_rate": 1.0515284006840596e-05, + "loss": 1.7041, + "step": 1543 + }, + { + "epoch": 1.9618805590851334, + "grad_norm": 1.4460708928459691, + "learning_rate": 1.0505189089138672e-05, + "loss": 1.8843, + "step": 1544 + }, + { + "epoch": 1.963151207115629, + "grad_norm": 1.3246627898468204, + "learning_rate": 1.049509365526807e-05, + "loss": 1.719, + "step": 1545 + }, + { + "epoch": 1.9644218551461246, + "grad_norm": 1.4195235677913507, + "learning_rate": 1.0484997715543632e-05, + "loss": 2.12, + "step": 1546 + }, + { + "epoch": 1.9656925031766201, + "grad_norm": 1.3449356124037468, + "learning_rate": 1.0474901280280717e-05, + "loss": 2.1002, + "step": 1547 + }, + { + "epoch": 1.9669631512071155, + "grad_norm": 1.411689091489764, + "learning_rate": 1.046480435979519e-05, + "loss": 1.922, + "step": 1548 + }, + { + "epoch": 1.9682337992376113, + "grad_norm": 1.3889025429782529, + "learning_rate": 1.0454706964403421e-05, + "loss": 1.8338, + "step": 1549 + }, + { + "epoch": 1.9695044472681067, + "grad_norm": 1.3058427582138856, + "learning_rate": 1.0444609104422253e-05, + "loss": 1.8817, + "step": 1550 + }, + { + "epoch": 1.9707750952986023, + "grad_norm": 1.737171300498622, + "learning_rate": 1.0434510790169014e-05, + "loss": 1.8244, + "step": 1551 + }, + { + "epoch": 1.9720457433290979, + "grad_norm": 1.3377667180365431, + "learning_rate": 1.0424412031961485e-05, + "loss": 1.6864, + "step": 1552 + }, + { + "epoch": 1.9733163913595932, + "grad_norm": 1.5352021161297504, + "learning_rate": 1.041431284011791e-05, + "loss": 2.0236, + "step": 1553 + }, + { + "epoch": 1.974587039390089, + "grad_norm": 1.4409359496788379, + "learning_rate": 1.0404213224956974e-05, + "loss": 1.8878, + "step": 1554 + }, + { + "epoch": 1.9758576874205844, + "grad_norm": 1.2230201495547812, + "learning_rate": 1.0394113196797793e-05, + "loss": 1.7995, + "step": 1555 + }, + { + "epoch": 1.97712833545108, + "grad_norm": 1.3318113041240423, + "learning_rate": 1.0384012765959904e-05, + "loss": 1.9446, + "step": 1556 + }, + { + "epoch": 1.9783989834815756, + "grad_norm": 1.4399026405166524, + "learning_rate": 1.037391194276326e-05, + "loss": 1.7571, + "step": 1557 + }, + { + "epoch": 1.9796696315120712, + "grad_norm": 1.2649007523271352, + "learning_rate": 1.0363810737528204e-05, + "loss": 1.6082, + "step": 1558 + }, + { + "epoch": 1.9809402795425668, + "grad_norm": 1.3362245297305004, + "learning_rate": 1.0353709160575488e-05, + "loss": 1.5646, + "step": 1559 + }, + { + "epoch": 1.9822109275730622, + "grad_norm": 1.3186964992067625, + "learning_rate": 1.0343607222226227e-05, + "loss": 1.5753, + "step": 1560 + }, + { + "epoch": 1.983481575603558, + "grad_norm": 1.2582780186793407, + "learning_rate": 1.0333504932801907e-05, + "loss": 2.0277, + "step": 1561 + }, + { + "epoch": 1.9847522236340533, + "grad_norm": 4.105112342397828, + "learning_rate": 1.0323402302624386e-05, + "loss": 2.1881, + "step": 1562 + }, + { + "epoch": 1.986022871664549, + "grad_norm": 1.316337918857903, + "learning_rate": 1.0313299342015855e-05, + "loss": 1.9083, + "step": 1563 + }, + { + "epoch": 1.9872935196950445, + "grad_norm": 1.4610849359414284, + "learning_rate": 1.030319606129885e-05, + "loss": 1.9046, + "step": 1564 + }, + { + "epoch": 1.98856416772554, + "grad_norm": 1.2497272565358144, + "learning_rate": 1.0293092470796236e-05, + "loss": 1.8425, + "step": 1565 + }, + { + "epoch": 1.9898348157560357, + "grad_norm": 1.4980665158805504, + "learning_rate": 1.0282988580831183e-05, + "loss": 1.9863, + "step": 1566 + }, + { + "epoch": 1.991105463786531, + "grad_norm": 1.6926076544107527, + "learning_rate": 1.027288440172718e-05, + "loss": 1.8363, + "step": 1567 + }, + { + "epoch": 1.9923761118170267, + "grad_norm": 1.4868892082114677, + "learning_rate": 1.026277994380801e-05, + "loss": 2.0619, + "step": 1568 + }, + { + "epoch": 1.9936467598475223, + "grad_norm": 1.3481010200559678, + "learning_rate": 1.0252675217397734e-05, + "loss": 1.7867, + "step": 1569 + }, + { + "epoch": 1.9949174078780176, + "grad_norm": 1.3063690989319638, + "learning_rate": 1.0242570232820687e-05, + "loss": 1.7639, + "step": 1570 + }, + { + "epoch": 1.9961880559085134, + "grad_norm": 1.269868093923743, + "learning_rate": 1.0232465000401482e-05, + "loss": 1.695, + "step": 1571 + }, + { + "epoch": 1.9974587039390088, + "grad_norm": 1.4284273129199416, + "learning_rate": 1.0222359530464964e-05, + "loss": 1.791, + "step": 1572 + }, + { + "epoch": 1.9987293519695044, + "grad_norm": 1.4282702884606915, + "learning_rate": 1.0212253833336237e-05, + "loss": 1.7568, + "step": 1573 + }, + { + "epoch": 2.0, + "grad_norm": 1.4072827771859442, + "learning_rate": 1.020214791934063e-05, + "loss": 1.7505, + "step": 1574 + }, + { + "epoch": 2.0012706480304954, + "grad_norm": 2.1922567521585887, + "learning_rate": 1.0192041798803696e-05, + "loss": 1.5317, + "step": 1575 + }, + { + "epoch": 2.002541296060991, + "grad_norm": 1.8162527031778972, + "learning_rate": 1.0181935482051198e-05, + "loss": 1.4413, + "step": 1576 + }, + { + "epoch": 2.0038119440914866, + "grad_norm": 1.750499161090013, + "learning_rate": 1.0171828979409099e-05, + "loss": 1.636, + "step": 1577 + }, + { + "epoch": 2.0050825921219824, + "grad_norm": 1.7380731045199291, + "learning_rate": 1.0161722301203554e-05, + "loss": 1.6421, + "step": 1578 + }, + { + "epoch": 2.0063532401524777, + "grad_norm": 2.1595843457290678, + "learning_rate": 1.0151615457760895e-05, + "loss": 1.4908, + "step": 1579 + }, + { + "epoch": 2.007623888182973, + "grad_norm": 3.1077355080184828, + "learning_rate": 1.0141508459407622e-05, + "loss": 1.557, + "step": 1580 + }, + { + "epoch": 2.008894536213469, + "grad_norm": 1.883335019434399, + "learning_rate": 1.01314013164704e-05, + "loss": 1.4266, + "step": 1581 + }, + { + "epoch": 2.0101651842439643, + "grad_norm": 2.0929579922525448, + "learning_rate": 1.0121294039276031e-05, + "loss": 1.7675, + "step": 1582 + }, + { + "epoch": 2.01143583227446, + "grad_norm": 1.6067207672407877, + "learning_rate": 1.0111186638151464e-05, + "loss": 1.5462, + "step": 1583 + }, + { + "epoch": 2.0127064803049555, + "grad_norm": 2.0002465791789734, + "learning_rate": 1.0101079123423771e-05, + "loss": 1.3479, + "step": 1584 + }, + { + "epoch": 2.0139771283354513, + "grad_norm": 1.521193085600093, + "learning_rate": 1.009097150542014e-05, + "loss": 1.4775, + "step": 1585 + }, + { + "epoch": 2.0152477763659467, + "grad_norm": 1.602107005268662, + "learning_rate": 1.0080863794467859e-05, + "loss": 1.475, + "step": 1586 + }, + { + "epoch": 2.016518424396442, + "grad_norm": 1.9436436215116601, + "learning_rate": 1.0070756000894321e-05, + "loss": 1.5147, + "step": 1587 + }, + { + "epoch": 2.017789072426938, + "grad_norm": 1.809577975993538, + "learning_rate": 1.0060648135026999e-05, + "loss": 1.4581, + "step": 1588 + }, + { + "epoch": 2.019059720457433, + "grad_norm": 1.720543171952214, + "learning_rate": 1.0050540207193433e-05, + "loss": 1.5072, + "step": 1589 + }, + { + "epoch": 2.020330368487929, + "grad_norm": 1.7047275216163906, + "learning_rate": 1.0040432227721242e-05, + "loss": 1.6827, + "step": 1590 + }, + { + "epoch": 2.0216010165184244, + "grad_norm": 1.5472544495690708, + "learning_rate": 1.0030324206938084e-05, + "loss": 1.5279, + "step": 1591 + }, + { + "epoch": 2.0228716645489198, + "grad_norm": 1.5430046286635146, + "learning_rate": 1.0020216155171662e-05, + "loss": 1.5495, + "step": 1592 + }, + { + "epoch": 2.0241423125794156, + "grad_norm": 1.3820694129150732, + "learning_rate": 1.0010108082749716e-05, + "loss": 1.8249, + "step": 1593 + }, + { + "epoch": 2.025412960609911, + "grad_norm": 1.6196746033148492, + "learning_rate": 1e-05, + "loss": 1.6198, + "step": 1594 + }, + { + "epoch": 2.0266836086404068, + "grad_norm": 1.4633038434106642, + "learning_rate": 9.989891917250286e-06, + "loss": 1.4854, + "step": 1595 + }, + { + "epoch": 2.027954256670902, + "grad_norm": 1.5375930378258063, + "learning_rate": 9.979783844828343e-06, + "loss": 1.5181, + "step": 1596 + }, + { + "epoch": 2.0292249047013975, + "grad_norm": 1.7262813627601707, + "learning_rate": 9.969675793061917e-06, + "loss": 1.3203, + "step": 1597 + }, + { + "epoch": 2.0304955527318933, + "grad_norm": 1.5531837084914053, + "learning_rate": 9.95956777227876e-06, + "loss": 1.5943, + "step": 1598 + }, + { + "epoch": 2.0317662007623887, + "grad_norm": 2.314212281134008, + "learning_rate": 9.949459792806569e-06, + "loss": 1.3614, + "step": 1599 + }, + { + "epoch": 2.0330368487928845, + "grad_norm": 1.56264513568958, + "learning_rate": 9.939351864973006e-06, + "loss": 1.623, + "step": 1600 + }, + { + "epoch": 2.03430749682338, + "grad_norm": 1.5310040399778346, + "learning_rate": 9.929243999105682e-06, + "loss": 1.3823, + "step": 1601 + }, + { + "epoch": 2.0355781448538757, + "grad_norm": 1.5122091613542985, + "learning_rate": 9.919136205532146e-06, + "loss": 1.5671, + "step": 1602 + }, + { + "epoch": 2.036848792884371, + "grad_norm": 1.602275195991115, + "learning_rate": 9.909028494579862e-06, + "loss": 1.4149, + "step": 1603 + }, + { + "epoch": 2.0381194409148664, + "grad_norm": 1.555548981748731, + "learning_rate": 9.89892087657623e-06, + "loss": 1.5887, + "step": 1604 + }, + { + "epoch": 2.0393900889453622, + "grad_norm": 1.789096954693768, + "learning_rate": 9.888813361848538e-06, + "loss": 1.6299, + "step": 1605 + }, + { + "epoch": 2.0406607369758576, + "grad_norm": 1.4371116279202802, + "learning_rate": 9.87870596072397e-06, + "loss": 1.5942, + "step": 1606 + }, + { + "epoch": 2.0419313850063534, + "grad_norm": 1.5448683516858457, + "learning_rate": 9.868598683529603e-06, + "loss": 1.41, + "step": 1607 + }, + { + "epoch": 2.0432020330368488, + "grad_norm": 1.5005003454194215, + "learning_rate": 9.858491540592383e-06, + "loss": 1.5644, + "step": 1608 + }, + { + "epoch": 2.044472681067344, + "grad_norm": 1.4227489543845717, + "learning_rate": 9.848384542239109e-06, + "loss": 1.3633, + "step": 1609 + }, + { + "epoch": 2.04574332909784, + "grad_norm": 1.566848584554528, + "learning_rate": 9.83827769879645e-06, + "loss": 1.4524, + "step": 1610 + }, + { + "epoch": 2.0470139771283353, + "grad_norm": 1.5994679206945885, + "learning_rate": 9.828171020590903e-06, + "loss": 1.3457, + "step": 1611 + }, + { + "epoch": 2.048284625158831, + "grad_norm": 1.5669877597200432, + "learning_rate": 9.818064517948806e-06, + "loss": 1.6205, + "step": 1612 + }, + { + "epoch": 2.0495552731893265, + "grad_norm": 1.8089768376002517, + "learning_rate": 9.807958201196307e-06, + "loss": 1.6253, + "step": 1613 + }, + { + "epoch": 2.0508259212198223, + "grad_norm": 1.5784294416030926, + "learning_rate": 9.797852080659375e-06, + "loss": 1.5439, + "step": 1614 + }, + { + "epoch": 2.0520965692503177, + "grad_norm": 1.6938171380318698, + "learning_rate": 9.787746166663765e-06, + "loss": 1.5235, + "step": 1615 + }, + { + "epoch": 2.053367217280813, + "grad_norm": 1.5592263744572192, + "learning_rate": 9.777640469535037e-06, + "loss": 1.3738, + "step": 1616 + }, + { + "epoch": 2.054637865311309, + "grad_norm": 1.6890696349677892, + "learning_rate": 9.76753499959852e-06, + "loss": 1.4987, + "step": 1617 + }, + { + "epoch": 2.0559085133418042, + "grad_norm": 1.7276203896324043, + "learning_rate": 9.757429767179314e-06, + "loss": 1.5838, + "step": 1618 + }, + { + "epoch": 2.0571791613723, + "grad_norm": 1.4661617737870516, + "learning_rate": 9.74732478260227e-06, + "loss": 1.503, + "step": 1619 + }, + { + "epoch": 2.0584498094027954, + "grad_norm": 1.3693147692588428, + "learning_rate": 9.737220056191995e-06, + "loss": 1.5142, + "step": 1620 + }, + { + "epoch": 2.059720457433291, + "grad_norm": 1.517200118021235, + "learning_rate": 9.727115598272821e-06, + "loss": 1.518, + "step": 1621 + }, + { + "epoch": 2.0609911054637866, + "grad_norm": 1.3717809560979908, + "learning_rate": 9.71701141916882e-06, + "loss": 1.6858, + "step": 1622 + }, + { + "epoch": 2.062261753494282, + "grad_norm": 1.5954676865844675, + "learning_rate": 9.706907529203769e-06, + "loss": 1.6237, + "step": 1623 + }, + { + "epoch": 2.063532401524778, + "grad_norm": 28.333500518838918, + "learning_rate": 9.696803938701153e-06, + "loss": 2.1156, + "step": 1624 + }, + { + "epoch": 2.064803049555273, + "grad_norm": 1.495655727010147, + "learning_rate": 9.686700657984148e-06, + "loss": 1.4315, + "step": 1625 + }, + { + "epoch": 2.0660736975857685, + "grad_norm": 1.46669444898845, + "learning_rate": 9.676597697375615e-06, + "loss": 1.5926, + "step": 1626 + }, + { + "epoch": 2.0673443456162643, + "grad_norm": 1.4893524309502175, + "learning_rate": 9.666495067198094e-06, + "loss": 1.6936, + "step": 1627 + }, + { + "epoch": 2.0686149936467597, + "grad_norm": 1.4657156784120036, + "learning_rate": 9.656392777773778e-06, + "loss": 1.7206, + "step": 1628 + }, + { + "epoch": 2.0698856416772555, + "grad_norm": 1.528267204530225, + "learning_rate": 9.646290839424515e-06, + "loss": 1.5584, + "step": 1629 + }, + { + "epoch": 2.071156289707751, + "grad_norm": 1.626357254375774, + "learning_rate": 9.6361892624718e-06, + "loss": 1.6127, + "step": 1630 + }, + { + "epoch": 2.0724269377382467, + "grad_norm": 1.4399509284642533, + "learning_rate": 9.626088057236745e-06, + "loss": 1.5282, + "step": 1631 + }, + { + "epoch": 2.073697585768742, + "grad_norm": 1.4694728842146232, + "learning_rate": 9.615987234040098e-06, + "loss": 1.6358, + "step": 1632 + }, + { + "epoch": 2.0749682337992374, + "grad_norm": 1.3107896183282848, + "learning_rate": 9.60588680320221e-06, + "loss": 1.4943, + "step": 1633 + }, + { + "epoch": 2.0762388818297333, + "grad_norm": 1.3258155601804371, + "learning_rate": 9.595786775043028e-06, + "loss": 1.5086, + "step": 1634 + }, + { + "epoch": 2.0775095298602286, + "grad_norm": 1.6867655906427885, + "learning_rate": 9.585687159882092e-06, + "loss": 1.6189, + "step": 1635 + }, + { + "epoch": 2.0787801778907244, + "grad_norm": 1.5458307869086416, + "learning_rate": 9.57558796803852e-06, + "loss": 1.7701, + "step": 1636 + }, + { + "epoch": 2.08005082592122, + "grad_norm": 1.556087014644666, + "learning_rate": 9.565489209830991e-06, + "loss": 1.5319, + "step": 1637 + }, + { + "epoch": 2.081321473951715, + "grad_norm": 1.5341891110262698, + "learning_rate": 9.555390895577748e-06, + "loss": 1.5834, + "step": 1638 + }, + { + "epoch": 2.082592121982211, + "grad_norm": 1.6591782020014392, + "learning_rate": 9.54529303559658e-06, + "loss": 1.3932, + "step": 1639 + }, + { + "epoch": 2.0838627700127064, + "grad_norm": 1.6103571396160286, + "learning_rate": 9.535195640204811e-06, + "loss": 1.5176, + "step": 1640 + }, + { + "epoch": 2.085133418043202, + "grad_norm": 1.648255448151265, + "learning_rate": 9.525098719719285e-06, + "loss": 1.6029, + "step": 1641 + }, + { + "epoch": 2.0864040660736975, + "grad_norm": 1.576871559589654, + "learning_rate": 9.515002284456373e-06, + "loss": 1.4553, + "step": 1642 + }, + { + "epoch": 2.0876747141041934, + "grad_norm": 1.573271991288505, + "learning_rate": 9.504906344731933e-06, + "loss": 1.5202, + "step": 1643 + }, + { + "epoch": 2.0889453621346887, + "grad_norm": 1.5214587878309893, + "learning_rate": 9.494810910861328e-06, + "loss": 1.5757, + "step": 1644 + }, + { + "epoch": 2.090216010165184, + "grad_norm": 1.6462726696144563, + "learning_rate": 9.484715993159407e-06, + "loss": 1.5592, + "step": 1645 + }, + { + "epoch": 2.09148665819568, + "grad_norm": 1.4765489474527553, + "learning_rate": 9.474621601940488e-06, + "loss": 1.5569, + "step": 1646 + }, + { + "epoch": 2.0927573062261753, + "grad_norm": 1.3643850828357655, + "learning_rate": 9.464527747518344e-06, + "loss": 1.4732, + "step": 1647 + }, + { + "epoch": 2.094027954256671, + "grad_norm": 1.5624165272696018, + "learning_rate": 9.454434440206211e-06, + "loss": 1.5263, + "step": 1648 + }, + { + "epoch": 2.0952986022871665, + "grad_norm": 1.5291621427497337, + "learning_rate": 9.444341690316754e-06, + "loss": 1.3224, + "step": 1649 + }, + { + "epoch": 2.096569250317662, + "grad_norm": 1.6367742909471383, + "learning_rate": 9.434249508162076e-06, + "loss": 1.472, + "step": 1650 + }, + { + "epoch": 2.0978398983481577, + "grad_norm": 1.5909269637605061, + "learning_rate": 9.424157904053705e-06, + "loss": 1.3923, + "step": 1651 + }, + { + "epoch": 2.099110546378653, + "grad_norm": 1.5625260293281584, + "learning_rate": 9.414066888302572e-06, + "loss": 1.4278, + "step": 1652 + }, + { + "epoch": 2.100381194409149, + "grad_norm": 1.6300100018177057, + "learning_rate": 9.403976471219e-06, + "loss": 1.584, + "step": 1653 + }, + { + "epoch": 2.101651842439644, + "grad_norm": 1.6298229936291917, + "learning_rate": 9.393886663112714e-06, + "loss": 1.7778, + "step": 1654 + }, + { + "epoch": 2.1029224904701396, + "grad_norm": 1.8296972349127292, + "learning_rate": 9.383797474292804e-06, + "loss": 1.5345, + "step": 1655 + }, + { + "epoch": 2.1041931385006354, + "grad_norm": 1.566619499783697, + "learning_rate": 9.373708915067738e-06, + "loss": 1.3907, + "step": 1656 + }, + { + "epoch": 2.1054637865311308, + "grad_norm": 1.927986374560233, + "learning_rate": 9.363620995745337e-06, + "loss": 1.7173, + "step": 1657 + }, + { + "epoch": 2.1067344345616266, + "grad_norm": 1.707036241486098, + "learning_rate": 9.353533726632768e-06, + "loss": 1.5264, + "step": 1658 + }, + { + "epoch": 2.108005082592122, + "grad_norm": 1.6716763707563085, + "learning_rate": 9.343447118036528e-06, + "loss": 1.6883, + "step": 1659 + }, + { + "epoch": 2.1092757306226178, + "grad_norm": 1.4522206766291692, + "learning_rate": 9.33336118026245e-06, + "loss": 1.5433, + "step": 1660 + }, + { + "epoch": 2.110546378653113, + "grad_norm": 1.5721077886542751, + "learning_rate": 9.323275923615669e-06, + "loss": 1.6695, + "step": 1661 + }, + { + "epoch": 2.1118170266836085, + "grad_norm": 1.5761380639840767, + "learning_rate": 9.313191358400638e-06, + "loss": 1.2518, + "step": 1662 + }, + { + "epoch": 2.1130876747141043, + "grad_norm": 1.672209113109455, + "learning_rate": 9.30310749492109e-06, + "loss": 1.6208, + "step": 1663 + }, + { + "epoch": 2.1143583227445997, + "grad_norm": 1.656915671786083, + "learning_rate": 9.293024343480056e-06, + "loss": 1.4643, + "step": 1664 + }, + { + "epoch": 2.1156289707750955, + "grad_norm": 1.483378364811783, + "learning_rate": 9.282941914379821e-06, + "loss": 1.5757, + "step": 1665 + }, + { + "epoch": 2.116899618805591, + "grad_norm": 1.4992838950739726, + "learning_rate": 9.272860217921951e-06, + "loss": 1.5028, + "step": 1666 + }, + { + "epoch": 2.1181702668360862, + "grad_norm": 1.6155109191754602, + "learning_rate": 9.262779264407245e-06, + "loss": 1.6201, + "step": 1667 + }, + { + "epoch": 2.119440914866582, + "grad_norm": 1.4819695282440337, + "learning_rate": 9.252699064135759e-06, + "loss": 1.4906, + "step": 1668 + }, + { + "epoch": 2.1207115628970774, + "grad_norm": 1.4590395485050054, + "learning_rate": 9.24261962740677e-06, + "loss": 1.3716, + "step": 1669 + }, + { + "epoch": 2.121982210927573, + "grad_norm": 1.7291314303929641, + "learning_rate": 9.23254096451878e-06, + "loss": 1.8731, + "step": 1670 + }, + { + "epoch": 2.1232528589580686, + "grad_norm": 1.5878570493809223, + "learning_rate": 9.222463085769495e-06, + "loss": 1.565, + "step": 1671 + }, + { + "epoch": 2.124523506988564, + "grad_norm": 1.6488457808778345, + "learning_rate": 9.212386001455826e-06, + "loss": 1.8541, + "step": 1672 + }, + { + "epoch": 2.1257941550190598, + "grad_norm": 1.5243218279056652, + "learning_rate": 9.202309721873861e-06, + "loss": 1.5937, + "step": 1673 + }, + { + "epoch": 2.127064803049555, + "grad_norm": 1.7912749204453913, + "learning_rate": 9.192234257318883e-06, + "loss": 1.5461, + "step": 1674 + }, + { + "epoch": 2.128335451080051, + "grad_norm": 1.440834640424777, + "learning_rate": 9.182159618085328e-06, + "loss": 1.5988, + "step": 1675 + }, + { + "epoch": 2.1296060991105463, + "grad_norm": 1.429961079837902, + "learning_rate": 9.172085814466798e-06, + "loss": 1.5742, + "step": 1676 + }, + { + "epoch": 2.130876747141042, + "grad_norm": 1.7624163980029814, + "learning_rate": 9.162012856756031e-06, + "loss": 1.5154, + "step": 1677 + }, + { + "epoch": 2.1321473951715375, + "grad_norm": 1.430025091724334, + "learning_rate": 9.151940755244912e-06, + "loss": 1.7124, + "step": 1678 + }, + { + "epoch": 2.133418043202033, + "grad_norm": 1.4365126628311997, + "learning_rate": 9.141869520224438e-06, + "loss": 1.5253, + "step": 1679 + }, + { + "epoch": 2.1346886912325287, + "grad_norm": 1.6293871964817268, + "learning_rate": 9.131799161984738e-06, + "loss": 1.585, + "step": 1680 + }, + { + "epoch": 2.135959339263024, + "grad_norm": 1.6621412081818165, + "learning_rate": 9.12172969081503e-06, + "loss": 1.6056, + "step": 1681 + }, + { + "epoch": 2.13722998729352, + "grad_norm": 1.6866346755461035, + "learning_rate": 9.111661117003637e-06, + "loss": 1.4979, + "step": 1682 + }, + { + "epoch": 2.1385006353240152, + "grad_norm": 1.5713172138081546, + "learning_rate": 9.101593450837952e-06, + "loss": 1.4382, + "step": 1683 + }, + { + "epoch": 2.1397712833545106, + "grad_norm": 1.5893242031849881, + "learning_rate": 9.091526702604448e-06, + "loss": 1.4081, + "step": 1684 + }, + { + "epoch": 2.1410419313850064, + "grad_norm": 1.6075705545790635, + "learning_rate": 9.081460882588668e-06, + "loss": 1.7753, + "step": 1685 + }, + { + "epoch": 2.142312579415502, + "grad_norm": 1.5260956701483068, + "learning_rate": 9.071396001075195e-06, + "loss": 1.5592, + "step": 1686 + }, + { + "epoch": 2.1435832274459976, + "grad_norm": 1.5492888125581123, + "learning_rate": 9.061332068347654e-06, + "loss": 1.727, + "step": 1687 + }, + { + "epoch": 2.144853875476493, + "grad_norm": 1.3887209879997233, + "learning_rate": 9.05126909468871e-06, + "loss": 1.3388, + "step": 1688 + }, + { + "epoch": 2.1461245235069883, + "grad_norm": 1.5828322115240465, + "learning_rate": 9.041207090380035e-06, + "loss": 1.8861, + "step": 1689 + }, + { + "epoch": 2.147395171537484, + "grad_norm": 1.5801839038857823, + "learning_rate": 9.031146065702316e-06, + "loss": 1.6062, + "step": 1690 + }, + { + "epoch": 2.1486658195679795, + "grad_norm": 1.6933001742313516, + "learning_rate": 9.021086030935248e-06, + "loss": 1.6028, + "step": 1691 + }, + { + "epoch": 2.1499364675984753, + "grad_norm": 1.4970349556990488, + "learning_rate": 9.011026996357504e-06, + "loss": 1.5388, + "step": 1692 + }, + { + "epoch": 2.1512071156289707, + "grad_norm": 1.5235751770699257, + "learning_rate": 9.000968972246734e-06, + "loss": 1.6252, + "step": 1693 + }, + { + "epoch": 2.1524777636594665, + "grad_norm": 1.5804044095803875, + "learning_rate": 8.990911968879566e-06, + "loss": 1.6521, + "step": 1694 + }, + { + "epoch": 2.153748411689962, + "grad_norm": 1.7761792114379973, + "learning_rate": 8.98085599653157e-06, + "loss": 1.5486, + "step": 1695 + }, + { + "epoch": 2.1550190597204573, + "grad_norm": 1.734623727959135, + "learning_rate": 8.970801065477276e-06, + "loss": 1.6489, + "step": 1696 + }, + { + "epoch": 2.156289707750953, + "grad_norm": 1.4507938151329542, + "learning_rate": 8.960747185990147e-06, + "loss": 1.2433, + "step": 1697 + }, + { + "epoch": 2.1575603557814484, + "grad_norm": 1.4898993070280564, + "learning_rate": 8.950694368342568e-06, + "loss": 1.4389, + "step": 1698 + }, + { + "epoch": 2.1588310038119443, + "grad_norm": 1.570539349783722, + "learning_rate": 8.94064262280584e-06, + "loss": 1.5924, + "step": 1699 + }, + { + "epoch": 2.1601016518424396, + "grad_norm": 1.6282082966926994, + "learning_rate": 8.930591959650173e-06, + "loss": 1.6758, + "step": 1700 + }, + { + "epoch": 2.161372299872935, + "grad_norm": 1.4261021613348666, + "learning_rate": 8.920542389144663e-06, + "loss": 1.5793, + "step": 1701 + }, + { + "epoch": 2.162642947903431, + "grad_norm": 1.5578874586391542, + "learning_rate": 8.910493921557293e-06, + "loss": 1.6233, + "step": 1702 + }, + { + "epoch": 2.163913595933926, + "grad_norm": 1.7188727597552311, + "learning_rate": 8.900446567154924e-06, + "loss": 1.4443, + "step": 1703 + }, + { + "epoch": 2.165184243964422, + "grad_norm": 1.4826880086010712, + "learning_rate": 8.89040033620328e-06, + "loss": 1.5363, + "step": 1704 + }, + { + "epoch": 2.1664548919949174, + "grad_norm": 1.5783201593369776, + "learning_rate": 8.880355238966923e-06, + "loss": 1.626, + "step": 1705 + }, + { + "epoch": 2.1677255400254127, + "grad_norm": 1.6043102679888508, + "learning_rate": 8.870311285709274e-06, + "loss": 1.1562, + "step": 1706 + }, + { + "epoch": 2.1689961880559085, + "grad_norm": 1.6574392377919835, + "learning_rate": 8.860268486692575e-06, + "loss": 1.8377, + "step": 1707 + }, + { + "epoch": 2.170266836086404, + "grad_norm": 1.4871357831315282, + "learning_rate": 8.85022685217789e-06, + "loss": 1.457, + "step": 1708 + }, + { + "epoch": 2.1715374841168997, + "grad_norm": 1.5616476648026785, + "learning_rate": 8.840186392425098e-06, + "loss": 1.6417, + "step": 1709 + }, + { + "epoch": 2.172808132147395, + "grad_norm": 1.7895366394288845, + "learning_rate": 8.830147117692876e-06, + "loss": 1.6573, + "step": 1710 + }, + { + "epoch": 2.174078780177891, + "grad_norm": 1.7137363094879385, + "learning_rate": 8.820109038238682e-06, + "loss": 1.6524, + "step": 1711 + }, + { + "epoch": 2.1753494282083863, + "grad_norm": 1.845553280517191, + "learning_rate": 8.810072164318766e-06, + "loss": 1.6691, + "step": 1712 + }, + { + "epoch": 2.1766200762388817, + "grad_norm": 1.6081167744680605, + "learning_rate": 8.80003650618813e-06, + "loss": 1.47, + "step": 1713 + }, + { + "epoch": 2.1778907242693775, + "grad_norm": 1.5525755260553589, + "learning_rate": 8.790002074100556e-06, + "loss": 1.3088, + "step": 1714 + }, + { + "epoch": 2.179161372299873, + "grad_norm": 10.268603491141855, + "learning_rate": 8.779968878308554e-06, + "loss": 1.6433, + "step": 1715 + }, + { + "epoch": 2.1804320203303686, + "grad_norm": 1.4235078712294176, + "learning_rate": 8.769936929063381e-06, + "loss": 1.5117, + "step": 1716 + }, + { + "epoch": 2.181702668360864, + "grad_norm": 1.7351705689202577, + "learning_rate": 8.75990623661501e-06, + "loss": 1.6901, + "step": 1717 + }, + { + "epoch": 2.1829733163913594, + "grad_norm": 1.4694761500743818, + "learning_rate": 8.749876811212144e-06, + "loss": 1.6197, + "step": 1718 + }, + { + "epoch": 2.184243964421855, + "grad_norm": 1.516154165521079, + "learning_rate": 8.739848663102176e-06, + "loss": 1.5381, + "step": 1719 + }, + { + "epoch": 2.1855146124523506, + "grad_norm": 1.6996412072679006, + "learning_rate": 8.729821802531213e-06, + "loss": 1.5123, + "step": 1720 + }, + { + "epoch": 2.1867852604828464, + "grad_norm": 1.3431724285108444, + "learning_rate": 8.719796239744029e-06, + "loss": 1.5364, + "step": 1721 + }, + { + "epoch": 2.1880559085133418, + "grad_norm": 1.5892564675817367, + "learning_rate": 8.70977198498408e-06, + "loss": 1.7507, + "step": 1722 + }, + { + "epoch": 2.189326556543837, + "grad_norm": 1.5438579499894396, + "learning_rate": 8.699749048493483e-06, + "loss": 1.4018, + "step": 1723 + }, + { + "epoch": 2.190597204574333, + "grad_norm": 1.4519940531190099, + "learning_rate": 8.689727440513013e-06, + "loss": 1.672, + "step": 1724 + }, + { + "epoch": 2.1918678526048283, + "grad_norm": 1.703684111028615, + "learning_rate": 8.679707171282073e-06, + "loss": 1.2744, + "step": 1725 + }, + { + "epoch": 2.193138500635324, + "grad_norm": 1.4577231337063985, + "learning_rate": 8.669688251038726e-06, + "loss": 1.6224, + "step": 1726 + }, + { + "epoch": 2.1944091486658195, + "grad_norm": 1.4475200873626741, + "learning_rate": 8.659670690019626e-06, + "loss": 1.508, + "step": 1727 + }, + { + "epoch": 2.1956797966963153, + "grad_norm": 1.5782944530270577, + "learning_rate": 8.64965449846006e-06, + "loss": 1.4206, + "step": 1728 + }, + { + "epoch": 2.1969504447268107, + "grad_norm": 1.4495894755794485, + "learning_rate": 8.639639686593904e-06, + "loss": 1.5102, + "step": 1729 + }, + { + "epoch": 2.198221092757306, + "grad_norm": 1.7498846987619256, + "learning_rate": 8.62962626465363e-06, + "loss": 1.5369, + "step": 1730 + }, + { + "epoch": 2.199491740787802, + "grad_norm": 1.5255113933892452, + "learning_rate": 8.61961424287028e-06, + "loss": 1.3851, + "step": 1731 + }, + { + "epoch": 2.200762388818297, + "grad_norm": 1.6512648505337741, + "learning_rate": 8.609603631473487e-06, + "loss": 1.53, + "step": 1732 + }, + { + "epoch": 2.202033036848793, + "grad_norm": 1.5399236926216269, + "learning_rate": 8.599594440691419e-06, + "loss": 1.4815, + "step": 1733 + }, + { + "epoch": 2.2033036848792884, + "grad_norm": 1.689193523447338, + "learning_rate": 8.58958668075081e-06, + "loss": 1.5956, + "step": 1734 + }, + { + "epoch": 2.204574332909784, + "grad_norm": 1.5459365286861693, + "learning_rate": 8.579580361876917e-06, + "loss": 1.5584, + "step": 1735 + }, + { + "epoch": 2.2058449809402796, + "grad_norm": 1.6568848921883528, + "learning_rate": 8.56957549429354e-06, + "loss": 1.5814, + "step": 1736 + }, + { + "epoch": 2.207115628970775, + "grad_norm": 1.61258329416859, + "learning_rate": 8.55957208822298e-06, + "loss": 1.4887, + "step": 1737 + }, + { + "epoch": 2.2083862770012708, + "grad_norm": 1.753927164526745, + "learning_rate": 8.549570153886062e-06, + "loss": 1.5408, + "step": 1738 + }, + { + "epoch": 2.209656925031766, + "grad_norm": 1.4274659907296536, + "learning_rate": 8.539569701502096e-06, + "loss": 1.5375, + "step": 1739 + }, + { + "epoch": 2.210927573062262, + "grad_norm": 1.620886641379908, + "learning_rate": 8.529570741288882e-06, + "loss": 1.6127, + "step": 1740 + }, + { + "epoch": 2.2121982210927573, + "grad_norm": 1.4694381578113502, + "learning_rate": 8.519573283462688e-06, + "loss": 1.6902, + "step": 1741 + }, + { + "epoch": 2.2134688691232527, + "grad_norm": 1.4208135400626172, + "learning_rate": 8.509577338238255e-06, + "loss": 1.759, + "step": 1742 + }, + { + "epoch": 2.2147395171537485, + "grad_norm": 1.6453102941428899, + "learning_rate": 8.499582915828782e-06, + "loss": 1.5061, + "step": 1743 + }, + { + "epoch": 2.216010165184244, + "grad_norm": 1.5204855654557505, + "learning_rate": 8.489590026445902e-06, + "loss": 1.5374, + "step": 1744 + }, + { + "epoch": 2.2172808132147397, + "grad_norm": 1.9856026444957109, + "learning_rate": 8.479598680299686e-06, + "loss": 1.3209, + "step": 1745 + }, + { + "epoch": 2.218551461245235, + "grad_norm": 1.675785199137644, + "learning_rate": 8.46960888759863e-06, + "loss": 1.6129, + "step": 1746 + }, + { + "epoch": 2.2198221092757304, + "grad_norm": 1.4868040261292785, + "learning_rate": 8.459620658549638e-06, + "loss": 1.5243, + "step": 1747 + }, + { + "epoch": 2.2210927573062262, + "grad_norm": 1.6585116655037806, + "learning_rate": 8.449634003358022e-06, + "loss": 1.4083, + "step": 1748 + }, + { + "epoch": 2.2223634053367216, + "grad_norm": 1.6873083046784862, + "learning_rate": 8.439648932227483e-06, + "loss": 1.3082, + "step": 1749 + }, + { + "epoch": 2.2236340533672174, + "grad_norm": 1.6185024009440863, + "learning_rate": 8.429665455360107e-06, + "loss": 1.6422, + "step": 1750 + }, + { + "epoch": 2.224904701397713, + "grad_norm": 1.424835120406413, + "learning_rate": 8.419683582956343e-06, + "loss": 1.4796, + "step": 1751 + }, + { + "epoch": 2.2261753494282086, + "grad_norm": 1.6087218356524635, + "learning_rate": 8.40970332521501e-06, + "loss": 1.3767, + "step": 1752 + }, + { + "epoch": 2.227445997458704, + "grad_norm": 1.513737894452076, + "learning_rate": 8.39972469233327e-06, + "loss": 1.512, + "step": 1753 + }, + { + "epoch": 2.2287166454891993, + "grad_norm": 1.370802180357388, + "learning_rate": 8.389747694506626e-06, + "loss": 1.3081, + "step": 1754 + }, + { + "epoch": 2.229987293519695, + "grad_norm": 1.4791124657047963, + "learning_rate": 8.379772341928916e-06, + "loss": 1.7237, + "step": 1755 + }, + { + "epoch": 2.2312579415501905, + "grad_norm": 1.5737279887950195, + "learning_rate": 8.369798644792295e-06, + "loss": 1.5387, + "step": 1756 + }, + { + "epoch": 2.2325285895806863, + "grad_norm": 1.5662459212887674, + "learning_rate": 8.359826613287218e-06, + "loss": 1.2879, + "step": 1757 + }, + { + "epoch": 2.2337992376111817, + "grad_norm": 1.799364522743647, + "learning_rate": 8.349856257602453e-06, + "loss": 1.5041, + "step": 1758 + }, + { + "epoch": 2.235069885641677, + "grad_norm": 1.5973977396672083, + "learning_rate": 8.33988758792504e-06, + "loss": 1.4523, + "step": 1759 + }, + { + "epoch": 2.236340533672173, + "grad_norm": 2.074809025149893, + "learning_rate": 8.329920614440306e-06, + "loss": 1.5942, + "step": 1760 + }, + { + "epoch": 2.2376111817026683, + "grad_norm": 1.4469566898299258, + "learning_rate": 8.319955347331847e-06, + "loss": 1.5738, + "step": 1761 + }, + { + "epoch": 2.238881829733164, + "grad_norm": 1.667672361818746, + "learning_rate": 8.309991796781512e-06, + "loss": 1.591, + "step": 1762 + }, + { + "epoch": 2.2401524777636594, + "grad_norm": 1.5784668955515517, + "learning_rate": 8.300029972969389e-06, + "loss": 1.8242, + "step": 1763 + }, + { + "epoch": 2.241423125794155, + "grad_norm": 1.5327500460115688, + "learning_rate": 8.290069886073815e-06, + "loss": 1.4986, + "step": 1764 + }, + { + "epoch": 2.2426937738246506, + "grad_norm": 1.713595354060149, + "learning_rate": 8.280111546271342e-06, + "loss": 1.6527, + "step": 1765 + }, + { + "epoch": 2.243964421855146, + "grad_norm": 1.5073458704886182, + "learning_rate": 8.270154963736737e-06, + "loss": 1.6408, + "step": 1766 + }, + { + "epoch": 2.245235069885642, + "grad_norm": 1.7682102522502663, + "learning_rate": 8.260200148642982e-06, + "loss": 1.4513, + "step": 1767 + }, + { + "epoch": 2.246505717916137, + "grad_norm": 1.6063360903705282, + "learning_rate": 8.250247111161248e-06, + "loss": 1.5636, + "step": 1768 + }, + { + "epoch": 2.247776365946633, + "grad_norm": 1.8512710323136712, + "learning_rate": 8.24029586146088e-06, + "loss": 1.5229, + "step": 1769 + }, + { + "epoch": 2.2490470139771284, + "grad_norm": 1.506792015467357, + "learning_rate": 8.230346409709414e-06, + "loss": 1.6667, + "step": 1770 + }, + { + "epoch": 2.2503176620076237, + "grad_norm": 1.490949180957695, + "learning_rate": 8.220398766072526e-06, + "loss": 1.5765, + "step": 1771 + }, + { + "epoch": 2.2515883100381195, + "grad_norm": 1.5367252773147595, + "learning_rate": 8.210452940714072e-06, + "loss": 1.588, + "step": 1772 + }, + { + "epoch": 2.252858958068615, + "grad_norm": 1.5451088380268685, + "learning_rate": 8.20050894379603e-06, + "loss": 1.5953, + "step": 1773 + }, + { + "epoch": 2.2541296060991107, + "grad_norm": 1.3478757161343975, + "learning_rate": 8.190566785478517e-06, + "loss": 1.5497, + "step": 1774 + }, + { + "epoch": 2.255400254129606, + "grad_norm": 1.5285510060778615, + "learning_rate": 8.180626475919768e-06, + "loss": 1.6153, + "step": 1775 + }, + { + "epoch": 2.2566709021601015, + "grad_norm": 1.7740152739154276, + "learning_rate": 8.170688025276134e-06, + "loss": 1.6365, + "step": 1776 + }, + { + "epoch": 2.2579415501905973, + "grad_norm": 1.7135233218143595, + "learning_rate": 8.160751443702062e-06, + "loss": 1.5003, + "step": 1777 + }, + { + "epoch": 2.2592121982210926, + "grad_norm": 1.408636468122467, + "learning_rate": 8.150816741350099e-06, + "loss": 1.6208, + "step": 1778 + }, + { + "epoch": 2.2604828462515885, + "grad_norm": 1.841401544872553, + "learning_rate": 8.140883928370855e-06, + "loss": 1.6257, + "step": 1779 + }, + { + "epoch": 2.261753494282084, + "grad_norm": 1.8089992703143447, + "learning_rate": 8.130953014913025e-06, + "loss": 1.5855, + "step": 1780 + }, + { + "epoch": 2.263024142312579, + "grad_norm": 1.382352287583554, + "learning_rate": 8.121024011123353e-06, + "loss": 1.4472, + "step": 1781 + }, + { + "epoch": 2.264294790343075, + "grad_norm": 1.9623033136293564, + "learning_rate": 8.11109692714664e-06, + "loss": 1.5006, + "step": 1782 + }, + { + "epoch": 2.2655654383735704, + "grad_norm": 1.4466012909581296, + "learning_rate": 8.101171773125716e-06, + "loss": 1.4422, + "step": 1783 + }, + { + "epoch": 2.266836086404066, + "grad_norm": 1.4731390191249392, + "learning_rate": 8.091248559201453e-06, + "loss": 1.5608, + "step": 1784 + }, + { + "epoch": 2.2681067344345616, + "grad_norm": 1.9585030783409343, + "learning_rate": 8.081327295512726e-06, + "loss": 1.5666, + "step": 1785 + }, + { + "epoch": 2.2693773824650574, + "grad_norm": 1.8934712477147808, + "learning_rate": 8.071407992196428e-06, + "loss": 1.6951, + "step": 1786 + }, + { + "epoch": 2.2706480304955527, + "grad_norm": 1.4255685952424664, + "learning_rate": 8.061490659387441e-06, + "loss": 1.5243, + "step": 1787 + }, + { + "epoch": 2.271918678526048, + "grad_norm": 1.6125269843284684, + "learning_rate": 8.051575307218637e-06, + "loss": 1.6832, + "step": 1788 + }, + { + "epoch": 2.273189326556544, + "grad_norm": 1.5699039685068195, + "learning_rate": 8.041661945820866e-06, + "loss": 1.5794, + "step": 1789 + }, + { + "epoch": 2.2744599745870393, + "grad_norm": 1.4677802591183078, + "learning_rate": 8.031750585322948e-06, + "loss": 1.7092, + "step": 1790 + }, + { + "epoch": 2.275730622617535, + "grad_norm": 1.6717478208537573, + "learning_rate": 8.021841235851646e-06, + "loss": 1.6947, + "step": 1791 + }, + { + "epoch": 2.2770012706480305, + "grad_norm": 1.4459052193819664, + "learning_rate": 8.01193390753168e-06, + "loss": 1.5133, + "step": 1792 + }, + { + "epoch": 2.2782719186785263, + "grad_norm": 1.3865773438986266, + "learning_rate": 8.002028610485695e-06, + "loss": 1.4625, + "step": 1793 + }, + { + "epoch": 2.2795425667090217, + "grad_norm": 1.488213716387411, + "learning_rate": 7.992125354834273e-06, + "loss": 1.7199, + "step": 1794 + }, + { + "epoch": 2.280813214739517, + "grad_norm": 1.388050280743275, + "learning_rate": 7.982224150695896e-06, + "loss": 1.5348, + "step": 1795 + }, + { + "epoch": 2.282083862770013, + "grad_norm": 1.3073310825896807, + "learning_rate": 7.972325008186966e-06, + "loss": 1.4043, + "step": 1796 + }, + { + "epoch": 2.283354510800508, + "grad_norm": 1.8266034226850034, + "learning_rate": 7.962427937421763e-06, + "loss": 1.6173, + "step": 1797 + }, + { + "epoch": 2.2846251588310036, + "grad_norm": 1.580567884125329, + "learning_rate": 7.952532948512464e-06, + "loss": 1.4702, + "step": 1798 + }, + { + "epoch": 2.2858958068614994, + "grad_norm": 1.729096492111216, + "learning_rate": 7.942640051569102e-06, + "loss": 1.831, + "step": 1799 + }, + { + "epoch": 2.2871664548919948, + "grad_norm": 1.6894573030661406, + "learning_rate": 7.932749256699588e-06, + "loss": 1.4857, + "step": 1800 + }, + { + "epoch": 2.2884371029224906, + "grad_norm": 1.6467272171651914, + "learning_rate": 7.92286057400968e-06, + "loss": 1.6362, + "step": 1801 + }, + { + "epoch": 2.289707750952986, + "grad_norm": 1.6330545870487407, + "learning_rate": 7.91297401360298e-06, + "loss": 1.6352, + "step": 1802 + }, + { + "epoch": 2.2909783989834818, + "grad_norm": 1.4708231561749436, + "learning_rate": 7.903089585580914e-06, + "loss": 1.5156, + "step": 1803 + }, + { + "epoch": 2.292249047013977, + "grad_norm": 1.7160301684303687, + "learning_rate": 7.89320730004274e-06, + "loss": 1.4859, + "step": 1804 + }, + { + "epoch": 2.2935196950444725, + "grad_norm": 1.6527094214901459, + "learning_rate": 7.883327167085514e-06, + "loss": 1.4175, + "step": 1805 + }, + { + "epoch": 2.2947903430749683, + "grad_norm": 1.6225243402916338, + "learning_rate": 7.873449196804106e-06, + "loss": 1.3302, + "step": 1806 + }, + { + "epoch": 2.2960609911054637, + "grad_norm": 1.6122001794500487, + "learning_rate": 7.863573399291169e-06, + "loss": 1.5241, + "step": 1807 + }, + { + "epoch": 2.2973316391359595, + "grad_norm": 1.69386117216537, + "learning_rate": 7.853699784637139e-06, + "loss": 1.4738, + "step": 1808 + }, + { + "epoch": 2.298602287166455, + "grad_norm": 1.6021965442081367, + "learning_rate": 7.843828362930217e-06, + "loss": 1.5937, + "step": 1809 + }, + { + "epoch": 2.2998729351969507, + "grad_norm": 1.4471567250070998, + "learning_rate": 7.83395914425637e-06, + "loss": 1.5809, + "step": 1810 + }, + { + "epoch": 2.301143583227446, + "grad_norm": 1.5467670582026032, + "learning_rate": 7.824092138699307e-06, + "loss": 1.4007, + "step": 1811 + }, + { + "epoch": 2.3024142312579414, + "grad_norm": 1.5768834499096565, + "learning_rate": 7.81422735634048e-06, + "loss": 1.3662, + "step": 1812 + }, + { + "epoch": 2.3036848792884372, + "grad_norm": 1.3139335059190138, + "learning_rate": 7.804364807259071e-06, + "loss": 1.5668, + "step": 1813 + }, + { + "epoch": 2.3049555273189326, + "grad_norm": 1.4085970135760213, + "learning_rate": 7.794504501531978e-06, + "loss": 1.5215, + "step": 1814 + }, + { + "epoch": 2.306226175349428, + "grad_norm": 1.4314080687544268, + "learning_rate": 7.784646449233806e-06, + "loss": 1.6986, + "step": 1815 + }, + { + "epoch": 2.307496823379924, + "grad_norm": 1.5284739850338658, + "learning_rate": 7.774790660436857e-06, + "loss": 1.5482, + "step": 1816 + }, + { + "epoch": 2.308767471410419, + "grad_norm": 1.4524893798000362, + "learning_rate": 7.764937145211126e-06, + "loss": 1.4476, + "step": 1817 + }, + { + "epoch": 2.310038119440915, + "grad_norm": 1.6073140233069667, + "learning_rate": 7.755085913624274e-06, + "loss": 1.5359, + "step": 1818 + }, + { + "epoch": 2.3113087674714103, + "grad_norm": 1.3996353796614822, + "learning_rate": 7.745236975741643e-06, + "loss": 1.554, + "step": 1819 + }, + { + "epoch": 2.312579415501906, + "grad_norm": 1.9511188694547437, + "learning_rate": 7.735390341626223e-06, + "loss": 1.7084, + "step": 1820 + }, + { + "epoch": 2.3138500635324015, + "grad_norm": 1.752974608444825, + "learning_rate": 7.725546021338645e-06, + "loss": 1.4784, + "step": 1821 + }, + { + "epoch": 2.315120711562897, + "grad_norm": 1.574142271538089, + "learning_rate": 7.715704024937188e-06, + "loss": 1.4726, + "step": 1822 + }, + { + "epoch": 2.3163913595933927, + "grad_norm": 1.439019695120883, + "learning_rate": 7.705864362477751e-06, + "loss": 1.4401, + "step": 1823 + }, + { + "epoch": 2.317662007623888, + "grad_norm": 1.644489663644214, + "learning_rate": 7.696027044013842e-06, + "loss": 1.5001, + "step": 1824 + }, + { + "epoch": 2.318932655654384, + "grad_norm": 1.72913661111972, + "learning_rate": 7.686192079596586e-06, + "loss": 1.5118, + "step": 1825 + }, + { + "epoch": 2.3202033036848793, + "grad_norm": 1.5907616296090283, + "learning_rate": 7.676359479274697e-06, + "loss": 1.4179, + "step": 1826 + }, + { + "epoch": 2.321473951715375, + "grad_norm": 1.5720060242626939, + "learning_rate": 7.666529253094469e-06, + "loss": 1.5574, + "step": 1827 + }, + { + "epoch": 2.3227445997458704, + "grad_norm": 1.4179955050324362, + "learning_rate": 7.656701411099777e-06, + "loss": 1.3249, + "step": 1828 + }, + { + "epoch": 2.324015247776366, + "grad_norm": 1.666878305887207, + "learning_rate": 7.646875963332056e-06, + "loss": 1.5944, + "step": 1829 + }, + { + "epoch": 2.3252858958068616, + "grad_norm": 1.3842116235724446, + "learning_rate": 7.637052919830303e-06, + "loss": 1.609, + "step": 1830 + }, + { + "epoch": 2.326556543837357, + "grad_norm": 1.5360748481078, + "learning_rate": 7.627232290631045e-06, + "loss": 1.6189, + "step": 1831 + }, + { + "epoch": 2.3278271918678524, + "grad_norm": 1.400367893827521, + "learning_rate": 7.617414085768352e-06, + "loss": 1.3703, + "step": 1832 + }, + { + "epoch": 2.329097839898348, + "grad_norm": 1.5844149233811378, + "learning_rate": 7.607598315273812e-06, + "loss": 1.6094, + "step": 1833 + }, + { + "epoch": 2.3303684879288435, + "grad_norm": 1.3974980324420434, + "learning_rate": 7.59778498917653e-06, + "loss": 1.5484, + "step": 1834 + }, + { + "epoch": 2.3316391359593394, + "grad_norm": 1.4213588910308288, + "learning_rate": 7.587974117503107e-06, + "loss": 1.4974, + "step": 1835 + }, + { + "epoch": 2.3329097839898347, + "grad_norm": 1.4614803253524362, + "learning_rate": 7.578165710277648e-06, + "loss": 1.5113, + "step": 1836 + }, + { + "epoch": 2.3341804320203305, + "grad_norm": 1.8778425505561283, + "learning_rate": 7.568359777521728e-06, + "loss": 1.4722, + "step": 1837 + }, + { + "epoch": 2.335451080050826, + "grad_norm": 1.5484667982235163, + "learning_rate": 7.558556329254397e-06, + "loss": 1.3134, + "step": 1838 + }, + { + "epoch": 2.3367217280813213, + "grad_norm": 1.566989203339903, + "learning_rate": 7.548755375492173e-06, + "loss": 1.5581, + "step": 1839 + }, + { + "epoch": 2.337992376111817, + "grad_norm": 1.4639957631528235, + "learning_rate": 7.538956926249013e-06, + "loss": 1.2382, + "step": 1840 + }, + { + "epoch": 2.3392630241423125, + "grad_norm": 1.4873332167896953, + "learning_rate": 7.5291609915363255e-06, + "loss": 1.5546, + "step": 1841 + }, + { + "epoch": 2.3405336721728083, + "grad_norm": 1.5861369473390936, + "learning_rate": 7.519367581362949e-06, + "loss": 1.7043, + "step": 1842 + }, + { + "epoch": 2.3418043202033036, + "grad_norm": 1.5022068690318784, + "learning_rate": 7.509576705735136e-06, + "loss": 1.7078, + "step": 1843 + }, + { + "epoch": 2.3430749682337995, + "grad_norm": 1.3789401080202535, + "learning_rate": 7.499788374656556e-06, + "loss": 1.693, + "step": 1844 + }, + { + "epoch": 2.344345616264295, + "grad_norm": 1.295675618650908, + "learning_rate": 7.490002598128276e-06, + "loss": 1.5228, + "step": 1845 + }, + { + "epoch": 2.34561626429479, + "grad_norm": 1.584834337879101, + "learning_rate": 7.480219386148751e-06, + "loss": 1.4103, + "step": 1846 + }, + { + "epoch": 2.346886912325286, + "grad_norm": 1.8601162293779845, + "learning_rate": 7.470438748713815e-06, + "loss": 1.5678, + "step": 1847 + }, + { + "epoch": 2.3481575603557814, + "grad_norm": 1.7949723193377425, + "learning_rate": 7.4606606958166836e-06, + "loss": 1.6394, + "step": 1848 + }, + { + "epoch": 2.349428208386277, + "grad_norm": 1.5699342047653413, + "learning_rate": 7.450885237447913e-06, + "loss": 1.5852, + "step": 1849 + }, + { + "epoch": 2.3506988564167726, + "grad_norm": 1.736272312862605, + "learning_rate": 7.441112383595424e-06, + "loss": 1.48, + "step": 1850 + }, + { + "epoch": 2.351969504447268, + "grad_norm": 1.3995849038786334, + "learning_rate": 7.431342144244466e-06, + "loss": 1.6927, + "step": 1851 + }, + { + "epoch": 2.3532401524777637, + "grad_norm": 1.5149788541962488, + "learning_rate": 7.421574529377623e-06, + "loss": 1.5541, + "step": 1852 + }, + { + "epoch": 2.354510800508259, + "grad_norm": 1.6383395325740433, + "learning_rate": 7.411809548974792e-06, + "loss": 1.7801, + "step": 1853 + }, + { + "epoch": 2.355781448538755, + "grad_norm": 1.6062109121903854, + "learning_rate": 7.4020472130131905e-06, + "loss": 1.3986, + "step": 1854 + }, + { + "epoch": 2.3570520965692503, + "grad_norm": 1.5758152979946658, + "learning_rate": 7.392287531467316e-06, + "loss": 1.6625, + "step": 1855 + }, + { + "epoch": 2.3583227445997457, + "grad_norm": 1.4684572170126242, + "learning_rate": 7.3825305143089675e-06, + "loss": 1.4875, + "step": 1856 + }, + { + "epoch": 2.3595933926302415, + "grad_norm": 1.4137996834272322, + "learning_rate": 7.372776171507221e-06, + "loss": 1.6975, + "step": 1857 + }, + { + "epoch": 2.360864040660737, + "grad_norm": 1.377852716087896, + "learning_rate": 7.363024513028407e-06, + "loss": 1.669, + "step": 1858 + }, + { + "epoch": 2.3621346886912327, + "grad_norm": 1.5830834118017767, + "learning_rate": 7.353275548836132e-06, + "loss": 1.3004, + "step": 1859 + }, + { + "epoch": 2.363405336721728, + "grad_norm": 1.4210056302021212, + "learning_rate": 7.343529288891239e-06, + "loss": 1.3457, + "step": 1860 + }, + { + "epoch": 2.364675984752224, + "grad_norm": 1.7089106539251813, + "learning_rate": 7.333785743151806e-06, + "loss": 1.6682, + "step": 1861 + }, + { + "epoch": 2.365946632782719, + "grad_norm": 1.4693395253273343, + "learning_rate": 7.3240449215731435e-06, + "loss": 1.5347, + "step": 1862 + }, + { + "epoch": 2.3672172808132146, + "grad_norm": 1.607552134774339, + "learning_rate": 7.314306834107779e-06, + "loss": 1.437, + "step": 1863 + }, + { + "epoch": 2.3684879288437104, + "grad_norm": 1.5446117794334342, + "learning_rate": 7.3045714907054345e-06, + "loss": 1.5402, + "step": 1864 + }, + { + "epoch": 2.3697585768742058, + "grad_norm": 1.5876854232853284, + "learning_rate": 7.2948389013130486e-06, + "loss": 1.3766, + "step": 1865 + }, + { + "epoch": 2.3710292249047016, + "grad_norm": 1.9756101593829416, + "learning_rate": 7.2851090758747325e-06, + "loss": 1.6314, + "step": 1866 + }, + { + "epoch": 2.372299872935197, + "grad_norm": 1.4918518931463847, + "learning_rate": 7.275382024331773e-06, + "loss": 1.4346, + "step": 1867 + }, + { + "epoch": 2.3735705209656923, + "grad_norm": 1.4528251214023307, + "learning_rate": 7.265657756622628e-06, + "loss": 1.3892, + "step": 1868 + }, + { + "epoch": 2.374841168996188, + "grad_norm": 1.6147228175947876, + "learning_rate": 7.25593628268291e-06, + "loss": 1.5138, + "step": 1869 + }, + { + "epoch": 2.3761118170266835, + "grad_norm": 1.5053062534440564, + "learning_rate": 7.246217612445368e-06, + "loss": 1.6059, + "step": 1870 + }, + { + "epoch": 2.3773824650571793, + "grad_norm": 1.9562779959252934, + "learning_rate": 7.236501755839904e-06, + "loss": 1.2351, + "step": 1871 + }, + { + "epoch": 2.3786531130876747, + "grad_norm": 1.345287123205452, + "learning_rate": 7.226788722793533e-06, + "loss": 1.5513, + "step": 1872 + }, + { + "epoch": 2.37992376111817, + "grad_norm": 1.6129590200155135, + "learning_rate": 7.217078523230388e-06, + "loss": 1.5787, + "step": 1873 + }, + { + "epoch": 2.381194409148666, + "grad_norm": 1.4423268084620797, + "learning_rate": 7.2073711670717e-06, + "loss": 1.3152, + "step": 1874 + }, + { + "epoch": 2.3824650571791612, + "grad_norm": 1.53578538307741, + "learning_rate": 7.1976666642358105e-06, + "loss": 1.6515, + "step": 1875 + }, + { + "epoch": 2.383735705209657, + "grad_norm": 1.6359029671002592, + "learning_rate": 7.187965024638127e-06, + "loss": 1.5311, + "step": 1876 + }, + { + "epoch": 2.3850063532401524, + "grad_norm": 1.4939051880974468, + "learning_rate": 7.178266258191149e-06, + "loss": 1.4407, + "step": 1877 + }, + { + "epoch": 2.3862770012706482, + "grad_norm": 1.5770847918080684, + "learning_rate": 7.168570374804428e-06, + "loss": 1.4534, + "step": 1878 + }, + { + "epoch": 2.3875476493011436, + "grad_norm": 1.5861571880340288, + "learning_rate": 7.158877384384577e-06, + "loss": 1.5207, + "step": 1879 + }, + { + "epoch": 2.388818297331639, + "grad_norm": 1.6339465749340256, + "learning_rate": 7.149187296835247e-06, + "loss": 1.6262, + "step": 1880 + }, + { + "epoch": 2.390088945362135, + "grad_norm": 1.3860018025537892, + "learning_rate": 7.13950012205713e-06, + "loss": 1.4204, + "step": 1881 + }, + { + "epoch": 2.39135959339263, + "grad_norm": 1.5066011151313867, + "learning_rate": 7.129815869947931e-06, + "loss": 1.4559, + "step": 1882 + }, + { + "epoch": 2.392630241423126, + "grad_norm": 1.6645327284475002, + "learning_rate": 7.1201345504023855e-06, + "loss": 1.4492, + "step": 1883 + }, + { + "epoch": 2.3939008894536213, + "grad_norm": 1.5789075326437572, + "learning_rate": 7.110456173312218e-06, + "loss": 1.5254, + "step": 1884 + }, + { + "epoch": 2.3951715374841167, + "grad_norm": 1.6463321606552945, + "learning_rate": 7.100780748566154e-06, + "loss": 1.643, + "step": 1885 + }, + { + "epoch": 2.3964421855146125, + "grad_norm": 1.763048767867577, + "learning_rate": 7.091108286049898e-06, + "loss": 1.444, + "step": 1886 + }, + { + "epoch": 2.397712833545108, + "grad_norm": 1.5902629901340428, + "learning_rate": 7.081438795646129e-06, + "loss": 1.6437, + "step": 1887 + }, + { + "epoch": 2.3989834815756037, + "grad_norm": 1.416496098097569, + "learning_rate": 7.071772287234497e-06, + "loss": 1.3835, + "step": 1888 + }, + { + "epoch": 2.400254129606099, + "grad_norm": 1.7152467379497507, + "learning_rate": 7.062108770691594e-06, + "loss": 1.4119, + "step": 1889 + }, + { + "epoch": 2.4015247776365944, + "grad_norm": 1.4841722169436224, + "learning_rate": 7.052448255890958e-06, + "loss": 1.4004, + "step": 1890 + }, + { + "epoch": 2.4027954256670903, + "grad_norm": 1.451227129551864, + "learning_rate": 7.042790752703068e-06, + "loss": 1.5291, + "step": 1891 + }, + { + "epoch": 2.4040660736975856, + "grad_norm": 1.7429828031780958, + "learning_rate": 7.033136270995313e-06, + "loss": 1.217, + "step": 1892 + }, + { + "epoch": 2.4053367217280814, + "grad_norm": 1.648381923053194, + "learning_rate": 7.023484820632005e-06, + "loss": 1.5879, + "step": 1893 + }, + { + "epoch": 2.406607369758577, + "grad_norm": 1.646583274623715, + "learning_rate": 7.013836411474358e-06, + "loss": 1.2618, + "step": 1894 + }, + { + "epoch": 2.4078780177890726, + "grad_norm": 1.5842634112121, + "learning_rate": 7.004191053380469e-06, + "loss": 1.4015, + "step": 1895 + }, + { + "epoch": 2.409148665819568, + "grad_norm": 1.5471584771822826, + "learning_rate": 6.994548756205332e-06, + "loss": 1.5965, + "step": 1896 + }, + { + "epoch": 2.4104193138500634, + "grad_norm": 1.472937778908629, + "learning_rate": 6.984909529800804e-06, + "loss": 1.5414, + "step": 1897 + }, + { + "epoch": 2.411689961880559, + "grad_norm": 1.725692441507782, + "learning_rate": 6.975273384015604e-06, + "loss": 1.4092, + "step": 1898 + }, + { + "epoch": 2.4129606099110545, + "grad_norm": 1.6991877267409594, + "learning_rate": 6.965640328695307e-06, + "loss": 1.4633, + "step": 1899 + }, + { + "epoch": 2.4142312579415504, + "grad_norm": 1.3888509550747612, + "learning_rate": 6.956010373682334e-06, + "loss": 1.4386, + "step": 1900 + }, + { + "epoch": 2.4155019059720457, + "grad_norm": 1.6374270895416039, + "learning_rate": 6.9463835288159295e-06, + "loss": 1.457, + "step": 1901 + }, + { + "epoch": 2.4167725540025415, + "grad_norm": 1.797814081810981, + "learning_rate": 6.936759803932167e-06, + "loss": 1.3279, + "step": 1902 + }, + { + "epoch": 2.418043202033037, + "grad_norm": 1.5405246741346568, + "learning_rate": 6.927139208863929e-06, + "loss": 1.6427, + "step": 1903 + }, + { + "epoch": 2.4193138500635323, + "grad_norm": 1.6513898886347445, + "learning_rate": 6.917521753440899e-06, + "loss": 1.5625, + "step": 1904 + }, + { + "epoch": 2.420584498094028, + "grad_norm": 1.4660904718823222, + "learning_rate": 6.9079074474895545e-06, + "loss": 1.4887, + "step": 1905 + }, + { + "epoch": 2.4218551461245235, + "grad_norm": 1.617362131355844, + "learning_rate": 6.8982963008331605e-06, + "loss": 1.4095, + "step": 1906 + }, + { + "epoch": 2.423125794155019, + "grad_norm": 1.657786559949924, + "learning_rate": 6.888688323291746e-06, + "loss": 1.474, + "step": 1907 + }, + { + "epoch": 2.4243964421855146, + "grad_norm": 1.541370968546513, + "learning_rate": 6.879083524682102e-06, + "loss": 1.3467, + "step": 1908 + }, + { + "epoch": 2.42566709021601, + "grad_norm": 2.2101482935442056, + "learning_rate": 6.869481914817779e-06, + "loss": 1.5522, + "step": 1909 + }, + { + "epoch": 2.426937738246506, + "grad_norm": 1.703555663572547, + "learning_rate": 6.859883503509062e-06, + "loss": 1.2727, + "step": 1910 + }, + { + "epoch": 2.428208386277001, + "grad_norm": 1.603473275026458, + "learning_rate": 6.850288300562966e-06, + "loss": 1.5417, + "step": 1911 + }, + { + "epoch": 2.429479034307497, + "grad_norm": 1.5091116759413492, + "learning_rate": 6.840696315783239e-06, + "loss": 1.4483, + "step": 1912 + }, + { + "epoch": 2.4307496823379924, + "grad_norm": 2.0729938037695423, + "learning_rate": 6.831107558970337e-06, + "loss": 1.9117, + "step": 1913 + }, + { + "epoch": 2.4320203303684877, + "grad_norm": 1.5659133357027375, + "learning_rate": 6.821522039921407e-06, + "loss": 1.4462, + "step": 1914 + }, + { + "epoch": 2.4332909783989836, + "grad_norm": 2.0163592713853715, + "learning_rate": 6.811939768430303e-06, + "loss": 1.4736, + "step": 1915 + }, + { + "epoch": 2.434561626429479, + "grad_norm": 1.6466677451967848, + "learning_rate": 6.802360754287548e-06, + "loss": 1.5789, + "step": 1916 + }, + { + "epoch": 2.4358322744599747, + "grad_norm": 1.793437180849285, + "learning_rate": 6.792785007280347e-06, + "loss": 1.4847, + "step": 1917 + }, + { + "epoch": 2.43710292249047, + "grad_norm": 1.497933365260444, + "learning_rate": 6.7832125371925625e-06, + "loss": 1.2717, + "step": 1918 + }, + { + "epoch": 2.438373570520966, + "grad_norm": 1.3552963469287735, + "learning_rate": 6.773643353804711e-06, + "loss": 1.4794, + "step": 1919 + }, + { + "epoch": 2.4396442185514613, + "grad_norm": 1.5896549816261436, + "learning_rate": 6.764077466893944e-06, + "loss": 1.5418, + "step": 1920 + }, + { + "epoch": 2.4409148665819567, + "grad_norm": 1.4623712262992057, + "learning_rate": 6.754514886234054e-06, + "loss": 1.5112, + "step": 1921 + }, + { + "epoch": 2.4421855146124525, + "grad_norm": 1.3949476808159789, + "learning_rate": 6.7449556215954435e-06, + "loss": 1.5416, + "step": 1922 + }, + { + "epoch": 2.443456162642948, + "grad_norm": 1.8033252619694837, + "learning_rate": 6.735399682745145e-06, + "loss": 1.4908, + "step": 1923 + }, + { + "epoch": 2.444726810673443, + "grad_norm": 1.5860225872596243, + "learning_rate": 6.725847079446779e-06, + "loss": 1.6275, + "step": 1924 + }, + { + "epoch": 2.445997458703939, + "grad_norm": 1.6053507982449213, + "learning_rate": 6.7162978214605615e-06, + "loss": 1.2901, + "step": 1925 + }, + { + "epoch": 2.4472681067344344, + "grad_norm": 1.6751208486108087, + "learning_rate": 6.706751918543288e-06, + "loss": 1.8313, + "step": 1926 + }, + { + "epoch": 2.44853875476493, + "grad_norm": 1.7560303434584272, + "learning_rate": 6.697209380448333e-06, + "loss": 1.6988, + "step": 1927 + }, + { + "epoch": 2.4498094027954256, + "grad_norm": 1.6864532391677458, + "learning_rate": 6.687670216925621e-06, + "loss": 1.6406, + "step": 1928 + }, + { + "epoch": 2.4510800508259214, + "grad_norm": 1.5156868555239327, + "learning_rate": 6.678134437721644e-06, + "loss": 1.3891, + "step": 1929 + }, + { + "epoch": 2.4523506988564168, + "grad_norm": 1.4058841595948535, + "learning_rate": 6.668602052579425e-06, + "loss": 1.6672, + "step": 1930 + }, + { + "epoch": 2.453621346886912, + "grad_norm": 1.49639471800268, + "learning_rate": 6.659073071238524e-06, + "loss": 1.4048, + "step": 1931 + }, + { + "epoch": 2.454891994917408, + "grad_norm": 1.4973993571816406, + "learning_rate": 6.649547503435021e-06, + "loss": 1.6426, + "step": 1932 + }, + { + "epoch": 2.4561626429479033, + "grad_norm": 1.4078171554908248, + "learning_rate": 6.640025358901509e-06, + "loss": 1.511, + "step": 1933 + }, + { + "epoch": 2.457433290978399, + "grad_norm": 1.580293969920001, + "learning_rate": 6.6305066473670765e-06, + "loss": 1.5076, + "step": 1934 + }, + { + "epoch": 2.4587039390088945, + "grad_norm": 1.6589775719575122, + "learning_rate": 6.6209913785573245e-06, + "loss": 1.3575, + "step": 1935 + }, + { + "epoch": 2.4599745870393903, + "grad_norm": 1.6922684808031945, + "learning_rate": 6.611479562194314e-06, + "loss": 1.406, + "step": 1936 + }, + { + "epoch": 2.4612452350698857, + "grad_norm": 1.421608647626518, + "learning_rate": 6.601971207996592e-06, + "loss": 1.6037, + "step": 1937 + }, + { + "epoch": 2.462515883100381, + "grad_norm": 1.6218271395345765, + "learning_rate": 6.592466325679159e-06, + "loss": 1.4237, + "step": 1938 + }, + { + "epoch": 2.463786531130877, + "grad_norm": 1.6665643671210966, + "learning_rate": 6.582964924953477e-06, + "loss": 1.6236, + "step": 1939 + }, + { + "epoch": 2.4650571791613722, + "grad_norm": 1.4575551061156442, + "learning_rate": 6.573467015527439e-06, + "loss": 1.6012, + "step": 1940 + }, + { + "epoch": 2.4663278271918676, + "grad_norm": 1.629582440111225, + "learning_rate": 6.563972607105393e-06, + "loss": 1.4583, + "step": 1941 + }, + { + "epoch": 2.4675984752223634, + "grad_norm": 1.5051220386961484, + "learning_rate": 6.554481709388083e-06, + "loss": 1.7184, + "step": 1942 + }, + { + "epoch": 2.468869123252859, + "grad_norm": 1.6787388884773073, + "learning_rate": 6.544994332072685e-06, + "loss": 1.4597, + "step": 1943 + }, + { + "epoch": 2.4701397712833546, + "grad_norm": 1.559077650508331, + "learning_rate": 6.535510484852767e-06, + "loss": 1.5626, + "step": 1944 + }, + { + "epoch": 2.47141041931385, + "grad_norm": 1.8559133633737679, + "learning_rate": 6.526030177418294e-06, + "loss": 1.783, + "step": 1945 + }, + { + "epoch": 2.472681067344346, + "grad_norm": 1.410279022037204, + "learning_rate": 6.51655341945562e-06, + "loss": 1.3541, + "step": 1946 + }, + { + "epoch": 2.473951715374841, + "grad_norm": 1.5190331881625136, + "learning_rate": 6.507080220647466e-06, + "loss": 1.4551, + "step": 1947 + }, + { + "epoch": 2.4752223634053365, + "grad_norm": 1.5160178031623226, + "learning_rate": 6.497610590672916e-06, + "loss": 1.4626, + "step": 1948 + }, + { + "epoch": 2.4764930114358323, + "grad_norm": 1.4461522772273703, + "learning_rate": 6.488144539207411e-06, + "loss": 1.5452, + "step": 1949 + }, + { + "epoch": 2.4777636594663277, + "grad_norm": 1.7233086915731444, + "learning_rate": 6.478682075922731e-06, + "loss": 1.4749, + "step": 1950 + }, + { + "epoch": 2.4790343074968235, + "grad_norm": 1.316593875218675, + "learning_rate": 6.469223210486992e-06, + "loss": 1.7607, + "step": 1951 + }, + { + "epoch": 2.480304955527319, + "grad_norm": 1.7001060873689287, + "learning_rate": 6.459767952564642e-06, + "loss": 1.6113, + "step": 1952 + }, + { + "epoch": 2.4815756035578147, + "grad_norm": 1.4831999469527257, + "learning_rate": 6.450316311816432e-06, + "loss": 1.462, + "step": 1953 + }, + { + "epoch": 2.48284625158831, + "grad_norm": 1.6656890074314092, + "learning_rate": 6.4408682978994195e-06, + "loss": 1.3112, + "step": 1954 + }, + { + "epoch": 2.4841168996188054, + "grad_norm": 1.5589781912381324, + "learning_rate": 6.431423920466963e-06, + "loss": 1.369, + "step": 1955 + }, + { + "epoch": 2.4853875476493013, + "grad_norm": 1.7347879717445733, + "learning_rate": 6.421983189168695e-06, + "loss": 1.544, + "step": 1956 + }, + { + "epoch": 2.4866581956797966, + "grad_norm": 1.7130648976384044, + "learning_rate": 6.412546113650526e-06, + "loss": 1.528, + "step": 1957 + }, + { + "epoch": 2.4879288437102924, + "grad_norm": 1.4912029502003834, + "learning_rate": 6.403112703554643e-06, + "loss": 1.4823, + "step": 1958 + }, + { + "epoch": 2.489199491740788, + "grad_norm": 1.5878108147444938, + "learning_rate": 6.393682968519474e-06, + "loss": 1.3936, + "step": 1959 + }, + { + "epoch": 2.490470139771283, + "grad_norm": 1.862810522369295, + "learning_rate": 6.384256918179692e-06, + "loss": 1.4847, + "step": 1960 + }, + { + "epoch": 2.491740787801779, + "grad_norm": 1.5408358555456292, + "learning_rate": 6.374834562166217e-06, + "loss": 1.6876, + "step": 1961 + }, + { + "epoch": 2.4930114358322744, + "grad_norm": 1.5269426517183589, + "learning_rate": 6.365415910106181e-06, + "loss": 1.6543, + "step": 1962 + }, + { + "epoch": 2.49428208386277, + "grad_norm": 1.8042970266257117, + "learning_rate": 6.356000971622938e-06, + "loss": 1.3149, + "step": 1963 + }, + { + "epoch": 2.4955527318932655, + "grad_norm": 1.5245698459517367, + "learning_rate": 6.34658975633605e-06, + "loss": 1.6631, + "step": 1964 + }, + { + "epoch": 2.496823379923761, + "grad_norm": 1.8477085437262835, + "learning_rate": 6.337182273861273e-06, + "loss": 1.6602, + "step": 1965 + }, + { + "epoch": 2.4980940279542567, + "grad_norm": 1.4583773739409853, + "learning_rate": 6.327778533810545e-06, + "loss": 1.7056, + "step": 1966 + }, + { + "epoch": 2.499364675984752, + "grad_norm": 1.7380518267388891, + "learning_rate": 6.318378545791988e-06, + "loss": 1.6442, + "step": 1967 + }, + { + "epoch": 2.500635324015248, + "grad_norm": 1.5569361056404658, + "learning_rate": 6.308982319409878e-06, + "loss": 1.3564, + "step": 1968 + }, + { + "epoch": 2.5019059720457433, + "grad_norm": 1.4894114236528235, + "learning_rate": 6.299589864264662e-06, + "loss": 1.6204, + "step": 1969 + }, + { + "epoch": 2.503176620076239, + "grad_norm": 1.5744140289250657, + "learning_rate": 6.290201189952925e-06, + "loss": 1.5788, + "step": 1970 + }, + { + "epoch": 2.5044472681067345, + "grad_norm": 1.4729487530295562, + "learning_rate": 6.280816306067393e-06, + "loss": 1.577, + "step": 1971 + }, + { + "epoch": 2.50571791613723, + "grad_norm": 1.6059178983559093, + "learning_rate": 6.2714352221969155e-06, + "loss": 1.6455, + "step": 1972 + }, + { + "epoch": 2.5069885641677256, + "grad_norm": 1.4392169542776443, + "learning_rate": 6.262057947926463e-06, + "loss": 1.318, + "step": 1973 + }, + { + "epoch": 2.508259212198221, + "grad_norm": 1.6892962025361826, + "learning_rate": 6.252684492837107e-06, + "loss": 1.411, + "step": 1974 + }, + { + "epoch": 2.5095298602287164, + "grad_norm": 1.7367334870989624, + "learning_rate": 6.2433148665060305e-06, + "loss": 1.4848, + "step": 1975 + }, + { + "epoch": 2.510800508259212, + "grad_norm": 1.6643612132262988, + "learning_rate": 6.233949078506489e-06, + "loss": 1.6053, + "step": 1976 + }, + { + "epoch": 2.512071156289708, + "grad_norm": 1.4933363345894404, + "learning_rate": 6.22458713840783e-06, + "loss": 1.7322, + "step": 1977 + }, + { + "epoch": 2.5133418043202034, + "grad_norm": 1.622785162327148, + "learning_rate": 6.215229055775454e-06, + "loss": 1.4641, + "step": 1978 + }, + { + "epoch": 2.5146124523506987, + "grad_norm": 1.5606802690693409, + "learning_rate": 6.205874840170833e-06, + "loss": 1.6456, + "step": 1979 + }, + { + "epoch": 2.5158831003811946, + "grad_norm": 1.7392960071598602, + "learning_rate": 6.196524501151479e-06, + "loss": 1.5845, + "step": 1980 + }, + { + "epoch": 2.51715374841169, + "grad_norm": 1.5907237596599346, + "learning_rate": 6.187178048270956e-06, + "loss": 1.2511, + "step": 1981 + }, + { + "epoch": 2.5184243964421853, + "grad_norm": 1.515123504297544, + "learning_rate": 6.1778354910788465e-06, + "loss": 1.593, + "step": 1982 + }, + { + "epoch": 2.519695044472681, + "grad_norm": 1.6709740601289755, + "learning_rate": 6.168496839120754e-06, + "loss": 1.455, + "step": 1983 + }, + { + "epoch": 2.5209656925031765, + "grad_norm": 1.6767746763503226, + "learning_rate": 6.159162101938292e-06, + "loss": 1.5791, + "step": 1984 + }, + { + "epoch": 2.5222363405336723, + "grad_norm": 1.3286317631433775, + "learning_rate": 6.149831289069079e-06, + "loss": 1.5604, + "step": 1985 + }, + { + "epoch": 2.5235069885641677, + "grad_norm": 1.3069539059345867, + "learning_rate": 6.140504410046712e-06, + "loss": 1.5462, + "step": 1986 + }, + { + "epoch": 2.5247776365946635, + "grad_norm": 1.319442531061846, + "learning_rate": 6.131181474400789e-06, + "loss": 1.3234, + "step": 1987 + }, + { + "epoch": 2.526048284625159, + "grad_norm": 1.6566533985853409, + "learning_rate": 6.121862491656858e-06, + "loss": 1.5241, + "step": 1988 + }, + { + "epoch": 2.527318932655654, + "grad_norm": 1.7007255613732077, + "learning_rate": 6.112547471336443e-06, + "loss": 1.6529, + "step": 1989 + }, + { + "epoch": 2.52858958068615, + "grad_norm": 1.6346339798837812, + "learning_rate": 6.103236422957009e-06, + "loss": 1.6411, + "step": 1990 + }, + { + "epoch": 2.5298602287166454, + "grad_norm": 1.3974924068573895, + "learning_rate": 6.09392935603197e-06, + "loss": 1.4445, + "step": 1991 + }, + { + "epoch": 2.5311308767471408, + "grad_norm": 1.675829876273148, + "learning_rate": 6.084626280070663e-06, + "loss": 1.4593, + "step": 1992 + }, + { + "epoch": 2.5324015247776366, + "grad_norm": 1.5825081593692385, + "learning_rate": 6.075327204578363e-06, + "loss": 1.5878, + "step": 1993 + }, + { + "epoch": 2.5336721728081324, + "grad_norm": 1.3884029886128855, + "learning_rate": 6.066032139056244e-06, + "loss": 1.426, + "step": 1994 + }, + { + "epoch": 2.5349428208386278, + "grad_norm": 1.6846001017280459, + "learning_rate": 6.056741093001387e-06, + "loss": 1.5619, + "step": 1995 + }, + { + "epoch": 2.536213468869123, + "grad_norm": 1.594688745927412, + "learning_rate": 6.0474540759067645e-06, + "loss": 1.5126, + "step": 1996 + }, + { + "epoch": 2.537484116899619, + "grad_norm": 1.568543889486274, + "learning_rate": 6.038171097261234e-06, + "loss": 1.6029, + "step": 1997 + }, + { + "epoch": 2.5387547649301143, + "grad_norm": 1.8935050808469194, + "learning_rate": 6.02889216654953e-06, + "loss": 1.5576, + "step": 1998 + }, + { + "epoch": 2.5400254129606097, + "grad_norm": 1.674665638907306, + "learning_rate": 6.019617293252249e-06, + "loss": 1.4246, + "step": 1999 + }, + { + "epoch": 2.5412960609911055, + "grad_norm": 1.3876686424816804, + "learning_rate": 6.010346486845837e-06, + "loss": 1.5987, + "step": 2000 + }, + { + "epoch": 2.542566709021601, + "grad_norm": 1.612133543103817, + "learning_rate": 6.001079756802592e-06, + "loss": 1.4918, + "step": 2001 + }, + { + "epoch": 2.5438373570520967, + "grad_norm": 1.6228281232000563, + "learning_rate": 5.991817112590641e-06, + "loss": 1.4632, + "step": 2002 + }, + { + "epoch": 2.545108005082592, + "grad_norm": 1.4937812398891073, + "learning_rate": 5.982558563673938e-06, + "loss": 1.5252, + "step": 2003 + }, + { + "epoch": 2.546378653113088, + "grad_norm": 1.534737369108716, + "learning_rate": 5.973304119512258e-06, + "loss": 1.3607, + "step": 2004 + }, + { + "epoch": 2.5476493011435832, + "grad_norm": 1.7211354755831836, + "learning_rate": 5.964053789561177e-06, + "loss": 1.4835, + "step": 2005 + }, + { + "epoch": 2.5489199491740786, + "grad_norm": 1.7018457215922282, + "learning_rate": 5.9548075832720655e-06, + "loss": 1.2578, + "step": 2006 + }, + { + "epoch": 2.5501905972045744, + "grad_norm": 1.575855457005483, + "learning_rate": 5.945565510092086e-06, + "loss": 1.4262, + "step": 2007 + }, + { + "epoch": 2.55146124523507, + "grad_norm": 1.5276479971645556, + "learning_rate": 5.936327579464174e-06, + "loss": 1.5924, + "step": 2008 + }, + { + "epoch": 2.5527318932655656, + "grad_norm": 1.5564125880265227, + "learning_rate": 5.927093800827032e-06, + "loss": 1.6219, + "step": 2009 + }, + { + "epoch": 2.554002541296061, + "grad_norm": 1.498257762944526, + "learning_rate": 5.917864183615125e-06, + "loss": 1.4304, + "step": 2010 + }, + { + "epoch": 2.555273189326557, + "grad_norm": 1.51373375392949, + "learning_rate": 5.908638737258666e-06, + "loss": 1.5876, + "step": 2011 + }, + { + "epoch": 2.556543837357052, + "grad_norm": 1.6551224771522475, + "learning_rate": 5.8994174711836e-06, + "loss": 1.307, + "step": 2012 + }, + { + "epoch": 2.5578144853875475, + "grad_norm": 1.8144453783675563, + "learning_rate": 5.890200394811605e-06, + "loss": 1.5938, + "step": 2013 + }, + { + "epoch": 2.5590851334180433, + "grad_norm": 1.6966638492890553, + "learning_rate": 5.880987517560075e-06, + "loss": 1.4143, + "step": 2014 + }, + { + "epoch": 2.5603557814485387, + "grad_norm": 1.6047126629279407, + "learning_rate": 5.87177884884212e-06, + "loss": 1.5542, + "step": 2015 + }, + { + "epoch": 2.561626429479034, + "grad_norm": 1.502741448048564, + "learning_rate": 5.862574398066547e-06, + "loss": 1.5601, + "step": 2016 + }, + { + "epoch": 2.56289707750953, + "grad_norm": 1.6164079656020516, + "learning_rate": 5.853374174637855e-06, + "loss": 1.6705, + "step": 2017 + }, + { + "epoch": 2.5641677255400253, + "grad_norm": 1.571527752327933, + "learning_rate": 5.844178187956215e-06, + "loss": 1.4099, + "step": 2018 + }, + { + "epoch": 2.565438373570521, + "grad_norm": 1.826938166341695, + "learning_rate": 5.834986447417481e-06, + "loss": 1.6639, + "step": 2019 + }, + { + "epoch": 2.5667090216010164, + "grad_norm": 1.5305172337767854, + "learning_rate": 5.825798962413164e-06, + "loss": 1.3573, + "step": 2020 + }, + { + "epoch": 2.5679796696315123, + "grad_norm": 1.4847243067385838, + "learning_rate": 5.81661574233042e-06, + "loss": 1.6699, + "step": 2021 + }, + { + "epoch": 2.5692503176620076, + "grad_norm": 1.5910837917800331, + "learning_rate": 5.807436796552062e-06, + "loss": 1.4542, + "step": 2022 + }, + { + "epoch": 2.570520965692503, + "grad_norm": 1.5691937901257724, + "learning_rate": 5.79826213445652e-06, + "loss": 1.5276, + "step": 2023 + }, + { + "epoch": 2.571791613722999, + "grad_norm": 1.7161084283422166, + "learning_rate": 5.789091765417862e-06, + "loss": 1.5906, + "step": 2024 + }, + { + "epoch": 2.573062261753494, + "grad_norm": 1.403230062689204, + "learning_rate": 5.77992569880576e-06, + "loss": 1.4501, + "step": 2025 + }, + { + "epoch": 2.57433290978399, + "grad_norm": 1.4964283684395638, + "learning_rate": 5.7707639439854865e-06, + "loss": 1.5517, + "step": 2026 + }, + { + "epoch": 2.5756035578144854, + "grad_norm": 1.3622802648369978, + "learning_rate": 5.761606510317921e-06, + "loss": 1.307, + "step": 2027 + }, + { + "epoch": 2.576874205844981, + "grad_norm": 1.326365025189336, + "learning_rate": 5.752453407159521e-06, + "loss": 1.7017, + "step": 2028 + }, + { + "epoch": 2.5781448538754765, + "grad_norm": 1.5433055701471474, + "learning_rate": 5.743304643862322e-06, + "loss": 1.4591, + "step": 2029 + }, + { + "epoch": 2.579415501905972, + "grad_norm": 2.016978937351612, + "learning_rate": 5.7341602297739185e-06, + "loss": 1.6965, + "step": 2030 + }, + { + "epoch": 2.5806861499364677, + "grad_norm": 1.4870993206979286, + "learning_rate": 5.725020174237463e-06, + "loss": 1.4431, + "step": 2031 + }, + { + "epoch": 2.581956797966963, + "grad_norm": 1.622926065692516, + "learning_rate": 5.715884486591663e-06, + "loss": 1.5469, + "step": 2032 + }, + { + "epoch": 2.5832274459974585, + "grad_norm": 1.4456629337597853, + "learning_rate": 5.706753176170761e-06, + "loss": 1.3962, + "step": 2033 + }, + { + "epoch": 2.5844980940279543, + "grad_norm": 1.6027971002444077, + "learning_rate": 5.697626252304518e-06, + "loss": 1.3743, + "step": 2034 + }, + { + "epoch": 2.5857687420584496, + "grad_norm": 1.5331799529340098, + "learning_rate": 5.688503724318217e-06, + "loss": 1.3231, + "step": 2035 + }, + { + "epoch": 2.5870393900889455, + "grad_norm": 1.490824381640179, + "learning_rate": 5.67938560153266e-06, + "loss": 1.5394, + "step": 2036 + }, + { + "epoch": 2.588310038119441, + "grad_norm": 1.4492078377518673, + "learning_rate": 5.670271893264135e-06, + "loss": 1.3416, + "step": 2037 + }, + { + "epoch": 2.5895806861499366, + "grad_norm": 1.579716437800065, + "learning_rate": 5.66116260882442e-06, + "loss": 1.6252, + "step": 2038 + }, + { + "epoch": 2.590851334180432, + "grad_norm": 1.5764787878452378, + "learning_rate": 5.652057757520782e-06, + "loss": 1.5564, + "step": 2039 + }, + { + "epoch": 2.5921219822109274, + "grad_norm": 1.6354725147911908, + "learning_rate": 5.642957348655957e-06, + "loss": 1.3986, + "step": 2040 + }, + { + "epoch": 2.593392630241423, + "grad_norm": 1.4289914223659326, + "learning_rate": 5.633861391528135e-06, + "loss": 1.2618, + "step": 2041 + }, + { + "epoch": 2.5946632782719186, + "grad_norm": 1.523265210937983, + "learning_rate": 5.6247698954309616e-06, + "loss": 1.5941, + "step": 2042 + }, + { + "epoch": 2.5959339263024144, + "grad_norm": 1.4714236027113614, + "learning_rate": 5.615682869653518e-06, + "loss": 1.5671, + "step": 2043 + }, + { + "epoch": 2.5972045743329097, + "grad_norm": 1.4876259379201178, + "learning_rate": 5.606600323480332e-06, + "loss": 1.4436, + "step": 2044 + }, + { + "epoch": 2.5984752223634056, + "grad_norm": 1.4783074748512717, + "learning_rate": 5.597522266191348e-06, + "loss": 1.4667, + "step": 2045 + }, + { + "epoch": 2.599745870393901, + "grad_norm": 1.90561408566863, + "learning_rate": 5.5884487070619184e-06, + "loss": 1.6617, + "step": 2046 + }, + { + "epoch": 2.6010165184243963, + "grad_norm": 1.5778631845722175, + "learning_rate": 5.579379655362801e-06, + "loss": 1.5352, + "step": 2047 + }, + { + "epoch": 2.602287166454892, + "grad_norm": 1.4646496408634837, + "learning_rate": 5.570315120360157e-06, + "loss": 1.6054, + "step": 2048 + }, + { + "epoch": 2.6035578144853875, + "grad_norm": 1.9450738926737605, + "learning_rate": 5.561255111315525e-06, + "loss": 1.819, + "step": 2049 + }, + { + "epoch": 2.604828462515883, + "grad_norm": 1.5400473317263956, + "learning_rate": 5.5521996374858134e-06, + "loss": 1.3851, + "step": 2050 + }, + { + "epoch": 2.6060991105463787, + "grad_norm": 1.8228604096633896, + "learning_rate": 5.5431487081233115e-06, + "loss": 1.582, + "step": 2051 + }, + { + "epoch": 2.6073697585768745, + "grad_norm": 1.551585038967548, + "learning_rate": 5.534102332475661e-06, + "loss": 1.2821, + "step": 2052 + }, + { + "epoch": 2.60864040660737, + "grad_norm": 1.9066203734674334, + "learning_rate": 5.525060519785845e-06, + "loss": 1.4285, + "step": 2053 + }, + { + "epoch": 2.609911054637865, + "grad_norm": 1.7219081400274896, + "learning_rate": 5.5160232792921845e-06, + "loss": 1.6345, + "step": 2054 + }, + { + "epoch": 2.611181702668361, + "grad_norm": 1.7065215233402147, + "learning_rate": 5.5069906202283315e-06, + "loss": 1.6086, + "step": 2055 + }, + { + "epoch": 2.6124523506988564, + "grad_norm": 1.7118658292689697, + "learning_rate": 5.497962551823266e-06, + "loss": 1.5247, + "step": 2056 + }, + { + "epoch": 2.6137229987293518, + "grad_norm": 1.5000744419560181, + "learning_rate": 5.488939083301264e-06, + "loss": 1.4179, + "step": 2057 + }, + { + "epoch": 2.6149936467598476, + "grad_norm": 1.5433374554439336, + "learning_rate": 5.479920223881906e-06, + "loss": 1.6244, + "step": 2058 + }, + { + "epoch": 2.616264294790343, + "grad_norm": 1.6079090761483708, + "learning_rate": 5.47090598278006e-06, + "loss": 1.6962, + "step": 2059 + }, + { + "epoch": 2.6175349428208388, + "grad_norm": 1.6881653881113183, + "learning_rate": 5.461896369205888e-06, + "loss": 1.2013, + "step": 2060 + }, + { + "epoch": 2.618805590851334, + "grad_norm": 1.599027772485402, + "learning_rate": 5.452891392364808e-06, + "loss": 1.5009, + "step": 2061 + }, + { + "epoch": 2.62007623888183, + "grad_norm": 1.8886355634647696, + "learning_rate": 5.4438910614575115e-06, + "loss": 1.5144, + "step": 2062 + }, + { + "epoch": 2.6213468869123253, + "grad_norm": 1.5412645355637977, + "learning_rate": 5.434895385679937e-06, + "loss": 1.3677, + "step": 2063 + }, + { + "epoch": 2.6226175349428207, + "grad_norm": 1.593887186726804, + "learning_rate": 5.425904374223272e-06, + "loss": 1.4759, + "step": 2064 + }, + { + "epoch": 2.6238881829733165, + "grad_norm": 1.4494031826945593, + "learning_rate": 5.416918036273935e-06, + "loss": 1.3884, + "step": 2065 + }, + { + "epoch": 2.625158831003812, + "grad_norm": 1.7941454557018781, + "learning_rate": 5.407936381013564e-06, + "loss": 1.6359, + "step": 2066 + }, + { + "epoch": 2.6264294790343072, + "grad_norm": 1.6654720648068262, + "learning_rate": 5.398959417619022e-06, + "loss": 1.6703, + "step": 2067 + }, + { + "epoch": 2.627700127064803, + "grad_norm": 1.5915824639539826, + "learning_rate": 5.38998715526238e-06, + "loss": 1.5062, + "step": 2068 + }, + { + "epoch": 2.628970775095299, + "grad_norm": 1.4993321994594955, + "learning_rate": 5.381019603110893e-06, + "loss": 1.364, + "step": 2069 + }, + { + "epoch": 2.6302414231257942, + "grad_norm": 1.5252693073346493, + "learning_rate": 5.3720567703270135e-06, + "loss": 1.5045, + "step": 2070 + }, + { + "epoch": 2.6315120711562896, + "grad_norm": 1.5144681548120225, + "learning_rate": 5.3630986660683644e-06, + "loss": 1.5413, + "step": 2071 + }, + { + "epoch": 2.6327827191867854, + "grad_norm": 1.475701365613179, + "learning_rate": 5.35414529948775e-06, + "loss": 1.1327, + "step": 2072 + }, + { + "epoch": 2.634053367217281, + "grad_norm": 1.4656305801975826, + "learning_rate": 5.345196679733118e-06, + "loss": 1.5157, + "step": 2073 + }, + { + "epoch": 2.635324015247776, + "grad_norm": 1.9554243405684435, + "learning_rate": 5.336252815947581e-06, + "loss": 1.6661, + "step": 2074 + }, + { + "epoch": 2.636594663278272, + "grad_norm": 1.4324007946095954, + "learning_rate": 5.32731371726938e-06, + "loss": 1.3692, + "step": 2075 + }, + { + "epoch": 2.6378653113087673, + "grad_norm": 1.395704553067545, + "learning_rate": 5.3183793928318986e-06, + "loss": 1.465, + "step": 2076 + }, + { + "epoch": 2.639135959339263, + "grad_norm": 1.4527379036498123, + "learning_rate": 5.3094498517636324e-06, + "loss": 1.5268, + "step": 2077 + }, + { + "epoch": 2.6404066073697585, + "grad_norm": 1.401333052050349, + "learning_rate": 5.3005251031881925e-06, + "loss": 1.5663, + "step": 2078 + }, + { + "epoch": 2.6416772554002543, + "grad_norm": 1.6150571455447142, + "learning_rate": 5.291605156224295e-06, + "loss": 1.5084, + "step": 2079 + }, + { + "epoch": 2.6429479034307497, + "grad_norm": 1.4759980850740504, + "learning_rate": 5.282690019985756e-06, + "loss": 1.3241, + "step": 2080 + }, + { + "epoch": 2.644218551461245, + "grad_norm": 1.8072175837094544, + "learning_rate": 5.273779703581468e-06, + "loss": 1.4983, + "step": 2081 + }, + { + "epoch": 2.645489199491741, + "grad_norm": 1.645313057282961, + "learning_rate": 5.264874216115391e-06, + "loss": 1.3519, + "step": 2082 + }, + { + "epoch": 2.6467598475222363, + "grad_norm": 4.221302465317087, + "learning_rate": 5.255973566686574e-06, + "loss": 1.5022, + "step": 2083 + }, + { + "epoch": 2.6480304955527316, + "grad_norm": 1.4510208188771505, + "learning_rate": 5.247077764389099e-06, + "loss": 1.4536, + "step": 2084 + }, + { + "epoch": 2.6493011435832274, + "grad_norm": 1.5493648471181438, + "learning_rate": 5.238186818312117e-06, + "loss": 1.4785, + "step": 2085 + }, + { + "epoch": 2.6505717916137232, + "grad_norm": 1.4302419242555682, + "learning_rate": 5.229300737539801e-06, + "loss": 1.5484, + "step": 2086 + }, + { + "epoch": 2.6518424396442186, + "grad_norm": 1.849299532453937, + "learning_rate": 5.220419531151355e-06, + "loss": 1.6416, + "step": 2087 + }, + { + "epoch": 2.653113087674714, + "grad_norm": 1.7434161293677466, + "learning_rate": 5.211543208221013e-06, + "loss": 1.6274, + "step": 2088 + }, + { + "epoch": 2.65438373570521, + "grad_norm": 2.0005460426933634, + "learning_rate": 5.20267177781801e-06, + "loss": 1.6919, + "step": 2089 + }, + { + "epoch": 2.655654383735705, + "grad_norm": 1.5018561095353853, + "learning_rate": 5.193805249006581e-06, + "loss": 1.5313, + "step": 2090 + }, + { + "epoch": 2.6569250317662005, + "grad_norm": 1.491362650772649, + "learning_rate": 5.18494363084596e-06, + "loss": 1.7367, + "step": 2091 + }, + { + "epoch": 2.6581956797966964, + "grad_norm": 1.5255178081426906, + "learning_rate": 5.176086932390365e-06, + "loss": 1.4245, + "step": 2092 + }, + { + "epoch": 2.6594663278271917, + "grad_norm": 1.9086179984912208, + "learning_rate": 5.167235162688977e-06, + "loss": 1.3983, + "step": 2093 + }, + { + "epoch": 2.6607369758576875, + "grad_norm": 1.4844669866832318, + "learning_rate": 5.158388330785944e-06, + "loss": 1.2229, + "step": 2094 + }, + { + "epoch": 2.662007623888183, + "grad_norm": 1.8342374109409971, + "learning_rate": 5.149546445720381e-06, + "loss": 1.6674, + "step": 2095 + }, + { + "epoch": 2.6632782719186787, + "grad_norm": 1.442105359076775, + "learning_rate": 5.140709516526328e-06, + "loss": 1.7274, + "step": 2096 + }, + { + "epoch": 2.664548919949174, + "grad_norm": 1.3759796092109167, + "learning_rate": 5.131877552232783e-06, + "loss": 1.4564, + "step": 2097 + }, + { + "epoch": 2.6658195679796695, + "grad_norm": 1.4110779592420157, + "learning_rate": 5.1230505618636575e-06, + "loss": 1.4833, + "step": 2098 + }, + { + "epoch": 2.6670902160101653, + "grad_norm": 1.5201835870073264, + "learning_rate": 5.114228554437779e-06, + "loss": 1.3928, + "step": 2099 + }, + { + "epoch": 2.6683608640406606, + "grad_norm": 1.5241126316189535, + "learning_rate": 5.105411538968898e-06, + "loss": 1.3865, + "step": 2100 + }, + { + "epoch": 2.6696315120711565, + "grad_norm": 1.6070488288142075, + "learning_rate": 5.0965995244656504e-06, + "loss": 1.5122, + "step": 2101 + }, + { + "epoch": 2.670902160101652, + "grad_norm": 1.6530350204573605, + "learning_rate": 5.087792519931565e-06, + "loss": 1.4907, + "step": 2102 + }, + { + "epoch": 2.6721728081321476, + "grad_norm": 1.7113670629919329, + "learning_rate": 5.078990534365058e-06, + "loss": 1.5365, + "step": 2103 + }, + { + "epoch": 2.673443456162643, + "grad_norm": 1.5090773096609014, + "learning_rate": 5.070193576759419e-06, + "loss": 1.4808, + "step": 2104 + }, + { + "epoch": 2.6747141041931384, + "grad_norm": 1.5895263186914708, + "learning_rate": 5.061401656102791e-06, + "loss": 1.4958, + "step": 2105 + }, + { + "epoch": 2.675984752223634, + "grad_norm": 1.3676785553851607, + "learning_rate": 5.05261478137817e-06, + "loss": 1.6263, + "step": 2106 + }, + { + "epoch": 2.6772554002541296, + "grad_norm": 1.5053464148984217, + "learning_rate": 5.043832961563411e-06, + "loss": 1.4466, + "step": 2107 + }, + { + "epoch": 2.678526048284625, + "grad_norm": 1.5858334316382405, + "learning_rate": 5.035056205631183e-06, + "loss": 1.3851, + "step": 2108 + }, + { + "epoch": 2.6797966963151207, + "grad_norm": 1.6235052285091025, + "learning_rate": 5.026284522549006e-06, + "loss": 1.4365, + "step": 2109 + }, + { + "epoch": 2.681067344345616, + "grad_norm": 1.5799018059764751, + "learning_rate": 5.017517921279198e-06, + "loss": 1.4248, + "step": 2110 + }, + { + "epoch": 2.682337992376112, + "grad_norm": 1.4343381070499153, + "learning_rate": 5.0087564107788835e-06, + "loss": 1.2388, + "step": 2111 + }, + { + "epoch": 2.6836086404066073, + "grad_norm": 1.718308551073008, + "learning_rate": 5.000000000000003e-06, + "loss": 1.4583, + "step": 2112 + }, + { + "epoch": 2.684879288437103, + "grad_norm": 1.616801102187747, + "learning_rate": 4.991248697889266e-06, + "loss": 1.522, + "step": 2113 + }, + { + "epoch": 2.6861499364675985, + "grad_norm": 1.674093541127283, + "learning_rate": 4.982502513388182e-06, + "loss": 1.4097, + "step": 2114 + }, + { + "epoch": 2.687420584498094, + "grad_norm": 1.5210203143542569, + "learning_rate": 4.973761455433014e-06, + "loss": 1.4347, + "step": 2115 + }, + { + "epoch": 2.6886912325285897, + "grad_norm": 1.390482685222237, + "learning_rate": 4.9650255329548016e-06, + "loss": 1.484, + "step": 2116 + }, + { + "epoch": 2.689961880559085, + "grad_norm": 1.5907098780154691, + "learning_rate": 4.9562947548793275e-06, + "loss": 1.4206, + "step": 2117 + }, + { + "epoch": 2.691232528589581, + "grad_norm": 1.4255923958634236, + "learning_rate": 4.947569130127115e-06, + "loss": 1.5406, + "step": 2118 + }, + { + "epoch": 2.692503176620076, + "grad_norm": 1.5360499494919408, + "learning_rate": 4.938848667613436e-06, + "loss": 1.3503, + "step": 2119 + }, + { + "epoch": 2.693773824650572, + "grad_norm": 1.415753496961855, + "learning_rate": 4.930133376248282e-06, + "loss": 1.4805, + "step": 2120 + }, + { + "epoch": 2.6950444726810674, + "grad_norm": 1.7055062463214483, + "learning_rate": 4.921423264936356e-06, + "loss": 1.4416, + "step": 2121 + }, + { + "epoch": 2.6963151207115628, + "grad_norm": 1.6167589168048309, + "learning_rate": 4.912718342577068e-06, + "loss": 1.7105, + "step": 2122 + }, + { + "epoch": 2.6975857687420586, + "grad_norm": 1.668769249731688, + "learning_rate": 4.904018618064536e-06, + "loss": 1.4102, + "step": 2123 + }, + { + "epoch": 2.698856416772554, + "grad_norm": 1.7919989171042408, + "learning_rate": 4.8953241002875585e-06, + "loss": 1.5343, + "step": 2124 + }, + { + "epoch": 2.7001270648030493, + "grad_norm": 1.907222422302549, + "learning_rate": 4.886634798129612e-06, + "loss": 1.457, + "step": 2125 + }, + { + "epoch": 2.701397712833545, + "grad_norm": 1.3685673054290053, + "learning_rate": 4.8779507204688595e-06, + "loss": 1.6121, + "step": 2126 + }, + { + "epoch": 2.7026683608640405, + "grad_norm": 1.5385795370037307, + "learning_rate": 4.869271876178103e-06, + "loss": 1.5327, + "step": 2127 + }, + { + "epoch": 2.7039390088945363, + "grad_norm": 1.6371369682481829, + "learning_rate": 4.860598274124821e-06, + "loss": 1.4613, + "step": 2128 + }, + { + "epoch": 2.7052096569250317, + "grad_norm": 1.5363684219769342, + "learning_rate": 4.851929923171118e-06, + "loss": 1.6277, + "step": 2129 + }, + { + "epoch": 2.7064803049555275, + "grad_norm": 1.5724072159200964, + "learning_rate": 4.843266832173737e-06, + "loss": 1.5082, + "step": 2130 + }, + { + "epoch": 2.707750952986023, + "grad_norm": 1.5739904757793726, + "learning_rate": 4.834609009984055e-06, + "loss": 1.6271, + "step": 2131 + }, + { + "epoch": 2.7090216010165182, + "grad_norm": 1.6771818386475754, + "learning_rate": 4.825956465448061e-06, + "loss": 1.5462, + "step": 2132 + }, + { + "epoch": 2.710292249047014, + "grad_norm": 1.704072020358441, + "learning_rate": 4.817309207406347e-06, + "loss": 1.179, + "step": 2133 + }, + { + "epoch": 2.7115628970775094, + "grad_norm": 1.513942054124072, + "learning_rate": 4.808667244694105e-06, + "loss": 1.7137, + "step": 2134 + }, + { + "epoch": 2.7128335451080052, + "grad_norm": 1.558758018604813, + "learning_rate": 4.800030586141125e-06, + "loss": 1.3797, + "step": 2135 + }, + { + "epoch": 2.7141041931385006, + "grad_norm": 1.9040432234103206, + "learning_rate": 4.791399240571771e-06, + "loss": 1.4426, + "step": 2136 + }, + { + "epoch": 2.7153748411689964, + "grad_norm": 1.647871591808805, + "learning_rate": 4.782773216804971e-06, + "loss": 1.361, + "step": 2137 + }, + { + "epoch": 2.716645489199492, + "grad_norm": 1.2647652100439177, + "learning_rate": 4.774152523654235e-06, + "loss": 1.5768, + "step": 2138 + }, + { + "epoch": 2.717916137229987, + "grad_norm": 1.7706175430015751, + "learning_rate": 4.765537169927604e-06, + "loss": 1.4845, + "step": 2139 + }, + { + "epoch": 2.719186785260483, + "grad_norm": 1.6110442013395836, + "learning_rate": 4.756927164427685e-06, + "loss": 1.7129, + "step": 2140 + }, + { + "epoch": 2.7204574332909783, + "grad_norm": 1.5657124909629976, + "learning_rate": 4.748322515951605e-06, + "loss": 1.371, + "step": 2141 + }, + { + "epoch": 2.7217280813214737, + "grad_norm": 1.6632605767728326, + "learning_rate": 4.739723233291019e-06, + "loss": 1.4385, + "step": 2142 + }, + { + "epoch": 2.7229987293519695, + "grad_norm": 1.4014563049575284, + "learning_rate": 4.731129325232106e-06, + "loss": 1.4947, + "step": 2143 + }, + { + "epoch": 2.7242693773824653, + "grad_norm": 1.3985696072178182, + "learning_rate": 4.722540800555559e-06, + "loss": 1.4192, + "step": 2144 + }, + { + "epoch": 2.7255400254129607, + "grad_norm": 1.3735185623637343, + "learning_rate": 4.713957668036553e-06, + "loss": 1.4882, + "step": 2145 + }, + { + "epoch": 2.726810673443456, + "grad_norm": 1.586671747488094, + "learning_rate": 4.7053799364447625e-06, + "loss": 1.3832, + "step": 2146 + }, + { + "epoch": 2.728081321473952, + "grad_norm": 1.6349004373412965, + "learning_rate": 4.696807614544352e-06, + "loss": 1.8207, + "step": 2147 + }, + { + "epoch": 2.7293519695044473, + "grad_norm": 2.29076717553526, + "learning_rate": 4.688240711093942e-06, + "loss": 1.5078, + "step": 2148 + }, + { + "epoch": 2.7306226175349426, + "grad_norm": 1.8004150981237756, + "learning_rate": 4.679679234846636e-06, + "loss": 1.7272, + "step": 2149 + }, + { + "epoch": 2.7318932655654384, + "grad_norm": 1.5136935349046672, + "learning_rate": 4.671123194549971e-06, + "loss": 1.6561, + "step": 2150 + }, + { + "epoch": 2.733163913595934, + "grad_norm": 1.4483326917445587, + "learning_rate": 4.662572598945951e-06, + "loss": 1.8069, + "step": 2151 + }, + { + "epoch": 2.7344345616264296, + "grad_norm": 1.6464150263965145, + "learning_rate": 4.6540274567710044e-06, + "loss": 1.4983, + "step": 2152 + }, + { + "epoch": 2.735705209656925, + "grad_norm": 1.6737452035371205, + "learning_rate": 4.645487776755988e-06, + "loss": 1.4596, + "step": 2153 + }, + { + "epoch": 2.736975857687421, + "grad_norm": 1.5188060122231457, + "learning_rate": 4.636953567626176e-06, + "loss": 1.4163, + "step": 2154 + }, + { + "epoch": 2.738246505717916, + "grad_norm": 1.724069268162742, + "learning_rate": 4.628424838101263e-06, + "loss": 1.4403, + "step": 2155 + }, + { + "epoch": 2.7395171537484115, + "grad_norm": 1.541246935958105, + "learning_rate": 4.619901596895342e-06, + "loss": 1.4836, + "step": 2156 + }, + { + "epoch": 2.7407878017789074, + "grad_norm": 1.6058072617047359, + "learning_rate": 4.61138385271689e-06, + "loss": 1.5705, + "step": 2157 + }, + { + "epoch": 2.7420584498094027, + "grad_norm": 1.5665710983434642, + "learning_rate": 4.602871614268769e-06, + "loss": 1.3489, + "step": 2158 + }, + { + "epoch": 2.743329097839898, + "grad_norm": 1.4042165480312023, + "learning_rate": 4.594364890248229e-06, + "loss": 1.7615, + "step": 2159 + }, + { + "epoch": 2.744599745870394, + "grad_norm": 1.7081967156520237, + "learning_rate": 4.585863689346865e-06, + "loss": 2.019, + "step": 2160 + }, + { + "epoch": 2.7458703939008897, + "grad_norm": 1.470732998371279, + "learning_rate": 4.57736802025065e-06, + "loss": 1.5371, + "step": 2161 + }, + { + "epoch": 2.747141041931385, + "grad_norm": 1.7047816374648823, + "learning_rate": 4.568877891639887e-06, + "loss": 1.4689, + "step": 2162 + }, + { + "epoch": 2.7484116899618805, + "grad_norm": 3.6959727917658265, + "learning_rate": 4.560393312189233e-06, + "loss": 1.6444, + "step": 2163 + }, + { + "epoch": 2.7496823379923763, + "grad_norm": 1.5603922210765961, + "learning_rate": 4.551914290567665e-06, + "loss": 1.4311, + "step": 2164 + }, + { + "epoch": 2.7509529860228716, + "grad_norm": 1.6051704189943057, + "learning_rate": 4.543440835438483e-06, + "loss": 1.5238, + "step": 2165 + }, + { + "epoch": 2.752223634053367, + "grad_norm": 1.4293857594876154, + "learning_rate": 4.534972955459299e-06, + "loss": 1.5342, + "step": 2166 + }, + { + "epoch": 2.753494282083863, + "grad_norm": 1.4783762134150356, + "learning_rate": 4.5265106592820344e-06, + "loss": 1.4105, + "step": 2167 + }, + { + "epoch": 2.754764930114358, + "grad_norm": 1.446137187771283, + "learning_rate": 4.518053955552903e-06, + "loss": 1.5654, + "step": 2168 + }, + { + "epoch": 2.756035578144854, + "grad_norm": 1.6473357899274805, + "learning_rate": 4.509602852912403e-06, + "loss": 1.5307, + "step": 2169 + }, + { + "epoch": 2.7573062261753494, + "grad_norm": 1.450012021625794, + "learning_rate": 4.5011573599953054e-06, + "loss": 1.5723, + "step": 2170 + }, + { + "epoch": 2.758576874205845, + "grad_norm": 1.3467797974730438, + "learning_rate": 4.492717485430657e-06, + "loss": 1.5967, + "step": 2171 + }, + { + "epoch": 2.7598475222363406, + "grad_norm": 1.5359734040408937, + "learning_rate": 4.484283237841766e-06, + "loss": 1.4576, + "step": 2172 + }, + { + "epoch": 2.761118170266836, + "grad_norm": 1.338077406445025, + "learning_rate": 4.475854625846183e-06, + "loss": 1.6186, + "step": 2173 + }, + { + "epoch": 2.7623888182973317, + "grad_norm": 1.7898288413802088, + "learning_rate": 4.467431658055701e-06, + "loss": 1.2723, + "step": 2174 + }, + { + "epoch": 2.763659466327827, + "grad_norm": 1.7675244465580209, + "learning_rate": 4.459014343076356e-06, + "loss": 1.4509, + "step": 2175 + }, + { + "epoch": 2.7649301143583225, + "grad_norm": 1.6014064081289283, + "learning_rate": 4.450602689508399e-06, + "loss": 1.6356, + "step": 2176 + }, + { + "epoch": 2.7662007623888183, + "grad_norm": 1.4284741772133758, + "learning_rate": 4.442196705946295e-06, + "loss": 1.4993, + "step": 2177 + }, + { + "epoch": 2.767471410419314, + "grad_norm": 1.458309275467031, + "learning_rate": 4.433796400978722e-06, + "loss": 1.6167, + "step": 2178 + }, + { + "epoch": 2.7687420584498095, + "grad_norm": 1.8266623098630126, + "learning_rate": 4.425401783188563e-06, + "loss": 1.5206, + "step": 2179 + }, + { + "epoch": 2.770012706480305, + "grad_norm": 1.4553210631084015, + "learning_rate": 4.417012861152873e-06, + "loss": 1.6961, + "step": 2180 + }, + { + "epoch": 2.7712833545108007, + "grad_norm": 1.3513808318342604, + "learning_rate": 4.408629643442896e-06, + "loss": 1.6516, + "step": 2181 + }, + { + "epoch": 2.772554002541296, + "grad_norm": 1.5580064345986402, + "learning_rate": 4.400252138624047e-06, + "loss": 1.2505, + "step": 2182 + }, + { + "epoch": 2.7738246505717914, + "grad_norm": 1.7947293920430765, + "learning_rate": 4.391880355255905e-06, + "loss": 1.4121, + "step": 2183 + }, + { + "epoch": 2.775095298602287, + "grad_norm": 1.6920046249691607, + "learning_rate": 4.3835143018922075e-06, + "loss": 1.8273, + "step": 2184 + }, + { + "epoch": 2.7763659466327826, + "grad_norm": 1.5498008118708304, + "learning_rate": 4.375153987080829e-06, + "loss": 1.5396, + "step": 2185 + }, + { + "epoch": 2.7776365946632784, + "grad_norm": 2.17633003006867, + "learning_rate": 4.3667994193637794e-06, + "loss": 1.6837, + "step": 2186 + }, + { + "epoch": 2.7789072426937738, + "grad_norm": 1.4609801522718546, + "learning_rate": 4.35845060727721e-06, + "loss": 1.4504, + "step": 2187 + }, + { + "epoch": 2.7801778907242696, + "grad_norm": 1.582605698584194, + "learning_rate": 4.35010755935138e-06, + "loss": 1.5495, + "step": 2188 + }, + { + "epoch": 2.781448538754765, + "grad_norm": 1.5466728476915508, + "learning_rate": 4.341770284110655e-06, + "loss": 1.3189, + "step": 2189 + }, + { + "epoch": 2.7827191867852603, + "grad_norm": 1.4794817703134617, + "learning_rate": 4.333438790073518e-06, + "loss": 1.4851, + "step": 2190 + }, + { + "epoch": 2.783989834815756, + "grad_norm": 1.543011701944719, + "learning_rate": 4.325113085752537e-06, + "loss": 1.6088, + "step": 2191 + }, + { + "epoch": 2.7852604828462515, + "grad_norm": 1.5245810456131066, + "learning_rate": 4.316793179654362e-06, + "loss": 1.1476, + "step": 2192 + }, + { + "epoch": 2.786531130876747, + "grad_norm": 1.7529414814172941, + "learning_rate": 4.308479080279718e-06, + "loss": 1.8261, + "step": 2193 + }, + { + "epoch": 2.7878017789072427, + "grad_norm": 1.4884060379109785, + "learning_rate": 4.3001707961233994e-06, + "loss": 1.5434, + "step": 2194 + }, + { + "epoch": 2.7890724269377385, + "grad_norm": 1.5961743329595908, + "learning_rate": 4.291868335674263e-06, + "loss": 1.3902, + "step": 2195 + }, + { + "epoch": 2.790343074968234, + "grad_norm": 1.4509705784168312, + "learning_rate": 4.283571707415214e-06, + "loss": 1.533, + "step": 2196 + }, + { + "epoch": 2.7916137229987292, + "grad_norm": 1.6271203889491292, + "learning_rate": 4.275280919823193e-06, + "loss": 1.407, + "step": 2197 + }, + { + "epoch": 2.792884371029225, + "grad_norm": 2.4592303081294, + "learning_rate": 4.266995981369174e-06, + "loss": 1.3711, + "step": 2198 + }, + { + "epoch": 2.7941550190597204, + "grad_norm": 1.6611590728869305, + "learning_rate": 4.258716900518164e-06, + "loss": 1.5451, + "step": 2199 + }, + { + "epoch": 2.795425667090216, + "grad_norm": 1.3924565008903842, + "learning_rate": 4.25044368572917e-06, + "loss": 1.6155, + "step": 2200 + }, + { + "epoch": 2.7966963151207116, + "grad_norm": 1.7698735241912924, + "learning_rate": 4.2421763454552225e-06, + "loss": 1.5948, + "step": 2201 + }, + { + "epoch": 2.797966963151207, + "grad_norm": 1.6408950608222295, + "learning_rate": 4.233914888143333e-06, + "loss": 1.5465, + "step": 2202 + }, + { + "epoch": 2.799237611181703, + "grad_norm": 1.373912692213604, + "learning_rate": 4.2256593222345185e-06, + "loss": 1.6543, + "step": 2203 + }, + { + "epoch": 2.800508259212198, + "grad_norm": 1.451843605699671, + "learning_rate": 4.2174096561637644e-06, + "loss": 1.488, + "step": 2204 + }, + { + "epoch": 2.801778907242694, + "grad_norm": 1.6196668958041813, + "learning_rate": 4.2091658983600305e-06, + "loss": 1.3349, + "step": 2205 + }, + { + "epoch": 2.8030495552731893, + "grad_norm": 1.407428500062136, + "learning_rate": 4.20092805724624e-06, + "loss": 1.5528, + "step": 2206 + }, + { + "epoch": 2.8043202033036847, + "grad_norm": 1.6616455879553866, + "learning_rate": 4.192696141239273e-06, + "loss": 1.4103, + "step": 2207 + }, + { + "epoch": 2.8055908513341805, + "grad_norm": 1.476847437744362, + "learning_rate": 4.184470158749961e-06, + "loss": 1.6905, + "step": 2208 + }, + { + "epoch": 2.806861499364676, + "grad_norm": 1.72921436484817, + "learning_rate": 4.176250118183063e-06, + "loss": 1.4946, + "step": 2209 + }, + { + "epoch": 2.8081321473951717, + "grad_norm": 1.571841716038649, + "learning_rate": 4.168036027937267e-06, + "loss": 1.4062, + "step": 2210 + }, + { + "epoch": 2.809402795425667, + "grad_norm": 1.287244228350747, + "learning_rate": 4.159827896405192e-06, + "loss": 1.5741, + "step": 2211 + }, + { + "epoch": 2.810673443456163, + "grad_norm": 1.6389873500505736, + "learning_rate": 4.151625731973354e-06, + "loss": 1.5757, + "step": 2212 + }, + { + "epoch": 2.8119440914866582, + "grad_norm": 1.4866099503879866, + "learning_rate": 4.143429543022191e-06, + "loss": 1.5559, + "step": 2213 + }, + { + "epoch": 2.8132147395171536, + "grad_norm": 1.704282712354898, + "learning_rate": 4.1352393379260125e-06, + "loss": 1.4492, + "step": 2214 + }, + { + "epoch": 2.8144853875476494, + "grad_norm": 1.536418241417774, + "learning_rate": 4.127055125053037e-06, + "loss": 1.4443, + "step": 2215 + }, + { + "epoch": 2.815756035578145, + "grad_norm": 1.5113596778312552, + "learning_rate": 4.118876912765347e-06, + "loss": 1.6588, + "step": 2216 + }, + { + "epoch": 2.81702668360864, + "grad_norm": 1.6039842349514237, + "learning_rate": 4.1107047094188946e-06, + "loss": 1.3748, + "step": 2217 + }, + { + "epoch": 2.818297331639136, + "grad_norm": 1.4682553751308622, + "learning_rate": 4.10253852336349e-06, + "loss": 1.4886, + "step": 2218 + }, + { + "epoch": 2.8195679796696314, + "grad_norm": 1.5395939610698777, + "learning_rate": 4.094378362942812e-06, + "loss": 1.5196, + "step": 2219 + }, + { + "epoch": 2.820838627700127, + "grad_norm": 1.9269379366055792, + "learning_rate": 4.086224236494366e-06, + "loss": 1.4785, + "step": 2220 + }, + { + "epoch": 2.8221092757306225, + "grad_norm": 1.4274328734813035, + "learning_rate": 4.078076152349496e-06, + "loss": 1.4438, + "step": 2221 + }, + { + "epoch": 2.8233799237611183, + "grad_norm": 1.595498803332665, + "learning_rate": 4.06993411883337e-06, + "loss": 1.4873, + "step": 2222 + }, + { + "epoch": 2.8246505717916137, + "grad_norm": 1.3971580437463702, + "learning_rate": 4.061798144264986e-06, + "loss": 1.5435, + "step": 2223 + }, + { + "epoch": 2.825921219822109, + "grad_norm": 1.4532248424265208, + "learning_rate": 4.053668236957135e-06, + "loss": 1.5382, + "step": 2224 + }, + { + "epoch": 2.827191867852605, + "grad_norm": 1.519642472222294, + "learning_rate": 4.045544405216422e-06, + "loss": 1.53, + "step": 2225 + }, + { + "epoch": 2.8284625158831003, + "grad_norm": 1.3947109208414181, + "learning_rate": 4.037426657343233e-06, + "loss": 1.2854, + "step": 2226 + }, + { + "epoch": 2.829733163913596, + "grad_norm": 1.6532271964354837, + "learning_rate": 4.029315001631753e-06, + "loss": 1.6441, + "step": 2227 + }, + { + "epoch": 2.8310038119440915, + "grad_norm": 1.656923087401254, + "learning_rate": 4.021209446369927e-06, + "loss": 1.5589, + "step": 2228 + }, + { + "epoch": 2.8322744599745873, + "grad_norm": 1.3989622503720005, + "learning_rate": 4.013109999839472e-06, + "loss": 1.5257, + "step": 2229 + }, + { + "epoch": 2.8335451080050826, + "grad_norm": 1.5786592082395432, + "learning_rate": 4.005016670315867e-06, + "loss": 1.6313, + "step": 2230 + }, + { + "epoch": 2.834815756035578, + "grad_norm": 1.8123966305128363, + "learning_rate": 3.996929466068344e-06, + "loss": 1.2574, + "step": 2231 + }, + { + "epoch": 2.836086404066074, + "grad_norm": 1.5203938631127307, + "learning_rate": 3.988848395359866e-06, + "loss": 1.6327, + "step": 2232 + }, + { + "epoch": 2.837357052096569, + "grad_norm": 1.6702076939009192, + "learning_rate": 3.980773466447138e-06, + "loss": 1.7363, + "step": 2233 + }, + { + "epoch": 2.8386277001270646, + "grad_norm": 1.4094045845321321, + "learning_rate": 3.97270468758058e-06, + "loss": 1.5077, + "step": 2234 + }, + { + "epoch": 2.8398983481575604, + "grad_norm": 1.3924696277906912, + "learning_rate": 3.964642067004338e-06, + "loss": 1.6309, + "step": 2235 + }, + { + "epoch": 2.841168996188056, + "grad_norm": 1.6232709109768664, + "learning_rate": 3.956585612956268e-06, + "loss": 1.3125, + "step": 2236 + }, + { + "epoch": 2.8424396442185516, + "grad_norm": 1.7944603999037159, + "learning_rate": 3.948535333667916e-06, + "loss": 1.6526, + "step": 2237 + }, + { + "epoch": 2.843710292249047, + "grad_norm": 1.7124321584478142, + "learning_rate": 3.940491237364519e-06, + "loss": 1.7881, + "step": 2238 + }, + { + "epoch": 2.8449809402795427, + "grad_norm": 1.6072517443508763, + "learning_rate": 3.9324533322650075e-06, + "loss": 1.5408, + "step": 2239 + }, + { + "epoch": 2.846251588310038, + "grad_norm": 2.32896329624759, + "learning_rate": 3.9244216265819755e-06, + "loss": 1.5828, + "step": 2240 + }, + { + "epoch": 2.8475222363405335, + "grad_norm": 1.4163103231202079, + "learning_rate": 3.916396128521686e-06, + "loss": 1.4773, + "step": 2241 + }, + { + "epoch": 2.8487928843710293, + "grad_norm": 1.7274268711443028, + "learning_rate": 3.908376846284061e-06, + "loss": 1.5606, + "step": 2242 + }, + { + "epoch": 2.8500635324015247, + "grad_norm": 1.7404585688099852, + "learning_rate": 3.9003637880626765e-06, + "loss": 1.691, + "step": 2243 + }, + { + "epoch": 2.8513341804320205, + "grad_norm": 1.709195129169358, + "learning_rate": 3.8923569620447375e-06, + "loss": 1.5409, + "step": 2244 + }, + { + "epoch": 2.852604828462516, + "grad_norm": 1.6705919031761796, + "learning_rate": 3.884356376411089e-06, + "loss": 1.4196, + "step": 2245 + }, + { + "epoch": 2.8538754764930117, + "grad_norm": 1.5192501448692692, + "learning_rate": 3.876362039336196e-06, + "loss": 1.4834, + "step": 2246 + }, + { + "epoch": 2.855146124523507, + "grad_norm": 1.490906994462505, + "learning_rate": 3.868373958988142e-06, + "loss": 1.1566, + "step": 2247 + }, + { + "epoch": 2.8564167725540024, + "grad_norm": 1.4843564294014155, + "learning_rate": 3.860392143528624e-06, + "loss": 1.4525, + "step": 2248 + }, + { + "epoch": 2.857687420584498, + "grad_norm": 1.5348131899543818, + "learning_rate": 3.852416601112925e-06, + "loss": 1.5744, + "step": 2249 + }, + { + "epoch": 2.8589580686149936, + "grad_norm": 1.43630845243318, + "learning_rate": 3.844447339889924e-06, + "loss": 1.5896, + "step": 2250 + }, + { + "epoch": 2.860228716645489, + "grad_norm": 1.5318404589559234, + "learning_rate": 3.836484368002088e-06, + "loss": 1.4683, + "step": 2251 + }, + { + "epoch": 2.8614993646759848, + "grad_norm": 1.4485601453260954, + "learning_rate": 3.828527693585451e-06, + "loss": 1.2319, + "step": 2252 + }, + { + "epoch": 2.8627700127064806, + "grad_norm": 1.3452807831122484, + "learning_rate": 3.8205773247696105e-06, + "loss": 1.469, + "step": 2253 + }, + { + "epoch": 2.864040660736976, + "grad_norm": 1.7164858031983954, + "learning_rate": 3.81263326967773e-06, + "loss": 1.8427, + "step": 2254 + }, + { + "epoch": 2.8653113087674713, + "grad_norm": 1.306710039223864, + "learning_rate": 3.8046955364265214e-06, + "loss": 1.5889, + "step": 2255 + }, + { + "epoch": 2.866581956797967, + "grad_norm": 1.707073776699419, + "learning_rate": 3.7967641331262295e-06, + "loss": 1.4975, + "step": 2256 + }, + { + "epoch": 2.8678526048284625, + "grad_norm": 1.4091419057103962, + "learning_rate": 3.788839067880635e-06, + "loss": 1.5389, + "step": 2257 + }, + { + "epoch": 2.869123252858958, + "grad_norm": 1.9789535173445154, + "learning_rate": 3.7809203487870395e-06, + "loss": 1.7107, + "step": 2258 + }, + { + "epoch": 2.8703939008894537, + "grad_norm": 1.6677355476402764, + "learning_rate": 3.7730079839362755e-06, + "loss": 1.5299, + "step": 2259 + }, + { + "epoch": 2.871664548919949, + "grad_norm": 1.5611925343525848, + "learning_rate": 3.7651019814126656e-06, + "loss": 1.3673, + "step": 2260 + }, + { + "epoch": 2.872935196950445, + "grad_norm": 1.694686562421888, + "learning_rate": 3.75720234929404e-06, + "loss": 1.3858, + "step": 2261 + }, + { + "epoch": 2.8742058449809402, + "grad_norm": 1.439181040260762, + "learning_rate": 3.7493090956517142e-06, + "loss": 1.4208, + "step": 2262 + }, + { + "epoch": 2.875476493011436, + "grad_norm": 1.3895366097637492, + "learning_rate": 3.7414222285504986e-06, + "loss": 1.5845, + "step": 2263 + }, + { + "epoch": 2.8767471410419314, + "grad_norm": 1.5428002728859518, + "learning_rate": 3.733541756048662e-06, + "loss": 1.3557, + "step": 2264 + }, + { + "epoch": 2.878017789072427, + "grad_norm": 1.650158881869085, + "learning_rate": 3.725667686197956e-06, + "loss": 1.5372, + "step": 2265 + }, + { + "epoch": 2.8792884371029226, + "grad_norm": 1.5303760428628421, + "learning_rate": 3.7178000270435765e-06, + "loss": 1.6347, + "step": 2266 + }, + { + "epoch": 2.880559085133418, + "grad_norm": 1.4439928286384305, + "learning_rate": 3.709938786624181e-06, + "loss": 1.4305, + "step": 2267 + }, + { + "epoch": 2.8818297331639133, + "grad_norm": 1.334703851173668, + "learning_rate": 3.7020839729718606e-06, + "loss": 1.6817, + "step": 2268 + }, + { + "epoch": 2.883100381194409, + "grad_norm": 1.5994007060692943, + "learning_rate": 3.6942355941121424e-06, + "loss": 1.4062, + "step": 2269 + }, + { + "epoch": 2.884371029224905, + "grad_norm": 1.3134585277590238, + "learning_rate": 3.6863936580639714e-06, + "loss": 1.4929, + "step": 2270 + }, + { + "epoch": 2.8856416772554003, + "grad_norm": 1.5620862407921547, + "learning_rate": 3.6785581728397312e-06, + "loss": 1.5845, + "step": 2271 + }, + { + "epoch": 2.8869123252858957, + "grad_norm": 1.8218826820601304, + "learning_rate": 3.6707291464451953e-06, + "loss": 1.6253, + "step": 2272 + }, + { + "epoch": 2.8881829733163915, + "grad_norm": 1.4859377686939863, + "learning_rate": 3.662906586879542e-06, + "loss": 1.417, + "step": 2273 + }, + { + "epoch": 2.889453621346887, + "grad_norm": 1.6305922022358126, + "learning_rate": 3.65509050213534e-06, + "loss": 1.276, + "step": 2274 + }, + { + "epoch": 2.8907242693773822, + "grad_norm": 1.4732452648881487, + "learning_rate": 3.6472809001985534e-06, + "loss": 1.4943, + "step": 2275 + }, + { + "epoch": 2.891994917407878, + "grad_norm": 1.5109274893828362, + "learning_rate": 3.6394777890485077e-06, + "loss": 1.5711, + "step": 2276 + }, + { + "epoch": 2.8932655654383734, + "grad_norm": 1.6857878009893548, + "learning_rate": 3.6316811766579106e-06, + "loss": 1.5752, + "step": 2277 + }, + { + "epoch": 2.8945362134688692, + "grad_norm": 1.6377320186466984, + "learning_rate": 3.6238910709928176e-06, + "loss": 1.5244, + "step": 2278 + }, + { + "epoch": 2.8958068614993646, + "grad_norm": 1.5226900332449655, + "learning_rate": 3.616107480012647e-06, + "loss": 1.4875, + "step": 2279 + }, + { + "epoch": 2.8970775095298604, + "grad_norm": 1.6081651224187385, + "learning_rate": 3.6083304116701535e-06, + "loss": 1.4027, + "step": 2280 + }, + { + "epoch": 2.898348157560356, + "grad_norm": 1.706130168269608, + "learning_rate": 3.6005598739114243e-06, + "loss": 1.5992, + "step": 2281 + }, + { + "epoch": 2.899618805590851, + "grad_norm": 1.5643040540066309, + "learning_rate": 3.592795874675884e-06, + "loss": 1.5868, + "step": 2282 + }, + { + "epoch": 2.900889453621347, + "grad_norm": 1.6816599483507202, + "learning_rate": 3.5850384218962743e-06, + "loss": 1.298, + "step": 2283 + }, + { + "epoch": 2.9021601016518423, + "grad_norm": 1.8470396107981721, + "learning_rate": 3.5772875234986413e-06, + "loss": 1.5678, + "step": 2284 + }, + { + "epoch": 2.9034307496823377, + "grad_norm": 1.6046584287550623, + "learning_rate": 3.569543187402341e-06, + "loss": 1.6325, + "step": 2285 + }, + { + "epoch": 2.9047013977128335, + "grad_norm": 1.523995355325757, + "learning_rate": 3.5618054215200173e-06, + "loss": 1.5939, + "step": 2286 + }, + { + "epoch": 2.9059720457433293, + "grad_norm": 1.5270449092754086, + "learning_rate": 3.5540742337576083e-06, + "loss": 1.4063, + "step": 2287 + }, + { + "epoch": 2.9072426937738247, + "grad_norm": 2.10017297192366, + "learning_rate": 3.546349632014334e-06, + "loss": 1.5012, + "step": 2288 + }, + { + "epoch": 2.90851334180432, + "grad_norm": 1.6155665835478203, + "learning_rate": 3.5386316241826748e-06, + "loss": 1.5012, + "step": 2289 + }, + { + "epoch": 2.909783989834816, + "grad_norm": 1.7787223778412131, + "learning_rate": 3.530920218148376e-06, + "loss": 1.3199, + "step": 2290 + }, + { + "epoch": 2.9110546378653113, + "grad_norm": 1.560549594236504, + "learning_rate": 3.523215421790447e-06, + "loss": 1.2565, + "step": 2291 + }, + { + "epoch": 2.9123252858958066, + "grad_norm": 1.394812239009204, + "learning_rate": 3.5155172429811336e-06, + "loss": 1.4569, + "step": 2292 + }, + { + "epoch": 2.9135959339263025, + "grad_norm": 1.6038138492524496, + "learning_rate": 3.5078256895859207e-06, + "loss": 1.5995, + "step": 2293 + }, + { + "epoch": 2.914866581956798, + "grad_norm": 1.6728720063218776, + "learning_rate": 3.5001407694635326e-06, + "loss": 1.5695, + "step": 2294 + }, + { + "epoch": 2.9161372299872936, + "grad_norm": 2.7465688150554537, + "learning_rate": 3.492462490465911e-06, + "loss": 1.5363, + "step": 2295 + }, + { + "epoch": 2.917407878017789, + "grad_norm": 1.4065911779100326, + "learning_rate": 3.4847908604382095e-06, + "loss": 1.4767, + "step": 2296 + }, + { + "epoch": 2.918678526048285, + "grad_norm": 1.5795356577772306, + "learning_rate": 3.4771258872187917e-06, + "loss": 1.5937, + "step": 2297 + }, + { + "epoch": 2.91994917407878, + "grad_norm": 1.5741018037150643, + "learning_rate": 3.469467578639214e-06, + "loss": 1.3136, + "step": 2298 + }, + { + "epoch": 2.9212198221092756, + "grad_norm": 1.4731933338365752, + "learning_rate": 3.4618159425242304e-06, + "loss": 1.5331, + "step": 2299 + }, + { + "epoch": 2.9224904701397714, + "grad_norm": 1.703185023187937, + "learning_rate": 3.4541709866917793e-06, + "loss": 1.5962, + "step": 2300 + }, + { + "epoch": 2.9237611181702667, + "grad_norm": 1.2521835252137483, + "learning_rate": 3.446532718952966e-06, + "loss": 1.4288, + "step": 2301 + }, + { + "epoch": 2.9250317662007626, + "grad_norm": 1.6440713932270161, + "learning_rate": 3.4389011471120614e-06, + "loss": 1.3655, + "step": 2302 + }, + { + "epoch": 2.926302414231258, + "grad_norm": 1.4889131386748893, + "learning_rate": 3.4312762789665067e-06, + "loss": 1.6922, + "step": 2303 + }, + { + "epoch": 2.9275730622617537, + "grad_norm": 1.6650208766938137, + "learning_rate": 3.423658122306882e-06, + "loss": 1.5555, + "step": 2304 + }, + { + "epoch": 2.928843710292249, + "grad_norm": 1.2522929715604783, + "learning_rate": 3.4160466849169106e-06, + "loss": 1.507, + "step": 2305 + }, + { + "epoch": 2.9301143583227445, + "grad_norm": 1.7803511980376172, + "learning_rate": 3.4084419745734577e-06, + "loss": 1.3705, + "step": 2306 + }, + { + "epoch": 2.9313850063532403, + "grad_norm": 1.7597512395948038, + "learning_rate": 3.400843999046516e-06, + "loss": 1.4553, + "step": 2307 + }, + { + "epoch": 2.9326556543837357, + "grad_norm": 1.5418716965422024, + "learning_rate": 3.3932527660991877e-06, + "loss": 1.4955, + "step": 2308 + }, + { + "epoch": 2.933926302414231, + "grad_norm": 1.8698683202288493, + "learning_rate": 3.3856682834876884e-06, + "loss": 1.4275, + "step": 2309 + }, + { + "epoch": 2.935196950444727, + "grad_norm": 1.4531375655005387, + "learning_rate": 3.3780905589613457e-06, + "loss": 1.4324, + "step": 2310 + }, + { + "epoch": 2.936467598475222, + "grad_norm": 1.8266056817430822, + "learning_rate": 3.370519600262567e-06, + "loss": 1.7186, + "step": 2311 + }, + { + "epoch": 2.937738246505718, + "grad_norm": 1.3272915282363376, + "learning_rate": 3.362955415126865e-06, + "loss": 1.3731, + "step": 2312 + }, + { + "epoch": 2.9390088945362134, + "grad_norm": 1.601240214729954, + "learning_rate": 3.3553980112828177e-06, + "loss": 1.4686, + "step": 2313 + }, + { + "epoch": 2.940279542566709, + "grad_norm": 1.798449523100172, + "learning_rate": 3.3478473964520754e-06, + "loss": 1.6216, + "step": 2314 + }, + { + "epoch": 2.9415501905972046, + "grad_norm": 1.3685831897094658, + "learning_rate": 3.340303578349361e-06, + "loss": 1.3927, + "step": 2315 + }, + { + "epoch": 2.9428208386277, + "grad_norm": 1.5147383181377905, + "learning_rate": 3.3327665646824404e-06, + "loss": 1.5493, + "step": 2316 + }, + { + "epoch": 2.9440914866581958, + "grad_norm": 1.548599616748368, + "learning_rate": 3.325236363152142e-06, + "loss": 1.5739, + "step": 2317 + }, + { + "epoch": 2.945362134688691, + "grad_norm": 1.3556053708238454, + "learning_rate": 3.317712981452319e-06, + "loss": 1.4329, + "step": 2318 + }, + { + "epoch": 2.946632782719187, + "grad_norm": 1.4473213850301048, + "learning_rate": 3.3101964272698693e-06, + "loss": 1.479, + "step": 2319 + }, + { + "epoch": 2.9479034307496823, + "grad_norm": 1.4337258339715722, + "learning_rate": 3.3026867082847058e-06, + "loss": 1.4843, + "step": 2320 + }, + { + "epoch": 2.949174078780178, + "grad_norm": 1.6875137284988417, + "learning_rate": 3.295183832169758e-06, + "loss": 1.5555, + "step": 2321 + }, + { + "epoch": 2.9504447268106735, + "grad_norm": 1.469035977971227, + "learning_rate": 3.2876878065909714e-06, + "loss": 1.5137, + "step": 2322 + }, + { + "epoch": 2.951715374841169, + "grad_norm": 1.4600563300664042, + "learning_rate": 3.2801986392072882e-06, + "loss": 1.3999, + "step": 2323 + }, + { + "epoch": 2.9529860228716647, + "grad_norm": 1.2802500447086944, + "learning_rate": 3.2727163376706408e-06, + "loss": 1.6125, + "step": 2324 + }, + { + "epoch": 2.95425667090216, + "grad_norm": 1.5732563628912453, + "learning_rate": 3.2652409096259473e-06, + "loss": 1.3998, + "step": 2325 + }, + { + "epoch": 2.9555273189326554, + "grad_norm": 2.0170281525799414, + "learning_rate": 3.2577723627111022e-06, + "loss": 1.6881, + "step": 2326 + }, + { + "epoch": 2.9567979669631512, + "grad_norm": 1.4425748850274933, + "learning_rate": 3.250310704556976e-06, + "loss": 1.4648, + "step": 2327 + }, + { + "epoch": 2.9580686149936466, + "grad_norm": 1.4309238275691571, + "learning_rate": 3.2428559427873908e-06, + "loss": 1.4489, + "step": 2328 + }, + { + "epoch": 2.9593392630241424, + "grad_norm": 1.5982818277408608, + "learning_rate": 3.2354080850191328e-06, + "loss": 1.5934, + "step": 2329 + }, + { + "epoch": 2.9606099110546378, + "grad_norm": 1.621257458394931, + "learning_rate": 3.227967138861923e-06, + "loss": 1.4109, + "step": 2330 + }, + { + "epoch": 2.9618805590851336, + "grad_norm": 1.571220115256424, + "learning_rate": 3.2205331119184313e-06, + "loss": 1.3235, + "step": 2331 + }, + { + "epoch": 2.963151207115629, + "grad_norm": 1.5107281267332602, + "learning_rate": 3.2131060117842505e-06, + "loss": 1.8047, + "step": 2332 + }, + { + "epoch": 2.9644218551461243, + "grad_norm": 1.6060147078324802, + "learning_rate": 3.205685846047897e-06, + "loss": 1.1387, + "step": 2333 + }, + { + "epoch": 2.96569250317662, + "grad_norm": 1.662098799358042, + "learning_rate": 3.1982726222908046e-06, + "loss": 1.5366, + "step": 2334 + }, + { + "epoch": 2.9669631512071155, + "grad_norm": 1.5249414874021523, + "learning_rate": 3.1908663480873182e-06, + "loss": 1.4588, + "step": 2335 + }, + { + "epoch": 2.9682337992376113, + "grad_norm": 1.5489987249881296, + "learning_rate": 3.1834670310046735e-06, + "loss": 1.558, + "step": 2336 + }, + { + "epoch": 2.9695044472681067, + "grad_norm": 1.440444947964428, + "learning_rate": 3.1760746786030004e-06, + "loss": 1.4947, + "step": 2337 + }, + { + "epoch": 2.9707750952986025, + "grad_norm": 1.4026839845489567, + "learning_rate": 3.1686892984353124e-06, + "loss": 1.3553, + "step": 2338 + }, + { + "epoch": 2.972045743329098, + "grad_norm": 1.4476234611526049, + "learning_rate": 3.161310898047507e-06, + "loss": 1.4604, + "step": 2339 + }, + { + "epoch": 2.9733163913595932, + "grad_norm": 1.5098981969866436, + "learning_rate": 3.1539394849783367e-06, + "loss": 1.4829, + "step": 2340 + }, + { + "epoch": 2.974587039390089, + "grad_norm": 1.759099977767992, + "learning_rate": 3.1465750667594286e-06, + "loss": 1.3885, + "step": 2341 + }, + { + "epoch": 2.9758576874205844, + "grad_norm": 1.3084296325082836, + "learning_rate": 3.1392176509152507e-06, + "loss": 1.5668, + "step": 2342 + }, + { + "epoch": 2.97712833545108, + "grad_norm": 1.5872405076149434, + "learning_rate": 3.1318672449631283e-06, + "loss": 1.6528, + "step": 2343 + }, + { + "epoch": 2.9783989834815756, + "grad_norm": 1.783486852855159, + "learning_rate": 3.124523856413216e-06, + "loss": 1.563, + "step": 2344 + }, + { + "epoch": 2.9796696315120714, + "grad_norm": 1.6482688878777818, + "learning_rate": 3.1171874927684964e-06, + "loss": 1.6659, + "step": 2345 + }, + { + "epoch": 2.980940279542567, + "grad_norm": 1.555191564256173, + "learning_rate": 3.1098581615247825e-06, + "loss": 1.4207, + "step": 2346 + }, + { + "epoch": 2.982210927573062, + "grad_norm": 1.6353378112947163, + "learning_rate": 3.102535870170702e-06, + "loss": 1.5507, + "step": 2347 + }, + { + "epoch": 2.983481575603558, + "grad_norm": 1.578849275240601, + "learning_rate": 3.0952206261876827e-06, + "loss": 1.4754, + "step": 2348 + }, + { + "epoch": 2.9847522236340533, + "grad_norm": 1.546255150623972, + "learning_rate": 3.0879124370499515e-06, + "loss": 1.5183, + "step": 2349 + }, + { + "epoch": 2.9860228716645487, + "grad_norm": 1.5875157884351954, + "learning_rate": 3.0806113102245395e-06, + "loss": 1.6667, + "step": 2350 + }, + { + "epoch": 2.9872935196950445, + "grad_norm": 1.398826145213781, + "learning_rate": 3.073317253171245e-06, + "loss": 1.4393, + "step": 2351 + }, + { + "epoch": 2.98856416772554, + "grad_norm": 2.692603145819837, + "learning_rate": 3.0660302733426595e-06, + "loss": 1.4171, + "step": 2352 + }, + { + "epoch": 2.9898348157560357, + "grad_norm": 1.5460129343554816, + "learning_rate": 3.0587503781841298e-06, + "loss": 1.5645, + "step": 2353 + }, + { + "epoch": 2.991105463786531, + "grad_norm": 4.535547058502799, + "learning_rate": 3.05147757513377e-06, + "loss": 1.6246, + "step": 2354 + }, + { + "epoch": 2.992376111817027, + "grad_norm": 1.6514865378676962, + "learning_rate": 3.04421187162245e-06, + "loss": 1.5007, + "step": 2355 + }, + { + "epoch": 2.9936467598475223, + "grad_norm": 1.6672665946304461, + "learning_rate": 3.036953275073783e-06, + "loss": 1.688, + "step": 2356 + }, + { + "epoch": 2.9949174078780176, + "grad_norm": 1.5455780021092613, + "learning_rate": 3.029701792904117e-06, + "loss": 1.5103, + "step": 2357 + }, + { + "epoch": 2.9961880559085134, + "grad_norm": 1.5389372324362445, + "learning_rate": 3.0224574325225385e-06, + "loss": 1.6554, + "step": 2358 + }, + { + "epoch": 2.997458703939009, + "grad_norm": 1.8161574897561656, + "learning_rate": 3.0152202013308573e-06, + "loss": 1.5419, + "step": 2359 + }, + { + "epoch": 2.998729351969504, + "grad_norm": 1.5867170148575802, + "learning_rate": 3.007990106723593e-06, + "loss": 1.4492, + "step": 2360 + }, + { + "epoch": 3.0, + "grad_norm": 1.4436629920446682, + "learning_rate": 3.0007671560879724e-06, + "loss": 1.5805, + "step": 2361 + }, + { + "epoch": 3.0012706480304954, + "grad_norm": 2.09892976706466, + "learning_rate": 2.993551356803933e-06, + "loss": 1.3329, + "step": 2362 + }, + { + "epoch": 3.002541296060991, + "grad_norm": 2.573425871573313, + "learning_rate": 2.9863427162440963e-06, + "loss": 1.3915, + "step": 2363 + }, + { + "epoch": 3.0038119440914866, + "grad_norm": 1.9744349108834152, + "learning_rate": 2.979141241773775e-06, + "loss": 1.1761, + "step": 2364 + }, + { + "epoch": 3.0050825921219824, + "grad_norm": 2.8085612456415947, + "learning_rate": 2.971946940750958e-06, + "loss": 1.4812, + "step": 2365 + }, + { + "epoch": 3.0063532401524777, + "grad_norm": 1.8526064065366492, + "learning_rate": 2.9647598205262996e-06, + "loss": 1.3335, + "step": 2366 + }, + { + "epoch": 3.007623888182973, + "grad_norm": 1.7332062309204277, + "learning_rate": 2.9575798884431297e-06, + "loss": 1.4045, + "step": 2367 + }, + { + "epoch": 3.008894536213469, + "grad_norm": 1.918921092336155, + "learning_rate": 2.950407151837421e-06, + "loss": 1.0056, + "step": 2368 + }, + { + "epoch": 3.0101651842439643, + "grad_norm": 2.2556239875071147, + "learning_rate": 2.9432416180377997e-06, + "loss": 1.2765, + "step": 2369 + }, + { + "epoch": 3.01143583227446, + "grad_norm": 2.7301888570855835, + "learning_rate": 2.9360832943655327e-06, + "loss": 1.2056, + "step": 2370 + }, + { + "epoch": 3.0127064803049555, + "grad_norm": 1.9099132389678797, + "learning_rate": 2.9289321881345257e-06, + "loss": 1.2137, + "step": 2371 + }, + { + "epoch": 3.0139771283354513, + "grad_norm": 2.0717116847331103, + "learning_rate": 2.9217883066512996e-06, + "loss": 0.9831, + "step": 2372 + }, + { + "epoch": 3.0152477763659467, + "grad_norm": 2.076544317014107, + "learning_rate": 2.914651657214996e-06, + "loss": 1.2405, + "step": 2373 + }, + { + "epoch": 3.016518424396442, + "grad_norm": 1.8240041683884223, + "learning_rate": 2.9075222471173725e-06, + "loss": 1.3826, + "step": 2374 + }, + { + "epoch": 3.017789072426938, + "grad_norm": 2.133027343181042, + "learning_rate": 2.9004000836427915e-06, + "loss": 1.1397, + "step": 2375 + }, + { + "epoch": 3.019059720457433, + "grad_norm": 1.7455345298398421, + "learning_rate": 2.893285174068201e-06, + "loss": 1.2348, + "step": 2376 + }, + { + "epoch": 3.020330368487929, + "grad_norm": 1.9656471710779257, + "learning_rate": 2.886177525663143e-06, + "loss": 1.1276, + "step": 2377 + }, + { + "epoch": 3.0216010165184244, + "grad_norm": 2.1534610852280327, + "learning_rate": 2.879077145689746e-06, + "loss": 1.3181, + "step": 2378 + }, + { + "epoch": 3.0228716645489198, + "grad_norm": 1.6287196021115788, + "learning_rate": 2.8719840414027047e-06, + "loss": 1.1469, + "step": 2379 + }, + { + "epoch": 3.0241423125794156, + "grad_norm": 1.6221238932125104, + "learning_rate": 2.864898220049277e-06, + "loss": 1.3919, + "step": 2380 + }, + { + "epoch": 3.025412960609911, + "grad_norm": 1.7903952814121018, + "learning_rate": 2.8578196888692932e-06, + "loss": 1.2985, + "step": 2381 + }, + { + "epoch": 3.0266836086404068, + "grad_norm": 1.9414238550600529, + "learning_rate": 2.8507484550951203e-06, + "loss": 1.3897, + "step": 2382 + }, + { + "epoch": 3.027954256670902, + "grad_norm": 1.6706825834310917, + "learning_rate": 2.843684525951681e-06, + "loss": 1.2016, + "step": 2383 + }, + { + "epoch": 3.0292249047013975, + "grad_norm": 1.7792669153317013, + "learning_rate": 2.8366279086564265e-06, + "loss": 1.3073, + "step": 2384 + }, + { + "epoch": 3.0304955527318933, + "grad_norm": 2.0092840867613146, + "learning_rate": 2.829578610419337e-06, + "loss": 1.1512, + "step": 2385 + }, + { + "epoch": 3.0317662007623887, + "grad_norm": 2.0069330063950392, + "learning_rate": 2.8225366384429197e-06, + "loss": 1.3232, + "step": 2386 + }, + { + "epoch": 3.0330368487928845, + "grad_norm": 1.7756567922278317, + "learning_rate": 2.8155019999221988e-06, + "loss": 1.1533, + "step": 2387 + }, + { + "epoch": 3.03430749682338, + "grad_norm": 1.90732193983837, + "learning_rate": 2.8084747020446977e-06, + "loss": 1.4712, + "step": 2388 + }, + { + "epoch": 3.0355781448538757, + "grad_norm": 1.7243416047414686, + "learning_rate": 2.80145475199044e-06, + "loss": 1.0818, + "step": 2389 + }, + { + "epoch": 3.036848792884371, + "grad_norm": 1.6847557181638975, + "learning_rate": 2.794442156931949e-06, + "loss": 1.2525, + "step": 2390 + }, + { + "epoch": 3.0381194409148664, + "grad_norm": 1.6368875045530906, + "learning_rate": 2.787436924034228e-06, + "loss": 1.2932, + "step": 2391 + }, + { + "epoch": 3.0393900889453622, + "grad_norm": 1.5948357268555189, + "learning_rate": 2.780439060454756e-06, + "loss": 1.3573, + "step": 2392 + }, + { + "epoch": 3.0406607369758576, + "grad_norm": 1.8124373478736528, + "learning_rate": 2.77344857334349e-06, + "loss": 1.4497, + "step": 2393 + }, + { + "epoch": 3.0419313850063534, + "grad_norm": 1.6889098700427108, + "learning_rate": 2.7664654698428407e-06, + "loss": 1.4171, + "step": 2394 + }, + { + "epoch": 3.0432020330368488, + "grad_norm": 1.8567968600834586, + "learning_rate": 2.7594897570876866e-06, + "loss": 1.411, + "step": 2395 + }, + { + "epoch": 3.044472681067344, + "grad_norm": 1.6434300105223425, + "learning_rate": 2.7525214422053424e-06, + "loss": 1.1523, + "step": 2396 + }, + { + "epoch": 3.04574332909784, + "grad_norm": 1.7170595548391778, + "learning_rate": 2.7455605323155697e-06, + "loss": 1.2543, + "step": 2397 + }, + { + "epoch": 3.0470139771283353, + "grad_norm": 1.7839808070985894, + "learning_rate": 2.738607034530566e-06, + "loss": 1.306, + "step": 2398 + }, + { + "epoch": 3.048284625158831, + "grad_norm": 1.9548446588583341, + "learning_rate": 2.7316609559549568e-06, + "loss": 1.4932, + "step": 2399 + }, + { + "epoch": 3.0495552731893265, + "grad_norm": 1.7197357275194696, + "learning_rate": 2.724722303685781e-06, + "loss": 1.2957, + "step": 2400 + }, + { + "epoch": 3.0508259212198223, + "grad_norm": 1.8084821385370988, + "learning_rate": 2.71779108481249e-06, + "loss": 1.1916, + "step": 2401 + }, + { + "epoch": 3.0520965692503177, + "grad_norm": 1.8019568439707627, + "learning_rate": 2.71086730641695e-06, + "loss": 1.2896, + "step": 2402 + }, + { + "epoch": 3.053367217280813, + "grad_norm": 1.6232294705360168, + "learning_rate": 2.7039509755734117e-06, + "loss": 1.3035, + "step": 2403 + }, + { + "epoch": 3.054637865311309, + "grad_norm": 1.732272049812764, + "learning_rate": 2.697042099348528e-06, + "loss": 1.3815, + "step": 2404 + }, + { + "epoch": 3.0559085133418042, + "grad_norm": 1.522102653553474, + "learning_rate": 2.6901406848013254e-06, + "loss": 1.2552, + "step": 2405 + }, + { + "epoch": 3.0571791613723, + "grad_norm": 1.9470133300553394, + "learning_rate": 2.683246738983217e-06, + "loss": 1.2267, + "step": 2406 + }, + { + "epoch": 3.0584498094027954, + "grad_norm": 1.4692551227434485, + "learning_rate": 2.6763602689379753e-06, + "loss": 1.139, + "step": 2407 + }, + { + "epoch": 3.059720457433291, + "grad_norm": 1.766405834849575, + "learning_rate": 2.669481281701739e-06, + "loss": 1.2124, + "step": 2408 + }, + { + "epoch": 3.0609911054637866, + "grad_norm": 1.6423190989642726, + "learning_rate": 2.6626097843029986e-06, + "loss": 1.3618, + "step": 2409 + }, + { + "epoch": 3.062261753494282, + "grad_norm": 1.7563432167722974, + "learning_rate": 2.6557457837625956e-06, + "loss": 1.1546, + "step": 2410 + }, + { + "epoch": 3.063532401524778, + "grad_norm": 1.7587679513921985, + "learning_rate": 2.648889287093713e-06, + "loss": 1.3976, + "step": 2411 + }, + { + "epoch": 3.064803049555273, + "grad_norm": 1.663781245316886, + "learning_rate": 2.642040301301861e-06, + "loss": 1.2392, + "step": 2412 + }, + { + "epoch": 3.0660736975857685, + "grad_norm": 1.5989960700089227, + "learning_rate": 2.6351988333848787e-06, + "loss": 1.186, + "step": 2413 + }, + { + "epoch": 3.0673443456162643, + "grad_norm": 1.7586610135204503, + "learning_rate": 2.6283648903329263e-06, + "loss": 1.1437, + "step": 2414 + }, + { + "epoch": 3.0686149936467597, + "grad_norm": 1.9317190502603545, + "learning_rate": 2.621538479128468e-06, + "loss": 1.2426, + "step": 2415 + }, + { + "epoch": 3.0698856416772555, + "grad_norm": 1.9112048957538366, + "learning_rate": 2.6147196067462855e-06, + "loss": 1.2112, + "step": 2416 + }, + { + "epoch": 3.071156289707751, + "grad_norm": 1.9348624538847758, + "learning_rate": 2.6079082801534417e-06, + "loss": 1.0744, + "step": 2417 + }, + { + "epoch": 3.0724269377382467, + "grad_norm": 1.83420768568301, + "learning_rate": 2.6011045063093064e-06, + "loss": 1.3859, + "step": 2418 + }, + { + "epoch": 3.073697585768742, + "grad_norm": 1.5765598823395752, + "learning_rate": 2.5943082921655194e-06, + "loss": 1.4534, + "step": 2419 + }, + { + "epoch": 3.0749682337992374, + "grad_norm": 1.6995207801918275, + "learning_rate": 2.587519644666001e-06, + "loss": 1.2786, + "step": 2420 + }, + { + "epoch": 3.0762388818297333, + "grad_norm": 1.7860290662716032, + "learning_rate": 2.580738570746939e-06, + "loss": 1.2452, + "step": 2421 + }, + { + "epoch": 3.0775095298602286, + "grad_norm": 1.5989460694010966, + "learning_rate": 2.5739650773367876e-06, + "loss": 1.4941, + "step": 2422 + }, + { + "epoch": 3.0787801778907244, + "grad_norm": 1.7672357089554391, + "learning_rate": 2.567199171356255e-06, + "loss": 1.298, + "step": 2423 + }, + { + "epoch": 3.08005082592122, + "grad_norm": 1.6720009691768105, + "learning_rate": 2.5604408597182917e-06, + "loss": 1.1471, + "step": 2424 + }, + { + "epoch": 3.081321473951715, + "grad_norm": 1.5102495836791583, + "learning_rate": 2.5536901493280897e-06, + "loss": 1.3379, + "step": 2425 + }, + { + "epoch": 3.082592121982211, + "grad_norm": 1.8372717401271044, + "learning_rate": 2.5469470470830827e-06, + "loss": 1.2962, + "step": 2426 + }, + { + "epoch": 3.0838627700127064, + "grad_norm": 1.880212000099131, + "learning_rate": 2.5402115598729182e-06, + "loss": 1.2711, + "step": 2427 + }, + { + "epoch": 3.085133418043202, + "grad_norm": 1.581484833594291, + "learning_rate": 2.533483694579477e-06, + "loss": 1.2823, + "step": 2428 + }, + { + "epoch": 3.0864040660736975, + "grad_norm": 1.6436382107472818, + "learning_rate": 2.5267634580768398e-06, + "loss": 1.4279, + "step": 2429 + }, + { + "epoch": 3.0876747141041934, + "grad_norm": 1.7784636953279571, + "learning_rate": 2.520050857231302e-06, + "loss": 1.2795, + "step": 2430 + }, + { + "epoch": 3.0889453621346887, + "grad_norm": 1.6686373614869814, + "learning_rate": 2.5133458989013536e-06, + "loss": 1.1758, + "step": 2431 + }, + { + "epoch": 3.090216010165184, + "grad_norm": 2.0466360249592146, + "learning_rate": 2.5066485899376704e-06, + "loss": 1.1162, + "step": 2432 + }, + { + "epoch": 3.09148665819568, + "grad_norm": 1.8507598532475589, + "learning_rate": 2.4999589371831258e-06, + "loss": 1.2016, + "step": 2433 + }, + { + "epoch": 3.0927573062261753, + "grad_norm": 1.5824017681378673, + "learning_rate": 2.493276947472756e-06, + "loss": 1.2221, + "step": 2434 + }, + { + "epoch": 3.094027954256671, + "grad_norm": 1.633379375594195, + "learning_rate": 2.4866026276337818e-06, + "loss": 1.2296, + "step": 2435 + }, + { + "epoch": 3.0952986022871665, + "grad_norm": 1.754909121517295, + "learning_rate": 2.4799359844855763e-06, + "loss": 1.1888, + "step": 2436 + }, + { + "epoch": 3.096569250317662, + "grad_norm": 1.5519156929438713, + "learning_rate": 2.47327702483967e-06, + "loss": 1.2978, + "step": 2437 + }, + { + "epoch": 3.0978398983481577, + "grad_norm": 1.7766717505881073, + "learning_rate": 2.4666257554997496e-06, + "loss": 1.407, + "step": 2438 + }, + { + "epoch": 3.099110546378653, + "grad_norm": 1.7497637609072278, + "learning_rate": 2.459982183261642e-06, + "loss": 1.2113, + "step": 2439 + }, + { + "epoch": 3.100381194409149, + "grad_norm": 1.9264978326369875, + "learning_rate": 2.4533463149133073e-06, + "loss": 1.3552, + "step": 2440 + }, + { + "epoch": 3.101651842439644, + "grad_norm": 1.7947498856520572, + "learning_rate": 2.446718157234832e-06, + "loss": 1.1678, + "step": 2441 + }, + { + "epoch": 3.1029224904701396, + "grad_norm": 2.4641410743895142, + "learning_rate": 2.440097716998433e-06, + "loss": 1.3406, + "step": 2442 + }, + { + "epoch": 3.1041931385006354, + "grad_norm": 1.8883725023205793, + "learning_rate": 2.4334850009684342e-06, + "loss": 1.173, + "step": 2443 + }, + { + "epoch": 3.1054637865311308, + "grad_norm": 1.6354338430570137, + "learning_rate": 2.4268800159012664e-06, + "loss": 1.1719, + "step": 2444 + }, + { + "epoch": 3.1067344345616266, + "grad_norm": 1.6103664625611638, + "learning_rate": 2.420282768545469e-06, + "loss": 1.2, + "step": 2445 + }, + { + "epoch": 3.108005082592122, + "grad_norm": 1.4172615712487338, + "learning_rate": 2.4136932656416735e-06, + "loss": 1.3682, + "step": 2446 + }, + { + "epoch": 3.1092757306226178, + "grad_norm": 1.6721589174587272, + "learning_rate": 2.407111513922594e-06, + "loss": 1.16, + "step": 2447 + }, + { + "epoch": 3.110546378653113, + "grad_norm": 1.6061198847753624, + "learning_rate": 2.4005375201130275e-06, + "loss": 1.2102, + "step": 2448 + }, + { + "epoch": 3.1118170266836085, + "grad_norm": 2.0932204461880897, + "learning_rate": 2.393971290929842e-06, + "loss": 1.4676, + "step": 2449 + }, + { + "epoch": 3.1130876747141043, + "grad_norm": 1.8592189977974782, + "learning_rate": 2.3874128330819768e-06, + "loss": 1.277, + "step": 2450 + }, + { + "epoch": 3.1143583227445997, + "grad_norm": 1.851433962265915, + "learning_rate": 2.3808621532704324e-06, + "loss": 1.34, + "step": 2451 + }, + { + "epoch": 3.1156289707750955, + "grad_norm": 1.7810350194668882, + "learning_rate": 2.3743192581882557e-06, + "loss": 1.3526, + "step": 2452 + }, + { + "epoch": 3.116899618805591, + "grad_norm": 1.6744866612908063, + "learning_rate": 2.3677841545205394e-06, + "loss": 1.1099, + "step": 2453 + }, + { + "epoch": 3.1181702668360862, + "grad_norm": 1.7583645293576202, + "learning_rate": 2.3612568489444255e-06, + "loss": 1.0625, + "step": 2454 + }, + { + "epoch": 3.119440914866582, + "grad_norm": 1.756564024894285, + "learning_rate": 2.354737348129077e-06, + "loss": 1.2936, + "step": 2455 + }, + { + "epoch": 3.1207115628970774, + "grad_norm": 1.586413743190357, + "learning_rate": 2.3482256587356857e-06, + "loss": 1.2569, + "step": 2456 + }, + { + "epoch": 3.121982210927573, + "grad_norm": 1.7556994065125986, + "learning_rate": 2.341721787417466e-06, + "loss": 1.35, + "step": 2457 + }, + { + "epoch": 3.1232528589580686, + "grad_norm": 2.0852506827055515, + "learning_rate": 2.3352257408196444e-06, + "loss": 1.1543, + "step": 2458 + }, + { + "epoch": 3.124523506988564, + "grad_norm": 1.7712498030499715, + "learning_rate": 2.3287375255794488e-06, + "loss": 1.2117, + "step": 2459 + }, + { + "epoch": 3.1257941550190598, + "grad_norm": 1.6886622101501163, + "learning_rate": 2.322257148326105e-06, + "loss": 1.3069, + "step": 2460 + }, + { + "epoch": 3.127064803049555, + "grad_norm": 1.9420365086561293, + "learning_rate": 2.3157846156808304e-06, + "loss": 1.3194, + "step": 2461 + }, + { + "epoch": 3.128335451080051, + "grad_norm": 1.5276811681070064, + "learning_rate": 2.3093199342568316e-06, + "loss": 1.2025, + "step": 2462 + }, + { + "epoch": 3.1296060991105463, + "grad_norm": 1.8115947546522564, + "learning_rate": 2.3028631106592947e-06, + "loss": 1.3846, + "step": 2463 + }, + { + "epoch": 3.130876747141042, + "grad_norm": 1.6659681066420193, + "learning_rate": 2.296414151485371e-06, + "loss": 1.2816, + "step": 2464 + }, + { + "epoch": 3.1321473951715375, + "grad_norm": 1.8727411129268166, + "learning_rate": 2.2899730633241747e-06, + "loss": 1.3635, + "step": 2465 + }, + { + "epoch": 3.133418043202033, + "grad_norm": 1.846374256482884, + "learning_rate": 2.2835398527567888e-06, + "loss": 1.3548, + "step": 2466 + }, + { + "epoch": 3.1346886912325287, + "grad_norm": 1.6356275346464342, + "learning_rate": 2.2771145263562355e-06, + "loss": 1.452, + "step": 2467 + }, + { + "epoch": 3.135959339263024, + "grad_norm": 1.9577792022247684, + "learning_rate": 2.2706970906874913e-06, + "loss": 1.5602, + "step": 2468 + }, + { + "epoch": 3.13722998729352, + "grad_norm": 1.8068650757372393, + "learning_rate": 2.2642875523074613e-06, + "loss": 1.1777, + "step": 2469 + }, + { + "epoch": 3.1385006353240152, + "grad_norm": 1.7538676709057037, + "learning_rate": 2.2578859177649924e-06, + "loss": 1.4586, + "step": 2470 + }, + { + "epoch": 3.1397712833545106, + "grad_norm": 1.7879657250480516, + "learning_rate": 2.251492193600846e-06, + "loss": 1.3761, + "step": 2471 + }, + { + "epoch": 3.1410419313850064, + "grad_norm": 2.3249828103843457, + "learning_rate": 2.245106386347706e-06, + "loss": 1.5108, + "step": 2472 + }, + { + "epoch": 3.142312579415502, + "grad_norm": 1.7462573196518558, + "learning_rate": 2.238728502530161e-06, + "loss": 1.2632, + "step": 2473 + }, + { + "epoch": 3.1435832274459976, + "grad_norm": 1.736296512648159, + "learning_rate": 2.2323585486647193e-06, + "loss": 1.2525, + "step": 2474 + }, + { + "epoch": 3.144853875476493, + "grad_norm": 1.9470504131851696, + "learning_rate": 2.225996531259772e-06, + "loss": 1.3773, + "step": 2475 + }, + { + "epoch": 3.1461245235069883, + "grad_norm": 1.9076770489781227, + "learning_rate": 2.2196424568156073e-06, + "loss": 1.0692, + "step": 2476 + }, + { + "epoch": 3.147395171537484, + "grad_norm": 1.7248979518869032, + "learning_rate": 2.2132963318243917e-06, + "loss": 1.486, + "step": 2477 + }, + { + "epoch": 3.1486658195679795, + "grad_norm": 2.035045665415418, + "learning_rate": 2.206958162770183e-06, + "loss": 1.3372, + "step": 2478 + }, + { + "epoch": 3.1499364675984753, + "grad_norm": 1.571699409302641, + "learning_rate": 2.2006279561288934e-06, + "loss": 1.2629, + "step": 2479 + }, + { + "epoch": 3.1512071156289707, + "grad_norm": 1.5587477711625815, + "learning_rate": 2.1943057183683146e-06, + "loss": 1.1787, + "step": 2480 + }, + { + "epoch": 3.1524777636594665, + "grad_norm": 1.8196764892645745, + "learning_rate": 2.1879914559480853e-06, + "loss": 1.2803, + "step": 2481 + }, + { + "epoch": 3.153748411689962, + "grad_norm": 1.7798333128896995, + "learning_rate": 2.1816851753197023e-06, + "loss": 1.1567, + "step": 2482 + }, + { + "epoch": 3.1550190597204573, + "grad_norm": 1.6955424352749122, + "learning_rate": 2.1753868829265046e-06, + "loss": 1.2298, + "step": 2483 + }, + { + "epoch": 3.156289707750953, + "grad_norm": 1.7276703176456651, + "learning_rate": 2.169096585203668e-06, + "loss": 1.1972, + "step": 2484 + }, + { + "epoch": 3.1575603557814484, + "grad_norm": 1.7683359583678933, + "learning_rate": 2.1628142885781966e-06, + "loss": 1.2756, + "step": 2485 + }, + { + "epoch": 3.1588310038119443, + "grad_norm": 1.775103180200822, + "learning_rate": 2.156539999468934e-06, + "loss": 1.1889, + "step": 2486 + }, + { + "epoch": 3.1601016518424396, + "grad_norm": 1.4900632070960849, + "learning_rate": 2.1502737242865266e-06, + "loss": 1.3028, + "step": 2487 + }, + { + "epoch": 3.161372299872935, + "grad_norm": 1.7758388346832947, + "learning_rate": 2.14401546943344e-06, + "loss": 1.2355, + "step": 2488 + }, + { + "epoch": 3.162642947903431, + "grad_norm": 1.576094652086718, + "learning_rate": 2.1377652413039405e-06, + "loss": 1.2363, + "step": 2489 + }, + { + "epoch": 3.163913595933926, + "grad_norm": 1.7874303237007443, + "learning_rate": 2.1315230462840985e-06, + "loss": 1.3443, + "step": 2490 + }, + { + "epoch": 3.165184243964422, + "grad_norm": 1.6787912401680196, + "learning_rate": 2.125288890751779e-06, + "loss": 1.2906, + "step": 2491 + }, + { + "epoch": 3.1664548919949174, + "grad_norm": 2.0180482623233202, + "learning_rate": 2.1190627810766228e-06, + "loss": 1.4137, + "step": 2492 + }, + { + "epoch": 3.1677255400254127, + "grad_norm": 1.564533811757316, + "learning_rate": 2.1128447236200544e-06, + "loss": 1.1543, + "step": 2493 + }, + { + "epoch": 3.1689961880559085, + "grad_norm": 1.7774083993726737, + "learning_rate": 2.106634724735278e-06, + "loss": 1.259, + "step": 2494 + }, + { + "epoch": 3.170266836086404, + "grad_norm": 1.5448424912442245, + "learning_rate": 2.100432790767254e-06, + "loss": 1.2078, + "step": 2495 + }, + { + "epoch": 3.1715374841168997, + "grad_norm": 2.142201872129878, + "learning_rate": 2.0942389280527066e-06, + "loss": 1.2173, + "step": 2496 + }, + { + "epoch": 3.172808132147395, + "grad_norm": 1.6587617050189192, + "learning_rate": 2.0880531429201146e-06, + "loss": 1.3604, + "step": 2497 + }, + { + "epoch": 3.174078780177891, + "grad_norm": 1.7834074653396286, + "learning_rate": 2.081875441689706e-06, + "loss": 1.1448, + "step": 2498 + }, + { + "epoch": 3.1753494282083863, + "grad_norm": 1.7572575087822744, + "learning_rate": 2.0757058306734433e-06, + "loss": 1.2094, + "step": 2499 + }, + { + "epoch": 3.1766200762388817, + "grad_norm": 2.083569077535876, + "learning_rate": 2.069544316175025e-06, + "loss": 1.3869, + "step": 2500 + }, + { + "epoch": 3.1778907242693775, + "grad_norm": 1.823152571267711, + "learning_rate": 2.0633909044898748e-06, + "loss": 1.2244, + "step": 2501 + }, + { + "epoch": 3.179161372299873, + "grad_norm": 1.773039069704479, + "learning_rate": 2.0572456019051446e-06, + "loss": 1.3954, + "step": 2502 + }, + { + "epoch": 3.1804320203303686, + "grad_norm": 1.8971742409013088, + "learning_rate": 2.0511084146996975e-06, + "loss": 1.3243, + "step": 2503 + }, + { + "epoch": 3.181702668360864, + "grad_norm": 1.7944894466951278, + "learning_rate": 2.0449793491441026e-06, + "loss": 1.4261, + "step": 2504 + }, + { + "epoch": 3.1829733163913594, + "grad_norm": 1.5140189328195208, + "learning_rate": 2.038858411500629e-06, + "loss": 1.1631, + "step": 2505 + }, + { + "epoch": 3.184243964421855, + "grad_norm": 1.7061152885765603, + "learning_rate": 2.03274560802325e-06, + "loss": 1.3066, + "step": 2506 + }, + { + "epoch": 3.1855146124523506, + "grad_norm": 1.4811927119334365, + "learning_rate": 2.026640944957621e-06, + "loss": 1.292, + "step": 2507 + }, + { + "epoch": 3.1867852604828464, + "grad_norm": 1.5809354619829985, + "learning_rate": 2.020544428541077e-06, + "loss": 1.3237, + "step": 2508 + }, + { + "epoch": 3.1880559085133418, + "grad_norm": 1.5956049131349987, + "learning_rate": 2.014456065002637e-06, + "loss": 1.1337, + "step": 2509 + }, + { + "epoch": 3.189326556543837, + "grad_norm": 1.5734953655845838, + "learning_rate": 2.00837586056299e-06, + "loss": 1.2489, + "step": 2510 + }, + { + "epoch": 3.190597204574333, + "grad_norm": 1.5257536631287314, + "learning_rate": 2.0023038214344827e-06, + "loss": 1.3132, + "step": 2511 + }, + { + "epoch": 3.1918678526048283, + "grad_norm": 1.875502687536241, + "learning_rate": 1.996239953821121e-06, + "loss": 1.444, + "step": 2512 + }, + { + "epoch": 3.193138500635324, + "grad_norm": 1.8223412917715889, + "learning_rate": 1.990184263918561e-06, + "loss": 1.4234, + "step": 2513 + }, + { + "epoch": 3.1944091486658195, + "grad_norm": 1.9417963652654793, + "learning_rate": 1.9841367579141057e-06, + "loss": 1.4581, + "step": 2514 + }, + { + "epoch": 3.1956797966963153, + "grad_norm": 1.6085196469552832, + "learning_rate": 1.9780974419866995e-06, + "loss": 1.1893, + "step": 2515 + }, + { + "epoch": 3.1969504447268107, + "grad_norm": 1.6969997688746226, + "learning_rate": 1.9720663223069115e-06, + "loss": 1.2482, + "step": 2516 + }, + { + "epoch": 3.198221092757306, + "grad_norm": 1.749402884036117, + "learning_rate": 1.966043405036936e-06, + "loss": 1.1891, + "step": 2517 + }, + { + "epoch": 3.199491740787802, + "grad_norm": 1.8730967483194572, + "learning_rate": 1.960028696330596e-06, + "loss": 1.2247, + "step": 2518 + }, + { + "epoch": 3.200762388818297, + "grad_norm": 1.7762581172294063, + "learning_rate": 1.9540222023333165e-06, + "loss": 1.3077, + "step": 2519 + }, + { + "epoch": 3.202033036848793, + "grad_norm": 1.4899743911392564, + "learning_rate": 1.94802392918214e-06, + "loss": 1.2546, + "step": 2520 + }, + { + "epoch": 3.2033036848792884, + "grad_norm": 11.200908479200699, + "learning_rate": 1.9420338830056984e-06, + "loss": 1.3995, + "step": 2521 + }, + { + "epoch": 3.204574332909784, + "grad_norm": 1.834755307368342, + "learning_rate": 1.936052069924228e-06, + "loss": 1.1952, + "step": 2522 + }, + { + "epoch": 3.2058449809402796, + "grad_norm": 1.9324793062038865, + "learning_rate": 1.9300784960495454e-06, + "loss": 1.4198, + "step": 2523 + }, + { + "epoch": 3.207115628970775, + "grad_norm": 1.8000970358216641, + "learning_rate": 1.924113167485054e-06, + "loss": 1.2533, + "step": 2524 + }, + { + "epoch": 3.2083862770012708, + "grad_norm": 1.5880216587333218, + "learning_rate": 1.9181560903257234e-06, + "loss": 1.3579, + "step": 2525 + }, + { + "epoch": 3.209656925031766, + "grad_norm": 1.8480812181506967, + "learning_rate": 1.9122072706581107e-06, + "loss": 1.3448, + "step": 2526 + }, + { + "epoch": 3.210927573062262, + "grad_norm": 2.011537400480146, + "learning_rate": 1.9062667145603208e-06, + "loss": 1.4513, + "step": 2527 + }, + { + "epoch": 3.2121982210927573, + "grad_norm": 1.9724271618153089, + "learning_rate": 1.9003344281020185e-06, + "loss": 1.3168, + "step": 2528 + }, + { + "epoch": 3.2134688691232527, + "grad_norm": 1.7502666918999221, + "learning_rate": 1.8944104173444178e-06, + "loss": 1.3477, + "step": 2529 + }, + { + "epoch": 3.2147395171537485, + "grad_norm": 1.7319409313243204, + "learning_rate": 1.8884946883402845e-06, + "loss": 1.3608, + "step": 2530 + }, + { + "epoch": 3.216010165184244, + "grad_norm": 1.783038046631286, + "learning_rate": 1.8825872471339146e-06, + "loss": 1.2214, + "step": 2531 + }, + { + "epoch": 3.2172808132147397, + "grad_norm": 1.720134512976163, + "learning_rate": 1.8766880997611424e-06, + "loss": 1.3674, + "step": 2532 + }, + { + "epoch": 3.218551461245235, + "grad_norm": 1.7045142043861203, + "learning_rate": 1.8707972522493211e-06, + "loss": 1.0517, + "step": 2533 + }, + { + "epoch": 3.2198221092757304, + "grad_norm": 1.6352671308189537, + "learning_rate": 1.8649147106173326e-06, + "loss": 1.2439, + "step": 2534 + }, + { + "epoch": 3.2210927573062262, + "grad_norm": 1.4847088232236991, + "learning_rate": 1.8590404808755646e-06, + "loss": 1.2547, + "step": 2535 + }, + { + "epoch": 3.2223634053367216, + "grad_norm": 3.125888668868235, + "learning_rate": 1.853174569025914e-06, + "loss": 1.2783, + "step": 2536 + }, + { + "epoch": 3.2236340533672174, + "grad_norm": 1.988625654965949, + "learning_rate": 1.847316981061782e-06, + "loss": 1.2393, + "step": 2537 + }, + { + "epoch": 3.224904701397713, + "grad_norm": 11.26007842938525, + "learning_rate": 1.8414677229680645e-06, + "loss": 1.4299, + "step": 2538 + }, + { + "epoch": 3.2261753494282086, + "grad_norm": 1.8211500733017565, + "learning_rate": 1.8356268007211442e-06, + "loss": 1.1572, + "step": 2539 + }, + { + "epoch": 3.227445997458704, + "grad_norm": 1.528557119278922, + "learning_rate": 1.8297942202888874e-06, + "loss": 1.3529, + "step": 2540 + }, + { + "epoch": 3.2287166454891993, + "grad_norm": 1.6232166805721628, + "learning_rate": 1.823969987630635e-06, + "loss": 1.3527, + "step": 2541 + }, + { + "epoch": 3.229987293519695, + "grad_norm": 1.5340998572237294, + "learning_rate": 1.8181541086972066e-06, + "loss": 1.0331, + "step": 2542 + }, + { + "epoch": 3.2312579415501905, + "grad_norm": 1.6425772528850264, + "learning_rate": 1.8123465894308756e-06, + "loss": 1.3634, + "step": 2543 + }, + { + "epoch": 3.2325285895806863, + "grad_norm": 1.7235750348622498, + "learning_rate": 1.8065474357653855e-06, + "loss": 1.1661, + "step": 2544 + }, + { + "epoch": 3.2337992376111817, + "grad_norm": 1.7389286929216596, + "learning_rate": 1.8007566536259224e-06, + "loss": 1.3072, + "step": 2545 + }, + { + "epoch": 3.235069885641677, + "grad_norm": 1.7967188959884117, + "learning_rate": 1.7949742489291256e-06, + "loss": 1.4596, + "step": 2546 + }, + { + "epoch": 3.236340533672173, + "grad_norm": 1.6685604006798536, + "learning_rate": 1.7892002275830723e-06, + "loss": 1.4423, + "step": 2547 + }, + { + "epoch": 3.2376111817026683, + "grad_norm": 1.9010633740483633, + "learning_rate": 1.7834345954872711e-06, + "loss": 1.2548, + "step": 2548 + }, + { + "epoch": 3.238881829733164, + "grad_norm": 1.8558798945459534, + "learning_rate": 1.7776773585326645e-06, + "loss": 1.3086, + "step": 2549 + }, + { + "epoch": 3.2401524777636594, + "grad_norm": 1.5729842199934867, + "learning_rate": 1.7719285226016181e-06, + "loss": 1.3923, + "step": 2550 + }, + { + "epoch": 3.241423125794155, + "grad_norm": 1.7812302004801492, + "learning_rate": 1.7661880935679077e-06, + "loss": 1.1471, + "step": 2551 + }, + { + "epoch": 3.2426937738246506, + "grad_norm": 1.7290420005529408, + "learning_rate": 1.7604560772967228e-06, + "loss": 1.3251, + "step": 2552 + }, + { + "epoch": 3.243964421855146, + "grad_norm": 1.9556366082379057, + "learning_rate": 1.7547324796446553e-06, + "loss": 1.1404, + "step": 2553 + }, + { + "epoch": 3.245235069885642, + "grad_norm": 1.6160249986866686, + "learning_rate": 1.7490173064596994e-06, + "loss": 1.4717, + "step": 2554 + }, + { + "epoch": 3.246505717916137, + "grad_norm": 1.7583921649936753, + "learning_rate": 1.743310563581242e-06, + "loss": 1.3199, + "step": 2555 + }, + { + "epoch": 3.247776365946633, + "grad_norm": 2.032519224190622, + "learning_rate": 1.7376122568400533e-06, + "loss": 1.2158, + "step": 2556 + }, + { + "epoch": 3.2490470139771284, + "grad_norm": 1.9452217164271726, + "learning_rate": 1.7319223920582795e-06, + "loss": 1.3222, + "step": 2557 + }, + { + "epoch": 3.2503176620076237, + "grad_norm": 1.670607279727031, + "learning_rate": 1.7262409750494546e-06, + "loss": 1.2129, + "step": 2558 + }, + { + "epoch": 3.2515883100381195, + "grad_norm": 1.8375210610863488, + "learning_rate": 1.7205680116184698e-06, + "loss": 1.0889, + "step": 2559 + }, + { + "epoch": 3.252858958068615, + "grad_norm": 1.8173242314446705, + "learning_rate": 1.7149035075615795e-06, + "loss": 1.1841, + "step": 2560 + }, + { + "epoch": 3.2541296060991107, + "grad_norm": 1.5470934273402792, + "learning_rate": 1.7092474686664018e-06, + "loss": 1.38, + "step": 2561 + }, + { + "epoch": 3.255400254129606, + "grad_norm": 1.7698700793992441, + "learning_rate": 1.703599900711903e-06, + "loss": 1.0056, + "step": 2562 + }, + { + "epoch": 3.2566709021601015, + "grad_norm": 1.9637213866070415, + "learning_rate": 1.697960809468392e-06, + "loss": 1.5702, + "step": 2563 + }, + { + "epoch": 3.2579415501905973, + "grad_norm": 1.568118703304819, + "learning_rate": 1.6923302006975174e-06, + "loss": 1.1889, + "step": 2564 + }, + { + "epoch": 3.2592121982210926, + "grad_norm": 2.1019949451684052, + "learning_rate": 1.6867080801522584e-06, + "loss": 1.317, + "step": 2565 + }, + { + "epoch": 3.2604828462515885, + "grad_norm": 1.7030736642023787, + "learning_rate": 1.681094453576928e-06, + "loss": 1.3978, + "step": 2566 + }, + { + "epoch": 3.261753494282084, + "grad_norm": 1.6902151932446823, + "learning_rate": 1.6754893267071593e-06, + "loss": 1.0607, + "step": 2567 + }, + { + "epoch": 3.263024142312579, + "grad_norm": 1.564217748349614, + "learning_rate": 1.6698927052698965e-06, + "loss": 1.2916, + "step": 2568 + }, + { + "epoch": 3.264294790343075, + "grad_norm": 1.7903482631113141, + "learning_rate": 1.6643045949833936e-06, + "loss": 1.2744, + "step": 2569 + }, + { + "epoch": 3.2655654383735704, + "grad_norm": 1.5940467435876144, + "learning_rate": 1.6587250015572164e-06, + "loss": 1.0312, + "step": 2570 + }, + { + "epoch": 3.266836086404066, + "grad_norm": 1.7225652116973902, + "learning_rate": 1.6531539306922195e-06, + "loss": 1.2726, + "step": 2571 + }, + { + "epoch": 3.2681067344345616, + "grad_norm": 1.6006370824120357, + "learning_rate": 1.6475913880805516e-06, + "loss": 1.1295, + "step": 2572 + }, + { + "epoch": 3.2693773824650574, + "grad_norm": 1.5880416946242164, + "learning_rate": 1.642037379405651e-06, + "loss": 1.4118, + "step": 2573 + }, + { + "epoch": 3.2706480304955527, + "grad_norm": 1.740882673940497, + "learning_rate": 1.6364919103422394e-06, + "loss": 1.2281, + "step": 2574 + }, + { + "epoch": 3.271918678526048, + "grad_norm": 1.8539367045431134, + "learning_rate": 1.6309549865563047e-06, + "loss": 1.213, + "step": 2575 + }, + { + "epoch": 3.273189326556544, + "grad_norm": 2.008826510862433, + "learning_rate": 1.6254266137051077e-06, + "loss": 1.3727, + "step": 2576 + }, + { + "epoch": 3.2744599745870393, + "grad_norm": 1.7726019489981462, + "learning_rate": 1.619906797437173e-06, + "loss": 1.3896, + "step": 2577 + }, + { + "epoch": 3.275730622617535, + "grad_norm": 1.4784689844202854, + "learning_rate": 1.6143955433922864e-06, + "loss": 1.2795, + "step": 2578 + }, + { + "epoch": 3.2770012706480305, + "grad_norm": 1.586961648155315, + "learning_rate": 1.6088928572014795e-06, + "loss": 1.265, + "step": 2579 + }, + { + "epoch": 3.2782719186785263, + "grad_norm": 1.493365217093996, + "learning_rate": 1.6033987444870303e-06, + "loss": 1.2889, + "step": 2580 + }, + { + "epoch": 3.2795425667090217, + "grad_norm": 1.6544268804041813, + "learning_rate": 1.5979132108624572e-06, + "loss": 1.3982, + "step": 2581 + }, + { + "epoch": 3.280813214739517, + "grad_norm": 1.7778419407714103, + "learning_rate": 1.5924362619325184e-06, + "loss": 1.4281, + "step": 2582 + }, + { + "epoch": 3.282083862770013, + "grad_norm": 1.5289639491789007, + "learning_rate": 1.586967903293194e-06, + "loss": 1.2833, + "step": 2583 + }, + { + "epoch": 3.283354510800508, + "grad_norm": 1.85855118389914, + "learning_rate": 1.5815081405316912e-06, + "loss": 1.1843, + "step": 2584 + }, + { + "epoch": 3.2846251588310036, + "grad_norm": 1.9327977737083843, + "learning_rate": 1.5760569792264324e-06, + "loss": 1.4376, + "step": 2585 + }, + { + "epoch": 3.2858958068614994, + "grad_norm": 2.0395641152232797, + "learning_rate": 1.5706144249470545e-06, + "loss": 1.2756, + "step": 2586 + }, + { + "epoch": 3.2871664548919948, + "grad_norm": 1.8734734560385995, + "learning_rate": 1.565180483254396e-06, + "loss": 1.2866, + "step": 2587 + }, + { + "epoch": 3.2884371029224906, + "grad_norm": 1.9185335011211122, + "learning_rate": 1.5597551597004968e-06, + "loss": 1.5165, + "step": 2588 + }, + { + "epoch": 3.289707750952986, + "grad_norm": 1.7423865067684943, + "learning_rate": 1.5543384598285938e-06, + "loss": 1.1624, + "step": 2589 + }, + { + "epoch": 3.2909783989834818, + "grad_norm": 1.9425305357709244, + "learning_rate": 1.5489303891731144e-06, + "loss": 1.4843, + "step": 2590 + }, + { + "epoch": 3.292249047013977, + "grad_norm": 2.0576161228612673, + "learning_rate": 1.5435309532596644e-06, + "loss": 1.1159, + "step": 2591 + }, + { + "epoch": 3.2935196950444725, + "grad_norm": 1.799508736226043, + "learning_rate": 1.538140157605027e-06, + "loss": 1.4685, + "step": 2592 + }, + { + "epoch": 3.2947903430749683, + "grad_norm": 1.653492445202349, + "learning_rate": 1.5327580077171589e-06, + "loss": 1.3122, + "step": 2593 + }, + { + "epoch": 3.2960609911054637, + "grad_norm": 1.8240709075790245, + "learning_rate": 1.5273845090951877e-06, + "loss": 1.2987, + "step": 2594 + }, + { + "epoch": 3.2973316391359595, + "grad_norm": 1.6468537576431075, + "learning_rate": 1.522019667229393e-06, + "loss": 1.266, + "step": 2595 + }, + { + "epoch": 3.298602287166455, + "grad_norm": 1.7727011306314875, + "learning_rate": 1.5166634876012187e-06, + "loss": 1.3907, + "step": 2596 + }, + { + "epoch": 3.2998729351969507, + "grad_norm": 1.7476788965426242, + "learning_rate": 1.5113159756832497e-06, + "loss": 1.2959, + "step": 2597 + }, + { + "epoch": 3.301143583227446, + "grad_norm": 1.6468660618165196, + "learning_rate": 1.5059771369392229e-06, + "loss": 1.0908, + "step": 2598 + }, + { + "epoch": 3.3024142312579414, + "grad_norm": 1.5128065024394974, + "learning_rate": 1.5006469768240062e-06, + "loss": 1.2446, + "step": 2599 + }, + { + "epoch": 3.3036848792884372, + "grad_norm": 1.8642886178980311, + "learning_rate": 1.4953255007836021e-06, + "loss": 1.2274, + "step": 2600 + }, + { + "epoch": 3.3049555273189326, + "grad_norm": 1.6539737668218981, + "learning_rate": 1.4900127142551446e-06, + "loss": 1.3047, + "step": 2601 + }, + { + "epoch": 3.306226175349428, + "grad_norm": 1.5516833997523842, + "learning_rate": 1.4847086226668871e-06, + "loss": 1.3523, + "step": 2602 + }, + { + "epoch": 3.307496823379924, + "grad_norm": 2.088689276710946, + "learning_rate": 1.479413231438197e-06, + "loss": 1.5014, + "step": 2603 + }, + { + "epoch": 3.308767471410419, + "grad_norm": 1.9576724887018369, + "learning_rate": 1.4741265459795517e-06, + "loss": 1.2595, + "step": 2604 + }, + { + "epoch": 3.310038119440915, + "grad_norm": 1.838294166050524, + "learning_rate": 1.4688485716925394e-06, + "loss": 1.3377, + "step": 2605 + }, + { + "epoch": 3.3113087674714103, + "grad_norm": 1.8197922760443668, + "learning_rate": 1.4635793139698384e-06, + "loss": 1.4283, + "step": 2606 + }, + { + "epoch": 3.312579415501906, + "grad_norm": 1.7019671459826233, + "learning_rate": 1.4583187781952335e-06, + "loss": 1.2218, + "step": 2607 + }, + { + "epoch": 3.3138500635324015, + "grad_norm": 1.8758449204713026, + "learning_rate": 1.4530669697435861e-06, + "loss": 1.2324, + "step": 2608 + }, + { + "epoch": 3.315120711562897, + "grad_norm": 1.8105019506396125, + "learning_rate": 1.4478238939808454e-06, + "loss": 1.2777, + "step": 2609 + }, + { + "epoch": 3.3163913595933927, + "grad_norm": 1.87633395598687, + "learning_rate": 1.4425895562640424e-06, + "loss": 1.3799, + "step": 2610 + }, + { + "epoch": 3.317662007623888, + "grad_norm": 1.7606582062257212, + "learning_rate": 1.4373639619412715e-06, + "loss": 1.3026, + "step": 2611 + }, + { + "epoch": 3.318932655654384, + "grad_norm": 1.6010525932324566, + "learning_rate": 1.4321471163516998e-06, + "loss": 1.168, + "step": 2612 + }, + { + "epoch": 3.3202033036848793, + "grad_norm": 1.6255357989695598, + "learning_rate": 1.4269390248255521e-06, + "loss": 1.1377, + "step": 2613 + }, + { + "epoch": 3.321473951715375, + "grad_norm": 1.6876307022543744, + "learning_rate": 1.4217396926841153e-06, + "loss": 1.2671, + "step": 2614 + }, + { + "epoch": 3.3227445997458704, + "grad_norm": 1.699645779116517, + "learning_rate": 1.4165491252397202e-06, + "loss": 1.1274, + "step": 2615 + }, + { + "epoch": 3.324015247776366, + "grad_norm": 1.6818213641961743, + "learning_rate": 1.4113673277957395e-06, + "loss": 1.4025, + "step": 2616 + }, + { + "epoch": 3.3252858958068616, + "grad_norm": 3.2032076510413536, + "learning_rate": 1.4061943056465965e-06, + "loss": 1.3181, + "step": 2617 + }, + { + "epoch": 3.326556543837357, + "grad_norm": 1.8105928565910216, + "learning_rate": 1.4010300640777352e-06, + "loss": 1.6368, + "step": 2618 + }, + { + "epoch": 3.3278271918678524, + "grad_norm": 1.542688925757691, + "learning_rate": 1.3958746083656428e-06, + "loss": 1.2069, + "step": 2619 + }, + { + "epoch": 3.329097839898348, + "grad_norm": 1.71104874448952, + "learning_rate": 1.3907279437778154e-06, + "loss": 1.098, + "step": 2620 + }, + { + "epoch": 3.3303684879288435, + "grad_norm": 1.8722048302574334, + "learning_rate": 1.3855900755727747e-06, + "loss": 1.1335, + "step": 2621 + }, + { + "epoch": 3.3316391359593394, + "grad_norm": 1.9022709916857448, + "learning_rate": 1.3804610090000558e-06, + "loss": 1.4643, + "step": 2622 + }, + { + "epoch": 3.3329097839898347, + "grad_norm": 1.923857938211866, + "learning_rate": 1.3753407493001968e-06, + "loss": 1.2675, + "step": 2623 + }, + { + "epoch": 3.3341804320203305, + "grad_norm": 2.065059666018281, + "learning_rate": 1.3702293017047375e-06, + "loss": 1.2717, + "step": 2624 + }, + { + "epoch": 3.335451080050826, + "grad_norm": 1.7471415191321704, + "learning_rate": 1.3651266714362166e-06, + "loss": 1.2484, + "step": 2625 + }, + { + "epoch": 3.3367217280813213, + "grad_norm": 1.7389887302213156, + "learning_rate": 1.3600328637081672e-06, + "loss": 1.3138, + "step": 2626 + }, + { + "epoch": 3.337992376111817, + "grad_norm": 1.594221698681159, + "learning_rate": 1.3549478837250995e-06, + "loss": 1.2117, + "step": 2627 + }, + { + "epoch": 3.3392630241423125, + "grad_norm": 2.0026558850987137, + "learning_rate": 1.3498717366825086e-06, + "loss": 1.3661, + "step": 2628 + }, + { + "epoch": 3.3405336721728083, + "grad_norm": 1.5019394801634307, + "learning_rate": 1.3448044277668682e-06, + "loss": 1.2723, + "step": 2629 + }, + { + "epoch": 3.3418043202033036, + "grad_norm": 1.92187489823837, + "learning_rate": 1.339745962155613e-06, + "loss": 1.2405, + "step": 2630 + }, + { + "epoch": 3.3430749682337995, + "grad_norm": 1.834804762069544, + "learning_rate": 1.3346963450171536e-06, + "loss": 1.1661, + "step": 2631 + }, + { + "epoch": 3.344345616264295, + "grad_norm": 1.7181637503941978, + "learning_rate": 1.329655581510847e-06, + "loss": 1.2759, + "step": 2632 + }, + { + "epoch": 3.34561626429479, + "grad_norm": 1.9299844130635269, + "learning_rate": 1.324623676787017e-06, + "loss": 1.2278, + "step": 2633 + }, + { + "epoch": 3.346886912325286, + "grad_norm": 1.9112199664175715, + "learning_rate": 1.3196006359869273e-06, + "loss": 1.2686, + "step": 2634 + }, + { + "epoch": 3.3481575603557814, + "grad_norm": 1.7199969917035751, + "learning_rate": 1.3145864642427841e-06, + "loss": 1.4043, + "step": 2635 + }, + { + "epoch": 3.349428208386277, + "grad_norm": 1.8152094794925473, + "learning_rate": 1.3095811666777413e-06, + "loss": 1.2477, + "step": 2636 + }, + { + "epoch": 3.3506988564167726, + "grad_norm": 1.7537576703915967, + "learning_rate": 1.3045847484058748e-06, + "loss": 1.1339, + "step": 2637 + }, + { + "epoch": 3.351969504447268, + "grad_norm": 1.7522948208376807, + "learning_rate": 1.2995972145321979e-06, + "loss": 1.1855, + "step": 2638 + }, + { + "epoch": 3.3532401524777637, + "grad_norm": 1.9192958082012628, + "learning_rate": 1.2946185701526392e-06, + "loss": 1.1036, + "step": 2639 + }, + { + "epoch": 3.354510800508259, + "grad_norm": 1.9867290806679554, + "learning_rate": 1.2896488203540447e-06, + "loss": 1.4721, + "step": 2640 + }, + { + "epoch": 3.355781448538755, + "grad_norm": 1.9360686626231494, + "learning_rate": 1.2846879702141769e-06, + "loss": 1.4588, + "step": 2641 + }, + { + "epoch": 3.3570520965692503, + "grad_norm": 1.9907414210884793, + "learning_rate": 1.2797360248017055e-06, + "loss": 1.3047, + "step": 2642 + }, + { + "epoch": 3.3583227445997457, + "grad_norm": 1.7923504196629938, + "learning_rate": 1.2747929891761978e-06, + "loss": 1.355, + "step": 2643 + }, + { + "epoch": 3.3595933926302415, + "grad_norm": 1.7949718703785045, + "learning_rate": 1.2698588683881185e-06, + "loss": 1.1143, + "step": 2644 + }, + { + "epoch": 3.360864040660737, + "grad_norm": 1.7318736966786377, + "learning_rate": 1.264933667478827e-06, + "loss": 1.3079, + "step": 2645 + }, + { + "epoch": 3.3621346886912327, + "grad_norm": 1.6551174733979463, + "learning_rate": 1.2600173914805647e-06, + "loss": 1.1932, + "step": 2646 + }, + { + "epoch": 3.363405336721728, + "grad_norm": 1.5903014941151614, + "learning_rate": 1.2551100454164556e-06, + "loss": 1.0314, + "step": 2647 + }, + { + "epoch": 3.364675984752224, + "grad_norm": 1.6469867484327598, + "learning_rate": 1.2502116343005033e-06, + "loss": 1.2687, + "step": 2648 + }, + { + "epoch": 3.365946632782719, + "grad_norm": 1.7078274190427007, + "learning_rate": 1.2453221631375755e-06, + "loss": 1.4505, + "step": 2649 + }, + { + "epoch": 3.3672172808132146, + "grad_norm": 1.7782134771535814, + "learning_rate": 1.240441636923413e-06, + "loss": 1.2899, + "step": 2650 + }, + { + "epoch": 3.3684879288437104, + "grad_norm": 1.844153101820231, + "learning_rate": 1.2355700606446119e-06, + "loss": 1.1901, + "step": 2651 + }, + { + "epoch": 3.3697585768742058, + "grad_norm": 1.9132378925639788, + "learning_rate": 1.2307074392786233e-06, + "loss": 1.267, + "step": 2652 + }, + { + "epoch": 3.3710292249047016, + "grad_norm": 2.0661967196773086, + "learning_rate": 1.2258537777937517e-06, + "loss": 1.2886, + "step": 2653 + }, + { + "epoch": 3.372299872935197, + "grad_norm": 1.7694658087370272, + "learning_rate": 1.2210090811491515e-06, + "loss": 1.1099, + "step": 2654 + }, + { + "epoch": 3.3735705209656923, + "grad_norm": 1.7817581559328577, + "learning_rate": 1.2161733542948073e-06, + "loss": 1.38, + "step": 2655 + }, + { + "epoch": 3.374841168996188, + "grad_norm": 1.7130467222950003, + "learning_rate": 1.2113466021715426e-06, + "loss": 1.4198, + "step": 2656 + }, + { + "epoch": 3.3761118170266835, + "grad_norm": 1.6031405696122085, + "learning_rate": 1.2065288297110167e-06, + "loss": 1.2814, + "step": 2657 + }, + { + "epoch": 3.3773824650571793, + "grad_norm": 1.8655197755258583, + "learning_rate": 1.2017200418357077e-06, + "loss": 1.3108, + "step": 2658 + }, + { + "epoch": 3.3786531130876747, + "grad_norm": 1.7812268166685585, + "learning_rate": 1.1969202434589133e-06, + "loss": 1.2573, + "step": 2659 + }, + { + "epoch": 3.37992376111817, + "grad_norm": 1.6440103317494115, + "learning_rate": 1.1921294394847537e-06, + "loss": 1.2683, + "step": 2660 + }, + { + "epoch": 3.381194409148666, + "grad_norm": 1.7585958796842118, + "learning_rate": 1.1873476348081514e-06, + "loss": 1.2807, + "step": 2661 + }, + { + "epoch": 3.3824650571791612, + "grad_norm": 2.5852035542332876, + "learning_rate": 1.182574834314838e-06, + "loss": 1.1824, + "step": 2662 + }, + { + "epoch": 3.383735705209657, + "grad_norm": 1.5086924890786593, + "learning_rate": 1.177811042881345e-06, + "loss": 1.2598, + "step": 2663 + }, + { + "epoch": 3.3850063532401524, + "grad_norm": 1.882253458637874, + "learning_rate": 1.1730562653749956e-06, + "loss": 1.1506, + "step": 2664 + }, + { + "epoch": 3.3862770012706482, + "grad_norm": 1.743303518267941, + "learning_rate": 1.1683105066539068e-06, + "loss": 1.361, + "step": 2665 + }, + { + "epoch": 3.3875476493011436, + "grad_norm": 1.47191660579717, + "learning_rate": 1.1635737715669827e-06, + "loss": 1.2394, + "step": 2666 + }, + { + "epoch": 3.388818297331639, + "grad_norm": 1.9385081502225063, + "learning_rate": 1.1588460649539036e-06, + "loss": 1.3206, + "step": 2667 + }, + { + "epoch": 3.390088945362135, + "grad_norm": 1.85582224027926, + "learning_rate": 1.1541273916451234e-06, + "loss": 1.3906, + "step": 2668 + }, + { + "epoch": 3.39135959339263, + "grad_norm": 1.8045536579743726, + "learning_rate": 1.1494177564618724e-06, + "loss": 1.1999, + "step": 2669 + }, + { + "epoch": 3.392630241423126, + "grad_norm": 1.9045572029921076, + "learning_rate": 1.1447171642161415e-06, + "loss": 1.3235, + "step": 2670 + }, + { + "epoch": 3.3939008894536213, + "grad_norm": 1.9426040842359356, + "learning_rate": 1.1400256197106873e-06, + "loss": 1.247, + "step": 2671 + }, + { + "epoch": 3.3951715374841167, + "grad_norm": 1.9443946239187213, + "learning_rate": 1.1353431277390125e-06, + "loss": 0.9854, + "step": 2672 + }, + { + "epoch": 3.3964421855146125, + "grad_norm": 1.9467877942662004, + "learning_rate": 1.1306696930853834e-06, + "loss": 1.1952, + "step": 2673 + }, + { + "epoch": 3.397712833545108, + "grad_norm": 1.8498200669109355, + "learning_rate": 1.1260053205248023e-06, + "loss": 0.94, + "step": 2674 + }, + { + "epoch": 3.3989834815756037, + "grad_norm": 2.180463686085276, + "learning_rate": 1.121350014823014e-06, + "loss": 1.2531, + "step": 2675 + }, + { + "epoch": 3.400254129606099, + "grad_norm": 1.7466906155479762, + "learning_rate": 1.116703780736501e-06, + "loss": 1.2283, + "step": 2676 + }, + { + "epoch": 3.4015247776365944, + "grad_norm": 1.6930497827444948, + "learning_rate": 1.1120666230124777e-06, + "loss": 1.1114, + "step": 2677 + }, + { + "epoch": 3.4027954256670903, + "grad_norm": 1.725500785012662, + "learning_rate": 1.107438546388887e-06, + "loss": 1.2197, + "step": 2678 + }, + { + "epoch": 3.4040660736975856, + "grad_norm": 1.6125906508638936, + "learning_rate": 1.1028195555943877e-06, + "loss": 1.2707, + "step": 2679 + }, + { + "epoch": 3.4053367217280814, + "grad_norm": 1.7063770961027755, + "learning_rate": 1.0982096553483568e-06, + "loss": 1.3152, + "step": 2680 + }, + { + "epoch": 3.406607369758577, + "grad_norm": 1.8909990165065969, + "learning_rate": 1.0936088503608876e-06, + "loss": 1.3825, + "step": 2681 + }, + { + "epoch": 3.4078780177890726, + "grad_norm": 1.8107735180118376, + "learning_rate": 1.0890171453327735e-06, + "loss": 1.1619, + "step": 2682 + }, + { + "epoch": 3.409148665819568, + "grad_norm": 1.6224194884354048, + "learning_rate": 1.0844345449555172e-06, + "loss": 1.2711, + "step": 2683 + }, + { + "epoch": 3.4104193138500634, + "grad_norm": 1.8196149695370645, + "learning_rate": 1.079861053911313e-06, + "loss": 1.2594, + "step": 2684 + }, + { + "epoch": 3.411689961880559, + "grad_norm": 2.034877917783471, + "learning_rate": 1.0752966768730543e-06, + "loss": 1.3763, + "step": 2685 + }, + { + "epoch": 3.4129606099110545, + "grad_norm": 2.020320327891875, + "learning_rate": 1.0707414185043163e-06, + "loss": 1.1893, + "step": 2686 + }, + { + "epoch": 3.4142312579415504, + "grad_norm": 1.6828139208453592, + "learning_rate": 1.066195283459359e-06, + "loss": 1.4082, + "step": 2687 + }, + { + "epoch": 3.4155019059720457, + "grad_norm": 2.078181012396198, + "learning_rate": 1.0616582763831206e-06, + "loss": 1.429, + "step": 2688 + }, + { + "epoch": 3.4167725540025415, + "grad_norm": 1.9832043832040667, + "learning_rate": 1.057130401911215e-06, + "loss": 1.3872, + "step": 2689 + }, + { + "epoch": 3.418043202033037, + "grad_norm": 1.6905291246322078, + "learning_rate": 1.0526116646699269e-06, + "loss": 1.4047, + "step": 2690 + }, + { + "epoch": 3.4193138500635323, + "grad_norm": 1.9283552129583106, + "learning_rate": 1.048102069276199e-06, + "loss": 1.2151, + "step": 2691 + }, + { + "epoch": 3.420584498094028, + "grad_norm": 2.065009343914376, + "learning_rate": 1.0436016203376343e-06, + "loss": 1.4083, + "step": 2692 + }, + { + "epoch": 3.4218551461245235, + "grad_norm": 1.5669351407838714, + "learning_rate": 1.0391103224524957e-06, + "loss": 1.2232, + "step": 2693 + }, + { + "epoch": 3.423125794155019, + "grad_norm": 1.5827806159536164, + "learning_rate": 1.0346281802096946e-06, + "loss": 1.1738, + "step": 2694 + }, + { + "epoch": 3.4243964421855146, + "grad_norm": 2.0284739060049457, + "learning_rate": 1.0301551981887848e-06, + "loss": 1.3213, + "step": 2695 + }, + { + "epoch": 3.42566709021601, + "grad_norm": 1.687141604822629, + "learning_rate": 1.0256913809599611e-06, + "loss": 1.2716, + "step": 2696 + }, + { + "epoch": 3.426937738246506, + "grad_norm": 1.765244292834096, + "learning_rate": 1.021236733084059e-06, + "loss": 1.3456, + "step": 2697 + }, + { + "epoch": 3.428208386277001, + "grad_norm": 1.4938395155118565, + "learning_rate": 1.0167912591125407e-06, + "loss": 1.2981, + "step": 2698 + }, + { + "epoch": 3.429479034307497, + "grad_norm": 1.704721002072044, + "learning_rate": 1.012354963587493e-06, + "loss": 1.2263, + "step": 2699 + }, + { + "epoch": 3.4307496823379924, + "grad_norm": 1.7142357107118333, + "learning_rate": 1.0079278510416313e-06, + "loss": 1.2548, + "step": 2700 + }, + { + "epoch": 3.4320203303684877, + "grad_norm": 1.9947239089373425, + "learning_rate": 1.0035099259982873e-06, + "loss": 1.2137, + "step": 2701 + }, + { + "epoch": 3.4332909783989836, + "grad_norm": 1.7276553341679752, + "learning_rate": 9.99101192971401e-07, + "loss": 1.1533, + "step": 2702 + }, + { + "epoch": 3.434561626429479, + "grad_norm": 1.7686756288098728, + "learning_rate": 9.947016564655243e-07, + "loss": 1.3311, + "step": 2703 + }, + { + "epoch": 3.4358322744599747, + "grad_norm": 1.820270383580354, + "learning_rate": 9.903113209758098e-07, + "loss": 1.1685, + "step": 2704 + }, + { + "epoch": 3.43710292249047, + "grad_norm": 2.156402860822522, + "learning_rate": 9.859301909880103e-07, + "loss": 1.3577, + "step": 2705 + }, + { + "epoch": 3.438373570520966, + "grad_norm": 1.8012062861999263, + "learning_rate": 9.815582709784788e-07, + "loss": 1.4139, + "step": 2706 + }, + { + "epoch": 3.4396442185514613, + "grad_norm": 1.797709871472713, + "learning_rate": 9.771955654141496e-07, + "loss": 1.3349, + "step": 2707 + }, + { + "epoch": 3.4409148665819567, + "grad_norm": 1.7472924648225128, + "learning_rate": 9.728420787525428e-07, + "loss": 1.2374, + "step": 2708 + }, + { + "epoch": 3.4421855146124525, + "grad_norm": 1.4860934647077977, + "learning_rate": 9.684978154417678e-07, + "loss": 1.194, + "step": 2709 + }, + { + "epoch": 3.443456162642948, + "grad_norm": 1.531343096586481, + "learning_rate": 9.641627799205012e-07, + "loss": 1.2311, + "step": 2710 + }, + { + "epoch": 3.444726810673443, + "grad_norm": 1.6723020451318749, + "learning_rate": 9.598369766179937e-07, + "loss": 1.3318, + "step": 2711 + }, + { + "epoch": 3.445997458703939, + "grad_norm": 1.8285098197851093, + "learning_rate": 9.55520409954066e-07, + "loss": 1.255, + "step": 2712 + }, + { + "epoch": 3.4472681067344344, + "grad_norm": 1.593358400223005, + "learning_rate": 9.512130843390998e-07, + "loss": 1.3607, + "step": 2713 + }, + { + "epoch": 3.44853875476493, + "grad_norm": 1.793270101846004, + "learning_rate": 9.469150041740338e-07, + "loss": 1.2904, + "step": 2714 + }, + { + "epoch": 3.4498094027954256, + "grad_norm": 1.8526523892890903, + "learning_rate": 9.426261738503617e-07, + "loss": 1.3375, + "step": 2715 + }, + { + "epoch": 3.4510800508259214, + "grad_norm": 2.1340765529575054, + "learning_rate": 9.383465977501227e-07, + "loss": 1.0028, + "step": 2716 + }, + { + "epoch": 3.4523506988564168, + "grad_norm": 1.7981696709178416, + "learning_rate": 9.340762802459047e-07, + "loss": 1.1427, + "step": 2717 + }, + { + "epoch": 3.453621346886912, + "grad_norm": 1.785072921646824, + "learning_rate": 9.298152257008386e-07, + "loss": 1.2483, + "step": 2718 + }, + { + "epoch": 3.454891994917408, + "grad_norm": 1.6301498593465438, + "learning_rate": 9.255634384685841e-07, + "loss": 1.3141, + "step": 2719 + }, + { + "epoch": 3.4561626429479033, + "grad_norm": 1.62370888948084, + "learning_rate": 9.213209228933339e-07, + "loss": 1.2085, + "step": 2720 + }, + { + "epoch": 3.457433290978399, + "grad_norm": 1.7534655841893003, + "learning_rate": 9.170876833098119e-07, + "loss": 1.3301, + "step": 2721 + }, + { + "epoch": 3.4587039390088945, + "grad_norm": 1.806965898077104, + "learning_rate": 9.128637240432581e-07, + "loss": 1.4144, + "step": 2722 + }, + { + "epoch": 3.4599745870393903, + "grad_norm": 1.731938997833424, + "learning_rate": 9.086490494094369e-07, + "loss": 1.2284, + "step": 2723 + }, + { + "epoch": 3.4612452350698857, + "grad_norm": 1.737150913677021, + "learning_rate": 9.044436637146204e-07, + "loss": 1.071, + "step": 2724 + }, + { + "epoch": 3.462515883100381, + "grad_norm": 1.6466063414818866, + "learning_rate": 9.002475712555959e-07, + "loss": 0.9581, + "step": 2725 + }, + { + "epoch": 3.463786531130877, + "grad_norm": 1.5537041351043512, + "learning_rate": 8.960607763196494e-07, + "loss": 1.301, + "step": 2726 + }, + { + "epoch": 3.4650571791613722, + "grad_norm": 1.776184933266563, + "learning_rate": 8.918832831845714e-07, + "loss": 1.4461, + "step": 2727 + }, + { + "epoch": 3.4663278271918676, + "grad_norm": 1.7993064628118447, + "learning_rate": 8.87715096118642e-07, + "loss": 1.1932, + "step": 2728 + }, + { + "epoch": 3.4675984752223634, + "grad_norm": 1.7030496492394833, + "learning_rate": 8.835562193806469e-07, + "loss": 1.2039, + "step": 2729 + }, + { + "epoch": 3.468869123252859, + "grad_norm": 1.7805576415021542, + "learning_rate": 8.794066572198456e-07, + "loss": 1.3519, + "step": 2730 + }, + { + "epoch": 3.4701397712833546, + "grad_norm": 2.3091277332278177, + "learning_rate": 8.752664138759858e-07, + "loss": 1.1834, + "step": 2731 + }, + { + "epoch": 3.47141041931385, + "grad_norm": 1.9229797089060061, + "learning_rate": 8.711354935792926e-07, + "loss": 1.0087, + "step": 2732 + }, + { + "epoch": 3.472681067344346, + "grad_norm": 1.942133714071417, + "learning_rate": 8.670139005504674e-07, + "loss": 1.5387, + "step": 2733 + }, + { + "epoch": 3.473951715374841, + "grad_norm": 1.6671088931646612, + "learning_rate": 8.629016390006783e-07, + "loss": 1.2378, + "step": 2734 + }, + { + "epoch": 3.4752223634053365, + "grad_norm": 1.5640065614416736, + "learning_rate": 8.587987131315656e-07, + "loss": 1.3365, + "step": 2735 + }, + { + "epoch": 3.4764930114358323, + "grad_norm": 1.8177900821827773, + "learning_rate": 8.547051271352213e-07, + "loss": 1.0777, + "step": 2736 + }, + { + "epoch": 3.4777636594663277, + "grad_norm": 1.7065655482669073, + "learning_rate": 8.506208851942043e-07, + "loss": 1.2597, + "step": 2737 + }, + { + "epoch": 3.4790343074968235, + "grad_norm": 1.6404825855437422, + "learning_rate": 8.4654599148152e-07, + "loss": 1.2792, + "step": 2738 + }, + { + "epoch": 3.480304955527319, + "grad_norm": 1.6952150831350192, + "learning_rate": 8.424804501606254e-07, + "loss": 1.3297, + "step": 2739 + }, + { + "epoch": 3.4815756035578147, + "grad_norm": 1.744695544224006, + "learning_rate": 8.384242653854146e-07, + "loss": 1.2434, + "step": 2740 + }, + { + "epoch": 3.48284625158831, + "grad_norm": 1.6822982224698178, + "learning_rate": 8.343774413002382e-07, + "loss": 1.1478, + "step": 2741 + }, + { + "epoch": 3.4841168996188054, + "grad_norm": 1.596300241861607, + "learning_rate": 8.303399820398672e-07, + "loss": 1.3028, + "step": 2742 + }, + { + "epoch": 3.4853875476493013, + "grad_norm": 1.8058411192139043, + "learning_rate": 8.263118917295088e-07, + "loss": 1.4362, + "step": 2743 + }, + { + "epoch": 3.4866581956797966, + "grad_norm": 1.7936411191700221, + "learning_rate": 8.222931744847984e-07, + "loss": 1.2268, + "step": 2744 + }, + { + "epoch": 3.4879288437102924, + "grad_norm": 1.956650496418258, + "learning_rate": 8.182838344117971e-07, + "loss": 1.3653, + "step": 2745 + }, + { + "epoch": 3.489199491740788, + "grad_norm": 1.9750594248748314, + "learning_rate": 8.142838756069793e-07, + "loss": 1.1586, + "step": 2746 + }, + { + "epoch": 3.490470139771283, + "grad_norm": 1.569757205227885, + "learning_rate": 8.102933021572412e-07, + "loss": 1.3598, + "step": 2747 + }, + { + "epoch": 3.491740787801779, + "grad_norm": 1.8717609483597424, + "learning_rate": 8.063121181398814e-07, + "loss": 1.3104, + "step": 2748 + }, + { + "epoch": 3.4930114358322744, + "grad_norm": 1.6794716570302477, + "learning_rate": 8.023403276226127e-07, + "loss": 1.0687, + "step": 2749 + }, + { + "epoch": 3.49428208386277, + "grad_norm": 2.152611708136472, + "learning_rate": 7.983779346635479e-07, + "loss": 1.3144, + "step": 2750 + }, + { + "epoch": 3.4955527318932655, + "grad_norm": 1.6025361349231622, + "learning_rate": 7.944249433111917e-07, + "loss": 1.0832, + "step": 2751 + }, + { + "epoch": 3.496823379923761, + "grad_norm": 1.9966220200266624, + "learning_rate": 7.904813576044534e-07, + "loss": 1.5379, + "step": 2752 + }, + { + "epoch": 3.4980940279542567, + "grad_norm": 1.9867104129705102, + "learning_rate": 7.865471815726266e-07, + "loss": 1.3133, + "step": 2753 + }, + { + "epoch": 3.499364675984752, + "grad_norm": 1.7758698233245882, + "learning_rate": 7.826224192353916e-07, + "loss": 1.2638, + "step": 2754 + }, + { + "epoch": 3.500635324015248, + "grad_norm": 1.7856567193948802, + "learning_rate": 7.78707074602808e-07, + "loss": 1.2643, + "step": 2755 + }, + { + "epoch": 3.5019059720457433, + "grad_norm": 1.6819281034220812, + "learning_rate": 7.74801151675314e-07, + "loss": 0.9395, + "step": 2756 + }, + { + "epoch": 3.503176620076239, + "grad_norm": 1.8774041278593898, + "learning_rate": 7.709046544437238e-07, + "loss": 1.3941, + "step": 2757 + }, + { + "epoch": 3.5044472681067345, + "grad_norm": 1.7228875613432229, + "learning_rate": 7.670175868892227e-07, + "loss": 1.1982, + "step": 2758 + }, + { + "epoch": 3.50571791613723, + "grad_norm": 1.9901388879104207, + "learning_rate": 7.63139952983356e-07, + "loss": 1.2779, + "step": 2759 + }, + { + "epoch": 3.5069885641677256, + "grad_norm": 1.856781653989378, + "learning_rate": 7.592717566880304e-07, + "loss": 1.4188, + "step": 2760 + }, + { + "epoch": 3.508259212198221, + "grad_norm": 1.7613715805684167, + "learning_rate": 7.554130019555161e-07, + "loss": 1.1357, + "step": 2761 + }, + { + "epoch": 3.5095298602287164, + "grad_norm": 1.7158166759805475, + "learning_rate": 7.515636927284309e-07, + "loss": 1.208, + "step": 2762 + }, + { + "epoch": 3.510800508259212, + "grad_norm": 1.591258186122319, + "learning_rate": 7.477238329397419e-07, + "loss": 0.8926, + "step": 2763 + }, + { + "epoch": 3.512071156289708, + "grad_norm": 1.7451266194100759, + "learning_rate": 7.43893426512764e-07, + "loss": 1.3221, + "step": 2764 + }, + { + "epoch": 3.5133418043202034, + "grad_norm": 1.7447778840735144, + "learning_rate": 7.400724773611545e-07, + "loss": 1.3805, + "step": 2765 + }, + { + "epoch": 3.5146124523506987, + "grad_norm": 1.6199393605082222, + "learning_rate": 7.362609893889028e-07, + "loss": 1.219, + "step": 2766 + }, + { + "epoch": 3.5158831003811946, + "grad_norm": 1.6402867048923917, + "learning_rate": 7.324589664903359e-07, + "loss": 1.3827, + "step": 2767 + }, + { + "epoch": 3.51715374841169, + "grad_norm": 1.5917563468080689, + "learning_rate": 7.286664125501064e-07, + "loss": 1.3233, + "step": 2768 + }, + { + "epoch": 3.5184243964421853, + "grad_norm": 1.6203147189496971, + "learning_rate": 7.248833314431958e-07, + "loss": 1.2932, + "step": 2769 + }, + { + "epoch": 3.519695044472681, + "grad_norm": 1.868463453498782, + "learning_rate": 7.211097270349065e-07, + "loss": 1.3553, + "step": 2770 + }, + { + "epoch": 3.5209656925031765, + "grad_norm": 1.5099116369166523, + "learning_rate": 7.173456031808568e-07, + "loss": 1.3424, + "step": 2771 + }, + { + "epoch": 3.5222363405336723, + "grad_norm": 1.8810330792980126, + "learning_rate": 7.135909637269745e-07, + "loss": 1.3142, + "step": 2772 + }, + { + "epoch": 3.5235069885641677, + "grad_norm": 1.7933284606688886, + "learning_rate": 7.098458125095064e-07, + "loss": 1.3502, + "step": 2773 + }, + { + "epoch": 3.5247776365946635, + "grad_norm": 1.4080364805760883, + "learning_rate": 7.061101533549952e-07, + "loss": 1.3269, + "step": 2774 + }, + { + "epoch": 3.526048284625159, + "grad_norm": 1.722577290621046, + "learning_rate": 7.023839900802931e-07, + "loss": 1.2447, + "step": 2775 + }, + { + "epoch": 3.527318932655654, + "grad_norm": 1.6793120859124784, + "learning_rate": 6.986673264925437e-07, + "loss": 1.2638, + "step": 2776 + }, + { + "epoch": 3.52858958068615, + "grad_norm": 1.5963888079729813, + "learning_rate": 6.949601663891891e-07, + "loss": 1.1815, + "step": 2777 + }, + { + "epoch": 3.5298602287166454, + "grad_norm": 1.5495246513860983, + "learning_rate": 6.912625135579587e-07, + "loss": 1.1055, + "step": 2778 + }, + { + "epoch": 3.5311308767471408, + "grad_norm": 1.99558693420346, + "learning_rate": 6.875743717768679e-07, + "loss": 1.2659, + "step": 2779 + }, + { + "epoch": 3.5324015247776366, + "grad_norm": 1.786030201932944, + "learning_rate": 6.838957448142136e-07, + "loss": 1.2315, + "step": 2780 + }, + { + "epoch": 3.5336721728081324, + "grad_norm": 1.5668698975548767, + "learning_rate": 6.802266364285782e-07, + "loss": 1.0094, + "step": 2781 + }, + { + "epoch": 3.5349428208386278, + "grad_norm": 1.6261749245699866, + "learning_rate": 6.765670503688093e-07, + "loss": 1.1143, + "step": 2782 + }, + { + "epoch": 3.536213468869123, + "grad_norm": 1.5394960346250208, + "learning_rate": 6.729169903740296e-07, + "loss": 1.3297, + "step": 2783 + }, + { + "epoch": 3.537484116899619, + "grad_norm": 1.8368062621220478, + "learning_rate": 6.692764601736268e-07, + "loss": 1.4588, + "step": 2784 + }, + { + "epoch": 3.5387547649301143, + "grad_norm": 1.6342571665890158, + "learning_rate": 6.656454634872556e-07, + "loss": 1.3219, + "step": 2785 + }, + { + "epoch": 3.5400254129606097, + "grad_norm": 1.6687677275629142, + "learning_rate": 6.62024004024825e-07, + "loss": 1.5602, + "step": 2786 + }, + { + "epoch": 3.5412960609911055, + "grad_norm": 1.7375449094773614, + "learning_rate": 6.58412085486505e-07, + "loss": 1.2297, + "step": 2787 + }, + { + "epoch": 3.542566709021601, + "grad_norm": 1.552416702318075, + "learning_rate": 6.548097115627106e-07, + "loss": 1.2901, + "step": 2788 + }, + { + "epoch": 3.5438373570520967, + "grad_norm": 1.9130146549727756, + "learning_rate": 6.512168859341117e-07, + "loss": 1.3471, + "step": 2789 + }, + { + "epoch": 3.545108005082592, + "grad_norm": 1.7147759293088716, + "learning_rate": 6.476336122716175e-07, + "loss": 1.205, + "step": 2790 + }, + { + "epoch": 3.546378653113088, + "grad_norm": 2.0546460990652347, + "learning_rate": 6.440598942363796e-07, + "loss": 1.3571, + "step": 2791 + }, + { + "epoch": 3.5476493011435832, + "grad_norm": 1.503927361267473, + "learning_rate": 6.404957354797825e-07, + "loss": 1.3311, + "step": 2792 + }, + { + "epoch": 3.5489199491740786, + "grad_norm": 1.737486376743467, + "learning_rate": 6.369411396434522e-07, + "loss": 1.0611, + "step": 2793 + }, + { + "epoch": 3.5501905972045744, + "grad_norm": 1.645776561560631, + "learning_rate": 6.333961103592379e-07, + "loss": 1.0911, + "step": 2794 + }, + { + "epoch": 3.55146124523507, + "grad_norm": 2.112524956117361, + "learning_rate": 6.298606512492134e-07, + "loss": 1.5111, + "step": 2795 + }, + { + "epoch": 3.5527318932655656, + "grad_norm": 1.4997925507175143, + "learning_rate": 6.263347659256758e-07, + "loss": 1.2825, + "step": 2796 + }, + { + "epoch": 3.554002541296061, + "grad_norm": 1.6710261427064792, + "learning_rate": 6.228184579911423e-07, + "loss": 1.1149, + "step": 2797 + }, + { + "epoch": 3.555273189326557, + "grad_norm": 1.621016823761684, + "learning_rate": 6.193117310383412e-07, + "loss": 1.296, + "step": 2798 + }, + { + "epoch": 3.556543837357052, + "grad_norm": 1.7308866298487022, + "learning_rate": 6.158145886502165e-07, + "loss": 1.1074, + "step": 2799 + }, + { + "epoch": 3.5578144853875475, + "grad_norm": 1.7267963162549822, + "learning_rate": 6.123270343999132e-07, + "loss": 1.2309, + "step": 2800 + }, + { + "epoch": 3.5590851334180433, + "grad_norm": 1.6143755437223648, + "learning_rate": 6.088490718507845e-07, + "loss": 1.2571, + "step": 2801 + }, + { + "epoch": 3.5603557814485387, + "grad_norm": 1.484465575202803, + "learning_rate": 6.053807045563808e-07, + "loss": 1.2447, + "step": 2802 + }, + { + "epoch": 3.561626429479034, + "grad_norm": 1.9146266955465872, + "learning_rate": 6.019219360604489e-07, + "loss": 1.3864, + "step": 2803 + }, + { + "epoch": 3.56289707750953, + "grad_norm": 1.7988196675952783, + "learning_rate": 5.984727698969306e-07, + "loss": 1.2575, + "step": 2804 + }, + { + "epoch": 3.5641677255400253, + "grad_norm": 1.7598875918220322, + "learning_rate": 5.950332095899547e-07, + "loss": 1.2743, + "step": 2805 + }, + { + "epoch": 3.565438373570521, + "grad_norm": 1.8552392774785547, + "learning_rate": 5.916032586538345e-07, + "loss": 1.2884, + "step": 2806 + }, + { + "epoch": 3.5667090216010164, + "grad_norm": 1.826409137549182, + "learning_rate": 5.881829205930678e-07, + "loss": 1.3555, + "step": 2807 + }, + { + "epoch": 3.5679796696315123, + "grad_norm": 1.5524145194881622, + "learning_rate": 5.847721989023258e-07, + "loss": 1.3055, + "step": 2808 + }, + { + "epoch": 3.5692503176620076, + "grad_norm": 1.7064428420759787, + "learning_rate": 5.81371097066461e-07, + "loss": 1.2404, + "step": 2809 + }, + { + "epoch": 3.570520965692503, + "grad_norm": 1.6450930111039546, + "learning_rate": 5.779796185604925e-07, + "loss": 1.3306, + "step": 2810 + }, + { + "epoch": 3.571791613722999, + "grad_norm": 1.762384800746856, + "learning_rate": 5.745977668496084e-07, + "loss": 1.4835, + "step": 2811 + }, + { + "epoch": 3.573062261753494, + "grad_norm": 2.0210484040370686, + "learning_rate": 5.71225545389158e-07, + "loss": 1.4861, + "step": 2812 + }, + { + "epoch": 3.57433290978399, + "grad_norm": 1.8646019100822542, + "learning_rate": 5.678629576246575e-07, + "loss": 1.359, + "step": 2813 + }, + { + "epoch": 3.5756035578144854, + "grad_norm": 1.9664729844151032, + "learning_rate": 5.64510006991772e-07, + "loss": 1.1857, + "step": 2814 + }, + { + "epoch": 3.576874205844981, + "grad_norm": 1.6227577074984558, + "learning_rate": 5.611666969163243e-07, + "loss": 1.1995, + "step": 2815 + }, + { + "epoch": 3.5781448538754765, + "grad_norm": 1.778946623070315, + "learning_rate": 5.578330308142887e-07, + "loss": 1.0837, + "step": 2816 + }, + { + "epoch": 3.579415501905972, + "grad_norm": 1.7167523857038194, + "learning_rate": 5.54509012091784e-07, + "loss": 1.2531, + "step": 2817 + }, + { + "epoch": 3.5806861499364677, + "grad_norm": 1.7830165224929857, + "learning_rate": 5.511946441450711e-07, + "loss": 1.3503, + "step": 2818 + }, + { + "epoch": 3.581956797966963, + "grad_norm": 1.566038817550546, + "learning_rate": 5.478899303605512e-07, + "loss": 1.1498, + "step": 2819 + }, + { + "epoch": 3.5832274459974585, + "grad_norm": 1.6522474906143725, + "learning_rate": 5.445948741147589e-07, + "loss": 1.3361, + "step": 2820 + }, + { + "epoch": 3.5844980940279543, + "grad_norm": 1.7629684303673683, + "learning_rate": 5.413094787743678e-07, + "loss": 0.9675, + "step": 2821 + }, + { + "epoch": 3.5857687420584496, + "grad_norm": 1.4674919742802814, + "learning_rate": 5.380337476961762e-07, + "loss": 1.417, + "step": 2822 + }, + { + "epoch": 3.5870393900889455, + "grad_norm": 1.8295725890516696, + "learning_rate": 5.347676842271088e-07, + "loss": 1.1349, + "step": 2823 + }, + { + "epoch": 3.588310038119441, + "grad_norm": 2.1103444012235903, + "learning_rate": 5.315112917042097e-07, + "loss": 1.3127, + "step": 2824 + }, + { + "epoch": 3.5895806861499366, + "grad_norm": 1.7780483291050404, + "learning_rate": 5.282645734546477e-07, + "loss": 1.3138, + "step": 2825 + }, + { + "epoch": 3.590851334180432, + "grad_norm": 1.8841007440618305, + "learning_rate": 5.250275327957033e-07, + "loss": 1.3983, + "step": 2826 + }, + { + "epoch": 3.5921219822109274, + "grad_norm": 1.646128930724859, + "learning_rate": 5.218001730347688e-07, + "loss": 1.0253, + "step": 2827 + }, + { + "epoch": 3.593392630241423, + "grad_norm": 2.135120725044294, + "learning_rate": 5.185824974693454e-07, + "loss": 1.4821, + "step": 2828 + }, + { + "epoch": 3.5946632782719186, + "grad_norm": 1.5744547156520436, + "learning_rate": 5.153745093870443e-07, + "loss": 1.3668, + "step": 2829 + }, + { + "epoch": 3.5959339263024144, + "grad_norm": 1.581711380685895, + "learning_rate": 5.121762120655727e-07, + "loss": 1.2427, + "step": 2830 + }, + { + "epoch": 3.5972045743329097, + "grad_norm": 1.5580465083953998, + "learning_rate": 5.089876087727364e-07, + "loss": 1.3276, + "step": 2831 + }, + { + "epoch": 3.5984752223634056, + "grad_norm": 1.5977517162237127, + "learning_rate": 5.058087027664404e-07, + "loss": 1.1969, + "step": 2832 + }, + { + "epoch": 3.599745870393901, + "grad_norm": 1.5332829739649898, + "learning_rate": 5.026394972946813e-07, + "loss": 1.2062, + "step": 2833 + }, + { + "epoch": 3.6010165184243963, + "grad_norm": 1.910884717881685, + "learning_rate": 4.994799955955409e-07, + "loss": 1.4812, + "step": 2834 + }, + { + "epoch": 3.602287166454892, + "grad_norm": 2.103982668673676, + "learning_rate": 4.963302008971904e-07, + "loss": 1.145, + "step": 2835 + }, + { + "epoch": 3.6035578144853875, + "grad_norm": 1.6329999265083088, + "learning_rate": 4.931901164178765e-07, + "loss": 1.4175, + "step": 2836 + }, + { + "epoch": 3.604828462515883, + "grad_norm": 1.994566922245568, + "learning_rate": 4.90059745365935e-07, + "loss": 1.4077, + "step": 2837 + }, + { + "epoch": 3.6060991105463787, + "grad_norm": 1.9225933837025715, + "learning_rate": 4.869390909397664e-07, + "loss": 1.3837, + "step": 2838 + }, + { + "epoch": 3.6073697585768745, + "grad_norm": 1.5585153656236626, + "learning_rate": 4.838281563278513e-07, + "loss": 1.1987, + "step": 2839 + }, + { + "epoch": 3.60864040660737, + "grad_norm": 1.8921769764839649, + "learning_rate": 4.807269447087348e-07, + "loss": 1.4019, + "step": 2840 + }, + { + "epoch": 3.609911054637865, + "grad_norm": 1.7808532225030733, + "learning_rate": 4.776354592510302e-07, + "loss": 1.3058, + "step": 2841 + }, + { + "epoch": 3.611181702668361, + "grad_norm": 2.787504736154616, + "learning_rate": 4.7455370311341174e-07, + "loss": 1.2682, + "step": 2842 + }, + { + "epoch": 3.6124523506988564, + "grad_norm": 1.5353303670722411, + "learning_rate": 4.71481679444612e-07, + "loss": 1.4039, + "step": 2843 + }, + { + "epoch": 3.6137229987293518, + "grad_norm": 1.8489732003472905, + "learning_rate": 4.684193913834212e-07, + "loss": 1.2498, + "step": 2844 + }, + { + "epoch": 3.6149936467598476, + "grad_norm": 1.846562588561736, + "learning_rate": 4.653668420586843e-07, + "loss": 1.3524, + "step": 2845 + }, + { + "epoch": 3.616264294790343, + "grad_norm": 1.7596402046071762, + "learning_rate": 4.623240345892932e-07, + "loss": 1.2591, + "step": 2846 + }, + { + "epoch": 3.6175349428208388, + "grad_norm": 1.6877790123769085, + "learning_rate": 4.592909720841843e-07, + "loss": 1.3654, + "step": 2847 + }, + { + "epoch": 3.618805590851334, + "grad_norm": 1.8048006323504517, + "learning_rate": 4.562676576423397e-07, + "loss": 1.3649, + "step": 2848 + }, + { + "epoch": 3.62007623888183, + "grad_norm": 1.7126379547511625, + "learning_rate": 4.53254094352783e-07, + "loss": 1.4378, + "step": 2849 + }, + { + "epoch": 3.6213468869123253, + "grad_norm": 1.856720805247342, + "learning_rate": 4.5025028529457225e-07, + "loss": 1.1204, + "step": 2850 + }, + { + "epoch": 3.6226175349428207, + "grad_norm": 1.637590427858388, + "learning_rate": 4.4725623353680246e-07, + "loss": 1.2142, + "step": 2851 + }, + { + "epoch": 3.6238881829733165, + "grad_norm": 1.6166541136566293, + "learning_rate": 4.4427194213859216e-07, + "loss": 1.2543, + "step": 2852 + }, + { + "epoch": 3.625158831003812, + "grad_norm": 1.8068453256972625, + "learning_rate": 4.4129741414909776e-07, + "loss": 1.4532, + "step": 2853 + }, + { + "epoch": 3.6264294790343072, + "grad_norm": 1.8418139517980867, + "learning_rate": 4.3833265260749157e-07, + "loss": 1.2231, + "step": 2854 + }, + { + "epoch": 3.627700127064803, + "grad_norm": 1.5087395562608672, + "learning_rate": 4.3537766054296935e-07, + "loss": 1.2651, + "step": 2855 + }, + { + "epoch": 3.628970775095299, + "grad_norm": 1.7131033116168544, + "learning_rate": 4.324324409747471e-07, + "loss": 1.2731, + "step": 2856 + }, + { + "epoch": 3.6302414231257942, + "grad_norm": 1.5717863519639734, + "learning_rate": 4.2949699691205547e-07, + "loss": 1.2464, + "step": 2857 + }, + { + "epoch": 3.6315120711562896, + "grad_norm": 1.7552470672585625, + "learning_rate": 4.2657133135413643e-07, + "loss": 1.3178, + "step": 2858 + }, + { + "epoch": 3.6327827191867854, + "grad_norm": 1.654582843775076, + "learning_rate": 4.2365544729023766e-07, + "loss": 1.155, + "step": 2859 + }, + { + "epoch": 3.634053367217281, + "grad_norm": 1.9057224665000165, + "learning_rate": 4.207493476996205e-07, + "loss": 1.3287, + "step": 2860 + }, + { + "epoch": 3.635324015247776, + "grad_norm": 1.8357969271756474, + "learning_rate": 4.178530355515409e-07, + "loss": 1.2494, + "step": 2861 + }, + { + "epoch": 3.636594663278272, + "grad_norm": 1.8333493950977857, + "learning_rate": 4.1496651380526164e-07, + "loss": 1.3367, + "step": 2862 + }, + { + "epoch": 3.6378653113087673, + "grad_norm": 1.6409649153529624, + "learning_rate": 4.1208978541003694e-07, + "loss": 1.3905, + "step": 2863 + }, + { + "epoch": 3.639135959339263, + "grad_norm": 1.6182993052070216, + "learning_rate": 4.092228533051157e-07, + "loss": 1.3968, + "step": 2864 + }, + { + "epoch": 3.6404066073697585, + "grad_norm": 2.0970039390292037, + "learning_rate": 4.063657204197424e-07, + "loss": 1.2981, + "step": 2865 + }, + { + "epoch": 3.6416772554002543, + "grad_norm": 1.778171798732873, + "learning_rate": 4.0351838967314427e-07, + "loss": 1.1876, + "step": 2866 + }, + { + "epoch": 3.6429479034307497, + "grad_norm": 2.0284644661241895, + "learning_rate": 4.0068086397453297e-07, + "loss": 1.5399, + "step": 2867 + }, + { + "epoch": 3.644218551461245, + "grad_norm": 1.8146083791558065, + "learning_rate": 3.97853146223105e-07, + "loss": 1.3671, + "step": 2868 + }, + { + "epoch": 3.645489199491741, + "grad_norm": 1.5427820417969758, + "learning_rate": 3.95035239308037e-07, + "loss": 1.1703, + "step": 2869 + }, + { + "epoch": 3.6467598475222363, + "grad_norm": 1.8578951301502669, + "learning_rate": 3.92227146108477e-07, + "loss": 1.3245, + "step": 2870 + }, + { + "epoch": 3.6480304955527316, + "grad_norm": 2.1604438011867773, + "learning_rate": 3.8942886949354777e-07, + "loss": 1.255, + "step": 2871 + }, + { + "epoch": 3.6493011435832274, + "grad_norm": 1.6022324609710354, + "learning_rate": 3.866404123223444e-07, + "loss": 1.2646, + "step": 2872 + }, + { + "epoch": 3.6505717916137232, + "grad_norm": 1.6355491971211404, + "learning_rate": 3.838617774439257e-07, + "loss": 1.3028, + "step": 2873 + }, + { + "epoch": 3.6518424396442186, + "grad_norm": 1.9864901590036022, + "learning_rate": 3.810929676973185e-07, + "loss": 1.2847, + "step": 2874 + }, + { + "epoch": 3.653113087674714, + "grad_norm": 1.7128003678677792, + "learning_rate": 3.783339859115065e-07, + "loss": 1.0734, + "step": 2875 + }, + { + "epoch": 3.65438373570521, + "grad_norm": 2.042760288386399, + "learning_rate": 3.7558483490543475e-07, + "loss": 1.3907, + "step": 2876 + }, + { + "epoch": 3.655654383735705, + "grad_norm": 1.64236759707824, + "learning_rate": 3.728455174880052e-07, + "loss": 1.2805, + "step": 2877 + }, + { + "epoch": 3.6569250317662005, + "grad_norm": 1.982255555801729, + "learning_rate": 3.7011603645806917e-07, + "loss": 1.4128, + "step": 2878 + }, + { + "epoch": 3.6581956797966964, + "grad_norm": 1.8216182340964904, + "learning_rate": 3.673963946044268e-07, + "loss": 1.4314, + "step": 2879 + }, + { + "epoch": 3.6594663278271917, + "grad_norm": 1.8666880653898337, + "learning_rate": 3.646865947058309e-07, + "loss": 1.4772, + "step": 2880 + }, + { + "epoch": 3.6607369758576875, + "grad_norm": 2.020424953693691, + "learning_rate": 3.619866395309757e-07, + "loss": 1.2956, + "step": 2881 + }, + { + "epoch": 3.662007623888183, + "grad_norm": 1.9641852338990382, + "learning_rate": 3.5929653183849444e-07, + "loss": 1.2681, + "step": 2882 + }, + { + "epoch": 3.6632782719186787, + "grad_norm": 1.959468794096944, + "learning_rate": 3.566162743769597e-07, + "loss": 1.2469, + "step": 2883 + }, + { + "epoch": 3.664548919949174, + "grad_norm": 1.8099735782469673, + "learning_rate": 3.53945869884883e-07, + "loss": 1.2767, + "step": 2884 + }, + { + "epoch": 3.6658195679796695, + "grad_norm": 1.5849198909026307, + "learning_rate": 3.51285321090703e-07, + "loss": 1.244, + "step": 2885 + }, + { + "epoch": 3.6670902160101653, + "grad_norm": 1.6530831894074411, + "learning_rate": 3.4863463071279636e-07, + "loss": 1.5761, + "step": 2886 + }, + { + "epoch": 3.6683608640406606, + "grad_norm": 1.7370571229861118, + "learning_rate": 3.45993801459461e-07, + "loss": 1.2814, + "step": 2887 + }, + { + "epoch": 3.6696315120711565, + "grad_norm": 1.8130340023816736, + "learning_rate": 3.4336283602891875e-07, + "loss": 1.2127, + "step": 2888 + }, + { + "epoch": 3.670902160101652, + "grad_norm": 1.851199820168119, + "learning_rate": 3.4074173710931804e-07, + "loss": 1.4793, + "step": 2889 + }, + { + "epoch": 3.6721728081321476, + "grad_norm": 1.5016518796267735, + "learning_rate": 3.381305073787211e-07, + "loss": 1.0988, + "step": 2890 + }, + { + "epoch": 3.673443456162643, + "grad_norm": 1.758242592174552, + "learning_rate": 3.355291495051127e-07, + "loss": 1.2777, + "step": 2891 + }, + { + "epoch": 3.6747141041931384, + "grad_norm": 2.016754925103769, + "learning_rate": 3.3293766614638457e-07, + "loss": 1.5474, + "step": 2892 + }, + { + "epoch": 3.675984752223634, + "grad_norm": 1.8421964175588048, + "learning_rate": 3.3035605995034524e-07, + "loss": 1.3418, + "step": 2893 + }, + { + "epoch": 3.6772554002541296, + "grad_norm": 1.9715525690894853, + "learning_rate": 3.277843335547071e-07, + "loss": 1.4832, + "step": 2894 + }, + { + "epoch": 3.678526048284625, + "grad_norm": 1.9264416532591508, + "learning_rate": 3.2522248958708814e-07, + "loss": 1.431, + "step": 2895 + }, + { + "epoch": 3.6797966963151207, + "grad_norm": 1.3478815832993138, + "learning_rate": 3.226705306650113e-07, + "loss": 1.1502, + "step": 2896 + }, + { + "epoch": 3.681067344345616, + "grad_norm": 1.6254549393829039, + "learning_rate": 3.201284593959009e-07, + "loss": 1.4309, + "step": 2897 + }, + { + "epoch": 3.682337992376112, + "grad_norm": 1.7209625236120987, + "learning_rate": 3.1759627837707475e-07, + "loss": 1.2548, + "step": 2898 + }, + { + "epoch": 3.6836086404066073, + "grad_norm": 1.719823940394226, + "learning_rate": 3.150739901957467e-07, + "loss": 1.1791, + "step": 2899 + }, + { + "epoch": 3.684879288437103, + "grad_norm": 1.7501731746528646, + "learning_rate": 3.1256159742902527e-07, + "loss": 1.112, + "step": 2900 + }, + { + "epoch": 3.6861499364675985, + "grad_norm": 1.818068130323241, + "learning_rate": 3.100591026439059e-07, + "loss": 1.2971, + "step": 2901 + }, + { + "epoch": 3.687420584498094, + "grad_norm": 1.704715465340639, + "learning_rate": 3.075665083972701e-07, + "loss": 1.1321, + "step": 2902 + }, + { + "epoch": 3.6886912325285897, + "grad_norm": 1.6476840934716483, + "learning_rate": 3.050838172358883e-07, + "loss": 1.1951, + "step": 2903 + }, + { + "epoch": 3.689961880559085, + "grad_norm": 1.7312533780583264, + "learning_rate": 3.0261103169640594e-07, + "loss": 1.2046, + "step": 2904 + }, + { + "epoch": 3.691232528589581, + "grad_norm": 1.5300711820989914, + "learning_rate": 3.0014815430535524e-07, + "loss": 1.3225, + "step": 2905 + }, + { + "epoch": 3.692503176620076, + "grad_norm": 1.7017019203015675, + "learning_rate": 2.9769518757913785e-07, + "loss": 1.4056, + "step": 2906 + }, + { + "epoch": 3.693773824650572, + "grad_norm": 2.021153403583211, + "learning_rate": 2.952521340240333e-07, + "loss": 1.1538, + "step": 2907 + }, + { + "epoch": 3.6950444726810674, + "grad_norm": 1.7738241525920886, + "learning_rate": 2.9281899613619047e-07, + "loss": 1.1375, + "step": 2908 + }, + { + "epoch": 3.6963151207115628, + "grad_norm": 1.7016217948023906, + "learning_rate": 2.9039577640163077e-07, + "loss": 1.2858, + "step": 2909 + }, + { + "epoch": 3.6975857687420586, + "grad_norm": 1.8183261497067524, + "learning_rate": 2.879824772962381e-07, + "loss": 1.3034, + "step": 2910 + }, + { + "epoch": 3.698856416772554, + "grad_norm": 1.6033159308319713, + "learning_rate": 2.8557910128575897e-07, + "loss": 1.3922, + "step": 2911 + }, + { + "epoch": 3.7001270648030493, + "grad_norm": 2.005423047013152, + "learning_rate": 2.8318565082580686e-07, + "loss": 1.3093, + "step": 2912 + }, + { + "epoch": 3.701397712833545, + "grad_norm": 1.6877843318389423, + "learning_rate": 2.8080212836185006e-07, + "loss": 1.2727, + "step": 2913 + }, + { + "epoch": 3.7026683608640405, + "grad_norm": 1.7753712002547122, + "learning_rate": 2.784285363292105e-07, + "loss": 1.1072, + "step": 2914 + }, + { + "epoch": 3.7039390088945363, + "grad_norm": 1.8499481759690708, + "learning_rate": 2.760648771530705e-07, + "loss": 1.4622, + "step": 2915 + }, + { + "epoch": 3.7052096569250317, + "grad_norm": 1.6641759613967153, + "learning_rate": 2.737111532484582e-07, + "loss": 1.359, + "step": 2916 + }, + { + "epoch": 3.7064803049555275, + "grad_norm": 1.8321493388148502, + "learning_rate": 2.7136736702025436e-07, + "loss": 0.8954, + "step": 2917 + }, + { + "epoch": 3.707750952986023, + "grad_norm": 1.785858099671127, + "learning_rate": 2.6903352086318336e-07, + "loss": 1.2825, + "step": 2918 + }, + { + "epoch": 3.7090216010165182, + "grad_norm": 1.82639553313416, + "learning_rate": 2.667096171618122e-07, + "loss": 1.2128, + "step": 2919 + }, + { + "epoch": 3.710292249047014, + "grad_norm": 1.5961260281125593, + "learning_rate": 2.6439565829055267e-07, + "loss": 1.0634, + "step": 2920 + }, + { + "epoch": 3.7115628970775094, + "grad_norm": 1.7764458700386587, + "learning_rate": 2.620916466136569e-07, + "loss": 1.1089, + "step": 2921 + }, + { + "epoch": 3.7128335451080052, + "grad_norm": 1.9957371319377588, + "learning_rate": 2.5979758448520854e-07, + "loss": 1.2149, + "step": 2922 + }, + { + "epoch": 3.7141041931385006, + "grad_norm": 1.7950216758943818, + "learning_rate": 2.57513474249127e-07, + "loss": 1.5193, + "step": 2923 + }, + { + "epoch": 3.7153748411689964, + "grad_norm": 1.9839075878351453, + "learning_rate": 2.552393182391677e-07, + "loss": 1.1559, + "step": 2924 + }, + { + "epoch": 3.716645489199492, + "grad_norm": 1.5837936980441778, + "learning_rate": 2.529751187789098e-07, + "loss": 1.2536, + "step": 2925 + }, + { + "epoch": 3.717916137229987, + "grad_norm": 1.7725334600507785, + "learning_rate": 2.507208781817638e-07, + "loss": 1.21, + "step": 2926 + }, + { + "epoch": 3.719186785260483, + "grad_norm": 1.8919214319444393, + "learning_rate": 2.4847659875096184e-07, + "loss": 1.1894, + "step": 2927 + }, + { + "epoch": 3.7204574332909783, + "grad_norm": 1.6578469740116388, + "learning_rate": 2.4624228277956077e-07, + "loss": 1.3188, + "step": 2928 + }, + { + "epoch": 3.7217280813214737, + "grad_norm": 1.7026605237047239, + "learning_rate": 2.4401793255043436e-07, + "loss": 1.3334, + "step": 2929 + }, + { + "epoch": 3.7229987293519695, + "grad_norm": 1.8933344589067407, + "learning_rate": 2.4180355033627925e-07, + "loss": 1.2346, + "step": 2930 + }, + { + "epoch": 3.7242693773824653, + "grad_norm": 1.6804946093757482, + "learning_rate": 2.395991383995999e-07, + "loss": 1.2925, + "step": 2931 + }, + { + "epoch": 3.7255400254129607, + "grad_norm": 1.7666045402959094, + "learning_rate": 2.3740469899272144e-07, + "loss": 1.0185, + "step": 2932 + }, + { + "epoch": 3.726810673443456, + "grad_norm": 2.3268439238309475, + "learning_rate": 2.3522023435777585e-07, + "loss": 1.5004, + "step": 2933 + }, + { + "epoch": 3.728081321473952, + "grad_norm": 1.8370176464784342, + "learning_rate": 2.3304574672670444e-07, + "loss": 1.3942, + "step": 2934 + }, + { + "epoch": 3.7293519695044473, + "grad_norm": 1.9368162512567109, + "learning_rate": 2.308812383212522e-07, + "loss": 1.4361, + "step": 2935 + }, + { + "epoch": 3.7306226175349426, + "grad_norm": 1.8444929993476247, + "learning_rate": 2.2872671135297342e-07, + "loss": 1.3625, + "step": 2936 + }, + { + "epoch": 3.7318932655654384, + "grad_norm": 1.8800705753592861, + "learning_rate": 2.265821680232172e-07, + "loss": 1.2644, + "step": 2937 + }, + { + "epoch": 3.733163913595934, + "grad_norm": 1.7966966962401014, + "learning_rate": 2.2444761052313857e-07, + "loss": 1.2963, + "step": 2938 + }, + { + "epoch": 3.7344345616264296, + "grad_norm": 1.5288572533419622, + "learning_rate": 2.2232304103368408e-07, + "loss": 1.4129, + "step": 2939 + }, + { + "epoch": 3.735705209656925, + "grad_norm": 1.778890367102727, + "learning_rate": 2.2020846172560062e-07, + "loss": 1.3096, + "step": 2940 + }, + { + "epoch": 3.736975857687421, + "grad_norm": 1.9359706302321475, + "learning_rate": 2.181038747594244e-07, + "loss": 1.3953, + "step": 2941 + }, + { + "epoch": 3.738246505717916, + "grad_norm": 1.7357617348857184, + "learning_rate": 2.160092822854809e-07, + "loss": 1.2162, + "step": 2942 + }, + { + "epoch": 3.7395171537484115, + "grad_norm": 1.64367042406681, + "learning_rate": 2.1392468644388598e-07, + "loss": 1.4227, + "step": 2943 + }, + { + "epoch": 3.7407878017789074, + "grad_norm": 1.4922126743992972, + "learning_rate": 2.1185008936454253e-07, + "loss": 1.2542, + "step": 2944 + }, + { + "epoch": 3.7420584498094027, + "grad_norm": 1.827408648948783, + "learning_rate": 2.0978549316713615e-07, + "loss": 1.3295, + "step": 2945 + }, + { + "epoch": 3.743329097839898, + "grad_norm": 1.6536944906486557, + "learning_rate": 2.0773089996113382e-07, + "loss": 1.1333, + "step": 2946 + }, + { + "epoch": 3.744599745870394, + "grad_norm": 1.6467853780164043, + "learning_rate": 2.0568631184578082e-07, + "loss": 1.2169, + "step": 2947 + }, + { + "epoch": 3.7458703939008897, + "grad_norm": 1.6666714776463187, + "learning_rate": 2.0365173091010382e-07, + "loss": 1.2184, + "step": 2948 + }, + { + "epoch": 3.747141041931385, + "grad_norm": 1.7900600111005363, + "learning_rate": 2.0162715923290333e-07, + "loss": 1.5177, + "step": 2949 + }, + { + "epoch": 3.7484116899618805, + "grad_norm": 1.8741298704629243, + "learning_rate": 1.996125988827502e-07, + "loss": 1.4674, + "step": 2950 + }, + { + "epoch": 3.7496823379923763, + "grad_norm": 1.6800530248687902, + "learning_rate": 1.9760805191798903e-07, + "loss": 1.2756, + "step": 2951 + }, + { + "epoch": 3.7509529860228716, + "grad_norm": 1.7106727901557028, + "learning_rate": 1.9561352038673264e-07, + "loss": 1.2821, + "step": 2952 + }, + { + "epoch": 3.752223634053367, + "grad_norm": 1.6267132760386054, + "learning_rate": 1.936290063268631e-07, + "loss": 1.2844, + "step": 2953 + }, + { + "epoch": 3.753494282083863, + "grad_norm": 1.631756064192363, + "learning_rate": 1.916545117660218e-07, + "loss": 1.3697, + "step": 2954 + }, + { + "epoch": 3.754764930114358, + "grad_norm": 1.9361139579297768, + "learning_rate": 1.8969003872161718e-07, + "loss": 1.2899, + "step": 2955 + }, + { + "epoch": 3.756035578144854, + "grad_norm": 1.8273051054382337, + "learning_rate": 1.8773558920082037e-07, + "loss": 1.3229, + "step": 2956 + }, + { + "epoch": 3.7573062261753494, + "grad_norm": 1.954200663567673, + "learning_rate": 1.8579116520055508e-07, + "loss": 1.4069, + "step": 2957 + }, + { + "epoch": 3.758576874205845, + "grad_norm": 1.6921839191320842, + "learning_rate": 1.8385676870750545e-07, + "loss": 1.3057, + "step": 2958 + }, + { + "epoch": 3.7598475222363406, + "grad_norm": 1.780772890095342, + "learning_rate": 1.8193240169810943e-07, + "loss": 1.0153, + "step": 2959 + }, + { + "epoch": 3.761118170266836, + "grad_norm": 1.9333807192242591, + "learning_rate": 1.8001806613855642e-07, + "loss": 1.3781, + "step": 2960 + }, + { + "epoch": 3.7623888182973317, + "grad_norm": 2.022433044933993, + "learning_rate": 1.7811376398479075e-07, + "loss": 1.3182, + "step": 2961 + }, + { + "epoch": 3.763659466327827, + "grad_norm": 1.612073428189375, + "learning_rate": 1.762194971824993e-07, + "loss": 1.1196, + "step": 2962 + }, + { + "epoch": 3.7649301143583225, + "grad_norm": 1.9915334072940494, + "learning_rate": 1.7433526766711727e-07, + "loss": 1.3878, + "step": 2963 + }, + { + "epoch": 3.7662007623888183, + "grad_norm": 1.7732026497255406, + "learning_rate": 1.72461077363828e-07, + "loss": 1.3563, + "step": 2964 + }, + { + "epoch": 3.767471410419314, + "grad_norm": 1.7834823656515153, + "learning_rate": 1.7059692818755414e-07, + "loss": 1.1275, + "step": 2965 + }, + { + "epoch": 3.7687420584498095, + "grad_norm": 1.897016759777693, + "learning_rate": 1.6874282204295765e-07, + "loss": 1.0414, + "step": 2966 + }, + { + "epoch": 3.770012706480305, + "grad_norm": 2.0080223575450806, + "learning_rate": 1.6689876082444323e-07, + "loss": 1.286, + "step": 2967 + }, + { + "epoch": 3.7712833545108007, + "grad_norm": 1.482766063008824, + "learning_rate": 1.6506474641614923e-07, + "loss": 1.2941, + "step": 2968 + }, + { + "epoch": 3.772554002541296, + "grad_norm": 1.7159381033338508, + "learning_rate": 1.6324078069195005e-07, + "loss": 1.1676, + "step": 2969 + }, + { + "epoch": 3.7738246505717914, + "grad_norm": 2.0441924176493522, + "learning_rate": 1.6142686551545385e-07, + "loss": 1.3063, + "step": 2970 + }, + { + "epoch": 3.775095298602287, + "grad_norm": 1.7507259640438184, + "learning_rate": 1.5962300273999586e-07, + "loss": 1.3503, + "step": 2971 + }, + { + "epoch": 3.7763659466327826, + "grad_norm": 1.6456676972136066, + "learning_rate": 1.5782919420864628e-07, + "loss": 1.1359, + "step": 2972 + }, + { + "epoch": 3.7776365946632784, + "grad_norm": 1.6442053485039345, + "learning_rate": 1.5604544175419901e-07, + "loss": 1.2885, + "step": 2973 + }, + { + "epoch": 3.7789072426937738, + "grad_norm": 1.5890118382331657, + "learning_rate": 1.542717471991728e-07, + "loss": 1.3009, + "step": 2974 + }, + { + "epoch": 3.7801778907242696, + "grad_norm": 1.7493121867933188, + "learning_rate": 1.5250811235581142e-07, + "loss": 1.1525, + "step": 2975 + }, + { + "epoch": 3.781448538754765, + "grad_norm": 1.65820116434337, + "learning_rate": 1.5075453902608117e-07, + "loss": 1.0238, + "step": 2976 + }, + { + "epoch": 3.7827191867852603, + "grad_norm": 1.8538014168797075, + "learning_rate": 1.4901102900166554e-07, + "loss": 1.1603, + "step": 2977 + }, + { + "epoch": 3.783989834815756, + "grad_norm": 1.7397589588999856, + "learning_rate": 1.472775840639673e-07, + "loss": 1.211, + "step": 2978 + }, + { + "epoch": 3.7852604828462515, + "grad_norm": 1.7225410662035296, + "learning_rate": 1.4555420598410642e-07, + "loss": 1.1997, + "step": 2979 + }, + { + "epoch": 3.786531130876747, + "grad_norm": 1.810570089026905, + "learning_rate": 1.4384089652291544e-07, + "loss": 1.3315, + "step": 2980 + }, + { + "epoch": 3.7878017789072427, + "grad_norm": 1.8889965513400706, + "learning_rate": 1.4213765743094077e-07, + "loss": 1.3262, + "step": 2981 + }, + { + "epoch": 3.7890724269377385, + "grad_norm": 1.5660552596803883, + "learning_rate": 1.4044449044843921e-07, + "loss": 1.2192, + "step": 2982 + }, + { + "epoch": 3.790343074968234, + "grad_norm": 1.746991509973848, + "learning_rate": 1.3876139730537475e-07, + "loss": 1.2917, + "step": 2983 + }, + { + "epoch": 3.7916137229987292, + "grad_norm": 1.8285601882985356, + "learning_rate": 1.3708837972142176e-07, + "loss": 1.2735, + "step": 2984 + }, + { + "epoch": 3.792884371029225, + "grad_norm": 1.5513317560129432, + "learning_rate": 1.3542543940595953e-07, + "loss": 1.3793, + "step": 2985 + }, + { + "epoch": 3.7941550190597204, + "grad_norm": 1.447340929091766, + "learning_rate": 1.3377257805806786e-07, + "loss": 1.076, + "step": 2986 + }, + { + "epoch": 3.795425667090216, + "grad_norm": 1.6517394517306694, + "learning_rate": 1.3212979736653142e-07, + "loss": 1.2815, + "step": 2987 + }, + { + "epoch": 3.7966963151207116, + "grad_norm": 1.5754085331155292, + "learning_rate": 1.3049709900983643e-07, + "loss": 1.3616, + "step": 2988 + }, + { + "epoch": 3.797966963151207, + "grad_norm": 1.8740750673950757, + "learning_rate": 1.2887448465616292e-07, + "loss": 1.2242, + "step": 2989 + }, + { + "epoch": 3.799237611181703, + "grad_norm": 1.8367249089502404, + "learning_rate": 1.272619559633914e-07, + "loss": 1.1385, + "step": 2990 + }, + { + "epoch": 3.800508259212198, + "grad_norm": 1.8082582017934608, + "learning_rate": 1.256595145790973e-07, + "loss": 1.4317, + "step": 2991 + }, + { + "epoch": 3.801778907242694, + "grad_norm": 1.9061150195764778, + "learning_rate": 1.2406716214054982e-07, + "loss": 1.2792, + "step": 2992 + }, + { + "epoch": 3.8030495552731893, + "grad_norm": 1.8110974893118241, + "learning_rate": 1.2248490027470748e-07, + "loss": 1.0067, + "step": 2993 + }, + { + "epoch": 3.8043202033036847, + "grad_norm": 1.7372482940152012, + "learning_rate": 1.209127305982205e-07, + "loss": 1.3623, + "step": 2994 + }, + { + "epoch": 3.8055908513341805, + "grad_norm": 1.696128262457204, + "learning_rate": 1.1935065471742612e-07, + "loss": 1.0419, + "step": 2995 + }, + { + "epoch": 3.806861499364676, + "grad_norm": 1.7845542976410695, + "learning_rate": 1.1779867422835323e-07, + "loss": 1.3506, + "step": 2996 + }, + { + "epoch": 3.8081321473951717, + "grad_norm": 1.7022176451932132, + "learning_rate": 1.1625679071671114e-07, + "loss": 1.3518, + "step": 2997 + }, + { + "epoch": 3.809402795425667, + "grad_norm": 1.6165430147381685, + "learning_rate": 1.1472500575789302e-07, + "loss": 1.2496, + "step": 2998 + }, + { + "epoch": 3.810673443456163, + "grad_norm": 1.7143957115232276, + "learning_rate": 1.1320332091697473e-07, + "loss": 1.3813, + "step": 2999 + }, + { + "epoch": 3.8119440914866582, + "grad_norm": 1.7730705702795937, + "learning_rate": 1.1169173774871478e-07, + "loss": 1.374, + "step": 3000 + }, + { + "epoch": 3.8132147395171536, + "grad_norm": 1.6553721276053135, + "learning_rate": 1.1019025779754666e-07, + "loss": 1.1461, + "step": 3001 + }, + { + "epoch": 3.8144853875476494, + "grad_norm": 1.6235163337614682, + "learning_rate": 1.0869888259758543e-07, + "loss": 1.2913, + "step": 3002 + }, + { + "epoch": 3.815756035578145, + "grad_norm": 1.7194122264096625, + "learning_rate": 1.0721761367261662e-07, + "loss": 1.3193, + "step": 3003 + }, + { + "epoch": 3.81702668360864, + "grad_norm": 1.8003799827594853, + "learning_rate": 1.0574645253610405e-07, + "loss": 1.32, + "step": 3004 + }, + { + "epoch": 3.818297331639136, + "grad_norm": 2.029674106133309, + "learning_rate": 1.0428540069118199e-07, + "loss": 1.1933, + "step": 3005 + }, + { + "epoch": 3.8195679796696314, + "grad_norm": 1.8422067171601757, + "learning_rate": 1.028344596306552e-07, + "loss": 1.1397, + "step": 3006 + }, + { + "epoch": 3.820838627700127, + "grad_norm": 1.6832468467920436, + "learning_rate": 1.0139363083700116e-07, + "loss": 1.3426, + "step": 3007 + }, + { + "epoch": 3.8221092757306225, + "grad_norm": 1.956043652707399, + "learning_rate": 9.996291578236228e-08, + "loss": 1.348, + "step": 3008 + }, + { + "epoch": 3.8233799237611183, + "grad_norm": 1.6432019123161303, + "learning_rate": 9.854231592854702e-08, + "loss": 1.3335, + "step": 3009 + }, + { + "epoch": 3.8246505717916137, + "grad_norm": 1.8269339963735867, + "learning_rate": 9.713183272703208e-08, + "loss": 1.1764, + "step": 3010 + }, + { + "epoch": 3.825921219822109, + "grad_norm": 1.6720143501255003, + "learning_rate": 9.573146761895358e-08, + "loss": 1.2134, + "step": 3011 + }, + { + "epoch": 3.827191867852605, + "grad_norm": 1.5742217245088348, + "learning_rate": 9.434122203511253e-08, + "loss": 1.1075, + "step": 3012 + }, + { + "epoch": 3.8284625158831003, + "grad_norm": 1.9518318186239076, + "learning_rate": 9.296109739597047e-08, + "loss": 1.2332, + "step": 3013 + }, + { + "epoch": 3.829733163913596, + "grad_norm": 1.6623111274270994, + "learning_rate": 9.15910951116461e-08, + "loss": 1.0146, + "step": 3014 + }, + { + "epoch": 3.8310038119440915, + "grad_norm": 1.84924252006562, + "learning_rate": 9.023121658191636e-08, + "loss": 1.3765, + "step": 3015 + }, + { + "epoch": 3.8322744599745873, + "grad_norm": 1.6623671762455492, + "learning_rate": 8.888146319621538e-08, + "loss": 1.234, + "step": 3016 + }, + { + "epoch": 3.8335451080050826, + "grad_norm": 1.6942240744570565, + "learning_rate": 8.754183633363334e-08, + "loss": 1.2557, + "step": 3017 + }, + { + "epoch": 3.834815756035578, + "grad_norm": 1.7066974151328211, + "learning_rate": 8.621233736290868e-08, + "loss": 1.2491, + "step": 3018 + }, + { + "epoch": 3.836086404066074, + "grad_norm": 1.6533387949571765, + "learning_rate": 8.489296764243704e-08, + "loss": 1.235, + "step": 3019 + }, + { + "epoch": 3.837357052096569, + "grad_norm": 1.7665935151869387, + "learning_rate": 8.358372852026342e-08, + "loss": 1.4332, + "step": 3020 + }, + { + "epoch": 3.8386277001270646, + "grad_norm": 1.6998377266516242, + "learning_rate": 8.228462133408111e-08, + "loss": 1.3522, + "step": 3021 + }, + { + "epoch": 3.8398983481575604, + "grad_norm": 1.4689041575981403, + "learning_rate": 8.099564741123167e-08, + "loss": 1.4345, + "step": 3022 + }, + { + "epoch": 3.841168996188056, + "grad_norm": 1.593025880943212, + "learning_rate": 7.971680806870163e-08, + "loss": 1.3203, + "step": 3023 + }, + { + "epoch": 3.8424396442185516, + "grad_norm": 1.704653882953453, + "learning_rate": 7.84481046131258e-08, + "loss": 1.2147, + "step": 3024 + }, + { + "epoch": 3.843710292249047, + "grad_norm": 2.071019884817444, + "learning_rate": 7.718953834078058e-08, + "loss": 1.4055, + "step": 3025 + }, + { + "epoch": 3.8449809402795427, + "grad_norm": 1.8372347907743876, + "learning_rate": 7.594111053758624e-08, + "loss": 1.5519, + "step": 3026 + }, + { + "epoch": 3.846251588310038, + "grad_norm": 1.682413532760154, + "learning_rate": 7.470282247910132e-08, + "loss": 1.3645, + "step": 3027 + }, + { + "epoch": 3.8475222363405335, + "grad_norm": 1.7578519280129183, + "learning_rate": 7.347467543052932e-08, + "loss": 1.5367, + "step": 3028 + }, + { + "epoch": 3.8487928843710293, + "grad_norm": 1.6770289020194398, + "learning_rate": 7.225667064670761e-08, + "loss": 1.2428, + "step": 3029 + }, + { + "epoch": 3.8500635324015247, + "grad_norm": 1.7272876491470486, + "learning_rate": 7.104880937211178e-08, + "loss": 1.2414, + "step": 3030 + }, + { + "epoch": 3.8513341804320205, + "grad_norm": 1.5814762997311047, + "learning_rate": 6.985109284085578e-08, + "loss": 1.2969, + "step": 3031 + }, + { + "epoch": 3.852604828462516, + "grad_norm": 1.5723974149649933, + "learning_rate": 6.866352227668626e-08, + "loss": 1.2302, + "step": 3032 + }, + { + "epoch": 3.8538754764930117, + "grad_norm": 1.968228242890551, + "learning_rate": 6.748609889298596e-08, + "loss": 1.2859, + "step": 3033 + }, + { + "epoch": 3.855146124523507, + "grad_norm": 1.5146300885257318, + "learning_rate": 6.631882389276478e-08, + "loss": 1.1577, + "step": 3034 + }, + { + "epoch": 3.8564167725540024, + "grad_norm": 2.0136855015637236, + "learning_rate": 6.51616984686676e-08, + "loss": 1.2817, + "step": 3035 + }, + { + "epoch": 3.857687420584498, + "grad_norm": 1.6774381577901225, + "learning_rate": 6.401472380297091e-08, + "loss": 1.2733, + "step": 3036 + }, + { + "epoch": 3.8589580686149936, + "grad_norm": 1.7053505051775177, + "learning_rate": 6.287790106757396e-08, + "loss": 1.2441, + "step": 3037 + }, + { + "epoch": 3.860228716645489, + "grad_norm": 1.7911463890045214, + "learning_rate": 6.175123142400986e-08, + "loss": 1.385, + "step": 3038 + }, + { + "epoch": 3.8614993646759848, + "grad_norm": 1.7537350290360554, + "learning_rate": 6.063471602343219e-08, + "loss": 1.2987, + "step": 3039 + }, + { + "epoch": 3.8627700127064806, + "grad_norm": 1.643927395956265, + "learning_rate": 5.952835600662288e-08, + "loss": 1.2382, + "step": 3040 + }, + { + "epoch": 3.864040660736976, + "grad_norm": 2.123215412392022, + "learning_rate": 5.843215250398882e-08, + "loss": 1.3108, + "step": 3041 + }, + { + "epoch": 3.8653113087674713, + "grad_norm": 1.8836046079215185, + "learning_rate": 5.7346106635556286e-08, + "loss": 1.2853, + "step": 3042 + }, + { + "epoch": 3.866581956797967, + "grad_norm": 1.7277767829316242, + "learning_rate": 5.6270219510975445e-08, + "loss": 1.1018, + "step": 3043 + }, + { + "epoch": 3.8678526048284625, + "grad_norm": 1.787729576761809, + "learning_rate": 5.5204492229515846e-08, + "loss": 1.2945, + "step": 3044 + }, + { + "epoch": 3.869123252858958, + "grad_norm": 1.8200958656037172, + "learning_rate": 5.4148925880068705e-08, + "loss": 1.3888, + "step": 3045 + }, + { + "epoch": 3.8703939008894537, + "grad_norm": 1.7378636533919039, + "learning_rate": 5.310352154113907e-08, + "loss": 1.1941, + "step": 3046 + }, + { + "epoch": 3.871664548919949, + "grad_norm": 1.8610609441033346, + "learning_rate": 5.206828028085364e-08, + "loss": 1.2257, + "step": 3047 + }, + { + "epoch": 3.872935196950445, + "grad_norm": 1.8081119313521299, + "learning_rate": 5.104320315695188e-08, + "loss": 1.3963, + "step": 3048 + }, + { + "epoch": 3.8742058449809402, + "grad_norm": 1.59550034274709, + "learning_rate": 5.002829121679154e-08, + "loss": 1.2953, + "step": 3049 + }, + { + "epoch": 3.875476493011436, + "grad_norm": 1.6764129655232993, + "learning_rate": 4.902354549733979e-08, + "loss": 1.2611, + "step": 3050 + }, + { + "epoch": 3.8767471410419314, + "grad_norm": 1.632876323628899, + "learning_rate": 4.8028967025181005e-08, + "loss": 1.4017, + "step": 3051 + }, + { + "epoch": 3.878017789072427, + "grad_norm": 1.794162893632187, + "learning_rate": 4.704455681650788e-08, + "loss": 1.2708, + "step": 3052 + }, + { + "epoch": 3.8792884371029226, + "grad_norm": 1.8126195614094525, + "learning_rate": 4.607031587712696e-08, + "loss": 1.2623, + "step": 3053 + }, + { + "epoch": 3.880559085133418, + "grad_norm": 1.8230091802966468, + "learning_rate": 4.5106245202453106e-08, + "loss": 1.3042, + "step": 3054 + }, + { + "epoch": 3.8818297331639133, + "grad_norm": 1.682422086282267, + "learning_rate": 4.4152345777507263e-08, + "loss": 1.3326, + "step": 3055 + }, + { + "epoch": 3.883100381194409, + "grad_norm": 1.6824514684501524, + "learning_rate": 4.320861857692316e-08, + "loss": 1.1603, + "step": 3056 + }, + { + "epoch": 3.884371029224905, + "grad_norm": 1.6110806549573322, + "learning_rate": 4.227506456493835e-08, + "loss": 1.2895, + "step": 3057 + }, + { + "epoch": 3.8856416772554003, + "grad_norm": 1.7676659860940012, + "learning_rate": 4.13516846953943e-08, + "loss": 1.3506, + "step": 3058 + }, + { + "epoch": 3.8869123252858957, + "grad_norm": 1.767369964062659, + "learning_rate": 4.043847991174188e-08, + "loss": 1.2727, + "step": 3059 + }, + { + "epoch": 3.8881829733163915, + "grad_norm": 1.5053172893324513, + "learning_rate": 3.953545114703139e-08, + "loss": 1.3379, + "step": 3060 + }, + { + "epoch": 3.889453621346887, + "grad_norm": 1.9212747322334562, + "learning_rate": 3.864259932391923e-08, + "loss": 1.4492, + "step": 3061 + }, + { + "epoch": 3.8907242693773822, + "grad_norm": 1.6797433050046398, + "learning_rate": 3.775992535466011e-08, + "loss": 1.1713, + "step": 3062 + }, + { + "epoch": 3.891994917407878, + "grad_norm": 1.5690469601995618, + "learning_rate": 3.688743014111262e-08, + "loss": 1.254, + "step": 3063 + }, + { + "epoch": 3.8932655654383734, + "grad_norm": 1.672822569238482, + "learning_rate": 3.602511457473479e-08, + "loss": 1.0805, + "step": 3064 + }, + { + "epoch": 3.8945362134688692, + "grad_norm": 1.8613409795594205, + "learning_rate": 3.517297953658405e-08, + "loss": 1.2872, + "step": 3065 + }, + { + "epoch": 3.8958068614993646, + "grad_norm": 1.8887233528686516, + "learning_rate": 3.4331025897313964e-08, + "loss": 1.351, + "step": 3066 + }, + { + "epoch": 3.8970775095298604, + "grad_norm": 1.7636899325155904, + "learning_rate": 3.34992545171775e-08, + "loss": 0.9495, + "step": 3067 + }, + { + "epoch": 3.898348157560356, + "grad_norm": 1.638791874209901, + "learning_rate": 3.267766624602375e-08, + "loss": 0.9402, + "step": 3068 + }, + { + "epoch": 3.899618805590851, + "grad_norm": 1.9138239371342511, + "learning_rate": 3.186626192329678e-08, + "loss": 1.4469, + "step": 3069 + }, + { + "epoch": 3.900889453621347, + "grad_norm": 1.6216518038849317, + "learning_rate": 3.106504237803454e-08, + "loss": 1.1445, + "step": 3070 + }, + { + "epoch": 3.9021601016518423, + "grad_norm": 1.6910513589662268, + "learning_rate": 3.027400842887218e-08, + "loss": 1.3907, + "step": 3071 + }, + { + "epoch": 3.9034307496823377, + "grad_norm": 1.6053862098839753, + "learning_rate": 2.9493160884035422e-08, + "loss": 1.2571, + "step": 3072 + }, + { + "epoch": 3.9047013977128335, + "grad_norm": 1.6766860443136364, + "learning_rate": 2.8722500541340515e-08, + "loss": 1.2078, + "step": 3073 + }, + { + "epoch": 3.9059720457433293, + "grad_norm": 1.700314508038029, + "learning_rate": 2.796202818819871e-08, + "loss": 1.2652, + "step": 3074 + }, + { + "epoch": 3.9072426937738247, + "grad_norm": 1.7644853472221245, + "learning_rate": 2.721174460160958e-08, + "loss": 1.5189, + "step": 3075 + }, + { + "epoch": 3.90851334180432, + "grad_norm": 1.587777721341335, + "learning_rate": 2.6471650548163253e-08, + "loss": 1.2959, + "step": 3076 + }, + { + "epoch": 3.909783989834816, + "grad_norm": 1.6559481427920208, + "learning_rate": 2.574174678403818e-08, + "loss": 1.2242, + "step": 3077 + }, + { + "epoch": 3.9110546378653113, + "grad_norm": 1.650624585828068, + "learning_rate": 2.5022034055003363e-08, + "loss": 1.1872, + "step": 3078 + }, + { + "epoch": 3.9123252858958066, + "grad_norm": 1.6626045471929531, + "learning_rate": 2.4312513096410585e-08, + "loss": 1.1773, + "step": 3079 + }, + { + "epoch": 3.9135959339263025, + "grad_norm": 1.6733515413223996, + "learning_rate": 2.361318463320439e-08, + "loss": 1.102, + "step": 3080 + }, + { + "epoch": 3.914866581956798, + "grad_norm": 2.1408182193354253, + "learning_rate": 2.2924049379909884e-08, + "loss": 1.4914, + "step": 3081 + }, + { + "epoch": 3.9161372299872936, + "grad_norm": 1.8681977538957117, + "learning_rate": 2.2245108040640505e-08, + "loss": 1.2554, + "step": 3082 + }, + { + "epoch": 3.917407878017789, + "grad_norm": 1.9529352454215083, + "learning_rate": 2.1576361309093575e-08, + "loss": 1.4007, + "step": 3083 + }, + { + "epoch": 3.918678526048285, + "grad_norm": 1.631295188914433, + "learning_rate": 2.09178098685503e-08, + "loss": 1.3517, + "step": 3084 + }, + { + "epoch": 3.91994917407878, + "grad_norm": 1.709811335555246, + "learning_rate": 2.0269454391874665e-08, + "loss": 1.3244, + "step": 3085 + }, + { + "epoch": 3.9212198221092756, + "grad_norm": 1.7447655051383881, + "learning_rate": 1.963129554151344e-08, + "loss": 1.3004, + "step": 3086 + }, + { + "epoch": 3.9224904701397714, + "grad_norm": 1.7599315058217841, + "learning_rate": 1.9003333969493942e-08, + "loss": 1.4566, + "step": 3087 + }, + { + "epoch": 3.9237611181702667, + "grad_norm": 1.743197743728091, + "learning_rate": 1.8385570317427382e-08, + "loss": 1.2948, + "step": 3088 + }, + { + "epoch": 3.9250317662007626, + "grad_norm": 1.7696025370080466, + "learning_rate": 1.777800521650219e-08, + "loss": 1.5456, + "step": 3089 + }, + { + "epoch": 3.926302414231258, + "grad_norm": 1.7468091292083294, + "learning_rate": 1.7180639287488476e-08, + "loss": 1.532, + "step": 3090 + }, + { + "epoch": 3.9275730622617537, + "grad_norm": 1.6879458253212902, + "learning_rate": 1.6593473140734673e-08, + "loss": 1.346, + "step": 3091 + }, + { + "epoch": 3.928843710292249, + "grad_norm": 1.855691619109339, + "learning_rate": 1.6016507376169776e-08, + "loss": 1.1771, + "step": 3092 + }, + { + "epoch": 3.9301143583227445, + "grad_norm": 1.7090759637140573, + "learning_rate": 1.544974258329668e-08, + "loss": 1.2063, + "step": 3093 + }, + { + "epoch": 3.9313850063532403, + "grad_norm": 1.6552561885273969, + "learning_rate": 1.4893179341199936e-08, + "loss": 1.2813, + "step": 3094 + }, + { + "epoch": 3.9326556543837357, + "grad_norm": 1.4363593756042043, + "learning_rate": 1.4346818218539115e-08, + "loss": 1.145, + "step": 3095 + }, + { + "epoch": 3.933926302414231, + "grad_norm": 1.9013228076996314, + "learning_rate": 1.3810659773547675e-08, + "loss": 1.329, + "step": 3096 + }, + { + "epoch": 3.935196950444727, + "grad_norm": 1.8939443423681859, + "learning_rate": 1.328470455403963e-08, + "loss": 1.0988, + "step": 3097 + }, + { + "epoch": 3.936467598475222, + "grad_norm": 1.6738124704075703, + "learning_rate": 1.276895309739845e-08, + "loss": 1.512, + "step": 3098 + }, + { + "epoch": 3.937738246505718, + "grad_norm": 1.6652367861274973, + "learning_rate": 1.2263405930585947e-08, + "loss": 1.2814, + "step": 3099 + }, + { + "epoch": 3.9390088945362134, + "grad_norm": 1.6795036473859142, + "learning_rate": 1.1768063570136712e-08, + "loss": 1.3776, + "step": 3100 + }, + { + "epoch": 3.940279542566709, + "grad_norm": 1.7541917603843578, + "learning_rate": 1.1282926522158121e-08, + "loss": 1.3044, + "step": 3101 + }, + { + "epoch": 3.9415501905972046, + "grad_norm": 1.771550776590971, + "learning_rate": 1.0807995282332562e-08, + "loss": 1.3544, + "step": 3102 + }, + { + "epoch": 3.9428208386277, + "grad_norm": 1.7367784516596882, + "learning_rate": 1.034327033591076e-08, + "loss": 1.4199, + "step": 3103 + }, + { + "epoch": 3.9440914866581958, + "grad_norm": 1.9025994701869688, + "learning_rate": 9.888752157719562e-09, + "loss": 1.2801, + "step": 3104 + }, + { + "epoch": 3.945362134688691, + "grad_norm": 1.7790550525516902, + "learning_rate": 9.444441212155264e-09, + "loss": 1.3545, + "step": 3105 + }, + { + "epoch": 3.946632782719187, + "grad_norm": 1.6489191377760344, + "learning_rate": 9.010337953185843e-09, + "loss": 1.3388, + "step": 3106 + }, + { + "epoch": 3.9479034307496823, + "grad_norm": 1.6857591547682904, + "learning_rate": 8.586442824347618e-09, + "loss": 1.2036, + "step": 3107 + }, + { + "epoch": 3.949174078780178, + "grad_norm": 1.7276376614276525, + "learning_rate": 8.172756258748582e-09, + "loss": 1.2178, + "step": 3108 + }, + { + "epoch": 3.9504447268106735, + "grad_norm": 1.8403311129365838, + "learning_rate": 7.769278679068404e-09, + "loss": 1.2601, + "step": 3109 + }, + { + "epoch": 3.951715374841169, + "grad_norm": 1.8610845240663354, + "learning_rate": 7.3760104975517665e-09, + "loss": 1.1905, + "step": 3110 + }, + { + "epoch": 3.9529860228716647, + "grad_norm": 1.7683248465418167, + "learning_rate": 6.992952116013918e-09, + "loss": 1.1733, + "step": 3111 + }, + { + "epoch": 3.95425667090216, + "grad_norm": 1.978842698024534, + "learning_rate": 6.620103925840671e-09, + "loss": 1.2842, + "step": 3112 + }, + { + "epoch": 3.9555273189326554, + "grad_norm": 1.8693913246007345, + "learning_rate": 6.257466307980631e-09, + "loss": 1.2822, + "step": 3113 + }, + { + "epoch": 3.9567979669631512, + "grad_norm": 1.8968847407535872, + "learning_rate": 5.905039632954079e-09, + "loss": 1.231, + "step": 3114 + }, + { + "epoch": 3.9580686149936466, + "grad_norm": 1.8537024086868568, + "learning_rate": 5.562824260848532e-09, + "loss": 1.3544, + "step": 3115 + }, + { + "epoch": 3.9593392630241424, + "grad_norm": 1.4458633275440091, + "learning_rate": 5.230820541314296e-09, + "loss": 1.3089, + "step": 3116 + }, + { + "epoch": 3.9606099110546378, + "grad_norm": 1.9149880517081121, + "learning_rate": 4.909028813573358e-09, + "loss": 1.3764, + "step": 3117 + }, + { + "epoch": 3.9618805590851336, + "grad_norm": 2.0025213342886645, + "learning_rate": 4.597449406409382e-09, + "loss": 1.2419, + "step": 3118 + }, + { + "epoch": 3.963151207115629, + "grad_norm": 1.7262784322679252, + "learning_rate": 4.296082638173271e-09, + "loss": 1.4185, + "step": 3119 + }, + { + "epoch": 3.9644218551461243, + "grad_norm": 2.0961436951711696, + "learning_rate": 4.00492881678427e-09, + "loss": 1.4669, + "step": 3120 + }, + { + "epoch": 3.96569250317662, + "grad_norm": 1.705911916933581, + "learning_rate": 3.723988239721088e-09, + "loss": 1.3224, + "step": 3121 + }, + { + "epoch": 3.9669631512071155, + "grad_norm": 1.839944201594869, + "learning_rate": 3.453261194030777e-09, + "loss": 1.3765, + "step": 3122 + }, + { + "epoch": 3.9682337992376113, + "grad_norm": 1.7966456195324074, + "learning_rate": 3.1927479563254037e-09, + "loss": 1.2228, + "step": 3123 + }, + { + "epoch": 3.9695044472681067, + "grad_norm": 1.7698139775117516, + "learning_rate": 2.942448792778718e-09, + "loss": 1.1008, + "step": 3124 + }, + { + "epoch": 3.9707750952986025, + "grad_norm": 1.9453271358852795, + "learning_rate": 2.702363959131704e-09, + "loss": 1.3687, + "step": 3125 + }, + { + "epoch": 3.972045743329098, + "grad_norm": 1.6560152016888756, + "learning_rate": 2.4724937006848083e-09, + "loss": 1.3055, + "step": 3126 + }, + { + "epoch": 3.9733163913595932, + "grad_norm": 1.688241672688269, + "learning_rate": 2.2528382523057115e-09, + "loss": 1.3405, + "step": 3127 + }, + { + "epoch": 3.974587039390089, + "grad_norm": 1.689276490503405, + "learning_rate": 2.0433978384237772e-09, + "loss": 1.1854, + "step": 3128 + }, + { + "epoch": 3.9758576874205844, + "grad_norm": 1.6608383150125265, + "learning_rate": 1.8441726730300535e-09, + "loss": 1.2289, + "step": 3129 + }, + { + "epoch": 3.97712833545108, + "grad_norm": 1.8290640252525683, + "learning_rate": 1.6551629596817109e-09, + "loss": 1.2416, + "step": 3130 + }, + { + "epoch": 3.9783989834815756, + "grad_norm": 1.619845081875068, + "learning_rate": 1.4763688914942732e-09, + "loss": 1.3471, + "step": 3131 + }, + { + "epoch": 3.9796696315120714, + "grad_norm": 1.793203277408509, + "learning_rate": 1.3077906511482773e-09, + "loss": 1.1703, + "step": 3132 + }, + { + "epoch": 3.980940279542567, + "grad_norm": 1.7349015618821286, + "learning_rate": 1.1494284108859443e-09, + "loss": 1.2498, + "step": 3133 + }, + { + "epoch": 3.982210927573062, + "grad_norm": 1.9995653740531916, + "learning_rate": 1.0012823325111776e-09, + "loss": 1.4463, + "step": 3134 + }, + { + "epoch": 3.983481575603558, + "grad_norm": 1.9850571096043719, + "learning_rate": 8.63352567390674e-10, + "loss": 1.3383, + "step": 3135 + }, + { + "epoch": 3.9847522236340533, + "grad_norm": 1.732098526894926, + "learning_rate": 7.356392564505932e-10, + "loss": 1.2589, + "step": 3136 + }, + { + "epoch": 3.9860228716645487, + "grad_norm": 1.5875173750980283, + "learning_rate": 6.181425301809985e-10, + "loss": 1.3793, + "step": 3137 + }, + { + "epoch": 3.9872935196950445, + "grad_norm": 2.0291698208290225, + "learning_rate": 5.108625086314157e-10, + "loss": 1.1664, + "step": 3138 + }, + { + "epoch": 3.98856416772554, + "grad_norm": 1.7860671094093037, + "learning_rate": 4.137993014130537e-10, + "loss": 1.396, + "step": 3139 + }, + { + "epoch": 3.9898348157560357, + "grad_norm": 1.7815082903328416, + "learning_rate": 3.2695300769991503e-10, + "loss": 1.3193, + "step": 3140 + }, + { + "epoch": 3.991105463786531, + "grad_norm": 1.6699156756254965, + "learning_rate": 2.503237162254646e-10, + "loss": 1.3357, + "step": 3141 + }, + { + "epoch": 3.992376111817027, + "grad_norm": 1.8035271386920084, + "learning_rate": 1.8391150528485058e-10, + "loss": 1.4147, + "step": 3142 + }, + { + "epoch": 3.9936467598475223, + "grad_norm": 1.9501122932797061, + "learning_rate": 1.277164427326838e-10, + "loss": 1.4584, + "step": 3143 + }, + { + "epoch": 3.9949174078780176, + "grad_norm": 1.7768540849474679, + "learning_rate": 8.173858598525819e-11, + "loss": 1.2683, + "step": 3144 + }, + { + "epoch": 3.9961880559085134, + "grad_norm": 1.5464287026588919, + "learning_rate": 4.597798201944059e-11, + "loss": 1.105, + "step": 3145 + }, + { + "epoch": 3.997458703939009, + "grad_norm": 1.7342586128482869, + "learning_rate": 2.043466737489119e-11, + "loss": 1.1672, + "step": 3146 + }, + { + "epoch": 3.998729351969504, + "grad_norm": 1.7855077119093086, + "learning_rate": 5.108668148512408e-12, + "loss": 1.349, + "step": 3147 + }, + { + "epoch": 4.0, + "grad_norm": 1.9978620033593717, + "learning_rate": 0.0, + "loss": 1.4619, + "step": 3148 + } + ], + "logging_steps": 1, + "max_steps": 3148, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 394, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 988690734120960.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}