{ "best_global_step": 1550, "best_metric": 0.07311470806598663, "best_model_checkpoint": "outputs_qwq/checkpoint-1550", "epoch": 2.903891233005157, "eval_steps": 50, "global_step": 1550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001875293014533521, "grad_norm": 0.39477676153182983, "learning_rate": 0.0, "loss": 0.7899, "step": 1 }, { "epoch": 0.003750586029067042, "grad_norm": 0.3514685332775116, "learning_rate": 2.0000000000000002e-07, "loss": 0.9541, "step": 2 }, { "epoch": 0.005625879043600563, "grad_norm": 0.2947579622268677, "learning_rate": 4.0000000000000003e-07, "loss": 0.7068, "step": 3 }, { "epoch": 0.007501172058134084, "grad_norm": 0.26537010073661804, "learning_rate": 6.000000000000001e-07, "loss": 0.6839, "step": 4 }, { "epoch": 0.009376465072667605, "grad_norm": 0.26260894536972046, "learning_rate": 8.000000000000001e-07, "loss": 0.68, "step": 5 }, { "epoch": 0.011251758087201125, "grad_norm": 0.24916508793830872, "learning_rate": 1.0000000000000002e-06, "loss": 0.6905, "step": 6 }, { "epoch": 0.013127051101734646, "grad_norm": 0.3689679801464081, "learning_rate": 1.2000000000000002e-06, "loss": 0.9347, "step": 7 }, { "epoch": 0.015002344116268168, "grad_norm": 0.27197960019111633, "learning_rate": 1.4000000000000001e-06, "loss": 0.7164, "step": 8 }, { "epoch": 0.016877637130801686, "grad_norm": 0.305757611989975, "learning_rate": 1.6000000000000001e-06, "loss": 0.721, "step": 9 }, { "epoch": 0.01875293014533521, "grad_norm": 0.272778183221817, "learning_rate": 1.8000000000000001e-06, "loss": 0.6814, "step": 10 }, { "epoch": 0.02062822315986873, "grad_norm": 0.28655725717544556, "learning_rate": 2.0000000000000003e-06, "loss": 0.7508, "step": 11 }, { "epoch": 0.02250351617440225, "grad_norm": 0.32160523533821106, "learning_rate": 2.2e-06, "loss": 0.8391, "step": 12 }, { "epoch": 0.02437880918893577, "grad_norm": 0.2688426077365875, "learning_rate": 2.4000000000000003e-06, "loss": 0.6594, "step": 13 }, { "epoch": 0.02625410220346929, "grad_norm": 0.33779385685920715, "learning_rate": 2.6e-06, "loss": 0.7729, "step": 14 }, { "epoch": 0.02812939521800281, "grad_norm": 0.3078613877296448, "learning_rate": 2.8000000000000003e-06, "loss": 0.7372, "step": 15 }, { "epoch": 0.030004688232536336, "grad_norm": 0.2747291922569275, "learning_rate": 3e-06, "loss": 0.6716, "step": 16 }, { "epoch": 0.03187998124706985, "grad_norm": 0.30704063177108765, "learning_rate": 3.2000000000000003e-06, "loss": 0.7303, "step": 17 }, { "epoch": 0.03375527426160337, "grad_norm": 0.312160849571228, "learning_rate": 3.4000000000000005e-06, "loss": 0.7528, "step": 18 }, { "epoch": 0.03563056727613689, "grad_norm": 0.3484225571155548, "learning_rate": 3.6000000000000003e-06, "loss": 0.728, "step": 19 }, { "epoch": 0.03750586029067042, "grad_norm": 0.328665167093277, "learning_rate": 3.8000000000000005e-06, "loss": 0.8129, "step": 20 }, { "epoch": 0.03938115330520394, "grad_norm": 0.32902592420578003, "learning_rate": 4.000000000000001e-06, "loss": 0.7526, "step": 21 }, { "epoch": 0.04125644631973746, "grad_norm": 0.3969726860523224, "learning_rate": 4.2000000000000004e-06, "loss": 0.806, "step": 22 }, { "epoch": 0.04313173933427098, "grad_norm": 0.3924497067928314, "learning_rate": 4.4e-06, "loss": 0.6978, "step": 23 }, { "epoch": 0.0450070323488045, "grad_norm": 0.33922383189201355, "learning_rate": 4.600000000000001e-06, "loss": 0.7919, "step": 24 }, { "epoch": 0.04688232536333802, "grad_norm": 0.3359040319919586, "learning_rate": 4.800000000000001e-06, "loss": 0.7881, "step": 25 }, { "epoch": 0.04875761837787154, "grad_norm": 0.4279753863811493, "learning_rate": 5e-06, "loss": 0.9385, "step": 26 }, { "epoch": 0.05063291139240506, "grad_norm": 0.351744145154953, "learning_rate": 5.2e-06, "loss": 0.8571, "step": 27 }, { "epoch": 0.05250820440693858, "grad_norm": 0.3303033113479614, "learning_rate": 5.400000000000001e-06, "loss": 0.6374, "step": 28 }, { "epoch": 0.0543834974214721, "grad_norm": 0.36658555269241333, "learning_rate": 5.600000000000001e-06, "loss": 0.7102, "step": 29 }, { "epoch": 0.05625879043600562, "grad_norm": 0.3628396987915039, "learning_rate": 5.8e-06, "loss": 0.7783, "step": 30 }, { "epoch": 0.058134083450539144, "grad_norm": 0.38539397716522217, "learning_rate": 6e-06, "loss": 0.798, "step": 31 }, { "epoch": 0.06000937646507267, "grad_norm": 0.4560730755329132, "learning_rate": 6.200000000000001e-06, "loss": 0.716, "step": 32 }, { "epoch": 0.06188466947960619, "grad_norm": 0.35180985927581787, "learning_rate": 6.4000000000000006e-06, "loss": 0.7262, "step": 33 }, { "epoch": 0.0637599624941397, "grad_norm": 0.4624330997467041, "learning_rate": 6.600000000000001e-06, "loss": 0.8614, "step": 34 }, { "epoch": 0.06563525550867323, "grad_norm": 0.3922227621078491, "learning_rate": 6.800000000000001e-06, "loss": 0.7928, "step": 35 }, { "epoch": 0.06751054852320675, "grad_norm": 0.44666793942451477, "learning_rate": 7e-06, "loss": 0.8177, "step": 36 }, { "epoch": 0.06938584153774027, "grad_norm": 0.4159597158432007, "learning_rate": 7.2000000000000005e-06, "loss": 0.8009, "step": 37 }, { "epoch": 0.07126113455227379, "grad_norm": 0.442553848028183, "learning_rate": 7.4e-06, "loss": 0.6281, "step": 38 }, { "epoch": 0.07313642756680731, "grad_norm": 0.36248019337654114, "learning_rate": 7.600000000000001e-06, "loss": 0.6649, "step": 39 }, { "epoch": 0.07501172058134084, "grad_norm": 0.5066297650337219, "learning_rate": 7.800000000000002e-06, "loss": 0.8881, "step": 40 }, { "epoch": 0.07688701359587435, "grad_norm": 0.4364047944545746, "learning_rate": 8.000000000000001e-06, "loss": 0.8244, "step": 41 }, { "epoch": 0.07876230661040788, "grad_norm": 0.43310582637786865, "learning_rate": 8.2e-06, "loss": 0.6797, "step": 42 }, { "epoch": 0.0806375996249414, "grad_norm": 0.5400950908660889, "learning_rate": 8.400000000000001e-06, "loss": 0.7903, "step": 43 }, { "epoch": 0.08251289263947492, "grad_norm": 0.4437219500541687, "learning_rate": 8.6e-06, "loss": 0.6449, "step": 44 }, { "epoch": 0.08438818565400844, "grad_norm": 0.48232921957969666, "learning_rate": 8.8e-06, "loss": 0.6013, "step": 45 }, { "epoch": 0.08626347866854196, "grad_norm": 0.6040736436843872, "learning_rate": 9e-06, "loss": 0.5827, "step": 46 }, { "epoch": 0.08813877168307548, "grad_norm": 0.4945738613605499, "learning_rate": 9.200000000000002e-06, "loss": 0.7523, "step": 47 }, { "epoch": 0.090014064697609, "grad_norm": 0.44322240352630615, "learning_rate": 9.4e-06, "loss": 0.6732, "step": 48 }, { "epoch": 0.09188935771214252, "grad_norm": 0.42256736755371094, "learning_rate": 9.600000000000001e-06, "loss": 0.6088, "step": 49 }, { "epoch": 0.09376465072667604, "grad_norm": 0.4956076443195343, "learning_rate": 9.800000000000001e-06, "loss": 0.5818, "step": 50 }, { "epoch": 0.09376465072667604, "eval_loss": 0.6773348450660706, "eval_runtime": 678.2607, "eval_samples_per_second": 0.29, "eval_steps_per_second": 0.29, "step": 50 }, { "epoch": 0.09563994374120956, "grad_norm": 0.5553563237190247, "learning_rate": 1e-05, "loss": 0.7514, "step": 51 }, { "epoch": 0.09751523675574308, "grad_norm": 0.5681352019309998, "learning_rate": 1.02e-05, "loss": 0.6105, "step": 52 }, { "epoch": 0.09939052977027661, "grad_norm": 0.5460308790206909, "learning_rate": 1.04e-05, "loss": 0.5824, "step": 53 }, { "epoch": 0.10126582278481013, "grad_norm": 0.5295383930206299, "learning_rate": 1.0600000000000002e-05, "loss": 0.6622, "step": 54 }, { "epoch": 0.10314111579934365, "grad_norm": 0.5573945045471191, "learning_rate": 1.0800000000000002e-05, "loss": 0.604, "step": 55 }, { "epoch": 0.10501640881387717, "grad_norm": 0.5254002809524536, "learning_rate": 1.1000000000000001e-05, "loss": 0.5777, "step": 56 }, { "epoch": 0.10689170182841069, "grad_norm": 0.4916251003742218, "learning_rate": 1.1200000000000001e-05, "loss": 0.5605, "step": 57 }, { "epoch": 0.1087669948429442, "grad_norm": 0.44173622131347656, "learning_rate": 1.14e-05, "loss": 0.4921, "step": 58 }, { "epoch": 0.11064228785747773, "grad_norm": 0.7284818887710571, "learning_rate": 1.16e-05, "loss": 0.676, "step": 59 }, { "epoch": 0.11251758087201125, "grad_norm": 0.5018178224563599, "learning_rate": 1.18e-05, "loss": 0.5044, "step": 60 }, { "epoch": 0.11439287388654477, "grad_norm": 0.4238162040710449, "learning_rate": 1.2e-05, "loss": 0.4526, "step": 61 }, { "epoch": 0.11626816690107829, "grad_norm": 0.43392857909202576, "learning_rate": 1.22e-05, "loss": 0.554, "step": 62 }, { "epoch": 0.11814345991561181, "grad_norm": 0.37505975365638733, "learning_rate": 1.2400000000000002e-05, "loss": 0.4167, "step": 63 }, { "epoch": 0.12001875293014534, "grad_norm": 0.41996413469314575, "learning_rate": 1.2600000000000001e-05, "loss": 0.4457, "step": 64 }, { "epoch": 0.12189404594467886, "grad_norm": 0.35364946722984314, "learning_rate": 1.2800000000000001e-05, "loss": 0.397, "step": 65 }, { "epoch": 0.12376933895921238, "grad_norm": 0.3553512692451477, "learning_rate": 1.3000000000000001e-05, "loss": 0.4027, "step": 66 }, { "epoch": 0.1256446319737459, "grad_norm": 0.35016101598739624, "learning_rate": 1.3200000000000002e-05, "loss": 0.4234, "step": 67 }, { "epoch": 0.1275199249882794, "grad_norm": 0.28325924277305603, "learning_rate": 1.3400000000000002e-05, "loss": 0.3099, "step": 68 }, { "epoch": 0.12939521800281295, "grad_norm": 0.3461209237575531, "learning_rate": 1.3600000000000002e-05, "loss": 0.4367, "step": 69 }, { "epoch": 0.13127051101734646, "grad_norm": 0.28789466619491577, "learning_rate": 1.38e-05, "loss": 0.3175, "step": 70 }, { "epoch": 0.13314580403187998, "grad_norm": 0.3014371693134308, "learning_rate": 1.4e-05, "loss": 0.3197, "step": 71 }, { "epoch": 0.1350210970464135, "grad_norm": 0.2781355082988739, "learning_rate": 1.4200000000000001e-05, "loss": 0.3437, "step": 72 }, { "epoch": 0.13689639006094703, "grad_norm": 0.2576352059841156, "learning_rate": 1.4400000000000001e-05, "loss": 0.2707, "step": 73 }, { "epoch": 0.13877168307548055, "grad_norm": 0.3152413070201874, "learning_rate": 1.46e-05, "loss": 0.3236, "step": 74 }, { "epoch": 0.14064697609001406, "grad_norm": 0.6653899550437927, "learning_rate": 1.48e-05, "loss": 0.3465, "step": 75 }, { "epoch": 0.14252226910454757, "grad_norm": 0.2475852519273758, "learning_rate": 1.5000000000000002e-05, "loss": 0.2468, "step": 76 }, { "epoch": 0.1443975621190811, "grad_norm": 0.28572630882263184, "learning_rate": 1.5200000000000002e-05, "loss": 0.3287, "step": 77 }, { "epoch": 0.14627285513361463, "grad_norm": 0.27325335144996643, "learning_rate": 1.54e-05, "loss": 0.2926, "step": 78 }, { "epoch": 0.14814814814814814, "grad_norm": 0.3255375027656555, "learning_rate": 1.5600000000000003e-05, "loss": 0.279, "step": 79 }, { "epoch": 0.15002344116268168, "grad_norm": 0.44983330368995667, "learning_rate": 1.58e-05, "loss": 0.3534, "step": 80 }, { "epoch": 0.1518987341772152, "grad_norm": 0.240912526845932, "learning_rate": 1.6000000000000003e-05, "loss": 0.2541, "step": 81 }, { "epoch": 0.1537740271917487, "grad_norm": 0.21851545572280884, "learning_rate": 1.62e-05, "loss": 0.2334, "step": 82 }, { "epoch": 0.15564932020628222, "grad_norm": 0.25758886337280273, "learning_rate": 1.64e-05, "loss": 0.2952, "step": 83 }, { "epoch": 0.15752461322081576, "grad_norm": 0.2865050435066223, "learning_rate": 1.66e-05, "loss": 0.2702, "step": 84 }, { "epoch": 0.15939990623534928, "grad_norm": 0.21041016280651093, "learning_rate": 1.6800000000000002e-05, "loss": 0.2565, "step": 85 }, { "epoch": 0.1612751992498828, "grad_norm": 0.21632152795791626, "learning_rate": 1.7e-05, "loss": 0.2643, "step": 86 }, { "epoch": 0.1631504922644163, "grad_norm": 0.1930413544178009, "learning_rate": 1.72e-05, "loss": 0.2386, "step": 87 }, { "epoch": 0.16502578527894984, "grad_norm": 0.21475085616111755, "learning_rate": 1.7400000000000003e-05, "loss": 0.3035, "step": 88 }, { "epoch": 0.16690107829348336, "grad_norm": 0.2499314695596695, "learning_rate": 1.76e-05, "loss": 0.3382, "step": 89 }, { "epoch": 0.16877637130801687, "grad_norm": 0.22533196210861206, "learning_rate": 1.7800000000000002e-05, "loss": 0.2262, "step": 90 }, { "epoch": 0.1706516643225504, "grad_norm": 0.22342143952846527, "learning_rate": 1.8e-05, "loss": 0.2489, "step": 91 }, { "epoch": 0.17252695733708392, "grad_norm": 0.2666771411895752, "learning_rate": 1.8200000000000002e-05, "loss": 0.2732, "step": 92 }, { "epoch": 0.17440225035161744, "grad_norm": 0.22881805896759033, "learning_rate": 1.8400000000000003e-05, "loss": 0.285, "step": 93 }, { "epoch": 0.17627754336615095, "grad_norm": 0.3157159984111786, "learning_rate": 1.86e-05, "loss": 0.2406, "step": 94 }, { "epoch": 0.1781528363806845, "grad_norm": 0.20557765662670135, "learning_rate": 1.88e-05, "loss": 0.2162, "step": 95 }, { "epoch": 0.180028129395218, "grad_norm": 0.26530343294143677, "learning_rate": 1.9e-05, "loss": 0.2527, "step": 96 }, { "epoch": 0.18190342240975152, "grad_norm": 0.21458247303962708, "learning_rate": 1.9200000000000003e-05, "loss": 0.1677, "step": 97 }, { "epoch": 0.18377871542428503, "grad_norm": 0.20788805186748505, "learning_rate": 1.94e-05, "loss": 0.2191, "step": 98 }, { "epoch": 0.18565400843881857, "grad_norm": 0.20019683241844177, "learning_rate": 1.9600000000000002e-05, "loss": 0.1821, "step": 99 }, { "epoch": 0.1875293014533521, "grad_norm": 0.264813631772995, "learning_rate": 1.98e-05, "loss": 0.2307, "step": 100 }, { "epoch": 0.1875293014533521, "eval_loss": 0.18159574270248413, "eval_runtime": 674.6463, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 100 }, { "epoch": 0.1894045944678856, "grad_norm": 0.2246488779783249, "learning_rate": 2e-05, "loss": 0.2421, "step": 101 }, { "epoch": 0.19127988748241911, "grad_norm": 0.24814476072788239, "learning_rate": 1.9992202729044836e-05, "loss": 0.1653, "step": 102 }, { "epoch": 0.19315518049695266, "grad_norm": 0.2789003849029541, "learning_rate": 1.998440545808967e-05, "loss": 0.2324, "step": 103 }, { "epoch": 0.19503047351148617, "grad_norm": 0.21182158589363098, "learning_rate": 1.9976608187134504e-05, "loss": 0.2226, "step": 104 }, { "epoch": 0.19690576652601968, "grad_norm": 0.299167662858963, "learning_rate": 1.996881091617934e-05, "loss": 0.2122, "step": 105 }, { "epoch": 0.19878105954055322, "grad_norm": 0.22616952657699585, "learning_rate": 1.9961013645224173e-05, "loss": 0.2174, "step": 106 }, { "epoch": 0.20065635255508674, "grad_norm": 0.19206267595291138, "learning_rate": 1.9953216374269007e-05, "loss": 0.1776, "step": 107 }, { "epoch": 0.20253164556962025, "grad_norm": 0.2957805097103119, "learning_rate": 1.994541910331384e-05, "loss": 0.1576, "step": 108 }, { "epoch": 0.20440693858415376, "grad_norm": 0.22284749150276184, "learning_rate": 1.9937621832358675e-05, "loss": 0.1495, "step": 109 }, { "epoch": 0.2062822315986873, "grad_norm": 0.21203336119651794, "learning_rate": 1.992982456140351e-05, "loss": 0.186, "step": 110 }, { "epoch": 0.20815752461322082, "grad_norm": 0.18476776778697968, "learning_rate": 1.9922027290448344e-05, "loss": 0.1626, "step": 111 }, { "epoch": 0.21003281762775433, "grad_norm": 0.19962036609649658, "learning_rate": 1.9914230019493178e-05, "loss": 0.1447, "step": 112 }, { "epoch": 0.21190811064228784, "grad_norm": 0.1857951283454895, "learning_rate": 1.9906432748538015e-05, "loss": 0.175, "step": 113 }, { "epoch": 0.21378340365682139, "grad_norm": 0.2035515159368515, "learning_rate": 1.989863547758285e-05, "loss": 0.1732, "step": 114 }, { "epoch": 0.2156586966713549, "grad_norm": 0.1710767149925232, "learning_rate": 1.9890838206627684e-05, "loss": 0.1317, "step": 115 }, { "epoch": 0.2175339896858884, "grad_norm": 0.2920154929161072, "learning_rate": 1.9883040935672515e-05, "loss": 0.1802, "step": 116 }, { "epoch": 0.21940928270042195, "grad_norm": 0.27223774790763855, "learning_rate": 1.987524366471735e-05, "loss": 0.1338, "step": 117 }, { "epoch": 0.22128457571495547, "grad_norm": 0.1965789496898651, "learning_rate": 1.9867446393762183e-05, "loss": 0.1751, "step": 118 }, { "epoch": 0.22315986872948898, "grad_norm": 0.19014707207679749, "learning_rate": 1.9859649122807017e-05, "loss": 0.1485, "step": 119 }, { "epoch": 0.2250351617440225, "grad_norm": 0.251869261264801, "learning_rate": 1.9851851851851855e-05, "loss": 0.1416, "step": 120 }, { "epoch": 0.22691045475855603, "grad_norm": 0.2957039475440979, "learning_rate": 1.984405458089669e-05, "loss": 0.1892, "step": 121 }, { "epoch": 0.22878574777308955, "grad_norm": 0.17241084575653076, "learning_rate": 1.9836257309941523e-05, "loss": 0.1412, "step": 122 }, { "epoch": 0.23066104078762306, "grad_norm": 0.2205045521259308, "learning_rate": 1.9828460038986357e-05, "loss": 0.1602, "step": 123 }, { "epoch": 0.23253633380215658, "grad_norm": 0.21944566071033478, "learning_rate": 1.982066276803119e-05, "loss": 0.146, "step": 124 }, { "epoch": 0.23441162681669012, "grad_norm": 0.21907442808151245, "learning_rate": 1.9812865497076026e-05, "loss": 0.1588, "step": 125 }, { "epoch": 0.23628691983122363, "grad_norm": 0.17742829024791718, "learning_rate": 1.980506822612086e-05, "loss": 0.151, "step": 126 }, { "epoch": 0.23816221284575714, "grad_norm": 0.23079413175582886, "learning_rate": 1.9797270955165694e-05, "loss": 0.2186, "step": 127 }, { "epoch": 0.24003750586029068, "grad_norm": 0.19806325435638428, "learning_rate": 1.9789473684210528e-05, "loss": 0.1352, "step": 128 }, { "epoch": 0.2419127988748242, "grad_norm": 0.1862531155347824, "learning_rate": 1.9781676413255362e-05, "loss": 0.1456, "step": 129 }, { "epoch": 0.2437880918893577, "grad_norm": 0.1777540147304535, "learning_rate": 1.9773879142300197e-05, "loss": 0.1331, "step": 130 }, { "epoch": 0.24566338490389122, "grad_norm": 0.29459261894226074, "learning_rate": 1.976608187134503e-05, "loss": 0.1421, "step": 131 }, { "epoch": 0.24753867791842477, "grad_norm": 0.20495347678661346, "learning_rate": 1.9758284600389865e-05, "loss": 0.1599, "step": 132 }, { "epoch": 0.24941397093295828, "grad_norm": 0.17535296082496643, "learning_rate": 1.97504873294347e-05, "loss": 0.17, "step": 133 }, { "epoch": 0.2512892639474918, "grad_norm": 0.16180671751499176, "learning_rate": 1.9742690058479533e-05, "loss": 0.1367, "step": 134 }, { "epoch": 0.25316455696202533, "grad_norm": 0.4137842357158661, "learning_rate": 1.9734892787524368e-05, "loss": 0.1536, "step": 135 }, { "epoch": 0.2550398499765588, "grad_norm": 0.174374058842659, "learning_rate": 1.9727095516569202e-05, "loss": 0.1562, "step": 136 }, { "epoch": 0.25691514299109236, "grad_norm": 0.1792580634355545, "learning_rate": 1.9719298245614036e-05, "loss": 0.174, "step": 137 }, { "epoch": 0.2587904360056259, "grad_norm": 0.40365007519721985, "learning_rate": 1.971150097465887e-05, "loss": 0.2197, "step": 138 }, { "epoch": 0.2606657290201594, "grad_norm": 0.1588585376739502, "learning_rate": 1.9703703703703704e-05, "loss": 0.1012, "step": 139 }, { "epoch": 0.26254102203469293, "grad_norm": 0.1797696053981781, "learning_rate": 1.969590643274854e-05, "loss": 0.123, "step": 140 }, { "epoch": 0.26441631504922647, "grad_norm": 0.41170960664749146, "learning_rate": 1.9688109161793373e-05, "loss": 0.2208, "step": 141 }, { "epoch": 0.26629160806375995, "grad_norm": 0.22392353415489197, "learning_rate": 1.9680311890838207e-05, "loss": 0.1634, "step": 142 }, { "epoch": 0.2681669010782935, "grad_norm": 0.2506747841835022, "learning_rate": 1.9672514619883044e-05, "loss": 0.1539, "step": 143 }, { "epoch": 0.270042194092827, "grad_norm": 0.3422393500804901, "learning_rate": 1.966471734892788e-05, "loss": 0.2705, "step": 144 }, { "epoch": 0.2719174871073605, "grad_norm": 0.15338042378425598, "learning_rate": 1.9656920077972713e-05, "loss": 0.1554, "step": 145 }, { "epoch": 0.27379278012189406, "grad_norm": 0.4089084267616272, "learning_rate": 1.9649122807017544e-05, "loss": 0.1756, "step": 146 }, { "epoch": 0.27566807313642755, "grad_norm": 0.19431771337985992, "learning_rate": 1.9641325536062378e-05, "loss": 0.1549, "step": 147 }, { "epoch": 0.2775433661509611, "grad_norm": 0.21856912970542908, "learning_rate": 1.9633528265107212e-05, "loss": 0.1854, "step": 148 }, { "epoch": 0.27941865916549463, "grad_norm": 0.21324169635772705, "learning_rate": 1.962573099415205e-05, "loss": 0.1348, "step": 149 }, { "epoch": 0.2812939521800281, "grad_norm": 0.18593250215053558, "learning_rate": 1.9617933723196884e-05, "loss": 0.1309, "step": 150 }, { "epoch": 0.2812939521800281, "eval_loss": 0.11937730759382248, "eval_runtime": 675.2131, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 150 }, { "epoch": 0.28316924519456166, "grad_norm": 0.17237016558647156, "learning_rate": 1.9610136452241718e-05, "loss": 0.1228, "step": 151 }, { "epoch": 0.28504453820909514, "grad_norm": 0.20051100850105286, "learning_rate": 1.9602339181286552e-05, "loss": 0.1028, "step": 152 }, { "epoch": 0.2869198312236287, "grad_norm": 0.20913535356521606, "learning_rate": 1.9594541910331386e-05, "loss": 0.1478, "step": 153 }, { "epoch": 0.2887951242381622, "grad_norm": 0.22947447001934052, "learning_rate": 1.958674463937622e-05, "loss": 0.1199, "step": 154 }, { "epoch": 0.2906704172526957, "grad_norm": 0.21503888070583344, "learning_rate": 1.9578947368421055e-05, "loss": 0.1469, "step": 155 }, { "epoch": 0.29254571026722925, "grad_norm": 0.15914572775363922, "learning_rate": 1.957115009746589e-05, "loss": 0.1126, "step": 156 }, { "epoch": 0.2944210032817628, "grad_norm": 0.1764422208070755, "learning_rate": 1.9563352826510723e-05, "loss": 0.142, "step": 157 }, { "epoch": 0.2962962962962963, "grad_norm": 0.20030224323272705, "learning_rate": 1.9555555555555557e-05, "loss": 0.1475, "step": 158 }, { "epoch": 0.2981715893108298, "grad_norm": 0.21742001175880432, "learning_rate": 1.954775828460039e-05, "loss": 0.1874, "step": 159 }, { "epoch": 0.30004688232536336, "grad_norm": 0.2284712940454483, "learning_rate": 1.9539961013645226e-05, "loss": 0.1532, "step": 160 }, { "epoch": 0.30192217533989685, "grad_norm": 0.18156972527503967, "learning_rate": 1.953216374269006e-05, "loss": 0.092, "step": 161 }, { "epoch": 0.3037974683544304, "grad_norm": 0.1754453182220459, "learning_rate": 1.9524366471734894e-05, "loss": 0.1348, "step": 162 }, { "epoch": 0.3056727613689639, "grad_norm": 0.16860631108283997, "learning_rate": 1.9516569200779728e-05, "loss": 0.1176, "step": 163 }, { "epoch": 0.3075480543834974, "grad_norm": 0.17378303408622742, "learning_rate": 1.9508771929824562e-05, "loss": 0.1372, "step": 164 }, { "epoch": 0.30942334739803096, "grad_norm": 0.23950371146202087, "learning_rate": 1.9500974658869397e-05, "loss": 0.1206, "step": 165 }, { "epoch": 0.31129864041256444, "grad_norm": 0.17646121978759766, "learning_rate": 1.949317738791423e-05, "loss": 0.1298, "step": 166 }, { "epoch": 0.313173933427098, "grad_norm": 0.1811673790216446, "learning_rate": 1.9485380116959065e-05, "loss": 0.188, "step": 167 }, { "epoch": 0.3150492264416315, "grad_norm": 0.27424830198287964, "learning_rate": 1.94775828460039e-05, "loss": 0.1422, "step": 168 }, { "epoch": 0.316924519456165, "grad_norm": 0.21313942968845367, "learning_rate": 1.9469785575048733e-05, "loss": 0.1201, "step": 169 }, { "epoch": 0.31879981247069855, "grad_norm": 0.1912909597158432, "learning_rate": 1.9461988304093568e-05, "loss": 0.0854, "step": 170 }, { "epoch": 0.3206751054852321, "grad_norm": 0.2451699674129486, "learning_rate": 1.9454191033138402e-05, "loss": 0.2009, "step": 171 }, { "epoch": 0.3225503984997656, "grad_norm": 0.1784246265888214, "learning_rate": 1.944639376218324e-05, "loss": 0.1587, "step": 172 }, { "epoch": 0.3244256915142991, "grad_norm": 0.19816836714744568, "learning_rate": 1.9438596491228074e-05, "loss": 0.1426, "step": 173 }, { "epoch": 0.3263009845288326, "grad_norm": 0.19529619812965393, "learning_rate": 1.9430799220272908e-05, "loss": 0.1431, "step": 174 }, { "epoch": 0.32817627754336615, "grad_norm": 0.2105475664138794, "learning_rate": 1.9423001949317742e-05, "loss": 0.1654, "step": 175 }, { "epoch": 0.3300515705578997, "grad_norm": 0.17807945609092712, "learning_rate": 1.9415204678362573e-05, "loss": 0.1366, "step": 176 }, { "epoch": 0.3319268635724332, "grad_norm": 0.2023920863866806, "learning_rate": 1.9407407407407407e-05, "loss": 0.1372, "step": 177 }, { "epoch": 0.3338021565869667, "grad_norm": 0.1807592213153839, "learning_rate": 1.939961013645224e-05, "loss": 0.1188, "step": 178 }, { "epoch": 0.33567744960150026, "grad_norm": 0.18859116733074188, "learning_rate": 1.939181286549708e-05, "loss": 0.1243, "step": 179 }, { "epoch": 0.33755274261603374, "grad_norm": 0.21937714517116547, "learning_rate": 1.9384015594541913e-05, "loss": 0.2015, "step": 180 }, { "epoch": 0.3394280356305673, "grad_norm": 0.1638198345899582, "learning_rate": 1.9376218323586747e-05, "loss": 0.1031, "step": 181 }, { "epoch": 0.3413033286451008, "grad_norm": 0.2563665807247162, "learning_rate": 1.936842105263158e-05, "loss": 0.1341, "step": 182 }, { "epoch": 0.3431786216596343, "grad_norm": 0.19014421105384827, "learning_rate": 1.9360623781676415e-05, "loss": 0.135, "step": 183 }, { "epoch": 0.34505391467416785, "grad_norm": 0.19648663699626923, "learning_rate": 1.935282651072125e-05, "loss": 0.1239, "step": 184 }, { "epoch": 0.34692920768870134, "grad_norm": 0.16469500958919525, "learning_rate": 1.9345029239766084e-05, "loss": 0.1142, "step": 185 }, { "epoch": 0.3488045007032349, "grad_norm": 0.17405149340629578, "learning_rate": 1.9337231968810918e-05, "loss": 0.1025, "step": 186 }, { "epoch": 0.3506797937177684, "grad_norm": 0.2042636275291443, "learning_rate": 1.9329434697855752e-05, "loss": 0.0976, "step": 187 }, { "epoch": 0.3525550867323019, "grad_norm": 0.2545097768306732, "learning_rate": 1.9321637426900586e-05, "loss": 0.1707, "step": 188 }, { "epoch": 0.35443037974683544, "grad_norm": 0.17079883813858032, "learning_rate": 1.931384015594542e-05, "loss": 0.1282, "step": 189 }, { "epoch": 0.356305672761369, "grad_norm": 0.18123818933963776, "learning_rate": 1.9306042884990255e-05, "loss": 0.1285, "step": 190 }, { "epoch": 0.35818096577590247, "grad_norm": 0.26533034443855286, "learning_rate": 1.929824561403509e-05, "loss": 0.1912, "step": 191 }, { "epoch": 0.360056258790436, "grad_norm": 0.21302203834056854, "learning_rate": 1.9290448343079923e-05, "loss": 0.152, "step": 192 }, { "epoch": 0.36193155180496955, "grad_norm": 0.16439904272556305, "learning_rate": 1.9282651072124757e-05, "loss": 0.1026, "step": 193 }, { "epoch": 0.36380684481950304, "grad_norm": 0.21921955049037933, "learning_rate": 1.927485380116959e-05, "loss": 0.1294, "step": 194 }, { "epoch": 0.3656821378340366, "grad_norm": 0.1894323229789734, "learning_rate": 1.9267056530214426e-05, "loss": 0.1307, "step": 195 }, { "epoch": 0.36755743084857007, "grad_norm": 0.18328344821929932, "learning_rate": 1.925925925925926e-05, "loss": 0.1168, "step": 196 }, { "epoch": 0.3694327238631036, "grad_norm": 0.19325962662696838, "learning_rate": 1.9251461988304094e-05, "loss": 0.0979, "step": 197 }, { "epoch": 0.37130801687763715, "grad_norm": 0.16475112736225128, "learning_rate": 1.924366471734893e-05, "loss": 0.11, "step": 198 }, { "epoch": 0.37318330989217063, "grad_norm": 0.24370963871479034, "learning_rate": 1.9235867446393763e-05, "loss": 0.1256, "step": 199 }, { "epoch": 0.3750586029067042, "grad_norm": 0.21204431354999542, "learning_rate": 1.9228070175438597e-05, "loss": 0.1498, "step": 200 }, { "epoch": 0.3750586029067042, "eval_loss": 0.10520625114440918, "eval_runtime": 675.3932, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 200 }, { "epoch": 0.3769338959212377, "grad_norm": 0.19303874671459198, "learning_rate": 1.922027290448343e-05, "loss": 0.1423, "step": 201 }, { "epoch": 0.3788091889357712, "grad_norm": 0.19636684656143188, "learning_rate": 1.921247563352827e-05, "loss": 0.1325, "step": 202 }, { "epoch": 0.38068448195030474, "grad_norm": 0.23951849341392517, "learning_rate": 1.9204678362573103e-05, "loss": 0.1276, "step": 203 }, { "epoch": 0.38255977496483823, "grad_norm": 0.229360893368721, "learning_rate": 1.9196881091617937e-05, "loss": 0.121, "step": 204 }, { "epoch": 0.38443506797937177, "grad_norm": 0.18153493106365204, "learning_rate": 1.918908382066277e-05, "loss": 0.1202, "step": 205 }, { "epoch": 0.3863103609939053, "grad_norm": 0.4398716986179352, "learning_rate": 1.9181286549707602e-05, "loss": 0.1189, "step": 206 }, { "epoch": 0.3881856540084388, "grad_norm": 0.20299677550792694, "learning_rate": 1.9173489278752436e-05, "loss": 0.1433, "step": 207 }, { "epoch": 0.39006094702297234, "grad_norm": 0.2434886395931244, "learning_rate": 1.916569200779727e-05, "loss": 0.1122, "step": 208 }, { "epoch": 0.3919362400375059, "grad_norm": 0.16485784947872162, "learning_rate": 1.9157894736842108e-05, "loss": 0.1505, "step": 209 }, { "epoch": 0.39381153305203936, "grad_norm": 0.2433251440525055, "learning_rate": 1.9150097465886942e-05, "loss": 0.1821, "step": 210 }, { "epoch": 0.3956868260665729, "grad_norm": 0.2099800854921341, "learning_rate": 1.9142300194931776e-05, "loss": 0.143, "step": 211 }, { "epoch": 0.39756211908110645, "grad_norm": 0.20996929705142975, "learning_rate": 1.913450292397661e-05, "loss": 0.1394, "step": 212 }, { "epoch": 0.39943741209563993, "grad_norm": 0.1913692057132721, "learning_rate": 1.9126705653021445e-05, "loss": 0.1408, "step": 213 }, { "epoch": 0.4013127051101735, "grad_norm": 0.1733906865119934, "learning_rate": 1.911890838206628e-05, "loss": 0.1024, "step": 214 }, { "epoch": 0.40318799812470696, "grad_norm": 0.1856503039598465, "learning_rate": 1.9111111111111113e-05, "loss": 0.1129, "step": 215 }, { "epoch": 0.4050632911392405, "grad_norm": 0.3084484338760376, "learning_rate": 1.9103313840155947e-05, "loss": 0.102, "step": 216 }, { "epoch": 0.40693858415377404, "grad_norm": 0.42217186093330383, "learning_rate": 1.909551656920078e-05, "loss": 0.1372, "step": 217 }, { "epoch": 0.4088138771683075, "grad_norm": 0.20602688193321228, "learning_rate": 1.9087719298245616e-05, "loss": 0.1585, "step": 218 }, { "epoch": 0.41068917018284107, "grad_norm": 0.524498701095581, "learning_rate": 1.907992202729045e-05, "loss": 0.1533, "step": 219 }, { "epoch": 0.4125644631973746, "grad_norm": 0.20144020020961761, "learning_rate": 1.9072124756335284e-05, "loss": 0.1539, "step": 220 }, { "epoch": 0.4144397562119081, "grad_norm": 0.17956125736236572, "learning_rate": 1.9064327485380118e-05, "loss": 0.1124, "step": 221 }, { "epoch": 0.41631504922644164, "grad_norm": 0.20834237337112427, "learning_rate": 1.9056530214424952e-05, "loss": 0.1151, "step": 222 }, { "epoch": 0.4181903422409752, "grad_norm": 0.24440927803516388, "learning_rate": 1.9048732943469787e-05, "loss": 0.1138, "step": 223 }, { "epoch": 0.42006563525550866, "grad_norm": 0.1995476335287094, "learning_rate": 1.904093567251462e-05, "loss": 0.0915, "step": 224 }, { "epoch": 0.4219409282700422, "grad_norm": 0.18664710223674774, "learning_rate": 1.9033138401559458e-05, "loss": 0.1295, "step": 225 }, { "epoch": 0.4238162212845757, "grad_norm": 0.21624010801315308, "learning_rate": 1.902534113060429e-05, "loss": 0.1861, "step": 226 }, { "epoch": 0.42569151429910923, "grad_norm": 0.27115607261657715, "learning_rate": 1.9017543859649123e-05, "loss": 0.0858, "step": 227 }, { "epoch": 0.42756680731364277, "grad_norm": 0.2523871660232544, "learning_rate": 1.9009746588693957e-05, "loss": 0.1371, "step": 228 }, { "epoch": 0.42944210032817626, "grad_norm": 0.20124512910842896, "learning_rate": 1.900194931773879e-05, "loss": 0.1079, "step": 229 }, { "epoch": 0.4313173933427098, "grad_norm": 0.15628211200237274, "learning_rate": 1.8994152046783626e-05, "loss": 0.0926, "step": 230 }, { "epoch": 0.43319268635724334, "grad_norm": 0.18558505177497864, "learning_rate": 1.898635477582846e-05, "loss": 0.1241, "step": 231 }, { "epoch": 0.4350679793717768, "grad_norm": 0.21066918969154358, "learning_rate": 1.8978557504873298e-05, "loss": 0.1148, "step": 232 }, { "epoch": 0.43694327238631037, "grad_norm": 0.24706971645355225, "learning_rate": 1.8970760233918132e-05, "loss": 0.1469, "step": 233 }, { "epoch": 0.4388185654008439, "grad_norm": 0.24075715243816376, "learning_rate": 1.8962962962962966e-05, "loss": 0.1078, "step": 234 }, { "epoch": 0.4406938584153774, "grad_norm": 0.9308987855911255, "learning_rate": 1.89551656920078e-05, "loss": 0.1242, "step": 235 }, { "epoch": 0.44256915142991093, "grad_norm": 0.23705029487609863, "learning_rate": 1.894736842105263e-05, "loss": 0.1246, "step": 236 }, { "epoch": 0.4444444444444444, "grad_norm": 0.20728544890880585, "learning_rate": 1.8939571150097465e-05, "loss": 0.1216, "step": 237 }, { "epoch": 0.44631973745897796, "grad_norm": 0.21318766474723816, "learning_rate": 1.8931773879142303e-05, "loss": 0.1153, "step": 238 }, { "epoch": 0.4481950304735115, "grad_norm": 0.22454850375652313, "learning_rate": 1.8923976608187137e-05, "loss": 0.1337, "step": 239 }, { "epoch": 0.450070323488045, "grad_norm": 0.23081591725349426, "learning_rate": 1.891617933723197e-05, "loss": 0.1014, "step": 240 }, { "epoch": 0.45194561650257853, "grad_norm": 0.2807827293872833, "learning_rate": 1.8908382066276805e-05, "loss": 0.1587, "step": 241 }, { "epoch": 0.45382090951711207, "grad_norm": 0.22218219935894012, "learning_rate": 1.890058479532164e-05, "loss": 0.1599, "step": 242 }, { "epoch": 0.45569620253164556, "grad_norm": 0.23275920748710632, "learning_rate": 1.8892787524366474e-05, "loss": 0.1336, "step": 243 }, { "epoch": 0.4575714955461791, "grad_norm": 0.20733550190925598, "learning_rate": 1.8884990253411308e-05, "loss": 0.1458, "step": 244 }, { "epoch": 0.45944678856071264, "grad_norm": 0.19315101206302643, "learning_rate": 1.8877192982456142e-05, "loss": 0.0923, "step": 245 }, { "epoch": 0.4613220815752461, "grad_norm": 0.22976315021514893, "learning_rate": 1.8869395711500976e-05, "loss": 0.141, "step": 246 }, { "epoch": 0.46319737458977966, "grad_norm": 0.40744727849960327, "learning_rate": 1.886159844054581e-05, "loss": 0.1416, "step": 247 }, { "epoch": 0.46507266760431315, "grad_norm": 0.16530601680278778, "learning_rate": 1.8853801169590645e-05, "loss": 0.0989, "step": 248 }, { "epoch": 0.4669479606188467, "grad_norm": 0.19157516956329346, "learning_rate": 1.884600389863548e-05, "loss": 0.1176, "step": 249 }, { "epoch": 0.46882325363338023, "grad_norm": 0.28270554542541504, "learning_rate": 1.8838206627680313e-05, "loss": 0.1039, "step": 250 }, { "epoch": 0.46882325363338023, "eval_loss": 0.09713947772979736, "eval_runtime": 675.3394, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 250 }, { "epoch": 0.4706985466479137, "grad_norm": 0.2453300952911377, "learning_rate": 1.8830409356725147e-05, "loss": 0.113, "step": 251 }, { "epoch": 0.47257383966244726, "grad_norm": 0.2129826545715332, "learning_rate": 1.882261208576998e-05, "loss": 0.143, "step": 252 }, { "epoch": 0.4744491326769808, "grad_norm": 0.20310461521148682, "learning_rate": 1.8814814814814816e-05, "loss": 0.1242, "step": 253 }, { "epoch": 0.4763244256915143, "grad_norm": 0.18484044075012207, "learning_rate": 1.880701754385965e-05, "loss": 0.1098, "step": 254 }, { "epoch": 0.4781997187060478, "grad_norm": 0.2547452747821808, "learning_rate": 1.8799220272904487e-05, "loss": 0.1461, "step": 255 }, { "epoch": 0.48007501172058137, "grad_norm": 0.2765617072582245, "learning_rate": 1.8791423001949318e-05, "loss": 0.1567, "step": 256 }, { "epoch": 0.48195030473511485, "grad_norm": 0.19878943264484406, "learning_rate": 1.8783625730994152e-05, "loss": 0.1102, "step": 257 }, { "epoch": 0.4838255977496484, "grad_norm": 0.3786822259426117, "learning_rate": 1.8775828460038987e-05, "loss": 0.1354, "step": 258 }, { "epoch": 0.4857008907641819, "grad_norm": 0.18465487658977509, "learning_rate": 1.876803118908382e-05, "loss": 0.0959, "step": 259 }, { "epoch": 0.4875761837787154, "grad_norm": 0.206305131316185, "learning_rate": 1.8760233918128655e-05, "loss": 0.0962, "step": 260 }, { "epoch": 0.48945147679324896, "grad_norm": 0.22003507614135742, "learning_rate": 1.8752436647173493e-05, "loss": 0.1274, "step": 261 }, { "epoch": 0.49132676980778245, "grad_norm": 0.17606805264949799, "learning_rate": 1.8744639376218327e-05, "loss": 0.0998, "step": 262 }, { "epoch": 0.493202062822316, "grad_norm": 0.27991971373558044, "learning_rate": 1.873684210526316e-05, "loss": 0.1564, "step": 263 }, { "epoch": 0.49507735583684953, "grad_norm": 0.23053331673145294, "learning_rate": 1.8729044834307995e-05, "loss": 0.1247, "step": 264 }, { "epoch": 0.496952648851383, "grad_norm": 0.20774593949317932, "learning_rate": 1.872124756335283e-05, "loss": 0.0893, "step": 265 }, { "epoch": 0.49882794186591656, "grad_norm": 0.20343434810638428, "learning_rate": 1.871345029239766e-05, "loss": 0.1096, "step": 266 }, { "epoch": 0.5007032348804501, "grad_norm": 0.2922403812408447, "learning_rate": 1.8705653021442494e-05, "loss": 0.151, "step": 267 }, { "epoch": 0.5025785278949836, "grad_norm": 0.21583294868469238, "learning_rate": 1.8697855750487332e-05, "loss": 0.0865, "step": 268 }, { "epoch": 0.5044538209095171, "grad_norm": 0.21538986265659332, "learning_rate": 1.8690058479532166e-05, "loss": 0.1181, "step": 269 }, { "epoch": 0.5063291139240507, "grad_norm": 0.2531646192073822, "learning_rate": 1.8682261208577e-05, "loss": 0.1502, "step": 270 }, { "epoch": 0.5082044069385842, "grad_norm": 0.28955358266830444, "learning_rate": 1.8674463937621834e-05, "loss": 0.1335, "step": 271 }, { "epoch": 0.5100796999531176, "grad_norm": 0.27051371335983276, "learning_rate": 1.866666666666667e-05, "loss": 0.1605, "step": 272 }, { "epoch": 0.5119549929676512, "grad_norm": 0.15731023252010345, "learning_rate": 1.8658869395711503e-05, "loss": 0.0682, "step": 273 }, { "epoch": 0.5138302859821847, "grad_norm": 0.20177946984767914, "learning_rate": 1.8651072124756337e-05, "loss": 0.1127, "step": 274 }, { "epoch": 0.5157055789967182, "grad_norm": 0.2223084568977356, "learning_rate": 1.864327485380117e-05, "loss": 0.1318, "step": 275 }, { "epoch": 0.5175808720112518, "grad_norm": 0.2421497255563736, "learning_rate": 1.8635477582846005e-05, "loss": 0.1372, "step": 276 }, { "epoch": 0.5194561650257853, "grad_norm": 0.3211030662059784, "learning_rate": 1.862768031189084e-05, "loss": 0.1556, "step": 277 }, { "epoch": 0.5213314580403188, "grad_norm": 0.2547368109226227, "learning_rate": 1.8619883040935674e-05, "loss": 0.1249, "step": 278 }, { "epoch": 0.5232067510548524, "grad_norm": 0.23231586813926697, "learning_rate": 1.8612085769980508e-05, "loss": 0.1412, "step": 279 }, { "epoch": 0.5250820440693859, "grad_norm": 0.21071861684322357, "learning_rate": 1.8604288499025342e-05, "loss": 0.105, "step": 280 }, { "epoch": 0.5269573370839193, "grad_norm": 0.22929051518440247, "learning_rate": 1.8596491228070176e-05, "loss": 0.11, "step": 281 }, { "epoch": 0.5288326300984529, "grad_norm": 0.22073444724082947, "learning_rate": 1.858869395711501e-05, "loss": 0.1031, "step": 282 }, { "epoch": 0.5307079231129864, "grad_norm": 0.20839901268482208, "learning_rate": 1.8580896686159845e-05, "loss": 0.099, "step": 283 }, { "epoch": 0.5325832161275199, "grad_norm": 0.2510814070701599, "learning_rate": 1.857309941520468e-05, "loss": 0.1614, "step": 284 }, { "epoch": 0.5344585091420534, "grad_norm": 0.23160187900066376, "learning_rate": 1.8565302144249517e-05, "loss": 0.1281, "step": 285 }, { "epoch": 0.536333802156587, "grad_norm": 0.3275841474533081, "learning_rate": 1.8557504873294347e-05, "loss": 0.0894, "step": 286 }, { "epoch": 0.5382090951711205, "grad_norm": 0.20695282518863678, "learning_rate": 1.854970760233918e-05, "loss": 0.0911, "step": 287 }, { "epoch": 0.540084388185654, "grad_norm": 0.19326826930046082, "learning_rate": 1.8541910331384016e-05, "loss": 0.1097, "step": 288 }, { "epoch": 0.5419596812001876, "grad_norm": 0.22065812349319458, "learning_rate": 1.853411306042885e-05, "loss": 0.1249, "step": 289 }, { "epoch": 0.543834974214721, "grad_norm": 0.3224574327468872, "learning_rate": 1.8526315789473684e-05, "loss": 0.1232, "step": 290 }, { "epoch": 0.5457102672292545, "grad_norm": 0.20936869084835052, "learning_rate": 1.851851851851852e-05, "loss": 0.1061, "step": 291 }, { "epoch": 0.5475855602437881, "grad_norm": 0.2805179953575134, "learning_rate": 1.8510721247563356e-05, "loss": 0.1176, "step": 292 }, { "epoch": 0.5494608532583216, "grad_norm": 0.2184048891067505, "learning_rate": 1.850292397660819e-05, "loss": 0.0922, "step": 293 }, { "epoch": 0.5513361462728551, "grad_norm": 0.2294335961341858, "learning_rate": 1.8495126705653024e-05, "loss": 0.1065, "step": 294 }, { "epoch": 0.5532114392873887, "grad_norm": 0.20929084718227386, "learning_rate": 1.848732943469786e-05, "loss": 0.1041, "step": 295 }, { "epoch": 0.5550867323019222, "grad_norm": 0.24733637273311615, "learning_rate": 1.847953216374269e-05, "loss": 0.1046, "step": 296 }, { "epoch": 0.5569620253164557, "grad_norm": 0.26208677887916565, "learning_rate": 1.8471734892787523e-05, "loss": 0.0933, "step": 297 }, { "epoch": 0.5588373183309893, "grad_norm": 0.20398394763469696, "learning_rate": 1.846393762183236e-05, "loss": 0.0967, "step": 298 }, { "epoch": 0.5607126113455227, "grad_norm": 0.2356574982404709, "learning_rate": 1.8456140350877195e-05, "loss": 0.1627, "step": 299 }, { "epoch": 0.5625879043600562, "grad_norm": 0.32398343086242676, "learning_rate": 1.844834307992203e-05, "loss": 0.1334, "step": 300 }, { "epoch": 0.5625879043600562, "eval_loss": 0.09722544252872467, "eval_runtime": 675.2264, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 300 }, { "epoch": 0.5644631973745898, "grad_norm": 0.25528204441070557, "learning_rate": 1.8440545808966864e-05, "loss": 0.1637, "step": 301 }, { "epoch": 0.5663384903891233, "grad_norm": 0.2690048813819885, "learning_rate": 1.8432748538011698e-05, "loss": 0.178, "step": 302 }, { "epoch": 0.5682137834036568, "grad_norm": 0.2270357310771942, "learning_rate": 1.8424951267056532e-05, "loss": 0.1252, "step": 303 }, { "epoch": 0.5700890764181903, "grad_norm": 0.3335007131099701, "learning_rate": 1.8417153996101366e-05, "loss": 0.1594, "step": 304 }, { "epoch": 0.5719643694327239, "grad_norm": 0.2951895594596863, "learning_rate": 1.84093567251462e-05, "loss": 0.1794, "step": 305 }, { "epoch": 0.5738396624472574, "grad_norm": 0.19075483083724976, "learning_rate": 1.8401559454191035e-05, "loss": 0.1237, "step": 306 }, { "epoch": 0.5757149554617909, "grad_norm": 0.25787994265556335, "learning_rate": 1.839376218323587e-05, "loss": 0.1151, "step": 307 }, { "epoch": 0.5775902484763245, "grad_norm": 0.20336200296878815, "learning_rate": 1.8385964912280703e-05, "loss": 0.0893, "step": 308 }, { "epoch": 0.5794655414908579, "grad_norm": 0.2570553123950958, "learning_rate": 1.8378167641325537e-05, "loss": 0.1197, "step": 309 }, { "epoch": 0.5813408345053914, "grad_norm": 0.24566973745822906, "learning_rate": 1.837037037037037e-05, "loss": 0.1383, "step": 310 }, { "epoch": 0.583216127519925, "grad_norm": 0.23258450627326965, "learning_rate": 1.8362573099415205e-05, "loss": 0.1182, "step": 311 }, { "epoch": 0.5850914205344585, "grad_norm": 0.2346324771642685, "learning_rate": 1.835477582846004e-05, "loss": 0.128, "step": 312 }, { "epoch": 0.586966713548992, "grad_norm": 0.21992027759552002, "learning_rate": 1.8346978557504874e-05, "loss": 0.0972, "step": 313 }, { "epoch": 0.5888420065635256, "grad_norm": 0.21227417886257172, "learning_rate": 1.833918128654971e-05, "loss": 0.1038, "step": 314 }, { "epoch": 0.5907172995780591, "grad_norm": 0.22464649379253387, "learning_rate": 1.8331384015594546e-05, "loss": 0.1452, "step": 315 }, { "epoch": 0.5925925925925926, "grad_norm": 0.22292515635490417, "learning_rate": 1.8323586744639376e-05, "loss": 0.1343, "step": 316 }, { "epoch": 0.5944678856071262, "grad_norm": 0.19903215765953064, "learning_rate": 1.831578947368421e-05, "loss": 0.144, "step": 317 }, { "epoch": 0.5963431786216596, "grad_norm": 0.327104389667511, "learning_rate": 1.8307992202729045e-05, "loss": 0.1041, "step": 318 }, { "epoch": 0.5982184716361931, "grad_norm": 0.25846344232559204, "learning_rate": 1.830019493177388e-05, "loss": 0.1504, "step": 319 }, { "epoch": 0.6000937646507267, "grad_norm": 0.23256655037403107, "learning_rate": 1.8292397660818713e-05, "loss": 0.1239, "step": 320 }, { "epoch": 0.6019690576652602, "grad_norm": 0.20890699326992035, "learning_rate": 1.828460038986355e-05, "loss": 0.112, "step": 321 }, { "epoch": 0.6038443506797937, "grad_norm": 0.23920990526676178, "learning_rate": 1.8276803118908385e-05, "loss": 0.0993, "step": 322 }, { "epoch": 0.6057196436943273, "grad_norm": 0.4082464575767517, "learning_rate": 1.826900584795322e-05, "loss": 0.1376, "step": 323 }, { "epoch": 0.6075949367088608, "grad_norm": 0.30590200424194336, "learning_rate": 1.8261208576998053e-05, "loss": 0.0958, "step": 324 }, { "epoch": 0.6094702297233943, "grad_norm": 0.2254190891981125, "learning_rate": 1.8253411306042888e-05, "loss": 0.1343, "step": 325 }, { "epoch": 0.6113455227379277, "grad_norm": 0.3132902979850769, "learning_rate": 1.824561403508772e-05, "loss": 0.1007, "step": 326 }, { "epoch": 0.6132208157524613, "grad_norm": 0.1695740818977356, "learning_rate": 1.8237816764132556e-05, "loss": 0.079, "step": 327 }, { "epoch": 0.6150961087669948, "grad_norm": 0.24086889624595642, "learning_rate": 1.823001949317739e-05, "loss": 0.132, "step": 328 }, { "epoch": 0.6169714017815283, "grad_norm": 0.30719810724258423, "learning_rate": 1.8222222222222224e-05, "loss": 0.1589, "step": 329 }, { "epoch": 0.6188466947960619, "grad_norm": 0.2731982171535492, "learning_rate": 1.821442495126706e-05, "loss": 0.1366, "step": 330 }, { "epoch": 0.6207219878105954, "grad_norm": 0.2092353105545044, "learning_rate": 1.8206627680311893e-05, "loss": 0.1234, "step": 331 }, { "epoch": 0.6225972808251289, "grad_norm": 0.21123865246772766, "learning_rate": 1.8198830409356727e-05, "loss": 0.129, "step": 332 }, { "epoch": 0.6244725738396625, "grad_norm": 0.3408866226673126, "learning_rate": 1.819103313840156e-05, "loss": 0.1415, "step": 333 }, { "epoch": 0.626347866854196, "grad_norm": 0.22271405160427094, "learning_rate": 1.8183235867446395e-05, "loss": 0.0728, "step": 334 }, { "epoch": 0.6282231598687295, "grad_norm": 0.2094489336013794, "learning_rate": 1.817543859649123e-05, "loss": 0.1083, "step": 335 }, { "epoch": 0.630098452883263, "grad_norm": 0.20245423913002014, "learning_rate": 1.8167641325536064e-05, "loss": 0.095, "step": 336 }, { "epoch": 0.6319737458977965, "grad_norm": 0.23320430517196655, "learning_rate": 1.8159844054580898e-05, "loss": 0.1613, "step": 337 }, { "epoch": 0.63384903891233, "grad_norm": 0.2643381655216217, "learning_rate": 1.8152046783625732e-05, "loss": 0.1424, "step": 338 }, { "epoch": 0.6357243319268636, "grad_norm": 0.2379055917263031, "learning_rate": 1.8144249512670566e-05, "loss": 0.1042, "step": 339 }, { "epoch": 0.6375996249413971, "grad_norm": 0.21585410833358765, "learning_rate": 1.81364522417154e-05, "loss": 0.1092, "step": 340 }, { "epoch": 0.6394749179559306, "grad_norm": 0.22150596976280212, "learning_rate": 1.8128654970760235e-05, "loss": 0.1266, "step": 341 }, { "epoch": 0.6413502109704642, "grad_norm": 0.3069721758365631, "learning_rate": 1.812085769980507e-05, "loss": 0.1232, "step": 342 }, { "epoch": 0.6432255039849977, "grad_norm": 0.22880715131759644, "learning_rate": 1.8113060428849903e-05, "loss": 0.1376, "step": 343 }, { "epoch": 0.6451007969995312, "grad_norm": 0.22147296369075775, "learning_rate": 1.810526315789474e-05, "loss": 0.0861, "step": 344 }, { "epoch": 0.6469760900140648, "grad_norm": 0.18794487416744232, "learning_rate": 1.8097465886939575e-05, "loss": 0.1047, "step": 345 }, { "epoch": 0.6488513830285982, "grad_norm": 0.19053122401237488, "learning_rate": 1.8089668615984406e-05, "loss": 0.0852, "step": 346 }, { "epoch": 0.6507266760431317, "grad_norm": 0.2369307428598404, "learning_rate": 1.808187134502924e-05, "loss": 0.1077, "step": 347 }, { "epoch": 0.6526019690576652, "grad_norm": 0.2687581479549408, "learning_rate": 1.8074074074074074e-05, "loss": 0.1187, "step": 348 }, { "epoch": 0.6544772620721988, "grad_norm": 0.32420602440834045, "learning_rate": 1.8066276803118908e-05, "loss": 0.1453, "step": 349 }, { "epoch": 0.6563525550867323, "grad_norm": 0.2944568693637848, "learning_rate": 1.8058479532163746e-05, "loss": 0.0945, "step": 350 }, { "epoch": 0.6563525550867323, "eval_loss": 0.08979390561580658, "eval_runtime": 675.9452, "eval_samples_per_second": 0.291, "eval_steps_per_second": 0.291, "step": 350 }, { "epoch": 0.6582278481012658, "grad_norm": 0.21131469309329987, "learning_rate": 1.805068226120858e-05, "loss": 0.0927, "step": 351 }, { "epoch": 0.6601031411157994, "grad_norm": 0.2300959676504135, "learning_rate": 1.8042884990253414e-05, "loss": 0.1187, "step": 352 }, { "epoch": 0.6619784341303329, "grad_norm": 0.21290843188762665, "learning_rate": 1.8035087719298248e-05, "loss": 0.0714, "step": 353 }, { "epoch": 0.6638537271448663, "grad_norm": 0.22140546143054962, "learning_rate": 1.8027290448343082e-05, "loss": 0.0918, "step": 354 }, { "epoch": 0.6657290201593999, "grad_norm": 0.21354801952838898, "learning_rate": 1.8019493177387917e-05, "loss": 0.081, "step": 355 }, { "epoch": 0.6676043131739334, "grad_norm": 0.25323230028152466, "learning_rate": 1.8011695906432747e-05, "loss": 0.1041, "step": 356 }, { "epoch": 0.6694796061884669, "grad_norm": 0.2276594042778015, "learning_rate": 1.8003898635477585e-05, "loss": 0.0924, "step": 357 }, { "epoch": 0.6713548992030005, "grad_norm": 0.35995563864707947, "learning_rate": 1.799610136452242e-05, "loss": 0.1373, "step": 358 }, { "epoch": 0.673230192217534, "grad_norm": 0.2169509083032608, "learning_rate": 1.7988304093567253e-05, "loss": 0.1004, "step": 359 }, { "epoch": 0.6751054852320675, "grad_norm": 0.221743643283844, "learning_rate": 1.7980506822612088e-05, "loss": 0.0883, "step": 360 }, { "epoch": 0.6769807782466011, "grad_norm": 0.1859385073184967, "learning_rate": 1.7972709551656922e-05, "loss": 0.0863, "step": 361 }, { "epoch": 0.6788560712611346, "grad_norm": 0.2101774513721466, "learning_rate": 1.7964912280701756e-05, "loss": 0.105, "step": 362 }, { "epoch": 0.680731364275668, "grad_norm": 0.2311290204524994, "learning_rate": 1.795711500974659e-05, "loss": 0.096, "step": 363 }, { "epoch": 0.6826066572902016, "grad_norm": 0.2699367105960846, "learning_rate": 1.7949317738791424e-05, "loss": 0.1301, "step": 364 }, { "epoch": 0.6844819503047351, "grad_norm": 0.31696122884750366, "learning_rate": 1.794152046783626e-05, "loss": 0.1594, "step": 365 }, { "epoch": 0.6863572433192686, "grad_norm": 0.31495556235313416, "learning_rate": 1.7933723196881093e-05, "loss": 0.094, "step": 366 }, { "epoch": 0.6882325363338021, "grad_norm": 0.2610797882080078, "learning_rate": 1.7925925925925927e-05, "loss": 0.1289, "step": 367 }, { "epoch": 0.6901078293483357, "grad_norm": 0.2139683961868286, "learning_rate": 1.791812865497076e-05, "loss": 0.1032, "step": 368 }, { "epoch": 0.6919831223628692, "grad_norm": 0.2474675327539444, "learning_rate": 1.7910331384015595e-05, "loss": 0.088, "step": 369 }, { "epoch": 0.6938584153774027, "grad_norm": 0.22533686459064484, "learning_rate": 1.790253411306043e-05, "loss": 0.0933, "step": 370 }, { "epoch": 0.6957337083919363, "grad_norm": 0.22728174924850464, "learning_rate": 1.7894736842105264e-05, "loss": 0.1225, "step": 371 }, { "epoch": 0.6976090014064698, "grad_norm": 0.23081286251544952, "learning_rate": 1.7886939571150098e-05, "loss": 0.1099, "step": 372 }, { "epoch": 0.6994842944210032, "grad_norm": 0.2340676188468933, "learning_rate": 1.7879142300194932e-05, "loss": 0.1011, "step": 373 }, { "epoch": 0.7013595874355368, "grad_norm": 0.2684282064437866, "learning_rate": 1.787134502923977e-05, "loss": 0.1667, "step": 374 }, { "epoch": 0.7032348804500703, "grad_norm": 0.19310195744037628, "learning_rate": 1.7863547758284604e-05, "loss": 0.0958, "step": 375 }, { "epoch": 0.7051101734646038, "grad_norm": 0.19192589819431305, "learning_rate": 1.7855750487329435e-05, "loss": 0.0769, "step": 376 }, { "epoch": 0.7069854664791374, "grad_norm": 0.25004175305366516, "learning_rate": 1.784795321637427e-05, "loss": 0.1112, "step": 377 }, { "epoch": 0.7088607594936709, "grad_norm": 0.2875761389732361, "learning_rate": 1.7840155945419103e-05, "loss": 0.1425, "step": 378 }, { "epoch": 0.7107360525082044, "grad_norm": 0.2135677933692932, "learning_rate": 1.7832358674463937e-05, "loss": 0.0945, "step": 379 }, { "epoch": 0.712611345522738, "grad_norm": 0.38285496830940247, "learning_rate": 1.7824561403508775e-05, "loss": 0.0824, "step": 380 }, { "epoch": 0.7144866385372715, "grad_norm": 0.26780804991722107, "learning_rate": 1.781676413255361e-05, "loss": 0.1429, "step": 381 }, { "epoch": 0.7163619315518049, "grad_norm": 0.2771541178226471, "learning_rate": 1.7808966861598443e-05, "loss": 0.1449, "step": 382 }, { "epoch": 0.7182372245663385, "grad_norm": 0.23967161774635315, "learning_rate": 1.7801169590643277e-05, "loss": 0.109, "step": 383 }, { "epoch": 0.720112517580872, "grad_norm": 0.2565579414367676, "learning_rate": 1.779337231968811e-05, "loss": 0.143, "step": 384 }, { "epoch": 0.7219878105954055, "grad_norm": 0.23335972428321838, "learning_rate": 1.7785575048732946e-05, "loss": 0.0836, "step": 385 }, { "epoch": 0.7238631036099391, "grad_norm": 0.21833084523677826, "learning_rate": 1.7777777777777777e-05, "loss": 0.0927, "step": 386 }, { "epoch": 0.7257383966244726, "grad_norm": 0.29433101415634155, "learning_rate": 1.7769980506822614e-05, "loss": 0.102, "step": 387 }, { "epoch": 0.7276136896390061, "grad_norm": 0.23135854303836823, "learning_rate": 1.776218323586745e-05, "loss": 0.1139, "step": 388 }, { "epoch": 0.7294889826535396, "grad_norm": 0.24812403321266174, "learning_rate": 1.7754385964912283e-05, "loss": 0.1098, "step": 389 }, { "epoch": 0.7313642756680732, "grad_norm": 0.2173132598400116, "learning_rate": 1.7746588693957117e-05, "loss": 0.1089, "step": 390 }, { "epoch": 0.7332395686826066, "grad_norm": 0.24950866401195526, "learning_rate": 1.773879142300195e-05, "loss": 0.0805, "step": 391 }, { "epoch": 0.7351148616971401, "grad_norm": 0.29122406244277954, "learning_rate": 1.7730994152046785e-05, "loss": 0.1481, "step": 392 }, { "epoch": 0.7369901547116737, "grad_norm": 0.2415425032377243, "learning_rate": 1.772319688109162e-05, "loss": 0.1364, "step": 393 }, { "epoch": 0.7388654477262072, "grad_norm": 0.2071705311536789, "learning_rate": 1.7715399610136454e-05, "loss": 0.1227, "step": 394 }, { "epoch": 0.7407407407407407, "grad_norm": 0.20239049196243286, "learning_rate": 1.7707602339181288e-05, "loss": 0.0864, "step": 395 }, { "epoch": 0.7426160337552743, "grad_norm": 0.25537100434303284, "learning_rate": 1.7699805068226122e-05, "loss": 0.1278, "step": 396 }, { "epoch": 0.7444913267698078, "grad_norm": 0.30780285596847534, "learning_rate": 1.7692007797270956e-05, "loss": 0.1586, "step": 397 }, { "epoch": 0.7463666197843413, "grad_norm": 0.36921605467796326, "learning_rate": 1.768421052631579e-05, "loss": 0.1244, "step": 398 }, { "epoch": 0.7482419127988749, "grad_norm": 0.2948741614818573, "learning_rate": 1.7676413255360624e-05, "loss": 0.1095, "step": 399 }, { "epoch": 0.7501172058134083, "grad_norm": 0.23762205243110657, "learning_rate": 1.766861598440546e-05, "loss": 0.1245, "step": 400 }, { "epoch": 0.7501172058134083, "eval_loss": 0.08483699709177017, "eval_runtime": 675.3388, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 400 }, { "epoch": 0.7519924988279418, "grad_norm": 0.24527454376220703, "learning_rate": 1.7660818713450293e-05, "loss": 0.1122, "step": 401 }, { "epoch": 0.7538677918424754, "grad_norm": 0.22449228167533875, "learning_rate": 1.7653021442495127e-05, "loss": 0.1044, "step": 402 }, { "epoch": 0.7557430848570089, "grad_norm": 0.285155713558197, "learning_rate": 1.7645224171539965e-05, "loss": 0.1141, "step": 403 }, { "epoch": 0.7576183778715424, "grad_norm": 0.247589111328125, "learning_rate": 1.76374269005848e-05, "loss": 0.1363, "step": 404 }, { "epoch": 0.759493670886076, "grad_norm": 0.26343780755996704, "learning_rate": 1.7629629629629633e-05, "loss": 0.1198, "step": 405 }, { "epoch": 0.7613689639006095, "grad_norm": 0.2533006966114044, "learning_rate": 1.7621832358674464e-05, "loss": 0.1137, "step": 406 }, { "epoch": 0.763244256915143, "grad_norm": 0.22146250307559967, "learning_rate": 1.7614035087719298e-05, "loss": 0.1152, "step": 407 }, { "epoch": 0.7651195499296765, "grad_norm": 0.3412543535232544, "learning_rate": 1.7606237816764132e-05, "loss": 0.1838, "step": 408 }, { "epoch": 0.76699484294421, "grad_norm": 0.2710413634777069, "learning_rate": 1.7598440545808966e-05, "loss": 0.1088, "step": 409 }, { "epoch": 0.7688701359587435, "grad_norm": 0.2602677047252655, "learning_rate": 1.7590643274853804e-05, "loss": 0.135, "step": 410 }, { "epoch": 0.770745428973277, "grad_norm": 0.22832591831684113, "learning_rate": 1.7582846003898638e-05, "loss": 0.099, "step": 411 }, { "epoch": 0.7726207219878106, "grad_norm": 0.20037035644054413, "learning_rate": 1.7575048732943472e-05, "loss": 0.0988, "step": 412 }, { "epoch": 0.7744960150023441, "grad_norm": 0.21016646921634674, "learning_rate": 1.7567251461988307e-05, "loss": 0.0859, "step": 413 }, { "epoch": 0.7763713080168776, "grad_norm": 0.25442710518836975, "learning_rate": 1.755945419103314e-05, "loss": 0.1192, "step": 414 }, { "epoch": 0.7782466010314112, "grad_norm": 0.2266324907541275, "learning_rate": 1.7551656920077975e-05, "loss": 0.106, "step": 415 }, { "epoch": 0.7801218940459447, "grad_norm": 0.24464181065559387, "learning_rate": 1.754385964912281e-05, "loss": 0.116, "step": 416 }, { "epoch": 0.7819971870604782, "grad_norm": 0.19099155068397522, "learning_rate": 1.7536062378167643e-05, "loss": 0.0759, "step": 417 }, { "epoch": 0.7838724800750118, "grad_norm": 0.23417051136493683, "learning_rate": 1.7528265107212477e-05, "loss": 0.0815, "step": 418 }, { "epoch": 0.7857477730895452, "grad_norm": 0.3102501630783081, "learning_rate": 1.752046783625731e-05, "loss": 0.1205, "step": 419 }, { "epoch": 0.7876230661040787, "grad_norm": 0.27138587832450867, "learning_rate": 1.7512670565302146e-05, "loss": 0.0957, "step": 420 }, { "epoch": 0.7894983591186123, "grad_norm": 0.3055587708950043, "learning_rate": 1.750487329434698e-05, "loss": 0.1181, "step": 421 }, { "epoch": 0.7913736521331458, "grad_norm": 0.24075333774089813, "learning_rate": 1.7497076023391814e-05, "loss": 0.1192, "step": 422 }, { "epoch": 0.7932489451476793, "grad_norm": 0.1806018054485321, "learning_rate": 1.748927875243665e-05, "loss": 0.0884, "step": 423 }, { "epoch": 0.7951242381622129, "grad_norm": 0.2315697968006134, "learning_rate": 1.7481481481481483e-05, "loss": 0.0696, "step": 424 }, { "epoch": 0.7969995311767464, "grad_norm": 0.23765219748020172, "learning_rate": 1.7473684210526317e-05, "loss": 0.1186, "step": 425 }, { "epoch": 0.7988748241912799, "grad_norm": 0.18785423040390015, "learning_rate": 1.746588693957115e-05, "loss": 0.0727, "step": 426 }, { "epoch": 0.8007501172058135, "grad_norm": 0.1897609829902649, "learning_rate": 1.7458089668615985e-05, "loss": 0.0926, "step": 427 }, { "epoch": 0.802625410220347, "grad_norm": 0.28181466460227966, "learning_rate": 1.745029239766082e-05, "loss": 0.1083, "step": 428 }, { "epoch": 0.8045007032348804, "grad_norm": 0.20873577892780304, "learning_rate": 1.7442495126705654e-05, "loss": 0.105, "step": 429 }, { "epoch": 0.8063759962494139, "grad_norm": 0.23470522463321686, "learning_rate": 1.7434697855750488e-05, "loss": 0.1103, "step": 430 }, { "epoch": 0.8082512892639475, "grad_norm": 0.26746684312820435, "learning_rate": 1.7426900584795322e-05, "loss": 0.1322, "step": 431 }, { "epoch": 0.810126582278481, "grad_norm": 0.2638936936855316, "learning_rate": 1.7419103313840156e-05, "loss": 0.1228, "step": 432 }, { "epoch": 0.8120018752930145, "grad_norm": 0.237047016620636, "learning_rate": 1.7411306042884994e-05, "loss": 0.1042, "step": 433 }, { "epoch": 0.8138771683075481, "grad_norm": 0.1885695904493332, "learning_rate": 1.7403508771929828e-05, "loss": 0.0723, "step": 434 }, { "epoch": 0.8157524613220816, "grad_norm": 0.21713057160377502, "learning_rate": 1.7395711500974662e-05, "loss": 0.0796, "step": 435 }, { "epoch": 0.817627754336615, "grad_norm": 0.3199704885482788, "learning_rate": 1.7387914230019493e-05, "loss": 0.1588, "step": 436 }, { "epoch": 0.8195030473511487, "grad_norm": 0.3154192566871643, "learning_rate": 1.7380116959064327e-05, "loss": 0.1365, "step": 437 }, { "epoch": 0.8213783403656821, "grad_norm": 0.2530040144920349, "learning_rate": 1.737231968810916e-05, "loss": 0.0948, "step": 438 }, { "epoch": 0.8232536333802156, "grad_norm": 0.23530352115631104, "learning_rate": 1.7364522417154e-05, "loss": 0.119, "step": 439 }, { "epoch": 0.8251289263947492, "grad_norm": 0.2744591534137726, "learning_rate": 1.7356725146198833e-05, "loss": 0.1177, "step": 440 }, { "epoch": 0.8270042194092827, "grad_norm": 0.21735382080078125, "learning_rate": 1.7348927875243667e-05, "loss": 0.0836, "step": 441 }, { "epoch": 0.8288795124238162, "grad_norm": 0.21691051125526428, "learning_rate": 1.73411306042885e-05, "loss": 0.1232, "step": 442 }, { "epoch": 0.8307548054383498, "grad_norm": 0.26144662499427795, "learning_rate": 1.7333333333333336e-05, "loss": 0.115, "step": 443 }, { "epoch": 0.8326300984528833, "grad_norm": 0.2164791226387024, "learning_rate": 1.732553606237817e-05, "loss": 0.1146, "step": 444 }, { "epoch": 0.8345053914674168, "grad_norm": 0.27034950256347656, "learning_rate": 1.7317738791423004e-05, "loss": 0.0644, "step": 445 }, { "epoch": 0.8363806844819504, "grad_norm": 0.19503502547740936, "learning_rate": 1.7309941520467838e-05, "loss": 0.0688, "step": 446 }, { "epoch": 0.8382559774964838, "grad_norm": 0.23584571480751038, "learning_rate": 1.7302144249512672e-05, "loss": 0.1256, "step": 447 }, { "epoch": 0.8401312705110173, "grad_norm": 0.2520376145839691, "learning_rate": 1.7294346978557507e-05, "loss": 0.1601, "step": 448 }, { "epoch": 0.8420065635255509, "grad_norm": 0.24150918424129486, "learning_rate": 1.728654970760234e-05, "loss": 0.1155, "step": 449 }, { "epoch": 0.8438818565400844, "grad_norm": 0.289753258228302, "learning_rate": 1.7278752436647175e-05, "loss": 0.1283, "step": 450 }, { "epoch": 0.8438818565400844, "eval_loss": 0.08511354774236679, "eval_runtime": 675.7661, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 450 }, { "epoch": 0.8457571495546179, "grad_norm": 0.27476435899734497, "learning_rate": 1.727095516569201e-05, "loss": 0.0989, "step": 451 }, { "epoch": 0.8476324425691514, "grad_norm": 0.20759902894496918, "learning_rate": 1.7263157894736843e-05, "loss": 0.0963, "step": 452 }, { "epoch": 0.849507735583685, "grad_norm": 0.3410794138908386, "learning_rate": 1.7255360623781678e-05, "loss": 0.1675, "step": 453 }, { "epoch": 0.8513830285982185, "grad_norm": 0.2817666828632355, "learning_rate": 1.7247563352826512e-05, "loss": 0.1458, "step": 454 }, { "epoch": 0.853258321612752, "grad_norm": 0.2924879491329193, "learning_rate": 1.7239766081871346e-05, "loss": 0.1393, "step": 455 }, { "epoch": 0.8551336146272855, "grad_norm": 0.3008408844470978, "learning_rate": 1.723196881091618e-05, "loss": 0.1051, "step": 456 }, { "epoch": 0.857008907641819, "grad_norm": 0.3666384518146515, "learning_rate": 1.7224171539961014e-05, "loss": 0.1031, "step": 457 }, { "epoch": 0.8588842006563525, "grad_norm": 0.2442241907119751, "learning_rate": 1.721637426900585e-05, "loss": 0.1333, "step": 458 }, { "epoch": 0.8607594936708861, "grad_norm": 0.3462783992290497, "learning_rate": 1.7208576998050683e-05, "loss": 0.115, "step": 459 }, { "epoch": 0.8626347866854196, "grad_norm": 0.25634944438934326, "learning_rate": 1.7200779727095517e-05, "loss": 0.0867, "step": 460 }, { "epoch": 0.8645100796999531, "grad_norm": 0.30906710028648376, "learning_rate": 1.719298245614035e-05, "loss": 0.1405, "step": 461 }, { "epoch": 0.8663853727144867, "grad_norm": 0.2611568570137024, "learning_rate": 1.7185185185185185e-05, "loss": 0.1071, "step": 462 }, { "epoch": 0.8682606657290202, "grad_norm": 0.2521813213825226, "learning_rate": 1.7177387914230023e-05, "loss": 0.0707, "step": 463 }, { "epoch": 0.8701359587435537, "grad_norm": 0.25580745935440063, "learning_rate": 1.7169590643274857e-05, "loss": 0.0854, "step": 464 }, { "epoch": 0.8720112517580872, "grad_norm": 0.2787662148475647, "learning_rate": 1.716179337231969e-05, "loss": 0.1018, "step": 465 }, { "epoch": 0.8738865447726207, "grad_norm": 0.21670454740524292, "learning_rate": 1.7153996101364522e-05, "loss": 0.1214, "step": 466 }, { "epoch": 0.8757618377871542, "grad_norm": 0.28645867109298706, "learning_rate": 1.7146198830409356e-05, "loss": 0.0877, "step": 467 }, { "epoch": 0.8776371308016878, "grad_norm": 0.20811931788921356, "learning_rate": 1.713840155945419e-05, "loss": 0.0846, "step": 468 }, { "epoch": 0.8795124238162213, "grad_norm": 0.301693320274353, "learning_rate": 1.7130604288499028e-05, "loss": 0.1215, "step": 469 }, { "epoch": 0.8813877168307548, "grad_norm": 0.21408531069755554, "learning_rate": 1.7122807017543862e-05, "loss": 0.0929, "step": 470 }, { "epoch": 0.8832630098452883, "grad_norm": 0.20539730787277222, "learning_rate": 1.7115009746588696e-05, "loss": 0.0689, "step": 471 }, { "epoch": 0.8851383028598219, "grad_norm": 0.2104424089193344, "learning_rate": 1.710721247563353e-05, "loss": 0.0997, "step": 472 }, { "epoch": 0.8870135958743554, "grad_norm": 0.7082309722900391, "learning_rate": 1.7099415204678365e-05, "loss": 0.0945, "step": 473 }, { "epoch": 0.8888888888888888, "grad_norm": 0.19709636270999908, "learning_rate": 1.70916179337232e-05, "loss": 0.0781, "step": 474 }, { "epoch": 0.8907641819034224, "grad_norm": 0.26817139983177185, "learning_rate": 1.7083820662768033e-05, "loss": 0.0823, "step": 475 }, { "epoch": 0.8926394749179559, "grad_norm": 0.26233142614364624, "learning_rate": 1.7076023391812867e-05, "loss": 0.0938, "step": 476 }, { "epoch": 0.8945147679324894, "grad_norm": 0.3621986508369446, "learning_rate": 1.70682261208577e-05, "loss": 0.1134, "step": 477 }, { "epoch": 0.896390060947023, "grad_norm": 0.2838749289512634, "learning_rate": 1.7060428849902536e-05, "loss": 0.1347, "step": 478 }, { "epoch": 0.8982653539615565, "grad_norm": 0.19387036561965942, "learning_rate": 1.705263157894737e-05, "loss": 0.065, "step": 479 }, { "epoch": 0.90014064697609, "grad_norm": 0.20621807873249054, "learning_rate": 1.7044834307992204e-05, "loss": 0.0731, "step": 480 }, { "epoch": 0.9020159399906236, "grad_norm": 0.2323528677225113, "learning_rate": 1.7037037037037038e-05, "loss": 0.0931, "step": 481 }, { "epoch": 0.9038912330051571, "grad_norm": 0.20820242166519165, "learning_rate": 1.7029239766081872e-05, "loss": 0.0853, "step": 482 }, { "epoch": 0.9057665260196905, "grad_norm": 0.27471327781677246, "learning_rate": 1.7021442495126707e-05, "loss": 0.1377, "step": 483 }, { "epoch": 0.9076418190342241, "grad_norm": 0.22734041512012482, "learning_rate": 1.701364522417154e-05, "loss": 0.1169, "step": 484 }, { "epoch": 0.9095171120487576, "grad_norm": 0.25026533007621765, "learning_rate": 1.7005847953216375e-05, "loss": 0.0976, "step": 485 }, { "epoch": 0.9113924050632911, "grad_norm": 0.24165751039981842, "learning_rate": 1.699805068226121e-05, "loss": 0.1008, "step": 486 }, { "epoch": 0.9132676980778247, "grad_norm": 0.24641317129135132, "learning_rate": 1.6990253411306043e-05, "loss": 0.1105, "step": 487 }, { "epoch": 0.9151429910923582, "grad_norm": 0.18702614307403564, "learning_rate": 1.6982456140350878e-05, "loss": 0.0711, "step": 488 }, { "epoch": 0.9170182841068917, "grad_norm": 0.31801968812942505, "learning_rate": 1.6974658869395712e-05, "loss": 0.1112, "step": 489 }, { "epoch": 0.9188935771214253, "grad_norm": 0.311109721660614, "learning_rate": 1.6966861598440546e-05, "loss": 0.128, "step": 490 }, { "epoch": 0.9207688701359588, "grad_norm": 0.24190711975097656, "learning_rate": 1.695906432748538e-05, "loss": 0.0873, "step": 491 }, { "epoch": 0.9226441631504922, "grad_norm": 0.32772934436798096, "learning_rate": 1.6951267056530218e-05, "loss": 0.0979, "step": 492 }, { "epoch": 0.9245194561650257, "grad_norm": 0.23775866627693176, "learning_rate": 1.6943469785575052e-05, "loss": 0.096, "step": 493 }, { "epoch": 0.9263947491795593, "grad_norm": 0.38035061955451965, "learning_rate": 1.6935672514619886e-05, "loss": 0.1299, "step": 494 }, { "epoch": 0.9282700421940928, "grad_norm": 0.25195756554603577, "learning_rate": 1.692787524366472e-05, "loss": 0.0958, "step": 495 }, { "epoch": 0.9301453352086263, "grad_norm": 0.24769015610218048, "learning_rate": 1.692007797270955e-05, "loss": 0.1028, "step": 496 }, { "epoch": 0.9320206282231599, "grad_norm": 0.2188967764377594, "learning_rate": 1.6912280701754385e-05, "loss": 0.0731, "step": 497 }, { "epoch": 0.9338959212376934, "grad_norm": 0.2325516939163208, "learning_rate": 1.690448343079922e-05, "loss": 0.1144, "step": 498 }, { "epoch": 0.9357712142522269, "grad_norm": 0.30458924174308777, "learning_rate": 1.6896686159844057e-05, "loss": 0.0813, "step": 499 }, { "epoch": 0.9376465072667605, "grad_norm": 0.22726097702980042, "learning_rate": 1.688888888888889e-05, "loss": 0.0839, "step": 500 }, { "epoch": 0.9376465072667605, "eval_loss": 0.08535026758909225, "eval_runtime": 676.335, "eval_samples_per_second": 0.291, "eval_steps_per_second": 0.291, "step": 500 }, { "epoch": 0.939521800281294, "grad_norm": 0.2691619098186493, "learning_rate": 1.6881091617933726e-05, "loss": 0.1266, "step": 501 }, { "epoch": 0.9413970932958274, "grad_norm": 0.28599363565444946, "learning_rate": 1.687329434697856e-05, "loss": 0.1125, "step": 502 }, { "epoch": 0.943272386310361, "grad_norm": 0.27564743161201477, "learning_rate": 1.6865497076023394e-05, "loss": 0.1029, "step": 503 }, { "epoch": 0.9451476793248945, "grad_norm": 0.23417291045188904, "learning_rate": 1.6857699805068228e-05, "loss": 0.087, "step": 504 }, { "epoch": 0.947022972339428, "grad_norm": 0.2351316511631012, "learning_rate": 1.6849902534113062e-05, "loss": 0.1024, "step": 505 }, { "epoch": 0.9488982653539616, "grad_norm": 0.2007894515991211, "learning_rate": 1.6842105263157896e-05, "loss": 0.0752, "step": 506 }, { "epoch": 0.9507735583684951, "grad_norm": 0.2415047585964203, "learning_rate": 1.683430799220273e-05, "loss": 0.0912, "step": 507 }, { "epoch": 0.9526488513830286, "grad_norm": 0.25994813442230225, "learning_rate": 1.6826510721247565e-05, "loss": 0.1331, "step": 508 }, { "epoch": 0.9545241443975622, "grad_norm": 0.2513566315174103, "learning_rate": 1.68187134502924e-05, "loss": 0.1031, "step": 509 }, { "epoch": 0.9563994374120957, "grad_norm": 0.2094557136297226, "learning_rate": 1.6810916179337233e-05, "loss": 0.0736, "step": 510 }, { "epoch": 0.9582747304266291, "grad_norm": 0.2421177178621292, "learning_rate": 1.6803118908382067e-05, "loss": 0.1123, "step": 511 }, { "epoch": 0.9601500234411627, "grad_norm": 0.2969476580619812, "learning_rate": 1.67953216374269e-05, "loss": 0.1175, "step": 512 }, { "epoch": 0.9620253164556962, "grad_norm": 0.3147030770778656, "learning_rate": 1.6787524366471736e-05, "loss": 0.1647, "step": 513 }, { "epoch": 0.9639006094702297, "grad_norm": 0.2841101586818695, "learning_rate": 1.677972709551657e-05, "loss": 0.1238, "step": 514 }, { "epoch": 0.9657759024847632, "grad_norm": 0.28984513878822327, "learning_rate": 1.6771929824561408e-05, "loss": 0.1257, "step": 515 }, { "epoch": 0.9676511954992968, "grad_norm": 0.32147666811943054, "learning_rate": 1.676413255360624e-05, "loss": 0.1446, "step": 516 }, { "epoch": 0.9695264885138303, "grad_norm": 0.17480434477329254, "learning_rate": 1.6756335282651073e-05, "loss": 0.079, "step": 517 }, { "epoch": 0.9714017815283638, "grad_norm": 0.2966823875904083, "learning_rate": 1.6748538011695907e-05, "loss": 0.1092, "step": 518 }, { "epoch": 0.9732770745428974, "grad_norm": 0.24680288136005402, "learning_rate": 1.674074074074074e-05, "loss": 0.0885, "step": 519 }, { "epoch": 0.9751523675574308, "grad_norm": 0.23044931888580322, "learning_rate": 1.6732943469785575e-05, "loss": 0.1615, "step": 520 }, { "epoch": 0.9770276605719643, "grad_norm": 0.2578699588775635, "learning_rate": 1.672514619883041e-05, "loss": 0.1323, "step": 521 }, { "epoch": 0.9789029535864979, "grad_norm": 0.2969329059123993, "learning_rate": 1.6717348927875247e-05, "loss": 0.1037, "step": 522 }, { "epoch": 0.9807782466010314, "grad_norm": 0.2269773781299591, "learning_rate": 1.670955165692008e-05, "loss": 0.1047, "step": 523 }, { "epoch": 0.9826535396155649, "grad_norm": 0.26680633425712585, "learning_rate": 1.6701754385964915e-05, "loss": 0.1249, "step": 524 }, { "epoch": 0.9845288326300985, "grad_norm": 0.24191637337207794, "learning_rate": 1.669395711500975e-05, "loss": 0.0922, "step": 525 }, { "epoch": 0.986404125644632, "grad_norm": 0.2023610770702362, "learning_rate": 1.668615984405458e-05, "loss": 0.0733, "step": 526 }, { "epoch": 0.9882794186591655, "grad_norm": 0.34420913457870483, "learning_rate": 1.6678362573099414e-05, "loss": 0.0859, "step": 527 }, { "epoch": 0.9901547116736991, "grad_norm": 0.29441869258880615, "learning_rate": 1.6670565302144252e-05, "loss": 0.1199, "step": 528 }, { "epoch": 0.9920300046882325, "grad_norm": 0.20998947322368622, "learning_rate": 1.6662768031189086e-05, "loss": 0.0767, "step": 529 }, { "epoch": 0.993905297702766, "grad_norm": 0.26571086049079895, "learning_rate": 1.665497076023392e-05, "loss": 0.1022, "step": 530 }, { "epoch": 0.9957805907172996, "grad_norm": 0.24049051105976105, "learning_rate": 1.6647173489278755e-05, "loss": 0.1158, "step": 531 }, { "epoch": 0.9976558837318331, "grad_norm": 0.3279871940612793, "learning_rate": 1.663937621832359e-05, "loss": 0.0895, "step": 532 }, { "epoch": 0.9995311767463666, "grad_norm": 0.19162967801094055, "learning_rate": 1.6631578947368423e-05, "loss": 0.0906, "step": 533 }, { "epoch": 1.0, "grad_norm": 0.5646827816963196, "learning_rate": 1.6623781676413257e-05, "loss": 0.1056, "step": 534 }, { "epoch": 1.0018752930145336, "grad_norm": 0.22089308500289917, "learning_rate": 1.661598440545809e-05, "loss": 0.081, "step": 535 }, { "epoch": 1.003750586029067, "grad_norm": 0.20040877163410187, "learning_rate": 1.6608187134502926e-05, "loss": 0.0746, "step": 536 }, { "epoch": 1.0056258790436006, "grad_norm": 0.23172321915626526, "learning_rate": 1.660038986354776e-05, "loss": 0.1107, "step": 537 }, { "epoch": 1.0075011720581342, "grad_norm": 0.22388094663619995, "learning_rate": 1.6592592592592594e-05, "loss": 0.1188, "step": 538 }, { "epoch": 1.0093764650726675, "grad_norm": 0.20831793546676636, "learning_rate": 1.6584795321637428e-05, "loss": 0.0724, "step": 539 }, { "epoch": 1.0112517580872011, "grad_norm": 0.2914353311061859, "learning_rate": 1.6576998050682262e-05, "loss": 0.1019, "step": 540 }, { "epoch": 1.0131270511017347, "grad_norm": 0.2762034833431244, "learning_rate": 1.6569200779727097e-05, "loss": 0.095, "step": 541 }, { "epoch": 1.015002344116268, "grad_norm": 0.22479356825351715, "learning_rate": 1.656140350877193e-05, "loss": 0.0883, "step": 542 }, { "epoch": 1.0168776371308017, "grad_norm": 0.18842089176177979, "learning_rate": 1.6553606237816765e-05, "loss": 0.0851, "step": 543 }, { "epoch": 1.0187529301453353, "grad_norm": 0.17845085263252258, "learning_rate": 1.65458089668616e-05, "loss": 0.0649, "step": 544 }, { "epoch": 1.0206282231598687, "grad_norm": 0.22942480444908142, "learning_rate": 1.6538011695906437e-05, "loss": 0.0968, "step": 545 }, { "epoch": 1.0225035161744023, "grad_norm": 0.3065761625766754, "learning_rate": 1.6530214424951268e-05, "loss": 0.098, "step": 546 }, { "epoch": 1.0243788091889359, "grad_norm": 0.41475629806518555, "learning_rate": 1.6522417153996102e-05, "loss": 0.0882, "step": 547 }, { "epoch": 1.0262541022034692, "grad_norm": 0.23140354454517365, "learning_rate": 1.6514619883040936e-05, "loss": 0.1071, "step": 548 }, { "epoch": 1.0281293952180028, "grad_norm": 0.26001206040382385, "learning_rate": 1.650682261208577e-05, "loss": 0.0984, "step": 549 }, { "epoch": 1.0300046882325364, "grad_norm": 0.27233394980430603, "learning_rate": 1.6499025341130604e-05, "loss": 0.1294, "step": 550 }, { "epoch": 1.0300046882325364, "eval_loss": 0.08262171596288681, "eval_runtime": 675.5684, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 550 }, { "epoch": 1.0318799812470698, "grad_norm": 0.28793230652809143, "learning_rate": 1.649122807017544e-05, "loss": 0.1191, "step": 551 }, { "epoch": 1.0337552742616034, "grad_norm": 0.23013058304786682, "learning_rate": 1.6483430799220276e-05, "loss": 0.0834, "step": 552 }, { "epoch": 1.035630567276137, "grad_norm": 0.2813061773777008, "learning_rate": 1.647563352826511e-05, "loss": 0.1116, "step": 553 }, { "epoch": 1.0375058602906704, "grad_norm": 0.27876320481300354, "learning_rate": 1.6467836257309944e-05, "loss": 0.1027, "step": 554 }, { "epoch": 1.039381153305204, "grad_norm": 0.2607191801071167, "learning_rate": 1.646003898635478e-05, "loss": 0.1119, "step": 555 }, { "epoch": 1.0412564463197373, "grad_norm": 0.31741422414779663, "learning_rate": 1.645224171539961e-05, "loss": 0.1435, "step": 556 }, { "epoch": 1.043131739334271, "grad_norm": 0.23251239955425262, "learning_rate": 1.6444444444444444e-05, "loss": 0.0574, "step": 557 }, { "epoch": 1.0450070323488045, "grad_norm": 0.2328520566225052, "learning_rate": 1.643664717348928e-05, "loss": 0.1157, "step": 558 }, { "epoch": 1.046882325363338, "grad_norm": 0.42328402400016785, "learning_rate": 1.6428849902534115e-05, "loss": 0.0988, "step": 559 }, { "epoch": 1.0487576183778715, "grad_norm": 0.570969820022583, "learning_rate": 1.642105263157895e-05, "loss": 0.0954, "step": 560 }, { "epoch": 1.0506329113924051, "grad_norm": 0.19645580649375916, "learning_rate": 1.6413255360623784e-05, "loss": 0.0751, "step": 561 }, { "epoch": 1.0525082044069385, "grad_norm": 0.25933393836021423, "learning_rate": 1.6405458089668618e-05, "loss": 0.1031, "step": 562 }, { "epoch": 1.054383497421472, "grad_norm": 0.2190171778202057, "learning_rate": 1.6397660818713452e-05, "loss": 0.0866, "step": 563 }, { "epoch": 1.0562587904360057, "grad_norm": 0.2852676808834076, "learning_rate": 1.6389863547758286e-05, "loss": 0.0901, "step": 564 }, { "epoch": 1.058134083450539, "grad_norm": 0.23386311531066895, "learning_rate": 1.638206627680312e-05, "loss": 0.0914, "step": 565 }, { "epoch": 1.0600093764650727, "grad_norm": 0.23497509956359863, "learning_rate": 1.6374269005847955e-05, "loss": 0.0885, "step": 566 }, { "epoch": 1.0618846694796062, "grad_norm": 0.2501085698604584, "learning_rate": 1.636647173489279e-05, "loss": 0.0555, "step": 567 }, { "epoch": 1.0637599624941396, "grad_norm": 0.2505752742290497, "learning_rate": 1.6358674463937623e-05, "loss": 0.0836, "step": 568 }, { "epoch": 1.0656352555086732, "grad_norm": 0.24521493911743164, "learning_rate": 1.6350877192982457e-05, "loss": 0.0907, "step": 569 }, { "epoch": 1.0675105485232068, "grad_norm": 0.3060048222541809, "learning_rate": 1.634307992202729e-05, "loss": 0.0901, "step": 570 }, { "epoch": 1.0693858415377402, "grad_norm": 0.2545836865901947, "learning_rate": 1.6335282651072126e-05, "loss": 0.1228, "step": 571 }, { "epoch": 1.0712611345522738, "grad_norm": 0.37637385725975037, "learning_rate": 1.632748538011696e-05, "loss": 0.1273, "step": 572 }, { "epoch": 1.0731364275668074, "grad_norm": 0.3298353850841522, "learning_rate": 1.6319688109161794e-05, "loss": 0.1297, "step": 573 }, { "epoch": 1.0750117205813408, "grad_norm": 0.18379832804203033, "learning_rate": 1.6311890838206628e-05, "loss": 0.0712, "step": 574 }, { "epoch": 1.0768870135958744, "grad_norm": 0.2380228340625763, "learning_rate": 1.6304093567251466e-05, "loss": 0.0645, "step": 575 }, { "epoch": 1.078762306610408, "grad_norm": 0.38280248641967773, "learning_rate": 1.6296296296296297e-05, "loss": 0.1449, "step": 576 }, { "epoch": 1.0806375996249413, "grad_norm": 0.24589546024799347, "learning_rate": 1.628849902534113e-05, "loss": 0.0954, "step": 577 }, { "epoch": 1.082512892639475, "grad_norm": 0.245429128408432, "learning_rate": 1.6280701754385965e-05, "loss": 0.0716, "step": 578 }, { "epoch": 1.0843881856540085, "grad_norm": 0.685253381729126, "learning_rate": 1.62729044834308e-05, "loss": 0.0688, "step": 579 }, { "epoch": 1.086263478668542, "grad_norm": 0.28492575883865356, "learning_rate": 1.6265107212475633e-05, "loss": 0.1057, "step": 580 }, { "epoch": 1.0881387716830755, "grad_norm": 0.2217700481414795, "learning_rate": 1.625730994152047e-05, "loss": 0.085, "step": 581 }, { "epoch": 1.090014064697609, "grad_norm": 0.30696970224380493, "learning_rate": 1.6249512670565305e-05, "loss": 0.0976, "step": 582 }, { "epoch": 1.0918893577121425, "grad_norm": 0.2384309470653534, "learning_rate": 1.624171539961014e-05, "loss": 0.0782, "step": 583 }, { "epoch": 1.093764650726676, "grad_norm": 0.2883552610874176, "learning_rate": 1.6233918128654974e-05, "loss": 0.1116, "step": 584 }, { "epoch": 1.0956399437412097, "grad_norm": 0.30440714955329895, "learning_rate": 1.6226120857699808e-05, "loss": 0.1204, "step": 585 }, { "epoch": 1.097515236755743, "grad_norm": 0.9227920174598694, "learning_rate": 1.621832358674464e-05, "loss": 0.1289, "step": 586 }, { "epoch": 1.0993905297702766, "grad_norm": 0.2924637198448181, "learning_rate": 1.6210526315789473e-05, "loss": 0.1018, "step": 587 }, { "epoch": 1.1012658227848102, "grad_norm": 0.28547459840774536, "learning_rate": 1.620272904483431e-05, "loss": 0.1105, "step": 588 }, { "epoch": 1.1031411157993436, "grad_norm": 0.2556934952735901, "learning_rate": 1.6194931773879144e-05, "loss": 0.0934, "step": 589 }, { "epoch": 1.1050164088138772, "grad_norm": 0.23647433519363403, "learning_rate": 1.618713450292398e-05, "loss": 0.0787, "step": 590 }, { "epoch": 1.1068917018284108, "grad_norm": 0.3279690146446228, "learning_rate": 1.6179337231968813e-05, "loss": 0.1185, "step": 591 }, { "epoch": 1.1087669948429442, "grad_norm": 0.22327874600887299, "learning_rate": 1.6171539961013647e-05, "loss": 0.0835, "step": 592 }, { "epoch": 1.1106422878574778, "grad_norm": 0.4429026246070862, "learning_rate": 1.616374269005848e-05, "loss": 0.12, "step": 593 }, { "epoch": 1.1125175808720114, "grad_norm": 0.27783477306365967, "learning_rate": 1.6155945419103315e-05, "loss": 0.1396, "step": 594 }, { "epoch": 1.1143928738865447, "grad_norm": 0.35669296979904175, "learning_rate": 1.614814814814815e-05, "loss": 0.0993, "step": 595 }, { "epoch": 1.1162681669010783, "grad_norm": 0.22483180463314056, "learning_rate": 1.6140350877192984e-05, "loss": 0.0727, "step": 596 }, { "epoch": 1.1181434599156117, "grad_norm": 0.2645404636859894, "learning_rate": 1.6132553606237818e-05, "loss": 0.1045, "step": 597 }, { "epoch": 1.1200187529301453, "grad_norm": 0.29490724205970764, "learning_rate": 1.6124756335282652e-05, "loss": 0.0936, "step": 598 }, { "epoch": 1.121894045944679, "grad_norm": 0.24751833081245422, "learning_rate": 1.6116959064327486e-05, "loss": 0.0827, "step": 599 }, { "epoch": 1.1237693389592125, "grad_norm": 0.27503255009651184, "learning_rate": 1.610916179337232e-05, "loss": 0.1093, "step": 600 }, { "epoch": 1.1237693389592125, "eval_loss": 0.08217223733663559, "eval_runtime": 675.2558, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 600 }, { "epoch": 1.1256446319737459, "grad_norm": 0.30064669251441956, "learning_rate": 1.6101364522417155e-05, "loss": 0.0869, "step": 601 }, { "epoch": 1.1275199249882795, "grad_norm": 0.2266855388879776, "learning_rate": 1.609356725146199e-05, "loss": 0.0804, "step": 602 }, { "epoch": 1.1293952180028128, "grad_norm": 0.2856343686580658, "learning_rate": 1.6085769980506823e-05, "loss": 0.0893, "step": 603 }, { "epoch": 1.1312705110173464, "grad_norm": 0.2303743064403534, "learning_rate": 1.607797270955166e-05, "loss": 0.0759, "step": 604 }, { "epoch": 1.13314580403188, "grad_norm": 0.3389296531677246, "learning_rate": 1.6070175438596495e-05, "loss": 0.1191, "step": 605 }, { "epoch": 1.1350210970464134, "grad_norm": 0.27957460284233093, "learning_rate": 1.6062378167641326e-05, "loss": 0.1205, "step": 606 }, { "epoch": 1.136896390060947, "grad_norm": 0.24907708168029785, "learning_rate": 1.605458089668616e-05, "loss": 0.0942, "step": 607 }, { "epoch": 1.1387716830754806, "grad_norm": 0.272223562002182, "learning_rate": 1.6046783625730994e-05, "loss": 0.1217, "step": 608 }, { "epoch": 1.140646976090014, "grad_norm": 0.290814608335495, "learning_rate": 1.603898635477583e-05, "loss": 0.1062, "step": 609 }, { "epoch": 1.1425222691045476, "grad_norm": 0.2468331754207611, "learning_rate": 1.6031189083820663e-05, "loss": 0.0969, "step": 610 }, { "epoch": 1.1443975621190812, "grad_norm": 0.23120653629302979, "learning_rate": 1.60233918128655e-05, "loss": 0.092, "step": 611 }, { "epoch": 1.1462728551336145, "grad_norm": 0.23789869248867035, "learning_rate": 1.6015594541910334e-05, "loss": 0.0992, "step": 612 }, { "epoch": 1.1481481481481481, "grad_norm": 0.3250608444213867, "learning_rate": 1.600779727095517e-05, "loss": 0.12, "step": 613 }, { "epoch": 1.1500234411626817, "grad_norm": 0.278442919254303, "learning_rate": 1.6000000000000003e-05, "loss": 0.1042, "step": 614 }, { "epoch": 1.1518987341772151, "grad_norm": 0.23886540532112122, "learning_rate": 1.5992202729044833e-05, "loss": 0.1048, "step": 615 }, { "epoch": 1.1537740271917487, "grad_norm": 0.3647238612174988, "learning_rate": 1.5984405458089668e-05, "loss": 0.0651, "step": 616 }, { "epoch": 1.1556493202062823, "grad_norm": 0.2553196847438812, "learning_rate": 1.5976608187134505e-05, "loss": 0.1011, "step": 617 }, { "epoch": 1.1575246132208157, "grad_norm": 0.26290223002433777, "learning_rate": 1.596881091617934e-05, "loss": 0.1024, "step": 618 }, { "epoch": 1.1593999062353493, "grad_norm": 0.3013964891433716, "learning_rate": 1.5961013645224174e-05, "loss": 0.1192, "step": 619 }, { "epoch": 1.1612751992498829, "grad_norm": 0.30323684215545654, "learning_rate": 1.5953216374269008e-05, "loss": 0.1065, "step": 620 }, { "epoch": 1.1631504922644162, "grad_norm": 0.28223493695259094, "learning_rate": 1.5945419103313842e-05, "loss": 0.1187, "step": 621 }, { "epoch": 1.1650257852789498, "grad_norm": 0.3298370838165283, "learning_rate": 1.5937621832358676e-05, "loss": 0.1322, "step": 622 }, { "epoch": 1.1669010782934834, "grad_norm": 0.2321697622537613, "learning_rate": 1.592982456140351e-05, "loss": 0.0828, "step": 623 }, { "epoch": 1.1687763713080168, "grad_norm": 0.27766016125679016, "learning_rate": 1.5922027290448345e-05, "loss": 0.1183, "step": 624 }, { "epoch": 1.1706516643225504, "grad_norm": 0.5540488362312317, "learning_rate": 1.591423001949318e-05, "loss": 0.1321, "step": 625 }, { "epoch": 1.172526957337084, "grad_norm": 0.24459047615528107, "learning_rate": 1.5906432748538013e-05, "loss": 0.1013, "step": 626 }, { "epoch": 1.1744022503516174, "grad_norm": 0.307108610868454, "learning_rate": 1.5898635477582847e-05, "loss": 0.1047, "step": 627 }, { "epoch": 1.176277543366151, "grad_norm": 0.36074694991111755, "learning_rate": 1.589083820662768e-05, "loss": 0.0792, "step": 628 }, { "epoch": 1.1781528363806846, "grad_norm": 0.28516885638237, "learning_rate": 1.5883040935672516e-05, "loss": 0.1324, "step": 629 }, { "epoch": 1.180028129395218, "grad_norm": 0.3291719853878021, "learning_rate": 1.587524366471735e-05, "loss": 0.0955, "step": 630 }, { "epoch": 1.1819034224097515, "grad_norm": 0.25097760558128357, "learning_rate": 1.5867446393762184e-05, "loss": 0.0849, "step": 631 }, { "epoch": 1.183778715424285, "grad_norm": 0.2444642335176468, "learning_rate": 1.5859649122807018e-05, "loss": 0.0694, "step": 632 }, { "epoch": 1.1856540084388185, "grad_norm": 0.27323952317237854, "learning_rate": 1.5851851851851852e-05, "loss": 0.0963, "step": 633 }, { "epoch": 1.1875293014533521, "grad_norm": 0.23469096422195435, "learning_rate": 1.584405458089669e-05, "loss": 0.0764, "step": 634 }, { "epoch": 1.1894045944678857, "grad_norm": 0.2885955572128296, "learning_rate": 1.583625730994152e-05, "loss": 0.0978, "step": 635 }, { "epoch": 1.191279887482419, "grad_norm": 0.29515987634658813, "learning_rate": 1.5828460038986355e-05, "loss": 0.1237, "step": 636 }, { "epoch": 1.1931551804969527, "grad_norm": 0.3769955039024353, "learning_rate": 1.582066276803119e-05, "loss": 0.0984, "step": 637 }, { "epoch": 1.195030473511486, "grad_norm": 0.3882874846458435, "learning_rate": 1.5812865497076023e-05, "loss": 0.1113, "step": 638 }, { "epoch": 1.1969057665260197, "grad_norm": 0.3134611248970032, "learning_rate": 1.5805068226120857e-05, "loss": 0.0973, "step": 639 }, { "epoch": 1.1987810595405533, "grad_norm": 0.23223522305488586, "learning_rate": 1.579727095516569e-05, "loss": 0.054, "step": 640 }, { "epoch": 1.2006563525550868, "grad_norm": 0.22435115277767181, "learning_rate": 1.578947368421053e-05, "loss": 0.0883, "step": 641 }, { "epoch": 1.2025316455696202, "grad_norm": 0.2723855674266815, "learning_rate": 1.5781676413255363e-05, "loss": 0.1132, "step": 642 }, { "epoch": 1.2044069385841538, "grad_norm": 0.32858818769454956, "learning_rate": 1.5773879142300198e-05, "loss": 0.0787, "step": 643 }, { "epoch": 1.2062822315986872, "grad_norm": 0.25390857458114624, "learning_rate": 1.5766081871345032e-05, "loss": 0.0659, "step": 644 }, { "epoch": 1.2081575246132208, "grad_norm": 0.24344322085380554, "learning_rate": 1.5758284600389863e-05, "loss": 0.0912, "step": 645 }, { "epoch": 1.2100328176277544, "grad_norm": 0.2637319564819336, "learning_rate": 1.5750487329434697e-05, "loss": 0.0915, "step": 646 }, { "epoch": 1.2119081106422878, "grad_norm": 0.31068336963653564, "learning_rate": 1.5742690058479534e-05, "loss": 0.1511, "step": 647 }, { "epoch": 1.2137834036568214, "grad_norm": 0.27969691157341003, "learning_rate": 1.573489278752437e-05, "loss": 0.0742, "step": 648 }, { "epoch": 1.215658696671355, "grad_norm": 0.31574878096580505, "learning_rate": 1.5727095516569203e-05, "loss": 0.0755, "step": 649 }, { "epoch": 1.2175339896858883, "grad_norm": 0.24576275050640106, "learning_rate": 1.5719298245614037e-05, "loss": 0.0803, "step": 650 }, { "epoch": 1.2175339896858883, "eval_loss": 0.07974553108215332, "eval_runtime": 676.0594, "eval_samples_per_second": 0.291, "eval_steps_per_second": 0.291, "step": 650 }, { "epoch": 1.219409282700422, "grad_norm": 0.31214672327041626, "learning_rate": 1.571150097465887e-05, "loss": 0.1131, "step": 651 }, { "epoch": 1.2212845757149555, "grad_norm": 0.268762469291687, "learning_rate": 1.5703703703703705e-05, "loss": 0.0896, "step": 652 }, { "epoch": 1.223159868729489, "grad_norm": 0.21026861667633057, "learning_rate": 1.569590643274854e-05, "loss": 0.056, "step": 653 }, { "epoch": 1.2250351617440225, "grad_norm": 0.4084342420101166, "learning_rate": 1.5688109161793374e-05, "loss": 0.0833, "step": 654 }, { "epoch": 1.226910454758556, "grad_norm": 0.28843924403190613, "learning_rate": 1.5680311890838208e-05, "loss": 0.0888, "step": 655 }, { "epoch": 1.2287857477730895, "grad_norm": 0.29009735584259033, "learning_rate": 1.5672514619883042e-05, "loss": 0.0992, "step": 656 }, { "epoch": 1.230661040787623, "grad_norm": 0.24224917590618134, "learning_rate": 1.5664717348927876e-05, "loss": 0.0624, "step": 657 }, { "epoch": 1.2325363338021567, "grad_norm": 0.29413849115371704, "learning_rate": 1.565692007797271e-05, "loss": 0.095, "step": 658 }, { "epoch": 1.23441162681669, "grad_norm": 0.21053080260753632, "learning_rate": 1.5649122807017545e-05, "loss": 0.0631, "step": 659 }, { "epoch": 1.2362869198312236, "grad_norm": 0.2499096691608429, "learning_rate": 1.564132553606238e-05, "loss": 0.0848, "step": 660 }, { "epoch": 1.2381622128457572, "grad_norm": 0.28722044825553894, "learning_rate": 1.5633528265107213e-05, "loss": 0.1162, "step": 661 }, { "epoch": 1.2400375058602906, "grad_norm": 0.3939790427684784, "learning_rate": 1.5625730994152047e-05, "loss": 0.0842, "step": 662 }, { "epoch": 1.2419127988748242, "grad_norm": 0.2812120318412781, "learning_rate": 1.561793372319688e-05, "loss": 0.06, "step": 663 }, { "epoch": 1.2437880918893578, "grad_norm": 0.2532125413417816, "learning_rate": 1.561013645224172e-05, "loss": 0.0866, "step": 664 }, { "epoch": 1.2456633849038912, "grad_norm": 0.3428124785423279, "learning_rate": 1.560233918128655e-05, "loss": 0.1119, "step": 665 }, { "epoch": 1.2475386779184248, "grad_norm": 0.3042241930961609, "learning_rate": 1.5594541910331384e-05, "loss": 0.1112, "step": 666 }, { "epoch": 1.2494139709329584, "grad_norm": 0.2961670458316803, "learning_rate": 1.5586744639376218e-05, "loss": 0.0964, "step": 667 }, { "epoch": 1.2512892639474917, "grad_norm": 0.403415322303772, "learning_rate": 1.5578947368421052e-05, "loss": 0.1721, "step": 668 }, { "epoch": 1.2531645569620253, "grad_norm": 0.27317744493484497, "learning_rate": 1.5571150097465887e-05, "loss": 0.096, "step": 669 }, { "epoch": 1.255039849976559, "grad_norm": 0.2543340027332306, "learning_rate": 1.5563352826510724e-05, "loss": 0.0775, "step": 670 }, { "epoch": 1.2569151429910923, "grad_norm": 0.4965609014034271, "learning_rate": 1.555555555555556e-05, "loss": 0.0798, "step": 671 }, { "epoch": 1.258790436005626, "grad_norm": 0.2571773827075958, "learning_rate": 1.5547758284600393e-05, "loss": 0.0865, "step": 672 }, { "epoch": 1.2606657290201593, "grad_norm": 0.2605682909488678, "learning_rate": 1.5539961013645227e-05, "loss": 0.1214, "step": 673 }, { "epoch": 1.2625410220346929, "grad_norm": 0.2747882902622223, "learning_rate": 1.553216374269006e-05, "loss": 0.115, "step": 674 }, { "epoch": 1.2644163150492265, "grad_norm": 0.2293609231710434, "learning_rate": 1.5524366471734892e-05, "loss": 0.0743, "step": 675 }, { "epoch": 1.26629160806376, "grad_norm": 0.3130941092967987, "learning_rate": 1.5516569200779726e-05, "loss": 0.0837, "step": 676 }, { "epoch": 1.2681669010782934, "grad_norm": 0.3052816092967987, "learning_rate": 1.5508771929824563e-05, "loss": 0.1217, "step": 677 }, { "epoch": 1.270042194092827, "grad_norm": 0.24826787412166595, "learning_rate": 1.5500974658869398e-05, "loss": 0.0881, "step": 678 }, { "epoch": 1.2719174871073604, "grad_norm": 0.30709272623062134, "learning_rate": 1.5493177387914232e-05, "loss": 0.0667, "step": 679 }, { "epoch": 1.273792780121894, "grad_norm": 0.2654632329940796, "learning_rate": 1.5485380116959066e-05, "loss": 0.0915, "step": 680 }, { "epoch": 1.2756680731364276, "grad_norm": 0.24949900805950165, "learning_rate": 1.54775828460039e-05, "loss": 0.0738, "step": 681 }, { "epoch": 1.2775433661509612, "grad_norm": 0.5046447515487671, "learning_rate": 1.5469785575048734e-05, "loss": 0.1017, "step": 682 }, { "epoch": 1.2794186591654946, "grad_norm": 0.31709209084510803, "learning_rate": 1.546198830409357e-05, "loss": 0.1129, "step": 683 }, { "epoch": 1.2812939521800282, "grad_norm": 0.30328288674354553, "learning_rate": 1.5454191033138403e-05, "loss": 0.1407, "step": 684 }, { "epoch": 1.2831692451945615, "grad_norm": 0.34941598773002625, "learning_rate": 1.5446393762183237e-05, "loss": 0.1595, "step": 685 }, { "epoch": 1.2850445382090951, "grad_norm": 0.25712135434150696, "learning_rate": 1.543859649122807e-05, "loss": 0.082, "step": 686 }, { "epoch": 1.2869198312236287, "grad_norm": 0.5184486508369446, "learning_rate": 1.5430799220272905e-05, "loss": 0.0826, "step": 687 }, { "epoch": 1.2887951242381623, "grad_norm": 0.23768484592437744, "learning_rate": 1.542300194931774e-05, "loss": 0.0893, "step": 688 }, { "epoch": 1.2906704172526957, "grad_norm": 0.2733663320541382, "learning_rate": 1.5415204678362574e-05, "loss": 0.1009, "step": 689 }, { "epoch": 1.2925457102672293, "grad_norm": 0.2970256805419922, "learning_rate": 1.5407407407407408e-05, "loss": 0.1103, "step": 690 }, { "epoch": 1.2944210032817627, "grad_norm": 0.29735323786735535, "learning_rate": 1.5399610136452242e-05, "loss": 0.1185, "step": 691 }, { "epoch": 1.2962962962962963, "grad_norm": 0.33100953698158264, "learning_rate": 1.5391812865497076e-05, "loss": 0.0833, "step": 692 }, { "epoch": 1.2981715893108299, "grad_norm": 0.30054280161857605, "learning_rate": 1.5384015594541914e-05, "loss": 0.058, "step": 693 }, { "epoch": 1.3000468823253635, "grad_norm": 0.2017904669046402, "learning_rate": 1.5376218323586748e-05, "loss": 0.0716, "step": 694 }, { "epoch": 1.3019221753398968, "grad_norm": 0.2905302047729492, "learning_rate": 1.536842105263158e-05, "loss": 0.1068, "step": 695 }, { "epoch": 1.3037974683544304, "grad_norm": 0.26433658599853516, "learning_rate": 1.5360623781676413e-05, "loss": 0.0923, "step": 696 }, { "epoch": 1.3056727613689638, "grad_norm": 0.24136176705360413, "learning_rate": 1.5352826510721247e-05, "loss": 0.0975, "step": 697 }, { "epoch": 1.3075480543834974, "grad_norm": 0.2888275384902954, "learning_rate": 1.534502923976608e-05, "loss": 0.0736, "step": 698 }, { "epoch": 1.309423347398031, "grad_norm": 0.28614625334739685, "learning_rate": 1.5337231968810916e-05, "loss": 0.0757, "step": 699 }, { "epoch": 1.3112986404125644, "grad_norm": 1.1438792943954468, "learning_rate": 1.5329434697855753e-05, "loss": 0.1105, "step": 700 }, { "epoch": 1.3112986404125644, "eval_loss": 0.08180084824562073, "eval_runtime": 675.7061, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 700 }, { "epoch": 1.313173933427098, "grad_norm": 0.24729672074317932, "learning_rate": 1.5321637426900587e-05, "loss": 0.1244, "step": 701 }, { "epoch": 1.3150492264416316, "grad_norm": 0.6661401391029358, "learning_rate": 1.531384015594542e-05, "loss": 0.0465, "step": 702 }, { "epoch": 1.316924519456165, "grad_norm": 0.2756960391998291, "learning_rate": 1.5306042884990256e-05, "loss": 0.0845, "step": 703 }, { "epoch": 1.3187998124706986, "grad_norm": 0.2175983488559723, "learning_rate": 1.529824561403509e-05, "loss": 0.0874, "step": 704 }, { "epoch": 1.3206751054852321, "grad_norm": 0.26682281494140625, "learning_rate": 1.529044834307992e-05, "loss": 0.0785, "step": 705 }, { "epoch": 1.3225503984997655, "grad_norm": 0.28904491662979126, "learning_rate": 1.528265107212476e-05, "loss": 0.0812, "step": 706 }, { "epoch": 1.3244256915142991, "grad_norm": 0.37207451462745667, "learning_rate": 1.5274853801169593e-05, "loss": 0.118, "step": 707 }, { "epoch": 1.3263009845288325, "grad_norm": 0.2251625806093216, "learning_rate": 1.5267056530214427e-05, "loss": 0.088, "step": 708 }, { "epoch": 1.328176277543366, "grad_norm": 0.2839013934135437, "learning_rate": 1.525925925925926e-05, "loss": 0.089, "step": 709 }, { "epoch": 1.3300515705578997, "grad_norm": 0.2906053960323334, "learning_rate": 1.5251461988304095e-05, "loss": 0.1056, "step": 710 }, { "epoch": 1.3319268635724333, "grad_norm": 0.29006117582321167, "learning_rate": 1.524366471734893e-05, "loss": 0.1284, "step": 711 }, { "epoch": 1.3338021565869667, "grad_norm": 0.23621053993701935, "learning_rate": 1.5235867446393764e-05, "loss": 0.0831, "step": 712 }, { "epoch": 1.3356774496015003, "grad_norm": 0.2675006091594696, "learning_rate": 1.5228070175438598e-05, "loss": 0.0979, "step": 713 }, { "epoch": 1.3375527426160336, "grad_norm": 0.25027772784233093, "learning_rate": 1.5220272904483434e-05, "loss": 0.0989, "step": 714 }, { "epoch": 1.3394280356305672, "grad_norm": 0.3723505437374115, "learning_rate": 1.5212475633528266e-05, "loss": 0.1134, "step": 715 }, { "epoch": 1.3413033286451008, "grad_norm": 0.2928392291069031, "learning_rate": 1.52046783625731e-05, "loss": 0.1118, "step": 716 }, { "epoch": 1.3431786216596344, "grad_norm": 0.23428718745708466, "learning_rate": 1.5196881091617935e-05, "loss": 0.056, "step": 717 }, { "epoch": 1.3450539146741678, "grad_norm": 0.23713499307632446, "learning_rate": 1.5189083820662769e-05, "loss": 0.0812, "step": 718 }, { "epoch": 1.3469292076887014, "grad_norm": 0.27535611391067505, "learning_rate": 1.5181286549707603e-05, "loss": 0.0926, "step": 719 }, { "epoch": 1.3488045007032348, "grad_norm": 0.29593226313591003, "learning_rate": 1.5173489278752439e-05, "loss": 0.1266, "step": 720 }, { "epoch": 1.3506797937177684, "grad_norm": 0.32769280672073364, "learning_rate": 1.5165692007797273e-05, "loss": 0.1462, "step": 721 }, { "epoch": 1.352555086732302, "grad_norm": 0.3472793996334076, "learning_rate": 1.5157894736842107e-05, "loss": 0.1547, "step": 722 }, { "epoch": 1.3544303797468356, "grad_norm": 0.2513481378555298, "learning_rate": 1.5150097465886941e-05, "loss": 0.093, "step": 723 }, { "epoch": 1.356305672761369, "grad_norm": 0.2549232244491577, "learning_rate": 1.5142300194931776e-05, "loss": 0.0858, "step": 724 }, { "epoch": 1.3581809657759025, "grad_norm": 0.33557018637657166, "learning_rate": 1.5134502923976608e-05, "loss": 0.1245, "step": 725 }, { "epoch": 1.360056258790436, "grad_norm": 0.2504093647003174, "learning_rate": 1.5126705653021442e-05, "loss": 0.0764, "step": 726 }, { "epoch": 1.3619315518049695, "grad_norm": 0.20211853086948395, "learning_rate": 1.5118908382066278e-05, "loss": 0.0634, "step": 727 }, { "epoch": 1.363806844819503, "grad_norm": 0.28420448303222656, "learning_rate": 1.5111111111111112e-05, "loss": 0.1096, "step": 728 }, { "epoch": 1.3656821378340367, "grad_norm": 0.22307127714157104, "learning_rate": 1.5103313840155947e-05, "loss": 0.0603, "step": 729 }, { "epoch": 1.36755743084857, "grad_norm": 0.21484006941318512, "learning_rate": 1.509551656920078e-05, "loss": 0.0784, "step": 730 }, { "epoch": 1.3694327238631037, "grad_norm": 0.23451459407806396, "learning_rate": 1.5087719298245615e-05, "loss": 0.1042, "step": 731 }, { "epoch": 1.371308016877637, "grad_norm": 0.3075193166732788, "learning_rate": 1.507992202729045e-05, "loss": 0.0924, "step": 732 }, { "epoch": 1.3731833098921706, "grad_norm": 0.25598907470703125, "learning_rate": 1.5072124756335285e-05, "loss": 0.0911, "step": 733 }, { "epoch": 1.3750586029067042, "grad_norm": 0.25740352272987366, "learning_rate": 1.5064327485380119e-05, "loss": 0.1088, "step": 734 }, { "epoch": 1.3769338959212378, "grad_norm": 0.3566450774669647, "learning_rate": 1.5056530214424952e-05, "loss": 0.1148, "step": 735 }, { "epoch": 1.3788091889357712, "grad_norm": 0.21709580719470978, "learning_rate": 1.5048732943469786e-05, "loss": 0.0762, "step": 736 }, { "epoch": 1.3806844819503048, "grad_norm": 0.36706310510635376, "learning_rate": 1.504093567251462e-05, "loss": 0.0655, "step": 737 }, { "epoch": 1.3825597749648382, "grad_norm": 0.2232268899679184, "learning_rate": 1.5033138401559454e-05, "loss": 0.0777, "step": 738 }, { "epoch": 1.3844350679793718, "grad_norm": 0.4871269762516022, "learning_rate": 1.502534113060429e-05, "loss": 0.1297, "step": 739 }, { "epoch": 1.3863103609939054, "grad_norm": 0.32851436734199524, "learning_rate": 1.5017543859649124e-05, "loss": 0.1113, "step": 740 }, { "epoch": 1.3881856540084387, "grad_norm": 0.28122517466545105, "learning_rate": 1.5009746588693958e-05, "loss": 0.1021, "step": 741 }, { "epoch": 1.3900609470229723, "grad_norm": 0.2887057065963745, "learning_rate": 1.5001949317738793e-05, "loss": 0.0871, "step": 742 }, { "epoch": 1.391936240037506, "grad_norm": 0.2595096230506897, "learning_rate": 1.4994152046783627e-05, "loss": 0.0708, "step": 743 }, { "epoch": 1.3938115330520393, "grad_norm": 0.232121542096138, "learning_rate": 1.4986354775828463e-05, "loss": 0.0727, "step": 744 }, { "epoch": 1.395686826066573, "grad_norm": 0.2849893569946289, "learning_rate": 1.4978557504873295e-05, "loss": 0.0861, "step": 745 }, { "epoch": 1.3975621190811065, "grad_norm": 0.2730804979801178, "learning_rate": 1.497076023391813e-05, "loss": 0.0973, "step": 746 }, { "epoch": 1.3994374120956399, "grad_norm": 0.24728837609291077, "learning_rate": 1.4962962962962964e-05, "loss": 0.0867, "step": 747 }, { "epoch": 1.4013127051101735, "grad_norm": 0.2925151288509369, "learning_rate": 1.4955165692007798e-05, "loss": 0.135, "step": 748 }, { "epoch": 1.4031879981247068, "grad_norm": 0.35172486305236816, "learning_rate": 1.4947368421052632e-05, "loss": 0.1115, "step": 749 }, { "epoch": 1.4050632911392404, "grad_norm": 0.24151886999607086, "learning_rate": 1.4939571150097468e-05, "loss": 0.0892, "step": 750 }, { "epoch": 1.4050632911392404, "eval_loss": 0.07957883179187775, "eval_runtime": 676.5902, "eval_samples_per_second": 0.291, "eval_steps_per_second": 0.291, "step": 750 }, { "epoch": 1.406938584153774, "grad_norm": 0.2599674463272095, "learning_rate": 1.4931773879142302e-05, "loss": 0.073, "step": 751 }, { "epoch": 1.4088138771683076, "grad_norm": 0.2866610586643219, "learning_rate": 1.4923976608187136e-05, "loss": 0.0743, "step": 752 }, { "epoch": 1.410689170182841, "grad_norm": 0.3928741216659546, "learning_rate": 1.491617933723197e-05, "loss": 0.1462, "step": 753 }, { "epoch": 1.4125644631973746, "grad_norm": 0.38174375891685486, "learning_rate": 1.4908382066276805e-05, "loss": 0.1081, "step": 754 }, { "epoch": 1.414439756211908, "grad_norm": 0.2848498821258545, "learning_rate": 1.4900584795321637e-05, "loss": 0.1071, "step": 755 }, { "epoch": 1.4163150492264416, "grad_norm": 0.26754605770111084, "learning_rate": 1.4892787524366471e-05, "loss": 0.0775, "step": 756 }, { "epoch": 1.4181903422409752, "grad_norm": 0.3248796761035919, "learning_rate": 1.4884990253411307e-05, "loss": 0.1093, "step": 757 }, { "epoch": 1.4200656352555088, "grad_norm": 0.2621667981147766, "learning_rate": 1.4877192982456141e-05, "loss": 0.0846, "step": 758 }, { "epoch": 1.4219409282700421, "grad_norm": 0.3511008024215698, "learning_rate": 1.4869395711500976e-05, "loss": 0.1343, "step": 759 }, { "epoch": 1.4238162212845757, "grad_norm": 0.23400309681892395, "learning_rate": 1.486159844054581e-05, "loss": 0.0795, "step": 760 }, { "epoch": 1.4256915142991091, "grad_norm": 0.3075447678565979, "learning_rate": 1.4853801169590644e-05, "loss": 0.1026, "step": 761 }, { "epoch": 1.4275668073136427, "grad_norm": 0.2749321162700653, "learning_rate": 1.484600389863548e-05, "loss": 0.0991, "step": 762 }, { "epoch": 1.4294421003281763, "grad_norm": 0.29317227005958557, "learning_rate": 1.4838206627680314e-05, "loss": 0.0857, "step": 763 }, { "epoch": 1.43131739334271, "grad_norm": 0.3031213879585266, "learning_rate": 1.4830409356725148e-05, "loss": 0.1196, "step": 764 }, { "epoch": 1.4331926863572433, "grad_norm": 0.2685413658618927, "learning_rate": 1.482261208576998e-05, "loss": 0.0958, "step": 765 }, { "epoch": 1.4350679793717769, "grad_norm": 0.28609031438827515, "learning_rate": 1.4814814814814815e-05, "loss": 0.1317, "step": 766 }, { "epoch": 1.4369432723863103, "grad_norm": 0.2767196595668793, "learning_rate": 1.4807017543859649e-05, "loss": 0.0985, "step": 767 }, { "epoch": 1.4388185654008439, "grad_norm": 0.29232558608055115, "learning_rate": 1.4799220272904485e-05, "loss": 0.1096, "step": 768 }, { "epoch": 1.4406938584153774, "grad_norm": 0.2960546016693115, "learning_rate": 1.479142300194932e-05, "loss": 0.0876, "step": 769 }, { "epoch": 1.442569151429911, "grad_norm": 0.43392062187194824, "learning_rate": 1.4783625730994153e-05, "loss": 0.1089, "step": 770 }, { "epoch": 1.4444444444444444, "grad_norm": 0.23281604051589966, "learning_rate": 1.4775828460038988e-05, "loss": 0.0941, "step": 771 }, { "epoch": 1.446319737458978, "grad_norm": 0.293990820646286, "learning_rate": 1.4768031189083822e-05, "loss": 0.1161, "step": 772 }, { "epoch": 1.4481950304735114, "grad_norm": 0.33511075377464294, "learning_rate": 1.4760233918128658e-05, "loss": 0.0917, "step": 773 }, { "epoch": 1.450070323488045, "grad_norm": 0.32499679923057556, "learning_rate": 1.4752436647173492e-05, "loss": 0.0784, "step": 774 }, { "epoch": 1.4519456165025786, "grad_norm": 0.27234625816345215, "learning_rate": 1.4744639376218324e-05, "loss": 0.0876, "step": 775 }, { "epoch": 1.4538209095171122, "grad_norm": 0.38885700702667236, "learning_rate": 1.4736842105263159e-05, "loss": 0.1354, "step": 776 }, { "epoch": 1.4556962025316456, "grad_norm": 0.28798046708106995, "learning_rate": 1.4729044834307993e-05, "loss": 0.0782, "step": 777 }, { "epoch": 1.4575714955461792, "grad_norm": 0.2976330518722534, "learning_rate": 1.4721247563352827e-05, "loss": 0.0898, "step": 778 }, { "epoch": 1.4594467885607125, "grad_norm": 0.31952551007270813, "learning_rate": 1.4713450292397661e-05, "loss": 0.128, "step": 779 }, { "epoch": 1.4613220815752461, "grad_norm": 0.27772435545921326, "learning_rate": 1.4705653021442497e-05, "loss": 0.0893, "step": 780 }, { "epoch": 1.4631973745897797, "grad_norm": 0.24229328334331512, "learning_rate": 1.4697855750487331e-05, "loss": 0.0684, "step": 781 }, { "epoch": 1.465072667604313, "grad_norm": 0.2713194489479065, "learning_rate": 1.4690058479532165e-05, "loss": 0.0987, "step": 782 }, { "epoch": 1.4669479606188467, "grad_norm": 0.37047621607780457, "learning_rate": 1.4682261208577e-05, "loss": 0.0901, "step": 783 }, { "epoch": 1.4688232536333803, "grad_norm": 0.5432588458061218, "learning_rate": 1.4674463937621834e-05, "loss": 0.1067, "step": 784 }, { "epoch": 1.4706985466479137, "grad_norm": 0.26890650391578674, "learning_rate": 1.4666666666666666e-05, "loss": 0.0755, "step": 785 }, { "epoch": 1.4725738396624473, "grad_norm": 0.30856049060821533, "learning_rate": 1.4658869395711502e-05, "loss": 0.0889, "step": 786 }, { "epoch": 1.4744491326769809, "grad_norm": 0.2999826669692993, "learning_rate": 1.4651072124756336e-05, "loss": 0.134, "step": 787 }, { "epoch": 1.4763244256915142, "grad_norm": 0.23444703221321106, "learning_rate": 1.464327485380117e-05, "loss": 0.0899, "step": 788 }, { "epoch": 1.4781997187060478, "grad_norm": 0.25326988101005554, "learning_rate": 1.4635477582846005e-05, "loss": 0.0889, "step": 789 }, { "epoch": 1.4800750117205814, "grad_norm": 0.4101651906967163, "learning_rate": 1.4627680311890839e-05, "loss": 0.0708, "step": 790 }, { "epoch": 1.4819503047351148, "grad_norm": 0.3386366069316864, "learning_rate": 1.4619883040935675e-05, "loss": 0.079, "step": 791 }, { "epoch": 1.4838255977496484, "grad_norm": 0.2348432093858719, "learning_rate": 1.4612085769980509e-05, "loss": 0.071, "step": 792 }, { "epoch": 1.485700890764182, "grad_norm": 0.2450290322303772, "learning_rate": 1.4604288499025343e-05, "loss": 0.0811, "step": 793 }, { "epoch": 1.4875761837787154, "grad_norm": 0.30053895711898804, "learning_rate": 1.4596491228070177e-05, "loss": 0.092, "step": 794 }, { "epoch": 1.489451476793249, "grad_norm": 0.3324088752269745, "learning_rate": 1.458869395711501e-05, "loss": 0.1003, "step": 795 }, { "epoch": 1.4913267698077823, "grad_norm": 0.31995347142219543, "learning_rate": 1.4580896686159844e-05, "loss": 0.1084, "step": 796 }, { "epoch": 1.493202062822316, "grad_norm": 0.2602950930595398, "learning_rate": 1.4573099415204678e-05, "loss": 0.0751, "step": 797 }, { "epoch": 1.4950773558368495, "grad_norm": 0.2805696725845337, "learning_rate": 1.4565302144249514e-05, "loss": 0.1025, "step": 798 }, { "epoch": 1.4969526488513831, "grad_norm": 0.29233241081237793, "learning_rate": 1.4557504873294348e-05, "loss": 0.1161, "step": 799 }, { "epoch": 1.4988279418659165, "grad_norm": 0.31460389494895935, "learning_rate": 1.4549707602339183e-05, "loss": 0.0975, "step": 800 }, { "epoch": 1.4988279418659165, "eval_loss": 0.07664016634225845, "eval_runtime": 675.2627, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 800 }, { "epoch": 1.50070323488045, "grad_norm": 0.3660965859889984, "learning_rate": 1.4541910331384017e-05, "loss": 0.1098, "step": 801 }, { "epoch": 1.5025785278949835, "grad_norm": 0.26959264278411865, "learning_rate": 1.4534113060428851e-05, "loss": 0.0855, "step": 802 }, { "epoch": 1.504453820909517, "grad_norm": 0.3515307605266571, "learning_rate": 1.4526315789473687e-05, "loss": 0.1272, "step": 803 }, { "epoch": 1.5063291139240507, "grad_norm": 0.2386811077594757, "learning_rate": 1.4518518518518521e-05, "loss": 0.0612, "step": 804 }, { "epoch": 1.5082044069385843, "grad_norm": 0.34232282638549805, "learning_rate": 1.4510721247563353e-05, "loss": 0.061, "step": 805 }, { "epoch": 1.5100796999531176, "grad_norm": 0.6131661534309387, "learning_rate": 1.4502923976608188e-05, "loss": 0.0817, "step": 806 }, { "epoch": 1.5119549929676512, "grad_norm": 0.20144400000572205, "learning_rate": 1.4495126705653022e-05, "loss": 0.0456, "step": 807 }, { "epoch": 1.5138302859821846, "grad_norm": 0.2816990315914154, "learning_rate": 1.4487329434697856e-05, "loss": 0.0858, "step": 808 }, { "epoch": 1.5157055789967182, "grad_norm": 0.3411908447742462, "learning_rate": 1.447953216374269e-05, "loss": 0.0866, "step": 809 }, { "epoch": 1.5175808720112518, "grad_norm": 0.2759881615638733, "learning_rate": 1.4471734892787526e-05, "loss": 0.0947, "step": 810 }, { "epoch": 1.5194561650257854, "grad_norm": 0.26385390758514404, "learning_rate": 1.446393762183236e-05, "loss": 0.0853, "step": 811 }, { "epoch": 1.5213314580403188, "grad_norm": 0.20965765416622162, "learning_rate": 1.4456140350877195e-05, "loss": 0.0716, "step": 812 }, { "epoch": 1.5232067510548524, "grad_norm": 0.33767369389533997, "learning_rate": 1.4448343079922029e-05, "loss": 0.1221, "step": 813 }, { "epoch": 1.5250820440693857, "grad_norm": 0.9217659831047058, "learning_rate": 1.4440545808966863e-05, "loss": 0.0984, "step": 814 }, { "epoch": 1.5269573370839193, "grad_norm": 0.2534090280532837, "learning_rate": 1.4432748538011695e-05, "loss": 0.0744, "step": 815 }, { "epoch": 1.528832630098453, "grad_norm": 0.27146193385124207, "learning_rate": 1.4424951267056531e-05, "loss": 0.0876, "step": 816 }, { "epoch": 1.5307079231129865, "grad_norm": 0.23886770009994507, "learning_rate": 1.4417153996101365e-05, "loss": 0.0823, "step": 817 }, { "epoch": 1.53258321612752, "grad_norm": 0.29205018281936646, "learning_rate": 1.44093567251462e-05, "loss": 0.1128, "step": 818 }, { "epoch": 1.5344585091420533, "grad_norm": 0.2402828186750412, "learning_rate": 1.4401559454191034e-05, "loss": 0.0853, "step": 819 }, { "epoch": 1.5363338021565869, "grad_norm": 0.28632691502571106, "learning_rate": 1.4393762183235868e-05, "loss": 0.0977, "step": 820 }, { "epoch": 1.5382090951711205, "grad_norm": 0.296055406332016, "learning_rate": 1.4385964912280704e-05, "loss": 0.1045, "step": 821 }, { "epoch": 1.540084388185654, "grad_norm": 0.2865302264690399, "learning_rate": 1.4378167641325538e-05, "loss": 0.0779, "step": 822 }, { "epoch": 1.5419596812001877, "grad_norm": 0.25754040479660034, "learning_rate": 1.4370370370370372e-05, "loss": 0.0892, "step": 823 }, { "epoch": 1.543834974214721, "grad_norm": 0.2635495960712433, "learning_rate": 1.4362573099415207e-05, "loss": 0.0716, "step": 824 }, { "epoch": 1.5457102672292544, "grad_norm": 0.42626357078552246, "learning_rate": 1.4354775828460039e-05, "loss": 0.1188, "step": 825 }, { "epoch": 1.547585560243788, "grad_norm": 0.2752715051174164, "learning_rate": 1.4346978557504873e-05, "loss": 0.0927, "step": 826 }, { "epoch": 1.5494608532583216, "grad_norm": 0.20132949948310852, "learning_rate": 1.4339181286549707e-05, "loss": 0.0541, "step": 827 }, { "epoch": 1.5513361462728552, "grad_norm": 0.3923582434654236, "learning_rate": 1.4331384015594543e-05, "loss": 0.0871, "step": 828 }, { "epoch": 1.5532114392873888, "grad_norm": 0.2865050137042999, "learning_rate": 1.4323586744639377e-05, "loss": 0.0802, "step": 829 }, { "epoch": 1.5550867323019222, "grad_norm": 0.2447250485420227, "learning_rate": 1.4315789473684212e-05, "loss": 0.0699, "step": 830 }, { "epoch": 1.5569620253164556, "grad_norm": 0.2569245398044586, "learning_rate": 1.4307992202729046e-05, "loss": 0.0736, "step": 831 }, { "epoch": 1.5588373183309892, "grad_norm": 0.27975255250930786, "learning_rate": 1.430019493177388e-05, "loss": 0.0841, "step": 832 }, { "epoch": 1.5607126113455227, "grad_norm": 0.246305450797081, "learning_rate": 1.4292397660818716e-05, "loss": 0.0722, "step": 833 }, { "epoch": 1.5625879043600563, "grad_norm": 0.3482731580734253, "learning_rate": 1.428460038986355e-05, "loss": 0.1454, "step": 834 }, { "epoch": 1.56446319737459, "grad_norm": 0.8472810983657837, "learning_rate": 1.4276803118908383e-05, "loss": 0.1386, "step": 835 }, { "epoch": 1.5663384903891233, "grad_norm": 0.2856524884700775, "learning_rate": 1.4269005847953217e-05, "loss": 0.0976, "step": 836 }, { "epoch": 1.5682137834036567, "grad_norm": 0.22626835107803345, "learning_rate": 1.4261208576998051e-05, "loss": 0.07, "step": 837 }, { "epoch": 1.5700890764181903, "grad_norm": 0.28921768069267273, "learning_rate": 1.4253411306042885e-05, "loss": 0.0653, "step": 838 }, { "epoch": 1.5719643694327239, "grad_norm": 0.3061649799346924, "learning_rate": 1.4245614035087721e-05, "loss": 0.1024, "step": 839 }, { "epoch": 1.5738396624472575, "grad_norm": 0.23194481432437897, "learning_rate": 1.4237816764132555e-05, "loss": 0.0542, "step": 840 }, { "epoch": 1.5757149554617909, "grad_norm": 0.21323058009147644, "learning_rate": 1.423001949317739e-05, "loss": 0.0413, "step": 841 }, { "epoch": 1.5775902484763245, "grad_norm": 0.6626904010772705, "learning_rate": 1.4222222222222224e-05, "loss": 0.061, "step": 842 }, { "epoch": 1.5794655414908578, "grad_norm": 0.279397577047348, "learning_rate": 1.4214424951267058e-05, "loss": 0.069, "step": 843 }, { "epoch": 1.5813408345053914, "grad_norm": 0.35950684547424316, "learning_rate": 1.4206627680311894e-05, "loss": 0.0628, "step": 844 }, { "epoch": 1.583216127519925, "grad_norm": 0.35051649808883667, "learning_rate": 1.4198830409356725e-05, "loss": 0.1164, "step": 845 }, { "epoch": 1.5850914205344586, "grad_norm": 0.23908878862857819, "learning_rate": 1.419103313840156e-05, "loss": 0.0745, "step": 846 }, { "epoch": 1.586966713548992, "grad_norm": 0.293130487203598, "learning_rate": 1.4183235867446395e-05, "loss": 0.0875, "step": 847 }, { "epoch": 1.5888420065635256, "grad_norm": 0.2919827699661255, "learning_rate": 1.4175438596491229e-05, "loss": 0.0886, "step": 848 }, { "epoch": 1.590717299578059, "grad_norm": 0.3345141112804413, "learning_rate": 1.4167641325536063e-05, "loss": 0.1071, "step": 849 }, { "epoch": 1.5925925925925926, "grad_norm": 0.6707221269607544, "learning_rate": 1.4159844054580897e-05, "loss": 0.1002, "step": 850 }, { "epoch": 1.5925925925925926, "eval_loss": 0.07790966331958771, "eval_runtime": 676.4736, "eval_samples_per_second": 0.291, "eval_steps_per_second": 0.291, "step": 850 }, { "epoch": 1.5944678856071262, "grad_norm": 0.27356359362602234, "learning_rate": 1.4152046783625733e-05, "loss": 0.0801, "step": 851 }, { "epoch": 1.5963431786216598, "grad_norm": 0.3435034155845642, "learning_rate": 1.4144249512670567e-05, "loss": 0.1108, "step": 852 }, { "epoch": 1.5982184716361931, "grad_norm": 0.32657772302627563, "learning_rate": 1.4136452241715401e-05, "loss": 0.113, "step": 853 }, { "epoch": 1.6000937646507267, "grad_norm": 0.3669753074645996, "learning_rate": 1.4128654970760236e-05, "loss": 0.1343, "step": 854 }, { "epoch": 1.60196905766526, "grad_norm": 0.23169514536857605, "learning_rate": 1.4120857699805068e-05, "loss": 0.0894, "step": 855 }, { "epoch": 1.6038443506797937, "grad_norm": 0.2689126431941986, "learning_rate": 1.4113060428849902e-05, "loss": 0.0753, "step": 856 }, { "epoch": 1.6057196436943273, "grad_norm": 0.2116389125585556, "learning_rate": 1.4105263157894738e-05, "loss": 0.0592, "step": 857 }, { "epoch": 1.6075949367088609, "grad_norm": 0.2922620475292206, "learning_rate": 1.4097465886939572e-05, "loss": 0.0944, "step": 858 }, { "epoch": 1.6094702297233943, "grad_norm": 0.26720038056373596, "learning_rate": 1.4089668615984407e-05, "loss": 0.0805, "step": 859 }, { "epoch": 1.6113455227379276, "grad_norm": 0.36932405829429626, "learning_rate": 1.408187134502924e-05, "loss": 0.0871, "step": 860 }, { "epoch": 1.6132208157524612, "grad_norm": 0.29181620478630066, "learning_rate": 1.4074074074074075e-05, "loss": 0.0903, "step": 861 }, { "epoch": 1.6150961087669948, "grad_norm": 0.24898619949817657, "learning_rate": 1.4066276803118911e-05, "loss": 0.0612, "step": 862 }, { "epoch": 1.6169714017815284, "grad_norm": 0.3132479786872864, "learning_rate": 1.4058479532163745e-05, "loss": 0.0962, "step": 863 }, { "epoch": 1.618846694796062, "grad_norm": 0.27776429057121277, "learning_rate": 1.405068226120858e-05, "loss": 0.0734, "step": 864 }, { "epoch": 1.6207219878105954, "grad_norm": 0.3017600476741791, "learning_rate": 1.4042884990253412e-05, "loss": 0.0802, "step": 865 }, { "epoch": 1.6225972808251288, "grad_norm": 0.2887416183948517, "learning_rate": 1.4035087719298246e-05, "loss": 0.098, "step": 866 }, { "epoch": 1.6244725738396624, "grad_norm": 0.3379286527633667, "learning_rate": 1.402729044834308e-05, "loss": 0.107, "step": 867 }, { "epoch": 1.626347866854196, "grad_norm": 0.33945974707603455, "learning_rate": 1.4019493177387914e-05, "loss": 0.0949, "step": 868 }, { "epoch": 1.6282231598687296, "grad_norm": 0.5351037979125977, "learning_rate": 1.401169590643275e-05, "loss": 0.0974, "step": 869 }, { "epoch": 1.6300984528832632, "grad_norm": 0.28477218747138977, "learning_rate": 1.4003898635477584e-05, "loss": 0.1049, "step": 870 }, { "epoch": 1.6319737458977965, "grad_norm": 0.34814324975013733, "learning_rate": 1.3996101364522419e-05, "loss": 0.1052, "step": 871 }, { "epoch": 1.63384903891233, "grad_norm": 0.3389275074005127, "learning_rate": 1.3988304093567253e-05, "loss": 0.1081, "step": 872 }, { "epoch": 1.6357243319268635, "grad_norm": 0.3551498353481293, "learning_rate": 1.3980506822612087e-05, "loss": 0.1016, "step": 873 }, { "epoch": 1.637599624941397, "grad_norm": 0.43945154547691345, "learning_rate": 1.3972709551656923e-05, "loss": 0.0631, "step": 874 }, { "epoch": 1.6394749179559307, "grad_norm": 0.7714282870292664, "learning_rate": 1.3964912280701755e-05, "loss": 0.0969, "step": 875 }, { "epoch": 1.6413502109704643, "grad_norm": 0.2278236597776413, "learning_rate": 1.395711500974659e-05, "loss": 0.0711, "step": 876 }, { "epoch": 1.6432255039849977, "grad_norm": 0.3024490773677826, "learning_rate": 1.3949317738791424e-05, "loss": 0.0727, "step": 877 }, { "epoch": 1.645100796999531, "grad_norm": 0.31009918451309204, "learning_rate": 1.3941520467836258e-05, "loss": 0.1222, "step": 878 }, { "epoch": 1.6469760900140646, "grad_norm": 0.402402400970459, "learning_rate": 1.3933723196881092e-05, "loss": 0.1404, "step": 879 }, { "epoch": 1.6488513830285982, "grad_norm": 0.3172832727432251, "learning_rate": 1.3925925925925928e-05, "loss": 0.1052, "step": 880 }, { "epoch": 1.6507266760431318, "grad_norm": 0.6237524151802063, "learning_rate": 1.3918128654970762e-05, "loss": 0.0804, "step": 881 }, { "epoch": 1.6526019690576652, "grad_norm": 0.27731725573539734, "learning_rate": 1.3910331384015596e-05, "loss": 0.0988, "step": 882 }, { "epoch": 1.6544772620721988, "grad_norm": 0.2532290518283844, "learning_rate": 1.390253411306043e-05, "loss": 0.1025, "step": 883 }, { "epoch": 1.6563525550867322, "grad_norm": 0.3791520297527313, "learning_rate": 1.3894736842105265e-05, "loss": 0.1178, "step": 884 }, { "epoch": 1.6582278481012658, "grad_norm": 0.24422794580459595, "learning_rate": 1.3886939571150097e-05, "loss": 0.0764, "step": 885 }, { "epoch": 1.6601031411157994, "grad_norm": 0.3019620478153229, "learning_rate": 1.3879142300194931e-05, "loss": 0.103, "step": 886 }, { "epoch": 1.661978434130333, "grad_norm": 0.33067408204078674, "learning_rate": 1.3871345029239767e-05, "loss": 0.114, "step": 887 }, { "epoch": 1.6638537271448663, "grad_norm": 0.30137330293655396, "learning_rate": 1.3863547758284602e-05, "loss": 0.0538, "step": 888 }, { "epoch": 1.6657290201594, "grad_norm": 0.3999065160751343, "learning_rate": 1.3855750487329436e-05, "loss": 0.0878, "step": 889 }, { "epoch": 1.6676043131739333, "grad_norm": 0.23727497458457947, "learning_rate": 1.384795321637427e-05, "loss": 0.0762, "step": 890 }, { "epoch": 1.669479606188467, "grad_norm": 0.3073793053627014, "learning_rate": 1.3840155945419104e-05, "loss": 0.1009, "step": 891 }, { "epoch": 1.6713548992030005, "grad_norm": 0.3737054467201233, "learning_rate": 1.383235867446394e-05, "loss": 0.118, "step": 892 }, { "epoch": 1.673230192217534, "grad_norm": 0.2168496549129486, "learning_rate": 1.3824561403508774e-05, "loss": 0.0733, "step": 893 }, { "epoch": 1.6751054852320675, "grad_norm": 0.3298211395740509, "learning_rate": 1.3816764132553608e-05, "loss": 0.1167, "step": 894 }, { "epoch": 1.676980778246601, "grad_norm": 0.2308388352394104, "learning_rate": 1.3808966861598441e-05, "loss": 0.0744, "step": 895 }, { "epoch": 1.6788560712611345, "grad_norm": 0.27769342064857483, "learning_rate": 1.3801169590643275e-05, "loss": 0.0651, "step": 896 }, { "epoch": 1.680731364275668, "grad_norm": 0.2548845708370209, "learning_rate": 1.379337231968811e-05, "loss": 0.0993, "step": 897 }, { "epoch": 1.6826066572902016, "grad_norm": 0.28797003626823425, "learning_rate": 1.3785575048732943e-05, "loss": 0.0773, "step": 898 }, { "epoch": 1.6844819503047352, "grad_norm": 0.32892584800720215, "learning_rate": 1.377777777777778e-05, "loss": 0.0944, "step": 899 }, { "epoch": 1.6863572433192686, "grad_norm": 0.40558120608329773, "learning_rate": 1.3769980506822614e-05, "loss": 0.1006, "step": 900 }, { "epoch": 1.6863572433192686, "eval_loss": 0.07565333694219589, "eval_runtime": 675.1107, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 900 }, { "epoch": 1.688232536333802, "grad_norm": 0.3913367986679077, "learning_rate": 1.3762183235867448e-05, "loss": 0.0936, "step": 901 }, { "epoch": 1.6901078293483356, "grad_norm": 0.2699122726917267, "learning_rate": 1.3754385964912282e-05, "loss": 0.0829, "step": 902 }, { "epoch": 1.6919831223628692, "grad_norm": 0.35245972871780396, "learning_rate": 1.3746588693957116e-05, "loss": 0.0932, "step": 903 }, { "epoch": 1.6938584153774028, "grad_norm": 0.25840988755226135, "learning_rate": 1.3738791423001952e-05, "loss": 0.086, "step": 904 }, { "epoch": 1.6957337083919364, "grad_norm": 0.44727715849876404, "learning_rate": 1.3730994152046784e-05, "loss": 0.0939, "step": 905 }, { "epoch": 1.6976090014064698, "grad_norm": 0.2222200334072113, "learning_rate": 1.3723196881091619e-05, "loss": 0.0859, "step": 906 }, { "epoch": 1.6994842944210031, "grad_norm": 0.27306899428367615, "learning_rate": 1.3715399610136453e-05, "loss": 0.0877, "step": 907 }, { "epoch": 1.7013595874355367, "grad_norm": 0.41138342022895813, "learning_rate": 1.3707602339181287e-05, "loss": 0.0812, "step": 908 }, { "epoch": 1.7032348804500703, "grad_norm": 0.29449108242988586, "learning_rate": 1.3699805068226121e-05, "loss": 0.0887, "step": 909 }, { "epoch": 1.705110173464604, "grad_norm": 0.25317761301994324, "learning_rate": 1.3692007797270957e-05, "loss": 0.0921, "step": 910 }, { "epoch": 1.7069854664791375, "grad_norm": 0.24713647365570068, "learning_rate": 1.3684210526315791e-05, "loss": 0.0646, "step": 911 }, { "epoch": 1.7088607594936709, "grad_norm": 0.2317000776529312, "learning_rate": 1.3676413255360625e-05, "loss": 0.0596, "step": 912 }, { "epoch": 1.7107360525082043, "grad_norm": 0.2771907150745392, "learning_rate": 1.366861598440546e-05, "loss": 0.091, "step": 913 }, { "epoch": 1.7126113455227379, "grad_norm": 0.23459120094776154, "learning_rate": 1.3660818713450294e-05, "loss": 0.0701, "step": 914 }, { "epoch": 1.7144866385372715, "grad_norm": 0.30127042531967163, "learning_rate": 1.3653021442495126e-05, "loss": 0.088, "step": 915 }, { "epoch": 1.716361931551805, "grad_norm": 0.24772456288337708, "learning_rate": 1.364522417153996e-05, "loss": 0.0774, "step": 916 }, { "epoch": 1.7182372245663387, "grad_norm": 0.2913759648799896, "learning_rate": 1.3637426900584796e-05, "loss": 0.0826, "step": 917 }, { "epoch": 1.720112517580872, "grad_norm": 0.28088027238845825, "learning_rate": 1.362962962962963e-05, "loss": 0.0881, "step": 918 }, { "epoch": 1.7219878105954054, "grad_norm": 0.30184197425842285, "learning_rate": 1.3621832358674465e-05, "loss": 0.0942, "step": 919 }, { "epoch": 1.723863103609939, "grad_norm": 0.2570498287677765, "learning_rate": 1.3614035087719299e-05, "loss": 0.0693, "step": 920 }, { "epoch": 1.7257383966244726, "grad_norm": 0.31009960174560547, "learning_rate": 1.3606237816764133e-05, "loss": 0.068, "step": 921 }, { "epoch": 1.7276136896390062, "grad_norm": 0.23270297050476074, "learning_rate": 1.3598440545808969e-05, "loss": 0.0619, "step": 922 }, { "epoch": 1.7294889826535396, "grad_norm": 0.33265721797943115, "learning_rate": 1.3590643274853803e-05, "loss": 0.0995, "step": 923 }, { "epoch": 1.7313642756680732, "grad_norm": 0.38637009263038635, "learning_rate": 1.3582846003898637e-05, "loss": 0.1298, "step": 924 }, { "epoch": 1.7332395686826065, "grad_norm": 0.27367690205574036, "learning_rate": 1.357504873294347e-05, "loss": 0.0852, "step": 925 }, { "epoch": 1.7351148616971401, "grad_norm": 0.3776531517505646, "learning_rate": 1.3567251461988304e-05, "loss": 0.1081, "step": 926 }, { "epoch": 1.7369901547116737, "grad_norm": 0.25184816122055054, "learning_rate": 1.3559454191033138e-05, "loss": 0.0747, "step": 927 }, { "epoch": 1.7388654477262073, "grad_norm": 0.26781925559043884, "learning_rate": 1.3551656920077974e-05, "loss": 0.0974, "step": 928 }, { "epoch": 1.7407407407407407, "grad_norm": 0.38023102283477783, "learning_rate": 1.3543859649122808e-05, "loss": 0.0989, "step": 929 }, { "epoch": 1.7426160337552743, "grad_norm": 0.30040284991264343, "learning_rate": 1.3536062378167643e-05, "loss": 0.098, "step": 930 }, { "epoch": 1.7444913267698077, "grad_norm": 0.41722437739372253, "learning_rate": 1.3528265107212477e-05, "loss": 0.0788, "step": 931 }, { "epoch": 1.7463666197843413, "grad_norm": 0.31635239720344543, "learning_rate": 1.3520467836257311e-05, "loss": 0.131, "step": 932 }, { "epoch": 1.7482419127988749, "grad_norm": 0.34196072816848755, "learning_rate": 1.3512670565302147e-05, "loss": 0.1302, "step": 933 }, { "epoch": 1.7501172058134085, "grad_norm": 0.24944040179252625, "learning_rate": 1.3504873294346981e-05, "loss": 0.078, "step": 934 }, { "epoch": 1.7519924988279418, "grad_norm": 0.2601966857910156, "learning_rate": 1.3497076023391814e-05, "loss": 0.0774, "step": 935 }, { "epoch": 1.7538677918424754, "grad_norm": 0.27114635705947876, "learning_rate": 1.3489278752436648e-05, "loss": 0.0871, "step": 936 }, { "epoch": 1.7557430848570088, "grad_norm": 0.27622026205062866, "learning_rate": 1.3481481481481482e-05, "loss": 0.0668, "step": 937 }, { "epoch": 1.7576183778715424, "grad_norm": 0.32424378395080566, "learning_rate": 1.3473684210526316e-05, "loss": 0.1001, "step": 938 }, { "epoch": 1.759493670886076, "grad_norm": 0.36716216802597046, "learning_rate": 1.346588693957115e-05, "loss": 0.1308, "step": 939 }, { "epoch": 1.7613689639006096, "grad_norm": 0.29691779613494873, "learning_rate": 1.3458089668615986e-05, "loss": 0.0736, "step": 940 }, { "epoch": 1.763244256915143, "grad_norm": 0.36653903126716614, "learning_rate": 1.345029239766082e-05, "loss": 0.1169, "step": 941 }, { "epoch": 1.7651195499296763, "grad_norm": 0.21092906594276428, "learning_rate": 1.3442495126705655e-05, "loss": 0.0722, "step": 942 }, { "epoch": 1.76699484294421, "grad_norm": 0.34096020460128784, "learning_rate": 1.3434697855750489e-05, "loss": 0.1135, "step": 943 }, { "epoch": 1.7688701359587435, "grad_norm": 0.2584519386291504, "learning_rate": 1.3426900584795323e-05, "loss": 0.0712, "step": 944 }, { "epoch": 1.7707454289732771, "grad_norm": 0.24627022445201874, "learning_rate": 1.3419103313840155e-05, "loss": 0.0683, "step": 945 }, { "epoch": 1.7726207219878107, "grad_norm": 0.2898278534412384, "learning_rate": 1.3411306042884991e-05, "loss": 0.0833, "step": 946 }, { "epoch": 1.774496015002344, "grad_norm": 0.2810303568840027, "learning_rate": 1.3403508771929826e-05, "loss": 0.1173, "step": 947 }, { "epoch": 1.7763713080168775, "grad_norm": 0.2736395597457886, "learning_rate": 1.339571150097466e-05, "loss": 0.0681, "step": 948 }, { "epoch": 1.778246601031411, "grad_norm": 0.31029412150382996, "learning_rate": 1.3387914230019494e-05, "loss": 0.0915, "step": 949 }, { "epoch": 1.7801218940459447, "grad_norm": 0.24978873133659363, "learning_rate": 1.3380116959064328e-05, "loss": 0.091, "step": 950 }, { "epoch": 1.7801218940459447, "eval_loss": 0.07832063734531403, "eval_runtime": 676.2932, "eval_samples_per_second": 0.291, "eval_steps_per_second": 0.291, "step": 950 }, { "epoch": 1.7819971870604783, "grad_norm": 0.3023127615451813, "learning_rate": 1.3372319688109164e-05, "loss": 0.0887, "step": 951 }, { "epoch": 1.7838724800750119, "grad_norm": 0.4247094988822937, "learning_rate": 1.3364522417153998e-05, "loss": 0.1705, "step": 952 }, { "epoch": 1.7857477730895452, "grad_norm": 0.3702313005924225, "learning_rate": 1.3356725146198832e-05, "loss": 0.1087, "step": 953 }, { "epoch": 1.7876230661040786, "grad_norm": 0.26381444931030273, "learning_rate": 1.3348927875243667e-05, "loss": 0.072, "step": 954 }, { "epoch": 1.7894983591186122, "grad_norm": 0.32169631123542786, "learning_rate": 1.3341130604288499e-05, "loss": 0.0828, "step": 955 }, { "epoch": 1.7913736521331458, "grad_norm": 0.37914732098579407, "learning_rate": 1.3333333333333333e-05, "loss": 0.0998, "step": 956 }, { "epoch": 1.7932489451476794, "grad_norm": 0.2996065020561218, "learning_rate": 1.3325536062378167e-05, "loss": 0.1056, "step": 957 }, { "epoch": 1.795124238162213, "grad_norm": 0.2926497161388397, "learning_rate": 1.3317738791423003e-05, "loss": 0.1248, "step": 958 }, { "epoch": 1.7969995311767464, "grad_norm": 0.26383379101753235, "learning_rate": 1.3309941520467838e-05, "loss": 0.095, "step": 959 }, { "epoch": 1.7988748241912798, "grad_norm": 0.334276020526886, "learning_rate": 1.3302144249512672e-05, "loss": 0.0806, "step": 960 }, { "epoch": 1.8007501172058133, "grad_norm": 0.31996652483940125, "learning_rate": 1.3294346978557506e-05, "loss": 0.0691, "step": 961 }, { "epoch": 1.802625410220347, "grad_norm": 0.3317575752735138, "learning_rate": 1.328654970760234e-05, "loss": 0.0823, "step": 962 }, { "epoch": 1.8045007032348805, "grad_norm": 0.307790070772171, "learning_rate": 1.3278752436647176e-05, "loss": 0.0916, "step": 963 }, { "epoch": 1.806375996249414, "grad_norm": 0.3371288478374481, "learning_rate": 1.327095516569201e-05, "loss": 0.0917, "step": 964 }, { "epoch": 1.8082512892639475, "grad_norm": 0.4027099907398224, "learning_rate": 1.3263157894736843e-05, "loss": 0.1157, "step": 965 }, { "epoch": 1.810126582278481, "grad_norm": 0.28519630432128906, "learning_rate": 1.3255360623781677e-05, "loss": 0.0841, "step": 966 }, { "epoch": 1.8120018752930145, "grad_norm": 0.2463994026184082, "learning_rate": 1.3247563352826511e-05, "loss": 0.0894, "step": 967 }, { "epoch": 1.813877168307548, "grad_norm": 0.2853533625602722, "learning_rate": 1.3239766081871345e-05, "loss": 0.0667, "step": 968 }, { "epoch": 1.8157524613220817, "grad_norm": 0.25355201959609985, "learning_rate": 1.3231968810916181e-05, "loss": 0.0765, "step": 969 }, { "epoch": 1.817627754336615, "grad_norm": 0.3773306608200073, "learning_rate": 1.3224171539961015e-05, "loss": 0.0854, "step": 970 }, { "epoch": 1.8195030473511487, "grad_norm": 0.29040902853012085, "learning_rate": 1.321637426900585e-05, "loss": 0.0893, "step": 971 }, { "epoch": 1.821378340365682, "grad_norm": 0.35382455587387085, "learning_rate": 1.3208576998050684e-05, "loss": 0.1021, "step": 972 }, { "epoch": 1.8232536333802156, "grad_norm": 0.4576570391654968, "learning_rate": 1.3200779727095518e-05, "loss": 0.1179, "step": 973 }, { "epoch": 1.8251289263947492, "grad_norm": 0.37235504388809204, "learning_rate": 1.3192982456140354e-05, "loss": 0.1344, "step": 974 }, { "epoch": 1.8270042194092828, "grad_norm": 0.30962350964546204, "learning_rate": 1.3185185185185185e-05, "loss": 0.0751, "step": 975 }, { "epoch": 1.8288795124238162, "grad_norm": 0.2378237396478653, "learning_rate": 1.317738791423002e-05, "loss": 0.0714, "step": 976 }, { "epoch": 1.8307548054383498, "grad_norm": 0.2367488443851471, "learning_rate": 1.3169590643274855e-05, "loss": 0.0599, "step": 977 }, { "epoch": 1.8326300984528832, "grad_norm": 0.2833852171897888, "learning_rate": 1.3161793372319689e-05, "loss": 0.0995, "step": 978 }, { "epoch": 1.8345053914674168, "grad_norm": 1.0794099569320679, "learning_rate": 1.3153996101364523e-05, "loss": 0.1319, "step": 979 }, { "epoch": 1.8363806844819504, "grad_norm": 0.29689860343933105, "learning_rate": 1.3146198830409357e-05, "loss": 0.0849, "step": 980 }, { "epoch": 1.838255977496484, "grad_norm": 0.3836843967437744, "learning_rate": 1.3138401559454193e-05, "loss": 0.1075, "step": 981 }, { "epoch": 1.8401312705110173, "grad_norm": 0.27197182178497314, "learning_rate": 1.3130604288499027e-05, "loss": 0.08, "step": 982 }, { "epoch": 1.842006563525551, "grad_norm": 0.3288334012031555, "learning_rate": 1.3122807017543862e-05, "loss": 0.0604, "step": 983 }, { "epoch": 1.8438818565400843, "grad_norm": 0.3458631932735443, "learning_rate": 1.3115009746588696e-05, "loss": 0.1051, "step": 984 }, { "epoch": 1.845757149554618, "grad_norm": 0.28444722294807434, "learning_rate": 1.3107212475633528e-05, "loss": 0.0876, "step": 985 }, { "epoch": 1.8476324425691515, "grad_norm": 0.3890294134616852, "learning_rate": 1.3099415204678362e-05, "loss": 0.1209, "step": 986 }, { "epoch": 1.849507735583685, "grad_norm": 0.2536354064941406, "learning_rate": 1.3091617933723197e-05, "loss": 0.0615, "step": 987 }, { "epoch": 1.8513830285982185, "grad_norm": 0.2301822453737259, "learning_rate": 1.3083820662768032e-05, "loss": 0.0802, "step": 988 }, { "epoch": 1.8532583216127518, "grad_norm": 0.5876390337944031, "learning_rate": 1.3076023391812867e-05, "loss": 0.134, "step": 989 }, { "epoch": 1.8551336146272854, "grad_norm": 0.2957145571708679, "learning_rate": 1.3068226120857701e-05, "loss": 0.0834, "step": 990 }, { "epoch": 1.857008907641819, "grad_norm": 0.321350634098053, "learning_rate": 1.3060428849902535e-05, "loss": 0.0998, "step": 991 }, { "epoch": 1.8588842006563526, "grad_norm": 0.32503458857536316, "learning_rate": 1.305263157894737e-05, "loss": 0.1065, "step": 992 }, { "epoch": 1.8607594936708862, "grad_norm": 0.2906481623649597, "learning_rate": 1.3044834307992205e-05, "loss": 0.0878, "step": 993 }, { "epoch": 1.8626347866854196, "grad_norm": 0.25748249888420105, "learning_rate": 1.303703703703704e-05, "loss": 0.0784, "step": 994 }, { "epoch": 1.864510079699953, "grad_norm": 0.2645077407360077, "learning_rate": 1.3029239766081872e-05, "loss": 0.0827, "step": 995 }, { "epoch": 1.8663853727144866, "grad_norm": 0.28453439474105835, "learning_rate": 1.3021442495126706e-05, "loss": 0.0792, "step": 996 }, { "epoch": 1.8682606657290202, "grad_norm": 0.2839300036430359, "learning_rate": 1.301364522417154e-05, "loss": 0.069, "step": 997 }, { "epoch": 1.8701359587435538, "grad_norm": 0.40391039848327637, "learning_rate": 1.3005847953216374e-05, "loss": 0.068, "step": 998 }, { "epoch": 1.8720112517580874, "grad_norm": 0.29251784086227417, "learning_rate": 1.299805068226121e-05, "loss": 0.0731, "step": 999 }, { "epoch": 1.8738865447726207, "grad_norm": 0.2935086786746979, "learning_rate": 1.2990253411306044e-05, "loss": 0.0984, "step": 1000 }, { "epoch": 1.8738865447726207, "eval_loss": 0.07738383859395981, "eval_runtime": 676.7683, "eval_samples_per_second": 0.291, "eval_steps_per_second": 0.291, "step": 1000 }, { "epoch": 1.875761837787154, "grad_norm": 0.5241721868515015, "learning_rate": 1.2982456140350879e-05, "loss": 0.1288, "step": 1001 }, { "epoch": 1.8776371308016877, "grad_norm": 0.2855282425880432, "learning_rate": 1.2974658869395713e-05, "loss": 0.0962, "step": 1002 }, { "epoch": 1.8795124238162213, "grad_norm": 0.26559796929359436, "learning_rate": 1.2966861598440547e-05, "loss": 0.0838, "step": 1003 }, { "epoch": 1.881387716830755, "grad_norm": 0.31060707569122314, "learning_rate": 1.2959064327485383e-05, "loss": 0.1008, "step": 1004 }, { "epoch": 1.8832630098452883, "grad_norm": 0.30378690361976624, "learning_rate": 1.2951267056530214e-05, "loss": 0.0783, "step": 1005 }, { "epoch": 1.8851383028598219, "grad_norm": 0.3393579125404358, "learning_rate": 1.294346978557505e-05, "loss": 0.075, "step": 1006 }, { "epoch": 1.8870135958743552, "grad_norm": 0.36604905128479004, "learning_rate": 1.2935672514619884e-05, "loss": 0.0771, "step": 1007 }, { "epoch": 1.8888888888888888, "grad_norm": 0.3377784192562103, "learning_rate": 1.2927875243664718e-05, "loss": 0.1062, "step": 1008 }, { "epoch": 1.8907641819034224, "grad_norm": 0.29350435733795166, "learning_rate": 1.2920077972709552e-05, "loss": 0.0703, "step": 1009 }, { "epoch": 1.892639474917956, "grad_norm": 0.2543967366218567, "learning_rate": 1.2912280701754386e-05, "loss": 0.0736, "step": 1010 }, { "epoch": 1.8945147679324894, "grad_norm": 0.29992133378982544, "learning_rate": 1.2904483430799222e-05, "loss": 0.0945, "step": 1011 }, { "epoch": 1.896390060947023, "grad_norm": 0.26045867800712585, "learning_rate": 1.2896686159844056e-05, "loss": 0.0823, "step": 1012 }, { "epoch": 1.8982653539615564, "grad_norm": 0.37396302819252014, "learning_rate": 1.288888888888889e-05, "loss": 0.1008, "step": 1013 }, { "epoch": 1.90014064697609, "grad_norm": 0.3353418707847595, "learning_rate": 1.2881091617933725e-05, "loss": 0.1059, "step": 1014 }, { "epoch": 1.9020159399906236, "grad_norm": 0.2745090126991272, "learning_rate": 1.2873294346978557e-05, "loss": 0.0838, "step": 1015 }, { "epoch": 1.9038912330051572, "grad_norm": 0.25266072154045105, "learning_rate": 1.2865497076023392e-05, "loss": 0.0638, "step": 1016 }, { "epoch": 1.9057665260196905, "grad_norm": 0.41703373193740845, "learning_rate": 1.2857699805068227e-05, "loss": 0.1085, "step": 1017 }, { "epoch": 1.9076418190342241, "grad_norm": 0.35806798934936523, "learning_rate": 1.2849902534113062e-05, "loss": 0.0803, "step": 1018 }, { "epoch": 1.9095171120487575, "grad_norm": 0.36233675479888916, "learning_rate": 1.2842105263157896e-05, "loss": 0.1004, "step": 1019 }, { "epoch": 1.9113924050632911, "grad_norm": 0.2726714611053467, "learning_rate": 1.283430799220273e-05, "loss": 0.0771, "step": 1020 }, { "epoch": 1.9132676980778247, "grad_norm": 0.25889918208122253, "learning_rate": 1.2826510721247564e-05, "loss": 0.0729, "step": 1021 }, { "epoch": 1.9151429910923583, "grad_norm": 0.27719736099243164, "learning_rate": 1.28187134502924e-05, "loss": 0.0926, "step": 1022 }, { "epoch": 1.9170182841068917, "grad_norm": 0.6307101249694824, "learning_rate": 1.2810916179337234e-05, "loss": 0.1319, "step": 1023 }, { "epoch": 1.9188935771214253, "grad_norm": 0.3898720443248749, "learning_rate": 1.2803118908382068e-05, "loss": 0.0881, "step": 1024 }, { "epoch": 1.9207688701359587, "grad_norm": 0.33780285716056824, "learning_rate": 1.2795321637426901e-05, "loss": 0.1037, "step": 1025 }, { "epoch": 1.9226441631504922, "grad_norm": 0.28749212622642517, "learning_rate": 1.2787524366471735e-05, "loss": 0.093, "step": 1026 }, { "epoch": 1.9245194561650258, "grad_norm": 0.33152034878730774, "learning_rate": 1.277972709551657e-05, "loss": 0.0945, "step": 1027 }, { "epoch": 1.9263947491795594, "grad_norm": 0.28936097025871277, "learning_rate": 1.2771929824561404e-05, "loss": 0.0916, "step": 1028 }, { "epoch": 1.9282700421940928, "grad_norm": 0.2941216826438904, "learning_rate": 1.276413255360624e-05, "loss": 0.0994, "step": 1029 }, { "epoch": 1.9301453352086262, "grad_norm": 0.30549120903015137, "learning_rate": 1.2756335282651074e-05, "loss": 0.0899, "step": 1030 }, { "epoch": 1.9320206282231598, "grad_norm": 0.24775011837482452, "learning_rate": 1.2748538011695908e-05, "loss": 0.0623, "step": 1031 }, { "epoch": 1.9338959212376934, "grad_norm": 0.27423885464668274, "learning_rate": 1.2740740740740742e-05, "loss": 0.0614, "step": 1032 }, { "epoch": 1.935771214252227, "grad_norm": 0.33300426602363586, "learning_rate": 1.2732943469785576e-05, "loss": 0.1014, "step": 1033 }, { "epoch": 1.9376465072667606, "grad_norm": 0.318990558385849, "learning_rate": 1.2725146198830412e-05, "loss": 0.0904, "step": 1034 }, { "epoch": 1.939521800281294, "grad_norm": 0.2824667692184448, "learning_rate": 1.2717348927875245e-05, "loss": 0.0735, "step": 1035 }, { "epoch": 1.9413970932958273, "grad_norm": 0.2873893976211548, "learning_rate": 1.2709551656920079e-05, "loss": 0.066, "step": 1036 }, { "epoch": 1.943272386310361, "grad_norm": 0.33205756545066833, "learning_rate": 1.2701754385964913e-05, "loss": 0.1017, "step": 1037 }, { "epoch": 1.9451476793248945, "grad_norm": 0.5657170414924622, "learning_rate": 1.2693957115009747e-05, "loss": 0.08, "step": 1038 }, { "epoch": 1.9470229723394281, "grad_norm": 0.5216059684753418, "learning_rate": 1.2686159844054581e-05, "loss": 0.0942, "step": 1039 }, { "epoch": 1.9488982653539617, "grad_norm": 0.45388665795326233, "learning_rate": 1.2678362573099417e-05, "loss": 0.0876, "step": 1040 }, { "epoch": 1.950773558368495, "grad_norm": 0.28133487701416016, "learning_rate": 1.2670565302144251e-05, "loss": 0.0829, "step": 1041 }, { "epoch": 1.9526488513830285, "grad_norm": 0.32736194133758545, "learning_rate": 1.2662768031189086e-05, "loss": 0.1283, "step": 1042 }, { "epoch": 1.954524144397562, "grad_norm": 0.307423859834671, "learning_rate": 1.265497076023392e-05, "loss": 0.0954, "step": 1043 }, { "epoch": 1.9563994374120957, "grad_norm": 0.3167324662208557, "learning_rate": 1.2647173489278754e-05, "loss": 0.0948, "step": 1044 }, { "epoch": 1.9582747304266293, "grad_norm": 0.2761521637439728, "learning_rate": 1.2639376218323586e-05, "loss": 0.0632, "step": 1045 }, { "epoch": 1.9601500234411628, "grad_norm": 0.3254704475402832, "learning_rate": 1.263157894736842e-05, "loss": 0.068, "step": 1046 }, { "epoch": 1.9620253164556962, "grad_norm": 0.41552290320396423, "learning_rate": 1.2623781676413257e-05, "loss": 0.1075, "step": 1047 }, { "epoch": 1.9639006094702296, "grad_norm": 0.31078511476516724, "learning_rate": 1.261598440545809e-05, "loss": 0.0812, "step": 1048 }, { "epoch": 1.9657759024847632, "grad_norm": 0.3442966639995575, "learning_rate": 1.2608187134502925e-05, "loss": 0.0926, "step": 1049 }, { "epoch": 1.9676511954992968, "grad_norm": 0.2290525734424591, "learning_rate": 1.2600389863547759e-05, "loss": 0.069, "step": 1050 }, { "epoch": 1.9676511954992968, "eval_loss": 0.07501588761806488, "eval_runtime": 675.7387, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 1050 }, { "epoch": 1.9695264885138304, "grad_norm": 0.2715151309967041, "learning_rate": 1.2592592592592593e-05, "loss": 0.0701, "step": 1051 }, { "epoch": 1.9714017815283638, "grad_norm": 0.3169614374637604, "learning_rate": 1.258479532163743e-05, "loss": 0.1119, "step": 1052 }, { "epoch": 1.9732770745428974, "grad_norm": 0.6188083291053772, "learning_rate": 1.2576998050682263e-05, "loss": 0.0969, "step": 1053 }, { "epoch": 1.9751523675574307, "grad_norm": 0.28429678082466125, "learning_rate": 1.2569200779727098e-05, "loss": 0.058, "step": 1054 }, { "epoch": 1.9770276605719643, "grad_norm": 0.7146753072738647, "learning_rate": 1.256140350877193e-05, "loss": 0.0636, "step": 1055 }, { "epoch": 1.978902953586498, "grad_norm": 0.5578083992004395, "learning_rate": 1.2553606237816764e-05, "loss": 0.0935, "step": 1056 }, { "epoch": 1.9807782466010315, "grad_norm": 0.30978304147720337, "learning_rate": 1.2545808966861598e-05, "loss": 0.0784, "step": 1057 }, { "epoch": 1.982653539615565, "grad_norm": 0.42586255073547363, "learning_rate": 1.2538011695906434e-05, "loss": 0.0982, "step": 1058 }, { "epoch": 1.9845288326300985, "grad_norm": 0.3126954138278961, "learning_rate": 1.2530214424951269e-05, "loss": 0.0776, "step": 1059 }, { "epoch": 1.9864041256446319, "grad_norm": 0.37667107582092285, "learning_rate": 1.2522417153996103e-05, "loss": 0.1329, "step": 1060 }, { "epoch": 1.9882794186591655, "grad_norm": 0.484842985868454, "learning_rate": 1.2514619883040937e-05, "loss": 0.0538, "step": 1061 }, { "epoch": 1.990154711673699, "grad_norm": 0.3426309823989868, "learning_rate": 1.2506822612085771e-05, "loss": 0.1018, "step": 1062 }, { "epoch": 1.9920300046882327, "grad_norm": 0.2878342866897583, "learning_rate": 1.2499025341130607e-05, "loss": 0.094, "step": 1063 }, { "epoch": 1.993905297702766, "grad_norm": 0.30334293842315674, "learning_rate": 1.2491228070175441e-05, "loss": 0.0705, "step": 1064 }, { "epoch": 1.9957805907172996, "grad_norm": 0.31606927514076233, "learning_rate": 1.2483430799220274e-05, "loss": 0.0955, "step": 1065 }, { "epoch": 1.997655883731833, "grad_norm": 0.3537469804286957, "learning_rate": 1.2475633528265108e-05, "loss": 0.0897, "step": 1066 }, { "epoch": 1.9995311767463666, "grad_norm": 0.30905476212501526, "learning_rate": 1.2467836257309942e-05, "loss": 0.1066, "step": 1067 }, { "epoch": 2.0, "grad_norm": 0.5351856350898743, "learning_rate": 1.2460038986354776e-05, "loss": 0.0439, "step": 1068 }, { "epoch": 2.0018752930145336, "grad_norm": 0.3859151601791382, "learning_rate": 1.245224171539961e-05, "loss": 0.086, "step": 1069 }, { "epoch": 2.003750586029067, "grad_norm": 0.3007124066352844, "learning_rate": 1.2444444444444446e-05, "loss": 0.0868, "step": 1070 }, { "epoch": 2.0056258790436003, "grad_norm": 0.26716360449790955, "learning_rate": 1.243664717348928e-05, "loss": 0.1024, "step": 1071 }, { "epoch": 2.007501172058134, "grad_norm": 0.33358505368232727, "learning_rate": 1.2428849902534115e-05, "loss": 0.097, "step": 1072 }, { "epoch": 2.0093764650726675, "grad_norm": 0.30429476499557495, "learning_rate": 1.2421052631578949e-05, "loss": 0.0773, "step": 1073 }, { "epoch": 2.011251758087201, "grad_norm": 0.2670055627822876, "learning_rate": 1.2413255360623783e-05, "loss": 0.0705, "step": 1074 }, { "epoch": 2.0131270511017347, "grad_norm": 0.30095306038856506, "learning_rate": 1.2405458089668616e-05, "loss": 0.0999, "step": 1075 }, { "epoch": 2.0150023441162683, "grad_norm": 0.3201741874217987, "learning_rate": 1.239766081871345e-05, "loss": 0.1148, "step": 1076 }, { "epoch": 2.0168776371308015, "grad_norm": 0.31769925355911255, "learning_rate": 1.2389863547758286e-05, "loss": 0.0756, "step": 1077 }, { "epoch": 2.018752930145335, "grad_norm": 0.3209584951400757, "learning_rate": 1.238206627680312e-05, "loss": 0.077, "step": 1078 }, { "epoch": 2.0206282231598687, "grad_norm": 0.2504936754703522, "learning_rate": 1.2374269005847954e-05, "loss": 0.0819, "step": 1079 }, { "epoch": 2.0225035161744023, "grad_norm": 0.31677699089050293, "learning_rate": 1.2366471734892788e-05, "loss": 0.0874, "step": 1080 }, { "epoch": 2.024378809188936, "grad_norm": 0.35840892791748047, "learning_rate": 1.2358674463937622e-05, "loss": 0.0859, "step": 1081 }, { "epoch": 2.0262541022034695, "grad_norm": 0.3263484537601471, "learning_rate": 1.2350877192982458e-05, "loss": 0.1069, "step": 1082 }, { "epoch": 2.0281293952180026, "grad_norm": 0.23933690786361694, "learning_rate": 1.2343079922027293e-05, "loss": 0.0628, "step": 1083 }, { "epoch": 2.030004688232536, "grad_norm": 0.2623092532157898, "learning_rate": 1.2335282651072127e-05, "loss": 0.0698, "step": 1084 }, { "epoch": 2.03187998124707, "grad_norm": 0.30900081992149353, "learning_rate": 1.232748538011696e-05, "loss": 0.0713, "step": 1085 }, { "epoch": 2.0337552742616034, "grad_norm": 0.3421262502670288, "learning_rate": 1.2319688109161793e-05, "loss": 0.1117, "step": 1086 }, { "epoch": 2.035630567276137, "grad_norm": 0.22926147282123566, "learning_rate": 1.2311890838206628e-05, "loss": 0.0522, "step": 1087 }, { "epoch": 2.0375058602906706, "grad_norm": 0.2777685821056366, "learning_rate": 1.2304093567251463e-05, "loss": 0.0524, "step": 1088 }, { "epoch": 2.0393811533052038, "grad_norm": 0.3484315276145935, "learning_rate": 1.2296296296296298e-05, "loss": 0.0844, "step": 1089 }, { "epoch": 2.0412564463197373, "grad_norm": 0.3067864775657654, "learning_rate": 1.2288499025341132e-05, "loss": 0.0889, "step": 1090 }, { "epoch": 2.043131739334271, "grad_norm": 0.3049250543117523, "learning_rate": 1.2280701754385966e-05, "loss": 0.0874, "step": 1091 }, { "epoch": 2.0450070323488045, "grad_norm": 0.371250718832016, "learning_rate": 1.22729044834308e-05, "loss": 0.0608, "step": 1092 }, { "epoch": 2.046882325363338, "grad_norm": 0.35286757349967957, "learning_rate": 1.2265107212475636e-05, "loss": 0.0617, "step": 1093 }, { "epoch": 2.0487576183778717, "grad_norm": 0.31813135743141174, "learning_rate": 1.225730994152047e-05, "loss": 0.072, "step": 1094 }, { "epoch": 2.050632911392405, "grad_norm": 0.3504825830459595, "learning_rate": 1.2249512670565303e-05, "loss": 0.0917, "step": 1095 }, { "epoch": 2.0525082044069385, "grad_norm": 0.3622145354747772, "learning_rate": 1.2241715399610137e-05, "loss": 0.1072, "step": 1096 }, { "epoch": 2.054383497421472, "grad_norm": 0.2720160186290741, "learning_rate": 1.2233918128654971e-05, "loss": 0.0644, "step": 1097 }, { "epoch": 2.0562587904360057, "grad_norm": 0.2766894996166229, "learning_rate": 1.2226120857699805e-05, "loss": 0.0764, "step": 1098 }, { "epoch": 2.0581340834505393, "grad_norm": 0.3748610019683838, "learning_rate": 1.221832358674464e-05, "loss": 0.074, "step": 1099 }, { "epoch": 2.060009376465073, "grad_norm": 0.2557353377342224, "learning_rate": 1.2210526315789475e-05, "loss": 0.0731, "step": 1100 }, { "epoch": 2.060009376465073, "eval_loss": 0.07578001916408539, "eval_runtime": 676.4414, "eval_samples_per_second": 0.291, "eval_steps_per_second": 0.291, "step": 1100 }, { "epoch": 2.061884669479606, "grad_norm": 0.4064513146877289, "learning_rate": 1.220272904483431e-05, "loss": 0.0999, "step": 1101 }, { "epoch": 2.0637599624941396, "grad_norm": 0.3794358968734741, "learning_rate": 1.2194931773879144e-05, "loss": 0.1305, "step": 1102 }, { "epoch": 2.065635255508673, "grad_norm": 0.3337356448173523, "learning_rate": 1.2187134502923978e-05, "loss": 0.073, "step": 1103 }, { "epoch": 2.067510548523207, "grad_norm": 0.310234010219574, "learning_rate": 1.2179337231968812e-05, "loss": 0.0899, "step": 1104 }, { "epoch": 2.0693858415377404, "grad_norm": 0.28629547357559204, "learning_rate": 1.2171539961013645e-05, "loss": 0.0674, "step": 1105 }, { "epoch": 2.071261134552274, "grad_norm": 0.31278061866760254, "learning_rate": 1.216374269005848e-05, "loss": 0.0724, "step": 1106 }, { "epoch": 2.073136427566807, "grad_norm": 0.3247126638889313, "learning_rate": 1.2155945419103315e-05, "loss": 0.0771, "step": 1107 }, { "epoch": 2.0750117205813408, "grad_norm": 0.3694022297859192, "learning_rate": 1.2148148148148149e-05, "loss": 0.0897, "step": 1108 }, { "epoch": 2.0768870135958744, "grad_norm": 0.3070647120475769, "learning_rate": 1.2140350877192983e-05, "loss": 0.0817, "step": 1109 }, { "epoch": 2.078762306610408, "grad_norm": 0.3457229733467102, "learning_rate": 1.2132553606237817e-05, "loss": 0.0743, "step": 1110 }, { "epoch": 2.0806375996249415, "grad_norm": 0.31315433979034424, "learning_rate": 1.2124756335282653e-05, "loss": 0.0742, "step": 1111 }, { "epoch": 2.0825128926394747, "grad_norm": 0.31154918670654297, "learning_rate": 1.2116959064327487e-05, "loss": 0.0748, "step": 1112 }, { "epoch": 2.0843881856540083, "grad_norm": 0.3989027738571167, "learning_rate": 1.2109161793372322e-05, "loss": 0.1073, "step": 1113 }, { "epoch": 2.086263478668542, "grad_norm": 0.3276286721229553, "learning_rate": 1.2101364522417156e-05, "loss": 0.1067, "step": 1114 }, { "epoch": 2.0881387716830755, "grad_norm": 0.32193440198898315, "learning_rate": 1.2093567251461988e-05, "loss": 0.0636, "step": 1115 }, { "epoch": 2.090014064697609, "grad_norm": 0.34374260902404785, "learning_rate": 1.2085769980506823e-05, "loss": 0.092, "step": 1116 }, { "epoch": 2.0918893577121427, "grad_norm": 0.24487826228141785, "learning_rate": 1.2077972709551657e-05, "loss": 0.0617, "step": 1117 }, { "epoch": 2.093764650726676, "grad_norm": 0.3080878257751465, "learning_rate": 1.2070175438596493e-05, "loss": 0.0682, "step": 1118 }, { "epoch": 2.0956399437412094, "grad_norm": 0.2777683138847351, "learning_rate": 1.2062378167641327e-05, "loss": 0.0716, "step": 1119 }, { "epoch": 2.097515236755743, "grad_norm": 0.3162023723125458, "learning_rate": 1.2054580896686161e-05, "loss": 0.0841, "step": 1120 }, { "epoch": 2.0993905297702766, "grad_norm": 0.4254573881626129, "learning_rate": 1.2046783625730995e-05, "loss": 0.0511, "step": 1121 }, { "epoch": 2.1012658227848102, "grad_norm": 0.33648309111595154, "learning_rate": 1.203898635477583e-05, "loss": 0.0949, "step": 1122 }, { "epoch": 2.103141115799344, "grad_norm": 0.24535001814365387, "learning_rate": 1.2031189083820665e-05, "loss": 0.0637, "step": 1123 }, { "epoch": 2.105016408813877, "grad_norm": 0.2758919894695282, "learning_rate": 1.20233918128655e-05, "loss": 0.0825, "step": 1124 }, { "epoch": 2.1068917018284106, "grad_norm": 0.29626914858818054, "learning_rate": 1.2015594541910332e-05, "loss": 0.0829, "step": 1125 }, { "epoch": 2.108766994842944, "grad_norm": 0.43505001068115234, "learning_rate": 1.2007797270955166e-05, "loss": 0.1068, "step": 1126 }, { "epoch": 2.1106422878574778, "grad_norm": 0.29316994547843933, "learning_rate": 1.2e-05, "loss": 0.0632, "step": 1127 }, { "epoch": 2.1125175808720114, "grad_norm": 0.3550972640514374, "learning_rate": 1.1992202729044834e-05, "loss": 0.0824, "step": 1128 }, { "epoch": 2.114392873886545, "grad_norm": 0.34816864132881165, "learning_rate": 1.198440545808967e-05, "loss": 0.0885, "step": 1129 }, { "epoch": 2.116268166901078, "grad_norm": 0.3325449526309967, "learning_rate": 1.1976608187134505e-05, "loss": 0.0932, "step": 1130 }, { "epoch": 2.1181434599156117, "grad_norm": 0.3157937228679657, "learning_rate": 1.1968810916179339e-05, "loss": 0.0735, "step": 1131 }, { "epoch": 2.1200187529301453, "grad_norm": 0.32261866331100464, "learning_rate": 1.1961013645224173e-05, "loss": 0.0657, "step": 1132 }, { "epoch": 2.121894045944679, "grad_norm": 0.36068305373191833, "learning_rate": 1.1953216374269007e-05, "loss": 0.0977, "step": 1133 }, { "epoch": 2.1237693389592125, "grad_norm": 0.27126020193099976, "learning_rate": 1.194541910331384e-05, "loss": 0.0554, "step": 1134 }, { "epoch": 2.125644631973746, "grad_norm": 0.3004317879676819, "learning_rate": 1.1937621832358674e-05, "loss": 0.0741, "step": 1135 }, { "epoch": 2.1275199249882792, "grad_norm": 0.33724191784858704, "learning_rate": 1.192982456140351e-05, "loss": 0.1176, "step": 1136 }, { "epoch": 2.129395218002813, "grad_norm": 0.34397128224372864, "learning_rate": 1.1922027290448344e-05, "loss": 0.0782, "step": 1137 }, { "epoch": 2.1312705110173464, "grad_norm": 0.34579211473464966, "learning_rate": 1.1914230019493178e-05, "loss": 0.091, "step": 1138 }, { "epoch": 2.13314580403188, "grad_norm": 0.27181315422058105, "learning_rate": 1.1906432748538012e-05, "loss": 0.0536, "step": 1139 }, { "epoch": 2.1350210970464136, "grad_norm": 0.3022139370441437, "learning_rate": 1.1898635477582846e-05, "loss": 0.0658, "step": 1140 }, { "epoch": 2.1368963900609472, "grad_norm": 0.38249850273132324, "learning_rate": 1.1890838206627682e-05, "loss": 0.0708, "step": 1141 }, { "epoch": 2.1387716830754804, "grad_norm": 0.3176434636116028, "learning_rate": 1.1883040935672517e-05, "loss": 0.0811, "step": 1142 }, { "epoch": 2.140646976090014, "grad_norm": 0.40170273184776306, "learning_rate": 1.187524366471735e-05, "loss": 0.1152, "step": 1143 }, { "epoch": 2.1425222691045476, "grad_norm": 0.3950449824333191, "learning_rate": 1.1867446393762183e-05, "loss": 0.1099, "step": 1144 }, { "epoch": 2.144397562119081, "grad_norm": 0.28338244557380676, "learning_rate": 1.1859649122807017e-05, "loss": 0.0821, "step": 1145 }, { "epoch": 2.1462728551336148, "grad_norm": 0.6058647036552429, "learning_rate": 1.1851851851851852e-05, "loss": 0.0856, "step": 1146 }, { "epoch": 2.148148148148148, "grad_norm": 0.28024426102638245, "learning_rate": 1.1844054580896688e-05, "loss": 0.0695, "step": 1147 }, { "epoch": 2.1500234411626815, "grad_norm": 0.3102171719074249, "learning_rate": 1.1836257309941522e-05, "loss": 0.0646, "step": 1148 }, { "epoch": 2.151898734177215, "grad_norm": 0.30384719371795654, "learning_rate": 1.1828460038986356e-05, "loss": 0.0779, "step": 1149 }, { "epoch": 2.1537740271917487, "grad_norm": 0.31122729182243347, "learning_rate": 1.182066276803119e-05, "loss": 0.0802, "step": 1150 }, { "epoch": 2.1537740271917487, "eval_loss": 0.07541442662477493, "eval_runtime": 675.8929, "eval_samples_per_second": 0.291, "eval_steps_per_second": 0.291, "step": 1150 }, { "epoch": 2.1556493202062823, "grad_norm": 0.39089834690093994, "learning_rate": 1.1812865497076024e-05, "loss": 0.1295, "step": 1151 }, { "epoch": 2.157524613220816, "grad_norm": 0.286997526884079, "learning_rate": 1.180506822612086e-05, "loss": 0.0582, "step": 1152 }, { "epoch": 2.1593999062353495, "grad_norm": 0.2970065772533417, "learning_rate": 1.1797270955165694e-05, "loss": 0.0862, "step": 1153 }, { "epoch": 2.1612751992498827, "grad_norm": 0.2717902660369873, "learning_rate": 1.1789473684210527e-05, "loss": 0.055, "step": 1154 }, { "epoch": 2.1631504922644162, "grad_norm": 0.3172938823699951, "learning_rate": 1.1781676413255361e-05, "loss": 0.0869, "step": 1155 }, { "epoch": 2.16502578527895, "grad_norm": 0.42429736256599426, "learning_rate": 1.1773879142300195e-05, "loss": 0.0802, "step": 1156 }, { "epoch": 2.1669010782934834, "grad_norm": 0.31430354714393616, "learning_rate": 1.176608187134503e-05, "loss": 0.0894, "step": 1157 }, { "epoch": 2.168776371308017, "grad_norm": 0.33213984966278076, "learning_rate": 1.1758284600389864e-05, "loss": 0.1002, "step": 1158 }, { "epoch": 2.17065166432255, "grad_norm": 0.35632964968681335, "learning_rate": 1.17504873294347e-05, "loss": 0.0793, "step": 1159 }, { "epoch": 2.172526957337084, "grad_norm": 0.376068651676178, "learning_rate": 1.1742690058479534e-05, "loss": 0.0858, "step": 1160 }, { "epoch": 2.1744022503516174, "grad_norm": 0.3594323694705963, "learning_rate": 1.1734892787524368e-05, "loss": 0.0589, "step": 1161 }, { "epoch": 2.176277543366151, "grad_norm": 0.37725338339805603, "learning_rate": 1.1727095516569202e-05, "loss": 0.1046, "step": 1162 }, { "epoch": 2.1781528363806846, "grad_norm": 0.28712713718414307, "learning_rate": 1.1719298245614036e-05, "loss": 0.0629, "step": 1163 }, { "epoch": 2.180028129395218, "grad_norm": 0.31834840774536133, "learning_rate": 1.1711500974658869e-05, "loss": 0.0971, "step": 1164 }, { "epoch": 2.1819034224097513, "grad_norm": 0.38184770941734314, "learning_rate": 1.1703703703703703e-05, "loss": 0.0971, "step": 1165 }, { "epoch": 2.183778715424285, "grad_norm": 0.3888046145439148, "learning_rate": 1.1695906432748539e-05, "loss": 0.1172, "step": 1166 }, { "epoch": 2.1856540084388185, "grad_norm": 0.3301868736743927, "learning_rate": 1.1688109161793373e-05, "loss": 0.0927, "step": 1167 }, { "epoch": 2.187529301453352, "grad_norm": 0.3630843460559845, "learning_rate": 1.1680311890838207e-05, "loss": 0.0839, "step": 1168 }, { "epoch": 2.1894045944678857, "grad_norm": 0.6889258623123169, "learning_rate": 1.1672514619883041e-05, "loss": 0.1217, "step": 1169 }, { "epoch": 2.1912798874824193, "grad_norm": 0.6070849299430847, "learning_rate": 1.1664717348927876e-05, "loss": 0.0926, "step": 1170 }, { "epoch": 2.1931551804969525, "grad_norm": 0.3465521037578583, "learning_rate": 1.1656920077972711e-05, "loss": 0.086, "step": 1171 }, { "epoch": 2.195030473511486, "grad_norm": 0.3908824622631073, "learning_rate": 1.1649122807017546e-05, "loss": 0.1146, "step": 1172 }, { "epoch": 2.1969057665260197, "grad_norm": 0.28279078006744385, "learning_rate": 1.164132553606238e-05, "loss": 0.072, "step": 1173 }, { "epoch": 2.1987810595405533, "grad_norm": 0.31403693556785583, "learning_rate": 1.1633528265107212e-05, "loss": 0.0871, "step": 1174 }, { "epoch": 2.200656352555087, "grad_norm": 0.33991819620132446, "learning_rate": 1.1625730994152047e-05, "loss": 0.0876, "step": 1175 }, { "epoch": 2.2025316455696204, "grad_norm": 0.3142626881599426, "learning_rate": 1.161793372319688e-05, "loss": 0.0724, "step": 1176 }, { "epoch": 2.2044069385841536, "grad_norm": 0.3219679594039917, "learning_rate": 1.1610136452241717e-05, "loss": 0.0659, "step": 1177 }, { "epoch": 2.206282231598687, "grad_norm": 0.3814351260662079, "learning_rate": 1.160233918128655e-05, "loss": 0.0708, "step": 1178 }, { "epoch": 2.208157524613221, "grad_norm": 0.380666583776474, "learning_rate": 1.1594541910331385e-05, "loss": 0.0952, "step": 1179 }, { "epoch": 2.2100328176277544, "grad_norm": 0.2872433662414551, "learning_rate": 1.158674463937622e-05, "loss": 0.0794, "step": 1180 }, { "epoch": 2.211908110642288, "grad_norm": 0.28936418890953064, "learning_rate": 1.1578947368421053e-05, "loss": 0.0606, "step": 1181 }, { "epoch": 2.2137834036568216, "grad_norm": 0.3666747808456421, "learning_rate": 1.157115009746589e-05, "loss": 0.0965, "step": 1182 }, { "epoch": 2.2156586966713547, "grad_norm": 0.3257901072502136, "learning_rate": 1.1563352826510723e-05, "loss": 0.0796, "step": 1183 }, { "epoch": 2.2175339896858883, "grad_norm": 0.47882843017578125, "learning_rate": 1.1555555555555556e-05, "loss": 0.0816, "step": 1184 }, { "epoch": 2.219409282700422, "grad_norm": 0.2858003079891205, "learning_rate": 1.154775828460039e-05, "loss": 0.069, "step": 1185 }, { "epoch": 2.2212845757149555, "grad_norm": 0.31669747829437256, "learning_rate": 1.1539961013645224e-05, "loss": 0.0842, "step": 1186 }, { "epoch": 2.223159868729489, "grad_norm": 0.3197777271270752, "learning_rate": 1.1532163742690059e-05, "loss": 0.099, "step": 1187 }, { "epoch": 2.2250351617440227, "grad_norm": 0.34864693880081177, "learning_rate": 1.1524366471734893e-05, "loss": 0.0849, "step": 1188 }, { "epoch": 2.226910454758556, "grad_norm": 0.7603491544723511, "learning_rate": 1.1516569200779729e-05, "loss": 0.0865, "step": 1189 }, { "epoch": 2.2287857477730895, "grad_norm": 0.3353845775127411, "learning_rate": 1.1508771929824563e-05, "loss": 0.0724, "step": 1190 }, { "epoch": 2.230661040787623, "grad_norm": 0.27294620871543884, "learning_rate": 1.1500974658869397e-05, "loss": 0.0718, "step": 1191 }, { "epoch": 2.2325363338021567, "grad_norm": 0.30225276947021484, "learning_rate": 1.1493177387914231e-05, "loss": 0.0752, "step": 1192 }, { "epoch": 2.2344116268166903, "grad_norm": 0.30490773916244507, "learning_rate": 1.1485380116959065e-05, "loss": 0.0653, "step": 1193 }, { "epoch": 2.2362869198312234, "grad_norm": 0.306281715631485, "learning_rate": 1.1477582846003898e-05, "loss": 0.0747, "step": 1194 }, { "epoch": 2.238162212845757, "grad_norm": 0.3963259756565094, "learning_rate": 1.1469785575048734e-05, "loss": 0.1006, "step": 1195 }, { "epoch": 2.2400375058602906, "grad_norm": 0.3330558240413666, "learning_rate": 1.1461988304093568e-05, "loss": 0.0925, "step": 1196 }, { "epoch": 2.241912798874824, "grad_norm": 0.27601566910743713, "learning_rate": 1.1454191033138402e-05, "loss": 0.0686, "step": 1197 }, { "epoch": 2.243788091889358, "grad_norm": 0.412579208612442, "learning_rate": 1.1446393762183236e-05, "loss": 0.0789, "step": 1198 }, { "epoch": 2.2456633849038914, "grad_norm": 0.39170923829078674, "learning_rate": 1.143859649122807e-05, "loss": 0.095, "step": 1199 }, { "epoch": 2.247538677918425, "grad_norm": 0.3268628716468811, "learning_rate": 1.1430799220272906e-05, "loss": 0.0772, "step": 1200 }, { "epoch": 2.247538677918425, "eval_loss": 0.07522810995578766, "eval_runtime": 675.3854, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 1200 }, { "epoch": 2.249413970932958, "grad_norm": 0.34962981939315796, "learning_rate": 1.142300194931774e-05, "loss": 0.0791, "step": 1201 }, { "epoch": 2.2512892639474917, "grad_norm": 0.3399597704410553, "learning_rate": 1.1415204678362575e-05, "loss": 0.0807, "step": 1202 }, { "epoch": 2.2531645569620253, "grad_norm": 0.27295050024986267, "learning_rate": 1.1407407407407409e-05, "loss": 0.0607, "step": 1203 }, { "epoch": 2.255039849976559, "grad_norm": 0.32791128754615784, "learning_rate": 1.1399610136452241e-05, "loss": 0.0548, "step": 1204 }, { "epoch": 2.2569151429910925, "grad_norm": 0.35572582483291626, "learning_rate": 1.1391812865497076e-05, "loss": 0.0806, "step": 1205 }, { "epoch": 2.2587904360056257, "grad_norm": 0.3451082408428192, "learning_rate": 1.138401559454191e-05, "loss": 0.0599, "step": 1206 }, { "epoch": 2.2606657290201593, "grad_norm": 0.25782617926597595, "learning_rate": 1.1376218323586746e-05, "loss": 0.0584, "step": 1207 }, { "epoch": 2.262541022034693, "grad_norm": 0.2698267996311188, "learning_rate": 1.136842105263158e-05, "loss": 0.067, "step": 1208 }, { "epoch": 2.2644163150492265, "grad_norm": 0.2824949026107788, "learning_rate": 1.1360623781676414e-05, "loss": 0.0325, "step": 1209 }, { "epoch": 2.26629160806376, "grad_norm": 0.30985549092292786, "learning_rate": 1.1352826510721248e-05, "loss": 0.0591, "step": 1210 }, { "epoch": 2.2681669010782937, "grad_norm": 0.3595530688762665, "learning_rate": 1.1345029239766083e-05, "loss": 0.0821, "step": 1211 }, { "epoch": 2.270042194092827, "grad_norm": 0.3911292552947998, "learning_rate": 1.1337231968810918e-05, "loss": 0.0942, "step": 1212 }, { "epoch": 2.2719174871073604, "grad_norm": 0.31286779046058655, "learning_rate": 1.1329434697855753e-05, "loss": 0.0703, "step": 1213 }, { "epoch": 2.273792780121894, "grad_norm": 0.6129016876220703, "learning_rate": 1.1321637426900585e-05, "loss": 0.0904, "step": 1214 }, { "epoch": 2.2756680731364276, "grad_norm": 0.35265544056892395, "learning_rate": 1.131384015594542e-05, "loss": 0.102, "step": 1215 }, { "epoch": 2.277543366150961, "grad_norm": 0.49612244963645935, "learning_rate": 1.1306042884990253e-05, "loss": 0.0851, "step": 1216 }, { "epoch": 2.279418659165495, "grad_norm": 2.8707265853881836, "learning_rate": 1.1298245614035088e-05, "loss": 0.0575, "step": 1217 }, { "epoch": 2.281293952180028, "grad_norm": 0.676102340221405, "learning_rate": 1.1290448343079924e-05, "loss": 0.0946, "step": 1218 }, { "epoch": 2.2831692451945615, "grad_norm": 0.3254333436489105, "learning_rate": 1.1282651072124758e-05, "loss": 0.0722, "step": 1219 }, { "epoch": 2.285044538209095, "grad_norm": 0.2682390809059143, "learning_rate": 1.1274853801169592e-05, "loss": 0.0674, "step": 1220 }, { "epoch": 2.2869198312236287, "grad_norm": 0.2639862298965454, "learning_rate": 1.1267056530214426e-05, "loss": 0.0515, "step": 1221 }, { "epoch": 2.2887951242381623, "grad_norm": 0.30463695526123047, "learning_rate": 1.125925925925926e-05, "loss": 0.0683, "step": 1222 }, { "epoch": 2.290670417252696, "grad_norm": 0.2600836753845215, "learning_rate": 1.1251461988304096e-05, "loss": 0.0588, "step": 1223 }, { "epoch": 2.292545710267229, "grad_norm": 0.2697446644306183, "learning_rate": 1.1243664717348927e-05, "loss": 0.0519, "step": 1224 }, { "epoch": 2.2944210032817627, "grad_norm": 0.34204694628715515, "learning_rate": 1.1235867446393763e-05, "loss": 0.0839, "step": 1225 }, { "epoch": 2.2962962962962963, "grad_norm": 0.30241382122039795, "learning_rate": 1.1228070175438597e-05, "loss": 0.0793, "step": 1226 }, { "epoch": 2.29817158931083, "grad_norm": 0.36129674315452576, "learning_rate": 1.1220272904483431e-05, "loss": 0.0572, "step": 1227 }, { "epoch": 2.3000468823253635, "grad_norm": 0.3861963450908661, "learning_rate": 1.1212475633528265e-05, "loss": 0.1096, "step": 1228 }, { "epoch": 2.3019221753398966, "grad_norm": 0.2883913516998291, "learning_rate": 1.12046783625731e-05, "loss": 0.0614, "step": 1229 }, { "epoch": 2.3037974683544302, "grad_norm": 0.3693699538707733, "learning_rate": 1.1196881091617936e-05, "loss": 0.0957, "step": 1230 }, { "epoch": 2.305672761368964, "grad_norm": 0.35850393772125244, "learning_rate": 1.118908382066277e-05, "loss": 0.0849, "step": 1231 }, { "epoch": 2.3075480543834974, "grad_norm": 0.35511448979377747, "learning_rate": 1.1181286549707604e-05, "loss": 0.1032, "step": 1232 }, { "epoch": 2.309423347398031, "grad_norm": 0.5250701308250427, "learning_rate": 1.1173489278752438e-05, "loss": 0.1174, "step": 1233 }, { "epoch": 2.3112986404125646, "grad_norm": 0.28343549370765686, "learning_rate": 1.116569200779727e-05, "loss": 0.0702, "step": 1234 }, { "epoch": 2.313173933427098, "grad_norm": 0.3174719512462616, "learning_rate": 1.1157894736842105e-05, "loss": 0.0628, "step": 1235 }, { "epoch": 2.3150492264416314, "grad_norm": 0.4021863341331482, "learning_rate": 1.115009746588694e-05, "loss": 0.0856, "step": 1236 }, { "epoch": 2.316924519456165, "grad_norm": 0.4704308807849884, "learning_rate": 1.1142300194931775e-05, "loss": 0.0977, "step": 1237 }, { "epoch": 2.3187998124706986, "grad_norm": 0.319234162569046, "learning_rate": 1.1134502923976609e-05, "loss": 0.0647, "step": 1238 }, { "epoch": 2.320675105485232, "grad_norm": 0.3543676435947418, "learning_rate": 1.1126705653021443e-05, "loss": 0.0791, "step": 1239 }, { "epoch": 2.3225503984997657, "grad_norm": 0.32436403632164, "learning_rate": 1.1118908382066277e-05, "loss": 0.0613, "step": 1240 }, { "epoch": 2.324425691514299, "grad_norm": 0.33297768235206604, "learning_rate": 1.1111111111111113e-05, "loss": 0.0684, "step": 1241 }, { "epoch": 2.3263009845288325, "grad_norm": 0.46087008714675903, "learning_rate": 1.1103313840155948e-05, "loss": 0.0744, "step": 1242 }, { "epoch": 2.328176277543366, "grad_norm": 0.3725678622722626, "learning_rate": 1.1095516569200782e-05, "loss": 0.0878, "step": 1243 }, { "epoch": 2.3300515705578997, "grad_norm": 0.6072844862937927, "learning_rate": 1.1087719298245614e-05, "loss": 0.1077, "step": 1244 }, { "epoch": 2.3319268635724333, "grad_norm": 0.32860544323921204, "learning_rate": 1.1079922027290448e-05, "loss": 0.0694, "step": 1245 }, { "epoch": 2.333802156586967, "grad_norm": 0.2839890122413635, "learning_rate": 1.1072124756335283e-05, "loss": 0.0402, "step": 1246 }, { "epoch": 2.3356774496015005, "grad_norm": 0.4085495173931122, "learning_rate": 1.1064327485380117e-05, "loss": 0.0641, "step": 1247 }, { "epoch": 2.3375527426160336, "grad_norm": 0.3415165841579437, "learning_rate": 1.1056530214424953e-05, "loss": 0.0643, "step": 1248 }, { "epoch": 2.3394280356305672, "grad_norm": 0.34704262018203735, "learning_rate": 1.1048732943469787e-05, "loss": 0.0792, "step": 1249 }, { "epoch": 2.341303328645101, "grad_norm": 0.2980154752731323, "learning_rate": 1.1040935672514621e-05, "loss": 0.0673, "step": 1250 }, { "epoch": 2.341303328645101, "eval_loss": 0.07502703368663788, "eval_runtime": 675.207, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 1250 }, { "epoch": 2.3431786216596344, "grad_norm": 0.27935025095939636, "learning_rate": 1.1033138401559455e-05, "loss": 0.0512, "step": 1251 }, { "epoch": 2.345053914674168, "grad_norm": 0.42243000864982605, "learning_rate": 1.102534113060429e-05, "loss": 0.1033, "step": 1252 }, { "epoch": 2.346929207688701, "grad_norm": 0.36166539788246155, "learning_rate": 1.1017543859649125e-05, "loss": 0.0787, "step": 1253 }, { "epoch": 2.3488045007032348, "grad_norm": 1.874434232711792, "learning_rate": 1.1009746588693956e-05, "loss": 0.1049, "step": 1254 }, { "epoch": 2.3506797937177684, "grad_norm": 0.30916956067085266, "learning_rate": 1.1001949317738792e-05, "loss": 0.0895, "step": 1255 }, { "epoch": 2.352555086732302, "grad_norm": 0.37780481576919556, "learning_rate": 1.0994152046783626e-05, "loss": 0.089, "step": 1256 }, { "epoch": 2.3544303797468356, "grad_norm": 0.3470967411994934, "learning_rate": 1.098635477582846e-05, "loss": 0.0827, "step": 1257 }, { "epoch": 2.356305672761369, "grad_norm": 0.28173500299453735, "learning_rate": 1.0978557504873295e-05, "loss": 0.0831, "step": 1258 }, { "epoch": 2.3581809657759023, "grad_norm": 0.32230475544929504, "learning_rate": 1.0970760233918129e-05, "loss": 0.0706, "step": 1259 }, { "epoch": 2.360056258790436, "grad_norm": 0.2897712290287018, "learning_rate": 1.0962962962962965e-05, "loss": 0.0614, "step": 1260 }, { "epoch": 2.3619315518049695, "grad_norm": 0.46456751227378845, "learning_rate": 1.0955165692007799e-05, "loss": 0.1074, "step": 1261 }, { "epoch": 2.363806844819503, "grad_norm": 0.35259947180747986, "learning_rate": 1.0947368421052633e-05, "loss": 0.09, "step": 1262 }, { "epoch": 2.3656821378340367, "grad_norm": 0.4690414071083069, "learning_rate": 1.0939571150097467e-05, "loss": 0.0667, "step": 1263 }, { "epoch": 2.36755743084857, "grad_norm": 0.3024637997150421, "learning_rate": 1.09317738791423e-05, "loss": 0.0723, "step": 1264 }, { "epoch": 2.3694327238631034, "grad_norm": 0.2506920397281647, "learning_rate": 1.0923976608187134e-05, "loss": 0.0584, "step": 1265 }, { "epoch": 2.371308016877637, "grad_norm": 0.3608229160308838, "learning_rate": 1.091617933723197e-05, "loss": 0.0659, "step": 1266 }, { "epoch": 2.3731833098921706, "grad_norm": 0.3218965232372284, "learning_rate": 1.0908382066276804e-05, "loss": 0.0803, "step": 1267 }, { "epoch": 2.3750586029067042, "grad_norm": 0.8374189734458923, "learning_rate": 1.0900584795321638e-05, "loss": 0.0654, "step": 1268 }, { "epoch": 2.376933895921238, "grad_norm": 0.3754996955394745, "learning_rate": 1.0892787524366472e-05, "loss": 0.0868, "step": 1269 }, { "epoch": 2.3788091889357714, "grad_norm": 0.928810179233551, "learning_rate": 1.0884990253411307e-05, "loss": 0.1044, "step": 1270 }, { "epoch": 2.3806844819503046, "grad_norm": 0.3189548850059509, "learning_rate": 1.0877192982456142e-05, "loss": 0.0773, "step": 1271 }, { "epoch": 2.382559774964838, "grad_norm": 0.3129134178161621, "learning_rate": 1.0869395711500977e-05, "loss": 0.0769, "step": 1272 }, { "epoch": 2.3844350679793718, "grad_norm": 0.2928052544593811, "learning_rate": 1.086159844054581e-05, "loss": 0.0783, "step": 1273 }, { "epoch": 2.3863103609939054, "grad_norm": 0.380834698677063, "learning_rate": 1.0853801169590643e-05, "loss": 0.0734, "step": 1274 }, { "epoch": 2.388185654008439, "grad_norm": 0.28873157501220703, "learning_rate": 1.0846003898635478e-05, "loss": 0.0697, "step": 1275 }, { "epoch": 2.390060947022972, "grad_norm": 0.2910199463367462, "learning_rate": 1.0838206627680312e-05, "loss": 0.0691, "step": 1276 }, { "epoch": 2.3919362400375057, "grad_norm": 0.3059178292751312, "learning_rate": 1.0830409356725146e-05, "loss": 0.064, "step": 1277 }, { "epoch": 2.3938115330520393, "grad_norm": 0.33945995569229126, "learning_rate": 1.0822612085769982e-05, "loss": 0.0589, "step": 1278 }, { "epoch": 2.395686826066573, "grad_norm": 0.35901668667793274, "learning_rate": 1.0814814814814816e-05, "loss": 0.0798, "step": 1279 }, { "epoch": 2.3975621190811065, "grad_norm": 0.2913598418235779, "learning_rate": 1.080701754385965e-05, "loss": 0.0609, "step": 1280 }, { "epoch": 2.39943741209564, "grad_norm": 0.30235838890075684, "learning_rate": 1.0799220272904484e-05, "loss": 0.0696, "step": 1281 }, { "epoch": 2.4013127051101737, "grad_norm": 0.29583993554115295, "learning_rate": 1.0791423001949319e-05, "loss": 0.0813, "step": 1282 }, { "epoch": 2.403187998124707, "grad_norm": 0.34998810291290283, "learning_rate": 1.0783625730994154e-05, "loss": 0.0782, "step": 1283 }, { "epoch": 2.4050632911392404, "grad_norm": 0.3048896789550781, "learning_rate": 1.0775828460038987e-05, "loss": 0.0579, "step": 1284 }, { "epoch": 2.406938584153774, "grad_norm": 0.22718416154384613, "learning_rate": 1.0768031189083821e-05, "loss": 0.0662, "step": 1285 }, { "epoch": 2.4088138771683076, "grad_norm": 0.2834322154521942, "learning_rate": 1.0760233918128655e-05, "loss": 0.0615, "step": 1286 }, { "epoch": 2.4106891701828412, "grad_norm": 0.36230021715164185, "learning_rate": 1.075243664717349e-05, "loss": 0.0743, "step": 1287 }, { "epoch": 2.4125644631973744, "grad_norm": 0.2996331453323364, "learning_rate": 1.0744639376218324e-05, "loss": 0.0763, "step": 1288 }, { "epoch": 2.414439756211908, "grad_norm": 0.3829980790615082, "learning_rate": 1.073684210526316e-05, "loss": 0.06, "step": 1289 }, { "epoch": 2.4163150492264416, "grad_norm": 0.36401882767677307, "learning_rate": 1.0729044834307994e-05, "loss": 0.091, "step": 1290 }, { "epoch": 2.418190342240975, "grad_norm": 0.38041242957115173, "learning_rate": 1.0721247563352828e-05, "loss": 0.073, "step": 1291 }, { "epoch": 2.4200656352555088, "grad_norm": 0.4183158576488495, "learning_rate": 1.0713450292397662e-05, "loss": 0.0889, "step": 1292 }, { "epoch": 2.4219409282700424, "grad_norm": 0.30840641260147095, "learning_rate": 1.0705653021442496e-05, "loss": 0.0878, "step": 1293 }, { "epoch": 2.4238162212845755, "grad_norm": 1.0156400203704834, "learning_rate": 1.0697855750487329e-05, "loss": 0.0942, "step": 1294 }, { "epoch": 2.425691514299109, "grad_norm": 0.26748737692832947, "learning_rate": 1.0690058479532163e-05, "loss": 0.0535, "step": 1295 }, { "epoch": 2.4275668073136427, "grad_norm": 0.3611546456813812, "learning_rate": 1.0682261208576999e-05, "loss": 0.0684, "step": 1296 }, { "epoch": 2.4294421003281763, "grad_norm": 0.4616575539112091, "learning_rate": 1.0674463937621833e-05, "loss": 0.0916, "step": 1297 }, { "epoch": 2.43131739334271, "grad_norm": 0.37064328789711, "learning_rate": 1.0666666666666667e-05, "loss": 0.0871, "step": 1298 }, { "epoch": 2.4331926863572435, "grad_norm": 0.30871546268463135, "learning_rate": 1.0658869395711502e-05, "loss": 0.0725, "step": 1299 }, { "epoch": 2.4350679793717767, "grad_norm": 0.45006898045539856, "learning_rate": 1.0651072124756336e-05, "loss": 0.1114, "step": 1300 }, { "epoch": 2.4350679793717767, "eval_loss": 0.07475950568914413, "eval_runtime": 674.4121, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 1300 }, { "epoch": 2.4369432723863103, "grad_norm": 0.410637229681015, "learning_rate": 1.0643274853801172e-05, "loss": 0.0854, "step": 1301 }, { "epoch": 2.438818565400844, "grad_norm": 0.40461596846580505, "learning_rate": 1.0635477582846006e-05, "loss": 0.1023, "step": 1302 }, { "epoch": 2.4406938584153774, "grad_norm": 0.47095608711242676, "learning_rate": 1.062768031189084e-05, "loss": 0.097, "step": 1303 }, { "epoch": 2.442569151429911, "grad_norm": 0.3717558979988098, "learning_rate": 1.0619883040935672e-05, "loss": 0.076, "step": 1304 }, { "epoch": 2.4444444444444446, "grad_norm": 0.3064993619918823, "learning_rate": 1.0612085769980507e-05, "loss": 0.06, "step": 1305 }, { "epoch": 2.446319737458978, "grad_norm": 0.380657821893692, "learning_rate": 1.060428849902534e-05, "loss": 0.0997, "step": 1306 }, { "epoch": 2.4481950304735114, "grad_norm": 0.3160291612148285, "learning_rate": 1.0596491228070177e-05, "loss": 0.0548, "step": 1307 }, { "epoch": 2.450070323488045, "grad_norm": 0.377288818359375, "learning_rate": 1.0588693957115011e-05, "loss": 0.0803, "step": 1308 }, { "epoch": 2.4519456165025786, "grad_norm": 0.27472472190856934, "learning_rate": 1.0580896686159845e-05, "loss": 0.0477, "step": 1309 }, { "epoch": 2.453820909517112, "grad_norm": 0.3660586476325989, "learning_rate": 1.057309941520468e-05, "loss": 0.075, "step": 1310 }, { "epoch": 2.4556962025316453, "grad_norm": 1.1128591299057007, "learning_rate": 1.0565302144249513e-05, "loss": 0.1068, "step": 1311 }, { "epoch": 2.457571495546179, "grad_norm": 0.3397810757160187, "learning_rate": 1.055750487329435e-05, "loss": 0.0689, "step": 1312 }, { "epoch": 2.4594467885607125, "grad_norm": 0.28418809175491333, "learning_rate": 1.0549707602339184e-05, "loss": 0.078, "step": 1313 }, { "epoch": 2.461322081575246, "grad_norm": 0.30030253529548645, "learning_rate": 1.0541910331384016e-05, "loss": 0.0504, "step": 1314 }, { "epoch": 2.4631973745897797, "grad_norm": 0.306923508644104, "learning_rate": 1.053411306042885e-05, "loss": 0.0516, "step": 1315 }, { "epoch": 2.4650726676043133, "grad_norm": 0.2753802239894867, "learning_rate": 1.0526315789473684e-05, "loss": 0.0733, "step": 1316 }, { "epoch": 2.466947960618847, "grad_norm": 0.4096115231513977, "learning_rate": 1.0518518518518519e-05, "loss": 0.1417, "step": 1317 }, { "epoch": 2.46882325363338, "grad_norm": 0.45057594776153564, "learning_rate": 1.0510721247563353e-05, "loss": 0.082, "step": 1318 }, { "epoch": 2.4706985466479137, "grad_norm": 0.2660753130912781, "learning_rate": 1.0502923976608189e-05, "loss": 0.0762, "step": 1319 }, { "epoch": 2.4725738396624473, "grad_norm": 0.3073916435241699, "learning_rate": 1.0495126705653023e-05, "loss": 0.0603, "step": 1320 }, { "epoch": 2.474449132676981, "grad_norm": 0.45714837312698364, "learning_rate": 1.0487329434697857e-05, "loss": 0.0975, "step": 1321 }, { "epoch": 2.4763244256915145, "grad_norm": 0.34037506580352783, "learning_rate": 1.0479532163742691e-05, "loss": 0.0837, "step": 1322 }, { "epoch": 2.4781997187060476, "grad_norm": 0.41261813044548035, "learning_rate": 1.0471734892787525e-05, "loss": 0.1105, "step": 1323 }, { "epoch": 2.480075011720581, "grad_norm": 0.3210998475551605, "learning_rate": 1.0463937621832358e-05, "loss": 0.0582, "step": 1324 }, { "epoch": 2.481950304735115, "grad_norm": 0.37079358100891113, "learning_rate": 1.0456140350877194e-05, "loss": 0.0842, "step": 1325 }, { "epoch": 2.4838255977496484, "grad_norm": 0.3284546434879303, "learning_rate": 1.0448343079922028e-05, "loss": 0.0634, "step": 1326 }, { "epoch": 2.485700890764182, "grad_norm": 0.28334537148475647, "learning_rate": 1.0440545808966862e-05, "loss": 0.0451, "step": 1327 }, { "epoch": 2.4875761837787156, "grad_norm": 0.3325977027416229, "learning_rate": 1.0432748538011696e-05, "loss": 0.0798, "step": 1328 }, { "epoch": 2.489451476793249, "grad_norm": 0.4519757330417633, "learning_rate": 1.042495126705653e-05, "loss": 0.1076, "step": 1329 }, { "epoch": 2.4913267698077823, "grad_norm": 0.313209593296051, "learning_rate": 1.0417153996101367e-05, "loss": 0.068, "step": 1330 }, { "epoch": 2.493202062822316, "grad_norm": 0.35849788784980774, "learning_rate": 1.04093567251462e-05, "loss": 0.1017, "step": 1331 }, { "epoch": 2.4950773558368495, "grad_norm": 0.34800082445144653, "learning_rate": 1.0401559454191035e-05, "loss": 0.0732, "step": 1332 }, { "epoch": 2.496952648851383, "grad_norm": 0.41697457432746887, "learning_rate": 1.0393762183235869e-05, "loss": 0.0898, "step": 1333 }, { "epoch": 2.4988279418659167, "grad_norm": 0.9164847731590271, "learning_rate": 1.0385964912280702e-05, "loss": 0.0843, "step": 1334 }, { "epoch": 2.50070323488045, "grad_norm": 0.3565974533557892, "learning_rate": 1.0378167641325536e-05, "loss": 0.0821, "step": 1335 }, { "epoch": 2.5025785278949835, "grad_norm": 0.37837761640548706, "learning_rate": 1.037037037037037e-05, "loss": 0.0807, "step": 1336 }, { "epoch": 2.504453820909517, "grad_norm": 0.30958202481269836, "learning_rate": 1.0362573099415206e-05, "loss": 0.0623, "step": 1337 }, { "epoch": 2.5063291139240507, "grad_norm": 0.29366958141326904, "learning_rate": 1.035477582846004e-05, "loss": 0.0592, "step": 1338 }, { "epoch": 2.5082044069385843, "grad_norm": 0.3304692506790161, "learning_rate": 1.0346978557504874e-05, "loss": 0.0606, "step": 1339 }, { "epoch": 2.510079699953118, "grad_norm": 0.3314734697341919, "learning_rate": 1.0339181286549708e-05, "loss": 0.078, "step": 1340 }, { "epoch": 2.5119549929676515, "grad_norm": 0.2976376712322235, "learning_rate": 1.0331384015594543e-05, "loss": 0.0737, "step": 1341 }, { "epoch": 2.5138302859821846, "grad_norm": 0.3934020400047302, "learning_rate": 1.0323586744639378e-05, "loss": 0.0567, "step": 1342 }, { "epoch": 2.515705578996718, "grad_norm": 0.35475224256515503, "learning_rate": 1.0315789473684213e-05, "loss": 0.0785, "step": 1343 }, { "epoch": 2.517580872011252, "grad_norm": 0.2962830364704132, "learning_rate": 1.0307992202729045e-05, "loss": 0.0693, "step": 1344 }, { "epoch": 2.5194561650257854, "grad_norm": 0.4114619195461273, "learning_rate": 1.030019493177388e-05, "loss": 0.1023, "step": 1345 }, { "epoch": 2.5213314580403186, "grad_norm": 0.3815486431121826, "learning_rate": 1.0292397660818714e-05, "loss": 0.079, "step": 1346 }, { "epoch": 2.523206751054852, "grad_norm": 0.4123891592025757, "learning_rate": 1.0284600389863548e-05, "loss": 0.1021, "step": 1347 }, { "epoch": 2.5250820440693857, "grad_norm": 0.3692156672477722, "learning_rate": 1.0276803118908382e-05, "loss": 0.0983, "step": 1348 }, { "epoch": 2.5269573370839193, "grad_norm": 0.3262293338775635, "learning_rate": 1.0269005847953218e-05, "loss": 0.096, "step": 1349 }, { "epoch": 2.528832630098453, "grad_norm": 1.2034567594528198, "learning_rate": 1.0261208576998052e-05, "loss": 0.0772, "step": 1350 }, { "epoch": 2.528832630098453, "eval_loss": 0.07395470887422562, "eval_runtime": 674.8172, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 1350 }, { "epoch": 2.5307079231129865, "grad_norm": 0.3334695100784302, "learning_rate": 1.0253411306042886e-05, "loss": 0.0581, "step": 1351 }, { "epoch": 2.53258321612752, "grad_norm": 0.4798605442047119, "learning_rate": 1.024561403508772e-05, "loss": 0.0992, "step": 1352 }, { "epoch": 2.5344585091420533, "grad_norm": 0.3962152600288391, "learning_rate": 1.0237816764132555e-05, "loss": 0.0782, "step": 1353 }, { "epoch": 2.536333802156587, "grad_norm": 0.3626305162906647, "learning_rate": 1.0230019493177387e-05, "loss": 0.1039, "step": 1354 }, { "epoch": 2.5382090951711205, "grad_norm": 0.6571470499038696, "learning_rate": 1.0222222222222223e-05, "loss": 0.0758, "step": 1355 }, { "epoch": 2.540084388185654, "grad_norm": 0.26701807975769043, "learning_rate": 1.0214424951267057e-05, "loss": 0.0533, "step": 1356 }, { "epoch": 2.5419596812001877, "grad_norm": 0.23627153038978577, "learning_rate": 1.0206627680311891e-05, "loss": 0.0486, "step": 1357 }, { "epoch": 2.543834974214721, "grad_norm": 0.2959972620010376, "learning_rate": 1.0198830409356726e-05, "loss": 0.0725, "step": 1358 }, { "epoch": 2.5457102672292544, "grad_norm": 0.9190336465835571, "learning_rate": 1.019103313840156e-05, "loss": 0.0876, "step": 1359 }, { "epoch": 2.547585560243788, "grad_norm": 0.3755362033843994, "learning_rate": 1.0183235867446396e-05, "loss": 0.088, "step": 1360 }, { "epoch": 2.5494608532583216, "grad_norm": 0.3958858847618103, "learning_rate": 1.017543859649123e-05, "loss": 0.1089, "step": 1361 }, { "epoch": 2.551336146272855, "grad_norm": 0.26009872555732727, "learning_rate": 1.0167641325536064e-05, "loss": 0.0635, "step": 1362 }, { "epoch": 2.553211439287389, "grad_norm": 0.3636474311351776, "learning_rate": 1.0159844054580898e-05, "loss": 0.0619, "step": 1363 }, { "epoch": 2.5550867323019224, "grad_norm": 0.4022858440876007, "learning_rate": 1.015204678362573e-05, "loss": 0.0611, "step": 1364 }, { "epoch": 2.5569620253164556, "grad_norm": 0.44928935170173645, "learning_rate": 1.0144249512670565e-05, "loss": 0.0787, "step": 1365 }, { "epoch": 2.558837318330989, "grad_norm": 0.40881165862083435, "learning_rate": 1.0136452241715399e-05, "loss": 0.0693, "step": 1366 }, { "epoch": 2.5607126113455227, "grad_norm": 0.3600151836872101, "learning_rate": 1.0128654970760235e-05, "loss": 0.1056, "step": 1367 }, { "epoch": 2.5625879043600563, "grad_norm": 0.4044286608695984, "learning_rate": 1.012085769980507e-05, "loss": 0.1205, "step": 1368 }, { "epoch": 2.56446319737459, "grad_norm": 0.3752521872520447, "learning_rate": 1.0113060428849903e-05, "loss": 0.0906, "step": 1369 }, { "epoch": 2.566338490389123, "grad_norm": 0.4615623652935028, "learning_rate": 1.0105263157894738e-05, "loss": 0.1037, "step": 1370 }, { "epoch": 2.5682137834036567, "grad_norm": 0.27481046319007874, "learning_rate": 1.0097465886939572e-05, "loss": 0.0464, "step": 1371 }, { "epoch": 2.5700890764181903, "grad_norm": 0.4228805601596832, "learning_rate": 1.0089668615984408e-05, "loss": 0.1014, "step": 1372 }, { "epoch": 2.571964369432724, "grad_norm": 0.33466285467147827, "learning_rate": 1.0081871345029242e-05, "loss": 0.0808, "step": 1373 }, { "epoch": 2.5738396624472575, "grad_norm": 0.2553812861442566, "learning_rate": 1.0074074074074074e-05, "loss": 0.0516, "step": 1374 }, { "epoch": 2.575714955461791, "grad_norm": 0.400551438331604, "learning_rate": 1.0066276803118908e-05, "loss": 0.07, "step": 1375 }, { "epoch": 2.5775902484763247, "grad_norm": 0.3270658552646637, "learning_rate": 1.0058479532163743e-05, "loss": 0.0657, "step": 1376 }, { "epoch": 2.579465541490858, "grad_norm": 0.3232525587081909, "learning_rate": 1.0050682261208577e-05, "loss": 0.0741, "step": 1377 }, { "epoch": 2.5813408345053914, "grad_norm": 0.3367152810096741, "learning_rate": 1.0042884990253413e-05, "loss": 0.0884, "step": 1378 }, { "epoch": 2.583216127519925, "grad_norm": 0.4030762314796448, "learning_rate": 1.0035087719298247e-05, "loss": 0.0941, "step": 1379 }, { "epoch": 2.5850914205344586, "grad_norm": 0.31161463260650635, "learning_rate": 1.0027290448343081e-05, "loss": 0.0782, "step": 1380 }, { "epoch": 2.5869667135489918, "grad_norm": 0.39888834953308105, "learning_rate": 1.0019493177387915e-05, "loss": 0.0821, "step": 1381 }, { "epoch": 2.5888420065635254, "grad_norm": 0.3201531767845154, "learning_rate": 1.001169590643275e-05, "loss": 0.0638, "step": 1382 }, { "epoch": 2.590717299578059, "grad_norm": 0.32830819487571716, "learning_rate": 1.0003898635477585e-05, "loss": 0.0777, "step": 1383 }, { "epoch": 2.5925925925925926, "grad_norm": 0.2944835424423218, "learning_rate": 9.996101364522418e-06, "loss": 0.0653, "step": 1384 }, { "epoch": 2.594467885607126, "grad_norm": 0.45982295274734497, "learning_rate": 9.988304093567252e-06, "loss": 0.1088, "step": 1385 }, { "epoch": 2.5963431786216598, "grad_norm": 0.3153984248638153, "learning_rate": 9.980506822612086e-06, "loss": 0.0653, "step": 1386 }, { "epoch": 2.5982184716361933, "grad_norm": 0.41158658266067505, "learning_rate": 9.97270955165692e-06, "loss": 0.0848, "step": 1387 }, { "epoch": 2.600093764650727, "grad_norm": 0.4037891924381256, "learning_rate": 9.964912280701755e-06, "loss": 0.1319, "step": 1388 }, { "epoch": 2.60196905766526, "grad_norm": 0.4108717739582062, "learning_rate": 9.957115009746589e-06, "loss": 0.1257, "step": 1389 }, { "epoch": 2.6038443506797937, "grad_norm": 0.30166369676589966, "learning_rate": 9.949317738791425e-06, "loss": 0.0695, "step": 1390 }, { "epoch": 2.6057196436943273, "grad_norm": 0.3084668517112732, "learning_rate": 9.941520467836257e-06, "loss": 0.0787, "step": 1391 }, { "epoch": 2.607594936708861, "grad_norm": 0.5487121939659119, "learning_rate": 9.933723196881091e-06, "loss": 0.1363, "step": 1392 }, { "epoch": 2.609470229723394, "grad_norm": 0.3409046530723572, "learning_rate": 9.925925925925927e-06, "loss": 0.0781, "step": 1393 }, { "epoch": 2.6113455227379276, "grad_norm": 0.35762616991996765, "learning_rate": 9.918128654970762e-06, "loss": 0.077, "step": 1394 }, { "epoch": 2.6132208157524612, "grad_norm": 0.3606860935688019, "learning_rate": 9.910331384015596e-06, "loss": 0.0701, "step": 1395 }, { "epoch": 2.615096108766995, "grad_norm": 0.33565056324005127, "learning_rate": 9.90253411306043e-06, "loss": 0.0705, "step": 1396 }, { "epoch": 2.6169714017815284, "grad_norm": 0.35233068466186523, "learning_rate": 9.894736842105264e-06, "loss": 0.0727, "step": 1397 }, { "epoch": 2.618846694796062, "grad_norm": 0.7528280019760132, "learning_rate": 9.886939571150098e-06, "loss": 0.0846, "step": 1398 }, { "epoch": 2.6207219878105956, "grad_norm": 0.3784114420413971, "learning_rate": 9.879142300194932e-06, "loss": 0.0894, "step": 1399 }, { "epoch": 2.6225972808251288, "grad_norm": 0.2998042106628418, "learning_rate": 9.871345029239767e-06, "loss": 0.0729, "step": 1400 }, { "epoch": 2.6225972808251288, "eval_loss": 0.07494800537824631, "eval_runtime": 675.6374, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 1400 }, { "epoch": 2.6244725738396624, "grad_norm": 0.45033907890319824, "learning_rate": 9.863547758284601e-06, "loss": 0.0543, "step": 1401 }, { "epoch": 2.626347866854196, "grad_norm": 0.33740511536598206, "learning_rate": 9.855750487329435e-06, "loss": 0.0887, "step": 1402 }, { "epoch": 2.6282231598687296, "grad_norm": 0.4127645492553711, "learning_rate": 9.84795321637427e-06, "loss": 0.0915, "step": 1403 }, { "epoch": 2.630098452883263, "grad_norm": 0.3304232656955719, "learning_rate": 9.840155945419103e-06, "loss": 0.0788, "step": 1404 }, { "epoch": 2.6319737458977963, "grad_norm": 0.36797669529914856, "learning_rate": 9.83235867446394e-06, "loss": 0.0873, "step": 1405 }, { "epoch": 2.63384903891233, "grad_norm": 0.2819061577320099, "learning_rate": 9.824561403508772e-06, "loss": 0.0618, "step": 1406 }, { "epoch": 2.6357243319268635, "grad_norm": 0.52513587474823, "learning_rate": 9.816764132553606e-06, "loss": 0.0824, "step": 1407 }, { "epoch": 2.637599624941397, "grad_norm": 0.37794506549835205, "learning_rate": 9.808966861598442e-06, "loss": 0.0629, "step": 1408 }, { "epoch": 2.6394749179559307, "grad_norm": 0.35513389110565186, "learning_rate": 9.801169590643276e-06, "loss": 0.1138, "step": 1409 }, { "epoch": 2.6413502109704643, "grad_norm": 0.36185532808303833, "learning_rate": 9.79337231968811e-06, "loss": 0.0829, "step": 1410 }, { "epoch": 2.643225503984998, "grad_norm": 0.3326586186885834, "learning_rate": 9.785575048732944e-06, "loss": 0.0579, "step": 1411 }, { "epoch": 2.645100796999531, "grad_norm": 0.364624947309494, "learning_rate": 9.777777777777779e-06, "loss": 0.095, "step": 1412 }, { "epoch": 2.6469760900140646, "grad_norm": 0.3595339059829712, "learning_rate": 9.769980506822613e-06, "loss": 0.0816, "step": 1413 }, { "epoch": 2.6488513830285982, "grad_norm": 0.25556787848472595, "learning_rate": 9.762183235867447e-06, "loss": 0.0403, "step": 1414 }, { "epoch": 2.650726676043132, "grad_norm": 0.3111402094364166, "learning_rate": 9.754385964912281e-06, "loss": 0.0652, "step": 1415 }, { "epoch": 2.652601969057665, "grad_norm": 0.3296256363391876, "learning_rate": 9.746588693957115e-06, "loss": 0.0722, "step": 1416 }, { "epoch": 2.6544772620721986, "grad_norm": 0.38634583353996277, "learning_rate": 9.73879142300195e-06, "loss": 0.0659, "step": 1417 }, { "epoch": 2.656352555086732, "grad_norm": 0.35213443636894226, "learning_rate": 9.730994152046784e-06, "loss": 0.0671, "step": 1418 }, { "epoch": 2.6582278481012658, "grad_norm": 0.3570035696029663, "learning_rate": 9.72319688109162e-06, "loss": 0.061, "step": 1419 }, { "epoch": 2.6601031411157994, "grad_norm": 0.33103039860725403, "learning_rate": 9.715399610136454e-06, "loss": 0.0693, "step": 1420 }, { "epoch": 2.661978434130333, "grad_norm": 0.33688971400260925, "learning_rate": 9.707602339181286e-06, "loss": 0.0913, "step": 1421 }, { "epoch": 2.6638537271448666, "grad_norm": 0.4284787178039551, "learning_rate": 9.69980506822612e-06, "loss": 0.0908, "step": 1422 }, { "epoch": 2.6657290201594, "grad_norm": 0.46354052424430847, "learning_rate": 9.692007797270956e-06, "loss": 0.0702, "step": 1423 }, { "epoch": 2.6676043131739333, "grad_norm": 0.32590043544769287, "learning_rate": 9.68421052631579e-06, "loss": 0.0664, "step": 1424 }, { "epoch": 2.669479606188467, "grad_norm": 0.39951497316360474, "learning_rate": 9.676413255360625e-06, "loss": 0.1049, "step": 1425 }, { "epoch": 2.6713548992030005, "grad_norm": 0.38672441244125366, "learning_rate": 9.668615984405459e-06, "loss": 0.0707, "step": 1426 }, { "epoch": 2.673230192217534, "grad_norm": 0.33832356333732605, "learning_rate": 9.660818713450293e-06, "loss": 0.0766, "step": 1427 }, { "epoch": 2.6751054852320673, "grad_norm": 0.38057902455329895, "learning_rate": 9.653021442495127e-06, "loss": 0.0622, "step": 1428 }, { "epoch": 2.676980778246601, "grad_norm": 0.2721557021141052, "learning_rate": 9.645224171539962e-06, "loss": 0.0585, "step": 1429 }, { "epoch": 2.6788560712611345, "grad_norm": 0.3114902973175049, "learning_rate": 9.637426900584796e-06, "loss": 0.0599, "step": 1430 }, { "epoch": 2.680731364275668, "grad_norm": 0.4293680787086487, "learning_rate": 9.62962962962963e-06, "loss": 0.095, "step": 1431 }, { "epoch": 2.6826066572902016, "grad_norm": 0.4264834523200989, "learning_rate": 9.621832358674464e-06, "loss": 0.0989, "step": 1432 }, { "epoch": 2.6844819503047352, "grad_norm": 0.45689934492111206, "learning_rate": 9.614035087719298e-06, "loss": 0.1043, "step": 1433 }, { "epoch": 2.686357243319269, "grad_norm": 0.3924713432788849, "learning_rate": 9.606237816764134e-06, "loss": 0.0931, "step": 1434 }, { "epoch": 2.688232536333802, "grad_norm": 0.4446573853492737, "learning_rate": 9.598440545808968e-06, "loss": 0.1063, "step": 1435 }, { "epoch": 2.6901078293483356, "grad_norm": 0.45865702629089355, "learning_rate": 9.590643274853801e-06, "loss": 0.1034, "step": 1436 }, { "epoch": 2.691983122362869, "grad_norm": 0.39059358835220337, "learning_rate": 9.582846003898635e-06, "loss": 0.0587, "step": 1437 }, { "epoch": 2.693858415377403, "grad_norm": 0.678318202495575, "learning_rate": 9.575048732943471e-06, "loss": 0.1233, "step": 1438 }, { "epoch": 2.6957337083919364, "grad_norm": 0.3813665211200714, "learning_rate": 9.567251461988305e-06, "loss": 0.1088, "step": 1439 }, { "epoch": 2.6976090014064695, "grad_norm": 0.43635404109954834, "learning_rate": 9.55945419103314e-06, "loss": 0.111, "step": 1440 }, { "epoch": 2.699484294421003, "grad_norm": 0.5471066236495972, "learning_rate": 9.551656920077974e-06, "loss": 0.082, "step": 1441 }, { "epoch": 2.7013595874355367, "grad_norm": 0.31651976704597473, "learning_rate": 9.543859649122808e-06, "loss": 0.0533, "step": 1442 }, { "epoch": 2.7032348804500703, "grad_norm": 0.3988685607910156, "learning_rate": 9.536062378167642e-06, "loss": 0.0786, "step": 1443 }, { "epoch": 2.705110173464604, "grad_norm": 1.229805827140808, "learning_rate": 9.528265107212476e-06, "loss": 0.0704, "step": 1444 }, { "epoch": 2.7069854664791375, "grad_norm": 0.3627307415008545, "learning_rate": 9.52046783625731e-06, "loss": 0.0857, "step": 1445 }, { "epoch": 2.708860759493671, "grad_norm": 0.2482980489730835, "learning_rate": 9.512670565302145e-06, "loss": 0.0583, "step": 1446 }, { "epoch": 2.7107360525082043, "grad_norm": 0.33016133308410645, "learning_rate": 9.504873294346979e-06, "loss": 0.0678, "step": 1447 }, { "epoch": 2.712611345522738, "grad_norm": 0.3658045828342438, "learning_rate": 9.497076023391813e-06, "loss": 0.0926, "step": 1448 }, { "epoch": 2.7144866385372715, "grad_norm": 0.3729887008666992, "learning_rate": 9.489278752436649e-06, "loss": 0.0698, "step": 1449 }, { "epoch": 2.716361931551805, "grad_norm": 0.36187222599983215, "learning_rate": 9.481481481481483e-06, "loss": 0.0651, "step": 1450 }, { "epoch": 2.716361931551805, "eval_loss": 0.07403497397899628, "eval_runtime": 673.9675, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 1450 }, { "epoch": 2.7182372245663387, "grad_norm": 0.3599952757358551, "learning_rate": 9.473684210526315e-06, "loss": 0.0779, "step": 1451 }, { "epoch": 2.720112517580872, "grad_norm": 0.6131613850593567, "learning_rate": 9.465886939571151e-06, "loss": 0.1098, "step": 1452 }, { "epoch": 2.7219878105954054, "grad_norm": 0.32765620946884155, "learning_rate": 9.458089668615986e-06, "loss": 0.0596, "step": 1453 }, { "epoch": 2.723863103609939, "grad_norm": 0.34976986050605774, "learning_rate": 9.45029239766082e-06, "loss": 0.0649, "step": 1454 }, { "epoch": 2.7257383966244726, "grad_norm": 0.39676281809806824, "learning_rate": 9.442495126705654e-06, "loss": 0.0835, "step": 1455 }, { "epoch": 2.727613689639006, "grad_norm": 0.3270690143108368, "learning_rate": 9.434697855750488e-06, "loss": 0.0987, "step": 1456 }, { "epoch": 2.72948898265354, "grad_norm": 0.40614816546440125, "learning_rate": 9.426900584795322e-06, "loss": 0.0847, "step": 1457 }, { "epoch": 2.7313642756680734, "grad_norm": 0.42256006598472595, "learning_rate": 9.419103313840157e-06, "loss": 0.08, "step": 1458 }, { "epoch": 2.7332395686826065, "grad_norm": 0.34114959836006165, "learning_rate": 9.41130604288499e-06, "loss": 0.0618, "step": 1459 }, { "epoch": 2.73511486169714, "grad_norm": 0.30372029542922974, "learning_rate": 9.403508771929825e-06, "loss": 0.0598, "step": 1460 }, { "epoch": 2.7369901547116737, "grad_norm": 0.439656525850296, "learning_rate": 9.395711500974659e-06, "loss": 0.0857, "step": 1461 }, { "epoch": 2.7388654477262073, "grad_norm": 0.36488088965415955, "learning_rate": 9.387914230019493e-06, "loss": 0.081, "step": 1462 }, { "epoch": 2.7407407407407405, "grad_norm": 0.2475002259016037, "learning_rate": 9.380116959064327e-06, "loss": 0.0479, "step": 1463 }, { "epoch": 2.742616033755274, "grad_norm": 0.3404149115085602, "learning_rate": 9.372319688109163e-06, "loss": 0.07, "step": 1464 }, { "epoch": 2.7444913267698077, "grad_norm": 0.4210808575153351, "learning_rate": 9.364522417153998e-06, "loss": 0.094, "step": 1465 }, { "epoch": 2.7463666197843413, "grad_norm": 0.4397883713245392, "learning_rate": 9.35672514619883e-06, "loss": 0.0938, "step": 1466 }, { "epoch": 2.748241912798875, "grad_norm": 0.4252859950065613, "learning_rate": 9.348927875243666e-06, "loss": 0.0688, "step": 1467 }, { "epoch": 2.7501172058134085, "grad_norm": 0.39042210578918457, "learning_rate": 9.3411306042885e-06, "loss": 0.0929, "step": 1468 }, { "epoch": 2.751992498827942, "grad_norm": 0.29327455163002014, "learning_rate": 9.333333333333334e-06, "loss": 0.0375, "step": 1469 }, { "epoch": 2.7538677918424757, "grad_norm": 0.5631712675094604, "learning_rate": 9.325536062378169e-06, "loss": 0.0884, "step": 1470 }, { "epoch": 2.755743084857009, "grad_norm": 0.9923895001411438, "learning_rate": 9.317738791423003e-06, "loss": 0.1044, "step": 1471 }, { "epoch": 2.7576183778715424, "grad_norm": 0.4112682640552521, "learning_rate": 9.309941520467837e-06, "loss": 0.1121, "step": 1472 }, { "epoch": 2.759493670886076, "grad_norm": 0.3854548931121826, "learning_rate": 9.302144249512671e-06, "loss": 0.0859, "step": 1473 }, { "epoch": 2.7613689639006096, "grad_norm": 0.46496057510375977, "learning_rate": 9.294346978557505e-06, "loss": 0.1247, "step": 1474 }, { "epoch": 2.7632442569151427, "grad_norm": 0.4549020230770111, "learning_rate": 9.28654970760234e-06, "loss": 0.0826, "step": 1475 }, { "epoch": 2.7651195499296763, "grad_norm": 0.3205399811267853, "learning_rate": 9.278752436647174e-06, "loss": 0.0791, "step": 1476 }, { "epoch": 2.76699484294421, "grad_norm": 0.41249391436576843, "learning_rate": 9.270955165692008e-06, "loss": 0.101, "step": 1477 }, { "epoch": 2.7688701359587435, "grad_norm": 0.3202146291732788, "learning_rate": 9.263157894736842e-06, "loss": 0.0538, "step": 1478 }, { "epoch": 2.770745428973277, "grad_norm": 0.34748685359954834, "learning_rate": 9.255360623781678e-06, "loss": 0.0803, "step": 1479 }, { "epoch": 2.7726207219878107, "grad_norm": 0.3739032745361328, "learning_rate": 9.247563352826512e-06, "loss": 0.0908, "step": 1480 }, { "epoch": 2.7744960150023443, "grad_norm": 0.420016884803772, "learning_rate": 9.239766081871345e-06, "loss": 0.0991, "step": 1481 }, { "epoch": 2.7763713080168775, "grad_norm": 0.4077822268009186, "learning_rate": 9.23196881091618e-06, "loss": 0.0954, "step": 1482 }, { "epoch": 2.778246601031411, "grad_norm": 0.30607473850250244, "learning_rate": 9.224171539961015e-06, "loss": 0.0585, "step": 1483 }, { "epoch": 2.7801218940459447, "grad_norm": 0.4178578853607178, "learning_rate": 9.216374269005849e-06, "loss": 0.0904, "step": 1484 }, { "epoch": 2.7819971870604783, "grad_norm": 0.37330979108810425, "learning_rate": 9.208576998050683e-06, "loss": 0.0743, "step": 1485 }, { "epoch": 2.783872480075012, "grad_norm": 0.3077203631401062, "learning_rate": 9.200779727095517e-06, "loss": 0.0803, "step": 1486 }, { "epoch": 2.785747773089545, "grad_norm": 0.3057253360748291, "learning_rate": 9.192982456140351e-06, "loss": 0.0621, "step": 1487 }, { "epoch": 2.7876230661040786, "grad_norm": 0.31388983130455017, "learning_rate": 9.185185185185186e-06, "loss": 0.074, "step": 1488 }, { "epoch": 2.789498359118612, "grad_norm": 0.3483256995677948, "learning_rate": 9.17738791423002e-06, "loss": 0.0828, "step": 1489 }, { "epoch": 2.791373652133146, "grad_norm": 0.2892071306705475, "learning_rate": 9.169590643274856e-06, "loss": 0.0716, "step": 1490 }, { "epoch": 2.7932489451476794, "grad_norm": 0.39768117666244507, "learning_rate": 9.161793372319688e-06, "loss": 0.093, "step": 1491 }, { "epoch": 2.795124238162213, "grad_norm": 0.408591628074646, "learning_rate": 9.153996101364522e-06, "loss": 0.0962, "step": 1492 }, { "epoch": 2.7969995311767466, "grad_norm": 1.1204290390014648, "learning_rate": 9.146198830409357e-06, "loss": 0.0881, "step": 1493 }, { "epoch": 2.7988748241912798, "grad_norm": 0.38295555114746094, "learning_rate": 9.138401559454192e-06, "loss": 0.0959, "step": 1494 }, { "epoch": 2.8007501172058133, "grad_norm": 0.33753833174705505, "learning_rate": 9.130604288499027e-06, "loss": 0.1013, "step": 1495 }, { "epoch": 2.802625410220347, "grad_norm": 0.32718968391418457, "learning_rate": 9.12280701754386e-06, "loss": 0.0674, "step": 1496 }, { "epoch": 2.8045007032348805, "grad_norm": 0.3198351562023163, "learning_rate": 9.115009746588695e-06, "loss": 0.0626, "step": 1497 }, { "epoch": 2.8063759962494137, "grad_norm": 0.5296013951301575, "learning_rate": 9.10721247563353e-06, "loss": 0.1014, "step": 1498 }, { "epoch": 2.8082512892639473, "grad_norm": 0.4616522789001465, "learning_rate": 9.099415204678363e-06, "loss": 0.0927, "step": 1499 }, { "epoch": 2.810126582278481, "grad_norm": 0.41021260619163513, "learning_rate": 9.091617933723198e-06, "loss": 0.0658, "step": 1500 }, { "epoch": 2.810126582278481, "eval_loss": 0.07440079748630524, "eval_runtime": 671.7286, "eval_samples_per_second": 0.293, "eval_steps_per_second": 0.293, "step": 1500 }, { "epoch": 2.8120018752930145, "grad_norm": 0.30366870760917664, "learning_rate": 9.083820662768032e-06, "loss": 0.0542, "step": 1501 }, { "epoch": 2.813877168307548, "grad_norm": 0.484829843044281, "learning_rate": 9.076023391812866e-06, "loss": 0.1142, "step": 1502 }, { "epoch": 2.8157524613220817, "grad_norm": 0.4134244918823242, "learning_rate": 9.0682261208577e-06, "loss": 0.1089, "step": 1503 }, { "epoch": 2.8176277543366153, "grad_norm": 0.34767600893974304, "learning_rate": 9.060428849902534e-06, "loss": 0.0834, "step": 1504 }, { "epoch": 2.819503047351149, "grad_norm": 0.32123830914497375, "learning_rate": 9.05263157894737e-06, "loss": 0.0647, "step": 1505 }, { "epoch": 2.821378340365682, "grad_norm": 0.2995455861091614, "learning_rate": 9.044834307992203e-06, "loss": 0.051, "step": 1506 }, { "epoch": 2.8232536333802156, "grad_norm": 0.2944452464580536, "learning_rate": 9.037037037037037e-06, "loss": 0.0508, "step": 1507 }, { "epoch": 2.825128926394749, "grad_norm": 0.4798865020275116, "learning_rate": 9.029239766081873e-06, "loss": 0.0894, "step": 1508 }, { "epoch": 2.827004219409283, "grad_norm": 0.3530484437942505, "learning_rate": 9.021442495126707e-06, "loss": 0.0788, "step": 1509 }, { "epoch": 2.828879512423816, "grad_norm": 0.328976571559906, "learning_rate": 9.013645224171541e-06, "loss": 0.0723, "step": 1510 }, { "epoch": 2.8307548054383496, "grad_norm": 0.4133010506629944, "learning_rate": 9.005847953216374e-06, "loss": 0.041, "step": 1511 }, { "epoch": 2.832630098452883, "grad_norm": 0.407266765832901, "learning_rate": 8.99805068226121e-06, "loss": 0.0664, "step": 1512 }, { "epoch": 2.8345053914674168, "grad_norm": 0.3167998194694519, "learning_rate": 8.990253411306044e-06, "loss": 0.0653, "step": 1513 }, { "epoch": 2.8363806844819504, "grad_norm": 0.36208438873291016, "learning_rate": 8.982456140350878e-06, "loss": 0.0538, "step": 1514 }, { "epoch": 2.838255977496484, "grad_norm": 0.5785827040672302, "learning_rate": 8.974658869395712e-06, "loss": 0.1072, "step": 1515 }, { "epoch": 2.8401312705110175, "grad_norm": 0.4494905471801758, "learning_rate": 8.966861598440546e-06, "loss": 0.0961, "step": 1516 }, { "epoch": 2.842006563525551, "grad_norm": 0.2944582998752594, "learning_rate": 8.95906432748538e-06, "loss": 0.0481, "step": 1517 }, { "epoch": 2.8438818565400843, "grad_norm": 0.31694504618644714, "learning_rate": 8.951267056530215e-06, "loss": 0.07, "step": 1518 }, { "epoch": 2.845757149554618, "grad_norm": 0.33348628878593445, "learning_rate": 8.943469785575049e-06, "loss": 0.0719, "step": 1519 }, { "epoch": 2.8476324425691515, "grad_norm": 0.26419350504875183, "learning_rate": 8.935672514619885e-06, "loss": 0.0496, "step": 1520 }, { "epoch": 2.849507735583685, "grad_norm": 0.422234445810318, "learning_rate": 8.927875243664717e-06, "loss": 0.0972, "step": 1521 }, { "epoch": 2.8513830285982182, "grad_norm": 0.4871450364589691, "learning_rate": 8.920077972709552e-06, "loss": 0.1091, "step": 1522 }, { "epoch": 2.853258321612752, "grad_norm": 0.39795181155204773, "learning_rate": 8.912280701754387e-06, "loss": 0.0972, "step": 1523 }, { "epoch": 2.8551336146272854, "grad_norm": 0.3796606957912445, "learning_rate": 8.904483430799222e-06, "loss": 0.071, "step": 1524 }, { "epoch": 2.857008907641819, "grad_norm": 0.3433877229690552, "learning_rate": 8.896686159844056e-06, "loss": 0.087, "step": 1525 }, { "epoch": 2.8588842006563526, "grad_norm": 0.3441942036151886, "learning_rate": 8.888888888888888e-06, "loss": 0.0537, "step": 1526 }, { "epoch": 2.8607594936708862, "grad_norm": 0.5122886896133423, "learning_rate": 8.881091617933724e-06, "loss": 0.104, "step": 1527 }, { "epoch": 2.86263478668542, "grad_norm": 0.3636258542537689, "learning_rate": 8.873294346978558e-06, "loss": 0.0622, "step": 1528 }, { "epoch": 2.864510079699953, "grad_norm": 0.39862120151519775, "learning_rate": 8.865497076023393e-06, "loss": 0.0827, "step": 1529 }, { "epoch": 2.8663853727144866, "grad_norm": 0.4351373612880707, "learning_rate": 8.857699805068227e-06, "loss": 0.1058, "step": 1530 }, { "epoch": 2.86826066572902, "grad_norm": 1.1896051168441772, "learning_rate": 8.849902534113061e-06, "loss": 0.0846, "step": 1531 }, { "epoch": 2.8701359587435538, "grad_norm": 0.4099143147468567, "learning_rate": 8.842105263157895e-06, "loss": 0.0858, "step": 1532 }, { "epoch": 2.8720112517580874, "grad_norm": 0.4123772978782654, "learning_rate": 8.83430799220273e-06, "loss": 0.0738, "step": 1533 }, { "epoch": 2.8738865447726205, "grad_norm": 0.39457976818084717, "learning_rate": 8.826510721247564e-06, "loss": 0.0598, "step": 1534 }, { "epoch": 2.875761837787154, "grad_norm": 0.36988794803619385, "learning_rate": 8.8187134502924e-06, "loss": 0.0887, "step": 1535 }, { "epoch": 2.8776371308016877, "grad_norm": 0.4902806282043457, "learning_rate": 8.810916179337232e-06, "loss": 0.0715, "step": 1536 }, { "epoch": 2.8795124238162213, "grad_norm": 0.31671905517578125, "learning_rate": 8.803118908382066e-06, "loss": 0.0586, "step": 1537 }, { "epoch": 2.881387716830755, "grad_norm": 0.33224424719810486, "learning_rate": 8.795321637426902e-06, "loss": 0.0644, "step": 1538 }, { "epoch": 2.8832630098452885, "grad_norm": 0.3493894338607788, "learning_rate": 8.787524366471736e-06, "loss": 0.0737, "step": 1539 }, { "epoch": 2.885138302859822, "grad_norm": 0.3095395863056183, "learning_rate": 8.77972709551657e-06, "loss": 0.0632, "step": 1540 }, { "epoch": 2.8870135958743552, "grad_norm": 0.283744215965271, "learning_rate": 8.771929824561405e-06, "loss": 0.0652, "step": 1541 }, { "epoch": 2.888888888888889, "grad_norm": 0.487179696559906, "learning_rate": 8.764132553606239e-06, "loss": 0.0984, "step": 1542 }, { "epoch": 2.8907641819034224, "grad_norm": 0.287124365568161, "learning_rate": 8.756335282651073e-06, "loss": 0.0511, "step": 1543 }, { "epoch": 2.892639474917956, "grad_norm": 0.3281533718109131, "learning_rate": 8.748538011695907e-06, "loss": 0.0639, "step": 1544 }, { "epoch": 2.894514767932489, "grad_norm": 0.399172306060791, "learning_rate": 8.740740740740741e-06, "loss": 0.0539, "step": 1545 }, { "epoch": 2.896390060947023, "grad_norm": 0.35018855333328247, "learning_rate": 8.732943469785576e-06, "loss": 0.0751, "step": 1546 }, { "epoch": 2.8982653539615564, "grad_norm": 0.3269667327404022, "learning_rate": 8.72514619883041e-06, "loss": 0.0528, "step": 1547 }, { "epoch": 2.90014064697609, "grad_norm": 0.4104063808917999, "learning_rate": 8.717348927875244e-06, "loss": 0.0827, "step": 1548 }, { "epoch": 2.9020159399906236, "grad_norm": 0.4464809000492096, "learning_rate": 8.709551656920078e-06, "loss": 0.1076, "step": 1549 }, { "epoch": 2.903891233005157, "grad_norm": 0.3798385560512543, "learning_rate": 8.701754385964914e-06, "loss": 0.0625, "step": 1550 }, { "epoch": 2.903891233005157, "eval_loss": 0.07311470806598663, "eval_runtime": 675.3623, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 1550 } ], "logging_steps": 1, "max_steps": 2665, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3036945883587664e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }