diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11132 @@ +{ + "best_global_step": 1550, + "best_metric": 0.07311470806598663, + "best_model_checkpoint": "outputs_qwq/checkpoint-1550", + "epoch": 2.903891233005157, + "eval_steps": 50, + "global_step": 1550, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001875293014533521, + "grad_norm": 0.39477676153182983, + "learning_rate": 0.0, + "loss": 0.7899, + "step": 1 + }, + { + "epoch": 0.003750586029067042, + "grad_norm": 0.3514685332775116, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.9541, + "step": 2 + }, + { + "epoch": 0.005625879043600563, + "grad_norm": 0.2947579622268677, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.7068, + "step": 3 + }, + { + "epoch": 0.007501172058134084, + "grad_norm": 0.26537010073661804, + "learning_rate": 6.000000000000001e-07, + "loss": 0.6839, + "step": 4 + }, + { + "epoch": 0.009376465072667605, + "grad_norm": 0.26260894536972046, + "learning_rate": 8.000000000000001e-07, + "loss": 0.68, + "step": 5 + }, + { + "epoch": 0.011251758087201125, + "grad_norm": 0.24916508793830872, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.6905, + "step": 6 + }, + { + "epoch": 0.013127051101734646, + "grad_norm": 0.3689679801464081, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.9347, + "step": 7 + }, + { + "epoch": 0.015002344116268168, + "grad_norm": 0.27197960019111633, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.7164, + "step": 8 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 0.305757611989975, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.721, + "step": 9 + }, + { + "epoch": 0.01875293014533521, + "grad_norm": 0.272778183221817, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.6814, + "step": 10 + }, + { + "epoch": 0.02062822315986873, + "grad_norm": 0.28655725717544556, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7508, + "step": 11 + }, + { + "epoch": 0.02250351617440225, + "grad_norm": 0.32160523533821106, + "learning_rate": 2.2e-06, + "loss": 0.8391, + "step": 12 + }, + { + "epoch": 0.02437880918893577, + "grad_norm": 0.2688426077365875, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.6594, + "step": 13 + }, + { + "epoch": 0.02625410220346929, + "grad_norm": 0.33779385685920715, + "learning_rate": 2.6e-06, + "loss": 0.7729, + "step": 14 + }, + { + "epoch": 0.02812939521800281, + "grad_norm": 0.3078613877296448, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.7372, + "step": 15 + }, + { + "epoch": 0.030004688232536336, + "grad_norm": 0.2747291922569275, + "learning_rate": 3e-06, + "loss": 0.6716, + "step": 16 + }, + { + "epoch": 0.03187998124706985, + "grad_norm": 0.30704063177108765, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.7303, + "step": 17 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 0.312160849571228, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.7528, + "step": 18 + }, + { + "epoch": 0.03563056727613689, + "grad_norm": 0.3484225571155548, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.728, + "step": 19 + }, + { + "epoch": 0.03750586029067042, + "grad_norm": 0.328665167093277, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.8129, + "step": 20 + }, + { + "epoch": 0.03938115330520394, + "grad_norm": 0.32902592420578003, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7526, + "step": 21 + }, + { + "epoch": 0.04125644631973746, + "grad_norm": 0.3969726860523224, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.806, + "step": 22 + }, + { + "epoch": 0.04313173933427098, + "grad_norm": 0.3924497067928314, + "learning_rate": 4.4e-06, + "loss": 0.6978, + "step": 23 + }, + { + "epoch": 0.0450070323488045, + "grad_norm": 0.33922383189201355, + "learning_rate": 4.600000000000001e-06, + "loss": 0.7919, + "step": 24 + }, + { + "epoch": 0.04688232536333802, + "grad_norm": 0.3359040319919586, + "learning_rate": 4.800000000000001e-06, + "loss": 0.7881, + "step": 25 + }, + { + "epoch": 0.04875761837787154, + "grad_norm": 0.4279753863811493, + "learning_rate": 5e-06, + "loss": 0.9385, + "step": 26 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 0.351744145154953, + "learning_rate": 5.2e-06, + "loss": 0.8571, + "step": 27 + }, + { + "epoch": 0.05250820440693858, + "grad_norm": 0.3303033113479614, + "learning_rate": 5.400000000000001e-06, + "loss": 0.6374, + "step": 28 + }, + { + "epoch": 0.0543834974214721, + "grad_norm": 0.36658555269241333, + "learning_rate": 5.600000000000001e-06, + "loss": 0.7102, + "step": 29 + }, + { + "epoch": 0.05625879043600562, + "grad_norm": 0.3628396987915039, + "learning_rate": 5.8e-06, + "loss": 0.7783, + "step": 30 + }, + { + "epoch": 0.058134083450539144, + "grad_norm": 0.38539397716522217, + "learning_rate": 6e-06, + "loss": 0.798, + "step": 31 + }, + { + "epoch": 0.06000937646507267, + "grad_norm": 0.4560730755329132, + "learning_rate": 6.200000000000001e-06, + "loss": 0.716, + "step": 32 + }, + { + "epoch": 0.06188466947960619, + "grad_norm": 0.35180985927581787, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.7262, + "step": 33 + }, + { + "epoch": 0.0637599624941397, + "grad_norm": 0.4624330997467041, + "learning_rate": 6.600000000000001e-06, + "loss": 0.8614, + "step": 34 + }, + { + "epoch": 0.06563525550867323, + "grad_norm": 0.3922227621078491, + "learning_rate": 6.800000000000001e-06, + "loss": 0.7928, + "step": 35 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 0.44666793942451477, + "learning_rate": 7e-06, + "loss": 0.8177, + "step": 36 + }, + { + "epoch": 0.06938584153774027, + "grad_norm": 0.4159597158432007, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.8009, + "step": 37 + }, + { + "epoch": 0.07126113455227379, + "grad_norm": 0.442553848028183, + "learning_rate": 7.4e-06, + "loss": 0.6281, + "step": 38 + }, + { + "epoch": 0.07313642756680731, + "grad_norm": 0.36248019337654114, + "learning_rate": 7.600000000000001e-06, + "loss": 0.6649, + "step": 39 + }, + { + "epoch": 0.07501172058134084, + "grad_norm": 0.5066297650337219, + "learning_rate": 7.800000000000002e-06, + "loss": 0.8881, + "step": 40 + }, + { + "epoch": 0.07688701359587435, + "grad_norm": 0.4364047944545746, + "learning_rate": 8.000000000000001e-06, + "loss": 0.8244, + "step": 41 + }, + { + "epoch": 0.07876230661040788, + "grad_norm": 0.43310582637786865, + "learning_rate": 8.2e-06, + "loss": 0.6797, + "step": 42 + }, + { + "epoch": 0.0806375996249414, + "grad_norm": 0.5400950908660889, + "learning_rate": 8.400000000000001e-06, + "loss": 0.7903, + "step": 43 + }, + { + "epoch": 0.08251289263947492, + "grad_norm": 0.4437219500541687, + "learning_rate": 8.6e-06, + "loss": 0.6449, + "step": 44 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 0.48232921957969666, + "learning_rate": 8.8e-06, + "loss": 0.6013, + "step": 45 + }, + { + "epoch": 0.08626347866854196, + "grad_norm": 0.6040736436843872, + "learning_rate": 9e-06, + "loss": 0.5827, + "step": 46 + }, + { + "epoch": 0.08813877168307548, + "grad_norm": 0.4945738613605499, + "learning_rate": 9.200000000000002e-06, + "loss": 0.7523, + "step": 47 + }, + { + "epoch": 0.090014064697609, + "grad_norm": 0.44322240352630615, + "learning_rate": 9.4e-06, + "loss": 0.6732, + "step": 48 + }, + { + "epoch": 0.09188935771214252, + "grad_norm": 0.42256736755371094, + "learning_rate": 9.600000000000001e-06, + "loss": 0.6088, + "step": 49 + }, + { + "epoch": 0.09376465072667604, + "grad_norm": 0.4956076443195343, + "learning_rate": 9.800000000000001e-06, + "loss": 0.5818, + "step": 50 + }, + { + "epoch": 0.09376465072667604, + "eval_loss": 0.6773348450660706, + "eval_runtime": 678.2607, + "eval_samples_per_second": 0.29, + "eval_steps_per_second": 0.29, + "step": 50 + }, + { + "epoch": 0.09563994374120956, + "grad_norm": 0.5553563237190247, + "learning_rate": 1e-05, + "loss": 0.7514, + "step": 51 + }, + { + "epoch": 0.09751523675574308, + "grad_norm": 0.5681352019309998, + "learning_rate": 1.02e-05, + "loss": 0.6105, + "step": 52 + }, + { + "epoch": 0.09939052977027661, + "grad_norm": 0.5460308790206909, + "learning_rate": 1.04e-05, + "loss": 0.5824, + "step": 53 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 0.5295383930206299, + "learning_rate": 1.0600000000000002e-05, + "loss": 0.6622, + "step": 54 + }, + { + "epoch": 0.10314111579934365, + "grad_norm": 0.5573945045471191, + "learning_rate": 1.0800000000000002e-05, + "loss": 0.604, + "step": 55 + }, + { + "epoch": 0.10501640881387717, + "grad_norm": 0.5254002809524536, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.5777, + "step": 56 + }, + { + "epoch": 0.10689170182841069, + "grad_norm": 0.4916251003742218, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.5605, + "step": 57 + }, + { + "epoch": 0.1087669948429442, + "grad_norm": 0.44173622131347656, + "learning_rate": 1.14e-05, + "loss": 0.4921, + "step": 58 + }, + { + "epoch": 0.11064228785747773, + "grad_norm": 0.7284818887710571, + "learning_rate": 1.16e-05, + "loss": 0.676, + "step": 59 + }, + { + "epoch": 0.11251758087201125, + "grad_norm": 0.5018178224563599, + "learning_rate": 1.18e-05, + "loss": 0.5044, + "step": 60 + }, + { + "epoch": 0.11439287388654477, + "grad_norm": 0.4238162040710449, + "learning_rate": 1.2e-05, + "loss": 0.4526, + "step": 61 + }, + { + "epoch": 0.11626816690107829, + "grad_norm": 0.43392857909202576, + "learning_rate": 1.22e-05, + "loss": 0.554, + "step": 62 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 0.37505975365638733, + "learning_rate": 1.2400000000000002e-05, + "loss": 0.4167, + "step": 63 + }, + { + "epoch": 0.12001875293014534, + "grad_norm": 0.41996413469314575, + "learning_rate": 1.2600000000000001e-05, + "loss": 0.4457, + "step": 64 + }, + { + "epoch": 0.12189404594467886, + "grad_norm": 0.35364946722984314, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.397, + "step": 65 + }, + { + "epoch": 0.12376933895921238, + "grad_norm": 0.3553512692451477, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.4027, + "step": 66 + }, + { + "epoch": 0.1256446319737459, + "grad_norm": 0.35016101598739624, + "learning_rate": 1.3200000000000002e-05, + "loss": 0.4234, + "step": 67 + }, + { + "epoch": 0.1275199249882794, + "grad_norm": 0.28325924277305603, + "learning_rate": 1.3400000000000002e-05, + "loss": 0.3099, + "step": 68 + }, + { + "epoch": 0.12939521800281295, + "grad_norm": 0.3461209237575531, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.4367, + "step": 69 + }, + { + "epoch": 0.13127051101734646, + "grad_norm": 0.28789466619491577, + "learning_rate": 1.38e-05, + "loss": 0.3175, + "step": 70 + }, + { + "epoch": 0.13314580403187998, + "grad_norm": 0.3014371693134308, + "learning_rate": 1.4e-05, + "loss": 0.3197, + "step": 71 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 0.2781355082988739, + "learning_rate": 1.4200000000000001e-05, + "loss": 0.3437, + "step": 72 + }, + { + "epoch": 0.13689639006094703, + "grad_norm": 0.2576352059841156, + "learning_rate": 1.4400000000000001e-05, + "loss": 0.2707, + "step": 73 + }, + { + "epoch": 0.13877168307548055, + "grad_norm": 0.3152413070201874, + "learning_rate": 1.46e-05, + "loss": 0.3236, + "step": 74 + }, + { + "epoch": 0.14064697609001406, + "grad_norm": 0.6653899550437927, + "learning_rate": 1.48e-05, + "loss": 0.3465, + "step": 75 + }, + { + "epoch": 0.14252226910454757, + "grad_norm": 0.2475852519273758, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.2468, + "step": 76 + }, + { + "epoch": 0.1443975621190811, + "grad_norm": 0.28572630882263184, + "learning_rate": 1.5200000000000002e-05, + "loss": 0.3287, + "step": 77 + }, + { + "epoch": 0.14627285513361463, + "grad_norm": 0.27325335144996643, + "learning_rate": 1.54e-05, + "loss": 0.2926, + "step": 78 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.3255375027656555, + "learning_rate": 1.5600000000000003e-05, + "loss": 0.279, + "step": 79 + }, + { + "epoch": 0.15002344116268168, + "grad_norm": 0.44983330368995667, + "learning_rate": 1.58e-05, + "loss": 0.3534, + "step": 80 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 0.240912526845932, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.2541, + "step": 81 + }, + { + "epoch": 0.1537740271917487, + "grad_norm": 0.21851545572280884, + "learning_rate": 1.62e-05, + "loss": 0.2334, + "step": 82 + }, + { + "epoch": 0.15564932020628222, + "grad_norm": 0.25758886337280273, + "learning_rate": 1.64e-05, + "loss": 0.2952, + "step": 83 + }, + { + "epoch": 0.15752461322081576, + "grad_norm": 0.2865050435066223, + "learning_rate": 1.66e-05, + "loss": 0.2702, + "step": 84 + }, + { + "epoch": 0.15939990623534928, + "grad_norm": 0.21041016280651093, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.2565, + "step": 85 + }, + { + "epoch": 0.1612751992498828, + "grad_norm": 0.21632152795791626, + "learning_rate": 1.7e-05, + "loss": 0.2643, + "step": 86 + }, + { + "epoch": 0.1631504922644163, + "grad_norm": 0.1930413544178009, + "learning_rate": 1.72e-05, + "loss": 0.2386, + "step": 87 + }, + { + "epoch": 0.16502578527894984, + "grad_norm": 0.21475085616111755, + "learning_rate": 1.7400000000000003e-05, + "loss": 0.3035, + "step": 88 + }, + { + "epoch": 0.16690107829348336, + "grad_norm": 0.2499314695596695, + "learning_rate": 1.76e-05, + "loss": 0.3382, + "step": 89 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 0.22533196210861206, + "learning_rate": 1.7800000000000002e-05, + "loss": 0.2262, + "step": 90 + }, + { + "epoch": 0.1706516643225504, + "grad_norm": 0.22342143952846527, + "learning_rate": 1.8e-05, + "loss": 0.2489, + "step": 91 + }, + { + "epoch": 0.17252695733708392, + "grad_norm": 0.2666771411895752, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.2732, + "step": 92 + }, + { + "epoch": 0.17440225035161744, + "grad_norm": 0.22881805896759033, + "learning_rate": 1.8400000000000003e-05, + "loss": 0.285, + "step": 93 + }, + { + "epoch": 0.17627754336615095, + "grad_norm": 0.3157159984111786, + "learning_rate": 1.86e-05, + "loss": 0.2406, + "step": 94 + }, + { + "epoch": 0.1781528363806845, + "grad_norm": 0.20557765662670135, + "learning_rate": 1.88e-05, + "loss": 0.2162, + "step": 95 + }, + { + "epoch": 0.180028129395218, + "grad_norm": 0.26530343294143677, + "learning_rate": 1.9e-05, + "loss": 0.2527, + "step": 96 + }, + { + "epoch": 0.18190342240975152, + "grad_norm": 0.21458247303962708, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.1677, + "step": 97 + }, + { + "epoch": 0.18377871542428503, + "grad_norm": 0.20788805186748505, + "learning_rate": 1.94e-05, + "loss": 0.2191, + "step": 98 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 0.20019683241844177, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.1821, + "step": 99 + }, + { + "epoch": 0.1875293014533521, + "grad_norm": 0.264813631772995, + "learning_rate": 1.98e-05, + "loss": 0.2307, + "step": 100 + }, + { + "epoch": 0.1875293014533521, + "eval_loss": 0.18159574270248413, + "eval_runtime": 674.6463, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 100 + }, + { + "epoch": 0.1894045944678856, + "grad_norm": 0.2246488779783249, + "learning_rate": 2e-05, + "loss": 0.2421, + "step": 101 + }, + { + "epoch": 0.19127988748241911, + "grad_norm": 0.24814476072788239, + "learning_rate": 1.9992202729044836e-05, + "loss": 0.1653, + "step": 102 + }, + { + "epoch": 0.19315518049695266, + "grad_norm": 0.2789003849029541, + "learning_rate": 1.998440545808967e-05, + "loss": 0.2324, + "step": 103 + }, + { + "epoch": 0.19503047351148617, + "grad_norm": 0.21182158589363098, + "learning_rate": 1.9976608187134504e-05, + "loss": 0.2226, + "step": 104 + }, + { + "epoch": 0.19690576652601968, + "grad_norm": 0.299167662858963, + "learning_rate": 1.996881091617934e-05, + "loss": 0.2122, + "step": 105 + }, + { + "epoch": 0.19878105954055322, + "grad_norm": 0.22616952657699585, + "learning_rate": 1.9961013645224173e-05, + "loss": 0.2174, + "step": 106 + }, + { + "epoch": 0.20065635255508674, + "grad_norm": 0.19206267595291138, + "learning_rate": 1.9953216374269007e-05, + "loss": 0.1776, + "step": 107 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 0.2957805097103119, + "learning_rate": 1.994541910331384e-05, + "loss": 0.1576, + "step": 108 + }, + { + "epoch": 0.20440693858415376, + "grad_norm": 0.22284749150276184, + "learning_rate": 1.9937621832358675e-05, + "loss": 0.1495, + "step": 109 + }, + { + "epoch": 0.2062822315986873, + "grad_norm": 0.21203336119651794, + "learning_rate": 1.992982456140351e-05, + "loss": 0.186, + "step": 110 + }, + { + "epoch": 0.20815752461322082, + "grad_norm": 0.18476776778697968, + "learning_rate": 1.9922027290448344e-05, + "loss": 0.1626, + "step": 111 + }, + { + "epoch": 0.21003281762775433, + "grad_norm": 0.19962036609649658, + "learning_rate": 1.9914230019493178e-05, + "loss": 0.1447, + "step": 112 + }, + { + "epoch": 0.21190811064228784, + "grad_norm": 0.1857951283454895, + "learning_rate": 1.9906432748538015e-05, + "loss": 0.175, + "step": 113 + }, + { + "epoch": 0.21378340365682139, + "grad_norm": 0.2035515159368515, + "learning_rate": 1.989863547758285e-05, + "loss": 0.1732, + "step": 114 + }, + { + "epoch": 0.2156586966713549, + "grad_norm": 0.1710767149925232, + "learning_rate": 1.9890838206627684e-05, + "loss": 0.1317, + "step": 115 + }, + { + "epoch": 0.2175339896858884, + "grad_norm": 0.2920154929161072, + "learning_rate": 1.9883040935672515e-05, + "loss": 0.1802, + "step": 116 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 0.27223774790763855, + "learning_rate": 1.987524366471735e-05, + "loss": 0.1338, + "step": 117 + }, + { + "epoch": 0.22128457571495547, + "grad_norm": 0.1965789496898651, + "learning_rate": 1.9867446393762183e-05, + "loss": 0.1751, + "step": 118 + }, + { + "epoch": 0.22315986872948898, + "grad_norm": 0.19014707207679749, + "learning_rate": 1.9859649122807017e-05, + "loss": 0.1485, + "step": 119 + }, + { + "epoch": 0.2250351617440225, + "grad_norm": 0.251869261264801, + "learning_rate": 1.9851851851851855e-05, + "loss": 0.1416, + "step": 120 + }, + { + "epoch": 0.22691045475855603, + "grad_norm": 0.2957039475440979, + "learning_rate": 1.984405458089669e-05, + "loss": 0.1892, + "step": 121 + }, + { + "epoch": 0.22878574777308955, + "grad_norm": 0.17241084575653076, + "learning_rate": 1.9836257309941523e-05, + "loss": 0.1412, + "step": 122 + }, + { + "epoch": 0.23066104078762306, + "grad_norm": 0.2205045521259308, + "learning_rate": 1.9828460038986357e-05, + "loss": 0.1602, + "step": 123 + }, + { + "epoch": 0.23253633380215658, + "grad_norm": 0.21944566071033478, + "learning_rate": 1.982066276803119e-05, + "loss": 0.146, + "step": 124 + }, + { + "epoch": 0.23441162681669012, + "grad_norm": 0.21907442808151245, + "learning_rate": 1.9812865497076026e-05, + "loss": 0.1588, + "step": 125 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 0.17742829024791718, + "learning_rate": 1.980506822612086e-05, + "loss": 0.151, + "step": 126 + }, + { + "epoch": 0.23816221284575714, + "grad_norm": 0.23079413175582886, + "learning_rate": 1.9797270955165694e-05, + "loss": 0.2186, + "step": 127 + }, + { + "epoch": 0.24003750586029068, + "grad_norm": 0.19806325435638428, + "learning_rate": 1.9789473684210528e-05, + "loss": 0.1352, + "step": 128 + }, + { + "epoch": 0.2419127988748242, + "grad_norm": 0.1862531155347824, + "learning_rate": 1.9781676413255362e-05, + "loss": 0.1456, + "step": 129 + }, + { + "epoch": 0.2437880918893577, + "grad_norm": 0.1777540147304535, + "learning_rate": 1.9773879142300197e-05, + "loss": 0.1331, + "step": 130 + }, + { + "epoch": 0.24566338490389122, + "grad_norm": 0.29459261894226074, + "learning_rate": 1.976608187134503e-05, + "loss": 0.1421, + "step": 131 + }, + { + "epoch": 0.24753867791842477, + "grad_norm": 0.20495347678661346, + "learning_rate": 1.9758284600389865e-05, + "loss": 0.1599, + "step": 132 + }, + { + "epoch": 0.24941397093295828, + "grad_norm": 0.17535296082496643, + "learning_rate": 1.97504873294347e-05, + "loss": 0.17, + "step": 133 + }, + { + "epoch": 0.2512892639474918, + "grad_norm": 0.16180671751499176, + "learning_rate": 1.9742690058479533e-05, + "loss": 0.1367, + "step": 134 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 0.4137842357158661, + "learning_rate": 1.9734892787524368e-05, + "loss": 0.1536, + "step": 135 + }, + { + "epoch": 0.2550398499765588, + "grad_norm": 0.174374058842659, + "learning_rate": 1.9727095516569202e-05, + "loss": 0.1562, + "step": 136 + }, + { + "epoch": 0.25691514299109236, + "grad_norm": 0.1792580634355545, + "learning_rate": 1.9719298245614036e-05, + "loss": 0.174, + "step": 137 + }, + { + "epoch": 0.2587904360056259, + "grad_norm": 0.40365007519721985, + "learning_rate": 1.971150097465887e-05, + "loss": 0.2197, + "step": 138 + }, + { + "epoch": 0.2606657290201594, + "grad_norm": 0.1588585376739502, + "learning_rate": 1.9703703703703704e-05, + "loss": 0.1012, + "step": 139 + }, + { + "epoch": 0.26254102203469293, + "grad_norm": 0.1797696053981781, + "learning_rate": 1.969590643274854e-05, + "loss": 0.123, + "step": 140 + }, + { + "epoch": 0.26441631504922647, + "grad_norm": 0.41170960664749146, + "learning_rate": 1.9688109161793373e-05, + "loss": 0.2208, + "step": 141 + }, + { + "epoch": 0.26629160806375995, + "grad_norm": 0.22392353415489197, + "learning_rate": 1.9680311890838207e-05, + "loss": 0.1634, + "step": 142 + }, + { + "epoch": 0.2681669010782935, + "grad_norm": 0.2506747841835022, + "learning_rate": 1.9672514619883044e-05, + "loss": 0.1539, + "step": 143 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 0.3422393500804901, + "learning_rate": 1.966471734892788e-05, + "loss": 0.2705, + "step": 144 + }, + { + "epoch": 0.2719174871073605, + "grad_norm": 0.15338042378425598, + "learning_rate": 1.9656920077972713e-05, + "loss": 0.1554, + "step": 145 + }, + { + "epoch": 0.27379278012189406, + "grad_norm": 0.4089084267616272, + "learning_rate": 1.9649122807017544e-05, + "loss": 0.1756, + "step": 146 + }, + { + "epoch": 0.27566807313642755, + "grad_norm": 0.19431771337985992, + "learning_rate": 1.9641325536062378e-05, + "loss": 0.1549, + "step": 147 + }, + { + "epoch": 0.2775433661509611, + "grad_norm": 0.21856912970542908, + "learning_rate": 1.9633528265107212e-05, + "loss": 0.1854, + "step": 148 + }, + { + "epoch": 0.27941865916549463, + "grad_norm": 0.21324169635772705, + "learning_rate": 1.962573099415205e-05, + "loss": 0.1348, + "step": 149 + }, + { + "epoch": 0.2812939521800281, + "grad_norm": 0.18593250215053558, + "learning_rate": 1.9617933723196884e-05, + "loss": 0.1309, + "step": 150 + }, + { + "epoch": 0.2812939521800281, + "eval_loss": 0.11937730759382248, + "eval_runtime": 675.2131, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 150 + }, + { + "epoch": 0.28316924519456166, + "grad_norm": 0.17237016558647156, + "learning_rate": 1.9610136452241718e-05, + "loss": 0.1228, + "step": 151 + }, + { + "epoch": 0.28504453820909514, + "grad_norm": 0.20051100850105286, + "learning_rate": 1.9602339181286552e-05, + "loss": 0.1028, + "step": 152 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 0.20913535356521606, + "learning_rate": 1.9594541910331386e-05, + "loss": 0.1478, + "step": 153 + }, + { + "epoch": 0.2887951242381622, + "grad_norm": 0.22947447001934052, + "learning_rate": 1.958674463937622e-05, + "loss": 0.1199, + "step": 154 + }, + { + "epoch": 0.2906704172526957, + "grad_norm": 0.21503888070583344, + "learning_rate": 1.9578947368421055e-05, + "loss": 0.1469, + "step": 155 + }, + { + "epoch": 0.29254571026722925, + "grad_norm": 0.15914572775363922, + "learning_rate": 1.957115009746589e-05, + "loss": 0.1126, + "step": 156 + }, + { + "epoch": 0.2944210032817628, + "grad_norm": 0.1764422208070755, + "learning_rate": 1.9563352826510723e-05, + "loss": 0.142, + "step": 157 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.20030224323272705, + "learning_rate": 1.9555555555555557e-05, + "loss": 0.1475, + "step": 158 + }, + { + "epoch": 0.2981715893108298, + "grad_norm": 0.21742001175880432, + "learning_rate": 1.954775828460039e-05, + "loss": 0.1874, + "step": 159 + }, + { + "epoch": 0.30004688232536336, + "grad_norm": 0.2284712940454483, + "learning_rate": 1.9539961013645226e-05, + "loss": 0.1532, + "step": 160 + }, + { + "epoch": 0.30192217533989685, + "grad_norm": 0.18156972527503967, + "learning_rate": 1.953216374269006e-05, + "loss": 0.092, + "step": 161 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 0.1754453182220459, + "learning_rate": 1.9524366471734894e-05, + "loss": 0.1348, + "step": 162 + }, + { + "epoch": 0.3056727613689639, + "grad_norm": 0.16860631108283997, + "learning_rate": 1.9516569200779728e-05, + "loss": 0.1176, + "step": 163 + }, + { + "epoch": 0.3075480543834974, + "grad_norm": 0.17378303408622742, + "learning_rate": 1.9508771929824562e-05, + "loss": 0.1372, + "step": 164 + }, + { + "epoch": 0.30942334739803096, + "grad_norm": 0.23950371146202087, + "learning_rate": 1.9500974658869397e-05, + "loss": 0.1206, + "step": 165 + }, + { + "epoch": 0.31129864041256444, + "grad_norm": 0.17646121978759766, + "learning_rate": 1.949317738791423e-05, + "loss": 0.1298, + "step": 166 + }, + { + "epoch": 0.313173933427098, + "grad_norm": 0.1811673790216446, + "learning_rate": 1.9485380116959065e-05, + "loss": 0.188, + "step": 167 + }, + { + "epoch": 0.3150492264416315, + "grad_norm": 0.27424830198287964, + "learning_rate": 1.94775828460039e-05, + "loss": 0.1422, + "step": 168 + }, + { + "epoch": 0.316924519456165, + "grad_norm": 0.21313942968845367, + "learning_rate": 1.9469785575048733e-05, + "loss": 0.1201, + "step": 169 + }, + { + "epoch": 0.31879981247069855, + "grad_norm": 0.1912909597158432, + "learning_rate": 1.9461988304093568e-05, + "loss": 0.0854, + "step": 170 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 0.2451699674129486, + "learning_rate": 1.9454191033138402e-05, + "loss": 0.2009, + "step": 171 + }, + { + "epoch": 0.3225503984997656, + "grad_norm": 0.1784246265888214, + "learning_rate": 1.944639376218324e-05, + "loss": 0.1587, + "step": 172 + }, + { + "epoch": 0.3244256915142991, + "grad_norm": 0.19816836714744568, + "learning_rate": 1.9438596491228074e-05, + "loss": 0.1426, + "step": 173 + }, + { + "epoch": 0.3263009845288326, + "grad_norm": 0.19529619812965393, + "learning_rate": 1.9430799220272908e-05, + "loss": 0.1431, + "step": 174 + }, + { + "epoch": 0.32817627754336615, + "grad_norm": 0.2105475664138794, + "learning_rate": 1.9423001949317742e-05, + "loss": 0.1654, + "step": 175 + }, + { + "epoch": 0.3300515705578997, + "grad_norm": 0.17807945609092712, + "learning_rate": 1.9415204678362573e-05, + "loss": 0.1366, + "step": 176 + }, + { + "epoch": 0.3319268635724332, + "grad_norm": 0.2023920863866806, + "learning_rate": 1.9407407407407407e-05, + "loss": 0.1372, + "step": 177 + }, + { + "epoch": 0.3338021565869667, + "grad_norm": 0.1807592213153839, + "learning_rate": 1.939961013645224e-05, + "loss": 0.1188, + "step": 178 + }, + { + "epoch": 0.33567744960150026, + "grad_norm": 0.18859116733074188, + "learning_rate": 1.939181286549708e-05, + "loss": 0.1243, + "step": 179 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 0.21937714517116547, + "learning_rate": 1.9384015594541913e-05, + "loss": 0.2015, + "step": 180 + }, + { + "epoch": 0.3394280356305673, + "grad_norm": 0.1638198345899582, + "learning_rate": 1.9376218323586747e-05, + "loss": 0.1031, + "step": 181 + }, + { + "epoch": 0.3413033286451008, + "grad_norm": 0.2563665807247162, + "learning_rate": 1.936842105263158e-05, + "loss": 0.1341, + "step": 182 + }, + { + "epoch": 0.3431786216596343, + "grad_norm": 0.19014421105384827, + "learning_rate": 1.9360623781676415e-05, + "loss": 0.135, + "step": 183 + }, + { + "epoch": 0.34505391467416785, + "grad_norm": 0.19648663699626923, + "learning_rate": 1.935282651072125e-05, + "loss": 0.1239, + "step": 184 + }, + { + "epoch": 0.34692920768870134, + "grad_norm": 0.16469500958919525, + "learning_rate": 1.9345029239766084e-05, + "loss": 0.1142, + "step": 185 + }, + { + "epoch": 0.3488045007032349, + "grad_norm": 0.17405149340629578, + "learning_rate": 1.9337231968810918e-05, + "loss": 0.1025, + "step": 186 + }, + { + "epoch": 0.3506797937177684, + "grad_norm": 0.2042636275291443, + "learning_rate": 1.9329434697855752e-05, + "loss": 0.0976, + "step": 187 + }, + { + "epoch": 0.3525550867323019, + "grad_norm": 0.2545097768306732, + "learning_rate": 1.9321637426900586e-05, + "loss": 0.1707, + "step": 188 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 0.17079883813858032, + "learning_rate": 1.931384015594542e-05, + "loss": 0.1282, + "step": 189 + }, + { + "epoch": 0.356305672761369, + "grad_norm": 0.18123818933963776, + "learning_rate": 1.9306042884990255e-05, + "loss": 0.1285, + "step": 190 + }, + { + "epoch": 0.35818096577590247, + "grad_norm": 0.26533034443855286, + "learning_rate": 1.929824561403509e-05, + "loss": 0.1912, + "step": 191 + }, + { + "epoch": 0.360056258790436, + "grad_norm": 0.21302203834056854, + "learning_rate": 1.9290448343079923e-05, + "loss": 0.152, + "step": 192 + }, + { + "epoch": 0.36193155180496955, + "grad_norm": 0.16439904272556305, + "learning_rate": 1.9282651072124757e-05, + "loss": 0.1026, + "step": 193 + }, + { + "epoch": 0.36380684481950304, + "grad_norm": 0.21921955049037933, + "learning_rate": 1.927485380116959e-05, + "loss": 0.1294, + "step": 194 + }, + { + "epoch": 0.3656821378340366, + "grad_norm": 0.1894323229789734, + "learning_rate": 1.9267056530214426e-05, + "loss": 0.1307, + "step": 195 + }, + { + "epoch": 0.36755743084857007, + "grad_norm": 0.18328344821929932, + "learning_rate": 1.925925925925926e-05, + "loss": 0.1168, + "step": 196 + }, + { + "epoch": 0.3694327238631036, + "grad_norm": 0.19325962662696838, + "learning_rate": 1.9251461988304094e-05, + "loss": 0.0979, + "step": 197 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 0.16475112736225128, + "learning_rate": 1.924366471734893e-05, + "loss": 0.11, + "step": 198 + }, + { + "epoch": 0.37318330989217063, + "grad_norm": 0.24370963871479034, + "learning_rate": 1.9235867446393763e-05, + "loss": 0.1256, + "step": 199 + }, + { + "epoch": 0.3750586029067042, + "grad_norm": 0.21204431354999542, + "learning_rate": 1.9228070175438597e-05, + "loss": 0.1498, + "step": 200 + }, + { + "epoch": 0.3750586029067042, + "eval_loss": 0.10520625114440918, + "eval_runtime": 675.3932, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 200 + }, + { + "epoch": 0.3769338959212377, + "grad_norm": 0.19303874671459198, + "learning_rate": 1.922027290448343e-05, + "loss": 0.1423, + "step": 201 + }, + { + "epoch": 0.3788091889357712, + "grad_norm": 0.19636684656143188, + "learning_rate": 1.921247563352827e-05, + "loss": 0.1325, + "step": 202 + }, + { + "epoch": 0.38068448195030474, + "grad_norm": 0.23951849341392517, + "learning_rate": 1.9204678362573103e-05, + "loss": 0.1276, + "step": 203 + }, + { + "epoch": 0.38255977496483823, + "grad_norm": 0.229360893368721, + "learning_rate": 1.9196881091617937e-05, + "loss": 0.121, + "step": 204 + }, + { + "epoch": 0.38443506797937177, + "grad_norm": 0.18153493106365204, + "learning_rate": 1.918908382066277e-05, + "loss": 0.1202, + "step": 205 + }, + { + "epoch": 0.3863103609939053, + "grad_norm": 0.4398716986179352, + "learning_rate": 1.9181286549707602e-05, + "loss": 0.1189, + "step": 206 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 0.20299677550792694, + "learning_rate": 1.9173489278752436e-05, + "loss": 0.1433, + "step": 207 + }, + { + "epoch": 0.39006094702297234, + "grad_norm": 0.2434886395931244, + "learning_rate": 1.916569200779727e-05, + "loss": 0.1122, + "step": 208 + }, + { + "epoch": 0.3919362400375059, + "grad_norm": 0.16485784947872162, + "learning_rate": 1.9157894736842108e-05, + "loss": 0.1505, + "step": 209 + }, + { + "epoch": 0.39381153305203936, + "grad_norm": 0.2433251440525055, + "learning_rate": 1.9150097465886942e-05, + "loss": 0.1821, + "step": 210 + }, + { + "epoch": 0.3956868260665729, + "grad_norm": 0.2099800854921341, + "learning_rate": 1.9142300194931776e-05, + "loss": 0.143, + "step": 211 + }, + { + "epoch": 0.39756211908110645, + "grad_norm": 0.20996929705142975, + "learning_rate": 1.913450292397661e-05, + "loss": 0.1394, + "step": 212 + }, + { + "epoch": 0.39943741209563993, + "grad_norm": 0.1913692057132721, + "learning_rate": 1.9126705653021445e-05, + "loss": 0.1408, + "step": 213 + }, + { + "epoch": 0.4013127051101735, + "grad_norm": 0.1733906865119934, + "learning_rate": 1.911890838206628e-05, + "loss": 0.1024, + "step": 214 + }, + { + "epoch": 0.40318799812470696, + "grad_norm": 0.1856503039598465, + "learning_rate": 1.9111111111111113e-05, + "loss": 0.1129, + "step": 215 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 0.3084484338760376, + "learning_rate": 1.9103313840155947e-05, + "loss": 0.102, + "step": 216 + }, + { + "epoch": 0.40693858415377404, + "grad_norm": 0.42217186093330383, + "learning_rate": 1.909551656920078e-05, + "loss": 0.1372, + "step": 217 + }, + { + "epoch": 0.4088138771683075, + "grad_norm": 0.20602688193321228, + "learning_rate": 1.9087719298245616e-05, + "loss": 0.1585, + "step": 218 + }, + { + "epoch": 0.41068917018284107, + "grad_norm": 0.524498701095581, + "learning_rate": 1.907992202729045e-05, + "loss": 0.1533, + "step": 219 + }, + { + "epoch": 0.4125644631973746, + "grad_norm": 0.20144020020961761, + "learning_rate": 1.9072124756335284e-05, + "loss": 0.1539, + "step": 220 + }, + { + "epoch": 0.4144397562119081, + "grad_norm": 0.17956125736236572, + "learning_rate": 1.9064327485380118e-05, + "loss": 0.1124, + "step": 221 + }, + { + "epoch": 0.41631504922644164, + "grad_norm": 0.20834237337112427, + "learning_rate": 1.9056530214424952e-05, + "loss": 0.1151, + "step": 222 + }, + { + "epoch": 0.4181903422409752, + "grad_norm": 0.24440927803516388, + "learning_rate": 1.9048732943469787e-05, + "loss": 0.1138, + "step": 223 + }, + { + "epoch": 0.42006563525550866, + "grad_norm": 0.1995476335287094, + "learning_rate": 1.904093567251462e-05, + "loss": 0.0915, + "step": 224 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 0.18664710223674774, + "learning_rate": 1.9033138401559458e-05, + "loss": 0.1295, + "step": 225 + }, + { + "epoch": 0.4238162212845757, + "grad_norm": 0.21624010801315308, + "learning_rate": 1.902534113060429e-05, + "loss": 0.1861, + "step": 226 + }, + { + "epoch": 0.42569151429910923, + "grad_norm": 0.27115607261657715, + "learning_rate": 1.9017543859649123e-05, + "loss": 0.0858, + "step": 227 + }, + { + "epoch": 0.42756680731364277, + "grad_norm": 0.2523871660232544, + "learning_rate": 1.9009746588693957e-05, + "loss": 0.1371, + "step": 228 + }, + { + "epoch": 0.42944210032817626, + "grad_norm": 0.20124512910842896, + "learning_rate": 1.900194931773879e-05, + "loss": 0.1079, + "step": 229 + }, + { + "epoch": 0.4313173933427098, + "grad_norm": 0.15628211200237274, + "learning_rate": 1.8994152046783626e-05, + "loss": 0.0926, + "step": 230 + }, + { + "epoch": 0.43319268635724334, + "grad_norm": 0.18558505177497864, + "learning_rate": 1.898635477582846e-05, + "loss": 0.1241, + "step": 231 + }, + { + "epoch": 0.4350679793717768, + "grad_norm": 0.21066918969154358, + "learning_rate": 1.8978557504873298e-05, + "loss": 0.1148, + "step": 232 + }, + { + "epoch": 0.43694327238631037, + "grad_norm": 0.24706971645355225, + "learning_rate": 1.8970760233918132e-05, + "loss": 0.1469, + "step": 233 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 0.24075715243816376, + "learning_rate": 1.8962962962962966e-05, + "loss": 0.1078, + "step": 234 + }, + { + "epoch": 0.4406938584153774, + "grad_norm": 0.9308987855911255, + "learning_rate": 1.89551656920078e-05, + "loss": 0.1242, + "step": 235 + }, + { + "epoch": 0.44256915142991093, + "grad_norm": 0.23705029487609863, + "learning_rate": 1.894736842105263e-05, + "loss": 0.1246, + "step": 236 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.20728544890880585, + "learning_rate": 1.8939571150097465e-05, + "loss": 0.1216, + "step": 237 + }, + { + "epoch": 0.44631973745897796, + "grad_norm": 0.21318766474723816, + "learning_rate": 1.8931773879142303e-05, + "loss": 0.1153, + "step": 238 + }, + { + "epoch": 0.4481950304735115, + "grad_norm": 0.22454850375652313, + "learning_rate": 1.8923976608187137e-05, + "loss": 0.1337, + "step": 239 + }, + { + "epoch": 0.450070323488045, + "grad_norm": 0.23081591725349426, + "learning_rate": 1.891617933723197e-05, + "loss": 0.1014, + "step": 240 + }, + { + "epoch": 0.45194561650257853, + "grad_norm": 0.2807827293872833, + "learning_rate": 1.8908382066276805e-05, + "loss": 0.1587, + "step": 241 + }, + { + "epoch": 0.45382090951711207, + "grad_norm": 0.22218219935894012, + "learning_rate": 1.890058479532164e-05, + "loss": 0.1599, + "step": 242 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 0.23275920748710632, + "learning_rate": 1.8892787524366474e-05, + "loss": 0.1336, + "step": 243 + }, + { + "epoch": 0.4575714955461791, + "grad_norm": 0.20733550190925598, + "learning_rate": 1.8884990253411308e-05, + "loss": 0.1458, + "step": 244 + }, + { + "epoch": 0.45944678856071264, + "grad_norm": 0.19315101206302643, + "learning_rate": 1.8877192982456142e-05, + "loss": 0.0923, + "step": 245 + }, + { + "epoch": 0.4613220815752461, + "grad_norm": 0.22976315021514893, + "learning_rate": 1.8869395711500976e-05, + "loss": 0.141, + "step": 246 + }, + { + "epoch": 0.46319737458977966, + "grad_norm": 0.40744727849960327, + "learning_rate": 1.886159844054581e-05, + "loss": 0.1416, + "step": 247 + }, + { + "epoch": 0.46507266760431315, + "grad_norm": 0.16530601680278778, + "learning_rate": 1.8853801169590645e-05, + "loss": 0.0989, + "step": 248 + }, + { + "epoch": 0.4669479606188467, + "grad_norm": 0.19157516956329346, + "learning_rate": 1.884600389863548e-05, + "loss": 0.1176, + "step": 249 + }, + { + "epoch": 0.46882325363338023, + "grad_norm": 0.28270554542541504, + "learning_rate": 1.8838206627680313e-05, + "loss": 0.1039, + "step": 250 + }, + { + "epoch": 0.46882325363338023, + "eval_loss": 0.09713947772979736, + "eval_runtime": 675.3394, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 250 + }, + { + "epoch": 0.4706985466479137, + "grad_norm": 0.2453300952911377, + "learning_rate": 1.8830409356725147e-05, + "loss": 0.113, + "step": 251 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 0.2129826545715332, + "learning_rate": 1.882261208576998e-05, + "loss": 0.143, + "step": 252 + }, + { + "epoch": 0.4744491326769808, + "grad_norm": 0.20310461521148682, + "learning_rate": 1.8814814814814816e-05, + "loss": 0.1242, + "step": 253 + }, + { + "epoch": 0.4763244256915143, + "grad_norm": 0.18484044075012207, + "learning_rate": 1.880701754385965e-05, + "loss": 0.1098, + "step": 254 + }, + { + "epoch": 0.4781997187060478, + "grad_norm": 0.2547452747821808, + "learning_rate": 1.8799220272904487e-05, + "loss": 0.1461, + "step": 255 + }, + { + "epoch": 0.48007501172058137, + "grad_norm": 0.2765617072582245, + "learning_rate": 1.8791423001949318e-05, + "loss": 0.1567, + "step": 256 + }, + { + "epoch": 0.48195030473511485, + "grad_norm": 0.19878943264484406, + "learning_rate": 1.8783625730994152e-05, + "loss": 0.1102, + "step": 257 + }, + { + "epoch": 0.4838255977496484, + "grad_norm": 0.3786822259426117, + "learning_rate": 1.8775828460038987e-05, + "loss": 0.1354, + "step": 258 + }, + { + "epoch": 0.4857008907641819, + "grad_norm": 0.18465487658977509, + "learning_rate": 1.876803118908382e-05, + "loss": 0.0959, + "step": 259 + }, + { + "epoch": 0.4875761837787154, + "grad_norm": 0.206305131316185, + "learning_rate": 1.8760233918128655e-05, + "loss": 0.0962, + "step": 260 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 0.22003507614135742, + "learning_rate": 1.8752436647173493e-05, + "loss": 0.1274, + "step": 261 + }, + { + "epoch": 0.49132676980778245, + "grad_norm": 0.17606805264949799, + "learning_rate": 1.8744639376218327e-05, + "loss": 0.0998, + "step": 262 + }, + { + "epoch": 0.493202062822316, + "grad_norm": 0.27991971373558044, + "learning_rate": 1.873684210526316e-05, + "loss": 0.1564, + "step": 263 + }, + { + "epoch": 0.49507735583684953, + "grad_norm": 0.23053331673145294, + "learning_rate": 1.8729044834307995e-05, + "loss": 0.1247, + "step": 264 + }, + { + "epoch": 0.496952648851383, + "grad_norm": 0.20774593949317932, + "learning_rate": 1.872124756335283e-05, + "loss": 0.0893, + "step": 265 + }, + { + "epoch": 0.49882794186591656, + "grad_norm": 0.20343434810638428, + "learning_rate": 1.871345029239766e-05, + "loss": 0.1096, + "step": 266 + }, + { + "epoch": 0.5007032348804501, + "grad_norm": 0.2922403812408447, + "learning_rate": 1.8705653021442494e-05, + "loss": 0.151, + "step": 267 + }, + { + "epoch": 0.5025785278949836, + "grad_norm": 0.21583294868469238, + "learning_rate": 1.8697855750487332e-05, + "loss": 0.0865, + "step": 268 + }, + { + "epoch": 0.5044538209095171, + "grad_norm": 0.21538986265659332, + "learning_rate": 1.8690058479532166e-05, + "loss": 0.1181, + "step": 269 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 0.2531646192073822, + "learning_rate": 1.8682261208577e-05, + "loss": 0.1502, + "step": 270 + }, + { + "epoch": 0.5082044069385842, + "grad_norm": 0.28955358266830444, + "learning_rate": 1.8674463937621834e-05, + "loss": 0.1335, + "step": 271 + }, + { + "epoch": 0.5100796999531176, + "grad_norm": 0.27051371335983276, + "learning_rate": 1.866666666666667e-05, + "loss": 0.1605, + "step": 272 + }, + { + "epoch": 0.5119549929676512, + "grad_norm": 0.15731023252010345, + "learning_rate": 1.8658869395711503e-05, + "loss": 0.0682, + "step": 273 + }, + { + "epoch": 0.5138302859821847, + "grad_norm": 0.20177946984767914, + "learning_rate": 1.8651072124756337e-05, + "loss": 0.1127, + "step": 274 + }, + { + "epoch": 0.5157055789967182, + "grad_norm": 0.2223084568977356, + "learning_rate": 1.864327485380117e-05, + "loss": 0.1318, + "step": 275 + }, + { + "epoch": 0.5175808720112518, + "grad_norm": 0.2421497255563736, + "learning_rate": 1.8635477582846005e-05, + "loss": 0.1372, + "step": 276 + }, + { + "epoch": 0.5194561650257853, + "grad_norm": 0.3211030662059784, + "learning_rate": 1.862768031189084e-05, + "loss": 0.1556, + "step": 277 + }, + { + "epoch": 0.5213314580403188, + "grad_norm": 0.2547368109226227, + "learning_rate": 1.8619883040935674e-05, + "loss": 0.1249, + "step": 278 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 0.23231586813926697, + "learning_rate": 1.8612085769980508e-05, + "loss": 0.1412, + "step": 279 + }, + { + "epoch": 0.5250820440693859, + "grad_norm": 0.21071861684322357, + "learning_rate": 1.8604288499025342e-05, + "loss": 0.105, + "step": 280 + }, + { + "epoch": 0.5269573370839193, + "grad_norm": 0.22929051518440247, + "learning_rate": 1.8596491228070176e-05, + "loss": 0.11, + "step": 281 + }, + { + "epoch": 0.5288326300984529, + "grad_norm": 0.22073444724082947, + "learning_rate": 1.858869395711501e-05, + "loss": 0.1031, + "step": 282 + }, + { + "epoch": 0.5307079231129864, + "grad_norm": 0.20839901268482208, + "learning_rate": 1.8580896686159845e-05, + "loss": 0.099, + "step": 283 + }, + { + "epoch": 0.5325832161275199, + "grad_norm": 0.2510814070701599, + "learning_rate": 1.857309941520468e-05, + "loss": 0.1614, + "step": 284 + }, + { + "epoch": 0.5344585091420534, + "grad_norm": 0.23160187900066376, + "learning_rate": 1.8565302144249517e-05, + "loss": 0.1281, + "step": 285 + }, + { + "epoch": 0.536333802156587, + "grad_norm": 0.3275841474533081, + "learning_rate": 1.8557504873294347e-05, + "loss": 0.0894, + "step": 286 + }, + { + "epoch": 0.5382090951711205, + "grad_norm": 0.20695282518863678, + "learning_rate": 1.854970760233918e-05, + "loss": 0.0911, + "step": 287 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 0.19326826930046082, + "learning_rate": 1.8541910331384016e-05, + "loss": 0.1097, + "step": 288 + }, + { + "epoch": 0.5419596812001876, + "grad_norm": 0.22065812349319458, + "learning_rate": 1.853411306042885e-05, + "loss": 0.1249, + "step": 289 + }, + { + "epoch": 0.543834974214721, + "grad_norm": 0.3224574327468872, + "learning_rate": 1.8526315789473684e-05, + "loss": 0.1232, + "step": 290 + }, + { + "epoch": 0.5457102672292545, + "grad_norm": 0.20936869084835052, + "learning_rate": 1.851851851851852e-05, + "loss": 0.1061, + "step": 291 + }, + { + "epoch": 0.5475855602437881, + "grad_norm": 0.2805179953575134, + "learning_rate": 1.8510721247563356e-05, + "loss": 0.1176, + "step": 292 + }, + { + "epoch": 0.5494608532583216, + "grad_norm": 0.2184048891067505, + "learning_rate": 1.850292397660819e-05, + "loss": 0.0922, + "step": 293 + }, + { + "epoch": 0.5513361462728551, + "grad_norm": 0.2294335961341858, + "learning_rate": 1.8495126705653024e-05, + "loss": 0.1065, + "step": 294 + }, + { + "epoch": 0.5532114392873887, + "grad_norm": 0.20929084718227386, + "learning_rate": 1.848732943469786e-05, + "loss": 0.1041, + "step": 295 + }, + { + "epoch": 0.5550867323019222, + "grad_norm": 0.24733637273311615, + "learning_rate": 1.847953216374269e-05, + "loss": 0.1046, + "step": 296 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 0.26208677887916565, + "learning_rate": 1.8471734892787523e-05, + "loss": 0.0933, + "step": 297 + }, + { + "epoch": 0.5588373183309893, + "grad_norm": 0.20398394763469696, + "learning_rate": 1.846393762183236e-05, + "loss": 0.0967, + "step": 298 + }, + { + "epoch": 0.5607126113455227, + "grad_norm": 0.2356574982404709, + "learning_rate": 1.8456140350877195e-05, + "loss": 0.1627, + "step": 299 + }, + { + "epoch": 0.5625879043600562, + "grad_norm": 0.32398343086242676, + "learning_rate": 1.844834307992203e-05, + "loss": 0.1334, + "step": 300 + }, + { + "epoch": 0.5625879043600562, + "eval_loss": 0.09722544252872467, + "eval_runtime": 675.2264, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 300 + }, + { + "epoch": 0.5644631973745898, + "grad_norm": 0.25528204441070557, + "learning_rate": 1.8440545808966864e-05, + "loss": 0.1637, + "step": 301 + }, + { + "epoch": 0.5663384903891233, + "grad_norm": 0.2690048813819885, + "learning_rate": 1.8432748538011698e-05, + "loss": 0.178, + "step": 302 + }, + { + "epoch": 0.5682137834036568, + "grad_norm": 0.2270357310771942, + "learning_rate": 1.8424951267056532e-05, + "loss": 0.1252, + "step": 303 + }, + { + "epoch": 0.5700890764181903, + "grad_norm": 0.3335007131099701, + "learning_rate": 1.8417153996101366e-05, + "loss": 0.1594, + "step": 304 + }, + { + "epoch": 0.5719643694327239, + "grad_norm": 0.2951895594596863, + "learning_rate": 1.84093567251462e-05, + "loss": 0.1794, + "step": 305 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 0.19075483083724976, + "learning_rate": 1.8401559454191035e-05, + "loss": 0.1237, + "step": 306 + }, + { + "epoch": 0.5757149554617909, + "grad_norm": 0.25787994265556335, + "learning_rate": 1.839376218323587e-05, + "loss": 0.1151, + "step": 307 + }, + { + "epoch": 0.5775902484763245, + "grad_norm": 0.20336200296878815, + "learning_rate": 1.8385964912280703e-05, + "loss": 0.0893, + "step": 308 + }, + { + "epoch": 0.5794655414908579, + "grad_norm": 0.2570553123950958, + "learning_rate": 1.8378167641325537e-05, + "loss": 0.1197, + "step": 309 + }, + { + "epoch": 0.5813408345053914, + "grad_norm": 0.24566973745822906, + "learning_rate": 1.837037037037037e-05, + "loss": 0.1383, + "step": 310 + }, + { + "epoch": 0.583216127519925, + "grad_norm": 0.23258450627326965, + "learning_rate": 1.8362573099415205e-05, + "loss": 0.1182, + "step": 311 + }, + { + "epoch": 0.5850914205344585, + "grad_norm": 0.2346324771642685, + "learning_rate": 1.835477582846004e-05, + "loss": 0.128, + "step": 312 + }, + { + "epoch": 0.586966713548992, + "grad_norm": 0.21992027759552002, + "learning_rate": 1.8346978557504874e-05, + "loss": 0.0972, + "step": 313 + }, + { + "epoch": 0.5888420065635256, + "grad_norm": 0.21227417886257172, + "learning_rate": 1.833918128654971e-05, + "loss": 0.1038, + "step": 314 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 0.22464649379253387, + "learning_rate": 1.8331384015594546e-05, + "loss": 0.1452, + "step": 315 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.22292515635490417, + "learning_rate": 1.8323586744639376e-05, + "loss": 0.1343, + "step": 316 + }, + { + "epoch": 0.5944678856071262, + "grad_norm": 0.19903215765953064, + "learning_rate": 1.831578947368421e-05, + "loss": 0.144, + "step": 317 + }, + { + "epoch": 0.5963431786216596, + "grad_norm": 0.327104389667511, + "learning_rate": 1.8307992202729045e-05, + "loss": 0.1041, + "step": 318 + }, + { + "epoch": 0.5982184716361931, + "grad_norm": 0.25846344232559204, + "learning_rate": 1.830019493177388e-05, + "loss": 0.1504, + "step": 319 + }, + { + "epoch": 0.6000937646507267, + "grad_norm": 0.23256655037403107, + "learning_rate": 1.8292397660818713e-05, + "loss": 0.1239, + "step": 320 + }, + { + "epoch": 0.6019690576652602, + "grad_norm": 0.20890699326992035, + "learning_rate": 1.828460038986355e-05, + "loss": 0.112, + "step": 321 + }, + { + "epoch": 0.6038443506797937, + "grad_norm": 0.23920990526676178, + "learning_rate": 1.8276803118908385e-05, + "loss": 0.0993, + "step": 322 + }, + { + "epoch": 0.6057196436943273, + "grad_norm": 0.4082464575767517, + "learning_rate": 1.826900584795322e-05, + "loss": 0.1376, + "step": 323 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 0.30590200424194336, + "learning_rate": 1.8261208576998053e-05, + "loss": 0.0958, + "step": 324 + }, + { + "epoch": 0.6094702297233943, + "grad_norm": 0.2254190891981125, + "learning_rate": 1.8253411306042888e-05, + "loss": 0.1343, + "step": 325 + }, + { + "epoch": 0.6113455227379277, + "grad_norm": 0.3132902979850769, + "learning_rate": 1.824561403508772e-05, + "loss": 0.1007, + "step": 326 + }, + { + "epoch": 0.6132208157524613, + "grad_norm": 0.1695740818977356, + "learning_rate": 1.8237816764132556e-05, + "loss": 0.079, + "step": 327 + }, + { + "epoch": 0.6150961087669948, + "grad_norm": 0.24086889624595642, + "learning_rate": 1.823001949317739e-05, + "loss": 0.132, + "step": 328 + }, + { + "epoch": 0.6169714017815283, + "grad_norm": 0.30719810724258423, + "learning_rate": 1.8222222222222224e-05, + "loss": 0.1589, + "step": 329 + }, + { + "epoch": 0.6188466947960619, + "grad_norm": 0.2731982171535492, + "learning_rate": 1.821442495126706e-05, + "loss": 0.1366, + "step": 330 + }, + { + "epoch": 0.6207219878105954, + "grad_norm": 0.2092353105545044, + "learning_rate": 1.8206627680311893e-05, + "loss": 0.1234, + "step": 331 + }, + { + "epoch": 0.6225972808251289, + "grad_norm": 0.21123865246772766, + "learning_rate": 1.8198830409356727e-05, + "loss": 0.129, + "step": 332 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 0.3408866226673126, + "learning_rate": 1.819103313840156e-05, + "loss": 0.1415, + "step": 333 + }, + { + "epoch": 0.626347866854196, + "grad_norm": 0.22271405160427094, + "learning_rate": 1.8183235867446395e-05, + "loss": 0.0728, + "step": 334 + }, + { + "epoch": 0.6282231598687295, + "grad_norm": 0.2094489336013794, + "learning_rate": 1.817543859649123e-05, + "loss": 0.1083, + "step": 335 + }, + { + "epoch": 0.630098452883263, + "grad_norm": 0.20245423913002014, + "learning_rate": 1.8167641325536064e-05, + "loss": 0.095, + "step": 336 + }, + { + "epoch": 0.6319737458977965, + "grad_norm": 0.23320430517196655, + "learning_rate": 1.8159844054580898e-05, + "loss": 0.1613, + "step": 337 + }, + { + "epoch": 0.63384903891233, + "grad_norm": 0.2643381655216217, + "learning_rate": 1.8152046783625732e-05, + "loss": 0.1424, + "step": 338 + }, + { + "epoch": 0.6357243319268636, + "grad_norm": 0.2379055917263031, + "learning_rate": 1.8144249512670566e-05, + "loss": 0.1042, + "step": 339 + }, + { + "epoch": 0.6375996249413971, + "grad_norm": 0.21585410833358765, + "learning_rate": 1.81364522417154e-05, + "loss": 0.1092, + "step": 340 + }, + { + "epoch": 0.6394749179559306, + "grad_norm": 0.22150596976280212, + "learning_rate": 1.8128654970760235e-05, + "loss": 0.1266, + "step": 341 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 0.3069721758365631, + "learning_rate": 1.812085769980507e-05, + "loss": 0.1232, + "step": 342 + }, + { + "epoch": 0.6432255039849977, + "grad_norm": 0.22880715131759644, + "learning_rate": 1.8113060428849903e-05, + "loss": 0.1376, + "step": 343 + }, + { + "epoch": 0.6451007969995312, + "grad_norm": 0.22147296369075775, + "learning_rate": 1.810526315789474e-05, + "loss": 0.0861, + "step": 344 + }, + { + "epoch": 0.6469760900140648, + "grad_norm": 0.18794487416744232, + "learning_rate": 1.8097465886939575e-05, + "loss": 0.1047, + "step": 345 + }, + { + "epoch": 0.6488513830285982, + "grad_norm": 0.19053122401237488, + "learning_rate": 1.8089668615984406e-05, + "loss": 0.0852, + "step": 346 + }, + { + "epoch": 0.6507266760431317, + "grad_norm": 0.2369307428598404, + "learning_rate": 1.808187134502924e-05, + "loss": 0.1077, + "step": 347 + }, + { + "epoch": 0.6526019690576652, + "grad_norm": 0.2687581479549408, + "learning_rate": 1.8074074074074074e-05, + "loss": 0.1187, + "step": 348 + }, + { + "epoch": 0.6544772620721988, + "grad_norm": 0.32420602440834045, + "learning_rate": 1.8066276803118908e-05, + "loss": 0.1453, + "step": 349 + }, + { + "epoch": 0.6563525550867323, + "grad_norm": 0.2944568693637848, + "learning_rate": 1.8058479532163746e-05, + "loss": 0.0945, + "step": 350 + }, + { + "epoch": 0.6563525550867323, + "eval_loss": 0.08979390561580658, + "eval_runtime": 675.9452, + "eval_samples_per_second": 0.291, + "eval_steps_per_second": 0.291, + "step": 350 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 0.21131469309329987, + "learning_rate": 1.805068226120858e-05, + "loss": 0.0927, + "step": 351 + }, + { + "epoch": 0.6601031411157994, + "grad_norm": 0.2300959676504135, + "learning_rate": 1.8042884990253414e-05, + "loss": 0.1187, + "step": 352 + }, + { + "epoch": 0.6619784341303329, + "grad_norm": 0.21290843188762665, + "learning_rate": 1.8035087719298248e-05, + "loss": 0.0714, + "step": 353 + }, + { + "epoch": 0.6638537271448663, + "grad_norm": 0.22140546143054962, + "learning_rate": 1.8027290448343082e-05, + "loss": 0.0918, + "step": 354 + }, + { + "epoch": 0.6657290201593999, + "grad_norm": 0.21354801952838898, + "learning_rate": 1.8019493177387917e-05, + "loss": 0.081, + "step": 355 + }, + { + "epoch": 0.6676043131739334, + "grad_norm": 0.25323230028152466, + "learning_rate": 1.8011695906432747e-05, + "loss": 0.1041, + "step": 356 + }, + { + "epoch": 0.6694796061884669, + "grad_norm": 0.2276594042778015, + "learning_rate": 1.8003898635477585e-05, + "loss": 0.0924, + "step": 357 + }, + { + "epoch": 0.6713548992030005, + "grad_norm": 0.35995563864707947, + "learning_rate": 1.799610136452242e-05, + "loss": 0.1373, + "step": 358 + }, + { + "epoch": 0.673230192217534, + "grad_norm": 0.2169509083032608, + "learning_rate": 1.7988304093567253e-05, + "loss": 0.1004, + "step": 359 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 0.221743643283844, + "learning_rate": 1.7980506822612088e-05, + "loss": 0.0883, + "step": 360 + }, + { + "epoch": 0.6769807782466011, + "grad_norm": 0.1859385073184967, + "learning_rate": 1.7972709551656922e-05, + "loss": 0.0863, + "step": 361 + }, + { + "epoch": 0.6788560712611346, + "grad_norm": 0.2101774513721466, + "learning_rate": 1.7964912280701756e-05, + "loss": 0.105, + "step": 362 + }, + { + "epoch": 0.680731364275668, + "grad_norm": 0.2311290204524994, + "learning_rate": 1.795711500974659e-05, + "loss": 0.096, + "step": 363 + }, + { + "epoch": 0.6826066572902016, + "grad_norm": 0.2699367105960846, + "learning_rate": 1.7949317738791424e-05, + "loss": 0.1301, + "step": 364 + }, + { + "epoch": 0.6844819503047351, + "grad_norm": 0.31696122884750366, + "learning_rate": 1.794152046783626e-05, + "loss": 0.1594, + "step": 365 + }, + { + "epoch": 0.6863572433192686, + "grad_norm": 0.31495556235313416, + "learning_rate": 1.7933723196881093e-05, + "loss": 0.094, + "step": 366 + }, + { + "epoch": 0.6882325363338021, + "grad_norm": 0.2610797882080078, + "learning_rate": 1.7925925925925927e-05, + "loss": 0.1289, + "step": 367 + }, + { + "epoch": 0.6901078293483357, + "grad_norm": 0.2139683961868286, + "learning_rate": 1.791812865497076e-05, + "loss": 0.1032, + "step": 368 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 0.2474675327539444, + "learning_rate": 1.7910331384015595e-05, + "loss": 0.088, + "step": 369 + }, + { + "epoch": 0.6938584153774027, + "grad_norm": 0.22533686459064484, + "learning_rate": 1.790253411306043e-05, + "loss": 0.0933, + "step": 370 + }, + { + "epoch": 0.6957337083919363, + "grad_norm": 0.22728174924850464, + "learning_rate": 1.7894736842105264e-05, + "loss": 0.1225, + "step": 371 + }, + { + "epoch": 0.6976090014064698, + "grad_norm": 0.23081286251544952, + "learning_rate": 1.7886939571150098e-05, + "loss": 0.1099, + "step": 372 + }, + { + "epoch": 0.6994842944210032, + "grad_norm": 0.2340676188468933, + "learning_rate": 1.7879142300194932e-05, + "loss": 0.1011, + "step": 373 + }, + { + "epoch": 0.7013595874355368, + "grad_norm": 0.2684282064437866, + "learning_rate": 1.787134502923977e-05, + "loss": 0.1667, + "step": 374 + }, + { + "epoch": 0.7032348804500703, + "grad_norm": 0.19310195744037628, + "learning_rate": 1.7863547758284604e-05, + "loss": 0.0958, + "step": 375 + }, + { + "epoch": 0.7051101734646038, + "grad_norm": 0.19192589819431305, + "learning_rate": 1.7855750487329435e-05, + "loss": 0.0769, + "step": 376 + }, + { + "epoch": 0.7069854664791374, + "grad_norm": 0.25004175305366516, + "learning_rate": 1.784795321637427e-05, + "loss": 0.1112, + "step": 377 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 0.2875761389732361, + "learning_rate": 1.7840155945419103e-05, + "loss": 0.1425, + "step": 378 + }, + { + "epoch": 0.7107360525082044, + "grad_norm": 0.2135677933692932, + "learning_rate": 1.7832358674463937e-05, + "loss": 0.0945, + "step": 379 + }, + { + "epoch": 0.712611345522738, + "grad_norm": 0.38285496830940247, + "learning_rate": 1.7824561403508775e-05, + "loss": 0.0824, + "step": 380 + }, + { + "epoch": 0.7144866385372715, + "grad_norm": 0.26780804991722107, + "learning_rate": 1.781676413255361e-05, + "loss": 0.1429, + "step": 381 + }, + { + "epoch": 0.7163619315518049, + "grad_norm": 0.2771541178226471, + "learning_rate": 1.7808966861598443e-05, + "loss": 0.1449, + "step": 382 + }, + { + "epoch": 0.7182372245663385, + "grad_norm": 0.23967161774635315, + "learning_rate": 1.7801169590643277e-05, + "loss": 0.109, + "step": 383 + }, + { + "epoch": 0.720112517580872, + "grad_norm": 0.2565579414367676, + "learning_rate": 1.779337231968811e-05, + "loss": 0.143, + "step": 384 + }, + { + "epoch": 0.7219878105954055, + "grad_norm": 0.23335972428321838, + "learning_rate": 1.7785575048732946e-05, + "loss": 0.0836, + "step": 385 + }, + { + "epoch": 0.7238631036099391, + "grad_norm": 0.21833084523677826, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.0927, + "step": 386 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 0.29433101415634155, + "learning_rate": 1.7769980506822614e-05, + "loss": 0.102, + "step": 387 + }, + { + "epoch": 0.7276136896390061, + "grad_norm": 0.23135854303836823, + "learning_rate": 1.776218323586745e-05, + "loss": 0.1139, + "step": 388 + }, + { + "epoch": 0.7294889826535396, + "grad_norm": 0.24812403321266174, + "learning_rate": 1.7754385964912283e-05, + "loss": 0.1098, + "step": 389 + }, + { + "epoch": 0.7313642756680732, + "grad_norm": 0.2173132598400116, + "learning_rate": 1.7746588693957117e-05, + "loss": 0.1089, + "step": 390 + }, + { + "epoch": 0.7332395686826066, + "grad_norm": 0.24950866401195526, + "learning_rate": 1.773879142300195e-05, + "loss": 0.0805, + "step": 391 + }, + { + "epoch": 0.7351148616971401, + "grad_norm": 0.29122406244277954, + "learning_rate": 1.7730994152046785e-05, + "loss": 0.1481, + "step": 392 + }, + { + "epoch": 0.7369901547116737, + "grad_norm": 0.2415425032377243, + "learning_rate": 1.772319688109162e-05, + "loss": 0.1364, + "step": 393 + }, + { + "epoch": 0.7388654477262072, + "grad_norm": 0.2071705311536789, + "learning_rate": 1.7715399610136454e-05, + "loss": 0.1227, + "step": 394 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.20239049196243286, + "learning_rate": 1.7707602339181288e-05, + "loss": 0.0864, + "step": 395 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 0.25537100434303284, + "learning_rate": 1.7699805068226122e-05, + "loss": 0.1278, + "step": 396 + }, + { + "epoch": 0.7444913267698078, + "grad_norm": 0.30780285596847534, + "learning_rate": 1.7692007797270956e-05, + "loss": 0.1586, + "step": 397 + }, + { + "epoch": 0.7463666197843413, + "grad_norm": 0.36921605467796326, + "learning_rate": 1.768421052631579e-05, + "loss": 0.1244, + "step": 398 + }, + { + "epoch": 0.7482419127988749, + "grad_norm": 0.2948741614818573, + "learning_rate": 1.7676413255360624e-05, + "loss": 0.1095, + "step": 399 + }, + { + "epoch": 0.7501172058134083, + "grad_norm": 0.23762205243110657, + "learning_rate": 1.766861598440546e-05, + "loss": 0.1245, + "step": 400 + }, + { + "epoch": 0.7501172058134083, + "eval_loss": 0.08483699709177017, + "eval_runtime": 675.3388, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 400 + }, + { + "epoch": 0.7519924988279418, + "grad_norm": 0.24527454376220703, + "learning_rate": 1.7660818713450293e-05, + "loss": 0.1122, + "step": 401 + }, + { + "epoch": 0.7538677918424754, + "grad_norm": 0.22449228167533875, + "learning_rate": 1.7653021442495127e-05, + "loss": 0.1044, + "step": 402 + }, + { + "epoch": 0.7557430848570089, + "grad_norm": 0.285155713558197, + "learning_rate": 1.7645224171539965e-05, + "loss": 0.1141, + "step": 403 + }, + { + "epoch": 0.7576183778715424, + "grad_norm": 0.247589111328125, + "learning_rate": 1.76374269005848e-05, + "loss": 0.1363, + "step": 404 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.26343780755996704, + "learning_rate": 1.7629629629629633e-05, + "loss": 0.1198, + "step": 405 + }, + { + "epoch": 0.7613689639006095, + "grad_norm": 0.2533006966114044, + "learning_rate": 1.7621832358674464e-05, + "loss": 0.1137, + "step": 406 + }, + { + "epoch": 0.763244256915143, + "grad_norm": 0.22146250307559967, + "learning_rate": 1.7614035087719298e-05, + "loss": 0.1152, + "step": 407 + }, + { + "epoch": 0.7651195499296765, + "grad_norm": 0.3412543535232544, + "learning_rate": 1.7606237816764132e-05, + "loss": 0.1838, + "step": 408 + }, + { + "epoch": 0.76699484294421, + "grad_norm": 0.2710413634777069, + "learning_rate": 1.7598440545808966e-05, + "loss": 0.1088, + "step": 409 + }, + { + "epoch": 0.7688701359587435, + "grad_norm": 0.2602677047252655, + "learning_rate": 1.7590643274853804e-05, + "loss": 0.135, + "step": 410 + }, + { + "epoch": 0.770745428973277, + "grad_norm": 0.22832591831684113, + "learning_rate": 1.7582846003898638e-05, + "loss": 0.099, + "step": 411 + }, + { + "epoch": 0.7726207219878106, + "grad_norm": 0.20037035644054413, + "learning_rate": 1.7575048732943472e-05, + "loss": 0.0988, + "step": 412 + }, + { + "epoch": 0.7744960150023441, + "grad_norm": 0.21016646921634674, + "learning_rate": 1.7567251461988307e-05, + "loss": 0.0859, + "step": 413 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 0.25442710518836975, + "learning_rate": 1.755945419103314e-05, + "loss": 0.1192, + "step": 414 + }, + { + "epoch": 0.7782466010314112, + "grad_norm": 0.2266324907541275, + "learning_rate": 1.7551656920077975e-05, + "loss": 0.106, + "step": 415 + }, + { + "epoch": 0.7801218940459447, + "grad_norm": 0.24464181065559387, + "learning_rate": 1.754385964912281e-05, + "loss": 0.116, + "step": 416 + }, + { + "epoch": 0.7819971870604782, + "grad_norm": 0.19099155068397522, + "learning_rate": 1.7536062378167643e-05, + "loss": 0.0759, + "step": 417 + }, + { + "epoch": 0.7838724800750118, + "grad_norm": 0.23417051136493683, + "learning_rate": 1.7528265107212477e-05, + "loss": 0.0815, + "step": 418 + }, + { + "epoch": 0.7857477730895452, + "grad_norm": 0.3102501630783081, + "learning_rate": 1.752046783625731e-05, + "loss": 0.1205, + "step": 419 + }, + { + "epoch": 0.7876230661040787, + "grad_norm": 0.27138587832450867, + "learning_rate": 1.7512670565302146e-05, + "loss": 0.0957, + "step": 420 + }, + { + "epoch": 0.7894983591186123, + "grad_norm": 0.3055587708950043, + "learning_rate": 1.750487329434698e-05, + "loss": 0.1181, + "step": 421 + }, + { + "epoch": 0.7913736521331458, + "grad_norm": 0.24075333774089813, + "learning_rate": 1.7497076023391814e-05, + "loss": 0.1192, + "step": 422 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 0.1806018054485321, + "learning_rate": 1.748927875243665e-05, + "loss": 0.0884, + "step": 423 + }, + { + "epoch": 0.7951242381622129, + "grad_norm": 0.2315697968006134, + "learning_rate": 1.7481481481481483e-05, + "loss": 0.0696, + "step": 424 + }, + { + "epoch": 0.7969995311767464, + "grad_norm": 0.23765219748020172, + "learning_rate": 1.7473684210526317e-05, + "loss": 0.1186, + "step": 425 + }, + { + "epoch": 0.7988748241912799, + "grad_norm": 0.18785423040390015, + "learning_rate": 1.746588693957115e-05, + "loss": 0.0727, + "step": 426 + }, + { + "epoch": 0.8007501172058135, + "grad_norm": 0.1897609829902649, + "learning_rate": 1.7458089668615985e-05, + "loss": 0.0926, + "step": 427 + }, + { + "epoch": 0.802625410220347, + "grad_norm": 0.28181466460227966, + "learning_rate": 1.745029239766082e-05, + "loss": 0.1083, + "step": 428 + }, + { + "epoch": 0.8045007032348804, + "grad_norm": 0.20873577892780304, + "learning_rate": 1.7442495126705654e-05, + "loss": 0.105, + "step": 429 + }, + { + "epoch": 0.8063759962494139, + "grad_norm": 0.23470522463321686, + "learning_rate": 1.7434697855750488e-05, + "loss": 0.1103, + "step": 430 + }, + { + "epoch": 0.8082512892639475, + "grad_norm": 0.26746684312820435, + "learning_rate": 1.7426900584795322e-05, + "loss": 0.1322, + "step": 431 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.2638936936855316, + "learning_rate": 1.7419103313840156e-05, + "loss": 0.1228, + "step": 432 + }, + { + "epoch": 0.8120018752930145, + "grad_norm": 0.237047016620636, + "learning_rate": 1.7411306042884994e-05, + "loss": 0.1042, + "step": 433 + }, + { + "epoch": 0.8138771683075481, + "grad_norm": 0.1885695904493332, + "learning_rate": 1.7403508771929828e-05, + "loss": 0.0723, + "step": 434 + }, + { + "epoch": 0.8157524613220816, + "grad_norm": 0.21713057160377502, + "learning_rate": 1.7395711500974662e-05, + "loss": 0.0796, + "step": 435 + }, + { + "epoch": 0.817627754336615, + "grad_norm": 0.3199704885482788, + "learning_rate": 1.7387914230019493e-05, + "loss": 0.1588, + "step": 436 + }, + { + "epoch": 0.8195030473511487, + "grad_norm": 0.3154192566871643, + "learning_rate": 1.7380116959064327e-05, + "loss": 0.1365, + "step": 437 + }, + { + "epoch": 0.8213783403656821, + "grad_norm": 0.2530040144920349, + "learning_rate": 1.737231968810916e-05, + "loss": 0.0948, + "step": 438 + }, + { + "epoch": 0.8232536333802156, + "grad_norm": 0.23530352115631104, + "learning_rate": 1.7364522417154e-05, + "loss": 0.119, + "step": 439 + }, + { + "epoch": 0.8251289263947492, + "grad_norm": 0.2744591534137726, + "learning_rate": 1.7356725146198833e-05, + "loss": 0.1177, + "step": 440 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 0.21735382080078125, + "learning_rate": 1.7348927875243667e-05, + "loss": 0.0836, + "step": 441 + }, + { + "epoch": 0.8288795124238162, + "grad_norm": 0.21691051125526428, + "learning_rate": 1.73411306042885e-05, + "loss": 0.1232, + "step": 442 + }, + { + "epoch": 0.8307548054383498, + "grad_norm": 0.26144662499427795, + "learning_rate": 1.7333333333333336e-05, + "loss": 0.115, + "step": 443 + }, + { + "epoch": 0.8326300984528833, + "grad_norm": 0.2164791226387024, + "learning_rate": 1.732553606237817e-05, + "loss": 0.1146, + "step": 444 + }, + { + "epoch": 0.8345053914674168, + "grad_norm": 0.27034950256347656, + "learning_rate": 1.7317738791423004e-05, + "loss": 0.0644, + "step": 445 + }, + { + "epoch": 0.8363806844819504, + "grad_norm": 0.19503502547740936, + "learning_rate": 1.7309941520467838e-05, + "loss": 0.0688, + "step": 446 + }, + { + "epoch": 0.8382559774964838, + "grad_norm": 0.23584571480751038, + "learning_rate": 1.7302144249512672e-05, + "loss": 0.1256, + "step": 447 + }, + { + "epoch": 0.8401312705110173, + "grad_norm": 0.2520376145839691, + "learning_rate": 1.7294346978557507e-05, + "loss": 0.1601, + "step": 448 + }, + { + "epoch": 0.8420065635255509, + "grad_norm": 0.24150918424129486, + "learning_rate": 1.728654970760234e-05, + "loss": 0.1155, + "step": 449 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 0.289753258228302, + "learning_rate": 1.7278752436647175e-05, + "loss": 0.1283, + "step": 450 + }, + { + "epoch": 0.8438818565400844, + "eval_loss": 0.08511354774236679, + "eval_runtime": 675.7661, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 450 + }, + { + "epoch": 0.8457571495546179, + "grad_norm": 0.27476435899734497, + "learning_rate": 1.727095516569201e-05, + "loss": 0.0989, + "step": 451 + }, + { + "epoch": 0.8476324425691514, + "grad_norm": 0.20759902894496918, + "learning_rate": 1.7263157894736843e-05, + "loss": 0.0963, + "step": 452 + }, + { + "epoch": 0.849507735583685, + "grad_norm": 0.3410794138908386, + "learning_rate": 1.7255360623781678e-05, + "loss": 0.1675, + "step": 453 + }, + { + "epoch": 0.8513830285982185, + "grad_norm": 0.2817666828632355, + "learning_rate": 1.7247563352826512e-05, + "loss": 0.1458, + "step": 454 + }, + { + "epoch": 0.853258321612752, + "grad_norm": 0.2924879491329193, + "learning_rate": 1.7239766081871346e-05, + "loss": 0.1393, + "step": 455 + }, + { + "epoch": 0.8551336146272855, + "grad_norm": 0.3008408844470978, + "learning_rate": 1.723196881091618e-05, + "loss": 0.1051, + "step": 456 + }, + { + "epoch": 0.857008907641819, + "grad_norm": 0.3666384518146515, + "learning_rate": 1.7224171539961014e-05, + "loss": 0.1031, + "step": 457 + }, + { + "epoch": 0.8588842006563525, + "grad_norm": 0.2442241907119751, + "learning_rate": 1.721637426900585e-05, + "loss": 0.1333, + "step": 458 + }, + { + "epoch": 0.8607594936708861, + "grad_norm": 0.3462783992290497, + "learning_rate": 1.7208576998050683e-05, + "loss": 0.115, + "step": 459 + }, + { + "epoch": 0.8626347866854196, + "grad_norm": 0.25634944438934326, + "learning_rate": 1.7200779727095517e-05, + "loss": 0.0867, + "step": 460 + }, + { + "epoch": 0.8645100796999531, + "grad_norm": 0.30906710028648376, + "learning_rate": 1.719298245614035e-05, + "loss": 0.1405, + "step": 461 + }, + { + "epoch": 0.8663853727144867, + "grad_norm": 0.2611568570137024, + "learning_rate": 1.7185185185185185e-05, + "loss": 0.1071, + "step": 462 + }, + { + "epoch": 0.8682606657290202, + "grad_norm": 0.2521813213825226, + "learning_rate": 1.7177387914230023e-05, + "loss": 0.0707, + "step": 463 + }, + { + "epoch": 0.8701359587435537, + "grad_norm": 0.25580745935440063, + "learning_rate": 1.7169590643274857e-05, + "loss": 0.0854, + "step": 464 + }, + { + "epoch": 0.8720112517580872, + "grad_norm": 0.2787662148475647, + "learning_rate": 1.716179337231969e-05, + "loss": 0.1018, + "step": 465 + }, + { + "epoch": 0.8738865447726207, + "grad_norm": 0.21670454740524292, + "learning_rate": 1.7153996101364522e-05, + "loss": 0.1214, + "step": 466 + }, + { + "epoch": 0.8757618377871542, + "grad_norm": 0.28645867109298706, + "learning_rate": 1.7146198830409356e-05, + "loss": 0.0877, + "step": 467 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 0.20811931788921356, + "learning_rate": 1.713840155945419e-05, + "loss": 0.0846, + "step": 468 + }, + { + "epoch": 0.8795124238162213, + "grad_norm": 0.301693320274353, + "learning_rate": 1.7130604288499028e-05, + "loss": 0.1215, + "step": 469 + }, + { + "epoch": 0.8813877168307548, + "grad_norm": 0.21408531069755554, + "learning_rate": 1.7122807017543862e-05, + "loss": 0.0929, + "step": 470 + }, + { + "epoch": 0.8832630098452883, + "grad_norm": 0.20539730787277222, + "learning_rate": 1.7115009746588696e-05, + "loss": 0.0689, + "step": 471 + }, + { + "epoch": 0.8851383028598219, + "grad_norm": 0.2104424089193344, + "learning_rate": 1.710721247563353e-05, + "loss": 0.0997, + "step": 472 + }, + { + "epoch": 0.8870135958743554, + "grad_norm": 0.7082309722900391, + "learning_rate": 1.7099415204678365e-05, + "loss": 0.0945, + "step": 473 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.19709636270999908, + "learning_rate": 1.70916179337232e-05, + "loss": 0.0781, + "step": 474 + }, + { + "epoch": 0.8907641819034224, + "grad_norm": 0.26817139983177185, + "learning_rate": 1.7083820662768033e-05, + "loss": 0.0823, + "step": 475 + }, + { + "epoch": 0.8926394749179559, + "grad_norm": 0.26233142614364624, + "learning_rate": 1.7076023391812867e-05, + "loss": 0.0938, + "step": 476 + }, + { + "epoch": 0.8945147679324894, + "grad_norm": 0.3621986508369446, + "learning_rate": 1.70682261208577e-05, + "loss": 0.1134, + "step": 477 + }, + { + "epoch": 0.896390060947023, + "grad_norm": 0.2838749289512634, + "learning_rate": 1.7060428849902536e-05, + "loss": 0.1347, + "step": 478 + }, + { + "epoch": 0.8982653539615565, + "grad_norm": 0.19387036561965942, + "learning_rate": 1.705263157894737e-05, + "loss": 0.065, + "step": 479 + }, + { + "epoch": 0.90014064697609, + "grad_norm": 0.20621807873249054, + "learning_rate": 1.7044834307992204e-05, + "loss": 0.0731, + "step": 480 + }, + { + "epoch": 0.9020159399906236, + "grad_norm": 0.2323528677225113, + "learning_rate": 1.7037037037037038e-05, + "loss": 0.0931, + "step": 481 + }, + { + "epoch": 0.9038912330051571, + "grad_norm": 0.20820242166519165, + "learning_rate": 1.7029239766081872e-05, + "loss": 0.0853, + "step": 482 + }, + { + "epoch": 0.9057665260196905, + "grad_norm": 0.27471327781677246, + "learning_rate": 1.7021442495126707e-05, + "loss": 0.1377, + "step": 483 + }, + { + "epoch": 0.9076418190342241, + "grad_norm": 0.22734041512012482, + "learning_rate": 1.701364522417154e-05, + "loss": 0.1169, + "step": 484 + }, + { + "epoch": 0.9095171120487576, + "grad_norm": 0.25026533007621765, + "learning_rate": 1.7005847953216375e-05, + "loss": 0.0976, + "step": 485 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 0.24165751039981842, + "learning_rate": 1.699805068226121e-05, + "loss": 0.1008, + "step": 486 + }, + { + "epoch": 0.9132676980778247, + "grad_norm": 0.24641317129135132, + "learning_rate": 1.6990253411306043e-05, + "loss": 0.1105, + "step": 487 + }, + { + "epoch": 0.9151429910923582, + "grad_norm": 0.18702614307403564, + "learning_rate": 1.6982456140350878e-05, + "loss": 0.0711, + "step": 488 + }, + { + "epoch": 0.9170182841068917, + "grad_norm": 0.31801968812942505, + "learning_rate": 1.6974658869395712e-05, + "loss": 0.1112, + "step": 489 + }, + { + "epoch": 0.9188935771214253, + "grad_norm": 0.311109721660614, + "learning_rate": 1.6966861598440546e-05, + "loss": 0.128, + "step": 490 + }, + { + "epoch": 0.9207688701359588, + "grad_norm": 0.24190711975097656, + "learning_rate": 1.695906432748538e-05, + "loss": 0.0873, + "step": 491 + }, + { + "epoch": 0.9226441631504922, + "grad_norm": 0.32772934436798096, + "learning_rate": 1.6951267056530218e-05, + "loss": 0.0979, + "step": 492 + }, + { + "epoch": 0.9245194561650257, + "grad_norm": 0.23775866627693176, + "learning_rate": 1.6943469785575052e-05, + "loss": 0.096, + "step": 493 + }, + { + "epoch": 0.9263947491795593, + "grad_norm": 0.38035061955451965, + "learning_rate": 1.6935672514619886e-05, + "loss": 0.1299, + "step": 494 + }, + { + "epoch": 0.9282700421940928, + "grad_norm": 0.25195756554603577, + "learning_rate": 1.692787524366472e-05, + "loss": 0.0958, + "step": 495 + }, + { + "epoch": 0.9301453352086263, + "grad_norm": 0.24769015610218048, + "learning_rate": 1.692007797270955e-05, + "loss": 0.1028, + "step": 496 + }, + { + "epoch": 0.9320206282231599, + "grad_norm": 0.2188967764377594, + "learning_rate": 1.6912280701754385e-05, + "loss": 0.0731, + "step": 497 + }, + { + "epoch": 0.9338959212376934, + "grad_norm": 0.2325516939163208, + "learning_rate": 1.690448343079922e-05, + "loss": 0.1144, + "step": 498 + }, + { + "epoch": 0.9357712142522269, + "grad_norm": 0.30458924174308777, + "learning_rate": 1.6896686159844057e-05, + "loss": 0.0813, + "step": 499 + }, + { + "epoch": 0.9376465072667605, + "grad_norm": 0.22726097702980042, + "learning_rate": 1.688888888888889e-05, + "loss": 0.0839, + "step": 500 + }, + { + "epoch": 0.9376465072667605, + "eval_loss": 0.08535026758909225, + "eval_runtime": 676.335, + "eval_samples_per_second": 0.291, + "eval_steps_per_second": 0.291, + "step": 500 + }, + { + "epoch": 0.939521800281294, + "grad_norm": 0.2691619098186493, + "learning_rate": 1.6881091617933726e-05, + "loss": 0.1266, + "step": 501 + }, + { + "epoch": 0.9413970932958274, + "grad_norm": 0.28599363565444946, + "learning_rate": 1.687329434697856e-05, + "loss": 0.1125, + "step": 502 + }, + { + "epoch": 0.943272386310361, + "grad_norm": 0.27564743161201477, + "learning_rate": 1.6865497076023394e-05, + "loss": 0.1029, + "step": 503 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 0.23417291045188904, + "learning_rate": 1.6857699805068228e-05, + "loss": 0.087, + "step": 504 + }, + { + "epoch": 0.947022972339428, + "grad_norm": 0.2351316511631012, + "learning_rate": 1.6849902534113062e-05, + "loss": 0.1024, + "step": 505 + }, + { + "epoch": 0.9488982653539616, + "grad_norm": 0.2007894515991211, + "learning_rate": 1.6842105263157896e-05, + "loss": 0.0752, + "step": 506 + }, + { + "epoch": 0.9507735583684951, + "grad_norm": 0.2415047585964203, + "learning_rate": 1.683430799220273e-05, + "loss": 0.0912, + "step": 507 + }, + { + "epoch": 0.9526488513830286, + "grad_norm": 0.25994813442230225, + "learning_rate": 1.6826510721247565e-05, + "loss": 0.1331, + "step": 508 + }, + { + "epoch": 0.9545241443975622, + "grad_norm": 0.2513566315174103, + "learning_rate": 1.68187134502924e-05, + "loss": 0.1031, + "step": 509 + }, + { + "epoch": 0.9563994374120957, + "grad_norm": 0.2094557136297226, + "learning_rate": 1.6810916179337233e-05, + "loss": 0.0736, + "step": 510 + }, + { + "epoch": 0.9582747304266291, + "grad_norm": 0.2421177178621292, + "learning_rate": 1.6803118908382067e-05, + "loss": 0.1123, + "step": 511 + }, + { + "epoch": 0.9601500234411627, + "grad_norm": 0.2969476580619812, + "learning_rate": 1.67953216374269e-05, + "loss": 0.1175, + "step": 512 + }, + { + "epoch": 0.9620253164556962, + "grad_norm": 0.3147030770778656, + "learning_rate": 1.6787524366471736e-05, + "loss": 0.1647, + "step": 513 + }, + { + "epoch": 0.9639006094702297, + "grad_norm": 0.2841101586818695, + "learning_rate": 1.677972709551657e-05, + "loss": 0.1238, + "step": 514 + }, + { + "epoch": 0.9657759024847632, + "grad_norm": 0.28984513878822327, + "learning_rate": 1.6771929824561408e-05, + "loss": 0.1257, + "step": 515 + }, + { + "epoch": 0.9676511954992968, + "grad_norm": 0.32147666811943054, + "learning_rate": 1.676413255360624e-05, + "loss": 0.1446, + "step": 516 + }, + { + "epoch": 0.9695264885138303, + "grad_norm": 0.17480434477329254, + "learning_rate": 1.6756335282651073e-05, + "loss": 0.079, + "step": 517 + }, + { + "epoch": 0.9714017815283638, + "grad_norm": 0.2966823875904083, + "learning_rate": 1.6748538011695907e-05, + "loss": 0.1092, + "step": 518 + }, + { + "epoch": 0.9732770745428974, + "grad_norm": 0.24680288136005402, + "learning_rate": 1.674074074074074e-05, + "loss": 0.0885, + "step": 519 + }, + { + "epoch": 0.9751523675574308, + "grad_norm": 0.23044931888580322, + "learning_rate": 1.6732943469785575e-05, + "loss": 0.1615, + "step": 520 + }, + { + "epoch": 0.9770276605719643, + "grad_norm": 0.2578699588775635, + "learning_rate": 1.672514619883041e-05, + "loss": 0.1323, + "step": 521 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 0.2969329059123993, + "learning_rate": 1.6717348927875247e-05, + "loss": 0.1037, + "step": 522 + }, + { + "epoch": 0.9807782466010314, + "grad_norm": 0.2269773781299591, + "learning_rate": 1.670955165692008e-05, + "loss": 0.1047, + "step": 523 + }, + { + "epoch": 0.9826535396155649, + "grad_norm": 0.26680633425712585, + "learning_rate": 1.6701754385964915e-05, + "loss": 0.1249, + "step": 524 + }, + { + "epoch": 0.9845288326300985, + "grad_norm": 0.24191637337207794, + "learning_rate": 1.669395711500975e-05, + "loss": 0.0922, + "step": 525 + }, + { + "epoch": 0.986404125644632, + "grad_norm": 0.2023610770702362, + "learning_rate": 1.668615984405458e-05, + "loss": 0.0733, + "step": 526 + }, + { + "epoch": 0.9882794186591655, + "grad_norm": 0.34420913457870483, + "learning_rate": 1.6678362573099414e-05, + "loss": 0.0859, + "step": 527 + }, + { + "epoch": 0.9901547116736991, + "grad_norm": 0.29441869258880615, + "learning_rate": 1.6670565302144252e-05, + "loss": 0.1199, + "step": 528 + }, + { + "epoch": 0.9920300046882325, + "grad_norm": 0.20998947322368622, + "learning_rate": 1.6662768031189086e-05, + "loss": 0.0767, + "step": 529 + }, + { + "epoch": 0.993905297702766, + "grad_norm": 0.26571086049079895, + "learning_rate": 1.665497076023392e-05, + "loss": 0.1022, + "step": 530 + }, + { + "epoch": 0.9957805907172996, + "grad_norm": 0.24049051105976105, + "learning_rate": 1.6647173489278755e-05, + "loss": 0.1158, + "step": 531 + }, + { + "epoch": 0.9976558837318331, + "grad_norm": 0.3279871940612793, + "learning_rate": 1.663937621832359e-05, + "loss": 0.0895, + "step": 532 + }, + { + "epoch": 0.9995311767463666, + "grad_norm": 0.19162967801094055, + "learning_rate": 1.6631578947368423e-05, + "loss": 0.0906, + "step": 533 + }, + { + "epoch": 1.0, + "grad_norm": 0.5646827816963196, + "learning_rate": 1.6623781676413257e-05, + "loss": 0.1056, + "step": 534 + }, + { + "epoch": 1.0018752930145336, + "grad_norm": 0.22089308500289917, + "learning_rate": 1.661598440545809e-05, + "loss": 0.081, + "step": 535 + }, + { + "epoch": 1.003750586029067, + "grad_norm": 0.20040877163410187, + "learning_rate": 1.6608187134502926e-05, + "loss": 0.0746, + "step": 536 + }, + { + "epoch": 1.0056258790436006, + "grad_norm": 0.23172321915626526, + "learning_rate": 1.660038986354776e-05, + "loss": 0.1107, + "step": 537 + }, + { + "epoch": 1.0075011720581342, + "grad_norm": 0.22388094663619995, + "learning_rate": 1.6592592592592594e-05, + "loss": 0.1188, + "step": 538 + }, + { + "epoch": 1.0093764650726675, + "grad_norm": 0.20831793546676636, + "learning_rate": 1.6584795321637428e-05, + "loss": 0.0724, + "step": 539 + }, + { + "epoch": 1.0112517580872011, + "grad_norm": 0.2914353311061859, + "learning_rate": 1.6576998050682262e-05, + "loss": 0.1019, + "step": 540 + }, + { + "epoch": 1.0131270511017347, + "grad_norm": 0.2762034833431244, + "learning_rate": 1.6569200779727097e-05, + "loss": 0.095, + "step": 541 + }, + { + "epoch": 1.015002344116268, + "grad_norm": 0.22479356825351715, + "learning_rate": 1.656140350877193e-05, + "loss": 0.0883, + "step": 542 + }, + { + "epoch": 1.0168776371308017, + "grad_norm": 0.18842089176177979, + "learning_rate": 1.6553606237816765e-05, + "loss": 0.0851, + "step": 543 + }, + { + "epoch": 1.0187529301453353, + "grad_norm": 0.17845085263252258, + "learning_rate": 1.65458089668616e-05, + "loss": 0.0649, + "step": 544 + }, + { + "epoch": 1.0206282231598687, + "grad_norm": 0.22942480444908142, + "learning_rate": 1.6538011695906437e-05, + "loss": 0.0968, + "step": 545 + }, + { + "epoch": 1.0225035161744023, + "grad_norm": 0.3065761625766754, + "learning_rate": 1.6530214424951268e-05, + "loss": 0.098, + "step": 546 + }, + { + "epoch": 1.0243788091889359, + "grad_norm": 0.41475629806518555, + "learning_rate": 1.6522417153996102e-05, + "loss": 0.0882, + "step": 547 + }, + { + "epoch": 1.0262541022034692, + "grad_norm": 0.23140354454517365, + "learning_rate": 1.6514619883040936e-05, + "loss": 0.1071, + "step": 548 + }, + { + "epoch": 1.0281293952180028, + "grad_norm": 0.26001206040382385, + "learning_rate": 1.650682261208577e-05, + "loss": 0.0984, + "step": 549 + }, + { + "epoch": 1.0300046882325364, + "grad_norm": 0.27233394980430603, + "learning_rate": 1.6499025341130604e-05, + "loss": 0.1294, + "step": 550 + }, + { + "epoch": 1.0300046882325364, + "eval_loss": 0.08262171596288681, + "eval_runtime": 675.5684, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 550 + }, + { + "epoch": 1.0318799812470698, + "grad_norm": 0.28793230652809143, + "learning_rate": 1.649122807017544e-05, + "loss": 0.1191, + "step": 551 + }, + { + "epoch": 1.0337552742616034, + "grad_norm": 0.23013058304786682, + "learning_rate": 1.6483430799220276e-05, + "loss": 0.0834, + "step": 552 + }, + { + "epoch": 1.035630567276137, + "grad_norm": 0.2813061773777008, + "learning_rate": 1.647563352826511e-05, + "loss": 0.1116, + "step": 553 + }, + { + "epoch": 1.0375058602906704, + "grad_norm": 0.27876320481300354, + "learning_rate": 1.6467836257309944e-05, + "loss": 0.1027, + "step": 554 + }, + { + "epoch": 1.039381153305204, + "grad_norm": 0.2607191801071167, + "learning_rate": 1.646003898635478e-05, + "loss": 0.1119, + "step": 555 + }, + { + "epoch": 1.0412564463197373, + "grad_norm": 0.31741422414779663, + "learning_rate": 1.645224171539961e-05, + "loss": 0.1435, + "step": 556 + }, + { + "epoch": 1.043131739334271, + "grad_norm": 0.23251239955425262, + "learning_rate": 1.6444444444444444e-05, + "loss": 0.0574, + "step": 557 + }, + { + "epoch": 1.0450070323488045, + "grad_norm": 0.2328520566225052, + "learning_rate": 1.643664717348928e-05, + "loss": 0.1157, + "step": 558 + }, + { + "epoch": 1.046882325363338, + "grad_norm": 0.42328402400016785, + "learning_rate": 1.6428849902534115e-05, + "loss": 0.0988, + "step": 559 + }, + { + "epoch": 1.0487576183778715, + "grad_norm": 0.570969820022583, + "learning_rate": 1.642105263157895e-05, + "loss": 0.0954, + "step": 560 + }, + { + "epoch": 1.0506329113924051, + "grad_norm": 0.19645580649375916, + "learning_rate": 1.6413255360623784e-05, + "loss": 0.0751, + "step": 561 + }, + { + "epoch": 1.0525082044069385, + "grad_norm": 0.25933393836021423, + "learning_rate": 1.6405458089668618e-05, + "loss": 0.1031, + "step": 562 + }, + { + "epoch": 1.054383497421472, + "grad_norm": 0.2190171778202057, + "learning_rate": 1.6397660818713452e-05, + "loss": 0.0866, + "step": 563 + }, + { + "epoch": 1.0562587904360057, + "grad_norm": 0.2852676808834076, + "learning_rate": 1.6389863547758286e-05, + "loss": 0.0901, + "step": 564 + }, + { + "epoch": 1.058134083450539, + "grad_norm": 0.23386311531066895, + "learning_rate": 1.638206627680312e-05, + "loss": 0.0914, + "step": 565 + }, + { + "epoch": 1.0600093764650727, + "grad_norm": 0.23497509956359863, + "learning_rate": 1.6374269005847955e-05, + "loss": 0.0885, + "step": 566 + }, + { + "epoch": 1.0618846694796062, + "grad_norm": 0.2501085698604584, + "learning_rate": 1.636647173489279e-05, + "loss": 0.0555, + "step": 567 + }, + { + "epoch": 1.0637599624941396, + "grad_norm": 0.2505752742290497, + "learning_rate": 1.6358674463937623e-05, + "loss": 0.0836, + "step": 568 + }, + { + "epoch": 1.0656352555086732, + "grad_norm": 0.24521493911743164, + "learning_rate": 1.6350877192982457e-05, + "loss": 0.0907, + "step": 569 + }, + { + "epoch": 1.0675105485232068, + "grad_norm": 0.3060048222541809, + "learning_rate": 1.634307992202729e-05, + "loss": 0.0901, + "step": 570 + }, + { + "epoch": 1.0693858415377402, + "grad_norm": 0.2545836865901947, + "learning_rate": 1.6335282651072126e-05, + "loss": 0.1228, + "step": 571 + }, + { + "epoch": 1.0712611345522738, + "grad_norm": 0.37637385725975037, + "learning_rate": 1.632748538011696e-05, + "loss": 0.1273, + "step": 572 + }, + { + "epoch": 1.0731364275668074, + "grad_norm": 0.3298353850841522, + "learning_rate": 1.6319688109161794e-05, + "loss": 0.1297, + "step": 573 + }, + { + "epoch": 1.0750117205813408, + "grad_norm": 0.18379832804203033, + "learning_rate": 1.6311890838206628e-05, + "loss": 0.0712, + "step": 574 + }, + { + "epoch": 1.0768870135958744, + "grad_norm": 0.2380228340625763, + "learning_rate": 1.6304093567251466e-05, + "loss": 0.0645, + "step": 575 + }, + { + "epoch": 1.078762306610408, + "grad_norm": 0.38280248641967773, + "learning_rate": 1.6296296296296297e-05, + "loss": 0.1449, + "step": 576 + }, + { + "epoch": 1.0806375996249413, + "grad_norm": 0.24589546024799347, + "learning_rate": 1.628849902534113e-05, + "loss": 0.0954, + "step": 577 + }, + { + "epoch": 1.082512892639475, + "grad_norm": 0.245429128408432, + "learning_rate": 1.6280701754385965e-05, + "loss": 0.0716, + "step": 578 + }, + { + "epoch": 1.0843881856540085, + "grad_norm": 0.685253381729126, + "learning_rate": 1.62729044834308e-05, + "loss": 0.0688, + "step": 579 + }, + { + "epoch": 1.086263478668542, + "grad_norm": 0.28492575883865356, + "learning_rate": 1.6265107212475633e-05, + "loss": 0.1057, + "step": 580 + }, + { + "epoch": 1.0881387716830755, + "grad_norm": 0.2217700481414795, + "learning_rate": 1.625730994152047e-05, + "loss": 0.085, + "step": 581 + }, + { + "epoch": 1.090014064697609, + "grad_norm": 0.30696970224380493, + "learning_rate": 1.6249512670565305e-05, + "loss": 0.0976, + "step": 582 + }, + { + "epoch": 1.0918893577121425, + "grad_norm": 0.2384309470653534, + "learning_rate": 1.624171539961014e-05, + "loss": 0.0782, + "step": 583 + }, + { + "epoch": 1.093764650726676, + "grad_norm": 0.2883552610874176, + "learning_rate": 1.6233918128654974e-05, + "loss": 0.1116, + "step": 584 + }, + { + "epoch": 1.0956399437412097, + "grad_norm": 0.30440714955329895, + "learning_rate": 1.6226120857699808e-05, + "loss": 0.1204, + "step": 585 + }, + { + "epoch": 1.097515236755743, + "grad_norm": 0.9227920174598694, + "learning_rate": 1.621832358674464e-05, + "loss": 0.1289, + "step": 586 + }, + { + "epoch": 1.0993905297702766, + "grad_norm": 0.2924637198448181, + "learning_rate": 1.6210526315789473e-05, + "loss": 0.1018, + "step": 587 + }, + { + "epoch": 1.1012658227848102, + "grad_norm": 0.28547459840774536, + "learning_rate": 1.620272904483431e-05, + "loss": 0.1105, + "step": 588 + }, + { + "epoch": 1.1031411157993436, + "grad_norm": 0.2556934952735901, + "learning_rate": 1.6194931773879144e-05, + "loss": 0.0934, + "step": 589 + }, + { + "epoch": 1.1050164088138772, + "grad_norm": 0.23647433519363403, + "learning_rate": 1.618713450292398e-05, + "loss": 0.0787, + "step": 590 + }, + { + "epoch": 1.1068917018284108, + "grad_norm": 0.3279690146446228, + "learning_rate": 1.6179337231968813e-05, + "loss": 0.1185, + "step": 591 + }, + { + "epoch": 1.1087669948429442, + "grad_norm": 0.22327874600887299, + "learning_rate": 1.6171539961013647e-05, + "loss": 0.0835, + "step": 592 + }, + { + "epoch": 1.1106422878574778, + "grad_norm": 0.4429026246070862, + "learning_rate": 1.616374269005848e-05, + "loss": 0.12, + "step": 593 + }, + { + "epoch": 1.1125175808720114, + "grad_norm": 0.27783477306365967, + "learning_rate": 1.6155945419103315e-05, + "loss": 0.1396, + "step": 594 + }, + { + "epoch": 1.1143928738865447, + "grad_norm": 0.35669296979904175, + "learning_rate": 1.614814814814815e-05, + "loss": 0.0993, + "step": 595 + }, + { + "epoch": 1.1162681669010783, + "grad_norm": 0.22483180463314056, + "learning_rate": 1.6140350877192984e-05, + "loss": 0.0727, + "step": 596 + }, + { + "epoch": 1.1181434599156117, + "grad_norm": 0.2645404636859894, + "learning_rate": 1.6132553606237818e-05, + "loss": 0.1045, + "step": 597 + }, + { + "epoch": 1.1200187529301453, + "grad_norm": 0.29490724205970764, + "learning_rate": 1.6124756335282652e-05, + "loss": 0.0936, + "step": 598 + }, + { + "epoch": 1.121894045944679, + "grad_norm": 0.24751833081245422, + "learning_rate": 1.6116959064327486e-05, + "loss": 0.0827, + "step": 599 + }, + { + "epoch": 1.1237693389592125, + "grad_norm": 0.27503255009651184, + "learning_rate": 1.610916179337232e-05, + "loss": 0.1093, + "step": 600 + }, + { + "epoch": 1.1237693389592125, + "eval_loss": 0.08217223733663559, + "eval_runtime": 675.2558, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 600 + }, + { + "epoch": 1.1256446319737459, + "grad_norm": 0.30064669251441956, + "learning_rate": 1.6101364522417155e-05, + "loss": 0.0869, + "step": 601 + }, + { + "epoch": 1.1275199249882795, + "grad_norm": 0.2266855388879776, + "learning_rate": 1.609356725146199e-05, + "loss": 0.0804, + "step": 602 + }, + { + "epoch": 1.1293952180028128, + "grad_norm": 0.2856343686580658, + "learning_rate": 1.6085769980506823e-05, + "loss": 0.0893, + "step": 603 + }, + { + "epoch": 1.1312705110173464, + "grad_norm": 0.2303743064403534, + "learning_rate": 1.607797270955166e-05, + "loss": 0.0759, + "step": 604 + }, + { + "epoch": 1.13314580403188, + "grad_norm": 0.3389296531677246, + "learning_rate": 1.6070175438596495e-05, + "loss": 0.1191, + "step": 605 + }, + { + "epoch": 1.1350210970464134, + "grad_norm": 0.27957460284233093, + "learning_rate": 1.6062378167641326e-05, + "loss": 0.1205, + "step": 606 + }, + { + "epoch": 1.136896390060947, + "grad_norm": 0.24907708168029785, + "learning_rate": 1.605458089668616e-05, + "loss": 0.0942, + "step": 607 + }, + { + "epoch": 1.1387716830754806, + "grad_norm": 0.272223562002182, + "learning_rate": 1.6046783625730994e-05, + "loss": 0.1217, + "step": 608 + }, + { + "epoch": 1.140646976090014, + "grad_norm": 0.290814608335495, + "learning_rate": 1.603898635477583e-05, + "loss": 0.1062, + "step": 609 + }, + { + "epoch": 1.1425222691045476, + "grad_norm": 0.2468331754207611, + "learning_rate": 1.6031189083820663e-05, + "loss": 0.0969, + "step": 610 + }, + { + "epoch": 1.1443975621190812, + "grad_norm": 0.23120653629302979, + "learning_rate": 1.60233918128655e-05, + "loss": 0.092, + "step": 611 + }, + { + "epoch": 1.1462728551336145, + "grad_norm": 0.23789869248867035, + "learning_rate": 1.6015594541910334e-05, + "loss": 0.0992, + "step": 612 + }, + { + "epoch": 1.1481481481481481, + "grad_norm": 0.3250608444213867, + "learning_rate": 1.600779727095517e-05, + "loss": 0.12, + "step": 613 + }, + { + "epoch": 1.1500234411626817, + "grad_norm": 0.278442919254303, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.1042, + "step": 614 + }, + { + "epoch": 1.1518987341772151, + "grad_norm": 0.23886540532112122, + "learning_rate": 1.5992202729044833e-05, + "loss": 0.1048, + "step": 615 + }, + { + "epoch": 1.1537740271917487, + "grad_norm": 0.3647238612174988, + "learning_rate": 1.5984405458089668e-05, + "loss": 0.0651, + "step": 616 + }, + { + "epoch": 1.1556493202062823, + "grad_norm": 0.2553196847438812, + "learning_rate": 1.5976608187134505e-05, + "loss": 0.1011, + "step": 617 + }, + { + "epoch": 1.1575246132208157, + "grad_norm": 0.26290223002433777, + "learning_rate": 1.596881091617934e-05, + "loss": 0.1024, + "step": 618 + }, + { + "epoch": 1.1593999062353493, + "grad_norm": 0.3013964891433716, + "learning_rate": 1.5961013645224174e-05, + "loss": 0.1192, + "step": 619 + }, + { + "epoch": 1.1612751992498829, + "grad_norm": 0.30323684215545654, + "learning_rate": 1.5953216374269008e-05, + "loss": 0.1065, + "step": 620 + }, + { + "epoch": 1.1631504922644162, + "grad_norm": 0.28223493695259094, + "learning_rate": 1.5945419103313842e-05, + "loss": 0.1187, + "step": 621 + }, + { + "epoch": 1.1650257852789498, + "grad_norm": 0.3298370838165283, + "learning_rate": 1.5937621832358676e-05, + "loss": 0.1322, + "step": 622 + }, + { + "epoch": 1.1669010782934834, + "grad_norm": 0.2321697622537613, + "learning_rate": 1.592982456140351e-05, + "loss": 0.0828, + "step": 623 + }, + { + "epoch": 1.1687763713080168, + "grad_norm": 0.27766016125679016, + "learning_rate": 1.5922027290448345e-05, + "loss": 0.1183, + "step": 624 + }, + { + "epoch": 1.1706516643225504, + "grad_norm": 0.5540488362312317, + "learning_rate": 1.591423001949318e-05, + "loss": 0.1321, + "step": 625 + }, + { + "epoch": 1.172526957337084, + "grad_norm": 0.24459047615528107, + "learning_rate": 1.5906432748538013e-05, + "loss": 0.1013, + "step": 626 + }, + { + "epoch": 1.1744022503516174, + "grad_norm": 0.307108610868454, + "learning_rate": 1.5898635477582847e-05, + "loss": 0.1047, + "step": 627 + }, + { + "epoch": 1.176277543366151, + "grad_norm": 0.36074694991111755, + "learning_rate": 1.589083820662768e-05, + "loss": 0.0792, + "step": 628 + }, + { + "epoch": 1.1781528363806846, + "grad_norm": 0.28516885638237, + "learning_rate": 1.5883040935672516e-05, + "loss": 0.1324, + "step": 629 + }, + { + "epoch": 1.180028129395218, + "grad_norm": 0.3291719853878021, + "learning_rate": 1.587524366471735e-05, + "loss": 0.0955, + "step": 630 + }, + { + "epoch": 1.1819034224097515, + "grad_norm": 0.25097760558128357, + "learning_rate": 1.5867446393762184e-05, + "loss": 0.0849, + "step": 631 + }, + { + "epoch": 1.183778715424285, + "grad_norm": 0.2444642335176468, + "learning_rate": 1.5859649122807018e-05, + "loss": 0.0694, + "step": 632 + }, + { + "epoch": 1.1856540084388185, + "grad_norm": 0.27323952317237854, + "learning_rate": 1.5851851851851852e-05, + "loss": 0.0963, + "step": 633 + }, + { + "epoch": 1.1875293014533521, + "grad_norm": 0.23469096422195435, + "learning_rate": 1.584405458089669e-05, + "loss": 0.0764, + "step": 634 + }, + { + "epoch": 1.1894045944678857, + "grad_norm": 0.2885955572128296, + "learning_rate": 1.583625730994152e-05, + "loss": 0.0978, + "step": 635 + }, + { + "epoch": 1.191279887482419, + "grad_norm": 0.29515987634658813, + "learning_rate": 1.5828460038986355e-05, + "loss": 0.1237, + "step": 636 + }, + { + "epoch": 1.1931551804969527, + "grad_norm": 0.3769955039024353, + "learning_rate": 1.582066276803119e-05, + "loss": 0.0984, + "step": 637 + }, + { + "epoch": 1.195030473511486, + "grad_norm": 0.3882874846458435, + "learning_rate": 1.5812865497076023e-05, + "loss": 0.1113, + "step": 638 + }, + { + "epoch": 1.1969057665260197, + "grad_norm": 0.3134611248970032, + "learning_rate": 1.5805068226120857e-05, + "loss": 0.0973, + "step": 639 + }, + { + "epoch": 1.1987810595405533, + "grad_norm": 0.23223522305488586, + "learning_rate": 1.579727095516569e-05, + "loss": 0.054, + "step": 640 + }, + { + "epoch": 1.2006563525550868, + "grad_norm": 0.22435115277767181, + "learning_rate": 1.578947368421053e-05, + "loss": 0.0883, + "step": 641 + }, + { + "epoch": 1.2025316455696202, + "grad_norm": 0.2723855674266815, + "learning_rate": 1.5781676413255363e-05, + "loss": 0.1132, + "step": 642 + }, + { + "epoch": 1.2044069385841538, + "grad_norm": 0.32858818769454956, + "learning_rate": 1.5773879142300198e-05, + "loss": 0.0787, + "step": 643 + }, + { + "epoch": 1.2062822315986872, + "grad_norm": 0.25390857458114624, + "learning_rate": 1.5766081871345032e-05, + "loss": 0.0659, + "step": 644 + }, + { + "epoch": 1.2081575246132208, + "grad_norm": 0.24344322085380554, + "learning_rate": 1.5758284600389863e-05, + "loss": 0.0912, + "step": 645 + }, + { + "epoch": 1.2100328176277544, + "grad_norm": 0.2637319564819336, + "learning_rate": 1.5750487329434697e-05, + "loss": 0.0915, + "step": 646 + }, + { + "epoch": 1.2119081106422878, + "grad_norm": 0.31068336963653564, + "learning_rate": 1.5742690058479534e-05, + "loss": 0.1511, + "step": 647 + }, + { + "epoch": 1.2137834036568214, + "grad_norm": 0.27969691157341003, + "learning_rate": 1.573489278752437e-05, + "loss": 0.0742, + "step": 648 + }, + { + "epoch": 1.215658696671355, + "grad_norm": 0.31574878096580505, + "learning_rate": 1.5727095516569203e-05, + "loss": 0.0755, + "step": 649 + }, + { + "epoch": 1.2175339896858883, + "grad_norm": 0.24576275050640106, + "learning_rate": 1.5719298245614037e-05, + "loss": 0.0803, + "step": 650 + }, + { + "epoch": 1.2175339896858883, + "eval_loss": 0.07974553108215332, + "eval_runtime": 676.0594, + "eval_samples_per_second": 0.291, + "eval_steps_per_second": 0.291, + "step": 650 + }, + { + "epoch": 1.219409282700422, + "grad_norm": 0.31214672327041626, + "learning_rate": 1.571150097465887e-05, + "loss": 0.1131, + "step": 651 + }, + { + "epoch": 1.2212845757149555, + "grad_norm": 0.268762469291687, + "learning_rate": 1.5703703703703705e-05, + "loss": 0.0896, + "step": 652 + }, + { + "epoch": 1.223159868729489, + "grad_norm": 0.21026861667633057, + "learning_rate": 1.569590643274854e-05, + "loss": 0.056, + "step": 653 + }, + { + "epoch": 1.2250351617440225, + "grad_norm": 0.4084342420101166, + "learning_rate": 1.5688109161793374e-05, + "loss": 0.0833, + "step": 654 + }, + { + "epoch": 1.226910454758556, + "grad_norm": 0.28843924403190613, + "learning_rate": 1.5680311890838208e-05, + "loss": 0.0888, + "step": 655 + }, + { + "epoch": 1.2287857477730895, + "grad_norm": 0.29009735584259033, + "learning_rate": 1.5672514619883042e-05, + "loss": 0.0992, + "step": 656 + }, + { + "epoch": 1.230661040787623, + "grad_norm": 0.24224917590618134, + "learning_rate": 1.5664717348927876e-05, + "loss": 0.0624, + "step": 657 + }, + { + "epoch": 1.2325363338021567, + "grad_norm": 0.29413849115371704, + "learning_rate": 1.565692007797271e-05, + "loss": 0.095, + "step": 658 + }, + { + "epoch": 1.23441162681669, + "grad_norm": 0.21053080260753632, + "learning_rate": 1.5649122807017545e-05, + "loss": 0.0631, + "step": 659 + }, + { + "epoch": 1.2362869198312236, + "grad_norm": 0.2499096691608429, + "learning_rate": 1.564132553606238e-05, + "loss": 0.0848, + "step": 660 + }, + { + "epoch": 1.2381622128457572, + "grad_norm": 0.28722044825553894, + "learning_rate": 1.5633528265107213e-05, + "loss": 0.1162, + "step": 661 + }, + { + "epoch": 1.2400375058602906, + "grad_norm": 0.3939790427684784, + "learning_rate": 1.5625730994152047e-05, + "loss": 0.0842, + "step": 662 + }, + { + "epoch": 1.2419127988748242, + "grad_norm": 0.2812120318412781, + "learning_rate": 1.561793372319688e-05, + "loss": 0.06, + "step": 663 + }, + { + "epoch": 1.2437880918893578, + "grad_norm": 0.2532125413417816, + "learning_rate": 1.561013645224172e-05, + "loss": 0.0866, + "step": 664 + }, + { + "epoch": 1.2456633849038912, + "grad_norm": 0.3428124785423279, + "learning_rate": 1.560233918128655e-05, + "loss": 0.1119, + "step": 665 + }, + { + "epoch": 1.2475386779184248, + "grad_norm": 0.3042241930961609, + "learning_rate": 1.5594541910331384e-05, + "loss": 0.1112, + "step": 666 + }, + { + "epoch": 1.2494139709329584, + "grad_norm": 0.2961670458316803, + "learning_rate": 1.5586744639376218e-05, + "loss": 0.0964, + "step": 667 + }, + { + "epoch": 1.2512892639474917, + "grad_norm": 0.403415322303772, + "learning_rate": 1.5578947368421052e-05, + "loss": 0.1721, + "step": 668 + }, + { + "epoch": 1.2531645569620253, + "grad_norm": 0.27317744493484497, + "learning_rate": 1.5571150097465887e-05, + "loss": 0.096, + "step": 669 + }, + { + "epoch": 1.255039849976559, + "grad_norm": 0.2543340027332306, + "learning_rate": 1.5563352826510724e-05, + "loss": 0.0775, + "step": 670 + }, + { + "epoch": 1.2569151429910923, + "grad_norm": 0.4965609014034271, + "learning_rate": 1.555555555555556e-05, + "loss": 0.0798, + "step": 671 + }, + { + "epoch": 1.258790436005626, + "grad_norm": 0.2571773827075958, + "learning_rate": 1.5547758284600393e-05, + "loss": 0.0865, + "step": 672 + }, + { + "epoch": 1.2606657290201593, + "grad_norm": 0.2605682909488678, + "learning_rate": 1.5539961013645227e-05, + "loss": 0.1214, + "step": 673 + }, + { + "epoch": 1.2625410220346929, + "grad_norm": 0.2747882902622223, + "learning_rate": 1.553216374269006e-05, + "loss": 0.115, + "step": 674 + }, + { + "epoch": 1.2644163150492265, + "grad_norm": 0.2293609231710434, + "learning_rate": 1.5524366471734892e-05, + "loss": 0.0743, + "step": 675 + }, + { + "epoch": 1.26629160806376, + "grad_norm": 0.3130941092967987, + "learning_rate": 1.5516569200779726e-05, + "loss": 0.0837, + "step": 676 + }, + { + "epoch": 1.2681669010782934, + "grad_norm": 0.3052816092967987, + "learning_rate": 1.5508771929824563e-05, + "loss": 0.1217, + "step": 677 + }, + { + "epoch": 1.270042194092827, + "grad_norm": 0.24826787412166595, + "learning_rate": 1.5500974658869398e-05, + "loss": 0.0881, + "step": 678 + }, + { + "epoch": 1.2719174871073604, + "grad_norm": 0.30709272623062134, + "learning_rate": 1.5493177387914232e-05, + "loss": 0.0667, + "step": 679 + }, + { + "epoch": 1.273792780121894, + "grad_norm": 0.2654632329940796, + "learning_rate": 1.5485380116959066e-05, + "loss": 0.0915, + "step": 680 + }, + { + "epoch": 1.2756680731364276, + "grad_norm": 0.24949900805950165, + "learning_rate": 1.54775828460039e-05, + "loss": 0.0738, + "step": 681 + }, + { + "epoch": 1.2775433661509612, + "grad_norm": 0.5046447515487671, + "learning_rate": 1.5469785575048734e-05, + "loss": 0.1017, + "step": 682 + }, + { + "epoch": 1.2794186591654946, + "grad_norm": 0.31709209084510803, + "learning_rate": 1.546198830409357e-05, + "loss": 0.1129, + "step": 683 + }, + { + "epoch": 1.2812939521800282, + "grad_norm": 0.30328288674354553, + "learning_rate": 1.5454191033138403e-05, + "loss": 0.1407, + "step": 684 + }, + { + "epoch": 1.2831692451945615, + "grad_norm": 0.34941598773002625, + "learning_rate": 1.5446393762183237e-05, + "loss": 0.1595, + "step": 685 + }, + { + "epoch": 1.2850445382090951, + "grad_norm": 0.25712135434150696, + "learning_rate": 1.543859649122807e-05, + "loss": 0.082, + "step": 686 + }, + { + "epoch": 1.2869198312236287, + "grad_norm": 0.5184486508369446, + "learning_rate": 1.5430799220272905e-05, + "loss": 0.0826, + "step": 687 + }, + { + "epoch": 1.2887951242381623, + "grad_norm": 0.23768484592437744, + "learning_rate": 1.542300194931774e-05, + "loss": 0.0893, + "step": 688 + }, + { + "epoch": 1.2906704172526957, + "grad_norm": 0.2733663320541382, + "learning_rate": 1.5415204678362574e-05, + "loss": 0.1009, + "step": 689 + }, + { + "epoch": 1.2925457102672293, + "grad_norm": 0.2970256805419922, + "learning_rate": 1.5407407407407408e-05, + "loss": 0.1103, + "step": 690 + }, + { + "epoch": 1.2944210032817627, + "grad_norm": 0.29735323786735535, + "learning_rate": 1.5399610136452242e-05, + "loss": 0.1185, + "step": 691 + }, + { + "epoch": 1.2962962962962963, + "grad_norm": 0.33100953698158264, + "learning_rate": 1.5391812865497076e-05, + "loss": 0.0833, + "step": 692 + }, + { + "epoch": 1.2981715893108299, + "grad_norm": 0.30054280161857605, + "learning_rate": 1.5384015594541914e-05, + "loss": 0.058, + "step": 693 + }, + { + "epoch": 1.3000468823253635, + "grad_norm": 0.2017904669046402, + "learning_rate": 1.5376218323586748e-05, + "loss": 0.0716, + "step": 694 + }, + { + "epoch": 1.3019221753398968, + "grad_norm": 0.2905302047729492, + "learning_rate": 1.536842105263158e-05, + "loss": 0.1068, + "step": 695 + }, + { + "epoch": 1.3037974683544304, + "grad_norm": 0.26433658599853516, + "learning_rate": 1.5360623781676413e-05, + "loss": 0.0923, + "step": 696 + }, + { + "epoch": 1.3056727613689638, + "grad_norm": 0.24136176705360413, + "learning_rate": 1.5352826510721247e-05, + "loss": 0.0975, + "step": 697 + }, + { + "epoch": 1.3075480543834974, + "grad_norm": 0.2888275384902954, + "learning_rate": 1.534502923976608e-05, + "loss": 0.0736, + "step": 698 + }, + { + "epoch": 1.309423347398031, + "grad_norm": 0.28614625334739685, + "learning_rate": 1.5337231968810916e-05, + "loss": 0.0757, + "step": 699 + }, + { + "epoch": 1.3112986404125644, + "grad_norm": 1.1438792943954468, + "learning_rate": 1.5329434697855753e-05, + "loss": 0.1105, + "step": 700 + }, + { + "epoch": 1.3112986404125644, + "eval_loss": 0.08180084824562073, + "eval_runtime": 675.7061, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 700 + }, + { + "epoch": 1.313173933427098, + "grad_norm": 0.24729672074317932, + "learning_rate": 1.5321637426900587e-05, + "loss": 0.1244, + "step": 701 + }, + { + "epoch": 1.3150492264416316, + "grad_norm": 0.6661401391029358, + "learning_rate": 1.531384015594542e-05, + "loss": 0.0465, + "step": 702 + }, + { + "epoch": 1.316924519456165, + "grad_norm": 0.2756960391998291, + "learning_rate": 1.5306042884990256e-05, + "loss": 0.0845, + "step": 703 + }, + { + "epoch": 1.3187998124706986, + "grad_norm": 0.2175983488559723, + "learning_rate": 1.529824561403509e-05, + "loss": 0.0874, + "step": 704 + }, + { + "epoch": 1.3206751054852321, + "grad_norm": 0.26682281494140625, + "learning_rate": 1.529044834307992e-05, + "loss": 0.0785, + "step": 705 + }, + { + "epoch": 1.3225503984997655, + "grad_norm": 0.28904491662979126, + "learning_rate": 1.528265107212476e-05, + "loss": 0.0812, + "step": 706 + }, + { + "epoch": 1.3244256915142991, + "grad_norm": 0.37207451462745667, + "learning_rate": 1.5274853801169593e-05, + "loss": 0.118, + "step": 707 + }, + { + "epoch": 1.3263009845288325, + "grad_norm": 0.2251625806093216, + "learning_rate": 1.5267056530214427e-05, + "loss": 0.088, + "step": 708 + }, + { + "epoch": 1.328176277543366, + "grad_norm": 0.2839013934135437, + "learning_rate": 1.525925925925926e-05, + "loss": 0.089, + "step": 709 + }, + { + "epoch": 1.3300515705578997, + "grad_norm": 0.2906053960323334, + "learning_rate": 1.5251461988304095e-05, + "loss": 0.1056, + "step": 710 + }, + { + "epoch": 1.3319268635724333, + "grad_norm": 0.29006117582321167, + "learning_rate": 1.524366471734893e-05, + "loss": 0.1284, + "step": 711 + }, + { + "epoch": 1.3338021565869667, + "grad_norm": 0.23621053993701935, + "learning_rate": 1.5235867446393764e-05, + "loss": 0.0831, + "step": 712 + }, + { + "epoch": 1.3356774496015003, + "grad_norm": 0.2675006091594696, + "learning_rate": 1.5228070175438598e-05, + "loss": 0.0979, + "step": 713 + }, + { + "epoch": 1.3375527426160336, + "grad_norm": 0.25027772784233093, + "learning_rate": 1.5220272904483434e-05, + "loss": 0.0989, + "step": 714 + }, + { + "epoch": 1.3394280356305672, + "grad_norm": 0.3723505437374115, + "learning_rate": 1.5212475633528266e-05, + "loss": 0.1134, + "step": 715 + }, + { + "epoch": 1.3413033286451008, + "grad_norm": 0.2928392291069031, + "learning_rate": 1.52046783625731e-05, + "loss": 0.1118, + "step": 716 + }, + { + "epoch": 1.3431786216596344, + "grad_norm": 0.23428718745708466, + "learning_rate": 1.5196881091617935e-05, + "loss": 0.056, + "step": 717 + }, + { + "epoch": 1.3450539146741678, + "grad_norm": 0.23713499307632446, + "learning_rate": 1.5189083820662769e-05, + "loss": 0.0812, + "step": 718 + }, + { + "epoch": 1.3469292076887014, + "grad_norm": 0.27535611391067505, + "learning_rate": 1.5181286549707603e-05, + "loss": 0.0926, + "step": 719 + }, + { + "epoch": 1.3488045007032348, + "grad_norm": 0.29593226313591003, + "learning_rate": 1.5173489278752439e-05, + "loss": 0.1266, + "step": 720 + }, + { + "epoch": 1.3506797937177684, + "grad_norm": 0.32769280672073364, + "learning_rate": 1.5165692007797273e-05, + "loss": 0.1462, + "step": 721 + }, + { + "epoch": 1.352555086732302, + "grad_norm": 0.3472793996334076, + "learning_rate": 1.5157894736842107e-05, + "loss": 0.1547, + "step": 722 + }, + { + "epoch": 1.3544303797468356, + "grad_norm": 0.2513481378555298, + "learning_rate": 1.5150097465886941e-05, + "loss": 0.093, + "step": 723 + }, + { + "epoch": 1.356305672761369, + "grad_norm": 0.2549232244491577, + "learning_rate": 1.5142300194931776e-05, + "loss": 0.0858, + "step": 724 + }, + { + "epoch": 1.3581809657759025, + "grad_norm": 0.33557018637657166, + "learning_rate": 1.5134502923976608e-05, + "loss": 0.1245, + "step": 725 + }, + { + "epoch": 1.360056258790436, + "grad_norm": 0.2504093647003174, + "learning_rate": 1.5126705653021442e-05, + "loss": 0.0764, + "step": 726 + }, + { + "epoch": 1.3619315518049695, + "grad_norm": 0.20211853086948395, + "learning_rate": 1.5118908382066278e-05, + "loss": 0.0634, + "step": 727 + }, + { + "epoch": 1.363806844819503, + "grad_norm": 0.28420448303222656, + "learning_rate": 1.5111111111111112e-05, + "loss": 0.1096, + "step": 728 + }, + { + "epoch": 1.3656821378340367, + "grad_norm": 0.22307127714157104, + "learning_rate": 1.5103313840155947e-05, + "loss": 0.0603, + "step": 729 + }, + { + "epoch": 1.36755743084857, + "grad_norm": 0.21484006941318512, + "learning_rate": 1.509551656920078e-05, + "loss": 0.0784, + "step": 730 + }, + { + "epoch": 1.3694327238631037, + "grad_norm": 0.23451459407806396, + "learning_rate": 1.5087719298245615e-05, + "loss": 0.1042, + "step": 731 + }, + { + "epoch": 1.371308016877637, + "grad_norm": 0.3075193166732788, + "learning_rate": 1.507992202729045e-05, + "loss": 0.0924, + "step": 732 + }, + { + "epoch": 1.3731833098921706, + "grad_norm": 0.25598907470703125, + "learning_rate": 1.5072124756335285e-05, + "loss": 0.0911, + "step": 733 + }, + { + "epoch": 1.3750586029067042, + "grad_norm": 0.25740352272987366, + "learning_rate": 1.5064327485380119e-05, + "loss": 0.1088, + "step": 734 + }, + { + "epoch": 1.3769338959212378, + "grad_norm": 0.3566450774669647, + "learning_rate": 1.5056530214424952e-05, + "loss": 0.1148, + "step": 735 + }, + { + "epoch": 1.3788091889357712, + "grad_norm": 0.21709580719470978, + "learning_rate": 1.5048732943469786e-05, + "loss": 0.0762, + "step": 736 + }, + { + "epoch": 1.3806844819503048, + "grad_norm": 0.36706310510635376, + "learning_rate": 1.504093567251462e-05, + "loss": 0.0655, + "step": 737 + }, + { + "epoch": 1.3825597749648382, + "grad_norm": 0.2232268899679184, + "learning_rate": 1.5033138401559454e-05, + "loss": 0.0777, + "step": 738 + }, + { + "epoch": 1.3844350679793718, + "grad_norm": 0.4871269762516022, + "learning_rate": 1.502534113060429e-05, + "loss": 0.1297, + "step": 739 + }, + { + "epoch": 1.3863103609939054, + "grad_norm": 0.32851436734199524, + "learning_rate": 1.5017543859649124e-05, + "loss": 0.1113, + "step": 740 + }, + { + "epoch": 1.3881856540084387, + "grad_norm": 0.28122517466545105, + "learning_rate": 1.5009746588693958e-05, + "loss": 0.1021, + "step": 741 + }, + { + "epoch": 1.3900609470229723, + "grad_norm": 0.2887057065963745, + "learning_rate": 1.5001949317738793e-05, + "loss": 0.0871, + "step": 742 + }, + { + "epoch": 1.391936240037506, + "grad_norm": 0.2595096230506897, + "learning_rate": 1.4994152046783627e-05, + "loss": 0.0708, + "step": 743 + }, + { + "epoch": 1.3938115330520393, + "grad_norm": 0.232121542096138, + "learning_rate": 1.4986354775828463e-05, + "loss": 0.0727, + "step": 744 + }, + { + "epoch": 1.395686826066573, + "grad_norm": 0.2849893569946289, + "learning_rate": 1.4978557504873295e-05, + "loss": 0.0861, + "step": 745 + }, + { + "epoch": 1.3975621190811065, + "grad_norm": 0.2730804979801178, + "learning_rate": 1.497076023391813e-05, + "loss": 0.0973, + "step": 746 + }, + { + "epoch": 1.3994374120956399, + "grad_norm": 0.24728837609291077, + "learning_rate": 1.4962962962962964e-05, + "loss": 0.0867, + "step": 747 + }, + { + "epoch": 1.4013127051101735, + "grad_norm": 0.2925151288509369, + "learning_rate": 1.4955165692007798e-05, + "loss": 0.135, + "step": 748 + }, + { + "epoch": 1.4031879981247068, + "grad_norm": 0.35172486305236816, + "learning_rate": 1.4947368421052632e-05, + "loss": 0.1115, + "step": 749 + }, + { + "epoch": 1.4050632911392404, + "grad_norm": 0.24151886999607086, + "learning_rate": 1.4939571150097468e-05, + "loss": 0.0892, + "step": 750 + }, + { + "epoch": 1.4050632911392404, + "eval_loss": 0.07957883179187775, + "eval_runtime": 676.5902, + "eval_samples_per_second": 0.291, + "eval_steps_per_second": 0.291, + "step": 750 + }, + { + "epoch": 1.406938584153774, + "grad_norm": 0.2599674463272095, + "learning_rate": 1.4931773879142302e-05, + "loss": 0.073, + "step": 751 + }, + { + "epoch": 1.4088138771683076, + "grad_norm": 0.2866610586643219, + "learning_rate": 1.4923976608187136e-05, + "loss": 0.0743, + "step": 752 + }, + { + "epoch": 1.410689170182841, + "grad_norm": 0.3928741216659546, + "learning_rate": 1.491617933723197e-05, + "loss": 0.1462, + "step": 753 + }, + { + "epoch": 1.4125644631973746, + "grad_norm": 0.38174375891685486, + "learning_rate": 1.4908382066276805e-05, + "loss": 0.1081, + "step": 754 + }, + { + "epoch": 1.414439756211908, + "grad_norm": 0.2848498821258545, + "learning_rate": 1.4900584795321637e-05, + "loss": 0.1071, + "step": 755 + }, + { + "epoch": 1.4163150492264416, + "grad_norm": 0.26754605770111084, + "learning_rate": 1.4892787524366471e-05, + "loss": 0.0775, + "step": 756 + }, + { + "epoch": 1.4181903422409752, + "grad_norm": 0.3248796761035919, + "learning_rate": 1.4884990253411307e-05, + "loss": 0.1093, + "step": 757 + }, + { + "epoch": 1.4200656352555088, + "grad_norm": 0.2621667981147766, + "learning_rate": 1.4877192982456141e-05, + "loss": 0.0846, + "step": 758 + }, + { + "epoch": 1.4219409282700421, + "grad_norm": 0.3511008024215698, + "learning_rate": 1.4869395711500976e-05, + "loss": 0.1343, + "step": 759 + }, + { + "epoch": 1.4238162212845757, + "grad_norm": 0.23400309681892395, + "learning_rate": 1.486159844054581e-05, + "loss": 0.0795, + "step": 760 + }, + { + "epoch": 1.4256915142991091, + "grad_norm": 0.3075447678565979, + "learning_rate": 1.4853801169590644e-05, + "loss": 0.1026, + "step": 761 + }, + { + "epoch": 1.4275668073136427, + "grad_norm": 0.2749321162700653, + "learning_rate": 1.484600389863548e-05, + "loss": 0.0991, + "step": 762 + }, + { + "epoch": 1.4294421003281763, + "grad_norm": 0.29317227005958557, + "learning_rate": 1.4838206627680314e-05, + "loss": 0.0857, + "step": 763 + }, + { + "epoch": 1.43131739334271, + "grad_norm": 0.3031213879585266, + "learning_rate": 1.4830409356725148e-05, + "loss": 0.1196, + "step": 764 + }, + { + "epoch": 1.4331926863572433, + "grad_norm": 0.2685413658618927, + "learning_rate": 1.482261208576998e-05, + "loss": 0.0958, + "step": 765 + }, + { + "epoch": 1.4350679793717769, + "grad_norm": 0.28609031438827515, + "learning_rate": 1.4814814814814815e-05, + "loss": 0.1317, + "step": 766 + }, + { + "epoch": 1.4369432723863103, + "grad_norm": 0.2767196595668793, + "learning_rate": 1.4807017543859649e-05, + "loss": 0.0985, + "step": 767 + }, + { + "epoch": 1.4388185654008439, + "grad_norm": 0.29232558608055115, + "learning_rate": 1.4799220272904485e-05, + "loss": 0.1096, + "step": 768 + }, + { + "epoch": 1.4406938584153774, + "grad_norm": 0.2960546016693115, + "learning_rate": 1.479142300194932e-05, + "loss": 0.0876, + "step": 769 + }, + { + "epoch": 1.442569151429911, + "grad_norm": 0.43392062187194824, + "learning_rate": 1.4783625730994153e-05, + "loss": 0.1089, + "step": 770 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.23281604051589966, + "learning_rate": 1.4775828460038988e-05, + "loss": 0.0941, + "step": 771 + }, + { + "epoch": 1.446319737458978, + "grad_norm": 0.293990820646286, + "learning_rate": 1.4768031189083822e-05, + "loss": 0.1161, + "step": 772 + }, + { + "epoch": 1.4481950304735114, + "grad_norm": 0.33511075377464294, + "learning_rate": 1.4760233918128658e-05, + "loss": 0.0917, + "step": 773 + }, + { + "epoch": 1.450070323488045, + "grad_norm": 0.32499679923057556, + "learning_rate": 1.4752436647173492e-05, + "loss": 0.0784, + "step": 774 + }, + { + "epoch": 1.4519456165025786, + "grad_norm": 0.27234625816345215, + "learning_rate": 1.4744639376218324e-05, + "loss": 0.0876, + "step": 775 + }, + { + "epoch": 1.4538209095171122, + "grad_norm": 0.38885700702667236, + "learning_rate": 1.4736842105263159e-05, + "loss": 0.1354, + "step": 776 + }, + { + "epoch": 1.4556962025316456, + "grad_norm": 0.28798046708106995, + "learning_rate": 1.4729044834307993e-05, + "loss": 0.0782, + "step": 777 + }, + { + "epoch": 1.4575714955461792, + "grad_norm": 0.2976330518722534, + "learning_rate": 1.4721247563352827e-05, + "loss": 0.0898, + "step": 778 + }, + { + "epoch": 1.4594467885607125, + "grad_norm": 0.31952551007270813, + "learning_rate": 1.4713450292397661e-05, + "loss": 0.128, + "step": 779 + }, + { + "epoch": 1.4613220815752461, + "grad_norm": 0.27772435545921326, + "learning_rate": 1.4705653021442497e-05, + "loss": 0.0893, + "step": 780 + }, + { + "epoch": 1.4631973745897797, + "grad_norm": 0.24229328334331512, + "learning_rate": 1.4697855750487331e-05, + "loss": 0.0684, + "step": 781 + }, + { + "epoch": 1.465072667604313, + "grad_norm": 0.2713194489479065, + "learning_rate": 1.4690058479532165e-05, + "loss": 0.0987, + "step": 782 + }, + { + "epoch": 1.4669479606188467, + "grad_norm": 0.37047621607780457, + "learning_rate": 1.4682261208577e-05, + "loss": 0.0901, + "step": 783 + }, + { + "epoch": 1.4688232536333803, + "grad_norm": 0.5432588458061218, + "learning_rate": 1.4674463937621834e-05, + "loss": 0.1067, + "step": 784 + }, + { + "epoch": 1.4706985466479137, + "grad_norm": 0.26890650391578674, + "learning_rate": 1.4666666666666666e-05, + "loss": 0.0755, + "step": 785 + }, + { + "epoch": 1.4725738396624473, + "grad_norm": 0.30856049060821533, + "learning_rate": 1.4658869395711502e-05, + "loss": 0.0889, + "step": 786 + }, + { + "epoch": 1.4744491326769809, + "grad_norm": 0.2999826669692993, + "learning_rate": 1.4651072124756336e-05, + "loss": 0.134, + "step": 787 + }, + { + "epoch": 1.4763244256915142, + "grad_norm": 0.23444703221321106, + "learning_rate": 1.464327485380117e-05, + "loss": 0.0899, + "step": 788 + }, + { + "epoch": 1.4781997187060478, + "grad_norm": 0.25326988101005554, + "learning_rate": 1.4635477582846005e-05, + "loss": 0.0889, + "step": 789 + }, + { + "epoch": 1.4800750117205814, + "grad_norm": 0.4101651906967163, + "learning_rate": 1.4627680311890839e-05, + "loss": 0.0708, + "step": 790 + }, + { + "epoch": 1.4819503047351148, + "grad_norm": 0.3386366069316864, + "learning_rate": 1.4619883040935675e-05, + "loss": 0.079, + "step": 791 + }, + { + "epoch": 1.4838255977496484, + "grad_norm": 0.2348432093858719, + "learning_rate": 1.4612085769980509e-05, + "loss": 0.071, + "step": 792 + }, + { + "epoch": 1.485700890764182, + "grad_norm": 0.2450290322303772, + "learning_rate": 1.4604288499025343e-05, + "loss": 0.0811, + "step": 793 + }, + { + "epoch": 1.4875761837787154, + "grad_norm": 0.30053895711898804, + "learning_rate": 1.4596491228070177e-05, + "loss": 0.092, + "step": 794 + }, + { + "epoch": 1.489451476793249, + "grad_norm": 0.3324088752269745, + "learning_rate": 1.458869395711501e-05, + "loss": 0.1003, + "step": 795 + }, + { + "epoch": 1.4913267698077823, + "grad_norm": 0.31995347142219543, + "learning_rate": 1.4580896686159844e-05, + "loss": 0.1084, + "step": 796 + }, + { + "epoch": 1.493202062822316, + "grad_norm": 0.2602950930595398, + "learning_rate": 1.4573099415204678e-05, + "loss": 0.0751, + "step": 797 + }, + { + "epoch": 1.4950773558368495, + "grad_norm": 0.2805696725845337, + "learning_rate": 1.4565302144249514e-05, + "loss": 0.1025, + "step": 798 + }, + { + "epoch": 1.4969526488513831, + "grad_norm": 0.29233241081237793, + "learning_rate": 1.4557504873294348e-05, + "loss": 0.1161, + "step": 799 + }, + { + "epoch": 1.4988279418659165, + "grad_norm": 0.31460389494895935, + "learning_rate": 1.4549707602339183e-05, + "loss": 0.0975, + "step": 800 + }, + { + "epoch": 1.4988279418659165, + "eval_loss": 0.07664016634225845, + "eval_runtime": 675.2627, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 800 + }, + { + "epoch": 1.50070323488045, + "grad_norm": 0.3660965859889984, + "learning_rate": 1.4541910331384017e-05, + "loss": 0.1098, + "step": 801 + }, + { + "epoch": 1.5025785278949835, + "grad_norm": 0.26959264278411865, + "learning_rate": 1.4534113060428851e-05, + "loss": 0.0855, + "step": 802 + }, + { + "epoch": 1.504453820909517, + "grad_norm": 0.3515307605266571, + "learning_rate": 1.4526315789473687e-05, + "loss": 0.1272, + "step": 803 + }, + { + "epoch": 1.5063291139240507, + "grad_norm": 0.2386811077594757, + "learning_rate": 1.4518518518518521e-05, + "loss": 0.0612, + "step": 804 + }, + { + "epoch": 1.5082044069385843, + "grad_norm": 0.34232282638549805, + "learning_rate": 1.4510721247563353e-05, + "loss": 0.061, + "step": 805 + }, + { + "epoch": 1.5100796999531176, + "grad_norm": 0.6131661534309387, + "learning_rate": 1.4502923976608188e-05, + "loss": 0.0817, + "step": 806 + }, + { + "epoch": 1.5119549929676512, + "grad_norm": 0.20144400000572205, + "learning_rate": 1.4495126705653022e-05, + "loss": 0.0456, + "step": 807 + }, + { + "epoch": 1.5138302859821846, + "grad_norm": 0.2816990315914154, + "learning_rate": 1.4487329434697856e-05, + "loss": 0.0858, + "step": 808 + }, + { + "epoch": 1.5157055789967182, + "grad_norm": 0.3411908447742462, + "learning_rate": 1.447953216374269e-05, + "loss": 0.0866, + "step": 809 + }, + { + "epoch": 1.5175808720112518, + "grad_norm": 0.2759881615638733, + "learning_rate": 1.4471734892787526e-05, + "loss": 0.0947, + "step": 810 + }, + { + "epoch": 1.5194561650257854, + "grad_norm": 0.26385390758514404, + "learning_rate": 1.446393762183236e-05, + "loss": 0.0853, + "step": 811 + }, + { + "epoch": 1.5213314580403188, + "grad_norm": 0.20965765416622162, + "learning_rate": 1.4456140350877195e-05, + "loss": 0.0716, + "step": 812 + }, + { + "epoch": 1.5232067510548524, + "grad_norm": 0.33767369389533997, + "learning_rate": 1.4448343079922029e-05, + "loss": 0.1221, + "step": 813 + }, + { + "epoch": 1.5250820440693857, + "grad_norm": 0.9217659831047058, + "learning_rate": 1.4440545808966863e-05, + "loss": 0.0984, + "step": 814 + }, + { + "epoch": 1.5269573370839193, + "grad_norm": 0.2534090280532837, + "learning_rate": 1.4432748538011695e-05, + "loss": 0.0744, + "step": 815 + }, + { + "epoch": 1.528832630098453, + "grad_norm": 0.27146193385124207, + "learning_rate": 1.4424951267056531e-05, + "loss": 0.0876, + "step": 816 + }, + { + "epoch": 1.5307079231129865, + "grad_norm": 0.23886770009994507, + "learning_rate": 1.4417153996101365e-05, + "loss": 0.0823, + "step": 817 + }, + { + "epoch": 1.53258321612752, + "grad_norm": 0.29205018281936646, + "learning_rate": 1.44093567251462e-05, + "loss": 0.1128, + "step": 818 + }, + { + "epoch": 1.5344585091420533, + "grad_norm": 0.2402828186750412, + "learning_rate": 1.4401559454191034e-05, + "loss": 0.0853, + "step": 819 + }, + { + "epoch": 1.5363338021565869, + "grad_norm": 0.28632691502571106, + "learning_rate": 1.4393762183235868e-05, + "loss": 0.0977, + "step": 820 + }, + { + "epoch": 1.5382090951711205, + "grad_norm": 0.296055406332016, + "learning_rate": 1.4385964912280704e-05, + "loss": 0.1045, + "step": 821 + }, + { + "epoch": 1.540084388185654, + "grad_norm": 0.2865302264690399, + "learning_rate": 1.4378167641325538e-05, + "loss": 0.0779, + "step": 822 + }, + { + "epoch": 1.5419596812001877, + "grad_norm": 0.25754040479660034, + "learning_rate": 1.4370370370370372e-05, + "loss": 0.0892, + "step": 823 + }, + { + "epoch": 1.543834974214721, + "grad_norm": 0.2635495960712433, + "learning_rate": 1.4362573099415207e-05, + "loss": 0.0716, + "step": 824 + }, + { + "epoch": 1.5457102672292544, + "grad_norm": 0.42626357078552246, + "learning_rate": 1.4354775828460039e-05, + "loss": 0.1188, + "step": 825 + }, + { + "epoch": 1.547585560243788, + "grad_norm": 0.2752715051174164, + "learning_rate": 1.4346978557504873e-05, + "loss": 0.0927, + "step": 826 + }, + { + "epoch": 1.5494608532583216, + "grad_norm": 0.20132949948310852, + "learning_rate": 1.4339181286549707e-05, + "loss": 0.0541, + "step": 827 + }, + { + "epoch": 1.5513361462728552, + "grad_norm": 0.3923582434654236, + "learning_rate": 1.4331384015594543e-05, + "loss": 0.0871, + "step": 828 + }, + { + "epoch": 1.5532114392873888, + "grad_norm": 0.2865050137042999, + "learning_rate": 1.4323586744639377e-05, + "loss": 0.0802, + "step": 829 + }, + { + "epoch": 1.5550867323019222, + "grad_norm": 0.2447250485420227, + "learning_rate": 1.4315789473684212e-05, + "loss": 0.0699, + "step": 830 + }, + { + "epoch": 1.5569620253164556, + "grad_norm": 0.2569245398044586, + "learning_rate": 1.4307992202729046e-05, + "loss": 0.0736, + "step": 831 + }, + { + "epoch": 1.5588373183309892, + "grad_norm": 0.27975255250930786, + "learning_rate": 1.430019493177388e-05, + "loss": 0.0841, + "step": 832 + }, + { + "epoch": 1.5607126113455227, + "grad_norm": 0.246305450797081, + "learning_rate": 1.4292397660818716e-05, + "loss": 0.0722, + "step": 833 + }, + { + "epoch": 1.5625879043600563, + "grad_norm": 0.3482731580734253, + "learning_rate": 1.428460038986355e-05, + "loss": 0.1454, + "step": 834 + }, + { + "epoch": 1.56446319737459, + "grad_norm": 0.8472810983657837, + "learning_rate": 1.4276803118908383e-05, + "loss": 0.1386, + "step": 835 + }, + { + "epoch": 1.5663384903891233, + "grad_norm": 0.2856524884700775, + "learning_rate": 1.4269005847953217e-05, + "loss": 0.0976, + "step": 836 + }, + { + "epoch": 1.5682137834036567, + "grad_norm": 0.22626835107803345, + "learning_rate": 1.4261208576998051e-05, + "loss": 0.07, + "step": 837 + }, + { + "epoch": 1.5700890764181903, + "grad_norm": 0.28921768069267273, + "learning_rate": 1.4253411306042885e-05, + "loss": 0.0653, + "step": 838 + }, + { + "epoch": 1.5719643694327239, + "grad_norm": 0.3061649799346924, + "learning_rate": 1.4245614035087721e-05, + "loss": 0.1024, + "step": 839 + }, + { + "epoch": 1.5738396624472575, + "grad_norm": 0.23194481432437897, + "learning_rate": 1.4237816764132555e-05, + "loss": 0.0542, + "step": 840 + }, + { + "epoch": 1.5757149554617909, + "grad_norm": 0.21323058009147644, + "learning_rate": 1.423001949317739e-05, + "loss": 0.0413, + "step": 841 + }, + { + "epoch": 1.5775902484763245, + "grad_norm": 0.6626904010772705, + "learning_rate": 1.4222222222222224e-05, + "loss": 0.061, + "step": 842 + }, + { + "epoch": 1.5794655414908578, + "grad_norm": 0.279397577047348, + "learning_rate": 1.4214424951267058e-05, + "loss": 0.069, + "step": 843 + }, + { + "epoch": 1.5813408345053914, + "grad_norm": 0.35950684547424316, + "learning_rate": 1.4206627680311894e-05, + "loss": 0.0628, + "step": 844 + }, + { + "epoch": 1.583216127519925, + "grad_norm": 0.35051649808883667, + "learning_rate": 1.4198830409356725e-05, + "loss": 0.1164, + "step": 845 + }, + { + "epoch": 1.5850914205344586, + "grad_norm": 0.23908878862857819, + "learning_rate": 1.419103313840156e-05, + "loss": 0.0745, + "step": 846 + }, + { + "epoch": 1.586966713548992, + "grad_norm": 0.293130487203598, + "learning_rate": 1.4183235867446395e-05, + "loss": 0.0875, + "step": 847 + }, + { + "epoch": 1.5888420065635256, + "grad_norm": 0.2919827699661255, + "learning_rate": 1.4175438596491229e-05, + "loss": 0.0886, + "step": 848 + }, + { + "epoch": 1.590717299578059, + "grad_norm": 0.3345141112804413, + "learning_rate": 1.4167641325536063e-05, + "loss": 0.1071, + "step": 849 + }, + { + "epoch": 1.5925925925925926, + "grad_norm": 0.6707221269607544, + "learning_rate": 1.4159844054580897e-05, + "loss": 0.1002, + "step": 850 + }, + { + "epoch": 1.5925925925925926, + "eval_loss": 0.07790966331958771, + "eval_runtime": 676.4736, + "eval_samples_per_second": 0.291, + "eval_steps_per_second": 0.291, + "step": 850 + }, + { + "epoch": 1.5944678856071262, + "grad_norm": 0.27356359362602234, + "learning_rate": 1.4152046783625733e-05, + "loss": 0.0801, + "step": 851 + }, + { + "epoch": 1.5963431786216598, + "grad_norm": 0.3435034155845642, + "learning_rate": 1.4144249512670567e-05, + "loss": 0.1108, + "step": 852 + }, + { + "epoch": 1.5982184716361931, + "grad_norm": 0.32657772302627563, + "learning_rate": 1.4136452241715401e-05, + "loss": 0.113, + "step": 853 + }, + { + "epoch": 1.6000937646507267, + "grad_norm": 0.3669753074645996, + "learning_rate": 1.4128654970760236e-05, + "loss": 0.1343, + "step": 854 + }, + { + "epoch": 1.60196905766526, + "grad_norm": 0.23169514536857605, + "learning_rate": 1.4120857699805068e-05, + "loss": 0.0894, + "step": 855 + }, + { + "epoch": 1.6038443506797937, + "grad_norm": 0.2689126431941986, + "learning_rate": 1.4113060428849902e-05, + "loss": 0.0753, + "step": 856 + }, + { + "epoch": 1.6057196436943273, + "grad_norm": 0.2116389125585556, + "learning_rate": 1.4105263157894738e-05, + "loss": 0.0592, + "step": 857 + }, + { + "epoch": 1.6075949367088609, + "grad_norm": 0.2922620475292206, + "learning_rate": 1.4097465886939572e-05, + "loss": 0.0944, + "step": 858 + }, + { + "epoch": 1.6094702297233943, + "grad_norm": 0.26720038056373596, + "learning_rate": 1.4089668615984407e-05, + "loss": 0.0805, + "step": 859 + }, + { + "epoch": 1.6113455227379276, + "grad_norm": 0.36932405829429626, + "learning_rate": 1.408187134502924e-05, + "loss": 0.0871, + "step": 860 + }, + { + "epoch": 1.6132208157524612, + "grad_norm": 0.29181620478630066, + "learning_rate": 1.4074074074074075e-05, + "loss": 0.0903, + "step": 861 + }, + { + "epoch": 1.6150961087669948, + "grad_norm": 0.24898619949817657, + "learning_rate": 1.4066276803118911e-05, + "loss": 0.0612, + "step": 862 + }, + { + "epoch": 1.6169714017815284, + "grad_norm": 0.3132479786872864, + "learning_rate": 1.4058479532163745e-05, + "loss": 0.0962, + "step": 863 + }, + { + "epoch": 1.618846694796062, + "grad_norm": 0.27776429057121277, + "learning_rate": 1.405068226120858e-05, + "loss": 0.0734, + "step": 864 + }, + { + "epoch": 1.6207219878105954, + "grad_norm": 0.3017600476741791, + "learning_rate": 1.4042884990253412e-05, + "loss": 0.0802, + "step": 865 + }, + { + "epoch": 1.6225972808251288, + "grad_norm": 0.2887416183948517, + "learning_rate": 1.4035087719298246e-05, + "loss": 0.098, + "step": 866 + }, + { + "epoch": 1.6244725738396624, + "grad_norm": 0.3379286527633667, + "learning_rate": 1.402729044834308e-05, + "loss": 0.107, + "step": 867 + }, + { + "epoch": 1.626347866854196, + "grad_norm": 0.33945974707603455, + "learning_rate": 1.4019493177387914e-05, + "loss": 0.0949, + "step": 868 + }, + { + "epoch": 1.6282231598687296, + "grad_norm": 0.5351037979125977, + "learning_rate": 1.401169590643275e-05, + "loss": 0.0974, + "step": 869 + }, + { + "epoch": 1.6300984528832632, + "grad_norm": 0.28477218747138977, + "learning_rate": 1.4003898635477584e-05, + "loss": 0.1049, + "step": 870 + }, + { + "epoch": 1.6319737458977965, + "grad_norm": 0.34814324975013733, + "learning_rate": 1.3996101364522419e-05, + "loss": 0.1052, + "step": 871 + }, + { + "epoch": 1.63384903891233, + "grad_norm": 0.3389275074005127, + "learning_rate": 1.3988304093567253e-05, + "loss": 0.1081, + "step": 872 + }, + { + "epoch": 1.6357243319268635, + "grad_norm": 0.3551498353481293, + "learning_rate": 1.3980506822612087e-05, + "loss": 0.1016, + "step": 873 + }, + { + "epoch": 1.637599624941397, + "grad_norm": 0.43945154547691345, + "learning_rate": 1.3972709551656923e-05, + "loss": 0.0631, + "step": 874 + }, + { + "epoch": 1.6394749179559307, + "grad_norm": 0.7714282870292664, + "learning_rate": 1.3964912280701755e-05, + "loss": 0.0969, + "step": 875 + }, + { + "epoch": 1.6413502109704643, + "grad_norm": 0.2278236597776413, + "learning_rate": 1.395711500974659e-05, + "loss": 0.0711, + "step": 876 + }, + { + "epoch": 1.6432255039849977, + "grad_norm": 0.3024490773677826, + "learning_rate": 1.3949317738791424e-05, + "loss": 0.0727, + "step": 877 + }, + { + "epoch": 1.645100796999531, + "grad_norm": 0.31009918451309204, + "learning_rate": 1.3941520467836258e-05, + "loss": 0.1222, + "step": 878 + }, + { + "epoch": 1.6469760900140646, + "grad_norm": 0.402402400970459, + "learning_rate": 1.3933723196881092e-05, + "loss": 0.1404, + "step": 879 + }, + { + "epoch": 1.6488513830285982, + "grad_norm": 0.3172832727432251, + "learning_rate": 1.3925925925925928e-05, + "loss": 0.1052, + "step": 880 + }, + { + "epoch": 1.6507266760431318, + "grad_norm": 0.6237524151802063, + "learning_rate": 1.3918128654970762e-05, + "loss": 0.0804, + "step": 881 + }, + { + "epoch": 1.6526019690576652, + "grad_norm": 0.27731725573539734, + "learning_rate": 1.3910331384015596e-05, + "loss": 0.0988, + "step": 882 + }, + { + "epoch": 1.6544772620721988, + "grad_norm": 0.2532290518283844, + "learning_rate": 1.390253411306043e-05, + "loss": 0.1025, + "step": 883 + }, + { + "epoch": 1.6563525550867322, + "grad_norm": 0.3791520297527313, + "learning_rate": 1.3894736842105265e-05, + "loss": 0.1178, + "step": 884 + }, + { + "epoch": 1.6582278481012658, + "grad_norm": 0.24422794580459595, + "learning_rate": 1.3886939571150097e-05, + "loss": 0.0764, + "step": 885 + }, + { + "epoch": 1.6601031411157994, + "grad_norm": 0.3019620478153229, + "learning_rate": 1.3879142300194931e-05, + "loss": 0.103, + "step": 886 + }, + { + "epoch": 1.661978434130333, + "grad_norm": 0.33067408204078674, + "learning_rate": 1.3871345029239767e-05, + "loss": 0.114, + "step": 887 + }, + { + "epoch": 1.6638537271448663, + "grad_norm": 0.30137330293655396, + "learning_rate": 1.3863547758284602e-05, + "loss": 0.0538, + "step": 888 + }, + { + "epoch": 1.6657290201594, + "grad_norm": 0.3999065160751343, + "learning_rate": 1.3855750487329436e-05, + "loss": 0.0878, + "step": 889 + }, + { + "epoch": 1.6676043131739333, + "grad_norm": 0.23727497458457947, + "learning_rate": 1.384795321637427e-05, + "loss": 0.0762, + "step": 890 + }, + { + "epoch": 1.669479606188467, + "grad_norm": 0.3073793053627014, + "learning_rate": 1.3840155945419104e-05, + "loss": 0.1009, + "step": 891 + }, + { + "epoch": 1.6713548992030005, + "grad_norm": 0.3737054467201233, + "learning_rate": 1.383235867446394e-05, + "loss": 0.118, + "step": 892 + }, + { + "epoch": 1.673230192217534, + "grad_norm": 0.2168496549129486, + "learning_rate": 1.3824561403508774e-05, + "loss": 0.0733, + "step": 893 + }, + { + "epoch": 1.6751054852320675, + "grad_norm": 0.3298211395740509, + "learning_rate": 1.3816764132553608e-05, + "loss": 0.1167, + "step": 894 + }, + { + "epoch": 1.676980778246601, + "grad_norm": 0.2308388352394104, + "learning_rate": 1.3808966861598441e-05, + "loss": 0.0744, + "step": 895 + }, + { + "epoch": 1.6788560712611345, + "grad_norm": 0.27769342064857483, + "learning_rate": 1.3801169590643275e-05, + "loss": 0.0651, + "step": 896 + }, + { + "epoch": 1.680731364275668, + "grad_norm": 0.2548845708370209, + "learning_rate": 1.379337231968811e-05, + "loss": 0.0993, + "step": 897 + }, + { + "epoch": 1.6826066572902016, + "grad_norm": 0.28797003626823425, + "learning_rate": 1.3785575048732943e-05, + "loss": 0.0773, + "step": 898 + }, + { + "epoch": 1.6844819503047352, + "grad_norm": 0.32892584800720215, + "learning_rate": 1.377777777777778e-05, + "loss": 0.0944, + "step": 899 + }, + { + "epoch": 1.6863572433192686, + "grad_norm": 0.40558120608329773, + "learning_rate": 1.3769980506822614e-05, + "loss": 0.1006, + "step": 900 + }, + { + "epoch": 1.6863572433192686, + "eval_loss": 0.07565333694219589, + "eval_runtime": 675.1107, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 900 + }, + { + "epoch": 1.688232536333802, + "grad_norm": 0.3913367986679077, + "learning_rate": 1.3762183235867448e-05, + "loss": 0.0936, + "step": 901 + }, + { + "epoch": 1.6901078293483356, + "grad_norm": 0.2699122726917267, + "learning_rate": 1.3754385964912282e-05, + "loss": 0.0829, + "step": 902 + }, + { + "epoch": 1.6919831223628692, + "grad_norm": 0.35245972871780396, + "learning_rate": 1.3746588693957116e-05, + "loss": 0.0932, + "step": 903 + }, + { + "epoch": 1.6938584153774028, + "grad_norm": 0.25840988755226135, + "learning_rate": 1.3738791423001952e-05, + "loss": 0.086, + "step": 904 + }, + { + "epoch": 1.6957337083919364, + "grad_norm": 0.44727715849876404, + "learning_rate": 1.3730994152046784e-05, + "loss": 0.0939, + "step": 905 + }, + { + "epoch": 1.6976090014064698, + "grad_norm": 0.2222200334072113, + "learning_rate": 1.3723196881091619e-05, + "loss": 0.0859, + "step": 906 + }, + { + "epoch": 1.6994842944210031, + "grad_norm": 0.27306899428367615, + "learning_rate": 1.3715399610136453e-05, + "loss": 0.0877, + "step": 907 + }, + { + "epoch": 1.7013595874355367, + "grad_norm": 0.41138342022895813, + "learning_rate": 1.3707602339181287e-05, + "loss": 0.0812, + "step": 908 + }, + { + "epoch": 1.7032348804500703, + "grad_norm": 0.29449108242988586, + "learning_rate": 1.3699805068226121e-05, + "loss": 0.0887, + "step": 909 + }, + { + "epoch": 1.705110173464604, + "grad_norm": 0.25317761301994324, + "learning_rate": 1.3692007797270957e-05, + "loss": 0.0921, + "step": 910 + }, + { + "epoch": 1.7069854664791375, + "grad_norm": 0.24713647365570068, + "learning_rate": 1.3684210526315791e-05, + "loss": 0.0646, + "step": 911 + }, + { + "epoch": 1.7088607594936709, + "grad_norm": 0.2317000776529312, + "learning_rate": 1.3676413255360625e-05, + "loss": 0.0596, + "step": 912 + }, + { + "epoch": 1.7107360525082043, + "grad_norm": 0.2771907150745392, + "learning_rate": 1.366861598440546e-05, + "loss": 0.091, + "step": 913 + }, + { + "epoch": 1.7126113455227379, + "grad_norm": 0.23459120094776154, + "learning_rate": 1.3660818713450294e-05, + "loss": 0.0701, + "step": 914 + }, + { + "epoch": 1.7144866385372715, + "grad_norm": 0.30127042531967163, + "learning_rate": 1.3653021442495126e-05, + "loss": 0.088, + "step": 915 + }, + { + "epoch": 1.716361931551805, + "grad_norm": 0.24772456288337708, + "learning_rate": 1.364522417153996e-05, + "loss": 0.0774, + "step": 916 + }, + { + "epoch": 1.7182372245663387, + "grad_norm": 0.2913759648799896, + "learning_rate": 1.3637426900584796e-05, + "loss": 0.0826, + "step": 917 + }, + { + "epoch": 1.720112517580872, + "grad_norm": 0.28088027238845825, + "learning_rate": 1.362962962962963e-05, + "loss": 0.0881, + "step": 918 + }, + { + "epoch": 1.7219878105954054, + "grad_norm": 0.30184197425842285, + "learning_rate": 1.3621832358674465e-05, + "loss": 0.0942, + "step": 919 + }, + { + "epoch": 1.723863103609939, + "grad_norm": 0.2570498287677765, + "learning_rate": 1.3614035087719299e-05, + "loss": 0.0693, + "step": 920 + }, + { + "epoch": 1.7257383966244726, + "grad_norm": 0.31009960174560547, + "learning_rate": 1.3606237816764133e-05, + "loss": 0.068, + "step": 921 + }, + { + "epoch": 1.7276136896390062, + "grad_norm": 0.23270297050476074, + "learning_rate": 1.3598440545808969e-05, + "loss": 0.0619, + "step": 922 + }, + { + "epoch": 1.7294889826535396, + "grad_norm": 0.33265721797943115, + "learning_rate": 1.3590643274853803e-05, + "loss": 0.0995, + "step": 923 + }, + { + "epoch": 1.7313642756680732, + "grad_norm": 0.38637009263038635, + "learning_rate": 1.3582846003898637e-05, + "loss": 0.1298, + "step": 924 + }, + { + "epoch": 1.7332395686826065, + "grad_norm": 0.27367690205574036, + "learning_rate": 1.357504873294347e-05, + "loss": 0.0852, + "step": 925 + }, + { + "epoch": 1.7351148616971401, + "grad_norm": 0.3776531517505646, + "learning_rate": 1.3567251461988304e-05, + "loss": 0.1081, + "step": 926 + }, + { + "epoch": 1.7369901547116737, + "grad_norm": 0.25184816122055054, + "learning_rate": 1.3559454191033138e-05, + "loss": 0.0747, + "step": 927 + }, + { + "epoch": 1.7388654477262073, + "grad_norm": 0.26781925559043884, + "learning_rate": 1.3551656920077974e-05, + "loss": 0.0974, + "step": 928 + }, + { + "epoch": 1.7407407407407407, + "grad_norm": 0.38023102283477783, + "learning_rate": 1.3543859649122808e-05, + "loss": 0.0989, + "step": 929 + }, + { + "epoch": 1.7426160337552743, + "grad_norm": 0.30040284991264343, + "learning_rate": 1.3536062378167643e-05, + "loss": 0.098, + "step": 930 + }, + { + "epoch": 1.7444913267698077, + "grad_norm": 0.41722437739372253, + "learning_rate": 1.3528265107212477e-05, + "loss": 0.0788, + "step": 931 + }, + { + "epoch": 1.7463666197843413, + "grad_norm": 0.31635239720344543, + "learning_rate": 1.3520467836257311e-05, + "loss": 0.131, + "step": 932 + }, + { + "epoch": 1.7482419127988749, + "grad_norm": 0.34196072816848755, + "learning_rate": 1.3512670565302147e-05, + "loss": 0.1302, + "step": 933 + }, + { + "epoch": 1.7501172058134085, + "grad_norm": 0.24944040179252625, + "learning_rate": 1.3504873294346981e-05, + "loss": 0.078, + "step": 934 + }, + { + "epoch": 1.7519924988279418, + "grad_norm": 0.2601966857910156, + "learning_rate": 1.3497076023391814e-05, + "loss": 0.0774, + "step": 935 + }, + { + "epoch": 1.7538677918424754, + "grad_norm": 0.27114635705947876, + "learning_rate": 1.3489278752436648e-05, + "loss": 0.0871, + "step": 936 + }, + { + "epoch": 1.7557430848570088, + "grad_norm": 0.27622026205062866, + "learning_rate": 1.3481481481481482e-05, + "loss": 0.0668, + "step": 937 + }, + { + "epoch": 1.7576183778715424, + "grad_norm": 0.32424378395080566, + "learning_rate": 1.3473684210526316e-05, + "loss": 0.1001, + "step": 938 + }, + { + "epoch": 1.759493670886076, + "grad_norm": 0.36716216802597046, + "learning_rate": 1.346588693957115e-05, + "loss": 0.1308, + "step": 939 + }, + { + "epoch": 1.7613689639006096, + "grad_norm": 0.29691779613494873, + "learning_rate": 1.3458089668615986e-05, + "loss": 0.0736, + "step": 940 + }, + { + "epoch": 1.763244256915143, + "grad_norm": 0.36653903126716614, + "learning_rate": 1.345029239766082e-05, + "loss": 0.1169, + "step": 941 + }, + { + "epoch": 1.7651195499296763, + "grad_norm": 0.21092906594276428, + "learning_rate": 1.3442495126705655e-05, + "loss": 0.0722, + "step": 942 + }, + { + "epoch": 1.76699484294421, + "grad_norm": 0.34096020460128784, + "learning_rate": 1.3434697855750489e-05, + "loss": 0.1135, + "step": 943 + }, + { + "epoch": 1.7688701359587435, + "grad_norm": 0.2584519386291504, + "learning_rate": 1.3426900584795323e-05, + "loss": 0.0712, + "step": 944 + }, + { + "epoch": 1.7707454289732771, + "grad_norm": 0.24627022445201874, + "learning_rate": 1.3419103313840155e-05, + "loss": 0.0683, + "step": 945 + }, + { + "epoch": 1.7726207219878107, + "grad_norm": 0.2898278534412384, + "learning_rate": 1.3411306042884991e-05, + "loss": 0.0833, + "step": 946 + }, + { + "epoch": 1.774496015002344, + "grad_norm": 0.2810303568840027, + "learning_rate": 1.3403508771929826e-05, + "loss": 0.1173, + "step": 947 + }, + { + "epoch": 1.7763713080168775, + "grad_norm": 0.2736395597457886, + "learning_rate": 1.339571150097466e-05, + "loss": 0.0681, + "step": 948 + }, + { + "epoch": 1.778246601031411, + "grad_norm": 0.31029412150382996, + "learning_rate": 1.3387914230019494e-05, + "loss": 0.0915, + "step": 949 + }, + { + "epoch": 1.7801218940459447, + "grad_norm": 0.24978873133659363, + "learning_rate": 1.3380116959064328e-05, + "loss": 0.091, + "step": 950 + }, + { + "epoch": 1.7801218940459447, + "eval_loss": 0.07832063734531403, + "eval_runtime": 676.2932, + "eval_samples_per_second": 0.291, + "eval_steps_per_second": 0.291, + "step": 950 + }, + { + "epoch": 1.7819971870604783, + "grad_norm": 0.3023127615451813, + "learning_rate": 1.3372319688109164e-05, + "loss": 0.0887, + "step": 951 + }, + { + "epoch": 1.7838724800750119, + "grad_norm": 0.4247094988822937, + "learning_rate": 1.3364522417153998e-05, + "loss": 0.1705, + "step": 952 + }, + { + "epoch": 1.7857477730895452, + "grad_norm": 0.3702313005924225, + "learning_rate": 1.3356725146198832e-05, + "loss": 0.1087, + "step": 953 + }, + { + "epoch": 1.7876230661040786, + "grad_norm": 0.26381444931030273, + "learning_rate": 1.3348927875243667e-05, + "loss": 0.072, + "step": 954 + }, + { + "epoch": 1.7894983591186122, + "grad_norm": 0.32169631123542786, + "learning_rate": 1.3341130604288499e-05, + "loss": 0.0828, + "step": 955 + }, + { + "epoch": 1.7913736521331458, + "grad_norm": 0.37914732098579407, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0998, + "step": 956 + }, + { + "epoch": 1.7932489451476794, + "grad_norm": 0.2996065020561218, + "learning_rate": 1.3325536062378167e-05, + "loss": 0.1056, + "step": 957 + }, + { + "epoch": 1.795124238162213, + "grad_norm": 0.2926497161388397, + "learning_rate": 1.3317738791423003e-05, + "loss": 0.1248, + "step": 958 + }, + { + "epoch": 1.7969995311767464, + "grad_norm": 0.26383379101753235, + "learning_rate": 1.3309941520467838e-05, + "loss": 0.095, + "step": 959 + }, + { + "epoch": 1.7988748241912798, + "grad_norm": 0.334276020526886, + "learning_rate": 1.3302144249512672e-05, + "loss": 0.0806, + "step": 960 + }, + { + "epoch": 1.8007501172058133, + "grad_norm": 0.31996652483940125, + "learning_rate": 1.3294346978557506e-05, + "loss": 0.0691, + "step": 961 + }, + { + "epoch": 1.802625410220347, + "grad_norm": 0.3317575752735138, + "learning_rate": 1.328654970760234e-05, + "loss": 0.0823, + "step": 962 + }, + { + "epoch": 1.8045007032348805, + "grad_norm": 0.307790070772171, + "learning_rate": 1.3278752436647176e-05, + "loss": 0.0916, + "step": 963 + }, + { + "epoch": 1.806375996249414, + "grad_norm": 0.3371288478374481, + "learning_rate": 1.327095516569201e-05, + "loss": 0.0917, + "step": 964 + }, + { + "epoch": 1.8082512892639475, + "grad_norm": 0.4027099907398224, + "learning_rate": 1.3263157894736843e-05, + "loss": 0.1157, + "step": 965 + }, + { + "epoch": 1.810126582278481, + "grad_norm": 0.28519630432128906, + "learning_rate": 1.3255360623781677e-05, + "loss": 0.0841, + "step": 966 + }, + { + "epoch": 1.8120018752930145, + "grad_norm": 0.2463994026184082, + "learning_rate": 1.3247563352826511e-05, + "loss": 0.0894, + "step": 967 + }, + { + "epoch": 1.813877168307548, + "grad_norm": 0.2853533625602722, + "learning_rate": 1.3239766081871345e-05, + "loss": 0.0667, + "step": 968 + }, + { + "epoch": 1.8157524613220817, + "grad_norm": 0.25355201959609985, + "learning_rate": 1.3231968810916181e-05, + "loss": 0.0765, + "step": 969 + }, + { + "epoch": 1.817627754336615, + "grad_norm": 0.3773306608200073, + "learning_rate": 1.3224171539961015e-05, + "loss": 0.0854, + "step": 970 + }, + { + "epoch": 1.8195030473511487, + "grad_norm": 0.29040902853012085, + "learning_rate": 1.321637426900585e-05, + "loss": 0.0893, + "step": 971 + }, + { + "epoch": 1.821378340365682, + "grad_norm": 0.35382455587387085, + "learning_rate": 1.3208576998050684e-05, + "loss": 0.1021, + "step": 972 + }, + { + "epoch": 1.8232536333802156, + "grad_norm": 0.4576570391654968, + "learning_rate": 1.3200779727095518e-05, + "loss": 0.1179, + "step": 973 + }, + { + "epoch": 1.8251289263947492, + "grad_norm": 0.37235504388809204, + "learning_rate": 1.3192982456140354e-05, + "loss": 0.1344, + "step": 974 + }, + { + "epoch": 1.8270042194092828, + "grad_norm": 0.30962350964546204, + "learning_rate": 1.3185185185185185e-05, + "loss": 0.0751, + "step": 975 + }, + { + "epoch": 1.8288795124238162, + "grad_norm": 0.2378237396478653, + "learning_rate": 1.317738791423002e-05, + "loss": 0.0714, + "step": 976 + }, + { + "epoch": 1.8307548054383498, + "grad_norm": 0.2367488443851471, + "learning_rate": 1.3169590643274855e-05, + "loss": 0.0599, + "step": 977 + }, + { + "epoch": 1.8326300984528832, + "grad_norm": 0.2833852171897888, + "learning_rate": 1.3161793372319689e-05, + "loss": 0.0995, + "step": 978 + }, + { + "epoch": 1.8345053914674168, + "grad_norm": 1.0794099569320679, + "learning_rate": 1.3153996101364523e-05, + "loss": 0.1319, + "step": 979 + }, + { + "epoch": 1.8363806844819504, + "grad_norm": 0.29689860343933105, + "learning_rate": 1.3146198830409357e-05, + "loss": 0.0849, + "step": 980 + }, + { + "epoch": 1.838255977496484, + "grad_norm": 0.3836843967437744, + "learning_rate": 1.3138401559454193e-05, + "loss": 0.1075, + "step": 981 + }, + { + "epoch": 1.8401312705110173, + "grad_norm": 0.27197182178497314, + "learning_rate": 1.3130604288499027e-05, + "loss": 0.08, + "step": 982 + }, + { + "epoch": 1.842006563525551, + "grad_norm": 0.3288334012031555, + "learning_rate": 1.3122807017543862e-05, + "loss": 0.0604, + "step": 983 + }, + { + "epoch": 1.8438818565400843, + "grad_norm": 0.3458631932735443, + "learning_rate": 1.3115009746588696e-05, + "loss": 0.1051, + "step": 984 + }, + { + "epoch": 1.845757149554618, + "grad_norm": 0.28444722294807434, + "learning_rate": 1.3107212475633528e-05, + "loss": 0.0876, + "step": 985 + }, + { + "epoch": 1.8476324425691515, + "grad_norm": 0.3890294134616852, + "learning_rate": 1.3099415204678362e-05, + "loss": 0.1209, + "step": 986 + }, + { + "epoch": 1.849507735583685, + "grad_norm": 0.2536354064941406, + "learning_rate": 1.3091617933723197e-05, + "loss": 0.0615, + "step": 987 + }, + { + "epoch": 1.8513830285982185, + "grad_norm": 0.2301822453737259, + "learning_rate": 1.3083820662768032e-05, + "loss": 0.0802, + "step": 988 + }, + { + "epoch": 1.8532583216127518, + "grad_norm": 0.5876390337944031, + "learning_rate": 1.3076023391812867e-05, + "loss": 0.134, + "step": 989 + }, + { + "epoch": 1.8551336146272854, + "grad_norm": 0.2957145571708679, + "learning_rate": 1.3068226120857701e-05, + "loss": 0.0834, + "step": 990 + }, + { + "epoch": 1.857008907641819, + "grad_norm": 0.321350634098053, + "learning_rate": 1.3060428849902535e-05, + "loss": 0.0998, + "step": 991 + }, + { + "epoch": 1.8588842006563526, + "grad_norm": 0.32503458857536316, + "learning_rate": 1.305263157894737e-05, + "loss": 0.1065, + "step": 992 + }, + { + "epoch": 1.8607594936708862, + "grad_norm": 0.2906481623649597, + "learning_rate": 1.3044834307992205e-05, + "loss": 0.0878, + "step": 993 + }, + { + "epoch": 1.8626347866854196, + "grad_norm": 0.25748249888420105, + "learning_rate": 1.303703703703704e-05, + "loss": 0.0784, + "step": 994 + }, + { + "epoch": 1.864510079699953, + "grad_norm": 0.2645077407360077, + "learning_rate": 1.3029239766081872e-05, + "loss": 0.0827, + "step": 995 + }, + { + "epoch": 1.8663853727144866, + "grad_norm": 0.28453439474105835, + "learning_rate": 1.3021442495126706e-05, + "loss": 0.0792, + "step": 996 + }, + { + "epoch": 1.8682606657290202, + "grad_norm": 0.2839300036430359, + "learning_rate": 1.301364522417154e-05, + "loss": 0.069, + "step": 997 + }, + { + "epoch": 1.8701359587435538, + "grad_norm": 0.40391039848327637, + "learning_rate": 1.3005847953216374e-05, + "loss": 0.068, + "step": 998 + }, + { + "epoch": 1.8720112517580874, + "grad_norm": 0.29251784086227417, + "learning_rate": 1.299805068226121e-05, + "loss": 0.0731, + "step": 999 + }, + { + "epoch": 1.8738865447726207, + "grad_norm": 0.2935086786746979, + "learning_rate": 1.2990253411306044e-05, + "loss": 0.0984, + "step": 1000 + }, + { + "epoch": 1.8738865447726207, + "eval_loss": 0.07738383859395981, + "eval_runtime": 676.7683, + "eval_samples_per_second": 0.291, + "eval_steps_per_second": 0.291, + "step": 1000 + }, + { + "epoch": 1.875761837787154, + "grad_norm": 0.5241721868515015, + "learning_rate": 1.2982456140350879e-05, + "loss": 0.1288, + "step": 1001 + }, + { + "epoch": 1.8776371308016877, + "grad_norm": 0.2855282425880432, + "learning_rate": 1.2974658869395713e-05, + "loss": 0.0962, + "step": 1002 + }, + { + "epoch": 1.8795124238162213, + "grad_norm": 0.26559796929359436, + "learning_rate": 1.2966861598440547e-05, + "loss": 0.0838, + "step": 1003 + }, + { + "epoch": 1.881387716830755, + "grad_norm": 0.31060707569122314, + "learning_rate": 1.2959064327485383e-05, + "loss": 0.1008, + "step": 1004 + }, + { + "epoch": 1.8832630098452883, + "grad_norm": 0.30378690361976624, + "learning_rate": 1.2951267056530214e-05, + "loss": 0.0783, + "step": 1005 + }, + { + "epoch": 1.8851383028598219, + "grad_norm": 0.3393579125404358, + "learning_rate": 1.294346978557505e-05, + "loss": 0.075, + "step": 1006 + }, + { + "epoch": 1.8870135958743552, + "grad_norm": 0.36604905128479004, + "learning_rate": 1.2935672514619884e-05, + "loss": 0.0771, + "step": 1007 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.3377784192562103, + "learning_rate": 1.2927875243664718e-05, + "loss": 0.1062, + "step": 1008 + }, + { + "epoch": 1.8907641819034224, + "grad_norm": 0.29350435733795166, + "learning_rate": 1.2920077972709552e-05, + "loss": 0.0703, + "step": 1009 + }, + { + "epoch": 1.892639474917956, + "grad_norm": 0.2543967366218567, + "learning_rate": 1.2912280701754386e-05, + "loss": 0.0736, + "step": 1010 + }, + { + "epoch": 1.8945147679324894, + "grad_norm": 0.29992133378982544, + "learning_rate": 1.2904483430799222e-05, + "loss": 0.0945, + "step": 1011 + }, + { + "epoch": 1.896390060947023, + "grad_norm": 0.26045867800712585, + "learning_rate": 1.2896686159844056e-05, + "loss": 0.0823, + "step": 1012 + }, + { + "epoch": 1.8982653539615564, + "grad_norm": 0.37396302819252014, + "learning_rate": 1.288888888888889e-05, + "loss": 0.1008, + "step": 1013 + }, + { + "epoch": 1.90014064697609, + "grad_norm": 0.3353418707847595, + "learning_rate": 1.2881091617933725e-05, + "loss": 0.1059, + "step": 1014 + }, + { + "epoch": 1.9020159399906236, + "grad_norm": 0.2745090126991272, + "learning_rate": 1.2873294346978557e-05, + "loss": 0.0838, + "step": 1015 + }, + { + "epoch": 1.9038912330051572, + "grad_norm": 0.25266072154045105, + "learning_rate": 1.2865497076023392e-05, + "loss": 0.0638, + "step": 1016 + }, + { + "epoch": 1.9057665260196905, + "grad_norm": 0.41703373193740845, + "learning_rate": 1.2857699805068227e-05, + "loss": 0.1085, + "step": 1017 + }, + { + "epoch": 1.9076418190342241, + "grad_norm": 0.35806798934936523, + "learning_rate": 1.2849902534113062e-05, + "loss": 0.0803, + "step": 1018 + }, + { + "epoch": 1.9095171120487575, + "grad_norm": 0.36233675479888916, + "learning_rate": 1.2842105263157896e-05, + "loss": 0.1004, + "step": 1019 + }, + { + "epoch": 1.9113924050632911, + "grad_norm": 0.2726714611053467, + "learning_rate": 1.283430799220273e-05, + "loss": 0.0771, + "step": 1020 + }, + { + "epoch": 1.9132676980778247, + "grad_norm": 0.25889918208122253, + "learning_rate": 1.2826510721247564e-05, + "loss": 0.0729, + "step": 1021 + }, + { + "epoch": 1.9151429910923583, + "grad_norm": 0.27719736099243164, + "learning_rate": 1.28187134502924e-05, + "loss": 0.0926, + "step": 1022 + }, + { + "epoch": 1.9170182841068917, + "grad_norm": 0.6307101249694824, + "learning_rate": 1.2810916179337234e-05, + "loss": 0.1319, + "step": 1023 + }, + { + "epoch": 1.9188935771214253, + "grad_norm": 0.3898720443248749, + "learning_rate": 1.2803118908382068e-05, + "loss": 0.0881, + "step": 1024 + }, + { + "epoch": 1.9207688701359587, + "grad_norm": 0.33780285716056824, + "learning_rate": 1.2795321637426901e-05, + "loss": 0.1037, + "step": 1025 + }, + { + "epoch": 1.9226441631504922, + "grad_norm": 0.28749212622642517, + "learning_rate": 1.2787524366471735e-05, + "loss": 0.093, + "step": 1026 + }, + { + "epoch": 1.9245194561650258, + "grad_norm": 0.33152034878730774, + "learning_rate": 1.277972709551657e-05, + "loss": 0.0945, + "step": 1027 + }, + { + "epoch": 1.9263947491795594, + "grad_norm": 0.28936097025871277, + "learning_rate": 1.2771929824561404e-05, + "loss": 0.0916, + "step": 1028 + }, + { + "epoch": 1.9282700421940928, + "grad_norm": 0.2941216826438904, + "learning_rate": 1.276413255360624e-05, + "loss": 0.0994, + "step": 1029 + }, + { + "epoch": 1.9301453352086262, + "grad_norm": 0.30549120903015137, + "learning_rate": 1.2756335282651074e-05, + "loss": 0.0899, + "step": 1030 + }, + { + "epoch": 1.9320206282231598, + "grad_norm": 0.24775011837482452, + "learning_rate": 1.2748538011695908e-05, + "loss": 0.0623, + "step": 1031 + }, + { + "epoch": 1.9338959212376934, + "grad_norm": 0.27423885464668274, + "learning_rate": 1.2740740740740742e-05, + "loss": 0.0614, + "step": 1032 + }, + { + "epoch": 1.935771214252227, + "grad_norm": 0.33300426602363586, + "learning_rate": 1.2732943469785576e-05, + "loss": 0.1014, + "step": 1033 + }, + { + "epoch": 1.9376465072667606, + "grad_norm": 0.318990558385849, + "learning_rate": 1.2725146198830412e-05, + "loss": 0.0904, + "step": 1034 + }, + { + "epoch": 1.939521800281294, + "grad_norm": 0.2824667692184448, + "learning_rate": 1.2717348927875245e-05, + "loss": 0.0735, + "step": 1035 + }, + { + "epoch": 1.9413970932958273, + "grad_norm": 0.2873893976211548, + "learning_rate": 1.2709551656920079e-05, + "loss": 0.066, + "step": 1036 + }, + { + "epoch": 1.943272386310361, + "grad_norm": 0.33205756545066833, + "learning_rate": 1.2701754385964913e-05, + "loss": 0.1017, + "step": 1037 + }, + { + "epoch": 1.9451476793248945, + "grad_norm": 0.5657170414924622, + "learning_rate": 1.2693957115009747e-05, + "loss": 0.08, + "step": 1038 + }, + { + "epoch": 1.9470229723394281, + "grad_norm": 0.5216059684753418, + "learning_rate": 1.2686159844054581e-05, + "loss": 0.0942, + "step": 1039 + }, + { + "epoch": 1.9488982653539617, + "grad_norm": 0.45388665795326233, + "learning_rate": 1.2678362573099417e-05, + "loss": 0.0876, + "step": 1040 + }, + { + "epoch": 1.950773558368495, + "grad_norm": 0.28133487701416016, + "learning_rate": 1.2670565302144251e-05, + "loss": 0.0829, + "step": 1041 + }, + { + "epoch": 1.9526488513830285, + "grad_norm": 0.32736194133758545, + "learning_rate": 1.2662768031189086e-05, + "loss": 0.1283, + "step": 1042 + }, + { + "epoch": 1.954524144397562, + "grad_norm": 0.307423859834671, + "learning_rate": 1.265497076023392e-05, + "loss": 0.0954, + "step": 1043 + }, + { + "epoch": 1.9563994374120957, + "grad_norm": 0.3167324662208557, + "learning_rate": 1.2647173489278754e-05, + "loss": 0.0948, + "step": 1044 + }, + { + "epoch": 1.9582747304266293, + "grad_norm": 0.2761521637439728, + "learning_rate": 1.2639376218323586e-05, + "loss": 0.0632, + "step": 1045 + }, + { + "epoch": 1.9601500234411628, + "grad_norm": 0.3254704475402832, + "learning_rate": 1.263157894736842e-05, + "loss": 0.068, + "step": 1046 + }, + { + "epoch": 1.9620253164556962, + "grad_norm": 0.41552290320396423, + "learning_rate": 1.2623781676413257e-05, + "loss": 0.1075, + "step": 1047 + }, + { + "epoch": 1.9639006094702296, + "grad_norm": 0.31078511476516724, + "learning_rate": 1.261598440545809e-05, + "loss": 0.0812, + "step": 1048 + }, + { + "epoch": 1.9657759024847632, + "grad_norm": 0.3442966639995575, + "learning_rate": 1.2608187134502925e-05, + "loss": 0.0926, + "step": 1049 + }, + { + "epoch": 1.9676511954992968, + "grad_norm": 0.2290525734424591, + "learning_rate": 1.2600389863547759e-05, + "loss": 0.069, + "step": 1050 + }, + { + "epoch": 1.9676511954992968, + "eval_loss": 0.07501588761806488, + "eval_runtime": 675.7387, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 1050 + }, + { + "epoch": 1.9695264885138304, + "grad_norm": 0.2715151309967041, + "learning_rate": 1.2592592592592593e-05, + "loss": 0.0701, + "step": 1051 + }, + { + "epoch": 1.9714017815283638, + "grad_norm": 0.3169614374637604, + "learning_rate": 1.258479532163743e-05, + "loss": 0.1119, + "step": 1052 + }, + { + "epoch": 1.9732770745428974, + "grad_norm": 0.6188083291053772, + "learning_rate": 1.2576998050682263e-05, + "loss": 0.0969, + "step": 1053 + }, + { + "epoch": 1.9751523675574307, + "grad_norm": 0.28429678082466125, + "learning_rate": 1.2569200779727098e-05, + "loss": 0.058, + "step": 1054 + }, + { + "epoch": 1.9770276605719643, + "grad_norm": 0.7146753072738647, + "learning_rate": 1.256140350877193e-05, + "loss": 0.0636, + "step": 1055 + }, + { + "epoch": 1.978902953586498, + "grad_norm": 0.5578083992004395, + "learning_rate": 1.2553606237816764e-05, + "loss": 0.0935, + "step": 1056 + }, + { + "epoch": 1.9807782466010315, + "grad_norm": 0.30978304147720337, + "learning_rate": 1.2545808966861598e-05, + "loss": 0.0784, + "step": 1057 + }, + { + "epoch": 1.982653539615565, + "grad_norm": 0.42586255073547363, + "learning_rate": 1.2538011695906434e-05, + "loss": 0.0982, + "step": 1058 + }, + { + "epoch": 1.9845288326300985, + "grad_norm": 0.3126954138278961, + "learning_rate": 1.2530214424951269e-05, + "loss": 0.0776, + "step": 1059 + }, + { + "epoch": 1.9864041256446319, + "grad_norm": 0.37667107582092285, + "learning_rate": 1.2522417153996103e-05, + "loss": 0.1329, + "step": 1060 + }, + { + "epoch": 1.9882794186591655, + "grad_norm": 0.484842985868454, + "learning_rate": 1.2514619883040937e-05, + "loss": 0.0538, + "step": 1061 + }, + { + "epoch": 1.990154711673699, + "grad_norm": 0.3426309823989868, + "learning_rate": 1.2506822612085771e-05, + "loss": 0.1018, + "step": 1062 + }, + { + "epoch": 1.9920300046882327, + "grad_norm": 0.2878342866897583, + "learning_rate": 1.2499025341130607e-05, + "loss": 0.094, + "step": 1063 + }, + { + "epoch": 1.993905297702766, + "grad_norm": 0.30334293842315674, + "learning_rate": 1.2491228070175441e-05, + "loss": 0.0705, + "step": 1064 + }, + { + "epoch": 1.9957805907172996, + "grad_norm": 0.31606927514076233, + "learning_rate": 1.2483430799220274e-05, + "loss": 0.0955, + "step": 1065 + }, + { + "epoch": 1.997655883731833, + "grad_norm": 0.3537469804286957, + "learning_rate": 1.2475633528265108e-05, + "loss": 0.0897, + "step": 1066 + }, + { + "epoch": 1.9995311767463666, + "grad_norm": 0.30905476212501526, + "learning_rate": 1.2467836257309942e-05, + "loss": 0.1066, + "step": 1067 + }, + { + "epoch": 2.0, + "grad_norm": 0.5351856350898743, + "learning_rate": 1.2460038986354776e-05, + "loss": 0.0439, + "step": 1068 + }, + { + "epoch": 2.0018752930145336, + "grad_norm": 0.3859151601791382, + "learning_rate": 1.245224171539961e-05, + "loss": 0.086, + "step": 1069 + }, + { + "epoch": 2.003750586029067, + "grad_norm": 0.3007124066352844, + "learning_rate": 1.2444444444444446e-05, + "loss": 0.0868, + "step": 1070 + }, + { + "epoch": 2.0056258790436003, + "grad_norm": 0.26716360449790955, + "learning_rate": 1.243664717348928e-05, + "loss": 0.1024, + "step": 1071 + }, + { + "epoch": 2.007501172058134, + "grad_norm": 0.33358505368232727, + "learning_rate": 1.2428849902534115e-05, + "loss": 0.097, + "step": 1072 + }, + { + "epoch": 2.0093764650726675, + "grad_norm": 0.30429476499557495, + "learning_rate": 1.2421052631578949e-05, + "loss": 0.0773, + "step": 1073 + }, + { + "epoch": 2.011251758087201, + "grad_norm": 0.2670055627822876, + "learning_rate": 1.2413255360623783e-05, + "loss": 0.0705, + "step": 1074 + }, + { + "epoch": 2.0131270511017347, + "grad_norm": 0.30095306038856506, + "learning_rate": 1.2405458089668616e-05, + "loss": 0.0999, + "step": 1075 + }, + { + "epoch": 2.0150023441162683, + "grad_norm": 0.3201741874217987, + "learning_rate": 1.239766081871345e-05, + "loss": 0.1148, + "step": 1076 + }, + { + "epoch": 2.0168776371308015, + "grad_norm": 0.31769925355911255, + "learning_rate": 1.2389863547758286e-05, + "loss": 0.0756, + "step": 1077 + }, + { + "epoch": 2.018752930145335, + "grad_norm": 0.3209584951400757, + "learning_rate": 1.238206627680312e-05, + "loss": 0.077, + "step": 1078 + }, + { + "epoch": 2.0206282231598687, + "grad_norm": 0.2504936754703522, + "learning_rate": 1.2374269005847954e-05, + "loss": 0.0819, + "step": 1079 + }, + { + "epoch": 2.0225035161744023, + "grad_norm": 0.31677699089050293, + "learning_rate": 1.2366471734892788e-05, + "loss": 0.0874, + "step": 1080 + }, + { + "epoch": 2.024378809188936, + "grad_norm": 0.35840892791748047, + "learning_rate": 1.2358674463937622e-05, + "loss": 0.0859, + "step": 1081 + }, + { + "epoch": 2.0262541022034695, + "grad_norm": 0.3263484537601471, + "learning_rate": 1.2350877192982458e-05, + "loss": 0.1069, + "step": 1082 + }, + { + "epoch": 2.0281293952180026, + "grad_norm": 0.23933690786361694, + "learning_rate": 1.2343079922027293e-05, + "loss": 0.0628, + "step": 1083 + }, + { + "epoch": 2.030004688232536, + "grad_norm": 0.2623092532157898, + "learning_rate": 1.2335282651072127e-05, + "loss": 0.0698, + "step": 1084 + }, + { + "epoch": 2.03187998124707, + "grad_norm": 0.30900081992149353, + "learning_rate": 1.232748538011696e-05, + "loss": 0.0713, + "step": 1085 + }, + { + "epoch": 2.0337552742616034, + "grad_norm": 0.3421262502670288, + "learning_rate": 1.2319688109161793e-05, + "loss": 0.1117, + "step": 1086 + }, + { + "epoch": 2.035630567276137, + "grad_norm": 0.22926147282123566, + "learning_rate": 1.2311890838206628e-05, + "loss": 0.0522, + "step": 1087 + }, + { + "epoch": 2.0375058602906706, + "grad_norm": 0.2777685821056366, + "learning_rate": 1.2304093567251463e-05, + "loss": 0.0524, + "step": 1088 + }, + { + "epoch": 2.0393811533052038, + "grad_norm": 0.3484315276145935, + "learning_rate": 1.2296296296296298e-05, + "loss": 0.0844, + "step": 1089 + }, + { + "epoch": 2.0412564463197373, + "grad_norm": 0.3067864775657654, + "learning_rate": 1.2288499025341132e-05, + "loss": 0.0889, + "step": 1090 + }, + { + "epoch": 2.043131739334271, + "grad_norm": 0.3049250543117523, + "learning_rate": 1.2280701754385966e-05, + "loss": 0.0874, + "step": 1091 + }, + { + "epoch": 2.0450070323488045, + "grad_norm": 0.371250718832016, + "learning_rate": 1.22729044834308e-05, + "loss": 0.0608, + "step": 1092 + }, + { + "epoch": 2.046882325363338, + "grad_norm": 0.35286757349967957, + "learning_rate": 1.2265107212475636e-05, + "loss": 0.0617, + "step": 1093 + }, + { + "epoch": 2.0487576183778717, + "grad_norm": 0.31813135743141174, + "learning_rate": 1.225730994152047e-05, + "loss": 0.072, + "step": 1094 + }, + { + "epoch": 2.050632911392405, + "grad_norm": 0.3504825830459595, + "learning_rate": 1.2249512670565303e-05, + "loss": 0.0917, + "step": 1095 + }, + { + "epoch": 2.0525082044069385, + "grad_norm": 0.3622145354747772, + "learning_rate": 1.2241715399610137e-05, + "loss": 0.1072, + "step": 1096 + }, + { + "epoch": 2.054383497421472, + "grad_norm": 0.2720160186290741, + "learning_rate": 1.2233918128654971e-05, + "loss": 0.0644, + "step": 1097 + }, + { + "epoch": 2.0562587904360057, + "grad_norm": 0.2766894996166229, + "learning_rate": 1.2226120857699805e-05, + "loss": 0.0764, + "step": 1098 + }, + { + "epoch": 2.0581340834505393, + "grad_norm": 0.3748610019683838, + "learning_rate": 1.221832358674464e-05, + "loss": 0.074, + "step": 1099 + }, + { + "epoch": 2.060009376465073, + "grad_norm": 0.2557353377342224, + "learning_rate": 1.2210526315789475e-05, + "loss": 0.0731, + "step": 1100 + }, + { + "epoch": 2.060009376465073, + "eval_loss": 0.07578001916408539, + "eval_runtime": 676.4414, + "eval_samples_per_second": 0.291, + "eval_steps_per_second": 0.291, + "step": 1100 + }, + { + "epoch": 2.061884669479606, + "grad_norm": 0.4064513146877289, + "learning_rate": 1.220272904483431e-05, + "loss": 0.0999, + "step": 1101 + }, + { + "epoch": 2.0637599624941396, + "grad_norm": 0.3794358968734741, + "learning_rate": 1.2194931773879144e-05, + "loss": 0.1305, + "step": 1102 + }, + { + "epoch": 2.065635255508673, + "grad_norm": 0.3337356448173523, + "learning_rate": 1.2187134502923978e-05, + "loss": 0.073, + "step": 1103 + }, + { + "epoch": 2.067510548523207, + "grad_norm": 0.310234010219574, + "learning_rate": 1.2179337231968812e-05, + "loss": 0.0899, + "step": 1104 + }, + { + "epoch": 2.0693858415377404, + "grad_norm": 0.28629547357559204, + "learning_rate": 1.2171539961013645e-05, + "loss": 0.0674, + "step": 1105 + }, + { + "epoch": 2.071261134552274, + "grad_norm": 0.31278061866760254, + "learning_rate": 1.216374269005848e-05, + "loss": 0.0724, + "step": 1106 + }, + { + "epoch": 2.073136427566807, + "grad_norm": 0.3247126638889313, + "learning_rate": 1.2155945419103315e-05, + "loss": 0.0771, + "step": 1107 + }, + { + "epoch": 2.0750117205813408, + "grad_norm": 0.3694022297859192, + "learning_rate": 1.2148148148148149e-05, + "loss": 0.0897, + "step": 1108 + }, + { + "epoch": 2.0768870135958744, + "grad_norm": 0.3070647120475769, + "learning_rate": 1.2140350877192983e-05, + "loss": 0.0817, + "step": 1109 + }, + { + "epoch": 2.078762306610408, + "grad_norm": 0.3457229733467102, + "learning_rate": 1.2132553606237817e-05, + "loss": 0.0743, + "step": 1110 + }, + { + "epoch": 2.0806375996249415, + "grad_norm": 0.31315433979034424, + "learning_rate": 1.2124756335282653e-05, + "loss": 0.0742, + "step": 1111 + }, + { + "epoch": 2.0825128926394747, + "grad_norm": 0.31154918670654297, + "learning_rate": 1.2116959064327487e-05, + "loss": 0.0748, + "step": 1112 + }, + { + "epoch": 2.0843881856540083, + "grad_norm": 0.3989027738571167, + "learning_rate": 1.2109161793372322e-05, + "loss": 0.1073, + "step": 1113 + }, + { + "epoch": 2.086263478668542, + "grad_norm": 0.3276286721229553, + "learning_rate": 1.2101364522417156e-05, + "loss": 0.1067, + "step": 1114 + }, + { + "epoch": 2.0881387716830755, + "grad_norm": 0.32193440198898315, + "learning_rate": 1.2093567251461988e-05, + "loss": 0.0636, + "step": 1115 + }, + { + "epoch": 2.090014064697609, + "grad_norm": 0.34374260902404785, + "learning_rate": 1.2085769980506823e-05, + "loss": 0.092, + "step": 1116 + }, + { + "epoch": 2.0918893577121427, + "grad_norm": 0.24487826228141785, + "learning_rate": 1.2077972709551657e-05, + "loss": 0.0617, + "step": 1117 + }, + { + "epoch": 2.093764650726676, + "grad_norm": 0.3080878257751465, + "learning_rate": 1.2070175438596493e-05, + "loss": 0.0682, + "step": 1118 + }, + { + "epoch": 2.0956399437412094, + "grad_norm": 0.2777683138847351, + "learning_rate": 1.2062378167641327e-05, + "loss": 0.0716, + "step": 1119 + }, + { + "epoch": 2.097515236755743, + "grad_norm": 0.3162023723125458, + "learning_rate": 1.2054580896686161e-05, + "loss": 0.0841, + "step": 1120 + }, + { + "epoch": 2.0993905297702766, + "grad_norm": 0.4254573881626129, + "learning_rate": 1.2046783625730995e-05, + "loss": 0.0511, + "step": 1121 + }, + { + "epoch": 2.1012658227848102, + "grad_norm": 0.33648309111595154, + "learning_rate": 1.203898635477583e-05, + "loss": 0.0949, + "step": 1122 + }, + { + "epoch": 2.103141115799344, + "grad_norm": 0.24535001814365387, + "learning_rate": 1.2031189083820665e-05, + "loss": 0.0637, + "step": 1123 + }, + { + "epoch": 2.105016408813877, + "grad_norm": 0.2758919894695282, + "learning_rate": 1.20233918128655e-05, + "loss": 0.0825, + "step": 1124 + }, + { + "epoch": 2.1068917018284106, + "grad_norm": 0.29626914858818054, + "learning_rate": 1.2015594541910332e-05, + "loss": 0.0829, + "step": 1125 + }, + { + "epoch": 2.108766994842944, + "grad_norm": 0.43505001068115234, + "learning_rate": 1.2007797270955166e-05, + "loss": 0.1068, + "step": 1126 + }, + { + "epoch": 2.1106422878574778, + "grad_norm": 0.29316994547843933, + "learning_rate": 1.2e-05, + "loss": 0.0632, + "step": 1127 + }, + { + "epoch": 2.1125175808720114, + "grad_norm": 0.3550972640514374, + "learning_rate": 1.1992202729044834e-05, + "loss": 0.0824, + "step": 1128 + }, + { + "epoch": 2.114392873886545, + "grad_norm": 0.34816864132881165, + "learning_rate": 1.198440545808967e-05, + "loss": 0.0885, + "step": 1129 + }, + { + "epoch": 2.116268166901078, + "grad_norm": 0.3325449526309967, + "learning_rate": 1.1976608187134505e-05, + "loss": 0.0932, + "step": 1130 + }, + { + "epoch": 2.1181434599156117, + "grad_norm": 0.3157937228679657, + "learning_rate": 1.1968810916179339e-05, + "loss": 0.0735, + "step": 1131 + }, + { + "epoch": 2.1200187529301453, + "grad_norm": 0.32261866331100464, + "learning_rate": 1.1961013645224173e-05, + "loss": 0.0657, + "step": 1132 + }, + { + "epoch": 2.121894045944679, + "grad_norm": 0.36068305373191833, + "learning_rate": 1.1953216374269007e-05, + "loss": 0.0977, + "step": 1133 + }, + { + "epoch": 2.1237693389592125, + "grad_norm": 0.27126020193099976, + "learning_rate": 1.194541910331384e-05, + "loss": 0.0554, + "step": 1134 + }, + { + "epoch": 2.125644631973746, + "grad_norm": 0.3004317879676819, + "learning_rate": 1.1937621832358674e-05, + "loss": 0.0741, + "step": 1135 + }, + { + "epoch": 2.1275199249882792, + "grad_norm": 0.33724191784858704, + "learning_rate": 1.192982456140351e-05, + "loss": 0.1176, + "step": 1136 + }, + { + "epoch": 2.129395218002813, + "grad_norm": 0.34397128224372864, + "learning_rate": 1.1922027290448344e-05, + "loss": 0.0782, + "step": 1137 + }, + { + "epoch": 2.1312705110173464, + "grad_norm": 0.34579211473464966, + "learning_rate": 1.1914230019493178e-05, + "loss": 0.091, + "step": 1138 + }, + { + "epoch": 2.13314580403188, + "grad_norm": 0.27181315422058105, + "learning_rate": 1.1906432748538012e-05, + "loss": 0.0536, + "step": 1139 + }, + { + "epoch": 2.1350210970464136, + "grad_norm": 0.3022139370441437, + "learning_rate": 1.1898635477582846e-05, + "loss": 0.0658, + "step": 1140 + }, + { + "epoch": 2.1368963900609472, + "grad_norm": 0.38249850273132324, + "learning_rate": 1.1890838206627682e-05, + "loss": 0.0708, + "step": 1141 + }, + { + "epoch": 2.1387716830754804, + "grad_norm": 0.3176434636116028, + "learning_rate": 1.1883040935672517e-05, + "loss": 0.0811, + "step": 1142 + }, + { + "epoch": 2.140646976090014, + "grad_norm": 0.40170273184776306, + "learning_rate": 1.187524366471735e-05, + "loss": 0.1152, + "step": 1143 + }, + { + "epoch": 2.1425222691045476, + "grad_norm": 0.3950449824333191, + "learning_rate": 1.1867446393762183e-05, + "loss": 0.1099, + "step": 1144 + }, + { + "epoch": 2.144397562119081, + "grad_norm": 0.28338244557380676, + "learning_rate": 1.1859649122807017e-05, + "loss": 0.0821, + "step": 1145 + }, + { + "epoch": 2.1462728551336148, + "grad_norm": 0.6058647036552429, + "learning_rate": 1.1851851851851852e-05, + "loss": 0.0856, + "step": 1146 + }, + { + "epoch": 2.148148148148148, + "grad_norm": 0.28024426102638245, + "learning_rate": 1.1844054580896688e-05, + "loss": 0.0695, + "step": 1147 + }, + { + "epoch": 2.1500234411626815, + "grad_norm": 0.3102171719074249, + "learning_rate": 1.1836257309941522e-05, + "loss": 0.0646, + "step": 1148 + }, + { + "epoch": 2.151898734177215, + "grad_norm": 0.30384719371795654, + "learning_rate": 1.1828460038986356e-05, + "loss": 0.0779, + "step": 1149 + }, + { + "epoch": 2.1537740271917487, + "grad_norm": 0.31122729182243347, + "learning_rate": 1.182066276803119e-05, + "loss": 0.0802, + "step": 1150 + }, + { + "epoch": 2.1537740271917487, + "eval_loss": 0.07541442662477493, + "eval_runtime": 675.8929, + "eval_samples_per_second": 0.291, + "eval_steps_per_second": 0.291, + "step": 1150 + }, + { + "epoch": 2.1556493202062823, + "grad_norm": 0.39089834690093994, + "learning_rate": 1.1812865497076024e-05, + "loss": 0.1295, + "step": 1151 + }, + { + "epoch": 2.157524613220816, + "grad_norm": 0.286997526884079, + "learning_rate": 1.180506822612086e-05, + "loss": 0.0582, + "step": 1152 + }, + { + "epoch": 2.1593999062353495, + "grad_norm": 0.2970065772533417, + "learning_rate": 1.1797270955165694e-05, + "loss": 0.0862, + "step": 1153 + }, + { + "epoch": 2.1612751992498827, + "grad_norm": 0.2717902660369873, + "learning_rate": 1.1789473684210527e-05, + "loss": 0.055, + "step": 1154 + }, + { + "epoch": 2.1631504922644162, + "grad_norm": 0.3172938823699951, + "learning_rate": 1.1781676413255361e-05, + "loss": 0.0869, + "step": 1155 + }, + { + "epoch": 2.16502578527895, + "grad_norm": 0.42429736256599426, + "learning_rate": 1.1773879142300195e-05, + "loss": 0.0802, + "step": 1156 + }, + { + "epoch": 2.1669010782934834, + "grad_norm": 0.31430354714393616, + "learning_rate": 1.176608187134503e-05, + "loss": 0.0894, + "step": 1157 + }, + { + "epoch": 2.168776371308017, + "grad_norm": 0.33213984966278076, + "learning_rate": 1.1758284600389864e-05, + "loss": 0.1002, + "step": 1158 + }, + { + "epoch": 2.17065166432255, + "grad_norm": 0.35632964968681335, + "learning_rate": 1.17504873294347e-05, + "loss": 0.0793, + "step": 1159 + }, + { + "epoch": 2.172526957337084, + "grad_norm": 0.376068651676178, + "learning_rate": 1.1742690058479534e-05, + "loss": 0.0858, + "step": 1160 + }, + { + "epoch": 2.1744022503516174, + "grad_norm": 0.3594323694705963, + "learning_rate": 1.1734892787524368e-05, + "loss": 0.0589, + "step": 1161 + }, + { + "epoch": 2.176277543366151, + "grad_norm": 0.37725338339805603, + "learning_rate": 1.1727095516569202e-05, + "loss": 0.1046, + "step": 1162 + }, + { + "epoch": 2.1781528363806846, + "grad_norm": 0.28712713718414307, + "learning_rate": 1.1719298245614036e-05, + "loss": 0.0629, + "step": 1163 + }, + { + "epoch": 2.180028129395218, + "grad_norm": 0.31834840774536133, + "learning_rate": 1.1711500974658869e-05, + "loss": 0.0971, + "step": 1164 + }, + { + "epoch": 2.1819034224097513, + "grad_norm": 0.38184770941734314, + "learning_rate": 1.1703703703703703e-05, + "loss": 0.0971, + "step": 1165 + }, + { + "epoch": 2.183778715424285, + "grad_norm": 0.3888046145439148, + "learning_rate": 1.1695906432748539e-05, + "loss": 0.1172, + "step": 1166 + }, + { + "epoch": 2.1856540084388185, + "grad_norm": 0.3301868736743927, + "learning_rate": 1.1688109161793373e-05, + "loss": 0.0927, + "step": 1167 + }, + { + "epoch": 2.187529301453352, + "grad_norm": 0.3630843460559845, + "learning_rate": 1.1680311890838207e-05, + "loss": 0.0839, + "step": 1168 + }, + { + "epoch": 2.1894045944678857, + "grad_norm": 0.6889258623123169, + "learning_rate": 1.1672514619883041e-05, + "loss": 0.1217, + "step": 1169 + }, + { + "epoch": 2.1912798874824193, + "grad_norm": 0.6070849299430847, + "learning_rate": 1.1664717348927876e-05, + "loss": 0.0926, + "step": 1170 + }, + { + "epoch": 2.1931551804969525, + "grad_norm": 0.3465521037578583, + "learning_rate": 1.1656920077972711e-05, + "loss": 0.086, + "step": 1171 + }, + { + "epoch": 2.195030473511486, + "grad_norm": 0.3908824622631073, + "learning_rate": 1.1649122807017546e-05, + "loss": 0.1146, + "step": 1172 + }, + { + "epoch": 2.1969057665260197, + "grad_norm": 0.28279078006744385, + "learning_rate": 1.164132553606238e-05, + "loss": 0.072, + "step": 1173 + }, + { + "epoch": 2.1987810595405533, + "grad_norm": 0.31403693556785583, + "learning_rate": 1.1633528265107212e-05, + "loss": 0.0871, + "step": 1174 + }, + { + "epoch": 2.200656352555087, + "grad_norm": 0.33991819620132446, + "learning_rate": 1.1625730994152047e-05, + "loss": 0.0876, + "step": 1175 + }, + { + "epoch": 2.2025316455696204, + "grad_norm": 0.3142626881599426, + "learning_rate": 1.161793372319688e-05, + "loss": 0.0724, + "step": 1176 + }, + { + "epoch": 2.2044069385841536, + "grad_norm": 0.3219679594039917, + "learning_rate": 1.1610136452241717e-05, + "loss": 0.0659, + "step": 1177 + }, + { + "epoch": 2.206282231598687, + "grad_norm": 0.3814351260662079, + "learning_rate": 1.160233918128655e-05, + "loss": 0.0708, + "step": 1178 + }, + { + "epoch": 2.208157524613221, + "grad_norm": 0.380666583776474, + "learning_rate": 1.1594541910331385e-05, + "loss": 0.0952, + "step": 1179 + }, + { + "epoch": 2.2100328176277544, + "grad_norm": 0.2872433662414551, + "learning_rate": 1.158674463937622e-05, + "loss": 0.0794, + "step": 1180 + }, + { + "epoch": 2.211908110642288, + "grad_norm": 0.28936418890953064, + "learning_rate": 1.1578947368421053e-05, + "loss": 0.0606, + "step": 1181 + }, + { + "epoch": 2.2137834036568216, + "grad_norm": 0.3666747808456421, + "learning_rate": 1.157115009746589e-05, + "loss": 0.0965, + "step": 1182 + }, + { + "epoch": 2.2156586966713547, + "grad_norm": 0.3257901072502136, + "learning_rate": 1.1563352826510723e-05, + "loss": 0.0796, + "step": 1183 + }, + { + "epoch": 2.2175339896858883, + "grad_norm": 0.47882843017578125, + "learning_rate": 1.1555555555555556e-05, + "loss": 0.0816, + "step": 1184 + }, + { + "epoch": 2.219409282700422, + "grad_norm": 0.2858003079891205, + "learning_rate": 1.154775828460039e-05, + "loss": 0.069, + "step": 1185 + }, + { + "epoch": 2.2212845757149555, + "grad_norm": 0.31669747829437256, + "learning_rate": 1.1539961013645224e-05, + "loss": 0.0842, + "step": 1186 + }, + { + "epoch": 2.223159868729489, + "grad_norm": 0.3197777271270752, + "learning_rate": 1.1532163742690059e-05, + "loss": 0.099, + "step": 1187 + }, + { + "epoch": 2.2250351617440227, + "grad_norm": 0.34864693880081177, + "learning_rate": 1.1524366471734893e-05, + "loss": 0.0849, + "step": 1188 + }, + { + "epoch": 2.226910454758556, + "grad_norm": 0.7603491544723511, + "learning_rate": 1.1516569200779729e-05, + "loss": 0.0865, + "step": 1189 + }, + { + "epoch": 2.2287857477730895, + "grad_norm": 0.3353845775127411, + "learning_rate": 1.1508771929824563e-05, + "loss": 0.0724, + "step": 1190 + }, + { + "epoch": 2.230661040787623, + "grad_norm": 0.27294620871543884, + "learning_rate": 1.1500974658869397e-05, + "loss": 0.0718, + "step": 1191 + }, + { + "epoch": 2.2325363338021567, + "grad_norm": 0.30225276947021484, + "learning_rate": 1.1493177387914231e-05, + "loss": 0.0752, + "step": 1192 + }, + { + "epoch": 2.2344116268166903, + "grad_norm": 0.30490773916244507, + "learning_rate": 1.1485380116959065e-05, + "loss": 0.0653, + "step": 1193 + }, + { + "epoch": 2.2362869198312234, + "grad_norm": 0.306281715631485, + "learning_rate": 1.1477582846003898e-05, + "loss": 0.0747, + "step": 1194 + }, + { + "epoch": 2.238162212845757, + "grad_norm": 0.3963259756565094, + "learning_rate": 1.1469785575048734e-05, + "loss": 0.1006, + "step": 1195 + }, + { + "epoch": 2.2400375058602906, + "grad_norm": 0.3330558240413666, + "learning_rate": 1.1461988304093568e-05, + "loss": 0.0925, + "step": 1196 + }, + { + "epoch": 2.241912798874824, + "grad_norm": 0.27601566910743713, + "learning_rate": 1.1454191033138402e-05, + "loss": 0.0686, + "step": 1197 + }, + { + "epoch": 2.243788091889358, + "grad_norm": 0.412579208612442, + "learning_rate": 1.1446393762183236e-05, + "loss": 0.0789, + "step": 1198 + }, + { + "epoch": 2.2456633849038914, + "grad_norm": 0.39170923829078674, + "learning_rate": 1.143859649122807e-05, + "loss": 0.095, + "step": 1199 + }, + { + "epoch": 2.247538677918425, + "grad_norm": 0.3268628716468811, + "learning_rate": 1.1430799220272906e-05, + "loss": 0.0772, + "step": 1200 + }, + { + "epoch": 2.247538677918425, + "eval_loss": 0.07522810995578766, + "eval_runtime": 675.3854, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 1200 + }, + { + "epoch": 2.249413970932958, + "grad_norm": 0.34962981939315796, + "learning_rate": 1.142300194931774e-05, + "loss": 0.0791, + "step": 1201 + }, + { + "epoch": 2.2512892639474917, + "grad_norm": 0.3399597704410553, + "learning_rate": 1.1415204678362575e-05, + "loss": 0.0807, + "step": 1202 + }, + { + "epoch": 2.2531645569620253, + "grad_norm": 0.27295050024986267, + "learning_rate": 1.1407407407407409e-05, + "loss": 0.0607, + "step": 1203 + }, + { + "epoch": 2.255039849976559, + "grad_norm": 0.32791128754615784, + "learning_rate": 1.1399610136452241e-05, + "loss": 0.0548, + "step": 1204 + }, + { + "epoch": 2.2569151429910925, + "grad_norm": 0.35572582483291626, + "learning_rate": 1.1391812865497076e-05, + "loss": 0.0806, + "step": 1205 + }, + { + "epoch": 2.2587904360056257, + "grad_norm": 0.3451082408428192, + "learning_rate": 1.138401559454191e-05, + "loss": 0.0599, + "step": 1206 + }, + { + "epoch": 2.2606657290201593, + "grad_norm": 0.25782617926597595, + "learning_rate": 1.1376218323586746e-05, + "loss": 0.0584, + "step": 1207 + }, + { + "epoch": 2.262541022034693, + "grad_norm": 0.2698267996311188, + "learning_rate": 1.136842105263158e-05, + "loss": 0.067, + "step": 1208 + }, + { + "epoch": 2.2644163150492265, + "grad_norm": 0.2824949026107788, + "learning_rate": 1.1360623781676414e-05, + "loss": 0.0325, + "step": 1209 + }, + { + "epoch": 2.26629160806376, + "grad_norm": 0.30985549092292786, + "learning_rate": 1.1352826510721248e-05, + "loss": 0.0591, + "step": 1210 + }, + { + "epoch": 2.2681669010782937, + "grad_norm": 0.3595530688762665, + "learning_rate": 1.1345029239766083e-05, + "loss": 0.0821, + "step": 1211 + }, + { + "epoch": 2.270042194092827, + "grad_norm": 0.3911292552947998, + "learning_rate": 1.1337231968810918e-05, + "loss": 0.0942, + "step": 1212 + }, + { + "epoch": 2.2719174871073604, + "grad_norm": 0.31286779046058655, + "learning_rate": 1.1329434697855753e-05, + "loss": 0.0703, + "step": 1213 + }, + { + "epoch": 2.273792780121894, + "grad_norm": 0.6129016876220703, + "learning_rate": 1.1321637426900585e-05, + "loss": 0.0904, + "step": 1214 + }, + { + "epoch": 2.2756680731364276, + "grad_norm": 0.35265544056892395, + "learning_rate": 1.131384015594542e-05, + "loss": 0.102, + "step": 1215 + }, + { + "epoch": 2.277543366150961, + "grad_norm": 0.49612244963645935, + "learning_rate": 1.1306042884990253e-05, + "loss": 0.0851, + "step": 1216 + }, + { + "epoch": 2.279418659165495, + "grad_norm": 2.8707265853881836, + "learning_rate": 1.1298245614035088e-05, + "loss": 0.0575, + "step": 1217 + }, + { + "epoch": 2.281293952180028, + "grad_norm": 0.676102340221405, + "learning_rate": 1.1290448343079924e-05, + "loss": 0.0946, + "step": 1218 + }, + { + "epoch": 2.2831692451945615, + "grad_norm": 0.3254333436489105, + "learning_rate": 1.1282651072124758e-05, + "loss": 0.0722, + "step": 1219 + }, + { + "epoch": 2.285044538209095, + "grad_norm": 0.2682390809059143, + "learning_rate": 1.1274853801169592e-05, + "loss": 0.0674, + "step": 1220 + }, + { + "epoch": 2.2869198312236287, + "grad_norm": 0.2639862298965454, + "learning_rate": 1.1267056530214426e-05, + "loss": 0.0515, + "step": 1221 + }, + { + "epoch": 2.2887951242381623, + "grad_norm": 0.30463695526123047, + "learning_rate": 1.125925925925926e-05, + "loss": 0.0683, + "step": 1222 + }, + { + "epoch": 2.290670417252696, + "grad_norm": 0.2600836753845215, + "learning_rate": 1.1251461988304096e-05, + "loss": 0.0588, + "step": 1223 + }, + { + "epoch": 2.292545710267229, + "grad_norm": 0.2697446644306183, + "learning_rate": 1.1243664717348927e-05, + "loss": 0.0519, + "step": 1224 + }, + { + "epoch": 2.2944210032817627, + "grad_norm": 0.34204694628715515, + "learning_rate": 1.1235867446393763e-05, + "loss": 0.0839, + "step": 1225 + }, + { + "epoch": 2.2962962962962963, + "grad_norm": 0.30241382122039795, + "learning_rate": 1.1228070175438597e-05, + "loss": 0.0793, + "step": 1226 + }, + { + "epoch": 2.29817158931083, + "grad_norm": 0.36129674315452576, + "learning_rate": 1.1220272904483431e-05, + "loss": 0.0572, + "step": 1227 + }, + { + "epoch": 2.3000468823253635, + "grad_norm": 0.3861963450908661, + "learning_rate": 1.1212475633528265e-05, + "loss": 0.1096, + "step": 1228 + }, + { + "epoch": 2.3019221753398966, + "grad_norm": 0.2883913516998291, + "learning_rate": 1.12046783625731e-05, + "loss": 0.0614, + "step": 1229 + }, + { + "epoch": 2.3037974683544302, + "grad_norm": 0.3693699538707733, + "learning_rate": 1.1196881091617936e-05, + "loss": 0.0957, + "step": 1230 + }, + { + "epoch": 2.305672761368964, + "grad_norm": 0.35850393772125244, + "learning_rate": 1.118908382066277e-05, + "loss": 0.0849, + "step": 1231 + }, + { + "epoch": 2.3075480543834974, + "grad_norm": 0.35511448979377747, + "learning_rate": 1.1181286549707604e-05, + "loss": 0.1032, + "step": 1232 + }, + { + "epoch": 2.309423347398031, + "grad_norm": 0.5250701308250427, + "learning_rate": 1.1173489278752438e-05, + "loss": 0.1174, + "step": 1233 + }, + { + "epoch": 2.3112986404125646, + "grad_norm": 0.28343549370765686, + "learning_rate": 1.116569200779727e-05, + "loss": 0.0702, + "step": 1234 + }, + { + "epoch": 2.313173933427098, + "grad_norm": 0.3174719512462616, + "learning_rate": 1.1157894736842105e-05, + "loss": 0.0628, + "step": 1235 + }, + { + "epoch": 2.3150492264416314, + "grad_norm": 0.4021863341331482, + "learning_rate": 1.115009746588694e-05, + "loss": 0.0856, + "step": 1236 + }, + { + "epoch": 2.316924519456165, + "grad_norm": 0.4704308807849884, + "learning_rate": 1.1142300194931775e-05, + "loss": 0.0977, + "step": 1237 + }, + { + "epoch": 2.3187998124706986, + "grad_norm": 0.319234162569046, + "learning_rate": 1.1134502923976609e-05, + "loss": 0.0647, + "step": 1238 + }, + { + "epoch": 2.320675105485232, + "grad_norm": 0.3543676435947418, + "learning_rate": 1.1126705653021443e-05, + "loss": 0.0791, + "step": 1239 + }, + { + "epoch": 2.3225503984997657, + "grad_norm": 0.32436403632164, + "learning_rate": 1.1118908382066277e-05, + "loss": 0.0613, + "step": 1240 + }, + { + "epoch": 2.324425691514299, + "grad_norm": 0.33297768235206604, + "learning_rate": 1.1111111111111113e-05, + "loss": 0.0684, + "step": 1241 + }, + { + "epoch": 2.3263009845288325, + "grad_norm": 0.46087008714675903, + "learning_rate": 1.1103313840155948e-05, + "loss": 0.0744, + "step": 1242 + }, + { + "epoch": 2.328176277543366, + "grad_norm": 0.3725678622722626, + "learning_rate": 1.1095516569200782e-05, + "loss": 0.0878, + "step": 1243 + }, + { + "epoch": 2.3300515705578997, + "grad_norm": 0.6072844862937927, + "learning_rate": 1.1087719298245614e-05, + "loss": 0.1077, + "step": 1244 + }, + { + "epoch": 2.3319268635724333, + "grad_norm": 0.32860544323921204, + "learning_rate": 1.1079922027290448e-05, + "loss": 0.0694, + "step": 1245 + }, + { + "epoch": 2.333802156586967, + "grad_norm": 0.2839890122413635, + "learning_rate": 1.1072124756335283e-05, + "loss": 0.0402, + "step": 1246 + }, + { + "epoch": 2.3356774496015005, + "grad_norm": 0.4085495173931122, + "learning_rate": 1.1064327485380117e-05, + "loss": 0.0641, + "step": 1247 + }, + { + "epoch": 2.3375527426160336, + "grad_norm": 0.3415165841579437, + "learning_rate": 1.1056530214424953e-05, + "loss": 0.0643, + "step": 1248 + }, + { + "epoch": 2.3394280356305672, + "grad_norm": 0.34704262018203735, + "learning_rate": 1.1048732943469787e-05, + "loss": 0.0792, + "step": 1249 + }, + { + "epoch": 2.341303328645101, + "grad_norm": 0.2980154752731323, + "learning_rate": 1.1040935672514621e-05, + "loss": 0.0673, + "step": 1250 + }, + { + "epoch": 2.341303328645101, + "eval_loss": 0.07502703368663788, + "eval_runtime": 675.207, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 1250 + }, + { + "epoch": 2.3431786216596344, + "grad_norm": 0.27935025095939636, + "learning_rate": 1.1033138401559455e-05, + "loss": 0.0512, + "step": 1251 + }, + { + "epoch": 2.345053914674168, + "grad_norm": 0.42243000864982605, + "learning_rate": 1.102534113060429e-05, + "loss": 0.1033, + "step": 1252 + }, + { + "epoch": 2.346929207688701, + "grad_norm": 0.36166539788246155, + "learning_rate": 1.1017543859649125e-05, + "loss": 0.0787, + "step": 1253 + }, + { + "epoch": 2.3488045007032348, + "grad_norm": 1.874434232711792, + "learning_rate": 1.1009746588693956e-05, + "loss": 0.1049, + "step": 1254 + }, + { + "epoch": 2.3506797937177684, + "grad_norm": 0.30916956067085266, + "learning_rate": 1.1001949317738792e-05, + "loss": 0.0895, + "step": 1255 + }, + { + "epoch": 2.352555086732302, + "grad_norm": 0.37780481576919556, + "learning_rate": 1.0994152046783626e-05, + "loss": 0.089, + "step": 1256 + }, + { + "epoch": 2.3544303797468356, + "grad_norm": 0.3470967411994934, + "learning_rate": 1.098635477582846e-05, + "loss": 0.0827, + "step": 1257 + }, + { + "epoch": 2.356305672761369, + "grad_norm": 0.28173500299453735, + "learning_rate": 1.0978557504873295e-05, + "loss": 0.0831, + "step": 1258 + }, + { + "epoch": 2.3581809657759023, + "grad_norm": 0.32230475544929504, + "learning_rate": 1.0970760233918129e-05, + "loss": 0.0706, + "step": 1259 + }, + { + "epoch": 2.360056258790436, + "grad_norm": 0.2897712290287018, + "learning_rate": 1.0962962962962965e-05, + "loss": 0.0614, + "step": 1260 + }, + { + "epoch": 2.3619315518049695, + "grad_norm": 0.46456751227378845, + "learning_rate": 1.0955165692007799e-05, + "loss": 0.1074, + "step": 1261 + }, + { + "epoch": 2.363806844819503, + "grad_norm": 0.35259947180747986, + "learning_rate": 1.0947368421052633e-05, + "loss": 0.09, + "step": 1262 + }, + { + "epoch": 2.3656821378340367, + "grad_norm": 0.4690414071083069, + "learning_rate": 1.0939571150097467e-05, + "loss": 0.0667, + "step": 1263 + }, + { + "epoch": 2.36755743084857, + "grad_norm": 0.3024637997150421, + "learning_rate": 1.09317738791423e-05, + "loss": 0.0723, + "step": 1264 + }, + { + "epoch": 2.3694327238631034, + "grad_norm": 0.2506920397281647, + "learning_rate": 1.0923976608187134e-05, + "loss": 0.0584, + "step": 1265 + }, + { + "epoch": 2.371308016877637, + "grad_norm": 0.3608229160308838, + "learning_rate": 1.091617933723197e-05, + "loss": 0.0659, + "step": 1266 + }, + { + "epoch": 2.3731833098921706, + "grad_norm": 0.3218965232372284, + "learning_rate": 1.0908382066276804e-05, + "loss": 0.0803, + "step": 1267 + }, + { + "epoch": 2.3750586029067042, + "grad_norm": 0.8374189734458923, + "learning_rate": 1.0900584795321638e-05, + "loss": 0.0654, + "step": 1268 + }, + { + "epoch": 2.376933895921238, + "grad_norm": 0.3754996955394745, + "learning_rate": 1.0892787524366472e-05, + "loss": 0.0868, + "step": 1269 + }, + { + "epoch": 2.3788091889357714, + "grad_norm": 0.928810179233551, + "learning_rate": 1.0884990253411307e-05, + "loss": 0.1044, + "step": 1270 + }, + { + "epoch": 2.3806844819503046, + "grad_norm": 0.3189548850059509, + "learning_rate": 1.0877192982456142e-05, + "loss": 0.0773, + "step": 1271 + }, + { + "epoch": 2.382559774964838, + "grad_norm": 0.3129134178161621, + "learning_rate": 1.0869395711500977e-05, + "loss": 0.0769, + "step": 1272 + }, + { + "epoch": 2.3844350679793718, + "grad_norm": 0.2928052544593811, + "learning_rate": 1.086159844054581e-05, + "loss": 0.0783, + "step": 1273 + }, + { + "epoch": 2.3863103609939054, + "grad_norm": 0.380834698677063, + "learning_rate": 1.0853801169590643e-05, + "loss": 0.0734, + "step": 1274 + }, + { + "epoch": 2.388185654008439, + "grad_norm": 0.28873157501220703, + "learning_rate": 1.0846003898635478e-05, + "loss": 0.0697, + "step": 1275 + }, + { + "epoch": 2.390060947022972, + "grad_norm": 0.2910199463367462, + "learning_rate": 1.0838206627680312e-05, + "loss": 0.0691, + "step": 1276 + }, + { + "epoch": 2.3919362400375057, + "grad_norm": 0.3059178292751312, + "learning_rate": 1.0830409356725146e-05, + "loss": 0.064, + "step": 1277 + }, + { + "epoch": 2.3938115330520393, + "grad_norm": 0.33945995569229126, + "learning_rate": 1.0822612085769982e-05, + "loss": 0.0589, + "step": 1278 + }, + { + "epoch": 2.395686826066573, + "grad_norm": 0.35901668667793274, + "learning_rate": 1.0814814814814816e-05, + "loss": 0.0798, + "step": 1279 + }, + { + "epoch": 2.3975621190811065, + "grad_norm": 0.2913598418235779, + "learning_rate": 1.080701754385965e-05, + "loss": 0.0609, + "step": 1280 + }, + { + "epoch": 2.39943741209564, + "grad_norm": 0.30235838890075684, + "learning_rate": 1.0799220272904484e-05, + "loss": 0.0696, + "step": 1281 + }, + { + "epoch": 2.4013127051101737, + "grad_norm": 0.29583993554115295, + "learning_rate": 1.0791423001949319e-05, + "loss": 0.0813, + "step": 1282 + }, + { + "epoch": 2.403187998124707, + "grad_norm": 0.34998810291290283, + "learning_rate": 1.0783625730994154e-05, + "loss": 0.0782, + "step": 1283 + }, + { + "epoch": 2.4050632911392404, + "grad_norm": 0.3048896789550781, + "learning_rate": 1.0775828460038987e-05, + "loss": 0.0579, + "step": 1284 + }, + { + "epoch": 2.406938584153774, + "grad_norm": 0.22718416154384613, + "learning_rate": 1.0768031189083821e-05, + "loss": 0.0662, + "step": 1285 + }, + { + "epoch": 2.4088138771683076, + "grad_norm": 0.2834322154521942, + "learning_rate": 1.0760233918128655e-05, + "loss": 0.0615, + "step": 1286 + }, + { + "epoch": 2.4106891701828412, + "grad_norm": 0.36230021715164185, + "learning_rate": 1.075243664717349e-05, + "loss": 0.0743, + "step": 1287 + }, + { + "epoch": 2.4125644631973744, + "grad_norm": 0.2996331453323364, + "learning_rate": 1.0744639376218324e-05, + "loss": 0.0763, + "step": 1288 + }, + { + "epoch": 2.414439756211908, + "grad_norm": 0.3829980790615082, + "learning_rate": 1.073684210526316e-05, + "loss": 0.06, + "step": 1289 + }, + { + "epoch": 2.4163150492264416, + "grad_norm": 0.36401882767677307, + "learning_rate": 1.0729044834307994e-05, + "loss": 0.091, + "step": 1290 + }, + { + "epoch": 2.418190342240975, + "grad_norm": 0.38041242957115173, + "learning_rate": 1.0721247563352828e-05, + "loss": 0.073, + "step": 1291 + }, + { + "epoch": 2.4200656352555088, + "grad_norm": 0.4183158576488495, + "learning_rate": 1.0713450292397662e-05, + "loss": 0.0889, + "step": 1292 + }, + { + "epoch": 2.4219409282700424, + "grad_norm": 0.30840641260147095, + "learning_rate": 1.0705653021442496e-05, + "loss": 0.0878, + "step": 1293 + }, + { + "epoch": 2.4238162212845755, + "grad_norm": 1.0156400203704834, + "learning_rate": 1.0697855750487329e-05, + "loss": 0.0942, + "step": 1294 + }, + { + "epoch": 2.425691514299109, + "grad_norm": 0.26748737692832947, + "learning_rate": 1.0690058479532163e-05, + "loss": 0.0535, + "step": 1295 + }, + { + "epoch": 2.4275668073136427, + "grad_norm": 0.3611546456813812, + "learning_rate": 1.0682261208576999e-05, + "loss": 0.0684, + "step": 1296 + }, + { + "epoch": 2.4294421003281763, + "grad_norm": 0.4616575539112091, + "learning_rate": 1.0674463937621833e-05, + "loss": 0.0916, + "step": 1297 + }, + { + "epoch": 2.43131739334271, + "grad_norm": 0.37064328789711, + "learning_rate": 1.0666666666666667e-05, + "loss": 0.0871, + "step": 1298 + }, + { + "epoch": 2.4331926863572435, + "grad_norm": 0.30871546268463135, + "learning_rate": 1.0658869395711502e-05, + "loss": 0.0725, + "step": 1299 + }, + { + "epoch": 2.4350679793717767, + "grad_norm": 0.45006898045539856, + "learning_rate": 1.0651072124756336e-05, + "loss": 0.1114, + "step": 1300 + }, + { + "epoch": 2.4350679793717767, + "eval_loss": 0.07475950568914413, + "eval_runtime": 674.4121, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 1300 + }, + { + "epoch": 2.4369432723863103, + "grad_norm": 0.410637229681015, + "learning_rate": 1.0643274853801172e-05, + "loss": 0.0854, + "step": 1301 + }, + { + "epoch": 2.438818565400844, + "grad_norm": 0.40461596846580505, + "learning_rate": 1.0635477582846006e-05, + "loss": 0.1023, + "step": 1302 + }, + { + "epoch": 2.4406938584153774, + "grad_norm": 0.47095608711242676, + "learning_rate": 1.062768031189084e-05, + "loss": 0.097, + "step": 1303 + }, + { + "epoch": 2.442569151429911, + "grad_norm": 0.3717558979988098, + "learning_rate": 1.0619883040935672e-05, + "loss": 0.076, + "step": 1304 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.3064993619918823, + "learning_rate": 1.0612085769980507e-05, + "loss": 0.06, + "step": 1305 + }, + { + "epoch": 2.446319737458978, + "grad_norm": 0.380657821893692, + "learning_rate": 1.060428849902534e-05, + "loss": 0.0997, + "step": 1306 + }, + { + "epoch": 2.4481950304735114, + "grad_norm": 0.3160291612148285, + "learning_rate": 1.0596491228070177e-05, + "loss": 0.0548, + "step": 1307 + }, + { + "epoch": 2.450070323488045, + "grad_norm": 0.377288818359375, + "learning_rate": 1.0588693957115011e-05, + "loss": 0.0803, + "step": 1308 + }, + { + "epoch": 2.4519456165025786, + "grad_norm": 0.27472472190856934, + "learning_rate": 1.0580896686159845e-05, + "loss": 0.0477, + "step": 1309 + }, + { + "epoch": 2.453820909517112, + "grad_norm": 0.3660586476325989, + "learning_rate": 1.057309941520468e-05, + "loss": 0.075, + "step": 1310 + }, + { + "epoch": 2.4556962025316453, + "grad_norm": 1.1128591299057007, + "learning_rate": 1.0565302144249513e-05, + "loss": 0.1068, + "step": 1311 + }, + { + "epoch": 2.457571495546179, + "grad_norm": 0.3397810757160187, + "learning_rate": 1.055750487329435e-05, + "loss": 0.0689, + "step": 1312 + }, + { + "epoch": 2.4594467885607125, + "grad_norm": 0.28418809175491333, + "learning_rate": 1.0549707602339184e-05, + "loss": 0.078, + "step": 1313 + }, + { + "epoch": 2.461322081575246, + "grad_norm": 0.30030253529548645, + "learning_rate": 1.0541910331384016e-05, + "loss": 0.0504, + "step": 1314 + }, + { + "epoch": 2.4631973745897797, + "grad_norm": 0.306923508644104, + "learning_rate": 1.053411306042885e-05, + "loss": 0.0516, + "step": 1315 + }, + { + "epoch": 2.4650726676043133, + "grad_norm": 0.2753802239894867, + "learning_rate": 1.0526315789473684e-05, + "loss": 0.0733, + "step": 1316 + }, + { + "epoch": 2.466947960618847, + "grad_norm": 0.4096115231513977, + "learning_rate": 1.0518518518518519e-05, + "loss": 0.1417, + "step": 1317 + }, + { + "epoch": 2.46882325363338, + "grad_norm": 0.45057594776153564, + "learning_rate": 1.0510721247563353e-05, + "loss": 0.082, + "step": 1318 + }, + { + "epoch": 2.4706985466479137, + "grad_norm": 0.2660753130912781, + "learning_rate": 1.0502923976608189e-05, + "loss": 0.0762, + "step": 1319 + }, + { + "epoch": 2.4725738396624473, + "grad_norm": 0.3073916435241699, + "learning_rate": 1.0495126705653023e-05, + "loss": 0.0603, + "step": 1320 + }, + { + "epoch": 2.474449132676981, + "grad_norm": 0.45714837312698364, + "learning_rate": 1.0487329434697857e-05, + "loss": 0.0975, + "step": 1321 + }, + { + "epoch": 2.4763244256915145, + "grad_norm": 0.34037506580352783, + "learning_rate": 1.0479532163742691e-05, + "loss": 0.0837, + "step": 1322 + }, + { + "epoch": 2.4781997187060476, + "grad_norm": 0.41261813044548035, + "learning_rate": 1.0471734892787525e-05, + "loss": 0.1105, + "step": 1323 + }, + { + "epoch": 2.480075011720581, + "grad_norm": 0.3210998475551605, + "learning_rate": 1.0463937621832358e-05, + "loss": 0.0582, + "step": 1324 + }, + { + "epoch": 2.481950304735115, + "grad_norm": 0.37079358100891113, + "learning_rate": 1.0456140350877194e-05, + "loss": 0.0842, + "step": 1325 + }, + { + "epoch": 2.4838255977496484, + "grad_norm": 0.3284546434879303, + "learning_rate": 1.0448343079922028e-05, + "loss": 0.0634, + "step": 1326 + }, + { + "epoch": 2.485700890764182, + "grad_norm": 0.28334537148475647, + "learning_rate": 1.0440545808966862e-05, + "loss": 0.0451, + "step": 1327 + }, + { + "epoch": 2.4875761837787156, + "grad_norm": 0.3325977027416229, + "learning_rate": 1.0432748538011696e-05, + "loss": 0.0798, + "step": 1328 + }, + { + "epoch": 2.489451476793249, + "grad_norm": 0.4519757330417633, + "learning_rate": 1.042495126705653e-05, + "loss": 0.1076, + "step": 1329 + }, + { + "epoch": 2.4913267698077823, + "grad_norm": 0.313209593296051, + "learning_rate": 1.0417153996101367e-05, + "loss": 0.068, + "step": 1330 + }, + { + "epoch": 2.493202062822316, + "grad_norm": 0.35849788784980774, + "learning_rate": 1.04093567251462e-05, + "loss": 0.1017, + "step": 1331 + }, + { + "epoch": 2.4950773558368495, + "grad_norm": 0.34800082445144653, + "learning_rate": 1.0401559454191035e-05, + "loss": 0.0732, + "step": 1332 + }, + { + "epoch": 2.496952648851383, + "grad_norm": 0.41697457432746887, + "learning_rate": 1.0393762183235869e-05, + "loss": 0.0898, + "step": 1333 + }, + { + "epoch": 2.4988279418659167, + "grad_norm": 0.9164847731590271, + "learning_rate": 1.0385964912280702e-05, + "loss": 0.0843, + "step": 1334 + }, + { + "epoch": 2.50070323488045, + "grad_norm": 0.3565974533557892, + "learning_rate": 1.0378167641325536e-05, + "loss": 0.0821, + "step": 1335 + }, + { + "epoch": 2.5025785278949835, + "grad_norm": 0.37837761640548706, + "learning_rate": 1.037037037037037e-05, + "loss": 0.0807, + "step": 1336 + }, + { + "epoch": 2.504453820909517, + "grad_norm": 0.30958202481269836, + "learning_rate": 1.0362573099415206e-05, + "loss": 0.0623, + "step": 1337 + }, + { + "epoch": 2.5063291139240507, + "grad_norm": 0.29366958141326904, + "learning_rate": 1.035477582846004e-05, + "loss": 0.0592, + "step": 1338 + }, + { + "epoch": 2.5082044069385843, + "grad_norm": 0.3304692506790161, + "learning_rate": 1.0346978557504874e-05, + "loss": 0.0606, + "step": 1339 + }, + { + "epoch": 2.510079699953118, + "grad_norm": 0.3314734697341919, + "learning_rate": 1.0339181286549708e-05, + "loss": 0.078, + "step": 1340 + }, + { + "epoch": 2.5119549929676515, + "grad_norm": 0.2976376712322235, + "learning_rate": 1.0331384015594543e-05, + "loss": 0.0737, + "step": 1341 + }, + { + "epoch": 2.5138302859821846, + "grad_norm": 0.3934020400047302, + "learning_rate": 1.0323586744639378e-05, + "loss": 0.0567, + "step": 1342 + }, + { + "epoch": 2.515705578996718, + "grad_norm": 0.35475224256515503, + "learning_rate": 1.0315789473684213e-05, + "loss": 0.0785, + "step": 1343 + }, + { + "epoch": 2.517580872011252, + "grad_norm": 0.2962830364704132, + "learning_rate": 1.0307992202729045e-05, + "loss": 0.0693, + "step": 1344 + }, + { + "epoch": 2.5194561650257854, + "grad_norm": 0.4114619195461273, + "learning_rate": 1.030019493177388e-05, + "loss": 0.1023, + "step": 1345 + }, + { + "epoch": 2.5213314580403186, + "grad_norm": 0.3815486431121826, + "learning_rate": 1.0292397660818714e-05, + "loss": 0.079, + "step": 1346 + }, + { + "epoch": 2.523206751054852, + "grad_norm": 0.4123891592025757, + "learning_rate": 1.0284600389863548e-05, + "loss": 0.1021, + "step": 1347 + }, + { + "epoch": 2.5250820440693857, + "grad_norm": 0.3692156672477722, + "learning_rate": 1.0276803118908382e-05, + "loss": 0.0983, + "step": 1348 + }, + { + "epoch": 2.5269573370839193, + "grad_norm": 0.3262293338775635, + "learning_rate": 1.0269005847953218e-05, + "loss": 0.096, + "step": 1349 + }, + { + "epoch": 2.528832630098453, + "grad_norm": 1.2034567594528198, + "learning_rate": 1.0261208576998052e-05, + "loss": 0.0772, + "step": 1350 + }, + { + "epoch": 2.528832630098453, + "eval_loss": 0.07395470887422562, + "eval_runtime": 674.8172, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 1350 + }, + { + "epoch": 2.5307079231129865, + "grad_norm": 0.3334695100784302, + "learning_rate": 1.0253411306042886e-05, + "loss": 0.0581, + "step": 1351 + }, + { + "epoch": 2.53258321612752, + "grad_norm": 0.4798605442047119, + "learning_rate": 1.024561403508772e-05, + "loss": 0.0992, + "step": 1352 + }, + { + "epoch": 2.5344585091420533, + "grad_norm": 0.3962152600288391, + "learning_rate": 1.0237816764132555e-05, + "loss": 0.0782, + "step": 1353 + }, + { + "epoch": 2.536333802156587, + "grad_norm": 0.3626305162906647, + "learning_rate": 1.0230019493177387e-05, + "loss": 0.1039, + "step": 1354 + }, + { + "epoch": 2.5382090951711205, + "grad_norm": 0.6571470499038696, + "learning_rate": 1.0222222222222223e-05, + "loss": 0.0758, + "step": 1355 + }, + { + "epoch": 2.540084388185654, + "grad_norm": 0.26701807975769043, + "learning_rate": 1.0214424951267057e-05, + "loss": 0.0533, + "step": 1356 + }, + { + "epoch": 2.5419596812001877, + "grad_norm": 0.23627153038978577, + "learning_rate": 1.0206627680311891e-05, + "loss": 0.0486, + "step": 1357 + }, + { + "epoch": 2.543834974214721, + "grad_norm": 0.2959972620010376, + "learning_rate": 1.0198830409356726e-05, + "loss": 0.0725, + "step": 1358 + }, + { + "epoch": 2.5457102672292544, + "grad_norm": 0.9190336465835571, + "learning_rate": 1.019103313840156e-05, + "loss": 0.0876, + "step": 1359 + }, + { + "epoch": 2.547585560243788, + "grad_norm": 0.3755362033843994, + "learning_rate": 1.0183235867446396e-05, + "loss": 0.088, + "step": 1360 + }, + { + "epoch": 2.5494608532583216, + "grad_norm": 0.3958858847618103, + "learning_rate": 1.017543859649123e-05, + "loss": 0.1089, + "step": 1361 + }, + { + "epoch": 2.551336146272855, + "grad_norm": 0.26009872555732727, + "learning_rate": 1.0167641325536064e-05, + "loss": 0.0635, + "step": 1362 + }, + { + "epoch": 2.553211439287389, + "grad_norm": 0.3636474311351776, + "learning_rate": 1.0159844054580898e-05, + "loss": 0.0619, + "step": 1363 + }, + { + "epoch": 2.5550867323019224, + "grad_norm": 0.4022858440876007, + "learning_rate": 1.015204678362573e-05, + "loss": 0.0611, + "step": 1364 + }, + { + "epoch": 2.5569620253164556, + "grad_norm": 0.44928935170173645, + "learning_rate": 1.0144249512670565e-05, + "loss": 0.0787, + "step": 1365 + }, + { + "epoch": 2.558837318330989, + "grad_norm": 0.40881165862083435, + "learning_rate": 1.0136452241715399e-05, + "loss": 0.0693, + "step": 1366 + }, + { + "epoch": 2.5607126113455227, + "grad_norm": 0.3600151836872101, + "learning_rate": 1.0128654970760235e-05, + "loss": 0.1056, + "step": 1367 + }, + { + "epoch": 2.5625879043600563, + "grad_norm": 0.4044286608695984, + "learning_rate": 1.012085769980507e-05, + "loss": 0.1205, + "step": 1368 + }, + { + "epoch": 2.56446319737459, + "grad_norm": 0.3752521872520447, + "learning_rate": 1.0113060428849903e-05, + "loss": 0.0906, + "step": 1369 + }, + { + "epoch": 2.566338490389123, + "grad_norm": 0.4615623652935028, + "learning_rate": 1.0105263157894738e-05, + "loss": 0.1037, + "step": 1370 + }, + { + "epoch": 2.5682137834036567, + "grad_norm": 0.27481046319007874, + "learning_rate": 1.0097465886939572e-05, + "loss": 0.0464, + "step": 1371 + }, + { + "epoch": 2.5700890764181903, + "grad_norm": 0.4228805601596832, + "learning_rate": 1.0089668615984408e-05, + "loss": 0.1014, + "step": 1372 + }, + { + "epoch": 2.571964369432724, + "grad_norm": 0.33466285467147827, + "learning_rate": 1.0081871345029242e-05, + "loss": 0.0808, + "step": 1373 + }, + { + "epoch": 2.5738396624472575, + "grad_norm": 0.2553812861442566, + "learning_rate": 1.0074074074074074e-05, + "loss": 0.0516, + "step": 1374 + }, + { + "epoch": 2.575714955461791, + "grad_norm": 0.400551438331604, + "learning_rate": 1.0066276803118908e-05, + "loss": 0.07, + "step": 1375 + }, + { + "epoch": 2.5775902484763247, + "grad_norm": 0.3270658552646637, + "learning_rate": 1.0058479532163743e-05, + "loss": 0.0657, + "step": 1376 + }, + { + "epoch": 2.579465541490858, + "grad_norm": 0.3232525587081909, + "learning_rate": 1.0050682261208577e-05, + "loss": 0.0741, + "step": 1377 + }, + { + "epoch": 2.5813408345053914, + "grad_norm": 0.3367152810096741, + "learning_rate": 1.0042884990253413e-05, + "loss": 0.0884, + "step": 1378 + }, + { + "epoch": 2.583216127519925, + "grad_norm": 0.4030762314796448, + "learning_rate": 1.0035087719298247e-05, + "loss": 0.0941, + "step": 1379 + }, + { + "epoch": 2.5850914205344586, + "grad_norm": 0.31161463260650635, + "learning_rate": 1.0027290448343081e-05, + "loss": 0.0782, + "step": 1380 + }, + { + "epoch": 2.5869667135489918, + "grad_norm": 0.39888834953308105, + "learning_rate": 1.0019493177387915e-05, + "loss": 0.0821, + "step": 1381 + }, + { + "epoch": 2.5888420065635254, + "grad_norm": 0.3201531767845154, + "learning_rate": 1.001169590643275e-05, + "loss": 0.0638, + "step": 1382 + }, + { + "epoch": 2.590717299578059, + "grad_norm": 0.32830819487571716, + "learning_rate": 1.0003898635477585e-05, + "loss": 0.0777, + "step": 1383 + }, + { + "epoch": 2.5925925925925926, + "grad_norm": 0.2944835424423218, + "learning_rate": 9.996101364522418e-06, + "loss": 0.0653, + "step": 1384 + }, + { + "epoch": 2.594467885607126, + "grad_norm": 0.45982295274734497, + "learning_rate": 9.988304093567252e-06, + "loss": 0.1088, + "step": 1385 + }, + { + "epoch": 2.5963431786216598, + "grad_norm": 0.3153984248638153, + "learning_rate": 9.980506822612086e-06, + "loss": 0.0653, + "step": 1386 + }, + { + "epoch": 2.5982184716361933, + "grad_norm": 0.41158658266067505, + "learning_rate": 9.97270955165692e-06, + "loss": 0.0848, + "step": 1387 + }, + { + "epoch": 2.600093764650727, + "grad_norm": 0.4037891924381256, + "learning_rate": 9.964912280701755e-06, + "loss": 0.1319, + "step": 1388 + }, + { + "epoch": 2.60196905766526, + "grad_norm": 0.4108717739582062, + "learning_rate": 9.957115009746589e-06, + "loss": 0.1257, + "step": 1389 + }, + { + "epoch": 2.6038443506797937, + "grad_norm": 0.30166369676589966, + "learning_rate": 9.949317738791425e-06, + "loss": 0.0695, + "step": 1390 + }, + { + "epoch": 2.6057196436943273, + "grad_norm": 0.3084668517112732, + "learning_rate": 9.941520467836257e-06, + "loss": 0.0787, + "step": 1391 + }, + { + "epoch": 2.607594936708861, + "grad_norm": 0.5487121939659119, + "learning_rate": 9.933723196881091e-06, + "loss": 0.1363, + "step": 1392 + }, + { + "epoch": 2.609470229723394, + "grad_norm": 0.3409046530723572, + "learning_rate": 9.925925925925927e-06, + "loss": 0.0781, + "step": 1393 + }, + { + "epoch": 2.6113455227379276, + "grad_norm": 0.35762616991996765, + "learning_rate": 9.918128654970762e-06, + "loss": 0.077, + "step": 1394 + }, + { + "epoch": 2.6132208157524612, + "grad_norm": 0.3606860935688019, + "learning_rate": 9.910331384015596e-06, + "loss": 0.0701, + "step": 1395 + }, + { + "epoch": 2.615096108766995, + "grad_norm": 0.33565056324005127, + "learning_rate": 9.90253411306043e-06, + "loss": 0.0705, + "step": 1396 + }, + { + "epoch": 2.6169714017815284, + "grad_norm": 0.35233068466186523, + "learning_rate": 9.894736842105264e-06, + "loss": 0.0727, + "step": 1397 + }, + { + "epoch": 2.618846694796062, + "grad_norm": 0.7528280019760132, + "learning_rate": 9.886939571150098e-06, + "loss": 0.0846, + "step": 1398 + }, + { + "epoch": 2.6207219878105956, + "grad_norm": 0.3784114420413971, + "learning_rate": 9.879142300194932e-06, + "loss": 0.0894, + "step": 1399 + }, + { + "epoch": 2.6225972808251288, + "grad_norm": 0.2998042106628418, + "learning_rate": 9.871345029239767e-06, + "loss": 0.0729, + "step": 1400 + }, + { + "epoch": 2.6225972808251288, + "eval_loss": 0.07494800537824631, + "eval_runtime": 675.6374, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 1400 + }, + { + "epoch": 2.6244725738396624, + "grad_norm": 0.45033907890319824, + "learning_rate": 9.863547758284601e-06, + "loss": 0.0543, + "step": 1401 + }, + { + "epoch": 2.626347866854196, + "grad_norm": 0.33740511536598206, + "learning_rate": 9.855750487329435e-06, + "loss": 0.0887, + "step": 1402 + }, + { + "epoch": 2.6282231598687296, + "grad_norm": 0.4127645492553711, + "learning_rate": 9.84795321637427e-06, + "loss": 0.0915, + "step": 1403 + }, + { + "epoch": 2.630098452883263, + "grad_norm": 0.3304232656955719, + "learning_rate": 9.840155945419103e-06, + "loss": 0.0788, + "step": 1404 + }, + { + "epoch": 2.6319737458977963, + "grad_norm": 0.36797669529914856, + "learning_rate": 9.83235867446394e-06, + "loss": 0.0873, + "step": 1405 + }, + { + "epoch": 2.63384903891233, + "grad_norm": 0.2819061577320099, + "learning_rate": 9.824561403508772e-06, + "loss": 0.0618, + "step": 1406 + }, + { + "epoch": 2.6357243319268635, + "grad_norm": 0.52513587474823, + "learning_rate": 9.816764132553606e-06, + "loss": 0.0824, + "step": 1407 + }, + { + "epoch": 2.637599624941397, + "grad_norm": 0.37794506549835205, + "learning_rate": 9.808966861598442e-06, + "loss": 0.0629, + "step": 1408 + }, + { + "epoch": 2.6394749179559307, + "grad_norm": 0.35513389110565186, + "learning_rate": 9.801169590643276e-06, + "loss": 0.1138, + "step": 1409 + }, + { + "epoch": 2.6413502109704643, + "grad_norm": 0.36185532808303833, + "learning_rate": 9.79337231968811e-06, + "loss": 0.0829, + "step": 1410 + }, + { + "epoch": 2.643225503984998, + "grad_norm": 0.3326586186885834, + "learning_rate": 9.785575048732944e-06, + "loss": 0.0579, + "step": 1411 + }, + { + "epoch": 2.645100796999531, + "grad_norm": 0.364624947309494, + "learning_rate": 9.777777777777779e-06, + "loss": 0.095, + "step": 1412 + }, + { + "epoch": 2.6469760900140646, + "grad_norm": 0.3595339059829712, + "learning_rate": 9.769980506822613e-06, + "loss": 0.0816, + "step": 1413 + }, + { + "epoch": 2.6488513830285982, + "grad_norm": 0.25556787848472595, + "learning_rate": 9.762183235867447e-06, + "loss": 0.0403, + "step": 1414 + }, + { + "epoch": 2.650726676043132, + "grad_norm": 0.3111402094364166, + "learning_rate": 9.754385964912281e-06, + "loss": 0.0652, + "step": 1415 + }, + { + "epoch": 2.652601969057665, + "grad_norm": 0.3296256363391876, + "learning_rate": 9.746588693957115e-06, + "loss": 0.0722, + "step": 1416 + }, + { + "epoch": 2.6544772620721986, + "grad_norm": 0.38634583353996277, + "learning_rate": 9.73879142300195e-06, + "loss": 0.0659, + "step": 1417 + }, + { + "epoch": 2.656352555086732, + "grad_norm": 0.35213443636894226, + "learning_rate": 9.730994152046784e-06, + "loss": 0.0671, + "step": 1418 + }, + { + "epoch": 2.6582278481012658, + "grad_norm": 0.3570035696029663, + "learning_rate": 9.72319688109162e-06, + "loss": 0.061, + "step": 1419 + }, + { + "epoch": 2.6601031411157994, + "grad_norm": 0.33103039860725403, + "learning_rate": 9.715399610136454e-06, + "loss": 0.0693, + "step": 1420 + }, + { + "epoch": 2.661978434130333, + "grad_norm": 0.33688971400260925, + "learning_rate": 9.707602339181286e-06, + "loss": 0.0913, + "step": 1421 + }, + { + "epoch": 2.6638537271448666, + "grad_norm": 0.4284787178039551, + "learning_rate": 9.69980506822612e-06, + "loss": 0.0908, + "step": 1422 + }, + { + "epoch": 2.6657290201594, + "grad_norm": 0.46354052424430847, + "learning_rate": 9.692007797270956e-06, + "loss": 0.0702, + "step": 1423 + }, + { + "epoch": 2.6676043131739333, + "grad_norm": 0.32590043544769287, + "learning_rate": 9.68421052631579e-06, + "loss": 0.0664, + "step": 1424 + }, + { + "epoch": 2.669479606188467, + "grad_norm": 0.39951497316360474, + "learning_rate": 9.676413255360625e-06, + "loss": 0.1049, + "step": 1425 + }, + { + "epoch": 2.6713548992030005, + "grad_norm": 0.38672441244125366, + "learning_rate": 9.668615984405459e-06, + "loss": 0.0707, + "step": 1426 + }, + { + "epoch": 2.673230192217534, + "grad_norm": 0.33832356333732605, + "learning_rate": 9.660818713450293e-06, + "loss": 0.0766, + "step": 1427 + }, + { + "epoch": 2.6751054852320673, + "grad_norm": 0.38057902455329895, + "learning_rate": 9.653021442495127e-06, + "loss": 0.0622, + "step": 1428 + }, + { + "epoch": 2.676980778246601, + "grad_norm": 0.2721557021141052, + "learning_rate": 9.645224171539962e-06, + "loss": 0.0585, + "step": 1429 + }, + { + "epoch": 2.6788560712611345, + "grad_norm": 0.3114902973175049, + "learning_rate": 9.637426900584796e-06, + "loss": 0.0599, + "step": 1430 + }, + { + "epoch": 2.680731364275668, + "grad_norm": 0.4293680787086487, + "learning_rate": 9.62962962962963e-06, + "loss": 0.095, + "step": 1431 + }, + { + "epoch": 2.6826066572902016, + "grad_norm": 0.4264834523200989, + "learning_rate": 9.621832358674464e-06, + "loss": 0.0989, + "step": 1432 + }, + { + "epoch": 2.6844819503047352, + "grad_norm": 0.45689934492111206, + "learning_rate": 9.614035087719298e-06, + "loss": 0.1043, + "step": 1433 + }, + { + "epoch": 2.686357243319269, + "grad_norm": 0.3924713432788849, + "learning_rate": 9.606237816764134e-06, + "loss": 0.0931, + "step": 1434 + }, + { + "epoch": 2.688232536333802, + "grad_norm": 0.4446573853492737, + "learning_rate": 9.598440545808968e-06, + "loss": 0.1063, + "step": 1435 + }, + { + "epoch": 2.6901078293483356, + "grad_norm": 0.45865702629089355, + "learning_rate": 9.590643274853801e-06, + "loss": 0.1034, + "step": 1436 + }, + { + "epoch": 2.691983122362869, + "grad_norm": 0.39059358835220337, + "learning_rate": 9.582846003898635e-06, + "loss": 0.0587, + "step": 1437 + }, + { + "epoch": 2.693858415377403, + "grad_norm": 0.678318202495575, + "learning_rate": 9.575048732943471e-06, + "loss": 0.1233, + "step": 1438 + }, + { + "epoch": 2.6957337083919364, + "grad_norm": 0.3813665211200714, + "learning_rate": 9.567251461988305e-06, + "loss": 0.1088, + "step": 1439 + }, + { + "epoch": 2.6976090014064695, + "grad_norm": 0.43635404109954834, + "learning_rate": 9.55945419103314e-06, + "loss": 0.111, + "step": 1440 + }, + { + "epoch": 2.699484294421003, + "grad_norm": 0.5471066236495972, + "learning_rate": 9.551656920077974e-06, + "loss": 0.082, + "step": 1441 + }, + { + "epoch": 2.7013595874355367, + "grad_norm": 0.31651976704597473, + "learning_rate": 9.543859649122808e-06, + "loss": 0.0533, + "step": 1442 + }, + { + "epoch": 2.7032348804500703, + "grad_norm": 0.3988685607910156, + "learning_rate": 9.536062378167642e-06, + "loss": 0.0786, + "step": 1443 + }, + { + "epoch": 2.705110173464604, + "grad_norm": 1.229805827140808, + "learning_rate": 9.528265107212476e-06, + "loss": 0.0704, + "step": 1444 + }, + { + "epoch": 2.7069854664791375, + "grad_norm": 0.3627307415008545, + "learning_rate": 9.52046783625731e-06, + "loss": 0.0857, + "step": 1445 + }, + { + "epoch": 2.708860759493671, + "grad_norm": 0.2482980489730835, + "learning_rate": 9.512670565302145e-06, + "loss": 0.0583, + "step": 1446 + }, + { + "epoch": 2.7107360525082043, + "grad_norm": 0.33016133308410645, + "learning_rate": 9.504873294346979e-06, + "loss": 0.0678, + "step": 1447 + }, + { + "epoch": 2.712611345522738, + "grad_norm": 0.3658045828342438, + "learning_rate": 9.497076023391813e-06, + "loss": 0.0926, + "step": 1448 + }, + { + "epoch": 2.7144866385372715, + "grad_norm": 0.3729887008666992, + "learning_rate": 9.489278752436649e-06, + "loss": 0.0698, + "step": 1449 + }, + { + "epoch": 2.716361931551805, + "grad_norm": 0.36187222599983215, + "learning_rate": 9.481481481481483e-06, + "loss": 0.0651, + "step": 1450 + }, + { + "epoch": 2.716361931551805, + "eval_loss": 0.07403497397899628, + "eval_runtime": 673.9675, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 1450 + }, + { + "epoch": 2.7182372245663387, + "grad_norm": 0.3599952757358551, + "learning_rate": 9.473684210526315e-06, + "loss": 0.0779, + "step": 1451 + }, + { + "epoch": 2.720112517580872, + "grad_norm": 0.6131613850593567, + "learning_rate": 9.465886939571151e-06, + "loss": 0.1098, + "step": 1452 + }, + { + "epoch": 2.7219878105954054, + "grad_norm": 0.32765620946884155, + "learning_rate": 9.458089668615986e-06, + "loss": 0.0596, + "step": 1453 + }, + { + "epoch": 2.723863103609939, + "grad_norm": 0.34976986050605774, + "learning_rate": 9.45029239766082e-06, + "loss": 0.0649, + "step": 1454 + }, + { + "epoch": 2.7257383966244726, + "grad_norm": 0.39676281809806824, + "learning_rate": 9.442495126705654e-06, + "loss": 0.0835, + "step": 1455 + }, + { + "epoch": 2.727613689639006, + "grad_norm": 0.3270690143108368, + "learning_rate": 9.434697855750488e-06, + "loss": 0.0987, + "step": 1456 + }, + { + "epoch": 2.72948898265354, + "grad_norm": 0.40614816546440125, + "learning_rate": 9.426900584795322e-06, + "loss": 0.0847, + "step": 1457 + }, + { + "epoch": 2.7313642756680734, + "grad_norm": 0.42256006598472595, + "learning_rate": 9.419103313840157e-06, + "loss": 0.08, + "step": 1458 + }, + { + "epoch": 2.7332395686826065, + "grad_norm": 0.34114959836006165, + "learning_rate": 9.41130604288499e-06, + "loss": 0.0618, + "step": 1459 + }, + { + "epoch": 2.73511486169714, + "grad_norm": 0.30372029542922974, + "learning_rate": 9.403508771929825e-06, + "loss": 0.0598, + "step": 1460 + }, + { + "epoch": 2.7369901547116737, + "grad_norm": 0.439656525850296, + "learning_rate": 9.395711500974659e-06, + "loss": 0.0857, + "step": 1461 + }, + { + "epoch": 2.7388654477262073, + "grad_norm": 0.36488088965415955, + "learning_rate": 9.387914230019493e-06, + "loss": 0.081, + "step": 1462 + }, + { + "epoch": 2.7407407407407405, + "grad_norm": 0.2475002259016037, + "learning_rate": 9.380116959064327e-06, + "loss": 0.0479, + "step": 1463 + }, + { + "epoch": 2.742616033755274, + "grad_norm": 0.3404149115085602, + "learning_rate": 9.372319688109163e-06, + "loss": 0.07, + "step": 1464 + }, + { + "epoch": 2.7444913267698077, + "grad_norm": 0.4210808575153351, + "learning_rate": 9.364522417153998e-06, + "loss": 0.094, + "step": 1465 + }, + { + "epoch": 2.7463666197843413, + "grad_norm": 0.4397883713245392, + "learning_rate": 9.35672514619883e-06, + "loss": 0.0938, + "step": 1466 + }, + { + "epoch": 2.748241912798875, + "grad_norm": 0.4252859950065613, + "learning_rate": 9.348927875243666e-06, + "loss": 0.0688, + "step": 1467 + }, + { + "epoch": 2.7501172058134085, + "grad_norm": 0.39042210578918457, + "learning_rate": 9.3411306042885e-06, + "loss": 0.0929, + "step": 1468 + }, + { + "epoch": 2.751992498827942, + "grad_norm": 0.29327455163002014, + "learning_rate": 9.333333333333334e-06, + "loss": 0.0375, + "step": 1469 + }, + { + "epoch": 2.7538677918424757, + "grad_norm": 0.5631712675094604, + "learning_rate": 9.325536062378169e-06, + "loss": 0.0884, + "step": 1470 + }, + { + "epoch": 2.755743084857009, + "grad_norm": 0.9923895001411438, + "learning_rate": 9.317738791423003e-06, + "loss": 0.1044, + "step": 1471 + }, + { + "epoch": 2.7576183778715424, + "grad_norm": 0.4112682640552521, + "learning_rate": 9.309941520467837e-06, + "loss": 0.1121, + "step": 1472 + }, + { + "epoch": 2.759493670886076, + "grad_norm": 0.3854548931121826, + "learning_rate": 9.302144249512671e-06, + "loss": 0.0859, + "step": 1473 + }, + { + "epoch": 2.7613689639006096, + "grad_norm": 0.46496057510375977, + "learning_rate": 9.294346978557505e-06, + "loss": 0.1247, + "step": 1474 + }, + { + "epoch": 2.7632442569151427, + "grad_norm": 0.4549020230770111, + "learning_rate": 9.28654970760234e-06, + "loss": 0.0826, + "step": 1475 + }, + { + "epoch": 2.7651195499296763, + "grad_norm": 0.3205399811267853, + "learning_rate": 9.278752436647174e-06, + "loss": 0.0791, + "step": 1476 + }, + { + "epoch": 2.76699484294421, + "grad_norm": 0.41249391436576843, + "learning_rate": 9.270955165692008e-06, + "loss": 0.101, + "step": 1477 + }, + { + "epoch": 2.7688701359587435, + "grad_norm": 0.3202146291732788, + "learning_rate": 9.263157894736842e-06, + "loss": 0.0538, + "step": 1478 + }, + { + "epoch": 2.770745428973277, + "grad_norm": 0.34748685359954834, + "learning_rate": 9.255360623781678e-06, + "loss": 0.0803, + "step": 1479 + }, + { + "epoch": 2.7726207219878107, + "grad_norm": 0.3739032745361328, + "learning_rate": 9.247563352826512e-06, + "loss": 0.0908, + "step": 1480 + }, + { + "epoch": 2.7744960150023443, + "grad_norm": 0.420016884803772, + "learning_rate": 9.239766081871345e-06, + "loss": 0.0991, + "step": 1481 + }, + { + "epoch": 2.7763713080168775, + "grad_norm": 0.4077822268009186, + "learning_rate": 9.23196881091618e-06, + "loss": 0.0954, + "step": 1482 + }, + { + "epoch": 2.778246601031411, + "grad_norm": 0.30607473850250244, + "learning_rate": 9.224171539961015e-06, + "loss": 0.0585, + "step": 1483 + }, + { + "epoch": 2.7801218940459447, + "grad_norm": 0.4178578853607178, + "learning_rate": 9.216374269005849e-06, + "loss": 0.0904, + "step": 1484 + }, + { + "epoch": 2.7819971870604783, + "grad_norm": 0.37330979108810425, + "learning_rate": 9.208576998050683e-06, + "loss": 0.0743, + "step": 1485 + }, + { + "epoch": 2.783872480075012, + "grad_norm": 0.3077203631401062, + "learning_rate": 9.200779727095517e-06, + "loss": 0.0803, + "step": 1486 + }, + { + "epoch": 2.785747773089545, + "grad_norm": 0.3057253360748291, + "learning_rate": 9.192982456140351e-06, + "loss": 0.0621, + "step": 1487 + }, + { + "epoch": 2.7876230661040786, + "grad_norm": 0.31388983130455017, + "learning_rate": 9.185185185185186e-06, + "loss": 0.074, + "step": 1488 + }, + { + "epoch": 2.789498359118612, + "grad_norm": 0.3483256995677948, + "learning_rate": 9.17738791423002e-06, + "loss": 0.0828, + "step": 1489 + }, + { + "epoch": 2.791373652133146, + "grad_norm": 0.2892071306705475, + "learning_rate": 9.169590643274856e-06, + "loss": 0.0716, + "step": 1490 + }, + { + "epoch": 2.7932489451476794, + "grad_norm": 0.39768117666244507, + "learning_rate": 9.161793372319688e-06, + "loss": 0.093, + "step": 1491 + }, + { + "epoch": 2.795124238162213, + "grad_norm": 0.408591628074646, + "learning_rate": 9.153996101364522e-06, + "loss": 0.0962, + "step": 1492 + }, + { + "epoch": 2.7969995311767466, + "grad_norm": 1.1204290390014648, + "learning_rate": 9.146198830409357e-06, + "loss": 0.0881, + "step": 1493 + }, + { + "epoch": 2.7988748241912798, + "grad_norm": 0.38295555114746094, + "learning_rate": 9.138401559454192e-06, + "loss": 0.0959, + "step": 1494 + }, + { + "epoch": 2.8007501172058133, + "grad_norm": 0.33753833174705505, + "learning_rate": 9.130604288499027e-06, + "loss": 0.1013, + "step": 1495 + }, + { + "epoch": 2.802625410220347, + "grad_norm": 0.32718968391418457, + "learning_rate": 9.12280701754386e-06, + "loss": 0.0674, + "step": 1496 + }, + { + "epoch": 2.8045007032348805, + "grad_norm": 0.3198351562023163, + "learning_rate": 9.115009746588695e-06, + "loss": 0.0626, + "step": 1497 + }, + { + "epoch": 2.8063759962494137, + "grad_norm": 0.5296013951301575, + "learning_rate": 9.10721247563353e-06, + "loss": 0.1014, + "step": 1498 + }, + { + "epoch": 2.8082512892639473, + "grad_norm": 0.4616522789001465, + "learning_rate": 9.099415204678363e-06, + "loss": 0.0927, + "step": 1499 + }, + { + "epoch": 2.810126582278481, + "grad_norm": 0.41021260619163513, + "learning_rate": 9.091617933723198e-06, + "loss": 0.0658, + "step": 1500 + }, + { + "epoch": 2.810126582278481, + "eval_loss": 0.07440079748630524, + "eval_runtime": 671.7286, + "eval_samples_per_second": 0.293, + "eval_steps_per_second": 0.293, + "step": 1500 + }, + { + "epoch": 2.8120018752930145, + "grad_norm": 0.30366870760917664, + "learning_rate": 9.083820662768032e-06, + "loss": 0.0542, + "step": 1501 + }, + { + "epoch": 2.813877168307548, + "grad_norm": 0.484829843044281, + "learning_rate": 9.076023391812866e-06, + "loss": 0.1142, + "step": 1502 + }, + { + "epoch": 2.8157524613220817, + "grad_norm": 0.4134244918823242, + "learning_rate": 9.0682261208577e-06, + "loss": 0.1089, + "step": 1503 + }, + { + "epoch": 2.8176277543366153, + "grad_norm": 0.34767600893974304, + "learning_rate": 9.060428849902534e-06, + "loss": 0.0834, + "step": 1504 + }, + { + "epoch": 2.819503047351149, + "grad_norm": 0.32123830914497375, + "learning_rate": 9.05263157894737e-06, + "loss": 0.0647, + "step": 1505 + }, + { + "epoch": 2.821378340365682, + "grad_norm": 0.2995455861091614, + "learning_rate": 9.044834307992203e-06, + "loss": 0.051, + "step": 1506 + }, + { + "epoch": 2.8232536333802156, + "grad_norm": 0.2944452464580536, + "learning_rate": 9.037037037037037e-06, + "loss": 0.0508, + "step": 1507 + }, + { + "epoch": 2.825128926394749, + "grad_norm": 0.4798865020275116, + "learning_rate": 9.029239766081873e-06, + "loss": 0.0894, + "step": 1508 + }, + { + "epoch": 2.827004219409283, + "grad_norm": 0.3530484437942505, + "learning_rate": 9.021442495126707e-06, + "loss": 0.0788, + "step": 1509 + }, + { + "epoch": 2.828879512423816, + "grad_norm": 0.328976571559906, + "learning_rate": 9.013645224171541e-06, + "loss": 0.0723, + "step": 1510 + }, + { + "epoch": 2.8307548054383496, + "grad_norm": 0.4133010506629944, + "learning_rate": 9.005847953216374e-06, + "loss": 0.041, + "step": 1511 + }, + { + "epoch": 2.832630098452883, + "grad_norm": 0.407266765832901, + "learning_rate": 8.99805068226121e-06, + "loss": 0.0664, + "step": 1512 + }, + { + "epoch": 2.8345053914674168, + "grad_norm": 0.3167998194694519, + "learning_rate": 8.990253411306044e-06, + "loss": 0.0653, + "step": 1513 + }, + { + "epoch": 2.8363806844819504, + "grad_norm": 0.36208438873291016, + "learning_rate": 8.982456140350878e-06, + "loss": 0.0538, + "step": 1514 + }, + { + "epoch": 2.838255977496484, + "grad_norm": 0.5785827040672302, + "learning_rate": 8.974658869395712e-06, + "loss": 0.1072, + "step": 1515 + }, + { + "epoch": 2.8401312705110175, + "grad_norm": 0.4494905471801758, + "learning_rate": 8.966861598440546e-06, + "loss": 0.0961, + "step": 1516 + }, + { + "epoch": 2.842006563525551, + "grad_norm": 0.2944582998752594, + "learning_rate": 8.95906432748538e-06, + "loss": 0.0481, + "step": 1517 + }, + { + "epoch": 2.8438818565400843, + "grad_norm": 0.31694504618644714, + "learning_rate": 8.951267056530215e-06, + "loss": 0.07, + "step": 1518 + }, + { + "epoch": 2.845757149554618, + "grad_norm": 0.33348628878593445, + "learning_rate": 8.943469785575049e-06, + "loss": 0.0719, + "step": 1519 + }, + { + "epoch": 2.8476324425691515, + "grad_norm": 0.26419350504875183, + "learning_rate": 8.935672514619885e-06, + "loss": 0.0496, + "step": 1520 + }, + { + "epoch": 2.849507735583685, + "grad_norm": 0.422234445810318, + "learning_rate": 8.927875243664717e-06, + "loss": 0.0972, + "step": 1521 + }, + { + "epoch": 2.8513830285982182, + "grad_norm": 0.4871450364589691, + "learning_rate": 8.920077972709552e-06, + "loss": 0.1091, + "step": 1522 + }, + { + "epoch": 2.853258321612752, + "grad_norm": 0.39795181155204773, + "learning_rate": 8.912280701754387e-06, + "loss": 0.0972, + "step": 1523 + }, + { + "epoch": 2.8551336146272854, + "grad_norm": 0.3796606957912445, + "learning_rate": 8.904483430799222e-06, + "loss": 0.071, + "step": 1524 + }, + { + "epoch": 2.857008907641819, + "grad_norm": 0.3433877229690552, + "learning_rate": 8.896686159844056e-06, + "loss": 0.087, + "step": 1525 + }, + { + "epoch": 2.8588842006563526, + "grad_norm": 0.3441942036151886, + "learning_rate": 8.888888888888888e-06, + "loss": 0.0537, + "step": 1526 + }, + { + "epoch": 2.8607594936708862, + "grad_norm": 0.5122886896133423, + "learning_rate": 8.881091617933724e-06, + "loss": 0.104, + "step": 1527 + }, + { + "epoch": 2.86263478668542, + "grad_norm": 0.3636258542537689, + "learning_rate": 8.873294346978558e-06, + "loss": 0.0622, + "step": 1528 + }, + { + "epoch": 2.864510079699953, + "grad_norm": 0.39862120151519775, + "learning_rate": 8.865497076023393e-06, + "loss": 0.0827, + "step": 1529 + }, + { + "epoch": 2.8663853727144866, + "grad_norm": 0.4351373612880707, + "learning_rate": 8.857699805068227e-06, + "loss": 0.1058, + "step": 1530 + }, + { + "epoch": 2.86826066572902, + "grad_norm": 1.1896051168441772, + "learning_rate": 8.849902534113061e-06, + "loss": 0.0846, + "step": 1531 + }, + { + "epoch": 2.8701359587435538, + "grad_norm": 0.4099143147468567, + "learning_rate": 8.842105263157895e-06, + "loss": 0.0858, + "step": 1532 + }, + { + "epoch": 2.8720112517580874, + "grad_norm": 0.4123772978782654, + "learning_rate": 8.83430799220273e-06, + "loss": 0.0738, + "step": 1533 + }, + { + "epoch": 2.8738865447726205, + "grad_norm": 0.39457976818084717, + "learning_rate": 8.826510721247564e-06, + "loss": 0.0598, + "step": 1534 + }, + { + "epoch": 2.875761837787154, + "grad_norm": 0.36988794803619385, + "learning_rate": 8.8187134502924e-06, + "loss": 0.0887, + "step": 1535 + }, + { + "epoch": 2.8776371308016877, + "grad_norm": 0.4902806282043457, + "learning_rate": 8.810916179337232e-06, + "loss": 0.0715, + "step": 1536 + }, + { + "epoch": 2.8795124238162213, + "grad_norm": 0.31671905517578125, + "learning_rate": 8.803118908382066e-06, + "loss": 0.0586, + "step": 1537 + }, + { + "epoch": 2.881387716830755, + "grad_norm": 0.33224424719810486, + "learning_rate": 8.795321637426902e-06, + "loss": 0.0644, + "step": 1538 + }, + { + "epoch": 2.8832630098452885, + "grad_norm": 0.3493894338607788, + "learning_rate": 8.787524366471736e-06, + "loss": 0.0737, + "step": 1539 + }, + { + "epoch": 2.885138302859822, + "grad_norm": 0.3095395863056183, + "learning_rate": 8.77972709551657e-06, + "loss": 0.0632, + "step": 1540 + }, + { + "epoch": 2.8870135958743552, + "grad_norm": 0.283744215965271, + "learning_rate": 8.771929824561405e-06, + "loss": 0.0652, + "step": 1541 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.487179696559906, + "learning_rate": 8.764132553606239e-06, + "loss": 0.0984, + "step": 1542 + }, + { + "epoch": 2.8907641819034224, + "grad_norm": 0.287124365568161, + "learning_rate": 8.756335282651073e-06, + "loss": 0.0511, + "step": 1543 + }, + { + "epoch": 2.892639474917956, + "grad_norm": 0.3281533718109131, + "learning_rate": 8.748538011695907e-06, + "loss": 0.0639, + "step": 1544 + }, + { + "epoch": 2.894514767932489, + "grad_norm": 0.399172306060791, + "learning_rate": 8.740740740740741e-06, + "loss": 0.0539, + "step": 1545 + }, + { + "epoch": 2.896390060947023, + "grad_norm": 0.35018855333328247, + "learning_rate": 8.732943469785576e-06, + "loss": 0.0751, + "step": 1546 + }, + { + "epoch": 2.8982653539615564, + "grad_norm": 0.3269667327404022, + "learning_rate": 8.72514619883041e-06, + "loss": 0.0528, + "step": 1547 + }, + { + "epoch": 2.90014064697609, + "grad_norm": 0.4104063808917999, + "learning_rate": 8.717348927875244e-06, + "loss": 0.0827, + "step": 1548 + }, + { + "epoch": 2.9020159399906236, + "grad_norm": 0.4464809000492096, + "learning_rate": 8.709551656920078e-06, + "loss": 0.1076, + "step": 1549 + }, + { + "epoch": 2.903891233005157, + "grad_norm": 0.3798385560512543, + "learning_rate": 8.701754385964914e-06, + "loss": 0.0625, + "step": 1550 + }, + { + "epoch": 2.903891233005157, + "eval_loss": 0.07311470806598663, + "eval_runtime": 675.3623, + "eval_samples_per_second": 0.292, + "eval_steps_per_second": 0.292, + "step": 1550 + } + ], + "logging_steps": 1, + "max_steps": 2665, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3036945883587664e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}