| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9924139799512326, | |
| "eval_steps": 58, | |
| "global_step": 460, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004336630979807562, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 2.173913043478261e-07, | |
| "loss": 1.0297, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.004336630979807562, | |
| "eval_loss": 1.1468182802200317, | |
| "eval_runtime": 109.2361, | |
| "eval_samples_per_second": 7.909, | |
| "eval_steps_per_second": 1.977, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008673261959615123, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 4.347826086956522e-07, | |
| "loss": 1.0442, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.013009892939422686, | |
| "grad_norm": 8.75, | |
| "learning_rate": 6.521739130434783e-07, | |
| "loss": 1.0301, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.017346523919230247, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 8.695652173913044e-07, | |
| "loss": 1.0477, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.02168315489903781, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 1.0869565217391306e-06, | |
| "loss": 1.038, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02601978587884537, | |
| "grad_norm": 6.0, | |
| "learning_rate": 1.3043478260869566e-06, | |
| "loss": 1.0415, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.030356416858652934, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 1.521739130434783e-06, | |
| "loss": 1.0229, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.03469304783846049, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 1.7391304347826088e-06, | |
| "loss": 1.0266, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.039029678818268056, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 1.956521739130435e-06, | |
| "loss": 1.0237, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.04336630979807562, | |
| "grad_norm": 6.15625, | |
| "learning_rate": 2.173913043478261e-06, | |
| "loss": 1.0316, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04770294077788318, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 2.391304347826087e-06, | |
| "loss": 1.0134, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.05203957175769074, | |
| "grad_norm": 6.375, | |
| "learning_rate": 2.6086956521739132e-06, | |
| "loss": 1.0288, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.056376202737498306, | |
| "grad_norm": 4.71875, | |
| "learning_rate": 2.8260869565217393e-06, | |
| "loss": 1.0208, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.06071283371730587, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 3.043478260869566e-06, | |
| "loss": 0.9986, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.06504946469711342, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 3.2608695652173914e-06, | |
| "loss": 1.0102, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.06938609567692099, | |
| "grad_norm": 5.15625, | |
| "learning_rate": 3.4782608695652175e-06, | |
| "loss": 0.991, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.07372272665672855, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 3.6956521739130436e-06, | |
| "loss": 0.9974, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.07805935763653611, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 3.91304347826087e-06, | |
| "loss": 0.9997, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.08239598861634367, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 4.130434782608696e-06, | |
| "loss": 0.9896, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.08673261959615124, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 4.347826086956522e-06, | |
| "loss": 0.973, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0910692505759588, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 4.565217391304348e-06, | |
| "loss": 0.9764, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.09540588155576636, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 4.782608695652174e-06, | |
| "loss": 0.9461, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.09974251253557392, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9355, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.10407914351538149, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 5.2173913043478265e-06, | |
| "loss": 0.9725, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.10841577449518905, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 5.4347826086956525e-06, | |
| "loss": 0.9244, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.11275240547499661, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 5.652173913043479e-06, | |
| "loss": 0.929, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.11708903645480417, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 5.8695652173913055e-06, | |
| "loss": 0.9434, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.12142566743461174, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 6.086956521739132e-06, | |
| "loss": 0.9331, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.1257622984144193, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 6.304347826086958e-06, | |
| "loss": 0.9264, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.13009892939422685, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 6.521739130434783e-06, | |
| "loss": 0.9114, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.13443556037403442, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 6.739130434782609e-06, | |
| "loss": 0.9277, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.13877219135384197, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 6.956521739130435e-06, | |
| "loss": 0.9154, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.14310882233364955, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 7.173913043478261e-06, | |
| "loss": 0.939, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.1474454533134571, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 7.391304347826087e-06, | |
| "loss": 0.9146, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.15178208429326467, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 7.608695652173914e-06, | |
| "loss": 0.9063, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.15611871527307222, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 7.82608695652174e-06, | |
| "loss": 0.9178, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.1604553462528798, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 8.043478260869566e-06, | |
| "loss": 0.9184, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.16479197723268735, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 8.260869565217392e-06, | |
| "loss": 0.911, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.16912860821249492, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 8.478260869565218e-06, | |
| "loss": 0.9031, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.17346523919230247, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 8.695652173913044e-06, | |
| "loss": 0.8881, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.17780187017211005, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 8.91304347826087e-06, | |
| "loss": 0.8846, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.1821385011519176, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 9.130434782608697e-06, | |
| "loss": 0.8895, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.18647513213172517, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 9.347826086956523e-06, | |
| "loss": 0.8683, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.19081176311153272, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 9.565217391304349e-06, | |
| "loss": 0.8795, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.1951483940913403, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 9.782608695652175e-06, | |
| "loss": 0.8829, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.19948502507114785, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8703, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.20382165605095542, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 9.999856041607732e-06, | |
| "loss": 0.8702, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.20815828703076297, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 9.99942417472053e-06, | |
| "loss": 0.869, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.21249491801057055, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 9.998704424206747e-06, | |
| "loss": 0.8748, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.2168315489903781, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 9.997696831512027e-06, | |
| "loss": 0.8737, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22116817997018567, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 9.996401454656941e-06, | |
| "loss": 0.8745, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.22550481094999322, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 9.994818368233639e-06, | |
| "loss": 0.8677, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.2298414419298008, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 9.992947663401548e-06, | |
| "loss": 0.863, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.23417807290960835, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 9.990789447882136e-06, | |
| "loss": 0.8709, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.2385147038894159, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 9.988343845952697e-06, | |
| "loss": 0.8543, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.24285133486922347, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 9.985610998439198e-06, | |
| "loss": 0.8735, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.24718796584903102, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 9.982591062708172e-06, | |
| "loss": 0.8631, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.2515245968288386, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 9.979284212657658e-06, | |
| "loss": 0.8512, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.2515245968288386, | |
| "eval_loss": 0.8729492425918579, | |
| "eval_runtime": 109.2389, | |
| "eval_samples_per_second": 7.909, | |
| "eval_steps_per_second": 1.977, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.2558612278086462, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 9.97569063870718e-06, | |
| "loss": 0.8554, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.2601978587884537, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 9.971810547786794e-06, | |
| "loss": 0.8661, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.26453448976826127, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 9.967644163325157e-06, | |
| "loss": 0.8592, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.26887112074806885, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 9.963191725236672e-06, | |
| "loss": 0.8614, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.2732077517278764, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 9.958453489907673e-06, | |
| "loss": 0.8555, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.27754438270768395, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 9.953429730181653e-06, | |
| "loss": 0.8572, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.2818810136874915, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 9.948120735343566e-06, | |
| "loss": 0.8583, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2862176446672991, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 9.942526811103153e-06, | |
| "loss": 0.8433, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.2905542756471067, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 9.93664827957735e-06, | |
| "loss": 0.8505, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.2948909066269142, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 9.930485479271735e-06, | |
| "loss": 0.8403, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.29922753760672177, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 9.924038765061042e-06, | |
| "loss": 0.8585, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.30356416858652935, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 9.917308508168712e-06, | |
| "loss": 0.8567, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3079007995663369, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 9.91029509614553e-06, | |
| "loss": 0.8543, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.31223743054614445, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 9.902998932847308e-06, | |
| "loss": 0.8752, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.316574061525952, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 9.895420438411616e-06, | |
| "loss": 0.8535, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.3209106925057596, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 9.887560049233606e-06, | |
| "loss": 0.8601, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.3252473234855672, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 9.879418217940872e-06, | |
| "loss": 0.8543, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3295839544653747, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 9.870995413367397e-06, | |
| "loss": 0.8113, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.33392058544518227, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 9.862292120526536e-06, | |
| "loss": 0.8583, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.33825721642498985, | |
| "grad_norm": 0.447265625, | |
| "learning_rate": 9.85330884058311e-06, | |
| "loss": 0.832, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.3425938474047974, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 9.844046090824533e-06, | |
| "loss": 0.8271, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.34693047838460495, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 9.834504404631032e-06, | |
| "loss": 0.8503, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3512671093644125, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 9.824684331444926e-06, | |
| "loss": 0.8189, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.3556037403442201, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 9.814586436738998e-06, | |
| "loss": 0.8465, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.3599403713240276, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 9.804211301983919e-06, | |
| "loss": 0.8159, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.3642770023038352, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 9.793559524614779e-06, | |
| "loss": 0.8392, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.36861363328364277, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 9.782631717996675e-06, | |
| "loss": 0.8291, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.37295026426345035, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 9.771428511389395e-06, | |
| "loss": 0.8398, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.37728689524325787, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 9.759950549911185e-06, | |
| "loss": 0.8499, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.38162352622306545, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 9.748198494501598e-06, | |
| "loss": 0.8244, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.385960157202873, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 9.736173021883433e-06, | |
| "loss": 0.8281, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.3902967881826806, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 9.72387482452377e-06, | |
| "loss": 0.8165, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3946334191624881, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 9.711304610594104e-06, | |
| "loss": 0.8329, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.3989700501422957, | |
| "grad_norm": 0.375, | |
| "learning_rate": 9.698463103929542e-06, | |
| "loss": 0.8218, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.40330668112210327, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 9.685351043987151e-06, | |
| "loss": 0.8132, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.40764331210191085, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 9.671969185803357e-06, | |
| "loss": 0.8357, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.41197994308171837, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 9.658318299950473e-06, | |
| "loss": 0.8352, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.41631657406152595, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 9.644399172492337e-06, | |
| "loss": 0.8112, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.4206532050413335, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 9.630212604939026e-06, | |
| "loss": 0.8376, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.4249898360211411, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 9.615759414200729e-06, | |
| "loss": 0.8304, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.4293264670009486, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 9.601040432540684e-06, | |
| "loss": 0.8403, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.4336630979807562, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 9.586056507527266e-06, | |
| "loss": 0.8331, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4379997289605638, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 9.570808501985176e-06, | |
| "loss": 0.8268, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.44233635994037135, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 9.55529729394576e-06, | |
| "loss": 0.8264, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.44667299092017887, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 9.539523776596446e-06, | |
| "loss": 0.8235, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.45100962189998645, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 9.523488858229313e-06, | |
| "loss": 0.8276, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.455346252879794, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 9.507193462188791e-06, | |
| "loss": 0.8142, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4596828838596016, | |
| "grad_norm": 0.375, | |
| "learning_rate": 9.490638526818482e-06, | |
| "loss": 0.8092, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.4640195148394091, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 9.47382500540714e-06, | |
| "loss": 0.8256, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.4683561458192167, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 9.45675386613377e-06, | |
| "loss": 0.8342, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.4726927767990243, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 9.439426092011877e-06, | |
| "loss": 0.8087, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.4770294077788318, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 9.421842680832862e-06, | |
| "loss": 0.8316, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.48136603875863937, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 9.40400464510857e-06, | |
| "loss": 0.8257, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.48570266973844695, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 9.385913012012972e-06, | |
| "loss": 0.8246, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.4900393007182545, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 9.367568823323039e-06, | |
| "loss": 0.8206, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.49437593169806204, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 9.348973135358734e-06, | |
| "loss": 0.8358, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.4987125626778696, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 9.330127018922195e-06, | |
| "loss": 0.8017, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.5030491936576772, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 9.311031559236067e-06, | |
| "loss": 0.8496, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.5030491936576772, | |
| "eval_loss": 0.8192870616912842, | |
| "eval_runtime": 109.0108, | |
| "eval_samples_per_second": 7.926, | |
| "eval_steps_per_second": 1.981, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.5073858246374847, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 9.291687855881027e-06, | |
| "loss": 0.8147, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.5117224556172923, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 9.272097022732444e-06, | |
| "loss": 0.8277, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.5160590865970999, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 9.252260187896257e-06, | |
| "loss": 0.8212, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.5203957175769074, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 9.232178493644006e-06, | |
| "loss": 0.8375, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.524732348556715, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 9.211853096347059e-06, | |
| "loss": 0.8386, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.5290689795365225, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 9.191285166410023e-06, | |
| "loss": 0.8118, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.5334056105163302, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 9.170475888203348e-06, | |
| "loss": 0.8181, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.5377422414961377, | |
| "grad_norm": 0.375, | |
| "learning_rate": 9.149426459995127e-06, | |
| "loss": 0.8213, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.5420788724759452, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 9.128138093882098e-06, | |
| "loss": 0.8392, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5464155034557528, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 9.106612015719845e-06, | |
| "loss": 0.8286, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.5507521344355604, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 9.08484946505221e-06, | |
| "loss": 0.8324, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.5550887654153679, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 9.062851695039915e-06, | |
| "loss": 0.8271, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.5594253963951755, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 9.040619972388402e-06, | |
| "loss": 0.8179, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.563762027374983, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 9.018155577274891e-06, | |
| "loss": 0.8214, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5680986583547907, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 8.995459803274664e-06, | |
| "loss": 0.8255, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.5724352893345982, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 8.972533957286574e-06, | |
| "loss": 0.8167, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.5767719203144057, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 8.949379359457795e-06, | |
| "loss": 0.8012, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.5811085512942133, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 8.925997343107796e-06, | |
| "loss": 0.8182, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.5854451822740209, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 8.902389254651568e-06, | |
| "loss": 0.8073, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5897818132538284, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 8.8785564535221e-06, | |
| "loss": 0.8195, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.594118444233636, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 8.854500312092081e-06, | |
| "loss": 0.8292, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.5984550752134435, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 8.83022221559489e-06, | |
| "loss": 0.8204, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.6027917061932511, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 8.805723562044825e-06, | |
| "loss": 0.8175, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.6071283371730587, | |
| "grad_norm": 0.375, | |
| "learning_rate": 8.781005762156593e-06, | |
| "loss": 0.8044, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6114649681528662, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 8.756070239264089e-06, | |
| "loss": 0.8187, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.6158015991326738, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 8.730918429238429e-06, | |
| "loss": 0.8164, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.6201382301124814, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 8.705551780405264e-06, | |
| "loss": 0.8051, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.6244748610922889, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 8.679971753461388e-06, | |
| "loss": 0.8127, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.6288114920720965, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 8.65417982139062e-06, | |
| "loss": 0.8283, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.633148123051904, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 8.628177469378995e-06, | |
| "loss": 0.8169, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.6374847540317116, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 8.601966194729228e-06, | |
| "loss": 0.8209, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.6418213850115192, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 8.575547506774498e-06, | |
| "loss": 0.8388, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.6461580159913267, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 8.548922926791545e-06, | |
| "loss": 0.8129, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.6504946469711343, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 8.522093987913063e-06, | |
| "loss": 0.8282, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6548312779509419, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 8.49506223503941e-06, | |
| "loss": 0.813, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.6591679089307494, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 8.467829224749665e-06, | |
| "loss": 0.8313, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.663504539910557, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 8.440396525211976e-06, | |
| "loss": 0.828, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.6678411708903645, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 8.412765716093273e-06, | |
| "loss": 0.8247, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.6721778018701721, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 8.384938388468296e-06, | |
| "loss": 0.8046, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6765144328499797, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 8.356916144727985e-06, | |
| "loss": 0.814, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.6808510638297872, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 8.328700598487203e-06, | |
| "loss": 0.8147, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.6851876948095948, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 8.300293374491821e-06, | |
| "loss": 0.8083, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.6895243257894024, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 8.271696108525156e-06, | |
| "loss": 0.801, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.6938609567692099, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 8.24291044731378e-06, | |
| "loss": 0.8155, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6981975877490175, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 8.213938048432697e-06, | |
| "loss": 0.7988, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.702534218728825, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 8.184780580209892e-06, | |
| "loss": 0.8184, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.7068708497086326, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 8.155439721630265e-06, | |
| "loss": 0.8212, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.7112074806884402, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 8.125917162238945e-06, | |
| "loss": 0.8401, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.7155441116682477, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 8.096214602044011e-06, | |
| "loss": 0.7886, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.7198807426480552, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 8.066333751418582e-06, | |
| "loss": 0.8181, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.7242173736278629, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 8.036276331002348e-06, | |
| "loss": 0.8188, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.7285540046076704, | |
| "grad_norm": 0.375, | |
| "learning_rate": 8.006044071602476e-06, | |
| "loss": 0.7999, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.732890635587478, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 7.97563871409395e-06, | |
| "loss": 0.8273, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.7372272665672855, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 7.94506200931932e-06, | |
| "loss": 0.7848, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7415638975470931, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 7.914315717987892e-06, | |
| "loss": 0.82, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.7459005285269007, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 7.883401610574338e-06, | |
| "loss": 0.805, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.7502371595067082, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 7.85232146721673e-06, | |
| "loss": 0.8017, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.7545737904865157, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 7.821077077614062e-06, | |
| "loss": 0.8175, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.7545737904865157, | |
| "eval_loss": 0.8033392429351807, | |
| "eval_runtime": 109.1503, | |
| "eval_samples_per_second": 7.916, | |
| "eval_steps_per_second": 1.979, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.7589104214663234, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 7.789670240923169e-06, | |
| "loss": 0.825, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7632470524461309, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 7.758102765655136e-06, | |
| "loss": 0.8155, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.7675836834259385, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 7.726376469571165e-06, | |
| "loss": 0.8138, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.771920314405746, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 7.69449317957788e-06, | |
| "loss": 0.8055, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.7762569453855536, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 7.66245473162215e-06, | |
| "loss": 0.8046, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.7805935763653612, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 7.630262970585355e-06, | |
| "loss": 0.8138, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7849302073451687, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 7.597919750177168e-06, | |
| "loss": 0.8366, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.7892668383249762, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 7.56542693282879e-06, | |
| "loss": 0.8303, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.7936034693047839, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 7.532786389585715e-06, | |
| "loss": 0.8139, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.7979401002845914, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.8098, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.802276731264399, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 7.467069652022017e-06, | |
| "loss": 0.8116, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.8066133622442065, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 7.433997241891743e-06, | |
| "loss": 0.7941, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.8109499932240141, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 7.400784674029579e-06, | |
| "loss": 0.8123, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.8152866242038217, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 7.3674338609266705e-06, | |
| "loss": 0.8237, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.8196232551836292, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 7.333946723034794e-06, | |
| "loss": 0.8241, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.8239598861634367, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 7.300325188655762e-06, | |
| "loss": 0.8072, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8282965171432444, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 7.266571193830387e-06, | |
| "loss": 0.8027, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.8326331481230519, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 7.232686682227001e-06, | |
| "loss": 0.8351, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.8369697791028594, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 7.198673605029529e-06, | |
| "loss": 0.8108, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.841306410082667, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 7.164533920825137e-06, | |
| "loss": 0.8248, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.8456430410624746, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 7.130269595491443e-06, | |
| "loss": 0.8117, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8499796720422822, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 7.095882602083321e-06, | |
| "loss": 0.832, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.8543163030220897, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 7.061374920719288e-06, | |
| "loss": 0.8196, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.8586529340018972, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 7.026748538467474e-06, | |
| "loss": 0.8023, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.8629895649817049, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 6.9920054492312086e-06, | |
| "loss": 0.8149, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.8673261959615124, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 6.957147653634198e-06, | |
| "loss": 0.8166, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8716628269413199, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 6.922177158905326e-06, | |
| "loss": 0.8198, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.8759994579211275, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 6.887095978763072e-06, | |
| "loss": 0.797, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.8803360889009351, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 6.851906133299556e-06, | |
| "loss": 0.8162, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.8846727198807427, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 6.816609648864208e-06, | |
| "loss": 0.8272, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.8890093508605502, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 6.781208557947085e-06, | |
| "loss": 0.7975, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8933459818403577, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 6.745704899061843e-06, | |
| "loss": 0.8349, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.8976826128201654, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 6.710100716628345e-06, | |
| "loss": 0.7963, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.9020192437999729, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 6.674398060854931e-06, | |
| "loss": 0.8233, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.9063558747797804, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 6.638598987620375e-06, | |
| "loss": 0.8137, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.910692505759588, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 6.6027055583554865e-06, | |
| "loss": 0.8076, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.9150291367393956, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 6.566719839924412e-06, | |
| "loss": 0.8046, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.9193657677192032, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 6.530643904505622e-06, | |
| "loss": 0.8211, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.9237023986990107, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 6.49447982947258e-06, | |
| "loss": 0.8135, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.9280390296788182, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 6.458229697274125e-06, | |
| "loss": 0.7993, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.9323756606586259, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 6.42189559531456e-06, | |
| "loss": 0.7944, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.9367122916384334, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 6.385479615833445e-06, | |
| "loss": 0.8078, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.9410489226182409, | |
| "grad_norm": 0.375, | |
| "learning_rate": 6.348983855785122e-06, | |
| "loss": 0.7926, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.9453855535980485, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 6.312410416717969e-06, | |
| "loss": 0.8212, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.9497221845778561, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 6.275761404653381e-06, | |
| "loss": 0.7814, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.9540588155576636, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 6.2390389299645e-06, | |
| "loss": 0.8039, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9583954465374712, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 6.2022451072546926e-06, | |
| "loss": 0.802, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.9627320775172787, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 6.165382055235784e-06, | |
| "loss": 0.7972, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.9670687084970864, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 6.128451896606054e-06, | |
| "loss": 0.7882, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.9714053394768939, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 6.091456757928008e-06, | |
| "loss": 0.7859, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.9757419704567014, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 6.0543987695059236e-06, | |
| "loss": 0.7966, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.980078601436509, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 6.0172800652631706e-06, | |
| "loss": 0.8079, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.9844152324163166, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 5.980102782619343e-06, | |
| "loss": 0.8123, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.9887518633961241, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 5.9428690623671796e-06, | |
| "loss": 0.8359, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.9930884943759317, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 5.905581048549279e-06, | |
| "loss": 0.8287, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.9974251253557392, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 5.8682408883346535e-06, | |
| "loss": 0.8032, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.0017617563355468, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 5.830850731895071e-06, | |
| "loss": 0.8129, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.0040639393118396, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 5.793412732281258e-06, | |
| "loss": 0.7868, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.0040639393118396, | |
| "eval_loss": 0.7960610389709473, | |
| "eval_runtime": 110.4506, | |
| "eval_samples_per_second": 7.822, | |
| "eval_steps_per_second": 1.956, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.0083988079111352, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 5.755929045298905e-06, | |
| "loss": 0.8008, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.0127336765104307, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 5.718401829384541e-06, | |
| "loss": 0.8084, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.0170685451097263, | |
| "grad_norm": 0.375, | |
| "learning_rate": 5.680833245481234e-06, | |
| "loss": 0.8068, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.021403413709022, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 5.6432254569141565e-06, | |
| "loss": 0.796, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.0257382823083174, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 5.605580629266021e-06, | |
| "loss": 0.8198, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.030073150907613, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 5.567900930252375e-06, | |
| "loss": 0.7929, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.0344080195069087, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 5.530188529596774e-06, | |
| "loss": 0.8029, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.0387428881062042, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 5.492445598905843e-06, | |
| "loss": 0.8121, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.0430777567054998, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 5.454674311544236e-06, | |
| "loss": 0.7917, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.0474126253047955, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 5.416876842509468e-06, | |
| "loss": 0.7988, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.051747493904091, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 5.379055368306693e-06, | |
| "loss": 0.7804, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.0560823625033866, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 5.341212066823356e-06, | |
| "loss": 0.8167, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.0604172311026823, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 5.3033491172037935e-06, | |
| "loss": 0.8158, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.0647520997019777, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 5.265468699723748e-06, | |
| "loss": 0.7957, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.0690869683012734, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 5.227572995664819e-06, | |
| "loss": 0.7902, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.073421836900569, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 5.189664187188857e-06, | |
| "loss": 0.7994, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.0777567054998645, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 5.151744457212312e-06, | |
| "loss": 0.809, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.0820915740991601, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 5.113815989280528e-06, | |
| "loss": 0.7849, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0864264426984558, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 5.075880967442014e-06, | |
| "loss": 0.8067, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.0907613112977512, | |
| "grad_norm": 0.375, | |
| "learning_rate": 5.037941576122667e-06, | |
| "loss": 0.798, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.0950961798970469, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7891, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.0994310484963425, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 4.962058423877335e-06, | |
| "loss": 0.8044, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.103765917095638, | |
| "grad_norm": 0.375, | |
| "learning_rate": 4.924119032557988e-06, | |
| "loss": 0.7842, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.1081007856949336, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 4.886184010719472e-06, | |
| "loss": 0.7962, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.1124356542942293, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 4.848255542787689e-06, | |
| "loss": 0.8043, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.1167705228935247, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 4.8103358128111435e-06, | |
| "loss": 0.8075, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.1211053914928204, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 4.772427004335183e-06, | |
| "loss": 0.8023, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.125440260092116, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 4.7345313002762545e-06, | |
| "loss": 0.7959, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.1297751286914115, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 4.696650882796207e-06, | |
| "loss": 0.7883, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.1341099972907072, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 4.6587879331766465e-06, | |
| "loss": 0.8036, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.1384448658900026, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 4.620944631693309e-06, | |
| "loss": 0.8016, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.1427797344892983, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 4.583123157490533e-06, | |
| "loss": 0.7982, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.147114603088594, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 4.545325688455766e-06, | |
| "loss": 0.794, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.1514494716878896, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 4.507554401094157e-06, | |
| "loss": 0.7905, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.155784340287185, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 4.469811470403228e-06, | |
| "loss": 0.7941, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.1601192088864807, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 4.432099069747625e-06, | |
| "loss": 0.801, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.1644540774857761, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 4.394419370733981e-06, | |
| "loss": 0.7985, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.1687889460850718, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 4.356774543085845e-06, | |
| "loss": 0.7837, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.1731238146843674, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 4.319166754518768e-06, | |
| "loss": 0.8008, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.1774586832836629, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 4.28159817061546e-06, | |
| "loss": 0.8059, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.1817935518829585, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 4.244070954701096e-06, | |
| "loss": 0.812, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.1861284204822542, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 4.206587267718743e-06, | |
| "loss": 0.7948, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.1904632890815496, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 4.1691492681049305e-06, | |
| "loss": 0.8005, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.1947981576808453, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 4.131759111665349e-06, | |
| "loss": 0.7992, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.199133026280141, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 4.094418951450721e-06, | |
| "loss": 0.8091, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.2034678948794364, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 4.057130937632821e-06, | |
| "loss": 0.799, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.207802763478732, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 4.01989721738066e-06, | |
| "loss": 0.8093, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.2121376320780277, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 3.982719934736832e-06, | |
| "loss": 0.8073, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.2164725006773232, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 3.945601230494079e-06, | |
| "loss": 0.8099, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.2208073692766188, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 3.9085432420719934e-06, | |
| "loss": 0.7912, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.2251422378759145, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 3.871548103393947e-06, | |
| "loss": 0.8105, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.22947710647521, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 3.834617944764218e-06, | |
| "loss": 0.7751, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.2338119750745056, | |
| "grad_norm": 0.375, | |
| "learning_rate": 3.797754892745309e-06, | |
| "loss": 0.8028, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.2381468436738012, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 3.7609610700355014e-06, | |
| "loss": 0.7939, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.2424817122730967, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 3.724238595346619e-06, | |
| "loss": 0.809, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.2468165808723923, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 3.687589583282031e-06, | |
| "loss": 0.8082, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.251151449471688, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 3.6510161442148783e-06, | |
| "loss": 0.7822, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.2554863180709834, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 3.6145203841665577e-06, | |
| "loss": 0.8119, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.2554863180709834, | |
| "eval_loss": 0.7933911681175232, | |
| "eval_runtime": 110.3505, | |
| "eval_samples_per_second": 7.83, | |
| "eval_steps_per_second": 1.957, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.259821186670279, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 3.578104404685442e-06, | |
| "loss": 0.806, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.2641560552695745, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 3.5417703027258752e-06, | |
| "loss": 0.8055, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.2684909238688702, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 3.5055201705274223e-06, | |
| "loss": 0.8039, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.2728257924681659, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 3.46935609549438e-06, | |
| "loss": 0.8149, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.2771606610674615, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 3.4332801600755895e-06, | |
| "loss": 0.7849, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.281495529666757, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 3.397294441644515e-06, | |
| "loss": 0.7956, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.2858303982660526, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 3.3614010123796257e-06, | |
| "loss": 0.7933, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.290165266865348, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 3.3256019391450696e-06, | |
| "loss": 0.8174, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.2945001354646437, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 3.289899283371657e-06, | |
| "loss": 0.7988, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.2988350040639394, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 3.2542951009381584e-06, | |
| "loss": 0.8037, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.303169872663235, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 3.2187914420529176e-06, | |
| "loss": 0.782, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.3075047412625305, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 3.1833903511357943e-06, | |
| "loss": 0.8037, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.3118396098618261, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 3.148093866700445e-06, | |
| "loss": 0.8053, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.3161744784611216, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 3.1129040212369286e-06, | |
| "loss": 0.7896, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.3205093470604172, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 3.077822841094675e-06, | |
| "loss": 0.8078, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.3248442156597129, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 3.0428523463658046e-06, | |
| "loss": 0.8084, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.3291790842590083, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 3.007994550768793e-06, | |
| "loss": 0.8277, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.333513952858304, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 2.973251461532527e-06, | |
| "loss": 0.8079, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.3378488214575996, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 2.9386250792807124e-06, | |
| "loss": 0.8168, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.342183690056895, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 2.9041173979166813e-06, | |
| "loss": 0.8047, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.3465185586561907, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 2.86973040450856e-06, | |
| "loss": 0.8037, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.3508534272554864, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 2.835466079174866e-06, | |
| "loss": 0.8001, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.3551882958547818, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 2.8013263949704706e-06, | |
| "loss": 0.8006, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.3595231644540775, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 2.767313317773e-06, | |
| "loss": 0.8156, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.363858033053373, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 2.7334288061696146e-06, | |
| "loss": 0.7992, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.3681929016526686, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 2.6996748113442397e-06, | |
| "loss": 0.7812, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.3725277702519643, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 2.666053276965207e-06, | |
| "loss": 0.7857, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.37686263885126, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 2.6325661390733303e-06, | |
| "loss": 0.7985, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.3811975074505554, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 2.599215325970423e-06, | |
| "loss": 0.7811, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.385532376049851, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 2.566002758108256e-06, | |
| "loss": 0.7975, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.3898672446491465, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 2.5329303479779855e-06, | |
| "loss": 0.8305, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.3942021132484421, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 2.5000000000000015e-06, | |
| "loss": 0.8006, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.3985369818477378, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 2.467213610414286e-06, | |
| "loss": 0.791, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.4028718504470334, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 2.434573067171213e-06, | |
| "loss": 0.7853, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.4072067190463289, | |
| "grad_norm": 0.375, | |
| "learning_rate": 2.4020802498228333e-06, | |
| "loss": 0.8011, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.4115415876456245, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 2.369737029414644e-06, | |
| "loss": 0.7996, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.41587645624492, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 2.337545268377853e-06, | |
| "loss": 0.8144, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.4202113248442156, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 2.3055068204221226e-06, | |
| "loss": 0.8064, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.4245461934435113, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 2.2736235304288373e-06, | |
| "loss": 0.7983, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.428881062042807, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 2.241897234344864e-06, | |
| "loss": 0.7919, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.4332159306421024, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 2.2103297590768334e-06, | |
| "loss": 0.785, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.437550799241398, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 2.1789229223859403e-06, | |
| "loss": 0.789, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.4418856678406935, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 2.1476785327832715e-06, | |
| "loss": 0.8104, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.4462205364399892, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 2.1165983894256647e-06, | |
| "loss": 0.7929, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.4505554050392848, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 2.085684282012108e-06, | |
| "loss": 0.8129, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.4548902736385803, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 2.0549379906806816e-06, | |
| "loss": 0.7983, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.459225142237876, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 2.0243612859060526e-06, | |
| "loss": 0.7915, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.4635600108371716, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1.9939559283975237e-06, | |
| "loss": 0.8021, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.467894879436467, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 1.9637236689976517e-06, | |
| "loss": 0.8164, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.4722297480357627, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 1.933666248581418e-06, | |
| "loss": 0.7876, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.4765646166350583, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1.9037853979559923e-06, | |
| "loss": 0.7911, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.4808994852343538, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 1.8740828377610564e-06, | |
| "loss": 0.786, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.4852343538336494, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 1.8445602783697375e-06, | |
| "loss": 0.8243, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.4895692224329449, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 1.8152194197901086e-06, | |
| "loss": 0.8162, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.4939040910322405, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1.7860619515673034e-06, | |
| "loss": 0.8081, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.4982389596315362, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 1.7570895526862202e-06, | |
| "loss": 0.814, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.5025738282308319, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 1.7283038914748446e-06, | |
| "loss": 0.7814, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.5069086968301273, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 1.6997066255081795e-06, | |
| "loss": 0.799, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.5069086968301273, | |
| "eval_loss": 0.7925707697868347, | |
| "eval_runtime": 110.4666, | |
| "eval_samples_per_second": 7.821, | |
| "eval_steps_per_second": 1.955, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.511243565429423, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 1.6712994015127976e-06, | |
| "loss": 0.798, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.5155784340287184, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 1.6430838552720168e-06, | |
| "loss": 0.8019, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.519913302628014, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 1.6150616115317052e-06, | |
| "loss": 0.77, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.5242481712273097, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1.5872342839067305e-06, | |
| "loss": 0.7969, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.5285830398266054, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 1.5596034747880263e-06, | |
| "loss": 0.8047, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.5329179084259008, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 1.5321707752503367e-06, | |
| "loss": 0.7922, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.5372527770251965, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 1.5049377649605906e-06, | |
| "loss": 0.8011, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.541587645624492, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 1.4779060120869393e-06, | |
| "loss": 0.7937, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.5459225142237876, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 1.451077073208455e-06, | |
| "loss": 0.7822, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.5502573828230832, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 1.4244524932255026e-06, | |
| "loss": 0.7985, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.554592251422379, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 1.3980338052707737e-06, | |
| "loss": 0.7968, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.5589271200216743, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 1.3718225306210049e-06, | |
| "loss": 0.8111, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.5632619886209698, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1.3458201786093795e-06, | |
| "loss": 0.7918, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.5675968572202654, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 1.3200282465386156e-06, | |
| "loss": 0.8026, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.571931725819561, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 1.2944482195947384e-06, | |
| "loss": 0.8124, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.5762665944188567, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 1.2690815707615727e-06, | |
| "loss": 0.7961, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.5806014630181524, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 1.2439297607359118e-06, | |
| "loss": 0.8055, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.5849363316174478, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 1.2189942378434083e-06, | |
| "loss": 0.786, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.5892712002167433, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 1.194276437955177e-06, | |
| "loss": 0.8009, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 1.593606068816039, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 1.1697777844051105e-06, | |
| "loss": 0.8016, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.5979409374153346, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 1.1454996879079205e-06, | |
| "loss": 0.7954, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.6022758060146303, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 1.1214435464779006e-06, | |
| "loss": 0.8051, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.606610674613926, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 1.0976107453484314e-06, | |
| "loss": 0.7912, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 1.6109455432132214, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 1.0740026568922058e-06, | |
| "loss": 0.8041, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.6152804118125168, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 1.050620640542208e-06, | |
| "loss": 0.7959, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 1.6196152804118125, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 1.027466042713428e-06, | |
| "loss": 0.8097, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.6239501490111081, | |
| "grad_norm": 0.375, | |
| "learning_rate": 1.0045401967253382e-06, | |
| "loss": 0.7924, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.6282850176104038, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 9.81844422725109e-07, | |
| "loss": 0.8068, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.6326198862096992, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 9.593800276115978e-07, | |
| "loss": 0.8052, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 1.6369547548089949, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 9.371483049600849e-07, | |
| "loss": 0.7862, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.6412896234082903, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 9.151505349477901e-07, | |
| "loss": 0.8059, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 1.645624492007586, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 8.933879842801558e-07, | |
| "loss": 0.785, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.6499593606068816, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 8.718619061179029e-07, | |
| "loss": 0.7866, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.6542942292061773, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 8.505735400048748e-07, | |
| "loss": 0.7948, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.6586290978054727, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 8.29524111796654e-07, | |
| "loss": 0.8076, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 1.6629639664047684, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 8.087148335899786e-07, | |
| "loss": 0.8034, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.6672988350040638, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 7.881469036529427e-07, | |
| "loss": 0.7956, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.6716337036033595, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 7.678215063559957e-07, | |
| "loss": 0.797, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.6759685722026552, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 7.477398121037449e-07, | |
| "loss": 0.777, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.6803034408019508, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 7.279029772675572e-07, | |
| "loss": 0.8072, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.6846383094012463, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 7.083121441189739e-07, | |
| "loss": 0.7878, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.6889731780005417, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 6.889684407639324e-07, | |
| "loss": 0.8186, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.6933080465998374, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 6.698729810778065e-07, | |
| "loss": 0.8195, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.697642915199133, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 6.510268646412665e-07, | |
| "loss": 0.7844, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.7019777837984287, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 6.324311766769631e-07, | |
| "loss": 0.7936, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.7063126523977243, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 6.140869879870287e-07, | |
| "loss": 0.795, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.7106475209970198, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 5.959953548914327e-07, | |
| "loss": 0.7961, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.7149823895963152, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 5.781573191671386e-07, | |
| "loss": 0.7819, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.7193172581956109, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 5.60573907988124e-07, | |
| "loss": 0.8076, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.7236521267949065, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 5.43246133866231e-07, | |
| "loss": 0.8044, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.7279869953942022, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 5.261749945928613e-07, | |
| "loss": 0.8001, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.7323218639934979, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 5.0936147318152e-07, | |
| "loss": 0.7955, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.7366567325927933, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 4.928065378112107e-07, | |
| "loss": 0.7974, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.7409916011920887, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 4.7651114177068694e-07, | |
| "loss": 0.8025, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.7453264697913844, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 4.604762234035548e-07, | |
| "loss": 0.7857, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.74966133839068, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 4.4470270605424195e-07, | |
| "loss": 0.8064, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.7539962069899757, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 4.2919149801482596e-07, | |
| "loss": 0.7966, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.7583310755892712, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 4.139434924727359e-07, | |
| "loss": 0.7891, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.7583310755892712, | |
| "eval_loss": 0.7922915816307068, | |
| "eval_runtime": 110.2564, | |
| "eval_samples_per_second": 7.836, | |
| "eval_steps_per_second": 1.959, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.7626659441885668, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 3.989595674593161e-07, | |
| "loss": 0.7935, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 1.7670008127878623, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 3.8424058579927147e-07, | |
| "loss": 0.7943, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.771335681387158, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 3.697873950609737e-07, | |
| "loss": 0.796, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 1.7756705499864536, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 3.55600827507665e-07, | |
| "loss": 0.8139, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.7800054185857492, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 3.416817000495271e-07, | |
| "loss": 0.7962, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.7843402871850447, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 3.2803081419664483e-07, | |
| "loss": 0.8059, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.7886751557843403, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 3.146489560128496e-07, | |
| "loss": 0.8073, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 1.7930100243836358, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 3.015368960704584e-07, | |
| "loss": 0.7965, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.7973448929829314, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 2.88695389405898e-07, | |
| "loss": 0.8057, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.801679761582227, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 2.7612517547622955e-07, | |
| "loss": 0.7942, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.8060146301815228, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 2.638269781165692e-07, | |
| "loss": 0.7904, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.8103494987808182, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 2.518015054984041e-07, | |
| "loss": 0.8075, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.8146843673801136, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 2.4004945008881617e-07, | |
| "loss": 0.8082, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 1.8190192359794093, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 2.2857148861060552e-07, | |
| "loss": 0.7803, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.823354104578705, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 2.1736828200332628e-07, | |
| "loss": 0.7705, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 1.8276889731780006, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 2.0644047538522226e-07, | |
| "loss": 0.8031, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.8320238417772963, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 1.9578869801608168e-07, | |
| "loss": 0.7753, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.8363587103765917, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 1.8541356326100436e-07, | |
| "loss": 0.8151, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.8406935789758871, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 1.7531566855507442e-07, | |
| "loss": 0.7754, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.8450284475751828, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 1.6549559536896964e-07, | |
| "loss": 0.795, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.8493633161744785, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1.559539091754686e-07, | |
| "loss": 0.7999, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 1.8536981847737741, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 1.4669115941689182e-07, | |
| "loss": 0.7965, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.8580330533730698, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 1.3770787947346597e-07, | |
| "loss": 0.8072, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 1.8623679219723652, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1.2900458663260506e-07, | |
| "loss": 0.8134, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.8667027905716607, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 1.2058178205912763e-07, | |
| "loss": 0.8142, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 1.8710376591709563, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 1.1243995076639535e-07, | |
| "loss": 0.7983, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.875372527770252, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 1.0457956158838545e-07, | |
| "loss": 0.7892, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 1.8797073963695476, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 9.700106715269386e-08, | |
| "loss": 0.793, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 1.884042264968843, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 8.970490385447061e-08, | |
| "loss": 0.8028, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.8883771335681387, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 8.269149183128988e-08, | |
| "loss": 0.8004, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 1.8927120021674342, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 7.59612349389599e-08, | |
| "loss": 0.7909, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 1.8970468707667298, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 6.951452072826547e-08, | |
| "loss": 0.7832, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.9013817393660255, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 6.335172042265192e-08, | |
| "loss": 0.794, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 1.9057166079653212, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 5.747318889684883e-08, | |
| "loss": 0.763, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.9100514765646166, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 5.187926465643478e-08, | |
| "loss": 0.7852, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 1.9143863451639123, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 4.657026981834623e-08, | |
| "loss": 0.8118, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.9187212137632077, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 4.1546510092327906e-08, | |
| "loss": 0.8019, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 1.9230560823625034, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 3.680827476332804e-08, | |
| "loss": 0.8194, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 1.927390950961799, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 3.235583667484443e-08, | |
| "loss": 0.8032, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.9317258195610947, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 2.8189452213207014e-08, | |
| "loss": 0.7975, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 1.9360606881603901, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 2.4309361292820245e-08, | |
| "loss": 0.8016, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 1.9403955567596856, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 2.0715787342343586e-08, | |
| "loss": 0.8265, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.9447304253589812, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 1.7408937291829575e-08, | |
| "loss": 0.8057, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 1.9490652939582769, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1.4389001560803917e-08, | |
| "loss": 0.7954, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.9534001625575725, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1.1656154047303691e-08, | |
| "loss": 0.797, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 1.9577350311568682, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 9.210552117863703e-09, | |
| "loss": 0.7966, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 1.9620698997561636, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 7.052336598451504e-09, | |
| "loss": 0.8071, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 1.966404768355459, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 5.181631766362216e-09, | |
| "loss": 0.8095, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 1.9707396369547547, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 3.5985453430598115e-09, | |
| "loss": 0.7737, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.9750745055540504, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 2.3031684879742944e-09, | |
| "loss": 0.8073, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 1.979409374153346, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 1.2955757932542334e-09, | |
| "loss": 0.7904, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 1.9837442427526417, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 5.758252794690888e-10, | |
| "loss": 0.8063, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 1.9880791113519372, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 1.4395839226910568e-10, | |
| "loss": 0.8058, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 1.9924139799512326, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 0.0, | |
| "loss": 0.7995, | |
| "step": 460 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 460, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 115, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.046286453205369e+19, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |