diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,4168 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 5.0, - "eval_steps": 500, - "global_step": 510, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.00984009840098401, - "grad_norm": 3.7944442389822073, - "learning_rate": 0.0, - "loss": 1.2501, - "num_tokens": 456505.0, - "step": 1 - }, - { - "epoch": 0.01968019680196802, - "grad_norm": 3.778041640972742, - "learning_rate": 6.25e-07, - "loss": 1.2343, - "num_tokens": 915160.0, - "step": 2 - }, - { - "epoch": 0.02952029520295203, - "grad_norm": 3.8325814879608386, - "learning_rate": 1.25e-06, - "loss": 1.254, - "num_tokens": 1365315.0, - "step": 3 - }, - { - "epoch": 0.03936039360393604, - "grad_norm": 3.582565733113683, - "learning_rate": 1.8750000000000003e-06, - "loss": 1.1869, - "num_tokens": 1841763.0, - "step": 4 - }, - { - "epoch": 0.04920049200492005, - "grad_norm": 3.5604969753172315, - "learning_rate": 2.5e-06, - "loss": 1.2394, - "num_tokens": 2301606.0, - "step": 5 - }, - { - "epoch": 0.05904059040590406, - "grad_norm": 3.105374395878177, - "learning_rate": 3.125e-06, - "loss": 1.2366, - "num_tokens": 2755825.0, - "step": 6 - }, - { - "epoch": 0.06888068880688807, - "grad_norm": 2.316426838717515, - "learning_rate": 3.7500000000000005e-06, - "loss": 1.1101, - "num_tokens": 3196409.0, - "step": 7 - }, - { - "epoch": 0.07872078720787208, - "grad_norm": 2.281060366927676, - "learning_rate": 4.3750000000000005e-06, - "loss": 1.1078, - "num_tokens": 3622733.0, - "step": 8 - }, - { - "epoch": 0.08856088560885608, - "grad_norm": 1.934577354985982, - "learning_rate": 5e-06, - "loss": 0.9014, - "num_tokens": 4055914.0, - "step": 9 - }, - { - "epoch": 0.0984009840098401, - "grad_norm": 1.9176079459138344, - "learning_rate": 5.625e-06, - "loss": 0.8745, - "num_tokens": 4485159.0, - "step": 10 - }, - { - "epoch": 0.10824108241082411, - "grad_norm": 1.786754010375736, - "learning_rate": 6.25e-06, - "loss": 0.7922, - "num_tokens": 4933514.0, - "step": 11 - }, - { - "epoch": 0.11808118081180811, - "grad_norm": 1.9933574737759214, - "learning_rate": 6.875e-06, - "loss": 0.4878, - "num_tokens": 5383658.0, - "step": 12 - }, - { - "epoch": 0.12792127921279212, - "grad_norm": 2.123289554302906, - "learning_rate": 7.500000000000001e-06, - "loss": 0.428, - "num_tokens": 5839838.0, - "step": 13 - }, - { - "epoch": 0.13776137761377613, - "grad_norm": 1.7562448014521572, - "learning_rate": 8.125000000000001e-06, - "loss": 0.3337, - "num_tokens": 6286175.0, - "step": 14 - }, - { - "epoch": 0.14760147601476015, - "grad_norm": 1.4384357290512548, - "learning_rate": 8.750000000000001e-06, - "loss": 0.2497, - "num_tokens": 6725821.0, - "step": 15 - }, - { - "epoch": 0.15744157441574416, - "grad_norm": 0.8232923354453182, - "learning_rate": 9.375000000000001e-06, - "loss": 0.1317, - "num_tokens": 7169854.0, - "step": 16 - }, - { - "epoch": 0.16728167281672818, - "grad_norm": 0.5262014955492348, - "learning_rate": 1e-05, - "loss": 0.1158, - "num_tokens": 7602261.0, - "step": 17 - }, - { - "epoch": 0.17712177121771217, - "grad_norm": 0.47218735378476806, - "learning_rate": 9.999909003036192e-06, - "loss": 0.098, - "num_tokens": 8040457.0, - "step": 18 - }, - { - "epoch": 0.18696186961869618, - "grad_norm": 0.22950756131023575, - "learning_rate": 9.99963601582496e-06, - "loss": 0.0827, - "num_tokens": 8484210.0, - "step": 19 - }, - { - "epoch": 0.1968019680196802, - "grad_norm": 0.24069667769460337, - "learning_rate": 9.999181049406756e-06, - "loss": 0.0733, - "num_tokens": 8913622.0, - "step": 20 - }, - { - "epoch": 0.2066420664206642, - "grad_norm": 0.19079800364724872, - "learning_rate": 9.998544122181829e-06, - "loss": 0.0851, - "num_tokens": 9379389.0, - "step": 21 - }, - { - "epoch": 0.21648216482164823, - "grad_norm": 0.17418458793254618, - "learning_rate": 9.997725259909487e-06, - "loss": 0.0734, - "num_tokens": 9803100.0, - "step": 22 - }, - { - "epoch": 0.22632226322263221, - "grad_norm": 0.19184343474298712, - "learning_rate": 9.996724495707056e-06, - "loss": 0.083, - "num_tokens": 10247767.0, - "step": 23 - }, - { - "epoch": 0.23616236162361623, - "grad_norm": 0.1475443750251538, - "learning_rate": 9.995541870048537e-06, - "loss": 0.0738, - "num_tokens": 10691516.0, - "step": 24 - }, - { - "epoch": 0.24600246002460024, - "grad_norm": 0.14419500643952865, - "learning_rate": 9.994177430762971e-06, - "loss": 0.0646, - "num_tokens": 11149524.0, - "step": 25 - }, - { - "epoch": 0.25584255842558423, - "grad_norm": 1.2190794556868674, - "learning_rate": 9.992631233032507e-06, - "loss": 0.0721, - "num_tokens": 11589958.0, - "step": 26 - }, - { - "epoch": 0.2656826568265683, - "grad_norm": 0.49893871677873436, - "learning_rate": 9.990903339390164e-06, - "loss": 0.0691, - "num_tokens": 12050102.0, - "step": 27 - }, - { - "epoch": 0.27552275522755226, - "grad_norm": 0.4422022706582718, - "learning_rate": 9.988993819717312e-06, - "loss": 0.0605, - "num_tokens": 12508433.0, - "step": 28 - }, - { - "epoch": 0.2853628536285363, - "grad_norm": 0.13441092661822238, - "learning_rate": 9.986902751240836e-06, - "loss": 0.0692, - "num_tokens": 12939960.0, - "step": 29 - }, - { - "epoch": 0.2952029520295203, - "grad_norm": 0.11858771432621444, - "learning_rate": 9.984630218530014e-06, - "loss": 0.0492, - "num_tokens": 13387850.0, - "step": 30 - }, - { - "epoch": 0.3050430504305043, - "grad_norm": 0.12696361103470127, - "learning_rate": 9.982176313493108e-06, - "loss": 0.0624, - "num_tokens": 13866894.0, - "step": 31 - }, - { - "epoch": 0.3148831488314883, - "grad_norm": 0.14678245947256616, - "learning_rate": 9.979541135373628e-06, - "loss": 0.0483, - "num_tokens": 14314553.0, - "step": 32 - }, - { - "epoch": 0.3247232472324723, - "grad_norm": 0.12403518011628543, - "learning_rate": 9.976724790746333e-06, - "loss": 0.0488, - "num_tokens": 14747330.0, - "step": 33 - }, - { - "epoch": 0.33456334563345635, - "grad_norm": 0.10283847292091904, - "learning_rate": 9.973727393512921e-06, - "loss": 0.0582, - "num_tokens": 15215873.0, - "step": 34 - }, - { - "epoch": 0.34440344403444034, - "grad_norm": 0.09860235260078455, - "learning_rate": 9.970549064897407e-06, - "loss": 0.0446, - "num_tokens": 15653849.0, - "step": 35 - }, - { - "epoch": 0.35424354243542433, - "grad_norm": 0.10274919661024226, - "learning_rate": 9.967189933441243e-06, - "loss": 0.0439, - "num_tokens": 16112913.0, - "step": 36 - }, - { - "epoch": 0.3640836408364084, - "grad_norm": 0.0918843632134462, - "learning_rate": 9.9636501349981e-06, - "loss": 0.0585, - "num_tokens": 16570588.0, - "step": 37 - }, - { - "epoch": 0.37392373923739236, - "grad_norm": 0.08618611894284056, - "learning_rate": 9.95992981272838e-06, - "loss": 0.0477, - "num_tokens": 17028395.0, - "step": 38 - }, - { - "epoch": 0.3837638376383764, - "grad_norm": 0.0915069403325355, - "learning_rate": 9.956029117093432e-06, - "loss": 0.045, - "num_tokens": 17477681.0, - "step": 39 - }, - { - "epoch": 0.3936039360393604, - "grad_norm": 0.09093140650787605, - "learning_rate": 9.951948205849457e-06, - "loss": 0.0444, - "num_tokens": 17940049.0, - "step": 40 - }, - { - "epoch": 0.4034440344403444, - "grad_norm": 0.08271507354884283, - "learning_rate": 9.947687244041143e-06, - "loss": 0.0401, - "num_tokens": 18360868.0, - "step": 41 - }, - { - "epoch": 0.4132841328413284, - "grad_norm": 0.08588968137159211, - "learning_rate": 9.943246403994969e-06, - "loss": 0.0358, - "num_tokens": 18811281.0, - "step": 42 - }, - { - "epoch": 0.4231242312423124, - "grad_norm": 0.08965565515357603, - "learning_rate": 9.938625865312252e-06, - "loss": 0.044, - "num_tokens": 19236998.0, - "step": 43 - }, - { - "epoch": 0.43296432964329645, - "grad_norm": 0.09636661290222473, - "learning_rate": 9.933825814861877e-06, - "loss": 0.0431, - "num_tokens": 19689363.0, - "step": 44 - }, - { - "epoch": 0.44280442804428044, - "grad_norm": 0.08912873391359938, - "learning_rate": 9.928846446772737e-06, - "loss": 0.0377, - "num_tokens": 20129602.0, - "step": 45 - }, - { - "epoch": 0.45264452644526443, - "grad_norm": 0.09271503002492597, - "learning_rate": 9.923687962425895e-06, - "loss": 0.0365, - "num_tokens": 20566055.0, - "step": 46 - }, - { - "epoch": 0.46248462484624847, - "grad_norm": 0.08617267288782972, - "learning_rate": 9.91835057044642e-06, - "loss": 0.0582, - "num_tokens": 21035837.0, - "step": 47 - }, - { - "epoch": 0.47232472324723246, - "grad_norm": 0.07942181409157618, - "learning_rate": 9.912834486694963e-06, - "loss": 0.0341, - "num_tokens": 21490681.0, - "step": 48 - }, - { - "epoch": 0.4821648216482165, - "grad_norm": 0.08409285833406879, - "learning_rate": 9.907139934259025e-06, - "loss": 0.0464, - "num_tokens": 21949736.0, - "step": 49 - }, - { - "epoch": 0.4920049200492005, - "grad_norm": 0.08981746101624732, - "learning_rate": 9.90126714344393e-06, - "loss": 0.0479, - "num_tokens": 22408345.0, - "step": 50 - }, - { - "epoch": 0.5018450184501845, - "grad_norm": 0.08557538109120558, - "learning_rate": 9.895216351763515e-06, - "loss": 0.04, - "num_tokens": 22869507.0, - "step": 51 - }, - { - "epoch": 0.5116851168511685, - "grad_norm": 0.08873060518107122, - "learning_rate": 9.888987803930523e-06, - "loss": 0.0359, - "num_tokens": 23337492.0, - "step": 52 - }, - { - "epoch": 0.5215252152521526, - "grad_norm": 0.08508195964995854, - "learning_rate": 9.882581751846707e-06, - "loss": 0.0338, - "num_tokens": 23788038.0, - "step": 53 - }, - { - "epoch": 0.5313653136531366, - "grad_norm": 0.076418318161816, - "learning_rate": 9.87599845459264e-06, - "loss": 0.0344, - "num_tokens": 24233994.0, - "step": 54 - }, - { - "epoch": 0.5412054120541205, - "grad_norm": 0.2889818789713905, - "learning_rate": 9.869238178417235e-06, - "loss": 0.2599, - "num_tokens": 24697351.0, - "step": 55 - }, - { - "epoch": 0.5510455104551045, - "grad_norm": 0.08884780995830746, - "learning_rate": 9.862301196726988e-06, - "loss": 0.0465, - "num_tokens": 25183095.0, - "step": 56 - }, - { - "epoch": 0.5608856088560885, - "grad_norm": 0.07990815808329678, - "learning_rate": 9.855187790074906e-06, - "loss": 0.0353, - "num_tokens": 25651971.0, - "step": 57 - }, - { - "epoch": 0.5707257072570726, - "grad_norm": 0.06894407429892842, - "learning_rate": 9.847898246149173e-06, - "loss": 0.0316, - "num_tokens": 26129683.0, - "step": 58 - }, - { - "epoch": 0.5805658056580566, - "grad_norm": 0.08216971413705307, - "learning_rate": 9.840432859761504e-06, - "loss": 0.0306, - "num_tokens": 26548348.0, - "step": 59 - }, - { - "epoch": 0.5904059040590406, - "grad_norm": 0.079031679127037, - "learning_rate": 9.832791932835232e-06, - "loss": 0.0362, - "num_tokens": 26977631.0, - "step": 60 - }, - { - "epoch": 0.6002460024600246, - "grad_norm": 0.07450412090133855, - "learning_rate": 9.824975774393089e-06, - "loss": 0.0276, - "num_tokens": 27421323.0, - "step": 61 - }, - { - "epoch": 0.6100861008610086, - "grad_norm": 0.08014735253624648, - "learning_rate": 9.816984700544714e-06, - "loss": 0.0286, - "num_tokens": 27882356.0, - "step": 62 - }, - { - "epoch": 0.6199261992619927, - "grad_norm": 0.08455294660438158, - "learning_rate": 9.808819034473869e-06, - "loss": 0.0407, - "num_tokens": 28343854.0, - "step": 63 - }, - { - "epoch": 0.6297662976629766, - "grad_norm": 0.08019778537515825, - "learning_rate": 9.800479106425356e-06, - "loss": 0.0299, - "num_tokens": 28790695.0, - "step": 64 - }, - { - "epoch": 0.6396063960639606, - "grad_norm": 0.08340888167507048, - "learning_rate": 9.791965253691687e-06, - "loss": 0.0353, - "num_tokens": 29220825.0, - "step": 65 - }, - { - "epoch": 0.6494464944649446, - "grad_norm": 0.08252486402936965, - "learning_rate": 9.783277820599408e-06, - "loss": 0.0367, - "num_tokens": 29686358.0, - "step": 66 - }, - { - "epoch": 0.6592865928659286, - "grad_norm": 0.08632773276059842, - "learning_rate": 9.774417158495208e-06, - "loss": 0.0331, - "num_tokens": 30120521.0, - "step": 67 - }, - { - "epoch": 0.6691266912669127, - "grad_norm": 0.082343171890358, - "learning_rate": 9.765383625731683e-06, - "loss": 0.0329, - "num_tokens": 30573947.0, - "step": 68 - }, - { - "epoch": 0.6789667896678967, - "grad_norm": 0.08874468637210653, - "learning_rate": 9.756177587652857e-06, - "loss": 0.0329, - "num_tokens": 30999244.0, - "step": 69 - }, - { - "epoch": 0.6888068880688807, - "grad_norm": 0.07673402020991506, - "learning_rate": 9.746799416579403e-06, - "loss": 0.0306, - "num_tokens": 31468786.0, - "step": 70 - }, - { - "epoch": 0.6986469864698647, - "grad_norm": 0.09204922624438575, - "learning_rate": 9.737249491793587e-06, - "loss": 0.0273, - "num_tokens": 31905019.0, - "step": 71 - }, - { - "epoch": 0.7084870848708487, - "grad_norm": 0.08145687118724444, - "learning_rate": 9.727528199523923e-06, - "loss": 0.029, - "num_tokens": 32340154.0, - "step": 72 - }, - { - "epoch": 0.7183271832718328, - "grad_norm": 0.09506872052374568, - "learning_rate": 9.717635932929556e-06, - "loss": 0.0373, - "num_tokens": 32789598.0, - "step": 73 - }, - { - "epoch": 0.7281672816728167, - "grad_norm": 0.08326889230017241, - "learning_rate": 9.707573092084368e-06, - "loss": 0.0286, - "num_tokens": 33239225.0, - "step": 74 - }, - { - "epoch": 0.7380073800738007, - "grad_norm": 0.07636964575035168, - "learning_rate": 9.697340083960785e-06, - "loss": 0.0291, - "num_tokens": 33718797.0, - "step": 75 - }, - { - "epoch": 0.7478474784747847, - "grad_norm": 0.09488168094776525, - "learning_rate": 9.686937322413325e-06, - "loss": 0.0328, - "num_tokens": 34155674.0, - "step": 76 - }, - { - "epoch": 0.7576875768757687, - "grad_norm": 0.0778086138359463, - "learning_rate": 9.676365228161869e-06, - "loss": 0.0252, - "num_tokens": 34584921.0, - "step": 77 - }, - { - "epoch": 0.7675276752767528, - "grad_norm": 0.08557737550120906, - "learning_rate": 9.66562422877462e-06, - "loss": 0.0338, - "num_tokens": 35049146.0, - "step": 78 - }, - { - "epoch": 0.7773677736777368, - "grad_norm": 0.09181023650151289, - "learning_rate": 9.654714758650844e-06, - "loss": 0.0299, - "num_tokens": 35519987.0, - "step": 79 - }, - { - "epoch": 0.7872078720787208, - "grad_norm": 0.07639914292637208, - "learning_rate": 9.643637259003276e-06, - "loss": 0.0242, - "num_tokens": 35959127.0, - "step": 80 - }, - { - "epoch": 0.7970479704797048, - "grad_norm": 0.08200922089613671, - "learning_rate": 9.632392177840286e-06, - "loss": 0.0317, - "num_tokens": 36416651.0, - "step": 81 - }, - { - "epoch": 0.8068880688806888, - "grad_norm": 0.07954028434263948, - "learning_rate": 9.620979969947759e-06, - "loss": 0.0293, - "num_tokens": 36864154.0, - "step": 82 - }, - { - "epoch": 0.8167281672816729, - "grad_norm": 0.07878375949867687, - "learning_rate": 9.609401096870707e-06, - "loss": 0.0237, - "num_tokens": 37310281.0, - "step": 83 - }, - { - "epoch": 0.8265682656826568, - "grad_norm": 0.07728168843840597, - "learning_rate": 9.597656026894591e-06, - "loss": 0.0322, - "num_tokens": 37746606.0, - "step": 84 - }, - { - "epoch": 0.8364083640836408, - "grad_norm": 0.07855221188672869, - "learning_rate": 9.585745235026391e-06, - "loss": 0.0258, - "num_tokens": 38189615.0, - "step": 85 - }, - { - "epoch": 0.8462484624846248, - "grad_norm": 0.07691630967258262, - "learning_rate": 9.5736692029754e-06, - "loss": 0.0293, - "num_tokens": 38637318.0, - "step": 86 - }, - { - "epoch": 0.8560885608856088, - "grad_norm": 0.07209047793755496, - "learning_rate": 9.561428419133723e-06, - "loss": 0.0235, - "num_tokens": 39102853.0, - "step": 87 - }, - { - "epoch": 0.8659286592865929, - "grad_norm": 0.0802072339239599, - "learning_rate": 9.549023378556548e-06, - "loss": 0.0311, - "num_tokens": 39538535.0, - "step": 88 - }, - { - "epoch": 0.8757687576875769, - "grad_norm": 0.09334524313401625, - "learning_rate": 9.53645458294211e-06, - "loss": 0.0484, - "num_tokens": 40020296.0, - "step": 89 - }, - { - "epoch": 0.8856088560885609, - "grad_norm": 2.3961901610996605, - "learning_rate": 9.523722540611403e-06, - "loss": 0.3276, - "num_tokens": 40506093.0, - "step": 90 - }, - { - "epoch": 0.8954489544895449, - "grad_norm": 0.09376957957757263, - "learning_rate": 9.510827766487625e-06, - "loss": 0.0288, - "num_tokens": 40937880.0, - "step": 91 - }, - { - "epoch": 0.9052890528905289, - "grad_norm": 0.08607984794603309, - "learning_rate": 9.497770782075353e-06, - "loss": 0.0247, - "num_tokens": 41374337.0, - "step": 92 - }, - { - "epoch": 0.915129151291513, - "grad_norm": 0.07253858203781333, - "learning_rate": 9.484552115439445e-06, - "loss": 0.0293, - "num_tokens": 41811558.0, - "step": 93 - }, - { - "epoch": 0.9249692496924969, - "grad_norm": 0.07768364358007782, - "learning_rate": 9.471172301183695e-06, - "loss": 0.0257, - "num_tokens": 42259726.0, - "step": 94 - }, - { - "epoch": 0.9348093480934809, - "grad_norm": 0.0769153663260077, - "learning_rate": 9.4576318804292e-06, - "loss": 0.0232, - "num_tokens": 42684319.0, - "step": 95 - }, - { - "epoch": 0.9446494464944649, - "grad_norm": 0.08163342042509363, - "learning_rate": 9.443931400792486e-06, - "loss": 0.0256, - "num_tokens": 43113589.0, - "step": 96 - }, - { - "epoch": 0.9544895448954489, - "grad_norm": 0.06536764982172343, - "learning_rate": 9.430071416363352e-06, - "loss": 0.0218, - "num_tokens": 43575488.0, - "step": 97 - }, - { - "epoch": 0.964329643296433, - "grad_norm": 0.08195099679978833, - "learning_rate": 9.416052487682465e-06, - "loss": 0.0254, - "num_tokens": 44016216.0, - "step": 98 - }, - { - "epoch": 0.974169741697417, - "grad_norm": 0.1266005657397246, - "learning_rate": 9.401875181718686e-06, - "loss": 0.0454, - "num_tokens": 44497742.0, - "step": 99 - }, - { - "epoch": 0.984009840098401, - "grad_norm": 0.07988798247506342, - "learning_rate": 9.387540071846155e-06, - "loss": 0.024, - "num_tokens": 44935936.0, - "step": 100 - }, - { - "epoch": 0.993849938499385, - "grad_norm": 0.07277763654694067, - "learning_rate": 9.373047737821078e-06, - "loss": 0.0216, - "num_tokens": 45381042.0, - "step": 101 - }, - { - "epoch": 1.0, - "grad_norm": 0.07277763654694067, - "learning_rate": 9.358398765758296e-06, - "loss": 0.0229, - "num_tokens": 45593876.0, - "step": 102 - }, - { - "epoch": 1.0, - "eval_loss": 0.07811997085809708, - "eval_num_tokens": 45593876.0, - "eval_runtime": 54.709, - "eval_samples_per_second": 41.054, - "eval_steps_per_second": 5.136, - "step": 102 - }, - { - "epoch": 1.009840098400984, - "grad_norm": 0.10795878798324991, - "learning_rate": 9.34359374810758e-06, - "loss": 0.0201, - "num_tokens": 46020335.0, - "step": 103 - }, - { - "epoch": 1.019680196801968, - "grad_norm": 0.07593949135329942, - "learning_rate": 9.328633283629666e-06, - "loss": 0.0222, - "num_tokens": 46466853.0, - "step": 104 - }, - { - "epoch": 1.029520295202952, - "grad_norm": 0.07596980345063492, - "learning_rate": 9.31351797737204e-06, - "loss": 0.0253, - "num_tokens": 46900993.0, - "step": 105 - }, - { - "epoch": 1.039360393603936, - "grad_norm": 0.08317964089954727, - "learning_rate": 9.29824844064447e-06, - "loss": 0.0206, - "num_tokens": 47334869.0, - "step": 106 - }, - { - "epoch": 1.04920049200492, - "grad_norm": 0.0805362815127939, - "learning_rate": 9.282825290994282e-06, - "loss": 0.0213, - "num_tokens": 47797630.0, - "step": 107 - }, - { - "epoch": 1.0590405904059041, - "grad_norm": 0.07839099238240128, - "learning_rate": 9.267249152181379e-06, - "loss": 0.0454, - "num_tokens": 48281974.0, - "step": 108 - }, - { - "epoch": 1.068880688806888, - "grad_norm": 0.0757738535866923, - "learning_rate": 9.251520654153028e-06, - "loss": 0.022, - "num_tokens": 48730118.0, - "step": 109 - }, - { - "epoch": 1.0787207872078721, - "grad_norm": 0.08256710571520359, - "learning_rate": 9.235640433018363e-06, - "loss": 0.0195, - "num_tokens": 49197576.0, - "step": 110 - }, - { - "epoch": 1.088560885608856, - "grad_norm": 0.07849933177459094, - "learning_rate": 9.219609131022684e-06, - "loss": 0.0203, - "num_tokens": 49673054.0, - "step": 111 - }, - { - "epoch": 1.09840098400984, - "grad_norm": 0.08067924302373455, - "learning_rate": 9.203427396521454e-06, - "loss": 0.0219, - "num_tokens": 50130569.0, - "step": 112 - }, - { - "epoch": 1.1082410824108242, - "grad_norm": 0.07527801624664898, - "learning_rate": 9.187095883954104e-06, - "loss": 0.0195, - "num_tokens": 50574721.0, - "step": 113 - }, - { - "epoch": 1.118081180811808, - "grad_norm": 0.08229755724299215, - "learning_rate": 9.170615253817547e-06, - "loss": 0.0193, - "num_tokens": 51010865.0, - "step": 114 - }, - { - "epoch": 1.1279212792127922, - "grad_norm": 0.07673721236222701, - "learning_rate": 9.153986172639474e-06, - "loss": 0.0211, - "num_tokens": 51469765.0, - "step": 115 - }, - { - "epoch": 1.137761377613776, - "grad_norm": 0.0845900192373935, - "learning_rate": 9.137209312951395e-06, - "loss": 0.0226, - "num_tokens": 51906114.0, - "step": 116 - }, - { - "epoch": 1.1476014760147601, - "grad_norm": 0.08215860044207468, - "learning_rate": 9.12028535326144e-06, - "loss": 0.022, - "num_tokens": 52354068.0, - "step": 117 - }, - { - "epoch": 1.1574415744157442, - "grad_norm": 0.07420368746928867, - "learning_rate": 9.103214978026922e-06, - "loss": 0.0188, - "num_tokens": 52836346.0, - "step": 118 - }, - { - "epoch": 1.1672816728167281, - "grad_norm": 0.07450541307438634, - "learning_rate": 9.085998877626644e-06, - "loss": 0.0192, - "num_tokens": 53299172.0, - "step": 119 - }, - { - "epoch": 1.1771217712177122, - "grad_norm": 0.07878886229739003, - "learning_rate": 9.068637748332993e-06, - "loss": 0.0215, - "num_tokens": 53759861.0, - "step": 120 - }, - { - "epoch": 1.186961869618696, - "grad_norm": 0.08311056334441597, - "learning_rate": 9.051132292283772e-06, - "loss": 0.0208, - "num_tokens": 54228512.0, - "step": 121 - }, - { - "epoch": 1.1968019680196802, - "grad_norm": 0.07068781735081182, - "learning_rate": 9.033483217453801e-06, - "loss": 0.0812, - "num_tokens": 54692852.0, - "step": 122 - }, - { - "epoch": 1.2066420664206643, - "grad_norm": 0.596254901083269, - "learning_rate": 9.015691237626292e-06, - "loss": 0.0199, - "num_tokens": 55139782.0, - "step": 123 - }, - { - "epoch": 1.2164821648216482, - "grad_norm": 0.08202279255895727, - "learning_rate": 8.997757072363976e-06, - "loss": 0.0342, - "num_tokens": 55604658.0, - "step": 124 - }, - { - "epoch": 1.2263222632226323, - "grad_norm": 0.09057478290667956, - "learning_rate": 8.979681446980002e-06, - "loss": 0.0227, - "num_tokens": 56030690.0, - "step": 125 - }, - { - "epoch": 1.2361623616236161, - "grad_norm": 0.07661103115531635, - "learning_rate": 8.961465092508607e-06, - "loss": 0.0339, - "num_tokens": 56492821.0, - "step": 126 - }, - { - "epoch": 1.2460024600246002, - "grad_norm": 0.08310739437969392, - "learning_rate": 8.943108745675542e-06, - "loss": 0.0249, - "num_tokens": 56927699.0, - "step": 127 - }, - { - "epoch": 1.2558425584255843, - "grad_norm": 0.08009221352147507, - "learning_rate": 8.92461314886829e-06, - "loss": 0.0209, - "num_tokens": 57365827.0, - "step": 128 - }, - { - "epoch": 1.2656826568265682, - "grad_norm": 0.07973094836265254, - "learning_rate": 8.905979050106029e-06, - "loss": 0.0251, - "num_tokens": 57821453.0, - "step": 129 - }, - { - "epoch": 1.2755227552275523, - "grad_norm": 0.07600070319773061, - "learning_rate": 8.887207203009385e-06, - "loss": 0.0213, - "num_tokens": 58267867.0, - "step": 130 - }, - { - "epoch": 1.2853628536285364, - "grad_norm": 0.07258837000806613, - "learning_rate": 8.868298366769956e-06, - "loss": 0.0198, - "num_tokens": 58715078.0, - "step": 131 - }, - { - "epoch": 1.2952029520295203, - "grad_norm": 0.07826062337656157, - "learning_rate": 8.849253306119601e-06, - "loss": 0.0199, - "num_tokens": 59159310.0, - "step": 132 - }, - { - "epoch": 1.3050430504305042, - "grad_norm": 0.07770042002025847, - "learning_rate": 8.83007279129952e-06, - "loss": 0.027, - "num_tokens": 59594031.0, - "step": 133 - }, - { - "epoch": 1.3148831488314883, - "grad_norm": 0.07607344407726713, - "learning_rate": 8.810757598029094e-06, - "loss": 0.0342, - "num_tokens": 60038506.0, - "step": 134 - }, - { - "epoch": 1.3247232472324724, - "grad_norm": 0.08771686774228402, - "learning_rate": 8.79130850747452e-06, - "loss": 0.0234, - "num_tokens": 60492486.0, - "step": 135 - }, - { - "epoch": 1.3345633456334562, - "grad_norm": 0.07482147000786651, - "learning_rate": 8.771726306217217e-06, - "loss": 0.0196, - "num_tokens": 60925341.0, - "step": 136 - }, - { - "epoch": 1.3444034440344403, - "grad_norm": 0.07171750614547971, - "learning_rate": 8.752011786222011e-06, - "loss": 0.0224, - "num_tokens": 61401128.0, - "step": 137 - }, - { - "epoch": 1.3542435424354244, - "grad_norm": 0.07289189868770962, - "learning_rate": 8.732165744805107e-06, - "loss": 0.0198, - "num_tokens": 61845691.0, - "step": 138 - }, - { - "epoch": 1.3640836408364083, - "grad_norm": 0.07907747558023923, - "learning_rate": 8.712188984601845e-06, - "loss": 0.0185, - "num_tokens": 62286361.0, - "step": 139 - }, - { - "epoch": 1.3739237392373924, - "grad_norm": 0.06910414114179665, - "learning_rate": 8.692082313534233e-06, - "loss": 0.0179, - "num_tokens": 62727406.0, - "step": 140 - }, - { - "epoch": 1.3837638376383765, - "grad_norm": 0.07791959325829377, - "learning_rate": 8.671846544778284e-06, - "loss": 0.0204, - "num_tokens": 63182141.0, - "step": 141 - }, - { - "epoch": 1.3936039360393604, - "grad_norm": 0.0741558195977179, - "learning_rate": 8.651482496731116e-06, - "loss": 0.0178, - "num_tokens": 63600729.0, - "step": 142 - }, - { - "epoch": 1.4034440344403443, - "grad_norm": 0.07283375136096223, - "learning_rate": 8.630990992977854e-06, - "loss": 0.0198, - "num_tokens": 64066267.0, - "step": 143 - }, - { - "epoch": 1.4132841328413284, - "grad_norm": 0.0731783816547012, - "learning_rate": 8.61037286225834e-06, - "loss": 0.2547, - "num_tokens": 64515946.0, - "step": 144 - }, - { - "epoch": 1.4231242312423125, - "grad_norm": 1.0212050791856901, - "learning_rate": 8.589628938433587e-06, - "loss": 0.0192, - "num_tokens": 64949958.0, - "step": 145 - }, - { - "epoch": 1.4329643296432963, - "grad_norm": 0.09844320658741419, - "learning_rate": 8.56876006045208e-06, - "loss": 0.0176, - "num_tokens": 65381018.0, - "step": 146 - }, - { - "epoch": 1.4428044280442804, - "grad_norm": 0.07030907656382593, - "learning_rate": 8.547767072315835e-06, - "loss": 0.0241, - "num_tokens": 65814016.0, - "step": 147 - }, - { - "epoch": 1.4526445264452645, - "grad_norm": 0.0779412275694533, - "learning_rate": 8.526650823046266e-06, - "loss": 0.0265, - "num_tokens": 66252980.0, - "step": 148 - }, - { - "epoch": 1.4624846248462484, - "grad_norm": 0.09570533939331194, - "learning_rate": 8.505412166649847e-06, - "loss": 0.0199, - "num_tokens": 66718111.0, - "step": 149 - }, - { - "epoch": 1.4723247232472325, - "grad_norm": 0.07915246167438994, - "learning_rate": 8.484051962083579e-06, - "loss": 0.0204, - "num_tokens": 67163762.0, - "step": 150 - }, - { - "epoch": 1.4821648216482166, - "grad_norm": 0.07935176799416567, - "learning_rate": 8.462571073220243e-06, - "loss": 0.0225, - "num_tokens": 67624386.0, - "step": 151 - }, - { - "epoch": 1.4920049200492005, - "grad_norm": 0.07841589822630919, - "learning_rate": 8.44097036881347e-06, - "loss": 0.0392, - "num_tokens": 68065290.0, - "step": 152 - }, - { - "epoch": 1.5018450184501844, - "grad_norm": 0.3517146293571387, - "learning_rate": 8.419250722462603e-06, - "loss": 0.0178, - "num_tokens": 68519107.0, - "step": 153 - }, - { - "epoch": 1.5116851168511685, - "grad_norm": 0.0764909788834621, - "learning_rate": 8.39741301257736e-06, - "loss": 0.0194, - "num_tokens": 68971128.0, - "step": 154 - }, - { - "epoch": 1.5215252152521526, - "grad_norm": 0.08078822036852527, - "learning_rate": 8.375458122342317e-06, - "loss": 0.0206, - "num_tokens": 69403792.0, - "step": 155 - }, - { - "epoch": 1.5313653136531364, - "grad_norm": 0.08235320219175549, - "learning_rate": 8.353386939681186e-06, - "loss": 0.0175, - "num_tokens": 69836602.0, - "step": 156 - }, - { - "epoch": 1.5412054120541205, - "grad_norm": 0.0735540837139594, - "learning_rate": 8.331200357220908e-06, - "loss": 0.0194, - "num_tokens": 70283814.0, - "step": 157 - }, - { - "epoch": 1.5510455104551046, - "grad_norm": 0.07322399084658018, - "learning_rate": 8.308899272255542e-06, - "loss": 0.0184, - "num_tokens": 70726284.0, - "step": 158 - }, - { - "epoch": 1.5608856088560885, - "grad_norm": 0.07790348390650517, - "learning_rate": 8.286484586709989e-06, - "loss": 0.0183, - "num_tokens": 71155169.0, - "step": 159 - }, - { - "epoch": 1.5707257072570726, - "grad_norm": 0.08611809383964489, - "learning_rate": 8.263957207103506e-06, - "loss": 0.0205, - "num_tokens": 71591204.0, - "step": 160 - }, - { - "epoch": 1.5805658056580567, - "grad_norm": 0.0706229845173915, - "learning_rate": 8.241318044513046e-06, - "loss": 0.0277, - "num_tokens": 72032119.0, - "step": 161 - }, - { - "epoch": 1.5904059040590406, - "grad_norm": 0.09019039164269532, - "learning_rate": 8.218568014536414e-06, - "loss": 0.0176, - "num_tokens": 72492164.0, - "step": 162 - }, - { - "epoch": 1.6002460024600245, - "grad_norm": 0.07947315916491103, - "learning_rate": 8.195708037255233e-06, - "loss": 0.0202, - "num_tokens": 72962752.0, - "step": 163 - }, - { - "epoch": 1.6100861008610086, - "grad_norm": 0.06840189166732885, - "learning_rate": 8.172739037197739e-06, - "loss": 0.018, - "num_tokens": 73415974.0, - "step": 164 - }, - { - "epoch": 1.6199261992619927, - "grad_norm": 0.07366616747573093, - "learning_rate": 8.149661943301382e-06, - "loss": 0.0181, - "num_tokens": 73882834.0, - "step": 165 - }, - { - "epoch": 1.6297662976629765, - "grad_norm": 0.07081012920317416, - "learning_rate": 8.126477688875262e-06, - "loss": 0.0204, - "num_tokens": 74321580.0, - "step": 166 - }, - { - "epoch": 1.6396063960639606, - "grad_norm": 0.07863097311534642, - "learning_rate": 8.103187211562386e-06, - "loss": 0.0229, - "num_tokens": 74781751.0, - "step": 167 - }, - { - "epoch": 1.6494464944649447, - "grad_norm": 0.10797044478776457, - "learning_rate": 8.079791453301742e-06, - "loss": 0.0287, - "num_tokens": 75219935.0, - "step": 168 - }, - { - "epoch": 1.6592865928659286, - "grad_norm": 0.07041534985061697, - "learning_rate": 8.056291360290202e-06, - "loss": 0.0248, - "num_tokens": 75665232.0, - "step": 169 - }, - { - "epoch": 1.6691266912669127, - "grad_norm": 0.08695303118518641, - "learning_rate": 8.032687882944264e-06, - "loss": 0.0193, - "num_tokens": 76087411.0, - "step": 170 - }, - { - "epoch": 1.6789667896678968, - "grad_norm": 0.06704813880798238, - "learning_rate": 8.0089819758616e-06, - "loss": 0.0169, - "num_tokens": 76529931.0, - "step": 171 - }, - { - "epoch": 1.6888068880688807, - "grad_norm": 0.06935996975041725, - "learning_rate": 7.985174597782469e-06, - "loss": 0.0197, - "num_tokens": 76974869.0, - "step": 172 - }, - { - "epoch": 1.6986469864698646, - "grad_norm": 0.0812644475398725, - "learning_rate": 7.961266711550922e-06, - "loss": 0.0259, - "num_tokens": 77413009.0, - "step": 173 - }, - { - "epoch": 1.7084870848708487, - "grad_norm": 0.07469198601302375, - "learning_rate": 7.937259284075872e-06, - "loss": 0.0191, - "num_tokens": 77854298.0, - "step": 174 - }, - { - "epoch": 1.7183271832718328, - "grad_norm": 0.07554209425696685, - "learning_rate": 7.913153286291995e-06, - "loss": 0.025, - "num_tokens": 78299682.0, - "step": 175 - }, - { - "epoch": 1.7281672816728166, - "grad_norm": 0.07564661483692575, - "learning_rate": 7.888949693120443e-06, - "loss": 0.0172, - "num_tokens": 78723460.0, - "step": 176 - }, - { - "epoch": 1.7380073800738007, - "grad_norm": 0.6264202015289688, - "learning_rate": 7.864649483429442e-06, - "loss": 0.0402, - "num_tokens": 79151526.0, - "step": 177 - }, - { - "epoch": 1.7478474784747848, - "grad_norm": 0.07431323606896861, - "learning_rate": 7.840253639994676e-06, - "loss": 0.0182, - "num_tokens": 79591692.0, - "step": 178 - }, - { - "epoch": 1.7576875768757687, - "grad_norm": 0.07199128250127072, - "learning_rate": 7.815763149459563e-06, - "loss": 0.018, - "num_tokens": 80054397.0, - "step": 179 - }, - { - "epoch": 1.7675276752767528, - "grad_norm": 0.0736771332831437, - "learning_rate": 7.791179002295334e-06, - "loss": 0.0182, - "num_tokens": 80527436.0, - "step": 180 - }, - { - "epoch": 1.777367773677737, - "grad_norm": 0.0722896910687323, - "learning_rate": 7.766502192760995e-06, - "loss": 0.0299, - "num_tokens": 80984085.0, - "step": 181 - }, - { - "epoch": 1.7872078720787208, - "grad_norm": 0.13146348676004535, - "learning_rate": 7.741733718863096e-06, - "loss": 0.0172, - "num_tokens": 81417093.0, - "step": 182 - }, - { - "epoch": 1.7970479704797047, - "grad_norm": 0.07559775090622188, - "learning_rate": 7.71687458231538e-06, - "loss": 0.0173, - "num_tokens": 81857802.0, - "step": 183 - }, - { - "epoch": 1.8068880688806888, - "grad_norm": 0.07625026619956689, - "learning_rate": 7.69192578849827e-06, - "loss": 0.0174, - "num_tokens": 82314635.0, - "step": 184 - }, - { - "epoch": 1.8167281672816729, - "grad_norm": 0.07079163666898536, - "learning_rate": 7.666888346418205e-06, - "loss": 0.0255, - "num_tokens": 82774404.0, - "step": 185 - }, - { - "epoch": 1.8265682656826567, - "grad_norm": 0.07862230056744444, - "learning_rate": 7.641763268666832e-06, - "loss": 0.0166, - "num_tokens": 83224858.0, - "step": 186 - }, - { - "epoch": 1.8364083640836408, - "grad_norm": 0.07767548895299481, - "learning_rate": 7.616551571380061e-06, - "loss": 0.0303, - "num_tokens": 83685638.0, - "step": 187 - }, - { - "epoch": 1.846248462484625, - "grad_norm": 0.0767555813557926, - "learning_rate": 7.5912542741969585e-06, - "loss": 0.0173, - "num_tokens": 84118329.0, - "step": 188 - }, - { - "epoch": 1.8560885608856088, - "grad_norm": 0.06505326217418561, - "learning_rate": 7.5658724002185215e-06, - "loss": 0.2302, - "num_tokens": 84642441.0, - "step": 189 - }, - { - "epoch": 1.865928659286593, - "grad_norm": 0.9831912884395022, - "learning_rate": 7.54040697596629e-06, - "loss": 0.0173, - "num_tokens": 85075013.0, - "step": 190 - }, - { - "epoch": 1.875768757687577, - "grad_norm": 0.0852074767092427, - "learning_rate": 7.514859031340835e-06, - "loss": 0.0197, - "num_tokens": 85539398.0, - "step": 191 - }, - { - "epoch": 1.8856088560885609, - "grad_norm": 0.07502455159038045, - "learning_rate": 7.489229599580111e-06, - "loss": 0.0167, - "num_tokens": 85976652.0, - "step": 192 - }, - { - "epoch": 1.8954489544895448, - "grad_norm": 0.07796568336104527, - "learning_rate": 7.463519717217663e-06, - "loss": 0.0253, - "num_tokens": 86404836.0, - "step": 193 - }, - { - "epoch": 1.9052890528905289, - "grad_norm": 0.07733304316410633, - "learning_rate": 7.437730424040702e-06, - "loss": 0.0232, - "num_tokens": 86871021.0, - "step": 194 - }, - { - "epoch": 1.915129151291513, - "grad_norm": 0.07837311923363188, - "learning_rate": 7.411862763048068e-06, - "loss": 0.0228, - "num_tokens": 87328297.0, - "step": 195 - }, - { - "epoch": 1.9249692496924968, - "grad_norm": 0.07159308881612252, - "learning_rate": 7.38591778040803e-06, - "loss": 0.0178, - "num_tokens": 87780478.0, - "step": 196 - }, - { - "epoch": 1.934809348093481, - "grad_norm": 0.06995284279442164, - "learning_rate": 7.359896525415986e-06, - "loss": 0.0166, - "num_tokens": 88245218.0, - "step": 197 - }, - { - "epoch": 1.944649446494465, - "grad_norm": 0.074185946727602, - "learning_rate": 7.333800050452024e-06, - "loss": 0.0335, - "num_tokens": 88720048.0, - "step": 198 - }, - { - "epoch": 1.954489544895449, - "grad_norm": 0.0936664061322253, - "learning_rate": 7.307629410938364e-06, - "loss": 0.0156, - "num_tokens": 89171687.0, - "step": 199 - }, - { - "epoch": 1.964329643296433, - "grad_norm": 0.06592479834851843, - "learning_rate": 7.281385665296663e-06, - "loss": 0.0162, - "num_tokens": 89636320.0, - "step": 200 - }, - { - "epoch": 1.974169741697417, - "grad_norm": 0.08486840853612633, - "learning_rate": 7.255069874905221e-06, - "loss": 0.0177, - "num_tokens": 90074778.0, - "step": 201 - }, - { - "epoch": 1.984009840098401, - "grad_norm": 0.06923307537599123, - "learning_rate": 7.228683104056051e-06, - "loss": 0.0168, - "num_tokens": 90519743.0, - "step": 202 - }, - { - "epoch": 1.9938499384993849, - "grad_norm": 0.09250588119689185, - "learning_rate": 7.202226419911832e-06, - "loss": 0.0266, - "num_tokens": 90971202.0, - "step": 203 - }, - { - "epoch": 2.0, - "grad_norm": 0.08932358174959376, - "learning_rate": 7.175700892462757e-06, - "loss": 0.0167, - "num_tokens": 91183681.0, - "step": 204 - }, - { - "epoch": 2.0, - "eval_loss": 0.08808860927820206, - "eval_num_tokens": 91183681.0, - "eval_runtime": 53.9315, - "eval_samples_per_second": 41.645, - "eval_steps_per_second": 5.21, - "step": 204 - }, - { - "epoch": 2.009840098400984, - "grad_norm": 0.07874869315833909, - "learning_rate": 7.149107594483251e-06, - "loss": 0.0142, - "num_tokens": 91625671.0, - "step": 205 - }, - { - "epoch": 2.019680196801968, - "grad_norm": 0.06385620551213778, - "learning_rate": 7.122447601488592e-06, - "loss": 0.0132, - "num_tokens": 92071488.0, - "step": 206 - }, - { - "epoch": 2.029520295202952, - "grad_norm": 0.06846197400142105, - "learning_rate": 7.095721991691411e-06, - "loss": 0.0149, - "num_tokens": 92542156.0, - "step": 207 - }, - { - "epoch": 2.039360393603936, - "grad_norm": 0.07424945414823086, - "learning_rate": 7.0689318459580845e-06, - "loss": 0.0156, - "num_tokens": 93002703.0, - "step": 208 - }, - { - "epoch": 2.0492004920049203, - "grad_norm": 0.06687580312011086, - "learning_rate": 7.042078247765019e-06, - "loss": 0.0135, - "num_tokens": 93436834.0, - "step": 209 - }, - { - "epoch": 2.059040590405904, - "grad_norm": 0.07720021453648518, - "learning_rate": 7.015162283154843e-06, - "loss": 0.0137, - "num_tokens": 93871635.0, - "step": 210 - }, - { - "epoch": 2.068880688806888, - "grad_norm": 0.13453391743262458, - "learning_rate": 6.988185040692469e-06, - "loss": 0.0221, - "num_tokens": 94314058.0, - "step": 211 - }, - { - "epoch": 2.078720787207872, - "grad_norm": 0.07982223152072775, - "learning_rate": 6.961147611421076e-06, - "loss": 0.017, - "num_tokens": 94750976.0, - "step": 212 - }, - { - "epoch": 2.088560885608856, - "grad_norm": 0.06995730861373262, - "learning_rate": 6.934051088817988e-06, - "loss": 0.0137, - "num_tokens": 95193789.0, - "step": 213 - }, - { - "epoch": 2.09840098400984, - "grad_norm": 0.07438600726959783, - "learning_rate": 6.906896568750441e-06, - "loss": 0.0193, - "num_tokens": 95676386.0, - "step": 214 - }, - { - "epoch": 2.108241082410824, - "grad_norm": 0.09331884860488432, - "learning_rate": 6.87968514943127e-06, - "loss": 0.0154, - "num_tokens": 96137917.0, - "step": 215 - }, - { - "epoch": 2.1180811808118083, - "grad_norm": 0.06703452835053635, - "learning_rate": 6.852417931374494e-06, - "loss": 0.0134, - "num_tokens": 96568059.0, - "step": 216 - }, - { - "epoch": 2.127921279212792, - "grad_norm": 0.07093081986870549, - "learning_rate": 6.825096017350807e-06, - "loss": 0.0138, - "num_tokens": 97019588.0, - "step": 217 - }, - { - "epoch": 2.137761377613776, - "grad_norm": 0.0650948479503258, - "learning_rate": 6.797720512342967e-06, - "loss": 0.0137, - "num_tokens": 97456418.0, - "step": 218 - }, - { - "epoch": 2.14760147601476, - "grad_norm": 0.06693139683273135, - "learning_rate": 6.77029252350113e-06, - "loss": 0.0142, - "num_tokens": 97874765.0, - "step": 219 - }, - { - "epoch": 2.1574415744157442, - "grad_norm": 0.07881816970778455, - "learning_rate": 6.742813160098054e-06, - "loss": 0.0188, - "num_tokens": 98322373.0, - "step": 220 - }, - { - "epoch": 2.167281672816728, - "grad_norm": 0.07381706020969016, - "learning_rate": 6.715283533484242e-06, - "loss": 0.0125, - "num_tokens": 98762055.0, - "step": 221 - }, - { - "epoch": 2.177121771217712, - "grad_norm": 0.06829050170688594, - "learning_rate": 6.6877047570430044e-06, - "loss": 0.0147, - "num_tokens": 99212257.0, - "step": 222 - }, - { - "epoch": 2.1869618696186963, - "grad_norm": 0.0726323898489312, - "learning_rate": 6.660077946145412e-06, - "loss": 0.0149, - "num_tokens": 99651696.0, - "step": 223 - }, - { - "epoch": 2.19680196801968, - "grad_norm": 0.06996376101830218, - "learning_rate": 6.632404218105205e-06, - "loss": 0.014, - "num_tokens": 100115333.0, - "step": 224 - }, - { - "epoch": 2.206642066420664, - "grad_norm": 0.07058857975728597, - "learning_rate": 6.604684692133597e-06, - "loss": 0.0128, - "num_tokens": 100567168.0, - "step": 225 - }, - { - "epoch": 2.2164821648216484, - "grad_norm": 0.06705830086377462, - "learning_rate": 6.576920489294011e-06, - "loss": 0.014, - "num_tokens": 101017414.0, - "step": 226 - }, - { - "epoch": 2.2263222632226323, - "grad_norm": 0.08216121325842957, - "learning_rate": 6.549112732456739e-06, - "loss": 0.0244, - "num_tokens": 101478653.0, - "step": 227 - }, - { - "epoch": 2.236162361623616, - "grad_norm": 0.06604918422838713, - "learning_rate": 6.5212625462535365e-06, - "loss": 0.0133, - "num_tokens": 101922998.0, - "step": 228 - }, - { - "epoch": 2.2460024600246005, - "grad_norm": 0.06450225948970358, - "learning_rate": 6.493371057032129e-06, - "loss": 0.0149, - "num_tokens": 102357947.0, - "step": 229 - }, - { - "epoch": 2.2558425584255843, - "grad_norm": 0.07514996917424294, - "learning_rate": 6.465439392810664e-06, - "loss": 0.0167, - "num_tokens": 102803832.0, - "step": 230 - }, - { - "epoch": 2.265682656826568, - "grad_norm": 0.06462428507734051, - "learning_rate": 6.4374686832320944e-06, - "loss": 0.0142, - "num_tokens": 103241692.0, - "step": 231 - }, - { - "epoch": 2.275522755227552, - "grad_norm": 0.06485952063828938, - "learning_rate": 6.409460059518482e-06, - "loss": 0.0136, - "num_tokens": 103688326.0, - "step": 232 - }, - { - "epoch": 2.2853628536285364, - "grad_norm": 0.06533997999817706, - "learning_rate": 6.381414654425261e-06, - "loss": 0.0131, - "num_tokens": 104139997.0, - "step": 233 - }, - { - "epoch": 2.2952029520295203, - "grad_norm": 0.06878268907753365, - "learning_rate": 6.353333602195414e-06, - "loss": 0.0138, - "num_tokens": 104583247.0, - "step": 234 - }, - { - "epoch": 2.305043050430504, - "grad_norm": 0.061527579151490784, - "learning_rate": 6.325218038513604e-06, - "loss": 0.0129, - "num_tokens": 105013546.0, - "step": 235 - }, - { - "epoch": 2.3148831488314885, - "grad_norm": 0.0688594189041464, - "learning_rate": 6.2970691004602425e-06, - "loss": 0.0147, - "num_tokens": 105469533.0, - "step": 236 - }, - { - "epoch": 2.3247232472324724, - "grad_norm": 0.07212293085873876, - "learning_rate": 6.26888792646551e-06, - "loss": 0.0138, - "num_tokens": 105902012.0, - "step": 237 - }, - { - "epoch": 2.3345633456334562, - "grad_norm": 0.07097729248579715, - "learning_rate": 6.240675656263303e-06, - "loss": 0.0133, - "num_tokens": 106319708.0, - "step": 238 - }, - { - "epoch": 2.34440344403444, - "grad_norm": 0.0702207231329528, - "learning_rate": 6.212433430845145e-06, - "loss": 0.0136, - "num_tokens": 106767770.0, - "step": 239 - }, - { - "epoch": 2.3542435424354244, - "grad_norm": 0.06717197740035392, - "learning_rate": 6.184162392414044e-06, - "loss": 0.0127, - "num_tokens": 107230010.0, - "step": 240 - }, - { - "epoch": 2.3640836408364083, - "grad_norm": 0.09206853570190297, - "learning_rate": 6.155863684338294e-06, - "loss": 0.0182, - "num_tokens": 107696665.0, - "step": 241 - }, - { - "epoch": 2.373923739237392, - "grad_norm": 0.07931539686074184, - "learning_rate": 6.127538451105232e-06, - "loss": 0.0156, - "num_tokens": 108145998.0, - "step": 242 - }, - { - "epoch": 2.3837638376383765, - "grad_norm": 0.0845167365221342, - "learning_rate": 6.099187838274959e-06, - "loss": 0.0304, - "num_tokens": 108605347.0, - "step": 243 - }, - { - "epoch": 2.3936039360393604, - "grad_norm": 0.8319925155014395, - "learning_rate": 6.070812992434003e-06, - "loss": 0.077, - "num_tokens": 109053120.0, - "step": 244 - }, - { - "epoch": 2.4034440344403443, - "grad_norm": 0.08254084053779843, - "learning_rate": 6.042415061148954e-06, - "loss": 0.0153, - "num_tokens": 109511574.0, - "step": 245 - }, - { - "epoch": 2.4132841328413286, - "grad_norm": 0.07621464852457635, - "learning_rate": 6.013995192920044e-06, - "loss": 0.013, - "num_tokens": 109961861.0, - "step": 246 - }, - { - "epoch": 2.4231242312423125, - "grad_norm": 0.06290755400921484, - "learning_rate": 5.985554537134702e-06, - "loss": 0.0133, - "num_tokens": 110439530.0, - "step": 247 - }, - { - "epoch": 2.4329643296432963, - "grad_norm": 0.06549923207889226, - "learning_rate": 5.957094244021071e-06, - "loss": 0.0133, - "num_tokens": 110902468.0, - "step": 248 - }, - { - "epoch": 2.4428044280442807, - "grad_norm": 0.06398296126869986, - "learning_rate": 5.928615464601497e-06, - "loss": 0.0128, - "num_tokens": 111361759.0, - "step": 249 - }, - { - "epoch": 2.4526445264452645, - "grad_norm": 0.062244715362799644, - "learning_rate": 5.900119350645956e-06, - "loss": 0.0128, - "num_tokens": 111799435.0, - "step": 250 - }, - { - "epoch": 2.4624846248462484, - "grad_norm": 0.06503161600374163, - "learning_rate": 5.871607054625497e-06, - "loss": 0.0128, - "num_tokens": 112244747.0, - "step": 251 - }, - { - "epoch": 2.4723247232472323, - "grad_norm": 0.08086590997362891, - "learning_rate": 5.8430797296656125e-06, - "loss": 0.0184, - "num_tokens": 112678903.0, - "step": 252 - }, - { - "epoch": 2.4821648216482166, - "grad_norm": 0.07239451855920867, - "learning_rate": 5.814538529499622e-06, - "loss": 0.0149, - "num_tokens": 113132832.0, - "step": 253 - }, - { - "epoch": 2.4920049200492005, - "grad_norm": 0.06030312987290577, - "learning_rate": 5.785984608421993e-06, - "loss": 0.0127, - "num_tokens": 113568429.0, - "step": 254 - }, - { - "epoch": 2.5018450184501844, - "grad_norm": 0.06349775541516244, - "learning_rate": 5.757419121241667e-06, - "loss": 0.0125, - "num_tokens": 114042240.0, - "step": 255 - }, - { - "epoch": 2.5116851168511687, - "grad_norm": 0.06952013750985335, - "learning_rate": 5.7288432232353615e-06, - "loss": 0.0204, - "num_tokens": 114496441.0, - "step": 256 - }, - { - "epoch": 2.5215252152521526, - "grad_norm": 0.0958262233433174, - "learning_rate": 5.7002580701008325e-06, - "loss": 0.0149, - "num_tokens": 114936236.0, - "step": 257 - }, - { - "epoch": 2.5313653136531364, - "grad_norm": 0.06572975411347728, - "learning_rate": 5.6716648179101445e-06, - "loss": 0.0123, - "num_tokens": 115365529.0, - "step": 258 - }, - { - "epoch": 2.5412054120541203, - "grad_norm": 0.07287254897275752, - "learning_rate": 5.64306462306291e-06, - "loss": 0.0177, - "num_tokens": 115812361.0, - "step": 259 - }, - { - "epoch": 2.5510455104551046, - "grad_norm": 0.0677506186552676, - "learning_rate": 5.614458642239534e-06, - "loss": 0.0126, - "num_tokens": 116269752.0, - "step": 260 - }, - { - "epoch": 2.5608856088560885, - "grad_norm": 0.07088790175345892, - "learning_rate": 5.585848032354411e-06, - "loss": 0.0139, - "num_tokens": 116739082.0, - "step": 261 - }, - { - "epoch": 2.570725707257073, - "grad_norm": 2.483507979054926, - "learning_rate": 5.557233950509159e-06, - "loss": 0.3298, - "num_tokens": 117236975.0, - "step": 262 - }, - { - "epoch": 2.5805658056580567, - "grad_norm": 0.6712341553033803, - "learning_rate": 5.528617553945807e-06, - "loss": 0.0131, - "num_tokens": 117701799.0, - "step": 263 - }, - { - "epoch": 2.5904059040590406, - "grad_norm": 0.070379027103792, - "learning_rate": 5.500000000000001e-06, - "loss": 0.019, - "num_tokens": 118190544.0, - "step": 264 - }, - { - "epoch": 2.6002460024600245, - "grad_norm": 0.09944926431551483, - "learning_rate": 5.4713824460541964e-06, - "loss": 0.0153, - "num_tokens": 118625146.0, - "step": 265 - }, - { - "epoch": 2.6100861008610083, - "grad_norm": 0.07370939155932825, - "learning_rate": 5.442766049490843e-06, - "loss": 0.0138, - "num_tokens": 119077739.0, - "step": 266 - }, - { - "epoch": 2.6199261992619927, - "grad_norm": 0.06555516765204612, - "learning_rate": 5.414151967645591e-06, - "loss": 0.0136, - "num_tokens": 119502701.0, - "step": 267 - }, - { - "epoch": 2.6297662976629765, - "grad_norm": 0.060577987544993946, - "learning_rate": 5.385541357760469e-06, - "loss": 0.0121, - "num_tokens": 119956823.0, - "step": 268 - }, - { - "epoch": 2.639606396063961, - "grad_norm": 0.06969958736256228, - "learning_rate": 5.35693537693709e-06, - "loss": 0.0131, - "num_tokens": 120410284.0, - "step": 269 - }, - { - "epoch": 2.6494464944649447, - "grad_norm": 0.08178808292429539, - "learning_rate": 5.3283351820898586e-06, - "loss": 0.0183, - "num_tokens": 120837514.0, - "step": 270 - }, - { - "epoch": 2.6592865928659286, - "grad_norm": 0.12228602708630738, - "learning_rate": 5.299741929899171e-06, - "loss": 0.0206, - "num_tokens": 121266377.0, - "step": 271 - }, - { - "epoch": 2.6691266912669125, - "grad_norm": 0.07647057417070459, - "learning_rate": 5.27115677676464e-06, - "loss": 0.0154, - "num_tokens": 121730907.0, - "step": 272 - }, - { - "epoch": 2.678966789667897, - "grad_norm": 0.07263570161343703, - "learning_rate": 5.242580878758334e-06, - "loss": 0.0138, - "num_tokens": 122162564.0, - "step": 273 - }, - { - "epoch": 2.6888068880688807, - "grad_norm": 0.07390794347850005, - "learning_rate": 5.21401539157801e-06, - "loss": 0.0131, - "num_tokens": 122644233.0, - "step": 274 - }, - { - "epoch": 2.6986469864698646, - "grad_norm": 0.05624120433704004, - "learning_rate": 5.1854614705003796e-06, - "loss": 0.0114, - "num_tokens": 123070674.0, - "step": 275 - }, - { - "epoch": 2.708487084870849, - "grad_norm": 0.07371873132309133, - "learning_rate": 5.156920270334389e-06, - "loss": 0.0194, - "num_tokens": 123517476.0, - "step": 276 - }, - { - "epoch": 2.7183271832718328, - "grad_norm": 0.06758978472435712, - "learning_rate": 5.1283929453745055e-06, - "loss": 0.0129, - "num_tokens": 123957650.0, - "step": 277 - }, - { - "epoch": 2.7281672816728166, - "grad_norm": 0.06857276382476074, - "learning_rate": 5.099880649354044e-06, - "loss": 0.0125, - "num_tokens": 124423561.0, - "step": 278 - }, - { - "epoch": 2.7380073800738005, - "grad_norm": 0.06198166285648246, - "learning_rate": 5.071384535398505e-06, - "loss": 0.0119, - "num_tokens": 124871204.0, - "step": 279 - }, - { - "epoch": 2.747847478474785, - "grad_norm": 0.05801997208341688, - "learning_rate": 5.04290575597893e-06, - "loss": 0.0119, - "num_tokens": 125320936.0, - "step": 280 - }, - { - "epoch": 2.7576875768757687, - "grad_norm": 0.09983800531852628, - "learning_rate": 5.0144454628653015e-06, - "loss": 0.0157, - "num_tokens": 125785587.0, - "step": 281 - }, - { - "epoch": 2.767527675276753, - "grad_norm": 0.05961861980322237, - "learning_rate": 4.986004807079959e-06, - "loss": 0.0119, - "num_tokens": 126223799.0, - "step": 282 - }, - { - "epoch": 2.777367773677737, - "grad_norm": 0.06887056012305312, - "learning_rate": 4.957584938851048e-06, - "loss": 0.0127, - "num_tokens": 126674560.0, - "step": 283 - }, - { - "epoch": 2.787207872078721, - "grad_norm": 0.06432285678662777, - "learning_rate": 4.929187007565996e-06, - "loss": 0.0124, - "num_tokens": 127121758.0, - "step": 284 - }, - { - "epoch": 2.7970479704797047, - "grad_norm": 0.06283306903955838, - "learning_rate": 4.9008121617250425e-06, - "loss": 0.0122, - "num_tokens": 127564319.0, - "step": 285 - }, - { - "epoch": 2.8068880688806885, - "grad_norm": 0.07395862495517919, - "learning_rate": 4.87246154889477e-06, - "loss": 0.0125, - "num_tokens": 128014723.0, - "step": 286 - }, - { - "epoch": 2.816728167281673, - "grad_norm": 0.06772968868173306, - "learning_rate": 4.8441363156617085e-06, - "loss": 0.026, - "num_tokens": 128456573.0, - "step": 287 - }, - { - "epoch": 2.8265682656826567, - "grad_norm": 0.2058477599150272, - "learning_rate": 4.815837607585957e-06, - "loss": 0.0313, - "num_tokens": 128888085.0, - "step": 288 - }, - { - "epoch": 2.836408364083641, - "grad_norm": 0.05983028509302605, - "learning_rate": 4.787566569154855e-06, - "loss": 0.0136, - "num_tokens": 129344186.0, - "step": 289 - }, - { - "epoch": 2.846248462484625, - "grad_norm": 0.1679165256737002, - "learning_rate": 4.759324343736698e-06, - "loss": 0.0268, - "num_tokens": 129820337.0, - "step": 290 - }, - { - "epoch": 2.856088560885609, - "grad_norm": 0.069693981729958, - "learning_rate": 4.731112073534491e-06, - "loss": 0.012, - "num_tokens": 130264132.0, - "step": 291 - }, - { - "epoch": 2.8659286592865927, - "grad_norm": 0.05673801969192786, - "learning_rate": 4.70293089953976e-06, - "loss": 0.237, - "num_tokens": 130747367.0, - "step": 292 - }, - { - "epoch": 2.875768757687577, - "grad_norm": 0.9244716369700087, - "learning_rate": 4.674781961486399e-06, - "loss": 0.0129, - "num_tokens": 131189544.0, - "step": 293 - }, - { - "epoch": 2.885608856088561, - "grad_norm": 0.0670539720853974, - "learning_rate": 4.646666397804586e-06, - "loss": 0.0127, - "num_tokens": 131615817.0, - "step": 294 - }, - { - "epoch": 2.8954489544895448, - "grad_norm": 0.07778029323101539, - "learning_rate": 4.618585345574741e-06, - "loss": 0.0136, - "num_tokens": 132065833.0, - "step": 295 - }, - { - "epoch": 2.905289052890529, - "grad_norm": 0.06633645417900966, - "learning_rate": 4.5905399404815196e-06, - "loss": 0.0119, - "num_tokens": 132513181.0, - "step": 296 - }, - { - "epoch": 2.915129151291513, - "grad_norm": 0.06604742202311176, - "learning_rate": 4.562531316767908e-06, - "loss": 0.0178, - "num_tokens": 132975979.0, - "step": 297 - }, - { - "epoch": 2.924969249692497, - "grad_norm": 0.06375772945002761, - "learning_rate": 4.534560607189338e-06, - "loss": 0.0121, - "num_tokens": 133411946.0, - "step": 298 - }, - { - "epoch": 2.9348093480934807, - "grad_norm": 0.0644873715390372, - "learning_rate": 4.506628942967874e-06, - "loss": 0.0226, - "num_tokens": 133882037.0, - "step": 299 - }, - { - "epoch": 2.944649446494465, - "grad_norm": 0.06122403707300358, - "learning_rate": 4.478737453746464e-06, - "loss": 0.0111, - "num_tokens": 134338580.0, - "step": 300 - }, - { - "epoch": 2.954489544895449, - "grad_norm": 0.06192995198797032, - "learning_rate": 4.450887267543261e-06, - "loss": 0.023, - "num_tokens": 134806429.0, - "step": 301 - }, - { - "epoch": 2.9643296432964332, - "grad_norm": 0.06577423487360488, - "learning_rate": 4.423079510705992e-06, - "loss": 0.0127, - "num_tokens": 135253050.0, - "step": 302 - }, - { - "epoch": 2.974169741697417, - "grad_norm": 0.061821762890230156, - "learning_rate": 4.395315307866404e-06, - "loss": 0.0118, - "num_tokens": 135701900.0, - "step": 303 - }, - { - "epoch": 2.984009840098401, - "grad_norm": 0.060295397517859534, - "learning_rate": 4.3675957818947965e-06, - "loss": 0.0112, - "num_tokens": 136134539.0, - "step": 304 - }, - { - "epoch": 2.993849938499385, - "grad_norm": 0.06204359834906306, - "learning_rate": 4.33992205385459e-06, - "loss": 0.0119, - "num_tokens": 136581981.0, - "step": 305 - }, - { - "epoch": 3.0, - "grad_norm": 0.06204359834906306, - "learning_rate": 4.312295242956998e-06, - "loss": 0.0109, - "num_tokens": 136774441.0, - "step": 306 - }, - { - "epoch": 3.0, - "eval_loss": 0.0963606908917427, - "eval_num_tokens": 136774441.0, - "eval_runtime": 53.9214, - "eval_samples_per_second": 41.653, - "eval_steps_per_second": 5.211, - "step": 306 - }, - { - "epoch": 3.009840098400984, - "grad_norm": 0.08266586517900253, - "learning_rate": 4.284716466515759e-06, - "loss": 0.0218, - "num_tokens": 137235846.0, - "step": 307 - }, - { - "epoch": 3.019680196801968, - "grad_norm": 0.06025259361613064, - "learning_rate": 4.257186839901948e-06, - "loss": 0.01, - "num_tokens": 137676575.0, - "step": 308 - }, - { - "epoch": 3.029520295202952, - "grad_norm": 0.059520087712568295, - "learning_rate": 4.229707476498871e-06, - "loss": 0.0107, - "num_tokens": 138127277.0, - "step": 309 - }, - { - "epoch": 3.039360393603936, - "grad_norm": 0.060007105121960225, - "learning_rate": 4.2022794876570335e-06, - "loss": 0.0099, - "num_tokens": 138558346.0, - "step": 310 - }, - { - "epoch": 3.0492004920049203, - "grad_norm": 0.05765555936281279, - "learning_rate": 4.1749039826491956e-06, - "loss": 0.2021, - "num_tokens": 139029117.0, - "step": 311 - }, - { - "epoch": 3.059040590405904, - "grad_norm": 0.25549047851203505, - "learning_rate": 4.1475820686255055e-06, - "loss": 0.01, - "num_tokens": 139465608.0, - "step": 312 - }, - { - "epoch": 3.068880688806888, - "grad_norm": 0.05745397404349778, - "learning_rate": 4.120314850568731e-06, - "loss": 0.0291, - "num_tokens": 139932040.0, - "step": 313 - }, - { - "epoch": 3.078720787207872, - "grad_norm": 0.21571060654935606, - "learning_rate": 4.093103431249563e-06, - "loss": 0.011, - "num_tokens": 140393810.0, - "step": 314 - }, - { - "epoch": 3.088560885608856, - "grad_norm": 0.06271676867820344, - "learning_rate": 4.065948911182015e-06, - "loss": 0.018, - "num_tokens": 140853306.0, - "step": 315 - }, - { - "epoch": 3.09840098400984, - "grad_norm": 0.06529992912597996, - "learning_rate": 4.038852388578925e-06, - "loss": 0.0102, - "num_tokens": 141293974.0, - "step": 316 - }, - { - "epoch": 3.108241082410824, - "grad_norm": 0.0613594667302306, - "learning_rate": 4.011814959307533e-06, - "loss": 0.0101, - "num_tokens": 141739396.0, - "step": 317 - }, - { - "epoch": 3.1180811808118083, - "grad_norm": 0.06143281774280475, - "learning_rate": 3.984837716845157e-06, - "loss": 0.0098, - "num_tokens": 142181417.0, - "step": 318 - }, - { - "epoch": 3.127921279212792, - "grad_norm": 0.06065540767441434, - "learning_rate": 3.957921752234982e-06, - "loss": 0.0095, - "num_tokens": 142615273.0, - "step": 319 - }, - { - "epoch": 3.137761377613776, - "grad_norm": 0.0565367496699821, - "learning_rate": 3.931068154041919e-06, - "loss": 0.0156, - "num_tokens": 143066695.0, - "step": 320 - }, - { - "epoch": 3.14760147601476, - "grad_norm": 0.0928817994214938, - "learning_rate": 3.904278008308589e-06, - "loss": 0.0093, - "num_tokens": 143543314.0, - "step": 321 - }, - { - "epoch": 3.1574415744157442, - "grad_norm": 0.05348206917431186, - "learning_rate": 3.877552398511409e-06, - "loss": 0.0102, - "num_tokens": 143978640.0, - "step": 322 - }, - { - "epoch": 3.167281672816728, - "grad_norm": 0.05744861837720995, - "learning_rate": 3.85089240551675e-06, - "loss": 0.0096, - "num_tokens": 144437143.0, - "step": 323 - }, - { - "epoch": 3.177121771217712, - "grad_norm": 0.05917730480215664, - "learning_rate": 3.8242991075372436e-06, - "loss": 0.0103, - "num_tokens": 144882614.0, - "step": 324 - }, - { - "epoch": 3.1869618696186963, - "grad_norm": 0.06138753989215512, - "learning_rate": 3.7977735800881687e-06, - "loss": 0.01, - "num_tokens": 145336615.0, - "step": 325 - }, - { - "epoch": 3.19680196801968, - "grad_norm": 0.057934477141044834, - "learning_rate": 3.7713168959439515e-06, - "loss": 0.0097, - "num_tokens": 145791703.0, - "step": 326 - }, - { - "epoch": 3.206642066420664, - "grad_norm": 0.062311400511582536, - "learning_rate": 3.74493012509478e-06, - "loss": 0.0163, - "num_tokens": 146256588.0, - "step": 327 - }, - { - "epoch": 3.2164821648216484, - "grad_norm": 0.11046706497961999, - "learning_rate": 3.718614334703339e-06, - "loss": 0.0096, - "num_tokens": 146704790.0, - "step": 328 - }, - { - "epoch": 3.2263222632226323, - "grad_norm": 0.06040935915809342, - "learning_rate": 3.692370589061639e-06, - "loss": 0.0161, - "num_tokens": 147150851.0, - "step": 329 - }, - { - "epoch": 3.236162361623616, - "grad_norm": 0.06309596528426079, - "learning_rate": 3.6661999495479772e-06, - "loss": 0.0116, - "num_tokens": 147586533.0, - "step": 330 - }, - { - "epoch": 3.2460024600246005, - "grad_norm": 0.0775947611650109, - "learning_rate": 3.640103474584016e-06, - "loss": 0.0102, - "num_tokens": 148012817.0, - "step": 331 - }, - { - "epoch": 3.2558425584255843, - "grad_norm": 0.060442066581616015, - "learning_rate": 3.614082219591972e-06, - "loss": 0.0094, - "num_tokens": 148454349.0, - "step": 332 - }, - { - "epoch": 3.265682656826568, - "grad_norm": 0.0599277899760194, - "learning_rate": 3.588137236951934e-06, - "loss": 0.0096, - "num_tokens": 148908837.0, - "step": 333 - }, - { - "epoch": 3.275522755227552, - "grad_norm": 0.06389649266611047, - "learning_rate": 3.5622695759592996e-06, - "loss": 0.0091, - "num_tokens": 149387409.0, - "step": 334 - }, - { - "epoch": 3.2853628536285364, - "grad_norm": 0.059031876557593344, - "learning_rate": 3.5364802827823397e-06, - "loss": 0.0124, - "num_tokens": 149842184.0, - "step": 335 - }, - { - "epoch": 3.2952029520295203, - "grad_norm": 0.06425762134540147, - "learning_rate": 3.5107704004198904e-06, - "loss": 0.0096, - "num_tokens": 150294624.0, - "step": 336 - }, - { - "epoch": 3.305043050430504, - "grad_norm": 0.060359900802863305, - "learning_rate": 3.485140968659166e-06, - "loss": 0.0156, - "num_tokens": 150757952.0, - "step": 337 - }, - { - "epoch": 3.3148831488314885, - "grad_norm": 0.06451910432321761, - "learning_rate": 3.4595930240337115e-06, - "loss": 0.0093, - "num_tokens": 151210941.0, - "step": 338 - }, - { - "epoch": 3.3247232472324724, - "grad_norm": 0.05771756769585445, - "learning_rate": 3.4341275997814795e-06, - "loss": 0.0311, - "num_tokens": 151659703.0, - "step": 339 - }, - { - "epoch": 3.3345633456334562, - "grad_norm": 0.2709101034464869, - "learning_rate": 3.408745725803042e-06, - "loss": 0.0198, - "num_tokens": 152096656.0, - "step": 340 - }, - { - "epoch": 3.34440344403444, - "grad_norm": 0.2165805542100797, - "learning_rate": 3.383448428619941e-06, - "loss": 0.0109, - "num_tokens": 152535937.0, - "step": 341 - }, - { - "epoch": 3.3542435424354244, - "grad_norm": 0.06249104678860667, - "learning_rate": 3.3582367313331692e-06, - "loss": 0.0241, - "num_tokens": 153012481.0, - "step": 342 - }, - { - "epoch": 3.3640836408364083, - "grad_norm": 0.07444091538512662, - "learning_rate": 3.3331116535817974e-06, - "loss": 0.0096, - "num_tokens": 153457239.0, - "step": 343 - }, - { - "epoch": 3.373923739237392, - "grad_norm": 0.05744783875540723, - "learning_rate": 3.308074211501732e-06, - "loss": 0.0112, - "num_tokens": 153885310.0, - "step": 344 - }, - { - "epoch": 3.3837638376383765, - "grad_norm": 0.062108203142145886, - "learning_rate": 3.2831254176846205e-06, - "loss": 0.0102, - "num_tokens": 154315565.0, - "step": 345 - }, - { - "epoch": 3.3936039360393604, - "grad_norm": 0.06493988486024563, - "learning_rate": 3.258266281136905e-06, - "loss": 0.0154, - "num_tokens": 154761237.0, - "step": 346 - }, - { - "epoch": 3.4034440344403443, - "grad_norm": 0.07703452506780802, - "learning_rate": 3.233497807239008e-06, - "loss": 0.0149, - "num_tokens": 155219079.0, - "step": 347 - }, - { - "epoch": 3.4132841328413286, - "grad_norm": 0.07716474025857703, - "learning_rate": 3.2088209977046657e-06, - "loss": 0.0099, - "num_tokens": 155672847.0, - "step": 348 - }, - { - "epoch": 3.4231242312423125, - "grad_norm": 0.0598011605849924, - "learning_rate": 3.1842368505404388e-06, - "loss": 0.0097, - "num_tokens": 156097592.0, - "step": 349 - }, - { - "epoch": 3.4329643296432963, - "grad_norm": 0.06067024127693304, - "learning_rate": 3.1597463600053258e-06, - "loss": 0.0097, - "num_tokens": 156543931.0, - "step": 350 - }, - { - "epoch": 3.4428044280442807, - "grad_norm": 0.06276348610439125, - "learning_rate": 3.135350516570559e-06, - "loss": 0.0115, - "num_tokens": 156993093.0, - "step": 351 - }, - { - "epoch": 3.4526445264452645, - "grad_norm": 0.07056305058653452, - "learning_rate": 3.111050306879556e-06, - "loss": 0.0161, - "num_tokens": 157435895.0, - "step": 352 - }, - { - "epoch": 3.4624846248462484, - "grad_norm": 0.0692853066303934, - "learning_rate": 3.0868467137080075e-06, - "loss": 0.0124, - "num_tokens": 157859703.0, - "step": 353 - }, - { - "epoch": 3.4723247232472323, - "grad_norm": 0.06622059827297899, - "learning_rate": 3.0627407159241273e-06, - "loss": 0.0098, - "num_tokens": 158319159.0, - "step": 354 - }, - { - "epoch": 3.4821648216482166, - "grad_norm": 0.06424105970441871, - "learning_rate": 3.0387332884490806e-06, - "loss": 0.0105, - "num_tokens": 158768974.0, - "step": 355 - }, - { - "epoch": 3.4920049200492005, - "grad_norm": 0.06970655480927966, - "learning_rate": 3.014825402217533e-06, - "loss": 0.0099, - "num_tokens": 159221319.0, - "step": 356 - }, - { - "epoch": 3.5018450184501844, - "grad_norm": 0.06231852234082556, - "learning_rate": 2.9910180241384014e-06, - "loss": 0.0099, - "num_tokens": 159657431.0, - "step": 357 - }, - { - "epoch": 3.5116851168511687, - "grad_norm": 0.06403174372575768, - "learning_rate": 2.9673121170557396e-06, - "loss": 0.0099, - "num_tokens": 160091184.0, - "step": 358 - }, - { - "epoch": 3.5215252152521526, - "grad_norm": 0.06050506427522611, - "learning_rate": 2.9437086397097996e-06, - "loss": 0.0095, - "num_tokens": 160538104.0, - "step": 359 - }, - { - "epoch": 3.5313653136531364, - "grad_norm": 0.05914580967848918, - "learning_rate": 2.92020854669826e-06, - "loss": 0.0151, - "num_tokens": 160984800.0, - "step": 360 - }, - { - "epoch": 3.5412054120541203, - "grad_norm": 0.06615551474859403, - "learning_rate": 2.896812788437615e-06, - "loss": 0.0102, - "num_tokens": 161437908.0, - "step": 361 - }, - { - "epoch": 3.5510455104551046, - "grad_norm": 0.05688142632929498, - "learning_rate": 2.8735223111247402e-06, - "loss": 0.0094, - "num_tokens": 161900209.0, - "step": 362 - }, - { - "epoch": 3.5608856088560885, - "grad_norm": 0.05805719882416427, - "learning_rate": 2.850338056698621e-06, - "loss": 0.0094, - "num_tokens": 162381378.0, - "step": 363 - }, - { - "epoch": 3.570725707257073, - "grad_norm": 0.05665394777981862, - "learning_rate": 2.827260962802263e-06, - "loss": 0.0089, - "num_tokens": 162818401.0, - "step": 364 - }, - { - "epoch": 3.5805658056580567, - "grad_norm": 0.058540688861597474, - "learning_rate": 2.804291962744768e-06, - "loss": 0.0102, - "num_tokens": 163261663.0, - "step": 365 - }, - { - "epoch": 3.5904059040590406, - "grad_norm": 0.06068364561780823, - "learning_rate": 2.7814319854635875e-06, - "loss": 0.0096, - "num_tokens": 163706510.0, - "step": 366 - }, - { - "epoch": 3.6002460024600245, - "grad_norm": 0.0593859542792967, - "learning_rate": 2.758681955486955e-06, - "loss": 0.0097, - "num_tokens": 164145145.0, - "step": 367 - }, - { - "epoch": 3.6100861008610083, - "grad_norm": 0.059439587082302694, - "learning_rate": 2.736042792896495e-06, - "loss": 0.0104, - "num_tokens": 164588218.0, - "step": 368 - }, - { - "epoch": 3.6199261992619927, - "grad_norm": 0.06426940128348262, - "learning_rate": 2.7135154132900133e-06, - "loss": 0.0203, - "num_tokens": 165039642.0, - "step": 369 - }, - { - "epoch": 3.6297662976629765, - "grad_norm": 0.059031373381084176, - "learning_rate": 2.691100727744458e-06, - "loss": 0.0091, - "num_tokens": 165502439.0, - "step": 370 - }, - { - "epoch": 3.639606396063961, - "grad_norm": 0.05706397506461239, - "learning_rate": 2.668799642779093e-06, - "loss": 0.0106, - "num_tokens": 165957611.0, - "step": 371 - }, - { - "epoch": 3.6494464944649447, - "grad_norm": 0.06337690848780857, - "learning_rate": 2.6466130603188157e-06, - "loss": 0.01, - "num_tokens": 166404741.0, - "step": 372 - }, - { - "epoch": 3.6592865928659286, - "grad_norm": 0.057865704503962175, - "learning_rate": 2.624541877657685e-06, - "loss": 0.1951, - "num_tokens": 166908892.0, - "step": 373 - }, - { - "epoch": 3.6691266912669125, - "grad_norm": 0.6748913551790232, - "learning_rate": 2.602586987422643e-06, - "loss": 0.0094, - "num_tokens": 167346017.0, - "step": 374 - }, - { - "epoch": 3.678966789667897, - "grad_norm": 0.06271310429727074, - "learning_rate": 2.580749277537399e-06, - "loss": 0.0093, - "num_tokens": 167795779.0, - "step": 375 - }, - { - "epoch": 3.6888068880688807, - "grad_norm": 0.05728241738284472, - "learning_rate": 2.5590296311865294e-06, - "loss": 0.0092, - "num_tokens": 168246613.0, - "step": 376 - }, - { - "epoch": 3.6986469864698646, - "grad_norm": 0.05730319671770116, - "learning_rate": 2.537428926779758e-06, - "loss": 0.0104, - "num_tokens": 168703193.0, - "step": 377 - }, - { - "epoch": 3.708487084870849, - "grad_norm": 0.061789009881383514, - "learning_rate": 2.515948037916423e-06, - "loss": 0.0104, - "num_tokens": 169166239.0, - "step": 378 - }, - { - "epoch": 3.7183271832718328, - "grad_norm": 0.05958784070544453, - "learning_rate": 2.494587833350153e-06, - "loss": 0.0564, - "num_tokens": 169618415.0, - "step": 379 - }, - { - "epoch": 3.7281672816728166, - "grad_norm": 0.22039415728368103, - "learning_rate": 2.473349176953736e-06, - "loss": 0.0094, - "num_tokens": 170079318.0, - "step": 380 - }, - { - "epoch": 3.7380073800738005, - "grad_norm": 0.05930397129828618, - "learning_rate": 2.4522329276841664e-06, - "loss": 0.0198, - "num_tokens": 170524571.0, - "step": 381 - }, - { - "epoch": 3.747847478474785, - "grad_norm": 0.06047568038440854, - "learning_rate": 2.431239939547921e-06, - "loss": 0.0094, - "num_tokens": 170983016.0, - "step": 382 - }, - { - "epoch": 3.7576875768757687, - "grad_norm": 0.061680315681806853, - "learning_rate": 2.4103710615664145e-06, - "loss": 0.0089, - "num_tokens": 171426486.0, - "step": 383 - }, - { - "epoch": 3.767527675276753, - "grad_norm": 0.05588539351574886, - "learning_rate": 2.389627137741662e-06, - "loss": 0.0094, - "num_tokens": 171871834.0, - "step": 384 - }, - { - "epoch": 3.777367773677737, - "grad_norm": 0.061780123368904795, - "learning_rate": 2.369009007022146e-06, - "loss": 0.0093, - "num_tokens": 172337523.0, - "step": 385 - }, - { - "epoch": 3.787207872078721, - "grad_norm": 0.05632561272908436, - "learning_rate": 2.3485175032688865e-06, - "loss": 0.0088, - "num_tokens": 172775826.0, - "step": 386 - }, - { - "epoch": 3.7970479704797047, - "grad_norm": 0.058782272770165275, - "learning_rate": 2.328153455221717e-06, - "loss": 0.0095, - "num_tokens": 173234709.0, - "step": 387 - }, - { - "epoch": 3.8068880688806885, - "grad_norm": 0.057526356469471435, - "learning_rate": 2.3079176864657673e-06, - "loss": 0.0097, - "num_tokens": 173700055.0, - "step": 388 - }, - { - "epoch": 3.816728167281673, - "grad_norm": 0.06609619441495819, - "learning_rate": 2.2878110153981565e-06, - "loss": 0.0111, - "num_tokens": 174147961.0, - "step": 389 - }, - { - "epoch": 3.8265682656826567, - "grad_norm": 0.06703233332357492, - "learning_rate": 2.267834255194894e-06, - "loss": 0.0116, - "num_tokens": 174586991.0, - "step": 390 - }, - { - "epoch": 3.836408364083641, - "grad_norm": 0.06522848493729735, - "learning_rate": 2.2479882137779903e-06, - "loss": 0.0106, - "num_tokens": 175006875.0, - "step": 391 - }, - { - "epoch": 3.846248462484625, - "grad_norm": 0.06306752932488521, - "learning_rate": 2.228273693782784e-06, - "loss": 0.0094, - "num_tokens": 175451007.0, - "step": 392 - }, - { - "epoch": 3.856088560885609, - "grad_norm": 0.062263756072231294, - "learning_rate": 2.208691492525481e-06, - "loss": 0.0135, - "num_tokens": 175896902.0, - "step": 393 - }, - { - "epoch": 3.8659286592865927, - "grad_norm": 0.06835430681220003, - "learning_rate": 2.189242401970908e-06, - "loss": 0.0092, - "num_tokens": 176346616.0, - "step": 394 - }, - { - "epoch": 3.875768757687577, - "grad_norm": 0.05728313379563115, - "learning_rate": 2.169927208700482e-06, - "loss": 0.0098, - "num_tokens": 176802124.0, - "step": 395 - }, - { - "epoch": 3.885608856088561, - "grad_norm": 0.06299115193931754, - "learning_rate": 2.1507466938804013e-06, - "loss": 0.0089, - "num_tokens": 177233961.0, - "step": 396 - }, - { - "epoch": 3.8954489544895448, - "grad_norm": 0.060076198285498296, - "learning_rate": 2.131701633230045e-06, - "loss": 0.0098, - "num_tokens": 177684662.0, - "step": 397 - }, - { - "epoch": 3.905289052890529, - "grad_norm": 0.06517531508961912, - "learning_rate": 2.112792796990616e-06, - "loss": 0.0095, - "num_tokens": 178123825.0, - "step": 398 - }, - { - "epoch": 3.915129151291513, - "grad_norm": 0.05863263973572925, - "learning_rate": 2.0940209498939732e-06, - "loss": 0.009, - "num_tokens": 178562641.0, - "step": 399 - }, - { - "epoch": 3.924969249692497, - "grad_norm": 0.05798991563312477, - "learning_rate": 2.075386851131711e-06, - "loss": 0.0094, - "num_tokens": 179007017.0, - "step": 400 - }, - { - "epoch": 3.9348093480934807, - "grad_norm": 0.06118488260559937, - "learning_rate": 2.056891254324459e-06, - "loss": 0.0095, - "num_tokens": 179449125.0, - "step": 401 - }, - { - "epoch": 3.944649446494465, - "grad_norm": 0.06403534407994695, - "learning_rate": 2.038534907491396e-06, - "loss": 0.009, - "num_tokens": 179887646.0, - "step": 402 - }, - { - "epoch": 3.954489544895449, - "grad_norm": 0.08058699039926022, - "learning_rate": 2.0203185530199983e-06, - "loss": 0.0138, - "num_tokens": 180341944.0, - "step": 403 - }, - { - "epoch": 3.9643296432964332, - "grad_norm": 0.056026267406971995, - "learning_rate": 2.0022429276360256e-06, - "loss": 0.0097, - "num_tokens": 180787775.0, - "step": 404 - }, - { - "epoch": 3.974169741697417, - "grad_norm": 0.058787256460149456, - "learning_rate": 1.9843087623737097e-06, - "loss": 0.0088, - "num_tokens": 181276015.0, - "step": 405 - }, - { - "epoch": 3.984009840098401, - "grad_norm": 0.054638072869340186, - "learning_rate": 1.966516782546199e-06, - "loss": 0.009, - "num_tokens": 181724759.0, - "step": 406 - }, - { - "epoch": 3.993849938499385, - "grad_norm": 0.05931097745374889, - "learning_rate": 1.94886770771623e-06, - "loss": 0.0098, - "num_tokens": 182165821.0, - "step": 407 - }, - { - "epoch": 4.0, - "grad_norm": 0.06697953375930626, - "learning_rate": 1.931362251667008e-06, - "loss": 0.027, - "num_tokens": 182364260.0, - "step": 408 - }, - { - "epoch": 4.0, - "eval_loss": 0.1028980016708374, - "eval_num_tokens": 182364260.0, - "eval_runtime": 53.8919, - "eval_samples_per_second": 41.676, - "eval_steps_per_second": 5.214, - "step": 408 - }, - { - "epoch": 4.009840098400984, - "grad_norm": 0.07632643003764507, - "learning_rate": 1.9140011223733576e-06, - "loss": 0.0082, - "num_tokens": 182806025.0, - "step": 409 - }, - { - "epoch": 4.019680196801968, - "grad_norm": 0.05437436276939388, - "learning_rate": 1.8967850219730799e-06, - "loss": 0.0081, - "num_tokens": 183278654.0, - "step": 410 - }, - { - "epoch": 4.029520295202952, - "grad_norm": 0.05114318878211908, - "learning_rate": 1.8797146467385604e-06, - "loss": 0.0076, - "num_tokens": 183720645.0, - "step": 411 - }, - { - "epoch": 4.039360393603936, - "grad_norm": 0.053465044974803935, - "learning_rate": 1.8627906870486063e-06, - "loss": 0.0082, - "num_tokens": 184191637.0, - "step": 412 - }, - { - "epoch": 4.04920049200492, - "grad_norm": 0.054542981072468875, - "learning_rate": 1.8460138273605265e-06, - "loss": 0.008, - "num_tokens": 184634141.0, - "step": 413 - }, - { - "epoch": 4.059040590405904, - "grad_norm": 0.052414283521576004, - "learning_rate": 1.8293847461824538e-06, - "loss": 0.0079, - "num_tokens": 185081741.0, - "step": 414 - }, - { - "epoch": 4.068880688806888, - "grad_norm": 0.05289967674124652, - "learning_rate": 1.8129041160458966e-06, - "loss": 0.008, - "num_tokens": 185495440.0, - "step": 415 - }, - { - "epoch": 4.078720787207872, - "grad_norm": 0.0584668942852983, - "learning_rate": 1.7965726034785466e-06, - "loss": 0.0081, - "num_tokens": 185938291.0, - "step": 416 - }, - { - "epoch": 4.088560885608856, - "grad_norm": 0.05897150659800833, - "learning_rate": 1.780390868977318e-06, - "loss": 0.0086, - "num_tokens": 186409542.0, - "step": 417 - }, - { - "epoch": 4.0984009840098405, - "grad_norm": 0.05118034680985974, - "learning_rate": 1.7643595669816378e-06, - "loss": 0.0077, - "num_tokens": 186852482.0, - "step": 418 - }, - { - "epoch": 4.108241082410824, - "grad_norm": 0.05911903344070817, - "learning_rate": 1.7484793458469745e-06, - "loss": 0.0081, - "num_tokens": 187306570.0, - "step": 419 - }, - { - "epoch": 4.118081180811808, - "grad_norm": 0.058617479568280846, - "learning_rate": 1.7327508478186216e-06, - "loss": 0.0075, - "num_tokens": 187738802.0, - "step": 420 - }, - { - "epoch": 4.127921279212792, - "grad_norm": 0.05743950460862962, - "learning_rate": 1.7171747090057201e-06, - "loss": 0.0081, - "num_tokens": 188188275.0, - "step": 421 - }, - { - "epoch": 4.137761377613776, - "grad_norm": 0.0578427653677817, - "learning_rate": 1.7017515593555295e-06, - "loss": 0.008, - "num_tokens": 188626310.0, - "step": 422 - }, - { - "epoch": 4.14760147601476, - "grad_norm": 0.055381917249045204, - "learning_rate": 1.6864820226279607e-06, - "loss": 0.0079, - "num_tokens": 189058824.0, - "step": 423 - }, - { - "epoch": 4.157441574415744, - "grad_norm": 0.0566904301682134, - "learning_rate": 1.6713667163703348e-06, - "loss": 0.008, - "num_tokens": 189488025.0, - "step": 424 - }, - { - "epoch": 4.167281672816729, - "grad_norm": 0.0591657691393218, - "learning_rate": 1.6564062518924202e-06, - "loss": 0.0093, - "num_tokens": 189949176.0, - "step": 425 - }, - { - "epoch": 4.177121771217712, - "grad_norm": 0.058609260537066755, - "learning_rate": 1.6416012342417056e-06, - "loss": 0.0075, - "num_tokens": 190405187.0, - "step": 426 - }, - { - "epoch": 4.186961869618696, - "grad_norm": 0.05376660491247955, - "learning_rate": 1.6269522621789246e-06, - "loss": 0.0094, - "num_tokens": 190839466.0, - "step": 427 - }, - { - "epoch": 4.19680196801968, - "grad_norm": 0.062048025442225076, - "learning_rate": 1.6124599281538452e-06, - "loss": 0.02, - "num_tokens": 191280153.0, - "step": 428 - }, - { - "epoch": 4.206642066420664, - "grad_norm": 0.06071173185238267, - "learning_rate": 1.5981248182813136e-06, - "loss": 0.0073, - "num_tokens": 191734314.0, - "step": 429 - }, - { - "epoch": 4.216482164821648, - "grad_norm": 0.05301725414979279, - "learning_rate": 1.583947512317537e-06, - "loss": 0.0117, - "num_tokens": 192202492.0, - "step": 430 - }, - { - "epoch": 4.226322263222632, - "grad_norm": 0.06832062526218917, - "learning_rate": 1.5699285836366488e-06, - "loss": 0.0093, - "num_tokens": 192667915.0, - "step": 431 - }, - { - "epoch": 4.236162361623617, - "grad_norm": 0.05748762603533909, - "learning_rate": 1.5560685992075141e-06, - "loss": 0.0078, - "num_tokens": 193136794.0, - "step": 432 - }, - { - "epoch": 4.2460024600246005, - "grad_norm": 0.0737572203685775, - "learning_rate": 1.5423681195707997e-06, - "loss": 0.0073, - "num_tokens": 193598491.0, - "step": 433 - }, - { - "epoch": 4.255842558425584, - "grad_norm": 0.05225082250599676, - "learning_rate": 1.528827698816306e-06, - "loss": 0.0077, - "num_tokens": 194023980.0, - "step": 434 - }, - { - "epoch": 4.265682656826568, - "grad_norm": 0.05296466266803098, - "learning_rate": 1.515447884560556e-06, - "loss": 0.0074, - "num_tokens": 194481167.0, - "step": 435 - }, - { - "epoch": 4.275522755227552, - "grad_norm": 0.05336380722303185, - "learning_rate": 1.502229217924649e-06, - "loss": 0.0075, - "num_tokens": 194915312.0, - "step": 436 - }, - { - "epoch": 4.285362853628536, - "grad_norm": 0.05458180686808586, - "learning_rate": 1.489172233512376e-06, - "loss": 0.0076, - "num_tokens": 195368266.0, - "step": 437 - }, - { - "epoch": 4.29520295202952, - "grad_norm": 0.05542603913086383, - "learning_rate": 1.4762774593885986e-06, - "loss": 0.0081, - "num_tokens": 195810914.0, - "step": 438 - }, - { - "epoch": 4.305043050430505, - "grad_norm": 0.054344537083576325, - "learning_rate": 1.4635454170578917e-06, - "loss": 0.0072, - "num_tokens": 196263940.0, - "step": 439 - }, - { - "epoch": 4.3148831488314885, - "grad_norm": 0.052701156778993646, - "learning_rate": 1.4509766214434535e-06, - "loss": 0.0077, - "num_tokens": 196718774.0, - "step": 440 - }, - { - "epoch": 4.324723247232472, - "grad_norm": 0.05423178707270067, - "learning_rate": 1.4385715808662787e-06, - "loss": 0.008, - "num_tokens": 197161519.0, - "step": 441 - }, - { - "epoch": 4.334563345633456, - "grad_norm": 0.055354896441224044, - "learning_rate": 1.4263307970246027e-06, - "loss": 0.008, - "num_tokens": 197621081.0, - "step": 442 - }, - { - "epoch": 4.34440344403444, - "grad_norm": 0.05816305513011695, - "learning_rate": 1.41425476497361e-06, - "loss": 0.0078, - "num_tokens": 198087857.0, - "step": 443 - }, - { - "epoch": 4.354243542435424, - "grad_norm": 0.05127845466920968, - "learning_rate": 1.4023439731054112e-06, - "loss": 0.0077, - "num_tokens": 198533672.0, - "step": 444 - }, - { - "epoch": 4.364083640836409, - "grad_norm": 0.07067731738580797, - "learning_rate": 1.390598903129296e-06, - "loss": 0.0322, - "num_tokens": 199022227.0, - "step": 445 - }, - { - "epoch": 4.373923739237393, - "grad_norm": 0.05511218194004341, - "learning_rate": 1.3790200300522413e-06, - "loss": 0.0077, - "num_tokens": 199462215.0, - "step": 446 - }, - { - "epoch": 4.3837638376383765, - "grad_norm": 0.05735730379081794, - "learning_rate": 1.3676078221597157e-06, - "loss": 0.0074, - "num_tokens": 199907231.0, - "step": 447 - }, - { - "epoch": 4.39360393603936, - "grad_norm": 0.05442936039834661, - "learning_rate": 1.3563627409967257e-06, - "loss": 0.1955, - "num_tokens": 200376904.0, - "step": 448 - }, - { - "epoch": 4.403444034440344, - "grad_norm": 0.5930661652942222, - "learning_rate": 1.3452852413491563e-06, - "loss": 0.0074, - "num_tokens": 200853967.0, - "step": 449 - }, - { - "epoch": 4.413284132841328, - "grad_norm": 0.05077867679984549, - "learning_rate": 1.3343757712253804e-06, - "loss": 0.0076, - "num_tokens": 201323621.0, - "step": 450 - }, - { - "epoch": 4.423124231242312, - "grad_norm": 0.058807424527887606, - "learning_rate": 1.3236347718381338e-06, - "loss": 0.0096, - "num_tokens": 201753687.0, - "step": 451 - }, - { - "epoch": 4.432964329643297, - "grad_norm": 0.06001374322910319, - "learning_rate": 1.3130626775866743e-06, - "loss": 0.0081, - "num_tokens": 202203799.0, - "step": 452 - }, - { - "epoch": 4.442804428044281, - "grad_norm": 0.06273437087252197, - "learning_rate": 1.3026599160392173e-06, - "loss": 0.0092, - "num_tokens": 202627243.0, - "step": 453 - }, - { - "epoch": 4.4526445264452645, - "grad_norm": 0.06372618537836224, - "learning_rate": 1.292426907915634e-06, - "loss": 0.0076, - "num_tokens": 203077433.0, - "step": 454 - }, - { - "epoch": 4.462484624846248, - "grad_norm": 0.057948321757535656, - "learning_rate": 1.2823640670704443e-06, - "loss": 0.0229, - "num_tokens": 203532517.0, - "step": 455 - }, - { - "epoch": 4.472324723247232, - "grad_norm": 0.06607138604150303, - "learning_rate": 1.2724718004760794e-06, - "loss": 0.0078, - "num_tokens": 203967752.0, - "step": 456 - }, - { - "epoch": 4.482164821648216, - "grad_norm": 0.05725783304801458, - "learning_rate": 1.2627505082064144e-06, - "loss": 0.0076, - "num_tokens": 204424349.0, - "step": 457 - }, - { - "epoch": 4.492004920049201, - "grad_norm": 0.055427831791831646, - "learning_rate": 1.2532005834205976e-06, - "loss": 0.0079, - "num_tokens": 204846138.0, - "step": 458 - }, - { - "epoch": 4.501845018450185, - "grad_norm": 0.05460191637217484, - "learning_rate": 1.2438224123471442e-06, - "loss": 0.0192, - "num_tokens": 205306730.0, - "step": 459 - }, - { - "epoch": 4.511685116851169, - "grad_norm": 0.06279438477449967, - "learning_rate": 1.2346163742683185e-06, - "loss": 0.0117, - "num_tokens": 205759609.0, - "step": 460 - }, - { - "epoch": 4.521525215252153, - "grad_norm": 0.05702285396092694, - "learning_rate": 1.2255828415047932e-06, - "loss": 0.0076, - "num_tokens": 206171295.0, - "step": 461 - }, - { - "epoch": 4.531365313653136, - "grad_norm": 0.054521558454890394, - "learning_rate": 1.216722179400592e-06, - "loss": 0.0076, - "num_tokens": 206639148.0, - "step": 462 - }, - { - "epoch": 4.54120541205412, - "grad_norm": 0.05168283263697403, - "learning_rate": 1.208034746308315e-06, - "loss": 0.0068, - "num_tokens": 207094260.0, - "step": 463 - }, - { - "epoch": 4.551045510455104, - "grad_norm": 0.05161429329359664, - "learning_rate": 1.1995208935746437e-06, - "loss": 0.0081, - "num_tokens": 207533375.0, - "step": 464 - }, - { - "epoch": 4.560885608856088, - "grad_norm": 0.058514508257411606, - "learning_rate": 1.1911809655261333e-06, - "loss": 0.0081, - "num_tokens": 207969517.0, - "step": 465 - }, - { - "epoch": 4.570725707257073, - "grad_norm": 0.056665893017668854, - "learning_rate": 1.1830152994552866e-06, - "loss": 0.0086, - "num_tokens": 208408117.0, - "step": 466 - }, - { - "epoch": 4.580565805658057, - "grad_norm": 0.056163462620316754, - "learning_rate": 1.175024225606912e-06, - "loss": 0.0074, - "num_tokens": 208879227.0, - "step": 467 - }, - { - "epoch": 4.590405904059041, - "grad_norm": 0.05409385523794747, - "learning_rate": 1.1672080671647695e-06, - "loss": 0.0078, - "num_tokens": 209325103.0, - "step": 468 - }, - { - "epoch": 4.6002460024600245, - "grad_norm": 0.05629255243399504, - "learning_rate": 1.1595671402384966e-06, - "loss": 0.0102, - "num_tokens": 209791894.0, - "step": 469 - }, - { - "epoch": 4.610086100861008, - "grad_norm": 0.051104203707396316, - "learning_rate": 1.152101753850828e-06, - "loss": 0.0072, - "num_tokens": 210254182.0, - "step": 470 - }, - { - "epoch": 4.619926199261993, - "grad_norm": 0.05229454749737629, - "learning_rate": 1.1448122099250946e-06, - "loss": 0.0104, - "num_tokens": 210702900.0, - "step": 471 - }, - { - "epoch": 4.629766297662977, - "grad_norm": 0.060177504722208404, - "learning_rate": 1.1376988032730135e-06, - "loss": 0.0079, - "num_tokens": 211151465.0, - "step": 472 - }, - { - "epoch": 4.639606396063961, - "grad_norm": 0.05182456184289124, - "learning_rate": 1.130761821582766e-06, - "loss": 0.0072, - "num_tokens": 211619464.0, - "step": 473 - }, - { - "epoch": 4.649446494464945, - "grad_norm": 0.05574225668849545, - "learning_rate": 1.1240015454073622e-06, - "loss": 0.0085, - "num_tokens": 212064266.0, - "step": 474 - }, - { - "epoch": 4.659286592865929, - "grad_norm": 0.06359820975154429, - "learning_rate": 1.1174182481532943e-06, - "loss": 0.0081, - "num_tokens": 212499724.0, - "step": 475 - }, - { - "epoch": 4.6691266912669125, - "grad_norm": 0.05622656000305094, - "learning_rate": 1.1110121960694773e-06, - "loss": 0.0079, - "num_tokens": 212945879.0, - "step": 476 - }, - { - "epoch": 4.678966789667896, - "grad_norm": 0.06093763072714235, - "learning_rate": 1.104783648236486e-06, - "loss": 0.0084, - "num_tokens": 213379787.0, - "step": 477 - }, - { - "epoch": 4.68880688806888, - "grad_norm": 0.0543614373855231, - "learning_rate": 1.0987328565560711e-06, - "loss": 0.0075, - "num_tokens": 213824263.0, - "step": 478 - }, - { - "epoch": 4.698646986469865, - "grad_norm": 0.056905167227697236, - "learning_rate": 1.0928600657409751e-06, - "loss": 0.0082, - "num_tokens": 214265208.0, - "step": 479 - }, - { - "epoch": 4.708487084870849, - "grad_norm": 0.057351833542733925, - "learning_rate": 1.0871655133050372e-06, - "loss": 0.0082, - "num_tokens": 214744301.0, - "step": 480 - }, - { - "epoch": 4.718327183271833, - "grad_norm": 0.29349816338215157, - "learning_rate": 1.081649429553581e-06, - "loss": 0.0553, - "num_tokens": 215194355.0, - "step": 481 - }, - { - "epoch": 4.728167281672817, - "grad_norm": 0.051057953015104116, - "learning_rate": 1.076312037574106e-06, - "loss": 0.0074, - "num_tokens": 215632060.0, - "step": 482 - }, - { - "epoch": 4.7380073800738005, - "grad_norm": 0.056594540815463674, - "learning_rate": 1.0711535532272632e-06, - "loss": 0.0235, - "num_tokens": 216097276.0, - "step": 483 - }, - { - "epoch": 4.747847478474784, - "grad_norm": 0.068871190152495, - "learning_rate": 1.0661741851381256e-06, - "loss": 0.0077, - "num_tokens": 216544463.0, - "step": 484 - }, - { - "epoch": 4.757687576875769, - "grad_norm": 0.05907548729697175, - "learning_rate": 1.0613741346877498e-06, - "loss": 0.0084, - "num_tokens": 216972058.0, - "step": 485 - }, - { - "epoch": 4.767527675276753, - "grad_norm": 0.055592377746762095, - "learning_rate": 1.056753596005032e-06, - "loss": 0.0074, - "num_tokens": 217401900.0, - "step": 486 - }, - { - "epoch": 4.777367773677737, - "grad_norm": 0.05562394957573223, - "learning_rate": 1.0523127559588579e-06, - "loss": 0.0075, - "num_tokens": 217845453.0, - "step": 487 - }, - { - "epoch": 4.787207872078721, - "grad_norm": 0.05258367575789477, - "learning_rate": 1.0480517941505428e-06, - "loss": 0.0073, - "num_tokens": 218272871.0, - "step": 488 - }, - { - "epoch": 4.797047970479705, - "grad_norm": 0.05390618674507445, - "learning_rate": 1.0439708829065708e-06, - "loss": 0.0078, - "num_tokens": 218732597.0, - "step": 489 - }, - { - "epoch": 4.8068880688806885, - "grad_norm": 0.06946151381547928, - "learning_rate": 1.0400701872716227e-06, - "loss": 0.0223, - "num_tokens": 219194340.0, - "step": 490 - }, - { - "epoch": 4.816728167281672, - "grad_norm": 0.05582170906207444, - "learning_rate": 1.0363498650019023e-06, - "loss": 0.0077, - "num_tokens": 219673692.0, - "step": 491 - }, - { - "epoch": 4.826568265682657, - "grad_norm": 0.05244987983803676, - "learning_rate": 1.0328100665587573e-06, - "loss": 0.0073, - "num_tokens": 220118246.0, - "step": 492 - }, - { - "epoch": 4.836408364083641, - "grad_norm": 0.055024340070040305, - "learning_rate": 1.029450935102592e-06, - "loss": 0.0077, - "num_tokens": 220555806.0, - "step": 493 - }, - { - "epoch": 4.846248462484625, - "grad_norm": 0.05338628090134423, - "learning_rate": 1.0262726064870801e-06, - "loss": 0.0073, - "num_tokens": 220997187.0, - "step": 494 - }, - { - "epoch": 4.856088560885609, - "grad_norm": 0.058254094197714025, - "learning_rate": 1.0232752092536666e-06, - "loss": 0.0074, - "num_tokens": 221434681.0, - "step": 495 - }, - { - "epoch": 4.865928659286593, - "grad_norm": 0.05261616134189719, - "learning_rate": 1.0204588646263731e-06, - "loss": 0.0074, - "num_tokens": 221884850.0, - "step": 496 - }, - { - "epoch": 4.875768757687577, - "grad_norm": 0.052167915998619634, - "learning_rate": 1.0178236865068933e-06, - "loss": 0.0072, - "num_tokens": 222333225.0, - "step": 497 - }, - { - "epoch": 4.885608856088561, - "grad_norm": 0.06187153122740552, - "learning_rate": 1.0153697814699858e-06, - "loss": 0.0106, - "num_tokens": 222774591.0, - "step": 498 - }, - { - "epoch": 4.895448954489545, - "grad_norm": 0.054905669170180534, - "learning_rate": 1.0130972487591658e-06, - "loss": 0.0112, - "num_tokens": 223227943.0, - "step": 499 - }, - { - "epoch": 4.905289052890529, - "grad_norm": 0.06206228565326619, - "learning_rate": 1.0110061802826889e-06, - "loss": 0.0076, - "num_tokens": 223680989.0, - "step": 500 - }, - { - "epoch": 4.915129151291513, - "grad_norm": 0.05437071230251554, - "learning_rate": 1.009096660609837e-06, - "loss": 0.1789, - "num_tokens": 224171724.0, - "step": 501 - }, - { - "epoch": 4.924969249692497, - "grad_norm": 0.12358300885271949, - "learning_rate": 1.0073687669674949e-06, - "loss": 0.0081, - "num_tokens": 224621243.0, - "step": 502 - }, - { - "epoch": 4.934809348093481, - "grad_norm": 0.05743551551374671, - "learning_rate": 1.0058225692370299e-06, - "loss": 0.0077, - "num_tokens": 225053570.0, - "step": 503 - }, - { - "epoch": 4.944649446494465, - "grad_norm": 0.05705289715957623, - "learning_rate": 1.0044581299514638e-06, - "loss": 0.0077, - "num_tokens": 225475922.0, - "step": 504 - }, - { - "epoch": 4.9544895448954485, - "grad_norm": 0.052608564457681, - "learning_rate": 1.003275504292944e-06, - "loss": 0.0072, - "num_tokens": 225944888.0, - "step": 505 - }, - { - "epoch": 4.964329643296433, - "grad_norm": 0.05546452983023311, - "learning_rate": 1.0022747400905126e-06, - "loss": 0.0079, - "num_tokens": 226384045.0, - "step": 506 - }, - { - "epoch": 4.974169741697417, - "grad_norm": 0.05754539826487939, - "learning_rate": 1.0014558778181714e-06, - "loss": 0.0073, - "num_tokens": 226815343.0, - "step": 507 - }, - { - "epoch": 4.984009840098401, - "grad_norm": 0.05456913560891108, - "learning_rate": 1.0008189505932444e-06, - "loss": 0.0084, - "num_tokens": 227286168.0, - "step": 508 - }, - { - "epoch": 4.993849938499385, - "grad_norm": 0.053799541560384294, - "learning_rate": 1.0003639841750404e-06, - "loss": 0.0076, - "num_tokens": 227746824.0, - "step": 509 - }, - { - "epoch": 5.0, - "grad_norm": 0.07884368824115337, - "learning_rate": 1.0000909969638097e-06, - "loss": 0.0089, - "num_tokens": 227957450.0, - "step": 510 - }, - { - "epoch": 5.0, - "eval_loss": 0.11205815523862839, - "eval_num_tokens": 227957450.0, - "eval_runtime": 53.843, - "eval_samples_per_second": 41.714, - "eval_steps_per_second": 5.219, - "step": 510 - }, - { - "epoch": 5.0, - "step": 510, - "total_flos": 7.689061516716278e+17, - "train_loss": 0.0504409685922677, - "train_runtime": 7612.3259, - "train_samples_per_second": 8.537, - "train_steps_per_second": 0.067 - } - ], - "logging_steps": 1, - "max_steps": 510, - "num_input_tokens_seen": 0, - "num_train_epochs": 5, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 7.689061516716278e+17, - "train_batch_size": 2, - "trial_name": null, - "trial_params": null -}