diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4168 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 510, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00984009840098401, + "grad_norm": 3.7944442389822073, + "learning_rate": 0.0, + "loss": 1.2501, + "num_tokens": 456505.0, + "step": 1 + }, + { + "epoch": 0.01968019680196802, + "grad_norm": 3.778041640972742, + "learning_rate": 6.25e-07, + "loss": 1.2343, + "num_tokens": 915160.0, + "step": 2 + }, + { + "epoch": 0.02952029520295203, + "grad_norm": 3.8325814879608386, + "learning_rate": 1.25e-06, + "loss": 1.254, + "num_tokens": 1365315.0, + "step": 3 + }, + { + "epoch": 0.03936039360393604, + "grad_norm": 3.582565733113683, + "learning_rate": 1.8750000000000003e-06, + "loss": 1.1869, + "num_tokens": 1841763.0, + "step": 4 + }, + { + "epoch": 0.04920049200492005, + "grad_norm": 3.5604969753172315, + "learning_rate": 2.5e-06, + "loss": 1.2394, + "num_tokens": 2301606.0, + "step": 5 + }, + { + "epoch": 0.05904059040590406, + "grad_norm": 3.105374395878177, + "learning_rate": 3.125e-06, + "loss": 1.2366, + "num_tokens": 2755825.0, + "step": 6 + }, + { + "epoch": 0.06888068880688807, + "grad_norm": 2.316426838717515, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.1101, + "num_tokens": 3196409.0, + "step": 7 + }, + { + "epoch": 0.07872078720787208, + "grad_norm": 2.281060366927676, + "learning_rate": 4.3750000000000005e-06, + "loss": 1.1078, + "num_tokens": 3622733.0, + "step": 8 + }, + { + "epoch": 0.08856088560885608, + "grad_norm": 1.934577354985982, + "learning_rate": 5e-06, + "loss": 0.9014, + "num_tokens": 4055914.0, + "step": 9 + }, + { + "epoch": 0.0984009840098401, + "grad_norm": 1.9176079459138344, + "learning_rate": 5.625e-06, + "loss": 0.8745, + "num_tokens": 4485159.0, + "step": 10 + }, + { + "epoch": 0.10824108241082411, + "grad_norm": 1.786754010375736, + "learning_rate": 6.25e-06, + "loss": 0.7922, + "num_tokens": 4933514.0, + "step": 11 + }, + { + "epoch": 0.11808118081180811, + "grad_norm": 1.9933574737759214, + "learning_rate": 6.875e-06, + "loss": 0.4878, + "num_tokens": 5383658.0, + "step": 12 + }, + { + "epoch": 0.12792127921279212, + "grad_norm": 2.123289554302906, + "learning_rate": 7.500000000000001e-06, + "loss": 0.428, + "num_tokens": 5839838.0, + "step": 13 + }, + { + "epoch": 0.13776137761377613, + "grad_norm": 1.7562448014521572, + "learning_rate": 8.125000000000001e-06, + "loss": 0.3337, + "num_tokens": 6286175.0, + "step": 14 + }, + { + "epoch": 0.14760147601476015, + "grad_norm": 1.4384357290512548, + "learning_rate": 8.750000000000001e-06, + "loss": 0.2497, + "num_tokens": 6725821.0, + "step": 15 + }, + { + "epoch": 0.15744157441574416, + "grad_norm": 0.8232923354453182, + "learning_rate": 9.375000000000001e-06, + "loss": 0.1317, + "num_tokens": 7169854.0, + "step": 16 + }, + { + "epoch": 0.16728167281672818, + "grad_norm": 0.5262014955492348, + "learning_rate": 1e-05, + "loss": 0.1158, + "num_tokens": 7602261.0, + "step": 17 + }, + { + "epoch": 0.17712177121771217, + "grad_norm": 0.47218735378476806, + "learning_rate": 9.999909003036192e-06, + "loss": 0.098, + "num_tokens": 8040457.0, + "step": 18 + }, + { + "epoch": 0.18696186961869618, + "grad_norm": 0.22950756131023575, + "learning_rate": 9.99963601582496e-06, + "loss": 0.0827, + "num_tokens": 8484210.0, + "step": 19 + }, + { + "epoch": 0.1968019680196802, + "grad_norm": 0.24069667769460337, + "learning_rate": 9.999181049406756e-06, + "loss": 0.0733, + "num_tokens": 8913622.0, + "step": 20 + }, + { + "epoch": 0.2066420664206642, + "grad_norm": 0.19079800364724872, + "learning_rate": 9.998544122181829e-06, + "loss": 0.0851, + "num_tokens": 9379389.0, + "step": 21 + }, + { + "epoch": 0.21648216482164823, + "grad_norm": 0.17418458793254618, + "learning_rate": 9.997725259909487e-06, + "loss": 0.0734, + "num_tokens": 9803100.0, + "step": 22 + }, + { + "epoch": 0.22632226322263221, + "grad_norm": 0.19184343474298712, + "learning_rate": 9.996724495707056e-06, + "loss": 0.083, + "num_tokens": 10247767.0, + "step": 23 + }, + { + "epoch": 0.23616236162361623, + "grad_norm": 0.1475443750251538, + "learning_rate": 9.995541870048537e-06, + "loss": 0.0738, + "num_tokens": 10691516.0, + "step": 24 + }, + { + "epoch": 0.24600246002460024, + "grad_norm": 0.14419500643952865, + "learning_rate": 9.994177430762971e-06, + "loss": 0.0646, + "num_tokens": 11149524.0, + "step": 25 + }, + { + "epoch": 0.25584255842558423, + "grad_norm": 1.2190794556868674, + "learning_rate": 9.992631233032507e-06, + "loss": 0.0721, + "num_tokens": 11589958.0, + "step": 26 + }, + { + "epoch": 0.2656826568265683, + "grad_norm": 0.49893871677873436, + "learning_rate": 9.990903339390164e-06, + "loss": 0.0691, + "num_tokens": 12050102.0, + "step": 27 + }, + { + "epoch": 0.27552275522755226, + "grad_norm": 0.4422022706582718, + "learning_rate": 9.988993819717312e-06, + "loss": 0.0605, + "num_tokens": 12508433.0, + "step": 28 + }, + { + "epoch": 0.2853628536285363, + "grad_norm": 0.13441092661822238, + "learning_rate": 9.986902751240836e-06, + "loss": 0.0692, + "num_tokens": 12939960.0, + "step": 29 + }, + { + "epoch": 0.2952029520295203, + "grad_norm": 0.11858771432621444, + "learning_rate": 9.984630218530014e-06, + "loss": 0.0492, + "num_tokens": 13387850.0, + "step": 30 + }, + { + "epoch": 0.3050430504305043, + "grad_norm": 0.12696361103470127, + "learning_rate": 9.982176313493108e-06, + "loss": 0.0624, + "num_tokens": 13866894.0, + "step": 31 + }, + { + "epoch": 0.3148831488314883, + "grad_norm": 0.14678245947256616, + "learning_rate": 9.979541135373628e-06, + "loss": 0.0483, + "num_tokens": 14314553.0, + "step": 32 + }, + { + "epoch": 0.3247232472324723, + "grad_norm": 0.12403518011628543, + "learning_rate": 9.976724790746333e-06, + "loss": 0.0488, + "num_tokens": 14747330.0, + "step": 33 + }, + { + "epoch": 0.33456334563345635, + "grad_norm": 0.10283847292091904, + "learning_rate": 9.973727393512921e-06, + "loss": 0.0582, + "num_tokens": 15215873.0, + "step": 34 + }, + { + "epoch": 0.34440344403444034, + "grad_norm": 0.09860235260078455, + "learning_rate": 9.970549064897407e-06, + "loss": 0.0446, + "num_tokens": 15653849.0, + "step": 35 + }, + { + "epoch": 0.35424354243542433, + "grad_norm": 0.10274919661024226, + "learning_rate": 9.967189933441243e-06, + "loss": 0.0439, + "num_tokens": 16112913.0, + "step": 36 + }, + { + "epoch": 0.3640836408364084, + "grad_norm": 0.0918843632134462, + "learning_rate": 9.9636501349981e-06, + "loss": 0.0585, + "num_tokens": 16570588.0, + "step": 37 + }, + { + "epoch": 0.37392373923739236, + "grad_norm": 0.08618611894284056, + "learning_rate": 9.95992981272838e-06, + "loss": 0.0477, + "num_tokens": 17028395.0, + "step": 38 + }, + { + "epoch": 0.3837638376383764, + "grad_norm": 0.0915069403325355, + "learning_rate": 9.956029117093432e-06, + "loss": 0.045, + "num_tokens": 17477681.0, + "step": 39 + }, + { + "epoch": 0.3936039360393604, + "grad_norm": 0.09093140650787605, + "learning_rate": 9.951948205849457e-06, + "loss": 0.0444, + "num_tokens": 17940049.0, + "step": 40 + }, + { + "epoch": 0.4034440344403444, + "grad_norm": 0.08271507354884283, + "learning_rate": 9.947687244041143e-06, + "loss": 0.0401, + "num_tokens": 18360868.0, + "step": 41 + }, + { + "epoch": 0.4132841328413284, + "grad_norm": 0.08588968137159211, + "learning_rate": 9.943246403994969e-06, + "loss": 0.0358, + "num_tokens": 18811281.0, + "step": 42 + }, + { + "epoch": 0.4231242312423124, + "grad_norm": 0.08965565515357603, + "learning_rate": 9.938625865312252e-06, + "loss": 0.044, + "num_tokens": 19236998.0, + "step": 43 + }, + { + "epoch": 0.43296432964329645, + "grad_norm": 0.09636661290222473, + "learning_rate": 9.933825814861877e-06, + "loss": 0.0431, + "num_tokens": 19689363.0, + "step": 44 + }, + { + "epoch": 0.44280442804428044, + "grad_norm": 0.08912873391359938, + "learning_rate": 9.928846446772737e-06, + "loss": 0.0377, + "num_tokens": 20129602.0, + "step": 45 + }, + { + "epoch": 0.45264452644526443, + "grad_norm": 0.09271503002492597, + "learning_rate": 9.923687962425895e-06, + "loss": 0.0365, + "num_tokens": 20566055.0, + "step": 46 + }, + { + "epoch": 0.46248462484624847, + "grad_norm": 0.08617267288782972, + "learning_rate": 9.91835057044642e-06, + "loss": 0.0582, + "num_tokens": 21035837.0, + "step": 47 + }, + { + "epoch": 0.47232472324723246, + "grad_norm": 0.07942181409157618, + "learning_rate": 9.912834486694963e-06, + "loss": 0.0341, + "num_tokens": 21490681.0, + "step": 48 + }, + { + "epoch": 0.4821648216482165, + "grad_norm": 0.08409285833406879, + "learning_rate": 9.907139934259025e-06, + "loss": 0.0464, + "num_tokens": 21949736.0, + "step": 49 + }, + { + "epoch": 0.4920049200492005, + "grad_norm": 0.08981746101624732, + "learning_rate": 9.90126714344393e-06, + "loss": 0.0479, + "num_tokens": 22408345.0, + "step": 50 + }, + { + "epoch": 0.5018450184501845, + "grad_norm": 0.08557538109120558, + "learning_rate": 9.895216351763515e-06, + "loss": 0.04, + "num_tokens": 22869507.0, + "step": 51 + }, + { + "epoch": 0.5116851168511685, + "grad_norm": 0.08873060518107122, + "learning_rate": 9.888987803930523e-06, + "loss": 0.0359, + "num_tokens": 23337492.0, + "step": 52 + }, + { + "epoch": 0.5215252152521526, + "grad_norm": 0.08508195964995854, + "learning_rate": 9.882581751846707e-06, + "loss": 0.0338, + "num_tokens": 23788038.0, + "step": 53 + }, + { + "epoch": 0.5313653136531366, + "grad_norm": 0.076418318161816, + "learning_rate": 9.87599845459264e-06, + "loss": 0.0344, + "num_tokens": 24233994.0, + "step": 54 + }, + { + "epoch": 0.5412054120541205, + "grad_norm": 0.2889818789713905, + "learning_rate": 9.869238178417235e-06, + "loss": 0.2599, + "num_tokens": 24697351.0, + "step": 55 + }, + { + "epoch": 0.5510455104551045, + "grad_norm": 0.08884780995830746, + "learning_rate": 9.862301196726988e-06, + "loss": 0.0465, + "num_tokens": 25183095.0, + "step": 56 + }, + { + "epoch": 0.5608856088560885, + "grad_norm": 0.07990815808329678, + "learning_rate": 9.855187790074906e-06, + "loss": 0.0353, + "num_tokens": 25651971.0, + "step": 57 + }, + { + "epoch": 0.5707257072570726, + "grad_norm": 0.06894407429892842, + "learning_rate": 9.847898246149173e-06, + "loss": 0.0316, + "num_tokens": 26129683.0, + "step": 58 + }, + { + "epoch": 0.5805658056580566, + "grad_norm": 0.08216971413705307, + "learning_rate": 9.840432859761504e-06, + "loss": 0.0306, + "num_tokens": 26548348.0, + "step": 59 + }, + { + "epoch": 0.5904059040590406, + "grad_norm": 0.079031679127037, + "learning_rate": 9.832791932835232e-06, + "loss": 0.0362, + "num_tokens": 26977631.0, + "step": 60 + }, + { + "epoch": 0.6002460024600246, + "grad_norm": 0.07450412090133855, + "learning_rate": 9.824975774393089e-06, + "loss": 0.0276, + "num_tokens": 27421323.0, + "step": 61 + }, + { + "epoch": 0.6100861008610086, + "grad_norm": 0.08014735253624648, + "learning_rate": 9.816984700544714e-06, + "loss": 0.0286, + "num_tokens": 27882356.0, + "step": 62 + }, + { + "epoch": 0.6199261992619927, + "grad_norm": 0.08455294660438158, + "learning_rate": 9.808819034473869e-06, + "loss": 0.0407, + "num_tokens": 28343854.0, + "step": 63 + }, + { + "epoch": 0.6297662976629766, + "grad_norm": 0.08019778537515825, + "learning_rate": 9.800479106425356e-06, + "loss": 0.0299, + "num_tokens": 28790695.0, + "step": 64 + }, + { + "epoch": 0.6396063960639606, + "grad_norm": 0.08340888167507048, + "learning_rate": 9.791965253691687e-06, + "loss": 0.0353, + "num_tokens": 29220825.0, + "step": 65 + }, + { + "epoch": 0.6494464944649446, + "grad_norm": 0.08252486402936965, + "learning_rate": 9.783277820599408e-06, + "loss": 0.0367, + "num_tokens": 29686358.0, + "step": 66 + }, + { + "epoch": 0.6592865928659286, + "grad_norm": 0.08632773276059842, + "learning_rate": 9.774417158495208e-06, + "loss": 0.0331, + "num_tokens": 30120521.0, + "step": 67 + }, + { + "epoch": 0.6691266912669127, + "grad_norm": 0.082343171890358, + "learning_rate": 9.765383625731683e-06, + "loss": 0.0329, + "num_tokens": 30573947.0, + "step": 68 + }, + { + "epoch": 0.6789667896678967, + "grad_norm": 0.08874468637210653, + "learning_rate": 9.756177587652857e-06, + "loss": 0.0329, + "num_tokens": 30999244.0, + "step": 69 + }, + { + "epoch": 0.6888068880688807, + "grad_norm": 0.07673402020991506, + "learning_rate": 9.746799416579403e-06, + "loss": 0.0306, + "num_tokens": 31468786.0, + "step": 70 + }, + { + "epoch": 0.6986469864698647, + "grad_norm": 0.09204922624438575, + "learning_rate": 9.737249491793587e-06, + "loss": 0.0273, + "num_tokens": 31905019.0, + "step": 71 + }, + { + "epoch": 0.7084870848708487, + "grad_norm": 0.08145687118724444, + "learning_rate": 9.727528199523923e-06, + "loss": 0.029, + "num_tokens": 32340154.0, + "step": 72 + }, + { + "epoch": 0.7183271832718328, + "grad_norm": 0.09506872052374568, + "learning_rate": 9.717635932929556e-06, + "loss": 0.0373, + "num_tokens": 32789598.0, + "step": 73 + }, + { + "epoch": 0.7281672816728167, + "grad_norm": 0.08326889230017241, + "learning_rate": 9.707573092084368e-06, + "loss": 0.0286, + "num_tokens": 33239225.0, + "step": 74 + }, + { + "epoch": 0.7380073800738007, + "grad_norm": 0.07636964575035168, + "learning_rate": 9.697340083960785e-06, + "loss": 0.0291, + "num_tokens": 33718797.0, + "step": 75 + }, + { + "epoch": 0.7478474784747847, + "grad_norm": 0.09488168094776525, + "learning_rate": 9.686937322413325e-06, + "loss": 0.0328, + "num_tokens": 34155674.0, + "step": 76 + }, + { + "epoch": 0.7576875768757687, + "grad_norm": 0.0778086138359463, + "learning_rate": 9.676365228161869e-06, + "loss": 0.0252, + "num_tokens": 34584921.0, + "step": 77 + }, + { + "epoch": 0.7675276752767528, + "grad_norm": 0.08557737550120906, + "learning_rate": 9.66562422877462e-06, + "loss": 0.0338, + "num_tokens": 35049146.0, + "step": 78 + }, + { + "epoch": 0.7773677736777368, + "grad_norm": 0.09181023650151289, + "learning_rate": 9.654714758650844e-06, + "loss": 0.0299, + "num_tokens": 35519987.0, + "step": 79 + }, + { + "epoch": 0.7872078720787208, + "grad_norm": 0.07639914292637208, + "learning_rate": 9.643637259003276e-06, + "loss": 0.0242, + "num_tokens": 35959127.0, + "step": 80 + }, + { + "epoch": 0.7970479704797048, + "grad_norm": 0.08200922089613671, + "learning_rate": 9.632392177840286e-06, + "loss": 0.0317, + "num_tokens": 36416651.0, + "step": 81 + }, + { + "epoch": 0.8068880688806888, + "grad_norm": 0.07954028434263948, + "learning_rate": 9.620979969947759e-06, + "loss": 0.0293, + "num_tokens": 36864154.0, + "step": 82 + }, + { + "epoch": 0.8167281672816729, + "grad_norm": 0.07878375949867687, + "learning_rate": 9.609401096870707e-06, + "loss": 0.0237, + "num_tokens": 37310281.0, + "step": 83 + }, + { + "epoch": 0.8265682656826568, + "grad_norm": 0.07728168843840597, + "learning_rate": 9.597656026894591e-06, + "loss": 0.0322, + "num_tokens": 37746606.0, + "step": 84 + }, + { + "epoch": 0.8364083640836408, + "grad_norm": 0.07855221188672869, + "learning_rate": 9.585745235026391e-06, + "loss": 0.0258, + "num_tokens": 38189615.0, + "step": 85 + }, + { + "epoch": 0.8462484624846248, + "grad_norm": 0.07691630967258262, + "learning_rate": 9.5736692029754e-06, + "loss": 0.0293, + "num_tokens": 38637318.0, + "step": 86 + }, + { + "epoch": 0.8560885608856088, + "grad_norm": 0.07209047793755496, + "learning_rate": 9.561428419133723e-06, + "loss": 0.0235, + "num_tokens": 39102853.0, + "step": 87 + }, + { + "epoch": 0.8659286592865929, + "grad_norm": 0.0802072339239599, + "learning_rate": 9.549023378556548e-06, + "loss": 0.0311, + "num_tokens": 39538535.0, + "step": 88 + }, + { + "epoch": 0.8757687576875769, + "grad_norm": 0.09334524313401625, + "learning_rate": 9.53645458294211e-06, + "loss": 0.0484, + "num_tokens": 40020296.0, + "step": 89 + }, + { + "epoch": 0.8856088560885609, + "grad_norm": 2.3961901610996605, + "learning_rate": 9.523722540611403e-06, + "loss": 0.3276, + "num_tokens": 40506093.0, + "step": 90 + }, + { + "epoch": 0.8954489544895449, + "grad_norm": 0.09376957957757263, + "learning_rate": 9.510827766487625e-06, + "loss": 0.0288, + "num_tokens": 40937880.0, + "step": 91 + }, + { + "epoch": 0.9052890528905289, + "grad_norm": 0.08607984794603309, + "learning_rate": 9.497770782075353e-06, + "loss": 0.0247, + "num_tokens": 41374337.0, + "step": 92 + }, + { + "epoch": 0.915129151291513, + "grad_norm": 0.07253858203781333, + "learning_rate": 9.484552115439445e-06, + "loss": 0.0293, + "num_tokens": 41811558.0, + "step": 93 + }, + { + "epoch": 0.9249692496924969, + "grad_norm": 0.07768364358007782, + "learning_rate": 9.471172301183695e-06, + "loss": 0.0257, + "num_tokens": 42259726.0, + "step": 94 + }, + { + "epoch": 0.9348093480934809, + "grad_norm": 0.0769153663260077, + "learning_rate": 9.4576318804292e-06, + "loss": 0.0232, + "num_tokens": 42684319.0, + "step": 95 + }, + { + "epoch": 0.9446494464944649, + "grad_norm": 0.08163342042509363, + "learning_rate": 9.443931400792486e-06, + "loss": 0.0256, + "num_tokens": 43113589.0, + "step": 96 + }, + { + "epoch": 0.9544895448954489, + "grad_norm": 0.06536764982172343, + "learning_rate": 9.430071416363352e-06, + "loss": 0.0218, + "num_tokens": 43575488.0, + "step": 97 + }, + { + "epoch": 0.964329643296433, + "grad_norm": 0.08195099679978833, + "learning_rate": 9.416052487682465e-06, + "loss": 0.0254, + "num_tokens": 44016216.0, + "step": 98 + }, + { + "epoch": 0.974169741697417, + "grad_norm": 0.1266005657397246, + "learning_rate": 9.401875181718686e-06, + "loss": 0.0454, + "num_tokens": 44497742.0, + "step": 99 + }, + { + "epoch": 0.984009840098401, + "grad_norm": 0.07988798247506342, + "learning_rate": 9.387540071846155e-06, + "loss": 0.024, + "num_tokens": 44935936.0, + "step": 100 + }, + { + "epoch": 0.993849938499385, + "grad_norm": 0.07277763654694067, + "learning_rate": 9.373047737821078e-06, + "loss": 0.0216, + "num_tokens": 45381042.0, + "step": 101 + }, + { + "epoch": 1.0, + "grad_norm": 0.07277763654694067, + "learning_rate": 9.358398765758296e-06, + "loss": 0.0229, + "num_tokens": 45593876.0, + "step": 102 + }, + { + "epoch": 1.0, + "eval_loss": 0.07811997085809708, + "eval_num_tokens": 45593876.0, + "eval_runtime": 54.709, + "eval_samples_per_second": 41.054, + "eval_steps_per_second": 5.136, + "step": 102 + }, + { + "epoch": 1.009840098400984, + "grad_norm": 0.10795878798324991, + "learning_rate": 9.34359374810758e-06, + "loss": 0.0201, + "num_tokens": 46020335.0, + "step": 103 + }, + { + "epoch": 1.019680196801968, + "grad_norm": 0.07593949135329942, + "learning_rate": 9.328633283629666e-06, + "loss": 0.0222, + "num_tokens": 46466853.0, + "step": 104 + }, + { + "epoch": 1.029520295202952, + "grad_norm": 0.07596980345063492, + "learning_rate": 9.31351797737204e-06, + "loss": 0.0253, + "num_tokens": 46900993.0, + "step": 105 + }, + { + "epoch": 1.039360393603936, + "grad_norm": 0.08317964089954727, + "learning_rate": 9.29824844064447e-06, + "loss": 0.0206, + "num_tokens": 47334869.0, + "step": 106 + }, + { + "epoch": 1.04920049200492, + "grad_norm": 0.0805362815127939, + "learning_rate": 9.282825290994282e-06, + "loss": 0.0213, + "num_tokens": 47797630.0, + "step": 107 + }, + { + "epoch": 1.0590405904059041, + "grad_norm": 0.07839099238240128, + "learning_rate": 9.267249152181379e-06, + "loss": 0.0454, + "num_tokens": 48281974.0, + "step": 108 + }, + { + "epoch": 1.068880688806888, + "grad_norm": 0.0757738535866923, + "learning_rate": 9.251520654153028e-06, + "loss": 0.022, + "num_tokens": 48730118.0, + "step": 109 + }, + { + "epoch": 1.0787207872078721, + "grad_norm": 0.08256710571520359, + "learning_rate": 9.235640433018363e-06, + "loss": 0.0195, + "num_tokens": 49197576.0, + "step": 110 + }, + { + "epoch": 1.088560885608856, + "grad_norm": 0.07849933177459094, + "learning_rate": 9.219609131022684e-06, + "loss": 0.0203, + "num_tokens": 49673054.0, + "step": 111 + }, + { + "epoch": 1.09840098400984, + "grad_norm": 0.08067924302373455, + "learning_rate": 9.203427396521454e-06, + "loss": 0.0219, + "num_tokens": 50130569.0, + "step": 112 + }, + { + "epoch": 1.1082410824108242, + "grad_norm": 0.07527801624664898, + "learning_rate": 9.187095883954104e-06, + "loss": 0.0195, + "num_tokens": 50574721.0, + "step": 113 + }, + { + "epoch": 1.118081180811808, + "grad_norm": 0.08229755724299215, + "learning_rate": 9.170615253817547e-06, + "loss": 0.0193, + "num_tokens": 51010865.0, + "step": 114 + }, + { + "epoch": 1.1279212792127922, + "grad_norm": 0.07673721236222701, + "learning_rate": 9.153986172639474e-06, + "loss": 0.0211, + "num_tokens": 51469765.0, + "step": 115 + }, + { + "epoch": 1.137761377613776, + "grad_norm": 0.0845900192373935, + "learning_rate": 9.137209312951395e-06, + "loss": 0.0226, + "num_tokens": 51906114.0, + "step": 116 + }, + { + "epoch": 1.1476014760147601, + "grad_norm": 0.08215860044207468, + "learning_rate": 9.12028535326144e-06, + "loss": 0.022, + "num_tokens": 52354068.0, + "step": 117 + }, + { + "epoch": 1.1574415744157442, + "grad_norm": 0.07420368746928867, + "learning_rate": 9.103214978026922e-06, + "loss": 0.0188, + "num_tokens": 52836346.0, + "step": 118 + }, + { + "epoch": 1.1672816728167281, + "grad_norm": 0.07450541307438634, + "learning_rate": 9.085998877626644e-06, + "loss": 0.0192, + "num_tokens": 53299172.0, + "step": 119 + }, + { + "epoch": 1.1771217712177122, + "grad_norm": 0.07878886229739003, + "learning_rate": 9.068637748332993e-06, + "loss": 0.0215, + "num_tokens": 53759861.0, + "step": 120 + }, + { + "epoch": 1.186961869618696, + "grad_norm": 0.08311056334441597, + "learning_rate": 9.051132292283772e-06, + "loss": 0.0208, + "num_tokens": 54228512.0, + "step": 121 + }, + { + "epoch": 1.1968019680196802, + "grad_norm": 0.07068781735081182, + "learning_rate": 9.033483217453801e-06, + "loss": 0.0812, + "num_tokens": 54692852.0, + "step": 122 + }, + { + "epoch": 1.2066420664206643, + "grad_norm": 0.596254901083269, + "learning_rate": 9.015691237626292e-06, + "loss": 0.0199, + "num_tokens": 55139782.0, + "step": 123 + }, + { + "epoch": 1.2164821648216482, + "grad_norm": 0.08202279255895727, + "learning_rate": 8.997757072363976e-06, + "loss": 0.0342, + "num_tokens": 55604658.0, + "step": 124 + }, + { + "epoch": 1.2263222632226323, + "grad_norm": 0.09057478290667956, + "learning_rate": 8.979681446980002e-06, + "loss": 0.0227, + "num_tokens": 56030690.0, + "step": 125 + }, + { + "epoch": 1.2361623616236161, + "grad_norm": 0.07661103115531635, + "learning_rate": 8.961465092508607e-06, + "loss": 0.0339, + "num_tokens": 56492821.0, + "step": 126 + }, + { + "epoch": 1.2460024600246002, + "grad_norm": 0.08310739437969392, + "learning_rate": 8.943108745675542e-06, + "loss": 0.0249, + "num_tokens": 56927699.0, + "step": 127 + }, + { + "epoch": 1.2558425584255843, + "grad_norm": 0.08009221352147507, + "learning_rate": 8.92461314886829e-06, + "loss": 0.0209, + "num_tokens": 57365827.0, + "step": 128 + }, + { + "epoch": 1.2656826568265682, + "grad_norm": 0.07973094836265254, + "learning_rate": 8.905979050106029e-06, + "loss": 0.0251, + "num_tokens": 57821453.0, + "step": 129 + }, + { + "epoch": 1.2755227552275523, + "grad_norm": 0.07600070319773061, + "learning_rate": 8.887207203009385e-06, + "loss": 0.0213, + "num_tokens": 58267867.0, + "step": 130 + }, + { + "epoch": 1.2853628536285364, + "grad_norm": 0.07258837000806613, + "learning_rate": 8.868298366769956e-06, + "loss": 0.0198, + "num_tokens": 58715078.0, + "step": 131 + }, + { + "epoch": 1.2952029520295203, + "grad_norm": 0.07826062337656157, + "learning_rate": 8.849253306119601e-06, + "loss": 0.0199, + "num_tokens": 59159310.0, + "step": 132 + }, + { + "epoch": 1.3050430504305042, + "grad_norm": 0.07770042002025847, + "learning_rate": 8.83007279129952e-06, + "loss": 0.027, + "num_tokens": 59594031.0, + "step": 133 + }, + { + "epoch": 1.3148831488314883, + "grad_norm": 0.07607344407726713, + "learning_rate": 8.810757598029094e-06, + "loss": 0.0342, + "num_tokens": 60038506.0, + "step": 134 + }, + { + "epoch": 1.3247232472324724, + "grad_norm": 0.08771686774228402, + "learning_rate": 8.79130850747452e-06, + "loss": 0.0234, + "num_tokens": 60492486.0, + "step": 135 + }, + { + "epoch": 1.3345633456334562, + "grad_norm": 0.07482147000786651, + "learning_rate": 8.771726306217217e-06, + "loss": 0.0196, + "num_tokens": 60925341.0, + "step": 136 + }, + { + "epoch": 1.3444034440344403, + "grad_norm": 0.07171750614547971, + "learning_rate": 8.752011786222011e-06, + "loss": 0.0224, + "num_tokens": 61401128.0, + "step": 137 + }, + { + "epoch": 1.3542435424354244, + "grad_norm": 0.07289189868770962, + "learning_rate": 8.732165744805107e-06, + "loss": 0.0198, + "num_tokens": 61845691.0, + "step": 138 + }, + { + "epoch": 1.3640836408364083, + "grad_norm": 0.07907747558023923, + "learning_rate": 8.712188984601845e-06, + "loss": 0.0185, + "num_tokens": 62286361.0, + "step": 139 + }, + { + "epoch": 1.3739237392373924, + "grad_norm": 0.06910414114179665, + "learning_rate": 8.692082313534233e-06, + "loss": 0.0179, + "num_tokens": 62727406.0, + "step": 140 + }, + { + "epoch": 1.3837638376383765, + "grad_norm": 0.07791959325829377, + "learning_rate": 8.671846544778284e-06, + "loss": 0.0204, + "num_tokens": 63182141.0, + "step": 141 + }, + { + "epoch": 1.3936039360393604, + "grad_norm": 0.0741558195977179, + "learning_rate": 8.651482496731116e-06, + "loss": 0.0178, + "num_tokens": 63600729.0, + "step": 142 + }, + { + "epoch": 1.4034440344403443, + "grad_norm": 0.07283375136096223, + "learning_rate": 8.630990992977854e-06, + "loss": 0.0198, + "num_tokens": 64066267.0, + "step": 143 + }, + { + "epoch": 1.4132841328413284, + "grad_norm": 0.0731783816547012, + "learning_rate": 8.61037286225834e-06, + "loss": 0.2547, + "num_tokens": 64515946.0, + "step": 144 + }, + { + "epoch": 1.4231242312423125, + "grad_norm": 1.0212050791856901, + "learning_rate": 8.589628938433587e-06, + "loss": 0.0192, + "num_tokens": 64949958.0, + "step": 145 + }, + { + "epoch": 1.4329643296432963, + "grad_norm": 0.09844320658741419, + "learning_rate": 8.56876006045208e-06, + "loss": 0.0176, + "num_tokens": 65381018.0, + "step": 146 + }, + { + "epoch": 1.4428044280442804, + "grad_norm": 0.07030907656382593, + "learning_rate": 8.547767072315835e-06, + "loss": 0.0241, + "num_tokens": 65814016.0, + "step": 147 + }, + { + "epoch": 1.4526445264452645, + "grad_norm": 0.0779412275694533, + "learning_rate": 8.526650823046266e-06, + "loss": 0.0265, + "num_tokens": 66252980.0, + "step": 148 + }, + { + "epoch": 1.4624846248462484, + "grad_norm": 0.09570533939331194, + "learning_rate": 8.505412166649847e-06, + "loss": 0.0199, + "num_tokens": 66718111.0, + "step": 149 + }, + { + "epoch": 1.4723247232472325, + "grad_norm": 0.07915246167438994, + "learning_rate": 8.484051962083579e-06, + "loss": 0.0204, + "num_tokens": 67163762.0, + "step": 150 + }, + { + "epoch": 1.4821648216482166, + "grad_norm": 0.07935176799416567, + "learning_rate": 8.462571073220243e-06, + "loss": 0.0225, + "num_tokens": 67624386.0, + "step": 151 + }, + { + "epoch": 1.4920049200492005, + "grad_norm": 0.07841589822630919, + "learning_rate": 8.44097036881347e-06, + "loss": 0.0392, + "num_tokens": 68065290.0, + "step": 152 + }, + { + "epoch": 1.5018450184501844, + "grad_norm": 0.3517146293571387, + "learning_rate": 8.419250722462603e-06, + "loss": 0.0178, + "num_tokens": 68519107.0, + "step": 153 + }, + { + "epoch": 1.5116851168511685, + "grad_norm": 0.0764909788834621, + "learning_rate": 8.39741301257736e-06, + "loss": 0.0194, + "num_tokens": 68971128.0, + "step": 154 + }, + { + "epoch": 1.5215252152521526, + "grad_norm": 0.08078822036852527, + "learning_rate": 8.375458122342317e-06, + "loss": 0.0206, + "num_tokens": 69403792.0, + "step": 155 + }, + { + "epoch": 1.5313653136531364, + "grad_norm": 0.08235320219175549, + "learning_rate": 8.353386939681186e-06, + "loss": 0.0175, + "num_tokens": 69836602.0, + "step": 156 + }, + { + "epoch": 1.5412054120541205, + "grad_norm": 0.0735540837139594, + "learning_rate": 8.331200357220908e-06, + "loss": 0.0194, + "num_tokens": 70283814.0, + "step": 157 + }, + { + "epoch": 1.5510455104551046, + "grad_norm": 0.07322399084658018, + "learning_rate": 8.308899272255542e-06, + "loss": 0.0184, + "num_tokens": 70726284.0, + "step": 158 + }, + { + "epoch": 1.5608856088560885, + "grad_norm": 0.07790348390650517, + "learning_rate": 8.286484586709989e-06, + "loss": 0.0183, + "num_tokens": 71155169.0, + "step": 159 + }, + { + "epoch": 1.5707257072570726, + "grad_norm": 0.08611809383964489, + "learning_rate": 8.263957207103506e-06, + "loss": 0.0205, + "num_tokens": 71591204.0, + "step": 160 + }, + { + "epoch": 1.5805658056580567, + "grad_norm": 0.0706229845173915, + "learning_rate": 8.241318044513046e-06, + "loss": 0.0277, + "num_tokens": 72032119.0, + "step": 161 + }, + { + "epoch": 1.5904059040590406, + "grad_norm": 0.09019039164269532, + "learning_rate": 8.218568014536414e-06, + "loss": 0.0176, + "num_tokens": 72492164.0, + "step": 162 + }, + { + "epoch": 1.6002460024600245, + "grad_norm": 0.07947315916491103, + "learning_rate": 8.195708037255233e-06, + "loss": 0.0202, + "num_tokens": 72962752.0, + "step": 163 + }, + { + "epoch": 1.6100861008610086, + "grad_norm": 0.06840189166732885, + "learning_rate": 8.172739037197739e-06, + "loss": 0.018, + "num_tokens": 73415974.0, + "step": 164 + }, + { + "epoch": 1.6199261992619927, + "grad_norm": 0.07366616747573093, + "learning_rate": 8.149661943301382e-06, + "loss": 0.0181, + "num_tokens": 73882834.0, + "step": 165 + }, + { + "epoch": 1.6297662976629765, + "grad_norm": 0.07081012920317416, + "learning_rate": 8.126477688875262e-06, + "loss": 0.0204, + "num_tokens": 74321580.0, + "step": 166 + }, + { + "epoch": 1.6396063960639606, + "grad_norm": 0.07863097311534642, + "learning_rate": 8.103187211562386e-06, + "loss": 0.0229, + "num_tokens": 74781751.0, + "step": 167 + }, + { + "epoch": 1.6494464944649447, + "grad_norm": 0.10797044478776457, + "learning_rate": 8.079791453301742e-06, + "loss": 0.0287, + "num_tokens": 75219935.0, + "step": 168 + }, + { + "epoch": 1.6592865928659286, + "grad_norm": 0.07041534985061697, + "learning_rate": 8.056291360290202e-06, + "loss": 0.0248, + "num_tokens": 75665232.0, + "step": 169 + }, + { + "epoch": 1.6691266912669127, + "grad_norm": 0.08695303118518641, + "learning_rate": 8.032687882944264e-06, + "loss": 0.0193, + "num_tokens": 76087411.0, + "step": 170 + }, + { + "epoch": 1.6789667896678968, + "grad_norm": 0.06704813880798238, + "learning_rate": 8.0089819758616e-06, + "loss": 0.0169, + "num_tokens": 76529931.0, + "step": 171 + }, + { + "epoch": 1.6888068880688807, + "grad_norm": 0.06935996975041725, + "learning_rate": 7.985174597782469e-06, + "loss": 0.0197, + "num_tokens": 76974869.0, + "step": 172 + }, + { + "epoch": 1.6986469864698646, + "grad_norm": 0.0812644475398725, + "learning_rate": 7.961266711550922e-06, + "loss": 0.0259, + "num_tokens": 77413009.0, + "step": 173 + }, + { + "epoch": 1.7084870848708487, + "grad_norm": 0.07469198601302375, + "learning_rate": 7.937259284075872e-06, + "loss": 0.0191, + "num_tokens": 77854298.0, + "step": 174 + }, + { + "epoch": 1.7183271832718328, + "grad_norm": 0.07554209425696685, + "learning_rate": 7.913153286291995e-06, + "loss": 0.025, + "num_tokens": 78299682.0, + "step": 175 + }, + { + "epoch": 1.7281672816728166, + "grad_norm": 0.07564661483692575, + "learning_rate": 7.888949693120443e-06, + "loss": 0.0172, + "num_tokens": 78723460.0, + "step": 176 + }, + { + "epoch": 1.7380073800738007, + "grad_norm": 0.6264202015289688, + "learning_rate": 7.864649483429442e-06, + "loss": 0.0402, + "num_tokens": 79151526.0, + "step": 177 + }, + { + "epoch": 1.7478474784747848, + "grad_norm": 0.07431323606896861, + "learning_rate": 7.840253639994676e-06, + "loss": 0.0182, + "num_tokens": 79591692.0, + "step": 178 + }, + { + "epoch": 1.7576875768757687, + "grad_norm": 0.07199128250127072, + "learning_rate": 7.815763149459563e-06, + "loss": 0.018, + "num_tokens": 80054397.0, + "step": 179 + }, + { + "epoch": 1.7675276752767528, + "grad_norm": 0.0736771332831437, + "learning_rate": 7.791179002295334e-06, + "loss": 0.0182, + "num_tokens": 80527436.0, + "step": 180 + }, + { + "epoch": 1.777367773677737, + "grad_norm": 0.0722896910687323, + "learning_rate": 7.766502192760995e-06, + "loss": 0.0299, + "num_tokens": 80984085.0, + "step": 181 + }, + { + "epoch": 1.7872078720787208, + "grad_norm": 0.13146348676004535, + "learning_rate": 7.741733718863096e-06, + "loss": 0.0172, + "num_tokens": 81417093.0, + "step": 182 + }, + { + "epoch": 1.7970479704797047, + "grad_norm": 0.07559775090622188, + "learning_rate": 7.71687458231538e-06, + "loss": 0.0173, + "num_tokens": 81857802.0, + "step": 183 + }, + { + "epoch": 1.8068880688806888, + "grad_norm": 0.07625026619956689, + "learning_rate": 7.69192578849827e-06, + "loss": 0.0174, + "num_tokens": 82314635.0, + "step": 184 + }, + { + "epoch": 1.8167281672816729, + "grad_norm": 0.07079163666898536, + "learning_rate": 7.666888346418205e-06, + "loss": 0.0255, + "num_tokens": 82774404.0, + "step": 185 + }, + { + "epoch": 1.8265682656826567, + "grad_norm": 0.07862230056744444, + "learning_rate": 7.641763268666832e-06, + "loss": 0.0166, + "num_tokens": 83224858.0, + "step": 186 + }, + { + "epoch": 1.8364083640836408, + "grad_norm": 0.07767548895299481, + "learning_rate": 7.616551571380061e-06, + "loss": 0.0303, + "num_tokens": 83685638.0, + "step": 187 + }, + { + "epoch": 1.846248462484625, + "grad_norm": 0.0767555813557926, + "learning_rate": 7.5912542741969585e-06, + "loss": 0.0173, + "num_tokens": 84118329.0, + "step": 188 + }, + { + "epoch": 1.8560885608856088, + "grad_norm": 0.06505326217418561, + "learning_rate": 7.5658724002185215e-06, + "loss": 0.2302, + "num_tokens": 84642441.0, + "step": 189 + }, + { + "epoch": 1.865928659286593, + "grad_norm": 0.9831912884395022, + "learning_rate": 7.54040697596629e-06, + "loss": 0.0173, + "num_tokens": 85075013.0, + "step": 190 + }, + { + "epoch": 1.875768757687577, + "grad_norm": 0.0852074767092427, + "learning_rate": 7.514859031340835e-06, + "loss": 0.0197, + "num_tokens": 85539398.0, + "step": 191 + }, + { + "epoch": 1.8856088560885609, + "grad_norm": 0.07502455159038045, + "learning_rate": 7.489229599580111e-06, + "loss": 0.0167, + "num_tokens": 85976652.0, + "step": 192 + }, + { + "epoch": 1.8954489544895448, + "grad_norm": 0.07796568336104527, + "learning_rate": 7.463519717217663e-06, + "loss": 0.0253, + "num_tokens": 86404836.0, + "step": 193 + }, + { + "epoch": 1.9052890528905289, + "grad_norm": 0.07733304316410633, + "learning_rate": 7.437730424040702e-06, + "loss": 0.0232, + "num_tokens": 86871021.0, + "step": 194 + }, + { + "epoch": 1.915129151291513, + "grad_norm": 0.07837311923363188, + "learning_rate": 7.411862763048068e-06, + "loss": 0.0228, + "num_tokens": 87328297.0, + "step": 195 + }, + { + "epoch": 1.9249692496924968, + "grad_norm": 0.07159308881612252, + "learning_rate": 7.38591778040803e-06, + "loss": 0.0178, + "num_tokens": 87780478.0, + "step": 196 + }, + { + "epoch": 1.934809348093481, + "grad_norm": 0.06995284279442164, + "learning_rate": 7.359896525415986e-06, + "loss": 0.0166, + "num_tokens": 88245218.0, + "step": 197 + }, + { + "epoch": 1.944649446494465, + "grad_norm": 0.074185946727602, + "learning_rate": 7.333800050452024e-06, + "loss": 0.0335, + "num_tokens": 88720048.0, + "step": 198 + }, + { + "epoch": 1.954489544895449, + "grad_norm": 0.0936664061322253, + "learning_rate": 7.307629410938364e-06, + "loss": 0.0156, + "num_tokens": 89171687.0, + "step": 199 + }, + { + "epoch": 1.964329643296433, + "grad_norm": 0.06592479834851843, + "learning_rate": 7.281385665296663e-06, + "loss": 0.0162, + "num_tokens": 89636320.0, + "step": 200 + }, + { + "epoch": 1.974169741697417, + "grad_norm": 0.08486840853612633, + "learning_rate": 7.255069874905221e-06, + "loss": 0.0177, + "num_tokens": 90074778.0, + "step": 201 + }, + { + "epoch": 1.984009840098401, + "grad_norm": 0.06923307537599123, + "learning_rate": 7.228683104056051e-06, + "loss": 0.0168, + "num_tokens": 90519743.0, + "step": 202 + }, + { + "epoch": 1.9938499384993849, + "grad_norm": 0.09250588119689185, + "learning_rate": 7.202226419911832e-06, + "loss": 0.0266, + "num_tokens": 90971202.0, + "step": 203 + }, + { + "epoch": 2.0, + "grad_norm": 0.08932358174959376, + "learning_rate": 7.175700892462757e-06, + "loss": 0.0167, + "num_tokens": 91183681.0, + "step": 204 + }, + { + "epoch": 2.0, + "eval_loss": 0.08808860927820206, + "eval_num_tokens": 91183681.0, + "eval_runtime": 53.9315, + "eval_samples_per_second": 41.645, + "eval_steps_per_second": 5.21, + "step": 204 + }, + { + "epoch": 2.009840098400984, + "grad_norm": 0.07874869315833909, + "learning_rate": 7.149107594483251e-06, + "loss": 0.0142, + "num_tokens": 91625671.0, + "step": 205 + }, + { + "epoch": 2.019680196801968, + "grad_norm": 0.06385620551213778, + "learning_rate": 7.122447601488592e-06, + "loss": 0.0132, + "num_tokens": 92071488.0, + "step": 206 + }, + { + "epoch": 2.029520295202952, + "grad_norm": 0.06846197400142105, + "learning_rate": 7.095721991691411e-06, + "loss": 0.0149, + "num_tokens": 92542156.0, + "step": 207 + }, + { + "epoch": 2.039360393603936, + "grad_norm": 0.07424945414823086, + "learning_rate": 7.0689318459580845e-06, + "loss": 0.0156, + "num_tokens": 93002703.0, + "step": 208 + }, + { + "epoch": 2.0492004920049203, + "grad_norm": 0.06687580312011086, + "learning_rate": 7.042078247765019e-06, + "loss": 0.0135, + "num_tokens": 93436834.0, + "step": 209 + }, + { + "epoch": 2.059040590405904, + "grad_norm": 0.07720021453648518, + "learning_rate": 7.015162283154843e-06, + "loss": 0.0137, + "num_tokens": 93871635.0, + "step": 210 + }, + { + "epoch": 2.068880688806888, + "grad_norm": 0.13453391743262458, + "learning_rate": 6.988185040692469e-06, + "loss": 0.0221, + "num_tokens": 94314058.0, + "step": 211 + }, + { + "epoch": 2.078720787207872, + "grad_norm": 0.07982223152072775, + "learning_rate": 6.961147611421076e-06, + "loss": 0.017, + "num_tokens": 94750976.0, + "step": 212 + }, + { + "epoch": 2.088560885608856, + "grad_norm": 0.06995730861373262, + "learning_rate": 6.934051088817988e-06, + "loss": 0.0137, + "num_tokens": 95193789.0, + "step": 213 + }, + { + "epoch": 2.09840098400984, + "grad_norm": 0.07438600726959783, + "learning_rate": 6.906896568750441e-06, + "loss": 0.0193, + "num_tokens": 95676386.0, + "step": 214 + }, + { + "epoch": 2.108241082410824, + "grad_norm": 0.09331884860488432, + "learning_rate": 6.87968514943127e-06, + "loss": 0.0154, + "num_tokens": 96137917.0, + "step": 215 + }, + { + "epoch": 2.1180811808118083, + "grad_norm": 0.06703452835053635, + "learning_rate": 6.852417931374494e-06, + "loss": 0.0134, + "num_tokens": 96568059.0, + "step": 216 + }, + { + "epoch": 2.127921279212792, + "grad_norm": 0.07093081986870549, + "learning_rate": 6.825096017350807e-06, + "loss": 0.0138, + "num_tokens": 97019588.0, + "step": 217 + }, + { + "epoch": 2.137761377613776, + "grad_norm": 0.0650948479503258, + "learning_rate": 6.797720512342967e-06, + "loss": 0.0137, + "num_tokens": 97456418.0, + "step": 218 + }, + { + "epoch": 2.14760147601476, + "grad_norm": 0.06693139683273135, + "learning_rate": 6.77029252350113e-06, + "loss": 0.0142, + "num_tokens": 97874765.0, + "step": 219 + }, + { + "epoch": 2.1574415744157442, + "grad_norm": 0.07881816970778455, + "learning_rate": 6.742813160098054e-06, + "loss": 0.0188, + "num_tokens": 98322373.0, + "step": 220 + }, + { + "epoch": 2.167281672816728, + "grad_norm": 0.07381706020969016, + "learning_rate": 6.715283533484242e-06, + "loss": 0.0125, + "num_tokens": 98762055.0, + "step": 221 + }, + { + "epoch": 2.177121771217712, + "grad_norm": 0.06829050170688594, + "learning_rate": 6.6877047570430044e-06, + "loss": 0.0147, + "num_tokens": 99212257.0, + "step": 222 + }, + { + "epoch": 2.1869618696186963, + "grad_norm": 0.0726323898489312, + "learning_rate": 6.660077946145412e-06, + "loss": 0.0149, + "num_tokens": 99651696.0, + "step": 223 + }, + { + "epoch": 2.19680196801968, + "grad_norm": 0.06996376101830218, + "learning_rate": 6.632404218105205e-06, + "loss": 0.014, + "num_tokens": 100115333.0, + "step": 224 + }, + { + "epoch": 2.206642066420664, + "grad_norm": 0.07058857975728597, + "learning_rate": 6.604684692133597e-06, + "loss": 0.0128, + "num_tokens": 100567168.0, + "step": 225 + }, + { + "epoch": 2.2164821648216484, + "grad_norm": 0.06705830086377462, + "learning_rate": 6.576920489294011e-06, + "loss": 0.014, + "num_tokens": 101017414.0, + "step": 226 + }, + { + "epoch": 2.2263222632226323, + "grad_norm": 0.08216121325842957, + "learning_rate": 6.549112732456739e-06, + "loss": 0.0244, + "num_tokens": 101478653.0, + "step": 227 + }, + { + "epoch": 2.236162361623616, + "grad_norm": 0.06604918422838713, + "learning_rate": 6.5212625462535365e-06, + "loss": 0.0133, + "num_tokens": 101922998.0, + "step": 228 + }, + { + "epoch": 2.2460024600246005, + "grad_norm": 0.06450225948970358, + "learning_rate": 6.493371057032129e-06, + "loss": 0.0149, + "num_tokens": 102357947.0, + "step": 229 + }, + { + "epoch": 2.2558425584255843, + "grad_norm": 0.07514996917424294, + "learning_rate": 6.465439392810664e-06, + "loss": 0.0167, + "num_tokens": 102803832.0, + "step": 230 + }, + { + "epoch": 2.265682656826568, + "grad_norm": 0.06462428507734051, + "learning_rate": 6.4374686832320944e-06, + "loss": 0.0142, + "num_tokens": 103241692.0, + "step": 231 + }, + { + "epoch": 2.275522755227552, + "grad_norm": 0.06485952063828938, + "learning_rate": 6.409460059518482e-06, + "loss": 0.0136, + "num_tokens": 103688326.0, + "step": 232 + }, + { + "epoch": 2.2853628536285364, + "grad_norm": 0.06533997999817706, + "learning_rate": 6.381414654425261e-06, + "loss": 0.0131, + "num_tokens": 104139997.0, + "step": 233 + }, + { + "epoch": 2.2952029520295203, + "grad_norm": 0.06878268907753365, + "learning_rate": 6.353333602195414e-06, + "loss": 0.0138, + "num_tokens": 104583247.0, + "step": 234 + }, + { + "epoch": 2.305043050430504, + "grad_norm": 0.061527579151490784, + "learning_rate": 6.325218038513604e-06, + "loss": 0.0129, + "num_tokens": 105013546.0, + "step": 235 + }, + { + "epoch": 2.3148831488314885, + "grad_norm": 0.0688594189041464, + "learning_rate": 6.2970691004602425e-06, + "loss": 0.0147, + "num_tokens": 105469533.0, + "step": 236 + }, + { + "epoch": 2.3247232472324724, + "grad_norm": 0.07212293085873876, + "learning_rate": 6.26888792646551e-06, + "loss": 0.0138, + "num_tokens": 105902012.0, + "step": 237 + }, + { + "epoch": 2.3345633456334562, + "grad_norm": 0.07097729248579715, + "learning_rate": 6.240675656263303e-06, + "loss": 0.0133, + "num_tokens": 106319708.0, + "step": 238 + }, + { + "epoch": 2.34440344403444, + "grad_norm": 0.0702207231329528, + "learning_rate": 6.212433430845145e-06, + "loss": 0.0136, + "num_tokens": 106767770.0, + "step": 239 + }, + { + "epoch": 2.3542435424354244, + "grad_norm": 0.06717197740035392, + "learning_rate": 6.184162392414044e-06, + "loss": 0.0127, + "num_tokens": 107230010.0, + "step": 240 + }, + { + "epoch": 2.3640836408364083, + "grad_norm": 0.09206853570190297, + "learning_rate": 6.155863684338294e-06, + "loss": 0.0182, + "num_tokens": 107696665.0, + "step": 241 + }, + { + "epoch": 2.373923739237392, + "grad_norm": 0.07931539686074184, + "learning_rate": 6.127538451105232e-06, + "loss": 0.0156, + "num_tokens": 108145998.0, + "step": 242 + }, + { + "epoch": 2.3837638376383765, + "grad_norm": 0.0845167365221342, + "learning_rate": 6.099187838274959e-06, + "loss": 0.0304, + "num_tokens": 108605347.0, + "step": 243 + }, + { + "epoch": 2.3936039360393604, + "grad_norm": 0.8319925155014395, + "learning_rate": 6.070812992434003e-06, + "loss": 0.077, + "num_tokens": 109053120.0, + "step": 244 + }, + { + "epoch": 2.4034440344403443, + "grad_norm": 0.08254084053779843, + "learning_rate": 6.042415061148954e-06, + "loss": 0.0153, + "num_tokens": 109511574.0, + "step": 245 + }, + { + "epoch": 2.4132841328413286, + "grad_norm": 0.07621464852457635, + "learning_rate": 6.013995192920044e-06, + "loss": 0.013, + "num_tokens": 109961861.0, + "step": 246 + }, + { + "epoch": 2.4231242312423125, + "grad_norm": 0.06290755400921484, + "learning_rate": 5.985554537134702e-06, + "loss": 0.0133, + "num_tokens": 110439530.0, + "step": 247 + }, + { + "epoch": 2.4329643296432963, + "grad_norm": 0.06549923207889226, + "learning_rate": 5.957094244021071e-06, + "loss": 0.0133, + "num_tokens": 110902468.0, + "step": 248 + }, + { + "epoch": 2.4428044280442807, + "grad_norm": 0.06398296126869986, + "learning_rate": 5.928615464601497e-06, + "loss": 0.0128, + "num_tokens": 111361759.0, + "step": 249 + }, + { + "epoch": 2.4526445264452645, + "grad_norm": 0.062244715362799644, + "learning_rate": 5.900119350645956e-06, + "loss": 0.0128, + "num_tokens": 111799435.0, + "step": 250 + }, + { + "epoch": 2.4624846248462484, + "grad_norm": 0.06503161600374163, + "learning_rate": 5.871607054625497e-06, + "loss": 0.0128, + "num_tokens": 112244747.0, + "step": 251 + }, + { + "epoch": 2.4723247232472323, + "grad_norm": 0.08086590997362891, + "learning_rate": 5.8430797296656125e-06, + "loss": 0.0184, + "num_tokens": 112678903.0, + "step": 252 + }, + { + "epoch": 2.4821648216482166, + "grad_norm": 0.07239451855920867, + "learning_rate": 5.814538529499622e-06, + "loss": 0.0149, + "num_tokens": 113132832.0, + "step": 253 + }, + { + "epoch": 2.4920049200492005, + "grad_norm": 0.06030312987290577, + "learning_rate": 5.785984608421993e-06, + "loss": 0.0127, + "num_tokens": 113568429.0, + "step": 254 + }, + { + "epoch": 2.5018450184501844, + "grad_norm": 0.06349775541516244, + "learning_rate": 5.757419121241667e-06, + "loss": 0.0125, + "num_tokens": 114042240.0, + "step": 255 + }, + { + "epoch": 2.5116851168511687, + "grad_norm": 0.06952013750985335, + "learning_rate": 5.7288432232353615e-06, + "loss": 0.0204, + "num_tokens": 114496441.0, + "step": 256 + }, + { + "epoch": 2.5215252152521526, + "grad_norm": 0.0958262233433174, + "learning_rate": 5.7002580701008325e-06, + "loss": 0.0149, + "num_tokens": 114936236.0, + "step": 257 + }, + { + "epoch": 2.5313653136531364, + "grad_norm": 0.06572975411347728, + "learning_rate": 5.6716648179101445e-06, + "loss": 0.0123, + "num_tokens": 115365529.0, + "step": 258 + }, + { + "epoch": 2.5412054120541203, + "grad_norm": 0.07287254897275752, + "learning_rate": 5.64306462306291e-06, + "loss": 0.0177, + "num_tokens": 115812361.0, + "step": 259 + }, + { + "epoch": 2.5510455104551046, + "grad_norm": 0.0677506186552676, + "learning_rate": 5.614458642239534e-06, + "loss": 0.0126, + "num_tokens": 116269752.0, + "step": 260 + }, + { + "epoch": 2.5608856088560885, + "grad_norm": 0.07088790175345892, + "learning_rate": 5.585848032354411e-06, + "loss": 0.0139, + "num_tokens": 116739082.0, + "step": 261 + }, + { + "epoch": 2.570725707257073, + "grad_norm": 2.483507979054926, + "learning_rate": 5.557233950509159e-06, + "loss": 0.3298, + "num_tokens": 117236975.0, + "step": 262 + }, + { + "epoch": 2.5805658056580567, + "grad_norm": 0.6712341553033803, + "learning_rate": 5.528617553945807e-06, + "loss": 0.0131, + "num_tokens": 117701799.0, + "step": 263 + }, + { + "epoch": 2.5904059040590406, + "grad_norm": 0.070379027103792, + "learning_rate": 5.500000000000001e-06, + "loss": 0.019, + "num_tokens": 118190544.0, + "step": 264 + }, + { + "epoch": 2.6002460024600245, + "grad_norm": 0.09944926431551483, + "learning_rate": 5.4713824460541964e-06, + "loss": 0.0153, + "num_tokens": 118625146.0, + "step": 265 + }, + { + "epoch": 2.6100861008610083, + "grad_norm": 0.07370939155932825, + "learning_rate": 5.442766049490843e-06, + "loss": 0.0138, + "num_tokens": 119077739.0, + "step": 266 + }, + { + "epoch": 2.6199261992619927, + "grad_norm": 0.06555516765204612, + "learning_rate": 5.414151967645591e-06, + "loss": 0.0136, + "num_tokens": 119502701.0, + "step": 267 + }, + { + "epoch": 2.6297662976629765, + "grad_norm": 0.060577987544993946, + "learning_rate": 5.385541357760469e-06, + "loss": 0.0121, + "num_tokens": 119956823.0, + "step": 268 + }, + { + "epoch": 2.639606396063961, + "grad_norm": 0.06969958736256228, + "learning_rate": 5.35693537693709e-06, + "loss": 0.0131, + "num_tokens": 120410284.0, + "step": 269 + }, + { + "epoch": 2.6494464944649447, + "grad_norm": 0.08178808292429539, + "learning_rate": 5.3283351820898586e-06, + "loss": 0.0183, + "num_tokens": 120837514.0, + "step": 270 + }, + { + "epoch": 2.6592865928659286, + "grad_norm": 0.12228602708630738, + "learning_rate": 5.299741929899171e-06, + "loss": 0.0206, + "num_tokens": 121266377.0, + "step": 271 + }, + { + "epoch": 2.6691266912669125, + "grad_norm": 0.07647057417070459, + "learning_rate": 5.27115677676464e-06, + "loss": 0.0154, + "num_tokens": 121730907.0, + "step": 272 + }, + { + "epoch": 2.678966789667897, + "grad_norm": 0.07263570161343703, + "learning_rate": 5.242580878758334e-06, + "loss": 0.0138, + "num_tokens": 122162564.0, + "step": 273 + }, + { + "epoch": 2.6888068880688807, + "grad_norm": 0.07390794347850005, + "learning_rate": 5.21401539157801e-06, + "loss": 0.0131, + "num_tokens": 122644233.0, + "step": 274 + }, + { + "epoch": 2.6986469864698646, + "grad_norm": 0.05624120433704004, + "learning_rate": 5.1854614705003796e-06, + "loss": 0.0114, + "num_tokens": 123070674.0, + "step": 275 + }, + { + "epoch": 2.708487084870849, + "grad_norm": 0.07371873132309133, + "learning_rate": 5.156920270334389e-06, + "loss": 0.0194, + "num_tokens": 123517476.0, + "step": 276 + }, + { + "epoch": 2.7183271832718328, + "grad_norm": 0.06758978472435712, + "learning_rate": 5.1283929453745055e-06, + "loss": 0.0129, + "num_tokens": 123957650.0, + "step": 277 + }, + { + "epoch": 2.7281672816728166, + "grad_norm": 0.06857276382476074, + "learning_rate": 5.099880649354044e-06, + "loss": 0.0125, + "num_tokens": 124423561.0, + "step": 278 + }, + { + "epoch": 2.7380073800738005, + "grad_norm": 0.06198166285648246, + "learning_rate": 5.071384535398505e-06, + "loss": 0.0119, + "num_tokens": 124871204.0, + "step": 279 + }, + { + "epoch": 2.747847478474785, + "grad_norm": 0.05801997208341688, + "learning_rate": 5.04290575597893e-06, + "loss": 0.0119, + "num_tokens": 125320936.0, + "step": 280 + }, + { + "epoch": 2.7576875768757687, + "grad_norm": 0.09983800531852628, + "learning_rate": 5.0144454628653015e-06, + "loss": 0.0157, + "num_tokens": 125785587.0, + "step": 281 + }, + { + "epoch": 2.767527675276753, + "grad_norm": 0.05961861980322237, + "learning_rate": 4.986004807079959e-06, + "loss": 0.0119, + "num_tokens": 126223799.0, + "step": 282 + }, + { + "epoch": 2.777367773677737, + "grad_norm": 0.06887056012305312, + "learning_rate": 4.957584938851048e-06, + "loss": 0.0127, + "num_tokens": 126674560.0, + "step": 283 + }, + { + "epoch": 2.787207872078721, + "grad_norm": 0.06432285678662777, + "learning_rate": 4.929187007565996e-06, + "loss": 0.0124, + "num_tokens": 127121758.0, + "step": 284 + }, + { + "epoch": 2.7970479704797047, + "grad_norm": 0.06283306903955838, + "learning_rate": 4.9008121617250425e-06, + "loss": 0.0122, + "num_tokens": 127564319.0, + "step": 285 + }, + { + "epoch": 2.8068880688806885, + "grad_norm": 0.07395862495517919, + "learning_rate": 4.87246154889477e-06, + "loss": 0.0125, + "num_tokens": 128014723.0, + "step": 286 + }, + { + "epoch": 2.816728167281673, + "grad_norm": 0.06772968868173306, + "learning_rate": 4.8441363156617085e-06, + "loss": 0.026, + "num_tokens": 128456573.0, + "step": 287 + }, + { + "epoch": 2.8265682656826567, + "grad_norm": 0.2058477599150272, + "learning_rate": 4.815837607585957e-06, + "loss": 0.0313, + "num_tokens": 128888085.0, + "step": 288 + }, + { + "epoch": 2.836408364083641, + "grad_norm": 0.05983028509302605, + "learning_rate": 4.787566569154855e-06, + "loss": 0.0136, + "num_tokens": 129344186.0, + "step": 289 + }, + { + "epoch": 2.846248462484625, + "grad_norm": 0.1679165256737002, + "learning_rate": 4.759324343736698e-06, + "loss": 0.0268, + "num_tokens": 129820337.0, + "step": 290 + }, + { + "epoch": 2.856088560885609, + "grad_norm": 0.069693981729958, + "learning_rate": 4.731112073534491e-06, + "loss": 0.012, + "num_tokens": 130264132.0, + "step": 291 + }, + { + "epoch": 2.8659286592865927, + "grad_norm": 0.05673801969192786, + "learning_rate": 4.70293089953976e-06, + "loss": 0.237, + "num_tokens": 130747367.0, + "step": 292 + }, + { + "epoch": 2.875768757687577, + "grad_norm": 0.9244716369700087, + "learning_rate": 4.674781961486399e-06, + "loss": 0.0129, + "num_tokens": 131189544.0, + "step": 293 + }, + { + "epoch": 2.885608856088561, + "grad_norm": 0.0670539720853974, + "learning_rate": 4.646666397804586e-06, + "loss": 0.0127, + "num_tokens": 131615817.0, + "step": 294 + }, + { + "epoch": 2.8954489544895448, + "grad_norm": 0.07778029323101539, + "learning_rate": 4.618585345574741e-06, + "loss": 0.0136, + "num_tokens": 132065833.0, + "step": 295 + }, + { + "epoch": 2.905289052890529, + "grad_norm": 0.06633645417900966, + "learning_rate": 4.5905399404815196e-06, + "loss": 0.0119, + "num_tokens": 132513181.0, + "step": 296 + }, + { + "epoch": 2.915129151291513, + "grad_norm": 0.06604742202311176, + "learning_rate": 4.562531316767908e-06, + "loss": 0.0178, + "num_tokens": 132975979.0, + "step": 297 + }, + { + "epoch": 2.924969249692497, + "grad_norm": 0.06375772945002761, + "learning_rate": 4.534560607189338e-06, + "loss": 0.0121, + "num_tokens": 133411946.0, + "step": 298 + }, + { + "epoch": 2.9348093480934807, + "grad_norm": 0.0644873715390372, + "learning_rate": 4.506628942967874e-06, + "loss": 0.0226, + "num_tokens": 133882037.0, + "step": 299 + }, + { + "epoch": 2.944649446494465, + "grad_norm": 0.06122403707300358, + "learning_rate": 4.478737453746464e-06, + "loss": 0.0111, + "num_tokens": 134338580.0, + "step": 300 + }, + { + "epoch": 2.954489544895449, + "grad_norm": 0.06192995198797032, + "learning_rate": 4.450887267543261e-06, + "loss": 0.023, + "num_tokens": 134806429.0, + "step": 301 + }, + { + "epoch": 2.9643296432964332, + "grad_norm": 0.06577423487360488, + "learning_rate": 4.423079510705992e-06, + "loss": 0.0127, + "num_tokens": 135253050.0, + "step": 302 + }, + { + "epoch": 2.974169741697417, + "grad_norm": 0.061821762890230156, + "learning_rate": 4.395315307866404e-06, + "loss": 0.0118, + "num_tokens": 135701900.0, + "step": 303 + }, + { + "epoch": 2.984009840098401, + "grad_norm": 0.060295397517859534, + "learning_rate": 4.3675957818947965e-06, + "loss": 0.0112, + "num_tokens": 136134539.0, + "step": 304 + }, + { + "epoch": 2.993849938499385, + "grad_norm": 0.06204359834906306, + "learning_rate": 4.33992205385459e-06, + "loss": 0.0119, + "num_tokens": 136581981.0, + "step": 305 + }, + { + "epoch": 3.0, + "grad_norm": 0.06204359834906306, + "learning_rate": 4.312295242956998e-06, + "loss": 0.0109, + "num_tokens": 136774441.0, + "step": 306 + }, + { + "epoch": 3.0, + "eval_loss": 0.0963606908917427, + "eval_num_tokens": 136774441.0, + "eval_runtime": 53.9214, + "eval_samples_per_second": 41.653, + "eval_steps_per_second": 5.211, + "step": 306 + }, + { + "epoch": 3.009840098400984, + "grad_norm": 0.08266586517900253, + "learning_rate": 4.284716466515759e-06, + "loss": 0.0218, + "num_tokens": 137235846.0, + "step": 307 + }, + { + "epoch": 3.019680196801968, + "grad_norm": 0.06025259361613064, + "learning_rate": 4.257186839901948e-06, + "loss": 0.01, + "num_tokens": 137676575.0, + "step": 308 + }, + { + "epoch": 3.029520295202952, + "grad_norm": 0.059520087712568295, + "learning_rate": 4.229707476498871e-06, + "loss": 0.0107, + "num_tokens": 138127277.0, + "step": 309 + }, + { + "epoch": 3.039360393603936, + "grad_norm": 0.060007105121960225, + "learning_rate": 4.2022794876570335e-06, + "loss": 0.0099, + "num_tokens": 138558346.0, + "step": 310 + }, + { + "epoch": 3.0492004920049203, + "grad_norm": 0.05765555936281279, + "learning_rate": 4.1749039826491956e-06, + "loss": 0.2021, + "num_tokens": 139029117.0, + "step": 311 + }, + { + "epoch": 3.059040590405904, + "grad_norm": 0.25549047851203505, + "learning_rate": 4.1475820686255055e-06, + "loss": 0.01, + "num_tokens": 139465608.0, + "step": 312 + }, + { + "epoch": 3.068880688806888, + "grad_norm": 0.05745397404349778, + "learning_rate": 4.120314850568731e-06, + "loss": 0.0291, + "num_tokens": 139932040.0, + "step": 313 + }, + { + "epoch": 3.078720787207872, + "grad_norm": 0.21571060654935606, + "learning_rate": 4.093103431249563e-06, + "loss": 0.011, + "num_tokens": 140393810.0, + "step": 314 + }, + { + "epoch": 3.088560885608856, + "grad_norm": 0.06271676867820344, + "learning_rate": 4.065948911182015e-06, + "loss": 0.018, + "num_tokens": 140853306.0, + "step": 315 + }, + { + "epoch": 3.09840098400984, + "grad_norm": 0.06529992912597996, + "learning_rate": 4.038852388578925e-06, + "loss": 0.0102, + "num_tokens": 141293974.0, + "step": 316 + }, + { + "epoch": 3.108241082410824, + "grad_norm": 0.0613594667302306, + "learning_rate": 4.011814959307533e-06, + "loss": 0.0101, + "num_tokens": 141739396.0, + "step": 317 + }, + { + "epoch": 3.1180811808118083, + "grad_norm": 0.06143281774280475, + "learning_rate": 3.984837716845157e-06, + "loss": 0.0098, + "num_tokens": 142181417.0, + "step": 318 + }, + { + "epoch": 3.127921279212792, + "grad_norm": 0.06065540767441434, + "learning_rate": 3.957921752234982e-06, + "loss": 0.0095, + "num_tokens": 142615273.0, + "step": 319 + }, + { + "epoch": 3.137761377613776, + "grad_norm": 0.0565367496699821, + "learning_rate": 3.931068154041919e-06, + "loss": 0.0156, + "num_tokens": 143066695.0, + "step": 320 + }, + { + "epoch": 3.14760147601476, + "grad_norm": 0.0928817994214938, + "learning_rate": 3.904278008308589e-06, + "loss": 0.0093, + "num_tokens": 143543314.0, + "step": 321 + }, + { + "epoch": 3.1574415744157442, + "grad_norm": 0.05348206917431186, + "learning_rate": 3.877552398511409e-06, + "loss": 0.0102, + "num_tokens": 143978640.0, + "step": 322 + }, + { + "epoch": 3.167281672816728, + "grad_norm": 0.05744861837720995, + "learning_rate": 3.85089240551675e-06, + "loss": 0.0096, + "num_tokens": 144437143.0, + "step": 323 + }, + { + "epoch": 3.177121771217712, + "grad_norm": 0.05917730480215664, + "learning_rate": 3.8242991075372436e-06, + "loss": 0.0103, + "num_tokens": 144882614.0, + "step": 324 + }, + { + "epoch": 3.1869618696186963, + "grad_norm": 0.06138753989215512, + "learning_rate": 3.7977735800881687e-06, + "loss": 0.01, + "num_tokens": 145336615.0, + "step": 325 + }, + { + "epoch": 3.19680196801968, + "grad_norm": 0.057934477141044834, + "learning_rate": 3.7713168959439515e-06, + "loss": 0.0097, + "num_tokens": 145791703.0, + "step": 326 + }, + { + "epoch": 3.206642066420664, + "grad_norm": 0.062311400511582536, + "learning_rate": 3.74493012509478e-06, + "loss": 0.0163, + "num_tokens": 146256588.0, + "step": 327 + }, + { + "epoch": 3.2164821648216484, + "grad_norm": 0.11046706497961999, + "learning_rate": 3.718614334703339e-06, + "loss": 0.0096, + "num_tokens": 146704790.0, + "step": 328 + }, + { + "epoch": 3.2263222632226323, + "grad_norm": 0.06040935915809342, + "learning_rate": 3.692370589061639e-06, + "loss": 0.0161, + "num_tokens": 147150851.0, + "step": 329 + }, + { + "epoch": 3.236162361623616, + "grad_norm": 0.06309596528426079, + "learning_rate": 3.6661999495479772e-06, + "loss": 0.0116, + "num_tokens": 147586533.0, + "step": 330 + }, + { + "epoch": 3.2460024600246005, + "grad_norm": 0.0775947611650109, + "learning_rate": 3.640103474584016e-06, + "loss": 0.0102, + "num_tokens": 148012817.0, + "step": 331 + }, + { + "epoch": 3.2558425584255843, + "grad_norm": 0.060442066581616015, + "learning_rate": 3.614082219591972e-06, + "loss": 0.0094, + "num_tokens": 148454349.0, + "step": 332 + }, + { + "epoch": 3.265682656826568, + "grad_norm": 0.0599277899760194, + "learning_rate": 3.588137236951934e-06, + "loss": 0.0096, + "num_tokens": 148908837.0, + "step": 333 + }, + { + "epoch": 3.275522755227552, + "grad_norm": 0.06389649266611047, + "learning_rate": 3.5622695759592996e-06, + "loss": 0.0091, + "num_tokens": 149387409.0, + "step": 334 + }, + { + "epoch": 3.2853628536285364, + "grad_norm": 0.059031876557593344, + "learning_rate": 3.5364802827823397e-06, + "loss": 0.0124, + "num_tokens": 149842184.0, + "step": 335 + }, + { + "epoch": 3.2952029520295203, + "grad_norm": 0.06425762134540147, + "learning_rate": 3.5107704004198904e-06, + "loss": 0.0096, + "num_tokens": 150294624.0, + "step": 336 + }, + { + "epoch": 3.305043050430504, + "grad_norm": 0.060359900802863305, + "learning_rate": 3.485140968659166e-06, + "loss": 0.0156, + "num_tokens": 150757952.0, + "step": 337 + }, + { + "epoch": 3.3148831488314885, + "grad_norm": 0.06451910432321761, + "learning_rate": 3.4595930240337115e-06, + "loss": 0.0093, + "num_tokens": 151210941.0, + "step": 338 + }, + { + "epoch": 3.3247232472324724, + "grad_norm": 0.05771756769585445, + "learning_rate": 3.4341275997814795e-06, + "loss": 0.0311, + "num_tokens": 151659703.0, + "step": 339 + }, + { + "epoch": 3.3345633456334562, + "grad_norm": 0.2709101034464869, + "learning_rate": 3.408745725803042e-06, + "loss": 0.0198, + "num_tokens": 152096656.0, + "step": 340 + }, + { + "epoch": 3.34440344403444, + "grad_norm": 0.2165805542100797, + "learning_rate": 3.383448428619941e-06, + "loss": 0.0109, + "num_tokens": 152535937.0, + "step": 341 + }, + { + "epoch": 3.3542435424354244, + "grad_norm": 0.06249104678860667, + "learning_rate": 3.3582367313331692e-06, + "loss": 0.0241, + "num_tokens": 153012481.0, + "step": 342 + }, + { + "epoch": 3.3640836408364083, + "grad_norm": 0.07444091538512662, + "learning_rate": 3.3331116535817974e-06, + "loss": 0.0096, + "num_tokens": 153457239.0, + "step": 343 + }, + { + "epoch": 3.373923739237392, + "grad_norm": 0.05744783875540723, + "learning_rate": 3.308074211501732e-06, + "loss": 0.0112, + "num_tokens": 153885310.0, + "step": 344 + }, + { + "epoch": 3.3837638376383765, + "grad_norm": 0.062108203142145886, + "learning_rate": 3.2831254176846205e-06, + "loss": 0.0102, + "num_tokens": 154315565.0, + "step": 345 + }, + { + "epoch": 3.3936039360393604, + "grad_norm": 0.06493988486024563, + "learning_rate": 3.258266281136905e-06, + "loss": 0.0154, + "num_tokens": 154761237.0, + "step": 346 + }, + { + "epoch": 3.4034440344403443, + "grad_norm": 0.07703452506780802, + "learning_rate": 3.233497807239008e-06, + "loss": 0.0149, + "num_tokens": 155219079.0, + "step": 347 + }, + { + "epoch": 3.4132841328413286, + "grad_norm": 0.07716474025857703, + "learning_rate": 3.2088209977046657e-06, + "loss": 0.0099, + "num_tokens": 155672847.0, + "step": 348 + }, + { + "epoch": 3.4231242312423125, + "grad_norm": 0.0598011605849924, + "learning_rate": 3.1842368505404388e-06, + "loss": 0.0097, + "num_tokens": 156097592.0, + "step": 349 + }, + { + "epoch": 3.4329643296432963, + "grad_norm": 0.06067024127693304, + "learning_rate": 3.1597463600053258e-06, + "loss": 0.0097, + "num_tokens": 156543931.0, + "step": 350 + }, + { + "epoch": 3.4428044280442807, + "grad_norm": 0.06276348610439125, + "learning_rate": 3.135350516570559e-06, + "loss": 0.0115, + "num_tokens": 156993093.0, + "step": 351 + }, + { + "epoch": 3.4526445264452645, + "grad_norm": 0.07056305058653452, + "learning_rate": 3.111050306879556e-06, + "loss": 0.0161, + "num_tokens": 157435895.0, + "step": 352 + }, + { + "epoch": 3.4624846248462484, + "grad_norm": 0.0692853066303934, + "learning_rate": 3.0868467137080075e-06, + "loss": 0.0124, + "num_tokens": 157859703.0, + "step": 353 + }, + { + "epoch": 3.4723247232472323, + "grad_norm": 0.06622059827297899, + "learning_rate": 3.0627407159241273e-06, + "loss": 0.0098, + "num_tokens": 158319159.0, + "step": 354 + }, + { + "epoch": 3.4821648216482166, + "grad_norm": 0.06424105970441871, + "learning_rate": 3.0387332884490806e-06, + "loss": 0.0105, + "num_tokens": 158768974.0, + "step": 355 + }, + { + "epoch": 3.4920049200492005, + "grad_norm": 0.06970655480927966, + "learning_rate": 3.014825402217533e-06, + "loss": 0.0099, + "num_tokens": 159221319.0, + "step": 356 + }, + { + "epoch": 3.5018450184501844, + "grad_norm": 0.06231852234082556, + "learning_rate": 2.9910180241384014e-06, + "loss": 0.0099, + "num_tokens": 159657431.0, + "step": 357 + }, + { + "epoch": 3.5116851168511687, + "grad_norm": 0.06403174372575768, + "learning_rate": 2.9673121170557396e-06, + "loss": 0.0099, + "num_tokens": 160091184.0, + "step": 358 + }, + { + "epoch": 3.5215252152521526, + "grad_norm": 0.06050506427522611, + "learning_rate": 2.9437086397097996e-06, + "loss": 0.0095, + "num_tokens": 160538104.0, + "step": 359 + }, + { + "epoch": 3.5313653136531364, + "grad_norm": 0.05914580967848918, + "learning_rate": 2.92020854669826e-06, + "loss": 0.0151, + "num_tokens": 160984800.0, + "step": 360 + }, + { + "epoch": 3.5412054120541203, + "grad_norm": 0.06615551474859403, + "learning_rate": 2.896812788437615e-06, + "loss": 0.0102, + "num_tokens": 161437908.0, + "step": 361 + }, + { + "epoch": 3.5510455104551046, + "grad_norm": 0.05688142632929498, + "learning_rate": 2.8735223111247402e-06, + "loss": 0.0094, + "num_tokens": 161900209.0, + "step": 362 + }, + { + "epoch": 3.5608856088560885, + "grad_norm": 0.05805719882416427, + "learning_rate": 2.850338056698621e-06, + "loss": 0.0094, + "num_tokens": 162381378.0, + "step": 363 + }, + { + "epoch": 3.570725707257073, + "grad_norm": 0.05665394777981862, + "learning_rate": 2.827260962802263e-06, + "loss": 0.0089, + "num_tokens": 162818401.0, + "step": 364 + }, + { + "epoch": 3.5805658056580567, + "grad_norm": 0.058540688861597474, + "learning_rate": 2.804291962744768e-06, + "loss": 0.0102, + "num_tokens": 163261663.0, + "step": 365 + }, + { + "epoch": 3.5904059040590406, + "grad_norm": 0.06068364561780823, + "learning_rate": 2.7814319854635875e-06, + "loss": 0.0096, + "num_tokens": 163706510.0, + "step": 366 + }, + { + "epoch": 3.6002460024600245, + "grad_norm": 0.0593859542792967, + "learning_rate": 2.758681955486955e-06, + "loss": 0.0097, + "num_tokens": 164145145.0, + "step": 367 + }, + { + "epoch": 3.6100861008610083, + "grad_norm": 0.059439587082302694, + "learning_rate": 2.736042792896495e-06, + "loss": 0.0104, + "num_tokens": 164588218.0, + "step": 368 + }, + { + "epoch": 3.6199261992619927, + "grad_norm": 0.06426940128348262, + "learning_rate": 2.7135154132900133e-06, + "loss": 0.0203, + "num_tokens": 165039642.0, + "step": 369 + }, + { + "epoch": 3.6297662976629765, + "grad_norm": 0.059031373381084176, + "learning_rate": 2.691100727744458e-06, + "loss": 0.0091, + "num_tokens": 165502439.0, + "step": 370 + }, + { + "epoch": 3.639606396063961, + "grad_norm": 0.05706397506461239, + "learning_rate": 2.668799642779093e-06, + "loss": 0.0106, + "num_tokens": 165957611.0, + "step": 371 + }, + { + "epoch": 3.6494464944649447, + "grad_norm": 0.06337690848780857, + "learning_rate": 2.6466130603188157e-06, + "loss": 0.01, + "num_tokens": 166404741.0, + "step": 372 + }, + { + "epoch": 3.6592865928659286, + "grad_norm": 0.057865704503962175, + "learning_rate": 2.624541877657685e-06, + "loss": 0.1951, + "num_tokens": 166908892.0, + "step": 373 + }, + { + "epoch": 3.6691266912669125, + "grad_norm": 0.6748913551790232, + "learning_rate": 2.602586987422643e-06, + "loss": 0.0094, + "num_tokens": 167346017.0, + "step": 374 + }, + { + "epoch": 3.678966789667897, + "grad_norm": 0.06271310429727074, + "learning_rate": 2.580749277537399e-06, + "loss": 0.0093, + "num_tokens": 167795779.0, + "step": 375 + }, + { + "epoch": 3.6888068880688807, + "grad_norm": 0.05728241738284472, + "learning_rate": 2.5590296311865294e-06, + "loss": 0.0092, + "num_tokens": 168246613.0, + "step": 376 + }, + { + "epoch": 3.6986469864698646, + "grad_norm": 0.05730319671770116, + "learning_rate": 2.537428926779758e-06, + "loss": 0.0104, + "num_tokens": 168703193.0, + "step": 377 + }, + { + "epoch": 3.708487084870849, + "grad_norm": 0.061789009881383514, + "learning_rate": 2.515948037916423e-06, + "loss": 0.0104, + "num_tokens": 169166239.0, + "step": 378 + }, + { + "epoch": 3.7183271832718328, + "grad_norm": 0.05958784070544453, + "learning_rate": 2.494587833350153e-06, + "loss": 0.0564, + "num_tokens": 169618415.0, + "step": 379 + }, + { + "epoch": 3.7281672816728166, + "grad_norm": 0.22039415728368103, + "learning_rate": 2.473349176953736e-06, + "loss": 0.0094, + "num_tokens": 170079318.0, + "step": 380 + }, + { + "epoch": 3.7380073800738005, + "grad_norm": 0.05930397129828618, + "learning_rate": 2.4522329276841664e-06, + "loss": 0.0198, + "num_tokens": 170524571.0, + "step": 381 + }, + { + "epoch": 3.747847478474785, + "grad_norm": 0.06047568038440854, + "learning_rate": 2.431239939547921e-06, + "loss": 0.0094, + "num_tokens": 170983016.0, + "step": 382 + }, + { + "epoch": 3.7576875768757687, + "grad_norm": 0.061680315681806853, + "learning_rate": 2.4103710615664145e-06, + "loss": 0.0089, + "num_tokens": 171426486.0, + "step": 383 + }, + { + "epoch": 3.767527675276753, + "grad_norm": 0.05588539351574886, + "learning_rate": 2.389627137741662e-06, + "loss": 0.0094, + "num_tokens": 171871834.0, + "step": 384 + }, + { + "epoch": 3.777367773677737, + "grad_norm": 0.061780123368904795, + "learning_rate": 2.369009007022146e-06, + "loss": 0.0093, + "num_tokens": 172337523.0, + "step": 385 + }, + { + "epoch": 3.787207872078721, + "grad_norm": 0.05632561272908436, + "learning_rate": 2.3485175032688865e-06, + "loss": 0.0088, + "num_tokens": 172775826.0, + "step": 386 + }, + { + "epoch": 3.7970479704797047, + "grad_norm": 0.058782272770165275, + "learning_rate": 2.328153455221717e-06, + "loss": 0.0095, + "num_tokens": 173234709.0, + "step": 387 + }, + { + "epoch": 3.8068880688806885, + "grad_norm": 0.057526356469471435, + "learning_rate": 2.3079176864657673e-06, + "loss": 0.0097, + "num_tokens": 173700055.0, + "step": 388 + }, + { + "epoch": 3.816728167281673, + "grad_norm": 0.06609619441495819, + "learning_rate": 2.2878110153981565e-06, + "loss": 0.0111, + "num_tokens": 174147961.0, + "step": 389 + }, + { + "epoch": 3.8265682656826567, + "grad_norm": 0.06703233332357492, + "learning_rate": 2.267834255194894e-06, + "loss": 0.0116, + "num_tokens": 174586991.0, + "step": 390 + }, + { + "epoch": 3.836408364083641, + "grad_norm": 0.06522848493729735, + "learning_rate": 2.2479882137779903e-06, + "loss": 0.0106, + "num_tokens": 175006875.0, + "step": 391 + }, + { + "epoch": 3.846248462484625, + "grad_norm": 0.06306752932488521, + "learning_rate": 2.228273693782784e-06, + "loss": 0.0094, + "num_tokens": 175451007.0, + "step": 392 + }, + { + "epoch": 3.856088560885609, + "grad_norm": 0.062263756072231294, + "learning_rate": 2.208691492525481e-06, + "loss": 0.0135, + "num_tokens": 175896902.0, + "step": 393 + }, + { + "epoch": 3.8659286592865927, + "grad_norm": 0.06835430681220003, + "learning_rate": 2.189242401970908e-06, + "loss": 0.0092, + "num_tokens": 176346616.0, + "step": 394 + }, + { + "epoch": 3.875768757687577, + "grad_norm": 0.05728313379563115, + "learning_rate": 2.169927208700482e-06, + "loss": 0.0098, + "num_tokens": 176802124.0, + "step": 395 + }, + { + "epoch": 3.885608856088561, + "grad_norm": 0.06299115193931754, + "learning_rate": 2.1507466938804013e-06, + "loss": 0.0089, + "num_tokens": 177233961.0, + "step": 396 + }, + { + "epoch": 3.8954489544895448, + "grad_norm": 0.060076198285498296, + "learning_rate": 2.131701633230045e-06, + "loss": 0.0098, + "num_tokens": 177684662.0, + "step": 397 + }, + { + "epoch": 3.905289052890529, + "grad_norm": 0.06517531508961912, + "learning_rate": 2.112792796990616e-06, + "loss": 0.0095, + "num_tokens": 178123825.0, + "step": 398 + }, + { + "epoch": 3.915129151291513, + "grad_norm": 0.05863263973572925, + "learning_rate": 2.0940209498939732e-06, + "loss": 0.009, + "num_tokens": 178562641.0, + "step": 399 + }, + { + "epoch": 3.924969249692497, + "grad_norm": 0.05798991563312477, + "learning_rate": 2.075386851131711e-06, + "loss": 0.0094, + "num_tokens": 179007017.0, + "step": 400 + }, + { + "epoch": 3.9348093480934807, + "grad_norm": 0.06118488260559937, + "learning_rate": 2.056891254324459e-06, + "loss": 0.0095, + "num_tokens": 179449125.0, + "step": 401 + }, + { + "epoch": 3.944649446494465, + "grad_norm": 0.06403534407994695, + "learning_rate": 2.038534907491396e-06, + "loss": 0.009, + "num_tokens": 179887646.0, + "step": 402 + }, + { + "epoch": 3.954489544895449, + "grad_norm": 0.08058699039926022, + "learning_rate": 2.0203185530199983e-06, + "loss": 0.0138, + "num_tokens": 180341944.0, + "step": 403 + }, + { + "epoch": 3.9643296432964332, + "grad_norm": 0.056026267406971995, + "learning_rate": 2.0022429276360256e-06, + "loss": 0.0097, + "num_tokens": 180787775.0, + "step": 404 + }, + { + "epoch": 3.974169741697417, + "grad_norm": 0.058787256460149456, + "learning_rate": 1.9843087623737097e-06, + "loss": 0.0088, + "num_tokens": 181276015.0, + "step": 405 + }, + { + "epoch": 3.984009840098401, + "grad_norm": 0.054638072869340186, + "learning_rate": 1.966516782546199e-06, + "loss": 0.009, + "num_tokens": 181724759.0, + "step": 406 + }, + { + "epoch": 3.993849938499385, + "grad_norm": 0.05931097745374889, + "learning_rate": 1.94886770771623e-06, + "loss": 0.0098, + "num_tokens": 182165821.0, + "step": 407 + }, + { + "epoch": 4.0, + "grad_norm": 0.06697953375930626, + "learning_rate": 1.931362251667008e-06, + "loss": 0.027, + "num_tokens": 182364260.0, + "step": 408 + }, + { + "epoch": 4.0, + "eval_loss": 0.1028980016708374, + "eval_num_tokens": 182364260.0, + "eval_runtime": 53.8919, + "eval_samples_per_second": 41.676, + "eval_steps_per_second": 5.214, + "step": 408 + }, + { + "epoch": 4.009840098400984, + "grad_norm": 0.07632643003764507, + "learning_rate": 1.9140011223733576e-06, + "loss": 0.0082, + "num_tokens": 182806025.0, + "step": 409 + }, + { + "epoch": 4.019680196801968, + "grad_norm": 0.05437436276939388, + "learning_rate": 1.8967850219730799e-06, + "loss": 0.0081, + "num_tokens": 183278654.0, + "step": 410 + }, + { + "epoch": 4.029520295202952, + "grad_norm": 0.05114318878211908, + "learning_rate": 1.8797146467385604e-06, + "loss": 0.0076, + "num_tokens": 183720645.0, + "step": 411 + }, + { + "epoch": 4.039360393603936, + "grad_norm": 0.053465044974803935, + "learning_rate": 1.8627906870486063e-06, + "loss": 0.0082, + "num_tokens": 184191637.0, + "step": 412 + }, + { + "epoch": 4.04920049200492, + "grad_norm": 0.054542981072468875, + "learning_rate": 1.8460138273605265e-06, + "loss": 0.008, + "num_tokens": 184634141.0, + "step": 413 + }, + { + "epoch": 4.059040590405904, + "grad_norm": 0.052414283521576004, + "learning_rate": 1.8293847461824538e-06, + "loss": 0.0079, + "num_tokens": 185081741.0, + "step": 414 + }, + { + "epoch": 4.068880688806888, + "grad_norm": 0.05289967674124652, + "learning_rate": 1.8129041160458966e-06, + "loss": 0.008, + "num_tokens": 185495440.0, + "step": 415 + }, + { + "epoch": 4.078720787207872, + "grad_norm": 0.0584668942852983, + "learning_rate": 1.7965726034785466e-06, + "loss": 0.0081, + "num_tokens": 185938291.0, + "step": 416 + }, + { + "epoch": 4.088560885608856, + "grad_norm": 0.05897150659800833, + "learning_rate": 1.780390868977318e-06, + "loss": 0.0086, + "num_tokens": 186409542.0, + "step": 417 + }, + { + "epoch": 4.0984009840098405, + "grad_norm": 0.05118034680985974, + "learning_rate": 1.7643595669816378e-06, + "loss": 0.0077, + "num_tokens": 186852482.0, + "step": 418 + }, + { + "epoch": 4.108241082410824, + "grad_norm": 0.05911903344070817, + "learning_rate": 1.7484793458469745e-06, + "loss": 0.0081, + "num_tokens": 187306570.0, + "step": 419 + }, + { + "epoch": 4.118081180811808, + "grad_norm": 0.058617479568280846, + "learning_rate": 1.7327508478186216e-06, + "loss": 0.0075, + "num_tokens": 187738802.0, + "step": 420 + }, + { + "epoch": 4.127921279212792, + "grad_norm": 0.05743950460862962, + "learning_rate": 1.7171747090057201e-06, + "loss": 0.0081, + "num_tokens": 188188275.0, + "step": 421 + }, + { + "epoch": 4.137761377613776, + "grad_norm": 0.0578427653677817, + "learning_rate": 1.7017515593555295e-06, + "loss": 0.008, + "num_tokens": 188626310.0, + "step": 422 + }, + { + "epoch": 4.14760147601476, + "grad_norm": 0.055381917249045204, + "learning_rate": 1.6864820226279607e-06, + "loss": 0.0079, + "num_tokens": 189058824.0, + "step": 423 + }, + { + "epoch": 4.157441574415744, + "grad_norm": 0.0566904301682134, + "learning_rate": 1.6713667163703348e-06, + "loss": 0.008, + "num_tokens": 189488025.0, + "step": 424 + }, + { + "epoch": 4.167281672816729, + "grad_norm": 0.0591657691393218, + "learning_rate": 1.6564062518924202e-06, + "loss": 0.0093, + "num_tokens": 189949176.0, + "step": 425 + }, + { + "epoch": 4.177121771217712, + "grad_norm": 0.058609260537066755, + "learning_rate": 1.6416012342417056e-06, + "loss": 0.0075, + "num_tokens": 190405187.0, + "step": 426 + }, + { + "epoch": 4.186961869618696, + "grad_norm": 0.05376660491247955, + "learning_rate": 1.6269522621789246e-06, + "loss": 0.0094, + "num_tokens": 190839466.0, + "step": 427 + }, + { + "epoch": 4.19680196801968, + "grad_norm": 0.062048025442225076, + "learning_rate": 1.6124599281538452e-06, + "loss": 0.02, + "num_tokens": 191280153.0, + "step": 428 + }, + { + "epoch": 4.206642066420664, + "grad_norm": 0.06071173185238267, + "learning_rate": 1.5981248182813136e-06, + "loss": 0.0073, + "num_tokens": 191734314.0, + "step": 429 + }, + { + "epoch": 4.216482164821648, + "grad_norm": 0.05301725414979279, + "learning_rate": 1.583947512317537e-06, + "loss": 0.0117, + "num_tokens": 192202492.0, + "step": 430 + }, + { + "epoch": 4.226322263222632, + "grad_norm": 0.06832062526218917, + "learning_rate": 1.5699285836366488e-06, + "loss": 0.0093, + "num_tokens": 192667915.0, + "step": 431 + }, + { + "epoch": 4.236162361623617, + "grad_norm": 0.05748762603533909, + "learning_rate": 1.5560685992075141e-06, + "loss": 0.0078, + "num_tokens": 193136794.0, + "step": 432 + }, + { + "epoch": 4.2460024600246005, + "grad_norm": 0.0737572203685775, + "learning_rate": 1.5423681195707997e-06, + "loss": 0.0073, + "num_tokens": 193598491.0, + "step": 433 + }, + { + "epoch": 4.255842558425584, + "grad_norm": 0.05225082250599676, + "learning_rate": 1.528827698816306e-06, + "loss": 0.0077, + "num_tokens": 194023980.0, + "step": 434 + }, + { + "epoch": 4.265682656826568, + "grad_norm": 0.05296466266803098, + "learning_rate": 1.515447884560556e-06, + "loss": 0.0074, + "num_tokens": 194481167.0, + "step": 435 + }, + { + "epoch": 4.275522755227552, + "grad_norm": 0.05336380722303185, + "learning_rate": 1.502229217924649e-06, + "loss": 0.0075, + "num_tokens": 194915312.0, + "step": 436 + }, + { + "epoch": 4.285362853628536, + "grad_norm": 0.05458180686808586, + "learning_rate": 1.489172233512376e-06, + "loss": 0.0076, + "num_tokens": 195368266.0, + "step": 437 + }, + { + "epoch": 4.29520295202952, + "grad_norm": 0.05542603913086383, + "learning_rate": 1.4762774593885986e-06, + "loss": 0.0081, + "num_tokens": 195810914.0, + "step": 438 + }, + { + "epoch": 4.305043050430505, + "grad_norm": 0.054344537083576325, + "learning_rate": 1.4635454170578917e-06, + "loss": 0.0072, + "num_tokens": 196263940.0, + "step": 439 + }, + { + "epoch": 4.3148831488314885, + "grad_norm": 0.052701156778993646, + "learning_rate": 1.4509766214434535e-06, + "loss": 0.0077, + "num_tokens": 196718774.0, + "step": 440 + }, + { + "epoch": 4.324723247232472, + "grad_norm": 0.05423178707270067, + "learning_rate": 1.4385715808662787e-06, + "loss": 0.008, + "num_tokens": 197161519.0, + "step": 441 + }, + { + "epoch": 4.334563345633456, + "grad_norm": 0.055354896441224044, + "learning_rate": 1.4263307970246027e-06, + "loss": 0.008, + "num_tokens": 197621081.0, + "step": 442 + }, + { + "epoch": 4.34440344403444, + "grad_norm": 0.05816305513011695, + "learning_rate": 1.41425476497361e-06, + "loss": 0.0078, + "num_tokens": 198087857.0, + "step": 443 + }, + { + "epoch": 4.354243542435424, + "grad_norm": 0.05127845466920968, + "learning_rate": 1.4023439731054112e-06, + "loss": 0.0077, + "num_tokens": 198533672.0, + "step": 444 + }, + { + "epoch": 4.364083640836409, + "grad_norm": 0.07067731738580797, + "learning_rate": 1.390598903129296e-06, + "loss": 0.0322, + "num_tokens": 199022227.0, + "step": 445 + }, + { + "epoch": 4.373923739237393, + "grad_norm": 0.05511218194004341, + "learning_rate": 1.3790200300522413e-06, + "loss": 0.0077, + "num_tokens": 199462215.0, + "step": 446 + }, + { + "epoch": 4.3837638376383765, + "grad_norm": 0.05735730379081794, + "learning_rate": 1.3676078221597157e-06, + "loss": 0.0074, + "num_tokens": 199907231.0, + "step": 447 + }, + { + "epoch": 4.39360393603936, + "grad_norm": 0.05442936039834661, + "learning_rate": 1.3563627409967257e-06, + "loss": 0.1955, + "num_tokens": 200376904.0, + "step": 448 + }, + { + "epoch": 4.403444034440344, + "grad_norm": 0.5930661652942222, + "learning_rate": 1.3452852413491563e-06, + "loss": 0.0074, + "num_tokens": 200853967.0, + "step": 449 + }, + { + "epoch": 4.413284132841328, + "grad_norm": 0.05077867679984549, + "learning_rate": 1.3343757712253804e-06, + "loss": 0.0076, + "num_tokens": 201323621.0, + "step": 450 + }, + { + "epoch": 4.423124231242312, + "grad_norm": 0.058807424527887606, + "learning_rate": 1.3236347718381338e-06, + "loss": 0.0096, + "num_tokens": 201753687.0, + "step": 451 + }, + { + "epoch": 4.432964329643297, + "grad_norm": 0.06001374322910319, + "learning_rate": 1.3130626775866743e-06, + "loss": 0.0081, + "num_tokens": 202203799.0, + "step": 452 + }, + { + "epoch": 4.442804428044281, + "grad_norm": 0.06273437087252197, + "learning_rate": 1.3026599160392173e-06, + "loss": 0.0092, + "num_tokens": 202627243.0, + "step": 453 + }, + { + "epoch": 4.4526445264452645, + "grad_norm": 0.06372618537836224, + "learning_rate": 1.292426907915634e-06, + "loss": 0.0076, + "num_tokens": 203077433.0, + "step": 454 + }, + { + "epoch": 4.462484624846248, + "grad_norm": 0.057948321757535656, + "learning_rate": 1.2823640670704443e-06, + "loss": 0.0229, + "num_tokens": 203532517.0, + "step": 455 + }, + { + "epoch": 4.472324723247232, + "grad_norm": 0.06607138604150303, + "learning_rate": 1.2724718004760794e-06, + "loss": 0.0078, + "num_tokens": 203967752.0, + "step": 456 + }, + { + "epoch": 4.482164821648216, + "grad_norm": 0.05725783304801458, + "learning_rate": 1.2627505082064144e-06, + "loss": 0.0076, + "num_tokens": 204424349.0, + "step": 457 + }, + { + "epoch": 4.492004920049201, + "grad_norm": 0.055427831791831646, + "learning_rate": 1.2532005834205976e-06, + "loss": 0.0079, + "num_tokens": 204846138.0, + "step": 458 + }, + { + "epoch": 4.501845018450185, + "grad_norm": 0.05460191637217484, + "learning_rate": 1.2438224123471442e-06, + "loss": 0.0192, + "num_tokens": 205306730.0, + "step": 459 + }, + { + "epoch": 4.511685116851169, + "grad_norm": 0.06279438477449967, + "learning_rate": 1.2346163742683185e-06, + "loss": 0.0117, + "num_tokens": 205759609.0, + "step": 460 + }, + { + "epoch": 4.521525215252153, + "grad_norm": 0.05702285396092694, + "learning_rate": 1.2255828415047932e-06, + "loss": 0.0076, + "num_tokens": 206171295.0, + "step": 461 + }, + { + "epoch": 4.531365313653136, + "grad_norm": 0.054521558454890394, + "learning_rate": 1.216722179400592e-06, + "loss": 0.0076, + "num_tokens": 206639148.0, + "step": 462 + }, + { + "epoch": 4.54120541205412, + "grad_norm": 0.05168283263697403, + "learning_rate": 1.208034746308315e-06, + "loss": 0.0068, + "num_tokens": 207094260.0, + "step": 463 + }, + { + "epoch": 4.551045510455104, + "grad_norm": 0.05161429329359664, + "learning_rate": 1.1995208935746437e-06, + "loss": 0.0081, + "num_tokens": 207533375.0, + "step": 464 + }, + { + "epoch": 4.560885608856088, + "grad_norm": 0.058514508257411606, + "learning_rate": 1.1911809655261333e-06, + "loss": 0.0081, + "num_tokens": 207969517.0, + "step": 465 + }, + { + "epoch": 4.570725707257073, + "grad_norm": 0.056665893017668854, + "learning_rate": 1.1830152994552866e-06, + "loss": 0.0086, + "num_tokens": 208408117.0, + "step": 466 + }, + { + "epoch": 4.580565805658057, + "grad_norm": 0.056163462620316754, + "learning_rate": 1.175024225606912e-06, + "loss": 0.0074, + "num_tokens": 208879227.0, + "step": 467 + }, + { + "epoch": 4.590405904059041, + "grad_norm": 0.05409385523794747, + "learning_rate": 1.1672080671647695e-06, + "loss": 0.0078, + "num_tokens": 209325103.0, + "step": 468 + }, + { + "epoch": 4.6002460024600245, + "grad_norm": 0.05629255243399504, + "learning_rate": 1.1595671402384966e-06, + "loss": 0.0102, + "num_tokens": 209791894.0, + "step": 469 + }, + { + "epoch": 4.610086100861008, + "grad_norm": 0.051104203707396316, + "learning_rate": 1.152101753850828e-06, + "loss": 0.0072, + "num_tokens": 210254182.0, + "step": 470 + }, + { + "epoch": 4.619926199261993, + "grad_norm": 0.05229454749737629, + "learning_rate": 1.1448122099250946e-06, + "loss": 0.0104, + "num_tokens": 210702900.0, + "step": 471 + }, + { + "epoch": 4.629766297662977, + "grad_norm": 0.060177504722208404, + "learning_rate": 1.1376988032730135e-06, + "loss": 0.0079, + "num_tokens": 211151465.0, + "step": 472 + }, + { + "epoch": 4.639606396063961, + "grad_norm": 0.05182456184289124, + "learning_rate": 1.130761821582766e-06, + "loss": 0.0072, + "num_tokens": 211619464.0, + "step": 473 + }, + { + "epoch": 4.649446494464945, + "grad_norm": 0.05574225668849545, + "learning_rate": 1.1240015454073622e-06, + "loss": 0.0085, + "num_tokens": 212064266.0, + "step": 474 + }, + { + "epoch": 4.659286592865929, + "grad_norm": 0.06359820975154429, + "learning_rate": 1.1174182481532943e-06, + "loss": 0.0081, + "num_tokens": 212499724.0, + "step": 475 + }, + { + "epoch": 4.6691266912669125, + "grad_norm": 0.05622656000305094, + "learning_rate": 1.1110121960694773e-06, + "loss": 0.0079, + "num_tokens": 212945879.0, + "step": 476 + }, + { + "epoch": 4.678966789667896, + "grad_norm": 0.06093763072714235, + "learning_rate": 1.104783648236486e-06, + "loss": 0.0084, + "num_tokens": 213379787.0, + "step": 477 + }, + { + "epoch": 4.68880688806888, + "grad_norm": 0.0543614373855231, + "learning_rate": 1.0987328565560711e-06, + "loss": 0.0075, + "num_tokens": 213824263.0, + "step": 478 + }, + { + "epoch": 4.698646986469865, + "grad_norm": 0.056905167227697236, + "learning_rate": 1.0928600657409751e-06, + "loss": 0.0082, + "num_tokens": 214265208.0, + "step": 479 + }, + { + "epoch": 4.708487084870849, + "grad_norm": 0.057351833542733925, + "learning_rate": 1.0871655133050372e-06, + "loss": 0.0082, + "num_tokens": 214744301.0, + "step": 480 + }, + { + "epoch": 4.718327183271833, + "grad_norm": 0.29349816338215157, + "learning_rate": 1.081649429553581e-06, + "loss": 0.0553, + "num_tokens": 215194355.0, + "step": 481 + }, + { + "epoch": 4.728167281672817, + "grad_norm": 0.051057953015104116, + "learning_rate": 1.076312037574106e-06, + "loss": 0.0074, + "num_tokens": 215632060.0, + "step": 482 + }, + { + "epoch": 4.7380073800738005, + "grad_norm": 0.056594540815463674, + "learning_rate": 1.0711535532272632e-06, + "loss": 0.0235, + "num_tokens": 216097276.0, + "step": 483 + }, + { + "epoch": 4.747847478474784, + "grad_norm": 0.068871190152495, + "learning_rate": 1.0661741851381256e-06, + "loss": 0.0077, + "num_tokens": 216544463.0, + "step": 484 + }, + { + "epoch": 4.757687576875769, + "grad_norm": 0.05907548729697175, + "learning_rate": 1.0613741346877498e-06, + "loss": 0.0084, + "num_tokens": 216972058.0, + "step": 485 + }, + { + "epoch": 4.767527675276753, + "grad_norm": 0.055592377746762095, + "learning_rate": 1.056753596005032e-06, + "loss": 0.0074, + "num_tokens": 217401900.0, + "step": 486 + }, + { + "epoch": 4.777367773677737, + "grad_norm": 0.05562394957573223, + "learning_rate": 1.0523127559588579e-06, + "loss": 0.0075, + "num_tokens": 217845453.0, + "step": 487 + }, + { + "epoch": 4.787207872078721, + "grad_norm": 0.05258367575789477, + "learning_rate": 1.0480517941505428e-06, + "loss": 0.0073, + "num_tokens": 218272871.0, + "step": 488 + }, + { + "epoch": 4.797047970479705, + "grad_norm": 0.05390618674507445, + "learning_rate": 1.0439708829065708e-06, + "loss": 0.0078, + "num_tokens": 218732597.0, + "step": 489 + }, + { + "epoch": 4.8068880688806885, + "grad_norm": 0.06946151381547928, + "learning_rate": 1.0400701872716227e-06, + "loss": 0.0223, + "num_tokens": 219194340.0, + "step": 490 + }, + { + "epoch": 4.816728167281672, + "grad_norm": 0.05582170906207444, + "learning_rate": 1.0363498650019023e-06, + "loss": 0.0077, + "num_tokens": 219673692.0, + "step": 491 + }, + { + "epoch": 4.826568265682657, + "grad_norm": 0.05244987983803676, + "learning_rate": 1.0328100665587573e-06, + "loss": 0.0073, + "num_tokens": 220118246.0, + "step": 492 + }, + { + "epoch": 4.836408364083641, + "grad_norm": 0.055024340070040305, + "learning_rate": 1.029450935102592e-06, + "loss": 0.0077, + "num_tokens": 220555806.0, + "step": 493 + }, + { + "epoch": 4.846248462484625, + "grad_norm": 0.05338628090134423, + "learning_rate": 1.0262726064870801e-06, + "loss": 0.0073, + "num_tokens": 220997187.0, + "step": 494 + }, + { + "epoch": 4.856088560885609, + "grad_norm": 0.058254094197714025, + "learning_rate": 1.0232752092536666e-06, + "loss": 0.0074, + "num_tokens": 221434681.0, + "step": 495 + }, + { + "epoch": 4.865928659286593, + "grad_norm": 0.05261616134189719, + "learning_rate": 1.0204588646263731e-06, + "loss": 0.0074, + "num_tokens": 221884850.0, + "step": 496 + }, + { + "epoch": 4.875768757687577, + "grad_norm": 0.052167915998619634, + "learning_rate": 1.0178236865068933e-06, + "loss": 0.0072, + "num_tokens": 222333225.0, + "step": 497 + }, + { + "epoch": 4.885608856088561, + "grad_norm": 0.06187153122740552, + "learning_rate": 1.0153697814699858e-06, + "loss": 0.0106, + "num_tokens": 222774591.0, + "step": 498 + }, + { + "epoch": 4.895448954489545, + "grad_norm": 0.054905669170180534, + "learning_rate": 1.0130972487591658e-06, + "loss": 0.0112, + "num_tokens": 223227943.0, + "step": 499 + }, + { + "epoch": 4.905289052890529, + "grad_norm": 0.06206228565326619, + "learning_rate": 1.0110061802826889e-06, + "loss": 0.0076, + "num_tokens": 223680989.0, + "step": 500 + }, + { + "epoch": 4.915129151291513, + "grad_norm": 0.05437071230251554, + "learning_rate": 1.009096660609837e-06, + "loss": 0.1789, + "num_tokens": 224171724.0, + "step": 501 + }, + { + "epoch": 4.924969249692497, + "grad_norm": 0.12358300885271949, + "learning_rate": 1.0073687669674949e-06, + "loss": 0.0081, + "num_tokens": 224621243.0, + "step": 502 + }, + { + "epoch": 4.934809348093481, + "grad_norm": 0.05743551551374671, + "learning_rate": 1.0058225692370299e-06, + "loss": 0.0077, + "num_tokens": 225053570.0, + "step": 503 + }, + { + "epoch": 4.944649446494465, + "grad_norm": 0.05705289715957623, + "learning_rate": 1.0044581299514638e-06, + "loss": 0.0077, + "num_tokens": 225475922.0, + "step": 504 + }, + { + "epoch": 4.9544895448954485, + "grad_norm": 0.052608564457681, + "learning_rate": 1.003275504292944e-06, + "loss": 0.0072, + "num_tokens": 225944888.0, + "step": 505 + }, + { + "epoch": 4.964329643296433, + "grad_norm": 0.05546452983023311, + "learning_rate": 1.0022747400905126e-06, + "loss": 0.0079, + "num_tokens": 226384045.0, + "step": 506 + }, + { + "epoch": 4.974169741697417, + "grad_norm": 0.05754539826487939, + "learning_rate": 1.0014558778181714e-06, + "loss": 0.0073, + "num_tokens": 226815343.0, + "step": 507 + }, + { + "epoch": 4.984009840098401, + "grad_norm": 0.05456913560891108, + "learning_rate": 1.0008189505932444e-06, + "loss": 0.0084, + "num_tokens": 227286168.0, + "step": 508 + }, + { + "epoch": 4.993849938499385, + "grad_norm": 0.053799541560384294, + "learning_rate": 1.0003639841750404e-06, + "loss": 0.0076, + "num_tokens": 227746824.0, + "step": 509 + }, + { + "epoch": 5.0, + "grad_norm": 0.07884368824115337, + "learning_rate": 1.0000909969638097e-06, + "loss": 0.0089, + "num_tokens": 227957450.0, + "step": 510 + }, + { + "epoch": 5.0, + "eval_loss": 0.11205815523862839, + "eval_num_tokens": 227957450.0, + "eval_runtime": 53.843, + "eval_samples_per_second": 41.714, + "eval_steps_per_second": 5.219, + "step": 510 + }, + { + "epoch": 5.0, + "step": 510, + "total_flos": 7.689061516716278e+17, + "train_loss": 0.0504409685922677, + "train_runtime": 7612.3259, + "train_samples_per_second": 8.537, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 1, + "max_steps": 510, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.689061516716278e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}