{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 510, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00984009840098401, "grad_norm": 3.7944442389822073, "learning_rate": 0.0, "loss": 1.2501, "num_tokens": 456505.0, "step": 1 }, { "epoch": 0.01968019680196802, "grad_norm": 3.778041640972742, "learning_rate": 6.25e-07, "loss": 1.2343, "num_tokens": 915160.0, "step": 2 }, { "epoch": 0.02952029520295203, "grad_norm": 3.8325814879608386, "learning_rate": 1.25e-06, "loss": 1.254, "num_tokens": 1365315.0, "step": 3 }, { "epoch": 0.03936039360393604, "grad_norm": 3.582565733113683, "learning_rate": 1.8750000000000003e-06, "loss": 1.1869, "num_tokens": 1841763.0, "step": 4 }, { "epoch": 0.04920049200492005, "grad_norm": 3.5604969753172315, "learning_rate": 2.5e-06, "loss": 1.2394, "num_tokens": 2301606.0, "step": 5 }, { "epoch": 0.05904059040590406, "grad_norm": 3.105374395878177, "learning_rate": 3.125e-06, "loss": 1.2366, "num_tokens": 2755825.0, "step": 6 }, { "epoch": 0.06888068880688807, "grad_norm": 2.316426838717515, "learning_rate": 3.7500000000000005e-06, "loss": 1.1101, "num_tokens": 3196409.0, "step": 7 }, { "epoch": 0.07872078720787208, "grad_norm": 2.281060366927676, "learning_rate": 4.3750000000000005e-06, "loss": 1.1078, "num_tokens": 3622733.0, "step": 8 }, { "epoch": 0.08856088560885608, "grad_norm": 1.934577354985982, "learning_rate": 5e-06, "loss": 0.9014, "num_tokens": 4055914.0, "step": 9 }, { "epoch": 0.0984009840098401, "grad_norm": 1.9176079459138344, "learning_rate": 5.625e-06, "loss": 0.8745, "num_tokens": 4485159.0, "step": 10 }, { "epoch": 0.10824108241082411, "grad_norm": 1.786754010375736, "learning_rate": 6.25e-06, "loss": 0.7922, "num_tokens": 4933514.0, "step": 11 }, { "epoch": 0.11808118081180811, "grad_norm": 1.9933574737759214, "learning_rate": 6.875e-06, "loss": 0.4878, "num_tokens": 5383658.0, "step": 12 }, { "epoch": 0.12792127921279212, "grad_norm": 2.123289554302906, "learning_rate": 7.500000000000001e-06, "loss": 0.428, "num_tokens": 5839838.0, "step": 13 }, { "epoch": 0.13776137761377613, "grad_norm": 1.7562448014521572, "learning_rate": 8.125000000000001e-06, "loss": 0.3337, "num_tokens": 6286175.0, "step": 14 }, { "epoch": 0.14760147601476015, "grad_norm": 1.4384357290512548, "learning_rate": 8.750000000000001e-06, "loss": 0.2497, "num_tokens": 6725821.0, "step": 15 }, { "epoch": 0.15744157441574416, "grad_norm": 0.8232923354453182, "learning_rate": 9.375000000000001e-06, "loss": 0.1317, "num_tokens": 7169854.0, "step": 16 }, { "epoch": 0.16728167281672818, "grad_norm": 0.5262014955492348, "learning_rate": 1e-05, "loss": 0.1158, "num_tokens": 7602261.0, "step": 17 }, { "epoch": 0.17712177121771217, "grad_norm": 0.47218735378476806, "learning_rate": 9.999909003036192e-06, "loss": 0.098, "num_tokens": 8040457.0, "step": 18 }, { "epoch": 0.18696186961869618, "grad_norm": 0.22950756131023575, "learning_rate": 9.99963601582496e-06, "loss": 0.0827, "num_tokens": 8484210.0, "step": 19 }, { "epoch": 0.1968019680196802, "grad_norm": 0.24069667769460337, "learning_rate": 9.999181049406756e-06, "loss": 0.0733, "num_tokens": 8913622.0, "step": 20 }, { "epoch": 0.2066420664206642, "grad_norm": 0.19079800364724872, "learning_rate": 9.998544122181829e-06, "loss": 0.0851, "num_tokens": 9379389.0, "step": 21 }, { "epoch": 0.21648216482164823, "grad_norm": 0.17418458793254618, "learning_rate": 9.997725259909487e-06, "loss": 0.0734, "num_tokens": 9803100.0, "step": 22 }, { "epoch": 0.22632226322263221, "grad_norm": 0.19184343474298712, "learning_rate": 9.996724495707056e-06, "loss": 0.083, "num_tokens": 10247767.0, "step": 23 }, { "epoch": 0.23616236162361623, "grad_norm": 0.1475443750251538, "learning_rate": 9.995541870048537e-06, "loss": 0.0738, "num_tokens": 10691516.0, "step": 24 }, { "epoch": 0.24600246002460024, "grad_norm": 0.14419500643952865, "learning_rate": 9.994177430762971e-06, "loss": 0.0646, "num_tokens": 11149524.0, "step": 25 }, { "epoch": 0.25584255842558423, "grad_norm": 1.2190794556868674, "learning_rate": 9.992631233032507e-06, "loss": 0.0721, "num_tokens": 11589958.0, "step": 26 }, { "epoch": 0.2656826568265683, "grad_norm": 0.49893871677873436, "learning_rate": 9.990903339390164e-06, "loss": 0.0691, "num_tokens": 12050102.0, "step": 27 }, { "epoch": 0.27552275522755226, "grad_norm": 0.4422022706582718, "learning_rate": 9.988993819717312e-06, "loss": 0.0605, "num_tokens": 12508433.0, "step": 28 }, { "epoch": 0.2853628536285363, "grad_norm": 0.13441092661822238, "learning_rate": 9.986902751240836e-06, "loss": 0.0692, "num_tokens": 12939960.0, "step": 29 }, { "epoch": 0.2952029520295203, "grad_norm": 0.11858771432621444, "learning_rate": 9.984630218530014e-06, "loss": 0.0492, "num_tokens": 13387850.0, "step": 30 }, { "epoch": 0.3050430504305043, "grad_norm": 0.12696361103470127, "learning_rate": 9.982176313493108e-06, "loss": 0.0624, "num_tokens": 13866894.0, "step": 31 }, { "epoch": 0.3148831488314883, "grad_norm": 0.14678245947256616, "learning_rate": 9.979541135373628e-06, "loss": 0.0483, "num_tokens": 14314553.0, "step": 32 }, { "epoch": 0.3247232472324723, "grad_norm": 0.12403518011628543, "learning_rate": 9.976724790746333e-06, "loss": 0.0488, "num_tokens": 14747330.0, "step": 33 }, { "epoch": 0.33456334563345635, "grad_norm": 0.10283847292091904, "learning_rate": 9.973727393512921e-06, "loss": 0.0582, "num_tokens": 15215873.0, "step": 34 }, { "epoch": 0.34440344403444034, "grad_norm": 0.09860235260078455, "learning_rate": 9.970549064897407e-06, "loss": 0.0446, "num_tokens": 15653849.0, "step": 35 }, { "epoch": 0.35424354243542433, "grad_norm": 0.10274919661024226, "learning_rate": 9.967189933441243e-06, "loss": 0.0439, "num_tokens": 16112913.0, "step": 36 }, { "epoch": 0.3640836408364084, "grad_norm": 0.0918843632134462, "learning_rate": 9.9636501349981e-06, "loss": 0.0585, "num_tokens": 16570588.0, "step": 37 }, { "epoch": 0.37392373923739236, "grad_norm": 0.08618611894284056, "learning_rate": 9.95992981272838e-06, "loss": 0.0477, "num_tokens": 17028395.0, "step": 38 }, { "epoch": 0.3837638376383764, "grad_norm": 0.0915069403325355, "learning_rate": 9.956029117093432e-06, "loss": 0.045, "num_tokens": 17477681.0, "step": 39 }, { "epoch": 0.3936039360393604, "grad_norm": 0.09093140650787605, "learning_rate": 9.951948205849457e-06, "loss": 0.0444, "num_tokens": 17940049.0, "step": 40 }, { "epoch": 0.4034440344403444, "grad_norm": 0.08271507354884283, "learning_rate": 9.947687244041143e-06, "loss": 0.0401, "num_tokens": 18360868.0, "step": 41 }, { "epoch": 0.4132841328413284, "grad_norm": 0.08588968137159211, "learning_rate": 9.943246403994969e-06, "loss": 0.0358, "num_tokens": 18811281.0, "step": 42 }, { "epoch": 0.4231242312423124, "grad_norm": 0.08965565515357603, "learning_rate": 9.938625865312252e-06, "loss": 0.044, "num_tokens": 19236998.0, "step": 43 }, { "epoch": 0.43296432964329645, "grad_norm": 0.09636661290222473, "learning_rate": 9.933825814861877e-06, "loss": 0.0431, "num_tokens": 19689363.0, "step": 44 }, { "epoch": 0.44280442804428044, "grad_norm": 0.08912873391359938, "learning_rate": 9.928846446772737e-06, "loss": 0.0377, "num_tokens": 20129602.0, "step": 45 }, { "epoch": 0.45264452644526443, "grad_norm": 0.09271503002492597, "learning_rate": 9.923687962425895e-06, "loss": 0.0365, "num_tokens": 20566055.0, "step": 46 }, { "epoch": 0.46248462484624847, "grad_norm": 0.08617267288782972, "learning_rate": 9.91835057044642e-06, "loss": 0.0582, "num_tokens": 21035837.0, "step": 47 }, { "epoch": 0.47232472324723246, "grad_norm": 0.07942181409157618, "learning_rate": 9.912834486694963e-06, "loss": 0.0341, "num_tokens": 21490681.0, "step": 48 }, { "epoch": 0.4821648216482165, "grad_norm": 0.08409285833406879, "learning_rate": 9.907139934259025e-06, "loss": 0.0464, "num_tokens": 21949736.0, "step": 49 }, { "epoch": 0.4920049200492005, "grad_norm": 0.08981746101624732, "learning_rate": 9.90126714344393e-06, "loss": 0.0479, "num_tokens": 22408345.0, "step": 50 }, { "epoch": 0.5018450184501845, "grad_norm": 0.08557538109120558, "learning_rate": 9.895216351763515e-06, "loss": 0.04, "num_tokens": 22869507.0, "step": 51 }, { "epoch": 0.5116851168511685, "grad_norm": 0.08873060518107122, "learning_rate": 9.888987803930523e-06, "loss": 0.0359, "num_tokens": 23337492.0, "step": 52 }, { "epoch": 0.5215252152521526, "grad_norm": 0.08508195964995854, "learning_rate": 9.882581751846707e-06, "loss": 0.0338, "num_tokens": 23788038.0, "step": 53 }, { "epoch": 0.5313653136531366, "grad_norm": 0.076418318161816, "learning_rate": 9.87599845459264e-06, "loss": 0.0344, "num_tokens": 24233994.0, "step": 54 }, { "epoch": 0.5412054120541205, "grad_norm": 0.2889818789713905, "learning_rate": 9.869238178417235e-06, "loss": 0.2599, "num_tokens": 24697351.0, "step": 55 }, { "epoch": 0.5510455104551045, "grad_norm": 0.08884780995830746, "learning_rate": 9.862301196726988e-06, "loss": 0.0465, "num_tokens": 25183095.0, "step": 56 }, { "epoch": 0.5608856088560885, "grad_norm": 0.07990815808329678, "learning_rate": 9.855187790074906e-06, "loss": 0.0353, "num_tokens": 25651971.0, "step": 57 }, { "epoch": 0.5707257072570726, "grad_norm": 0.06894407429892842, "learning_rate": 9.847898246149173e-06, "loss": 0.0316, "num_tokens": 26129683.0, "step": 58 }, { "epoch": 0.5805658056580566, "grad_norm": 0.08216971413705307, "learning_rate": 9.840432859761504e-06, "loss": 0.0306, "num_tokens": 26548348.0, "step": 59 }, { "epoch": 0.5904059040590406, "grad_norm": 0.079031679127037, "learning_rate": 9.832791932835232e-06, "loss": 0.0362, "num_tokens": 26977631.0, "step": 60 }, { "epoch": 0.6002460024600246, "grad_norm": 0.07450412090133855, "learning_rate": 9.824975774393089e-06, "loss": 0.0276, "num_tokens": 27421323.0, "step": 61 }, { "epoch": 0.6100861008610086, "grad_norm": 0.08014735253624648, "learning_rate": 9.816984700544714e-06, "loss": 0.0286, "num_tokens": 27882356.0, "step": 62 }, { "epoch": 0.6199261992619927, "grad_norm": 0.08455294660438158, "learning_rate": 9.808819034473869e-06, "loss": 0.0407, "num_tokens": 28343854.0, "step": 63 }, { "epoch": 0.6297662976629766, "grad_norm": 0.08019778537515825, "learning_rate": 9.800479106425356e-06, "loss": 0.0299, "num_tokens": 28790695.0, "step": 64 }, { "epoch": 0.6396063960639606, "grad_norm": 0.08340888167507048, "learning_rate": 9.791965253691687e-06, "loss": 0.0353, "num_tokens": 29220825.0, "step": 65 }, { "epoch": 0.6494464944649446, "grad_norm": 0.08252486402936965, "learning_rate": 9.783277820599408e-06, "loss": 0.0367, "num_tokens": 29686358.0, "step": 66 }, { "epoch": 0.6592865928659286, "grad_norm": 0.08632773276059842, "learning_rate": 9.774417158495208e-06, "loss": 0.0331, "num_tokens": 30120521.0, "step": 67 }, { "epoch": 0.6691266912669127, "grad_norm": 0.082343171890358, "learning_rate": 9.765383625731683e-06, "loss": 0.0329, "num_tokens": 30573947.0, "step": 68 }, { "epoch": 0.6789667896678967, "grad_norm": 0.08874468637210653, "learning_rate": 9.756177587652857e-06, "loss": 0.0329, "num_tokens": 30999244.0, "step": 69 }, { "epoch": 0.6888068880688807, "grad_norm": 0.07673402020991506, "learning_rate": 9.746799416579403e-06, "loss": 0.0306, "num_tokens": 31468786.0, "step": 70 }, { "epoch": 0.6986469864698647, "grad_norm": 0.09204922624438575, "learning_rate": 9.737249491793587e-06, "loss": 0.0273, "num_tokens": 31905019.0, "step": 71 }, { "epoch": 0.7084870848708487, "grad_norm": 0.08145687118724444, "learning_rate": 9.727528199523923e-06, "loss": 0.029, "num_tokens": 32340154.0, "step": 72 }, { "epoch": 0.7183271832718328, "grad_norm": 0.09506872052374568, "learning_rate": 9.717635932929556e-06, "loss": 0.0373, "num_tokens": 32789598.0, "step": 73 }, { "epoch": 0.7281672816728167, "grad_norm": 0.08326889230017241, "learning_rate": 9.707573092084368e-06, "loss": 0.0286, "num_tokens": 33239225.0, "step": 74 }, { "epoch": 0.7380073800738007, "grad_norm": 0.07636964575035168, "learning_rate": 9.697340083960785e-06, "loss": 0.0291, "num_tokens": 33718797.0, "step": 75 }, { "epoch": 0.7478474784747847, "grad_norm": 0.09488168094776525, "learning_rate": 9.686937322413325e-06, "loss": 0.0328, "num_tokens": 34155674.0, "step": 76 }, { "epoch": 0.7576875768757687, "grad_norm": 0.0778086138359463, "learning_rate": 9.676365228161869e-06, "loss": 0.0252, "num_tokens": 34584921.0, "step": 77 }, { "epoch": 0.7675276752767528, "grad_norm": 0.08557737550120906, "learning_rate": 9.66562422877462e-06, "loss": 0.0338, "num_tokens": 35049146.0, "step": 78 }, { "epoch": 0.7773677736777368, "grad_norm": 0.09181023650151289, "learning_rate": 9.654714758650844e-06, "loss": 0.0299, "num_tokens": 35519987.0, "step": 79 }, { "epoch": 0.7872078720787208, "grad_norm": 0.07639914292637208, "learning_rate": 9.643637259003276e-06, "loss": 0.0242, "num_tokens": 35959127.0, "step": 80 }, { "epoch": 0.7970479704797048, "grad_norm": 0.08200922089613671, "learning_rate": 9.632392177840286e-06, "loss": 0.0317, "num_tokens": 36416651.0, "step": 81 }, { "epoch": 0.8068880688806888, "grad_norm": 0.07954028434263948, "learning_rate": 9.620979969947759e-06, "loss": 0.0293, "num_tokens": 36864154.0, "step": 82 }, { "epoch": 0.8167281672816729, "grad_norm": 0.07878375949867687, "learning_rate": 9.609401096870707e-06, "loss": 0.0237, "num_tokens": 37310281.0, "step": 83 }, { "epoch": 0.8265682656826568, "grad_norm": 0.07728168843840597, "learning_rate": 9.597656026894591e-06, "loss": 0.0322, "num_tokens": 37746606.0, "step": 84 }, { "epoch": 0.8364083640836408, "grad_norm": 0.07855221188672869, "learning_rate": 9.585745235026391e-06, "loss": 0.0258, "num_tokens": 38189615.0, "step": 85 }, { "epoch": 0.8462484624846248, "grad_norm": 0.07691630967258262, "learning_rate": 9.5736692029754e-06, "loss": 0.0293, "num_tokens": 38637318.0, "step": 86 }, { "epoch": 0.8560885608856088, "grad_norm": 0.07209047793755496, "learning_rate": 9.561428419133723e-06, "loss": 0.0235, "num_tokens": 39102853.0, "step": 87 }, { "epoch": 0.8659286592865929, "grad_norm": 0.0802072339239599, "learning_rate": 9.549023378556548e-06, "loss": 0.0311, "num_tokens": 39538535.0, "step": 88 }, { "epoch": 0.8757687576875769, "grad_norm": 0.09334524313401625, "learning_rate": 9.53645458294211e-06, "loss": 0.0484, "num_tokens": 40020296.0, "step": 89 }, { "epoch": 0.8856088560885609, "grad_norm": 2.3961901610996605, "learning_rate": 9.523722540611403e-06, "loss": 0.3276, "num_tokens": 40506093.0, "step": 90 }, { "epoch": 0.8954489544895449, "grad_norm": 0.09376957957757263, "learning_rate": 9.510827766487625e-06, "loss": 0.0288, "num_tokens": 40937880.0, "step": 91 }, { "epoch": 0.9052890528905289, "grad_norm": 0.08607984794603309, "learning_rate": 9.497770782075353e-06, "loss": 0.0247, "num_tokens": 41374337.0, "step": 92 }, { "epoch": 0.915129151291513, "grad_norm": 0.07253858203781333, "learning_rate": 9.484552115439445e-06, "loss": 0.0293, "num_tokens": 41811558.0, "step": 93 }, { "epoch": 0.9249692496924969, "grad_norm": 0.07768364358007782, "learning_rate": 9.471172301183695e-06, "loss": 0.0257, "num_tokens": 42259726.0, "step": 94 }, { "epoch": 0.9348093480934809, "grad_norm": 0.0769153663260077, "learning_rate": 9.4576318804292e-06, "loss": 0.0232, "num_tokens": 42684319.0, "step": 95 }, { "epoch": 0.9446494464944649, "grad_norm": 0.08163342042509363, "learning_rate": 9.443931400792486e-06, "loss": 0.0256, "num_tokens": 43113589.0, "step": 96 }, { "epoch": 0.9544895448954489, "grad_norm": 0.06536764982172343, "learning_rate": 9.430071416363352e-06, "loss": 0.0218, "num_tokens": 43575488.0, "step": 97 }, { "epoch": 0.964329643296433, "grad_norm": 0.08195099679978833, "learning_rate": 9.416052487682465e-06, "loss": 0.0254, "num_tokens": 44016216.0, "step": 98 }, { "epoch": 0.974169741697417, "grad_norm": 0.1266005657397246, "learning_rate": 9.401875181718686e-06, "loss": 0.0454, "num_tokens": 44497742.0, "step": 99 }, { "epoch": 0.984009840098401, "grad_norm": 0.07988798247506342, "learning_rate": 9.387540071846155e-06, "loss": 0.024, "num_tokens": 44935936.0, "step": 100 }, { "epoch": 0.993849938499385, "grad_norm": 0.07277763654694067, "learning_rate": 9.373047737821078e-06, "loss": 0.0216, "num_tokens": 45381042.0, "step": 101 }, { "epoch": 1.0, "grad_norm": 0.07277763654694067, "learning_rate": 9.358398765758296e-06, "loss": 0.0229, "num_tokens": 45593876.0, "step": 102 }, { "epoch": 1.0, "eval_loss": 0.07811997085809708, "eval_num_tokens": 45593876.0, "eval_runtime": 54.709, "eval_samples_per_second": 41.054, "eval_steps_per_second": 5.136, "step": 102 }, { "epoch": 1.009840098400984, "grad_norm": 0.10795878798324991, "learning_rate": 9.34359374810758e-06, "loss": 0.0201, "num_tokens": 46020335.0, "step": 103 }, { "epoch": 1.019680196801968, "grad_norm": 0.07593949135329942, "learning_rate": 9.328633283629666e-06, "loss": 0.0222, "num_tokens": 46466853.0, "step": 104 }, { "epoch": 1.029520295202952, "grad_norm": 0.07596980345063492, "learning_rate": 9.31351797737204e-06, "loss": 0.0253, "num_tokens": 46900993.0, "step": 105 }, { "epoch": 1.039360393603936, "grad_norm": 0.08317964089954727, "learning_rate": 9.29824844064447e-06, "loss": 0.0206, "num_tokens": 47334869.0, "step": 106 }, { "epoch": 1.04920049200492, "grad_norm": 0.0805362815127939, "learning_rate": 9.282825290994282e-06, "loss": 0.0213, "num_tokens": 47797630.0, "step": 107 }, { "epoch": 1.0590405904059041, "grad_norm": 0.07839099238240128, "learning_rate": 9.267249152181379e-06, "loss": 0.0454, "num_tokens": 48281974.0, "step": 108 }, { "epoch": 1.068880688806888, "grad_norm": 0.0757738535866923, "learning_rate": 9.251520654153028e-06, "loss": 0.022, "num_tokens": 48730118.0, "step": 109 }, { "epoch": 1.0787207872078721, "grad_norm": 0.08256710571520359, "learning_rate": 9.235640433018363e-06, "loss": 0.0195, "num_tokens": 49197576.0, "step": 110 }, { "epoch": 1.088560885608856, "grad_norm": 0.07849933177459094, "learning_rate": 9.219609131022684e-06, "loss": 0.0203, "num_tokens": 49673054.0, "step": 111 }, { "epoch": 1.09840098400984, "grad_norm": 0.08067924302373455, "learning_rate": 9.203427396521454e-06, "loss": 0.0219, "num_tokens": 50130569.0, "step": 112 }, { "epoch": 1.1082410824108242, "grad_norm": 0.07527801624664898, "learning_rate": 9.187095883954104e-06, "loss": 0.0195, "num_tokens": 50574721.0, "step": 113 }, { "epoch": 1.118081180811808, "grad_norm": 0.08229755724299215, "learning_rate": 9.170615253817547e-06, "loss": 0.0193, "num_tokens": 51010865.0, "step": 114 }, { "epoch": 1.1279212792127922, "grad_norm": 0.07673721236222701, "learning_rate": 9.153986172639474e-06, "loss": 0.0211, "num_tokens": 51469765.0, "step": 115 }, { "epoch": 1.137761377613776, "grad_norm": 0.0845900192373935, "learning_rate": 9.137209312951395e-06, "loss": 0.0226, "num_tokens": 51906114.0, "step": 116 }, { "epoch": 1.1476014760147601, "grad_norm": 0.08215860044207468, "learning_rate": 9.12028535326144e-06, "loss": 0.022, "num_tokens": 52354068.0, "step": 117 }, { "epoch": 1.1574415744157442, "grad_norm": 0.07420368746928867, "learning_rate": 9.103214978026922e-06, "loss": 0.0188, "num_tokens": 52836346.0, "step": 118 }, { "epoch": 1.1672816728167281, "grad_norm": 0.07450541307438634, "learning_rate": 9.085998877626644e-06, "loss": 0.0192, "num_tokens": 53299172.0, "step": 119 }, { "epoch": 1.1771217712177122, "grad_norm": 0.07878886229739003, "learning_rate": 9.068637748332993e-06, "loss": 0.0215, "num_tokens": 53759861.0, "step": 120 }, { "epoch": 1.186961869618696, "grad_norm": 0.08311056334441597, "learning_rate": 9.051132292283772e-06, "loss": 0.0208, "num_tokens": 54228512.0, "step": 121 }, { "epoch": 1.1968019680196802, "grad_norm": 0.07068781735081182, "learning_rate": 9.033483217453801e-06, "loss": 0.0812, "num_tokens": 54692852.0, "step": 122 }, { "epoch": 1.2066420664206643, "grad_norm": 0.596254901083269, "learning_rate": 9.015691237626292e-06, "loss": 0.0199, "num_tokens": 55139782.0, "step": 123 }, { "epoch": 1.2164821648216482, "grad_norm": 0.08202279255895727, "learning_rate": 8.997757072363976e-06, "loss": 0.0342, "num_tokens": 55604658.0, "step": 124 }, { "epoch": 1.2263222632226323, "grad_norm": 0.09057478290667956, "learning_rate": 8.979681446980002e-06, "loss": 0.0227, "num_tokens": 56030690.0, "step": 125 }, { "epoch": 1.2361623616236161, "grad_norm": 0.07661103115531635, "learning_rate": 8.961465092508607e-06, "loss": 0.0339, "num_tokens": 56492821.0, "step": 126 }, { "epoch": 1.2460024600246002, "grad_norm": 0.08310739437969392, "learning_rate": 8.943108745675542e-06, "loss": 0.0249, "num_tokens": 56927699.0, "step": 127 }, { "epoch": 1.2558425584255843, "grad_norm": 0.08009221352147507, "learning_rate": 8.92461314886829e-06, "loss": 0.0209, "num_tokens": 57365827.0, "step": 128 }, { "epoch": 1.2656826568265682, "grad_norm": 0.07973094836265254, "learning_rate": 8.905979050106029e-06, "loss": 0.0251, "num_tokens": 57821453.0, "step": 129 }, { "epoch": 1.2755227552275523, "grad_norm": 0.07600070319773061, "learning_rate": 8.887207203009385e-06, "loss": 0.0213, "num_tokens": 58267867.0, "step": 130 }, { "epoch": 1.2853628536285364, "grad_norm": 0.07258837000806613, "learning_rate": 8.868298366769956e-06, "loss": 0.0198, "num_tokens": 58715078.0, "step": 131 }, { "epoch": 1.2952029520295203, "grad_norm": 0.07826062337656157, "learning_rate": 8.849253306119601e-06, "loss": 0.0199, "num_tokens": 59159310.0, "step": 132 }, { "epoch": 1.3050430504305042, "grad_norm": 0.07770042002025847, "learning_rate": 8.83007279129952e-06, "loss": 0.027, "num_tokens": 59594031.0, "step": 133 }, { "epoch": 1.3148831488314883, "grad_norm": 0.07607344407726713, "learning_rate": 8.810757598029094e-06, "loss": 0.0342, "num_tokens": 60038506.0, "step": 134 }, { "epoch": 1.3247232472324724, "grad_norm": 0.08771686774228402, "learning_rate": 8.79130850747452e-06, "loss": 0.0234, "num_tokens": 60492486.0, "step": 135 }, { "epoch": 1.3345633456334562, "grad_norm": 0.07482147000786651, "learning_rate": 8.771726306217217e-06, "loss": 0.0196, "num_tokens": 60925341.0, "step": 136 }, { "epoch": 1.3444034440344403, "grad_norm": 0.07171750614547971, "learning_rate": 8.752011786222011e-06, "loss": 0.0224, "num_tokens": 61401128.0, "step": 137 }, { "epoch": 1.3542435424354244, "grad_norm": 0.07289189868770962, "learning_rate": 8.732165744805107e-06, "loss": 0.0198, "num_tokens": 61845691.0, "step": 138 }, { "epoch": 1.3640836408364083, "grad_norm": 0.07907747558023923, "learning_rate": 8.712188984601845e-06, "loss": 0.0185, "num_tokens": 62286361.0, "step": 139 }, { "epoch": 1.3739237392373924, "grad_norm": 0.06910414114179665, "learning_rate": 8.692082313534233e-06, "loss": 0.0179, "num_tokens": 62727406.0, "step": 140 }, { "epoch": 1.3837638376383765, "grad_norm": 0.07791959325829377, "learning_rate": 8.671846544778284e-06, "loss": 0.0204, "num_tokens": 63182141.0, "step": 141 }, { "epoch": 1.3936039360393604, "grad_norm": 0.0741558195977179, "learning_rate": 8.651482496731116e-06, "loss": 0.0178, "num_tokens": 63600729.0, "step": 142 }, { "epoch": 1.4034440344403443, "grad_norm": 0.07283375136096223, "learning_rate": 8.630990992977854e-06, "loss": 0.0198, "num_tokens": 64066267.0, "step": 143 }, { "epoch": 1.4132841328413284, "grad_norm": 0.0731783816547012, "learning_rate": 8.61037286225834e-06, "loss": 0.2547, "num_tokens": 64515946.0, "step": 144 }, { "epoch": 1.4231242312423125, "grad_norm": 1.0212050791856901, "learning_rate": 8.589628938433587e-06, "loss": 0.0192, "num_tokens": 64949958.0, "step": 145 }, { "epoch": 1.4329643296432963, "grad_norm": 0.09844320658741419, "learning_rate": 8.56876006045208e-06, "loss": 0.0176, "num_tokens": 65381018.0, "step": 146 }, { "epoch": 1.4428044280442804, "grad_norm": 0.07030907656382593, "learning_rate": 8.547767072315835e-06, "loss": 0.0241, "num_tokens": 65814016.0, "step": 147 }, { "epoch": 1.4526445264452645, "grad_norm": 0.0779412275694533, "learning_rate": 8.526650823046266e-06, "loss": 0.0265, "num_tokens": 66252980.0, "step": 148 }, { "epoch": 1.4624846248462484, "grad_norm": 0.09570533939331194, "learning_rate": 8.505412166649847e-06, "loss": 0.0199, "num_tokens": 66718111.0, "step": 149 }, { "epoch": 1.4723247232472325, "grad_norm": 0.07915246167438994, "learning_rate": 8.484051962083579e-06, "loss": 0.0204, "num_tokens": 67163762.0, "step": 150 }, { "epoch": 1.4821648216482166, "grad_norm": 0.07935176799416567, "learning_rate": 8.462571073220243e-06, "loss": 0.0225, "num_tokens": 67624386.0, "step": 151 }, { "epoch": 1.4920049200492005, "grad_norm": 0.07841589822630919, "learning_rate": 8.44097036881347e-06, "loss": 0.0392, "num_tokens": 68065290.0, "step": 152 }, { "epoch": 1.5018450184501844, "grad_norm": 0.3517146293571387, "learning_rate": 8.419250722462603e-06, "loss": 0.0178, "num_tokens": 68519107.0, "step": 153 }, { "epoch": 1.5116851168511685, "grad_norm": 0.0764909788834621, "learning_rate": 8.39741301257736e-06, "loss": 0.0194, "num_tokens": 68971128.0, "step": 154 }, { "epoch": 1.5215252152521526, "grad_norm": 0.08078822036852527, "learning_rate": 8.375458122342317e-06, "loss": 0.0206, "num_tokens": 69403792.0, "step": 155 }, { "epoch": 1.5313653136531364, "grad_norm": 0.08235320219175549, "learning_rate": 8.353386939681186e-06, "loss": 0.0175, "num_tokens": 69836602.0, "step": 156 }, { "epoch": 1.5412054120541205, "grad_norm": 0.0735540837139594, "learning_rate": 8.331200357220908e-06, "loss": 0.0194, "num_tokens": 70283814.0, "step": 157 }, { "epoch": 1.5510455104551046, "grad_norm": 0.07322399084658018, "learning_rate": 8.308899272255542e-06, "loss": 0.0184, "num_tokens": 70726284.0, "step": 158 }, { "epoch": 1.5608856088560885, "grad_norm": 0.07790348390650517, "learning_rate": 8.286484586709989e-06, "loss": 0.0183, "num_tokens": 71155169.0, "step": 159 }, { "epoch": 1.5707257072570726, "grad_norm": 0.08611809383964489, "learning_rate": 8.263957207103506e-06, "loss": 0.0205, "num_tokens": 71591204.0, "step": 160 }, { "epoch": 1.5805658056580567, "grad_norm": 0.0706229845173915, "learning_rate": 8.241318044513046e-06, "loss": 0.0277, "num_tokens": 72032119.0, "step": 161 }, { "epoch": 1.5904059040590406, "grad_norm": 0.09019039164269532, "learning_rate": 8.218568014536414e-06, "loss": 0.0176, "num_tokens": 72492164.0, "step": 162 }, { "epoch": 1.6002460024600245, "grad_norm": 0.07947315916491103, "learning_rate": 8.195708037255233e-06, "loss": 0.0202, "num_tokens": 72962752.0, "step": 163 }, { "epoch": 1.6100861008610086, "grad_norm": 0.06840189166732885, "learning_rate": 8.172739037197739e-06, "loss": 0.018, "num_tokens": 73415974.0, "step": 164 }, { "epoch": 1.6199261992619927, "grad_norm": 0.07366616747573093, "learning_rate": 8.149661943301382e-06, "loss": 0.0181, "num_tokens": 73882834.0, "step": 165 }, { "epoch": 1.6297662976629765, "grad_norm": 0.07081012920317416, "learning_rate": 8.126477688875262e-06, "loss": 0.0204, "num_tokens": 74321580.0, "step": 166 }, { "epoch": 1.6396063960639606, "grad_norm": 0.07863097311534642, "learning_rate": 8.103187211562386e-06, "loss": 0.0229, "num_tokens": 74781751.0, "step": 167 }, { "epoch": 1.6494464944649447, "grad_norm": 0.10797044478776457, "learning_rate": 8.079791453301742e-06, "loss": 0.0287, "num_tokens": 75219935.0, "step": 168 }, { "epoch": 1.6592865928659286, "grad_norm": 0.07041534985061697, "learning_rate": 8.056291360290202e-06, "loss": 0.0248, "num_tokens": 75665232.0, "step": 169 }, { "epoch": 1.6691266912669127, "grad_norm": 0.08695303118518641, "learning_rate": 8.032687882944264e-06, "loss": 0.0193, "num_tokens": 76087411.0, "step": 170 }, { "epoch": 1.6789667896678968, "grad_norm": 0.06704813880798238, "learning_rate": 8.0089819758616e-06, "loss": 0.0169, "num_tokens": 76529931.0, "step": 171 }, { "epoch": 1.6888068880688807, "grad_norm": 0.06935996975041725, "learning_rate": 7.985174597782469e-06, "loss": 0.0197, "num_tokens": 76974869.0, "step": 172 }, { "epoch": 1.6986469864698646, "grad_norm": 0.0812644475398725, "learning_rate": 7.961266711550922e-06, "loss": 0.0259, "num_tokens": 77413009.0, "step": 173 }, { "epoch": 1.7084870848708487, "grad_norm": 0.07469198601302375, "learning_rate": 7.937259284075872e-06, "loss": 0.0191, "num_tokens": 77854298.0, "step": 174 }, { "epoch": 1.7183271832718328, "grad_norm": 0.07554209425696685, "learning_rate": 7.913153286291995e-06, "loss": 0.025, "num_tokens": 78299682.0, "step": 175 }, { "epoch": 1.7281672816728166, "grad_norm": 0.07564661483692575, "learning_rate": 7.888949693120443e-06, "loss": 0.0172, "num_tokens": 78723460.0, "step": 176 }, { "epoch": 1.7380073800738007, "grad_norm": 0.6264202015289688, "learning_rate": 7.864649483429442e-06, "loss": 0.0402, "num_tokens": 79151526.0, "step": 177 }, { "epoch": 1.7478474784747848, "grad_norm": 0.07431323606896861, "learning_rate": 7.840253639994676e-06, "loss": 0.0182, "num_tokens": 79591692.0, "step": 178 }, { "epoch": 1.7576875768757687, "grad_norm": 0.07199128250127072, "learning_rate": 7.815763149459563e-06, "loss": 0.018, "num_tokens": 80054397.0, "step": 179 }, { "epoch": 1.7675276752767528, "grad_norm": 0.0736771332831437, "learning_rate": 7.791179002295334e-06, "loss": 0.0182, "num_tokens": 80527436.0, "step": 180 }, { "epoch": 1.777367773677737, "grad_norm": 0.0722896910687323, "learning_rate": 7.766502192760995e-06, "loss": 0.0299, "num_tokens": 80984085.0, "step": 181 }, { "epoch": 1.7872078720787208, "grad_norm": 0.13146348676004535, "learning_rate": 7.741733718863096e-06, "loss": 0.0172, "num_tokens": 81417093.0, "step": 182 }, { "epoch": 1.7970479704797047, "grad_norm": 0.07559775090622188, "learning_rate": 7.71687458231538e-06, "loss": 0.0173, "num_tokens": 81857802.0, "step": 183 }, { "epoch": 1.8068880688806888, "grad_norm": 0.07625026619956689, "learning_rate": 7.69192578849827e-06, "loss": 0.0174, "num_tokens": 82314635.0, "step": 184 }, { "epoch": 1.8167281672816729, "grad_norm": 0.07079163666898536, "learning_rate": 7.666888346418205e-06, "loss": 0.0255, "num_tokens": 82774404.0, "step": 185 }, { "epoch": 1.8265682656826567, "grad_norm": 0.07862230056744444, "learning_rate": 7.641763268666832e-06, "loss": 0.0166, "num_tokens": 83224858.0, "step": 186 }, { "epoch": 1.8364083640836408, "grad_norm": 0.07767548895299481, "learning_rate": 7.616551571380061e-06, "loss": 0.0303, "num_tokens": 83685638.0, "step": 187 }, { "epoch": 1.846248462484625, "grad_norm": 0.0767555813557926, "learning_rate": 7.5912542741969585e-06, "loss": 0.0173, "num_tokens": 84118329.0, "step": 188 }, { "epoch": 1.8560885608856088, "grad_norm": 0.06505326217418561, "learning_rate": 7.5658724002185215e-06, "loss": 0.2302, "num_tokens": 84642441.0, "step": 189 }, { "epoch": 1.865928659286593, "grad_norm": 0.9831912884395022, "learning_rate": 7.54040697596629e-06, "loss": 0.0173, "num_tokens": 85075013.0, "step": 190 }, { "epoch": 1.875768757687577, "grad_norm": 0.0852074767092427, "learning_rate": 7.514859031340835e-06, "loss": 0.0197, "num_tokens": 85539398.0, "step": 191 }, { "epoch": 1.8856088560885609, "grad_norm": 0.07502455159038045, "learning_rate": 7.489229599580111e-06, "loss": 0.0167, "num_tokens": 85976652.0, "step": 192 }, { "epoch": 1.8954489544895448, "grad_norm": 0.07796568336104527, "learning_rate": 7.463519717217663e-06, "loss": 0.0253, "num_tokens": 86404836.0, "step": 193 }, { "epoch": 1.9052890528905289, "grad_norm": 0.07733304316410633, "learning_rate": 7.437730424040702e-06, "loss": 0.0232, "num_tokens": 86871021.0, "step": 194 }, { "epoch": 1.915129151291513, "grad_norm": 0.07837311923363188, "learning_rate": 7.411862763048068e-06, "loss": 0.0228, "num_tokens": 87328297.0, "step": 195 }, { "epoch": 1.9249692496924968, "grad_norm": 0.07159308881612252, "learning_rate": 7.38591778040803e-06, "loss": 0.0178, "num_tokens": 87780478.0, "step": 196 }, { "epoch": 1.934809348093481, "grad_norm": 0.06995284279442164, "learning_rate": 7.359896525415986e-06, "loss": 0.0166, "num_tokens": 88245218.0, "step": 197 }, { "epoch": 1.944649446494465, "grad_norm": 0.074185946727602, "learning_rate": 7.333800050452024e-06, "loss": 0.0335, "num_tokens": 88720048.0, "step": 198 }, { "epoch": 1.954489544895449, "grad_norm": 0.0936664061322253, "learning_rate": 7.307629410938364e-06, "loss": 0.0156, "num_tokens": 89171687.0, "step": 199 }, { "epoch": 1.964329643296433, "grad_norm": 0.06592479834851843, "learning_rate": 7.281385665296663e-06, "loss": 0.0162, "num_tokens": 89636320.0, "step": 200 }, { "epoch": 1.974169741697417, "grad_norm": 0.08486840853612633, "learning_rate": 7.255069874905221e-06, "loss": 0.0177, "num_tokens": 90074778.0, "step": 201 }, { "epoch": 1.984009840098401, "grad_norm": 0.06923307537599123, "learning_rate": 7.228683104056051e-06, "loss": 0.0168, "num_tokens": 90519743.0, "step": 202 }, { "epoch": 1.9938499384993849, "grad_norm": 0.09250588119689185, "learning_rate": 7.202226419911832e-06, "loss": 0.0266, "num_tokens": 90971202.0, "step": 203 }, { "epoch": 2.0, "grad_norm": 0.08932358174959376, "learning_rate": 7.175700892462757e-06, "loss": 0.0167, "num_tokens": 91183681.0, "step": 204 }, { "epoch": 2.0, "eval_loss": 0.08808860927820206, "eval_num_tokens": 91183681.0, "eval_runtime": 53.9315, "eval_samples_per_second": 41.645, "eval_steps_per_second": 5.21, "step": 204 }, { "epoch": 2.009840098400984, "grad_norm": 0.07874869315833909, "learning_rate": 7.149107594483251e-06, "loss": 0.0142, "num_tokens": 91625671.0, "step": 205 }, { "epoch": 2.019680196801968, "grad_norm": 0.06385620551213778, "learning_rate": 7.122447601488592e-06, "loss": 0.0132, "num_tokens": 92071488.0, "step": 206 }, { "epoch": 2.029520295202952, "grad_norm": 0.06846197400142105, "learning_rate": 7.095721991691411e-06, "loss": 0.0149, "num_tokens": 92542156.0, "step": 207 }, { "epoch": 2.039360393603936, "grad_norm": 0.07424945414823086, "learning_rate": 7.0689318459580845e-06, "loss": 0.0156, "num_tokens": 93002703.0, "step": 208 }, { "epoch": 2.0492004920049203, "grad_norm": 0.06687580312011086, "learning_rate": 7.042078247765019e-06, "loss": 0.0135, "num_tokens": 93436834.0, "step": 209 }, { "epoch": 2.059040590405904, "grad_norm": 0.07720021453648518, "learning_rate": 7.015162283154843e-06, "loss": 0.0137, "num_tokens": 93871635.0, "step": 210 }, { "epoch": 2.068880688806888, "grad_norm": 0.13453391743262458, "learning_rate": 6.988185040692469e-06, "loss": 0.0221, "num_tokens": 94314058.0, "step": 211 }, { "epoch": 2.078720787207872, "grad_norm": 0.07982223152072775, "learning_rate": 6.961147611421076e-06, "loss": 0.017, "num_tokens": 94750976.0, "step": 212 }, { "epoch": 2.088560885608856, "grad_norm": 0.06995730861373262, "learning_rate": 6.934051088817988e-06, "loss": 0.0137, "num_tokens": 95193789.0, "step": 213 }, { "epoch": 2.09840098400984, "grad_norm": 0.07438600726959783, "learning_rate": 6.906896568750441e-06, "loss": 0.0193, "num_tokens": 95676386.0, "step": 214 }, { "epoch": 2.108241082410824, "grad_norm": 0.09331884860488432, "learning_rate": 6.87968514943127e-06, "loss": 0.0154, "num_tokens": 96137917.0, "step": 215 }, { "epoch": 2.1180811808118083, "grad_norm": 0.06703452835053635, "learning_rate": 6.852417931374494e-06, "loss": 0.0134, "num_tokens": 96568059.0, "step": 216 }, { "epoch": 2.127921279212792, "grad_norm": 0.07093081986870549, "learning_rate": 6.825096017350807e-06, "loss": 0.0138, "num_tokens": 97019588.0, "step": 217 }, { "epoch": 2.137761377613776, "grad_norm": 0.0650948479503258, "learning_rate": 6.797720512342967e-06, "loss": 0.0137, "num_tokens": 97456418.0, "step": 218 }, { "epoch": 2.14760147601476, "grad_norm": 0.06693139683273135, "learning_rate": 6.77029252350113e-06, "loss": 0.0142, "num_tokens": 97874765.0, "step": 219 }, { "epoch": 2.1574415744157442, "grad_norm": 0.07881816970778455, "learning_rate": 6.742813160098054e-06, "loss": 0.0188, "num_tokens": 98322373.0, "step": 220 }, { "epoch": 2.167281672816728, "grad_norm": 0.07381706020969016, "learning_rate": 6.715283533484242e-06, "loss": 0.0125, "num_tokens": 98762055.0, "step": 221 }, { "epoch": 2.177121771217712, "grad_norm": 0.06829050170688594, "learning_rate": 6.6877047570430044e-06, "loss": 0.0147, "num_tokens": 99212257.0, "step": 222 }, { "epoch": 2.1869618696186963, "grad_norm": 0.0726323898489312, "learning_rate": 6.660077946145412e-06, "loss": 0.0149, "num_tokens": 99651696.0, "step": 223 }, { "epoch": 2.19680196801968, "grad_norm": 0.06996376101830218, "learning_rate": 6.632404218105205e-06, "loss": 0.014, "num_tokens": 100115333.0, "step": 224 }, { "epoch": 2.206642066420664, "grad_norm": 0.07058857975728597, "learning_rate": 6.604684692133597e-06, "loss": 0.0128, "num_tokens": 100567168.0, "step": 225 }, { "epoch": 2.2164821648216484, "grad_norm": 0.06705830086377462, "learning_rate": 6.576920489294011e-06, "loss": 0.014, "num_tokens": 101017414.0, "step": 226 }, { "epoch": 2.2263222632226323, "grad_norm": 0.08216121325842957, "learning_rate": 6.549112732456739e-06, "loss": 0.0244, "num_tokens": 101478653.0, "step": 227 }, { "epoch": 2.236162361623616, "grad_norm": 0.06604918422838713, "learning_rate": 6.5212625462535365e-06, "loss": 0.0133, "num_tokens": 101922998.0, "step": 228 }, { "epoch": 2.2460024600246005, "grad_norm": 0.06450225948970358, "learning_rate": 6.493371057032129e-06, "loss": 0.0149, "num_tokens": 102357947.0, "step": 229 }, { "epoch": 2.2558425584255843, "grad_norm": 0.07514996917424294, "learning_rate": 6.465439392810664e-06, "loss": 0.0167, "num_tokens": 102803832.0, "step": 230 }, { "epoch": 2.265682656826568, "grad_norm": 0.06462428507734051, "learning_rate": 6.4374686832320944e-06, "loss": 0.0142, "num_tokens": 103241692.0, "step": 231 }, { "epoch": 2.275522755227552, "grad_norm": 0.06485952063828938, "learning_rate": 6.409460059518482e-06, "loss": 0.0136, "num_tokens": 103688326.0, "step": 232 }, { "epoch": 2.2853628536285364, "grad_norm": 0.06533997999817706, "learning_rate": 6.381414654425261e-06, "loss": 0.0131, "num_tokens": 104139997.0, "step": 233 }, { "epoch": 2.2952029520295203, "grad_norm": 0.06878268907753365, "learning_rate": 6.353333602195414e-06, "loss": 0.0138, "num_tokens": 104583247.0, "step": 234 }, { "epoch": 2.305043050430504, "grad_norm": 0.061527579151490784, "learning_rate": 6.325218038513604e-06, "loss": 0.0129, "num_tokens": 105013546.0, "step": 235 }, { "epoch": 2.3148831488314885, "grad_norm": 0.0688594189041464, "learning_rate": 6.2970691004602425e-06, "loss": 0.0147, "num_tokens": 105469533.0, "step": 236 }, { "epoch": 2.3247232472324724, "grad_norm": 0.07212293085873876, "learning_rate": 6.26888792646551e-06, "loss": 0.0138, "num_tokens": 105902012.0, "step": 237 }, { "epoch": 2.3345633456334562, "grad_norm": 0.07097729248579715, "learning_rate": 6.240675656263303e-06, "loss": 0.0133, "num_tokens": 106319708.0, "step": 238 }, { "epoch": 2.34440344403444, "grad_norm": 0.0702207231329528, "learning_rate": 6.212433430845145e-06, "loss": 0.0136, "num_tokens": 106767770.0, "step": 239 }, { "epoch": 2.3542435424354244, "grad_norm": 0.06717197740035392, "learning_rate": 6.184162392414044e-06, "loss": 0.0127, "num_tokens": 107230010.0, "step": 240 }, { "epoch": 2.3640836408364083, "grad_norm": 0.09206853570190297, "learning_rate": 6.155863684338294e-06, "loss": 0.0182, "num_tokens": 107696665.0, "step": 241 }, { "epoch": 2.373923739237392, "grad_norm": 0.07931539686074184, "learning_rate": 6.127538451105232e-06, "loss": 0.0156, "num_tokens": 108145998.0, "step": 242 }, { "epoch": 2.3837638376383765, "grad_norm": 0.0845167365221342, "learning_rate": 6.099187838274959e-06, "loss": 0.0304, "num_tokens": 108605347.0, "step": 243 }, { "epoch": 2.3936039360393604, "grad_norm": 0.8319925155014395, "learning_rate": 6.070812992434003e-06, "loss": 0.077, "num_tokens": 109053120.0, "step": 244 }, { "epoch": 2.4034440344403443, "grad_norm": 0.08254084053779843, "learning_rate": 6.042415061148954e-06, "loss": 0.0153, "num_tokens": 109511574.0, "step": 245 }, { "epoch": 2.4132841328413286, "grad_norm": 0.07621464852457635, "learning_rate": 6.013995192920044e-06, "loss": 0.013, "num_tokens": 109961861.0, "step": 246 }, { "epoch": 2.4231242312423125, "grad_norm": 0.06290755400921484, "learning_rate": 5.985554537134702e-06, "loss": 0.0133, "num_tokens": 110439530.0, "step": 247 }, { "epoch": 2.4329643296432963, "grad_norm": 0.06549923207889226, "learning_rate": 5.957094244021071e-06, "loss": 0.0133, "num_tokens": 110902468.0, "step": 248 }, { "epoch": 2.4428044280442807, "grad_norm": 0.06398296126869986, "learning_rate": 5.928615464601497e-06, "loss": 0.0128, "num_tokens": 111361759.0, "step": 249 }, { "epoch": 2.4526445264452645, "grad_norm": 0.062244715362799644, "learning_rate": 5.900119350645956e-06, "loss": 0.0128, "num_tokens": 111799435.0, "step": 250 }, { "epoch": 2.4624846248462484, "grad_norm": 0.06503161600374163, "learning_rate": 5.871607054625497e-06, "loss": 0.0128, "num_tokens": 112244747.0, "step": 251 }, { "epoch": 2.4723247232472323, "grad_norm": 0.08086590997362891, "learning_rate": 5.8430797296656125e-06, "loss": 0.0184, "num_tokens": 112678903.0, "step": 252 }, { "epoch": 2.4821648216482166, "grad_norm": 0.07239451855920867, "learning_rate": 5.814538529499622e-06, "loss": 0.0149, "num_tokens": 113132832.0, "step": 253 }, { "epoch": 2.4920049200492005, "grad_norm": 0.06030312987290577, "learning_rate": 5.785984608421993e-06, "loss": 0.0127, "num_tokens": 113568429.0, "step": 254 }, { "epoch": 2.5018450184501844, "grad_norm": 0.06349775541516244, "learning_rate": 5.757419121241667e-06, "loss": 0.0125, "num_tokens": 114042240.0, "step": 255 }, { "epoch": 2.5116851168511687, "grad_norm": 0.06952013750985335, "learning_rate": 5.7288432232353615e-06, "loss": 0.0204, "num_tokens": 114496441.0, "step": 256 }, { "epoch": 2.5215252152521526, "grad_norm": 0.0958262233433174, "learning_rate": 5.7002580701008325e-06, "loss": 0.0149, "num_tokens": 114936236.0, "step": 257 }, { "epoch": 2.5313653136531364, "grad_norm": 0.06572975411347728, "learning_rate": 5.6716648179101445e-06, "loss": 0.0123, "num_tokens": 115365529.0, "step": 258 }, { "epoch": 2.5412054120541203, "grad_norm": 0.07287254897275752, "learning_rate": 5.64306462306291e-06, "loss": 0.0177, "num_tokens": 115812361.0, "step": 259 }, { "epoch": 2.5510455104551046, "grad_norm": 0.0677506186552676, "learning_rate": 5.614458642239534e-06, "loss": 0.0126, "num_tokens": 116269752.0, "step": 260 }, { "epoch": 2.5608856088560885, "grad_norm": 0.07088790175345892, "learning_rate": 5.585848032354411e-06, "loss": 0.0139, "num_tokens": 116739082.0, "step": 261 }, { "epoch": 2.570725707257073, "grad_norm": 2.483507979054926, "learning_rate": 5.557233950509159e-06, "loss": 0.3298, "num_tokens": 117236975.0, "step": 262 }, { "epoch": 2.5805658056580567, "grad_norm": 0.6712341553033803, "learning_rate": 5.528617553945807e-06, "loss": 0.0131, "num_tokens": 117701799.0, "step": 263 }, { "epoch": 2.5904059040590406, "grad_norm": 0.070379027103792, "learning_rate": 5.500000000000001e-06, "loss": 0.019, "num_tokens": 118190544.0, "step": 264 }, { "epoch": 2.6002460024600245, "grad_norm": 0.09944926431551483, "learning_rate": 5.4713824460541964e-06, "loss": 0.0153, "num_tokens": 118625146.0, "step": 265 }, { "epoch": 2.6100861008610083, "grad_norm": 0.07370939155932825, "learning_rate": 5.442766049490843e-06, "loss": 0.0138, "num_tokens": 119077739.0, "step": 266 }, { "epoch": 2.6199261992619927, "grad_norm": 0.06555516765204612, "learning_rate": 5.414151967645591e-06, "loss": 0.0136, "num_tokens": 119502701.0, "step": 267 }, { "epoch": 2.6297662976629765, "grad_norm": 0.060577987544993946, "learning_rate": 5.385541357760469e-06, "loss": 0.0121, "num_tokens": 119956823.0, "step": 268 }, { "epoch": 2.639606396063961, "grad_norm": 0.06969958736256228, "learning_rate": 5.35693537693709e-06, "loss": 0.0131, "num_tokens": 120410284.0, "step": 269 }, { "epoch": 2.6494464944649447, "grad_norm": 0.08178808292429539, "learning_rate": 5.3283351820898586e-06, "loss": 0.0183, "num_tokens": 120837514.0, "step": 270 }, { "epoch": 2.6592865928659286, "grad_norm": 0.12228602708630738, "learning_rate": 5.299741929899171e-06, "loss": 0.0206, "num_tokens": 121266377.0, "step": 271 }, { "epoch": 2.6691266912669125, "grad_norm": 0.07647057417070459, "learning_rate": 5.27115677676464e-06, "loss": 0.0154, "num_tokens": 121730907.0, "step": 272 }, { "epoch": 2.678966789667897, "grad_norm": 0.07263570161343703, "learning_rate": 5.242580878758334e-06, "loss": 0.0138, "num_tokens": 122162564.0, "step": 273 }, { "epoch": 2.6888068880688807, "grad_norm": 0.07390794347850005, "learning_rate": 5.21401539157801e-06, "loss": 0.0131, "num_tokens": 122644233.0, "step": 274 }, { "epoch": 2.6986469864698646, "grad_norm": 0.05624120433704004, "learning_rate": 5.1854614705003796e-06, "loss": 0.0114, "num_tokens": 123070674.0, "step": 275 }, { "epoch": 2.708487084870849, "grad_norm": 0.07371873132309133, "learning_rate": 5.156920270334389e-06, "loss": 0.0194, "num_tokens": 123517476.0, "step": 276 }, { "epoch": 2.7183271832718328, "grad_norm": 0.06758978472435712, "learning_rate": 5.1283929453745055e-06, "loss": 0.0129, "num_tokens": 123957650.0, "step": 277 }, { "epoch": 2.7281672816728166, "grad_norm": 0.06857276382476074, "learning_rate": 5.099880649354044e-06, "loss": 0.0125, "num_tokens": 124423561.0, "step": 278 }, { "epoch": 2.7380073800738005, "grad_norm": 0.06198166285648246, "learning_rate": 5.071384535398505e-06, "loss": 0.0119, "num_tokens": 124871204.0, "step": 279 }, { "epoch": 2.747847478474785, "grad_norm": 0.05801997208341688, "learning_rate": 5.04290575597893e-06, "loss": 0.0119, "num_tokens": 125320936.0, "step": 280 }, { "epoch": 2.7576875768757687, "grad_norm": 0.09983800531852628, "learning_rate": 5.0144454628653015e-06, "loss": 0.0157, "num_tokens": 125785587.0, "step": 281 }, { "epoch": 2.767527675276753, "grad_norm": 0.05961861980322237, "learning_rate": 4.986004807079959e-06, "loss": 0.0119, "num_tokens": 126223799.0, "step": 282 }, { "epoch": 2.777367773677737, "grad_norm": 0.06887056012305312, "learning_rate": 4.957584938851048e-06, "loss": 0.0127, "num_tokens": 126674560.0, "step": 283 }, { "epoch": 2.787207872078721, "grad_norm": 0.06432285678662777, "learning_rate": 4.929187007565996e-06, "loss": 0.0124, "num_tokens": 127121758.0, "step": 284 }, { "epoch": 2.7970479704797047, "grad_norm": 0.06283306903955838, "learning_rate": 4.9008121617250425e-06, "loss": 0.0122, "num_tokens": 127564319.0, "step": 285 }, { "epoch": 2.8068880688806885, "grad_norm": 0.07395862495517919, "learning_rate": 4.87246154889477e-06, "loss": 0.0125, "num_tokens": 128014723.0, "step": 286 }, { "epoch": 2.816728167281673, "grad_norm": 0.06772968868173306, "learning_rate": 4.8441363156617085e-06, "loss": 0.026, "num_tokens": 128456573.0, "step": 287 }, { "epoch": 2.8265682656826567, "grad_norm": 0.2058477599150272, "learning_rate": 4.815837607585957e-06, "loss": 0.0313, "num_tokens": 128888085.0, "step": 288 }, { "epoch": 2.836408364083641, "grad_norm": 0.05983028509302605, "learning_rate": 4.787566569154855e-06, "loss": 0.0136, "num_tokens": 129344186.0, "step": 289 }, { "epoch": 2.846248462484625, "grad_norm": 0.1679165256737002, "learning_rate": 4.759324343736698e-06, "loss": 0.0268, "num_tokens": 129820337.0, "step": 290 }, { "epoch": 2.856088560885609, "grad_norm": 0.069693981729958, "learning_rate": 4.731112073534491e-06, "loss": 0.012, "num_tokens": 130264132.0, "step": 291 }, { "epoch": 2.8659286592865927, "grad_norm": 0.05673801969192786, "learning_rate": 4.70293089953976e-06, "loss": 0.237, "num_tokens": 130747367.0, "step": 292 }, { "epoch": 2.875768757687577, "grad_norm": 0.9244716369700087, "learning_rate": 4.674781961486399e-06, "loss": 0.0129, "num_tokens": 131189544.0, "step": 293 }, { "epoch": 2.885608856088561, "grad_norm": 0.0670539720853974, "learning_rate": 4.646666397804586e-06, "loss": 0.0127, "num_tokens": 131615817.0, "step": 294 }, { "epoch": 2.8954489544895448, "grad_norm": 0.07778029323101539, "learning_rate": 4.618585345574741e-06, "loss": 0.0136, "num_tokens": 132065833.0, "step": 295 }, { "epoch": 2.905289052890529, "grad_norm": 0.06633645417900966, "learning_rate": 4.5905399404815196e-06, "loss": 0.0119, "num_tokens": 132513181.0, "step": 296 }, { "epoch": 2.915129151291513, "grad_norm": 0.06604742202311176, "learning_rate": 4.562531316767908e-06, "loss": 0.0178, "num_tokens": 132975979.0, "step": 297 }, { "epoch": 2.924969249692497, "grad_norm": 0.06375772945002761, "learning_rate": 4.534560607189338e-06, "loss": 0.0121, "num_tokens": 133411946.0, "step": 298 }, { "epoch": 2.9348093480934807, "grad_norm": 0.0644873715390372, "learning_rate": 4.506628942967874e-06, "loss": 0.0226, "num_tokens": 133882037.0, "step": 299 }, { "epoch": 2.944649446494465, "grad_norm": 0.06122403707300358, "learning_rate": 4.478737453746464e-06, "loss": 0.0111, "num_tokens": 134338580.0, "step": 300 }, { "epoch": 2.954489544895449, "grad_norm": 0.06192995198797032, "learning_rate": 4.450887267543261e-06, "loss": 0.023, "num_tokens": 134806429.0, "step": 301 }, { "epoch": 2.9643296432964332, "grad_norm": 0.06577423487360488, "learning_rate": 4.423079510705992e-06, "loss": 0.0127, "num_tokens": 135253050.0, "step": 302 }, { "epoch": 2.974169741697417, "grad_norm": 0.061821762890230156, "learning_rate": 4.395315307866404e-06, "loss": 0.0118, "num_tokens": 135701900.0, "step": 303 }, { "epoch": 2.984009840098401, "grad_norm": 0.060295397517859534, "learning_rate": 4.3675957818947965e-06, "loss": 0.0112, "num_tokens": 136134539.0, "step": 304 }, { "epoch": 2.993849938499385, "grad_norm": 0.06204359834906306, "learning_rate": 4.33992205385459e-06, "loss": 0.0119, "num_tokens": 136581981.0, "step": 305 }, { "epoch": 3.0, "grad_norm": 0.06204359834906306, "learning_rate": 4.312295242956998e-06, "loss": 0.0109, "num_tokens": 136774441.0, "step": 306 }, { "epoch": 3.0, "eval_loss": 0.0963606908917427, "eval_num_tokens": 136774441.0, "eval_runtime": 53.9214, "eval_samples_per_second": 41.653, "eval_steps_per_second": 5.211, "step": 306 }, { "epoch": 3.009840098400984, "grad_norm": 0.08266586517900253, "learning_rate": 4.284716466515759e-06, "loss": 0.0218, "num_tokens": 137235846.0, "step": 307 }, { "epoch": 3.019680196801968, "grad_norm": 0.06025259361613064, "learning_rate": 4.257186839901948e-06, "loss": 0.01, "num_tokens": 137676575.0, "step": 308 }, { "epoch": 3.029520295202952, "grad_norm": 0.059520087712568295, "learning_rate": 4.229707476498871e-06, "loss": 0.0107, "num_tokens": 138127277.0, "step": 309 }, { "epoch": 3.039360393603936, "grad_norm": 0.060007105121960225, "learning_rate": 4.2022794876570335e-06, "loss": 0.0099, "num_tokens": 138558346.0, "step": 310 }, { "epoch": 3.0492004920049203, "grad_norm": 0.05765555936281279, "learning_rate": 4.1749039826491956e-06, "loss": 0.2021, "num_tokens": 139029117.0, "step": 311 }, { "epoch": 3.059040590405904, "grad_norm": 0.25549047851203505, "learning_rate": 4.1475820686255055e-06, "loss": 0.01, "num_tokens": 139465608.0, "step": 312 }, { "epoch": 3.068880688806888, "grad_norm": 0.05745397404349778, "learning_rate": 4.120314850568731e-06, "loss": 0.0291, "num_tokens": 139932040.0, "step": 313 }, { "epoch": 3.078720787207872, "grad_norm": 0.21571060654935606, "learning_rate": 4.093103431249563e-06, "loss": 0.011, "num_tokens": 140393810.0, "step": 314 }, { "epoch": 3.088560885608856, "grad_norm": 0.06271676867820344, "learning_rate": 4.065948911182015e-06, "loss": 0.018, "num_tokens": 140853306.0, "step": 315 }, { "epoch": 3.09840098400984, "grad_norm": 0.06529992912597996, "learning_rate": 4.038852388578925e-06, "loss": 0.0102, "num_tokens": 141293974.0, "step": 316 }, { "epoch": 3.108241082410824, "grad_norm": 0.0613594667302306, "learning_rate": 4.011814959307533e-06, "loss": 0.0101, "num_tokens": 141739396.0, "step": 317 }, { "epoch": 3.1180811808118083, "grad_norm": 0.06143281774280475, "learning_rate": 3.984837716845157e-06, "loss": 0.0098, "num_tokens": 142181417.0, "step": 318 }, { "epoch": 3.127921279212792, "grad_norm": 0.06065540767441434, "learning_rate": 3.957921752234982e-06, "loss": 0.0095, "num_tokens": 142615273.0, "step": 319 }, { "epoch": 3.137761377613776, "grad_norm": 0.0565367496699821, "learning_rate": 3.931068154041919e-06, "loss": 0.0156, "num_tokens": 143066695.0, "step": 320 }, { "epoch": 3.14760147601476, "grad_norm": 0.0928817994214938, "learning_rate": 3.904278008308589e-06, "loss": 0.0093, "num_tokens": 143543314.0, "step": 321 }, { "epoch": 3.1574415744157442, "grad_norm": 0.05348206917431186, "learning_rate": 3.877552398511409e-06, "loss": 0.0102, "num_tokens": 143978640.0, "step": 322 }, { "epoch": 3.167281672816728, "grad_norm": 0.05744861837720995, "learning_rate": 3.85089240551675e-06, "loss": 0.0096, "num_tokens": 144437143.0, "step": 323 }, { "epoch": 3.177121771217712, "grad_norm": 0.05917730480215664, "learning_rate": 3.8242991075372436e-06, "loss": 0.0103, "num_tokens": 144882614.0, "step": 324 }, { "epoch": 3.1869618696186963, "grad_norm": 0.06138753989215512, "learning_rate": 3.7977735800881687e-06, "loss": 0.01, "num_tokens": 145336615.0, "step": 325 }, { "epoch": 3.19680196801968, "grad_norm": 0.057934477141044834, "learning_rate": 3.7713168959439515e-06, "loss": 0.0097, "num_tokens": 145791703.0, "step": 326 }, { "epoch": 3.206642066420664, "grad_norm": 0.062311400511582536, "learning_rate": 3.74493012509478e-06, "loss": 0.0163, "num_tokens": 146256588.0, "step": 327 }, { "epoch": 3.2164821648216484, "grad_norm": 0.11046706497961999, "learning_rate": 3.718614334703339e-06, "loss": 0.0096, "num_tokens": 146704790.0, "step": 328 }, { "epoch": 3.2263222632226323, "grad_norm": 0.06040935915809342, "learning_rate": 3.692370589061639e-06, "loss": 0.0161, "num_tokens": 147150851.0, "step": 329 }, { "epoch": 3.236162361623616, "grad_norm": 0.06309596528426079, "learning_rate": 3.6661999495479772e-06, "loss": 0.0116, "num_tokens": 147586533.0, "step": 330 }, { "epoch": 3.2460024600246005, "grad_norm": 0.0775947611650109, "learning_rate": 3.640103474584016e-06, "loss": 0.0102, "num_tokens": 148012817.0, "step": 331 }, { "epoch": 3.2558425584255843, "grad_norm": 0.060442066581616015, "learning_rate": 3.614082219591972e-06, "loss": 0.0094, "num_tokens": 148454349.0, "step": 332 }, { "epoch": 3.265682656826568, "grad_norm": 0.0599277899760194, "learning_rate": 3.588137236951934e-06, "loss": 0.0096, "num_tokens": 148908837.0, "step": 333 }, { "epoch": 3.275522755227552, "grad_norm": 0.06389649266611047, "learning_rate": 3.5622695759592996e-06, "loss": 0.0091, "num_tokens": 149387409.0, "step": 334 }, { "epoch": 3.2853628536285364, "grad_norm": 0.059031876557593344, "learning_rate": 3.5364802827823397e-06, "loss": 0.0124, "num_tokens": 149842184.0, "step": 335 }, { "epoch": 3.2952029520295203, "grad_norm": 0.06425762134540147, "learning_rate": 3.5107704004198904e-06, "loss": 0.0096, "num_tokens": 150294624.0, "step": 336 }, { "epoch": 3.305043050430504, "grad_norm": 0.060359900802863305, "learning_rate": 3.485140968659166e-06, "loss": 0.0156, "num_tokens": 150757952.0, "step": 337 }, { "epoch": 3.3148831488314885, "grad_norm": 0.06451910432321761, "learning_rate": 3.4595930240337115e-06, "loss": 0.0093, "num_tokens": 151210941.0, "step": 338 }, { "epoch": 3.3247232472324724, "grad_norm": 0.05771756769585445, "learning_rate": 3.4341275997814795e-06, "loss": 0.0311, "num_tokens": 151659703.0, "step": 339 }, { "epoch": 3.3345633456334562, "grad_norm": 0.2709101034464869, "learning_rate": 3.408745725803042e-06, "loss": 0.0198, "num_tokens": 152096656.0, "step": 340 }, { "epoch": 3.34440344403444, "grad_norm": 0.2165805542100797, "learning_rate": 3.383448428619941e-06, "loss": 0.0109, "num_tokens": 152535937.0, "step": 341 }, { "epoch": 3.3542435424354244, "grad_norm": 0.06249104678860667, "learning_rate": 3.3582367313331692e-06, "loss": 0.0241, "num_tokens": 153012481.0, "step": 342 }, { "epoch": 3.3640836408364083, "grad_norm": 0.07444091538512662, "learning_rate": 3.3331116535817974e-06, "loss": 0.0096, "num_tokens": 153457239.0, "step": 343 }, { "epoch": 3.373923739237392, "grad_norm": 0.05744783875540723, "learning_rate": 3.308074211501732e-06, "loss": 0.0112, "num_tokens": 153885310.0, "step": 344 }, { "epoch": 3.3837638376383765, "grad_norm": 0.062108203142145886, "learning_rate": 3.2831254176846205e-06, "loss": 0.0102, "num_tokens": 154315565.0, "step": 345 }, { "epoch": 3.3936039360393604, "grad_norm": 0.06493988486024563, "learning_rate": 3.258266281136905e-06, "loss": 0.0154, "num_tokens": 154761237.0, "step": 346 }, { "epoch": 3.4034440344403443, "grad_norm": 0.07703452506780802, "learning_rate": 3.233497807239008e-06, "loss": 0.0149, "num_tokens": 155219079.0, "step": 347 }, { "epoch": 3.4132841328413286, "grad_norm": 0.07716474025857703, "learning_rate": 3.2088209977046657e-06, "loss": 0.0099, "num_tokens": 155672847.0, "step": 348 }, { "epoch": 3.4231242312423125, "grad_norm": 0.0598011605849924, "learning_rate": 3.1842368505404388e-06, "loss": 0.0097, "num_tokens": 156097592.0, "step": 349 }, { "epoch": 3.4329643296432963, "grad_norm": 0.06067024127693304, "learning_rate": 3.1597463600053258e-06, "loss": 0.0097, "num_tokens": 156543931.0, "step": 350 }, { "epoch": 3.4428044280442807, "grad_norm": 0.06276348610439125, "learning_rate": 3.135350516570559e-06, "loss": 0.0115, "num_tokens": 156993093.0, "step": 351 }, { "epoch": 3.4526445264452645, "grad_norm": 0.07056305058653452, "learning_rate": 3.111050306879556e-06, "loss": 0.0161, "num_tokens": 157435895.0, "step": 352 }, { "epoch": 3.4624846248462484, "grad_norm": 0.0692853066303934, "learning_rate": 3.0868467137080075e-06, "loss": 0.0124, "num_tokens": 157859703.0, "step": 353 }, { "epoch": 3.4723247232472323, "grad_norm": 0.06622059827297899, "learning_rate": 3.0627407159241273e-06, "loss": 0.0098, "num_tokens": 158319159.0, "step": 354 }, { "epoch": 3.4821648216482166, "grad_norm": 0.06424105970441871, "learning_rate": 3.0387332884490806e-06, "loss": 0.0105, "num_tokens": 158768974.0, "step": 355 }, { "epoch": 3.4920049200492005, "grad_norm": 0.06970655480927966, "learning_rate": 3.014825402217533e-06, "loss": 0.0099, "num_tokens": 159221319.0, "step": 356 }, { "epoch": 3.5018450184501844, "grad_norm": 0.06231852234082556, "learning_rate": 2.9910180241384014e-06, "loss": 0.0099, "num_tokens": 159657431.0, "step": 357 }, { "epoch": 3.5116851168511687, "grad_norm": 0.06403174372575768, "learning_rate": 2.9673121170557396e-06, "loss": 0.0099, "num_tokens": 160091184.0, "step": 358 }, { "epoch": 3.5215252152521526, "grad_norm": 0.06050506427522611, "learning_rate": 2.9437086397097996e-06, "loss": 0.0095, "num_tokens": 160538104.0, "step": 359 }, { "epoch": 3.5313653136531364, "grad_norm": 0.05914580967848918, "learning_rate": 2.92020854669826e-06, "loss": 0.0151, "num_tokens": 160984800.0, "step": 360 }, { "epoch": 3.5412054120541203, "grad_norm": 0.06615551474859403, "learning_rate": 2.896812788437615e-06, "loss": 0.0102, "num_tokens": 161437908.0, "step": 361 }, { "epoch": 3.5510455104551046, "grad_norm": 0.05688142632929498, "learning_rate": 2.8735223111247402e-06, "loss": 0.0094, "num_tokens": 161900209.0, "step": 362 }, { "epoch": 3.5608856088560885, "grad_norm": 0.05805719882416427, "learning_rate": 2.850338056698621e-06, "loss": 0.0094, "num_tokens": 162381378.0, "step": 363 }, { "epoch": 3.570725707257073, "grad_norm": 0.05665394777981862, "learning_rate": 2.827260962802263e-06, "loss": 0.0089, "num_tokens": 162818401.0, "step": 364 }, { "epoch": 3.5805658056580567, "grad_norm": 0.058540688861597474, "learning_rate": 2.804291962744768e-06, "loss": 0.0102, "num_tokens": 163261663.0, "step": 365 }, { "epoch": 3.5904059040590406, "grad_norm": 0.06068364561780823, "learning_rate": 2.7814319854635875e-06, "loss": 0.0096, "num_tokens": 163706510.0, "step": 366 }, { "epoch": 3.6002460024600245, "grad_norm": 0.0593859542792967, "learning_rate": 2.758681955486955e-06, "loss": 0.0097, "num_tokens": 164145145.0, "step": 367 }, { "epoch": 3.6100861008610083, "grad_norm": 0.059439587082302694, "learning_rate": 2.736042792896495e-06, "loss": 0.0104, "num_tokens": 164588218.0, "step": 368 }, { "epoch": 3.6199261992619927, "grad_norm": 0.06426940128348262, "learning_rate": 2.7135154132900133e-06, "loss": 0.0203, "num_tokens": 165039642.0, "step": 369 }, { "epoch": 3.6297662976629765, "grad_norm": 0.059031373381084176, "learning_rate": 2.691100727744458e-06, "loss": 0.0091, "num_tokens": 165502439.0, "step": 370 }, { "epoch": 3.639606396063961, "grad_norm": 0.05706397506461239, "learning_rate": 2.668799642779093e-06, "loss": 0.0106, "num_tokens": 165957611.0, "step": 371 }, { "epoch": 3.6494464944649447, "grad_norm": 0.06337690848780857, "learning_rate": 2.6466130603188157e-06, "loss": 0.01, "num_tokens": 166404741.0, "step": 372 }, { "epoch": 3.6592865928659286, "grad_norm": 0.057865704503962175, "learning_rate": 2.624541877657685e-06, "loss": 0.1951, "num_tokens": 166908892.0, "step": 373 }, { "epoch": 3.6691266912669125, "grad_norm": 0.6748913551790232, "learning_rate": 2.602586987422643e-06, "loss": 0.0094, "num_tokens": 167346017.0, "step": 374 }, { "epoch": 3.678966789667897, "grad_norm": 0.06271310429727074, "learning_rate": 2.580749277537399e-06, "loss": 0.0093, "num_tokens": 167795779.0, "step": 375 }, { "epoch": 3.6888068880688807, "grad_norm": 0.05728241738284472, "learning_rate": 2.5590296311865294e-06, "loss": 0.0092, "num_tokens": 168246613.0, "step": 376 }, { "epoch": 3.6986469864698646, "grad_norm": 0.05730319671770116, "learning_rate": 2.537428926779758e-06, "loss": 0.0104, "num_tokens": 168703193.0, "step": 377 }, { "epoch": 3.708487084870849, "grad_norm": 0.061789009881383514, "learning_rate": 2.515948037916423e-06, "loss": 0.0104, "num_tokens": 169166239.0, "step": 378 }, { "epoch": 3.7183271832718328, "grad_norm": 0.05958784070544453, "learning_rate": 2.494587833350153e-06, "loss": 0.0564, "num_tokens": 169618415.0, "step": 379 }, { "epoch": 3.7281672816728166, "grad_norm": 0.22039415728368103, "learning_rate": 2.473349176953736e-06, "loss": 0.0094, "num_tokens": 170079318.0, "step": 380 }, { "epoch": 3.7380073800738005, "grad_norm": 0.05930397129828618, "learning_rate": 2.4522329276841664e-06, "loss": 0.0198, "num_tokens": 170524571.0, "step": 381 }, { "epoch": 3.747847478474785, "grad_norm": 0.06047568038440854, "learning_rate": 2.431239939547921e-06, "loss": 0.0094, "num_tokens": 170983016.0, "step": 382 }, { "epoch": 3.7576875768757687, "grad_norm": 0.061680315681806853, "learning_rate": 2.4103710615664145e-06, "loss": 0.0089, "num_tokens": 171426486.0, "step": 383 }, { "epoch": 3.767527675276753, "grad_norm": 0.05588539351574886, "learning_rate": 2.389627137741662e-06, "loss": 0.0094, "num_tokens": 171871834.0, "step": 384 }, { "epoch": 3.777367773677737, "grad_norm": 0.061780123368904795, "learning_rate": 2.369009007022146e-06, "loss": 0.0093, "num_tokens": 172337523.0, "step": 385 }, { "epoch": 3.787207872078721, "grad_norm": 0.05632561272908436, "learning_rate": 2.3485175032688865e-06, "loss": 0.0088, "num_tokens": 172775826.0, "step": 386 }, { "epoch": 3.7970479704797047, "grad_norm": 0.058782272770165275, "learning_rate": 2.328153455221717e-06, "loss": 0.0095, "num_tokens": 173234709.0, "step": 387 }, { "epoch": 3.8068880688806885, "grad_norm": 0.057526356469471435, "learning_rate": 2.3079176864657673e-06, "loss": 0.0097, "num_tokens": 173700055.0, "step": 388 }, { "epoch": 3.816728167281673, "grad_norm": 0.06609619441495819, "learning_rate": 2.2878110153981565e-06, "loss": 0.0111, "num_tokens": 174147961.0, "step": 389 }, { "epoch": 3.8265682656826567, "grad_norm": 0.06703233332357492, "learning_rate": 2.267834255194894e-06, "loss": 0.0116, "num_tokens": 174586991.0, "step": 390 }, { "epoch": 3.836408364083641, "grad_norm": 0.06522848493729735, "learning_rate": 2.2479882137779903e-06, "loss": 0.0106, "num_tokens": 175006875.0, "step": 391 }, { "epoch": 3.846248462484625, "grad_norm": 0.06306752932488521, "learning_rate": 2.228273693782784e-06, "loss": 0.0094, "num_tokens": 175451007.0, "step": 392 }, { "epoch": 3.856088560885609, "grad_norm": 0.062263756072231294, "learning_rate": 2.208691492525481e-06, "loss": 0.0135, "num_tokens": 175896902.0, "step": 393 }, { "epoch": 3.8659286592865927, "grad_norm": 0.06835430681220003, "learning_rate": 2.189242401970908e-06, "loss": 0.0092, "num_tokens": 176346616.0, "step": 394 }, { "epoch": 3.875768757687577, "grad_norm": 0.05728313379563115, "learning_rate": 2.169927208700482e-06, "loss": 0.0098, "num_tokens": 176802124.0, "step": 395 }, { "epoch": 3.885608856088561, "grad_norm": 0.06299115193931754, "learning_rate": 2.1507466938804013e-06, "loss": 0.0089, "num_tokens": 177233961.0, "step": 396 }, { "epoch": 3.8954489544895448, "grad_norm": 0.060076198285498296, "learning_rate": 2.131701633230045e-06, "loss": 0.0098, "num_tokens": 177684662.0, "step": 397 }, { "epoch": 3.905289052890529, "grad_norm": 0.06517531508961912, "learning_rate": 2.112792796990616e-06, "loss": 0.0095, "num_tokens": 178123825.0, "step": 398 }, { "epoch": 3.915129151291513, "grad_norm": 0.05863263973572925, "learning_rate": 2.0940209498939732e-06, "loss": 0.009, "num_tokens": 178562641.0, "step": 399 }, { "epoch": 3.924969249692497, "grad_norm": 0.05798991563312477, "learning_rate": 2.075386851131711e-06, "loss": 0.0094, "num_tokens": 179007017.0, "step": 400 }, { "epoch": 3.9348093480934807, "grad_norm": 0.06118488260559937, "learning_rate": 2.056891254324459e-06, "loss": 0.0095, "num_tokens": 179449125.0, "step": 401 }, { "epoch": 3.944649446494465, "grad_norm": 0.06403534407994695, "learning_rate": 2.038534907491396e-06, "loss": 0.009, "num_tokens": 179887646.0, "step": 402 }, { "epoch": 3.954489544895449, "grad_norm": 0.08058699039926022, "learning_rate": 2.0203185530199983e-06, "loss": 0.0138, "num_tokens": 180341944.0, "step": 403 }, { "epoch": 3.9643296432964332, "grad_norm": 0.056026267406971995, "learning_rate": 2.0022429276360256e-06, "loss": 0.0097, "num_tokens": 180787775.0, "step": 404 }, { "epoch": 3.974169741697417, "grad_norm": 0.058787256460149456, "learning_rate": 1.9843087623737097e-06, "loss": 0.0088, "num_tokens": 181276015.0, "step": 405 }, { "epoch": 3.984009840098401, "grad_norm": 0.054638072869340186, "learning_rate": 1.966516782546199e-06, "loss": 0.009, "num_tokens": 181724759.0, "step": 406 }, { "epoch": 3.993849938499385, "grad_norm": 0.05931097745374889, "learning_rate": 1.94886770771623e-06, "loss": 0.0098, "num_tokens": 182165821.0, "step": 407 }, { "epoch": 4.0, "grad_norm": 0.06697953375930626, "learning_rate": 1.931362251667008e-06, "loss": 0.027, "num_tokens": 182364260.0, "step": 408 }, { "epoch": 4.0, "eval_loss": 0.1028980016708374, "eval_num_tokens": 182364260.0, "eval_runtime": 53.8919, "eval_samples_per_second": 41.676, "eval_steps_per_second": 5.214, "step": 408 }, { "epoch": 4.009840098400984, "grad_norm": 0.07632643003764507, "learning_rate": 1.9140011223733576e-06, "loss": 0.0082, "num_tokens": 182806025.0, "step": 409 }, { "epoch": 4.019680196801968, "grad_norm": 0.05437436276939388, "learning_rate": 1.8967850219730799e-06, "loss": 0.0081, "num_tokens": 183278654.0, "step": 410 }, { "epoch": 4.029520295202952, "grad_norm": 0.05114318878211908, "learning_rate": 1.8797146467385604e-06, "loss": 0.0076, "num_tokens": 183720645.0, "step": 411 }, { "epoch": 4.039360393603936, "grad_norm": 0.053465044974803935, "learning_rate": 1.8627906870486063e-06, "loss": 0.0082, "num_tokens": 184191637.0, "step": 412 }, { "epoch": 4.04920049200492, "grad_norm": 0.054542981072468875, "learning_rate": 1.8460138273605265e-06, "loss": 0.008, "num_tokens": 184634141.0, "step": 413 }, { "epoch": 4.059040590405904, "grad_norm": 0.052414283521576004, "learning_rate": 1.8293847461824538e-06, "loss": 0.0079, "num_tokens": 185081741.0, "step": 414 }, { "epoch": 4.068880688806888, "grad_norm": 0.05289967674124652, "learning_rate": 1.8129041160458966e-06, "loss": 0.008, "num_tokens": 185495440.0, "step": 415 }, { "epoch": 4.078720787207872, "grad_norm": 0.0584668942852983, "learning_rate": 1.7965726034785466e-06, "loss": 0.0081, "num_tokens": 185938291.0, "step": 416 }, { "epoch": 4.088560885608856, "grad_norm": 0.05897150659800833, "learning_rate": 1.780390868977318e-06, "loss": 0.0086, "num_tokens": 186409542.0, "step": 417 }, { "epoch": 4.0984009840098405, "grad_norm": 0.05118034680985974, "learning_rate": 1.7643595669816378e-06, "loss": 0.0077, "num_tokens": 186852482.0, "step": 418 }, { "epoch": 4.108241082410824, "grad_norm": 0.05911903344070817, "learning_rate": 1.7484793458469745e-06, "loss": 0.0081, "num_tokens": 187306570.0, "step": 419 }, { "epoch": 4.118081180811808, "grad_norm": 0.058617479568280846, "learning_rate": 1.7327508478186216e-06, "loss": 0.0075, "num_tokens": 187738802.0, "step": 420 }, { "epoch": 4.127921279212792, "grad_norm": 0.05743950460862962, "learning_rate": 1.7171747090057201e-06, "loss": 0.0081, "num_tokens": 188188275.0, "step": 421 }, { "epoch": 4.137761377613776, "grad_norm": 0.0578427653677817, "learning_rate": 1.7017515593555295e-06, "loss": 0.008, "num_tokens": 188626310.0, "step": 422 }, { "epoch": 4.14760147601476, "grad_norm": 0.055381917249045204, "learning_rate": 1.6864820226279607e-06, "loss": 0.0079, "num_tokens": 189058824.0, "step": 423 }, { "epoch": 4.157441574415744, "grad_norm": 0.0566904301682134, "learning_rate": 1.6713667163703348e-06, "loss": 0.008, "num_tokens": 189488025.0, "step": 424 }, { "epoch": 4.167281672816729, "grad_norm": 0.0591657691393218, "learning_rate": 1.6564062518924202e-06, "loss": 0.0093, "num_tokens": 189949176.0, "step": 425 }, { "epoch": 4.177121771217712, "grad_norm": 0.058609260537066755, "learning_rate": 1.6416012342417056e-06, "loss": 0.0075, "num_tokens": 190405187.0, "step": 426 }, { "epoch": 4.186961869618696, "grad_norm": 0.05376660491247955, "learning_rate": 1.6269522621789246e-06, "loss": 0.0094, "num_tokens": 190839466.0, "step": 427 }, { "epoch": 4.19680196801968, "grad_norm": 0.062048025442225076, "learning_rate": 1.6124599281538452e-06, "loss": 0.02, "num_tokens": 191280153.0, "step": 428 }, { "epoch": 4.206642066420664, "grad_norm": 0.06071173185238267, "learning_rate": 1.5981248182813136e-06, "loss": 0.0073, "num_tokens": 191734314.0, "step": 429 }, { "epoch": 4.216482164821648, "grad_norm": 0.05301725414979279, "learning_rate": 1.583947512317537e-06, "loss": 0.0117, "num_tokens": 192202492.0, "step": 430 }, { "epoch": 4.226322263222632, "grad_norm": 0.06832062526218917, "learning_rate": 1.5699285836366488e-06, "loss": 0.0093, "num_tokens": 192667915.0, "step": 431 }, { "epoch": 4.236162361623617, "grad_norm": 0.05748762603533909, "learning_rate": 1.5560685992075141e-06, "loss": 0.0078, "num_tokens": 193136794.0, "step": 432 }, { "epoch": 4.2460024600246005, "grad_norm": 0.0737572203685775, "learning_rate": 1.5423681195707997e-06, "loss": 0.0073, "num_tokens": 193598491.0, "step": 433 }, { "epoch": 4.255842558425584, "grad_norm": 0.05225082250599676, "learning_rate": 1.528827698816306e-06, "loss": 0.0077, "num_tokens": 194023980.0, "step": 434 }, { "epoch": 4.265682656826568, "grad_norm": 0.05296466266803098, "learning_rate": 1.515447884560556e-06, "loss": 0.0074, "num_tokens": 194481167.0, "step": 435 }, { "epoch": 4.275522755227552, "grad_norm": 0.05336380722303185, "learning_rate": 1.502229217924649e-06, "loss": 0.0075, "num_tokens": 194915312.0, "step": 436 }, { "epoch": 4.285362853628536, "grad_norm": 0.05458180686808586, "learning_rate": 1.489172233512376e-06, "loss": 0.0076, "num_tokens": 195368266.0, "step": 437 }, { "epoch": 4.29520295202952, "grad_norm": 0.05542603913086383, "learning_rate": 1.4762774593885986e-06, "loss": 0.0081, "num_tokens": 195810914.0, "step": 438 }, { "epoch": 4.305043050430505, "grad_norm": 0.054344537083576325, "learning_rate": 1.4635454170578917e-06, "loss": 0.0072, "num_tokens": 196263940.0, "step": 439 }, { "epoch": 4.3148831488314885, "grad_norm": 0.052701156778993646, "learning_rate": 1.4509766214434535e-06, "loss": 0.0077, "num_tokens": 196718774.0, "step": 440 }, { "epoch": 4.324723247232472, "grad_norm": 0.05423178707270067, "learning_rate": 1.4385715808662787e-06, "loss": 0.008, "num_tokens": 197161519.0, "step": 441 }, { "epoch": 4.334563345633456, "grad_norm": 0.055354896441224044, "learning_rate": 1.4263307970246027e-06, "loss": 0.008, "num_tokens": 197621081.0, "step": 442 }, { "epoch": 4.34440344403444, "grad_norm": 0.05816305513011695, "learning_rate": 1.41425476497361e-06, "loss": 0.0078, "num_tokens": 198087857.0, "step": 443 }, { "epoch": 4.354243542435424, "grad_norm": 0.05127845466920968, "learning_rate": 1.4023439731054112e-06, "loss": 0.0077, "num_tokens": 198533672.0, "step": 444 }, { "epoch": 4.364083640836409, "grad_norm": 0.07067731738580797, "learning_rate": 1.390598903129296e-06, "loss": 0.0322, "num_tokens": 199022227.0, "step": 445 }, { "epoch": 4.373923739237393, "grad_norm": 0.05511218194004341, "learning_rate": 1.3790200300522413e-06, "loss": 0.0077, "num_tokens": 199462215.0, "step": 446 }, { "epoch": 4.3837638376383765, "grad_norm": 0.05735730379081794, "learning_rate": 1.3676078221597157e-06, "loss": 0.0074, "num_tokens": 199907231.0, "step": 447 }, { "epoch": 4.39360393603936, "grad_norm": 0.05442936039834661, "learning_rate": 1.3563627409967257e-06, "loss": 0.1955, "num_tokens": 200376904.0, "step": 448 }, { "epoch": 4.403444034440344, "grad_norm": 0.5930661652942222, "learning_rate": 1.3452852413491563e-06, "loss": 0.0074, "num_tokens": 200853967.0, "step": 449 }, { "epoch": 4.413284132841328, "grad_norm": 0.05077867679984549, "learning_rate": 1.3343757712253804e-06, "loss": 0.0076, "num_tokens": 201323621.0, "step": 450 }, { "epoch": 4.423124231242312, "grad_norm": 0.058807424527887606, "learning_rate": 1.3236347718381338e-06, "loss": 0.0096, "num_tokens": 201753687.0, "step": 451 }, { "epoch": 4.432964329643297, "grad_norm": 0.06001374322910319, "learning_rate": 1.3130626775866743e-06, "loss": 0.0081, "num_tokens": 202203799.0, "step": 452 }, { "epoch": 4.442804428044281, "grad_norm": 0.06273437087252197, "learning_rate": 1.3026599160392173e-06, "loss": 0.0092, "num_tokens": 202627243.0, "step": 453 }, { "epoch": 4.4526445264452645, "grad_norm": 0.06372618537836224, "learning_rate": 1.292426907915634e-06, "loss": 0.0076, "num_tokens": 203077433.0, "step": 454 }, { "epoch": 4.462484624846248, "grad_norm": 0.057948321757535656, "learning_rate": 1.2823640670704443e-06, "loss": 0.0229, "num_tokens": 203532517.0, "step": 455 }, { "epoch": 4.472324723247232, "grad_norm": 0.06607138604150303, "learning_rate": 1.2724718004760794e-06, "loss": 0.0078, "num_tokens": 203967752.0, "step": 456 }, { "epoch": 4.482164821648216, "grad_norm": 0.05725783304801458, "learning_rate": 1.2627505082064144e-06, "loss": 0.0076, "num_tokens": 204424349.0, "step": 457 }, { "epoch": 4.492004920049201, "grad_norm": 0.055427831791831646, "learning_rate": 1.2532005834205976e-06, "loss": 0.0079, "num_tokens": 204846138.0, "step": 458 }, { "epoch": 4.501845018450185, "grad_norm": 0.05460191637217484, "learning_rate": 1.2438224123471442e-06, "loss": 0.0192, "num_tokens": 205306730.0, "step": 459 }, { "epoch": 4.511685116851169, "grad_norm": 0.06279438477449967, "learning_rate": 1.2346163742683185e-06, "loss": 0.0117, "num_tokens": 205759609.0, "step": 460 }, { "epoch": 4.521525215252153, "grad_norm": 0.05702285396092694, "learning_rate": 1.2255828415047932e-06, "loss": 0.0076, "num_tokens": 206171295.0, "step": 461 }, { "epoch": 4.531365313653136, "grad_norm": 0.054521558454890394, "learning_rate": 1.216722179400592e-06, "loss": 0.0076, "num_tokens": 206639148.0, "step": 462 }, { "epoch": 4.54120541205412, "grad_norm": 0.05168283263697403, "learning_rate": 1.208034746308315e-06, "loss": 0.0068, "num_tokens": 207094260.0, "step": 463 }, { "epoch": 4.551045510455104, "grad_norm": 0.05161429329359664, "learning_rate": 1.1995208935746437e-06, "loss": 0.0081, "num_tokens": 207533375.0, "step": 464 }, { "epoch": 4.560885608856088, "grad_norm": 0.058514508257411606, "learning_rate": 1.1911809655261333e-06, "loss": 0.0081, "num_tokens": 207969517.0, "step": 465 }, { "epoch": 4.570725707257073, "grad_norm": 0.056665893017668854, "learning_rate": 1.1830152994552866e-06, "loss": 0.0086, "num_tokens": 208408117.0, "step": 466 }, { "epoch": 4.580565805658057, "grad_norm": 0.056163462620316754, "learning_rate": 1.175024225606912e-06, "loss": 0.0074, "num_tokens": 208879227.0, "step": 467 }, { "epoch": 4.590405904059041, "grad_norm": 0.05409385523794747, "learning_rate": 1.1672080671647695e-06, "loss": 0.0078, "num_tokens": 209325103.0, "step": 468 }, { "epoch": 4.6002460024600245, "grad_norm": 0.05629255243399504, "learning_rate": 1.1595671402384966e-06, "loss": 0.0102, "num_tokens": 209791894.0, "step": 469 }, { "epoch": 4.610086100861008, "grad_norm": 0.051104203707396316, "learning_rate": 1.152101753850828e-06, "loss": 0.0072, "num_tokens": 210254182.0, "step": 470 }, { "epoch": 4.619926199261993, "grad_norm": 0.05229454749737629, "learning_rate": 1.1448122099250946e-06, "loss": 0.0104, "num_tokens": 210702900.0, "step": 471 }, { "epoch": 4.629766297662977, "grad_norm": 0.060177504722208404, "learning_rate": 1.1376988032730135e-06, "loss": 0.0079, "num_tokens": 211151465.0, "step": 472 }, { "epoch": 4.639606396063961, "grad_norm": 0.05182456184289124, "learning_rate": 1.130761821582766e-06, "loss": 0.0072, "num_tokens": 211619464.0, "step": 473 }, { "epoch": 4.649446494464945, "grad_norm": 0.05574225668849545, "learning_rate": 1.1240015454073622e-06, "loss": 0.0085, "num_tokens": 212064266.0, "step": 474 }, { "epoch": 4.659286592865929, "grad_norm": 0.06359820975154429, "learning_rate": 1.1174182481532943e-06, "loss": 0.0081, "num_tokens": 212499724.0, "step": 475 }, { "epoch": 4.6691266912669125, "grad_norm": 0.05622656000305094, "learning_rate": 1.1110121960694773e-06, "loss": 0.0079, "num_tokens": 212945879.0, "step": 476 }, { "epoch": 4.678966789667896, "grad_norm": 0.06093763072714235, "learning_rate": 1.104783648236486e-06, "loss": 0.0084, "num_tokens": 213379787.0, "step": 477 }, { "epoch": 4.68880688806888, "grad_norm": 0.0543614373855231, "learning_rate": 1.0987328565560711e-06, "loss": 0.0075, "num_tokens": 213824263.0, "step": 478 }, { "epoch": 4.698646986469865, "grad_norm": 0.056905167227697236, "learning_rate": 1.0928600657409751e-06, "loss": 0.0082, "num_tokens": 214265208.0, "step": 479 }, { "epoch": 4.708487084870849, "grad_norm": 0.057351833542733925, "learning_rate": 1.0871655133050372e-06, "loss": 0.0082, "num_tokens": 214744301.0, "step": 480 }, { "epoch": 4.718327183271833, "grad_norm": 0.29349816338215157, "learning_rate": 1.081649429553581e-06, "loss": 0.0553, "num_tokens": 215194355.0, "step": 481 }, { "epoch": 4.728167281672817, "grad_norm": 0.051057953015104116, "learning_rate": 1.076312037574106e-06, "loss": 0.0074, "num_tokens": 215632060.0, "step": 482 }, { "epoch": 4.7380073800738005, "grad_norm": 0.056594540815463674, "learning_rate": 1.0711535532272632e-06, "loss": 0.0235, "num_tokens": 216097276.0, "step": 483 }, { "epoch": 4.747847478474784, "grad_norm": 0.068871190152495, "learning_rate": 1.0661741851381256e-06, "loss": 0.0077, "num_tokens": 216544463.0, "step": 484 }, { "epoch": 4.757687576875769, "grad_norm": 0.05907548729697175, "learning_rate": 1.0613741346877498e-06, "loss": 0.0084, "num_tokens": 216972058.0, "step": 485 }, { "epoch": 4.767527675276753, "grad_norm": 0.055592377746762095, "learning_rate": 1.056753596005032e-06, "loss": 0.0074, "num_tokens": 217401900.0, "step": 486 }, { "epoch": 4.777367773677737, "grad_norm": 0.05562394957573223, "learning_rate": 1.0523127559588579e-06, "loss": 0.0075, "num_tokens": 217845453.0, "step": 487 }, { "epoch": 4.787207872078721, "grad_norm": 0.05258367575789477, "learning_rate": 1.0480517941505428e-06, "loss": 0.0073, "num_tokens": 218272871.0, "step": 488 }, { "epoch": 4.797047970479705, "grad_norm": 0.05390618674507445, "learning_rate": 1.0439708829065708e-06, "loss": 0.0078, "num_tokens": 218732597.0, "step": 489 }, { "epoch": 4.8068880688806885, "grad_norm": 0.06946151381547928, "learning_rate": 1.0400701872716227e-06, "loss": 0.0223, "num_tokens": 219194340.0, "step": 490 }, { "epoch": 4.816728167281672, "grad_norm": 0.05582170906207444, "learning_rate": 1.0363498650019023e-06, "loss": 0.0077, "num_tokens": 219673692.0, "step": 491 }, { "epoch": 4.826568265682657, "grad_norm": 0.05244987983803676, "learning_rate": 1.0328100665587573e-06, "loss": 0.0073, "num_tokens": 220118246.0, "step": 492 }, { "epoch": 4.836408364083641, "grad_norm": 0.055024340070040305, "learning_rate": 1.029450935102592e-06, "loss": 0.0077, "num_tokens": 220555806.0, "step": 493 }, { "epoch": 4.846248462484625, "grad_norm": 0.05338628090134423, "learning_rate": 1.0262726064870801e-06, "loss": 0.0073, "num_tokens": 220997187.0, "step": 494 }, { "epoch": 4.856088560885609, "grad_norm": 0.058254094197714025, "learning_rate": 1.0232752092536666e-06, "loss": 0.0074, "num_tokens": 221434681.0, "step": 495 }, { "epoch": 4.865928659286593, "grad_norm": 0.05261616134189719, "learning_rate": 1.0204588646263731e-06, "loss": 0.0074, "num_tokens": 221884850.0, "step": 496 }, { "epoch": 4.875768757687577, "grad_norm": 0.052167915998619634, "learning_rate": 1.0178236865068933e-06, "loss": 0.0072, "num_tokens": 222333225.0, "step": 497 }, { "epoch": 4.885608856088561, "grad_norm": 0.06187153122740552, "learning_rate": 1.0153697814699858e-06, "loss": 0.0106, "num_tokens": 222774591.0, "step": 498 }, { "epoch": 4.895448954489545, "grad_norm": 0.054905669170180534, "learning_rate": 1.0130972487591658e-06, "loss": 0.0112, "num_tokens": 223227943.0, "step": 499 }, { "epoch": 4.905289052890529, "grad_norm": 0.06206228565326619, "learning_rate": 1.0110061802826889e-06, "loss": 0.0076, "num_tokens": 223680989.0, "step": 500 }, { "epoch": 4.915129151291513, "grad_norm": 0.05437071230251554, "learning_rate": 1.009096660609837e-06, "loss": 0.1789, "num_tokens": 224171724.0, "step": 501 }, { "epoch": 4.924969249692497, "grad_norm": 0.12358300885271949, "learning_rate": 1.0073687669674949e-06, "loss": 0.0081, "num_tokens": 224621243.0, "step": 502 }, { "epoch": 4.934809348093481, "grad_norm": 0.05743551551374671, "learning_rate": 1.0058225692370299e-06, "loss": 0.0077, "num_tokens": 225053570.0, "step": 503 }, { "epoch": 4.944649446494465, "grad_norm": 0.05705289715957623, "learning_rate": 1.0044581299514638e-06, "loss": 0.0077, "num_tokens": 225475922.0, "step": 504 }, { "epoch": 4.9544895448954485, "grad_norm": 0.052608564457681, "learning_rate": 1.003275504292944e-06, "loss": 0.0072, "num_tokens": 225944888.0, "step": 505 }, { "epoch": 4.964329643296433, "grad_norm": 0.05546452983023311, "learning_rate": 1.0022747400905126e-06, "loss": 0.0079, "num_tokens": 226384045.0, "step": 506 }, { "epoch": 4.974169741697417, "grad_norm": 0.05754539826487939, "learning_rate": 1.0014558778181714e-06, "loss": 0.0073, "num_tokens": 226815343.0, "step": 507 }, { "epoch": 4.984009840098401, "grad_norm": 0.05456913560891108, "learning_rate": 1.0008189505932444e-06, "loss": 0.0084, "num_tokens": 227286168.0, "step": 508 }, { "epoch": 4.993849938499385, "grad_norm": 0.053799541560384294, "learning_rate": 1.0003639841750404e-06, "loss": 0.0076, "num_tokens": 227746824.0, "step": 509 }, { "epoch": 5.0, "grad_norm": 0.07884368824115337, "learning_rate": 1.0000909969638097e-06, "loss": 0.0089, "num_tokens": 227957450.0, "step": 510 }, { "epoch": 5.0, "eval_loss": 0.11205815523862839, "eval_num_tokens": 227957450.0, "eval_runtime": 53.843, "eval_samples_per_second": 41.714, "eval_steps_per_second": 5.219, "step": 510 }, { "epoch": 5.0, "step": 510, "total_flos": 7.689061516716278e+17, "train_loss": 0.0504409685922677, "train_runtime": 7612.3259, "train_samples_per_second": 8.537, "train_steps_per_second": 0.067 } ], "logging_steps": 1, "max_steps": 510, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.689061516716278e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }