| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 510, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00984009840098401, | |
| "grad_norm": 3.7944442389822073, | |
| "learning_rate": 0.0, | |
| "loss": 1.2501, | |
| "num_tokens": 456505.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.01968019680196802, | |
| "grad_norm": 3.778041640972742, | |
| "learning_rate": 6.25e-07, | |
| "loss": 1.2343, | |
| "num_tokens": 915160.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.02952029520295203, | |
| "grad_norm": 3.8325814879608386, | |
| "learning_rate": 1.25e-06, | |
| "loss": 1.254, | |
| "num_tokens": 1365315.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.03936039360393604, | |
| "grad_norm": 3.582565733113683, | |
| "learning_rate": 1.8750000000000003e-06, | |
| "loss": 1.1869, | |
| "num_tokens": 1841763.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04920049200492005, | |
| "grad_norm": 3.5604969753172315, | |
| "learning_rate": 2.5e-06, | |
| "loss": 1.2394, | |
| "num_tokens": 2301606.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.05904059040590406, | |
| "grad_norm": 3.105374395878177, | |
| "learning_rate": 3.125e-06, | |
| "loss": 1.2366, | |
| "num_tokens": 2755825.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.06888068880688807, | |
| "grad_norm": 2.316426838717515, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 1.1101, | |
| "num_tokens": 3196409.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.07872078720787208, | |
| "grad_norm": 2.281060366927676, | |
| "learning_rate": 4.3750000000000005e-06, | |
| "loss": 1.1078, | |
| "num_tokens": 3622733.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.08856088560885608, | |
| "grad_norm": 1.934577354985982, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9014, | |
| "num_tokens": 4055914.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0984009840098401, | |
| "grad_norm": 1.9176079459138344, | |
| "learning_rate": 5.625e-06, | |
| "loss": 0.8745, | |
| "num_tokens": 4485159.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10824108241082411, | |
| "grad_norm": 1.786754010375736, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.7922, | |
| "num_tokens": 4933514.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.11808118081180811, | |
| "grad_norm": 1.9933574737759214, | |
| "learning_rate": 6.875e-06, | |
| "loss": 0.4878, | |
| "num_tokens": 5383658.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.12792127921279212, | |
| "grad_norm": 2.123289554302906, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.428, | |
| "num_tokens": 5839838.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.13776137761377613, | |
| "grad_norm": 1.7562448014521572, | |
| "learning_rate": 8.125000000000001e-06, | |
| "loss": 0.3337, | |
| "num_tokens": 6286175.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.14760147601476015, | |
| "grad_norm": 1.4384357290512548, | |
| "learning_rate": 8.750000000000001e-06, | |
| "loss": 0.2497, | |
| "num_tokens": 6725821.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.15744157441574416, | |
| "grad_norm": 0.8232923354453182, | |
| "learning_rate": 9.375000000000001e-06, | |
| "loss": 0.1317, | |
| "num_tokens": 7169854.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.16728167281672818, | |
| "grad_norm": 0.5262014955492348, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1158, | |
| "num_tokens": 7602261.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.17712177121771217, | |
| "grad_norm": 0.47218735378476806, | |
| "learning_rate": 9.999909003036192e-06, | |
| "loss": 0.098, | |
| "num_tokens": 8040457.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.18696186961869618, | |
| "grad_norm": 0.22950756131023575, | |
| "learning_rate": 9.99963601582496e-06, | |
| "loss": 0.0827, | |
| "num_tokens": 8484210.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.1968019680196802, | |
| "grad_norm": 0.24069667769460337, | |
| "learning_rate": 9.999181049406756e-06, | |
| "loss": 0.0733, | |
| "num_tokens": 8913622.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2066420664206642, | |
| "grad_norm": 0.19079800364724872, | |
| "learning_rate": 9.998544122181829e-06, | |
| "loss": 0.0851, | |
| "num_tokens": 9379389.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.21648216482164823, | |
| "grad_norm": 0.17418458793254618, | |
| "learning_rate": 9.997725259909487e-06, | |
| "loss": 0.0734, | |
| "num_tokens": 9803100.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.22632226322263221, | |
| "grad_norm": 0.19184343474298712, | |
| "learning_rate": 9.996724495707056e-06, | |
| "loss": 0.083, | |
| "num_tokens": 10247767.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.23616236162361623, | |
| "grad_norm": 0.1475443750251538, | |
| "learning_rate": 9.995541870048537e-06, | |
| "loss": 0.0738, | |
| "num_tokens": 10691516.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.24600246002460024, | |
| "grad_norm": 0.14419500643952865, | |
| "learning_rate": 9.994177430762971e-06, | |
| "loss": 0.0646, | |
| "num_tokens": 11149524.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.25584255842558423, | |
| "grad_norm": 1.2190794556868674, | |
| "learning_rate": 9.992631233032507e-06, | |
| "loss": 0.0721, | |
| "num_tokens": 11589958.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.2656826568265683, | |
| "grad_norm": 0.49893871677873436, | |
| "learning_rate": 9.990903339390164e-06, | |
| "loss": 0.0691, | |
| "num_tokens": 12050102.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.27552275522755226, | |
| "grad_norm": 0.4422022706582718, | |
| "learning_rate": 9.988993819717312e-06, | |
| "loss": 0.0605, | |
| "num_tokens": 12508433.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.2853628536285363, | |
| "grad_norm": 0.13441092661822238, | |
| "learning_rate": 9.986902751240836e-06, | |
| "loss": 0.0692, | |
| "num_tokens": 12939960.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.2952029520295203, | |
| "grad_norm": 0.11858771432621444, | |
| "learning_rate": 9.984630218530014e-06, | |
| "loss": 0.0492, | |
| "num_tokens": 13387850.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3050430504305043, | |
| "grad_norm": 0.12696361103470127, | |
| "learning_rate": 9.982176313493108e-06, | |
| "loss": 0.0624, | |
| "num_tokens": 13866894.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.3148831488314883, | |
| "grad_norm": 0.14678245947256616, | |
| "learning_rate": 9.979541135373628e-06, | |
| "loss": 0.0483, | |
| "num_tokens": 14314553.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.3247232472324723, | |
| "grad_norm": 0.12403518011628543, | |
| "learning_rate": 9.976724790746333e-06, | |
| "loss": 0.0488, | |
| "num_tokens": 14747330.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.33456334563345635, | |
| "grad_norm": 0.10283847292091904, | |
| "learning_rate": 9.973727393512921e-06, | |
| "loss": 0.0582, | |
| "num_tokens": 15215873.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.34440344403444034, | |
| "grad_norm": 0.09860235260078455, | |
| "learning_rate": 9.970549064897407e-06, | |
| "loss": 0.0446, | |
| "num_tokens": 15653849.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.35424354243542433, | |
| "grad_norm": 0.10274919661024226, | |
| "learning_rate": 9.967189933441243e-06, | |
| "loss": 0.0439, | |
| "num_tokens": 16112913.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.3640836408364084, | |
| "grad_norm": 0.0918843632134462, | |
| "learning_rate": 9.9636501349981e-06, | |
| "loss": 0.0585, | |
| "num_tokens": 16570588.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.37392373923739236, | |
| "grad_norm": 0.08618611894284056, | |
| "learning_rate": 9.95992981272838e-06, | |
| "loss": 0.0477, | |
| "num_tokens": 17028395.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.3837638376383764, | |
| "grad_norm": 0.0915069403325355, | |
| "learning_rate": 9.956029117093432e-06, | |
| "loss": 0.045, | |
| "num_tokens": 17477681.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.3936039360393604, | |
| "grad_norm": 0.09093140650787605, | |
| "learning_rate": 9.951948205849457e-06, | |
| "loss": 0.0444, | |
| "num_tokens": 17940049.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.4034440344403444, | |
| "grad_norm": 0.08271507354884283, | |
| "learning_rate": 9.947687244041143e-06, | |
| "loss": 0.0401, | |
| "num_tokens": 18360868.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.4132841328413284, | |
| "grad_norm": 0.08588968137159211, | |
| "learning_rate": 9.943246403994969e-06, | |
| "loss": 0.0358, | |
| "num_tokens": 18811281.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.4231242312423124, | |
| "grad_norm": 0.08965565515357603, | |
| "learning_rate": 9.938625865312252e-06, | |
| "loss": 0.044, | |
| "num_tokens": 19236998.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.43296432964329645, | |
| "grad_norm": 0.09636661290222473, | |
| "learning_rate": 9.933825814861877e-06, | |
| "loss": 0.0431, | |
| "num_tokens": 19689363.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.44280442804428044, | |
| "grad_norm": 0.08912873391359938, | |
| "learning_rate": 9.928846446772737e-06, | |
| "loss": 0.0377, | |
| "num_tokens": 20129602.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.45264452644526443, | |
| "grad_norm": 0.09271503002492597, | |
| "learning_rate": 9.923687962425895e-06, | |
| "loss": 0.0365, | |
| "num_tokens": 20566055.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.46248462484624847, | |
| "grad_norm": 0.08617267288782972, | |
| "learning_rate": 9.91835057044642e-06, | |
| "loss": 0.0582, | |
| "num_tokens": 21035837.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.47232472324723246, | |
| "grad_norm": 0.07942181409157618, | |
| "learning_rate": 9.912834486694963e-06, | |
| "loss": 0.0341, | |
| "num_tokens": 21490681.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.4821648216482165, | |
| "grad_norm": 0.08409285833406879, | |
| "learning_rate": 9.907139934259025e-06, | |
| "loss": 0.0464, | |
| "num_tokens": 21949736.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.4920049200492005, | |
| "grad_norm": 0.08981746101624732, | |
| "learning_rate": 9.90126714344393e-06, | |
| "loss": 0.0479, | |
| "num_tokens": 22408345.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5018450184501845, | |
| "grad_norm": 0.08557538109120558, | |
| "learning_rate": 9.895216351763515e-06, | |
| "loss": 0.04, | |
| "num_tokens": 22869507.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.5116851168511685, | |
| "grad_norm": 0.08873060518107122, | |
| "learning_rate": 9.888987803930523e-06, | |
| "loss": 0.0359, | |
| "num_tokens": 23337492.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.5215252152521526, | |
| "grad_norm": 0.08508195964995854, | |
| "learning_rate": 9.882581751846707e-06, | |
| "loss": 0.0338, | |
| "num_tokens": 23788038.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.5313653136531366, | |
| "grad_norm": 0.076418318161816, | |
| "learning_rate": 9.87599845459264e-06, | |
| "loss": 0.0344, | |
| "num_tokens": 24233994.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.5412054120541205, | |
| "grad_norm": 0.2889818789713905, | |
| "learning_rate": 9.869238178417235e-06, | |
| "loss": 0.2599, | |
| "num_tokens": 24697351.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5510455104551045, | |
| "grad_norm": 0.08884780995830746, | |
| "learning_rate": 9.862301196726988e-06, | |
| "loss": 0.0465, | |
| "num_tokens": 25183095.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.5608856088560885, | |
| "grad_norm": 0.07990815808329678, | |
| "learning_rate": 9.855187790074906e-06, | |
| "loss": 0.0353, | |
| "num_tokens": 25651971.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.5707257072570726, | |
| "grad_norm": 0.06894407429892842, | |
| "learning_rate": 9.847898246149173e-06, | |
| "loss": 0.0316, | |
| "num_tokens": 26129683.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.5805658056580566, | |
| "grad_norm": 0.08216971413705307, | |
| "learning_rate": 9.840432859761504e-06, | |
| "loss": 0.0306, | |
| "num_tokens": 26548348.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.5904059040590406, | |
| "grad_norm": 0.079031679127037, | |
| "learning_rate": 9.832791932835232e-06, | |
| "loss": 0.0362, | |
| "num_tokens": 26977631.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6002460024600246, | |
| "grad_norm": 0.07450412090133855, | |
| "learning_rate": 9.824975774393089e-06, | |
| "loss": 0.0276, | |
| "num_tokens": 27421323.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.6100861008610086, | |
| "grad_norm": 0.08014735253624648, | |
| "learning_rate": 9.816984700544714e-06, | |
| "loss": 0.0286, | |
| "num_tokens": 27882356.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.6199261992619927, | |
| "grad_norm": 0.08455294660438158, | |
| "learning_rate": 9.808819034473869e-06, | |
| "loss": 0.0407, | |
| "num_tokens": 28343854.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.6297662976629766, | |
| "grad_norm": 0.08019778537515825, | |
| "learning_rate": 9.800479106425356e-06, | |
| "loss": 0.0299, | |
| "num_tokens": 28790695.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.6396063960639606, | |
| "grad_norm": 0.08340888167507048, | |
| "learning_rate": 9.791965253691687e-06, | |
| "loss": 0.0353, | |
| "num_tokens": 29220825.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.6494464944649446, | |
| "grad_norm": 0.08252486402936965, | |
| "learning_rate": 9.783277820599408e-06, | |
| "loss": 0.0367, | |
| "num_tokens": 29686358.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.6592865928659286, | |
| "grad_norm": 0.08632773276059842, | |
| "learning_rate": 9.774417158495208e-06, | |
| "loss": 0.0331, | |
| "num_tokens": 30120521.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.6691266912669127, | |
| "grad_norm": 0.082343171890358, | |
| "learning_rate": 9.765383625731683e-06, | |
| "loss": 0.0329, | |
| "num_tokens": 30573947.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.6789667896678967, | |
| "grad_norm": 0.08874468637210653, | |
| "learning_rate": 9.756177587652857e-06, | |
| "loss": 0.0329, | |
| "num_tokens": 30999244.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.6888068880688807, | |
| "grad_norm": 0.07673402020991506, | |
| "learning_rate": 9.746799416579403e-06, | |
| "loss": 0.0306, | |
| "num_tokens": 31468786.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6986469864698647, | |
| "grad_norm": 0.09204922624438575, | |
| "learning_rate": 9.737249491793587e-06, | |
| "loss": 0.0273, | |
| "num_tokens": 31905019.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.7084870848708487, | |
| "grad_norm": 0.08145687118724444, | |
| "learning_rate": 9.727528199523923e-06, | |
| "loss": 0.029, | |
| "num_tokens": 32340154.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.7183271832718328, | |
| "grad_norm": 0.09506872052374568, | |
| "learning_rate": 9.717635932929556e-06, | |
| "loss": 0.0373, | |
| "num_tokens": 32789598.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.7281672816728167, | |
| "grad_norm": 0.08326889230017241, | |
| "learning_rate": 9.707573092084368e-06, | |
| "loss": 0.0286, | |
| "num_tokens": 33239225.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.7380073800738007, | |
| "grad_norm": 0.07636964575035168, | |
| "learning_rate": 9.697340083960785e-06, | |
| "loss": 0.0291, | |
| "num_tokens": 33718797.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.7478474784747847, | |
| "grad_norm": 0.09488168094776525, | |
| "learning_rate": 9.686937322413325e-06, | |
| "loss": 0.0328, | |
| "num_tokens": 34155674.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.7576875768757687, | |
| "grad_norm": 0.0778086138359463, | |
| "learning_rate": 9.676365228161869e-06, | |
| "loss": 0.0252, | |
| "num_tokens": 34584921.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.7675276752767528, | |
| "grad_norm": 0.08557737550120906, | |
| "learning_rate": 9.66562422877462e-06, | |
| "loss": 0.0338, | |
| "num_tokens": 35049146.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.7773677736777368, | |
| "grad_norm": 0.09181023650151289, | |
| "learning_rate": 9.654714758650844e-06, | |
| "loss": 0.0299, | |
| "num_tokens": 35519987.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.7872078720787208, | |
| "grad_norm": 0.07639914292637208, | |
| "learning_rate": 9.643637259003276e-06, | |
| "loss": 0.0242, | |
| "num_tokens": 35959127.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.7970479704797048, | |
| "grad_norm": 0.08200922089613671, | |
| "learning_rate": 9.632392177840286e-06, | |
| "loss": 0.0317, | |
| "num_tokens": 36416651.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.8068880688806888, | |
| "grad_norm": 0.07954028434263948, | |
| "learning_rate": 9.620979969947759e-06, | |
| "loss": 0.0293, | |
| "num_tokens": 36864154.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.8167281672816729, | |
| "grad_norm": 0.07878375949867687, | |
| "learning_rate": 9.609401096870707e-06, | |
| "loss": 0.0237, | |
| "num_tokens": 37310281.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.8265682656826568, | |
| "grad_norm": 0.07728168843840597, | |
| "learning_rate": 9.597656026894591e-06, | |
| "loss": 0.0322, | |
| "num_tokens": 37746606.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.8364083640836408, | |
| "grad_norm": 0.07855221188672869, | |
| "learning_rate": 9.585745235026391e-06, | |
| "loss": 0.0258, | |
| "num_tokens": 38189615.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.8462484624846248, | |
| "grad_norm": 0.07691630967258262, | |
| "learning_rate": 9.5736692029754e-06, | |
| "loss": 0.0293, | |
| "num_tokens": 38637318.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.8560885608856088, | |
| "grad_norm": 0.07209047793755496, | |
| "learning_rate": 9.561428419133723e-06, | |
| "loss": 0.0235, | |
| "num_tokens": 39102853.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.8659286592865929, | |
| "grad_norm": 0.0802072339239599, | |
| "learning_rate": 9.549023378556548e-06, | |
| "loss": 0.0311, | |
| "num_tokens": 39538535.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.8757687576875769, | |
| "grad_norm": 0.09334524313401625, | |
| "learning_rate": 9.53645458294211e-06, | |
| "loss": 0.0484, | |
| "num_tokens": 40020296.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.8856088560885609, | |
| "grad_norm": 2.3961901610996605, | |
| "learning_rate": 9.523722540611403e-06, | |
| "loss": 0.3276, | |
| "num_tokens": 40506093.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.8954489544895449, | |
| "grad_norm": 0.09376957957757263, | |
| "learning_rate": 9.510827766487625e-06, | |
| "loss": 0.0288, | |
| "num_tokens": 40937880.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.9052890528905289, | |
| "grad_norm": 0.08607984794603309, | |
| "learning_rate": 9.497770782075353e-06, | |
| "loss": 0.0247, | |
| "num_tokens": 41374337.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.915129151291513, | |
| "grad_norm": 0.07253858203781333, | |
| "learning_rate": 9.484552115439445e-06, | |
| "loss": 0.0293, | |
| "num_tokens": 41811558.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.9249692496924969, | |
| "grad_norm": 0.07768364358007782, | |
| "learning_rate": 9.471172301183695e-06, | |
| "loss": 0.0257, | |
| "num_tokens": 42259726.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.9348093480934809, | |
| "grad_norm": 0.0769153663260077, | |
| "learning_rate": 9.4576318804292e-06, | |
| "loss": 0.0232, | |
| "num_tokens": 42684319.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.9446494464944649, | |
| "grad_norm": 0.08163342042509363, | |
| "learning_rate": 9.443931400792486e-06, | |
| "loss": 0.0256, | |
| "num_tokens": 43113589.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.9544895448954489, | |
| "grad_norm": 0.06536764982172343, | |
| "learning_rate": 9.430071416363352e-06, | |
| "loss": 0.0218, | |
| "num_tokens": 43575488.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.964329643296433, | |
| "grad_norm": 0.08195099679978833, | |
| "learning_rate": 9.416052487682465e-06, | |
| "loss": 0.0254, | |
| "num_tokens": 44016216.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.974169741697417, | |
| "grad_norm": 0.1266005657397246, | |
| "learning_rate": 9.401875181718686e-06, | |
| "loss": 0.0454, | |
| "num_tokens": 44497742.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.984009840098401, | |
| "grad_norm": 0.07988798247506342, | |
| "learning_rate": 9.387540071846155e-06, | |
| "loss": 0.024, | |
| "num_tokens": 44935936.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.993849938499385, | |
| "grad_norm": 0.07277763654694067, | |
| "learning_rate": 9.373047737821078e-06, | |
| "loss": 0.0216, | |
| "num_tokens": 45381042.0, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.07277763654694067, | |
| "learning_rate": 9.358398765758296e-06, | |
| "loss": 0.0229, | |
| "num_tokens": 45593876.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.07811997085809708, | |
| "eval_num_tokens": 45593876.0, | |
| "eval_runtime": 54.709, | |
| "eval_samples_per_second": 41.054, | |
| "eval_steps_per_second": 5.136, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.009840098400984, | |
| "grad_norm": 0.10795878798324991, | |
| "learning_rate": 9.34359374810758e-06, | |
| "loss": 0.0201, | |
| "num_tokens": 46020335.0, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.019680196801968, | |
| "grad_norm": 0.07593949135329942, | |
| "learning_rate": 9.328633283629666e-06, | |
| "loss": 0.0222, | |
| "num_tokens": 46466853.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.029520295202952, | |
| "grad_norm": 0.07596980345063492, | |
| "learning_rate": 9.31351797737204e-06, | |
| "loss": 0.0253, | |
| "num_tokens": 46900993.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.039360393603936, | |
| "grad_norm": 0.08317964089954727, | |
| "learning_rate": 9.29824844064447e-06, | |
| "loss": 0.0206, | |
| "num_tokens": 47334869.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.04920049200492, | |
| "grad_norm": 0.0805362815127939, | |
| "learning_rate": 9.282825290994282e-06, | |
| "loss": 0.0213, | |
| "num_tokens": 47797630.0, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.0590405904059041, | |
| "grad_norm": 0.07839099238240128, | |
| "learning_rate": 9.267249152181379e-06, | |
| "loss": 0.0454, | |
| "num_tokens": 48281974.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.068880688806888, | |
| "grad_norm": 0.0757738535866923, | |
| "learning_rate": 9.251520654153028e-06, | |
| "loss": 0.022, | |
| "num_tokens": 48730118.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.0787207872078721, | |
| "grad_norm": 0.08256710571520359, | |
| "learning_rate": 9.235640433018363e-06, | |
| "loss": 0.0195, | |
| "num_tokens": 49197576.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.088560885608856, | |
| "grad_norm": 0.07849933177459094, | |
| "learning_rate": 9.219609131022684e-06, | |
| "loss": 0.0203, | |
| "num_tokens": 49673054.0, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.09840098400984, | |
| "grad_norm": 0.08067924302373455, | |
| "learning_rate": 9.203427396521454e-06, | |
| "loss": 0.0219, | |
| "num_tokens": 50130569.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.1082410824108242, | |
| "grad_norm": 0.07527801624664898, | |
| "learning_rate": 9.187095883954104e-06, | |
| "loss": 0.0195, | |
| "num_tokens": 50574721.0, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.118081180811808, | |
| "grad_norm": 0.08229755724299215, | |
| "learning_rate": 9.170615253817547e-06, | |
| "loss": 0.0193, | |
| "num_tokens": 51010865.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.1279212792127922, | |
| "grad_norm": 0.07673721236222701, | |
| "learning_rate": 9.153986172639474e-06, | |
| "loss": 0.0211, | |
| "num_tokens": 51469765.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.137761377613776, | |
| "grad_norm": 0.0845900192373935, | |
| "learning_rate": 9.137209312951395e-06, | |
| "loss": 0.0226, | |
| "num_tokens": 51906114.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.1476014760147601, | |
| "grad_norm": 0.08215860044207468, | |
| "learning_rate": 9.12028535326144e-06, | |
| "loss": 0.022, | |
| "num_tokens": 52354068.0, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.1574415744157442, | |
| "grad_norm": 0.07420368746928867, | |
| "learning_rate": 9.103214978026922e-06, | |
| "loss": 0.0188, | |
| "num_tokens": 52836346.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.1672816728167281, | |
| "grad_norm": 0.07450541307438634, | |
| "learning_rate": 9.085998877626644e-06, | |
| "loss": 0.0192, | |
| "num_tokens": 53299172.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.1771217712177122, | |
| "grad_norm": 0.07878886229739003, | |
| "learning_rate": 9.068637748332993e-06, | |
| "loss": 0.0215, | |
| "num_tokens": 53759861.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.186961869618696, | |
| "grad_norm": 0.08311056334441597, | |
| "learning_rate": 9.051132292283772e-06, | |
| "loss": 0.0208, | |
| "num_tokens": 54228512.0, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.1968019680196802, | |
| "grad_norm": 0.07068781735081182, | |
| "learning_rate": 9.033483217453801e-06, | |
| "loss": 0.0812, | |
| "num_tokens": 54692852.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.2066420664206643, | |
| "grad_norm": 0.596254901083269, | |
| "learning_rate": 9.015691237626292e-06, | |
| "loss": 0.0199, | |
| "num_tokens": 55139782.0, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.2164821648216482, | |
| "grad_norm": 0.08202279255895727, | |
| "learning_rate": 8.997757072363976e-06, | |
| "loss": 0.0342, | |
| "num_tokens": 55604658.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.2263222632226323, | |
| "grad_norm": 0.09057478290667956, | |
| "learning_rate": 8.979681446980002e-06, | |
| "loss": 0.0227, | |
| "num_tokens": 56030690.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.2361623616236161, | |
| "grad_norm": 0.07661103115531635, | |
| "learning_rate": 8.961465092508607e-06, | |
| "loss": 0.0339, | |
| "num_tokens": 56492821.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.2460024600246002, | |
| "grad_norm": 0.08310739437969392, | |
| "learning_rate": 8.943108745675542e-06, | |
| "loss": 0.0249, | |
| "num_tokens": 56927699.0, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.2558425584255843, | |
| "grad_norm": 0.08009221352147507, | |
| "learning_rate": 8.92461314886829e-06, | |
| "loss": 0.0209, | |
| "num_tokens": 57365827.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.2656826568265682, | |
| "grad_norm": 0.07973094836265254, | |
| "learning_rate": 8.905979050106029e-06, | |
| "loss": 0.0251, | |
| "num_tokens": 57821453.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.2755227552275523, | |
| "grad_norm": 0.07600070319773061, | |
| "learning_rate": 8.887207203009385e-06, | |
| "loss": 0.0213, | |
| "num_tokens": 58267867.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.2853628536285364, | |
| "grad_norm": 0.07258837000806613, | |
| "learning_rate": 8.868298366769956e-06, | |
| "loss": 0.0198, | |
| "num_tokens": 58715078.0, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.2952029520295203, | |
| "grad_norm": 0.07826062337656157, | |
| "learning_rate": 8.849253306119601e-06, | |
| "loss": 0.0199, | |
| "num_tokens": 59159310.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.3050430504305042, | |
| "grad_norm": 0.07770042002025847, | |
| "learning_rate": 8.83007279129952e-06, | |
| "loss": 0.027, | |
| "num_tokens": 59594031.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.3148831488314883, | |
| "grad_norm": 0.07607344407726713, | |
| "learning_rate": 8.810757598029094e-06, | |
| "loss": 0.0342, | |
| "num_tokens": 60038506.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.3247232472324724, | |
| "grad_norm": 0.08771686774228402, | |
| "learning_rate": 8.79130850747452e-06, | |
| "loss": 0.0234, | |
| "num_tokens": 60492486.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.3345633456334562, | |
| "grad_norm": 0.07482147000786651, | |
| "learning_rate": 8.771726306217217e-06, | |
| "loss": 0.0196, | |
| "num_tokens": 60925341.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.3444034440344403, | |
| "grad_norm": 0.07171750614547971, | |
| "learning_rate": 8.752011786222011e-06, | |
| "loss": 0.0224, | |
| "num_tokens": 61401128.0, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.3542435424354244, | |
| "grad_norm": 0.07289189868770962, | |
| "learning_rate": 8.732165744805107e-06, | |
| "loss": 0.0198, | |
| "num_tokens": 61845691.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.3640836408364083, | |
| "grad_norm": 0.07907747558023923, | |
| "learning_rate": 8.712188984601845e-06, | |
| "loss": 0.0185, | |
| "num_tokens": 62286361.0, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.3739237392373924, | |
| "grad_norm": 0.06910414114179665, | |
| "learning_rate": 8.692082313534233e-06, | |
| "loss": 0.0179, | |
| "num_tokens": 62727406.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.3837638376383765, | |
| "grad_norm": 0.07791959325829377, | |
| "learning_rate": 8.671846544778284e-06, | |
| "loss": 0.0204, | |
| "num_tokens": 63182141.0, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.3936039360393604, | |
| "grad_norm": 0.0741558195977179, | |
| "learning_rate": 8.651482496731116e-06, | |
| "loss": 0.0178, | |
| "num_tokens": 63600729.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.4034440344403443, | |
| "grad_norm": 0.07283375136096223, | |
| "learning_rate": 8.630990992977854e-06, | |
| "loss": 0.0198, | |
| "num_tokens": 64066267.0, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.4132841328413284, | |
| "grad_norm": 0.0731783816547012, | |
| "learning_rate": 8.61037286225834e-06, | |
| "loss": 0.2547, | |
| "num_tokens": 64515946.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.4231242312423125, | |
| "grad_norm": 1.0212050791856901, | |
| "learning_rate": 8.589628938433587e-06, | |
| "loss": 0.0192, | |
| "num_tokens": 64949958.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.4329643296432963, | |
| "grad_norm": 0.09844320658741419, | |
| "learning_rate": 8.56876006045208e-06, | |
| "loss": 0.0176, | |
| "num_tokens": 65381018.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.4428044280442804, | |
| "grad_norm": 0.07030907656382593, | |
| "learning_rate": 8.547767072315835e-06, | |
| "loss": 0.0241, | |
| "num_tokens": 65814016.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.4526445264452645, | |
| "grad_norm": 0.0779412275694533, | |
| "learning_rate": 8.526650823046266e-06, | |
| "loss": 0.0265, | |
| "num_tokens": 66252980.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.4624846248462484, | |
| "grad_norm": 0.09570533939331194, | |
| "learning_rate": 8.505412166649847e-06, | |
| "loss": 0.0199, | |
| "num_tokens": 66718111.0, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.4723247232472325, | |
| "grad_norm": 0.07915246167438994, | |
| "learning_rate": 8.484051962083579e-06, | |
| "loss": 0.0204, | |
| "num_tokens": 67163762.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.4821648216482166, | |
| "grad_norm": 0.07935176799416567, | |
| "learning_rate": 8.462571073220243e-06, | |
| "loss": 0.0225, | |
| "num_tokens": 67624386.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.4920049200492005, | |
| "grad_norm": 0.07841589822630919, | |
| "learning_rate": 8.44097036881347e-06, | |
| "loss": 0.0392, | |
| "num_tokens": 68065290.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.5018450184501844, | |
| "grad_norm": 0.3517146293571387, | |
| "learning_rate": 8.419250722462603e-06, | |
| "loss": 0.0178, | |
| "num_tokens": 68519107.0, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.5116851168511685, | |
| "grad_norm": 0.0764909788834621, | |
| "learning_rate": 8.39741301257736e-06, | |
| "loss": 0.0194, | |
| "num_tokens": 68971128.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.5215252152521526, | |
| "grad_norm": 0.08078822036852527, | |
| "learning_rate": 8.375458122342317e-06, | |
| "loss": 0.0206, | |
| "num_tokens": 69403792.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.5313653136531364, | |
| "grad_norm": 0.08235320219175549, | |
| "learning_rate": 8.353386939681186e-06, | |
| "loss": 0.0175, | |
| "num_tokens": 69836602.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.5412054120541205, | |
| "grad_norm": 0.0735540837139594, | |
| "learning_rate": 8.331200357220908e-06, | |
| "loss": 0.0194, | |
| "num_tokens": 70283814.0, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.5510455104551046, | |
| "grad_norm": 0.07322399084658018, | |
| "learning_rate": 8.308899272255542e-06, | |
| "loss": 0.0184, | |
| "num_tokens": 70726284.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.5608856088560885, | |
| "grad_norm": 0.07790348390650517, | |
| "learning_rate": 8.286484586709989e-06, | |
| "loss": 0.0183, | |
| "num_tokens": 71155169.0, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.5707257072570726, | |
| "grad_norm": 0.08611809383964489, | |
| "learning_rate": 8.263957207103506e-06, | |
| "loss": 0.0205, | |
| "num_tokens": 71591204.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.5805658056580567, | |
| "grad_norm": 0.0706229845173915, | |
| "learning_rate": 8.241318044513046e-06, | |
| "loss": 0.0277, | |
| "num_tokens": 72032119.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.5904059040590406, | |
| "grad_norm": 0.09019039164269532, | |
| "learning_rate": 8.218568014536414e-06, | |
| "loss": 0.0176, | |
| "num_tokens": 72492164.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.6002460024600245, | |
| "grad_norm": 0.07947315916491103, | |
| "learning_rate": 8.195708037255233e-06, | |
| "loss": 0.0202, | |
| "num_tokens": 72962752.0, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.6100861008610086, | |
| "grad_norm": 0.06840189166732885, | |
| "learning_rate": 8.172739037197739e-06, | |
| "loss": 0.018, | |
| "num_tokens": 73415974.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.6199261992619927, | |
| "grad_norm": 0.07366616747573093, | |
| "learning_rate": 8.149661943301382e-06, | |
| "loss": 0.0181, | |
| "num_tokens": 73882834.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.6297662976629765, | |
| "grad_norm": 0.07081012920317416, | |
| "learning_rate": 8.126477688875262e-06, | |
| "loss": 0.0204, | |
| "num_tokens": 74321580.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.6396063960639606, | |
| "grad_norm": 0.07863097311534642, | |
| "learning_rate": 8.103187211562386e-06, | |
| "loss": 0.0229, | |
| "num_tokens": 74781751.0, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.6494464944649447, | |
| "grad_norm": 0.10797044478776457, | |
| "learning_rate": 8.079791453301742e-06, | |
| "loss": 0.0287, | |
| "num_tokens": 75219935.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.6592865928659286, | |
| "grad_norm": 0.07041534985061697, | |
| "learning_rate": 8.056291360290202e-06, | |
| "loss": 0.0248, | |
| "num_tokens": 75665232.0, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.6691266912669127, | |
| "grad_norm": 0.08695303118518641, | |
| "learning_rate": 8.032687882944264e-06, | |
| "loss": 0.0193, | |
| "num_tokens": 76087411.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.6789667896678968, | |
| "grad_norm": 0.06704813880798238, | |
| "learning_rate": 8.0089819758616e-06, | |
| "loss": 0.0169, | |
| "num_tokens": 76529931.0, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.6888068880688807, | |
| "grad_norm": 0.06935996975041725, | |
| "learning_rate": 7.985174597782469e-06, | |
| "loss": 0.0197, | |
| "num_tokens": 76974869.0, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.6986469864698646, | |
| "grad_norm": 0.0812644475398725, | |
| "learning_rate": 7.961266711550922e-06, | |
| "loss": 0.0259, | |
| "num_tokens": 77413009.0, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.7084870848708487, | |
| "grad_norm": 0.07469198601302375, | |
| "learning_rate": 7.937259284075872e-06, | |
| "loss": 0.0191, | |
| "num_tokens": 77854298.0, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.7183271832718328, | |
| "grad_norm": 0.07554209425696685, | |
| "learning_rate": 7.913153286291995e-06, | |
| "loss": 0.025, | |
| "num_tokens": 78299682.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.7281672816728166, | |
| "grad_norm": 0.07564661483692575, | |
| "learning_rate": 7.888949693120443e-06, | |
| "loss": 0.0172, | |
| "num_tokens": 78723460.0, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.7380073800738007, | |
| "grad_norm": 0.6264202015289688, | |
| "learning_rate": 7.864649483429442e-06, | |
| "loss": 0.0402, | |
| "num_tokens": 79151526.0, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.7478474784747848, | |
| "grad_norm": 0.07431323606896861, | |
| "learning_rate": 7.840253639994676e-06, | |
| "loss": 0.0182, | |
| "num_tokens": 79591692.0, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.7576875768757687, | |
| "grad_norm": 0.07199128250127072, | |
| "learning_rate": 7.815763149459563e-06, | |
| "loss": 0.018, | |
| "num_tokens": 80054397.0, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.7675276752767528, | |
| "grad_norm": 0.0736771332831437, | |
| "learning_rate": 7.791179002295334e-06, | |
| "loss": 0.0182, | |
| "num_tokens": 80527436.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.777367773677737, | |
| "grad_norm": 0.0722896910687323, | |
| "learning_rate": 7.766502192760995e-06, | |
| "loss": 0.0299, | |
| "num_tokens": 80984085.0, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.7872078720787208, | |
| "grad_norm": 0.13146348676004535, | |
| "learning_rate": 7.741733718863096e-06, | |
| "loss": 0.0172, | |
| "num_tokens": 81417093.0, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.7970479704797047, | |
| "grad_norm": 0.07559775090622188, | |
| "learning_rate": 7.71687458231538e-06, | |
| "loss": 0.0173, | |
| "num_tokens": 81857802.0, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.8068880688806888, | |
| "grad_norm": 0.07625026619956689, | |
| "learning_rate": 7.69192578849827e-06, | |
| "loss": 0.0174, | |
| "num_tokens": 82314635.0, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.8167281672816729, | |
| "grad_norm": 0.07079163666898536, | |
| "learning_rate": 7.666888346418205e-06, | |
| "loss": 0.0255, | |
| "num_tokens": 82774404.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.8265682656826567, | |
| "grad_norm": 0.07862230056744444, | |
| "learning_rate": 7.641763268666832e-06, | |
| "loss": 0.0166, | |
| "num_tokens": 83224858.0, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.8364083640836408, | |
| "grad_norm": 0.07767548895299481, | |
| "learning_rate": 7.616551571380061e-06, | |
| "loss": 0.0303, | |
| "num_tokens": 83685638.0, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.846248462484625, | |
| "grad_norm": 0.0767555813557926, | |
| "learning_rate": 7.5912542741969585e-06, | |
| "loss": 0.0173, | |
| "num_tokens": 84118329.0, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.8560885608856088, | |
| "grad_norm": 0.06505326217418561, | |
| "learning_rate": 7.5658724002185215e-06, | |
| "loss": 0.2302, | |
| "num_tokens": 84642441.0, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.865928659286593, | |
| "grad_norm": 0.9831912884395022, | |
| "learning_rate": 7.54040697596629e-06, | |
| "loss": 0.0173, | |
| "num_tokens": 85075013.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.875768757687577, | |
| "grad_norm": 0.0852074767092427, | |
| "learning_rate": 7.514859031340835e-06, | |
| "loss": 0.0197, | |
| "num_tokens": 85539398.0, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.8856088560885609, | |
| "grad_norm": 0.07502455159038045, | |
| "learning_rate": 7.489229599580111e-06, | |
| "loss": 0.0167, | |
| "num_tokens": 85976652.0, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.8954489544895448, | |
| "grad_norm": 0.07796568336104527, | |
| "learning_rate": 7.463519717217663e-06, | |
| "loss": 0.0253, | |
| "num_tokens": 86404836.0, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.9052890528905289, | |
| "grad_norm": 0.07733304316410633, | |
| "learning_rate": 7.437730424040702e-06, | |
| "loss": 0.0232, | |
| "num_tokens": 86871021.0, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.915129151291513, | |
| "grad_norm": 0.07837311923363188, | |
| "learning_rate": 7.411862763048068e-06, | |
| "loss": 0.0228, | |
| "num_tokens": 87328297.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.9249692496924968, | |
| "grad_norm": 0.07159308881612252, | |
| "learning_rate": 7.38591778040803e-06, | |
| "loss": 0.0178, | |
| "num_tokens": 87780478.0, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.934809348093481, | |
| "grad_norm": 0.06995284279442164, | |
| "learning_rate": 7.359896525415986e-06, | |
| "loss": 0.0166, | |
| "num_tokens": 88245218.0, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.944649446494465, | |
| "grad_norm": 0.074185946727602, | |
| "learning_rate": 7.333800050452024e-06, | |
| "loss": 0.0335, | |
| "num_tokens": 88720048.0, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.954489544895449, | |
| "grad_norm": 0.0936664061322253, | |
| "learning_rate": 7.307629410938364e-06, | |
| "loss": 0.0156, | |
| "num_tokens": 89171687.0, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.964329643296433, | |
| "grad_norm": 0.06592479834851843, | |
| "learning_rate": 7.281385665296663e-06, | |
| "loss": 0.0162, | |
| "num_tokens": 89636320.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.974169741697417, | |
| "grad_norm": 0.08486840853612633, | |
| "learning_rate": 7.255069874905221e-06, | |
| "loss": 0.0177, | |
| "num_tokens": 90074778.0, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.984009840098401, | |
| "grad_norm": 0.06923307537599123, | |
| "learning_rate": 7.228683104056051e-06, | |
| "loss": 0.0168, | |
| "num_tokens": 90519743.0, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.9938499384993849, | |
| "grad_norm": 0.09250588119689185, | |
| "learning_rate": 7.202226419911832e-06, | |
| "loss": 0.0266, | |
| "num_tokens": 90971202.0, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.08932358174959376, | |
| "learning_rate": 7.175700892462757e-06, | |
| "loss": 0.0167, | |
| "num_tokens": 91183681.0, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.08808860927820206, | |
| "eval_num_tokens": 91183681.0, | |
| "eval_runtime": 53.9315, | |
| "eval_samples_per_second": 41.645, | |
| "eval_steps_per_second": 5.21, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.009840098400984, | |
| "grad_norm": 0.07874869315833909, | |
| "learning_rate": 7.149107594483251e-06, | |
| "loss": 0.0142, | |
| "num_tokens": 91625671.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.019680196801968, | |
| "grad_norm": 0.06385620551213778, | |
| "learning_rate": 7.122447601488592e-06, | |
| "loss": 0.0132, | |
| "num_tokens": 92071488.0, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.029520295202952, | |
| "grad_norm": 0.06846197400142105, | |
| "learning_rate": 7.095721991691411e-06, | |
| "loss": 0.0149, | |
| "num_tokens": 92542156.0, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.039360393603936, | |
| "grad_norm": 0.07424945414823086, | |
| "learning_rate": 7.0689318459580845e-06, | |
| "loss": 0.0156, | |
| "num_tokens": 93002703.0, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.0492004920049203, | |
| "grad_norm": 0.06687580312011086, | |
| "learning_rate": 7.042078247765019e-06, | |
| "loss": 0.0135, | |
| "num_tokens": 93436834.0, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.059040590405904, | |
| "grad_norm": 0.07720021453648518, | |
| "learning_rate": 7.015162283154843e-06, | |
| "loss": 0.0137, | |
| "num_tokens": 93871635.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.068880688806888, | |
| "grad_norm": 0.13453391743262458, | |
| "learning_rate": 6.988185040692469e-06, | |
| "loss": 0.0221, | |
| "num_tokens": 94314058.0, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.078720787207872, | |
| "grad_norm": 0.07982223152072775, | |
| "learning_rate": 6.961147611421076e-06, | |
| "loss": 0.017, | |
| "num_tokens": 94750976.0, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.088560885608856, | |
| "grad_norm": 0.06995730861373262, | |
| "learning_rate": 6.934051088817988e-06, | |
| "loss": 0.0137, | |
| "num_tokens": 95193789.0, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.09840098400984, | |
| "grad_norm": 0.07438600726959783, | |
| "learning_rate": 6.906896568750441e-06, | |
| "loss": 0.0193, | |
| "num_tokens": 95676386.0, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.108241082410824, | |
| "grad_norm": 0.09331884860488432, | |
| "learning_rate": 6.87968514943127e-06, | |
| "loss": 0.0154, | |
| "num_tokens": 96137917.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.1180811808118083, | |
| "grad_norm": 0.06703452835053635, | |
| "learning_rate": 6.852417931374494e-06, | |
| "loss": 0.0134, | |
| "num_tokens": 96568059.0, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.127921279212792, | |
| "grad_norm": 0.07093081986870549, | |
| "learning_rate": 6.825096017350807e-06, | |
| "loss": 0.0138, | |
| "num_tokens": 97019588.0, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.137761377613776, | |
| "grad_norm": 0.0650948479503258, | |
| "learning_rate": 6.797720512342967e-06, | |
| "loss": 0.0137, | |
| "num_tokens": 97456418.0, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.14760147601476, | |
| "grad_norm": 0.06693139683273135, | |
| "learning_rate": 6.77029252350113e-06, | |
| "loss": 0.0142, | |
| "num_tokens": 97874765.0, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.1574415744157442, | |
| "grad_norm": 0.07881816970778455, | |
| "learning_rate": 6.742813160098054e-06, | |
| "loss": 0.0188, | |
| "num_tokens": 98322373.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.167281672816728, | |
| "grad_norm": 0.07381706020969016, | |
| "learning_rate": 6.715283533484242e-06, | |
| "loss": 0.0125, | |
| "num_tokens": 98762055.0, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.177121771217712, | |
| "grad_norm": 0.06829050170688594, | |
| "learning_rate": 6.6877047570430044e-06, | |
| "loss": 0.0147, | |
| "num_tokens": 99212257.0, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.1869618696186963, | |
| "grad_norm": 0.0726323898489312, | |
| "learning_rate": 6.660077946145412e-06, | |
| "loss": 0.0149, | |
| "num_tokens": 99651696.0, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.19680196801968, | |
| "grad_norm": 0.06996376101830218, | |
| "learning_rate": 6.632404218105205e-06, | |
| "loss": 0.014, | |
| "num_tokens": 100115333.0, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.206642066420664, | |
| "grad_norm": 0.07058857975728597, | |
| "learning_rate": 6.604684692133597e-06, | |
| "loss": 0.0128, | |
| "num_tokens": 100567168.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.2164821648216484, | |
| "grad_norm": 0.06705830086377462, | |
| "learning_rate": 6.576920489294011e-06, | |
| "loss": 0.014, | |
| "num_tokens": 101017414.0, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.2263222632226323, | |
| "grad_norm": 0.08216121325842957, | |
| "learning_rate": 6.549112732456739e-06, | |
| "loss": 0.0244, | |
| "num_tokens": 101478653.0, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.236162361623616, | |
| "grad_norm": 0.06604918422838713, | |
| "learning_rate": 6.5212625462535365e-06, | |
| "loss": 0.0133, | |
| "num_tokens": 101922998.0, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.2460024600246005, | |
| "grad_norm": 0.06450225948970358, | |
| "learning_rate": 6.493371057032129e-06, | |
| "loss": 0.0149, | |
| "num_tokens": 102357947.0, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.2558425584255843, | |
| "grad_norm": 0.07514996917424294, | |
| "learning_rate": 6.465439392810664e-06, | |
| "loss": 0.0167, | |
| "num_tokens": 102803832.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.265682656826568, | |
| "grad_norm": 0.06462428507734051, | |
| "learning_rate": 6.4374686832320944e-06, | |
| "loss": 0.0142, | |
| "num_tokens": 103241692.0, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.275522755227552, | |
| "grad_norm": 0.06485952063828938, | |
| "learning_rate": 6.409460059518482e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 103688326.0, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.2853628536285364, | |
| "grad_norm": 0.06533997999817706, | |
| "learning_rate": 6.381414654425261e-06, | |
| "loss": 0.0131, | |
| "num_tokens": 104139997.0, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.2952029520295203, | |
| "grad_norm": 0.06878268907753365, | |
| "learning_rate": 6.353333602195414e-06, | |
| "loss": 0.0138, | |
| "num_tokens": 104583247.0, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.305043050430504, | |
| "grad_norm": 0.061527579151490784, | |
| "learning_rate": 6.325218038513604e-06, | |
| "loss": 0.0129, | |
| "num_tokens": 105013546.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.3148831488314885, | |
| "grad_norm": 0.0688594189041464, | |
| "learning_rate": 6.2970691004602425e-06, | |
| "loss": 0.0147, | |
| "num_tokens": 105469533.0, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.3247232472324724, | |
| "grad_norm": 0.07212293085873876, | |
| "learning_rate": 6.26888792646551e-06, | |
| "loss": 0.0138, | |
| "num_tokens": 105902012.0, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.3345633456334562, | |
| "grad_norm": 0.07097729248579715, | |
| "learning_rate": 6.240675656263303e-06, | |
| "loss": 0.0133, | |
| "num_tokens": 106319708.0, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.34440344403444, | |
| "grad_norm": 0.0702207231329528, | |
| "learning_rate": 6.212433430845145e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 106767770.0, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.3542435424354244, | |
| "grad_norm": 0.06717197740035392, | |
| "learning_rate": 6.184162392414044e-06, | |
| "loss": 0.0127, | |
| "num_tokens": 107230010.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.3640836408364083, | |
| "grad_norm": 0.09206853570190297, | |
| "learning_rate": 6.155863684338294e-06, | |
| "loss": 0.0182, | |
| "num_tokens": 107696665.0, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.373923739237392, | |
| "grad_norm": 0.07931539686074184, | |
| "learning_rate": 6.127538451105232e-06, | |
| "loss": 0.0156, | |
| "num_tokens": 108145998.0, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.3837638376383765, | |
| "grad_norm": 0.0845167365221342, | |
| "learning_rate": 6.099187838274959e-06, | |
| "loss": 0.0304, | |
| "num_tokens": 108605347.0, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.3936039360393604, | |
| "grad_norm": 0.8319925155014395, | |
| "learning_rate": 6.070812992434003e-06, | |
| "loss": 0.077, | |
| "num_tokens": 109053120.0, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.4034440344403443, | |
| "grad_norm": 0.08254084053779843, | |
| "learning_rate": 6.042415061148954e-06, | |
| "loss": 0.0153, | |
| "num_tokens": 109511574.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.4132841328413286, | |
| "grad_norm": 0.07621464852457635, | |
| "learning_rate": 6.013995192920044e-06, | |
| "loss": 0.013, | |
| "num_tokens": 109961861.0, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.4231242312423125, | |
| "grad_norm": 0.06290755400921484, | |
| "learning_rate": 5.985554537134702e-06, | |
| "loss": 0.0133, | |
| "num_tokens": 110439530.0, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.4329643296432963, | |
| "grad_norm": 0.06549923207889226, | |
| "learning_rate": 5.957094244021071e-06, | |
| "loss": 0.0133, | |
| "num_tokens": 110902468.0, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.4428044280442807, | |
| "grad_norm": 0.06398296126869986, | |
| "learning_rate": 5.928615464601497e-06, | |
| "loss": 0.0128, | |
| "num_tokens": 111361759.0, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.4526445264452645, | |
| "grad_norm": 0.062244715362799644, | |
| "learning_rate": 5.900119350645956e-06, | |
| "loss": 0.0128, | |
| "num_tokens": 111799435.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.4624846248462484, | |
| "grad_norm": 0.06503161600374163, | |
| "learning_rate": 5.871607054625497e-06, | |
| "loss": 0.0128, | |
| "num_tokens": 112244747.0, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.4723247232472323, | |
| "grad_norm": 0.08086590997362891, | |
| "learning_rate": 5.8430797296656125e-06, | |
| "loss": 0.0184, | |
| "num_tokens": 112678903.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.4821648216482166, | |
| "grad_norm": 0.07239451855920867, | |
| "learning_rate": 5.814538529499622e-06, | |
| "loss": 0.0149, | |
| "num_tokens": 113132832.0, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.4920049200492005, | |
| "grad_norm": 0.06030312987290577, | |
| "learning_rate": 5.785984608421993e-06, | |
| "loss": 0.0127, | |
| "num_tokens": 113568429.0, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.5018450184501844, | |
| "grad_norm": 0.06349775541516244, | |
| "learning_rate": 5.757419121241667e-06, | |
| "loss": 0.0125, | |
| "num_tokens": 114042240.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.5116851168511687, | |
| "grad_norm": 0.06952013750985335, | |
| "learning_rate": 5.7288432232353615e-06, | |
| "loss": 0.0204, | |
| "num_tokens": 114496441.0, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.5215252152521526, | |
| "grad_norm": 0.0958262233433174, | |
| "learning_rate": 5.7002580701008325e-06, | |
| "loss": 0.0149, | |
| "num_tokens": 114936236.0, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.5313653136531364, | |
| "grad_norm": 0.06572975411347728, | |
| "learning_rate": 5.6716648179101445e-06, | |
| "loss": 0.0123, | |
| "num_tokens": 115365529.0, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.5412054120541203, | |
| "grad_norm": 0.07287254897275752, | |
| "learning_rate": 5.64306462306291e-06, | |
| "loss": 0.0177, | |
| "num_tokens": 115812361.0, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.5510455104551046, | |
| "grad_norm": 0.0677506186552676, | |
| "learning_rate": 5.614458642239534e-06, | |
| "loss": 0.0126, | |
| "num_tokens": 116269752.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.5608856088560885, | |
| "grad_norm": 0.07088790175345892, | |
| "learning_rate": 5.585848032354411e-06, | |
| "loss": 0.0139, | |
| "num_tokens": 116739082.0, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.570725707257073, | |
| "grad_norm": 2.483507979054926, | |
| "learning_rate": 5.557233950509159e-06, | |
| "loss": 0.3298, | |
| "num_tokens": 117236975.0, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.5805658056580567, | |
| "grad_norm": 0.6712341553033803, | |
| "learning_rate": 5.528617553945807e-06, | |
| "loss": 0.0131, | |
| "num_tokens": 117701799.0, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.5904059040590406, | |
| "grad_norm": 0.070379027103792, | |
| "learning_rate": 5.500000000000001e-06, | |
| "loss": 0.019, | |
| "num_tokens": 118190544.0, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.6002460024600245, | |
| "grad_norm": 0.09944926431551483, | |
| "learning_rate": 5.4713824460541964e-06, | |
| "loss": 0.0153, | |
| "num_tokens": 118625146.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.6100861008610083, | |
| "grad_norm": 0.07370939155932825, | |
| "learning_rate": 5.442766049490843e-06, | |
| "loss": 0.0138, | |
| "num_tokens": 119077739.0, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.6199261992619927, | |
| "grad_norm": 0.06555516765204612, | |
| "learning_rate": 5.414151967645591e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 119502701.0, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.6297662976629765, | |
| "grad_norm": 0.060577987544993946, | |
| "learning_rate": 5.385541357760469e-06, | |
| "loss": 0.0121, | |
| "num_tokens": 119956823.0, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.639606396063961, | |
| "grad_norm": 0.06969958736256228, | |
| "learning_rate": 5.35693537693709e-06, | |
| "loss": 0.0131, | |
| "num_tokens": 120410284.0, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.6494464944649447, | |
| "grad_norm": 0.08178808292429539, | |
| "learning_rate": 5.3283351820898586e-06, | |
| "loss": 0.0183, | |
| "num_tokens": 120837514.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.6592865928659286, | |
| "grad_norm": 0.12228602708630738, | |
| "learning_rate": 5.299741929899171e-06, | |
| "loss": 0.0206, | |
| "num_tokens": 121266377.0, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.6691266912669125, | |
| "grad_norm": 0.07647057417070459, | |
| "learning_rate": 5.27115677676464e-06, | |
| "loss": 0.0154, | |
| "num_tokens": 121730907.0, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.678966789667897, | |
| "grad_norm": 0.07263570161343703, | |
| "learning_rate": 5.242580878758334e-06, | |
| "loss": 0.0138, | |
| "num_tokens": 122162564.0, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.6888068880688807, | |
| "grad_norm": 0.07390794347850005, | |
| "learning_rate": 5.21401539157801e-06, | |
| "loss": 0.0131, | |
| "num_tokens": 122644233.0, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.6986469864698646, | |
| "grad_norm": 0.05624120433704004, | |
| "learning_rate": 5.1854614705003796e-06, | |
| "loss": 0.0114, | |
| "num_tokens": 123070674.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.708487084870849, | |
| "grad_norm": 0.07371873132309133, | |
| "learning_rate": 5.156920270334389e-06, | |
| "loss": 0.0194, | |
| "num_tokens": 123517476.0, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.7183271832718328, | |
| "grad_norm": 0.06758978472435712, | |
| "learning_rate": 5.1283929453745055e-06, | |
| "loss": 0.0129, | |
| "num_tokens": 123957650.0, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.7281672816728166, | |
| "grad_norm": 0.06857276382476074, | |
| "learning_rate": 5.099880649354044e-06, | |
| "loss": 0.0125, | |
| "num_tokens": 124423561.0, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.7380073800738005, | |
| "grad_norm": 0.06198166285648246, | |
| "learning_rate": 5.071384535398505e-06, | |
| "loss": 0.0119, | |
| "num_tokens": 124871204.0, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.747847478474785, | |
| "grad_norm": 0.05801997208341688, | |
| "learning_rate": 5.04290575597893e-06, | |
| "loss": 0.0119, | |
| "num_tokens": 125320936.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.7576875768757687, | |
| "grad_norm": 0.09983800531852628, | |
| "learning_rate": 5.0144454628653015e-06, | |
| "loss": 0.0157, | |
| "num_tokens": 125785587.0, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.767527675276753, | |
| "grad_norm": 0.05961861980322237, | |
| "learning_rate": 4.986004807079959e-06, | |
| "loss": 0.0119, | |
| "num_tokens": 126223799.0, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.777367773677737, | |
| "grad_norm": 0.06887056012305312, | |
| "learning_rate": 4.957584938851048e-06, | |
| "loss": 0.0127, | |
| "num_tokens": 126674560.0, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.787207872078721, | |
| "grad_norm": 0.06432285678662777, | |
| "learning_rate": 4.929187007565996e-06, | |
| "loss": 0.0124, | |
| "num_tokens": 127121758.0, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.7970479704797047, | |
| "grad_norm": 0.06283306903955838, | |
| "learning_rate": 4.9008121617250425e-06, | |
| "loss": 0.0122, | |
| "num_tokens": 127564319.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.8068880688806885, | |
| "grad_norm": 0.07395862495517919, | |
| "learning_rate": 4.87246154889477e-06, | |
| "loss": 0.0125, | |
| "num_tokens": 128014723.0, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.816728167281673, | |
| "grad_norm": 0.06772968868173306, | |
| "learning_rate": 4.8441363156617085e-06, | |
| "loss": 0.026, | |
| "num_tokens": 128456573.0, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.8265682656826567, | |
| "grad_norm": 0.2058477599150272, | |
| "learning_rate": 4.815837607585957e-06, | |
| "loss": 0.0313, | |
| "num_tokens": 128888085.0, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.836408364083641, | |
| "grad_norm": 0.05983028509302605, | |
| "learning_rate": 4.787566569154855e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 129344186.0, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.846248462484625, | |
| "grad_norm": 0.1679165256737002, | |
| "learning_rate": 4.759324343736698e-06, | |
| "loss": 0.0268, | |
| "num_tokens": 129820337.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.856088560885609, | |
| "grad_norm": 0.069693981729958, | |
| "learning_rate": 4.731112073534491e-06, | |
| "loss": 0.012, | |
| "num_tokens": 130264132.0, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.8659286592865927, | |
| "grad_norm": 0.05673801969192786, | |
| "learning_rate": 4.70293089953976e-06, | |
| "loss": 0.237, | |
| "num_tokens": 130747367.0, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.875768757687577, | |
| "grad_norm": 0.9244716369700087, | |
| "learning_rate": 4.674781961486399e-06, | |
| "loss": 0.0129, | |
| "num_tokens": 131189544.0, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.885608856088561, | |
| "grad_norm": 0.0670539720853974, | |
| "learning_rate": 4.646666397804586e-06, | |
| "loss": 0.0127, | |
| "num_tokens": 131615817.0, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.8954489544895448, | |
| "grad_norm": 0.07778029323101539, | |
| "learning_rate": 4.618585345574741e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 132065833.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.905289052890529, | |
| "grad_norm": 0.06633645417900966, | |
| "learning_rate": 4.5905399404815196e-06, | |
| "loss": 0.0119, | |
| "num_tokens": 132513181.0, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.915129151291513, | |
| "grad_norm": 0.06604742202311176, | |
| "learning_rate": 4.562531316767908e-06, | |
| "loss": 0.0178, | |
| "num_tokens": 132975979.0, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.924969249692497, | |
| "grad_norm": 0.06375772945002761, | |
| "learning_rate": 4.534560607189338e-06, | |
| "loss": 0.0121, | |
| "num_tokens": 133411946.0, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.9348093480934807, | |
| "grad_norm": 0.0644873715390372, | |
| "learning_rate": 4.506628942967874e-06, | |
| "loss": 0.0226, | |
| "num_tokens": 133882037.0, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.944649446494465, | |
| "grad_norm": 0.06122403707300358, | |
| "learning_rate": 4.478737453746464e-06, | |
| "loss": 0.0111, | |
| "num_tokens": 134338580.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.954489544895449, | |
| "grad_norm": 0.06192995198797032, | |
| "learning_rate": 4.450887267543261e-06, | |
| "loss": 0.023, | |
| "num_tokens": 134806429.0, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 2.9643296432964332, | |
| "grad_norm": 0.06577423487360488, | |
| "learning_rate": 4.423079510705992e-06, | |
| "loss": 0.0127, | |
| "num_tokens": 135253050.0, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.974169741697417, | |
| "grad_norm": 0.061821762890230156, | |
| "learning_rate": 4.395315307866404e-06, | |
| "loss": 0.0118, | |
| "num_tokens": 135701900.0, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.984009840098401, | |
| "grad_norm": 0.060295397517859534, | |
| "learning_rate": 4.3675957818947965e-06, | |
| "loss": 0.0112, | |
| "num_tokens": 136134539.0, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.993849938499385, | |
| "grad_norm": 0.06204359834906306, | |
| "learning_rate": 4.33992205385459e-06, | |
| "loss": 0.0119, | |
| "num_tokens": 136581981.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.06204359834906306, | |
| "learning_rate": 4.312295242956998e-06, | |
| "loss": 0.0109, | |
| "num_tokens": 136774441.0, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.0963606908917427, | |
| "eval_num_tokens": 136774441.0, | |
| "eval_runtime": 53.9214, | |
| "eval_samples_per_second": 41.653, | |
| "eval_steps_per_second": 5.211, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 3.009840098400984, | |
| "grad_norm": 0.08266586517900253, | |
| "learning_rate": 4.284716466515759e-06, | |
| "loss": 0.0218, | |
| "num_tokens": 137235846.0, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 3.019680196801968, | |
| "grad_norm": 0.06025259361613064, | |
| "learning_rate": 4.257186839901948e-06, | |
| "loss": 0.01, | |
| "num_tokens": 137676575.0, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 3.029520295202952, | |
| "grad_norm": 0.059520087712568295, | |
| "learning_rate": 4.229707476498871e-06, | |
| "loss": 0.0107, | |
| "num_tokens": 138127277.0, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 3.039360393603936, | |
| "grad_norm": 0.060007105121960225, | |
| "learning_rate": 4.2022794876570335e-06, | |
| "loss": 0.0099, | |
| "num_tokens": 138558346.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.0492004920049203, | |
| "grad_norm": 0.05765555936281279, | |
| "learning_rate": 4.1749039826491956e-06, | |
| "loss": 0.2021, | |
| "num_tokens": 139029117.0, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 3.059040590405904, | |
| "grad_norm": 0.25549047851203505, | |
| "learning_rate": 4.1475820686255055e-06, | |
| "loss": 0.01, | |
| "num_tokens": 139465608.0, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 3.068880688806888, | |
| "grad_norm": 0.05745397404349778, | |
| "learning_rate": 4.120314850568731e-06, | |
| "loss": 0.0291, | |
| "num_tokens": 139932040.0, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 3.078720787207872, | |
| "grad_norm": 0.21571060654935606, | |
| "learning_rate": 4.093103431249563e-06, | |
| "loss": 0.011, | |
| "num_tokens": 140393810.0, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 3.088560885608856, | |
| "grad_norm": 0.06271676867820344, | |
| "learning_rate": 4.065948911182015e-06, | |
| "loss": 0.018, | |
| "num_tokens": 140853306.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.09840098400984, | |
| "grad_norm": 0.06529992912597996, | |
| "learning_rate": 4.038852388578925e-06, | |
| "loss": 0.0102, | |
| "num_tokens": 141293974.0, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 3.108241082410824, | |
| "grad_norm": 0.0613594667302306, | |
| "learning_rate": 4.011814959307533e-06, | |
| "loss": 0.0101, | |
| "num_tokens": 141739396.0, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 3.1180811808118083, | |
| "grad_norm": 0.06143281774280475, | |
| "learning_rate": 3.984837716845157e-06, | |
| "loss": 0.0098, | |
| "num_tokens": 142181417.0, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 3.127921279212792, | |
| "grad_norm": 0.06065540767441434, | |
| "learning_rate": 3.957921752234982e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 142615273.0, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 3.137761377613776, | |
| "grad_norm": 0.0565367496699821, | |
| "learning_rate": 3.931068154041919e-06, | |
| "loss": 0.0156, | |
| "num_tokens": 143066695.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.14760147601476, | |
| "grad_norm": 0.0928817994214938, | |
| "learning_rate": 3.904278008308589e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 143543314.0, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 3.1574415744157442, | |
| "grad_norm": 0.05348206917431186, | |
| "learning_rate": 3.877552398511409e-06, | |
| "loss": 0.0102, | |
| "num_tokens": 143978640.0, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 3.167281672816728, | |
| "grad_norm": 0.05744861837720995, | |
| "learning_rate": 3.85089240551675e-06, | |
| "loss": 0.0096, | |
| "num_tokens": 144437143.0, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 3.177121771217712, | |
| "grad_norm": 0.05917730480215664, | |
| "learning_rate": 3.8242991075372436e-06, | |
| "loss": 0.0103, | |
| "num_tokens": 144882614.0, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 3.1869618696186963, | |
| "grad_norm": 0.06138753989215512, | |
| "learning_rate": 3.7977735800881687e-06, | |
| "loss": 0.01, | |
| "num_tokens": 145336615.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 3.19680196801968, | |
| "grad_norm": 0.057934477141044834, | |
| "learning_rate": 3.7713168959439515e-06, | |
| "loss": 0.0097, | |
| "num_tokens": 145791703.0, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 3.206642066420664, | |
| "grad_norm": 0.062311400511582536, | |
| "learning_rate": 3.74493012509478e-06, | |
| "loss": 0.0163, | |
| "num_tokens": 146256588.0, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 3.2164821648216484, | |
| "grad_norm": 0.11046706497961999, | |
| "learning_rate": 3.718614334703339e-06, | |
| "loss": 0.0096, | |
| "num_tokens": 146704790.0, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 3.2263222632226323, | |
| "grad_norm": 0.06040935915809342, | |
| "learning_rate": 3.692370589061639e-06, | |
| "loss": 0.0161, | |
| "num_tokens": 147150851.0, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 3.236162361623616, | |
| "grad_norm": 0.06309596528426079, | |
| "learning_rate": 3.6661999495479772e-06, | |
| "loss": 0.0116, | |
| "num_tokens": 147586533.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.2460024600246005, | |
| "grad_norm": 0.0775947611650109, | |
| "learning_rate": 3.640103474584016e-06, | |
| "loss": 0.0102, | |
| "num_tokens": 148012817.0, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 3.2558425584255843, | |
| "grad_norm": 0.060442066581616015, | |
| "learning_rate": 3.614082219591972e-06, | |
| "loss": 0.0094, | |
| "num_tokens": 148454349.0, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 3.265682656826568, | |
| "grad_norm": 0.0599277899760194, | |
| "learning_rate": 3.588137236951934e-06, | |
| "loss": 0.0096, | |
| "num_tokens": 148908837.0, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 3.275522755227552, | |
| "grad_norm": 0.06389649266611047, | |
| "learning_rate": 3.5622695759592996e-06, | |
| "loss": 0.0091, | |
| "num_tokens": 149387409.0, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 3.2853628536285364, | |
| "grad_norm": 0.059031876557593344, | |
| "learning_rate": 3.5364802827823397e-06, | |
| "loss": 0.0124, | |
| "num_tokens": 149842184.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.2952029520295203, | |
| "grad_norm": 0.06425762134540147, | |
| "learning_rate": 3.5107704004198904e-06, | |
| "loss": 0.0096, | |
| "num_tokens": 150294624.0, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 3.305043050430504, | |
| "grad_norm": 0.060359900802863305, | |
| "learning_rate": 3.485140968659166e-06, | |
| "loss": 0.0156, | |
| "num_tokens": 150757952.0, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 3.3148831488314885, | |
| "grad_norm": 0.06451910432321761, | |
| "learning_rate": 3.4595930240337115e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 151210941.0, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 3.3247232472324724, | |
| "grad_norm": 0.05771756769585445, | |
| "learning_rate": 3.4341275997814795e-06, | |
| "loss": 0.0311, | |
| "num_tokens": 151659703.0, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 3.3345633456334562, | |
| "grad_norm": 0.2709101034464869, | |
| "learning_rate": 3.408745725803042e-06, | |
| "loss": 0.0198, | |
| "num_tokens": 152096656.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.34440344403444, | |
| "grad_norm": 0.2165805542100797, | |
| "learning_rate": 3.383448428619941e-06, | |
| "loss": 0.0109, | |
| "num_tokens": 152535937.0, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 3.3542435424354244, | |
| "grad_norm": 0.06249104678860667, | |
| "learning_rate": 3.3582367313331692e-06, | |
| "loss": 0.0241, | |
| "num_tokens": 153012481.0, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 3.3640836408364083, | |
| "grad_norm": 0.07444091538512662, | |
| "learning_rate": 3.3331116535817974e-06, | |
| "loss": 0.0096, | |
| "num_tokens": 153457239.0, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 3.373923739237392, | |
| "grad_norm": 0.05744783875540723, | |
| "learning_rate": 3.308074211501732e-06, | |
| "loss": 0.0112, | |
| "num_tokens": 153885310.0, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 3.3837638376383765, | |
| "grad_norm": 0.062108203142145886, | |
| "learning_rate": 3.2831254176846205e-06, | |
| "loss": 0.0102, | |
| "num_tokens": 154315565.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.3936039360393604, | |
| "grad_norm": 0.06493988486024563, | |
| "learning_rate": 3.258266281136905e-06, | |
| "loss": 0.0154, | |
| "num_tokens": 154761237.0, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 3.4034440344403443, | |
| "grad_norm": 0.07703452506780802, | |
| "learning_rate": 3.233497807239008e-06, | |
| "loss": 0.0149, | |
| "num_tokens": 155219079.0, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 3.4132841328413286, | |
| "grad_norm": 0.07716474025857703, | |
| "learning_rate": 3.2088209977046657e-06, | |
| "loss": 0.0099, | |
| "num_tokens": 155672847.0, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 3.4231242312423125, | |
| "grad_norm": 0.0598011605849924, | |
| "learning_rate": 3.1842368505404388e-06, | |
| "loss": 0.0097, | |
| "num_tokens": 156097592.0, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 3.4329643296432963, | |
| "grad_norm": 0.06067024127693304, | |
| "learning_rate": 3.1597463600053258e-06, | |
| "loss": 0.0097, | |
| "num_tokens": 156543931.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.4428044280442807, | |
| "grad_norm": 0.06276348610439125, | |
| "learning_rate": 3.135350516570559e-06, | |
| "loss": 0.0115, | |
| "num_tokens": 156993093.0, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 3.4526445264452645, | |
| "grad_norm": 0.07056305058653452, | |
| "learning_rate": 3.111050306879556e-06, | |
| "loss": 0.0161, | |
| "num_tokens": 157435895.0, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 3.4624846248462484, | |
| "grad_norm": 0.0692853066303934, | |
| "learning_rate": 3.0868467137080075e-06, | |
| "loss": 0.0124, | |
| "num_tokens": 157859703.0, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 3.4723247232472323, | |
| "grad_norm": 0.06622059827297899, | |
| "learning_rate": 3.0627407159241273e-06, | |
| "loss": 0.0098, | |
| "num_tokens": 158319159.0, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 3.4821648216482166, | |
| "grad_norm": 0.06424105970441871, | |
| "learning_rate": 3.0387332884490806e-06, | |
| "loss": 0.0105, | |
| "num_tokens": 158768974.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 3.4920049200492005, | |
| "grad_norm": 0.06970655480927966, | |
| "learning_rate": 3.014825402217533e-06, | |
| "loss": 0.0099, | |
| "num_tokens": 159221319.0, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 3.5018450184501844, | |
| "grad_norm": 0.06231852234082556, | |
| "learning_rate": 2.9910180241384014e-06, | |
| "loss": 0.0099, | |
| "num_tokens": 159657431.0, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 3.5116851168511687, | |
| "grad_norm": 0.06403174372575768, | |
| "learning_rate": 2.9673121170557396e-06, | |
| "loss": 0.0099, | |
| "num_tokens": 160091184.0, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 3.5215252152521526, | |
| "grad_norm": 0.06050506427522611, | |
| "learning_rate": 2.9437086397097996e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 160538104.0, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 3.5313653136531364, | |
| "grad_norm": 0.05914580967848918, | |
| "learning_rate": 2.92020854669826e-06, | |
| "loss": 0.0151, | |
| "num_tokens": 160984800.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.5412054120541203, | |
| "grad_norm": 0.06615551474859403, | |
| "learning_rate": 2.896812788437615e-06, | |
| "loss": 0.0102, | |
| "num_tokens": 161437908.0, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 3.5510455104551046, | |
| "grad_norm": 0.05688142632929498, | |
| "learning_rate": 2.8735223111247402e-06, | |
| "loss": 0.0094, | |
| "num_tokens": 161900209.0, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 3.5608856088560885, | |
| "grad_norm": 0.05805719882416427, | |
| "learning_rate": 2.850338056698621e-06, | |
| "loss": 0.0094, | |
| "num_tokens": 162381378.0, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 3.570725707257073, | |
| "grad_norm": 0.05665394777981862, | |
| "learning_rate": 2.827260962802263e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 162818401.0, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 3.5805658056580567, | |
| "grad_norm": 0.058540688861597474, | |
| "learning_rate": 2.804291962744768e-06, | |
| "loss": 0.0102, | |
| "num_tokens": 163261663.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 3.5904059040590406, | |
| "grad_norm": 0.06068364561780823, | |
| "learning_rate": 2.7814319854635875e-06, | |
| "loss": 0.0096, | |
| "num_tokens": 163706510.0, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 3.6002460024600245, | |
| "grad_norm": 0.0593859542792967, | |
| "learning_rate": 2.758681955486955e-06, | |
| "loss": 0.0097, | |
| "num_tokens": 164145145.0, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 3.6100861008610083, | |
| "grad_norm": 0.059439587082302694, | |
| "learning_rate": 2.736042792896495e-06, | |
| "loss": 0.0104, | |
| "num_tokens": 164588218.0, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 3.6199261992619927, | |
| "grad_norm": 0.06426940128348262, | |
| "learning_rate": 2.7135154132900133e-06, | |
| "loss": 0.0203, | |
| "num_tokens": 165039642.0, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 3.6297662976629765, | |
| "grad_norm": 0.059031373381084176, | |
| "learning_rate": 2.691100727744458e-06, | |
| "loss": 0.0091, | |
| "num_tokens": 165502439.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.639606396063961, | |
| "grad_norm": 0.05706397506461239, | |
| "learning_rate": 2.668799642779093e-06, | |
| "loss": 0.0106, | |
| "num_tokens": 165957611.0, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 3.6494464944649447, | |
| "grad_norm": 0.06337690848780857, | |
| "learning_rate": 2.6466130603188157e-06, | |
| "loss": 0.01, | |
| "num_tokens": 166404741.0, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 3.6592865928659286, | |
| "grad_norm": 0.057865704503962175, | |
| "learning_rate": 2.624541877657685e-06, | |
| "loss": 0.1951, | |
| "num_tokens": 166908892.0, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 3.6691266912669125, | |
| "grad_norm": 0.6748913551790232, | |
| "learning_rate": 2.602586987422643e-06, | |
| "loss": 0.0094, | |
| "num_tokens": 167346017.0, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 3.678966789667897, | |
| "grad_norm": 0.06271310429727074, | |
| "learning_rate": 2.580749277537399e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 167795779.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.6888068880688807, | |
| "grad_norm": 0.05728241738284472, | |
| "learning_rate": 2.5590296311865294e-06, | |
| "loss": 0.0092, | |
| "num_tokens": 168246613.0, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 3.6986469864698646, | |
| "grad_norm": 0.05730319671770116, | |
| "learning_rate": 2.537428926779758e-06, | |
| "loss": 0.0104, | |
| "num_tokens": 168703193.0, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 3.708487084870849, | |
| "grad_norm": 0.061789009881383514, | |
| "learning_rate": 2.515948037916423e-06, | |
| "loss": 0.0104, | |
| "num_tokens": 169166239.0, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 3.7183271832718328, | |
| "grad_norm": 0.05958784070544453, | |
| "learning_rate": 2.494587833350153e-06, | |
| "loss": 0.0564, | |
| "num_tokens": 169618415.0, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 3.7281672816728166, | |
| "grad_norm": 0.22039415728368103, | |
| "learning_rate": 2.473349176953736e-06, | |
| "loss": 0.0094, | |
| "num_tokens": 170079318.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.7380073800738005, | |
| "grad_norm": 0.05930397129828618, | |
| "learning_rate": 2.4522329276841664e-06, | |
| "loss": 0.0198, | |
| "num_tokens": 170524571.0, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 3.747847478474785, | |
| "grad_norm": 0.06047568038440854, | |
| "learning_rate": 2.431239939547921e-06, | |
| "loss": 0.0094, | |
| "num_tokens": 170983016.0, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 3.7576875768757687, | |
| "grad_norm": 0.061680315681806853, | |
| "learning_rate": 2.4103710615664145e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 171426486.0, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 3.767527675276753, | |
| "grad_norm": 0.05588539351574886, | |
| "learning_rate": 2.389627137741662e-06, | |
| "loss": 0.0094, | |
| "num_tokens": 171871834.0, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 3.777367773677737, | |
| "grad_norm": 0.061780123368904795, | |
| "learning_rate": 2.369009007022146e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 172337523.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.787207872078721, | |
| "grad_norm": 0.05632561272908436, | |
| "learning_rate": 2.3485175032688865e-06, | |
| "loss": 0.0088, | |
| "num_tokens": 172775826.0, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 3.7970479704797047, | |
| "grad_norm": 0.058782272770165275, | |
| "learning_rate": 2.328153455221717e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 173234709.0, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 3.8068880688806885, | |
| "grad_norm": 0.057526356469471435, | |
| "learning_rate": 2.3079176864657673e-06, | |
| "loss": 0.0097, | |
| "num_tokens": 173700055.0, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 3.816728167281673, | |
| "grad_norm": 0.06609619441495819, | |
| "learning_rate": 2.2878110153981565e-06, | |
| "loss": 0.0111, | |
| "num_tokens": 174147961.0, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 3.8265682656826567, | |
| "grad_norm": 0.06703233332357492, | |
| "learning_rate": 2.267834255194894e-06, | |
| "loss": 0.0116, | |
| "num_tokens": 174586991.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.836408364083641, | |
| "grad_norm": 0.06522848493729735, | |
| "learning_rate": 2.2479882137779903e-06, | |
| "loss": 0.0106, | |
| "num_tokens": 175006875.0, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 3.846248462484625, | |
| "grad_norm": 0.06306752932488521, | |
| "learning_rate": 2.228273693782784e-06, | |
| "loss": 0.0094, | |
| "num_tokens": 175451007.0, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 3.856088560885609, | |
| "grad_norm": 0.062263756072231294, | |
| "learning_rate": 2.208691492525481e-06, | |
| "loss": 0.0135, | |
| "num_tokens": 175896902.0, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 3.8659286592865927, | |
| "grad_norm": 0.06835430681220003, | |
| "learning_rate": 2.189242401970908e-06, | |
| "loss": 0.0092, | |
| "num_tokens": 176346616.0, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 3.875768757687577, | |
| "grad_norm": 0.05728313379563115, | |
| "learning_rate": 2.169927208700482e-06, | |
| "loss": 0.0098, | |
| "num_tokens": 176802124.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.885608856088561, | |
| "grad_norm": 0.06299115193931754, | |
| "learning_rate": 2.1507466938804013e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 177233961.0, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 3.8954489544895448, | |
| "grad_norm": 0.060076198285498296, | |
| "learning_rate": 2.131701633230045e-06, | |
| "loss": 0.0098, | |
| "num_tokens": 177684662.0, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 3.905289052890529, | |
| "grad_norm": 0.06517531508961912, | |
| "learning_rate": 2.112792796990616e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 178123825.0, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 3.915129151291513, | |
| "grad_norm": 0.05863263973572925, | |
| "learning_rate": 2.0940209498939732e-06, | |
| "loss": 0.009, | |
| "num_tokens": 178562641.0, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 3.924969249692497, | |
| "grad_norm": 0.05798991563312477, | |
| "learning_rate": 2.075386851131711e-06, | |
| "loss": 0.0094, | |
| "num_tokens": 179007017.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.9348093480934807, | |
| "grad_norm": 0.06118488260559937, | |
| "learning_rate": 2.056891254324459e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 179449125.0, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 3.944649446494465, | |
| "grad_norm": 0.06403534407994695, | |
| "learning_rate": 2.038534907491396e-06, | |
| "loss": 0.009, | |
| "num_tokens": 179887646.0, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 3.954489544895449, | |
| "grad_norm": 0.08058699039926022, | |
| "learning_rate": 2.0203185530199983e-06, | |
| "loss": 0.0138, | |
| "num_tokens": 180341944.0, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 3.9643296432964332, | |
| "grad_norm": 0.056026267406971995, | |
| "learning_rate": 2.0022429276360256e-06, | |
| "loss": 0.0097, | |
| "num_tokens": 180787775.0, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 3.974169741697417, | |
| "grad_norm": 0.058787256460149456, | |
| "learning_rate": 1.9843087623737097e-06, | |
| "loss": 0.0088, | |
| "num_tokens": 181276015.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.984009840098401, | |
| "grad_norm": 0.054638072869340186, | |
| "learning_rate": 1.966516782546199e-06, | |
| "loss": 0.009, | |
| "num_tokens": 181724759.0, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 3.993849938499385, | |
| "grad_norm": 0.05931097745374889, | |
| "learning_rate": 1.94886770771623e-06, | |
| "loss": 0.0098, | |
| "num_tokens": 182165821.0, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.06697953375930626, | |
| "learning_rate": 1.931362251667008e-06, | |
| "loss": 0.027, | |
| "num_tokens": 182364260.0, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.1028980016708374, | |
| "eval_num_tokens": 182364260.0, | |
| "eval_runtime": 53.8919, | |
| "eval_samples_per_second": 41.676, | |
| "eval_steps_per_second": 5.214, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 4.009840098400984, | |
| "grad_norm": 0.07632643003764507, | |
| "learning_rate": 1.9140011223733576e-06, | |
| "loss": 0.0082, | |
| "num_tokens": 182806025.0, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 4.019680196801968, | |
| "grad_norm": 0.05437436276939388, | |
| "learning_rate": 1.8967850219730799e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 183278654.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.029520295202952, | |
| "grad_norm": 0.05114318878211908, | |
| "learning_rate": 1.8797146467385604e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 183720645.0, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 4.039360393603936, | |
| "grad_norm": 0.053465044974803935, | |
| "learning_rate": 1.8627906870486063e-06, | |
| "loss": 0.0082, | |
| "num_tokens": 184191637.0, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 4.04920049200492, | |
| "grad_norm": 0.054542981072468875, | |
| "learning_rate": 1.8460138273605265e-06, | |
| "loss": 0.008, | |
| "num_tokens": 184634141.0, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 4.059040590405904, | |
| "grad_norm": 0.052414283521576004, | |
| "learning_rate": 1.8293847461824538e-06, | |
| "loss": 0.0079, | |
| "num_tokens": 185081741.0, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 4.068880688806888, | |
| "grad_norm": 0.05289967674124652, | |
| "learning_rate": 1.8129041160458966e-06, | |
| "loss": 0.008, | |
| "num_tokens": 185495440.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 4.078720787207872, | |
| "grad_norm": 0.0584668942852983, | |
| "learning_rate": 1.7965726034785466e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 185938291.0, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 4.088560885608856, | |
| "grad_norm": 0.05897150659800833, | |
| "learning_rate": 1.780390868977318e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 186409542.0, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 4.0984009840098405, | |
| "grad_norm": 0.05118034680985974, | |
| "learning_rate": 1.7643595669816378e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 186852482.0, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 4.108241082410824, | |
| "grad_norm": 0.05911903344070817, | |
| "learning_rate": 1.7484793458469745e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 187306570.0, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 4.118081180811808, | |
| "grad_norm": 0.058617479568280846, | |
| "learning_rate": 1.7327508478186216e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 187738802.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.127921279212792, | |
| "grad_norm": 0.05743950460862962, | |
| "learning_rate": 1.7171747090057201e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 188188275.0, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 4.137761377613776, | |
| "grad_norm": 0.0578427653677817, | |
| "learning_rate": 1.7017515593555295e-06, | |
| "loss": 0.008, | |
| "num_tokens": 188626310.0, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 4.14760147601476, | |
| "grad_norm": 0.055381917249045204, | |
| "learning_rate": 1.6864820226279607e-06, | |
| "loss": 0.0079, | |
| "num_tokens": 189058824.0, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 4.157441574415744, | |
| "grad_norm": 0.0566904301682134, | |
| "learning_rate": 1.6713667163703348e-06, | |
| "loss": 0.008, | |
| "num_tokens": 189488025.0, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 4.167281672816729, | |
| "grad_norm": 0.0591657691393218, | |
| "learning_rate": 1.6564062518924202e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 189949176.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 4.177121771217712, | |
| "grad_norm": 0.058609260537066755, | |
| "learning_rate": 1.6416012342417056e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 190405187.0, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 4.186961869618696, | |
| "grad_norm": 0.05376660491247955, | |
| "learning_rate": 1.6269522621789246e-06, | |
| "loss": 0.0094, | |
| "num_tokens": 190839466.0, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 4.19680196801968, | |
| "grad_norm": 0.062048025442225076, | |
| "learning_rate": 1.6124599281538452e-06, | |
| "loss": 0.02, | |
| "num_tokens": 191280153.0, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 4.206642066420664, | |
| "grad_norm": 0.06071173185238267, | |
| "learning_rate": 1.5981248182813136e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 191734314.0, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 4.216482164821648, | |
| "grad_norm": 0.05301725414979279, | |
| "learning_rate": 1.583947512317537e-06, | |
| "loss": 0.0117, | |
| "num_tokens": 192202492.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.226322263222632, | |
| "grad_norm": 0.06832062526218917, | |
| "learning_rate": 1.5699285836366488e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 192667915.0, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 4.236162361623617, | |
| "grad_norm": 0.05748762603533909, | |
| "learning_rate": 1.5560685992075141e-06, | |
| "loss": 0.0078, | |
| "num_tokens": 193136794.0, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 4.2460024600246005, | |
| "grad_norm": 0.0737572203685775, | |
| "learning_rate": 1.5423681195707997e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 193598491.0, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 4.255842558425584, | |
| "grad_norm": 0.05225082250599676, | |
| "learning_rate": 1.528827698816306e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 194023980.0, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 4.265682656826568, | |
| "grad_norm": 0.05296466266803098, | |
| "learning_rate": 1.515447884560556e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 194481167.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 4.275522755227552, | |
| "grad_norm": 0.05336380722303185, | |
| "learning_rate": 1.502229217924649e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 194915312.0, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 4.285362853628536, | |
| "grad_norm": 0.05458180686808586, | |
| "learning_rate": 1.489172233512376e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 195368266.0, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 4.29520295202952, | |
| "grad_norm": 0.05542603913086383, | |
| "learning_rate": 1.4762774593885986e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 195810914.0, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 4.305043050430505, | |
| "grad_norm": 0.054344537083576325, | |
| "learning_rate": 1.4635454170578917e-06, | |
| "loss": 0.0072, | |
| "num_tokens": 196263940.0, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 4.3148831488314885, | |
| "grad_norm": 0.052701156778993646, | |
| "learning_rate": 1.4509766214434535e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 196718774.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.324723247232472, | |
| "grad_norm": 0.05423178707270067, | |
| "learning_rate": 1.4385715808662787e-06, | |
| "loss": 0.008, | |
| "num_tokens": 197161519.0, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 4.334563345633456, | |
| "grad_norm": 0.055354896441224044, | |
| "learning_rate": 1.4263307970246027e-06, | |
| "loss": 0.008, | |
| "num_tokens": 197621081.0, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 4.34440344403444, | |
| "grad_norm": 0.05816305513011695, | |
| "learning_rate": 1.41425476497361e-06, | |
| "loss": 0.0078, | |
| "num_tokens": 198087857.0, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 4.354243542435424, | |
| "grad_norm": 0.05127845466920968, | |
| "learning_rate": 1.4023439731054112e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 198533672.0, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 4.364083640836409, | |
| "grad_norm": 0.07067731738580797, | |
| "learning_rate": 1.390598903129296e-06, | |
| "loss": 0.0322, | |
| "num_tokens": 199022227.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 4.373923739237393, | |
| "grad_norm": 0.05511218194004341, | |
| "learning_rate": 1.3790200300522413e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 199462215.0, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 4.3837638376383765, | |
| "grad_norm": 0.05735730379081794, | |
| "learning_rate": 1.3676078221597157e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 199907231.0, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 4.39360393603936, | |
| "grad_norm": 0.05442936039834661, | |
| "learning_rate": 1.3563627409967257e-06, | |
| "loss": 0.1955, | |
| "num_tokens": 200376904.0, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 4.403444034440344, | |
| "grad_norm": 0.5930661652942222, | |
| "learning_rate": 1.3452852413491563e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 200853967.0, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 4.413284132841328, | |
| "grad_norm": 0.05077867679984549, | |
| "learning_rate": 1.3343757712253804e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 201323621.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.423124231242312, | |
| "grad_norm": 0.058807424527887606, | |
| "learning_rate": 1.3236347718381338e-06, | |
| "loss": 0.0096, | |
| "num_tokens": 201753687.0, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 4.432964329643297, | |
| "grad_norm": 0.06001374322910319, | |
| "learning_rate": 1.3130626775866743e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 202203799.0, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 4.442804428044281, | |
| "grad_norm": 0.06273437087252197, | |
| "learning_rate": 1.3026599160392173e-06, | |
| "loss": 0.0092, | |
| "num_tokens": 202627243.0, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 4.4526445264452645, | |
| "grad_norm": 0.06372618537836224, | |
| "learning_rate": 1.292426907915634e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 203077433.0, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 4.462484624846248, | |
| "grad_norm": 0.057948321757535656, | |
| "learning_rate": 1.2823640670704443e-06, | |
| "loss": 0.0229, | |
| "num_tokens": 203532517.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 4.472324723247232, | |
| "grad_norm": 0.06607138604150303, | |
| "learning_rate": 1.2724718004760794e-06, | |
| "loss": 0.0078, | |
| "num_tokens": 203967752.0, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 4.482164821648216, | |
| "grad_norm": 0.05725783304801458, | |
| "learning_rate": 1.2627505082064144e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 204424349.0, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 4.492004920049201, | |
| "grad_norm": 0.055427831791831646, | |
| "learning_rate": 1.2532005834205976e-06, | |
| "loss": 0.0079, | |
| "num_tokens": 204846138.0, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 4.501845018450185, | |
| "grad_norm": 0.05460191637217484, | |
| "learning_rate": 1.2438224123471442e-06, | |
| "loss": 0.0192, | |
| "num_tokens": 205306730.0, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 4.511685116851169, | |
| "grad_norm": 0.06279438477449967, | |
| "learning_rate": 1.2346163742683185e-06, | |
| "loss": 0.0117, | |
| "num_tokens": 205759609.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 4.521525215252153, | |
| "grad_norm": 0.05702285396092694, | |
| "learning_rate": 1.2255828415047932e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 206171295.0, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 4.531365313653136, | |
| "grad_norm": 0.054521558454890394, | |
| "learning_rate": 1.216722179400592e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 206639148.0, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 4.54120541205412, | |
| "grad_norm": 0.05168283263697403, | |
| "learning_rate": 1.208034746308315e-06, | |
| "loss": 0.0068, | |
| "num_tokens": 207094260.0, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 4.551045510455104, | |
| "grad_norm": 0.05161429329359664, | |
| "learning_rate": 1.1995208935746437e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 207533375.0, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 4.560885608856088, | |
| "grad_norm": 0.058514508257411606, | |
| "learning_rate": 1.1911809655261333e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 207969517.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 4.570725707257073, | |
| "grad_norm": 0.056665893017668854, | |
| "learning_rate": 1.1830152994552866e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 208408117.0, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 4.580565805658057, | |
| "grad_norm": 0.056163462620316754, | |
| "learning_rate": 1.175024225606912e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 208879227.0, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 4.590405904059041, | |
| "grad_norm": 0.05409385523794747, | |
| "learning_rate": 1.1672080671647695e-06, | |
| "loss": 0.0078, | |
| "num_tokens": 209325103.0, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 4.6002460024600245, | |
| "grad_norm": 0.05629255243399504, | |
| "learning_rate": 1.1595671402384966e-06, | |
| "loss": 0.0102, | |
| "num_tokens": 209791894.0, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 4.610086100861008, | |
| "grad_norm": 0.051104203707396316, | |
| "learning_rate": 1.152101753850828e-06, | |
| "loss": 0.0072, | |
| "num_tokens": 210254182.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 4.619926199261993, | |
| "grad_norm": 0.05229454749737629, | |
| "learning_rate": 1.1448122099250946e-06, | |
| "loss": 0.0104, | |
| "num_tokens": 210702900.0, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 4.629766297662977, | |
| "grad_norm": 0.060177504722208404, | |
| "learning_rate": 1.1376988032730135e-06, | |
| "loss": 0.0079, | |
| "num_tokens": 211151465.0, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 4.639606396063961, | |
| "grad_norm": 0.05182456184289124, | |
| "learning_rate": 1.130761821582766e-06, | |
| "loss": 0.0072, | |
| "num_tokens": 211619464.0, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 4.649446494464945, | |
| "grad_norm": 0.05574225668849545, | |
| "learning_rate": 1.1240015454073622e-06, | |
| "loss": 0.0085, | |
| "num_tokens": 212064266.0, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 4.659286592865929, | |
| "grad_norm": 0.06359820975154429, | |
| "learning_rate": 1.1174182481532943e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 212499724.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 4.6691266912669125, | |
| "grad_norm": 0.05622656000305094, | |
| "learning_rate": 1.1110121960694773e-06, | |
| "loss": 0.0079, | |
| "num_tokens": 212945879.0, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 4.678966789667896, | |
| "grad_norm": 0.06093763072714235, | |
| "learning_rate": 1.104783648236486e-06, | |
| "loss": 0.0084, | |
| "num_tokens": 213379787.0, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 4.68880688806888, | |
| "grad_norm": 0.0543614373855231, | |
| "learning_rate": 1.0987328565560711e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 213824263.0, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 4.698646986469865, | |
| "grad_norm": 0.056905167227697236, | |
| "learning_rate": 1.0928600657409751e-06, | |
| "loss": 0.0082, | |
| "num_tokens": 214265208.0, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 4.708487084870849, | |
| "grad_norm": 0.057351833542733925, | |
| "learning_rate": 1.0871655133050372e-06, | |
| "loss": 0.0082, | |
| "num_tokens": 214744301.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 4.718327183271833, | |
| "grad_norm": 0.29349816338215157, | |
| "learning_rate": 1.081649429553581e-06, | |
| "loss": 0.0553, | |
| "num_tokens": 215194355.0, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 4.728167281672817, | |
| "grad_norm": 0.051057953015104116, | |
| "learning_rate": 1.076312037574106e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 215632060.0, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 4.7380073800738005, | |
| "grad_norm": 0.056594540815463674, | |
| "learning_rate": 1.0711535532272632e-06, | |
| "loss": 0.0235, | |
| "num_tokens": 216097276.0, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 4.747847478474784, | |
| "grad_norm": 0.068871190152495, | |
| "learning_rate": 1.0661741851381256e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 216544463.0, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 4.757687576875769, | |
| "grad_norm": 0.05907548729697175, | |
| "learning_rate": 1.0613741346877498e-06, | |
| "loss": 0.0084, | |
| "num_tokens": 216972058.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 4.767527675276753, | |
| "grad_norm": 0.055592377746762095, | |
| "learning_rate": 1.056753596005032e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 217401900.0, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 4.777367773677737, | |
| "grad_norm": 0.05562394957573223, | |
| "learning_rate": 1.0523127559588579e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 217845453.0, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 4.787207872078721, | |
| "grad_norm": 0.05258367575789477, | |
| "learning_rate": 1.0480517941505428e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 218272871.0, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 4.797047970479705, | |
| "grad_norm": 0.05390618674507445, | |
| "learning_rate": 1.0439708829065708e-06, | |
| "loss": 0.0078, | |
| "num_tokens": 218732597.0, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 4.8068880688806885, | |
| "grad_norm": 0.06946151381547928, | |
| "learning_rate": 1.0400701872716227e-06, | |
| "loss": 0.0223, | |
| "num_tokens": 219194340.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 4.816728167281672, | |
| "grad_norm": 0.05582170906207444, | |
| "learning_rate": 1.0363498650019023e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 219673692.0, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 4.826568265682657, | |
| "grad_norm": 0.05244987983803676, | |
| "learning_rate": 1.0328100665587573e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 220118246.0, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 4.836408364083641, | |
| "grad_norm": 0.055024340070040305, | |
| "learning_rate": 1.029450935102592e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 220555806.0, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 4.846248462484625, | |
| "grad_norm": 0.05338628090134423, | |
| "learning_rate": 1.0262726064870801e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 220997187.0, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 4.856088560885609, | |
| "grad_norm": 0.058254094197714025, | |
| "learning_rate": 1.0232752092536666e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 221434681.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 4.865928659286593, | |
| "grad_norm": 0.05261616134189719, | |
| "learning_rate": 1.0204588646263731e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 221884850.0, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 4.875768757687577, | |
| "grad_norm": 0.052167915998619634, | |
| "learning_rate": 1.0178236865068933e-06, | |
| "loss": 0.0072, | |
| "num_tokens": 222333225.0, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 4.885608856088561, | |
| "grad_norm": 0.06187153122740552, | |
| "learning_rate": 1.0153697814699858e-06, | |
| "loss": 0.0106, | |
| "num_tokens": 222774591.0, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 4.895448954489545, | |
| "grad_norm": 0.054905669170180534, | |
| "learning_rate": 1.0130972487591658e-06, | |
| "loss": 0.0112, | |
| "num_tokens": 223227943.0, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 4.905289052890529, | |
| "grad_norm": 0.06206228565326619, | |
| "learning_rate": 1.0110061802826889e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 223680989.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.915129151291513, | |
| "grad_norm": 0.05437071230251554, | |
| "learning_rate": 1.009096660609837e-06, | |
| "loss": 0.1789, | |
| "num_tokens": 224171724.0, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 4.924969249692497, | |
| "grad_norm": 0.12358300885271949, | |
| "learning_rate": 1.0073687669674949e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 224621243.0, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 4.934809348093481, | |
| "grad_norm": 0.05743551551374671, | |
| "learning_rate": 1.0058225692370299e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 225053570.0, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 4.944649446494465, | |
| "grad_norm": 0.05705289715957623, | |
| "learning_rate": 1.0044581299514638e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 225475922.0, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 4.9544895448954485, | |
| "grad_norm": 0.052608564457681, | |
| "learning_rate": 1.003275504292944e-06, | |
| "loss": 0.0072, | |
| "num_tokens": 225944888.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 4.964329643296433, | |
| "grad_norm": 0.05546452983023311, | |
| "learning_rate": 1.0022747400905126e-06, | |
| "loss": 0.0079, | |
| "num_tokens": 226384045.0, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 4.974169741697417, | |
| "grad_norm": 0.05754539826487939, | |
| "learning_rate": 1.0014558778181714e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 226815343.0, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 4.984009840098401, | |
| "grad_norm": 0.05456913560891108, | |
| "learning_rate": 1.0008189505932444e-06, | |
| "loss": 0.0084, | |
| "num_tokens": 227286168.0, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 4.993849938499385, | |
| "grad_norm": 0.053799541560384294, | |
| "learning_rate": 1.0003639841750404e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 227746824.0, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.07884368824115337, | |
| "learning_rate": 1.0000909969638097e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 227957450.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.11205815523862839, | |
| "eval_num_tokens": 227957450.0, | |
| "eval_runtime": 53.843, | |
| "eval_samples_per_second": 41.714, | |
| "eval_steps_per_second": 5.219, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 510, | |
| "total_flos": 7.689061516716278e+17, | |
| "train_loss": 0.0504409685922677, | |
| "train_runtime": 7612.3259, | |
| "train_samples_per_second": 8.537, | |
| "train_steps_per_second": 0.067 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 510, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.689061516716278e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |