| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 510, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00984009840098401, | |
| "grad_norm": 4.0806973528031865, | |
| "learning_rate": 0.0, | |
| "loss": 1.3287, | |
| "num_tokens": 423926.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.01968019680196802, | |
| "grad_norm": 3.67747034614091, | |
| "learning_rate": 6.25e-07, | |
| "loss": 1.2135, | |
| "num_tokens": 900553.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.02952029520295203, | |
| "grad_norm": 3.8171114533122408, | |
| "learning_rate": 1.25e-06, | |
| "loss": 1.2489, | |
| "num_tokens": 1354101.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.03936039360393604, | |
| "grad_norm": 3.601160453205568, | |
| "learning_rate": 1.8750000000000003e-06, | |
| "loss": 1.2057, | |
| "num_tokens": 1831336.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04920049200492005, | |
| "grad_norm": 3.5064220048415957, | |
| "learning_rate": 2.5e-06, | |
| "loss": 1.2032, | |
| "num_tokens": 2299763.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.05904059040590406, | |
| "grad_norm": 3.1555410715542345, | |
| "learning_rate": 3.125e-06, | |
| "loss": 1.2117, | |
| "num_tokens": 2739130.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.06888068880688807, | |
| "grad_norm": 2.268607117088524, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 1.0872, | |
| "num_tokens": 3186646.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.07872078720787208, | |
| "grad_norm": 2.2099777833656136, | |
| "learning_rate": 4.3750000000000005e-06, | |
| "loss": 1.0835, | |
| "num_tokens": 3629799.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.08856088560885608, | |
| "grad_norm": 1.8443437689000939, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8645, | |
| "num_tokens": 4083162.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0984009840098401, | |
| "grad_norm": 1.9131458467328875, | |
| "learning_rate": 5.625e-06, | |
| "loss": 0.884, | |
| "num_tokens": 4514571.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10824108241082411, | |
| "grad_norm": 1.6995323396674717, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.7743, | |
| "num_tokens": 4990034.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.11808118081180811, | |
| "grad_norm": 1.9772572417592447, | |
| "learning_rate": 6.875e-06, | |
| "loss": 0.53, | |
| "num_tokens": 5446252.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.12792127921279212, | |
| "grad_norm": 2.235467875434991, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.4443, | |
| "num_tokens": 5880398.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.13776137761377613, | |
| "grad_norm": 1.779888369557262, | |
| "learning_rate": 8.125000000000001e-06, | |
| "loss": 0.3535, | |
| "num_tokens": 6317527.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.14760147601476015, | |
| "grad_norm": 1.4174646561758966, | |
| "learning_rate": 8.750000000000001e-06, | |
| "loss": 0.257, | |
| "num_tokens": 6768988.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.15744157441574416, | |
| "grad_norm": 0.825210223865691, | |
| "learning_rate": 9.375000000000001e-06, | |
| "loss": 0.1351, | |
| "num_tokens": 7212525.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.16728167281672818, | |
| "grad_norm": 0.5272789553691977, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1092, | |
| "num_tokens": 7660418.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.17712177121771217, | |
| "grad_norm": 0.4895013664227461, | |
| "learning_rate": 9.999909003036192e-06, | |
| "loss": 0.3213, | |
| "num_tokens": 8107247.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.18696186961869618, | |
| "grad_norm": 0.2368744364092584, | |
| "learning_rate": 9.99963601582496e-06, | |
| "loss": 0.0892, | |
| "num_tokens": 8547828.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.1968019680196802, | |
| "grad_norm": 0.17827168672763916, | |
| "learning_rate": 9.999181049406756e-06, | |
| "loss": 0.0814, | |
| "num_tokens": 8995945.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2066420664206642, | |
| "grad_norm": 0.1790643674004139, | |
| "learning_rate": 9.998544122181829e-06, | |
| "loss": 0.07, | |
| "num_tokens": 9428292.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.21648216482164823, | |
| "grad_norm": 0.17378868386224117, | |
| "learning_rate": 9.997725259909487e-06, | |
| "loss": 0.0822, | |
| "num_tokens": 9872391.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.22632226322263221, | |
| "grad_norm": 0.16172795763425857, | |
| "learning_rate": 9.996724495707056e-06, | |
| "loss": 0.0695, | |
| "num_tokens": 10321265.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.23616236162361623, | |
| "grad_norm": 0.15374263186536802, | |
| "learning_rate": 9.995541870048537e-06, | |
| "loss": 0.0699, | |
| "num_tokens": 10756941.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.24600246002460024, | |
| "grad_norm": 0.13904547469173764, | |
| "learning_rate": 9.994177430762971e-06, | |
| "loss": 0.0654, | |
| "num_tokens": 11211102.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.25584255842558423, | |
| "grad_norm": 0.1313831102836863, | |
| "learning_rate": 9.992631233032507e-06, | |
| "loss": 0.0575, | |
| "num_tokens": 11655921.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.2656826568265683, | |
| "grad_norm": 0.12358184868008147, | |
| "learning_rate": 9.990903339390164e-06, | |
| "loss": 0.0654, | |
| "num_tokens": 12123406.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.27552275522755226, | |
| "grad_norm": 0.13102947055348427, | |
| "learning_rate": 9.988993819717312e-06, | |
| "loss": 0.0576, | |
| "num_tokens": 12571100.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.2853628536285363, | |
| "grad_norm": 0.12370667408412227, | |
| "learning_rate": 9.986902751240836e-06, | |
| "loss": 0.0672, | |
| "num_tokens": 13004426.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.2952029520295203, | |
| "grad_norm": 0.11805846922575773, | |
| "learning_rate": 9.984630218530014e-06, | |
| "loss": 0.0653, | |
| "num_tokens": 13453098.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3050430504305043, | |
| "grad_norm": 0.12596387207866033, | |
| "learning_rate": 9.982176313493108e-06, | |
| "loss": 0.0531, | |
| "num_tokens": 13910634.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.3148831488314883, | |
| "grad_norm": 0.10677675254365898, | |
| "learning_rate": 9.979541135373628e-06, | |
| "loss": 0.0422, | |
| "num_tokens": 14359546.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.3247232472324723, | |
| "grad_norm": 0.11973442758561412, | |
| "learning_rate": 9.976724790746333e-06, | |
| "loss": 0.0681, | |
| "num_tokens": 14810794.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.33456334563345635, | |
| "grad_norm": 0.09765768004768276, | |
| "learning_rate": 9.973727393512921e-06, | |
| "loss": 0.0513, | |
| "num_tokens": 15282306.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.34440344403444034, | |
| "grad_norm": 0.09901017562060792, | |
| "learning_rate": 9.970549064897407e-06, | |
| "loss": 0.0461, | |
| "num_tokens": 15719140.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.35424354243542433, | |
| "grad_norm": 0.08726438112480077, | |
| "learning_rate": 9.967189933441243e-06, | |
| "loss": 0.0546, | |
| "num_tokens": 16160794.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.3640836408364084, | |
| "grad_norm": 0.0862429330782484, | |
| "learning_rate": 9.9636501349981e-06, | |
| "loss": 0.0482, | |
| "num_tokens": 16594103.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.37392373923739236, | |
| "grad_norm": 0.08703197270042312, | |
| "learning_rate": 9.95992981272838e-06, | |
| "loss": 0.0431, | |
| "num_tokens": 17050487.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.3837638376383764, | |
| "grad_norm": 0.09107427503262026, | |
| "learning_rate": 9.956029117093432e-06, | |
| "loss": 0.0437, | |
| "num_tokens": 17495118.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.3936039360393604, | |
| "grad_norm": 0.0937444613029256, | |
| "learning_rate": 9.951948205849457e-06, | |
| "loss": 0.0454, | |
| "num_tokens": 17970094.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.4034440344403444, | |
| "grad_norm": 0.08311104466178655, | |
| "learning_rate": 9.947687244041143e-06, | |
| "loss": 0.035, | |
| "num_tokens": 18410339.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.4132841328413284, | |
| "grad_norm": 0.08793054938548499, | |
| "learning_rate": 9.943246403994969e-06, | |
| "loss": 0.0398, | |
| "num_tokens": 18827350.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.4231242312423124, | |
| "grad_norm": 0.0870730212694011, | |
| "learning_rate": 9.938625865312252e-06, | |
| "loss": 0.0399, | |
| "num_tokens": 19263988.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.43296432964329645, | |
| "grad_norm": 0.09378264156348792, | |
| "learning_rate": 9.933825814861877e-06, | |
| "loss": 0.0417, | |
| "num_tokens": 19720928.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.44280442804428044, | |
| "grad_norm": 0.09291535955010499, | |
| "learning_rate": 9.928846446772737e-06, | |
| "loss": 0.0503, | |
| "num_tokens": 20158462.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.45264452644526443, | |
| "grad_norm": 0.09075012323845133, | |
| "learning_rate": 9.923687962425895e-06, | |
| "loss": 0.0383, | |
| "num_tokens": 20601362.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.46248462484624847, | |
| "grad_norm": 0.08004209717985471, | |
| "learning_rate": 9.91835057044642e-06, | |
| "loss": 0.037, | |
| "num_tokens": 21072330.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.47232472324723246, | |
| "grad_norm": 0.08493198697322972, | |
| "learning_rate": 9.912834486694963e-06, | |
| "loss": 0.0391, | |
| "num_tokens": 21525288.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.4821648216482165, | |
| "grad_norm": 0.09568253792719325, | |
| "learning_rate": 9.907139934259025e-06, | |
| "loss": 0.0343, | |
| "num_tokens": 21953640.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.4920049200492005, | |
| "grad_norm": 0.08303424847503821, | |
| "learning_rate": 9.90126714344393e-06, | |
| "loss": 0.0376, | |
| "num_tokens": 22435895.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5018450184501845, | |
| "grad_norm": 0.08402276200759969, | |
| "learning_rate": 9.895216351763515e-06, | |
| "loss": 0.0329, | |
| "num_tokens": 22883079.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.5116851168511685, | |
| "grad_norm": 0.08739650107232434, | |
| "learning_rate": 9.888987803930523e-06, | |
| "loss": 0.0434, | |
| "num_tokens": 23334165.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.5215252152521526, | |
| "grad_norm": 0.08000069499125344, | |
| "learning_rate": 9.882581751846707e-06, | |
| "loss": 0.0292, | |
| "num_tokens": 23797014.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.5313653136531366, | |
| "grad_norm": 0.08649372481980258, | |
| "learning_rate": 9.87599845459264e-06, | |
| "loss": 0.0347, | |
| "num_tokens": 24288663.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.5412054120541205, | |
| "grad_norm": 0.08523000136251592, | |
| "learning_rate": 9.869238178417235e-06, | |
| "loss": 0.0341, | |
| "num_tokens": 24710186.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5510455104551045, | |
| "grad_norm": 0.07465459840939084, | |
| "learning_rate": 9.862301196726988e-06, | |
| "loss": 0.0304, | |
| "num_tokens": 25160056.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.5608856088560885, | |
| "grad_norm": 0.08503608654454574, | |
| "learning_rate": 9.855187790074906e-06, | |
| "loss": 0.0396, | |
| "num_tokens": 25598946.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.5707257072570726, | |
| "grad_norm": 0.09884072621149297, | |
| "learning_rate": 9.847898246149173e-06, | |
| "loss": 0.0379, | |
| "num_tokens": 26066191.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.5805658056580566, | |
| "grad_norm": 0.08138600445295054, | |
| "learning_rate": 9.840432859761504e-06, | |
| "loss": 0.0346, | |
| "num_tokens": 26518390.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.5904059040590406, | |
| "grad_norm": 0.0914956119948303, | |
| "learning_rate": 9.832791932835232e-06, | |
| "loss": 0.0338, | |
| "num_tokens": 26971703.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6002460024600246, | |
| "grad_norm": 0.07667618947866506, | |
| "learning_rate": 9.824975774393089e-06, | |
| "loss": 0.0311, | |
| "num_tokens": 27413893.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.6100861008610086, | |
| "grad_norm": 0.07239904062409902, | |
| "learning_rate": 9.816984700544714e-06, | |
| "loss": 0.0281, | |
| "num_tokens": 27868849.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.6199261992619927, | |
| "grad_norm": 0.07354472191772266, | |
| "learning_rate": 9.808819034473869e-06, | |
| "loss": 0.0286, | |
| "num_tokens": 28286253.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.6297662976629766, | |
| "grad_norm": 0.07644175488051343, | |
| "learning_rate": 9.800479106425356e-06, | |
| "loss": 0.0259, | |
| "num_tokens": 28699755.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.6396063960639606, | |
| "grad_norm": 0.07705355988067097, | |
| "learning_rate": 9.791965253691687e-06, | |
| "loss": 0.0273, | |
| "num_tokens": 29142779.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.6494464944649446, | |
| "grad_norm": 0.09001368693559736, | |
| "learning_rate": 9.783277820599408e-06, | |
| "loss": 0.0324, | |
| "num_tokens": 29564256.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.6592865928659286, | |
| "grad_norm": 0.07998998288422189, | |
| "learning_rate": 9.774417158495208e-06, | |
| "loss": 0.0311, | |
| "num_tokens": 30035831.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.6691266912669127, | |
| "grad_norm": 0.08094973090726079, | |
| "learning_rate": 9.765383625731683e-06, | |
| "loss": 0.0306, | |
| "num_tokens": 30471581.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.6789667896678967, | |
| "grad_norm": 0.07575991095164336, | |
| "learning_rate": 9.756177587652857e-06, | |
| "loss": 0.0259, | |
| "num_tokens": 30914761.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.6888068880688807, | |
| "grad_norm": 0.08255507372333148, | |
| "learning_rate": 9.746799416579403e-06, | |
| "loss": 0.0288, | |
| "num_tokens": 31351700.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6986469864698647, | |
| "grad_norm": 0.07383179182825, | |
| "learning_rate": 9.737249491793587e-06, | |
| "loss": 0.0322, | |
| "num_tokens": 31832339.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.7084870848708487, | |
| "grad_norm": 0.0779807473775348, | |
| "learning_rate": 9.727528199523923e-06, | |
| "loss": 0.0384, | |
| "num_tokens": 32297687.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.7183271832718328, | |
| "grad_norm": 0.9900117683369877, | |
| "learning_rate": 9.717635932929556e-06, | |
| "loss": 0.2712, | |
| "num_tokens": 32795919.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.7281672816728167, | |
| "grad_norm": 0.0991868351288853, | |
| "learning_rate": 9.707573092084368e-06, | |
| "loss": 0.0419, | |
| "num_tokens": 33250913.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.7380073800738007, | |
| "grad_norm": 0.08222641622667337, | |
| "learning_rate": 9.697340083960785e-06, | |
| "loss": 0.0289, | |
| "num_tokens": 33700970.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.7478474784747847, | |
| "grad_norm": 0.0807953175546777, | |
| "learning_rate": 9.686937322413325e-06, | |
| "loss": 0.0286, | |
| "num_tokens": 34160528.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.7576875768757687, | |
| "grad_norm": 0.07804479191069146, | |
| "learning_rate": 9.676365228161869e-06, | |
| "loss": 0.0288, | |
| "num_tokens": 34612256.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.7675276752767528, | |
| "grad_norm": 0.0766842054326196, | |
| "learning_rate": 9.66562422877462e-06, | |
| "loss": 0.0232, | |
| "num_tokens": 35040279.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.7773677736777368, | |
| "grad_norm": 0.0891517560198986, | |
| "learning_rate": 9.654714758650844e-06, | |
| "loss": 0.0297, | |
| "num_tokens": 35479921.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.7872078720787208, | |
| "grad_norm": 0.07866162600023792, | |
| "learning_rate": 9.643637259003276e-06, | |
| "loss": 0.0246, | |
| "num_tokens": 35916545.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.7970479704797048, | |
| "grad_norm": 0.09304350719642641, | |
| "learning_rate": 9.632392177840286e-06, | |
| "loss": 0.034, | |
| "num_tokens": 36366132.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.8068880688806888, | |
| "grad_norm": 0.07700162361294739, | |
| "learning_rate": 9.620979969947759e-06, | |
| "loss": 0.0253, | |
| "num_tokens": 36826531.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.8167281672816729, | |
| "grad_norm": 0.07988889172787667, | |
| "learning_rate": 9.609401096870707e-06, | |
| "loss": 0.0218, | |
| "num_tokens": 37250803.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.8265682656826568, | |
| "grad_norm": 0.07662932628344148, | |
| "learning_rate": 9.597656026894591e-06, | |
| "loss": 0.0246, | |
| "num_tokens": 37707551.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.8364083640836408, | |
| "grad_norm": 0.08311607561866212, | |
| "learning_rate": 9.585745235026391e-06, | |
| "loss": 0.0288, | |
| "num_tokens": 38157805.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.8462484624846248, | |
| "grad_norm": 0.08297888399480316, | |
| "learning_rate": 9.5736692029754e-06, | |
| "loss": 0.04, | |
| "num_tokens": 38632172.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.8560885608856088, | |
| "grad_norm": 0.07156880458651095, | |
| "learning_rate": 9.561428419133723e-06, | |
| "loss": 0.0244, | |
| "num_tokens": 39106655.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.8659286592865929, | |
| "grad_norm": 0.07908760460903244, | |
| "learning_rate": 9.549023378556548e-06, | |
| "loss": 0.0283, | |
| "num_tokens": 39539107.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.8757687576875769, | |
| "grad_norm": 0.07754860550749794, | |
| "learning_rate": 9.53645458294211e-06, | |
| "loss": 0.0267, | |
| "num_tokens": 39991573.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.8856088560885609, | |
| "grad_norm": 0.07382048908783427, | |
| "learning_rate": 9.523722540611403e-06, | |
| "loss": 0.0302, | |
| "num_tokens": 40424055.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.8954489544895449, | |
| "grad_norm": 0.07273833859487783, | |
| "learning_rate": 9.510827766487625e-06, | |
| "loss": 0.0256, | |
| "num_tokens": 40896577.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.9052890528905289, | |
| "grad_norm": 0.07964369978108742, | |
| "learning_rate": 9.497770782075353e-06, | |
| "loss": 0.035, | |
| "num_tokens": 41366216.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.915129151291513, | |
| "grad_norm": 0.07401188987280478, | |
| "learning_rate": 9.484552115439445e-06, | |
| "loss": 0.0245, | |
| "num_tokens": 41809244.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.9249692496924969, | |
| "grad_norm": 0.07840443321039077, | |
| "learning_rate": 9.471172301183695e-06, | |
| "loss": 0.0289, | |
| "num_tokens": 42248913.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.9348093480934809, | |
| "grad_norm": 0.08325366129705902, | |
| "learning_rate": 9.4576318804292e-06, | |
| "loss": 0.0231, | |
| "num_tokens": 42692031.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.9446494464944649, | |
| "grad_norm": 0.07523171897596297, | |
| "learning_rate": 9.443931400792486e-06, | |
| "loss": 0.0231, | |
| "num_tokens": 43152827.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.9544895448954489, | |
| "grad_norm": 0.08414955879961411, | |
| "learning_rate": 9.430071416363352e-06, | |
| "loss": 0.0361, | |
| "num_tokens": 43589003.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.964329643296433, | |
| "grad_norm": 0.07780017154939398, | |
| "learning_rate": 9.416052487682465e-06, | |
| "loss": 0.0325, | |
| "num_tokens": 44039888.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.974169741697417, | |
| "grad_norm": 0.07812625211660923, | |
| "learning_rate": 9.401875181718686e-06, | |
| "loss": 0.0269, | |
| "num_tokens": 44477755.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.984009840098401, | |
| "grad_norm": 0.08986184340836745, | |
| "learning_rate": 9.387540071846155e-06, | |
| "loss": 0.028, | |
| "num_tokens": 44942597.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.993849938499385, | |
| "grad_norm": 0.07602588052869161, | |
| "learning_rate": 9.373047737821078e-06, | |
| "loss": 0.0263, | |
| "num_tokens": 45381474.0, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.07602588052869161, | |
| "learning_rate": 9.358398765758296e-06, | |
| "loss": 0.0233, | |
| "num_tokens": 45594452.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.009840098400984, | |
| "grad_norm": 0.10983513675872132, | |
| "learning_rate": 9.34359374810758e-06, | |
| "loss": 0.0203, | |
| "num_tokens": 46030876.0, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.019680196801968, | |
| "grad_norm": 0.06573014575776571, | |
| "learning_rate": 9.328633283629666e-06, | |
| "loss": 0.0227, | |
| "num_tokens": 46486828.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.029520295202952, | |
| "grad_norm": 0.08034715377363097, | |
| "learning_rate": 9.31351797737204e-06, | |
| "loss": 0.0318, | |
| "num_tokens": 46952208.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.039360393603936, | |
| "grad_norm": 0.07429105498387009, | |
| "learning_rate": 9.29824844064447e-06, | |
| "loss": 0.0303, | |
| "num_tokens": 47410999.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.04920049200492, | |
| "grad_norm": 0.08198831738294611, | |
| "learning_rate": 9.282825290994282e-06, | |
| "loss": 0.0214, | |
| "num_tokens": 47859777.0, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.0590405904059041, | |
| "grad_norm": 0.08882279229683457, | |
| "learning_rate": 9.267249152181379e-06, | |
| "loss": 0.0348, | |
| "num_tokens": 48313998.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.068880688806888, | |
| "grad_norm": 0.07979621896563821, | |
| "learning_rate": 9.251520654153028e-06, | |
| "loss": 0.0217, | |
| "num_tokens": 48768989.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.0787207872078721, | |
| "grad_norm": 0.07487008128556895, | |
| "learning_rate": 9.235640433018363e-06, | |
| "loss": 0.0312, | |
| "num_tokens": 49228144.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.088560885608856, | |
| "grad_norm": 0.07679612898308323, | |
| "learning_rate": 9.219609131022684e-06, | |
| "loss": 0.0931, | |
| "num_tokens": 49660848.0, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.09840098400984, | |
| "grad_norm": 0.2760363965156351, | |
| "learning_rate": 9.203427396521454e-06, | |
| "loss": 0.0199, | |
| "num_tokens": 50101951.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.1082410824108242, | |
| "grad_norm": 0.0785172085157219, | |
| "learning_rate": 9.187095883954104e-06, | |
| "loss": 0.0249, | |
| "num_tokens": 50561884.0, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.118081180811808, | |
| "grad_norm": 0.07511192184574486, | |
| "learning_rate": 9.170615253817547e-06, | |
| "loss": 0.0202, | |
| "num_tokens": 51023121.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.1279212792127922, | |
| "grad_norm": 0.0788369940517159, | |
| "learning_rate": 9.153986172639474e-06, | |
| "loss": 0.0242, | |
| "num_tokens": 51475717.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.137761377613776, | |
| "grad_norm": 0.08125072620630153, | |
| "learning_rate": 9.137209312951395e-06, | |
| "loss": 0.023, | |
| "num_tokens": 51936171.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.1476014760147601, | |
| "grad_norm": 0.07976122735928637, | |
| "learning_rate": 9.12028535326144e-06, | |
| "loss": 0.0243, | |
| "num_tokens": 52386546.0, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.1574415744157442, | |
| "grad_norm": 0.08346012008591713, | |
| "learning_rate": 9.103214978026922e-06, | |
| "loss": 0.0325, | |
| "num_tokens": 52836033.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.1672816728167281, | |
| "grad_norm": 0.11906435467388006, | |
| "learning_rate": 9.085998877626644e-06, | |
| "loss": 0.0322, | |
| "num_tokens": 53268007.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.1771217712177122, | |
| "grad_norm": 0.07249411043595039, | |
| "learning_rate": 9.068637748332993e-06, | |
| "loss": 0.022, | |
| "num_tokens": 53704420.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.186961869618696, | |
| "grad_norm": 0.07720155000392087, | |
| "learning_rate": 9.051132292283772e-06, | |
| "loss": 0.0175, | |
| "num_tokens": 54139993.0, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.1968019680196802, | |
| "grad_norm": 0.07940921390702492, | |
| "learning_rate": 9.033483217453801e-06, | |
| "loss": 0.028, | |
| "num_tokens": 54570992.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.2066420664206643, | |
| "grad_norm": 0.07451157010744122, | |
| "learning_rate": 9.015691237626292e-06, | |
| "loss": 0.0372, | |
| "num_tokens": 55010129.0, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.2164821648216482, | |
| "grad_norm": 0.0847625901422177, | |
| "learning_rate": 8.997757072363976e-06, | |
| "loss": 0.026, | |
| "num_tokens": 55484006.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.2263222632226323, | |
| "grad_norm": 0.0734447382667372, | |
| "learning_rate": 8.979681446980002e-06, | |
| "loss": 0.0187, | |
| "num_tokens": 55916003.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.2361623616236161, | |
| "grad_norm": 0.07720210069754488, | |
| "learning_rate": 8.961465092508607e-06, | |
| "loss": 0.0204, | |
| "num_tokens": 56367051.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.2460024600246002, | |
| "grad_norm": 0.07005770488907263, | |
| "learning_rate": 8.943108745675542e-06, | |
| "loss": 0.0191, | |
| "num_tokens": 56809919.0, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.2558425584255843, | |
| "grad_norm": 0.070753857700872, | |
| "learning_rate": 8.92461314886829e-06, | |
| "loss": 0.0166, | |
| "num_tokens": 57239307.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.2656826568265682, | |
| "grad_norm": 0.07412229123559273, | |
| "learning_rate": 8.905979050106029e-06, | |
| "loss": 0.0182, | |
| "num_tokens": 57659139.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.2755227552275523, | |
| "grad_norm": 0.07409420522842904, | |
| "learning_rate": 8.887207203009385e-06, | |
| "loss": 0.0182, | |
| "num_tokens": 58106309.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.2853628536285364, | |
| "grad_norm": 0.07792555021983492, | |
| "learning_rate": 8.868298366769956e-06, | |
| "loss": 0.02, | |
| "num_tokens": 58533301.0, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.2952029520295203, | |
| "grad_norm": 0.07755261181847745, | |
| "learning_rate": 8.849253306119601e-06, | |
| "loss": 0.018, | |
| "num_tokens": 58966567.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.3050430504305042, | |
| "grad_norm": 0.07236047683074463, | |
| "learning_rate": 8.83007279129952e-06, | |
| "loss": 0.0231, | |
| "num_tokens": 59420421.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.3148831488314883, | |
| "grad_norm": 0.07698814451544558, | |
| "learning_rate": 8.810757598029094e-06, | |
| "loss": 0.0171, | |
| "num_tokens": 59841646.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.3247232472324724, | |
| "grad_norm": 0.07814813953683186, | |
| "learning_rate": 8.79130850747452e-06, | |
| "loss": 0.02, | |
| "num_tokens": 60262792.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.3345633456334562, | |
| "grad_norm": 0.06873838482423762, | |
| "learning_rate": 8.771726306217217e-06, | |
| "loss": 0.0185, | |
| "num_tokens": 60732922.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.3444034440344403, | |
| "grad_norm": 0.09687913725758614, | |
| "learning_rate": 8.752011786222011e-06, | |
| "loss": 0.0176, | |
| "num_tokens": 61205715.0, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.3542435424354244, | |
| "grad_norm": 0.07245054518431464, | |
| "learning_rate": 8.732165744805107e-06, | |
| "loss": 0.0235, | |
| "num_tokens": 61643580.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.3640836408364083, | |
| "grad_norm": 0.0751731684247356, | |
| "learning_rate": 8.712188984601845e-06, | |
| "loss": 0.0204, | |
| "num_tokens": 62086664.0, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.3739237392373924, | |
| "grad_norm": 0.0765957432328115, | |
| "learning_rate": 8.692082313534233e-06, | |
| "loss": 0.0246, | |
| "num_tokens": 62556237.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.3837638376383765, | |
| "grad_norm": 0.07942748233411417, | |
| "learning_rate": 8.671846544778284e-06, | |
| "loss": 0.0262, | |
| "num_tokens": 63014753.0, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.3936039360393604, | |
| "grad_norm": 0.07220660807543852, | |
| "learning_rate": 8.651482496731116e-06, | |
| "loss": 0.0206, | |
| "num_tokens": 63462646.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.4034440344403443, | |
| "grad_norm": 0.07123468502943975, | |
| "learning_rate": 8.630990992977854e-06, | |
| "loss": 0.0171, | |
| "num_tokens": 63953362.0, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.4132841328413284, | |
| "grad_norm": 0.06845259344800432, | |
| "learning_rate": 8.61037286225834e-06, | |
| "loss": 0.0176, | |
| "num_tokens": 64407104.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.4231242312423125, | |
| "grad_norm": 0.0712728854892911, | |
| "learning_rate": 8.589628938433587e-06, | |
| "loss": 0.0198, | |
| "num_tokens": 64843926.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.4329643296432963, | |
| "grad_norm": 0.07477061506965738, | |
| "learning_rate": 8.56876006045208e-06, | |
| "loss": 0.0186, | |
| "num_tokens": 65296311.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.4428044280442804, | |
| "grad_norm": 0.06777146676082023, | |
| "learning_rate": 8.547767072315835e-06, | |
| "loss": 0.019, | |
| "num_tokens": 65756092.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.4526445264452645, | |
| "grad_norm": 0.07114975862038111, | |
| "learning_rate": 8.526650823046266e-06, | |
| "loss": 0.0178, | |
| "num_tokens": 66217555.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.4624846248462484, | |
| "grad_norm": 0.07525940320393967, | |
| "learning_rate": 8.505412166649847e-06, | |
| "loss": 0.0179, | |
| "num_tokens": 66685889.0, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.4723247232472325, | |
| "grad_norm": 0.06408400112895582, | |
| "learning_rate": 8.484051962083579e-06, | |
| "loss": 0.0218, | |
| "num_tokens": 67133723.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.4821648216482166, | |
| "grad_norm": 0.28687790559136056, | |
| "learning_rate": 8.462571073220243e-06, | |
| "loss": 0.0384, | |
| "num_tokens": 67578744.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.4920049200492005, | |
| "grad_norm": 0.0793310962273527, | |
| "learning_rate": 8.44097036881347e-06, | |
| "loss": 0.0229, | |
| "num_tokens": 68014169.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.5018450184501844, | |
| "grad_norm": 0.06714316191575523, | |
| "learning_rate": 8.419250722462603e-06, | |
| "loss": 0.0171, | |
| "num_tokens": 68463185.0, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.5116851168511685, | |
| "grad_norm": 0.07317824154830675, | |
| "learning_rate": 8.39741301257736e-06, | |
| "loss": 0.0168, | |
| "num_tokens": 68915214.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.5215252152521526, | |
| "grad_norm": 0.07498605748005185, | |
| "learning_rate": 8.375458122342317e-06, | |
| "loss": 0.0218, | |
| "num_tokens": 69372682.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.5313653136531364, | |
| "grad_norm": 0.07930974741179314, | |
| "learning_rate": 8.353386939681186e-06, | |
| "loss": 0.0197, | |
| "num_tokens": 69856887.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.5412054120541205, | |
| "grad_norm": 0.06557036848250264, | |
| "learning_rate": 8.331200357220908e-06, | |
| "loss": 0.0169, | |
| "num_tokens": 70281087.0, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.5510455104551046, | |
| "grad_norm": 0.06911145883255942, | |
| "learning_rate": 8.308899272255542e-06, | |
| "loss": 0.0258, | |
| "num_tokens": 70703636.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.5608856088560885, | |
| "grad_norm": 0.07694897194945108, | |
| "learning_rate": 8.286484586709989e-06, | |
| "loss": 0.0195, | |
| "num_tokens": 71151769.0, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.5707257072570726, | |
| "grad_norm": 0.07509068702973927, | |
| "learning_rate": 8.263957207103506e-06, | |
| "loss": 0.0163, | |
| "num_tokens": 71608512.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.5805658056580567, | |
| "grad_norm": 0.06708625688834685, | |
| "learning_rate": 8.241318044513046e-06, | |
| "loss": 0.0182, | |
| "num_tokens": 72064241.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.5904059040590406, | |
| "grad_norm": 0.07409751654155339, | |
| "learning_rate": 8.218568014536414e-06, | |
| "loss": 0.0214, | |
| "num_tokens": 72529560.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.6002460024600245, | |
| "grad_norm": 0.08398773553611014, | |
| "learning_rate": 8.195708037255233e-06, | |
| "loss": 0.0219, | |
| "num_tokens": 72983545.0, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.6100861008610086, | |
| "grad_norm": 0.0924163975250339, | |
| "learning_rate": 8.172739037197739e-06, | |
| "loss": 0.0199, | |
| "num_tokens": 73420020.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.6199261992619927, | |
| "grad_norm": 0.07341361509627292, | |
| "learning_rate": 8.149661943301382e-06, | |
| "loss": 0.0193, | |
| "num_tokens": 73867089.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.6297662976629765, | |
| "grad_norm": 0.09734659605562107, | |
| "learning_rate": 8.126477688875262e-06, | |
| "loss": 0.0393, | |
| "num_tokens": 74337960.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.6396063960639606, | |
| "grad_norm": 0.0653922306331562, | |
| "learning_rate": 8.103187211562386e-06, | |
| "loss": 0.0169, | |
| "num_tokens": 74779558.0, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.6494464944649447, | |
| "grad_norm": 0.07532255669592061, | |
| "learning_rate": 8.079791453301742e-06, | |
| "loss": 0.0181, | |
| "num_tokens": 75249832.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.6592865928659286, | |
| "grad_norm": 0.06592599083206258, | |
| "learning_rate": 8.056291360290202e-06, | |
| "loss": 0.0189, | |
| "num_tokens": 75680939.0, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.6691266912669127, | |
| "grad_norm": 0.07466148209100053, | |
| "learning_rate": 8.032687882944264e-06, | |
| "loss": 0.0197, | |
| "num_tokens": 76131450.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.6789667896678968, | |
| "grad_norm": 0.06887573727890169, | |
| "learning_rate": 8.0089819758616e-06, | |
| "loss": 0.0192, | |
| "num_tokens": 76593750.0, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.6888068880688807, | |
| "grad_norm": 0.06693500545746488, | |
| "learning_rate": 7.985174597782469e-06, | |
| "loss": 0.0184, | |
| "num_tokens": 77050866.0, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.6986469864698646, | |
| "grad_norm": 0.0743814094089007, | |
| "learning_rate": 7.961266711550922e-06, | |
| "loss": 0.018, | |
| "num_tokens": 77464839.0, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.7084870848708487, | |
| "grad_norm": 0.07491525806289706, | |
| "learning_rate": 7.937259284075872e-06, | |
| "loss": 0.0242, | |
| "num_tokens": 77916642.0, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.7183271832718328, | |
| "grad_norm": 0.06409329354600994, | |
| "learning_rate": 7.913153286291995e-06, | |
| "loss": 0.0159, | |
| "num_tokens": 78389781.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.7281672816728166, | |
| "grad_norm": 0.07234553574283793, | |
| "learning_rate": 7.888949693120443e-06, | |
| "loss": 0.0185, | |
| "num_tokens": 78840356.0, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.7380073800738007, | |
| "grad_norm": 0.06775336526687877, | |
| "learning_rate": 7.864649483429442e-06, | |
| "loss": 0.0222, | |
| "num_tokens": 79280780.0, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.7478474784747848, | |
| "grad_norm": 0.06875670670455086, | |
| "learning_rate": 7.840253639994676e-06, | |
| "loss": 0.2282, | |
| "num_tokens": 79776798.0, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.7576875768757687, | |
| "grad_norm": 0.6287911227006427, | |
| "learning_rate": 7.815763149459563e-06, | |
| "loss": 0.0278, | |
| "num_tokens": 80233063.0, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.7675276752767528, | |
| "grad_norm": 0.0846354199885941, | |
| "learning_rate": 7.791179002295334e-06, | |
| "loss": 0.0176, | |
| "num_tokens": 80677429.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.777367773677737, | |
| "grad_norm": 0.07041805151413434, | |
| "learning_rate": 7.766502192760995e-06, | |
| "loss": 0.019, | |
| "num_tokens": 81122406.0, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.7872078720787208, | |
| "grad_norm": 0.0643818791121022, | |
| "learning_rate": 7.741733718863096e-06, | |
| "loss": 0.021, | |
| "num_tokens": 81570725.0, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.7970479704797047, | |
| "grad_norm": 0.09698204542146324, | |
| "learning_rate": 7.71687458231538e-06, | |
| "loss": 0.0164, | |
| "num_tokens": 82009994.0, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.8068880688806888, | |
| "grad_norm": 0.06854732624004896, | |
| "learning_rate": 7.69192578849827e-06, | |
| "loss": 0.0166, | |
| "num_tokens": 82455315.0, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.8167281672816729, | |
| "grad_norm": 0.08363256711677616, | |
| "learning_rate": 7.666888346418205e-06, | |
| "loss": 0.026, | |
| "num_tokens": 82901866.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.8265682656826567, | |
| "grad_norm": 0.07243825921308177, | |
| "learning_rate": 7.641763268666832e-06, | |
| "loss": 0.019, | |
| "num_tokens": 83346981.0, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.8364083640836408, | |
| "grad_norm": 0.06930096159612371, | |
| "learning_rate": 7.616551571380061e-06, | |
| "loss": 0.0214, | |
| "num_tokens": 83806206.0, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.846248462484625, | |
| "grad_norm": 0.0681854243495615, | |
| "learning_rate": 7.5912542741969585e-06, | |
| "loss": 0.0163, | |
| "num_tokens": 84274704.0, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.8560885608856088, | |
| "grad_norm": 0.07242016185622592, | |
| "learning_rate": 7.5658724002185215e-06, | |
| "loss": 0.0176, | |
| "num_tokens": 84711534.0, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.865928659286593, | |
| "grad_norm": 0.06579105822967504, | |
| "learning_rate": 7.54040697596629e-06, | |
| "loss": 0.0283, | |
| "num_tokens": 85160170.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.875768757687577, | |
| "grad_norm": 0.07108784887648237, | |
| "learning_rate": 7.514859031340835e-06, | |
| "loss": 0.0396, | |
| "num_tokens": 85613402.0, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.8856088560885609, | |
| "grad_norm": 0.1388795021837048, | |
| "learning_rate": 7.489229599580111e-06, | |
| "loss": 0.0246, | |
| "num_tokens": 86052381.0, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.8954489544895448, | |
| "grad_norm": 0.07205397955257539, | |
| "learning_rate": 7.463519717217663e-06, | |
| "loss": 0.0168, | |
| "num_tokens": 86528412.0, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.9052890528905289, | |
| "grad_norm": 0.06405950191637912, | |
| "learning_rate": 7.437730424040702e-06, | |
| "loss": 0.0172, | |
| "num_tokens": 86961573.0, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.915129151291513, | |
| "grad_norm": 0.06777861356344914, | |
| "learning_rate": 7.411862763048068e-06, | |
| "loss": 0.2543, | |
| "num_tokens": 87398765.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.9249692496924968, | |
| "grad_norm": 0.6838887307240747, | |
| "learning_rate": 7.38591778040803e-06, | |
| "loss": 0.0173, | |
| "num_tokens": 87844520.0, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.934809348093481, | |
| "grad_norm": 0.0776240582840387, | |
| "learning_rate": 7.359896525415986e-06, | |
| "loss": 0.017, | |
| "num_tokens": 88283922.0, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.944649446494465, | |
| "grad_norm": 0.07000567204619491, | |
| "learning_rate": 7.333800050452024e-06, | |
| "loss": 0.017, | |
| "num_tokens": 88751425.0, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.954489544895449, | |
| "grad_norm": 0.06845270762016023, | |
| "learning_rate": 7.307629410938364e-06, | |
| "loss": 0.0156, | |
| "num_tokens": 89205645.0, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.964329643296433, | |
| "grad_norm": 0.06368141241550528, | |
| "learning_rate": 7.281385665296663e-06, | |
| "loss": 0.0241, | |
| "num_tokens": 89660304.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.974169741697417, | |
| "grad_norm": 0.07288122003316398, | |
| "learning_rate": 7.255069874905221e-06, | |
| "loss": 0.0177, | |
| "num_tokens": 90107418.0, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.984009840098401, | |
| "grad_norm": 0.06287805035179808, | |
| "learning_rate": 7.228683104056051e-06, | |
| "loss": 0.0162, | |
| "num_tokens": 90539742.0, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.9938499384993849, | |
| "grad_norm": 0.0689550244527601, | |
| "learning_rate": 7.202226419911832e-06, | |
| "loss": 0.0158, | |
| "num_tokens": 90980046.0, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.08227420282080453, | |
| "learning_rate": 7.175700892462757e-06, | |
| "loss": 0.0146, | |
| "num_tokens": 91185077.0, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.009840098400984, | |
| "grad_norm": 0.07497487377417848, | |
| "learning_rate": 7.149107594483251e-06, | |
| "loss": 0.0179, | |
| "num_tokens": 91614882.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.019680196801968, | |
| "grad_norm": 0.06696426999273204, | |
| "learning_rate": 7.122447601488592e-06, | |
| "loss": 0.0138, | |
| "num_tokens": 92080175.0, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.029520295202952, | |
| "grad_norm": 0.6914326541667255, | |
| "learning_rate": 7.095721991691411e-06, | |
| "loss": 0.0906, | |
| "num_tokens": 92537610.0, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.039360393603936, | |
| "grad_norm": 0.07698410304489973, | |
| "learning_rate": 7.0689318459580845e-06, | |
| "loss": 0.014, | |
| "num_tokens": 93006593.0, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.0492004920049203, | |
| "grad_norm": 0.06605656882870746, | |
| "learning_rate": 7.042078247765019e-06, | |
| "loss": 0.0135, | |
| "num_tokens": 93476972.0, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.059040590405904, | |
| "grad_norm": 0.20519577755096738, | |
| "learning_rate": 7.015162283154843e-06, | |
| "loss": 0.0448, | |
| "num_tokens": 93923596.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.068880688806888, | |
| "grad_norm": 0.07138995831165522, | |
| "learning_rate": 6.988185040692469e-06, | |
| "loss": 0.0147, | |
| "num_tokens": 94363955.0, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.078720787207872, | |
| "grad_norm": 0.07243575038647761, | |
| "learning_rate": 6.961147611421076e-06, | |
| "loss": 0.0133, | |
| "num_tokens": 94822454.0, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.088560885608856, | |
| "grad_norm": 0.06470408361373992, | |
| "learning_rate": 6.934051088817988e-06, | |
| "loss": 0.0129, | |
| "num_tokens": 95281989.0, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.09840098400984, | |
| "grad_norm": 0.06874194166286132, | |
| "learning_rate": 6.906896568750441e-06, | |
| "loss": 0.0133, | |
| "num_tokens": 95718050.0, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.108241082410824, | |
| "grad_norm": 0.06876988398180649, | |
| "learning_rate": 6.87968514943127e-06, | |
| "loss": 0.0131, | |
| "num_tokens": 96162240.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.1180811808118083, | |
| "grad_norm": 0.06407419572232166, | |
| "learning_rate": 6.852417931374494e-06, | |
| "loss": 0.0154, | |
| "num_tokens": 96606411.0, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.127921279212792, | |
| "grad_norm": 0.07884002350292682, | |
| "learning_rate": 6.825096017350807e-06, | |
| "loss": 0.0145, | |
| "num_tokens": 97056671.0, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.137761377613776, | |
| "grad_norm": 0.9981711556732346, | |
| "learning_rate": 6.797720512342967e-06, | |
| "loss": 0.2386, | |
| "num_tokens": 97514395.0, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.14760147601476, | |
| "grad_norm": 0.08409368119046862, | |
| "learning_rate": 6.77029252350113e-06, | |
| "loss": 0.0131, | |
| "num_tokens": 97963358.0, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.1574415744157442, | |
| "grad_norm": 0.07248619207767666, | |
| "learning_rate": 6.742813160098054e-06, | |
| "loss": 0.0209, | |
| "num_tokens": 98418943.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.167281672816728, | |
| "grad_norm": 0.0644855598598356, | |
| "learning_rate": 6.715283533484242e-06, | |
| "loss": 0.013, | |
| "num_tokens": 98877761.0, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.177121771217712, | |
| "grad_norm": 0.06622877203564365, | |
| "learning_rate": 6.6877047570430044e-06, | |
| "loss": 0.0142, | |
| "num_tokens": 99319699.0, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.1869618696186963, | |
| "grad_norm": 0.07112191399232934, | |
| "learning_rate": 6.660077946145412e-06, | |
| "loss": 0.0141, | |
| "num_tokens": 99741161.0, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.19680196801968, | |
| "grad_norm": 0.06421873614899783, | |
| "learning_rate": 6.632404218105205e-06, | |
| "loss": 0.0125, | |
| "num_tokens": 100166108.0, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.206642066420664, | |
| "grad_norm": 0.10033810460897824, | |
| "learning_rate": 6.604684692133597e-06, | |
| "loss": 0.0252, | |
| "num_tokens": 100619482.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.2164821648216484, | |
| "grad_norm": 0.06530150422818083, | |
| "learning_rate": 6.576920489294011e-06, | |
| "loss": 0.0127, | |
| "num_tokens": 101046330.0, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.2263222632226323, | |
| "grad_norm": 0.07185919142366148, | |
| "learning_rate": 6.549112732456739e-06, | |
| "loss": 0.016, | |
| "num_tokens": 101495504.0, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.236162361623616, | |
| "grad_norm": 0.06818636079419191, | |
| "learning_rate": 6.5212625462535365e-06, | |
| "loss": 0.0213, | |
| "num_tokens": 101968574.0, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.2460024600246005, | |
| "grad_norm": 0.06992643360327205, | |
| "learning_rate": 6.493371057032129e-06, | |
| "loss": 0.0129, | |
| "num_tokens": 102400777.0, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.2558425584255843, | |
| "grad_norm": 0.07704204356207317, | |
| "learning_rate": 6.465439392810664e-06, | |
| "loss": 0.015, | |
| "num_tokens": 102843075.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.265682656826568, | |
| "grad_norm": 0.06544957942318019, | |
| "learning_rate": 6.4374686832320944e-06, | |
| "loss": 0.0123, | |
| "num_tokens": 103261288.0, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.275522755227552, | |
| "grad_norm": 0.062334728387161344, | |
| "learning_rate": 6.409460059518482e-06, | |
| "loss": 0.0129, | |
| "num_tokens": 103717825.0, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.2853628536285364, | |
| "grad_norm": 0.06332858275476305, | |
| "learning_rate": 6.381414654425261e-06, | |
| "loss": 0.0206, | |
| "num_tokens": 104165764.0, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.2952029520295203, | |
| "grad_norm": 0.09448999187172064, | |
| "learning_rate": 6.353333602195414e-06, | |
| "loss": 0.0124, | |
| "num_tokens": 104606652.0, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.305043050430504, | |
| "grad_norm": 0.0668177771084121, | |
| "learning_rate": 6.325218038513604e-06, | |
| "loss": 0.0126, | |
| "num_tokens": 105039670.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.3148831488314885, | |
| "grad_norm": 0.06130143066732146, | |
| "learning_rate": 6.2970691004602425e-06, | |
| "loss": 0.0128, | |
| "num_tokens": 105498133.0, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.3247232472324724, | |
| "grad_norm": 0.06353571099299025, | |
| "learning_rate": 6.26888792646551e-06, | |
| "loss": 0.0132, | |
| "num_tokens": 105955337.0, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.3345633456334562, | |
| "grad_norm": 0.07072744488591255, | |
| "learning_rate": 6.240675656263303e-06, | |
| "loss": 0.0206, | |
| "num_tokens": 106398065.0, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.34440344403444, | |
| "grad_norm": 0.0701393753248028, | |
| "learning_rate": 6.212433430845145e-06, | |
| "loss": 0.0207, | |
| "num_tokens": 106835350.0, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.3542435424354244, | |
| "grad_norm": 0.06848300507150402, | |
| "learning_rate": 6.184162392414044e-06, | |
| "loss": 0.0145, | |
| "num_tokens": 107296896.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.3640836408364083, | |
| "grad_norm": 0.07142535679112567, | |
| "learning_rate": 6.155863684338294e-06, | |
| "loss": 0.0142, | |
| "num_tokens": 107740689.0, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.373923739237392, | |
| "grad_norm": 0.061123970206678674, | |
| "learning_rate": 6.127538451105232e-06, | |
| "loss": 0.0127, | |
| "num_tokens": 108196343.0, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.3837638376383765, | |
| "grad_norm": 0.0699480717937599, | |
| "learning_rate": 6.099187838274959e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 108646263.0, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.3936039360393604, | |
| "grad_norm": 0.06593159068712609, | |
| "learning_rate": 6.070812992434003e-06, | |
| "loss": 0.0127, | |
| "num_tokens": 109089367.0, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.4034440344403443, | |
| "grad_norm": 0.07004342519131408, | |
| "learning_rate": 6.042415061148954e-06, | |
| "loss": 0.015, | |
| "num_tokens": 109534992.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.4132841328413286, | |
| "grad_norm": 0.08012326478595262, | |
| "learning_rate": 6.013995192920044e-06, | |
| "loss": 0.0181, | |
| "num_tokens": 110020906.0, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.4231242312423125, | |
| "grad_norm": 0.06662874441753898, | |
| "learning_rate": 5.985554537134702e-06, | |
| "loss": 0.0125, | |
| "num_tokens": 110456126.0, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.4329643296432963, | |
| "grad_norm": 0.06935993203158428, | |
| "learning_rate": 5.957094244021071e-06, | |
| "loss": 0.0124, | |
| "num_tokens": 110889342.0, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.4428044280442807, | |
| "grad_norm": 0.06948156606308496, | |
| "learning_rate": 5.928615464601497e-06, | |
| "loss": 0.0143, | |
| "num_tokens": 111329838.0, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.4526445264452645, | |
| "grad_norm": 0.08174081643789106, | |
| "learning_rate": 5.900119350645956e-06, | |
| "loss": 0.0158, | |
| "num_tokens": 111789840.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.4624846248462484, | |
| "grad_norm": 0.07150394939916561, | |
| "learning_rate": 5.871607054625497e-06, | |
| "loss": 0.0189, | |
| "num_tokens": 112240250.0, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.4723247232472323, | |
| "grad_norm": 0.0795332410978433, | |
| "learning_rate": 5.8430797296656125e-06, | |
| "loss": 0.0211, | |
| "num_tokens": 112673933.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.4821648216482166, | |
| "grad_norm": 0.0689891322716241, | |
| "learning_rate": 5.814538529499622e-06, | |
| "loss": 0.0148, | |
| "num_tokens": 113134668.0, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.4920049200492005, | |
| "grad_norm": 0.07982357004600245, | |
| "learning_rate": 5.785984608421993e-06, | |
| "loss": 0.016, | |
| "num_tokens": 113596645.0, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.5018450184501844, | |
| "grad_norm": 0.07075114056461625, | |
| "learning_rate": 5.757419121241667e-06, | |
| "loss": 0.0184, | |
| "num_tokens": 114038969.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.5116851168511687, | |
| "grad_norm": 0.07123055043084275, | |
| "learning_rate": 5.7288432232353615e-06, | |
| "loss": 0.0172, | |
| "num_tokens": 114476716.0, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.5215252152521526, | |
| "grad_norm": 0.06238876313147613, | |
| "learning_rate": 5.7002580701008325e-06, | |
| "loss": 0.0134, | |
| "num_tokens": 114958206.0, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.5313653136531364, | |
| "grad_norm": 0.06444872364781601, | |
| "learning_rate": 5.6716648179101445e-06, | |
| "loss": 0.0122, | |
| "num_tokens": 115415134.0, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.5412054120541203, | |
| "grad_norm": 0.06263958198396931, | |
| "learning_rate": 5.64306462306291e-06, | |
| "loss": 0.0142, | |
| "num_tokens": 115898609.0, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.5510455104551046, | |
| "grad_norm": 0.07604741062856764, | |
| "learning_rate": 5.614458642239534e-06, | |
| "loss": 0.0121, | |
| "num_tokens": 116348874.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.5608856088560885, | |
| "grad_norm": 0.07535603766213132, | |
| "learning_rate": 5.585848032354411e-06, | |
| "loss": 0.0187, | |
| "num_tokens": 116822359.0, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.570725707257073, | |
| "grad_norm": 0.07043618175110689, | |
| "learning_rate": 5.557233950509159e-06, | |
| "loss": 0.0332, | |
| "num_tokens": 117246814.0, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.5805658056580567, | |
| "grad_norm": 0.20105726012087, | |
| "learning_rate": 5.528617553945807e-06, | |
| "loss": 0.0128, | |
| "num_tokens": 117708546.0, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.5904059040590406, | |
| "grad_norm": 0.07781337694485964, | |
| "learning_rate": 5.500000000000001e-06, | |
| "loss": 0.0138, | |
| "num_tokens": 118145045.0, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.6002460024600245, | |
| "grad_norm": 0.06125312067867148, | |
| "learning_rate": 5.4713824460541964e-06, | |
| "loss": 0.0122, | |
| "num_tokens": 118586755.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.6100861008610083, | |
| "grad_norm": 0.06778243090705331, | |
| "learning_rate": 5.442766049490843e-06, | |
| "loss": 0.0242, | |
| "num_tokens": 119025525.0, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.6199261992619927, | |
| "grad_norm": 0.06153415408443227, | |
| "learning_rate": 5.414151967645591e-06, | |
| "loss": 0.0125, | |
| "num_tokens": 119479378.0, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.6297662976629765, | |
| "grad_norm": 0.06942223993965259, | |
| "learning_rate": 5.385541357760469e-06, | |
| "loss": 0.0243, | |
| "num_tokens": 119937308.0, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.639606396063961, | |
| "grad_norm": 0.06214738625923835, | |
| "learning_rate": 5.35693537693709e-06, | |
| "loss": 0.0131, | |
| "num_tokens": 120395822.0, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.6494464944649447, | |
| "grad_norm": 0.0669104820126879, | |
| "learning_rate": 5.3283351820898586e-06, | |
| "loss": 0.013, | |
| "num_tokens": 120868183.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.6592865928659286, | |
| "grad_norm": 0.0647734711741491, | |
| "learning_rate": 5.299741929899171e-06, | |
| "loss": 0.0123, | |
| "num_tokens": 121321350.0, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.6691266912669125, | |
| "grad_norm": 0.07530586770614496, | |
| "learning_rate": 5.27115677676464e-06, | |
| "loss": 0.0148, | |
| "num_tokens": 121767518.0, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.678966789667897, | |
| "grad_norm": 0.060785296202459226, | |
| "learning_rate": 5.242580878758334e-06, | |
| "loss": 0.0113, | |
| "num_tokens": 122214099.0, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.6888068880688807, | |
| "grad_norm": 0.06051260316507515, | |
| "learning_rate": 5.21401539157801e-06, | |
| "loss": 0.0118, | |
| "num_tokens": 122641818.0, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.6986469864698646, | |
| "grad_norm": 0.06697146315594861, | |
| "learning_rate": 5.1854614705003796e-06, | |
| "loss": 0.0129, | |
| "num_tokens": 123073540.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.708487084870849, | |
| "grad_norm": 0.06445807540084068, | |
| "learning_rate": 5.156920270334389e-06, | |
| "loss": 0.011, | |
| "num_tokens": 123534092.0, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.7183271832718328, | |
| "grad_norm": 0.06525833653570826, | |
| "learning_rate": 5.1283929453745055e-06, | |
| "loss": 0.0134, | |
| "num_tokens": 123958744.0, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.7281672816728166, | |
| "grad_norm": 0.05852868843858968, | |
| "learning_rate": 5.099880649354044e-06, | |
| "loss": 0.0111, | |
| "num_tokens": 124403705.0, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.7380073800738005, | |
| "grad_norm": 0.07241672197036282, | |
| "learning_rate": 5.071384535398505e-06, | |
| "loss": 0.0127, | |
| "num_tokens": 124889693.0, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.747847478474785, | |
| "grad_norm": 0.06559889162939202, | |
| "learning_rate": 5.04290575597893e-06, | |
| "loss": 0.0141, | |
| "num_tokens": 125351837.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.7576875768757687, | |
| "grad_norm": 0.06353700618816872, | |
| "learning_rate": 5.0144454628653015e-06, | |
| "loss": 0.0119, | |
| "num_tokens": 125793504.0, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.767527675276753, | |
| "grad_norm": 0.06683615854915595, | |
| "learning_rate": 4.986004807079959e-06, | |
| "loss": 0.0129, | |
| "num_tokens": 126251447.0, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.777367773677737, | |
| "grad_norm": 0.07018116547650956, | |
| "learning_rate": 4.957584938851048e-06, | |
| "loss": 0.0302, | |
| "num_tokens": 126702532.0, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.787207872078721, | |
| "grad_norm": 0.0779053076124061, | |
| "learning_rate": 4.929187007565996e-06, | |
| "loss": 0.0135, | |
| "num_tokens": 127176249.0, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.7970479704797047, | |
| "grad_norm": 0.05851676194024864, | |
| "learning_rate": 4.9008121617250425e-06, | |
| "loss": 0.0119, | |
| "num_tokens": 127624082.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.8068880688806885, | |
| "grad_norm": 0.06198574153406469, | |
| "learning_rate": 4.87246154889477e-06, | |
| "loss": 0.0122, | |
| "num_tokens": 128044479.0, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.816728167281673, | |
| "grad_norm": 0.06543548304815791, | |
| "learning_rate": 4.8441363156617085e-06, | |
| "loss": 0.0116, | |
| "num_tokens": 128480925.0, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.8265682656826567, | |
| "grad_norm": 0.06877942595907044, | |
| "learning_rate": 4.815837607585957e-06, | |
| "loss": 0.0129, | |
| "num_tokens": 128924136.0, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.836408364083641, | |
| "grad_norm": 0.07130567586435052, | |
| "learning_rate": 4.787566569154855e-06, | |
| "loss": 0.014, | |
| "num_tokens": 129349471.0, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.846248462484625, | |
| "grad_norm": 0.059470596149140166, | |
| "learning_rate": 4.759324343736698e-06, | |
| "loss": 0.0114, | |
| "num_tokens": 129789536.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.856088560885609, | |
| "grad_norm": 0.06813104970883538, | |
| "learning_rate": 4.731112073534491e-06, | |
| "loss": 0.0134, | |
| "num_tokens": 130231279.0, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.8659286592865927, | |
| "grad_norm": 0.44224478447995313, | |
| "learning_rate": 4.70293089953976e-06, | |
| "loss": 0.2098, | |
| "num_tokens": 130687907.0, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.875768757687577, | |
| "grad_norm": 0.06521058331073082, | |
| "learning_rate": 4.674781961486399e-06, | |
| "loss": 0.0121, | |
| "num_tokens": 131139639.0, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.885608856088561, | |
| "grad_norm": 0.0669166944603376, | |
| "learning_rate": 4.646666397804586e-06, | |
| "loss": 0.0115, | |
| "num_tokens": 131558019.0, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.8954489544895448, | |
| "grad_norm": 0.05844491203018172, | |
| "learning_rate": 4.618585345574741e-06, | |
| "loss": 0.0116, | |
| "num_tokens": 132015109.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.905289052890529, | |
| "grad_norm": 0.06568591959992806, | |
| "learning_rate": 4.5905399404815196e-06, | |
| "loss": 0.0139, | |
| "num_tokens": 132473250.0, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.915129151291513, | |
| "grad_norm": 0.0740965654131703, | |
| "learning_rate": 4.562531316767908e-06, | |
| "loss": 0.0142, | |
| "num_tokens": 132924223.0, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.924969249692497, | |
| "grad_norm": 0.06249511498433694, | |
| "learning_rate": 4.534560607189338e-06, | |
| "loss": 0.0116, | |
| "num_tokens": 133401598.0, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.9348093480934807, | |
| "grad_norm": 0.06283720689157825, | |
| "learning_rate": 4.506628942967874e-06, | |
| "loss": 0.0111, | |
| "num_tokens": 133858797.0, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.944649446494465, | |
| "grad_norm": 0.056149678938915315, | |
| "learning_rate": 4.478737453746464e-06, | |
| "loss": 0.0104, | |
| "num_tokens": 134308841.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.954489544895449, | |
| "grad_norm": 0.09917979372278035, | |
| "learning_rate": 4.450887267543261e-06, | |
| "loss": 0.016, | |
| "num_tokens": 134737382.0, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 2.9643296432964332, | |
| "grad_norm": 0.0640380581075781, | |
| "learning_rate": 4.423079510705992e-06, | |
| "loss": 0.0122, | |
| "num_tokens": 135161876.0, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.974169741697417, | |
| "grad_norm": 0.06467022053195733, | |
| "learning_rate": 4.395315307866404e-06, | |
| "loss": 0.0165, | |
| "num_tokens": 135642143.0, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.984009840098401, | |
| "grad_norm": 0.06355259176583768, | |
| "learning_rate": 4.3675957818947965e-06, | |
| "loss": 0.0122, | |
| "num_tokens": 136085866.0, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.993849938499385, | |
| "grad_norm": 0.06019771768539855, | |
| "learning_rate": 4.33992205385459e-06, | |
| "loss": 0.0192, | |
| "num_tokens": 136558605.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.06019771768539855, | |
| "learning_rate": 4.312295242956998e-06, | |
| "loss": 0.0196, | |
| "num_tokens": 136769120.0, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 3.009840098400984, | |
| "grad_norm": 0.09281584655295379, | |
| "learning_rate": 4.284716466515759e-06, | |
| "loss": 0.015, | |
| "num_tokens": 137221619.0, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 3.019680196801968, | |
| "grad_norm": 0.06328942540239005, | |
| "learning_rate": 4.257186839901948e-06, | |
| "loss": 0.0242, | |
| "num_tokens": 137668031.0, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 3.029520295202952, | |
| "grad_norm": 0.1013363943890701, | |
| "learning_rate": 4.229707476498871e-06, | |
| "loss": 0.0098, | |
| "num_tokens": 138123076.0, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 3.039360393603936, | |
| "grad_norm": 0.05368561892248246, | |
| "learning_rate": 4.2022794876570335e-06, | |
| "loss": 0.0092, | |
| "num_tokens": 138584469.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.0492004920049203, | |
| "grad_norm": 0.055594058845563385, | |
| "learning_rate": 4.1749039826491956e-06, | |
| "loss": 0.0097, | |
| "num_tokens": 139040295.0, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 3.059040590405904, | |
| "grad_norm": 0.061365365973882896, | |
| "learning_rate": 4.1475820686255055e-06, | |
| "loss": 0.0112, | |
| "num_tokens": 139467398.0, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 3.068880688806888, | |
| "grad_norm": 0.07576400254078287, | |
| "learning_rate": 4.120314850568731e-06, | |
| "loss": 0.0112, | |
| "num_tokens": 139915959.0, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 3.078720787207872, | |
| "grad_norm": 0.06609887375125617, | |
| "learning_rate": 4.093103431249563e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 140383093.0, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 3.088560885608856, | |
| "grad_norm": 0.06075307137850355, | |
| "learning_rate": 4.065948911182015e-06, | |
| "loss": 0.0099, | |
| "num_tokens": 140822538.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.09840098400984, | |
| "grad_norm": 0.062126435434214644, | |
| "learning_rate": 4.038852388578925e-06, | |
| "loss": 0.0099, | |
| "num_tokens": 141294868.0, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 3.108241082410824, | |
| "grad_norm": 0.06817279237293263, | |
| "learning_rate": 4.011814959307533e-06, | |
| "loss": 0.0168, | |
| "num_tokens": 141736810.0, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 3.1180811808118083, | |
| "grad_norm": 0.06959931879800538, | |
| "learning_rate": 3.984837716845157e-06, | |
| "loss": 0.0098, | |
| "num_tokens": 142184603.0, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 3.127921279212792, | |
| "grad_norm": 0.06353490194683095, | |
| "learning_rate": 3.957921752234982e-06, | |
| "loss": 0.0167, | |
| "num_tokens": 142622524.0, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 3.137761377613776, | |
| "grad_norm": 0.06947695018548046, | |
| "learning_rate": 3.931068154041919e-06, | |
| "loss": 0.0097, | |
| "num_tokens": 143083493.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.14760147601476, | |
| "grad_norm": 0.059043509631747315, | |
| "learning_rate": 3.904278008308589e-06, | |
| "loss": 0.0104, | |
| "num_tokens": 143533589.0, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 3.1574415744157442, | |
| "grad_norm": 0.06496286878562947, | |
| "learning_rate": 3.877552398511409e-06, | |
| "loss": 0.0096, | |
| "num_tokens": 143992909.0, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 3.167281672816728, | |
| "grad_norm": 0.05772743455179762, | |
| "learning_rate": 3.85089240551675e-06, | |
| "loss": 0.0116, | |
| "num_tokens": 144432232.0, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 3.177121771217712, | |
| "grad_norm": 0.06067527859066343, | |
| "learning_rate": 3.8242991075372436e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 144872986.0, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 3.1869618696186963, | |
| "grad_norm": 0.06092970409513084, | |
| "learning_rate": 3.7977735800881687e-06, | |
| "loss": 0.0096, | |
| "num_tokens": 145338330.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 3.19680196801968, | |
| "grad_norm": 0.0599181858398082, | |
| "learning_rate": 3.7713168959439515e-06, | |
| "loss": 0.0105, | |
| "num_tokens": 145779615.0, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 3.206642066420664, | |
| "grad_norm": 0.06222020359396952, | |
| "learning_rate": 3.74493012509478e-06, | |
| "loss": 0.0111, | |
| "num_tokens": 146242526.0, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 3.2164821648216484, | |
| "grad_norm": 0.06718711950500762, | |
| "learning_rate": 3.718614334703339e-06, | |
| "loss": 0.011, | |
| "num_tokens": 146706718.0, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 3.2263222632226323, | |
| "grad_norm": 0.06582492337813067, | |
| "learning_rate": 3.692370589061639e-06, | |
| "loss": 0.0176, | |
| "num_tokens": 147136544.0, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 3.236162361623616, | |
| "grad_norm": 0.11649214601244208, | |
| "learning_rate": 3.6661999495479772e-06, | |
| "loss": 0.0143, | |
| "num_tokens": 147581972.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.2460024600246005, | |
| "grad_norm": 0.07623567009845389, | |
| "learning_rate": 3.640103474584016e-06, | |
| "loss": 0.0109, | |
| "num_tokens": 148046352.0, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 3.2558425584255843, | |
| "grad_norm": 0.10991461635206767, | |
| "learning_rate": 3.614082219591972e-06, | |
| "loss": 0.0125, | |
| "num_tokens": 148512074.0, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 3.265682656826568, | |
| "grad_norm": 0.0553413140200664, | |
| "learning_rate": 3.588137236951934e-06, | |
| "loss": 0.0112, | |
| "num_tokens": 148974289.0, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 3.275522755227552, | |
| "grad_norm": 0.0647893470114748, | |
| "learning_rate": 3.5622695759592996e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 149432821.0, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 3.2853628536285364, | |
| "grad_norm": 0.05909492699311915, | |
| "learning_rate": 3.5364802827823397e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 149885306.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.2952029520295203, | |
| "grad_norm": 0.06091784855812255, | |
| "learning_rate": 3.5107704004198904e-06, | |
| "loss": 0.012, | |
| "num_tokens": 150329823.0, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 3.305043050430504, | |
| "grad_norm": 0.0653285630713288, | |
| "learning_rate": 3.485140968659166e-06, | |
| "loss": 0.0092, | |
| "num_tokens": 150762193.0, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 3.3148831488314885, | |
| "grad_norm": 0.059323407369363576, | |
| "learning_rate": 3.4595930240337115e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 151238234.0, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 3.3247232472324724, | |
| "grad_norm": 0.057695587877706345, | |
| "learning_rate": 3.4341275997814795e-06, | |
| "loss": 0.0088, | |
| "num_tokens": 151665989.0, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 3.3345633456334562, | |
| "grad_norm": 0.05732212076245395, | |
| "learning_rate": 3.408745725803042e-06, | |
| "loss": 0.0098, | |
| "num_tokens": 152102577.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.34440344403444, | |
| "grad_norm": 0.0595487114131121, | |
| "learning_rate": 3.383448428619941e-06, | |
| "loss": 0.0097, | |
| "num_tokens": 152531136.0, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 3.3542435424354244, | |
| "grad_norm": 0.06136757636985544, | |
| "learning_rate": 3.3582367313331692e-06, | |
| "loss": 0.0096, | |
| "num_tokens": 152996747.0, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 3.3640836408364083, | |
| "grad_norm": 0.05958558106516815, | |
| "learning_rate": 3.3331116535817974e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 153435759.0, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 3.373923739237392, | |
| "grad_norm": 0.05874672952511224, | |
| "learning_rate": 3.308074211501732e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 153870082.0, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 3.3837638376383765, | |
| "grad_norm": 0.06528615293660638, | |
| "learning_rate": 3.2831254176846205e-06, | |
| "loss": 0.0099, | |
| "num_tokens": 154317864.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.3936039360393604, | |
| "grad_norm": 0.06940648713659847, | |
| "learning_rate": 3.258266281136905e-06, | |
| "loss": 0.0152, | |
| "num_tokens": 154756769.0, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 3.4034440344403443, | |
| "grad_norm": 0.061886970984610115, | |
| "learning_rate": 3.233497807239008e-06, | |
| "loss": 0.0097, | |
| "num_tokens": 155205894.0, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 3.4132841328413286, | |
| "grad_norm": 0.06025802060896222, | |
| "learning_rate": 3.2088209977046657e-06, | |
| "loss": 0.0104, | |
| "num_tokens": 155650235.0, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 3.4231242312423125, | |
| "grad_norm": 0.079505096478584, | |
| "learning_rate": 3.1842368505404388e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 156097609.0, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 3.4329643296432963, | |
| "grad_norm": 0.05748478767611536, | |
| "learning_rate": 3.1597463600053258e-06, | |
| "loss": 0.0099, | |
| "num_tokens": 156538234.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.4428044280442807, | |
| "grad_norm": 0.06024044990886995, | |
| "learning_rate": 3.135350516570559e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 156983361.0, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 3.4526445264452645, | |
| "grad_norm": 0.05743125939934545, | |
| "learning_rate": 3.111050306879556e-06, | |
| "loss": 0.0092, | |
| "num_tokens": 157418360.0, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 3.4624846248462484, | |
| "grad_norm": 0.05732681074233502, | |
| "learning_rate": 3.0868467137080075e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 157832487.0, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 3.4723247232472323, | |
| "grad_norm": 0.05983498966527194, | |
| "learning_rate": 3.0627407159241273e-06, | |
| "loss": 0.0134, | |
| "num_tokens": 158300222.0, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 3.4821648216482166, | |
| "grad_norm": 0.11937622237492422, | |
| "learning_rate": 3.0387332884490806e-06, | |
| "loss": 0.0101, | |
| "num_tokens": 158757341.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 3.4920049200492005, | |
| "grad_norm": 0.06131645683210505, | |
| "learning_rate": 3.014825402217533e-06, | |
| "loss": 0.0162, | |
| "num_tokens": 159204107.0, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 3.5018450184501844, | |
| "grad_norm": 0.07487264837463524, | |
| "learning_rate": 2.9910180241384014e-06, | |
| "loss": 0.0094, | |
| "num_tokens": 159649846.0, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 3.5116851168511687, | |
| "grad_norm": 0.05978234516994627, | |
| "learning_rate": 2.9673121170557396e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 160075112.0, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 3.5215252152521526, | |
| "grad_norm": 0.05949296353246161, | |
| "learning_rate": 2.9437086397097996e-06, | |
| "loss": 0.0654, | |
| "num_tokens": 160529018.0, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 3.5313653136531364, | |
| "grad_norm": 0.49227073434692264, | |
| "learning_rate": 2.92020854669826e-06, | |
| "loss": 0.0099, | |
| "num_tokens": 160983786.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.5412054120541203, | |
| "grad_norm": 0.06622940201032448, | |
| "learning_rate": 2.896812788437615e-06, | |
| "loss": 0.01, | |
| "num_tokens": 161416791.0, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 3.5510455104551046, | |
| "grad_norm": 0.06226773442108338, | |
| "learning_rate": 2.8735223111247402e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 161878966.0, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 3.5608856088560885, | |
| "grad_norm": 0.052597494365304226, | |
| "learning_rate": 2.850338056698621e-06, | |
| "loss": 0.0105, | |
| "num_tokens": 162327927.0, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 3.570725707257073, | |
| "grad_norm": 0.07012270383857575, | |
| "learning_rate": 2.827260962802263e-06, | |
| "loss": 0.0147, | |
| "num_tokens": 162786081.0, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 3.5805658056580567, | |
| "grad_norm": 0.055063275741049224, | |
| "learning_rate": 2.804291962744768e-06, | |
| "loss": 0.0094, | |
| "num_tokens": 163225787.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 3.5904059040590406, | |
| "grad_norm": 0.05925152600247852, | |
| "learning_rate": 2.7814319854635875e-06, | |
| "loss": 0.0103, | |
| "num_tokens": 163657501.0, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 3.6002460024600245, | |
| "grad_norm": 0.06722393012249571, | |
| "learning_rate": 2.758681955486955e-06, | |
| "loss": 0.0144, | |
| "num_tokens": 164141801.0, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 3.6100861008610083, | |
| "grad_norm": 0.06258536118771861, | |
| "learning_rate": 2.736042792896495e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 164583522.0, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 3.6199261992619927, | |
| "grad_norm": 0.058745565835135405, | |
| "learning_rate": 2.7135154132900133e-06, | |
| "loss": 0.2081, | |
| "num_tokens": 165059903.0, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 3.6297662976629765, | |
| "grad_norm": 0.45572028974101275, | |
| "learning_rate": 2.691100727744458e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 165501531.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.639606396063961, | |
| "grad_norm": 0.06413891247366597, | |
| "learning_rate": 2.668799642779093e-06, | |
| "loss": 0.0092, | |
| "num_tokens": 165943767.0, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 3.6494464944649447, | |
| "grad_norm": 0.06190519506649364, | |
| "learning_rate": 2.6466130603188157e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 166367316.0, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 3.6592865928659286, | |
| "grad_norm": 0.05986849692187042, | |
| "learning_rate": 2.624541877657685e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 166834863.0, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 3.6691266912669125, | |
| "grad_norm": 0.05660848820396227, | |
| "learning_rate": 2.602586987422643e-06, | |
| "loss": 0.0091, | |
| "num_tokens": 167276182.0, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 3.678966789667897, | |
| "grad_norm": 0.06018671949798086, | |
| "learning_rate": 2.580749277537399e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 167713298.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.6888068880688807, | |
| "grad_norm": 0.05866547399139435, | |
| "learning_rate": 2.5590296311865294e-06, | |
| "loss": 0.0089, | |
| "num_tokens": 168174626.0, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 3.6986469864698646, | |
| "grad_norm": 0.05484667079392053, | |
| "learning_rate": 2.537428926779758e-06, | |
| "loss": 0.0084, | |
| "num_tokens": 168614618.0, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 3.708487084870849, | |
| "grad_norm": 0.05383142631084436, | |
| "learning_rate": 2.515948037916423e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 169020836.0, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 3.7183271832718328, | |
| "grad_norm": 0.06002011881446048, | |
| "learning_rate": 2.494587833350153e-06, | |
| "loss": 0.0087, | |
| "num_tokens": 169486382.0, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 3.7281672816728166, | |
| "grad_norm": 0.05744381429509693, | |
| "learning_rate": 2.473349176953736e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 169927283.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.7380073800738005, | |
| "grad_norm": 0.060208708524887654, | |
| "learning_rate": 2.4522329276841664e-06, | |
| "loss": 0.0199, | |
| "num_tokens": 170406677.0, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 3.747847478474785, | |
| "grad_norm": 0.07568634621301189, | |
| "learning_rate": 2.431239939547921e-06, | |
| "loss": 0.0091, | |
| "num_tokens": 170848564.0, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 3.7576875768757687, | |
| "grad_norm": 0.05568928633830086, | |
| "learning_rate": 2.4103710615664145e-06, | |
| "loss": 0.0087, | |
| "num_tokens": 171300323.0, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 3.767527675276753, | |
| "grad_norm": 0.05797358351819658, | |
| "learning_rate": 2.389627137741662e-06, | |
| "loss": 0.009, | |
| "num_tokens": 171756442.0, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 3.777367773677737, | |
| "grad_norm": 0.05921869424481757, | |
| "learning_rate": 2.369009007022146e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 172205755.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.787207872078721, | |
| "grad_norm": 0.05977666911207599, | |
| "learning_rate": 2.3485175032688865e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 172657182.0, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 3.7970479704797047, | |
| "grad_norm": 0.05932546877174744, | |
| "learning_rate": 2.328153455221717e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 173095889.0, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 3.8068880688806885, | |
| "grad_norm": 0.06042663443540837, | |
| "learning_rate": 2.3079176864657673e-06, | |
| "loss": 0.0126, | |
| "num_tokens": 173547333.0, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 3.816728167281673, | |
| "grad_norm": 0.15319976378374434, | |
| "learning_rate": 2.2878110153981565e-06, | |
| "loss": 0.0127, | |
| "num_tokens": 174010201.0, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 3.8265682656826567, | |
| "grad_norm": 0.06053526576608028, | |
| "learning_rate": 2.267834255194894e-06, | |
| "loss": 0.0091, | |
| "num_tokens": 174451094.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.836408364083641, | |
| "grad_norm": 0.056105090205726134, | |
| "learning_rate": 2.2479882137779903e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 174912611.0, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 3.846248462484625, | |
| "grad_norm": 0.05451931809215225, | |
| "learning_rate": 2.228273693782784e-06, | |
| "loss": 0.181, | |
| "num_tokens": 175406884.0, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 3.856088560885609, | |
| "grad_norm": 0.21723127868567568, | |
| "learning_rate": 2.208691492525481e-06, | |
| "loss": 0.0201, | |
| "num_tokens": 175906140.0, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 3.8659286592865927, | |
| "grad_norm": 0.0674311563870576, | |
| "learning_rate": 2.189242401970908e-06, | |
| "loss": 0.0109, | |
| "num_tokens": 176321003.0, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 3.875768757687577, | |
| "grad_norm": 0.06352093720920256, | |
| "learning_rate": 2.169927208700482e-06, | |
| "loss": 0.0105, | |
| "num_tokens": 176774697.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.885608856088561, | |
| "grad_norm": 0.07829494568911512, | |
| "learning_rate": 2.1507466938804013e-06, | |
| "loss": 0.009, | |
| "num_tokens": 177239096.0, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 3.8954489544895448, | |
| "grad_norm": 0.05295977087497385, | |
| "learning_rate": 2.131701633230045e-06, | |
| "loss": 0.0108, | |
| "num_tokens": 177705420.0, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 3.905289052890529, | |
| "grad_norm": 0.06253022869487425, | |
| "learning_rate": 2.112792796990616e-06, | |
| "loss": 0.011, | |
| "num_tokens": 178159440.0, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 3.915129151291513, | |
| "grad_norm": 0.06311743420915392, | |
| "learning_rate": 2.0940209498939732e-06, | |
| "loss": 0.0087, | |
| "num_tokens": 178600759.0, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 3.924969249692497, | |
| "grad_norm": 0.05932091295852782, | |
| "learning_rate": 2.075386851131711e-06, | |
| "loss": 0.022, | |
| "num_tokens": 179053675.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.9348093480934807, | |
| "grad_norm": 0.07035371257735856, | |
| "learning_rate": 2.056891254324459e-06, | |
| "loss": 0.0099, | |
| "num_tokens": 179505855.0, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 3.944649446494465, | |
| "grad_norm": 0.06484414999103219, | |
| "learning_rate": 2.038534907491396e-06, | |
| "loss": 0.0088, | |
| "num_tokens": 179936013.0, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 3.954489544895449, | |
| "grad_norm": 0.060396196023032395, | |
| "learning_rate": 2.0203185530199983e-06, | |
| "loss": 0.0088, | |
| "num_tokens": 180382727.0, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 3.9643296432964332, | |
| "grad_norm": 0.05481446314958498, | |
| "learning_rate": 2.0022429276360256e-06, | |
| "loss": 0.0083, | |
| "num_tokens": 180829894.0, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 3.974169741697417, | |
| "grad_norm": 0.05579653515238233, | |
| "learning_rate": 1.9843087623737097e-06, | |
| "loss": 0.0084, | |
| "num_tokens": 181285317.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.984009840098401, | |
| "grad_norm": 0.08234263765839406, | |
| "learning_rate": 1.966516782546199e-06, | |
| "loss": 0.0147, | |
| "num_tokens": 181726465.0, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 3.993849938499385, | |
| "grad_norm": 0.061989197697927254, | |
| "learning_rate": 1.94886770771623e-06, | |
| "loss": 0.021, | |
| "num_tokens": 182137592.0, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.06561678793095134, | |
| "learning_rate": 1.931362251667008e-06, | |
| "loss": 0.01, | |
| "num_tokens": 182350141.0, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 4.009840098400984, | |
| "grad_norm": 0.08249843123562729, | |
| "learning_rate": 1.9140011223733576e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 182808332.0, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 4.019680196801968, | |
| "grad_norm": 0.051015416231373206, | |
| "learning_rate": 1.8967850219730799e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 183234541.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.029520295202952, | |
| "grad_norm": 0.05624434271690936, | |
| "learning_rate": 1.8797146467385604e-06, | |
| "loss": 0.008, | |
| "num_tokens": 183657861.0, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 4.039360393603936, | |
| "grad_norm": 0.055439304665308706, | |
| "learning_rate": 1.8627906870486063e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 184114899.0, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 4.04920049200492, | |
| "grad_norm": 0.053200752263272366, | |
| "learning_rate": 1.8460138273605265e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 184540917.0, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 4.059040590405904, | |
| "grad_norm": 0.05493004000264093, | |
| "learning_rate": 1.8293847461824538e-06, | |
| "loss": 0.0182, | |
| "num_tokens": 184993388.0, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 4.068880688806888, | |
| "grad_norm": 0.08372361370144696, | |
| "learning_rate": 1.8129041160458966e-06, | |
| "loss": 0.0136, | |
| "num_tokens": 185443466.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 4.078720787207872, | |
| "grad_norm": 0.05966232822391394, | |
| "learning_rate": 1.7965726034785466e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 185884446.0, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 4.088560885608856, | |
| "grad_norm": 0.05964634351715403, | |
| "learning_rate": 1.780390868977318e-06, | |
| "loss": 0.0109, | |
| "num_tokens": 186324584.0, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 4.0984009840098405, | |
| "grad_norm": 0.077920426753446, | |
| "learning_rate": 1.7643595669816378e-06, | |
| "loss": 0.0125, | |
| "num_tokens": 186805249.0, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 4.108241082410824, | |
| "grad_norm": 0.052268159950947905, | |
| "learning_rate": 1.7484793458469745e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 187242236.0, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 4.118081180811808, | |
| "grad_norm": 0.055826923432114865, | |
| "learning_rate": 1.7327508478186216e-06, | |
| "loss": 0.0072, | |
| "num_tokens": 187710452.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.127921279212792, | |
| "grad_norm": 0.05075579193844761, | |
| "learning_rate": 1.7171747090057201e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 188166192.0, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 4.137761377613776, | |
| "grad_norm": 0.05526562542826341, | |
| "learning_rate": 1.7017515593555295e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 188609825.0, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 4.14760147601476, | |
| "grad_norm": 0.05657081985879626, | |
| "learning_rate": 1.6864820226279607e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 189068636.0, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 4.157441574415744, | |
| "grad_norm": 0.05470927216680097, | |
| "learning_rate": 1.6713667163703348e-06, | |
| "loss": 0.008, | |
| "num_tokens": 189501962.0, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 4.167281672816729, | |
| "grad_norm": 0.06396161695980077, | |
| "learning_rate": 1.6564062518924202e-06, | |
| "loss": 0.0134, | |
| "num_tokens": 189933564.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 4.177121771217712, | |
| "grad_norm": 0.06708045538135683, | |
| "learning_rate": 1.6416012342417056e-06, | |
| "loss": 0.013, | |
| "num_tokens": 190376512.0, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 4.186961869618696, | |
| "grad_norm": 0.06902938106259363, | |
| "learning_rate": 1.6269522621789246e-06, | |
| "loss": 0.0185, | |
| "num_tokens": 190824286.0, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 4.19680196801968, | |
| "grad_norm": 0.06764173670337073, | |
| "learning_rate": 1.6124599281538452e-06, | |
| "loss": 0.023, | |
| "num_tokens": 191293501.0, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 4.206642066420664, | |
| "grad_norm": 0.055114986132894145, | |
| "learning_rate": 1.5981248182813136e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 191738984.0, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 4.216482164821648, | |
| "grad_norm": 0.051261855942856496, | |
| "learning_rate": 1.583947512317537e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 192188472.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.226322263222632, | |
| "grad_norm": 0.05658108779272827, | |
| "learning_rate": 1.5699285836366488e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 192610188.0, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 4.236162361623617, | |
| "grad_norm": 0.05505791232716452, | |
| "learning_rate": 1.5560685992075141e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 193063967.0, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 4.2460024600246005, | |
| "grad_norm": 0.054672853412556996, | |
| "learning_rate": 1.5423681195707997e-06, | |
| "loss": 0.0072, | |
| "num_tokens": 193546556.0, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 4.255842558425584, | |
| "grad_norm": 0.05581920906854521, | |
| "learning_rate": 1.528827698816306e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 193976678.0, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 4.265682656826568, | |
| "grad_norm": 0.05717281375700824, | |
| "learning_rate": 1.515447884560556e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 194451959.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 4.275522755227552, | |
| "grad_norm": 0.05353729617839062, | |
| "learning_rate": 1.502229217924649e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 194919769.0, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 4.285362853628536, | |
| "grad_norm": 0.05747591456155956, | |
| "learning_rate": 1.489172233512376e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 195385061.0, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 4.29520295202952, | |
| "grad_norm": 0.0605171649231424, | |
| "learning_rate": 1.4762774593885986e-06, | |
| "loss": 0.008, | |
| "num_tokens": 195829302.0, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 4.305043050430505, | |
| "grad_norm": 0.057478821326489744, | |
| "learning_rate": 1.4635454170578917e-06, | |
| "loss": 0.0082, | |
| "num_tokens": 196272086.0, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 4.3148831488314885, | |
| "grad_norm": 0.05677684124604302, | |
| "learning_rate": 1.4509766214434535e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 196705376.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.324723247232472, | |
| "grad_norm": 0.05308761925199772, | |
| "learning_rate": 1.4385715808662787e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 197144202.0, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 4.334563345633456, | |
| "grad_norm": 0.0537126730210605, | |
| "learning_rate": 1.4263307970246027e-06, | |
| "loss": 0.0072, | |
| "num_tokens": 197583484.0, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 4.34440344403444, | |
| "grad_norm": 0.0581676639275268, | |
| "learning_rate": 1.41425476497361e-06, | |
| "loss": 0.01, | |
| "num_tokens": 198038237.0, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 4.354243542435424, | |
| "grad_norm": 0.05692004762795615, | |
| "learning_rate": 1.4023439731054112e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 198478749.0, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 4.364083640836409, | |
| "grad_norm": 0.05533945838827064, | |
| "learning_rate": 1.390598903129296e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 198919863.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 4.373923739237393, | |
| "grad_norm": 0.08520786010753936, | |
| "learning_rate": 1.3790200300522413e-06, | |
| "loss": 0.021, | |
| "num_tokens": 199382929.0, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 4.3837638376383765, | |
| "grad_norm": 0.05139822843194828, | |
| "learning_rate": 1.3676078221597157e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 199843263.0, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 4.39360393603936, | |
| "grad_norm": 0.31624336801190883, | |
| "learning_rate": 1.3563627409967257e-06, | |
| "loss": 0.0554, | |
| "num_tokens": 200287378.0, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 4.403444034440344, | |
| "grad_norm": 0.05264132661612972, | |
| "learning_rate": 1.3452852413491563e-06, | |
| "loss": 0.0065, | |
| "num_tokens": 200793892.0, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 4.413284132841328, | |
| "grad_norm": 0.05900297532446137, | |
| "learning_rate": 1.3343757712253804e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 201237298.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.423124231242312, | |
| "grad_norm": 0.28018993976510653, | |
| "learning_rate": 1.3236347718381338e-06, | |
| "loss": 0.1846, | |
| "num_tokens": 201713169.0, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 4.432964329643297, | |
| "grad_norm": 0.056807303328480156, | |
| "learning_rate": 1.3130626775866743e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 202186125.0, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 4.442804428044281, | |
| "grad_norm": 0.050176964423526066, | |
| "learning_rate": 1.3026599160392173e-06, | |
| "loss": 0.0069, | |
| "num_tokens": 202642625.0, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 4.4526445264452645, | |
| "grad_norm": 0.09261733517736204, | |
| "learning_rate": 1.292426907915634e-06, | |
| "loss": 0.0116, | |
| "num_tokens": 203084429.0, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 4.462484624846248, | |
| "grad_norm": 0.05582692526106092, | |
| "learning_rate": 1.2823640670704443e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 203531332.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 4.472324723247232, | |
| "grad_norm": 0.0544386302493359, | |
| "learning_rate": 1.2724718004760794e-06, | |
| "loss": 0.007, | |
| "num_tokens": 203977923.0, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 4.482164821648216, | |
| "grad_norm": 0.0485924079874789, | |
| "learning_rate": 1.2627505082064144e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 204447835.0, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 4.492004920049201, | |
| "grad_norm": 0.05314912137402164, | |
| "learning_rate": 1.2532005834205976e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 204889787.0, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 4.501845018450185, | |
| "grad_norm": 0.059487474238797314, | |
| "learning_rate": 1.2438224123471442e-06, | |
| "loss": 0.0081, | |
| "num_tokens": 205319753.0, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 4.511685116851169, | |
| "grad_norm": 0.053210906319883816, | |
| "learning_rate": 1.2346163742683185e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 205765375.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 4.521525215252153, | |
| "grad_norm": 0.052123105650746834, | |
| "learning_rate": 1.2255828415047932e-06, | |
| "loss": 0.0215, | |
| "num_tokens": 206223886.0, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 4.531365313653136, | |
| "grad_norm": 0.058022484093730155, | |
| "learning_rate": 1.216722179400592e-06, | |
| "loss": 0.0079, | |
| "num_tokens": 206657932.0, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 4.54120541205412, | |
| "grad_norm": 0.06046936274806646, | |
| "learning_rate": 1.208034746308315e-06, | |
| "loss": 0.0119, | |
| "num_tokens": 207113562.0, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 4.551045510455104, | |
| "grad_norm": 0.05202796714609154, | |
| "learning_rate": 1.1995208935746437e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 207577701.0, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 4.560885608856088, | |
| "grad_norm": 0.053010352876086804, | |
| "learning_rate": 1.1911809655261333e-06, | |
| "loss": 0.0071, | |
| "num_tokens": 208021678.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 4.570725707257073, | |
| "grad_norm": 0.05123561999347148, | |
| "learning_rate": 1.1830152994552866e-06, | |
| "loss": 0.0072, | |
| "num_tokens": 208467211.0, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 4.580565805658057, | |
| "grad_norm": 0.06003956537674327, | |
| "learning_rate": 1.175024225606912e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 208909702.0, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 4.590405904059041, | |
| "grad_norm": 0.056238323571208206, | |
| "learning_rate": 1.1672080671647695e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 209342529.0, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 4.6002460024600245, | |
| "grad_norm": 0.0552115239217723, | |
| "learning_rate": 1.1595671402384966e-06, | |
| "loss": 0.0112, | |
| "num_tokens": 209805083.0, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 4.610086100861008, | |
| "grad_norm": 0.07199003274369836, | |
| "learning_rate": 1.152101753850828e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 210267459.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 4.619926199261993, | |
| "grad_norm": 0.050805856130265635, | |
| "learning_rate": 1.1448122099250946e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 210732019.0, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 4.629766297662977, | |
| "grad_norm": 0.054524548819372444, | |
| "learning_rate": 1.1376988032730135e-06, | |
| "loss": 0.0082, | |
| "num_tokens": 211158328.0, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 4.639606396063961, | |
| "grad_norm": 0.061253234718759104, | |
| "learning_rate": 1.130761821582766e-06, | |
| "loss": 0.0068, | |
| "num_tokens": 211615451.0, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 4.649446494464945, | |
| "grad_norm": 0.054438039734835346, | |
| "learning_rate": 1.1240015454073622e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 212052321.0, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 4.659286592865929, | |
| "grad_norm": 0.05428969184748025, | |
| "learning_rate": 1.1174182481532943e-06, | |
| "loss": 0.0085, | |
| "num_tokens": 212511116.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 4.6691266912669125, | |
| "grad_norm": 0.06405292334946296, | |
| "learning_rate": 1.1110121960694773e-06, | |
| "loss": 0.015, | |
| "num_tokens": 212943028.0, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 4.678966789667896, | |
| "grad_norm": 0.05682101787290208, | |
| "learning_rate": 1.104783648236486e-06, | |
| "loss": 0.0175, | |
| "num_tokens": 213395996.0, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 4.68880688806888, | |
| "grad_norm": 0.05953289126520201, | |
| "learning_rate": 1.0987328565560711e-06, | |
| "loss": 0.007, | |
| "num_tokens": 213829686.0, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 4.698646986469865, | |
| "grad_norm": 0.05168917084897981, | |
| "learning_rate": 1.0928600657409751e-06, | |
| "loss": 0.007, | |
| "num_tokens": 214277322.0, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 4.708487084870849, | |
| "grad_norm": 0.1608212490455815, | |
| "learning_rate": 1.0871655133050372e-06, | |
| "loss": 0.1771, | |
| "num_tokens": 214763908.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 4.718327183271833, | |
| "grad_norm": 0.06395186276533307, | |
| "learning_rate": 1.081649429553581e-06, | |
| "loss": 0.0096, | |
| "num_tokens": 215194438.0, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 4.728167281672817, | |
| "grad_norm": 0.05084061327021202, | |
| "learning_rate": 1.076312037574106e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 215654297.0, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 4.7380073800738005, | |
| "grad_norm": 0.05497766832460128, | |
| "learning_rate": 1.0711535532272632e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 216086613.0, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 4.747847478474784, | |
| "grad_norm": 0.055956248844709046, | |
| "learning_rate": 1.0661741851381256e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 216536848.0, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 4.757687576875769, | |
| "grad_norm": 0.05660105878886696, | |
| "learning_rate": 1.0613741346877498e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 216997743.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 4.767527675276753, | |
| "grad_norm": 0.06132538554048896, | |
| "learning_rate": 1.056753596005032e-06, | |
| "loss": 0.0072, | |
| "num_tokens": 217458807.0, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 4.777367773677737, | |
| "grad_norm": 0.055422872986755904, | |
| "learning_rate": 1.0523127559588579e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 217917380.0, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 4.787207872078721, | |
| "grad_norm": 0.05522528856299787, | |
| "learning_rate": 1.0480517941505428e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 218379436.0, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 4.797047970479705, | |
| "grad_norm": 0.06958848204521575, | |
| "learning_rate": 1.0439708829065708e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 218813896.0, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 4.8068880688806885, | |
| "grad_norm": 0.06386162989494891, | |
| "learning_rate": 1.0400701872716227e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 219264589.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 4.816728167281672, | |
| "grad_norm": 0.05787632062480919, | |
| "learning_rate": 1.0363498650019023e-06, | |
| "loss": 0.0076, | |
| "num_tokens": 219704673.0, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 4.826568265682657, | |
| "grad_norm": 0.06292874524087964, | |
| "learning_rate": 1.0328100665587573e-06, | |
| "loss": 0.0127, | |
| "num_tokens": 220132557.0, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 4.836408364083641, | |
| "grad_norm": 0.05664304451978608, | |
| "learning_rate": 1.029450935102592e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 220586640.0, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 4.846248462484625, | |
| "grad_norm": 0.06939065112948692, | |
| "learning_rate": 1.0262726064870801e-06, | |
| "loss": 0.0085, | |
| "num_tokens": 221016982.0, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 4.856088560885609, | |
| "grad_norm": 0.052898898903196025, | |
| "learning_rate": 1.0232752092536666e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 221462667.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 4.865928659286593, | |
| "grad_norm": 0.05776273104035291, | |
| "learning_rate": 1.0204588646263731e-06, | |
| "loss": 0.0071, | |
| "num_tokens": 221913595.0, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 4.875768757687577, | |
| "grad_norm": 0.05461381324686126, | |
| "learning_rate": 1.0178236865068933e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 222326904.0, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 4.885608856088561, | |
| "grad_norm": 0.055498699411421645, | |
| "learning_rate": 1.0153697814699858e-06, | |
| "loss": 0.0072, | |
| "num_tokens": 222774557.0, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 4.895448954489545, | |
| "grad_norm": 0.053031884611377415, | |
| "learning_rate": 1.0130972487591658e-06, | |
| "loss": 0.0068, | |
| "num_tokens": 223236906.0, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 4.905289052890529, | |
| "grad_norm": 0.054483452664515604, | |
| "learning_rate": 1.0110061802826889e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 223673165.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.915129151291513, | |
| "grad_norm": 0.05035207860411025, | |
| "learning_rate": 1.009096660609837e-06, | |
| "loss": 0.0068, | |
| "num_tokens": 224139613.0, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 4.924969249692497, | |
| "grad_norm": 0.05491435698714458, | |
| "learning_rate": 1.0073687669674949e-06, | |
| "loss": 0.0108, | |
| "num_tokens": 224623785.0, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 4.934809348093481, | |
| "grad_norm": 0.05816593959272977, | |
| "learning_rate": 1.0058225692370299e-06, | |
| "loss": 0.0074, | |
| "num_tokens": 225060586.0, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 4.944649446494465, | |
| "grad_norm": 0.05535566978108373, | |
| "learning_rate": 1.0044581299514638e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 225504012.0, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 4.9544895448954485, | |
| "grad_norm": 0.053032498372300854, | |
| "learning_rate": 1.003275504292944e-06, | |
| "loss": 0.0073, | |
| "num_tokens": 225944573.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 4.964329643296433, | |
| "grad_norm": 0.051644596347508585, | |
| "learning_rate": 1.0022747400905126e-06, | |
| "loss": 0.0072, | |
| "num_tokens": 226383598.0, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 4.974169741697417, | |
| "grad_norm": 0.05140289748972471, | |
| "learning_rate": 1.0014558778181714e-06, | |
| "loss": 0.0067, | |
| "num_tokens": 226851896.0, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 4.984009840098401, | |
| "grad_norm": 0.05762796975847249, | |
| "learning_rate": 1.0008189505932444e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 227269800.0, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 4.993849938499385, | |
| "grad_norm": 0.05745101659790941, | |
| "learning_rate": 1.0003639841750404e-06, | |
| "loss": 0.007, | |
| "num_tokens": 227720300.0, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.0720275822622563, | |
| "learning_rate": 1.0000909969638097e-06, | |
| "loss": 0.007, | |
| "num_tokens": 227931822.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 510, | |
| "total_flos": 7.699310209956577e+17, | |
| "train_loss": 0.04971654405360859, | |
| "train_runtime": 7469.8311, | |
| "train_samples_per_second": 8.7, | |
| "train_steps_per_second": 0.068 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 510, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.699310209956577e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |