| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 0, | |
| "global_step": 797, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0012547051442910915, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 9.98745294855709e-06, | |
| "loss": 1.8811, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.002509410288582183, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 9.974905897114179e-06, | |
| "loss": 1.9066, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0037641154328732747, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 9.962358845671269e-06, | |
| "loss": 1.838, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.005018820577164366, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 9.949811794228357e-06, | |
| "loss": 1.9337, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.006273525721455458, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 9.937264742785447e-06, | |
| "loss": 1.7792, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0075282308657465494, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 9.924717691342535e-06, | |
| "loss": 1.8373, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.00878293601003764, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 9.912170639899625e-06, | |
| "loss": 1.8733, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.010037641154328732, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 9.899623588456713e-06, | |
| "loss": 1.9003, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.011292346298619825, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 9.887076537013803e-06, | |
| "loss": 1.839, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.012547051442910916, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 9.874529485570891e-06, | |
| "loss": 1.8247, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.013801756587202008, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 9.861982434127981e-06, | |
| "loss": 1.7166, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.015056461731493099, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 9.849435382685069e-06, | |
| "loss": 1.7286, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.01631116687578419, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 9.836888331242159e-06, | |
| "loss": 1.7676, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.01756587202007528, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 9.824341279799247e-06, | |
| "loss": 1.7642, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.018820577164366373, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 9.811794228356337e-06, | |
| "loss": 1.6793, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.020075282308657464, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 9.799247176913425e-06, | |
| "loss": 1.5586, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.02132998745294856, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 9.786700125470515e-06, | |
| "loss": 1.6833, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.02258469259723965, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 9.774153074027605e-06, | |
| "loss": 1.5879, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.02383939774153074, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 9.761606022584693e-06, | |
| "loss": 1.5705, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.025094102885821833, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 9.749058971141783e-06, | |
| "loss": 1.5896, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.026348808030112924, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 9.736511919698871e-06, | |
| "loss": 1.6392, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.027603513174404015, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 9.723964868255961e-06, | |
| "loss": 1.5772, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.028858218318695106, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 9.711417816813051e-06, | |
| "loss": 1.5175, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.030112923462986198, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 9.69887076537014e-06, | |
| "loss": 1.5083, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.03136762860727729, | |
| "grad_norm": 0.25, | |
| "learning_rate": 9.686323713927227e-06, | |
| "loss": 1.5575, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03262233375156838, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 9.673776662484317e-06, | |
| "loss": 1.549, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.033877038895859475, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 9.661229611041405e-06, | |
| "loss": 1.5112, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.03513174404015056, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 9.648682559598495e-06, | |
| "loss": 1.5569, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.03638644918444166, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 9.636135508155583e-06, | |
| "loss": 1.4399, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.037641154328732745, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 9.623588456712673e-06, | |
| "loss": 1.4761, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03889585947302384, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 9.611041405269761e-06, | |
| "loss": 1.4108, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.04015056461731493, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 9.598494353826851e-06, | |
| "loss": 1.5262, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.04140526976160602, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 9.585947302383941e-06, | |
| "loss": 1.4889, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.04265997490589712, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 9.57340025094103e-06, | |
| "loss": 1.456, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.043914680050188205, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 9.56085319949812e-06, | |
| "loss": 1.4607, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0451693851944793, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 9.548306148055207e-06, | |
| "loss": 1.4645, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.04642409033877039, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 9.535759096612297e-06, | |
| "loss": 1.342, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.04767879548306148, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 9.523212045169386e-06, | |
| "loss": 1.4266, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.04893350062735257, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 9.510664993726475e-06, | |
| "loss": 1.3732, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.050188205771643665, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 9.498117942283565e-06, | |
| "loss": 1.5087, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05144291091593475, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 9.485570890840653e-06, | |
| "loss": 1.4479, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.05269761606022585, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 9.473023839397743e-06, | |
| "loss": 1.4427, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.053952321204516936, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 9.460476787954832e-06, | |
| "loss": 1.3319, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.05520702634880803, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 9.44792973651192e-06, | |
| "loss": 1.4298, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.056461731493099125, | |
| "grad_norm": 0.1875, | |
| "learning_rate": 9.43538268506901e-06, | |
| "loss": 1.3957, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.05771643663739021, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 9.422835633626098e-06, | |
| "loss": 1.3846, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.05897114178168131, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 9.410288582183188e-06, | |
| "loss": 1.2841, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.060225846925972396, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 9.397741530740276e-06, | |
| "loss": 1.3474, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.06148055207026349, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 9.385194479297366e-06, | |
| "loss": 1.3209, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.06273525721455459, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 9.372647427854456e-06, | |
| "loss": 1.3605, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06398996235884567, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 9.360100376411544e-06, | |
| "loss": 1.284, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.06524466750313676, | |
| "grad_norm": 0.19140625, | |
| "learning_rate": 9.347553324968634e-06, | |
| "loss": 1.3667, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.06649937264742785, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 9.335006273525722e-06, | |
| "loss": 1.3511, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.06775407779171895, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 9.322459222082812e-06, | |
| "loss": 1.3237, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.06900878293601004, | |
| "grad_norm": 0.189453125, | |
| "learning_rate": 9.309912170639902e-06, | |
| "loss": 1.4086, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.07026348808030113, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 9.29736511919699e-06, | |
| "loss": 1.2681, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.07151819322459223, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 9.28481806775408e-06, | |
| "loss": 1.2862, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.07277289836888332, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 9.272271016311168e-06, | |
| "loss": 1.2984, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.0740276035131744, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 9.259723964868258e-06, | |
| "loss": 1.2213, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.07528230865746549, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 9.247176913425346e-06, | |
| "loss": 1.2776, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07653701380175659, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 9.234629861982434e-06, | |
| "loss": 1.2724, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.07779171894604768, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 9.222082810539524e-06, | |
| "loss": 1.2754, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.07904642409033877, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 9.209535759096612e-06, | |
| "loss": 1.2553, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.08030112923462986, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 9.196988707653702e-06, | |
| "loss": 1.2712, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.08155583437892096, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 9.18444165621079e-06, | |
| "loss": 1.3042, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.08281053952321205, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 9.17189460476788e-06, | |
| "loss": 1.3251, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.08406524466750313, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 9.15934755332497e-06, | |
| "loss": 1.2872, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.08531994981179424, | |
| "grad_norm": 0.1806640625, | |
| "learning_rate": 9.146800501882058e-06, | |
| "loss": 1.3151, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.08657465495608532, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 9.134253450439148e-06, | |
| "loss": 1.2665, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.08782936010037641, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 9.121706398996236e-06, | |
| "loss": 1.253, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0890840652446675, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 9.109159347553326e-06, | |
| "loss": 1.2587, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.0903387703889586, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 9.096612296110416e-06, | |
| "loss": 1.266, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.09159347553324969, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 9.084065244667504e-06, | |
| "loss": 1.2809, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.09284818067754078, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 9.071518193224594e-06, | |
| "loss": 1.2475, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.09410288582183186, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 9.058971141781682e-06, | |
| "loss": 1.2244, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.09535759096612297, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 9.046424090338772e-06, | |
| "loss": 1.315, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.09661229611041405, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 9.03387703889586e-06, | |
| "loss": 1.2624, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.09786700125470514, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 9.02132998745295e-06, | |
| "loss": 1.2259, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.09912170639899624, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 9.008782936010038e-06, | |
| "loss": 1.2157, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.10037641154328733, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 8.996235884567126e-06, | |
| "loss": 1.2593, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.10163111668757842, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 8.983688833124216e-06, | |
| "loss": 1.2992, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.1028858218318695, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 8.971141781681304e-06, | |
| "loss": 1.2249, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.10414052697616061, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 8.958594730238394e-06, | |
| "loss": 1.2319, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.1053952321204517, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 8.946047678795484e-06, | |
| "loss": 1.2112, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.10664993726474278, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 8.933500627352572e-06, | |
| "loss": 1.264, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.10790464240903387, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 8.920953575909662e-06, | |
| "loss": 1.2381, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.10915934755332497, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 8.90840652446675e-06, | |
| "loss": 1.2084, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.11041405269761606, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 8.89585947302384e-06, | |
| "loss": 1.2211, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.11166875784190715, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 8.88331242158093e-06, | |
| "loss": 1.2351, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.11292346298619825, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 8.870765370138018e-06, | |
| "loss": 1.2459, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.11417816813048934, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 8.858218318695108e-06, | |
| "loss": 1.2756, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.11543287327478043, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 8.845671267252196e-06, | |
| "loss": 1.1769, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.11668757841907151, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 8.833124215809286e-06, | |
| "loss": 1.2025, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.11794228356336262, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 8.820577164366374e-06, | |
| "loss": 1.1827, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.1191969887076537, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 8.808030112923464e-06, | |
| "loss": 1.2374, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.12045169385194479, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 8.795483061480552e-06, | |
| "loss": 1.2126, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.12170639899623588, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 8.782936010037642e-06, | |
| "loss": 1.2181, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.12296110414052698, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 8.77038895859473e-06, | |
| "loss": 1.1734, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.12421580928481807, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 8.75784190715182e-06, | |
| "loss": 1.1591, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.12547051442910917, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 8.745294855708909e-06, | |
| "loss": 1.1996, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12672521957340024, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 8.732747804265998e-06, | |
| "loss": 1.2239, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.12797992471769135, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 8.720200752823087e-06, | |
| "loss": 1.1277, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.12923462986198245, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 8.707653701380176e-06, | |
| "loss": 1.1678, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.13048933500627352, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 8.695106649937265e-06, | |
| "loss": 1.1846, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.13174404015056462, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 8.682559598494355e-06, | |
| "loss": 1.164, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.1329987452948557, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 8.670012547051444e-06, | |
| "loss": 1.1754, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.1342534504391468, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 8.657465495608533e-06, | |
| "loss": 1.2038, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.1355081555834379, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 8.644918444165622e-06, | |
| "loss": 1.2131, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.13676286072772897, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 8.63237139272271e-06, | |
| "loss": 1.2102, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.13801756587202008, | |
| "grad_norm": 0.158203125, | |
| "learning_rate": 8.6198243412798e-06, | |
| "loss": 1.2427, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.13927227101631118, | |
| "grad_norm": 0.158203125, | |
| "learning_rate": 8.607277289836889e-06, | |
| "loss": 1.1832, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.14052697616060225, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 8.594730238393979e-06, | |
| "loss": 1.2198, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.14178168130489335, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 8.582183186951067e-06, | |
| "loss": 1.117, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.14303638644918445, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 8.569636135508157e-06, | |
| "loss": 1.166, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.14429109159347553, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 8.557089084065245e-06, | |
| "loss": 1.1694, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.14554579673776663, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 8.544542032622335e-06, | |
| "loss": 1.1551, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.1468005018820577, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 8.531994981179423e-06, | |
| "loss": 1.1501, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.1480552070263488, | |
| "grad_norm": 0.171875, | |
| "learning_rate": 8.519447929736513e-06, | |
| "loss": 1.1958, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.1493099121706399, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 8.506900878293601e-06, | |
| "loss": 1.1376, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.15056461731493098, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 8.49435382685069e-06, | |
| "loss": 1.2132, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.15181932245922208, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 8.48180677540778e-06, | |
| "loss": 1.181, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.15307402760351319, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 8.469259723964869e-06, | |
| "loss": 1.2168, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.15432873274780426, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 8.456712672521959e-06, | |
| "loss": 1.165, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.15558343789209536, | |
| "grad_norm": 0.181640625, | |
| "learning_rate": 8.444165621079047e-06, | |
| "loss": 1.1765, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.15683814303638646, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 8.431618569636137e-06, | |
| "loss": 1.1852, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.15809284818067754, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 8.419071518193225e-06, | |
| "loss": 1.1537, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.15934755332496864, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 8.406524466750315e-06, | |
| "loss": 1.1517, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.1606022584692597, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 8.393977415307403e-06, | |
| "loss": 1.1485, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.1618569636135508, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 8.381430363864493e-06, | |
| "loss": 1.1894, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.16311166875784192, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 8.368883312421581e-06, | |
| "loss": 1.1028, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.164366373902133, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 8.356336260978671e-06, | |
| "loss": 1.1623, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.1656210790464241, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 8.343789209535759e-06, | |
| "loss": 1.1596, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.1668757841907152, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 8.331242158092849e-06, | |
| "loss": 1.1794, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.16813048933500627, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 8.318695106649937e-06, | |
| "loss": 1.1584, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.16938519447929737, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 8.306148055207027e-06, | |
| "loss": 1.1652, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.17063989962358847, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 8.293601003764115e-06, | |
| "loss": 1.1535, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.17189460476787954, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 8.281053952321205e-06, | |
| "loss": 1.1373, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.17314930991217065, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 8.268506900878295e-06, | |
| "loss": 1.1891, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.17440401505646172, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 8.255959849435383e-06, | |
| "loss": 1.1435, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.17565872020075282, | |
| "grad_norm": 0.1826171875, | |
| "learning_rate": 8.243412797992473e-06, | |
| "loss": 1.2206, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.17691342534504392, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 8.230865746549561e-06, | |
| "loss": 1.1836, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.178168130489335, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 8.218318695106651e-06, | |
| "loss": 1.163, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.1794228356336261, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 8.205771643663741e-06, | |
| "loss": 1.1532, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.1806775407779172, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 8.193224592220829e-06, | |
| "loss": 1.1273, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.18193224592220827, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 8.180677540777919e-06, | |
| "loss": 1.1758, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.18318695106649938, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 8.168130489335007e-06, | |
| "loss": 1.1591, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.18444165621079048, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 8.155583437892095e-06, | |
| "loss": 1.1518, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.18569636135508155, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 8.143036386449185e-06, | |
| "loss": 1.083, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.18695106649937265, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 8.130489335006273e-06, | |
| "loss": 1.2089, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.18820577164366373, | |
| "grad_norm": 0.171875, | |
| "learning_rate": 8.117942283563363e-06, | |
| "loss": 1.1721, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.18946047678795483, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 8.105395232120451e-06, | |
| "loss": 1.0729, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.19071518193224593, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 8.092848180677541e-06, | |
| "loss": 1.1459, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.191969887076537, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 8.08030112923463e-06, | |
| "loss": 1.0966, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.1932245922208281, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 8.06775407779172e-06, | |
| "loss": 1.1786, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.1944792973651192, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 8.05520702634881e-06, | |
| "loss": 1.0695, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.19573400250941028, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 8.042659974905897e-06, | |
| "loss": 1.1277, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.19698870765370138, | |
| "grad_norm": 0.181640625, | |
| "learning_rate": 8.030112923462987e-06, | |
| "loss": 1.1343, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.19824341279799249, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 8.017565872020076e-06, | |
| "loss": 1.1395, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.19949811794228356, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 8.005018820577165e-06, | |
| "loss": 1.1521, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.20075282308657466, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 7.992471769134255e-06, | |
| "loss": 1.1283, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.20200752823086573, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 7.979924717691343e-06, | |
| "loss": 1.2036, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.20326223337515684, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 7.967377666248433e-06, | |
| "loss": 1.0924, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.20451693851944794, | |
| "grad_norm": 0.1875, | |
| "learning_rate": 7.954830614805521e-06, | |
| "loss": 1.1782, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.205771643663739, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 7.942283563362611e-06, | |
| "loss": 1.1477, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.20702634880803011, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 7.9297365119197e-06, | |
| "loss": 1.0975, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.20828105395232122, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 7.917189460476788e-06, | |
| "loss": 1.1117, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.2095357590966123, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 7.904642409033878e-06, | |
| "loss": 1.1155, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.2107904642409034, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 7.892095357590966e-06, | |
| "loss": 1.1577, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.2120451693851945, | |
| "grad_norm": 0.1806640625, | |
| "learning_rate": 7.879548306148056e-06, | |
| "loss": 1.0996, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.21329987452948557, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 7.867001254705144e-06, | |
| "loss": 1.1129, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.21455457967377667, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 7.854454203262234e-06, | |
| "loss": 1.1069, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.21580928481806774, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 7.841907151819324e-06, | |
| "loss": 1.112, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.21706398996235884, | |
| "grad_norm": 0.171875, | |
| "learning_rate": 7.829360100376412e-06, | |
| "loss": 1.1032, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.21831869510664995, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 7.816813048933502e-06, | |
| "loss": 1.1991, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.21957340025094102, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 7.80426599749059e-06, | |
| "loss": 1.1184, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.22082810539523212, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 7.79171894604768e-06, | |
| "loss": 1.0521, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.22208281053952322, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 7.77917189460477e-06, | |
| "loss": 1.1179, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.2233375156838143, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 7.766624843161858e-06, | |
| "loss": 1.1509, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.2245922208281054, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 7.754077791718948e-06, | |
| "loss": 1.1356, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.2258469259723965, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 7.741530740276036e-06, | |
| "loss": 1.1048, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.22710163111668757, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 7.728983688833126e-06, | |
| "loss": 1.1204, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.22835633626097868, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 7.716436637390214e-06, | |
| "loss": 1.1351, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.22961104140526975, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 7.703889585947302e-06, | |
| "loss": 1.14, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.23086574654956085, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 7.691342534504392e-06, | |
| "loss": 1.0908, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.23212045169385195, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 7.67879548306148e-06, | |
| "loss": 1.073, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.23337515683814303, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 7.66624843161857e-06, | |
| "loss": 1.1242, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.23462986198243413, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 7.65370138017566e-06, | |
| "loss": 1.0843, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.23588456712672523, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 7.641154328732748e-06, | |
| "loss": 1.1049, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.2371392722710163, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 7.628607277289838e-06, | |
| "loss": 1.1389, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.2383939774153074, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 7.616060225846926e-06, | |
| "loss": 1.1214, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2396486825595985, | |
| "grad_norm": 0.171875, | |
| "learning_rate": 7.603513174404016e-06, | |
| "loss": 1.1295, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.24090338770388958, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 7.590966122961104e-06, | |
| "loss": 1.1018, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.24215809284818068, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 7.578419071518194e-06, | |
| "loss": 1.0607, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.24341279799247176, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 7.565872020075283e-06, | |
| "loss": 1.2116, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.24466750313676286, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 7.553324968632372e-06, | |
| "loss": 1.0891, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.24592220828105396, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 7.540777917189461e-06, | |
| "loss": 1.217, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.24717691342534504, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 7.52823086574655e-06, | |
| "loss": 1.1137, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.24843161856963614, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 7.515683814303639e-06, | |
| "loss": 1.0802, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.24968632371392724, | |
| "grad_norm": 0.1806640625, | |
| "learning_rate": 7.503136762860729e-06, | |
| "loss": 1.0763, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.25094102885821834, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 7.490589711417817e-06, | |
| "loss": 1.0652, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2521957340025094, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 7.478042659974907e-06, | |
| "loss": 1.0527, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.2534504391468005, | |
| "grad_norm": 0.189453125, | |
| "learning_rate": 7.465495608531995e-06, | |
| "loss": 1.1129, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.2547051442910916, | |
| "grad_norm": 0.189453125, | |
| "learning_rate": 7.452948557089085e-06, | |
| "loss": 1.0906, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.2559598494353827, | |
| "grad_norm": 0.181640625, | |
| "learning_rate": 7.440401505646174e-06, | |
| "loss": 1.0946, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.2572145545796738, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 7.427854454203262e-06, | |
| "loss": 1.0753, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.2584692597239649, | |
| "grad_norm": 0.189453125, | |
| "learning_rate": 7.415307402760352e-06, | |
| "loss": 1.143, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.25972396486825594, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 7.40276035131744e-06, | |
| "loss": 1.0865, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.26097867001254704, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 7.39021329987453e-06, | |
| "loss": 1.1062, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.26223337515683814, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 7.37766624843162e-06, | |
| "loss": 1.0765, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.26348808030112925, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 7.365119196988708e-06, | |
| "loss": 1.0659, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.26474278544542035, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 7.352572145545797e-06, | |
| "loss": 1.0919, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.2659974905897114, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 7.340025094102886e-06, | |
| "loss": 1.0782, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.2672521957340025, | |
| "grad_norm": 0.1826171875, | |
| "learning_rate": 7.327478042659975e-06, | |
| "loss": 1.0881, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.2685069008782936, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 7.314930991217064e-06, | |
| "loss": 1.1003, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.2697616060225847, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 7.302383939774153e-06, | |
| "loss": 1.1336, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.2710163111668758, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 7.289836888331243e-06, | |
| "loss": 1.0792, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.2722710163111669, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 7.2772898368883315e-06, | |
| "loss": 1.0272, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.27352572145545795, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 7.264742785445421e-06, | |
| "loss": 1.0626, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.27478042659974905, | |
| "grad_norm": 0.1865234375, | |
| "learning_rate": 7.2521957340025095e-06, | |
| "loss": 1.0846, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.27603513174404015, | |
| "grad_norm": 0.181640625, | |
| "learning_rate": 7.239648682559599e-06, | |
| "loss": 1.0261, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.27728983688833125, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 7.2271016311166884e-06, | |
| "loss": 1.0981, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.27854454203262236, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 7.2145545796737775e-06, | |
| "loss": 1.1451, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.2797992471769134, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 7.2020075282308665e-06, | |
| "loss": 1.0633, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.2810539523212045, | |
| "grad_norm": 0.1806640625, | |
| "learning_rate": 7.189460476787955e-06, | |
| "loss": 1.0107, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.2823086574654956, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 7.1769134253450445e-06, | |
| "loss": 1.077, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.2835633626097867, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 7.164366373902134e-06, | |
| "loss": 1.0807, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.2848180677540778, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 7.151819322459223e-06, | |
| "loss": 1.0934, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.2860727728983689, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 7.1392722710163125e-06, | |
| "loss": 1.06, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.28732747804265996, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 7.126725219573401e-06, | |
| "loss": 1.0939, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.28858218318695106, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 7.11417816813049e-06, | |
| "loss": 1.099, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.28983688833124216, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 7.1016311166875795e-06, | |
| "loss": 1.1347, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.29109159347553326, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 7.089084065244668e-06, | |
| "loss": 1.1201, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.29234629861982436, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 7.076537013801758e-06, | |
| "loss": 1.1353, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.2936010037641154, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 7.063989962358846e-06, | |
| "loss": 1.107, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.2948557089084065, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 7.051442910915936e-06, | |
| "loss": 1.0844, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.2961104140526976, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 7.038895859473024e-06, | |
| "loss": 1.1499, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.2973651191969887, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 7.026348808030114e-06, | |
| "loss": 1.1032, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.2986198243412798, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 7.013801756587203e-06, | |
| "loss": 1.0374, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.2998745294855709, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 7.001254705144292e-06, | |
| "loss": 1.0793, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.30112923462986196, | |
| "grad_norm": 0.181640625, | |
| "learning_rate": 6.988707653701381e-06, | |
| "loss": 1.0437, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.30238393977415307, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 6.976160602258469e-06, | |
| "loss": 1.056, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.30363864491844417, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 6.963613550815559e-06, | |
| "loss": 1.0532, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.30489335006273527, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 6.951066499372649e-06, | |
| "loss": 1.0622, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.30614805520702637, | |
| "grad_norm": 0.1826171875, | |
| "learning_rate": 6.938519447929737e-06, | |
| "loss": 1.0593, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.3074027603513174, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 6.925972396486827e-06, | |
| "loss": 1.143, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.3086574654956085, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 6.913425345043915e-06, | |
| "loss": 1.1176, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.3099121706398996, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 6.900878293601004e-06, | |
| "loss": 1.0449, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.3111668757841907, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 6.888331242158094e-06, | |
| "loss": 1.1068, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.3124215809284818, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 6.875784190715182e-06, | |
| "loss": 1.0431, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.3136762860727729, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 6.863237139272272e-06, | |
| "loss": 1.0766, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.31493099121706397, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 6.85069008782936e-06, | |
| "loss": 1.1234, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.3161856963613551, | |
| "grad_norm": 0.19140625, | |
| "learning_rate": 6.83814303638645e-06, | |
| "loss": 1.0631, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.3174404015056462, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 6.825595984943539e-06, | |
| "loss": 1.0232, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.3186951066499373, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 6.813048933500628e-06, | |
| "loss": 1.0325, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.3199498117942284, | |
| "grad_norm": 0.1806640625, | |
| "learning_rate": 6.800501882057717e-06, | |
| "loss": 1.0789, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.3212045169385194, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 6.787954830614806e-06, | |
| "loss": 1.0603, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.3224592220828105, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 6.775407779171895e-06, | |
| "loss": 1.0139, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.3237139272271016, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 6.762860727728984e-06, | |
| "loss": 1.074, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.32496863237139273, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 6.750313676286073e-06, | |
| "loss": 1.1028, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.32622333751568383, | |
| "grad_norm": 0.19140625, | |
| "learning_rate": 6.737766624843163e-06, | |
| "loss": 1.0827, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.32747804265997493, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 6.725219573400251e-06, | |
| "loss": 1.125, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.328732747804266, | |
| "grad_norm": 0.189453125, | |
| "learning_rate": 6.712672521957341e-06, | |
| "loss": 1.0249, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.3299874529485571, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 6.700125470514429e-06, | |
| "loss": 1.1003, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.3312421580928482, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 6.687578419071519e-06, | |
| "loss": 1.1257, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.3324968632371393, | |
| "grad_norm": 0.1826171875, | |
| "learning_rate": 6.675031367628608e-06, | |
| "loss": 1.0234, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.3337515683814304, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 6.662484316185696e-06, | |
| "loss": 1.0556, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.33500627352572143, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 6.649937264742786e-06, | |
| "loss": 1.0651, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.33626097867001253, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 6.637390213299874e-06, | |
| "loss": 1.0517, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.33751568381430364, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 6.624843161856964e-06, | |
| "loss": 1.1119, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.33877038895859474, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 6.612296110414054e-06, | |
| "loss": 1.0818, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.34002509410288584, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 6.599749058971142e-06, | |
| "loss": 1.0845, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.34127979924717694, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 6.587202007528231e-06, | |
| "loss": 1.0434, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.342534504391468, | |
| "grad_norm": 0.1865234375, | |
| "learning_rate": 6.57465495608532e-06, | |
| "loss": 1.0789, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.3437892095357591, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 6.562107904642409e-06, | |
| "loss": 1.1049, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.3450439146800502, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 6.549560853199499e-06, | |
| "loss": 1.0722, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.3462986198243413, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 6.5370138017565874e-06, | |
| "loss": 1.0139, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.3475533249686324, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 6.524466750313677e-06, | |
| "loss": 1.061, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.34880803011292344, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 6.5119196988707655e-06, | |
| "loss": 1.0736, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.35006273525721454, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 6.499372647427855e-06, | |
| "loss": 1.0735, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.35131744040150564, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 6.4868255959849435e-06, | |
| "loss": 1.1039, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.35257214554579674, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 6.474278544542033e-06, | |
| "loss": 1.0102, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.35382685069008785, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 6.4617314930991224e-06, | |
| "loss": 1.1021, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.35508155583437895, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 6.4491844416562115e-06, | |
| "loss": 1.0137, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.35633626097867, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 6.4366373902133005e-06, | |
| "loss": 1.1087, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.3575909661229611, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 6.424090338770389e-06, | |
| "loss": 1.0688, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.3588456712672522, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 6.4115432873274786e-06, | |
| "loss": 1.087, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.3601003764115433, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 6.3989962358845684e-06, | |
| "loss": 1.1308, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.3613550815558344, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 6.386449184441657e-06, | |
| "loss": 1.1111, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.36260978670012545, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 6.373902132998746e-06, | |
| "loss": 1.0542, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.36386449184441655, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 6.361355081555835e-06, | |
| "loss": 1.0971, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.36511919698870765, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 6.348808030112924e-06, | |
| "loss": 1.0325, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.36637390213299875, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 6.3362609786700136e-06, | |
| "loss": 1.044, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.36762860727728985, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 6.323713927227102e-06, | |
| "loss": 1.1222, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.36888331242158096, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 6.311166875784192e-06, | |
| "loss": 1.0249, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.370138017565872, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 6.29861982434128e-06, | |
| "loss": 1.0361, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.3713927227101631, | |
| "grad_norm": 0.19140625, | |
| "learning_rate": 6.28607277289837e-06, | |
| "loss": 1.1183, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.3726474278544542, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 6.273525721455459e-06, | |
| "loss": 1.0808, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.3739021329987453, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 6.260978670012548e-06, | |
| "loss": 1.0988, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.3751568381430364, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 6.248431618569637e-06, | |
| "loss": 1.0883, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.37641154328732745, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 6.235884567126726e-06, | |
| "loss": 1.053, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.37766624843161856, | |
| "grad_norm": 0.25, | |
| "learning_rate": 6.223337515683815e-06, | |
| "loss": 1.1065, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.37892095357590966, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 6.210790464240903e-06, | |
| "loss": 1.0535, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.38017565872020076, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 6.198243412797993e-06, | |
| "loss": 1.1443, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.38143036386449186, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 6.185696361355083e-06, | |
| "loss": 1.0134, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.38268506900878296, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 6.173149309912171e-06, | |
| "loss": 1.0806, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.383939774153074, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 6.160602258469261e-06, | |
| "loss": 1.0563, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.3851944792973651, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 6.148055207026349e-06, | |
| "loss": 1.0994, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.3864491844416562, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 6.135508155583438e-06, | |
| "loss": 1.0843, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.3877038895859473, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 6.122961104140528e-06, | |
| "loss": 0.9856, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.3889585947302384, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 6.110414052697616e-06, | |
| "loss": 1.0472, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.39021329987452946, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 6.097867001254706e-06, | |
| "loss": 1.0711, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.39146800501882056, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 6.085319949811794e-06, | |
| "loss": 1.1034, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.39272271016311167, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 6.072772898368884e-06, | |
| "loss": 1.0568, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.39397741530740277, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 6.060225846925973e-06, | |
| "loss": 1.0413, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.39523212045169387, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 6.047678795483062e-06, | |
| "loss": 1.0047, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.39648682559598497, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 6.035131744040151e-06, | |
| "loss": 1.0121, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.397741530740276, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 6.02258469259724e-06, | |
| "loss": 1.0732, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.3989962358845671, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 6.010037641154329e-06, | |
| "loss": 1.092, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.4002509410288582, | |
| "grad_norm": 0.19140625, | |
| "learning_rate": 5.997490589711419e-06, | |
| "loss": 1.0383, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.4015056461731493, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 5.984943538268507e-06, | |
| "loss": 1.069, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4027603513174404, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 5.972396486825597e-06, | |
| "loss": 0.9865, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.40401505646173147, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 5.959849435382685e-06, | |
| "loss": 1.0732, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.40526976160602257, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 5.947302383939775e-06, | |
| "loss": 1.1649, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.4065244667503137, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 5.934755332496863e-06, | |
| "loss": 1.0262, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.4077791718946048, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 5.922208281053953e-06, | |
| "loss": 1.0525, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.4090338770388959, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 5.909661229611042e-06, | |
| "loss": 1.0554, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.410288582183187, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 5.89711417816813e-06, | |
| "loss": 1.0965, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.411543287327478, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 5.88456712672522e-06, | |
| "loss": 1.0792, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.4127979924717691, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 5.872020075282308e-06, | |
| "loss": 1.056, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.41405269761606023, | |
| "grad_norm": 0.19140625, | |
| "learning_rate": 5.859473023839398e-06, | |
| "loss": 1.0459, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.41530740276035133, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 5.846925972396488e-06, | |
| "loss": 1.0796, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.41656210790464243, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 5.834378920953576e-06, | |
| "loss": 1.0094, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.4178168130489335, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 5.821831869510665e-06, | |
| "loss": 1.0475, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.4190715181932246, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 5.809284818067754e-06, | |
| "loss": 1.0524, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.4203262233375157, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 5.796737766624843e-06, | |
| "loss": 1.0325, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.4215809284818068, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 5.784190715181933e-06, | |
| "loss": 1.0461, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.4228356336260979, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 5.7716436637390215e-06, | |
| "loss": 1.0828, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.424090338770389, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 5.759096612296111e-06, | |
| "loss": 1.119, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.42534504391468003, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 5.7465495608531995e-06, | |
| "loss": 1.025, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.42659974905897113, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 5.734002509410289e-06, | |
| "loss": 1.0919, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.42785445420326224, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 5.721455457967378e-06, | |
| "loss": 1.081, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.42910915934755334, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 5.7089084065244674e-06, | |
| "loss": 1.0661, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.43036386449184444, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 5.6963613550815565e-06, | |
| "loss": 1.059, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.4316185696361355, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 5.683814303638645e-06, | |
| "loss": 1.0253, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.4328732747804266, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 5.6712672521957345e-06, | |
| "loss": 1.0943, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.4341279799247177, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 5.658720200752823e-06, | |
| "loss": 1.0899, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.4353826850690088, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 5.6461731493099126e-06, | |
| "loss": 1.0111, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.4366373902132999, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 5.6336260978670024e-06, | |
| "loss": 1.0214, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.437892095357591, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 5.621079046424091e-06, | |
| "loss": 1.1007, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.43914680050188204, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 5.60853199498118e-06, | |
| "loss": 1.1226, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.44040150564617314, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 5.595984943538269e-06, | |
| "loss": 1.0233, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.44165621079046424, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 5.583437892095358e-06, | |
| "loss": 1.0142, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.44291091593475534, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 5.5708908406524476e-06, | |
| "loss": 1.0528, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.44416562107904645, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 5.558343789209536e-06, | |
| "loss": 1.0586, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.4454203262233375, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 5.545796737766626e-06, | |
| "loss": 1.0455, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.4466750313676286, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 5.533249686323714e-06, | |
| "loss": 1.0394, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.4479297365119197, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 5.520702634880804e-06, | |
| "loss": 1.055, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.4491844416562108, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 5.508155583437893e-06, | |
| "loss": 1.0775, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.4504391468005019, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 5.495608531994982e-06, | |
| "loss": 1.0259, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.451693851944793, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 5.483061480552071e-06, | |
| "loss": 1.0071, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.45294855708908405, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 5.47051442910916e-06, | |
| "loss": 1.0958, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.45420326223337515, | |
| "grad_norm": 0.1875, | |
| "learning_rate": 5.457967377666249e-06, | |
| "loss": 1.0126, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.45545796737766625, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 5.445420326223339e-06, | |
| "loss": 1.0417, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.45671267252195735, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 5.432873274780427e-06, | |
| "loss": 1.1278, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.45796737766624845, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 5.420326223337517e-06, | |
| "loss": 1.0332, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.4592220828105395, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 5.407779171894605e-06, | |
| "loss": 1.0645, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.4604767879548306, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 5.395232120451695e-06, | |
| "loss": 1.0541, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.4617314930991217, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 5.382685069008783e-06, | |
| "loss": 1.05, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.4629861982434128, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 5.370138017565872e-06, | |
| "loss": 1.0079, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.4642409033877039, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 5.357590966122962e-06, | |
| "loss": 1.0665, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.465495608531995, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 5.34504391468005e-06, | |
| "loss": 1.0672, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.46675031367628605, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 5.33249686323714e-06, | |
| "loss": 1.0404, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.46800501882057716, | |
| "grad_norm": 0.19140625, | |
| "learning_rate": 5.319949811794228e-06, | |
| "loss": 1.049, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.46925972396486826, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 5.307402760351318e-06, | |
| "loss": 1.0869, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.47051442910915936, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 5.294855708908407e-06, | |
| "loss": 1.0311, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.47176913425345046, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 5.282308657465496e-06, | |
| "loss": 0.9855, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.4730238393977415, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 5.269761606022585e-06, | |
| "loss": 1.1071, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.4742785445420326, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 5.257214554579674e-06, | |
| "loss": 1.0345, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.4755332496863237, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 5.244667503136763e-06, | |
| "loss": 1.0342, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.4767879548306148, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 5.232120451693853e-06, | |
| "loss": 1.1377, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.4780426599749059, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 5.219573400250941e-06, | |
| "loss": 1.0074, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.479297365119197, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 5.207026348808031e-06, | |
| "loss": 1.1016, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.48055207026348806, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 5.194479297365119e-06, | |
| "loss": 0.9935, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.48180677540777916, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 5.181932245922209e-06, | |
| "loss": 0.9686, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.48306148055207027, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 5.169385194479298e-06, | |
| "loss": 1.0541, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.48431618569636137, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 5.156838143036387e-06, | |
| "loss": 1.0873, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.48557089084065247, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 5.144291091593476e-06, | |
| "loss": 1.0733, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.4868255959849435, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 5.131744040150564e-06, | |
| "loss": 1.0454, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.4880803011292346, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 5.119196988707654e-06, | |
| "loss": 1.0617, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.4893350062735257, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 5.106649937264742e-06, | |
| "loss": 1.0762, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4905897114178168, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 5.094102885821832e-06, | |
| "loss": 1.0593, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.4918444165621079, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 5.081555834378922e-06, | |
| "loss": 1.0697, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.493099121706399, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 5.06900878293601e-06, | |
| "loss": 1.0755, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.49435382685069007, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 5.056461731493099e-06, | |
| "loss": 1.0863, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.49560853199498117, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 5.043914680050188e-06, | |
| "loss": 1.075, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.4968632371392723, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 5.031367628607277e-06, | |
| "loss": 1.0718, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.4981179422835634, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 5.018820577164367e-06, | |
| "loss": 1.0541, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.4993726474278545, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 5.0062735257214555e-06, | |
| "loss": 1.11, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.5006273525721455, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 4.993726474278545e-06, | |
| "loss": 1.0366, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.5018820577164367, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 4.981179422835634e-06, | |
| "loss": 1.0649, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5031367628607277, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 4.968632371392723e-06, | |
| "loss": 1.0277, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.5043914680050188, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 4.9560853199498124e-06, | |
| "loss": 1.0862, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.5056461731493099, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 4.9435382685069015e-06, | |
| "loss": 1.0414, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.506900878293601, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 4.9309912170639905e-06, | |
| "loss": 1.033, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.5081555834378921, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 4.9184441656210795e-06, | |
| "loss": 1.0621, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.5094102885821832, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 4.9058971141781685e-06, | |
| "loss": 1.0961, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.5106649937264742, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 4.8933500627352576e-06, | |
| "loss": 1.0633, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.5119196988707654, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 4.880803011292347e-06, | |
| "loss": 1.0757, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.5131744040150564, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 4.868255959849436e-06, | |
| "loss": 1.0588, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.5144291091593476, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 4.8557089084065255e-06, | |
| "loss": 1.0291, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5156838143036386, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 4.843161856963614e-06, | |
| "loss": 1.0779, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.5169385194479298, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 4.830614805520703e-06, | |
| "loss": 1.0104, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.5181932245922208, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 4.818067754077792e-06, | |
| "loss": 1.0218, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.5194479297365119, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 4.805520702634881e-06, | |
| "loss": 1.0131, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.520702634880803, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 4.792973651191971e-06, | |
| "loss": 1.0428, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.5219573400250941, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 4.78042659974906e-06, | |
| "loss": 1.0789, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.5232120451693852, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 4.767879548306149e-06, | |
| "loss": 1.0057, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.5244667503136763, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 4.755332496863238e-06, | |
| "loss": 1.0565, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.5257214554579673, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 4.742785445420327e-06, | |
| "loss": 1.105, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.5269761606022585, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 4.730238393977416e-06, | |
| "loss": 1.0279, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5282308657465495, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 4.717691342534505e-06, | |
| "loss": 1.0793, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.5294855708908407, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 4.705144291091594e-06, | |
| "loss": 1.0478, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.5307402760351317, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 4.692597239648683e-06, | |
| "loss": 1.0389, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.5319949811794228, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 4.680050188205772e-06, | |
| "loss": 1.0813, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.533249686323714, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 4.667503136762861e-06, | |
| "loss": 1.0383, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.534504391468005, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 4.654956085319951e-06, | |
| "loss": 1.0483, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.5357590966122961, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 4.64240903387704e-06, | |
| "loss": 1.06, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.5370138017565872, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 4.629861982434129e-06, | |
| "loss": 1.0391, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.5382685069008782, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 4.617314930991217e-06, | |
| "loss": 1.0405, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.5395232120451694, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 4.604767879548306e-06, | |
| "loss": 1.0204, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5407779171894604, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 4.592220828105395e-06, | |
| "loss": 1.0687, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.5420326223337516, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 4.579673776662485e-06, | |
| "loss": 1.0194, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.5432873274780426, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 4.567126725219574e-06, | |
| "loss": 1.1113, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.5445420326223338, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 4.554579673776663e-06, | |
| "loss": 1.0448, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.5457967377666249, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 4.542032622333752e-06, | |
| "loss": 1.0532, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.5470514429109159, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 4.529485570890841e-06, | |
| "loss": 1.0412, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.548306148055207, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 4.51693851944793e-06, | |
| "loss": 1.0333, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.5495608531994981, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 4.504391468005019e-06, | |
| "loss": 1.0322, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.5508155583437893, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 4.491844416562108e-06, | |
| "loss": 1.0335, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.5520702634880803, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 4.479297365119197e-06, | |
| "loss": 0.9846, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5533249686323714, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 4.466750313676286e-06, | |
| "loss": 1.0333, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.5545796737766625, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 4.454203262233375e-06, | |
| "loss": 0.9969, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.5558343789209536, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 4.441656210790465e-06, | |
| "loss": 1.0541, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.5570890840652447, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 4.429109159347554e-06, | |
| "loss": 1.0073, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.5583437892095358, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 4.416562107904643e-06, | |
| "loss": 0.9851, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.5595984943538268, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 4.404015056461732e-06, | |
| "loss": 1.0577, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.560853199498118, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 4.391468005018821e-06, | |
| "loss": 1.0376, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.562107904642409, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 4.37892095357591e-06, | |
| "loss": 1.0417, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.5633626097867002, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 4.366373902132999e-06, | |
| "loss": 1.018, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.5646173149309912, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 4.353826850690088e-06, | |
| "loss": 1.0216, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5658720200752823, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 4.341279799247177e-06, | |
| "loss": 1.0128, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.5671267252195734, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 4.328732747804266e-06, | |
| "loss": 1.043, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.5683814303638645, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 4.316185696361355e-06, | |
| "loss": 1.0468, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.5696361355081556, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 4.303638644918444e-06, | |
| "loss": 1.1028, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.5708908406524467, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 4.291091593475533e-06, | |
| "loss": 1.0701, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.5721455457967378, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 4.278544542032622e-06, | |
| "loss": 1.0275, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.5734002509410289, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 4.2659974905897114e-06, | |
| "loss": 1.0543, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.5746549560853199, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 4.2534504391468005e-06, | |
| "loss": 1.0299, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.5759096612296111, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 4.24090338770389e-06, | |
| "loss": 1.0248, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.5771643663739021, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 4.228356336260979e-06, | |
| "loss": 1.084, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5784190715181933, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 4.215809284818068e-06, | |
| "loss": 1.0025, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.5796737766624843, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 4.203262233375157e-06, | |
| "loss": 1.0101, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.5809284818067754, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 4.1907151819322464e-06, | |
| "loss": 1.1198, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.5821831869510665, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 4.1781681304893355e-06, | |
| "loss": 1.0278, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.5834378920953576, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 4.1656210790464245e-06, | |
| "loss": 1.0495, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.5846925972396487, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 4.1530740276035135e-06, | |
| "loss": 1.0356, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.5859473023839398, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 4.1405269761606026e-06, | |
| "loss": 1.0484, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.5872020075282308, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 4.127979924717692e-06, | |
| "loss": 1.0519, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.588456712672522, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 4.115432873274781e-06, | |
| "loss": 1.0387, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.589711417816813, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 4.1028858218318705e-06, | |
| "loss": 1.0847, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5909661229611042, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 4.0903387703889595e-06, | |
| "loss": 1.0396, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.5922208281053952, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 4.077791718946048e-06, | |
| "loss": 1.0742, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.5934755332496863, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 4.065244667503137e-06, | |
| "loss": 1.0868, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.5947302383939774, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 4.052697616060226e-06, | |
| "loss": 1.0198, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.5959849435382685, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 4.040150564617315e-06, | |
| "loss": 1.0183, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.5972396486825596, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 4.027603513174405e-06, | |
| "loss": 0.9979, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.5984943538268507, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 4.015056461731494e-06, | |
| "loss": 1.0757, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.5997490589711418, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 4.002509410288583e-06, | |
| "loss": 1.0298, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.6010037641154329, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 3.989962358845672e-06, | |
| "loss": 1.0244, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.6022584692597239, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 3.977415307402761e-06, | |
| "loss": 1.1217, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6035131744040151, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 3.96486825595985e-06, | |
| "loss": 1.0211, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.6047678795483061, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 3.952321204516939e-06, | |
| "loss": 1.0317, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.6060225846925973, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 3.939774153074028e-06, | |
| "loss": 1.0053, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.6072772898368883, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 3.927227101631117e-06, | |
| "loss": 1.0749, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.6085319949811794, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 3.914680050188206e-06, | |
| "loss": 1.0847, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.6097867001254705, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 3.902132998745295e-06, | |
| "loss": 1.0502, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.6110414052697616, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 3.889585947302385e-06, | |
| "loss": 0.9839, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.6122961104140527, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 3.877038895859474e-06, | |
| "loss": 1.0372, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.6135508155583438, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 3.864491844416563e-06, | |
| "loss": 1.025, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.6148055207026348, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 3.851944792973651e-06, | |
| "loss": 1.0403, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.616060225846926, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 3.83939774153074e-06, | |
| "loss": 0.9886, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.617314930991217, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 3.82685069008783e-06, | |
| "loss": 1.0598, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.6185696361355082, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 3.814303638644919e-06, | |
| "loss": 0.9854, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.6198243412797992, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 3.801756587202008e-06, | |
| "loss": 1.0404, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.6210790464240903, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 3.789209535759097e-06, | |
| "loss": 1.0512, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.6223337515683814, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 3.776662484316186e-06, | |
| "loss": 1.0205, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.6235884567126725, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 3.764115432873275e-06, | |
| "loss": 1.0123, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.6248431618569636, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 3.7515683814303645e-06, | |
| "loss": 1.0438, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.6260978670012547, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 3.7390213299874535e-06, | |
| "loss": 0.9792, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.6273525721455459, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 3.7264742785445425e-06, | |
| "loss": 1.057, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6286072772898369, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 3.713927227101631e-06, | |
| "loss": 1.0221, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.6298619824341279, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 3.70138017565872e-06, | |
| "loss": 1.0133, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.6311166875784191, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 3.68883312421581e-06, | |
| "loss": 1.0232, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.6323713927227101, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 3.6762860727728987e-06, | |
| "loss": 1.0509, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.6336260978670013, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 3.6637390213299877e-06, | |
| "loss": 0.9949, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.6348808030112923, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 3.6511919698870767e-06, | |
| "loss": 1.0611, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.6361355081555834, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 3.6386449184441657e-06, | |
| "loss": 1.0192, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.6373902132998746, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 3.6260978670012548e-06, | |
| "loss": 1.0595, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.6386449184441656, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 3.6135508155583442e-06, | |
| "loss": 1.004, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.6398996235884568, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 3.6010037641154332e-06, | |
| "loss": 1.0088, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6411543287327478, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 3.5884567126725223e-06, | |
| "loss": 1.0476, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.6424090338770388, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 3.5759096612296113e-06, | |
| "loss": 1.0425, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.64366373902133, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 3.5633626097867003e-06, | |
| "loss": 1.0592, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.644918444165621, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 3.5508155583437898e-06, | |
| "loss": 0.9867, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.6461731493099122, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 3.538268506900879e-06, | |
| "loss": 1.0498, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.6474278544542033, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 3.525721455457968e-06, | |
| "loss": 1.0369, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.6486825595984943, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 3.513174404015057e-06, | |
| "loss": 1.0112, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.6499372647427855, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 3.500627352572146e-06, | |
| "loss": 1.0099, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.6511919698870765, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 3.4880803011292345e-06, | |
| "loss": 1.0667, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.6524466750313677, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 3.4755332496863244e-06, | |
| "loss": 1.0835, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6537013801756587, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 3.4629861982434134e-06, | |
| "loss": 1.0616, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.6549560853199499, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 3.450439146800502e-06, | |
| "loss": 1.0777, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.6562107904642409, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 3.437892095357591e-06, | |
| "loss": 1.0241, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.657465495608532, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 3.42534504391468e-06, | |
| "loss": 1.0476, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.6587202007528231, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 3.4127979924717695e-06, | |
| "loss": 1.0474, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.6599749058971142, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 3.4002509410288585e-06, | |
| "loss": 1.0541, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.6612296110414053, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 3.3877038895859475e-06, | |
| "loss": 1.0815, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.6624843161856964, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 3.3751568381430366e-06, | |
| "loss": 1.0137, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.6637390213299874, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 3.3626097867001256e-06, | |
| "loss": 1.0295, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.6649937264742786, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 3.3500627352572146e-06, | |
| "loss": 1.0171, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6662484316185696, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 3.337515683814304e-06, | |
| "loss": 1.0739, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.6675031367628608, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 3.324968632371393e-06, | |
| "loss": 1.0432, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.6687578419071518, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 3.312421580928482e-06, | |
| "loss": 1.0359, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.6700125470514429, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 3.299874529485571e-06, | |
| "loss": 1.0066, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.671267252195734, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 3.28732747804266e-06, | |
| "loss": 1.0246, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.6725219573400251, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 3.2747804265997496e-06, | |
| "loss": 1.0027, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.6737766624843162, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 3.2622333751568387e-06, | |
| "loss": 1.0538, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.6750313676286073, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 3.2496863237139277e-06, | |
| "loss": 1.0506, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.6762860727728983, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 3.2371392722710167e-06, | |
| "loss": 0.9803, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.6775407779171895, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 3.2245922208281057e-06, | |
| "loss": 1.0177, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6787954830614805, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 3.2120451693851943e-06, | |
| "loss": 1.0454, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.6800501882057717, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 3.1994981179422842e-06, | |
| "loss": 1.0216, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.6813048933500627, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 3.186951066499373e-06, | |
| "loss": 1.0246, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.6825595984943539, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 3.174404015056462e-06, | |
| "loss": 0.9709, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.6838143036386449, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 3.161856963613551e-06, | |
| "loss": 0.9927, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.685069008782936, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 3.14930991217064e-06, | |
| "loss": 1.0039, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.6863237139272271, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 3.1367628607277293e-06, | |
| "loss": 1.0182, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 0.6875784190715182, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 3.1242158092848184e-06, | |
| "loss": 1.0093, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.6888331242158093, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 3.1116687578419074e-06, | |
| "loss": 1.037, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.6900878293601004, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 3.0991217063989964e-06, | |
| "loss": 0.9981, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6913425345043914, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 3.0865746549560855e-06, | |
| "loss": 1.0494, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 0.6925972396486826, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 3.0740276035131745e-06, | |
| "loss": 1.028, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.6938519447929736, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 3.061480552070264e-06, | |
| "loss": 1.0407, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 0.6951066499372648, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 3.048933500627353e-06, | |
| "loss": 1.0021, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.6963613550815558, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 3.036386449184442e-06, | |
| "loss": 1.042, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.6976160602258469, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 3.023839397741531e-06, | |
| "loss": 1.0004, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.698870765370138, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 3.01129234629862e-06, | |
| "loss": 1.0578, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 0.7001254705144291, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 2.9987452948557095e-06, | |
| "loss": 1.0015, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.7013801756587202, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 2.9861982434127985e-06, | |
| "loss": 0.9725, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 0.7026348808030113, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 2.9736511919698875e-06, | |
| "loss": 1.0287, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.7038895859473023, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 2.9611041405269766e-06, | |
| "loss": 0.9984, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 0.7051442910915935, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 2.948557089084065e-06, | |
| "loss": 1.0141, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 0.7063989962358845, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 2.936010037641154e-06, | |
| "loss": 0.9633, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 0.7076537013801757, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 2.923462986198244e-06, | |
| "loss": 1.0358, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.7089084065244667, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 2.9109159347553327e-06, | |
| "loss": 1.052, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.7101631116687579, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 2.8983688833124217e-06, | |
| "loss": 1.0014, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 0.7114178168130489, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 2.8858218318695107e-06, | |
| "loss": 1.0039, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 0.71267252195734, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 2.8732747804265998e-06, | |
| "loss": 1.0154, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.7139272271016311, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 2.860727728983689e-06, | |
| "loss": 1.0154, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 0.7151819322459222, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 2.8481806775407782e-06, | |
| "loss": 0.993, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7164366373902133, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 2.8356336260978673e-06, | |
| "loss": 1.0333, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 0.7176913425345044, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 2.8230865746549563e-06, | |
| "loss": 1.0426, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 0.7189460476787954, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 2.8105395232120453e-06, | |
| "loss": 1.0889, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 0.7202007528230866, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 2.7979924717691343e-06, | |
| "loss": 1.0128, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.7214554579673776, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 2.7854454203262238e-06, | |
| "loss": 1.0716, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.7227101631116688, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 2.772898368883313e-06, | |
| "loss": 1.0305, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.7239648682559598, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 2.760351317440402e-06, | |
| "loss": 0.9754, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 0.7252195734002509, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 2.747804265997491e-06, | |
| "loss": 1.0193, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 0.726474278544542, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 2.73525721455458e-06, | |
| "loss": 1.0029, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 0.7277289836888331, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 2.7227101631116693e-06, | |
| "loss": 1.0143, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7289836888331243, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 2.7101631116687584e-06, | |
| "loss": 1.1078, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 0.7302383939774153, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 2.6976160602258474e-06, | |
| "loss": 0.995, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.7314930991217063, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 2.685069008782936e-06, | |
| "loss": 1.0182, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 0.7327478042659975, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 2.672521957340025e-06, | |
| "loss": 1.0709, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.7340025094102886, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 2.659974905897114e-06, | |
| "loss": 0.9936, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.7352572145545797, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 2.6474278544542035e-06, | |
| "loss": 1.0406, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 0.7365119196988708, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 2.6348808030112925e-06, | |
| "loss": 1.0011, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 0.7377666248431619, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 2.6223337515683816e-06, | |
| "loss": 1.0088, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.739021329987453, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 2.6097867001254706e-06, | |
| "loss": 1.033, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 0.740276035131744, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 2.5972396486825596e-06, | |
| "loss": 0.9935, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7415307402760352, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 2.584692597239649e-06, | |
| "loss": 1.025, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 0.7427854454203262, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 2.572145545796738e-06, | |
| "loss": 1.042, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 0.7440401505646174, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 2.559598494353827e-06, | |
| "loss": 0.9731, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 0.7452948557089084, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 2.547051442910916e-06, | |
| "loss": 0.9729, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 0.7465495608531995, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 2.534504391468005e-06, | |
| "loss": 1.03, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.7478042659974906, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 2.521957340025094e-06, | |
| "loss": 1.0294, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 0.7490589711417817, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 2.5094102885821836e-06, | |
| "loss": 1.076, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 0.7503136762860728, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 2.4968632371392727e-06, | |
| "loss": 1.0643, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 0.7515683814303639, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 2.4843161856963617e-06, | |
| "loss": 1.0265, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 0.7528230865746549, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 2.4717691342534507e-06, | |
| "loss": 1.0221, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7540777917189461, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 2.4592220828105398e-06, | |
| "loss": 1.0333, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 0.7553324968632371, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 2.4466750313676288e-06, | |
| "loss": 0.9877, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 0.7565872020075283, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 2.434127979924718e-06, | |
| "loss": 1.0679, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 0.7578419071518193, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 2.421580928481807e-06, | |
| "loss": 1.066, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 0.7590966122961104, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 2.409033877038896e-06, | |
| "loss": 1.0166, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.7603513174404015, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 2.3964868255959853e-06, | |
| "loss": 1.0421, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 0.7616060225846926, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 2.3839397741530743e-06, | |
| "loss": 1.0617, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 0.7628607277289837, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 2.3713927227101634e-06, | |
| "loss": 0.9719, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.7641154328732748, | |
| "grad_norm": 0.25, | |
| "learning_rate": 2.3588456712672524e-06, | |
| "loss": 1.0267, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 0.7653701380175659, | |
| "grad_norm": 0.25, | |
| "learning_rate": 2.3462986198243414e-06, | |
| "loss": 1.0067, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.766624843161857, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 2.3337515683814304e-06, | |
| "loss": 1.0346, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 0.767879548306148, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 2.32120451693852e-06, | |
| "loss": 0.9765, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 0.7691342534504392, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 2.3086574654956085e-06, | |
| "loss": 1.0025, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 0.7703889585947302, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 2.2961104140526975e-06, | |
| "loss": 1.0229, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 0.7716436637390214, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 2.283563362609787e-06, | |
| "loss": 0.9822, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.7728983688833124, | |
| "grad_norm": 0.25, | |
| "learning_rate": 2.271016311166876e-06, | |
| "loss": 1.077, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 0.7741530740276035, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 2.258469259723965e-06, | |
| "loss": 1.044, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 0.7754077791718946, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 2.245922208281054e-06, | |
| "loss": 1.0348, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 0.7766624843161857, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 2.233375156838143e-06, | |
| "loss": 0.9906, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 0.7779171894604768, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 2.2208281053952325e-06, | |
| "loss": 1.0023, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7791718946047679, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 2.2082810539523216e-06, | |
| "loss": 1.0452, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 0.7804265997490589, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 2.1957340025094106e-06, | |
| "loss": 1.0391, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 0.7816813048933501, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 2.1831869510664996e-06, | |
| "loss": 1.0097, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 0.7829360100376411, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 2.1706398996235886e-06, | |
| "loss": 1.0256, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 0.7841907151819323, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 2.1580928481806777e-06, | |
| "loss": 1.0621, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.7854454203262233, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 2.1455457967377667e-06, | |
| "loss": 1.0201, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 0.7867001254705144, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 2.1329987452948557e-06, | |
| "loss": 1.0165, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 0.7879548306148055, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 2.120451693851945e-06, | |
| "loss": 1.0409, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 0.7892095357590966, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 2.107904642409034e-06, | |
| "loss": 1.0204, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 0.7904642409033877, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 2.0953575909661232e-06, | |
| "loss": 1.0584, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7917189460476788, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 2.0828105395232122e-06, | |
| "loss": 1.0017, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 0.7929736511919699, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 2.0702634880803013e-06, | |
| "loss": 1.016, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 0.794228356336261, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 2.0577164366373903e-06, | |
| "loss": 1.0895, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 0.795483061480552, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 2.0451693851944798e-06, | |
| "loss": 1.0615, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 0.7967377666248432, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 2.0326223337515684e-06, | |
| "loss": 1.0618, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.7979924717691342, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 2.0200752823086574e-06, | |
| "loss": 1.0634, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 0.7992471769134254, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 2.007528230865747e-06, | |
| "loss": 1.0281, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 0.8005018820577164, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1.994981179422836e-06, | |
| "loss": 0.971, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 0.8017565872020075, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 1.982434127979925e-06, | |
| "loss": 1.0409, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 0.8030112923462986, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.969887076537014e-06, | |
| "loss": 1.0139, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.8042659974905897, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 1.957340025094103e-06, | |
| "loss": 1.0208, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 0.8055207026348808, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 1.9447929736511924e-06, | |
| "loss": 1.0015, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 0.8067754077791719, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 1.9322459222082814e-06, | |
| "loss": 0.989, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 0.8080301129234629, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 1.91969887076537e-06, | |
| "loss": 1.0381, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 0.8092848180677541, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1.9071518193224595e-06, | |
| "loss": 1.0115, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.8105395232120451, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 1.8946047678795485e-06, | |
| "loss": 1.0244, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 0.8117942283563363, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 1.8820577164366375e-06, | |
| "loss": 0.9951, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 0.8130489335006273, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1.8695106649937268e-06, | |
| "loss": 1.0914, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 0.8143036386449184, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 1.8569636135508156e-06, | |
| "loss": 1.0711, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 0.8155583437892095, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1.844416562107905e-06, | |
| "loss": 1.0151, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8168130489335006, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 1.8318695106649938e-06, | |
| "loss": 0.9684, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 0.8180677540777918, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 1.8193224592220829e-06, | |
| "loss": 0.9752, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 0.8193224592220828, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 1.8067754077791721e-06, | |
| "loss": 0.9948, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 0.820577164366374, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1.7942283563362611e-06, | |
| "loss": 0.9991, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 0.821831869510665, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 1.7816813048933502e-06, | |
| "loss": 0.9931, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.823086574654956, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.7691342534504394e-06, | |
| "loss": 1.0357, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 0.8243412797992472, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1.7565872020075284e-06, | |
| "loss": 1.0357, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 0.8255959849435383, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1.7440401505646172e-06, | |
| "loss": 1.0378, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 0.8268506900878294, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 1.7314930991217067e-06, | |
| "loss": 1.0116, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 0.8281053952321205, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1.7189460476787955e-06, | |
| "loss": 0.9911, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8293601003764115, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 1.7063989962358847e-06, | |
| "loss": 0.978, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 0.8306148055207027, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1.6938519447929738e-06, | |
| "loss": 1.0247, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 0.8318695106649937, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 1.6813048933500628e-06, | |
| "loss": 1.0207, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 0.8331242158092849, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 1.668757841907152e-06, | |
| "loss": 0.9531, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 0.8343789209535759, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1.656210790464241e-06, | |
| "loss": 0.9747, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.835633626097867, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 1.64366373902133e-06, | |
| "loss": 1.0085, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 0.8368883312421581, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.6311166875784193e-06, | |
| "loss": 0.979, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 0.8381430363864492, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1.6185696361355084e-06, | |
| "loss": 1.0111, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 0.8393977415307403, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.6060225846925972e-06, | |
| "loss": 1.0272, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 0.8406524466750314, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1.5934755332496864e-06, | |
| "loss": 1.028, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8419071518193224, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1.5809284818067754e-06, | |
| "loss": 1.066, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 0.8431618569636136, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 1.5683814303638647e-06, | |
| "loss": 1.0011, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.8444165621079046, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1.5558343789209537e-06, | |
| "loss": 1.0091, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 0.8456712672521958, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 1.5432873274780427e-06, | |
| "loss": 0.9805, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 0.8469259723964868, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.530740276035132e-06, | |
| "loss": 1.05, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.848180677540778, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 1.518193224592221e-06, | |
| "loss": 1.0343, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 0.849435382685069, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 1.50564617314931e-06, | |
| "loss": 1.058, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 0.8506900878293601, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 1.4930991217063993e-06, | |
| "loss": 1.0374, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 0.8519447929736512, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1.4805520702634883e-06, | |
| "loss": 1.0525, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 0.8531994981179423, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.468005018820577e-06, | |
| "loss": 0.9897, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.8544542032622334, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 1.4554579673776663e-06, | |
| "loss": 0.9919, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 0.8557089084065245, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 1.4429109159347554e-06, | |
| "loss": 0.9903, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 0.8569636135508155, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 1.4303638644918446e-06, | |
| "loss": 1.0151, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 0.8582183186951067, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1.4178168130489336e-06, | |
| "loss": 1.0299, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 0.8594730238393977, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.4052697616060227e-06, | |
| "loss": 1.0317, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.8607277289836889, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1.3927227101631119e-06, | |
| "loss": 1.0427, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 0.8619824341279799, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 1.380175658720201e-06, | |
| "loss": 1.0358, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 0.863237139272271, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 1.36762860727729e-06, | |
| "loss": 1.0435, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 0.8644918444165621, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1.3550815558343792e-06, | |
| "loss": 1.0404, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 0.8657465495608532, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 1.342534504391468e-06, | |
| "loss": 1.0497, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.8670012547051443, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 1.329987452948557e-06, | |
| "loss": 0.9816, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 0.8682559598494354, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1.3174404015056463e-06, | |
| "loss": 0.9882, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 0.8695106649937264, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 1.3048933500627353e-06, | |
| "loss": 1.0454, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 0.8707653701380176, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 1.2923462986198245e-06, | |
| "loss": 1.0426, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 0.8720200752823086, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 1.2797992471769136e-06, | |
| "loss": 1.0392, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.8732747804265998, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 1.2672521957340026e-06, | |
| "loss": 1.0051, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 0.8745294855708908, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 1.2547051442910918e-06, | |
| "loss": 1.0095, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 0.875784190715182, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 1.2421580928481808e-06, | |
| "loss": 1.0611, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 0.877038895859473, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1.2296110414052699e-06, | |
| "loss": 1.0009, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 0.8782936010037641, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 1.217063989962359e-06, | |
| "loss": 1.0206, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8795483061480552, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 1.204516938519448e-06, | |
| "loss": 1.0475, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 0.8808030112923463, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 1.1919698870765372e-06, | |
| "loss": 0.9919, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 0.8820577164366374, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 1.1794228356336262e-06, | |
| "loss": 1.0449, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 0.8833124215809285, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1.1668757841907152e-06, | |
| "loss": 1.0439, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.8845671267252195, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 1.1543287327478042e-06, | |
| "loss": 1.0178, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.8858218318695107, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.1417816813048935e-06, | |
| "loss": 1.0516, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 0.8870765370138017, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1.1292346298619825e-06, | |
| "loss": 1.1199, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 0.8883312421580929, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1.1166875784190715e-06, | |
| "loss": 1.0612, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 0.8895859473023839, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 1.1041405269761608e-06, | |
| "loss": 1.0453, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 0.890840652446675, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 1.0915934755332498e-06, | |
| "loss": 0.9853, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.8920953575909661, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 1.0790464240903388e-06, | |
| "loss": 1.0574, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 0.8933500627352572, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 1.0664993726474279e-06, | |
| "loss": 1.0094, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 0.8946047678795483, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 1.053952321204517e-06, | |
| "loss": 1.0675, | |
| "step": 713 | |
| }, | |
| { | |
| "epoch": 0.8958594730238394, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1.0414052697616061e-06, | |
| "loss": 1.0285, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 0.8971141781681304, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 1.0288582183186952e-06, | |
| "loss": 1.065, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.8983688833124216, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 1.0163111668757842e-06, | |
| "loss": 1.0149, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 0.8996235884567126, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 1.0037641154328734e-06, | |
| "loss": 1.0481, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 0.9008782936010038, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 9.912170639899624e-07, | |
| "loss": 1.0658, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 0.9021329987452948, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 9.786700125470515e-07, | |
| "loss": 1.0236, | |
| "step": 719 | |
| }, | |
| { | |
| "epoch": 0.903387703889586, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 9.661229611041407e-07, | |
| "loss": 1.0063, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.904642409033877, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 9.535759096612297e-07, | |
| "loss": 1.0015, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 0.9058971141781681, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 9.410288582183188e-07, | |
| "loss": 0.9525, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 0.9071518193224593, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 9.284818067754078e-07, | |
| "loss": 1.0432, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 0.9084065244667503, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 9.159347553324969e-07, | |
| "loss": 1.0255, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 0.9096612296110415, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 9.033877038895861e-07, | |
| "loss": 1.0301, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.9109159347553325, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 8.908406524466751e-07, | |
| "loss": 1.0571, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 0.9121706398996235, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 8.782936010037642e-07, | |
| "loss": 1.0558, | |
| "step": 727 | |
| }, | |
| { | |
| "epoch": 0.9134253450439147, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 8.657465495608533e-07, | |
| "loss": 1.045, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 0.9146800501882058, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 8.531994981179424e-07, | |
| "loss": 1.0567, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 0.9159347553324969, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 8.406524466750314e-07, | |
| "loss": 1.0789, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.917189460476788, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 8.281053952321205e-07, | |
| "loss": 1.0029, | |
| "step": 731 | |
| }, | |
| { | |
| "epoch": 0.918444165621079, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 8.155583437892097e-07, | |
| "loss": 1.0012, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 0.9196988707653702, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 8.030112923462986e-07, | |
| "loss": 1.0303, | |
| "step": 733 | |
| }, | |
| { | |
| "epoch": 0.9209535759096612, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 7.904642409033877e-07, | |
| "loss": 1.081, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 0.9222082810539524, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 7.779171894604768e-07, | |
| "loss": 0.973, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.9234629861982434, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 7.65370138017566e-07, | |
| "loss": 1.0737, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 0.9247176913425345, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 7.52823086574655e-07, | |
| "loss": 1.0283, | |
| "step": 737 | |
| }, | |
| { | |
| "epoch": 0.9259723964868256, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 7.402760351317441e-07, | |
| "loss": 0.9872, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 0.9272271016311167, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 7.277289836888332e-07, | |
| "loss": 1.0571, | |
| "step": 739 | |
| }, | |
| { | |
| "epoch": 0.9284818067754078, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 7.151819322459223e-07, | |
| "loss": 0.9945, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.9297365119196989, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 7.026348808030113e-07, | |
| "loss": 1.0531, | |
| "step": 741 | |
| }, | |
| { | |
| "epoch": 0.93099121706399, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 6.900878293601005e-07, | |
| "loss": 1.014, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 0.9322459222082811, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 6.775407779171896e-07, | |
| "loss": 0.9942, | |
| "step": 743 | |
| }, | |
| { | |
| "epoch": 0.9335006273525721, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 6.649937264742785e-07, | |
| "loss": 1.0488, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 0.9347553324968633, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 6.524466750313676e-07, | |
| "loss": 1.0216, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.9360100376411543, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 6.398996235884568e-07, | |
| "loss": 1.0371, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 0.9372647427854455, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 6.273525721455459e-07, | |
| "loss": 1.0918, | |
| "step": 747 | |
| }, | |
| { | |
| "epoch": 0.9385194479297365, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 6.148055207026349e-07, | |
| "loss": 1.0376, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 0.9397741530740276, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 6.02258469259724e-07, | |
| "loss": 1.013, | |
| "step": 749 | |
| }, | |
| { | |
| "epoch": 0.9410288582183187, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 5.897114178168131e-07, | |
| "loss": 1.0821, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9422835633626098, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 5.771643663739021e-07, | |
| "loss": 1.0363, | |
| "step": 751 | |
| }, | |
| { | |
| "epoch": 0.9435382685069009, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 5.646173149309913e-07, | |
| "loss": 1.0259, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 0.944792973651192, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 5.520702634880804e-07, | |
| "loss": 1.0375, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 0.946047678795483, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 5.395232120451694e-07, | |
| "loss": 0.9884, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 0.9473023839397742, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 5.269761606022585e-07, | |
| "loss": 0.9806, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.9485570890840652, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 5.144291091593476e-07, | |
| "loss": 1.0393, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 0.9498117942283564, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 5.018820577164367e-07, | |
| "loss": 1.0035, | |
| "step": 757 | |
| }, | |
| { | |
| "epoch": 0.9510664993726474, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 4.893350062735257e-07, | |
| "loss": 1.0313, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 0.9523212045169385, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 4.7678795483061487e-07, | |
| "loss": 1.0483, | |
| "step": 759 | |
| }, | |
| { | |
| "epoch": 0.9535759096612296, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 4.642409033877039e-07, | |
| "loss": 1.026, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.9548306148055207, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 4.5169385194479303e-07, | |
| "loss": 1.0188, | |
| "step": 761 | |
| }, | |
| { | |
| "epoch": 0.9560853199498118, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 4.391468005018821e-07, | |
| "loss": 1.0507, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 0.9573400250941029, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 4.265997490589712e-07, | |
| "loss": 1.003, | |
| "step": 763 | |
| }, | |
| { | |
| "epoch": 0.958594730238394, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 4.1405269761606027e-07, | |
| "loss": 1.0485, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 0.9598494353826851, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 4.015056461731493e-07, | |
| "loss": 1.1192, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.9611041405269761, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 3.889585947302384e-07, | |
| "loss": 0.9953, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 0.9623588456712673, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 3.764115432873275e-07, | |
| "loss": 1.0077, | |
| "step": 767 | |
| }, | |
| { | |
| "epoch": 0.9636135508155583, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 3.638644918444166e-07, | |
| "loss": 1.0298, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.9648682559598495, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 3.5131744040150566e-07, | |
| "loss": 1.0681, | |
| "step": 769 | |
| }, | |
| { | |
| "epoch": 0.9661229611041405, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 3.387703889585948e-07, | |
| "loss": 1.0081, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.9673776662484316, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 3.262233375156838e-07, | |
| "loss": 1.0426, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 0.9686323713927227, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 3.1367628607277296e-07, | |
| "loss": 1.0218, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 0.9698870765370138, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 3.01129234629862e-07, | |
| "loss": 1.0126, | |
| "step": 773 | |
| }, | |
| { | |
| "epoch": 0.9711417816813049, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 2.8858218318695106e-07, | |
| "loss": 1.0691, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 0.972396486825596, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 2.760351317440402e-07, | |
| "loss": 0.963, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.973651191969887, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 2.634880803011293e-07, | |
| "loss": 1.0191, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 0.9749058971141782, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 2.5094102885821835e-07, | |
| "loss": 1.0333, | |
| "step": 777 | |
| }, | |
| { | |
| "epoch": 0.9761606022584692, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 2.3839397741530743e-07, | |
| "loss": 1.0624, | |
| "step": 778 | |
| }, | |
| { | |
| "epoch": 0.9774153074027604, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 2.2584692597239651e-07, | |
| "loss": 1.0546, | |
| "step": 779 | |
| }, | |
| { | |
| "epoch": 0.9786700125470514, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 2.132998745294856e-07, | |
| "loss": 1.0582, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9799247176913425, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 2.0075282308657465e-07, | |
| "loss": 1.0658, | |
| "step": 781 | |
| }, | |
| { | |
| "epoch": 0.9811794228356336, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 1.8820577164366375e-07, | |
| "loss": 1.0176, | |
| "step": 782 | |
| }, | |
| { | |
| "epoch": 0.9824341279799247, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 1.7565872020075283e-07, | |
| "loss": 1.0045, | |
| "step": 783 | |
| }, | |
| { | |
| "epoch": 0.9836888331242158, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 1.631116687578419e-07, | |
| "loss": 1.0487, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 0.9849435382685069, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 1.50564617314931e-07, | |
| "loss": 0.9917, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.986198243412798, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1.380175658720201e-07, | |
| "loss": 1.0878, | |
| "step": 786 | |
| }, | |
| { | |
| "epoch": 0.9874529485570891, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 1.2547051442910918e-07, | |
| "loss": 1.0832, | |
| "step": 787 | |
| }, | |
| { | |
| "epoch": 0.9887076537013801, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1.1292346298619826e-07, | |
| "loss": 1.0357, | |
| "step": 788 | |
| }, | |
| { | |
| "epoch": 0.9899623588456713, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 1.0037641154328732e-07, | |
| "loss": 0.9572, | |
| "step": 789 | |
| }, | |
| { | |
| "epoch": 0.9912170639899623, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 8.782936010037642e-08, | |
| "loss": 1.0341, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.9924717691342535, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 7.52823086574655e-08, | |
| "loss": 1.0047, | |
| "step": 791 | |
| }, | |
| { | |
| "epoch": 0.9937264742785445, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 6.273525721455459e-08, | |
| "loss": 0.9957, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 0.9949811794228356, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 5.018820577164366e-08, | |
| "loss": 1.0317, | |
| "step": 793 | |
| }, | |
| { | |
| "epoch": 0.9962358845671268, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 3.764115432873275e-08, | |
| "loss": 1.0379, | |
| "step": 794 | |
| }, | |
| { | |
| "epoch": 0.9974905897114178, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 2.509410288582183e-08, | |
| "loss": 0.9841, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.998745294855709, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 1.2547051442910915e-08, | |
| "loss": 1.0625, | |
| "step": 796 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 0.0, | |
| "loss": 0.9426, | |
| "step": 797 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 797, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.5590252305802854e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |