| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 5859, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0008533879501621437, | |
| "grad_norm": 3.0286532193579454, | |
| "learning_rate": 6.825938566552902e-07, | |
| "loss": 0.8686, | |
| "num_tokens": 495968.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0017067759003242873, | |
| "grad_norm": 2.525679240316591, | |
| "learning_rate": 1.5358361774744028e-06, | |
| "loss": 0.8704, | |
| "num_tokens": 949756.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.002560163850486431, | |
| "grad_norm": 1.9983918195326824, | |
| "learning_rate": 2.389078498293516e-06, | |
| "loss": 0.8775, | |
| "num_tokens": 1481214.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0034135518006485747, | |
| "grad_norm": 1.5158059434437512, | |
| "learning_rate": 3.242320819112628e-06, | |
| "loss": 0.7941, | |
| "num_tokens": 1934239.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.004266939750810718, | |
| "grad_norm": 1.185720928601531, | |
| "learning_rate": 4.095563139931741e-06, | |
| "loss": 0.7538, | |
| "num_tokens": 2404000.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.005120327700972862, | |
| "grad_norm": 1.0193202716561145, | |
| "learning_rate": 4.948805460750854e-06, | |
| "loss": 0.7231, | |
| "num_tokens": 2810536.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.005973715651135006, | |
| "grad_norm": 0.7449002364466364, | |
| "learning_rate": 5.802047781569966e-06, | |
| "loss": 0.7001, | |
| "num_tokens": 3246200.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.006827103601297149, | |
| "grad_norm": 0.777521320667087, | |
| "learning_rate": 6.655290102389079e-06, | |
| "loss": 0.6787, | |
| "num_tokens": 3759778.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.007680491551459293, | |
| "grad_norm": 0.8546190268475989, | |
| "learning_rate": 7.508532423208192e-06, | |
| "loss": 0.659, | |
| "num_tokens": 4203683.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.008533879501621437, | |
| "grad_norm": 0.6293638972673229, | |
| "learning_rate": 8.361774744027304e-06, | |
| "loss": 0.6758, | |
| "num_tokens": 4719221.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.00938726745178358, | |
| "grad_norm": 0.7894895920979316, | |
| "learning_rate": 9.215017064846417e-06, | |
| "loss": 0.6915, | |
| "num_tokens": 5163232.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.010240655401945725, | |
| "grad_norm": 0.7040452581545864, | |
| "learning_rate": 1.006825938566553e-05, | |
| "loss": 0.6845, | |
| "num_tokens": 5645472.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.011094043352107869, | |
| "grad_norm": 0.7320437900527926, | |
| "learning_rate": 1.0921501706484643e-05, | |
| "loss": 0.6526, | |
| "num_tokens": 6152548.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.011947431302270013, | |
| "grad_norm": 0.6868940154580355, | |
| "learning_rate": 1.1774744027303754e-05, | |
| "loss": 0.6613, | |
| "num_tokens": 6675650.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.012800819252432157, | |
| "grad_norm": 0.72738951564176, | |
| "learning_rate": 1.2627986348122867e-05, | |
| "loss": 0.6508, | |
| "num_tokens": 7179896.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.013654207202594299, | |
| "grad_norm": 0.6694108895858425, | |
| "learning_rate": 1.348122866894198e-05, | |
| "loss": 0.6401, | |
| "num_tokens": 7670069.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.014507595152756443, | |
| "grad_norm": 0.6570142771571191, | |
| "learning_rate": 1.4334470989761092e-05, | |
| "loss": 0.6382, | |
| "num_tokens": 8133039.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.015360983102918587, | |
| "grad_norm": 0.7092691750950824, | |
| "learning_rate": 1.5187713310580206e-05, | |
| "loss": 0.6093, | |
| "num_tokens": 8562223.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.01621437105308073, | |
| "grad_norm": 0.7683319522376421, | |
| "learning_rate": 1.604095563139932e-05, | |
| "loss": 0.6582, | |
| "num_tokens": 9031725.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.017067759003242873, | |
| "grad_norm": 0.7853748225785034, | |
| "learning_rate": 1.689419795221843e-05, | |
| "loss": 0.62, | |
| "num_tokens": 9514590.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.017921146953405017, | |
| "grad_norm": 0.7913695039148809, | |
| "learning_rate": 1.7747440273037545e-05, | |
| "loss": 0.6074, | |
| "num_tokens": 9961118.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.01877453490356716, | |
| "grad_norm": 0.765278282758163, | |
| "learning_rate": 1.8600682593856656e-05, | |
| "loss": 0.6036, | |
| "num_tokens": 10423653.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.019627922853729305, | |
| "grad_norm": 0.737359678644543, | |
| "learning_rate": 1.945392491467577e-05, | |
| "loss": 0.6295, | |
| "num_tokens": 10873524.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.02048131080389145, | |
| "grad_norm": 0.8463903264435886, | |
| "learning_rate": 2.0307167235494882e-05, | |
| "loss": 0.5886, | |
| "num_tokens": 11370767.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.021334698754053593, | |
| "grad_norm": 0.8218570320834011, | |
| "learning_rate": 2.1160409556313997e-05, | |
| "loss": 0.6334, | |
| "num_tokens": 11821842.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.022188086704215737, | |
| "grad_norm": 0.8592640918273211, | |
| "learning_rate": 2.201365187713311e-05, | |
| "loss": 0.601, | |
| "num_tokens": 12205003.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.02304147465437788, | |
| "grad_norm": 0.839535180775933, | |
| "learning_rate": 2.286689419795222e-05, | |
| "loss": 0.6258, | |
| "num_tokens": 12736074.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.023894862604540025, | |
| "grad_norm": 0.7332931189874, | |
| "learning_rate": 2.3720136518771334e-05, | |
| "loss": 0.6046, | |
| "num_tokens": 13221634.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.02474825055470217, | |
| "grad_norm": 0.721466862687616, | |
| "learning_rate": 2.4573378839590446e-05, | |
| "loss": 0.5945, | |
| "num_tokens": 13677274.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.025601638504864313, | |
| "grad_norm": 0.7630811442307591, | |
| "learning_rate": 2.5426621160409557e-05, | |
| "loss": 0.6029, | |
| "num_tokens": 14132790.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.026455026455026454, | |
| "grad_norm": 0.8018808988597567, | |
| "learning_rate": 2.627986348122867e-05, | |
| "loss": 0.607, | |
| "num_tokens": 14642152.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.027308414405188598, | |
| "grad_norm": 0.7966504856806722, | |
| "learning_rate": 2.7133105802047783e-05, | |
| "loss": 0.6078, | |
| "num_tokens": 15120586.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.02816180235535074, | |
| "grad_norm": 0.7610090989738578, | |
| "learning_rate": 2.7986348122866894e-05, | |
| "loss": 0.6199, | |
| "num_tokens": 15590756.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.029015190305512886, | |
| "grad_norm": 0.8061028238209393, | |
| "learning_rate": 2.883959044368601e-05, | |
| "loss": 0.6317, | |
| "num_tokens": 16123604.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.02986857825567503, | |
| "grad_norm": 0.7197932440435224, | |
| "learning_rate": 2.969283276450512e-05, | |
| "loss": 0.6305, | |
| "num_tokens": 16624908.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.030721966205837174, | |
| "grad_norm": 0.7274806538295826, | |
| "learning_rate": 3.054607508532423e-05, | |
| "loss": 0.5712, | |
| "num_tokens": 17045024.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.03157535415599932, | |
| "grad_norm": 0.7452276800406582, | |
| "learning_rate": 3.139931740614335e-05, | |
| "loss": 0.6119, | |
| "num_tokens": 17491604.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.03242874210616146, | |
| "grad_norm": 0.7247133102440203, | |
| "learning_rate": 3.225255972696246e-05, | |
| "loss": 0.6097, | |
| "num_tokens": 17979516.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.033282130056323606, | |
| "grad_norm": 0.7138442249840892, | |
| "learning_rate": 3.310580204778157e-05, | |
| "loss": 0.6164, | |
| "num_tokens": 18455308.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.034135518006485746, | |
| "grad_norm": 0.7082062304392969, | |
| "learning_rate": 3.395904436860068e-05, | |
| "loss": 0.5727, | |
| "num_tokens": 18909989.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.034988905956647894, | |
| "grad_norm": 0.8829540258966007, | |
| "learning_rate": 3.48122866894198e-05, | |
| "loss": 0.6191, | |
| "num_tokens": 19416675.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.035842293906810034, | |
| "grad_norm": 0.730098109447111, | |
| "learning_rate": 3.5665529010238906e-05, | |
| "loss": 0.6068, | |
| "num_tokens": 19893037.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.03669568185697218, | |
| "grad_norm": 0.7429557918229422, | |
| "learning_rate": 3.6518771331058024e-05, | |
| "loss": 0.5895, | |
| "num_tokens": 20366027.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.03754906980713432, | |
| "grad_norm": 0.7453517512344715, | |
| "learning_rate": 3.7372013651877135e-05, | |
| "loss": 0.5704, | |
| "num_tokens": 20794568.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.03840245775729647, | |
| "grad_norm": 0.7383975579411907, | |
| "learning_rate": 3.822525597269625e-05, | |
| "loss": 0.6044, | |
| "num_tokens": 21260320.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.03925584570745861, | |
| "grad_norm": 0.7219581885586872, | |
| "learning_rate": 3.907849829351536e-05, | |
| "loss": 0.609, | |
| "num_tokens": 21708539.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.04010923365762076, | |
| "grad_norm": 0.7434667782326754, | |
| "learning_rate": 3.9931740614334476e-05, | |
| "loss": 0.6185, | |
| "num_tokens": 22189255.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.0409626216077829, | |
| "grad_norm": 0.7471902731707051, | |
| "learning_rate": 4.078498293515359e-05, | |
| "loss": 0.6035, | |
| "num_tokens": 22645536.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.04181600955794504, | |
| "grad_norm": 0.8260924079758846, | |
| "learning_rate": 4.16382252559727e-05, | |
| "loss": 0.5939, | |
| "num_tokens": 23150858.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.042669397508107186, | |
| "grad_norm": 0.9770949190134531, | |
| "learning_rate": 4.249146757679181e-05, | |
| "loss": 0.6074, | |
| "num_tokens": 23599448.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.04352278545826933, | |
| "grad_norm": 0.6942914410312829, | |
| "learning_rate": 4.334470989761093e-05, | |
| "loss": 0.6448, | |
| "num_tokens": 24122236.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.044376173408431474, | |
| "grad_norm": 0.8358450078356706, | |
| "learning_rate": 4.419795221843004e-05, | |
| "loss": 0.6069, | |
| "num_tokens": 24609009.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.045229561358593615, | |
| "grad_norm": 0.6873369891467112, | |
| "learning_rate": 4.505119453924915e-05, | |
| "loss": 0.6102, | |
| "num_tokens": 25127844.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.04608294930875576, | |
| "grad_norm": 0.7986576789478481, | |
| "learning_rate": 4.590443686006826e-05, | |
| "loss": 0.614, | |
| "num_tokens": 25618632.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0469363372589179, | |
| "grad_norm": 0.715451742276675, | |
| "learning_rate": 4.675767918088737e-05, | |
| "loss": 0.6063, | |
| "num_tokens": 26169254.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.04778972520908005, | |
| "grad_norm": 0.8051537516590759, | |
| "learning_rate": 4.7610921501706484e-05, | |
| "loss": 0.6003, | |
| "num_tokens": 26641035.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.04864311315924219, | |
| "grad_norm": 0.7167582367990758, | |
| "learning_rate": 4.84641638225256e-05, | |
| "loss": 0.5607, | |
| "num_tokens": 27097384.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.04949650110940434, | |
| "grad_norm": 0.7808871755447675, | |
| "learning_rate": 4.931740614334471e-05, | |
| "loss": 0.6015, | |
| "num_tokens": 27517779.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.05034988905956648, | |
| "grad_norm": 0.7183612803121492, | |
| "learning_rate": 4.999999641601773e-05, | |
| "loss": 0.5638, | |
| "num_tokens": 27987518.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.051203277009728626, | |
| "grad_norm": 0.7548605219041664, | |
| "learning_rate": 4.999987097675823e-05, | |
| "loss": 0.5847, | |
| "num_tokens": 28445983.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.05205666495989077, | |
| "grad_norm": 0.7817939751955983, | |
| "learning_rate": 4.99995663395271e-05, | |
| "loss": 0.603, | |
| "num_tokens": 28963399.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.05291005291005291, | |
| "grad_norm": 0.7552334057933794, | |
| "learning_rate": 4.999908250675058e-05, | |
| "loss": 0.5937, | |
| "num_tokens": 29489865.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.053763440860215055, | |
| "grad_norm": 0.6288087877883393, | |
| "learning_rate": 4.999841948228211e-05, | |
| "loss": 0.6033, | |
| "num_tokens": 29966875.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.054616828810377195, | |
| "grad_norm": 0.696835112652273, | |
| "learning_rate": 4.999757727140229e-05, | |
| "loss": 0.6113, | |
| "num_tokens": 30423007.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.05547021676053934, | |
| "grad_norm": 0.7890055201591463, | |
| "learning_rate": 4.999655588081883e-05, | |
| "loss": 0.6014, | |
| "num_tokens": 30910795.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.05632360471070148, | |
| "grad_norm": 0.6717095131099273, | |
| "learning_rate": 4.999535531866646e-05, | |
| "loss": 0.5751, | |
| "num_tokens": 31372257.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.05717699266086363, | |
| "grad_norm": 0.6081791595492333, | |
| "learning_rate": 4.9993975594506975e-05, | |
| "loss": 0.5976, | |
| "num_tokens": 31867525.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.05803038061102577, | |
| "grad_norm": 0.6396688201978674, | |
| "learning_rate": 4.999241671932903e-05, | |
| "loss": 0.6209, | |
| "num_tokens": 32356676.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.05888376856118792, | |
| "grad_norm": 0.7306473150506871, | |
| "learning_rate": 4.999067870554814e-05, | |
| "loss": 0.6044, | |
| "num_tokens": 32843045.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.05973715651135006, | |
| "grad_norm": 0.7021163516010843, | |
| "learning_rate": 4.9988761567006536e-05, | |
| "loss": 0.5942, | |
| "num_tokens": 33261796.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.06059054446151221, | |
| "grad_norm": 0.6838658059399225, | |
| "learning_rate": 4.998666531897308e-05, | |
| "loss": 0.6237, | |
| "num_tokens": 33766969.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.06144393241167435, | |
| "grad_norm": 0.6192632249147999, | |
| "learning_rate": 4.998438997814312e-05, | |
| "loss": 0.5752, | |
| "num_tokens": 34237001.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.06229732036183649, | |
| "grad_norm": 0.689236992875899, | |
| "learning_rate": 4.9981935562638395e-05, | |
| "loss": 0.609, | |
| "num_tokens": 34708958.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.06315070831199864, | |
| "grad_norm": 0.7007363174304856, | |
| "learning_rate": 4.997930209200684e-05, | |
| "loss": 0.5978, | |
| "num_tokens": 35167253.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.06400409626216078, | |
| "grad_norm": 0.6318579789369254, | |
| "learning_rate": 4.997648958722248e-05, | |
| "loss": 0.5881, | |
| "num_tokens": 35600959.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.06485748421232292, | |
| "grad_norm": 0.7209145192109564, | |
| "learning_rate": 4.997349807068521e-05, | |
| "loss": 0.5843, | |
| "num_tokens": 36074352.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.06571087216248507, | |
| "grad_norm": 0.6223675543509836, | |
| "learning_rate": 4.997032756622068e-05, | |
| "loss": 0.6161, | |
| "num_tokens": 36536345.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.06656426011264721, | |
| "grad_norm": 0.721034308790531, | |
| "learning_rate": 4.996697809908006e-05, | |
| "loss": 0.6537, | |
| "num_tokens": 37055456.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.06741764806280935, | |
| "grad_norm": 0.6039820100318718, | |
| "learning_rate": 4.9963449695939824e-05, | |
| "loss": 0.5821, | |
| "num_tokens": 37525136.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.06827103601297149, | |
| "grad_norm": 0.5705818697410519, | |
| "learning_rate": 4.995974238490161e-05, | |
| "loss": 0.5798, | |
| "num_tokens": 37967720.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06912442396313365, | |
| "grad_norm": 0.6099061850172808, | |
| "learning_rate": 4.9955856195491904e-05, | |
| "loss": 0.6119, | |
| "num_tokens": 38455711.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.06997781191329579, | |
| "grad_norm": 0.601509963047287, | |
| "learning_rate": 4.995179115866189e-05, | |
| "loss": 0.6038, | |
| "num_tokens": 38924279.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.07083119986345793, | |
| "grad_norm": 0.6516172840976665, | |
| "learning_rate": 4.994754730678713e-05, | |
| "loss": 0.5917, | |
| "num_tokens": 39365740.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.07168458781362007, | |
| "grad_norm": 0.6492304941088707, | |
| "learning_rate": 4.994312467366738e-05, | |
| "loss": 0.6142, | |
| "num_tokens": 39853351.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.07253797576378221, | |
| "grad_norm": 0.5717704530037617, | |
| "learning_rate": 4.9938523294526243e-05, | |
| "loss": 0.5829, | |
| "num_tokens": 40318042.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.07339136371394436, | |
| "grad_norm": 0.6834024044791193, | |
| "learning_rate": 4.993374320601095e-05, | |
| "loss": 0.5942, | |
| "num_tokens": 40811069.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.0742447516641065, | |
| "grad_norm": 0.6730108524901087, | |
| "learning_rate": 4.992878444619203e-05, | |
| "loss": 0.6059, | |
| "num_tokens": 41251105.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.07509813961426864, | |
| "grad_norm": 0.8960819502457008, | |
| "learning_rate": 4.992364705456304e-05, | |
| "loss": 0.6162, | |
| "num_tokens": 41833783.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.07595152756443078, | |
| "grad_norm": 0.617788988253723, | |
| "learning_rate": 4.991833107204022e-05, | |
| "loss": 0.5815, | |
| "num_tokens": 42286798.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.07680491551459294, | |
| "grad_norm": 0.6111038230390965, | |
| "learning_rate": 4.9912836540962165e-05, | |
| "loss": 0.5833, | |
| "num_tokens": 42744744.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.07765830346475508, | |
| "grad_norm": 0.5828108068064031, | |
| "learning_rate": 4.9907163505089535e-05, | |
| "loss": 0.5892, | |
| "num_tokens": 43207628.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.07851169141491722, | |
| "grad_norm": 0.6736375719669128, | |
| "learning_rate": 4.9901312009604665e-05, | |
| "loss": 0.6307, | |
| "num_tokens": 43682290.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.07936507936507936, | |
| "grad_norm": 0.7822391020414807, | |
| "learning_rate": 4.989528210111117e-05, | |
| "loss": 0.6117, | |
| "num_tokens": 44130246.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.08021846731524152, | |
| "grad_norm": 0.6483232346253414, | |
| "learning_rate": 4.98890738276337e-05, | |
| "loss": 0.5683, | |
| "num_tokens": 44609387.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.08107185526540366, | |
| "grad_norm": 0.6896598710950818, | |
| "learning_rate": 4.988268723861739e-05, | |
| "loss": 0.5636, | |
| "num_tokens": 45061108.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.0819252432155658, | |
| "grad_norm": 0.548581280039598, | |
| "learning_rate": 4.9876122384927606e-05, | |
| "loss": 0.6117, | |
| "num_tokens": 45550959.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.08277863116572794, | |
| "grad_norm": 0.6748364052208712, | |
| "learning_rate": 4.9869379318849456e-05, | |
| "loss": 0.6258, | |
| "num_tokens": 45986485.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.08363201911589008, | |
| "grad_norm": 0.6138711843161757, | |
| "learning_rate": 4.9862458094087435e-05, | |
| "loss": 0.5883, | |
| "num_tokens": 46433321.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.08448540706605223, | |
| "grad_norm": 0.6078598449693189, | |
| "learning_rate": 4.985535876576493e-05, | |
| "loss": 0.5866, | |
| "num_tokens": 46947467.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.08533879501621437, | |
| "grad_norm": 0.593344206253376, | |
| "learning_rate": 4.984808139042385e-05, | |
| "loss": 0.6042, | |
| "num_tokens": 47474526.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.08619218296637651, | |
| "grad_norm": 0.5558189448035136, | |
| "learning_rate": 4.9840626026024094e-05, | |
| "loss": 0.59, | |
| "num_tokens": 47949962.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.08704557091653865, | |
| "grad_norm": 0.6750404265864367, | |
| "learning_rate": 4.983299273194318e-05, | |
| "loss": 0.5981, | |
| "num_tokens": 48425336.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.08789895886670081, | |
| "grad_norm": 0.6875523162723854, | |
| "learning_rate": 4.982518156897573e-05, | |
| "loss": 0.5865, | |
| "num_tokens": 48890592.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.08875234681686295, | |
| "grad_norm": 0.6263009826847921, | |
| "learning_rate": 4.981719259933295e-05, | |
| "loss": 0.5931, | |
| "num_tokens": 49377672.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.08960573476702509, | |
| "grad_norm": 0.5948003939336438, | |
| "learning_rate": 4.980902588664219e-05, | |
| "loss": 0.6157, | |
| "num_tokens": 49831838.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.09045912271718723, | |
| "grad_norm": 0.6697241517559412, | |
| "learning_rate": 4.9800681495946424e-05, | |
| "loss": 0.5649, | |
| "num_tokens": 50297482.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.09131251066734938, | |
| "grad_norm": 0.613163692284079, | |
| "learning_rate": 4.979215949370372e-05, | |
| "loss": 0.5586, | |
| "num_tokens": 50730374.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.09216589861751152, | |
| "grad_norm": 0.7206338791809827, | |
| "learning_rate": 4.9783459947786706e-05, | |
| "loss": 0.6116, | |
| "num_tokens": 51201531.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.09301928656767366, | |
| "grad_norm": 0.5933034005027407, | |
| "learning_rate": 4.977458292748204e-05, | |
| "loss": 0.5874, | |
| "num_tokens": 51684617.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.0938726745178358, | |
| "grad_norm": 0.5893663575107254, | |
| "learning_rate": 4.9765528503489875e-05, | |
| "loss": 0.5683, | |
| "num_tokens": 52135060.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.09472606246799795, | |
| "grad_norm": 0.6035532915383235, | |
| "learning_rate": 4.975629674792326e-05, | |
| "loss": 0.6048, | |
| "num_tokens": 52573799.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.0955794504181601, | |
| "grad_norm": 0.6144257714701433, | |
| "learning_rate": 4.974688773430759e-05, | |
| "loss": 0.5834, | |
| "num_tokens": 53019691.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.09643283836832224, | |
| "grad_norm": 0.5518110283195418, | |
| "learning_rate": 4.973730153758e-05, | |
| "loss": 0.5562, | |
| "num_tokens": 53511517.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.09728622631848438, | |
| "grad_norm": 0.5818056396954733, | |
| "learning_rate": 4.972753823408882e-05, | |
| "loss": 0.6097, | |
| "num_tokens": 54021567.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.09813961426864652, | |
| "grad_norm": 0.577461092437619, | |
| "learning_rate": 4.9717597901592886e-05, | |
| "loss": 0.5653, | |
| "num_tokens": 54461654.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.09899300221880868, | |
| "grad_norm": 0.6991934472224808, | |
| "learning_rate": 4.970748061926097e-05, | |
| "loss": 0.5922, | |
| "num_tokens": 54928756.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.09984639016897082, | |
| "grad_norm": 0.6358541800058106, | |
| "learning_rate": 4.9697186467671194e-05, | |
| "loss": 0.562, | |
| "num_tokens": 55386599.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.10069977811913296, | |
| "grad_norm": 0.5328728738798645, | |
| "learning_rate": 4.968671552881026e-05, | |
| "loss": 0.6014, | |
| "num_tokens": 55901721.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.1015531660692951, | |
| "grad_norm": 0.609398607060316, | |
| "learning_rate": 4.967606788607292e-05, | |
| "loss": 0.6196, | |
| "num_tokens": 56417852.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.10240655401945725, | |
| "grad_norm": 0.6065711379933088, | |
| "learning_rate": 4.966524362426128e-05, | |
| "loss": 0.6016, | |
| "num_tokens": 56919336.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.10325994196961939, | |
| "grad_norm": 0.5890448973594594, | |
| "learning_rate": 4.965424282958407e-05, | |
| "loss": 0.5595, | |
| "num_tokens": 57379202.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.10411332991978153, | |
| "grad_norm": 0.5742101726310437, | |
| "learning_rate": 4.964306558965604e-05, | |
| "loss": 0.547, | |
| "num_tokens": 57859693.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.10496671786994367, | |
| "grad_norm": 0.6231911585667101, | |
| "learning_rate": 4.963171199349718e-05, | |
| "loss": 0.5823, | |
| "num_tokens": 58341560.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.10582010582010581, | |
| "grad_norm": 0.6821003782356443, | |
| "learning_rate": 4.9620182131532074e-05, | |
| "loss": 0.5795, | |
| "num_tokens": 58769487.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.10667349377026797, | |
| "grad_norm": 0.5631906571826463, | |
| "learning_rate": 4.960847609558916e-05, | |
| "loss": 0.5721, | |
| "num_tokens": 59258506.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.10752688172043011, | |
| "grad_norm": 0.5142703050495413, | |
| "learning_rate": 4.959659397889998e-05, | |
| "loss": 0.5579, | |
| "num_tokens": 59686216.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.10838026967059225, | |
| "grad_norm": 0.5742893668014999, | |
| "learning_rate": 4.958453587609848e-05, | |
| "loss": 0.5956, | |
| "num_tokens": 60209917.0, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.10923365762075439, | |
| "grad_norm": 0.5905079706800682, | |
| "learning_rate": 4.9572301883220196e-05, | |
| "loss": 0.5738, | |
| "num_tokens": 60660649.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.11008704557091654, | |
| "grad_norm": 0.5413711607353467, | |
| "learning_rate": 4.955989209770155e-05, | |
| "loss": 0.5465, | |
| "num_tokens": 61104796.0, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.11094043352107869, | |
| "grad_norm": 0.5094102285050546, | |
| "learning_rate": 4.954730661837904e-05, | |
| "loss": 0.5816, | |
| "num_tokens": 61640951.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.11179382147124083, | |
| "grad_norm": 0.5144298015548795, | |
| "learning_rate": 4.9534545545488454e-05, | |
| "loss": 0.5393, | |
| "num_tokens": 62159397.0, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.11264720942140297, | |
| "grad_norm": 0.4640681049452894, | |
| "learning_rate": 4.952160898066408e-05, | |
| "loss": 0.5624, | |
| "num_tokens": 62673288.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.1135005973715651, | |
| "grad_norm": 0.5268826997576247, | |
| "learning_rate": 4.950849702693789e-05, | |
| "loss": 0.5647, | |
| "num_tokens": 63170258.0, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.11435398532172726, | |
| "grad_norm": 0.585871721899771, | |
| "learning_rate": 4.949520978873874e-05, | |
| "loss": 0.5866, | |
| "num_tokens": 63654789.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.1152073732718894, | |
| "grad_norm": 0.5375998830175728, | |
| "learning_rate": 4.9481747371891495e-05, | |
| "loss": 0.5436, | |
| "num_tokens": 64118880.0, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.11606076122205154, | |
| "grad_norm": 0.6293152238030233, | |
| "learning_rate": 4.946810988361623e-05, | |
| "loss": 0.5811, | |
| "num_tokens": 64575366.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.11691414917221368, | |
| "grad_norm": 0.5401915658803644, | |
| "learning_rate": 4.945429743252737e-05, | |
| "loss": 0.5771, | |
| "num_tokens": 65068665.0, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.11776753712237584, | |
| "grad_norm": 0.5692216727893749, | |
| "learning_rate": 4.9440310128632784e-05, | |
| "loss": 0.5731, | |
| "num_tokens": 65572577.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.11862092507253798, | |
| "grad_norm": 0.5320137928944675, | |
| "learning_rate": 4.942614808333296e-05, | |
| "loss": 0.581, | |
| "num_tokens": 66002894.0, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.11947431302270012, | |
| "grad_norm": 0.5640049851505579, | |
| "learning_rate": 4.9411811409420094e-05, | |
| "loss": 0.5693, | |
| "num_tokens": 66496209.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.12032770097286226, | |
| "grad_norm": 0.5458617864702557, | |
| "learning_rate": 4.9397300221077194e-05, | |
| "loss": 0.5978, | |
| "num_tokens": 66964609.0, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.12118108892302441, | |
| "grad_norm": 0.5866675467349187, | |
| "learning_rate": 4.9382614633877156e-05, | |
| "loss": 0.609, | |
| "num_tokens": 67477381.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.12203447687318655, | |
| "grad_norm": 0.4970303089416509, | |
| "learning_rate": 4.936775476478187e-05, | |
| "loss": 0.5697, | |
| "num_tokens": 67962305.0, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.1228878648233487, | |
| "grad_norm": 0.5614890511018131, | |
| "learning_rate": 4.93527207321413e-05, | |
| "loss": 0.576, | |
| "num_tokens": 68438217.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.12374125277351083, | |
| "grad_norm": 0.5120792577932561, | |
| "learning_rate": 4.933751265569247e-05, | |
| "loss": 0.6112, | |
| "num_tokens": 68941561.0, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.12459464072367298, | |
| "grad_norm": 0.5283082577260544, | |
| "learning_rate": 4.9322130656558604e-05, | |
| "loss": 0.5424, | |
| "num_tokens": 69455969.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.12544802867383512, | |
| "grad_norm": 0.5616205031638513, | |
| "learning_rate": 4.9306574857248065e-05, | |
| "loss": 0.5943, | |
| "num_tokens": 69925259.0, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.12630141662399727, | |
| "grad_norm": 0.5425785467585231, | |
| "learning_rate": 4.929084538165349e-05, | |
| "loss": 0.573, | |
| "num_tokens": 70344364.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.12715480457415942, | |
| "grad_norm": 0.6148651550045969, | |
| "learning_rate": 4.9274942355050705e-05, | |
| "loss": 0.5858, | |
| "num_tokens": 70825137.0, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.12800819252432155, | |
| "grad_norm": 0.5639134123568423, | |
| "learning_rate": 4.9258865904097775e-05, | |
| "loss": 0.6037, | |
| "num_tokens": 71311780.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.1288615804744837, | |
| "grad_norm": 0.4791384052443658, | |
| "learning_rate": 4.924261615683398e-05, | |
| "loss": 0.5797, | |
| "num_tokens": 71771403.0, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.12971496842464583, | |
| "grad_norm": 0.5283721678870286, | |
| "learning_rate": 4.922619324267881e-05, | |
| "loss": 0.5429, | |
| "num_tokens": 72264786.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.130568356374808, | |
| "grad_norm": 0.515474724310633, | |
| "learning_rate": 4.920959729243091e-05, | |
| "loss": 0.5451, | |
| "num_tokens": 72783486.0, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.13142174432497014, | |
| "grad_norm": 0.5063163947916296, | |
| "learning_rate": 4.919282843826709e-05, | |
| "loss": 0.5416, | |
| "num_tokens": 73254319.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.13227513227513227, | |
| "grad_norm": 0.578040826086601, | |
| "learning_rate": 4.91758868137412e-05, | |
| "loss": 0.5998, | |
| "num_tokens": 73798412.0, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.13312852022529442, | |
| "grad_norm": 0.5365069210268434, | |
| "learning_rate": 4.9158772553783105e-05, | |
| "loss": 0.524, | |
| "num_tokens": 74233642.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.13398190817545655, | |
| "grad_norm": 0.4746567806870269, | |
| "learning_rate": 4.914148579469763e-05, | |
| "loss": 0.5629, | |
| "num_tokens": 74755335.0, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.1348352961256187, | |
| "grad_norm": 0.6006250041714432, | |
| "learning_rate": 4.912402667416344e-05, | |
| "loss": 0.5873, | |
| "num_tokens": 75216559.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.13568868407578086, | |
| "grad_norm": 0.4987790879034688, | |
| "learning_rate": 4.910639533123193e-05, | |
| "loss": 0.5916, | |
| "num_tokens": 75696654.0, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.13654207202594298, | |
| "grad_norm": 0.5517797030367481, | |
| "learning_rate": 4.90885919063262e-05, | |
| "loss": 0.5581, | |
| "num_tokens": 76173511.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.13739545997610514, | |
| "grad_norm": 0.4744993493511542, | |
| "learning_rate": 4.907061654123982e-05, | |
| "loss": 0.5331, | |
| "num_tokens": 76679661.0, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.1382488479262673, | |
| "grad_norm": 0.5874392405495928, | |
| "learning_rate": 4.9052469379135796e-05, | |
| "loss": 0.5893, | |
| "num_tokens": 77137128.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.13910223587642942, | |
| "grad_norm": 0.5540441946788993, | |
| "learning_rate": 4.903415056454539e-05, | |
| "loss": 0.5728, | |
| "num_tokens": 77626807.0, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.13995562382659157, | |
| "grad_norm": 0.5853176802045126, | |
| "learning_rate": 4.901566024336696e-05, | |
| "loss": 0.5617, | |
| "num_tokens": 78183541.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.1408090117767537, | |
| "grad_norm": 0.5582202817600866, | |
| "learning_rate": 4.899699856286484e-05, | |
| "loss": 0.5866, | |
| "num_tokens": 78680973.0, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.14166239972691586, | |
| "grad_norm": 0.48999412544075355, | |
| "learning_rate": 4.8978165671668086e-05, | |
| "loss": 0.572, | |
| "num_tokens": 79163656.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.142515787677078, | |
| "grad_norm": 0.5187531268739033, | |
| "learning_rate": 4.8959161719769395e-05, | |
| "loss": 0.5298, | |
| "num_tokens": 79656446.0, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.14336917562724014, | |
| "grad_norm": 0.5331984170314205, | |
| "learning_rate": 4.893998685852385e-05, | |
| "loss": 0.5864, | |
| "num_tokens": 80166162.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.1442225635774023, | |
| "grad_norm": 0.5838027837650953, | |
| "learning_rate": 4.892064124064768e-05, | |
| "loss": 0.5505, | |
| "num_tokens": 80614083.0, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.14507595152756442, | |
| "grad_norm": 0.6154999668022753, | |
| "learning_rate": 4.8901125020217165e-05, | |
| "loss": 0.5638, | |
| "num_tokens": 81121834.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.14592933947772657, | |
| "grad_norm": 0.5329930140256497, | |
| "learning_rate": 4.888143835266726e-05, | |
| "loss": 0.5753, | |
| "num_tokens": 81548843.0, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.14678272742788873, | |
| "grad_norm": 0.778416962414645, | |
| "learning_rate": 4.8861581394790484e-05, | |
| "loss": 0.5789, | |
| "num_tokens": 81987283.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.14763611537805085, | |
| "grad_norm": 0.49093588854464204, | |
| "learning_rate": 4.884155430473557e-05, | |
| "loss": 0.5975, | |
| "num_tokens": 82483501.0, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.148489503328213, | |
| "grad_norm": 0.4738397778103487, | |
| "learning_rate": 4.882135724200628e-05, | |
| "loss": 0.5683, | |
| "num_tokens": 82994121.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.14934289127837516, | |
| "grad_norm": 0.5322703357793086, | |
| "learning_rate": 4.8800990367460106e-05, | |
| "loss": 0.5878, | |
| "num_tokens": 83455702.0, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.1501962792285373, | |
| "grad_norm": 0.5459376179054927, | |
| "learning_rate": 4.878045384330698e-05, | |
| "loss": 0.5473, | |
| "num_tokens": 83915537.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.15104966717869944, | |
| "grad_norm": 0.6439284154409579, | |
| "learning_rate": 4.875974783310799e-05, | |
| "loss": 0.5325, | |
| "num_tokens": 84333632.0, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.15190305512886157, | |
| "grad_norm": 0.5021062558868914, | |
| "learning_rate": 4.873887250177408e-05, | |
| "loss": 0.5691, | |
| "num_tokens": 84755157.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.15275644307902372, | |
| "grad_norm": 0.5055675191745028, | |
| "learning_rate": 4.871782801556476e-05, | |
| "loss": 0.5475, | |
| "num_tokens": 85242601.0, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.15360983102918588, | |
| "grad_norm": 0.48851252306585596, | |
| "learning_rate": 4.869661454208671e-05, | |
| "loss": 0.554, | |
| "num_tokens": 85690047.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.154463218979348, | |
| "grad_norm": 0.5915533823195631, | |
| "learning_rate": 4.867523225029253e-05, | |
| "loss": 0.6066, | |
| "num_tokens": 86206408.0, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.15531660692951016, | |
| "grad_norm": 0.5212785293408069, | |
| "learning_rate": 4.865368131047933e-05, | |
| "loss": 0.5962, | |
| "num_tokens": 86736902.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.15616999487967229, | |
| "grad_norm": 0.47624820822860364, | |
| "learning_rate": 4.8631961894287436e-05, | |
| "loss": 0.5952, | |
| "num_tokens": 87189432.0, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.15702338282983444, | |
| "grad_norm": 0.5351869092993649, | |
| "learning_rate": 4.861007417469895e-05, | |
| "loss": 0.5667, | |
| "num_tokens": 87705484.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.1578767707799966, | |
| "grad_norm": 0.545933784355766, | |
| "learning_rate": 4.858801832603643e-05, | |
| "loss": 0.5525, | |
| "num_tokens": 88156167.0, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.15873015873015872, | |
| "grad_norm": 0.5275104728583638, | |
| "learning_rate": 4.856579452396148e-05, | |
| "loss": 0.5576, | |
| "num_tokens": 88652585.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.15958354668032088, | |
| "grad_norm": 0.56997016737521, | |
| "learning_rate": 4.854340294547334e-05, | |
| "loss": 0.5508, | |
| "num_tokens": 89147990.0, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.16043693463048303, | |
| "grad_norm": 0.49798658559412606, | |
| "learning_rate": 4.85208437689075e-05, | |
| "loss": 0.5732, | |
| "num_tokens": 89594618.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.16129032258064516, | |
| "grad_norm": 0.578267430163061, | |
| "learning_rate": 4.8498117173934274e-05, | |
| "loss": 0.5675, | |
| "num_tokens": 90075195.0, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.1621437105308073, | |
| "grad_norm": 0.5391925616991625, | |
| "learning_rate": 4.847522334155734e-05, | |
| "loss": 0.5379, | |
| "num_tokens": 90594832.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.16299709848096944, | |
| "grad_norm": 0.5011668805866669, | |
| "learning_rate": 4.845216245411234e-05, | |
| "loss": 0.573, | |
| "num_tokens": 91053980.0, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.1638504864311316, | |
| "grad_norm": 0.5182953611048032, | |
| "learning_rate": 4.842893469526542e-05, | |
| "loss": 0.5516, | |
| "num_tokens": 91557529.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.16470387438129375, | |
| "grad_norm": 0.5770460360704944, | |
| "learning_rate": 4.840554025001172e-05, | |
| "loss": 0.5492, | |
| "num_tokens": 92020214.0, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.16555726233145587, | |
| "grad_norm": 0.5561913654702277, | |
| "learning_rate": 4.838197930467397e-05, | |
| "loss": 0.5821, | |
| "num_tokens": 92520759.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.16641065028161803, | |
| "grad_norm": 0.48523003455287067, | |
| "learning_rate": 4.835825204690096e-05, | |
| "loss": 0.5526, | |
| "num_tokens": 93047314.0, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.16726403823178015, | |
| "grad_norm": 0.513524516918407, | |
| "learning_rate": 4.833435866566607e-05, | |
| "loss": 0.5525, | |
| "num_tokens": 93508244.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.1681174261819423, | |
| "grad_norm": 0.4832545727823208, | |
| "learning_rate": 4.831029935126572e-05, | |
| "loss": 0.5404, | |
| "num_tokens": 93993449.0, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.16897081413210446, | |
| "grad_norm": 0.5892074842556879, | |
| "learning_rate": 4.828607429531795e-05, | |
| "loss": 0.621, | |
| "num_tokens": 94489856.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.1698242020822666, | |
| "grad_norm": 0.5541794654886316, | |
| "learning_rate": 4.826168369076076e-05, | |
| "loss": 0.563, | |
| "num_tokens": 94967128.0, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.17067759003242874, | |
| "grad_norm": 0.5224430795130109, | |
| "learning_rate": 4.82371277318507e-05, | |
| "loss": 0.5639, | |
| "num_tokens": 95453893.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1715309779825909, | |
| "grad_norm": 0.5615627948095356, | |
| "learning_rate": 4.8212406614161244e-05, | |
| "loss": 0.5503, | |
| "num_tokens": 95875261.0, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.17238436593275303, | |
| "grad_norm": 0.5901899830535561, | |
| "learning_rate": 4.818752053458126e-05, | |
| "loss": 0.5746, | |
| "num_tokens": 96333287.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.17323775388291518, | |
| "grad_norm": 0.519150664593853, | |
| "learning_rate": 4.816246969131342e-05, | |
| "loss": 0.5606, | |
| "num_tokens": 96788781.0, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.1740911418330773, | |
| "grad_norm": 0.5046766039074801, | |
| "learning_rate": 4.8137254283872696e-05, | |
| "loss": 0.5669, | |
| "num_tokens": 97319183.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.17494452978323946, | |
| "grad_norm": 0.5636413369566914, | |
| "learning_rate": 4.8111874513084656e-05, | |
| "loss": 0.5744, | |
| "num_tokens": 97854146.0, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.17579791773340162, | |
| "grad_norm": 0.47274352124747765, | |
| "learning_rate": 4.808633058108395e-05, | |
| "loss": 0.566, | |
| "num_tokens": 98374455.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.17665130568356374, | |
| "grad_norm": 0.5613604916818807, | |
| "learning_rate": 4.806062269131267e-05, | |
| "loss": 0.5322, | |
| "num_tokens": 98844756.0, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.1775046936337259, | |
| "grad_norm": 0.5275704920355538, | |
| "learning_rate": 4.803475104851872e-05, | |
| "loss": 0.5514, | |
| "num_tokens": 99400041.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.17835808158388802, | |
| "grad_norm": 0.5138342790090193, | |
| "learning_rate": 4.800871585875424e-05, | |
| "loss": 0.5821, | |
| "num_tokens": 99888979.0, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.17921146953405018, | |
| "grad_norm": 0.5116180287052348, | |
| "learning_rate": 4.798251732937387e-05, | |
| "loss": 0.5648, | |
| "num_tokens": 100419726.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.18006485748421233, | |
| "grad_norm": 0.5500103323286261, | |
| "learning_rate": 4.795615566903318e-05, | |
| "loss": 0.5454, | |
| "num_tokens": 100908650.0, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.18091824543437446, | |
| "grad_norm": 0.4817098430672225, | |
| "learning_rate": 4.792963108768698e-05, | |
| "loss": 0.5499, | |
| "num_tokens": 101373875.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.1817716333845366, | |
| "grad_norm": 0.4982062899819975, | |
| "learning_rate": 4.7902943796587645e-05, | |
| "loss": 0.5629, | |
| "num_tokens": 101870697.0, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.18262502133469877, | |
| "grad_norm": 0.4926411093603361, | |
| "learning_rate": 4.787609400828343e-05, | |
| "loss": 0.5613, | |
| "num_tokens": 102331658.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.1834784092848609, | |
| "grad_norm": 0.48860808911165377, | |
| "learning_rate": 4.78490819366168e-05, | |
| "loss": 0.5299, | |
| "num_tokens": 102812346.0, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.18433179723502305, | |
| "grad_norm": 0.5474671517734467, | |
| "learning_rate": 4.782190779672269e-05, | |
| "loss": 0.5667, | |
| "num_tokens": 103240516.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.18518518518518517, | |
| "grad_norm": 0.5116972696475446, | |
| "learning_rate": 4.779457180502682e-05, | |
| "loss": 0.579, | |
| "num_tokens": 103709603.0, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.18603857313534733, | |
| "grad_norm": 0.7311578763537683, | |
| "learning_rate": 4.7767074179243957e-05, | |
| "loss": 0.5738, | |
| "num_tokens": 104162605.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.18689196108550948, | |
| "grad_norm": 0.5175759169253928, | |
| "learning_rate": 4.77394151383762e-05, | |
| "loss": 0.5677, | |
| "num_tokens": 104606778.0, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.1877453490356716, | |
| "grad_norm": 0.5180750044567718, | |
| "learning_rate": 4.771159490271121e-05, | |
| "loss": 0.5519, | |
| "num_tokens": 105061908.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.18859873698583376, | |
| "grad_norm": 0.4995766211611613, | |
| "learning_rate": 4.768361369382046e-05, | |
| "loss": 0.5981, | |
| "num_tokens": 105501969.0, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.1894521249359959, | |
| "grad_norm": 0.7496734720125644, | |
| "learning_rate": 4.765547173455751e-05, | |
| "loss": 0.5441, | |
| "num_tokens": 105919969.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.19030551288615805, | |
| "grad_norm": 0.520475746556315, | |
| "learning_rate": 4.762716924905615e-05, | |
| "loss": 0.5675, | |
| "num_tokens": 106395115.0, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.1911589008363202, | |
| "grad_norm": 0.5350046475351589, | |
| "learning_rate": 4.7598706462728724e-05, | |
| "loss": 0.6038, | |
| "num_tokens": 106919294.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.19201228878648233, | |
| "grad_norm": 0.5153052033495796, | |
| "learning_rate": 4.757008360226423e-05, | |
| "loss": 0.5109, | |
| "num_tokens": 107397666.0, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.19286567673664448, | |
| "grad_norm": 0.4752166408192588, | |
| "learning_rate": 4.754130089562658e-05, | |
| "loss": 0.5255, | |
| "num_tokens": 107846976.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.19371906468680664, | |
| "grad_norm": 0.50577512343806, | |
| "learning_rate": 4.751235857205277e-05, | |
| "loss": 0.5665, | |
| "num_tokens": 108340980.0, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.19457245263696876, | |
| "grad_norm": 0.4848395588370321, | |
| "learning_rate": 4.748325686205103e-05, | |
| "loss": 0.5211, | |
| "num_tokens": 108819219.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.19542584058713092, | |
| "grad_norm": 0.4793120456541258, | |
| "learning_rate": 4.7453995997399025e-05, | |
| "loss": 0.5657, | |
| "num_tokens": 109356591.0, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.19627922853729304, | |
| "grad_norm": 0.49540833202809614, | |
| "learning_rate": 4.742457621114198e-05, | |
| "loss": 0.54, | |
| "num_tokens": 109845258.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.1971326164874552, | |
| "grad_norm": 0.5480350242504503, | |
| "learning_rate": 4.739499773759084e-05, | |
| "loss": 0.5639, | |
| "num_tokens": 110341036.0, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.19798600443761735, | |
| "grad_norm": 0.5378152017962325, | |
| "learning_rate": 4.7365260812320395e-05, | |
| "loss": 0.5496, | |
| "num_tokens": 110858993.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.19883939238777948, | |
| "grad_norm": 0.5427878056141996, | |
| "learning_rate": 4.733536567216742e-05, | |
| "loss": 0.5254, | |
| "num_tokens": 111271023.0, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.19969278033794163, | |
| "grad_norm": 0.47635814315525654, | |
| "learning_rate": 4.7305312555228764e-05, | |
| "loss": 0.6003, | |
| "num_tokens": 111788546.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.20054616828810376, | |
| "grad_norm": 0.5529022492767732, | |
| "learning_rate": 4.7275101700859476e-05, | |
| "loss": 0.5335, | |
| "num_tokens": 112240691.0, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.20139955623826591, | |
| "grad_norm": 0.476077946359243, | |
| "learning_rate": 4.724473334967087e-05, | |
| "loss": 0.5738, | |
| "num_tokens": 112732095.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.20225294418842807, | |
| "grad_norm": 0.49761849800094554, | |
| "learning_rate": 4.721420774352866e-05, | |
| "loss": 0.5755, | |
| "num_tokens": 113188980.0, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.2031063321385902, | |
| "grad_norm": 0.5033087511187714, | |
| "learning_rate": 4.7183525125550965e-05, | |
| "loss": 0.5337, | |
| "num_tokens": 113649022.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.20395972008875235, | |
| "grad_norm": 0.5757771930258121, | |
| "learning_rate": 4.715268574010644e-05, | |
| "loss": 0.5622, | |
| "num_tokens": 114112857.0, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.2048131080389145, | |
| "grad_norm": 0.5825769259450685, | |
| "learning_rate": 4.712168983281228e-05, | |
| "loss": 0.5657, | |
| "num_tokens": 114540376.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.20566649598907663, | |
| "grad_norm": 0.47066874905508205, | |
| "learning_rate": 4.709053765053228e-05, | |
| "loss": 0.562, | |
| "num_tokens": 115050544.0, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.20651988393923879, | |
| "grad_norm": 0.5339025505261117, | |
| "learning_rate": 4.7059229441374894e-05, | |
| "loss": 0.5556, | |
| "num_tokens": 115480997.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.2073732718894009, | |
| "grad_norm": 0.49655049131497414, | |
| "learning_rate": 4.7027765454691204e-05, | |
| "loss": 0.5171, | |
| "num_tokens": 115954998.0, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.20822665983956307, | |
| "grad_norm": 0.43630821074371373, | |
| "learning_rate": 4.6996145941073003e-05, | |
| "loss": 0.5473, | |
| "num_tokens": 116464215.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.20908004778972522, | |
| "grad_norm": 0.5212619535450396, | |
| "learning_rate": 4.6964371152350735e-05, | |
| "loss": 0.5528, | |
| "num_tokens": 116935328.0, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.20993343573988735, | |
| "grad_norm": 0.4996884621384485, | |
| "learning_rate": 4.693244134159153e-05, | |
| "loss": 0.539, | |
| "num_tokens": 117413382.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.2107868236900495, | |
| "grad_norm": 0.458178620585562, | |
| "learning_rate": 4.690035676309716e-05, | |
| "loss": 0.5565, | |
| "num_tokens": 117977956.0, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.21164021164021163, | |
| "grad_norm": 0.4810973401996453, | |
| "learning_rate": 4.686811767240206e-05, | |
| "loss": 0.5257, | |
| "num_tokens": 118465258.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.21249359959037378, | |
| "grad_norm": 0.5063837708262426, | |
| "learning_rate": 4.683572432627124e-05, | |
| "loss": 0.5784, | |
| "num_tokens": 118981265.0, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.21334698754053594, | |
| "grad_norm": 0.519038511710822, | |
| "learning_rate": 4.6803176982698244e-05, | |
| "loss": 0.5357, | |
| "num_tokens": 119368272.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.21420037549069806, | |
| "grad_norm": 0.446771124196734, | |
| "learning_rate": 4.677047590090315e-05, | |
| "loss": 0.5166, | |
| "num_tokens": 119831443.0, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.21505376344086022, | |
| "grad_norm": 0.5655966916582784, | |
| "learning_rate": 4.6737621341330454e-05, | |
| "loss": 0.569, | |
| "num_tokens": 120323497.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.21590715139102234, | |
| "grad_norm": 0.5284102053901687, | |
| "learning_rate": 4.6704613565647005e-05, | |
| "loss": 0.5829, | |
| "num_tokens": 120844764.0, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.2167605393411845, | |
| "grad_norm": 0.4660048972852806, | |
| "learning_rate": 4.667145283673993e-05, | |
| "loss": 0.5531, | |
| "num_tokens": 121336091.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.21761392729134665, | |
| "grad_norm": 0.5658927195405619, | |
| "learning_rate": 4.663813941871454e-05, | |
| "loss": 0.5391, | |
| "num_tokens": 121865752.0, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.21846731524150878, | |
| "grad_norm": 0.510725357391612, | |
| "learning_rate": 4.6604673576892216e-05, | |
| "loss": 0.54, | |
| "num_tokens": 122331818.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.21932070319167093, | |
| "grad_norm": 0.47505637007705503, | |
| "learning_rate": 4.657105557780831e-05, | |
| "loss": 0.554, | |
| "num_tokens": 122803686.0, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.2201740911418331, | |
| "grad_norm": 0.4713905901256551, | |
| "learning_rate": 4.653728568921001e-05, | |
| "loss": 0.549, | |
| "num_tokens": 123276577.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.22102747909199522, | |
| "grad_norm": 0.4950481169653495, | |
| "learning_rate": 4.650336418005423e-05, | |
| "loss": 0.5621, | |
| "num_tokens": 123742454.0, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.22188086704215737, | |
| "grad_norm": 0.4906341070206242, | |
| "learning_rate": 4.6469291320505423e-05, | |
| "loss": 0.5633, | |
| "num_tokens": 124260967.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.2227342549923195, | |
| "grad_norm": 0.49124294217247816, | |
| "learning_rate": 4.643506738193346e-05, | |
| "loss": 0.5869, | |
| "num_tokens": 124690528.0, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.22358764294248165, | |
| "grad_norm": 0.5318881576855438, | |
| "learning_rate": 4.64006926369115e-05, | |
| "loss": 0.5413, | |
| "num_tokens": 125178584.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.2244410308926438, | |
| "grad_norm": 0.4652041026309504, | |
| "learning_rate": 4.6366167359213744e-05, | |
| "loss": 0.5361, | |
| "num_tokens": 125630299.0, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.22529441884280593, | |
| "grad_norm": 0.5248570810580482, | |
| "learning_rate": 4.6331491823813325e-05, | |
| "loss": 0.5455, | |
| "num_tokens": 126115279.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.2261478067929681, | |
| "grad_norm": 0.44610795297132333, | |
| "learning_rate": 4.629666630688006e-05, | |
| "loss": 0.5416, | |
| "num_tokens": 126602086.0, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.2270011947431302, | |
| "grad_norm": 0.45970967320492034, | |
| "learning_rate": 4.6261691085778315e-05, | |
| "loss": 0.5296, | |
| "num_tokens": 127123510.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.22785458269329237, | |
| "grad_norm": 0.501960669685246, | |
| "learning_rate": 4.622656643906472e-05, | |
| "loss": 0.5317, | |
| "num_tokens": 127611722.0, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.22870797064345452, | |
| "grad_norm": 0.4498895905669684, | |
| "learning_rate": 4.619129264648602e-05, | |
| "loss": 0.5363, | |
| "num_tokens": 128096244.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.22956135859361665, | |
| "grad_norm": 0.47710140071837376, | |
| "learning_rate": 4.615586998897681e-05, | |
| "loss": 0.554, | |
| "num_tokens": 128550690.0, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.2304147465437788, | |
| "grad_norm": 0.42366560231241607, | |
| "learning_rate": 4.6120298748657295e-05, | |
| "loss": 0.5452, | |
| "num_tokens": 129070712.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.23126813449394096, | |
| "grad_norm": 0.4945094382228333, | |
| "learning_rate": 4.6084579208831066e-05, | |
| "loss": 0.5715, | |
| "num_tokens": 129563278.0, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.23212152244410308, | |
| "grad_norm": 0.5291283310269107, | |
| "learning_rate": 4.604871165398282e-05, | |
| "loss": 0.5687, | |
| "num_tokens": 129992439.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.23297491039426524, | |
| "grad_norm": 0.4577010259092311, | |
| "learning_rate": 4.601269636977611e-05, | |
| "loss": 0.5523, | |
| "num_tokens": 130475575.0, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.23382829834442737, | |
| "grad_norm": 0.5102967425327236, | |
| "learning_rate": 4.5976533643051076e-05, | |
| "loss": 0.5359, | |
| "num_tokens": 130917322.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.23468168629458952, | |
| "grad_norm": 0.45264858485919063, | |
| "learning_rate": 4.594022376182212e-05, | |
| "loss": 0.5688, | |
| "num_tokens": 131375215.0, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.23553507424475167, | |
| "grad_norm": 0.524032753944991, | |
| "learning_rate": 4.590376701527566e-05, | |
| "loss": 0.5803, | |
| "num_tokens": 131869001.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.2363884621949138, | |
| "grad_norm": 0.4905411069354665, | |
| "learning_rate": 4.586716369376782e-05, | |
| "loss": 0.584, | |
| "num_tokens": 132391436.0, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.23724185014507596, | |
| "grad_norm": 0.5367963638334747, | |
| "learning_rate": 4.5830414088822097e-05, | |
| "loss": 0.5413, | |
| "num_tokens": 132871820.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.23809523809523808, | |
| "grad_norm": 0.4917565423882787, | |
| "learning_rate": 4.579351849312703e-05, | |
| "loss": 0.5536, | |
| "num_tokens": 133334905.0, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.23894862604540024, | |
| "grad_norm": 0.5413952248071385, | |
| "learning_rate": 4.575647720053389e-05, | |
| "loss": 0.5845, | |
| "num_tokens": 133787994.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.2398020139955624, | |
| "grad_norm": 0.5066520213343823, | |
| "learning_rate": 4.5719290506054366e-05, | |
| "loss": 0.5264, | |
| "num_tokens": 134209133.0, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.24065540194572452, | |
| "grad_norm": 0.500077599616316, | |
| "learning_rate": 4.5681958705858155e-05, | |
| "loss": 0.5478, | |
| "num_tokens": 134665258.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.24150878989588667, | |
| "grad_norm": 0.46826801262032436, | |
| "learning_rate": 4.564448209727066e-05, | |
| "loss": 0.5316, | |
| "num_tokens": 135127929.0, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.24236217784604883, | |
| "grad_norm": 0.502988184034415, | |
| "learning_rate": 4.5606860978770554e-05, | |
| "loss": 0.5544, | |
| "num_tokens": 135563718.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.24321556579621095, | |
| "grad_norm": 0.4838611684916234, | |
| "learning_rate": 4.55690956499875e-05, | |
| "loss": 0.5317, | |
| "num_tokens": 136004025.0, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.2440689537463731, | |
| "grad_norm": 0.47457845014569755, | |
| "learning_rate": 4.553118641169967e-05, | |
| "loss": 0.5533, | |
| "num_tokens": 136530274.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.24492234169653523, | |
| "grad_norm": 0.4757198115466305, | |
| "learning_rate": 4.5493133565831395e-05, | |
| "loss": 0.5382, | |
| "num_tokens": 137055227.0, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.2457757296466974, | |
| "grad_norm": 0.4835580620018664, | |
| "learning_rate": 4.5454937415450774e-05, | |
| "loss": 0.5674, | |
| "num_tokens": 137544925.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.24662911759685954, | |
| "grad_norm": 0.44039668031361656, | |
| "learning_rate": 4.54165982647672e-05, | |
| "loss": 0.5397, | |
| "num_tokens": 138050313.0, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.24748250554702167, | |
| "grad_norm": 0.5126770204898458, | |
| "learning_rate": 4.5378116419129035e-05, | |
| "loss": 0.5502, | |
| "num_tokens": 138475799.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.24833589349718382, | |
| "grad_norm": 0.4307603526962706, | |
| "learning_rate": 4.5339492185021066e-05, | |
| "loss": 0.5207, | |
| "num_tokens": 138942999.0, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.24918928144734595, | |
| "grad_norm": 0.5335857575917428, | |
| "learning_rate": 4.5300725870062153e-05, | |
| "loss": 0.5273, | |
| "num_tokens": 139435802.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.25004266939750813, | |
| "grad_norm": 0.5251968934675055, | |
| "learning_rate": 4.5261817783002726e-05, | |
| "loss": 0.5481, | |
| "num_tokens": 139911719.0, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.25089605734767023, | |
| "grad_norm": 0.5152245828215201, | |
| "learning_rate": 4.522276823372236e-05, | |
| "loss": 0.5344, | |
| "num_tokens": 140314284.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.2517494452978324, | |
| "grad_norm": 0.4542739894879496, | |
| "learning_rate": 4.518357753322728e-05, | |
| "loss": 0.5206, | |
| "num_tokens": 140779886.0, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.25260283324799454, | |
| "grad_norm": 0.5155603955708363, | |
| "learning_rate": 4.5144245993647896e-05, | |
| "loss": 0.5526, | |
| "num_tokens": 141219364.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.2534562211981567, | |
| "grad_norm": 0.44093269982131145, | |
| "learning_rate": 4.5104773928236324e-05, | |
| "loss": 0.5207, | |
| "num_tokens": 141800333.0, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.25430960914831885, | |
| "grad_norm": 0.48199459338850753, | |
| "learning_rate": 4.506516165136388e-05, | |
| "loss": 0.5622, | |
| "num_tokens": 142303539.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.25516299709848095, | |
| "grad_norm": 0.4132333113143036, | |
| "learning_rate": 4.502540947851859e-05, | |
| "loss": 0.598, | |
| "num_tokens": 142813045.0, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.2560163850486431, | |
| "grad_norm": 0.48382201091933114, | |
| "learning_rate": 4.498551772630264e-05, | |
| "loss": 0.5587, | |
| "num_tokens": 143301347.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.25686977299880526, | |
| "grad_norm": 0.47222830447428116, | |
| "learning_rate": 4.494548671242991e-05, | |
| "loss": 0.563, | |
| "num_tokens": 143788000.0, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.2577231609489674, | |
| "grad_norm": 0.4699914540137336, | |
| "learning_rate": 4.490531675572341e-05, | |
| "loss": 0.5034, | |
| "num_tokens": 144231893.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.25857654889912957, | |
| "grad_norm": 0.4968882363081607, | |
| "learning_rate": 4.486500817611273e-05, | |
| "loss": 0.5356, | |
| "num_tokens": 144664097.0, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.25942993684929166, | |
| "grad_norm": 0.5388906065337659, | |
| "learning_rate": 4.482456129463153e-05, | |
| "loss": 0.5486, | |
| "num_tokens": 145160611.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.2602833247994538, | |
| "grad_norm": 0.5111134351287713, | |
| "learning_rate": 4.478397643341495e-05, | |
| "loss": 0.5244, | |
| "num_tokens": 145627893.0, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.261136712749616, | |
| "grad_norm": 0.49748496364336453, | |
| "learning_rate": 4.474325391569706e-05, | |
| "loss": 0.5419, | |
| "num_tokens": 146121565.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.26199010069977813, | |
| "grad_norm": 0.4522989758217813, | |
| "learning_rate": 4.470239406580827e-05, | |
| "loss": 0.5282, | |
| "num_tokens": 146626799.0, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.2628434886499403, | |
| "grad_norm": 0.5125478701143711, | |
| "learning_rate": 4.466139720917277e-05, | |
| "loss": 0.5145, | |
| "num_tokens": 147031619.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.2636968766001024, | |
| "grad_norm": 0.4535804289462772, | |
| "learning_rate": 4.4620263672305916e-05, | |
| "loss": 0.5454, | |
| "num_tokens": 147494090.0, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.26455026455026454, | |
| "grad_norm": 0.4217613245251448, | |
| "learning_rate": 4.457899378281167e-05, | |
| "loss": 0.5198, | |
| "num_tokens": 147981269.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.2654036525004267, | |
| "grad_norm": 0.41291485773870157, | |
| "learning_rate": 4.453758786937992e-05, | |
| "loss": 0.5519, | |
| "num_tokens": 148509932.0, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.26625704045058884, | |
| "grad_norm": 0.4486153856267517, | |
| "learning_rate": 4.449604626178393e-05, | |
| "loss": 0.556, | |
| "num_tokens": 148964661.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.267110428400751, | |
| "grad_norm": 0.4723417199697049, | |
| "learning_rate": 4.445436929087767e-05, | |
| "loss": 0.5772, | |
| "num_tokens": 149478665.0, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.2679638163509131, | |
| "grad_norm": 0.4266686602392304, | |
| "learning_rate": 4.441255728859321e-05, | |
| "loss": 0.5258, | |
| "num_tokens": 150008239.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.26881720430107525, | |
| "grad_norm": 0.5005521919161452, | |
| "learning_rate": 4.437061058793806e-05, | |
| "loss": 0.5535, | |
| "num_tokens": 150495483.0, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.2696705922512374, | |
| "grad_norm": 0.5162737209335702, | |
| "learning_rate": 4.432852952299252e-05, | |
| "loss": 0.547, | |
| "num_tokens": 150937307.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.27052398020139956, | |
| "grad_norm": 0.45763839660125666, | |
| "learning_rate": 4.428631442890702e-05, | |
| "loss": 0.5194, | |
| "num_tokens": 151380604.0, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.2713773681515617, | |
| "grad_norm": 0.5729890333681282, | |
| "learning_rate": 4.424396564189947e-05, | |
| "loss": 0.571, | |
| "num_tokens": 151860116.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.27223075610172387, | |
| "grad_norm": 0.44940050708117657, | |
| "learning_rate": 4.420148349925252e-05, | |
| "loss": 0.543, | |
| "num_tokens": 152386160.0, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.27308414405188597, | |
| "grad_norm": 0.5194440948783737, | |
| "learning_rate": 4.415886833931097e-05, | |
| "loss": 0.5253, | |
| "num_tokens": 152834849.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2739375320020481, | |
| "grad_norm": 0.4966999548537793, | |
| "learning_rate": 4.411612050147899e-05, | |
| "loss": 0.5077, | |
| "num_tokens": 153309626.0, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.2747909199522103, | |
| "grad_norm": 0.4326956188708584, | |
| "learning_rate": 4.4073240326217446e-05, | |
| "loss": 0.5218, | |
| "num_tokens": 153761800.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.27564430790237243, | |
| "grad_norm": 0.48106009012769335, | |
| "learning_rate": 4.403022815504122e-05, | |
| "loss": 0.5272, | |
| "num_tokens": 154253245.0, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.2764976958525346, | |
| "grad_norm": 0.4691088256150863, | |
| "learning_rate": 4.398708433051645e-05, | |
| "loss": 0.5341, | |
| "num_tokens": 154739335.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.2773510838026967, | |
| "grad_norm": 0.4789255818713284, | |
| "learning_rate": 4.3943809196257794e-05, | |
| "loss": 0.5752, | |
| "num_tokens": 155258537.0, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.27820447175285884, | |
| "grad_norm": 0.4384757162143022, | |
| "learning_rate": 4.390040309692574e-05, | |
| "loss": 0.5446, | |
| "num_tokens": 155726110.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.279057859703021, | |
| "grad_norm": 0.45029889086717195, | |
| "learning_rate": 4.385686637822382e-05, | |
| "loss": 0.5746, | |
| "num_tokens": 156219775.0, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.27991124765318315, | |
| "grad_norm": 0.483534434263141, | |
| "learning_rate": 4.381319938689588e-05, | |
| "loss": 0.5224, | |
| "num_tokens": 156709373.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.2807646356033453, | |
| "grad_norm": 0.526357163746985, | |
| "learning_rate": 4.376940247072331e-05, | |
| "loss": 0.5414, | |
| "num_tokens": 157153936.0, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.2816180235535074, | |
| "grad_norm": 0.46579995749091513, | |
| "learning_rate": 4.372547597852225e-05, | |
| "loss": 0.5318, | |
| "num_tokens": 157605031.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.28247141150366956, | |
| "grad_norm": 0.45067576856616254, | |
| "learning_rate": 4.368142026014086e-05, | |
| "loss": 0.5444, | |
| "num_tokens": 158069912.0, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.2833247994538317, | |
| "grad_norm": 0.4536061774265857, | |
| "learning_rate": 4.3637235666456506e-05, | |
| "loss": 0.4995, | |
| "num_tokens": 158541316.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.28417818740399386, | |
| "grad_norm": 0.4715512677986255, | |
| "learning_rate": 4.359292254937296e-05, | |
| "loss": 0.5587, | |
| "num_tokens": 159019433.0, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.285031575354156, | |
| "grad_norm": 0.45913895999678267, | |
| "learning_rate": 4.354848126181762e-05, | |
| "loss": 0.5482, | |
| "num_tokens": 159525622.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.2858849633043181, | |
| "grad_norm": 0.4717023641798634, | |
| "learning_rate": 4.350391215773867e-05, | |
| "loss": 0.5475, | |
| "num_tokens": 159986039.0, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.2867383512544803, | |
| "grad_norm": 0.4577457504041561, | |
| "learning_rate": 4.345921559210227e-05, | |
| "loss": 0.5449, | |
| "num_tokens": 160459756.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.2875917392046424, | |
| "grad_norm": 0.43006816683796123, | |
| "learning_rate": 4.341439192088976e-05, | |
| "loss": 0.5238, | |
| "num_tokens": 160900045.0, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.2884451271548046, | |
| "grad_norm": 0.44991390160606315, | |
| "learning_rate": 4.336944150109478e-05, | |
| "loss": 0.5309, | |
| "num_tokens": 161382919.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.28929851510496674, | |
| "grad_norm": 0.4612606886863633, | |
| "learning_rate": 4.332436469072044e-05, | |
| "loss": 0.5332, | |
| "num_tokens": 161904331.0, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.29015190305512883, | |
| "grad_norm": 0.4663086179207718, | |
| "learning_rate": 4.327916184877652e-05, | |
| "loss": 0.5466, | |
| "num_tokens": 162358277.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.291005291005291, | |
| "grad_norm": 0.4346433555739564, | |
| "learning_rate": 4.3233833335276494e-05, | |
| "loss": 0.5595, | |
| "num_tokens": 162862470.0, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.29185867895545314, | |
| "grad_norm": 0.481816835343307, | |
| "learning_rate": 4.31883795112348e-05, | |
| "loss": 0.5513, | |
| "num_tokens": 163332894.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.2927120669056153, | |
| "grad_norm": 0.4703194323676184, | |
| "learning_rate": 4.314280073866386e-05, | |
| "loss": 0.5512, | |
| "num_tokens": 163862008.0, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.29356545485577745, | |
| "grad_norm": 0.42638506770745144, | |
| "learning_rate": 4.3097097380571256e-05, | |
| "loss": 0.5347, | |
| "num_tokens": 164401378.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.2944188428059396, | |
| "grad_norm": 0.4995382173633094, | |
| "learning_rate": 4.305126980095681e-05, | |
| "loss": 0.5127, | |
| "num_tokens": 164870602.0, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.2952722307561017, | |
| "grad_norm": 0.4770564355031365, | |
| "learning_rate": 4.300531836480968e-05, | |
| "loss": 0.5036, | |
| "num_tokens": 165316411.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.29612561870626386, | |
| "grad_norm": 0.48182669463964817, | |
| "learning_rate": 4.295924343810551e-05, | |
| "loss": 0.5245, | |
| "num_tokens": 165802439.0, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.296979006656426, | |
| "grad_norm": 0.4867773545583882, | |
| "learning_rate": 4.291304538780343e-05, | |
| "loss": 0.5214, | |
| "num_tokens": 166276916.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.29783239460658817, | |
| "grad_norm": 0.4496166244953182, | |
| "learning_rate": 4.286672458184319e-05, | |
| "loss": 0.5232, | |
| "num_tokens": 166798336.0, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.2986857825567503, | |
| "grad_norm": 0.41637643907648975, | |
| "learning_rate": 4.282028138914221e-05, | |
| "loss": 0.5095, | |
| "num_tokens": 167298025.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.2995391705069124, | |
| "grad_norm": 0.4439278611828928, | |
| "learning_rate": 4.2773716179592666e-05, | |
| "loss": 0.5268, | |
| "num_tokens": 167745361.0, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.3003925584570746, | |
| "grad_norm": 0.5502393148640106, | |
| "learning_rate": 4.27270293240585e-05, | |
| "loss": 0.5437, | |
| "num_tokens": 168250987.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.30124594640723673, | |
| "grad_norm": 0.4325350233469434, | |
| "learning_rate": 4.26802211943725e-05, | |
| "loss": 0.5414, | |
| "num_tokens": 168758170.0, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.3020993343573989, | |
| "grad_norm": 0.42907081172047984, | |
| "learning_rate": 4.263329216333335e-05, | |
| "loss": 0.5327, | |
| "num_tokens": 169245557.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.30295272230756104, | |
| "grad_norm": 0.43903165966889535, | |
| "learning_rate": 4.258624260470262e-05, | |
| "loss": 0.5547, | |
| "num_tokens": 169769436.0, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.30380611025772314, | |
| "grad_norm": 0.4508862158538626, | |
| "learning_rate": 4.253907289320179e-05, | |
| "loss": 0.5486, | |
| "num_tokens": 170217424.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.3046594982078853, | |
| "grad_norm": 0.4647907501488008, | |
| "learning_rate": 4.249178340450933e-05, | |
| "loss": 0.5378, | |
| "num_tokens": 170701196.0, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.30551288615804745, | |
| "grad_norm": 0.44253991898058176, | |
| "learning_rate": 4.244437451525764e-05, | |
| "loss": 0.5015, | |
| "num_tokens": 171143325.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.3063662741082096, | |
| "grad_norm": 0.4379537398466406, | |
| "learning_rate": 4.239684660303006e-05, | |
| "loss": 0.5084, | |
| "num_tokens": 171652194.0, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.30721966205837176, | |
| "grad_norm": 0.46000874925949437, | |
| "learning_rate": 4.234920004635792e-05, | |
| "loss": 0.5431, | |
| "num_tokens": 172160313.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.30807305000853386, | |
| "grad_norm": 0.40910104785596435, | |
| "learning_rate": 4.230143522471743e-05, | |
| "loss": 0.5258, | |
| "num_tokens": 172680319.0, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.308926437958696, | |
| "grad_norm": 0.43571435079507004, | |
| "learning_rate": 4.225355251852675e-05, | |
| "loss": 0.5283, | |
| "num_tokens": 173195128.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.30977982590885816, | |
| "grad_norm": 0.42399823343690013, | |
| "learning_rate": 4.2205552309142885e-05, | |
| "loss": 0.5213, | |
| "num_tokens": 173645350.0, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.3106332138590203, | |
| "grad_norm": 0.46125926953132673, | |
| "learning_rate": 4.215743497885873e-05, | |
| "loss": 0.5635, | |
| "num_tokens": 174112017.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.3114866018091825, | |
| "grad_norm": 0.46565221452673544, | |
| "learning_rate": 4.2109200910899916e-05, | |
| "loss": 0.5231, | |
| "num_tokens": 174616471.0, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.31233998975934457, | |
| "grad_norm": 0.4395652304186312, | |
| "learning_rate": 4.206085048942187e-05, | |
| "loss": 0.552, | |
| "num_tokens": 175151908.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.3131933777095067, | |
| "grad_norm": 0.49697312570667, | |
| "learning_rate": 4.2012384099506694e-05, | |
| "loss": 0.511, | |
| "num_tokens": 175662076.0, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.3140467656596689, | |
| "grad_norm": 0.46117830278274213, | |
| "learning_rate": 4.196380212716008e-05, | |
| "loss": 0.5068, | |
| "num_tokens": 176115268.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.31490015360983103, | |
| "grad_norm": 0.45172247098187684, | |
| "learning_rate": 4.191510495930831e-05, | |
| "loss": 0.5215, | |
| "num_tokens": 176622621.0, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.3157535415599932, | |
| "grad_norm": 0.4640298844165901, | |
| "learning_rate": 4.1866292983795084e-05, | |
| "loss": 0.5066, | |
| "num_tokens": 177084976.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.31660692951015534, | |
| "grad_norm": 0.4419322260337894, | |
| "learning_rate": 4.1817366589378526e-05, | |
| "loss": 0.5021, | |
| "num_tokens": 177584108.0, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.31746031746031744, | |
| "grad_norm": 0.4192074815965821, | |
| "learning_rate": 4.1768326165727975e-05, | |
| "loss": 0.4974, | |
| "num_tokens": 178049137.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.3183137054104796, | |
| "grad_norm": 0.4566464241885361, | |
| "learning_rate": 4.171917210342101e-05, | |
| "loss": 0.5437, | |
| "num_tokens": 178525294.0, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.31916709336064175, | |
| "grad_norm": 0.4716287926199994, | |
| "learning_rate": 4.166990479394023e-05, | |
| "loss": 0.5284, | |
| "num_tokens": 178990489.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.3200204813108039, | |
| "grad_norm": 0.41462088247371975, | |
| "learning_rate": 4.1620524629670196e-05, | |
| "loss": 0.5453, | |
| "num_tokens": 179468312.0, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.32087386926096606, | |
| "grad_norm": 0.5204628417702931, | |
| "learning_rate": 4.157103200389428e-05, | |
| "loss": 0.5203, | |
| "num_tokens": 179947208.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.32172725721112816, | |
| "grad_norm": 0.4837431786776778, | |
| "learning_rate": 4.152142731079155e-05, | |
| "loss": 0.5571, | |
| "num_tokens": 180462505.0, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.3225806451612903, | |
| "grad_norm": 0.5647864034147051, | |
| "learning_rate": 4.147171094543363e-05, | |
| "loss": 0.5245, | |
| "num_tokens": 180895300.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.32343403311145247, | |
| "grad_norm": 0.4234758612666593, | |
| "learning_rate": 4.142188330378155e-05, | |
| "loss": 0.5644, | |
| "num_tokens": 181428026.0, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.3242874210616146, | |
| "grad_norm": 0.4596616984560979, | |
| "learning_rate": 4.1371944782682536e-05, | |
| "loss": 0.5092, | |
| "num_tokens": 181891310.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.3251408090117768, | |
| "grad_norm": 0.4626660287753584, | |
| "learning_rate": 4.132189577986699e-05, | |
| "loss": 0.5115, | |
| "num_tokens": 182350373.0, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.3259941969619389, | |
| "grad_norm": 0.5345026767665364, | |
| "learning_rate": 4.127173669394516e-05, | |
| "loss": 0.5544, | |
| "num_tokens": 182829455.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.32684758491210103, | |
| "grad_norm": 0.4613439204470022, | |
| "learning_rate": 4.1221467924404075e-05, | |
| "loss": 0.5166, | |
| "num_tokens": 183279486.0, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.3277009728622632, | |
| "grad_norm": 0.41655333813843026, | |
| "learning_rate": 4.117108987160432e-05, | |
| "loss": 0.5303, | |
| "num_tokens": 183776326.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.32855436081242534, | |
| "grad_norm": 0.44072941850460157, | |
| "learning_rate": 4.112060293677687e-05, | |
| "loss": 0.5236, | |
| "num_tokens": 184213641.0, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.3294077487625875, | |
| "grad_norm": 0.4398205817287364, | |
| "learning_rate": 4.107000752201984e-05, | |
| "loss": 0.5193, | |
| "num_tokens": 184739434.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.3302611367127496, | |
| "grad_norm": 0.4568072703857535, | |
| "learning_rate": 4.101930403029538e-05, | |
| "loss": 0.5683, | |
| "num_tokens": 185243928.0, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.33111452466291175, | |
| "grad_norm": 0.4599483174740539, | |
| "learning_rate": 4.0968492865426367e-05, | |
| "loss": 0.5709, | |
| "num_tokens": 185721070.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.3319679126130739, | |
| "grad_norm": 0.450461328517899, | |
| "learning_rate": 4.091757443209322e-05, | |
| "loss": 0.5392, | |
| "num_tokens": 186257893.0, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.33282130056323606, | |
| "grad_norm": 0.44213695984569323, | |
| "learning_rate": 4.0866549135830745e-05, | |
| "loss": 0.5331, | |
| "num_tokens": 186739113.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.3336746885133982, | |
| "grad_norm": 0.5223167386936513, | |
| "learning_rate": 4.081541738302478e-05, | |
| "loss": 0.5035, | |
| "num_tokens": 187241668.0, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.3345280764635603, | |
| "grad_norm": 0.5455186522510769, | |
| "learning_rate": 4.076417958090906e-05, | |
| "loss": 0.5821, | |
| "num_tokens": 187730193.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.33538146441372246, | |
| "grad_norm": 0.4973674571359577, | |
| "learning_rate": 4.071283613756196e-05, | |
| "loss": 0.5361, | |
| "num_tokens": 188188571.0, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.3362348523638846, | |
| "grad_norm": 0.4655740024728068, | |
| "learning_rate": 4.0661387461903154e-05, | |
| "loss": 0.5216, | |
| "num_tokens": 188652348.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.33708824031404677, | |
| "grad_norm": 0.4656912903064106, | |
| "learning_rate": 4.060983396369051e-05, | |
| "loss": 0.5244, | |
| "num_tokens": 189165486.0, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.3379416282642089, | |
| "grad_norm": 0.47877107601199326, | |
| "learning_rate": 4.055817605351669e-05, | |
| "loss": 0.526, | |
| "num_tokens": 189611145.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.3387950162143711, | |
| "grad_norm": 0.4552031837318823, | |
| "learning_rate": 4.050641414280597e-05, | |
| "loss": 0.5061, | |
| "num_tokens": 190052318.0, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.3396484041645332, | |
| "grad_norm": 0.3854951156758152, | |
| "learning_rate": 4.045454864381088e-05, | |
| "loss": 0.5572, | |
| "num_tokens": 190593764.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.34050179211469533, | |
| "grad_norm": 0.45772155466228287, | |
| "learning_rate": 4.0402579969609024e-05, | |
| "loss": 0.5188, | |
| "num_tokens": 191077021.0, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 0.3413551800648575, | |
| "grad_norm": 0.4207950845177105, | |
| "learning_rate": 4.035050853409969e-05, | |
| "loss": 0.5253, | |
| "num_tokens": 191543591.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.34220856801501964, | |
| "grad_norm": 0.404428279415364, | |
| "learning_rate": 4.0298334752000634e-05, | |
| "loss": 0.5096, | |
| "num_tokens": 192058677.0, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 0.3430619559651818, | |
| "grad_norm": 0.45414959506499775, | |
| "learning_rate": 4.0246059038844716e-05, | |
| "loss": 0.5249, | |
| "num_tokens": 192536034.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.3439153439153439, | |
| "grad_norm": 0.4333123098549116, | |
| "learning_rate": 4.019368181097663e-05, | |
| "loss": 0.5171, | |
| "num_tokens": 193003486.0, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 0.34476873186550605, | |
| "grad_norm": 0.4855501956895698, | |
| "learning_rate": 4.0141203485549564e-05, | |
| "loss": 0.5348, | |
| "num_tokens": 193451364.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.3456221198156682, | |
| "grad_norm": 0.4632195605910646, | |
| "learning_rate": 4.008862448052188e-05, | |
| "loss": 0.5382, | |
| "num_tokens": 193943468.0, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.34647550776583036, | |
| "grad_norm": 0.46101384114147687, | |
| "learning_rate": 4.003594521465381e-05, | |
| "loss": 0.5387, | |
| "num_tokens": 194391841.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.3473288957159925, | |
| "grad_norm": 0.47474364816213427, | |
| "learning_rate": 3.9983166107504114e-05, | |
| "loss": 0.5386, | |
| "num_tokens": 194886160.0, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 0.3481822836661546, | |
| "grad_norm": 0.43188631824791146, | |
| "learning_rate": 3.99302875794267e-05, | |
| "loss": 0.5043, | |
| "num_tokens": 195399830.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.34903567161631677, | |
| "grad_norm": 0.47043280245914776, | |
| "learning_rate": 3.987731005156731e-05, | |
| "loss": 0.5092, | |
| "num_tokens": 195898457.0, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 0.3498890595664789, | |
| "grad_norm": 0.45339879749871337, | |
| "learning_rate": 3.9824233945860165e-05, | |
| "loss": 0.5209, | |
| "num_tokens": 196363387.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.3507424475166411, | |
| "grad_norm": 0.4589728488254337, | |
| "learning_rate": 3.977105968502461e-05, | |
| "loss": 0.4889, | |
| "num_tokens": 196830932.0, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 0.35159583546680323, | |
| "grad_norm": 0.49851956812660997, | |
| "learning_rate": 3.971778769256172e-05, | |
| "loss": 0.5189, | |
| "num_tokens": 197289380.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.35244922341696533, | |
| "grad_norm": 0.47779288264546294, | |
| "learning_rate": 3.966441839275095e-05, | |
| "loss": 0.5588, | |
| "num_tokens": 197792417.0, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 0.3533026113671275, | |
| "grad_norm": 0.441451991083009, | |
| "learning_rate": 3.9610952210646746e-05, | |
| "loss": 0.5148, | |
| "num_tokens": 198263603.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.35415599931728964, | |
| "grad_norm": 0.46071108782977366, | |
| "learning_rate": 3.955738957207517e-05, | |
| "loss": 0.4939, | |
| "num_tokens": 198723475.0, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.3550093872674518, | |
| "grad_norm": 0.3837574742118743, | |
| "learning_rate": 3.95037309036305e-05, | |
| "loss": 0.508, | |
| "num_tokens": 199248080.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.35586277521761395, | |
| "grad_norm": 0.4052434261393795, | |
| "learning_rate": 3.944997663267183e-05, | |
| "loss": 0.5341, | |
| "num_tokens": 199732114.0, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 0.35671616316777605, | |
| "grad_norm": 0.5602646164699131, | |
| "learning_rate": 3.939612718731968e-05, | |
| "loss": 0.5203, | |
| "num_tokens": 200182740.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.3575695511179382, | |
| "grad_norm": 0.532351857979547, | |
| "learning_rate": 3.934218299645256e-05, | |
| "loss": 0.5608, | |
| "num_tokens": 200651895.0, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 0.35842293906810035, | |
| "grad_norm": 0.47985961386334514, | |
| "learning_rate": 3.9288144489703595e-05, | |
| "loss": 0.5664, | |
| "num_tokens": 201135153.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3592763270182625, | |
| "grad_norm": 0.49531554578307374, | |
| "learning_rate": 3.923401209745705e-05, | |
| "loss": 0.5259, | |
| "num_tokens": 201584248.0, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 0.36012971496842466, | |
| "grad_norm": 0.5406276048272152, | |
| "learning_rate": 3.917978625084497e-05, | |
| "loss": 0.5169, | |
| "num_tokens": 202034499.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.36098310291858676, | |
| "grad_norm": 0.45541739959251265, | |
| "learning_rate": 3.912546738174367e-05, | |
| "loss": 0.5569, | |
| "num_tokens": 202509062.0, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 0.3618364908687489, | |
| "grad_norm": 0.46222153430509844, | |
| "learning_rate": 3.907105592277035e-05, | |
| "loss": 0.4909, | |
| "num_tokens": 202936389.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.36268987881891107, | |
| "grad_norm": 0.516808792491393, | |
| "learning_rate": 3.901655230727964e-05, | |
| "loss": 0.5128, | |
| "num_tokens": 203333731.0, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.3635432667690732, | |
| "grad_norm": 0.44483238168649236, | |
| "learning_rate": 3.896195696936012e-05, | |
| "loss": 0.5174, | |
| "num_tokens": 203808949.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.3643966547192354, | |
| "grad_norm": 0.4577683314352761, | |
| "learning_rate": 3.890727034383092e-05, | |
| "loss": 0.5401, | |
| "num_tokens": 204294840.0, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 0.36525004266939753, | |
| "grad_norm": 0.44828714525129754, | |
| "learning_rate": 3.885249286623816e-05, | |
| "loss": 0.5209, | |
| "num_tokens": 204803047.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.36610343061955963, | |
| "grad_norm": 0.4206666647186255, | |
| "learning_rate": 3.879762497285162e-05, | |
| "loss": 0.5314, | |
| "num_tokens": 205303910.0, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 0.3669568185697218, | |
| "grad_norm": 0.5126654581434062, | |
| "learning_rate": 3.874266710066113e-05, | |
| "loss": 0.5258, | |
| "num_tokens": 205764413.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.36781020651988394, | |
| "grad_norm": 0.43032400879560057, | |
| "learning_rate": 3.868761968737318e-05, | |
| "loss": 0.5154, | |
| "num_tokens": 206317754.0, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 0.3686635944700461, | |
| "grad_norm": 0.43014533734753363, | |
| "learning_rate": 3.863248317140736e-05, | |
| "loss": 0.5151, | |
| "num_tokens": 206795183.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.36951698242020825, | |
| "grad_norm": 0.4205693285626258, | |
| "learning_rate": 3.857725799189296e-05, | |
| "loss": 0.5322, | |
| "num_tokens": 207296161.0, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 0.37037037037037035, | |
| "grad_norm": 0.4318178305652955, | |
| "learning_rate": 3.852194458866538e-05, | |
| "loss": 0.5144, | |
| "num_tokens": 207776278.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.3712237583205325, | |
| "grad_norm": 0.407889777426393, | |
| "learning_rate": 3.846654340226271e-05, | |
| "loss": 0.522, | |
| "num_tokens": 208274050.0, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.37207714627069466, | |
| "grad_norm": 0.4205578176652811, | |
| "learning_rate": 3.841105487392213e-05, | |
| "loss": 0.5066, | |
| "num_tokens": 208806680.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.3729305342208568, | |
| "grad_norm": 0.48015717553414355, | |
| "learning_rate": 3.835547944557648e-05, | |
| "loss": 0.5177, | |
| "num_tokens": 209260793.0, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 0.37378392217101897, | |
| "grad_norm": 0.4781192057056195, | |
| "learning_rate": 3.829981755985072e-05, | |
| "loss": 0.5109, | |
| "num_tokens": 209705965.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.37463731012118107, | |
| "grad_norm": 0.5114102149191294, | |
| "learning_rate": 3.824406966005835e-05, | |
| "loss": 0.5215, | |
| "num_tokens": 210126246.0, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 0.3754906980713432, | |
| "grad_norm": 0.41261103697656065, | |
| "learning_rate": 3.818823619019795e-05, | |
| "loss": 0.5194, | |
| "num_tokens": 210620229.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.3763440860215054, | |
| "grad_norm": 0.4539516910573201, | |
| "learning_rate": 3.8132317594949593e-05, | |
| "loss": 0.5181, | |
| "num_tokens": 211067779.0, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 0.37719747397166753, | |
| "grad_norm": 0.4555163819259218, | |
| "learning_rate": 3.807631431967135e-05, | |
| "loss": 0.5428, | |
| "num_tokens": 211518941.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.3780508619218297, | |
| "grad_norm": 0.42925516663768193, | |
| "learning_rate": 3.8020226810395706e-05, | |
| "loss": 0.5221, | |
| "num_tokens": 212002020.0, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 0.3789042498719918, | |
| "grad_norm": 0.4255906391324076, | |
| "learning_rate": 3.796405551382602e-05, | |
| "loss": 0.5264, | |
| "num_tokens": 212481056.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.37975763782215394, | |
| "grad_norm": 0.4777451670858945, | |
| "learning_rate": 3.7907800877332945e-05, | |
| "loss": 0.5166, | |
| "num_tokens": 212948510.0, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.3806110257723161, | |
| "grad_norm": 0.42615424229465915, | |
| "learning_rate": 3.785146334895093e-05, | |
| "loss": 0.5036, | |
| "num_tokens": 213401212.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.38146441372247825, | |
| "grad_norm": 0.46819303230834364, | |
| "learning_rate": 3.779504337737456e-05, | |
| "loss": 0.52, | |
| "num_tokens": 213843324.0, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 0.3823178016726404, | |
| "grad_norm": 0.48118411702315234, | |
| "learning_rate": 3.7738541411955074e-05, | |
| "loss": 0.5385, | |
| "num_tokens": 214296967.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.3831711896228025, | |
| "grad_norm": 0.38919675381080204, | |
| "learning_rate": 3.768195790269672e-05, | |
| "loss": 0.5015, | |
| "num_tokens": 214829914.0, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 0.38402457757296465, | |
| "grad_norm": 0.4580291673628315, | |
| "learning_rate": 3.762529330025319e-05, | |
| "loss": 0.5628, | |
| "num_tokens": 215306580.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.3848779655231268, | |
| "grad_norm": 0.4176302349851398, | |
| "learning_rate": 3.756854805592405e-05, | |
| "loss": 0.5441, | |
| "num_tokens": 215772605.0, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 0.38573135347328896, | |
| "grad_norm": 0.5002607549855148, | |
| "learning_rate": 3.7511722621651116e-05, | |
| "loss": 0.527, | |
| "num_tokens": 216266686.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.3865847414234511, | |
| "grad_norm": 0.42680908786848415, | |
| "learning_rate": 3.745481745001488e-05, | |
| "loss": 0.5346, | |
| "num_tokens": 216760935.0, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 0.38743812937361327, | |
| "grad_norm": 0.45877841243851086, | |
| "learning_rate": 3.7397832994230886e-05, | |
| "loss": 0.5355, | |
| "num_tokens": 217244190.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.38829151732377537, | |
| "grad_norm": 0.4150413457105324, | |
| "learning_rate": 3.734076970814613e-05, | |
| "loss": 0.5029, | |
| "num_tokens": 217688335.0, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.3891449052739375, | |
| "grad_norm": 0.4932503698824527, | |
| "learning_rate": 3.728362804623544e-05, | |
| "loss": 0.5279, | |
| "num_tokens": 218179883.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.3899982932240997, | |
| "grad_norm": 0.6277158666293738, | |
| "learning_rate": 3.7226408463597885e-05, | |
| "loss": 0.5558, | |
| "num_tokens": 218665407.0, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 0.39085168117426183, | |
| "grad_norm": 0.41124114551004265, | |
| "learning_rate": 3.716911141595309e-05, | |
| "loss": 0.5054, | |
| "num_tokens": 219217201.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.391705069124424, | |
| "grad_norm": 0.4687995128004619, | |
| "learning_rate": 3.711173735963767e-05, | |
| "loss": 0.5165, | |
| "num_tokens": 219676229.0, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 0.3925584570745861, | |
| "grad_norm": 0.478813879091618, | |
| "learning_rate": 3.705428675160154e-05, | |
| "loss": 0.515, | |
| "num_tokens": 220144855.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.39341184502474824, | |
| "grad_norm": 0.384005573880658, | |
| "learning_rate": 3.6996760049404344e-05, | |
| "loss": 0.4964, | |
| "num_tokens": 220640390.0, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 0.3942652329749104, | |
| "grad_norm": 0.4329076649233892, | |
| "learning_rate": 3.693915771121173e-05, | |
| "loss": 0.5377, | |
| "num_tokens": 221131861.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.39511862092507255, | |
| "grad_norm": 0.4999542478455457, | |
| "learning_rate": 3.688148019579177e-05, | |
| "loss": 0.5412, | |
| "num_tokens": 221608729.0, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 0.3959720088752347, | |
| "grad_norm": 0.42106136892975193, | |
| "learning_rate": 3.682372796251125e-05, | |
| "loss": 0.5219, | |
| "num_tokens": 222093783.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.3968253968253968, | |
| "grad_norm": 0.46042824206480915, | |
| "learning_rate": 3.676590147133206e-05, | |
| "loss": 0.5298, | |
| "num_tokens": 222584614.0, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.39767878477555896, | |
| "grad_norm": 0.48738424108068756, | |
| "learning_rate": 3.670800118280753e-05, | |
| "loss": 0.5273, | |
| "num_tokens": 223054352.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.3985321727257211, | |
| "grad_norm": 0.45523944559285684, | |
| "learning_rate": 3.665002755807868e-05, | |
| "loss": 0.5034, | |
| "num_tokens": 223479949.0, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 0.39938556067588327, | |
| "grad_norm": 0.4254276509166681, | |
| "learning_rate": 3.6591981058870666e-05, | |
| "loss": 0.4771, | |
| "num_tokens": 223929151.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.4002389486260454, | |
| "grad_norm": 0.4329601091567961, | |
| "learning_rate": 3.653386214748902e-05, | |
| "loss": 0.557, | |
| "num_tokens": 224431940.0, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 0.4010923365762075, | |
| "grad_norm": 0.4323959450216349, | |
| "learning_rate": 3.647567128681598e-05, | |
| "loss": 0.4956, | |
| "num_tokens": 224865728.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.4019457245263697, | |
| "grad_norm": 0.46146583139208747, | |
| "learning_rate": 3.641740894030687e-05, | |
| "loss": 0.5107, | |
| "num_tokens": 225347058.0, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 0.40279911247653183, | |
| "grad_norm": 0.4242680306945169, | |
| "learning_rate": 3.635907557198629e-05, | |
| "loss": 0.5118, | |
| "num_tokens": 225865685.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.403652500426694, | |
| "grad_norm": 0.45647269857115413, | |
| "learning_rate": 3.630067164644453e-05, | |
| "loss": 0.4901, | |
| "num_tokens": 226268257.0, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 0.40450588837685614, | |
| "grad_norm": 0.43391408115164065, | |
| "learning_rate": 3.624219762883381e-05, | |
| "loss": 0.5022, | |
| "num_tokens": 226749326.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.40535927632701824, | |
| "grad_norm": 0.426842917443231, | |
| "learning_rate": 3.61836539848646e-05, | |
| "loss": 0.5578, | |
| "num_tokens": 227284246.0, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.4062126642771804, | |
| "grad_norm": 0.45662158044710444, | |
| "learning_rate": 3.6125041180801876e-05, | |
| "loss": 0.5035, | |
| "num_tokens": 227786006.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.40706605222734255, | |
| "grad_norm": 0.3867133176606606, | |
| "learning_rate": 3.606635968346148e-05, | |
| "loss": 0.4959, | |
| "num_tokens": 228251754.0, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 0.4079194401775047, | |
| "grad_norm": 0.422237589150487, | |
| "learning_rate": 3.6007609960206316e-05, | |
| "loss": 0.5269, | |
| "num_tokens": 228763631.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.40877282812766685, | |
| "grad_norm": 0.46197352048338103, | |
| "learning_rate": 3.5948792478942666e-05, | |
| "loss": 0.5079, | |
| "num_tokens": 229273998.0, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 0.409626216077829, | |
| "grad_norm": 0.424857545739237, | |
| "learning_rate": 3.588990770811649e-05, | |
| "loss": 0.5143, | |
| "num_tokens": 229748231.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.4104796040279911, | |
| "grad_norm": 0.4513797224935267, | |
| "learning_rate": 3.583095611670965e-05, | |
| "loss": 0.5024, | |
| "num_tokens": 230231250.0, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 0.41133299197815326, | |
| "grad_norm": 0.4163414034805104, | |
| "learning_rate": 3.57719381742362e-05, | |
| "loss": 0.5239, | |
| "num_tokens": 230738165.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.4121863799283154, | |
| "grad_norm": 0.4434748873450728, | |
| "learning_rate": 3.571285435073865e-05, | |
| "loss": 0.5274, | |
| "num_tokens": 231200420.0, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 0.41303976787847757, | |
| "grad_norm": 0.41052060311046834, | |
| "learning_rate": 3.5653705116784174e-05, | |
| "loss": 0.5193, | |
| "num_tokens": 231716185.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.4138931558286397, | |
| "grad_norm": 0.41357132890327114, | |
| "learning_rate": 3.559449094346096e-05, | |
| "loss": 0.5299, | |
| "num_tokens": 232181406.0, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.4147465437788018, | |
| "grad_norm": 0.3877767349743217, | |
| "learning_rate": 3.5535212302374334e-05, | |
| "loss": 0.5256, | |
| "num_tokens": 232698142.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.415599931728964, | |
| "grad_norm": 0.4709889203746464, | |
| "learning_rate": 3.547586966564314e-05, | |
| "loss": 0.5572, | |
| "num_tokens": 233160411.0, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 0.41645331967912613, | |
| "grad_norm": 0.41266056756499886, | |
| "learning_rate": 3.5416463505895836e-05, | |
| "loss": 0.5303, | |
| "num_tokens": 233655900.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.4173067076292883, | |
| "grad_norm": 0.4038647167080695, | |
| "learning_rate": 3.5356994296266874e-05, | |
| "loss": 0.5127, | |
| "num_tokens": 234147593.0, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 0.41816009557945044, | |
| "grad_norm": 0.476972494478868, | |
| "learning_rate": 3.5297462510392796e-05, | |
| "loss": 0.5234, | |
| "num_tokens": 234596302.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.41901348352961254, | |
| "grad_norm": 0.48830304398338387, | |
| "learning_rate": 3.5237868622408574e-05, | |
| "loss": 0.5506, | |
| "num_tokens": 235067909.0, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 0.4198668714797747, | |
| "grad_norm": 0.4192067255569544, | |
| "learning_rate": 3.5178213106943754e-05, | |
| "loss": 0.5015, | |
| "num_tokens": 235502719.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.42072025942993685, | |
| "grad_norm": 0.4643629521107101, | |
| "learning_rate": 3.5118496439118734e-05, | |
| "loss": 0.5221, | |
| "num_tokens": 235975405.0, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 0.421573647380099, | |
| "grad_norm": 0.4304948248518558, | |
| "learning_rate": 3.505871909454093e-05, | |
| "loss": 0.5376, | |
| "num_tokens": 236475023.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.42242703533026116, | |
| "grad_norm": 0.4049982974393234, | |
| "learning_rate": 3.4998881549301025e-05, | |
| "loss": 0.5132, | |
| "num_tokens": 237014646.0, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.42328042328042326, | |
| "grad_norm": 0.4396169324109435, | |
| "learning_rate": 3.493898427996917e-05, | |
| "loss": 0.5367, | |
| "num_tokens": 237433481.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.4241338112305854, | |
| "grad_norm": 0.4548416698299291, | |
| "learning_rate": 3.487902776359118e-05, | |
| "loss": 0.5149, | |
| "num_tokens": 237905824.0, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 0.42498719918074757, | |
| "grad_norm": 0.45370411213783196, | |
| "learning_rate": 3.481901247768471e-05, | |
| "loss": 0.494, | |
| "num_tokens": 238378423.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.4258405871309097, | |
| "grad_norm": 0.43255936627460106, | |
| "learning_rate": 3.4758938900235514e-05, | |
| "loss": 0.5416, | |
| "num_tokens": 238870652.0, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 0.4266939750810719, | |
| "grad_norm": 0.48454443459996355, | |
| "learning_rate": 3.46988075096936e-05, | |
| "loss": 0.5199, | |
| "num_tokens": 239310469.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.427547363031234, | |
| "grad_norm": 0.42164293312333906, | |
| "learning_rate": 3.463861878496939e-05, | |
| "loss": 0.5108, | |
| "num_tokens": 239810904.0, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 0.42840075098139613, | |
| "grad_norm": 0.45579788720621434, | |
| "learning_rate": 3.457837320542998e-05, | |
| "loss": 0.5264, | |
| "num_tokens": 240265629.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.4292541389315583, | |
| "grad_norm": 0.4370006161671363, | |
| "learning_rate": 3.451807125089525e-05, | |
| "loss": 0.5298, | |
| "num_tokens": 240726331.0, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 0.43010752688172044, | |
| "grad_norm": 0.4520018287866162, | |
| "learning_rate": 3.445771340163409e-05, | |
| "loss": 0.5146, | |
| "num_tokens": 241153898.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.4309609148318826, | |
| "grad_norm": 0.43492790670886017, | |
| "learning_rate": 3.4397300138360565e-05, | |
| "loss": 0.5295, | |
| "num_tokens": 241625072.0, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.4318143027820447, | |
| "grad_norm": 0.4443784590192856, | |
| "learning_rate": 3.433683194223004e-05, | |
| "loss": 0.542, | |
| "num_tokens": 242074086.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.43266769073220684, | |
| "grad_norm": 0.566094322849903, | |
| "learning_rate": 3.427630929483543e-05, | |
| "loss": 0.5437, | |
| "num_tokens": 242572145.0, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 0.433521078682369, | |
| "grad_norm": 0.48862510524184777, | |
| "learning_rate": 3.42157326782033e-05, | |
| "loss": 0.5169, | |
| "num_tokens": 243032696.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.43437446663253115, | |
| "grad_norm": 0.45479152949087676, | |
| "learning_rate": 3.415510257479008e-05, | |
| "loss": 0.5451, | |
| "num_tokens": 243549917.0, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 0.4352278545826933, | |
| "grad_norm": 0.43265925421094587, | |
| "learning_rate": 3.409441946747813e-05, | |
| "loss": 0.5057, | |
| "num_tokens": 244052484.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.43608124253285546, | |
| "grad_norm": 0.40494057790118937, | |
| "learning_rate": 3.403368383957199e-05, | |
| "loss": 0.4883, | |
| "num_tokens": 244496332.0, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 0.43693463048301756, | |
| "grad_norm": 0.42437156906773316, | |
| "learning_rate": 3.39728961747945e-05, | |
| "loss": 0.4869, | |
| "num_tokens": 245008254.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.4377880184331797, | |
| "grad_norm": 0.38732592887378, | |
| "learning_rate": 3.391205695728295e-05, | |
| "loss": 0.5297, | |
| "num_tokens": 245566459.0, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 0.43864140638334187, | |
| "grad_norm": 0.3683635369809261, | |
| "learning_rate": 3.3851166671585176e-05, | |
| "loss": 0.521, | |
| "num_tokens": 246076299.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.439494794333504, | |
| "grad_norm": 0.42088704640799, | |
| "learning_rate": 3.3790225802655796e-05, | |
| "loss": 0.5769, | |
| "num_tokens": 246583492.0, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.4403481822836662, | |
| "grad_norm": 0.46665532337470306, | |
| "learning_rate": 3.3729234835852236e-05, | |
| "loss": 0.5365, | |
| "num_tokens": 247016964.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.4412015702338283, | |
| "grad_norm": 0.45592142218192566, | |
| "learning_rate": 3.3668194256930966e-05, | |
| "loss": 0.5145, | |
| "num_tokens": 247509731.0, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 0.44205495818399043, | |
| "grad_norm": 0.45361752931985455, | |
| "learning_rate": 3.360710455204357e-05, | |
| "loss": 0.4864, | |
| "num_tokens": 247913243.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.4429083461341526, | |
| "grad_norm": 0.40568695596331544, | |
| "learning_rate": 3.354596620773288e-05, | |
| "loss": 0.5155, | |
| "num_tokens": 248374918.0, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 0.44376173408431474, | |
| "grad_norm": 0.42150082834320685, | |
| "learning_rate": 3.348477971092914e-05, | |
| "loss": 0.5217, | |
| "num_tokens": 248791992.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.4446151220344769, | |
| "grad_norm": 0.4025765145454328, | |
| "learning_rate": 3.3423545548946074e-05, | |
| "loss": 0.4981, | |
| "num_tokens": 249300317.0, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 0.445468509984639, | |
| "grad_norm": 0.47790378494764735, | |
| "learning_rate": 3.336226420947704e-05, | |
| "loss": 0.5357, | |
| "num_tokens": 249736240.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.44632189793480115, | |
| "grad_norm": 0.3786578220504516, | |
| "learning_rate": 3.330093618059114e-05, | |
| "loss": 0.5352, | |
| "num_tokens": 250229897.0, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 0.4471752858849633, | |
| "grad_norm": 0.4335834827562002, | |
| "learning_rate": 3.323956195072932e-05, | |
| "loss": 0.5054, | |
| "num_tokens": 250713070.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.44802867383512546, | |
| "grad_norm": 0.47402600153196595, | |
| "learning_rate": 3.3178142008700494e-05, | |
| "loss": 0.5106, | |
| "num_tokens": 251151646.0, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.4488820617852876, | |
| "grad_norm": 0.45317934952926076, | |
| "learning_rate": 3.311667684367765e-05, | |
| "loss": 0.5275, | |
| "num_tokens": 251635307.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.4497354497354497, | |
| "grad_norm": 0.39315106805520583, | |
| "learning_rate": 3.3055166945193944e-05, | |
| "loss": 0.4996, | |
| "num_tokens": 252132086.0, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 0.45058883768561186, | |
| "grad_norm": 0.4143629204736371, | |
| "learning_rate": 3.299361280313881e-05, | |
| "loss": 0.4849, | |
| "num_tokens": 252648795.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.451442225635774, | |
| "grad_norm": 0.44400289513216434, | |
| "learning_rate": 3.293201490775406e-05, | |
| "loss": 0.5171, | |
| "num_tokens": 253139442.0, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 0.4522956135859362, | |
| "grad_norm": 0.45522752917796416, | |
| "learning_rate": 3.2870373749629954e-05, | |
| "loss": 0.5502, | |
| "num_tokens": 253618343.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.45314900153609833, | |
| "grad_norm": 0.38876691549496806, | |
| "learning_rate": 3.280868981970134e-05, | |
| "loss": 0.4981, | |
| "num_tokens": 254046961.0, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 0.4540023894862604, | |
| "grad_norm": 0.47317345694440643, | |
| "learning_rate": 3.2746963609243716e-05, | |
| "loss": 0.5105, | |
| "num_tokens": 254539106.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.4548557774364226, | |
| "grad_norm": 0.366524222877937, | |
| "learning_rate": 3.26851956098693e-05, | |
| "loss": 0.5178, | |
| "num_tokens": 255015832.0, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 0.45570916538658474, | |
| "grad_norm": 0.4349193539753097, | |
| "learning_rate": 3.2623386313523145e-05, | |
| "loss": 0.5114, | |
| "num_tokens": 255475551.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.4565625533367469, | |
| "grad_norm": 0.3904044605439651, | |
| "learning_rate": 3.256153621247921e-05, | |
| "loss": 0.5348, | |
| "num_tokens": 256012259.0, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.45741594128690904, | |
| "grad_norm": 0.4327658112630048, | |
| "learning_rate": 3.249964579933644e-05, | |
| "loss": 0.5086, | |
| "num_tokens": 256460069.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.4582693292370712, | |
| "grad_norm": 0.4428069600064318, | |
| "learning_rate": 3.2437715567014836e-05, | |
| "loss": 0.4924, | |
| "num_tokens": 256926548.0, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 0.4591227171872333, | |
| "grad_norm": 0.460066832634619, | |
| "learning_rate": 3.237574600875154e-05, | |
| "loss": 0.5063, | |
| "num_tokens": 257384246.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.45997610513739545, | |
| "grad_norm": 0.40855459105660263, | |
| "learning_rate": 3.231373761809689e-05, | |
| "loss": 0.5196, | |
| "num_tokens": 257950343.0, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 0.4608294930875576, | |
| "grad_norm": 0.4172932873551889, | |
| "learning_rate": 3.2251690888910535e-05, | |
| "loss": 0.4813, | |
| "num_tokens": 258439462.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.46168288103771976, | |
| "grad_norm": 0.4248784573748959, | |
| "learning_rate": 3.218960631535742e-05, | |
| "loss": 0.5116, | |
| "num_tokens": 258898015.0, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 0.4625362689878819, | |
| "grad_norm": 0.4564197798862648, | |
| "learning_rate": 3.212748439190392e-05, | |
| "loss": 0.5142, | |
| "num_tokens": 259419905.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.463389656938044, | |
| "grad_norm": 0.39895871159504365, | |
| "learning_rate": 3.20653256133139e-05, | |
| "loss": 0.5095, | |
| "num_tokens": 259945956.0, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 0.46424304488820617, | |
| "grad_norm": 0.40644909219196834, | |
| "learning_rate": 3.200313047464471e-05, | |
| "loss": 0.5489, | |
| "num_tokens": 260499223.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.4650964328383683, | |
| "grad_norm": 0.44281326283922634, | |
| "learning_rate": 3.194089947124333e-05, | |
| "loss": 0.4883, | |
| "num_tokens": 260905757.0, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.4659498207885305, | |
| "grad_norm": 0.37821784550445875, | |
| "learning_rate": 3.1878633098742344e-05, | |
| "loss": 0.5025, | |
| "num_tokens": 261442637.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.46680320873869263, | |
| "grad_norm": 0.41587702085637157, | |
| "learning_rate": 3.1816331853056063e-05, | |
| "loss": 0.5058, | |
| "num_tokens": 261881998.0, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 0.46765659668885473, | |
| "grad_norm": 0.3579938113507809, | |
| "learning_rate": 3.175399623037652e-05, | |
| "loss": 0.5053, | |
| "num_tokens": 262428104.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.4685099846390169, | |
| "grad_norm": 0.4228513819003615, | |
| "learning_rate": 3.169162672716954e-05, | |
| "loss": 0.486, | |
| "num_tokens": 262835596.0, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 0.46936337258917904, | |
| "grad_norm": 0.39234185689036544, | |
| "learning_rate": 3.162922384017081e-05, | |
| "loss": 0.4938, | |
| "num_tokens": 263310660.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.4702167605393412, | |
| "grad_norm": 0.4421974022376107, | |
| "learning_rate": 3.156678806638186e-05, | |
| "loss": 0.4997, | |
| "num_tokens": 263785113.0, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 0.47107014848950335, | |
| "grad_norm": 0.40859274360519093, | |
| "learning_rate": 3.150431990306617e-05, | |
| "loss": 0.5395, | |
| "num_tokens": 264331327.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.47192353643966545, | |
| "grad_norm": 0.42573162840619944, | |
| "learning_rate": 3.1441819847745186e-05, | |
| "loss": 0.496, | |
| "num_tokens": 264761907.0, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 0.4727769243898276, | |
| "grad_norm": 0.3852077729248029, | |
| "learning_rate": 3.137928839819434e-05, | |
| "loss": 0.5266, | |
| "num_tokens": 265270147.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.47363031233998976, | |
| "grad_norm": 0.4596895535171644, | |
| "learning_rate": 3.131672605243911e-05, | |
| "loss": 0.5056, | |
| "num_tokens": 265726159.0, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.4744837002901519, | |
| "grad_norm": 0.4347272943279631, | |
| "learning_rate": 3.125413330875104e-05, | |
| "loss": 0.5209, | |
| "num_tokens": 266242185.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.47533708824031407, | |
| "grad_norm": 0.4156213578123365, | |
| "learning_rate": 3.119151066564378e-05, | |
| "loss": 0.538, | |
| "num_tokens": 266708386.0, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 0.4049702981972163, | |
| "learning_rate": 3.1128858621869084e-05, | |
| "loss": 0.4874, | |
| "num_tokens": 267191130.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.4770438641406383, | |
| "grad_norm": 0.4348670360779557, | |
| "learning_rate": 3.106617767641291e-05, | |
| "loss": 0.4975, | |
| "num_tokens": 267663197.0, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 0.4778972520908005, | |
| "grad_norm": 0.43225104742799214, | |
| "learning_rate": 3.100346832849137e-05, | |
| "loss": 0.5238, | |
| "num_tokens": 268142845.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.4787506400409626, | |
| "grad_norm": 0.4088305738694034, | |
| "learning_rate": 3.0940731077546806e-05, | |
| "loss": 0.5295, | |
| "num_tokens": 268630880.0, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 0.4796040279911248, | |
| "grad_norm": 0.40782563301753116, | |
| "learning_rate": 3.087796642324376e-05, | |
| "loss": 0.5257, | |
| "num_tokens": 269116555.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.48045741594128694, | |
| "grad_norm": 0.42105056058475027, | |
| "learning_rate": 3.0815174865465046e-05, | |
| "loss": 0.5226, | |
| "num_tokens": 269632490.0, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 0.48131080389144903, | |
| "grad_norm": 0.4076093239232857, | |
| "learning_rate": 3.075235690430775e-05, | |
| "loss": 0.5127, | |
| "num_tokens": 270078357.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.4821641918416112, | |
| "grad_norm": 0.4301254693302114, | |
| "learning_rate": 3.0689513040079235e-05, | |
| "loss": 0.5001, | |
| "num_tokens": 270555837.0, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.48301757979177334, | |
| "grad_norm": 0.4332880792431796, | |
| "learning_rate": 3.062664377329317e-05, | |
| "loss": 0.5367, | |
| "num_tokens": 271065401.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.4838709677419355, | |
| "grad_norm": 0.45368572120343953, | |
| "learning_rate": 3.0563749604665556e-05, | |
| "loss": 0.5408, | |
| "num_tokens": 271556701.0, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 0.48472435569209765, | |
| "grad_norm": 0.4520673606943377, | |
| "learning_rate": 3.0500831035110677e-05, | |
| "loss": 0.4989, | |
| "num_tokens": 272018748.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.48557774364225975, | |
| "grad_norm": 0.39523619712392105, | |
| "learning_rate": 3.0437888565737215e-05, | |
| "loss": 0.5439, | |
| "num_tokens": 272549900.0, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 0.4864311315924219, | |
| "grad_norm": 0.43000518990498643, | |
| "learning_rate": 3.0374922697844167e-05, | |
| "loss": 0.4952, | |
| "num_tokens": 273051158.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.48728451954258406, | |
| "grad_norm": 0.38808168747132776, | |
| "learning_rate": 3.0311933932916874e-05, | |
| "loss": 0.5456, | |
| "num_tokens": 273517952.0, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 0.4881379074927462, | |
| "grad_norm": 0.4977367232996969, | |
| "learning_rate": 3.0248922772623066e-05, | |
| "loss": 0.5267, | |
| "num_tokens": 274027158.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.48899129544290837, | |
| "grad_norm": 0.49433388548273255, | |
| "learning_rate": 3.0185889718808813e-05, | |
| "loss": 0.5234, | |
| "num_tokens": 274522673.0, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 0.48984468339307047, | |
| "grad_norm": 0.47900994253841755, | |
| "learning_rate": 3.012283527349458e-05, | |
| "loss": 0.4906, | |
| "num_tokens": 274976608.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.4906980713432326, | |
| "grad_norm": 0.4511536975657782, | |
| "learning_rate": 3.0059759938871194e-05, | |
| "loss": 0.5451, | |
| "num_tokens": 275448649.0, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.4915514592933948, | |
| "grad_norm": 0.4737249004161396, | |
| "learning_rate": 2.9996664217295832e-05, | |
| "loss": 0.5252, | |
| "num_tokens": 275897357.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.49240484724355693, | |
| "grad_norm": 0.4102917765501351, | |
| "learning_rate": 2.9933548611288064e-05, | |
| "loss": 0.4952, | |
| "num_tokens": 276347055.0, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 0.4932582351937191, | |
| "grad_norm": 0.5020992671306612, | |
| "learning_rate": 2.987041362352581e-05, | |
| "loss": 0.4826, | |
| "num_tokens": 276767022.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.4941116231438812, | |
| "grad_norm": 0.4420055358517792, | |
| "learning_rate": 2.9807259756841383e-05, | |
| "loss": 0.489, | |
| "num_tokens": 277212119.0, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 0.49496501109404334, | |
| "grad_norm": 0.4477313795217654, | |
| "learning_rate": 2.974408751421743e-05, | |
| "loss": 0.497, | |
| "num_tokens": 277733458.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.4958183990442055, | |
| "grad_norm": 0.47208064434768, | |
| "learning_rate": 2.9680897398782946e-05, | |
| "loss": 0.5309, | |
| "num_tokens": 278170013.0, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 0.49667178699436765, | |
| "grad_norm": 0.4585105698260265, | |
| "learning_rate": 2.9617689913809304e-05, | |
| "loss": 0.507, | |
| "num_tokens": 278655070.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.4975251749445298, | |
| "grad_norm": 0.43664634631684907, | |
| "learning_rate": 2.955446556270618e-05, | |
| "loss": 0.5132, | |
| "num_tokens": 279102554.0, | |
| "step": 2915 | |
| }, | |
| { | |
| "epoch": 0.4983785628946919, | |
| "grad_norm": 0.45462237543409334, | |
| "learning_rate": 2.9491224849017602e-05, | |
| "loss": 0.5061, | |
| "num_tokens": 279550428.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.49923195084485406, | |
| "grad_norm": 0.43393219433167807, | |
| "learning_rate": 2.94279682764179e-05, | |
| "loss": 0.5009, | |
| "num_tokens": 280040967.0, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.5000853387950163, | |
| "grad_norm": 0.4016235110349555, | |
| "learning_rate": 2.9364696348707726e-05, | |
| "loss": 0.5065, | |
| "num_tokens": 280592599.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.5009387267451784, | |
| "grad_norm": 0.44604082171147685, | |
| "learning_rate": 2.930140956981002e-05, | |
| "loss": 0.4968, | |
| "num_tokens": 281007234.0, | |
| "step": 2935 | |
| }, | |
| { | |
| "epoch": 0.5017921146953405, | |
| "grad_norm": 0.3972539608962464, | |
| "learning_rate": 2.9238108443765988e-05, | |
| "loss": 0.4917, | |
| "num_tokens": 281502549.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.5026455026455027, | |
| "grad_norm": 0.4068618524764002, | |
| "learning_rate": 2.9174793474731133e-05, | |
| "loss": 0.5169, | |
| "num_tokens": 282007423.0, | |
| "step": 2945 | |
| }, | |
| { | |
| "epoch": 0.5034988905956648, | |
| "grad_norm": 0.3934284698838445, | |
| "learning_rate": 2.911146516697118e-05, | |
| "loss": 0.5206, | |
| "num_tokens": 282554594.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.504352278545827, | |
| "grad_norm": 0.49794445260102854, | |
| "learning_rate": 2.904812402485811e-05, | |
| "loss": 0.5336, | |
| "num_tokens": 283018404.0, | |
| "step": 2955 | |
| }, | |
| { | |
| "epoch": 0.5052056664959891, | |
| "grad_norm": 0.4301520522221805, | |
| "learning_rate": 2.8984770552866108e-05, | |
| "loss": 0.4652, | |
| "num_tokens": 283508120.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.5060590544461512, | |
| "grad_norm": 0.3630491480984147, | |
| "learning_rate": 2.8921405255567578e-05, | |
| "loss": 0.4703, | |
| "num_tokens": 283980181.0, | |
| "step": 2965 | |
| }, | |
| { | |
| "epoch": 0.5069124423963134, | |
| "grad_norm": 0.4383642773880878, | |
| "learning_rate": 2.8858028637629063e-05, | |
| "loss": 0.5036, | |
| "num_tokens": 284430674.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.5077658303464755, | |
| "grad_norm": 0.39546928553841354, | |
| "learning_rate": 2.8794641203807314e-05, | |
| "loss": 0.516, | |
| "num_tokens": 284959792.0, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.5086192182966377, | |
| "grad_norm": 0.37660609263719025, | |
| "learning_rate": 2.873124345894521e-05, | |
| "loss": 0.5281, | |
| "num_tokens": 285480194.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.5094726062467998, | |
| "grad_norm": 0.4024974092986853, | |
| "learning_rate": 2.8667835907967748e-05, | |
| "loss": 0.5225, | |
| "num_tokens": 285984736.0, | |
| "step": 2985 | |
| }, | |
| { | |
| "epoch": 0.5103259941969619, | |
| "grad_norm": 0.44800596936588594, | |
| "learning_rate": 2.8604419055878017e-05, | |
| "loss": 0.4856, | |
| "num_tokens": 286426656.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.5111793821471241, | |
| "grad_norm": 0.4590106934442063, | |
| "learning_rate": 2.854099340775319e-05, | |
| "loss": 0.5204, | |
| "num_tokens": 286879976.0, | |
| "step": 2995 | |
| }, | |
| { | |
| "epoch": 0.5120327700972862, | |
| "grad_norm": 0.4437459388280179, | |
| "learning_rate": 2.8477559468740506e-05, | |
| "loss": 0.5419, | |
| "num_tokens": 287419124.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5128861580474484, | |
| "grad_norm": 0.3941391853137448, | |
| "learning_rate": 2.8414117744053225e-05, | |
| "loss": 0.4996, | |
| "num_tokens": 287889044.0, | |
| "step": 3005 | |
| }, | |
| { | |
| "epoch": 0.5137395459976105, | |
| "grad_norm": 0.4486424754845704, | |
| "learning_rate": 2.8350668738966612e-05, | |
| "loss": 0.4796, | |
| "num_tokens": 288343301.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.5145929339477726, | |
| "grad_norm": 0.41559997831283824, | |
| "learning_rate": 2.8287212958813926e-05, | |
| "loss": 0.4688, | |
| "num_tokens": 288795850.0, | |
| "step": 3015 | |
| }, | |
| { | |
| "epoch": 0.5154463218979348, | |
| "grad_norm": 0.44299000952386297, | |
| "learning_rate": 2.8223750908982378e-05, | |
| "loss": 0.5127, | |
| "num_tokens": 289306762.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.5162997098480969, | |
| "grad_norm": 0.4599537421363437, | |
| "learning_rate": 2.8160283094909105e-05, | |
| "loss": 0.508, | |
| "num_tokens": 289781437.0, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 0.5171530977982591, | |
| "grad_norm": 0.40069114544445816, | |
| "learning_rate": 2.8096810022077184e-05, | |
| "loss": 0.5014, | |
| "num_tokens": 290236538.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.5180064857484212, | |
| "grad_norm": 0.45676370908777364, | |
| "learning_rate": 2.8033332196011548e-05, | |
| "loss": 0.5094, | |
| "num_tokens": 290654343.0, | |
| "step": 3035 | |
| }, | |
| { | |
| "epoch": 0.5188598736985833, | |
| "grad_norm": 0.42700441443187453, | |
| "learning_rate": 2.7969850122274977e-05, | |
| "loss": 0.5073, | |
| "num_tokens": 291105578.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.5197132616487455, | |
| "grad_norm": 0.4437302855763285, | |
| "learning_rate": 2.7906364306464116e-05, | |
| "loss": 0.5484, | |
| "num_tokens": 291563085.0, | |
| "step": 3045 | |
| }, | |
| { | |
| "epoch": 0.5205666495989076, | |
| "grad_norm": 0.4080110251430444, | |
| "learning_rate": 2.784287525420538e-05, | |
| "loss": 0.5068, | |
| "num_tokens": 292089939.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.5214200375490698, | |
| "grad_norm": 0.4410861917351765, | |
| "learning_rate": 2.777938347115098e-05, | |
| "loss": 0.4886, | |
| "num_tokens": 292575735.0, | |
| "step": 3055 | |
| }, | |
| { | |
| "epoch": 0.522273425499232, | |
| "grad_norm": 0.3785526744465806, | |
| "learning_rate": 2.771588946297488e-05, | |
| "loss": 0.5033, | |
| "num_tokens": 293082816.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.523126813449394, | |
| "grad_norm": 0.40308065766510176, | |
| "learning_rate": 2.7652393735368747e-05, | |
| "loss": 0.5058, | |
| "num_tokens": 293551824.0, | |
| "step": 3065 | |
| }, | |
| { | |
| "epoch": 0.5239802013995563, | |
| "grad_norm": 0.4339117207480613, | |
| "learning_rate": 2.758889679403795e-05, | |
| "loss": 0.5021, | |
| "num_tokens": 293957075.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.5248335893497184, | |
| "grad_norm": 0.4522692729196487, | |
| "learning_rate": 2.7525399144697534e-05, | |
| "loss": 0.5023, | |
| "num_tokens": 294462942.0, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 0.5256869772998806, | |
| "grad_norm": 0.42818343974439493, | |
| "learning_rate": 2.746190129306816e-05, | |
| "loss": 0.4971, | |
| "num_tokens": 294930434.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.5265403652500427, | |
| "grad_norm": 0.4360784349237638, | |
| "learning_rate": 2.739840374487214e-05, | |
| "loss": 0.5238, | |
| "num_tokens": 295431615.0, | |
| "step": 3085 | |
| }, | |
| { | |
| "epoch": 0.5273937532002048, | |
| "grad_norm": 0.4196174080934708, | |
| "learning_rate": 2.733490700582932e-05, | |
| "loss": 0.5098, | |
| "num_tokens": 295898443.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.528247141150367, | |
| "grad_norm": 0.3852913637071565, | |
| "learning_rate": 2.7271411581653145e-05, | |
| "loss": 0.4832, | |
| "num_tokens": 296353814.0, | |
| "step": 3095 | |
| }, | |
| { | |
| "epoch": 0.5291005291005291, | |
| "grad_norm": 0.40367490819418617, | |
| "learning_rate": 2.720791797804656e-05, | |
| "loss": 0.525, | |
| "num_tokens": 296842807.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.5299539170506913, | |
| "grad_norm": 0.38224241516328433, | |
| "learning_rate": 2.7144426700698012e-05, | |
| "loss": 0.5212, | |
| "num_tokens": 297377978.0, | |
| "step": 3105 | |
| }, | |
| { | |
| "epoch": 0.5308073050008534, | |
| "grad_norm": 0.38884647817248724, | |
| "learning_rate": 2.708093825527745e-05, | |
| "loss": 0.5062, | |
| "num_tokens": 297874811.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.5316606929510155, | |
| "grad_norm": 0.3824119289882588, | |
| "learning_rate": 2.7017453147432225e-05, | |
| "loss": 0.485, | |
| "num_tokens": 298297913.0, | |
| "step": 3115 | |
| }, | |
| { | |
| "epoch": 0.5325140809011777, | |
| "grad_norm": 0.46386419421635433, | |
| "learning_rate": 2.6953971882783142e-05, | |
| "loss": 0.5173, | |
| "num_tokens": 298726998.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.5333674688513398, | |
| "grad_norm": 0.43321631161233604, | |
| "learning_rate": 2.689049496692037e-05, | |
| "loss": 0.5044, | |
| "num_tokens": 299231719.0, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 0.534220856801502, | |
| "grad_norm": 0.38820410814314277, | |
| "learning_rate": 2.6827022905399456e-05, | |
| "loss": 0.5169, | |
| "num_tokens": 299723782.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.5350742447516641, | |
| "grad_norm": 0.4802499836124574, | |
| "learning_rate": 2.676355620373731e-05, | |
| "loss": 0.5016, | |
| "num_tokens": 300205069.0, | |
| "step": 3135 | |
| }, | |
| { | |
| "epoch": 0.5359276327018262, | |
| "grad_norm": 0.41530827448297597, | |
| "learning_rate": 2.67000953674081e-05, | |
| "loss": 0.4705, | |
| "num_tokens": 300660125.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.5367810206519884, | |
| "grad_norm": 0.4627794473994795, | |
| "learning_rate": 2.663664090183932e-05, | |
| "loss": 0.5069, | |
| "num_tokens": 301065861.0, | |
| "step": 3145 | |
| }, | |
| { | |
| "epoch": 0.5376344086021505, | |
| "grad_norm": 0.4215990049412115, | |
| "learning_rate": 2.657319331240771e-05, | |
| "loss": 0.4913, | |
| "num_tokens": 301489572.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.5384877965523127, | |
| "grad_norm": 0.3817969956139297, | |
| "learning_rate": 2.650975310443525e-05, | |
| "loss": 0.4677, | |
| "num_tokens": 301998215.0, | |
| "step": 3155 | |
| }, | |
| { | |
| "epoch": 0.5393411845024748, | |
| "grad_norm": 0.39589002477034635, | |
| "learning_rate": 2.644632078318513e-05, | |
| "loss": 0.4861, | |
| "num_tokens": 302437719.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.540194572452637, | |
| "grad_norm": 0.41456292279502605, | |
| "learning_rate": 2.6382896853857736e-05, | |
| "loss": 0.5216, | |
| "num_tokens": 302939376.0, | |
| "step": 3165 | |
| }, | |
| { | |
| "epoch": 0.5410479604027991, | |
| "grad_norm": 0.38635741541653823, | |
| "learning_rate": 2.63194818215866e-05, | |
| "loss": 0.5238, | |
| "num_tokens": 303463770.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.5419013483529612, | |
| "grad_norm": 0.42334131939782493, | |
| "learning_rate": 2.625607619143439e-05, | |
| "loss": 0.5185, | |
| "num_tokens": 303913234.0, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 0.5427547363031234, | |
| "grad_norm": 0.42056612217199596, | |
| "learning_rate": 2.619268046838893e-05, | |
| "loss": 0.4923, | |
| "num_tokens": 304343518.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.5436081242532855, | |
| "grad_norm": 0.397540864182293, | |
| "learning_rate": 2.61292951573591e-05, | |
| "loss": 0.519, | |
| "num_tokens": 304816425.0, | |
| "step": 3185 | |
| }, | |
| { | |
| "epoch": 0.5444615122034477, | |
| "grad_norm": 0.4452678145317302, | |
| "learning_rate": 2.606592076317087e-05, | |
| "loss": 0.5183, | |
| "num_tokens": 305270656.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.5453149001536098, | |
| "grad_norm": 0.36481720174114934, | |
| "learning_rate": 2.6002557790563276e-05, | |
| "loss": 0.5148, | |
| "num_tokens": 305784206.0, | |
| "step": 3195 | |
| }, | |
| { | |
| "epoch": 0.5461682881037719, | |
| "grad_norm": 0.43334773061691706, | |
| "learning_rate": 2.5939206744184354e-05, | |
| "loss": 0.4937, | |
| "num_tokens": 306180961.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.5470216760539341, | |
| "grad_norm": 0.40557388342408573, | |
| "learning_rate": 2.5875868128587177e-05, | |
| "loss": 0.5024, | |
| "num_tokens": 306669118.0, | |
| "step": 3205 | |
| }, | |
| { | |
| "epoch": 0.5478750640040962, | |
| "grad_norm": 0.45164418625731906, | |
| "learning_rate": 2.5812542448225836e-05, | |
| "loss": 0.5587, | |
| "num_tokens": 307152419.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.5487284519542585, | |
| "grad_norm": 0.42125446183736254, | |
| "learning_rate": 2.574923020745135e-05, | |
| "loss": 0.5196, | |
| "num_tokens": 307606425.0, | |
| "step": 3215 | |
| }, | |
| { | |
| "epoch": 0.5495818399044206, | |
| "grad_norm": 0.4334062162332435, | |
| "learning_rate": 2.5685931910507756e-05, | |
| "loss": 0.4842, | |
| "num_tokens": 308076053.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.5504352278545827, | |
| "grad_norm": 0.39639612499559884, | |
| "learning_rate": 2.562264806152798e-05, | |
| "loss": 0.531, | |
| "num_tokens": 308553946.0, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 0.5512886158047449, | |
| "grad_norm": 0.9850518699011881, | |
| "learning_rate": 2.5559379164529916e-05, | |
| "loss": 0.5016, | |
| "num_tokens": 309029777.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.552142003754907, | |
| "grad_norm": 0.4378674201988131, | |
| "learning_rate": 2.5496125723412378e-05, | |
| "loss": 0.4953, | |
| "num_tokens": 309539091.0, | |
| "step": 3235 | |
| }, | |
| { | |
| "epoch": 0.5529953917050692, | |
| "grad_norm": 0.4690543065354333, | |
| "learning_rate": 2.5432888241951047e-05, | |
| "loss": 0.5015, | |
| "num_tokens": 310003006.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.5538487796552313, | |
| "grad_norm": 0.4369823724960186, | |
| "learning_rate": 2.5369667223794546e-05, | |
| "loss": 0.4817, | |
| "num_tokens": 310493436.0, | |
| "step": 3245 | |
| }, | |
| { | |
| "epoch": 0.5547021676053934, | |
| "grad_norm": 0.44917439930869113, | |
| "learning_rate": 2.5306463172460327e-05, | |
| "loss": 0.4945, | |
| "num_tokens": 310946881.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 0.4284766174041252, | |
| "learning_rate": 2.524327659133075e-05, | |
| "loss": 0.5183, | |
| "num_tokens": 311432512.0, | |
| "step": 3255 | |
| }, | |
| { | |
| "epoch": 0.5564089435057177, | |
| "grad_norm": 0.42777974813645614, | |
| "learning_rate": 2.5180107983649e-05, | |
| "loss": 0.4823, | |
| "num_tokens": 311908102.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.5572623314558799, | |
| "grad_norm": 0.4354339701125081, | |
| "learning_rate": 2.5116957852515144e-05, | |
| "loss": 0.5117, | |
| "num_tokens": 312411813.0, | |
| "step": 3265 | |
| }, | |
| { | |
| "epoch": 0.558115719406042, | |
| "grad_norm": 0.4373291085879072, | |
| "learning_rate": 2.50538267008821e-05, | |
| "loss": 0.5042, | |
| "num_tokens": 312808887.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.5589691073562041, | |
| "grad_norm": 0.4705347746627223, | |
| "learning_rate": 2.4990715031551576e-05, | |
| "loss": 0.4805, | |
| "num_tokens": 313245307.0, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 0.5598224953063663, | |
| "grad_norm": 0.3923081123015428, | |
| "learning_rate": 2.4927623347170187e-05, | |
| "loss": 0.5039, | |
| "num_tokens": 313777123.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.5606758832565284, | |
| "grad_norm": 0.4311798687682826, | |
| "learning_rate": 2.4864552150225313e-05, | |
| "loss": 0.4972, | |
| "num_tokens": 314292359.0, | |
| "step": 3285 | |
| }, | |
| { | |
| "epoch": 0.5615292712066906, | |
| "grad_norm": 0.44410065167660334, | |
| "learning_rate": 2.4801501943041207e-05, | |
| "loss": 0.5191, | |
| "num_tokens": 314739056.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.5623826591568527, | |
| "grad_norm": 0.4304815272397994, | |
| "learning_rate": 2.473847322777494e-05, | |
| "loss": 0.4745, | |
| "num_tokens": 315237748.0, | |
| "step": 3295 | |
| }, | |
| { | |
| "epoch": 0.5632360471070148, | |
| "grad_norm": 0.40916157233641587, | |
| "learning_rate": 2.467546650641239e-05, | |
| "loss": 0.5034, | |
| "num_tokens": 315742426.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.564089435057177, | |
| "grad_norm": 0.45059898318172864, | |
| "learning_rate": 2.461248228076431e-05, | |
| "loss": 0.5157, | |
| "num_tokens": 316189696.0, | |
| "step": 3305 | |
| }, | |
| { | |
| "epoch": 0.5649428230073391, | |
| "grad_norm": 0.40355953359125624, | |
| "learning_rate": 2.454952105246225e-05, | |
| "loss": 0.516, | |
| "num_tokens": 316737000.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.5657962109575013, | |
| "grad_norm": 0.4492863148515123, | |
| "learning_rate": 2.4486583322954615e-05, | |
| "loss": 0.491, | |
| "num_tokens": 317169178.0, | |
| "step": 3315 | |
| }, | |
| { | |
| "epoch": 0.5666495989076634, | |
| "grad_norm": 0.450129107023297, | |
| "learning_rate": 2.4423669593502674e-05, | |
| "loss": 0.4967, | |
| "num_tokens": 317577413.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.5675029868578255, | |
| "grad_norm": 0.43455560755021194, | |
| "learning_rate": 2.43607803651765e-05, | |
| "loss": 0.5366, | |
| "num_tokens": 318049538.0, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 0.5683563748079877, | |
| "grad_norm": 0.3750935484263295, | |
| "learning_rate": 2.429791613885109e-05, | |
| "loss": 0.4701, | |
| "num_tokens": 318577669.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.5692097627581498, | |
| "grad_norm": 0.3897674804665693, | |
| "learning_rate": 2.4235077415202267e-05, | |
| "loss": 0.509, | |
| "num_tokens": 319096463.0, | |
| "step": 3335 | |
| }, | |
| { | |
| "epoch": 0.570063150708312, | |
| "grad_norm": 0.4683941092191944, | |
| "learning_rate": 2.4172264694702766e-05, | |
| "loss": 0.482, | |
| "num_tokens": 319557900.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.5709165386584741, | |
| "grad_norm": 0.4355962046233121, | |
| "learning_rate": 2.4109478477618226e-05, | |
| "loss": 0.5074, | |
| "num_tokens": 319972387.0, | |
| "step": 3345 | |
| }, | |
| { | |
| "epoch": 0.5717699266086362, | |
| "grad_norm": 0.40015806498212714, | |
| "learning_rate": 2.404671926400317e-05, | |
| "loss": 0.4922, | |
| "num_tokens": 320511805.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.5726233145587984, | |
| "grad_norm": 0.46297355892665293, | |
| "learning_rate": 2.39839875536971e-05, | |
| "loss": 0.5178, | |
| "num_tokens": 320967264.0, | |
| "step": 3355 | |
| }, | |
| { | |
| "epoch": 0.5734767025089605, | |
| "grad_norm": 0.45633772588444144, | |
| "learning_rate": 2.3921283846320434e-05, | |
| "loss": 0.4936, | |
| "num_tokens": 321462994.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.5743300904591228, | |
| "grad_norm": 0.4059712174597122, | |
| "learning_rate": 2.385860864127057e-05, | |
| "loss": 0.4844, | |
| "num_tokens": 321933448.0, | |
| "step": 3365 | |
| }, | |
| { | |
| "epoch": 0.5751834784092849, | |
| "grad_norm": 0.4227906432017419, | |
| "learning_rate": 2.3795962437717933e-05, | |
| "loss": 0.4806, | |
| "num_tokens": 322380429.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.576036866359447, | |
| "grad_norm": 0.42185673678737146, | |
| "learning_rate": 2.3733345734601926e-05, | |
| "loss": 0.5041, | |
| "num_tokens": 322853511.0, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 0.5768902543096092, | |
| "grad_norm": 0.4494646688158493, | |
| "learning_rate": 2.3670759030627026e-05, | |
| "loss": 0.5054, | |
| "num_tokens": 323355363.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.5777436422597713, | |
| "grad_norm": 0.387393693793285, | |
| "learning_rate": 2.3608202824258756e-05, | |
| "loss": 0.4813, | |
| "num_tokens": 323812185.0, | |
| "step": 3385 | |
| }, | |
| { | |
| "epoch": 0.5785970302099335, | |
| "grad_norm": 0.4280421796882521, | |
| "learning_rate": 2.3545677613719796e-05, | |
| "loss": 0.5274, | |
| "num_tokens": 324276849.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.5794504181600956, | |
| "grad_norm": 0.40866942244292914, | |
| "learning_rate": 2.3483183896985905e-05, | |
| "loss": 0.5184, | |
| "num_tokens": 324753875.0, | |
| "step": 3395 | |
| }, | |
| { | |
| "epoch": 0.5803038061102577, | |
| "grad_norm": 0.44181206333894957, | |
| "learning_rate": 2.3420722171782044e-05, | |
| "loss": 0.4629, | |
| "num_tokens": 325155802.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.5811571940604199, | |
| "grad_norm": 0.4355223116026911, | |
| "learning_rate": 2.335829293557839e-05, | |
| "loss": 0.4631, | |
| "num_tokens": 325595711.0, | |
| "step": 3405 | |
| }, | |
| { | |
| "epoch": 0.582010582010582, | |
| "grad_norm": 0.439684162961097, | |
| "learning_rate": 2.3295896685586327e-05, | |
| "loss": 0.4848, | |
| "num_tokens": 325996402.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.5828639699607442, | |
| "grad_norm": 0.43142934478354206, | |
| "learning_rate": 2.3233533918754568e-05, | |
| "loss": 0.5191, | |
| "num_tokens": 326497583.0, | |
| "step": 3415 | |
| }, | |
| { | |
| "epoch": 0.5837173579109063, | |
| "grad_norm": 0.4030134147551671, | |
| "learning_rate": 2.3171205131765106e-05, | |
| "loss": 0.541, | |
| "num_tokens": 326956942.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.5845707458610685, | |
| "grad_norm": 0.3987803523433034, | |
| "learning_rate": 2.3108910821029357e-05, | |
| "loss": 0.4948, | |
| "num_tokens": 327486192.0, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 0.5854241338112306, | |
| "grad_norm": 0.5229502470149472, | |
| "learning_rate": 2.304665148268411e-05, | |
| "loss": 0.5277, | |
| "num_tokens": 327985119.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.5862775217613927, | |
| "grad_norm": 0.4072552891208659, | |
| "learning_rate": 2.2984427612587638e-05, | |
| "loss": 0.482, | |
| "num_tokens": 328466172.0, | |
| "step": 3435 | |
| }, | |
| { | |
| "epoch": 0.5871309097115549, | |
| "grad_norm": 0.43876678179089584, | |
| "learning_rate": 2.2922239706315745e-05, | |
| "loss": 0.4928, | |
| "num_tokens": 328885222.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.587984297661717, | |
| "grad_norm": 0.4427289792806618, | |
| "learning_rate": 2.2860088259157776e-05, | |
| "loss": 0.5126, | |
| "num_tokens": 329384750.0, | |
| "step": 3445 | |
| }, | |
| { | |
| "epoch": 0.5888376856118792, | |
| "grad_norm": 0.40099651188178703, | |
| "learning_rate": 2.2797973766112702e-05, | |
| "loss": 0.4972, | |
| "num_tokens": 329923181.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.5896910735620413, | |
| "grad_norm": 0.44199703682750713, | |
| "learning_rate": 2.2735896721885218e-05, | |
| "loss": 0.4843, | |
| "num_tokens": 330370757.0, | |
| "step": 3455 | |
| }, | |
| { | |
| "epoch": 0.5905444615122034, | |
| "grad_norm": 0.3875820648420032, | |
| "learning_rate": 2.2673857620881712e-05, | |
| "loss": 0.4928, | |
| "num_tokens": 330878184.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.5913978494623656, | |
| "grad_norm": 0.3906829837925773, | |
| "learning_rate": 2.2611856957206413e-05, | |
| "loss": 0.4651, | |
| "num_tokens": 331388657.0, | |
| "step": 3465 | |
| }, | |
| { | |
| "epoch": 0.5922512374125277, | |
| "grad_norm": 0.4171132760042612, | |
| "learning_rate": 2.2549895224657392e-05, | |
| "loss": 0.4672, | |
| "num_tokens": 331849770.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.5931046253626899, | |
| "grad_norm": 0.4926802679183523, | |
| "learning_rate": 2.248797291672267e-05, | |
| "loss": 0.458, | |
| "num_tokens": 332306509.0, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 0.593958013312852, | |
| "grad_norm": 0.4438284770275958, | |
| "learning_rate": 2.2426090526576288e-05, | |
| "loss": 0.5335, | |
| "num_tokens": 332821824.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.5948114012630141, | |
| "grad_norm": 0.429161217863249, | |
| "learning_rate": 2.2364248547074335e-05, | |
| "loss": 0.4872, | |
| "num_tokens": 333290729.0, | |
| "step": 3485 | |
| }, | |
| { | |
| "epoch": 0.5956647892131763, | |
| "grad_norm": 0.44213140160826603, | |
| "learning_rate": 2.2302447470751087e-05, | |
| "loss": 0.529, | |
| "num_tokens": 333794769.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.5965181771633384, | |
| "grad_norm": 0.43565989583206144, | |
| "learning_rate": 2.224068778981501e-05, | |
| "loss": 0.4774, | |
| "num_tokens": 334272989.0, | |
| "step": 3495 | |
| }, | |
| { | |
| "epoch": 0.5973715651135006, | |
| "grad_norm": 0.43153159761159265, | |
| "learning_rate": 2.2178969996144933e-05, | |
| "loss": 0.5055, | |
| "num_tokens": 334798404.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5982249530636627, | |
| "grad_norm": 0.3936911105507658, | |
| "learning_rate": 2.211729458128603e-05, | |
| "loss": 0.4825, | |
| "num_tokens": 335242815.0, | |
| "step": 3505 | |
| }, | |
| { | |
| "epoch": 0.5990783410138248, | |
| "grad_norm": 0.40461704815260285, | |
| "learning_rate": 2.205566203644598e-05, | |
| "loss": 0.48, | |
| "num_tokens": 335687213.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.599931728963987, | |
| "grad_norm": 0.41232065635764376, | |
| "learning_rate": 2.1994072852491028e-05, | |
| "loss": 0.4693, | |
| "num_tokens": 336191478.0, | |
| "step": 3515 | |
| }, | |
| { | |
| "epoch": 0.6007851169141492, | |
| "grad_norm": 0.40859255614048423, | |
| "learning_rate": 2.1932527519942048e-05, | |
| "loss": 0.4556, | |
| "num_tokens": 336651049.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.6016385048643114, | |
| "grad_norm": 0.3860371330029888, | |
| "learning_rate": 2.1871026528970706e-05, | |
| "loss": 0.4746, | |
| "num_tokens": 337083217.0, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 0.6024918928144735, | |
| "grad_norm": 0.38986526001575333, | |
| "learning_rate": 2.1809570369395476e-05, | |
| "loss": 0.474, | |
| "num_tokens": 337519072.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.6033452807646356, | |
| "grad_norm": 0.44931040690751284, | |
| "learning_rate": 2.1748159530677808e-05, | |
| "loss": 0.5264, | |
| "num_tokens": 338002665.0, | |
| "step": 3535 | |
| }, | |
| { | |
| "epoch": 0.6041986687147978, | |
| "grad_norm": 0.4606111195784462, | |
| "learning_rate": 2.1686794501918183e-05, | |
| "loss": 0.5002, | |
| "num_tokens": 338438000.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.6050520566649599, | |
| "grad_norm": 0.4343116454898236, | |
| "learning_rate": 2.1625475771852217e-05, | |
| "loss": 0.5234, | |
| "num_tokens": 338954730.0, | |
| "step": 3545 | |
| }, | |
| { | |
| "epoch": 0.6059054446151221, | |
| "grad_norm": 0.36571207174430276, | |
| "learning_rate": 2.156420382884682e-05, | |
| "loss": 0.5047, | |
| "num_tokens": 339506233.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.6067588325652842, | |
| "grad_norm": 0.4724470415541178, | |
| "learning_rate": 2.1502979160896243e-05, | |
| "loss": 0.4888, | |
| "num_tokens": 339873756.0, | |
| "step": 3555 | |
| }, | |
| { | |
| "epoch": 0.6076122205154463, | |
| "grad_norm": 0.39960460130385483, | |
| "learning_rate": 2.1441802255618227e-05, | |
| "loss": 0.4978, | |
| "num_tokens": 340414967.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.6084656084656085, | |
| "grad_norm": 0.45186062496041596, | |
| "learning_rate": 2.138067360025012e-05, | |
| "loss": 0.5229, | |
| "num_tokens": 340894702.0, | |
| "step": 3565 | |
| }, | |
| { | |
| "epoch": 0.6093189964157706, | |
| "grad_norm": 0.494901828326181, | |
| "learning_rate": 2.1319593681644983e-05, | |
| "loss": 0.515, | |
| "num_tokens": 341348316.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.6101723843659328, | |
| "grad_norm": 0.43091909255115424, | |
| "learning_rate": 2.125856298626772e-05, | |
| "loss": 0.5015, | |
| "num_tokens": 341820736.0, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 0.6110257723160949, | |
| "grad_norm": 0.4063753043945408, | |
| "learning_rate": 2.1197582000191195e-05, | |
| "loss": 0.4703, | |
| "num_tokens": 342244656.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.611879160266257, | |
| "grad_norm": 0.4108958691439959, | |
| "learning_rate": 2.1136651209092366e-05, | |
| "loss": 0.4786, | |
| "num_tokens": 342721666.0, | |
| "step": 3585 | |
| }, | |
| { | |
| "epoch": 0.6127325482164192, | |
| "grad_norm": 0.41180781515360393, | |
| "learning_rate": 2.1075771098248435e-05, | |
| "loss": 0.4866, | |
| "num_tokens": 343244894.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.6135859361665813, | |
| "grad_norm": 0.44021612429673157, | |
| "learning_rate": 2.101494215253295e-05, | |
| "loss": 0.495, | |
| "num_tokens": 343763510.0, | |
| "step": 3595 | |
| }, | |
| { | |
| "epoch": 0.6144393241167435, | |
| "grad_norm": 0.40395073765164075, | |
| "learning_rate": 2.095416485641197e-05, | |
| "loss": 0.5139, | |
| "num_tokens": 344295726.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.6152927120669056, | |
| "grad_norm": 0.3992658043522841, | |
| "learning_rate": 2.0893439693940164e-05, | |
| "loss": 0.491, | |
| "num_tokens": 344746249.0, | |
| "step": 3605 | |
| }, | |
| { | |
| "epoch": 0.6161461000170677, | |
| "grad_norm": 0.33870138861934956, | |
| "learning_rate": 2.083276714875704e-05, | |
| "loss": 0.5072, | |
| "num_tokens": 345312852.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.6169994879672299, | |
| "grad_norm": 0.4157359060165468, | |
| "learning_rate": 2.0772147704083018e-05, | |
| "loss": 0.5341, | |
| "num_tokens": 345800450.0, | |
| "step": 3615 | |
| }, | |
| { | |
| "epoch": 0.617852875917392, | |
| "grad_norm": 0.45979732968981735, | |
| "learning_rate": 2.071158184271558e-05, | |
| "loss": 0.4822, | |
| "num_tokens": 346263433.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.6187062638675542, | |
| "grad_norm": 0.38443832685536994, | |
| "learning_rate": 2.0651070047025495e-05, | |
| "loss": 0.5348, | |
| "num_tokens": 346763563.0, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 0.6195596518177163, | |
| "grad_norm": 0.4064339928295911, | |
| "learning_rate": 2.059061279895288e-05, | |
| "loss": 0.4678, | |
| "num_tokens": 347219412.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.6204130397678784, | |
| "grad_norm": 0.4160833808900383, | |
| "learning_rate": 2.0530210580003462e-05, | |
| "loss": 0.5079, | |
| "num_tokens": 347665780.0, | |
| "step": 3635 | |
| }, | |
| { | |
| "epoch": 0.6212664277180406, | |
| "grad_norm": 0.4136263508722369, | |
| "learning_rate": 2.0469863871244653e-05, | |
| "loss": 0.483, | |
| "num_tokens": 348137882.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.6221198156682027, | |
| "grad_norm": 0.42856928518680726, | |
| "learning_rate": 2.040957315330179e-05, | |
| "loss": 0.5212, | |
| "num_tokens": 348604011.0, | |
| "step": 3645 | |
| }, | |
| { | |
| "epoch": 0.622973203618365, | |
| "grad_norm": 0.4520541804811179, | |
| "learning_rate": 2.0349338906354265e-05, | |
| "loss": 0.5078, | |
| "num_tokens": 349064203.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.623826591568527, | |
| "grad_norm": 0.38464465664614655, | |
| "learning_rate": 2.028916161013171e-05, | |
| "loss": 0.4952, | |
| "num_tokens": 349521350.0, | |
| "step": 3655 | |
| }, | |
| { | |
| "epoch": 0.6246799795186891, | |
| "grad_norm": 0.403256075111667, | |
| "learning_rate": 2.0229041743910177e-05, | |
| "loss": 0.4968, | |
| "num_tokens": 349987787.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.6255333674688514, | |
| "grad_norm": 0.38972525835767646, | |
| "learning_rate": 2.016897978650833e-05, | |
| "loss": 0.4832, | |
| "num_tokens": 350463547.0, | |
| "step": 3665 | |
| }, | |
| { | |
| "epoch": 0.6263867554190135, | |
| "grad_norm": 0.3973077826690484, | |
| "learning_rate": 2.010897621628362e-05, | |
| "loss": 0.5045, | |
| "num_tokens": 350963679.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.6272401433691757, | |
| "grad_norm": 0.4219646857753937, | |
| "learning_rate": 2.0049031511128485e-05, | |
| "loss": 0.4852, | |
| "num_tokens": 351415526.0, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 0.6280935313193378, | |
| "grad_norm": 0.4234500540568865, | |
| "learning_rate": 1.998914614846652e-05, | |
| "loss": 0.4584, | |
| "num_tokens": 351875623.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.6289469192694999, | |
| "grad_norm": 0.38952452727654746, | |
| "learning_rate": 1.9929320605248724e-05, | |
| "loss": 0.466, | |
| "num_tokens": 352342024.0, | |
| "step": 3685 | |
| }, | |
| { | |
| "epoch": 0.6298003072196621, | |
| "grad_norm": 0.4385777392451615, | |
| "learning_rate": 1.9869555357949632e-05, | |
| "loss": 0.4999, | |
| "num_tokens": 352834128.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.6306536951698242, | |
| "grad_norm": 0.4384686773256983, | |
| "learning_rate": 1.980985088256358e-05, | |
| "loss": 0.4936, | |
| "num_tokens": 353310049.0, | |
| "step": 3695 | |
| }, | |
| { | |
| "epoch": 0.6315070831199864, | |
| "grad_norm": 0.4141761563151494, | |
| "learning_rate": 1.975020765460091e-05, | |
| "loss": 0.4913, | |
| "num_tokens": 353796065.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.6323604710701485, | |
| "grad_norm": 0.38564920163833905, | |
| "learning_rate": 1.9690626149084123e-05, | |
| "loss": 0.5046, | |
| "num_tokens": 354279883.0, | |
| "step": 3705 | |
| }, | |
| { | |
| "epoch": 0.6332138590203107, | |
| "grad_norm": 0.4058764925597797, | |
| "learning_rate": 1.9631106840544172e-05, | |
| "loss": 0.4633, | |
| "num_tokens": 354752780.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.6340672469704728, | |
| "grad_norm": 0.37450769133321365, | |
| "learning_rate": 1.9571650203016617e-05, | |
| "loss": 0.4965, | |
| "num_tokens": 355219150.0, | |
| "step": 3715 | |
| }, | |
| { | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 0.3970402834053551, | |
| "learning_rate": 1.9512256710037917e-05, | |
| "loss": 0.4894, | |
| "num_tokens": 355697743.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.6357740228707971, | |
| "grad_norm": 0.41823811548195566, | |
| "learning_rate": 1.9452926834641617e-05, | |
| "loss": 0.482, | |
| "num_tokens": 356230572.0, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 0.6366274108209592, | |
| "grad_norm": 0.3630709405801741, | |
| "learning_rate": 1.939366104935455e-05, | |
| "loss": 0.4931, | |
| "num_tokens": 356707970.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.6374807987711214, | |
| "grad_norm": 0.43632558640318736, | |
| "learning_rate": 1.9334459826193145e-05, | |
| "loss": 0.5162, | |
| "num_tokens": 357088634.0, | |
| "step": 3735 | |
| }, | |
| { | |
| "epoch": 0.6383341867212835, | |
| "grad_norm": 0.4164889544558645, | |
| "learning_rate": 1.927532363665962e-05, | |
| "loss": 0.54, | |
| "num_tokens": 357607500.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.6391875746714456, | |
| "grad_norm": 0.4082119878390868, | |
| "learning_rate": 1.921625295173824e-05, | |
| "loss": 0.4926, | |
| "num_tokens": 358079716.0, | |
| "step": 3745 | |
| }, | |
| { | |
| "epoch": 0.6400409626216078, | |
| "grad_norm": 0.3968680927941563, | |
| "learning_rate": 1.9157248241891574e-05, | |
| "loss": 0.516, | |
| "num_tokens": 358549752.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.6408943505717699, | |
| "grad_norm": 0.44736923235379816, | |
| "learning_rate": 1.9098309977056717e-05, | |
| "loss": 0.4861, | |
| "num_tokens": 358969098.0, | |
| "step": 3755 | |
| }, | |
| { | |
| "epoch": 0.6417477385219321, | |
| "grad_norm": 0.5186802492473832, | |
| "learning_rate": 1.9039438626641598e-05, | |
| "loss": 0.5245, | |
| "num_tokens": 359496768.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.6426011264720942, | |
| "grad_norm": 0.3894432728535987, | |
| "learning_rate": 1.8980634659521183e-05, | |
| "loss": 0.4541, | |
| "num_tokens": 359999142.0, | |
| "step": 3765 | |
| }, | |
| { | |
| "epoch": 0.6434545144222563, | |
| "grad_norm": 0.41321092803361514, | |
| "learning_rate": 1.89218985440338e-05, | |
| "loss": 0.4736, | |
| "num_tokens": 360449477.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.6443079023724185, | |
| "grad_norm": 0.4635651256421096, | |
| "learning_rate": 1.886323074797736e-05, | |
| "loss": 0.4891, | |
| "num_tokens": 360931824.0, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 0.40598948114437067, | |
| "learning_rate": 1.880463173860565e-05, | |
| "loss": 0.5027, | |
| "num_tokens": 361389017.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.6460146782727428, | |
| "grad_norm": 0.40642791661819405, | |
| "learning_rate": 1.8746101982624632e-05, | |
| "loss": 0.4866, | |
| "num_tokens": 361845979.0, | |
| "step": 3785 | |
| }, | |
| { | |
| "epoch": 0.6468680662229049, | |
| "grad_norm": 0.4211594985748512, | |
| "learning_rate": 1.8687641946188673e-05, | |
| "loss": 0.4581, | |
| "num_tokens": 362299106.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.647721454173067, | |
| "grad_norm": 0.4624660552198793, | |
| "learning_rate": 1.8629252094896903e-05, | |
| "loss": 0.518, | |
| "num_tokens": 362755921.0, | |
| "step": 3795 | |
| }, | |
| { | |
| "epoch": 0.6485748421232292, | |
| "grad_norm": 0.38652134085472206, | |
| "learning_rate": 1.8570932893789443e-05, | |
| "loss": 0.475, | |
| "num_tokens": 363249804.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.6494282300733913, | |
| "grad_norm": 0.3890015123260553, | |
| "learning_rate": 1.8512684807343734e-05, | |
| "loss": 0.496, | |
| "num_tokens": 363675076.0, | |
| "step": 3805 | |
| }, | |
| { | |
| "epoch": 0.6502816180235536, | |
| "grad_norm": 0.44177610946738743, | |
| "learning_rate": 1.8454508299470846e-05, | |
| "loss": 0.5083, | |
| "num_tokens": 364170107.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.6511350059737157, | |
| "grad_norm": 0.45898880336276054, | |
| "learning_rate": 1.8396403833511744e-05, | |
| "loss": 0.5024, | |
| "num_tokens": 364647503.0, | |
| "step": 3815 | |
| }, | |
| { | |
| "epoch": 0.6519883939238778, | |
| "grad_norm": 0.4304978813763353, | |
| "learning_rate": 1.8338371872233646e-05, | |
| "loss": 0.4515, | |
| "num_tokens": 365084839.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.65284178187404, | |
| "grad_norm": 0.3995217321863832, | |
| "learning_rate": 1.828041287782628e-05, | |
| "loss": 0.493, | |
| "num_tokens": 365574309.0, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 0.6536951698242021, | |
| "grad_norm": 0.4109136541235624, | |
| "learning_rate": 1.8222527311898274e-05, | |
| "loss": 0.4883, | |
| "num_tokens": 365997466.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.6545485577743643, | |
| "grad_norm": 0.42717667070360243, | |
| "learning_rate": 1.8164715635473438e-05, | |
| "loss": 0.4634, | |
| "num_tokens": 366423695.0, | |
| "step": 3835 | |
| }, | |
| { | |
| "epoch": 0.6554019457245264, | |
| "grad_norm": 0.4291213100332811, | |
| "learning_rate": 1.8106978308987076e-05, | |
| "loss": 0.4755, | |
| "num_tokens": 366923656.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.6562553336746885, | |
| "grad_norm": 0.367962627625887, | |
| "learning_rate": 1.8049315792282345e-05, | |
| "loss": 0.4893, | |
| "num_tokens": 367455423.0, | |
| "step": 3845 | |
| }, | |
| { | |
| "epoch": 0.6571087216248507, | |
| "grad_norm": 0.39834591702335914, | |
| "learning_rate": 1.799172854460659e-05, | |
| "loss": 0.5092, | |
| "num_tokens": 367958683.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.6579621095750128, | |
| "grad_norm": 0.41069106276000594, | |
| "learning_rate": 1.793421702460769e-05, | |
| "loss": 0.4882, | |
| "num_tokens": 368423286.0, | |
| "step": 3855 | |
| }, | |
| { | |
| "epoch": 0.658815497525175, | |
| "grad_norm": 0.4403877592555042, | |
| "learning_rate": 1.78767816903304e-05, | |
| "loss": 0.5386, | |
| "num_tokens": 368897813.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.6596688854753371, | |
| "grad_norm": 0.4284510833561219, | |
| "learning_rate": 1.7819422999212677e-05, | |
| "loss": 0.4798, | |
| "num_tokens": 369332751.0, | |
| "step": 3865 | |
| }, | |
| { | |
| "epoch": 0.6605222734254992, | |
| "grad_norm": 0.48424881267659203, | |
| "learning_rate": 1.7762141408082096e-05, | |
| "loss": 0.4992, | |
| "num_tokens": 369808844.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.6613756613756614, | |
| "grad_norm": 0.4461364691263493, | |
| "learning_rate": 1.7704937373152147e-05, | |
| "loss": 0.4799, | |
| "num_tokens": 370294598.0, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 0.6622290493258235, | |
| "grad_norm": 0.4257692064880883, | |
| "learning_rate": 1.7647811350018646e-05, | |
| "loss": 0.4991, | |
| "num_tokens": 370786942.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.6630824372759857, | |
| "grad_norm": 0.405676102821404, | |
| "learning_rate": 1.7590763793656095e-05, | |
| "loss": 0.4657, | |
| "num_tokens": 371218600.0, | |
| "step": 3885 | |
| }, | |
| { | |
| "epoch": 0.6639358252261478, | |
| "grad_norm": 0.39340323676916794, | |
| "learning_rate": 1.753379515841404e-05, | |
| "loss": 0.4644, | |
| "num_tokens": 371657863.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.6647892131763099, | |
| "grad_norm": 0.3973728397154092, | |
| "learning_rate": 1.7476905898013494e-05, | |
| "loss": 0.4771, | |
| "num_tokens": 372124113.0, | |
| "step": 3895 | |
| }, | |
| { | |
| "epoch": 0.6656426011264721, | |
| "grad_norm": 0.37598913083623925, | |
| "learning_rate": 1.7420096465543262e-05, | |
| "loss": 0.4836, | |
| "num_tokens": 372630696.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.6664959890766342, | |
| "grad_norm": 0.4173744890381582, | |
| "learning_rate": 1.73633673134564e-05, | |
| "loss": 0.4912, | |
| "num_tokens": 373098703.0, | |
| "step": 3905 | |
| }, | |
| { | |
| "epoch": 0.6673493770267964, | |
| "grad_norm": 0.40906199202708654, | |
| "learning_rate": 1.7306718893566556e-05, | |
| "loss": 0.5083, | |
| "num_tokens": 373650740.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.6682027649769585, | |
| "grad_norm": 0.5037637534050279, | |
| "learning_rate": 1.725015165704441e-05, | |
| "loss": 0.4937, | |
| "num_tokens": 374100579.0, | |
| "step": 3915 | |
| }, | |
| { | |
| "epoch": 0.6690561529271206, | |
| "grad_norm": 0.3863065873789815, | |
| "learning_rate": 1.7193666054414062e-05, | |
| "loss": 0.485, | |
| "num_tokens": 374628223.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.6699095408772828, | |
| "grad_norm": 0.3964952440847914, | |
| "learning_rate": 1.7137262535549424e-05, | |
| "loss": 0.4786, | |
| "num_tokens": 375092247.0, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 0.6707629288274449, | |
| "grad_norm": 0.4866841362263111, | |
| "learning_rate": 1.7080941549670704e-05, | |
| "loss": 0.4907, | |
| "num_tokens": 375573909.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.6716163167776071, | |
| "grad_norm": 0.41570963934613275, | |
| "learning_rate": 1.7024703545340738e-05, | |
| "loss": 0.4687, | |
| "num_tokens": 376026978.0, | |
| "step": 3935 | |
| }, | |
| { | |
| "epoch": 0.6724697047277692, | |
| "grad_norm": 0.3882848955463959, | |
| "learning_rate": 1.6968548970461497e-05, | |
| "loss": 0.4636, | |
| "num_tokens": 376493989.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.6733230926779313, | |
| "grad_norm": 0.4156306117715765, | |
| "learning_rate": 1.691247827227049e-05, | |
| "loss": 0.4615, | |
| "num_tokens": 376987520.0, | |
| "step": 3945 | |
| }, | |
| { | |
| "epoch": 0.6741764806280935, | |
| "grad_norm": 0.40032593836358465, | |
| "learning_rate": 1.6856491897337152e-05, | |
| "loss": 0.4732, | |
| "num_tokens": 377479804.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.6750298685782556, | |
| "grad_norm": 0.3874861016594395, | |
| "learning_rate": 1.6800590291559395e-05, | |
| "loss": 0.4778, | |
| "num_tokens": 377948154.0, | |
| "step": 3955 | |
| }, | |
| { | |
| "epoch": 0.6758832565284179, | |
| "grad_norm": 0.47209735991541896, | |
| "learning_rate": 1.6744773900159954e-05, | |
| "loss": 0.4919, | |
| "num_tokens": 378392822.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.67673664447858, | |
| "grad_norm": 0.4084721241709745, | |
| "learning_rate": 1.6689043167682884e-05, | |
| "loss": 0.4836, | |
| "num_tokens": 378894419.0, | |
| "step": 3965 | |
| }, | |
| { | |
| "epoch": 0.6775900324287422, | |
| "grad_norm": 0.4176213838313082, | |
| "learning_rate": 1.663339853799005e-05, | |
| "loss": 0.4616, | |
| "num_tokens": 379328234.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.6784434203789043, | |
| "grad_norm": 0.445415171459785, | |
| "learning_rate": 1.657784045425752e-05, | |
| "loss": 0.4551, | |
| "num_tokens": 379824512.0, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 0.6792968083290664, | |
| "grad_norm": 0.4178058186239751, | |
| "learning_rate": 1.6522369358972107e-05, | |
| "loss": 0.4748, | |
| "num_tokens": 380308538.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.6801501962792286, | |
| "grad_norm": 0.4369794573241186, | |
| "learning_rate": 1.646698569392779e-05, | |
| "loss": 0.5016, | |
| "num_tokens": 380767302.0, | |
| "step": 3985 | |
| }, | |
| { | |
| "epoch": 0.6810035842293907, | |
| "grad_norm": 0.34487732454345854, | |
| "learning_rate": 1.6411689900222233e-05, | |
| "loss": 0.4873, | |
| "num_tokens": 381210797.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.6818569721795529, | |
| "grad_norm": 0.384612291079315, | |
| "learning_rate": 1.6356482418253264e-05, | |
| "loss": 0.5065, | |
| "num_tokens": 381671125.0, | |
| "step": 3995 | |
| }, | |
| { | |
| "epoch": 0.682710360129715, | |
| "grad_norm": 0.44280508590557865, | |
| "learning_rate": 1.630136368771534e-05, | |
| "loss": 0.4747, | |
| "num_tokens": 382166430.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6835637480798771, | |
| "grad_norm": 0.3952894047571598, | |
| "learning_rate": 1.624633414759608e-05, | |
| "loss": 0.5192, | |
| "num_tokens": 382719446.0, | |
| "step": 4005 | |
| }, | |
| { | |
| "epoch": 0.6844171360300393, | |
| "grad_norm": 0.4278951482902631, | |
| "learning_rate": 1.619139423617274e-05, | |
| "loss": 0.4922, | |
| "num_tokens": 383179056.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.6852705239802014, | |
| "grad_norm": 0.41266303103834073, | |
| "learning_rate": 1.6136544391008766e-05, | |
| "loss": 0.4986, | |
| "num_tokens": 383640184.0, | |
| "step": 4015 | |
| }, | |
| { | |
| "epoch": 0.6861239119303636, | |
| "grad_norm": 0.4044179518299468, | |
| "learning_rate": 1.608178504895025e-05, | |
| "loss": 0.4959, | |
| "num_tokens": 384075218.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.6869772998805257, | |
| "grad_norm": 0.42974231156673803, | |
| "learning_rate": 1.6027116646122497e-05, | |
| "loss": 0.4687, | |
| "num_tokens": 384521597.0, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 0.6878306878306878, | |
| "grad_norm": 0.40711400623953364, | |
| "learning_rate": 1.5972539617926547e-05, | |
| "loss": 0.4643, | |
| "num_tokens": 384999949.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.68868407578085, | |
| "grad_norm": 0.40374582515834256, | |
| "learning_rate": 1.5918054399035656e-05, | |
| "loss": 0.477, | |
| "num_tokens": 385474122.0, | |
| "step": 4035 | |
| }, | |
| { | |
| "epoch": 0.6895374637310121, | |
| "grad_norm": 0.4574916049281237, | |
| "learning_rate": 1.5863661423391924e-05, | |
| "loss": 0.4979, | |
| "num_tokens": 385900887.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.6903908516811743, | |
| "grad_norm": 0.4027357480039622, | |
| "learning_rate": 1.580936112420275e-05, | |
| "loss": 0.4799, | |
| "num_tokens": 386375167.0, | |
| "step": 4045 | |
| }, | |
| { | |
| "epoch": 0.6912442396313364, | |
| "grad_norm": 0.3778906634385024, | |
| "learning_rate": 1.5755153933937433e-05, | |
| "loss": 0.5086, | |
| "num_tokens": 386870114.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.6920976275814985, | |
| "grad_norm": 0.4374584054340601, | |
| "learning_rate": 1.5701040284323733e-05, | |
| "loss": 0.4906, | |
| "num_tokens": 387360813.0, | |
| "step": 4055 | |
| }, | |
| { | |
| "epoch": 0.6929510155316607, | |
| "grad_norm": 0.43950899378836694, | |
| "learning_rate": 1.5647020606344374e-05, | |
| "loss": 0.4811, | |
| "num_tokens": 387782669.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.6938044034818228, | |
| "grad_norm": 0.38903017402374057, | |
| "learning_rate": 1.5593095330233702e-05, | |
| "loss": 0.4751, | |
| "num_tokens": 388226708.0, | |
| "step": 4065 | |
| }, | |
| { | |
| "epoch": 0.694657791431985, | |
| "grad_norm": 0.4563030383700527, | |
| "learning_rate": 1.553926488547417e-05, | |
| "loss": 0.4975, | |
| "num_tokens": 388715961.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.6955111793821471, | |
| "grad_norm": 0.4222127101768727, | |
| "learning_rate": 1.5485529700792972e-05, | |
| "loss": 0.472, | |
| "num_tokens": 389192939.0, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 0.6963645673323092, | |
| "grad_norm": 0.3845073301569914, | |
| "learning_rate": 1.5431890204158623e-05, | |
| "loss": 0.4756, | |
| "num_tokens": 389650899.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.6972179552824714, | |
| "grad_norm": 0.4086273938770818, | |
| "learning_rate": 1.5378346822777506e-05, | |
| "loss": 0.4664, | |
| "num_tokens": 390112350.0, | |
| "step": 4085 | |
| }, | |
| { | |
| "epoch": 0.6980713432326335, | |
| "grad_norm": 0.430364549106535, | |
| "learning_rate": 1.5324899983090552e-05, | |
| "loss": 0.4926, | |
| "num_tokens": 390610942.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.6989247311827957, | |
| "grad_norm": 0.4199911271693082, | |
| "learning_rate": 1.5271550110769756e-05, | |
| "loss": 0.5134, | |
| "num_tokens": 391149323.0, | |
| "step": 4095 | |
| }, | |
| { | |
| "epoch": 0.6997781191329578, | |
| "grad_norm": 0.40258677066589266, | |
| "learning_rate": 1.5218297630714829e-05, | |
| "loss": 0.5129, | |
| "num_tokens": 391632321.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.7006315070831199, | |
| "grad_norm": 0.3923111415563789, | |
| "learning_rate": 1.516514296704984e-05, | |
| "loss": 0.4773, | |
| "num_tokens": 392123457.0, | |
| "step": 4105 | |
| }, | |
| { | |
| "epoch": 0.7014848950332822, | |
| "grad_norm": 0.44996415087492175, | |
| "learning_rate": 1.511208654311977e-05, | |
| "loss": 0.4857, | |
| "num_tokens": 392626976.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.7023382829834443, | |
| "grad_norm": 0.4448711354635569, | |
| "learning_rate": 1.5059128781487225e-05, | |
| "loss": 0.4836, | |
| "num_tokens": 393063239.0, | |
| "step": 4115 | |
| }, | |
| { | |
| "epoch": 0.7031916709336065, | |
| "grad_norm": 0.40109087721151776, | |
| "learning_rate": 1.5006270103928976e-05, | |
| "loss": 0.4803, | |
| "num_tokens": 393493604.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.7040450588837686, | |
| "grad_norm": 0.41117543808643187, | |
| "learning_rate": 1.4953510931432685e-05, | |
| "loss": 0.4926, | |
| "num_tokens": 393941943.0, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 0.7048984468339307, | |
| "grad_norm": 0.4061904637385267, | |
| "learning_rate": 1.4900851684193512e-05, | |
| "loss": 0.5178, | |
| "num_tokens": 394371570.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.7057518347840929, | |
| "grad_norm": 0.40589654176993206, | |
| "learning_rate": 1.4848292781610751e-05, | |
| "loss": 0.4938, | |
| "num_tokens": 394853993.0, | |
| "step": 4135 | |
| }, | |
| { | |
| "epoch": 0.706605222734255, | |
| "grad_norm": 0.4017058772358506, | |
| "learning_rate": 1.4795834642284528e-05, | |
| "loss": 0.4915, | |
| "num_tokens": 395329185.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.7074586106844172, | |
| "grad_norm": 0.41805268188884603, | |
| "learning_rate": 1.4743477684012438e-05, | |
| "loss": 0.4963, | |
| "num_tokens": 395828098.0, | |
| "step": 4145 | |
| }, | |
| { | |
| "epoch": 0.7083119986345793, | |
| "grad_norm": 0.46556118125036533, | |
| "learning_rate": 1.4691222323786253e-05, | |
| "loss": 0.4768, | |
| "num_tokens": 396285000.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.7091653865847414, | |
| "grad_norm": 0.4714302982254491, | |
| "learning_rate": 1.4639068977788542e-05, | |
| "loss": 0.5232, | |
| "num_tokens": 396789442.0, | |
| "step": 4155 | |
| }, | |
| { | |
| "epoch": 0.7100187745349036, | |
| "grad_norm": 0.4059352203704004, | |
| "learning_rate": 1.4587018061389426e-05, | |
| "loss": 0.5213, | |
| "num_tokens": 397224172.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.7108721624850657, | |
| "grad_norm": 0.3890468867463587, | |
| "learning_rate": 1.453506998914323e-05, | |
| "loss": 0.478, | |
| "num_tokens": 397747466.0, | |
| "step": 4165 | |
| }, | |
| { | |
| "epoch": 0.7117255504352279, | |
| "grad_norm": 0.41294040022016687, | |
| "learning_rate": 1.448322517478516e-05, | |
| "loss": 0.4858, | |
| "num_tokens": 398210109.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.71257893838539, | |
| "grad_norm": 0.4251556113450014, | |
| "learning_rate": 1.4431484031228069e-05, | |
| "loss": 0.4631, | |
| "num_tokens": 398681117.0, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 0.7134323263355521, | |
| "grad_norm": 0.4077128999446609, | |
| "learning_rate": 1.4379846970559113e-05, | |
| "loss": 0.478, | |
| "num_tokens": 399158135.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.4493452698637352, | |
| "learning_rate": 1.4328314404036486e-05, | |
| "loss": 0.4914, | |
| "num_tokens": 399584546.0, | |
| "step": 4185 | |
| }, | |
| { | |
| "epoch": 0.7151391022358764, | |
| "grad_norm": 0.4160866280590416, | |
| "learning_rate": 1.4276886742086175e-05, | |
| "loss": 0.4869, | |
| "num_tokens": 400075133.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.7159924901860386, | |
| "grad_norm": 0.4466019930832434, | |
| "learning_rate": 1.4225564394298641e-05, | |
| "loss": 0.525, | |
| "num_tokens": 400536012.0, | |
| "step": 4195 | |
| }, | |
| { | |
| "epoch": 0.7168458781362007, | |
| "grad_norm": 0.38207214800462763, | |
| "learning_rate": 1.4174347769425594e-05, | |
| "loss": 0.4721, | |
| "num_tokens": 400990040.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.7176992660863628, | |
| "grad_norm": 0.39726695937965517, | |
| "learning_rate": 1.412323727537671e-05, | |
| "loss": 0.5049, | |
| "num_tokens": 401444616.0, | |
| "step": 4205 | |
| }, | |
| { | |
| "epoch": 0.718552654036525, | |
| "grad_norm": 0.36254218460771337, | |
| "learning_rate": 1.407223331921641e-05, | |
| "loss": 0.4502, | |
| "num_tokens": 401912436.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.7194060419866871, | |
| "grad_norm": 0.4243373339771448, | |
| "learning_rate": 1.4021336307160612e-05, | |
| "loss": 0.4761, | |
| "num_tokens": 402343317.0, | |
| "step": 4215 | |
| }, | |
| { | |
| "epoch": 0.7202594299368493, | |
| "grad_norm": 0.38955697216237994, | |
| "learning_rate": 1.3970546644573467e-05, | |
| "loss": 0.4637, | |
| "num_tokens": 402861165.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.7211128178870114, | |
| "grad_norm": 0.43105309295474786, | |
| "learning_rate": 1.3919864735964167e-05, | |
| "loss": 0.498, | |
| "num_tokens": 403302153.0, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 0.7219662058371735, | |
| "grad_norm": 0.4204432629627796, | |
| "learning_rate": 1.3869290984983685e-05, | |
| "loss": 0.4752, | |
| "num_tokens": 403763990.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.7228195937873357, | |
| "grad_norm": 0.40364761697974744, | |
| "learning_rate": 1.3818825794421619e-05, | |
| "loss": 0.4738, | |
| "num_tokens": 404262168.0, | |
| "step": 4235 | |
| }, | |
| { | |
| "epoch": 0.7236729817374978, | |
| "grad_norm": 0.4109694290603725, | |
| "learning_rate": 1.376846956620293e-05, | |
| "loss": 0.4891, | |
| "num_tokens": 404752387.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.72452636968766, | |
| "grad_norm": 0.3659480223615566, | |
| "learning_rate": 1.3718222701384757e-05, | |
| "loss": 0.4728, | |
| "num_tokens": 405251602.0, | |
| "step": 4245 | |
| }, | |
| { | |
| "epoch": 0.7253797576378221, | |
| "grad_norm": 0.41498798932131076, | |
| "learning_rate": 1.3668085600153232e-05, | |
| "loss": 0.4856, | |
| "num_tokens": 405691554.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.7262331455879844, | |
| "grad_norm": 0.4524823500426223, | |
| "learning_rate": 1.3618058661820277e-05, | |
| "loss": 0.4696, | |
| "num_tokens": 406131870.0, | |
| "step": 4255 | |
| }, | |
| { | |
| "epoch": 0.7270865335381465, | |
| "grad_norm": 0.41475346178538414, | |
| "learning_rate": 1.3568142284820442e-05, | |
| "loss": 0.5107, | |
| "num_tokens": 406680793.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.7279399214883086, | |
| "grad_norm": 0.4135280000394242, | |
| "learning_rate": 1.3518336866707723e-05, | |
| "loss": 0.4882, | |
| "num_tokens": 407139264.0, | |
| "step": 4265 | |
| }, | |
| { | |
| "epoch": 0.7287933094384708, | |
| "grad_norm": 0.4324000273399736, | |
| "learning_rate": 1.3468642804152374e-05, | |
| "loss": 0.4876, | |
| "num_tokens": 407683383.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.7296466973886329, | |
| "grad_norm": 0.4006977169294556, | |
| "learning_rate": 1.3419060492937802e-05, | |
| "loss": 0.4661, | |
| "num_tokens": 408175796.0, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 0.7305000853387951, | |
| "grad_norm": 0.4490369002561085, | |
| "learning_rate": 1.3369590327957348e-05, | |
| "loss": 0.4941, | |
| "num_tokens": 408650161.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.7313534732889572, | |
| "grad_norm": 0.44961429893648996, | |
| "learning_rate": 1.3320232703211214e-05, | |
| "loss": 0.4907, | |
| "num_tokens": 409101561.0, | |
| "step": 4285 | |
| }, | |
| { | |
| "epoch": 0.7322068612391193, | |
| "grad_norm": 0.42766955856470856, | |
| "learning_rate": 1.3270988011803243e-05, | |
| "loss": 0.5028, | |
| "num_tokens": 409549494.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.7330602491892815, | |
| "grad_norm": 0.38960252837741977, | |
| "learning_rate": 1.3221856645937868e-05, | |
| "loss": 0.469, | |
| "num_tokens": 410034712.0, | |
| "step": 4295 | |
| }, | |
| { | |
| "epoch": 0.7339136371394436, | |
| "grad_norm": 0.44934298130705935, | |
| "learning_rate": 1.3172838996916936e-05, | |
| "loss": 0.5063, | |
| "num_tokens": 410510602.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.7347670250896058, | |
| "grad_norm": 0.45628929188470024, | |
| "learning_rate": 1.3123935455136599e-05, | |
| "loss": 0.4935, | |
| "num_tokens": 411004501.0, | |
| "step": 4305 | |
| }, | |
| { | |
| "epoch": 0.7356204130397679, | |
| "grad_norm": 0.4392000024405787, | |
| "learning_rate": 1.307514641008424e-05, | |
| "loss": 0.4958, | |
| "num_tokens": 411492905.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.73647380098993, | |
| "grad_norm": 0.42788294784562186, | |
| "learning_rate": 1.302647225033532e-05, | |
| "loss": 0.4871, | |
| "num_tokens": 411924936.0, | |
| "step": 4315 | |
| }, | |
| { | |
| "epoch": 0.7373271889400922, | |
| "grad_norm": 0.3908231650434555, | |
| "learning_rate": 1.2977913363550304e-05, | |
| "loss": 0.4779, | |
| "num_tokens": 412454003.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.7381805768902543, | |
| "grad_norm": 0.4220895452254383, | |
| "learning_rate": 1.2929470136471607e-05, | |
| "loss": 0.5168, | |
| "num_tokens": 412966217.0, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 0.7390339648404165, | |
| "grad_norm": 0.4558677310822803, | |
| "learning_rate": 1.288114295492045e-05, | |
| "loss": 0.4957, | |
| "num_tokens": 413421190.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.7398873527905786, | |
| "grad_norm": 0.3596149688176871, | |
| "learning_rate": 1.2832932203793848e-05, | |
| "loss": 0.4956, | |
| "num_tokens": 413963161.0, | |
| "step": 4335 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 0.3841859885217558, | |
| "learning_rate": 1.2784838267061491e-05, | |
| "loss": 0.484, | |
| "num_tokens": 414425737.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.7415941286909029, | |
| "grad_norm": 0.4270328699693807, | |
| "learning_rate": 1.273686152776274e-05, | |
| "loss": 0.5033, | |
| "num_tokens": 414887259.0, | |
| "step": 4345 | |
| }, | |
| { | |
| "epoch": 0.742447516641065, | |
| "grad_norm": 0.3940661042364259, | |
| "learning_rate": 1.2689002368003539e-05, | |
| "loss": 0.4425, | |
| "num_tokens": 415344538.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.7433009045912272, | |
| "grad_norm": 0.3651080461406393, | |
| "learning_rate": 1.2641261168953366e-05, | |
| "loss": 0.517, | |
| "num_tokens": 415920643.0, | |
| "step": 4355 | |
| }, | |
| { | |
| "epoch": 0.7441542925413893, | |
| "grad_norm": 0.38417324300176203, | |
| "learning_rate": 1.2593638310842235e-05, | |
| "loss": 0.4827, | |
| "num_tokens": 416440659.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.7450076804915514, | |
| "grad_norm": 0.4386768288277266, | |
| "learning_rate": 1.2546134172957619e-05, | |
| "loss": 0.5139, | |
| "num_tokens": 416923221.0, | |
| "step": 4365 | |
| }, | |
| { | |
| "epoch": 0.7458610684417136, | |
| "grad_norm": 0.3971991068257288, | |
| "learning_rate": 1.2498749133641489e-05, | |
| "loss": 0.46, | |
| "num_tokens": 417360385.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.7467144563918757, | |
| "grad_norm": 0.3830606629171972, | |
| "learning_rate": 1.245148357028725e-05, | |
| "loss": 0.4861, | |
| "num_tokens": 417855139.0, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 0.7475678443420379, | |
| "grad_norm": 0.44578901935433984, | |
| "learning_rate": 1.2404337859336743e-05, | |
| "loss": 0.4711, | |
| "num_tokens": 418334906.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.7484212322922, | |
| "grad_norm": 0.47087692205027515, | |
| "learning_rate": 1.2357312376277279e-05, | |
| "loss": 0.4534, | |
| "num_tokens": 418757440.0, | |
| "step": 4385 | |
| }, | |
| { | |
| "epoch": 0.7492746202423621, | |
| "grad_norm": 0.3868325787916382, | |
| "learning_rate": 1.2310407495638599e-05, | |
| "loss": 0.4666, | |
| "num_tokens": 419196059.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.7501280081925243, | |
| "grad_norm": 0.4434335875999656, | |
| "learning_rate": 1.226362359098995e-05, | |
| "loss": 0.4731, | |
| "num_tokens": 419648217.0, | |
| "step": 4395 | |
| }, | |
| { | |
| "epoch": 0.7509813961426864, | |
| "grad_norm": 0.437669638166961, | |
| "learning_rate": 1.2216961034937048e-05, | |
| "loss": 0.4888, | |
| "num_tokens": 420121223.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.7518347840928487, | |
| "grad_norm": 0.41251853408818256, | |
| "learning_rate": 1.2170420199119151e-05, | |
| "loss": 0.4806, | |
| "num_tokens": 420554055.0, | |
| "step": 4405 | |
| }, | |
| { | |
| "epoch": 0.7526881720430108, | |
| "grad_norm": 0.4179104412216871, | |
| "learning_rate": 1.2124001454206102e-05, | |
| "loss": 0.4834, | |
| "num_tokens": 421062594.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.7535415599931728, | |
| "grad_norm": 0.4264458281319359, | |
| "learning_rate": 1.2077705169895338e-05, | |
| "loss": 0.4518, | |
| "num_tokens": 421493934.0, | |
| "step": 4415 | |
| }, | |
| { | |
| "epoch": 0.7543949479433351, | |
| "grad_norm": 0.38247330621872, | |
| "learning_rate": 1.2031531714908997e-05, | |
| "loss": 0.4739, | |
| "num_tokens": 421975487.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.7552483358934972, | |
| "grad_norm": 0.3632906503042789, | |
| "learning_rate": 1.1985481456990928e-05, | |
| "loss": 0.4934, | |
| "num_tokens": 422389134.0, | |
| "step": 4425 | |
| }, | |
| { | |
| "epoch": 0.7561017238436594, | |
| "grad_norm": 0.3987555410928658, | |
| "learning_rate": 1.1939554762903813e-05, | |
| "loss": 0.4705, | |
| "num_tokens": 422820853.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.7569551117938215, | |
| "grad_norm": 0.40029134958970697, | |
| "learning_rate": 1.189375199842622e-05, | |
| "loss": 0.4758, | |
| "num_tokens": 423290058.0, | |
| "step": 4435 | |
| }, | |
| { | |
| "epoch": 0.7578084997439836, | |
| "grad_norm": 0.40190448791462935, | |
| "learning_rate": 1.1848073528349676e-05, | |
| "loss": 0.4901, | |
| "num_tokens": 423755589.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.7586618876941458, | |
| "grad_norm": 0.37335723935656784, | |
| "learning_rate": 1.1802519716475786e-05, | |
| "loss": 0.4816, | |
| "num_tokens": 424257881.0, | |
| "step": 4445 | |
| }, | |
| { | |
| "epoch": 0.7595152756443079, | |
| "grad_norm": 0.38614328795185704, | |
| "learning_rate": 1.1757090925613323e-05, | |
| "loss": 0.4868, | |
| "num_tokens": 424688821.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.7603686635944701, | |
| "grad_norm": 0.3760109402905597, | |
| "learning_rate": 1.171178751757535e-05, | |
| "loss": 0.4442, | |
| "num_tokens": 425164941.0, | |
| "step": 4455 | |
| }, | |
| { | |
| "epoch": 0.7612220515446322, | |
| "grad_norm": 0.42319557908272304, | |
| "learning_rate": 1.1666609853176342e-05, | |
| "loss": 0.4844, | |
| "num_tokens": 425585640.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.7620754394947943, | |
| "grad_norm": 0.40174520701740907, | |
| "learning_rate": 1.1621558292229268e-05, | |
| "loss": 0.4829, | |
| "num_tokens": 426040857.0, | |
| "step": 4465 | |
| }, | |
| { | |
| "epoch": 0.7629288274449565, | |
| "grad_norm": 0.3654902278674818, | |
| "learning_rate": 1.1576633193542797e-05, | |
| "loss": 0.4713, | |
| "num_tokens": 426547476.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.7637822153951186, | |
| "grad_norm": 0.42757814520113596, | |
| "learning_rate": 1.1531834914918365e-05, | |
| "loss": 0.4886, | |
| "num_tokens": 427045299.0, | |
| "step": 4475 | |
| }, | |
| { | |
| "epoch": 0.7646356033452808, | |
| "grad_norm": 0.35870737158818267, | |
| "learning_rate": 1.14871638131474e-05, | |
| "loss": 0.4499, | |
| "num_tokens": 427476418.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.7654889912954429, | |
| "grad_norm": 0.42106166604792017, | |
| "learning_rate": 1.1442620244008409e-05, | |
| "loss": 0.4782, | |
| "num_tokens": 427900405.0, | |
| "step": 4485 | |
| }, | |
| { | |
| "epoch": 0.766342379245605, | |
| "grad_norm": 0.37713869215413787, | |
| "learning_rate": 1.1398204562264192e-05, | |
| "loss": 0.475, | |
| "num_tokens": 428405822.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.7671957671957672, | |
| "grad_norm": 0.3842103766371807, | |
| "learning_rate": 1.1353917121659017e-05, | |
| "loss": 0.4677, | |
| "num_tokens": 428861765.0, | |
| "step": 4495 | |
| }, | |
| { | |
| "epoch": 0.7680491551459293, | |
| "grad_norm": 0.36951915347773684, | |
| "learning_rate": 1.1309758274915756e-05, | |
| "loss": 0.4778, | |
| "num_tokens": 429400796.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.7689025430960915, | |
| "grad_norm": 0.38125534517310483, | |
| "learning_rate": 1.1265728373733138e-05, | |
| "loss": 0.4819, | |
| "num_tokens": 429935003.0, | |
| "step": 4505 | |
| }, | |
| { | |
| "epoch": 0.7697559310462536, | |
| "grad_norm": 0.37259968004369953, | |
| "learning_rate": 1.1221827768782909e-05, | |
| "loss": 0.4686, | |
| "num_tokens": 430392694.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.7706093189964157, | |
| "grad_norm": 0.41781127042026184, | |
| "learning_rate": 1.1178056809707035e-05, | |
| "loss": 0.4701, | |
| "num_tokens": 430883679.0, | |
| "step": 4515 | |
| }, | |
| { | |
| "epoch": 0.7714627069465779, | |
| "grad_norm": 0.3920353933413547, | |
| "learning_rate": 1.1134415845114954e-05, | |
| "loss": 0.4633, | |
| "num_tokens": 431359520.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.77231609489674, | |
| "grad_norm": 0.4002133146400338, | |
| "learning_rate": 1.1090905222580756e-05, | |
| "loss": 0.457, | |
| "num_tokens": 431813181.0, | |
| "step": 4525 | |
| }, | |
| { | |
| "epoch": 0.7731694828469022, | |
| "grad_norm": 0.4203247969667646, | |
| "learning_rate": 1.104752528864044e-05, | |
| "loss": 0.5121, | |
| "num_tokens": 432294915.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.7740228707970643, | |
| "grad_norm": 0.3909715162480175, | |
| "learning_rate": 1.1004276388789146e-05, | |
| "loss": 0.4758, | |
| "num_tokens": 432809362.0, | |
| "step": 4535 | |
| }, | |
| { | |
| "epoch": 0.7748762587472265, | |
| "grad_norm": 0.37156344183983764, | |
| "learning_rate": 1.096115886747842e-05, | |
| "loss": 0.4749, | |
| "num_tokens": 433284774.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.7757296466973886, | |
| "grad_norm": 0.4165669717129614, | |
| "learning_rate": 1.0918173068113446e-05, | |
| "loss": 0.4952, | |
| "num_tokens": 433767206.0, | |
| "step": 4545 | |
| }, | |
| { | |
| "epoch": 0.7765830346475507, | |
| "grad_norm": 0.38559169918567077, | |
| "learning_rate": 1.0875319333050315e-05, | |
| "loss": 0.4731, | |
| "num_tokens": 434284592.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.777436422597713, | |
| "grad_norm": 0.3747171421477805, | |
| "learning_rate": 1.0832598003593325e-05, | |
| "loss": 0.4587, | |
| "num_tokens": 434700803.0, | |
| "step": 4555 | |
| }, | |
| { | |
| "epoch": 0.778289810547875, | |
| "grad_norm": 0.4229921079832128, | |
| "learning_rate": 1.079000941999222e-05, | |
| "loss": 0.462, | |
| "num_tokens": 435149765.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.7791431984980373, | |
| "grad_norm": 0.38259280049269817, | |
| "learning_rate": 1.0747553921439515e-05, | |
| "loss": 0.4694, | |
| "num_tokens": 435664679.0, | |
| "step": 4565 | |
| }, | |
| { | |
| "epoch": 0.7799965864481994, | |
| "grad_norm": 0.42216552845261135, | |
| "learning_rate": 1.0705231846067792e-05, | |
| "loss": 0.4935, | |
| "num_tokens": 436158148.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.7808499743983615, | |
| "grad_norm": 0.37597921789521377, | |
| "learning_rate": 1.0663043530946979e-05, | |
| "loss": 0.4775, | |
| "num_tokens": 436670023.0, | |
| "step": 4575 | |
| }, | |
| { | |
| "epoch": 0.7817033623485237, | |
| "grad_norm": 0.399407830927538, | |
| "learning_rate": 1.0620989312081695e-05, | |
| "loss": 0.4673, | |
| "num_tokens": 437135437.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.7825567502986858, | |
| "grad_norm": 0.4162180706347344, | |
| "learning_rate": 1.0579069524408547e-05, | |
| "loss": 0.4724, | |
| "num_tokens": 437583429.0, | |
| "step": 4585 | |
| }, | |
| { | |
| "epoch": 0.783410138248848, | |
| "grad_norm": 0.3749151960200756, | |
| "learning_rate": 1.0537284501793502e-05, | |
| "loss": 0.4481, | |
| "num_tokens": 438061307.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.7842635261990101, | |
| "grad_norm": 0.39956405597255984, | |
| "learning_rate": 1.0495634577029192e-05, | |
| "loss": 0.4931, | |
| "num_tokens": 438550284.0, | |
| "step": 4595 | |
| }, | |
| { | |
| "epoch": 0.7851169141491722, | |
| "grad_norm": 0.36415584380192006, | |
| "learning_rate": 1.045412008183227e-05, | |
| "loss": 0.4817, | |
| "num_tokens": 439079263.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.7859703020993344, | |
| "grad_norm": 0.33725341265572595, | |
| "learning_rate": 1.0412741346840793e-05, | |
| "loss": 0.4875, | |
| "num_tokens": 439664787.0, | |
| "step": 4605 | |
| }, | |
| { | |
| "epoch": 0.7868236900494965, | |
| "grad_norm": 0.40013051392863686, | |
| "learning_rate": 1.037149870161154e-05, | |
| "loss": 0.5129, | |
| "num_tokens": 440154096.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.7876770779996587, | |
| "grad_norm": 0.36531838979013637, | |
| "learning_rate": 1.0330392474617448e-05, | |
| "loss": 0.4778, | |
| "num_tokens": 440696523.0, | |
| "step": 4615 | |
| }, | |
| { | |
| "epoch": 0.7885304659498208, | |
| "grad_norm": 0.4172477340219732, | |
| "learning_rate": 1.0289422993244942e-05, | |
| "loss": 0.4808, | |
| "num_tokens": 441195758.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.7893838538999829, | |
| "grad_norm": 0.37658318732885415, | |
| "learning_rate": 1.0248590583791355e-05, | |
| "loss": 0.4891, | |
| "num_tokens": 441702532.0, | |
| "step": 4625 | |
| }, | |
| { | |
| "epoch": 0.7902372418501451, | |
| "grad_norm": 0.3725261359820099, | |
| "learning_rate": 1.0207895571462337e-05, | |
| "loss": 0.4876, | |
| "num_tokens": 442194379.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.7910906298003072, | |
| "grad_norm": 0.48693962642223004, | |
| "learning_rate": 1.0167338280369233e-05, | |
| "loss": 0.4932, | |
| "num_tokens": 442629431.0, | |
| "step": 4635 | |
| }, | |
| { | |
| "epoch": 0.7919440177504694, | |
| "grad_norm": 0.4190402335465006, | |
| "learning_rate": 1.0126919033526536e-05, | |
| "loss": 0.4898, | |
| "num_tokens": 443118717.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.7927974057006315, | |
| "grad_norm": 0.4243039182549761, | |
| "learning_rate": 1.0086638152849298e-05, | |
| "loss": 0.4846, | |
| "num_tokens": 443607834.0, | |
| "step": 4645 | |
| }, | |
| { | |
| "epoch": 0.7936507936507936, | |
| "grad_norm": 0.4241329578393716, | |
| "learning_rate": 1.0046495959150554e-05, | |
| "loss": 0.4714, | |
| "num_tokens": 444049001.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.7945041816009558, | |
| "grad_norm": 0.36907176685050724, | |
| "learning_rate": 1.0006492772138798e-05, | |
| "loss": 0.4749, | |
| "num_tokens": 444542442.0, | |
| "step": 4655 | |
| }, | |
| { | |
| "epoch": 0.7953575695511179, | |
| "grad_norm": 0.49174330221812856, | |
| "learning_rate": 9.966628910415413e-06, | |
| "loss": 0.486, | |
| "num_tokens": 444948197.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.7962109575012801, | |
| "grad_norm": 0.3516426361435758, | |
| "learning_rate": 9.926904691472134e-06, | |
| "loss": 0.4834, | |
| "num_tokens": 445471608.0, | |
| "step": 4665 | |
| }, | |
| { | |
| "epoch": 0.7970643454514422, | |
| "grad_norm": 0.388265976099864, | |
| "learning_rate": 9.887320431688521e-06, | |
| "loss": 0.4902, | |
| "num_tokens": 445993006.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.7979177334016043, | |
| "grad_norm": 0.418829317539944, | |
| "learning_rate": 9.847876446329457e-06, | |
| "loss": 0.4644, | |
| "num_tokens": 446478568.0, | |
| "step": 4675 | |
| }, | |
| { | |
| "epoch": 0.7987711213517665, | |
| "grad_norm": 0.41125748512347277, | |
| "learning_rate": 9.808573049542627e-06, | |
| "loss": 0.4664, | |
| "num_tokens": 446925677.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.7996245093019286, | |
| "grad_norm": 0.3961023709296336, | |
| "learning_rate": 9.76941055435599e-06, | |
| "loss": 0.4824, | |
| "num_tokens": 447369647.0, | |
| "step": 4685 | |
| }, | |
| { | |
| "epoch": 0.8004778972520908, | |
| "grad_norm": 0.4247684866677576, | |
| "learning_rate": 9.730389272675331e-06, | |
| "loss": 0.4743, | |
| "num_tokens": 447799196.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.8013312852022529, | |
| "grad_norm": 0.39072870557170725, | |
| "learning_rate": 9.691509515281738e-06, | |
| "loss": 0.5007, | |
| "num_tokens": 448276910.0, | |
| "step": 4695 | |
| }, | |
| { | |
| "epoch": 0.802184673152415, | |
| "grad_norm": 0.38279348075398484, | |
| "learning_rate": 9.652771591829156e-06, | |
| "loss": 0.4744, | |
| "num_tokens": 448739177.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.8030380611025772, | |
| "grad_norm": 0.4116207254254995, | |
| "learning_rate": 9.614175810841896e-06, | |
| "loss": 0.4853, | |
| "num_tokens": 449236834.0, | |
| "step": 4705 | |
| }, | |
| { | |
| "epoch": 0.8038914490527393, | |
| "grad_norm": 0.41207753919520107, | |
| "learning_rate": 9.57572247971219e-06, | |
| "loss": 0.5108, | |
| "num_tokens": 449751958.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.8047448370029016, | |
| "grad_norm": 0.42520927322999524, | |
| "learning_rate": 9.53741190469776e-06, | |
| "loss": 0.5118, | |
| "num_tokens": 450200277.0, | |
| "step": 4715 | |
| }, | |
| { | |
| "epoch": 0.8055982249530637, | |
| "grad_norm": 0.3817947289670553, | |
| "learning_rate": 9.499244390919335e-06, | |
| "loss": 0.4914, | |
| "num_tokens": 450761568.0, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.8064516129032258, | |
| "grad_norm": 0.36287106751093384, | |
| "learning_rate": 9.461220242358268e-06, | |
| "loss": 0.4816, | |
| "num_tokens": 451273084.0, | |
| "step": 4725 | |
| }, | |
| { | |
| "epoch": 0.807305000853388, | |
| "grad_norm": 0.46610247831010887, | |
| "learning_rate": 9.42333976185409e-06, | |
| "loss": 0.4947, | |
| "num_tokens": 451740507.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.8081583888035501, | |
| "grad_norm": 0.4309592386303314, | |
| "learning_rate": 9.385603251102084e-06, | |
| "loss": 0.47, | |
| "num_tokens": 452206917.0, | |
| "step": 4735 | |
| }, | |
| { | |
| "epoch": 0.8090117767537123, | |
| "grad_norm": 0.4048285692111887, | |
| "learning_rate": 9.348011010650937e-06, | |
| "loss": 0.4622, | |
| "num_tokens": 452625293.0, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.8098651647038744, | |
| "grad_norm": 0.3666976161881287, | |
| "learning_rate": 9.310563339900272e-06, | |
| "loss": 0.4688, | |
| "num_tokens": 453114538.0, | |
| "step": 4745 | |
| }, | |
| { | |
| "epoch": 0.8107185526540365, | |
| "grad_norm": 0.46629164494716185, | |
| "learning_rate": 9.273260537098315e-06, | |
| "loss": 0.4844, | |
| "num_tokens": 453642012.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.8115719406041987, | |
| "grad_norm": 0.40865864820059056, | |
| "learning_rate": 9.236102899339519e-06, | |
| "loss": 0.475, | |
| "num_tokens": 454112377.0, | |
| "step": 4755 | |
| }, | |
| { | |
| "epoch": 0.8124253285543608, | |
| "grad_norm": 0.4149592263668008, | |
| "learning_rate": 9.199090722562156e-06, | |
| "loss": 0.4791, | |
| "num_tokens": 454558803.0, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.813278716504523, | |
| "grad_norm": 0.39367322676329386, | |
| "learning_rate": 9.162224301546025e-06, | |
| "loss": 0.4575, | |
| "num_tokens": 455019230.0, | |
| "step": 4765 | |
| }, | |
| { | |
| "epoch": 0.8141321044546851, | |
| "grad_norm": 0.42263411896685965, | |
| "learning_rate": 9.125503929910035e-06, | |
| "loss": 0.4834, | |
| "num_tokens": 455523392.0, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.8149854924048472, | |
| "grad_norm": 0.40835939639136054, | |
| "learning_rate": 9.08892990010992e-06, | |
| "loss": 0.4391, | |
| "num_tokens": 455978366.0, | |
| "step": 4775 | |
| }, | |
| { | |
| "epoch": 0.8158388803550094, | |
| "grad_norm": 0.4062138614954961, | |
| "learning_rate": 9.052502503435873e-06, | |
| "loss": 0.4688, | |
| "num_tokens": 456494011.0, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.8166922683051715, | |
| "grad_norm": 0.4863203256645935, | |
| "learning_rate": 9.016222030010259e-06, | |
| "loss": 0.4813, | |
| "num_tokens": 456916541.0, | |
| "step": 4785 | |
| }, | |
| { | |
| "epoch": 0.8175456562553337, | |
| "grad_norm": 0.41962218258769507, | |
| "learning_rate": 8.980088768785271e-06, | |
| "loss": 0.4851, | |
| "num_tokens": 457401137.0, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.8183990442054958, | |
| "grad_norm": 0.3706640025543311, | |
| "learning_rate": 8.94410300754067e-06, | |
| "loss": 0.464, | |
| "num_tokens": 457910346.0, | |
| "step": 4795 | |
| }, | |
| { | |
| "epoch": 0.819252432155658, | |
| "grad_norm": 0.40601220290687806, | |
| "learning_rate": 8.908265032881438e-06, | |
| "loss": 0.5163, | |
| "num_tokens": 458401399.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.8201058201058201, | |
| "grad_norm": 0.3870202244258349, | |
| "learning_rate": 8.872575130235533e-06, | |
| "loss": 0.4856, | |
| "num_tokens": 458905454.0, | |
| "step": 4805 | |
| }, | |
| { | |
| "epoch": 0.8209592080559822, | |
| "grad_norm": 0.3945659119881064, | |
| "learning_rate": 8.837033583851625e-06, | |
| "loss": 0.5122, | |
| "num_tokens": 459365515.0, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.8218125960061444, | |
| "grad_norm": 0.43390704825554965, | |
| "learning_rate": 8.801640676796811e-06, | |
| "loss": 0.4695, | |
| "num_tokens": 459811468.0, | |
| "step": 4815 | |
| }, | |
| { | |
| "epoch": 0.8226659839563065, | |
| "grad_norm": 0.4040597097207471, | |
| "learning_rate": 8.76639669095434e-06, | |
| "loss": 0.5054, | |
| "num_tokens": 460303546.0, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.8235193719064687, | |
| "grad_norm": 0.38179350259669326, | |
| "learning_rate": 8.73130190702143e-06, | |
| "loss": 0.4589, | |
| "num_tokens": 460721247.0, | |
| "step": 4825 | |
| }, | |
| { | |
| "epoch": 0.8243727598566308, | |
| "grad_norm": 0.4422390443417133, | |
| "learning_rate": 8.696356604506964e-06, | |
| "loss": 0.5035, | |
| "num_tokens": 461172152.0, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.8252261478067929, | |
| "grad_norm": 0.39710527746958324, | |
| "learning_rate": 8.66156106172932e-06, | |
| "loss": 0.4494, | |
| "num_tokens": 461673270.0, | |
| "step": 4835 | |
| }, | |
| { | |
| "epoch": 0.8260795357569551, | |
| "grad_norm": 0.4542632378822877, | |
| "learning_rate": 8.62691555581411e-06, | |
| "loss": 0.4552, | |
| "num_tokens": 462132037.0, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.8269329237071172, | |
| "grad_norm": 0.40299187015871685, | |
| "learning_rate": 8.592420362691994e-06, | |
| "loss": 0.4694, | |
| "num_tokens": 462668174.0, | |
| "step": 4845 | |
| }, | |
| { | |
| "epoch": 0.8277863116572794, | |
| "grad_norm": 0.4118631263499891, | |
| "learning_rate": 8.558075757096502e-06, | |
| "loss": 0.4891, | |
| "num_tokens": 463170048.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.8286396996074415, | |
| "grad_norm": 0.36294116101336305, | |
| "learning_rate": 8.523882012561792e-06, | |
| "loss": 0.4578, | |
| "num_tokens": 463640265.0, | |
| "step": 4855 | |
| }, | |
| { | |
| "epoch": 0.8294930875576036, | |
| "grad_norm": 0.4268548554777094, | |
| "learning_rate": 8.489839401420538e-06, | |
| "loss": 0.4494, | |
| "num_tokens": 464065062.0, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.8303464755077659, | |
| "grad_norm": 0.4338519095829496, | |
| "learning_rate": 8.455948194801706e-06, | |
| "loss": 0.4937, | |
| "num_tokens": 464538160.0, | |
| "step": 4865 | |
| }, | |
| { | |
| "epoch": 0.831199863457928, | |
| "grad_norm": 0.359490668074626, | |
| "learning_rate": 8.422208662628415e-06, | |
| "loss": 0.4607, | |
| "num_tokens": 465056306.0, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.8320532514080902, | |
| "grad_norm": 0.43048228222233315, | |
| "learning_rate": 8.388621073615803e-06, | |
| "loss": 0.5054, | |
| "num_tokens": 465553262.0, | |
| "step": 4875 | |
| }, | |
| { | |
| "epoch": 0.8329066393582523, | |
| "grad_norm": 0.35177253272904774, | |
| "learning_rate": 8.355185695268858e-06, | |
| "loss": 0.4639, | |
| "num_tokens": 466050672.0, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.8337600273084144, | |
| "grad_norm": 0.3861546477207878, | |
| "learning_rate": 8.321902793880301e-06, | |
| "loss": 0.4687, | |
| "num_tokens": 466492623.0, | |
| "step": 4885 | |
| }, | |
| { | |
| "epoch": 0.8346134152585766, | |
| "grad_norm": 0.37710521151750065, | |
| "learning_rate": 8.28877263452848e-06, | |
| "loss": 0.5128, | |
| "num_tokens": 467046976.0, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.8354668032087387, | |
| "grad_norm": 0.3808621303260848, | |
| "learning_rate": 8.255795481075228e-06, | |
| "loss": 0.4662, | |
| "num_tokens": 467556599.0, | |
| "step": 4895 | |
| }, | |
| { | |
| "epoch": 0.8363201911589009, | |
| "grad_norm": 0.46264787926707435, | |
| "learning_rate": 8.222971596163792e-06, | |
| "loss": 0.4495, | |
| "num_tokens": 468039326.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.837173579109063, | |
| "grad_norm": 0.4029161488123674, | |
| "learning_rate": 8.19030124121671e-06, | |
| "loss": 0.4504, | |
| "num_tokens": 468436102.0, | |
| "step": 4905 | |
| }, | |
| { | |
| "epoch": 0.8380269670592251, | |
| "grad_norm": 0.4725586775465415, | |
| "learning_rate": 8.157784676433764e-06, | |
| "loss": 0.4697, | |
| "num_tokens": 468922373.0, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.8388803550093873, | |
| "grad_norm": 0.3864722955209497, | |
| "learning_rate": 8.125422160789878e-06, | |
| "loss": 0.4718, | |
| "num_tokens": 469390865.0, | |
| "step": 4915 | |
| }, | |
| { | |
| "epoch": 0.8397337429595494, | |
| "grad_norm": 0.35131840074778387, | |
| "learning_rate": 8.093213952033072e-06, | |
| "loss": 0.4822, | |
| "num_tokens": 469868923.0, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.8405871309097116, | |
| "grad_norm": 0.37703675404513326, | |
| "learning_rate": 8.061160306682406e-06, | |
| "loss": 0.4697, | |
| "num_tokens": 470324067.0, | |
| "step": 4925 | |
| }, | |
| { | |
| "epoch": 0.8414405188598737, | |
| "grad_norm": 0.4249191051452189, | |
| "learning_rate": 8.029261480025922e-06, | |
| "loss": 0.4618, | |
| "num_tokens": 470752870.0, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.8422939068100358, | |
| "grad_norm": 0.4093624078176157, | |
| "learning_rate": 7.997517726118644e-06, | |
| "loss": 0.4736, | |
| "num_tokens": 471226811.0, | |
| "step": 4935 | |
| }, | |
| { | |
| "epoch": 0.843147294760198, | |
| "grad_norm": 0.43115891511692345, | |
| "learning_rate": 7.965929297780515e-06, | |
| "loss": 0.476, | |
| "num_tokens": 471694644.0, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.8440006827103601, | |
| "grad_norm": 0.37692439095375324, | |
| "learning_rate": 7.934496446594417e-06, | |
| "loss": 0.4702, | |
| "num_tokens": 472162188.0, | |
| "step": 4945 | |
| }, | |
| { | |
| "epoch": 0.8448540706605223, | |
| "grad_norm": 0.36710823313675706, | |
| "learning_rate": 7.903219422904158e-06, | |
| "loss": 0.463, | |
| "num_tokens": 472652423.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.8457074586106844, | |
| "grad_norm": 0.3700451821116543, | |
| "learning_rate": 7.87209847581245e-06, | |
| "loss": 0.4909, | |
| "num_tokens": 473220197.0, | |
| "step": 4955 | |
| }, | |
| { | |
| "epoch": 0.8465608465608465, | |
| "grad_norm": 0.359374962956843, | |
| "learning_rate": 7.841133853178975e-06, | |
| "loss": 0.4433, | |
| "num_tokens": 473648633.0, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.8474142345110087, | |
| "grad_norm": 0.38469060693858964, | |
| "learning_rate": 7.810325801618365e-06, | |
| "loss": 0.5004, | |
| "num_tokens": 474183826.0, | |
| "step": 4965 | |
| }, | |
| { | |
| "epoch": 0.8482676224611708, | |
| "grad_norm": 0.3575222345339292, | |
| "learning_rate": 7.779674566498263e-06, | |
| "loss": 0.473, | |
| "num_tokens": 474686021.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.849121010411333, | |
| "grad_norm": 0.38038821787708194, | |
| "learning_rate": 7.749180391937372e-06, | |
| "loss": 0.4766, | |
| "num_tokens": 475129499.0, | |
| "step": 4975 | |
| }, | |
| { | |
| "epoch": 0.8499743983614951, | |
| "grad_norm": 0.387099736274751, | |
| "learning_rate": 7.718843520803487e-06, | |
| "loss": 0.473, | |
| "num_tokens": 475597050.0, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.8508277863116572, | |
| "grad_norm": 0.397713261621825, | |
| "learning_rate": 7.688664194711592e-06, | |
| "loss": 0.4952, | |
| "num_tokens": 476066921.0, | |
| "step": 4985 | |
| }, | |
| { | |
| "epoch": 0.8516811742618194, | |
| "grad_norm": 0.4142209154791408, | |
| "learning_rate": 7.658642654021904e-06, | |
| "loss": 0.4898, | |
| "num_tokens": 476480863.0, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.8525345622119815, | |
| "grad_norm": 0.4498802507059042, | |
| "learning_rate": 7.628779137837981e-06, | |
| "loss": 0.4787, | |
| "num_tokens": 476992861.0, | |
| "step": 4995 | |
| }, | |
| { | |
| "epoch": 0.8533879501621437, | |
| "grad_norm": 0.3762284583585599, | |
| "learning_rate": 7.5990738840048174e-06, | |
| "loss": 0.4871, | |
| "num_tokens": 477480036.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8542413381123058, | |
| "grad_norm": 0.43376693427528423, | |
| "learning_rate": 7.569527129106931e-06, | |
| "loss": 0.4829, | |
| "num_tokens": 477950574.0, | |
| "step": 5005 | |
| }, | |
| { | |
| "epoch": 0.855094726062468, | |
| "grad_norm": 0.3864568502583971, | |
| "learning_rate": 7.540139108466496e-06, | |
| "loss": 0.4874, | |
| "num_tokens": 478441727.0, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.8559481140126302, | |
| "grad_norm": 0.39586516364740193, | |
| "learning_rate": 7.510910056141456e-06, | |
| "loss": 0.4731, | |
| "num_tokens": 478924136.0, | |
| "step": 5015 | |
| }, | |
| { | |
| "epoch": 0.8568015019627923, | |
| "grad_norm": 0.34616393211650737, | |
| "learning_rate": 7.481840204923681e-06, | |
| "loss": 0.4835, | |
| "num_tokens": 479450987.0, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.8576548899129545, | |
| "grad_norm": 0.39752301696936804, | |
| "learning_rate": 7.452929786337096e-06, | |
| "loss": 0.4937, | |
| "num_tokens": 480007196.0, | |
| "step": 5025 | |
| }, | |
| { | |
| "epoch": 0.8585082778631166, | |
| "grad_norm": 0.4396503810914459, | |
| "learning_rate": 7.424179030635831e-06, | |
| "loss": 0.4671, | |
| "num_tokens": 480480087.0, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 0.8593616658132787, | |
| "grad_norm": 0.4056180469840527, | |
| "learning_rate": 7.395588166802412e-06, | |
| "loss": 0.4915, | |
| "num_tokens": 480963358.0, | |
| "step": 5035 | |
| }, | |
| { | |
| "epoch": 0.8602150537634409, | |
| "grad_norm": 0.42654141295245285, | |
| "learning_rate": 7.367157422545904e-06, | |
| "loss": 0.4811, | |
| "num_tokens": 481428209.0, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.861068441713603, | |
| "grad_norm": 0.3793401011602741, | |
| "learning_rate": 7.338887024300134e-06, | |
| "loss": 0.4579, | |
| "num_tokens": 481929273.0, | |
| "step": 5045 | |
| }, | |
| { | |
| "epoch": 0.8619218296637652, | |
| "grad_norm": 0.39003290700480503, | |
| "learning_rate": 7.310777197221854e-06, | |
| "loss": 0.4567, | |
| "num_tokens": 482333846.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.8627752176139273, | |
| "grad_norm": 0.3835772255495926, | |
| "learning_rate": 7.282828165188976e-06, | |
| "loss": 0.4834, | |
| "num_tokens": 482795801.0, | |
| "step": 5055 | |
| }, | |
| { | |
| "epoch": 0.8636286055640894, | |
| "grad_norm": 0.4409396543496292, | |
| "learning_rate": 7.255040150798771e-06, | |
| "loss": 0.484, | |
| "num_tokens": 483299671.0, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.8644819935142516, | |
| "grad_norm": 0.35641284615004537, | |
| "learning_rate": 7.227413375366089e-06, | |
| "loss": 0.4831, | |
| "num_tokens": 483799061.0, | |
| "step": 5065 | |
| }, | |
| { | |
| "epoch": 0.8653353814644137, | |
| "grad_norm": 0.39766976111454977, | |
| "learning_rate": 7.199948058921629e-06, | |
| "loss": 0.501, | |
| "num_tokens": 484313085.0, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 0.8661887694145759, | |
| "grad_norm": 0.4646811042390154, | |
| "learning_rate": 7.1726444202101535e-06, | |
| "loss": 0.5044, | |
| "num_tokens": 484795514.0, | |
| "step": 5075 | |
| }, | |
| { | |
| "epoch": 0.867042157364738, | |
| "grad_norm": 0.39429012460569557, | |
| "learning_rate": 7.145502676688759e-06, | |
| "loss": 0.4768, | |
| "num_tokens": 485251856.0, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.8678955453149002, | |
| "grad_norm": 0.4464316703419056, | |
| "learning_rate": 7.1185230445251535e-06, | |
| "loss": 0.4902, | |
| "num_tokens": 485734962.0, | |
| "step": 5085 | |
| }, | |
| { | |
| "epoch": 0.8687489332650623, | |
| "grad_norm": 0.3749237455482977, | |
| "learning_rate": 7.091705738595911e-06, | |
| "loss": 0.4536, | |
| "num_tokens": 486246381.0, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 0.8696023212152244, | |
| "grad_norm": 0.36241853040964883, | |
| "learning_rate": 7.065050972484788e-06, | |
| "loss": 0.4586, | |
| "num_tokens": 486726337.0, | |
| "step": 5095 | |
| }, | |
| { | |
| "epoch": 0.8704557091653866, | |
| "grad_norm": 0.3855921581581282, | |
| "learning_rate": 7.038558958481001e-06, | |
| "loss": 0.4718, | |
| "num_tokens": 487198446.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.8713090971155487, | |
| "grad_norm": 0.41034470595646566, | |
| "learning_rate": 7.01222990757754e-06, | |
| "loss": 0.4883, | |
| "num_tokens": 487629317.0, | |
| "step": 5105 | |
| }, | |
| { | |
| "epoch": 0.8721624850657109, | |
| "grad_norm": 0.35651502619365183, | |
| "learning_rate": 6.986064029469508e-06, | |
| "loss": 0.4429, | |
| "num_tokens": 488156403.0, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 0.873015873015873, | |
| "grad_norm": 0.3546728377192537, | |
| "learning_rate": 6.9600615325524115e-06, | |
| "loss": 0.4614, | |
| "num_tokens": 488635779.0, | |
| "step": 5115 | |
| }, | |
| { | |
| "epoch": 0.8738692609660351, | |
| "grad_norm": 0.3631133122572483, | |
| "learning_rate": 6.934222623920547e-06, | |
| "loss": 0.4644, | |
| "num_tokens": 489140124.0, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.8747226489161973, | |
| "grad_norm": 0.43958527960566596, | |
| "learning_rate": 6.908547509365305e-06, | |
| "loss": 0.444, | |
| "num_tokens": 489547159.0, | |
| "step": 5125 | |
| }, | |
| { | |
| "epoch": 0.8755760368663594, | |
| "grad_norm": 0.3761310126308099, | |
| "learning_rate": 6.883036393373579e-06, | |
| "loss": 0.4878, | |
| "num_tokens": 490104565.0, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 0.8764294248165216, | |
| "grad_norm": 0.4225485979751788, | |
| "learning_rate": 6.857689479126099e-06, | |
| "loss": 0.4579, | |
| "num_tokens": 490529223.0, | |
| "step": 5135 | |
| }, | |
| { | |
| "epoch": 0.8772828127666837, | |
| "grad_norm": 0.37266255259581216, | |
| "learning_rate": 6.8325069684958235e-06, | |
| "loss": 0.4818, | |
| "num_tokens": 491054868.0, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.8781362007168458, | |
| "grad_norm": 0.3779071334870246, | |
| "learning_rate": 6.8074890620463394e-06, | |
| "loss": 0.4598, | |
| "num_tokens": 491580041.0, | |
| "step": 5145 | |
| }, | |
| { | |
| "epoch": 0.878989588667008, | |
| "grad_norm": 0.3948017220123988, | |
| "learning_rate": 6.782635959030259e-06, | |
| "loss": 0.468, | |
| "num_tokens": 492078546.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.8798429766171701, | |
| "grad_norm": 0.4159358655316634, | |
| "learning_rate": 6.7579478573876366e-06, | |
| "loss": 0.4813, | |
| "num_tokens": 492537624.0, | |
| "step": 5155 | |
| }, | |
| { | |
| "epoch": 0.8806963645673324, | |
| "grad_norm": 0.4517347316794219, | |
| "learning_rate": 6.733424953744391e-06, | |
| "loss": 0.476, | |
| "num_tokens": 492974236.0, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.8815497525174945, | |
| "grad_norm": 0.3562439340554458, | |
| "learning_rate": 6.709067443410733e-06, | |
| "loss": 0.4941, | |
| "num_tokens": 493521035.0, | |
| "step": 5165 | |
| }, | |
| { | |
| "epoch": 0.8824031404676566, | |
| "grad_norm": 0.3639915359744882, | |
| "learning_rate": 6.684875520379618e-06, | |
| "loss": 0.4491, | |
| "num_tokens": 494019470.0, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 0.8832565284178188, | |
| "grad_norm": 0.46460587508644224, | |
| "learning_rate": 6.66084937732519e-06, | |
| "loss": 0.5322, | |
| "num_tokens": 494510308.0, | |
| "step": 5175 | |
| }, | |
| { | |
| "epoch": 0.8841099163679809, | |
| "grad_norm": 0.39413960971710404, | |
| "learning_rate": 6.636989205601276e-06, | |
| "loss": 0.4868, | |
| "num_tokens": 495062198.0, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.8849633043181431, | |
| "grad_norm": 0.37656954889896244, | |
| "learning_rate": 6.613295195239816e-06, | |
| "loss": 0.4646, | |
| "num_tokens": 495546063.0, | |
| "step": 5185 | |
| }, | |
| { | |
| "epoch": 0.8858166922683052, | |
| "grad_norm": 0.36960663076371775, | |
| "learning_rate": 6.589767534949384e-06, | |
| "loss": 0.4557, | |
| "num_tokens": 496039347.0, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 0.8866700802184673, | |
| "grad_norm": 0.3438787406040836, | |
| "learning_rate": 6.5664064121136865e-06, | |
| "loss": 0.4516, | |
| "num_tokens": 496528322.0, | |
| "step": 5195 | |
| }, | |
| { | |
| "epoch": 0.8875234681686295, | |
| "grad_norm": 0.4113654714066015, | |
| "learning_rate": 6.543212012790038e-06, | |
| "loss": 0.4777, | |
| "num_tokens": 496987511.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.8883768561187916, | |
| "grad_norm": 0.4026168446978517, | |
| "learning_rate": 6.520184521707923e-06, | |
| "loss": 0.469, | |
| "num_tokens": 497455898.0, | |
| "step": 5205 | |
| }, | |
| { | |
| "epoch": 0.8892302440689538, | |
| "grad_norm": 0.4689279607693898, | |
| "learning_rate": 6.497324122267478e-06, | |
| "loss": 0.5074, | |
| "num_tokens": 497953501.0, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 0.8900836320191159, | |
| "grad_norm": 0.3879759334617288, | |
| "learning_rate": 6.474630996538078e-06, | |
| "loss": 0.4609, | |
| "num_tokens": 498465680.0, | |
| "step": 5215 | |
| }, | |
| { | |
| "epoch": 0.890937019969278, | |
| "grad_norm": 0.3627496108649411, | |
| "learning_rate": 6.452105325256852e-06, | |
| "loss": 0.4666, | |
| "num_tokens": 498971917.0, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.8917904079194402, | |
| "grad_norm": 0.42834814185964604, | |
| "learning_rate": 6.429747287827254e-06, | |
| "loss": 0.4808, | |
| "num_tokens": 499485913.0, | |
| "step": 5225 | |
| }, | |
| { | |
| "epoch": 0.8926437958696023, | |
| "grad_norm": 0.4137600502462467, | |
| "learning_rate": 6.407557062317632e-06, | |
| "loss": 0.4593, | |
| "num_tokens": 499959465.0, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 0.8934971838197645, | |
| "grad_norm": 0.4009391472354262, | |
| "learning_rate": 6.38553482545982e-06, | |
| "loss": 0.4704, | |
| "num_tokens": 500463764.0, | |
| "step": 5235 | |
| }, | |
| { | |
| "epoch": 0.8943505717699266, | |
| "grad_norm": 0.37847313213460887, | |
| "learning_rate": 6.36368075264772e-06, | |
| "loss": 0.4593, | |
| "num_tokens": 500947010.0, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.8952039597200887, | |
| "grad_norm": 0.374175327312065, | |
| "learning_rate": 6.341995017935916e-06, | |
| "loss": 0.4881, | |
| "num_tokens": 501511817.0, | |
| "step": 5245 | |
| }, | |
| { | |
| "epoch": 0.8960573476702509, | |
| "grad_norm": 0.422644927786315, | |
| "learning_rate": 6.320477794038258e-06, | |
| "loss": 0.4663, | |
| "num_tokens": 501972930.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.896910735620413, | |
| "grad_norm": 0.3750006488424808, | |
| "learning_rate": 6.299129252326541e-06, | |
| "loss": 0.4956, | |
| "num_tokens": 502474218.0, | |
| "step": 5255 | |
| }, | |
| { | |
| "epoch": 0.8977641235705752, | |
| "grad_norm": 0.46267661150363143, | |
| "learning_rate": 6.277949562829075e-06, | |
| "loss": 0.4854, | |
| "num_tokens": 502958987.0, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.8986175115207373, | |
| "grad_norm": 0.4050323850205571, | |
| "learning_rate": 6.256938894229389e-06, | |
| "loss": 0.4691, | |
| "num_tokens": 503509031.0, | |
| "step": 5265 | |
| }, | |
| { | |
| "epoch": 0.8994708994708994, | |
| "grad_norm": 0.38414160632964717, | |
| "learning_rate": 6.236097413864841e-06, | |
| "loss": 0.4709, | |
| "num_tokens": 503946765.0, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 0.9003242874210616, | |
| "grad_norm": 0.4094522813634835, | |
| "learning_rate": 6.215425287725328e-06, | |
| "loss": 0.4639, | |
| "num_tokens": 504415954.0, | |
| "step": 5275 | |
| }, | |
| { | |
| "epoch": 0.9011776753712237, | |
| "grad_norm": 0.3943122979866665, | |
| "learning_rate": 6.194922680451922e-06, | |
| "loss": 0.4657, | |
| "num_tokens": 504893463.0, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.9020310633213859, | |
| "grad_norm": 0.3472933602583215, | |
| "learning_rate": 6.17458975533559e-06, | |
| "loss": 0.4733, | |
| "num_tokens": 505408687.0, | |
| "step": 5285 | |
| }, | |
| { | |
| "epoch": 0.902884451271548, | |
| "grad_norm": 0.415147741113188, | |
| "learning_rate": 6.1544266743158805e-06, | |
| "loss": 0.4751, | |
| "num_tokens": 505899175.0, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 0.9037378392217101, | |
| "grad_norm": 0.42206700403189945, | |
| "learning_rate": 6.134433597979634e-06, | |
| "loss": 0.4795, | |
| "num_tokens": 506361572.0, | |
| "step": 5295 | |
| }, | |
| { | |
| "epoch": 0.9045912271718723, | |
| "grad_norm": 0.35892457909894904, | |
| "learning_rate": 6.114610685559708e-06, | |
| "loss": 0.4533, | |
| "num_tokens": 506861395.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.9054446151220344, | |
| "grad_norm": 0.39488494668114504, | |
| "learning_rate": 6.09495809493371e-06, | |
| "loss": 0.4919, | |
| "num_tokens": 507325945.0, | |
| "step": 5305 | |
| }, | |
| { | |
| "epoch": 0.9062980030721967, | |
| "grad_norm": 0.3769328536810784, | |
| "learning_rate": 6.0754759826227225e-06, | |
| "loss": 0.4597, | |
| "num_tokens": 507817589.0, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 0.9071513910223588, | |
| "grad_norm": 0.4035383669423095, | |
| "learning_rate": 6.056164503790092e-06, | |
| "loss": 0.4784, | |
| "num_tokens": 508268448.0, | |
| "step": 5315 | |
| }, | |
| { | |
| "epoch": 0.9080047789725209, | |
| "grad_norm": 0.4631912631589421, | |
| "learning_rate": 6.0370238122401495e-06, | |
| "loss": 0.4604, | |
| "num_tokens": 508690960.0, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.9088581669226831, | |
| "grad_norm": 0.4242137892677596, | |
| "learning_rate": 6.01805406041702e-06, | |
| "loss": 0.4658, | |
| "num_tokens": 509144003.0, | |
| "step": 5325 | |
| }, | |
| { | |
| "epoch": 0.9097115548728452, | |
| "grad_norm": 0.37573135722468437, | |
| "learning_rate": 5.999255399403401e-06, | |
| "loss": 0.4816, | |
| "num_tokens": 509649233.0, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 0.9105649428230074, | |
| "grad_norm": 0.44763057677091095, | |
| "learning_rate": 5.980627978919339e-06, | |
| "loss": 0.4705, | |
| "num_tokens": 510116279.0, | |
| "step": 5335 | |
| }, | |
| { | |
| "epoch": 0.9114183307731695, | |
| "grad_norm": 0.418264602572911, | |
| "learning_rate": 5.962171947321067e-06, | |
| "loss": 0.4718, | |
| "num_tokens": 510627944.0, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.9122717187233317, | |
| "grad_norm": 0.4055773276324188, | |
| "learning_rate": 5.943887451599798e-06, | |
| "loss": 0.4653, | |
| "num_tokens": 511076362.0, | |
| "step": 5345 | |
| }, | |
| { | |
| "epoch": 0.9131251066734938, | |
| "grad_norm": 0.37352133624457506, | |
| "learning_rate": 5.925774637380573e-06, | |
| "loss": 0.4857, | |
| "num_tokens": 511591407.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.9139784946236559, | |
| "grad_norm": 0.4027603546084621, | |
| "learning_rate": 5.9078336489210895e-06, | |
| "loss": 0.4896, | |
| "num_tokens": 512088847.0, | |
| "step": 5355 | |
| }, | |
| { | |
| "epoch": 0.9148318825738181, | |
| "grad_norm": 0.4088279197259369, | |
| "learning_rate": 5.890064629110552e-06, | |
| "loss": 0.5004, | |
| "num_tokens": 512558436.0, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.9156852705239802, | |
| "grad_norm": 0.41760753057367384, | |
| "learning_rate": 5.8724677194685435e-06, | |
| "loss": 0.4544, | |
| "num_tokens": 512991186.0, | |
| "step": 5365 | |
| }, | |
| { | |
| "epoch": 0.9165386584741424, | |
| "grad_norm": 0.39053477949305254, | |
| "learning_rate": 5.855043060143887e-06, | |
| "loss": 0.4787, | |
| "num_tokens": 513474763.0, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 0.9173920464243045, | |
| "grad_norm": 0.3714314177027866, | |
| "learning_rate": 5.83779078991354e-06, | |
| "loss": 0.4402, | |
| "num_tokens": 513939479.0, | |
| "step": 5375 | |
| }, | |
| { | |
| "epoch": 0.9182454343744666, | |
| "grad_norm": 0.40622174376674514, | |
| "learning_rate": 5.820711046181488e-06, | |
| "loss": 0.4683, | |
| "num_tokens": 514407245.0, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.9190988223246288, | |
| "grad_norm": 0.391039015286031, | |
| "learning_rate": 5.803803964977634e-06, | |
| "loss": 0.4442, | |
| "num_tokens": 514890031.0, | |
| "step": 5385 | |
| }, | |
| { | |
| "epoch": 0.9199522102747909, | |
| "grad_norm": 0.39149647645658614, | |
| "learning_rate": 5.7870696809567425e-06, | |
| "loss": 0.4643, | |
| "num_tokens": 515337057.0, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 0.9208055982249531, | |
| "grad_norm": 0.40708128247571407, | |
| "learning_rate": 5.770508327397339e-06, | |
| "loss": 0.4849, | |
| "num_tokens": 515817232.0, | |
| "step": 5395 | |
| }, | |
| { | |
| "epoch": 0.9216589861751152, | |
| "grad_norm": 0.4357094366342168, | |
| "learning_rate": 5.754120036200669e-06, | |
| "loss": 0.5188, | |
| "num_tokens": 516301089.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.9225123741252773, | |
| "grad_norm": 0.39705418708488704, | |
| "learning_rate": 5.7379049378896406e-06, | |
| "loss": 0.4541, | |
| "num_tokens": 516744090.0, | |
| "step": 5405 | |
| }, | |
| { | |
| "epoch": 0.9233657620754395, | |
| "grad_norm": 0.4331846671080257, | |
| "learning_rate": 5.721863161607775e-06, | |
| "loss": 0.4939, | |
| "num_tokens": 517258131.0, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 0.9242191500256016, | |
| "grad_norm": 0.398920329815837, | |
| "learning_rate": 5.705994835118203e-06, | |
| "loss": 0.5021, | |
| "num_tokens": 517712020.0, | |
| "step": 5415 | |
| }, | |
| { | |
| "epoch": 0.9250725379757638, | |
| "grad_norm": 0.3994654563299424, | |
| "learning_rate": 5.6903000848026165e-06, | |
| "loss": 0.4818, | |
| "num_tokens": 518223298.0, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.9259259259259259, | |
| "grad_norm": 0.35899312035446906, | |
| "learning_rate": 5.674779035660291e-06, | |
| "loss": 0.4597, | |
| "num_tokens": 518685499.0, | |
| "step": 5425 | |
| }, | |
| { | |
| "epoch": 0.926779313876088, | |
| "grad_norm": 0.37962422057783574, | |
| "learning_rate": 5.659431811307065e-06, | |
| "loss": 0.4706, | |
| "num_tokens": 519130985.0, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 0.9276327018262502, | |
| "grad_norm": 0.3794193030615154, | |
| "learning_rate": 5.644258533974374e-06, | |
| "loss": 0.4778, | |
| "num_tokens": 519582640.0, | |
| "step": 5435 | |
| }, | |
| { | |
| "epoch": 0.9284860897764123, | |
| "grad_norm": 0.4030094643266786, | |
| "learning_rate": 5.629259324508267e-06, | |
| "loss": 0.4841, | |
| "num_tokens": 520051962.0, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.9293394777265745, | |
| "grad_norm": 0.4124858277827859, | |
| "learning_rate": 5.614434302368449e-06, | |
| "loss": 0.4561, | |
| "num_tokens": 520494923.0, | |
| "step": 5445 | |
| }, | |
| { | |
| "epoch": 0.9301928656767366, | |
| "grad_norm": 0.41785885327790534, | |
| "learning_rate": 5.599783585627322e-06, | |
| "loss": 0.4529, | |
| "num_tokens": 520934447.0, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.9310462536268987, | |
| "grad_norm": 0.411135131232383, | |
| "learning_rate": 5.585307290969054e-06, | |
| "loss": 0.4718, | |
| "num_tokens": 521440211.0, | |
| "step": 5455 | |
| }, | |
| { | |
| "epoch": 0.931899641577061, | |
| "grad_norm": 0.4025174206094342, | |
| "learning_rate": 5.571005533688649e-06, | |
| "loss": 0.4587, | |
| "num_tokens": 521910187.0, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.932753029527223, | |
| "grad_norm": 0.3754354548725426, | |
| "learning_rate": 5.556878427691023e-06, | |
| "loss": 0.4649, | |
| "num_tokens": 522379850.0, | |
| "step": 5465 | |
| }, | |
| { | |
| "epoch": 0.9336064174773853, | |
| "grad_norm": 0.3994106758521501, | |
| "learning_rate": 5.542926085490093e-06, | |
| "loss": 0.4449, | |
| "num_tokens": 522876883.0, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 0.9344598054275474, | |
| "grad_norm": 0.3975760928774059, | |
| "learning_rate": 5.529148618207897e-06, | |
| "loss": 0.4528, | |
| "num_tokens": 523362146.0, | |
| "step": 5475 | |
| }, | |
| { | |
| "epoch": 0.9353131933777095, | |
| "grad_norm": 0.4835964693967314, | |
| "learning_rate": 5.515546135573695e-06, | |
| "loss": 0.5137, | |
| "num_tokens": 523880084.0, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.9361665813278717, | |
| "grad_norm": 0.4241602372599853, | |
| "learning_rate": 5.5021187459230964e-06, | |
| "loss": 0.4682, | |
| "num_tokens": 524348977.0, | |
| "step": 5485 | |
| }, | |
| { | |
| "epoch": 0.9370199692780338, | |
| "grad_norm": 0.41720871328080034, | |
| "learning_rate": 5.4888665561972065e-06, | |
| "loss": 0.4766, | |
| "num_tokens": 524788350.0, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 0.937873357228196, | |
| "grad_norm": 0.3549601370629738, | |
| "learning_rate": 5.475789671941761e-06, | |
| "loss": 0.5128, | |
| "num_tokens": 525362978.0, | |
| "step": 5495 | |
| }, | |
| { | |
| "epoch": 0.9387267451783581, | |
| "grad_norm": 0.3550437206753368, | |
| "learning_rate": 5.462888197306301e-06, | |
| "loss": 0.4458, | |
| "num_tokens": 525832920.0, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.9395801331285202, | |
| "grad_norm": 0.4085947037645355, | |
| "learning_rate": 5.450162235043325e-06, | |
| "loss": 0.4931, | |
| "num_tokens": 526345441.0, | |
| "step": 5505 | |
| }, | |
| { | |
| "epoch": 0.9404335210786824, | |
| "grad_norm": 0.3805970642957296, | |
| "learning_rate": 5.43761188650749e-06, | |
| "loss": 0.4814, | |
| "num_tokens": 526845340.0, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 0.9412869090288445, | |
| "grad_norm": 0.353643700022046, | |
| "learning_rate": 5.425237251654792e-06, | |
| "loss": 0.4456, | |
| "num_tokens": 527313723.0, | |
| "step": 5515 | |
| }, | |
| { | |
| "epoch": 0.9421402969790067, | |
| "grad_norm": 0.4067460726300466, | |
| "learning_rate": 5.41303842904177e-06, | |
| "loss": 0.4981, | |
| "num_tokens": 527753736.0, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.9429936849291688, | |
| "grad_norm": 0.3705779237525373, | |
| "learning_rate": 5.401015515824727e-06, | |
| "loss": 0.4654, | |
| "num_tokens": 528283757.0, | |
| "step": 5525 | |
| }, | |
| { | |
| "epoch": 0.9438470728793309, | |
| "grad_norm": 0.4025717732477277, | |
| "learning_rate": 5.389168607758956e-06, | |
| "loss": 0.4842, | |
| "num_tokens": 528798665.0, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 0.9447004608294931, | |
| "grad_norm": 0.4198788649356861, | |
| "learning_rate": 5.377497799197965e-06, | |
| "loss": 0.4855, | |
| "num_tokens": 529266311.0, | |
| "step": 5535 | |
| }, | |
| { | |
| "epoch": 0.9455538487796552, | |
| "grad_norm": 0.3903166097731886, | |
| "learning_rate": 5.366003183092747e-06, | |
| "loss": 0.4763, | |
| "num_tokens": 529711197.0, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.9464072367298174, | |
| "grad_norm": 0.4482830953651333, | |
| "learning_rate": 5.354684850991019e-06, | |
| "loss": 0.4699, | |
| "num_tokens": 530185801.0, | |
| "step": 5545 | |
| }, | |
| { | |
| "epoch": 0.9472606246799795, | |
| "grad_norm": 0.4441008472526631, | |
| "learning_rate": 5.343542893036508e-06, | |
| "loss": 0.4552, | |
| "num_tokens": 530632318.0, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.9481140126301416, | |
| "grad_norm": 0.4095198187364235, | |
| "learning_rate": 5.332577397968218e-06, | |
| "loss": 0.4866, | |
| "num_tokens": 531096656.0, | |
| "step": 5555 | |
| }, | |
| { | |
| "epoch": 0.9489674005803038, | |
| "grad_norm": 0.36259571086545295, | |
| "learning_rate": 5.321788453119741e-06, | |
| "loss": 0.4759, | |
| "num_tokens": 531553862.0, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.9498207885304659, | |
| "grad_norm": 0.38362830626868116, | |
| "learning_rate": 5.3111761444185486e-06, | |
| "loss": 0.4717, | |
| "num_tokens": 532021500.0, | |
| "step": 5565 | |
| }, | |
| { | |
| "epoch": 0.9506741764806281, | |
| "grad_norm": 0.381933501436336, | |
| "learning_rate": 5.300740556385312e-06, | |
| "loss": 0.5012, | |
| "num_tokens": 532532990.0, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 0.9515275644307902, | |
| "grad_norm": 0.415085528646988, | |
| "learning_rate": 5.29048177213323e-06, | |
| "loss": 0.5077, | |
| "num_tokens": 532955029.0, | |
| "step": 5575 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 0.3955095109944891, | |
| "learning_rate": 5.280399873367359e-06, | |
| "loss": 0.4657, | |
| "num_tokens": 533448215.0, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.9532343403311145, | |
| "grad_norm": 0.4324698158038418, | |
| "learning_rate": 5.270494940383981e-06, | |
| "loss": 0.4697, | |
| "num_tokens": 533912863.0, | |
| "step": 5585 | |
| }, | |
| { | |
| "epoch": 0.9540877282812766, | |
| "grad_norm": 0.38508680134607826, | |
| "learning_rate": 5.260767052069932e-06, | |
| "loss": 0.481, | |
| "num_tokens": 534401740.0, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 0.9549411162314388, | |
| "grad_norm": 0.41907493817620006, | |
| "learning_rate": 5.251216285902014e-06, | |
| "loss": 0.4638, | |
| "num_tokens": 534880729.0, | |
| "step": 5595 | |
| }, | |
| { | |
| "epoch": 0.955794504181601, | |
| "grad_norm": 0.3628670891747334, | |
| "learning_rate": 5.241842717946349e-06, | |
| "loss": 0.4596, | |
| "num_tokens": 535341346.0, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.956647892131763, | |
| "grad_norm": 0.4031632125204859, | |
| "learning_rate": 5.232646422857779e-06, | |
| "loss": 0.4396, | |
| "num_tokens": 535841224.0, | |
| "step": 5605 | |
| }, | |
| { | |
| "epoch": 0.9575012800819253, | |
| "grad_norm": 0.3818135373590379, | |
| "learning_rate": 5.2236274738792755e-06, | |
| "loss": 0.4624, | |
| "num_tokens": 536296941.0, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 0.9583546680320874, | |
| "grad_norm": 0.4028517959227554, | |
| "learning_rate": 5.214785942841354e-06, | |
| "loss": 0.4728, | |
| "num_tokens": 536809649.0, | |
| "step": 5615 | |
| }, | |
| { | |
| "epoch": 0.9592080559822496, | |
| "grad_norm": 0.35150144199814176, | |
| "learning_rate": 5.206121900161511e-06, | |
| "loss": 0.4613, | |
| "num_tokens": 537336227.0, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.9600614439324117, | |
| "grad_norm": 0.3587893017483423, | |
| "learning_rate": 5.197635414843641e-06, | |
| "loss": 0.4597, | |
| "num_tokens": 537837698.0, | |
| "step": 5625 | |
| }, | |
| { | |
| "epoch": 0.9609148318825739, | |
| "grad_norm": 0.38358006159788616, | |
| "learning_rate": 5.189326554477508e-06, | |
| "loss": 0.4951, | |
| "num_tokens": 538331668.0, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 0.961768219832736, | |
| "grad_norm": 0.425209540606683, | |
| "learning_rate": 5.181195385238204e-06, | |
| "loss": 0.4597, | |
| "num_tokens": 538771273.0, | |
| "step": 5635 | |
| }, | |
| { | |
| "epoch": 0.9626216077828981, | |
| "grad_norm": 0.4002330650672327, | |
| "learning_rate": 5.173241971885606e-06, | |
| "loss": 0.4459, | |
| "num_tokens": 539226192.0, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.9634749957330603, | |
| "grad_norm": 0.37212381959731367, | |
| "learning_rate": 5.1654663777638825e-06, | |
| "loss": 0.4711, | |
| "num_tokens": 539718270.0, | |
| "step": 5645 | |
| }, | |
| { | |
| "epoch": 0.9643283836832224, | |
| "grad_norm": 0.45191748477762655, | |
| "learning_rate": 5.15786866480098e-06, | |
| "loss": 0.4714, | |
| "num_tokens": 540188216.0, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.9651817716333846, | |
| "grad_norm": 0.4065793108570635, | |
| "learning_rate": 5.150448893508114e-06, | |
| "loss": 0.4605, | |
| "num_tokens": 540588455.0, | |
| "step": 5655 | |
| }, | |
| { | |
| "epoch": 0.9660351595835467, | |
| "grad_norm": 0.41417161369706124, | |
| "learning_rate": 5.143207122979319e-06, | |
| "loss": 0.474, | |
| "num_tokens": 541043880.0, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.9668885475337088, | |
| "grad_norm": 0.4258744756265496, | |
| "learning_rate": 5.136143410890947e-06, | |
| "loss": 0.514, | |
| "num_tokens": 541519837.0, | |
| "step": 5665 | |
| }, | |
| { | |
| "epoch": 0.967741935483871, | |
| "grad_norm": 0.3676181767341479, | |
| "learning_rate": 5.129257813501227e-06, | |
| "loss": 0.4534, | |
| "num_tokens": 541979505.0, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 0.9685953234340331, | |
| "grad_norm": 0.4821703308089171, | |
| "learning_rate": 5.122550385649811e-06, | |
| "loss": 0.4595, | |
| "num_tokens": 542421933.0, | |
| "step": 5675 | |
| }, | |
| { | |
| "epoch": 0.9694487113841953, | |
| "grad_norm": 0.41439324767486935, | |
| "learning_rate": 5.116021180757339e-06, | |
| "loss": 0.475, | |
| "num_tokens": 542913487.0, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.9703020993343574, | |
| "grad_norm": 0.3841303425062739, | |
| "learning_rate": 5.1096702508250065e-06, | |
| "loss": 0.4517, | |
| "num_tokens": 543414338.0, | |
| "step": 5685 | |
| }, | |
| { | |
| "epoch": 0.9711554872845195, | |
| "grad_norm": 0.44571730015038735, | |
| "learning_rate": 5.103497646434162e-06, | |
| "loss": 0.5052, | |
| "num_tokens": 543888926.0, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 0.9720088752346817, | |
| "grad_norm": 0.4112190281935544, | |
| "learning_rate": 5.0975034167458985e-06, | |
| "loss": 0.521, | |
| "num_tokens": 544388851.0, | |
| "step": 5695 | |
| }, | |
| { | |
| "epoch": 0.9728622631848438, | |
| "grad_norm": 0.47941627145277926, | |
| "learning_rate": 5.0916876095006525e-06, | |
| "loss": 0.4728, | |
| "num_tokens": 544851416.0, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.973715651135006, | |
| "grad_norm": 0.42861708474605725, | |
| "learning_rate": 5.086050271017843e-06, | |
| "loss": 0.471, | |
| "num_tokens": 545315881.0, | |
| "step": 5705 | |
| }, | |
| { | |
| "epoch": 0.9745690390851681, | |
| "grad_norm": 0.4503219008426513, | |
| "learning_rate": 5.080591446195489e-06, | |
| "loss": 0.4813, | |
| "num_tokens": 545757943.0, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 0.9754224270353302, | |
| "grad_norm": 0.35058627310744045, | |
| "learning_rate": 5.075311178509852e-06, | |
| "loss": 0.4503, | |
| "num_tokens": 546177890.0, | |
| "step": 5715 | |
| }, | |
| { | |
| "epoch": 0.9762758149854924, | |
| "grad_norm": 0.44318533429007384, | |
| "learning_rate": 5.070209510015099e-06, | |
| "loss": 0.5075, | |
| "num_tokens": 546668069.0, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.9771292029356545, | |
| "grad_norm": 0.4091754430000602, | |
| "learning_rate": 5.065286481342953e-06, | |
| "loss": 0.4562, | |
| "num_tokens": 547099467.0, | |
| "step": 5725 | |
| }, | |
| { | |
| "epoch": 0.9779825908858167, | |
| "grad_norm": 0.37194505366086944, | |
| "learning_rate": 5.060542131702389e-06, | |
| "loss": 0.4706, | |
| "num_tokens": 547620964.0, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 0.9788359788359788, | |
| "grad_norm": 0.408950385695778, | |
| "learning_rate": 5.055976498879303e-06, | |
| "loss": 0.4763, | |
| "num_tokens": 548138400.0, | |
| "step": 5735 | |
| }, | |
| { | |
| "epoch": 0.9796893667861409, | |
| "grad_norm": 0.41137705466294106, | |
| "learning_rate": 5.05158961923622e-06, | |
| "loss": 0.4729, | |
| "num_tokens": 548580998.0, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.9805427547363031, | |
| "grad_norm": 0.41807101281527526, | |
| "learning_rate": 5.047381527712007e-06, | |
| "loss": 0.4828, | |
| "num_tokens": 549066487.0, | |
| "step": 5745 | |
| }, | |
| { | |
| "epoch": 0.9813961426864652, | |
| "grad_norm": 0.42368580278206597, | |
| "learning_rate": 5.0433522578215845e-06, | |
| "loss": 0.4831, | |
| "num_tokens": 549511852.0, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.9822495306366275, | |
| "grad_norm": 0.34533122926230386, | |
| "learning_rate": 5.039501841655672e-06, | |
| "loss": 0.4582, | |
| "num_tokens": 549998268.0, | |
| "step": 5755 | |
| }, | |
| { | |
| "epoch": 0.9831029185867896, | |
| "grad_norm": 0.4538131961141252, | |
| "learning_rate": 5.035830309880523e-06, | |
| "loss": 0.4511, | |
| "num_tokens": 550468465.0, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.9839563065369517, | |
| "grad_norm": 0.4573902491830718, | |
| "learning_rate": 5.032337691737683e-06, | |
| "loss": 0.4717, | |
| "num_tokens": 550912090.0, | |
| "step": 5765 | |
| }, | |
| { | |
| "epoch": 0.9848096944871139, | |
| "grad_norm": 0.427946617179411, | |
| "learning_rate": 5.0290240150437645e-06, | |
| "loss": 0.4515, | |
| "num_tokens": 551376796.0, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 0.985663082437276, | |
| "grad_norm": 0.36527196106682885, | |
| "learning_rate": 5.025889306190208e-06, | |
| "loss": 0.4794, | |
| "num_tokens": 551914960.0, | |
| "step": 5775 | |
| }, | |
| { | |
| "epoch": 0.9865164703874382, | |
| "grad_norm": 0.37659410878730065, | |
| "learning_rate": 5.0229335901430926e-06, | |
| "loss": 0.4902, | |
| "num_tokens": 552397091.0, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.9873698583376003, | |
| "grad_norm": 0.35280350877142214, | |
| "learning_rate": 5.020156890442924e-06, | |
| "loss": 0.4492, | |
| "num_tokens": 552867980.0, | |
| "step": 5785 | |
| }, | |
| { | |
| "epoch": 0.9882232462877624, | |
| "grad_norm": 0.3850345054474253, | |
| "learning_rate": 5.017559229204447e-06, | |
| "loss": 0.4631, | |
| "num_tokens": 553378344.0, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 0.9890766342379246, | |
| "grad_norm": 0.4059801207859906, | |
| "learning_rate": 5.015140627116475e-06, | |
| "loss": 0.4569, | |
| "num_tokens": 553828199.0, | |
| "step": 5795 | |
| }, | |
| { | |
| "epoch": 0.9899300221880867, | |
| "grad_norm": 0.39944971091987225, | |
| "learning_rate": 5.012901103441723e-06, | |
| "loss": 0.4609, | |
| "num_tokens": 554315309.0, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.9907834101382489, | |
| "grad_norm": 0.48592509059806677, | |
| "learning_rate": 5.010840676016651e-06, | |
| "loss": 0.4566, | |
| "num_tokens": 554755561.0, | |
| "step": 5805 | |
| }, | |
| { | |
| "epoch": 0.991636798088411, | |
| "grad_norm": 0.4111061441262132, | |
| "learning_rate": 5.008959361251331e-06, | |
| "loss": 0.4744, | |
| "num_tokens": 555217376.0, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 0.9924901860385731, | |
| "grad_norm": 0.4126842024328507, | |
| "learning_rate": 5.007257174129304e-06, | |
| "loss": 0.4811, | |
| "num_tokens": 555688174.0, | |
| "step": 5815 | |
| }, | |
| { | |
| "epoch": 0.9933435739887353, | |
| "grad_norm": 0.37937971044742413, | |
| "learning_rate": 5.00573412820747e-06, | |
| "loss": 0.5025, | |
| "num_tokens": 556147373.0, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.9941969619388974, | |
| "grad_norm": 0.4166951390316682, | |
| "learning_rate": 5.004390235615973e-06, | |
| "loss": 0.4729, | |
| "num_tokens": 556623249.0, | |
| "step": 5825 | |
| }, | |
| { | |
| "epoch": 0.9950503498890596, | |
| "grad_norm": 0.42632843588067976, | |
| "learning_rate": 5.003225507058114e-06, | |
| "loss": 0.4775, | |
| "num_tokens": 557076223.0, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 0.9959037378392217, | |
| "grad_norm": 0.42479258478371845, | |
| "learning_rate": 5.002239951810257e-06, | |
| "loss": 0.4541, | |
| "num_tokens": 557598145.0, | |
| "step": 5835 | |
| }, | |
| { | |
| "epoch": 0.9967571257893838, | |
| "grad_norm": 0.3528097150366589, | |
| "learning_rate": 5.001433577721758e-06, | |
| "loss": 0.462, | |
| "num_tokens": 558093284.0, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.997610513739546, | |
| "grad_norm": 0.4402206568911672, | |
| "learning_rate": 5.000806391214903e-06, | |
| "loss": 0.4898, | |
| "num_tokens": 558537100.0, | |
| "step": 5845 | |
| }, | |
| { | |
| "epoch": 0.9984639016897081, | |
| "grad_norm": 0.389763661900114, | |
| "learning_rate": 5.000358397284853e-06, | |
| "loss": 0.4455, | |
| "num_tokens": 558997916.0, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.9993172896398703, | |
| "grad_norm": 0.45734810451631475, | |
| "learning_rate": 5.0000895994996155e-06, | |
| "loss": 0.4783, | |
| "num_tokens": 559505562.0, | |
| "step": 5855 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "num_tokens": 559846292.0, | |
| "step": 5859, | |
| "total_flos": 1146448090169344.0, | |
| "train_loss": 0.5193908028399951, | |
| "train_runtime": 27532.3398, | |
| "train_samples_per_second": 3.404, | |
| "train_steps_per_second": 0.213 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 5859, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1146448090169344.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |