| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 807, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0186219739292365, |
| "grad_norm": 1.3907511234283447, |
| "learning_rate": 2.9268292682926833e-06, |
| "loss": 1.3041, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.037243947858473, |
| "grad_norm": 0.7414476275444031, |
| "learning_rate": 6.585365853658537e-06, |
| "loss": 1.3177, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.055865921787709494, |
| "grad_norm": 0.7080162763595581, |
| "learning_rate": 1.024390243902439e-05, |
| "loss": 1.3286, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.074487895716946, |
| "grad_norm": 0.7129006385803223, |
| "learning_rate": 1.3902439024390245e-05, |
| "loss": 1.2253, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0931098696461825, |
| "grad_norm": 0.5740082263946533, |
| "learning_rate": 1.7560975609756096e-05, |
| "loss": 1.2695, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.11173184357541899, |
| "grad_norm": 0.5253065228462219, |
| "learning_rate": 2.121951219512195e-05, |
| "loss": 1.2724, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.1303538175046555, |
| "grad_norm": 0.6284242272377014, |
| "learning_rate": 2.4878048780487805e-05, |
| "loss": 1.1713, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.148975791433892, |
| "grad_norm": 0.5629732012748718, |
| "learning_rate": 2.8536585365853658e-05, |
| "loss": 1.1165, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.16759776536312848, |
| "grad_norm": 0.468484103679657, |
| "learning_rate": 2.999886462193363e-05, |
| "loss": 1.1549, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.186219739292365, |
| "grad_norm": 0.5735260844230652, |
| "learning_rate": 2.999192682284521e-05, |
| "loss": 1.1626, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2048417132216015, |
| "grad_norm": 0.44839179515838623, |
| "learning_rate": 2.997868490407536e-05, |
| "loss": 1.1194, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.22346368715083798, |
| "grad_norm": 0.4823947250843048, |
| "learning_rate": 2.9959144433863682e-05, |
| "loss": 1.1298, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.24208566108007448, |
| "grad_norm": 0.47787025570869446, |
| "learning_rate": 2.9933313628996742e-05, |
| "loss": 1.1535, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.260707635009311, |
| "grad_norm": 0.5934181809425354, |
| "learning_rate": 2.9901203351352888e-05, |
| "loss": 1.0981, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.27932960893854747, |
| "grad_norm": 0.5122277140617371, |
| "learning_rate": 2.9862827103334823e-05, |
| "loss": 1.0592, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.297951582867784, |
| "grad_norm": 0.5349485874176025, |
| "learning_rate": 2.9818201022191834e-05, |
| "loss": 1.0702, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.3165735567970205, |
| "grad_norm": 0.5598180294036865, |
| "learning_rate": 2.9767343873234065e-05, |
| "loss": 1.0592, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.33519553072625696, |
| "grad_norm": 0.533814549446106, |
| "learning_rate": 2.9710277041941673e-05, |
| "loss": 1.0389, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3538175046554935, |
| "grad_norm": 0.5535034537315369, |
| "learning_rate": 2.9647024524972232e-05, |
| "loss": 1.0295, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.37243947858473, |
| "grad_norm": 0.6329430937767029, |
| "learning_rate": 2.957761292007011e-05, |
| "loss": 1.0265, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.39106145251396646, |
| "grad_norm": 0.5731159448623657, |
| "learning_rate": 2.9502071414882078e-05, |
| "loss": 1.0276, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.409683426443203, |
| "grad_norm": 0.5810534358024597, |
| "learning_rate": 2.9420431774683884e-05, |
| "loss": 1.0267, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.42830540037243947, |
| "grad_norm": 0.70155930519104, |
| "learning_rate": 2.933272832902294e-05, |
| "loss": 1.0017, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.44692737430167595, |
| "grad_norm": 0.6325356960296631, |
| "learning_rate": 2.9238997957282696e-05, |
| "loss": 0.9631, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.4655493482309125, |
| "grad_norm": 0.6613216996192932, |
| "learning_rate": 2.913928007317482e-05, |
| "loss": 0.9426, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.48417132216014896, |
| "grad_norm": 0.692284345626831, |
| "learning_rate": 2.9033616608165715e-05, |
| "loss": 0.9294, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5027932960893855, |
| "grad_norm": 0.7366511225700378, |
| "learning_rate": 2.8922051993844277e-05, |
| "loss": 0.9085, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.521415270018622, |
| "grad_norm": 0.7105802297592163, |
| "learning_rate": 2.880463314323837e-05, |
| "loss": 0.8647, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5400372439478585, |
| "grad_norm": 0.7824246287345886, |
| "learning_rate": 2.8681409431087875e-05, |
| "loss": 0.8991, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.5586592178770949, |
| "grad_norm": 0.7299179434776306, |
| "learning_rate": 2.8552432673082543e-05, |
| "loss": 0.8591, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5772811918063314, |
| "grad_norm": 0.705094039440155, |
| "learning_rate": 2.8417757104073444e-05, |
| "loss": 0.8758, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.595903165735568, |
| "grad_norm": 0.7872571349143982, |
| "learning_rate": 2.8277439355267178e-05, |
| "loss": 0.8594, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6145251396648045, |
| "grad_norm": 0.8229089975357056, |
| "learning_rate": 2.8131538430412386e-05, |
| "loss": 0.9027, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.633147113594041, |
| "grad_norm": 0.8249003291130066, |
| "learning_rate": 2.798011568098862e-05, |
| "loss": 0.8756, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6517690875232774, |
| "grad_norm": 0.8125863671302795, |
| "learning_rate": 2.782323478040799e-05, |
| "loss": 0.8548, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.6703910614525139, |
| "grad_norm": 1.0271687507629395, |
| "learning_rate": 2.7660961697240414e-05, |
| "loss": 0.8183, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6890130353817505, |
| "grad_norm": 0.9223941564559937, |
| "learning_rate": 2.7493364667473816e-05, |
| "loss": 0.822, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.707635009310987, |
| "grad_norm": 0.801182210445404, |
| "learning_rate": 2.7320514165820762e-05, |
| "loss": 0.8089, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.7262569832402235, |
| "grad_norm": 0.7897576689720154, |
| "learning_rate": 2.7142482876083833e-05, |
| "loss": 0.8041, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.74487895716946, |
| "grad_norm": 0.8355113863945007, |
| "learning_rate": 2.695934566059199e-05, |
| "loss": 0.7713, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7635009310986964, |
| "grad_norm": 0.9361415505409241, |
| "learning_rate": 2.6771179528720894e-05, |
| "loss": 0.7682, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.7821229050279329, |
| "grad_norm": 0.9785051345825195, |
| "learning_rate": 2.657806360451039e-05, |
| "loss": 0.7463, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.8007448789571695, |
| "grad_norm": 0.885306715965271, |
| "learning_rate": 2.6380079093392776e-05, |
| "loss": 0.7197, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.819366852886406, |
| "grad_norm": 0.8461589813232422, |
| "learning_rate": 2.6177309248045835e-05, |
| "loss": 0.7056, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.8379888268156425, |
| "grad_norm": 0.9384778141975403, |
| "learning_rate": 2.5969839333385012e-05, |
| "loss": 0.6813, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.8566108007448789, |
| "grad_norm": 0.9349907636642456, |
| "learning_rate": 2.575775659070942e-05, |
| "loss": 0.6834, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8752327746741154, |
| "grad_norm": 0.8804568648338318, |
| "learning_rate": 2.554115020101681e-05, |
| "loss": 0.7315, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.8938547486033519, |
| "grad_norm": 0.9351520538330078, |
| "learning_rate": 2.5320111247502856e-05, |
| "loss": 0.6991, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.9124767225325885, |
| "grad_norm": 1.1232490539550781, |
| "learning_rate": 2.5094732677260595e-05, |
| "loss": 0.6612, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.931098696461825, |
| "grad_norm": 0.8860473036766052, |
| "learning_rate": 2.486510926219609e-05, |
| "loss": 0.6943, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.9497206703910615, |
| "grad_norm": 0.8939250707626343, |
| "learning_rate": 2.4631337559176722e-05, |
| "loss": 0.6973, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.9683426443202979, |
| "grad_norm": 1.2053793668746948, |
| "learning_rate": 2.4393515869428983e-05, |
| "loss": 0.6426, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9869646182495344, |
| "grad_norm": 1.0905733108520508, |
| "learning_rate": 2.415174419720267e-05, |
| "loss": 0.6378, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.0037243947858474, |
| "grad_norm": 1.0751861333847046, |
| "learning_rate": 2.3906124207719015e-05, |
| "loss": 0.6175, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.0223463687150838, |
| "grad_norm": 1.0828917026519775, |
| "learning_rate": 2.3656759184420396e-05, |
| "loss": 0.5873, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.0409683426443204, |
| "grad_norm": 1.1315468549728394, |
| "learning_rate": 2.3403753985539493e-05, |
| "loss": 0.6039, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.0595903165735567, |
| "grad_norm": 1.1883121728897095, |
| "learning_rate": 2.3147215000006328e-05, |
| "loss": 0.5707, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.0782122905027933, |
| "grad_norm": 0.9794520735740662, |
| "learning_rate": 2.288725010271164e-05, |
| "loss": 0.5567, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.0968342644320297, |
| "grad_norm": 1.0168079137802124, |
| "learning_rate": 2.2623968609145334e-05, |
| "loss": 0.5513, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.1154562383612663, |
| "grad_norm": 1.0619779825210571, |
| "learning_rate": 2.2357481229429274e-05, |
| "loss": 0.55, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.1340782122905029, |
| "grad_norm": 1.0750116109848022, |
| "learning_rate": 2.208790002176353e-05, |
| "loss": 0.5336, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.1527001862197392, |
| "grad_norm": 1.2124170064926147, |
| "learning_rate": 2.1815338345305796e-05, |
| "loss": 0.5499, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.1713221601489758, |
| "grad_norm": 1.076454520225525, |
| "learning_rate": 2.1539910812503786e-05, |
| "loss": 0.5901, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.1899441340782122, |
| "grad_norm": 1.0857462882995605, |
| "learning_rate": 2.1261733240900548e-05, |
| "loss": 0.5407, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.2085661080074488, |
| "grad_norm": 1.0985511541366577, |
| "learning_rate": 2.0980922604433114e-05, |
| "loss": 0.4982, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.2271880819366854, |
| "grad_norm": 1.0199052095413208, |
| "learning_rate": 2.069759698424482e-05, |
| "loss": 0.5383, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.2458100558659218, |
| "grad_norm": 1.0805414915084839, |
| "learning_rate": 2.0411875519032095e-05, |
| "loss": 0.5103, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.2644320297951583, |
| "grad_norm": 1.2961382865905762, |
| "learning_rate": 2.012387835494649e-05, |
| "loss": 0.5273, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.2830540037243947, |
| "grad_norm": 1.1369295120239258, |
| "learning_rate": 1.983372659507314e-05, |
| "loss": 0.5441, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.3016759776536313, |
| "grad_norm": 1.0780054330825806, |
| "learning_rate": 1.9541542248506786e-05, |
| "loss": 0.4934, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.3202979515828677, |
| "grad_norm": 1.2800835371017456, |
| "learning_rate": 1.9247448179046823e-05, |
| "loss": 0.5047, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.3389199255121043, |
| "grad_norm": 1.1320581436157227, |
| "learning_rate": 1.895156805353297e-05, |
| "loss": 0.4655, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.3575418994413408, |
| "grad_norm": 1.2637457847595215, |
| "learning_rate": 1.865402628984324e-05, |
| "loss": 0.487, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.3761638733705772, |
| "grad_norm": 1.2081173658370972, |
| "learning_rate": 1.8354948004576103e-05, |
| "loss": 0.5081, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.3947858472998138, |
| "grad_norm": 1.0850831270217896, |
| "learning_rate": 1.805445896043882e-05, |
| "loss": 0.4788, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.4134078212290504, |
| "grad_norm": 1.2690433263778687, |
| "learning_rate": 1.7752685513364138e-05, |
| "loss": 0.4973, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.4320297951582868, |
| "grad_norm": 1.2087140083312988, |
| "learning_rate": 1.7449754559377456e-05, |
| "loss": 0.4749, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.4506517690875234, |
| "grad_norm": 1.000762939453125, |
| "learning_rate": 1.7145793481236956e-05, |
| "loss": 0.4559, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.4692737430167597, |
| "grad_norm": 1.0567432641983032, |
| "learning_rate": 1.6840930094869024e-05, |
| "loss": 0.4233, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.4878957169459963, |
| "grad_norm": 1.0863107442855835, |
| "learning_rate": 1.6535292595621516e-05, |
| "loss": 0.4539, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.5065176908752327, |
| "grad_norm": 1.3777681589126587, |
| "learning_rate": 1.622900950435751e-05, |
| "loss": 0.423, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.5251396648044693, |
| "grad_norm": 1.0469967126846313, |
| "learning_rate": 1.5922209613412132e-05, |
| "loss": 0.4851, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.5437616387337059, |
| "grad_norm": 1.2107429504394531, |
| "learning_rate": 1.5615021932435298e-05, |
| "loss": 0.4103, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.5623836126629422, |
| "grad_norm": 1.1506654024124146, |
| "learning_rate": 1.530757563414298e-05, |
| "loss": 0.4189, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.5810055865921788, |
| "grad_norm": 1.2449917793273926, |
| "learning_rate": 1.5e-05, |
| "loss": 0.4293, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.5996275605214154, |
| "grad_norm": 1.1543105840682983, |
| "learning_rate": 1.469242436585702e-05, |
| "loss": 0.4643, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.6182495344506518, |
| "grad_norm": 1.338456630706787, |
| "learning_rate": 1.4384978067564708e-05, |
| "loss": 0.4162, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.6368715083798882, |
| "grad_norm": 1.1966506242752075, |
| "learning_rate": 1.4077790386587867e-05, |
| "loss": 0.4461, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.6554934823091247, |
| "grad_norm": 1.169392466545105, |
| "learning_rate": 1.3770990495642493e-05, |
| "loss": 0.4363, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.6741154562383613, |
| "grad_norm": 1.2890475988388062, |
| "learning_rate": 1.3464707404378487e-05, |
| "loss": 0.4129, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.6927374301675977, |
| "grad_norm": 1.1962250471115112, |
| "learning_rate": 1.3159069905130979e-05, |
| "loss": 0.4012, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.7113594040968343, |
| "grad_norm": 1.236181378364563, |
| "learning_rate": 1.285420651876305e-05, |
| "loss": 0.4459, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.7299813780260709, |
| "grad_norm": 1.0868475437164307, |
| "learning_rate": 1.2550245440622547e-05, |
| "loss": 0.4048, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.7486033519553073, |
| "grad_norm": 1.1553728580474854, |
| "learning_rate": 1.2247314486635863e-05, |
| "loss": 0.4136, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.7672253258845436, |
| "grad_norm": 1.191113829612732, |
| "learning_rate": 1.1945541039561182e-05, |
| "loss": 0.4326, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.7858472998137802, |
| "grad_norm": 1.1444103717803955, |
| "learning_rate": 1.16450519954239e-05, |
| "loss": 0.3985, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.8044692737430168, |
| "grad_norm": 1.239621639251709, |
| "learning_rate": 1.1345973710156764e-05, |
| "loss": 0.4143, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.8230912476722532, |
| "grad_norm": 1.17149817943573, |
| "learning_rate": 1.1048431946467028e-05, |
| "loss": 0.3927, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.8417132216014898, |
| "grad_norm": 1.1904518604278564, |
| "learning_rate": 1.0752551820953181e-05, |
| "loss": 0.3833, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.8603351955307263, |
| "grad_norm": 1.2947860956192017, |
| "learning_rate": 1.0458457751493217e-05, |
| "loss": 0.3817, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.8789571694599627, |
| "grad_norm": 1.1897753477096558, |
| "learning_rate": 1.016627340492686e-05, |
| "loss": 0.3699, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.8975791433891993, |
| "grad_norm": 1.2094521522521973, |
| "learning_rate": 9.87612164505351e-06, |
| "loss": 0.3868, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.916201117318436, |
| "grad_norm": 1.259769082069397, |
| "learning_rate": 9.588124480967908e-06, |
| "loss": 0.4097, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.9348230912476723, |
| "grad_norm": 1.370043158531189, |
| "learning_rate": 9.30240301575518e-06, |
| "loss": 0.4034, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.9534450651769086, |
| "grad_norm": 1.281767725944519, |
| "learning_rate": 9.019077395566892e-06, |
| "loss": 0.3576, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.9720670391061452, |
| "grad_norm": 1.2074289321899414, |
| "learning_rate": 8.738266759099456e-06, |
| "loss": 0.375, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.9906890130353818, |
| "grad_norm": 1.345578908920288, |
| "learning_rate": 8.460089187496217e-06, |
| "loss": 0.3934, |
| "step": 535 |
| }, |
| { |
| "epoch": 2.007448789571695, |
| "grad_norm": 1.2631962299346924, |
| "learning_rate": 8.184661654694204e-06, |
| "loss": 0.3523, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.026070763500931, |
| "grad_norm": 1.3945969343185425, |
| "learning_rate": 7.912099978236474e-06, |
| "loss": 0.3299, |
| "step": 545 |
| }, |
| { |
| "epoch": 2.0446927374301676, |
| "grad_norm": 1.1135674715042114, |
| "learning_rate": 7.642518770570722e-06, |
| "loss": 0.3233, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.063314711359404, |
| "grad_norm": 1.309389352798462, |
| "learning_rate": 7.376031390854668e-06, |
| "loss": 0.3096, |
| "step": 555 |
| }, |
| { |
| "epoch": 2.0819366852886407, |
| "grad_norm": 1.3764303922653198, |
| "learning_rate": 7.112749897288364e-06, |
| "loss": 0.2887, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.100558659217877, |
| "grad_norm": 1.0938595533370972, |
| "learning_rate": 6.852784999993673e-06, |
| "loss": 0.321, |
| "step": 565 |
| }, |
| { |
| "epoch": 2.1191806331471135, |
| "grad_norm": 1.326688528060913, |
| "learning_rate": 6.596246014460515e-06, |
| "loss": 0.2884, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.1378026070763503, |
| "grad_norm": 1.365779995918274, |
| "learning_rate": 6.343240815579605e-06, |
| "loss": 0.2677, |
| "step": 575 |
| }, |
| { |
| "epoch": 2.1564245810055866, |
| "grad_norm": 1.1633373498916626, |
| "learning_rate": 6.093875792280985e-06, |
| "loss": 0.314, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.175046554934823, |
| "grad_norm": 1.224949598312378, |
| "learning_rate": 5.848255802797335e-06, |
| "loss": 0.2781, |
| "step": 585 |
| }, |
| { |
| "epoch": 2.1936685288640594, |
| "grad_norm": 1.1924068927764893, |
| "learning_rate": 5.60648413057102e-06, |
| "loss": 0.3149, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.212290502793296, |
| "grad_norm": 1.1579667329788208, |
| "learning_rate": 5.368662440823281e-06, |
| "loss": 0.281, |
| "step": 595 |
| }, |
| { |
| "epoch": 2.2309124767225326, |
| "grad_norm": 1.1058433055877686, |
| "learning_rate": 5.134890737803913e-06, |
| "loss": 0.3227, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.249534450651769, |
| "grad_norm": 1.33882737159729, |
| "learning_rate": 4.905267322739406e-06, |
| "loss": 0.2766, |
| "step": 605 |
| }, |
| { |
| "epoch": 2.2681564245810057, |
| "grad_norm": 1.2620124816894531, |
| "learning_rate": 4.6798887524971445e-06, |
| "loss": 0.2762, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.286778398510242, |
| "grad_norm": 1.1448348760604858, |
| "learning_rate": 4.458849798983193e-06, |
| "loss": 0.2793, |
| "step": 615 |
| }, |
| { |
| "epoch": 2.3054003724394785, |
| "grad_norm": 1.2718420028686523, |
| "learning_rate": 4.242243409290579e-06, |
| "loss": 0.2583, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.3240223463687153, |
| "grad_norm": 1.3523516654968262, |
| "learning_rate": 4.030160666614992e-06, |
| "loss": 0.3118, |
| "step": 625 |
| }, |
| { |
| "epoch": 2.3426443202979517, |
| "grad_norm": 1.157949447631836, |
| "learning_rate": 3.822690751954167e-06, |
| "loss": 0.2913, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.361266294227188, |
| "grad_norm": 1.3174906969070435, |
| "learning_rate": 3.619920906607224e-06, |
| "loss": 0.2831, |
| "step": 635 |
| }, |
| { |
| "epoch": 2.3798882681564244, |
| "grad_norm": 1.3369598388671875, |
| "learning_rate": 3.4219363954896128e-06, |
| "loss": 0.2635, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.398510242085661, |
| "grad_norm": 1.17523992061615, |
| "learning_rate": 3.2288204712791064e-06, |
| "loss": 0.2807, |
| "step": 645 |
| }, |
| { |
| "epoch": 2.4171322160148976, |
| "grad_norm": 1.3446990251541138, |
| "learning_rate": 3.0406543394080107e-06, |
| "loss": 0.2916, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.435754189944134, |
| "grad_norm": 1.2706341743469238, |
| "learning_rate": 2.85751712391617e-06, |
| "loss": 0.2959, |
| "step": 655 |
| }, |
| { |
| "epoch": 2.4543761638733708, |
| "grad_norm": 1.2393903732299805, |
| "learning_rate": 2.6794858341792415e-06, |
| "loss": 0.3176, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.472998137802607, |
| "grad_norm": 1.2350081205368042, |
| "learning_rate": 2.506635332526186e-06, |
| "loss": 0.283, |
| "step": 665 |
| }, |
| { |
| "epoch": 2.4916201117318435, |
| "grad_norm": 1.3355473279953003, |
| "learning_rate": 2.3390383027595846e-06, |
| "loss": 0.2387, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.51024208566108, |
| "grad_norm": 1.2572468519210815, |
| "learning_rate": 2.1767652195920153e-06, |
| "loss": 0.2688, |
| "step": 675 |
| }, |
| { |
| "epoch": 2.5288640595903167, |
| "grad_norm": 1.3220757246017456, |
| "learning_rate": 2.0198843190113806e-06, |
| "loss": 0.2526, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.547486033519553, |
| "grad_norm": 1.2973809242248535, |
| "learning_rate": 1.8684615695876166e-06, |
| "loss": 0.2916, |
| "step": 685 |
| }, |
| { |
| "epoch": 2.5661080074487894, |
| "grad_norm": 1.1551833152770996, |
| "learning_rate": 1.722560644732824e-06, |
| "loss": 0.2999, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.5847299813780262, |
| "grad_norm": 1.147216796875, |
| "learning_rate": 1.5822428959265578e-06, |
| "loss": 0.2556, |
| "step": 695 |
| }, |
| { |
| "epoch": 2.6033519553072626, |
| "grad_norm": 1.3674964904785156, |
| "learning_rate": 1.44756732691746e-06, |
| "loss": 0.3018, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.621973929236499, |
| "grad_norm": 1.1120444536209106, |
| "learning_rate": 1.3185905689121247e-06, |
| "loss": 0.2711, |
| "step": 705 |
| }, |
| { |
| "epoch": 2.6405959031657353, |
| "grad_norm": 1.2605507373809814, |
| "learning_rate": 1.1953668567616322e-06, |
| "loss": 0.3158, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.659217877094972, |
| "grad_norm": 1.1186015605926514, |
| "learning_rate": 1.0779480061557261e-06, |
| "loss": 0.2482, |
| "step": 715 |
| }, |
| { |
| "epoch": 2.6778398510242085, |
| "grad_norm": 1.233724594116211, |
| "learning_rate": 9.66383391834285e-07, |
| "loss": 0.3035, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.6964618249534453, |
| "grad_norm": 1.151913046836853, |
| "learning_rate": 8.607199268251798e-07, |
| "loss": 0.2766, |
| "step": 725 |
| }, |
| { |
| "epoch": 2.7150837988826817, |
| "grad_norm": 1.5956870317459106, |
| "learning_rate": 7.610020427173048e-07, |
| "loss": 0.2467, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.733705772811918, |
| "grad_norm": 1.070102572441101, |
| "learning_rate": 6.672716709770594e-07, |
| "loss": 0.2215, |
| "step": 735 |
| }, |
| { |
| "epoch": 2.7523277467411544, |
| "grad_norm": 1.215812087059021, |
| "learning_rate": 5.795682253161177e-07, |
| "loss": 0.2713, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.770949720670391, |
| "grad_norm": 1.3350632190704346, |
| "learning_rate": 4.979285851179272e-07, |
| "loss": 0.2848, |
| "step": 745 |
| }, |
| { |
| "epoch": 2.7895716945996276, |
| "grad_norm": 1.2269939184188843, |
| "learning_rate": 4.2238707992989236e-07, |
| "loss": 0.28, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.808193668528864, |
| "grad_norm": 1.1725128889083862, |
| "learning_rate": 3.5297547502776804e-07, |
| "loss": 0.2845, |
| "step": 755 |
| }, |
| { |
| "epoch": 2.826815642458101, |
| "grad_norm": 1.2703720331192017, |
| "learning_rate": 2.897229580583288e-07, |
| "loss": 0.2902, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.845437616387337, |
| "grad_norm": 1.2266652584075928, |
| "learning_rate": 2.3265612676593827e-07, |
| "loss": 0.2969, |
| "step": 765 |
| }, |
| { |
| "epoch": 2.8640595903165735, |
| "grad_norm": 1.2293243408203125, |
| "learning_rate": 1.8179897780816578e-07, |
| "loss": 0.3124, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.88268156424581, |
| "grad_norm": 1.388389229774475, |
| "learning_rate": 1.3717289666517807e-07, |
| "loss": 0.2425, |
| "step": 775 |
| }, |
| { |
| "epoch": 2.9013035381750467, |
| "grad_norm": 1.0714391469955444, |
| "learning_rate": 9.879664864711258e-08, |
| "loss": 0.2665, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.919925512104283, |
| "grad_norm": 1.2166239023208618, |
| "learning_rate": 6.668637100325948e-08, |
| "loss": 0.2968, |
| "step": 785 |
| }, |
| { |
| "epoch": 2.9385474860335195, |
| "grad_norm": 1.2198659181594849, |
| "learning_rate": 4.085556613631969e-08, |
| "loss": 0.3017, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.9571694599627563, |
| "grad_norm": 1.1453981399536133, |
| "learning_rate": 2.1315095924643314e-08, |
| "loss": 0.2665, |
| "step": 795 |
| }, |
| { |
| "epoch": 2.9757914338919926, |
| "grad_norm": 1.0881311893463135, |
| "learning_rate": 8.073177154789834e-09, |
| "loss": 0.2811, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.994413407821229, |
| "grad_norm": 1.3312422037124634, |
| "learning_rate": 1.135378066368653e-09, |
| "loss": 0.2901, |
| "step": 805 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 807, |
| "total_flos": 1.1559174971620065e+18, |
| "train_loss": 0.5622850377320356, |
| "train_runtime": 546.8929, |
| "train_samples_per_second": 47.132, |
| "train_steps_per_second": 1.476 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 807, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.1559174971620065e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|