| { |
| "best_global_step": 11542, |
| "best_metric": 3.3484463691711426, |
| "best_model_checkpoint": "sindhibert_session3/checkpoint-11542", |
| "epoch": 2.0, |
| "eval_steps": 5771, |
| "global_step": 11542, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.017328019407381736, |
| "grad_norm": 9.74232006072998, |
| "learning_rate": 5.147313691507799e-06, |
| "loss": 16.534342041015623, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.03465603881476347, |
| "grad_norm": 9.413031578063965, |
| "learning_rate": 1.0346620450606586e-05, |
| "loss": 16.06064208984375, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.05198405822214521, |
| "grad_norm": 9.366157531738281, |
| "learning_rate": 1.554592720970537e-05, |
| "loss": 15.73246337890625, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.06931207762952694, |
| "grad_norm": 8.934579849243164, |
| "learning_rate": 2.074523396880416e-05, |
| "loss": 15.634798583984375, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.08664009703690868, |
| "grad_norm": 9.873139381408691, |
| "learning_rate": 2.594454072790295e-05, |
| "loss": 15.491142578125, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.10396811644429042, |
| "grad_norm": 9.112743377685547, |
| "learning_rate": 2.9999702019626288e-05, |
| "loss": 15.47271728515625, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.12129613585167215, |
| "grad_norm": 8.721996307373047, |
| "learning_rate": 2.999083739047451e-05, |
| "loss": 15.291612548828125, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.1386241552590539, |
| "grad_norm": 8.849467277526855, |
| "learning_rate": 2.9969667845201166e-05, |
| "loss": 15.32687255859375, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.15595217466643563, |
| "grad_norm": 8.970343589782715, |
| "learning_rate": 2.9936210760385845e-05, |
| "loss": 15.221800537109376, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.17328019407381737, |
| "grad_norm": 9.423188209533691, |
| "learning_rate": 2.9890493598578603e-05, |
| "loss": 15.21154541015625, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.1906082134811991, |
| "grad_norm": 10.529290199279785, |
| "learning_rate": 2.9832553885757926e-05, |
| "loss": 15.091610107421875, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.20793623288858085, |
| "grad_norm": 8.895530700683594, |
| "learning_rate": 2.97624391805283e-05, |
| "loss": 15.116024169921875, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.22526425229596256, |
| "grad_norm": 9.481012344360352, |
| "learning_rate": 2.968020703508272e-05, |
| "loss": 15.086820068359375, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.2425922717033443, |
| "grad_norm": 8.957283020019531, |
| "learning_rate": 2.9585924947962195e-05, |
| "loss": 15.09182373046875, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.25992029111072606, |
| "grad_norm": 8.475807189941406, |
| "learning_rate": 2.9479670308650942e-05, |
| "loss": 14.974696044921876, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.2772483105181078, |
| "grad_norm": 8.860872268676758, |
| "learning_rate": 2.9361530334052883e-05, |
| "loss": 14.967041015625, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.29457632992548954, |
| "grad_norm": 8.990629196166992, |
| "learning_rate": 2.9231601996901433e-05, |
| "loss": 14.9465673828125, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.31190434933287126, |
| "grad_norm": 9.683910369873047, |
| "learning_rate": 2.9089991946161484e-05, |
| "loss": 14.9761962890625, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.32923236874025297, |
| "grad_norm": 9.044540405273438, |
| "learning_rate": 2.89368164194888e-05, |
| "loss": 14.89200927734375, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.34656038814763473, |
| "grad_norm": 8.935420036315918, |
| "learning_rate": 2.8772201147818787e-05, |
| "loss": 14.9054736328125, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.36388840755501645, |
| "grad_norm": 8.12104320526123, |
| "learning_rate": 2.8596281252162868e-05, |
| "loss": 14.8011767578125, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.3812164269623982, |
| "grad_norm": 9.633867263793945, |
| "learning_rate": 2.840920113269721e-05, |
| "loss": 14.789473876953124, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.3985444463697799, |
| "grad_norm": 9.07466983795166, |
| "learning_rate": 2.8211114350234873e-05, |
| "loss": 14.80165283203125, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.4158724657771617, |
| "grad_norm": 9.412736892700195, |
| "learning_rate": 2.8002183500178594e-05, |
| "loss": 14.746627197265624, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.4332004851845434, |
| "grad_norm": 9.755793571472168, |
| "learning_rate": 2.7782580079057772e-05, |
| "loss": 14.778804931640625, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.4505285045919251, |
| "grad_norm": 9.882634162902832, |
| "learning_rate": 2.7552484343759096e-05, |
| "loss": 14.704544677734376, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.4678565239993069, |
| "grad_norm": 9.305146217346191, |
| "learning_rate": 2.731208516356645e-05, |
| "loss": 14.75770751953125, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.4851845434066886, |
| "grad_norm": 9.269790649414062, |
| "learning_rate": 2.7061579865131508e-05, |
| "loss": 14.68646484375, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.5025125628140703, |
| "grad_norm": 9.310648918151855, |
| "learning_rate": 2.6801174070502248e-05, |
| "loss": 14.635621337890624, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.5198405822214521, |
| "grad_norm": 9.239577293395996, |
| "learning_rate": 2.653108152834241e-05, |
| "loss": 14.71250732421875, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.5371686016288338, |
| "grad_norm": 9.674842834472656, |
| "learning_rate": 2.6251523938480346e-05, |
| "loss": 14.602254638671875, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.5544966210362156, |
| "grad_norm": 10.178524017333984, |
| "learning_rate": 2.5962730769931346e-05, |
| "loss": 14.558492431640625, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.5718246404435973, |
| "grad_norm": 9.312729835510254, |
| "learning_rate": 2.5664939072542787e-05, |
| "loss": 14.588648681640626, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.5891526598509791, |
| "grad_norm": 9.438308715820312, |
| "learning_rate": 2.5358393282416714e-05, |
| "loss": 14.535865478515625, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.6064806792583608, |
| "grad_norm": 8.51146125793457, |
| "learning_rate": 2.5043345021269554e-05, |
| "loss": 14.5489208984375, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.6238086986657425, |
| "grad_norm": 9.856837272644043, |
| "learning_rate": 2.4720052889893698e-05, |
| "loss": 14.565177001953124, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.6411367180731242, |
| "grad_norm": 9.223260879516602, |
| "learning_rate": 2.4388782255890405e-05, |
| "loss": 14.452093505859375, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.6584647374805059, |
| "grad_norm": 9.016181945800781, |
| "learning_rate": 2.404980503584838e-05, |
| "loss": 14.49298828125, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.6757927568878878, |
| "grad_norm": 9.865802764892578, |
| "learning_rate": 2.370339947214669e-05, |
| "loss": 14.474598388671875, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.6931207762952695, |
| "grad_norm": 8.965621948242188, |
| "learning_rate": 2.3349849904565318e-05, |
| "loss": 14.46911376953125, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.7104487957026512, |
| "grad_norm": 8.362798690795898, |
| "learning_rate": 2.2989446536890786e-05, |
| "loss": 14.390712890625, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.7277768151100329, |
| "grad_norm": 10.564478874206543, |
| "learning_rate": 2.2622485198708445e-05, |
| "loss": 14.45989501953125, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.7451048345174146, |
| "grad_norm": 9.188340187072754, |
| "learning_rate": 2.2249267102576903e-05, |
| "loss": 14.422335205078125, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.7624328539247964, |
| "grad_norm": 9.867836952209473, |
| "learning_rate": 2.1870098596784012e-05, |
| "loss": 14.341461181640625, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.7797608733321781, |
| "grad_norm": 9.469503402709961, |
| "learning_rate": 2.148529091388725e-05, |
| "loss": 14.42570556640625, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.7970888927395599, |
| "grad_norm": 9.195992469787598, |
| "learning_rate": 2.1095159915244956e-05, |
| "loss": 14.3226025390625, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.8144169121469416, |
| "grad_norm": 9.930395126342773, |
| "learning_rate": 2.070002583174816e-05, |
| "loss": 14.317152099609375, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.8317449315543234, |
| "grad_norm": 9.45024299621582, |
| "learning_rate": 2.0300213000965707e-05, |
| "loss": 14.355799560546876, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.8490729509617051, |
| "grad_norm": 9.889897346496582, |
| "learning_rate": 1.989604960091854e-05, |
| "loss": 14.314393310546874, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.8664009703690868, |
| "grad_norm": 10.8844575881958, |
| "learning_rate": 1.948786738070162e-05, |
| "loss": 14.279014892578125, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.8837289897764685, |
| "grad_norm": 9.387309074401855, |
| "learning_rate": 1.9076001388174608e-05, |
| "loss": 14.240478515625, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.9010570091838502, |
| "grad_norm": 10.535667419433594, |
| "learning_rate": 1.866078969494479e-05, |
| "loss": 14.26585205078125, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.918385028591232, |
| "grad_norm": 9.147391319274902, |
| "learning_rate": 1.8242573118868094e-05, |
| "loss": 14.309058837890625, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.9357130479986138, |
| "grad_norm": 9.556977272033691, |
| "learning_rate": 1.7821694944295836e-05, |
| "loss": 14.21564453125, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.9530410674059955, |
| "grad_norm": 9.025933265686035, |
| "learning_rate": 1.7398500640296928e-05, |
| "loss": 14.192568359375, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.9703690868133772, |
| "grad_norm": 9.630436897277832, |
| "learning_rate": 1.6973337577086803e-05, |
| "loss": 14.193314208984376, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.987697106220759, |
| "grad_norm": 9.064878463745117, |
| "learning_rate": 1.6546554740895815e-05, |
| "loss": 14.1739111328125, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 3.394857168197632, |
| "eval_runtime": 22.6074, |
| "eval_samples_per_second": 660.048, |
| "eval_steps_per_second": 10.351, |
| "step": 5771 |
| }, |
| { |
| "epoch": 1.0050251256281406, |
| "grad_norm": 10.425875663757324, |
| "learning_rate": 1.611850244751118e-05, |
| "loss": 14.170721435546875, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.0223531450355225, |
| "grad_norm": 8.938867568969727, |
| "learning_rate": 1.5689532054727568e-05, |
| "loss": 14.155902099609374, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.0396811644429043, |
| "grad_norm": 9.677651405334473, |
| "learning_rate": 1.525999567394238e-05, |
| "loss": 14.137279052734375, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.057009183850286, |
| "grad_norm": 9.292587280273438, |
| "learning_rate": 1.4830245881132463e-05, |
| "loss": 14.072491455078126, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.0743372032576677, |
| "grad_norm": 9.849185943603516, |
| "learning_rate": 1.4400635427449486e-05, |
| "loss": 14.121292724609376, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.0916652226650494, |
| "grad_norm": 9.886627197265625, |
| "learning_rate": 1.3971516949671474e-05, |
| "loss": 14.058907470703126, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.108993242072431, |
| "grad_norm": 10.207225799560547, |
| "learning_rate": 1.3543242680748322e-05, |
| "loss": 14.07645263671875, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.1263212614798128, |
| "grad_norm": 10.271860122680664, |
| "learning_rate": 1.311616416067868e-05, |
| "loss": 14.01097412109375, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.1436492808871945, |
| "grad_norm": 9.155773162841797, |
| "learning_rate": 1.2690631947955715e-05, |
| "loss": 14.044959716796875, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.1609773002945762, |
| "grad_norm": 9.51415729522705, |
| "learning_rate": 1.2266995331818446e-05, |
| "loss": 14.045927734375, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.1783053197019582, |
| "grad_norm": 9.813915252685547, |
| "learning_rate": 1.184560204554501e-05, |
| "loss": 14.03373291015625, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.1956333391093399, |
| "grad_norm": 9.063379287719727, |
| "learning_rate": 1.1426797981023001e-05, |
| "loss": 14.052874755859374, |
| "step": 6900 |
| }, |
| { |
| "epoch": 1.2129613585167216, |
| "grad_norm": 9.815446853637695, |
| "learning_rate": 1.1010926904831378e-05, |
| "loss": 14.02966552734375, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.2302893779241033, |
| "grad_norm": 9.000835418701172, |
| "learning_rate": 1.0598330176066803e-05, |
| "loss": 14.0574609375, |
| "step": 7100 |
| }, |
| { |
| "epoch": 1.247617397331485, |
| "grad_norm": 9.734049797058105, |
| "learning_rate": 1.0189346466146175e-05, |
| "loss": 13.99876953125, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.2649454167388667, |
| "grad_norm": 9.00645923614502, |
| "learning_rate": 9.784311480815246e-06, |
| "loss": 14.043223876953125, |
| "step": 7300 |
| }, |
| { |
| "epoch": 1.2822734361462484, |
| "grad_norm": 9.783398628234863, |
| "learning_rate": 9.38355768459158e-06, |
| "loss": 13.970205078125, |
| "step": 7400 |
| }, |
| { |
| "epoch": 1.2996014555536302, |
| "grad_norm": 10.050580024719238, |
| "learning_rate": 8.98741402786796e-06, |
| "loss": 13.93072021484375, |
| "step": 7500 |
| }, |
| { |
| "epoch": 1.3169294749610119, |
| "grad_norm": 8.815546035766602, |
| "learning_rate": 8.596205676900367e-06, |
| "loss": 14.004686279296875, |
| "step": 7600 |
| }, |
| { |
| "epoch": 1.3342574943683938, |
| "grad_norm": 10.487207412719727, |
| "learning_rate": 8.210253746901994e-06, |
| "loss": 13.99391845703125, |
| "step": 7700 |
| }, |
| { |
| "epoch": 1.3515855137757753, |
| "grad_norm": 9.005860328674316, |
| "learning_rate": 7.829875038462556e-06, |
| "loss": 13.9050439453125, |
| "step": 7800 |
| }, |
| { |
| "epoch": 1.3689135331831572, |
| "grad_norm": 9.685938835144043, |
| "learning_rate": 7.4553817775091135e-06, |
| "loss": 13.942437744140625, |
| "step": 7900 |
| }, |
| { |
| "epoch": 1.386241552590539, |
| "grad_norm": 8.673903465270996, |
| "learning_rate": 7.087081359021974e-06, |
| "loss": 13.9566064453125, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.4035695719979207, |
| "grad_norm": 9.947382926940918, |
| "learning_rate": 6.7252760947158586e-06, |
| "loss": 13.9610302734375, |
| "step": 8100 |
| }, |
| { |
| "epoch": 1.4208975914053024, |
| "grad_norm": 10.123763084411621, |
| "learning_rate": 6.370262964893738e-06, |
| "loss": 13.928218994140625, |
| "step": 8200 |
| }, |
| { |
| "epoch": 1.438225610812684, |
| "grad_norm": 10.419230461120605, |
| "learning_rate": 6.0223333746766456e-06, |
| "loss": 13.940389404296875, |
| "step": 8300 |
| }, |
| { |
| "epoch": 1.4555536302200658, |
| "grad_norm": 9.33340835571289, |
| "learning_rate": 5.6817729148099585e-06, |
| "loss": 13.91553955078125, |
| "step": 8400 |
| }, |
| { |
| "epoch": 1.4728816496274475, |
| "grad_norm": 9.930194854736328, |
| "learning_rate": 5.3488611272421005e-06, |
| "loss": 13.920137939453125, |
| "step": 8500 |
| }, |
| { |
| "epoch": 1.4902096690348294, |
| "grad_norm": 9.439746856689453, |
| "learning_rate": 5.023871275668458e-06, |
| "loss": 13.894053955078125, |
| "step": 8600 |
| }, |
| { |
| "epoch": 1.507537688442211, |
| "grad_norm": 10.869328498840332, |
| "learning_rate": 4.707070121228482e-06, |
| "loss": 13.908199462890625, |
| "step": 8700 |
| }, |
| { |
| "epoch": 1.5248657078495929, |
| "grad_norm": 9.644942283630371, |
| "learning_rate": 4.398717703540468e-06, |
| "loss": 13.870057373046874, |
| "step": 8800 |
| }, |
| { |
| "epoch": 1.5421937272569746, |
| "grad_norm": 9.200098991394043, |
| "learning_rate": 4.099067127253367e-06, |
| "loss": 13.87569580078125, |
| "step": 8900 |
| }, |
| { |
| "epoch": 1.5595217466643563, |
| "grad_norm": 9.218724250793457, |
| "learning_rate": 3.8083643542912018e-06, |
| "loss": 13.833634033203126, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.576849766071738, |
| "grad_norm": 9.068521499633789, |
| "learning_rate": 3.526848001960283e-06, |
| "loss": 13.915274658203124, |
| "step": 9100 |
| }, |
| { |
| "epoch": 1.5941777854791197, |
| "grad_norm": 9.659659385681152, |
| "learning_rate": 3.2547491470852124e-06, |
| "loss": 13.857677001953125, |
| "step": 9200 |
| }, |
| { |
| "epoch": 1.6115058048865016, |
| "grad_norm": 9.85434341430664, |
| "learning_rate": 2.992291136334279e-06, |
| "loss": 13.899166259765625, |
| "step": 9300 |
| }, |
| { |
| "epoch": 1.6288338242938831, |
| "grad_norm": 9.265457153320312, |
| "learning_rate": 2.7396894028900064e-06, |
| "loss": 13.8499951171875, |
| "step": 9400 |
| }, |
| { |
| "epoch": 1.646161843701265, |
| "grad_norm": 9.825538635253906, |
| "learning_rate": 2.497151289615319e-06, |
| "loss": 13.880186767578126, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.6634898631086465, |
| "grad_norm": 9.972418785095215, |
| "learning_rate": 2.2648758788604805e-06, |
| "loss": 13.867176513671875, |
| "step": 9600 |
| }, |
| { |
| "epoch": 1.6808178825160285, |
| "grad_norm": 8.635448455810547, |
| "learning_rate": 2.043053829050502e-06, |
| "loss": 13.825604248046876, |
| "step": 9700 |
| }, |
| { |
| "epoch": 1.6981459019234102, |
| "grad_norm": 9.8888578414917, |
| "learning_rate": 1.8318672181871465e-06, |
| "loss": 13.817935791015625, |
| "step": 9800 |
| }, |
| { |
| "epoch": 1.715473921330792, |
| "grad_norm": 8.874034881591797, |
| "learning_rate": 1.631489394394005e-06, |
| "loss": 13.843865966796875, |
| "step": 9900 |
| }, |
| { |
| "epoch": 1.7328019407381736, |
| "grad_norm": 10.087613105773926, |
| "learning_rate": 1.4420848336272991e-06, |
| "loss": 13.775987548828125, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.7501299601455553, |
| "grad_norm": 9.76197624206543, |
| "learning_rate": 1.2638090046692313e-06, |
| "loss": 13.89355224609375, |
| "step": 10100 |
| }, |
| { |
| "epoch": 1.7674579795529373, |
| "grad_norm": 9.227717399597168, |
| "learning_rate": 1.0968082415146735e-06, |
| "loss": 13.792811279296876, |
| "step": 10200 |
| }, |
| { |
| "epoch": 1.7847859989603188, |
| "grad_norm": 9.555807113647461, |
| "learning_rate": 9.412196232559611e-07, |
| "loss": 13.825599365234375, |
| "step": 10300 |
| }, |
| { |
| "epoch": 1.8021140183677007, |
| "grad_norm": 9.286114692687988, |
| "learning_rate": 7.971708615643874e-07, |
| "loss": 13.7984130859375, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.8194420377750822, |
| "grad_norm": 10.93297290802002, |
| "learning_rate": 6.647801958607236e-07, |
| "loss": 13.85096435546875, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.836770057182464, |
| "grad_norm": 9.38337516784668, |
| "learning_rate": 5.441562962608837e-07, |
| "loss": 13.84087158203125, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.8540980765898458, |
| "grad_norm": 9.067867279052734, |
| "learning_rate": 4.353981743762975e-07, |
| "loss": 13.902230224609376, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.8714260959972275, |
| "grad_norm": 9.294976234436035, |
| "learning_rate": 3.385951020423256e-07, |
| "loss": 13.867879638671875, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.8887541154046092, |
| "grad_norm": 8.976114273071289, |
| "learning_rate": 2.5382653804130686e-07, |
| "loss": 13.81992919921875, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.906082134811991, |
| "grad_norm": 9.376502990722656, |
| "learning_rate": 1.8116206288049885e-07, |
| "loss": 13.830848388671875, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.923410154219373, |
| "grad_norm": 9.49721622467041, |
| "learning_rate": 1.2066132167835253e-07, |
| "loss": 13.855355224609376, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.9407381736267544, |
| "grad_norm": 10.23469352722168, |
| "learning_rate": 7.237397520607147e-08, |
| "loss": 13.8263525390625, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.9580661930341363, |
| "grad_norm": 9.898947715759277, |
| "learning_rate": 3.633965912460069e-08, |
| "loss": 13.86751708984375, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.9753942124415178, |
| "grad_norm": 9.46378231048584, |
| "learning_rate": 1.2587951450517832e-08, |
| "loss": 13.787728271484376, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.9927222318488997, |
| "grad_norm": 9.18976879119873, |
| "learning_rate": 1.1383482775406685e-09, |
| "loss": 13.8797900390625, |
| "step": 11500 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 3.3484463691711426, |
| "eval_runtime": 22.6234, |
| "eval_samples_per_second": 659.583, |
| "eval_steps_per_second": 10.343, |
| "step": 11542 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 11542, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 5771, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 3, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 0 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.776973151621775e+17, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|