| { | |
| "best_metric": 0.9182948490230906, | |
| "best_model_checkpoint": "./results/checkpoint-29910", | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 29910, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 4.954967021942139, | |
| "learning_rate": 4.9832831828819794e-05, | |
| "loss": 4.7403, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 9.918214797973633, | |
| "learning_rate": 4.9665663657639585e-05, | |
| "loss": 4.7281, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 6.391179084777832, | |
| "learning_rate": 4.949849548645938e-05, | |
| "loss": 4.6786, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 6.756315231323242, | |
| "learning_rate": 4.9331327315279175e-05, | |
| "loss": 4.6128, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 8.407713890075684, | |
| "learning_rate": 4.916415914409897e-05, | |
| "loss": 4.4836, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 8.354033470153809, | |
| "learning_rate": 4.899699097291876e-05, | |
| "loss": 4.3776, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 7.996518611907959, | |
| "learning_rate": 4.882982280173855e-05, | |
| "loss": 4.2701, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 14.100532531738281, | |
| "learning_rate": 4.866265463055835e-05, | |
| "loss": 4.1032, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 10.907315254211426, | |
| "learning_rate": 4.849548645937814e-05, | |
| "loss": 3.952, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 9.731605529785156, | |
| "learning_rate": 4.8328318288197924e-05, | |
| "loss": 3.732, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 9.989665985107422, | |
| "learning_rate": 4.816115011701772e-05, | |
| "loss": 3.5489, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 9.542133331298828, | |
| "learning_rate": 4.7993981945837514e-05, | |
| "loss": 3.3949, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 11.988595008850098, | |
| "learning_rate": 4.7826813774657305e-05, | |
| "loss": 3.216, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 13.553967475891113, | |
| "learning_rate": 4.76596456034771e-05, | |
| "loss": 2.9855, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 42.20621109008789, | |
| "learning_rate": 4.7492477432296895e-05, | |
| "loss": 2.7659, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 18.790130615234375, | |
| "learning_rate": 4.732530926111669e-05, | |
| "loss": 2.5604, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 20.554113388061523, | |
| "learning_rate": 4.715814108993648e-05, | |
| "loss": 2.4376, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 18.882707595825195, | |
| "learning_rate": 4.699097291875627e-05, | |
| "loss": 2.3501, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 14.733109474182129, | |
| "learning_rate": 4.682380474757606e-05, | |
| "loss": 2.168, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 17.430740356445312, | |
| "learning_rate": 4.665663657639586e-05, | |
| "loss": 2.0081, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 21.797836303710938, | |
| "learning_rate": 4.6489468405215644e-05, | |
| "loss": 1.9554, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 13.148958206176758, | |
| "learning_rate": 4.632230023403544e-05, | |
| "loss": 1.8524, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 14.161394119262695, | |
| "learning_rate": 4.6155132062855234e-05, | |
| "loss": 1.793, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 20.908519744873047, | |
| "learning_rate": 4.5987963891675026e-05, | |
| "loss": 1.6493, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 15.107952117919922, | |
| "learning_rate": 4.582079572049482e-05, | |
| "loss": 1.5724, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 18.561201095581055, | |
| "learning_rate": 4.5653627549314615e-05, | |
| "loss": 1.4915, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 15.365275382995605, | |
| "learning_rate": 4.548645937813441e-05, | |
| "loss": 1.488, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 16.04875946044922, | |
| "learning_rate": 4.53192912069542e-05, | |
| "loss": 1.4316, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 13.593673706054688, | |
| "learning_rate": 4.515212303577399e-05, | |
| "loss": 1.456, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 16.379798889160156, | |
| "learning_rate": 4.498495486459378e-05, | |
| "loss": 1.3006, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 13.564205169677734, | |
| "learning_rate": 4.481778669341358e-05, | |
| "loss": 1.2661, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 15.44586181640625, | |
| "learning_rate": 4.4650618522233364e-05, | |
| "loss": 1.2917, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 12.80644416809082, | |
| "learning_rate": 4.448345035105316e-05, | |
| "loss": 1.1765, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 19.545106887817383, | |
| "learning_rate": 4.4316282179872954e-05, | |
| "loss": 1.1622, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 14.377379417419434, | |
| "learning_rate": 4.414911400869275e-05, | |
| "loss": 1.1047, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 21.595245361328125, | |
| "learning_rate": 4.398194583751254e-05, | |
| "loss": 1.1384, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 14.641448020935059, | |
| "learning_rate": 4.3814777666332335e-05, | |
| "loss": 1.0872, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 13.082781791687012, | |
| "learning_rate": 4.364760949515213e-05, | |
| "loss": 1.0366, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 18.576641082763672, | |
| "learning_rate": 4.348044132397192e-05, | |
| "loss": 1.0953, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 9.915220260620117, | |
| "learning_rate": 4.331327315279171e-05, | |
| "loss": 1.001, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 12.059024810791016, | |
| "learning_rate": 4.31461049816115e-05, | |
| "loss": 1.0585, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 17.607337951660156, | |
| "learning_rate": 4.29789368104313e-05, | |
| "loss": 1.0179, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 16.324430465698242, | |
| "learning_rate": 4.2811768639251084e-05, | |
| "loss": 0.9491, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 19.5161075592041, | |
| "learning_rate": 4.264460046807088e-05, | |
| "loss": 0.9374, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 20.448488235473633, | |
| "learning_rate": 4.2477432296890674e-05, | |
| "loss": 0.9146, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 10.544804573059082, | |
| "learning_rate": 4.231026412571047e-05, | |
| "loss": 0.9187, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 17.095731735229492, | |
| "learning_rate": 4.214309595453026e-05, | |
| "loss": 0.8732, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 18.1314754486084, | |
| "learning_rate": 4.197592778335005e-05, | |
| "loss": 0.9072, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 8.516233444213867, | |
| "learning_rate": 4.180875961216985e-05, | |
| "loss": 0.8264, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 12.620676040649414, | |
| "learning_rate": 4.164159144098964e-05, | |
| "loss": 0.8425, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 23.544219970703125, | |
| "learning_rate": 4.147442326980943e-05, | |
| "loss": 0.8371, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 15.980536460876465, | |
| "learning_rate": 4.130725509862922e-05, | |
| "loss": 0.8257, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 16.621524810791016, | |
| "learning_rate": 4.114008692744902e-05, | |
| "loss": 0.7705, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 25.2496280670166, | |
| "learning_rate": 4.0972918756268804e-05, | |
| "loss": 0.7741, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 12.541385650634766, | |
| "learning_rate": 4.08057505850886e-05, | |
| "loss": 0.7408, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 30.975236892700195, | |
| "learning_rate": 4.0638582413908394e-05, | |
| "loss": 0.7417, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 16.33625030517578, | |
| "learning_rate": 4.0471414242728186e-05, | |
| "loss": 0.766, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 17.48399543762207, | |
| "learning_rate": 4.030424607154798e-05, | |
| "loss": 0.8336, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 19.421096801757812, | |
| "learning_rate": 4.013707790036777e-05, | |
| "loss": 0.7135, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.7995263469508584, | |
| "eval_f1": 0.7955612032049123, | |
| "eval_loss": 0.7165877223014832, | |
| "eval_precision": 0.805591523931921, | |
| "eval_recall": 0.7995263469508584, | |
| "eval_runtime": 64.1068, | |
| "eval_samples_per_second": 131.733, | |
| "eval_steps_per_second": 8.236, | |
| "step": 5982 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 10.486939430236816, | |
| "learning_rate": 3.996990972918757e-05, | |
| "loss": 0.685, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 19.489837646484375, | |
| "learning_rate": 3.980274155800736e-05, | |
| "loss": 0.6431, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 8.935369491577148, | |
| "learning_rate": 3.963557338682715e-05, | |
| "loss": 0.6402, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 10.298083305358887, | |
| "learning_rate": 3.946840521564694e-05, | |
| "loss": 0.6261, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 18.606569290161133, | |
| "learning_rate": 3.930123704446674e-05, | |
| "loss": 0.5874, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 12.412484169006348, | |
| "learning_rate": 3.913406887328653e-05, | |
| "loss": 0.5923, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 9.3939847946167, | |
| "learning_rate": 3.8966900702106316e-05, | |
| "loss": 0.6091, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 14.168825149536133, | |
| "learning_rate": 3.8799732530926114e-05, | |
| "loss": 0.6259, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 18.846487045288086, | |
| "learning_rate": 3.8632564359745906e-05, | |
| "loss": 0.5543, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 7.268430709838867, | |
| "learning_rate": 3.84653961885657e-05, | |
| "loss": 0.5615, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 6.565930366516113, | |
| "learning_rate": 3.829822801738549e-05, | |
| "loss": 0.5725, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 11.122172355651855, | |
| "learning_rate": 3.813105984620529e-05, | |
| "loss": 0.543, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 15.909794807434082, | |
| "learning_rate": 3.796389167502508e-05, | |
| "loss": 0.5053, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 17.935998916625977, | |
| "learning_rate": 3.779672350384487e-05, | |
| "loss": 0.5866, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 7.46903657913208, | |
| "learning_rate": 3.762955533266466e-05, | |
| "loss": 0.5573, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 10.208723068237305, | |
| "learning_rate": 3.746238716148446e-05, | |
| "loss": 0.511, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 15.062224388122559, | |
| "learning_rate": 3.729521899030425e-05, | |
| "loss": 0.5211, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 11.787239074707031, | |
| "learning_rate": 3.7128050819124036e-05, | |
| "loss": 0.5687, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 20.22210693359375, | |
| "learning_rate": 3.6960882647943834e-05, | |
| "loss": 0.544, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 22.17251205444336, | |
| "learning_rate": 3.6793714476763626e-05, | |
| "loss": 0.5223, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 16.83318519592285, | |
| "learning_rate": 3.662654630558342e-05, | |
| "loss": 0.5043, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 10.143548965454102, | |
| "learning_rate": 3.645937813440321e-05, | |
| "loss": 0.5181, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 20.629831314086914, | |
| "learning_rate": 3.629220996322301e-05, | |
| "loss": 0.4886, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 12.14686107635498, | |
| "learning_rate": 3.61250417920428e-05, | |
| "loss": 0.5667, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 17.1881160736084, | |
| "learning_rate": 3.595787362086259e-05, | |
| "loss": 0.5211, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 7.506267070770264, | |
| "learning_rate": 3.579070544968238e-05, | |
| "loss": 0.5356, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 23.122560501098633, | |
| "learning_rate": 3.562353727850217e-05, | |
| "loss": 0.5044, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 21.808191299438477, | |
| "learning_rate": 3.545636910732197e-05, | |
| "loss": 0.5059, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 12.899435997009277, | |
| "learning_rate": 3.5289200936141756e-05, | |
| "loss": 0.5082, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 11.228046417236328, | |
| "learning_rate": 3.5122032764961554e-05, | |
| "loss": 0.4466, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 15.656624794006348, | |
| "learning_rate": 3.4954864593781346e-05, | |
| "loss": 0.4877, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 14.958187103271484, | |
| "learning_rate": 3.478769642260114e-05, | |
| "loss": 0.4283, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 27.727924346923828, | |
| "learning_rate": 3.462052825142093e-05, | |
| "loss": 0.504, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 21.103147506713867, | |
| "learning_rate": 3.445336008024073e-05, | |
| "loss": 0.5081, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 14.884688377380371, | |
| "learning_rate": 3.428619190906052e-05, | |
| "loss": 0.47, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 26.825908660888672, | |
| "learning_rate": 3.411902373788031e-05, | |
| "loss": 0.4587, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 23.39227867126465, | |
| "learning_rate": 3.39518555667001e-05, | |
| "loss": 0.4621, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 15.503640174865723, | |
| "learning_rate": 3.378468739551989e-05, | |
| "loss": 0.5122, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 13.298539161682129, | |
| "learning_rate": 3.361751922433969e-05, | |
| "loss": 0.4846, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 17.961261749267578, | |
| "learning_rate": 3.3450351053159476e-05, | |
| "loss": 0.4576, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 15.622933387756348, | |
| "learning_rate": 3.3283182881979274e-05, | |
| "loss": 0.4239, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 15.286486625671387, | |
| "learning_rate": 3.3116014710799066e-05, | |
| "loss": 0.4478, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 28.045799255371094, | |
| "learning_rate": 3.294884653961886e-05, | |
| "loss": 0.4457, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 23.578136444091797, | |
| "learning_rate": 3.278167836843865e-05, | |
| "loss": 0.464, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 12.858305931091309, | |
| "learning_rate": 3.261451019725844e-05, | |
| "loss": 0.4507, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 18.197952270507812, | |
| "learning_rate": 3.244734202607824e-05, | |
| "loss": 0.4158, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 5.134513854980469, | |
| "learning_rate": 3.228017385489803e-05, | |
| "loss": 0.4088, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 2.1014363765716553, | |
| "learning_rate": 3.211300568371782e-05, | |
| "loss": 0.4524, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 14.459040641784668, | |
| "learning_rate": 3.194583751253761e-05, | |
| "loss": 0.4637, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 29.922468185424805, | |
| "learning_rate": 3.177866934135741e-05, | |
| "loss": 0.4302, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 23.523460388183594, | |
| "learning_rate": 3.1611501170177196e-05, | |
| "loss": 0.4155, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 11.668371200561523, | |
| "learning_rate": 3.1444332998996994e-05, | |
| "loss": 0.4238, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 15.930005073547363, | |
| "learning_rate": 3.1277164827816786e-05, | |
| "loss": 0.4072, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 18.61160659790039, | |
| "learning_rate": 3.110999665663658e-05, | |
| "loss": 0.4348, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 27.475053787231445, | |
| "learning_rate": 3.094282848545637e-05, | |
| "loss": 0.4648, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 6.477468013763428, | |
| "learning_rate": 3.077566031427616e-05, | |
| "loss": 0.4241, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 26.99014663696289, | |
| "learning_rate": 3.060849214309596e-05, | |
| "loss": 0.4243, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 16.152755737304688, | |
| "learning_rate": 3.0441323971915747e-05, | |
| "loss": 0.4186, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 15.536150932312012, | |
| "learning_rate": 3.0274155800735542e-05, | |
| "loss": 0.3808, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 23.708145141601562, | |
| "learning_rate": 3.0106987629555333e-05, | |
| "loss": 0.4365, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.8680876258140912, | |
| "eval_f1": 0.8628914936078326, | |
| "eval_loss": 0.4633374810218811, | |
| "eval_precision": 0.8684864554322808, | |
| "eval_recall": 0.8680876258140912, | |
| "eval_runtime": 64.0052, | |
| "eval_samples_per_second": 131.942, | |
| "eval_steps_per_second": 8.249, | |
| "step": 11964 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 10.474257469177246, | |
| "learning_rate": 2.9939819458375128e-05, | |
| "loss": 0.3853, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 15.668170928955078, | |
| "learning_rate": 2.977265128719492e-05, | |
| "loss": 0.2858, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 10.29902172088623, | |
| "learning_rate": 2.960548311601471e-05, | |
| "loss": 0.2803, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 33.27579116821289, | |
| "learning_rate": 2.9438314944834506e-05, | |
| "loss": 0.2858, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 13.799466133117676, | |
| "learning_rate": 2.9271146773654294e-05, | |
| "loss": 0.2793, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 4.722692489624023, | |
| "learning_rate": 2.9103978602474092e-05, | |
| "loss": 0.2935, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 8.643231391906738, | |
| "learning_rate": 2.893681043129388e-05, | |
| "loss": 0.2825, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 10.378469467163086, | |
| "learning_rate": 2.876964226011368e-05, | |
| "loss": 0.2845, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 9.1376953125, | |
| "learning_rate": 2.8602474088933467e-05, | |
| "loss": 0.2725, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 10.372312545776367, | |
| "learning_rate": 2.8435305917753262e-05, | |
| "loss": 0.3067, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 23.952699661254883, | |
| "learning_rate": 2.8268137746573053e-05, | |
| "loss": 0.2934, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 2.125562906265259, | |
| "learning_rate": 2.8100969575392848e-05, | |
| "loss": 0.2535, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 8.090828895568848, | |
| "learning_rate": 2.793380140421264e-05, | |
| "loss": 0.295, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 13.274210929870605, | |
| "learning_rate": 2.776663323303243e-05, | |
| "loss": 0.2851, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 5.6807732582092285, | |
| "learning_rate": 2.7599465061852226e-05, | |
| "loss": 0.2662, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 11.885269165039062, | |
| "learning_rate": 2.7432296890672014e-05, | |
| "loss": 0.2969, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 21.52318000793457, | |
| "learning_rate": 2.7265128719491812e-05, | |
| "loss": 0.2706, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 21.661279678344727, | |
| "learning_rate": 2.70979605483116e-05, | |
| "loss": 0.2715, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 27.985078811645508, | |
| "learning_rate": 2.69307923771314e-05, | |
| "loss": 0.3016, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 11.431729316711426, | |
| "learning_rate": 2.6763624205951187e-05, | |
| "loss": 0.2501, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 5.3406901359558105, | |
| "learning_rate": 2.6596456034770982e-05, | |
| "loss": 0.2762, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 11.077746391296387, | |
| "learning_rate": 2.6429287863590773e-05, | |
| "loss": 0.2819, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 17.451330184936523, | |
| "learning_rate": 2.6262119692410565e-05, | |
| "loss": 0.3074, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 7.353370189666748, | |
| "learning_rate": 2.609495152123036e-05, | |
| "loss": 0.3068, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 12.055102348327637, | |
| "learning_rate": 2.592778335005015e-05, | |
| "loss": 0.2779, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 17.555917739868164, | |
| "learning_rate": 2.5760615178869946e-05, | |
| "loss": 0.2421, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 22.887771606445312, | |
| "learning_rate": 2.5593447007689734e-05, | |
| "loss": 0.3016, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 1.915899395942688, | |
| "learning_rate": 2.5426278836509533e-05, | |
| "loss": 0.2638, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 13.446496963500977, | |
| "learning_rate": 2.525911066532932e-05, | |
| "loss": 0.293, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 12.734638214111328, | |
| "learning_rate": 2.509194249414912e-05, | |
| "loss": 0.2668, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 15.557112693786621, | |
| "learning_rate": 2.4924774322968907e-05, | |
| "loss": 0.2691, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 10.383445739746094, | |
| "learning_rate": 2.4757606151788702e-05, | |
| "loss": 0.2204, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 7.19666862487793, | |
| "learning_rate": 2.4590437980608493e-05, | |
| "loss": 0.2447, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 17.903339385986328, | |
| "learning_rate": 2.442326980942829e-05, | |
| "loss": 0.2504, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 10.492616653442383, | |
| "learning_rate": 2.425610163824808e-05, | |
| "loss": 0.2256, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 11.051074028015137, | |
| "learning_rate": 2.408893346706787e-05, | |
| "loss": 0.259, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 23.400402069091797, | |
| "learning_rate": 2.3921765295887663e-05, | |
| "loss": 0.2487, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 20.601686477661133, | |
| "learning_rate": 2.3754597124707458e-05, | |
| "loss": 0.2338, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 12.519159317016602, | |
| "learning_rate": 2.358742895352725e-05, | |
| "loss": 0.2652, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 21.95683479309082, | |
| "learning_rate": 2.342026078234704e-05, | |
| "loss": 0.2306, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 24.98236656188965, | |
| "learning_rate": 2.3253092611166836e-05, | |
| "loss": 0.2475, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 6.362200736999512, | |
| "learning_rate": 2.3085924439986627e-05, | |
| "loss": 0.2646, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 14.293391227722168, | |
| "learning_rate": 2.2918756268806422e-05, | |
| "loss": 0.2404, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 11.405878067016602, | |
| "learning_rate": 2.2751588097626213e-05, | |
| "loss": 0.2651, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 15.082180976867676, | |
| "learning_rate": 2.258441992644601e-05, | |
| "loss": 0.281, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 27.33397674560547, | |
| "learning_rate": 2.2417251755265796e-05, | |
| "loss": 0.2492, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 10.052102088928223, | |
| "learning_rate": 2.225008358408559e-05, | |
| "loss": 0.2382, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 15.405964851379395, | |
| "learning_rate": 2.2082915412905383e-05, | |
| "loss": 0.2496, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 7.162382125854492, | |
| "learning_rate": 2.1915747241725178e-05, | |
| "loss": 0.2343, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 11.130888938903809, | |
| "learning_rate": 2.174857907054497e-05, | |
| "loss": 0.2474, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 8.277360916137695, | |
| "learning_rate": 2.158141089936476e-05, | |
| "loss": 0.2687, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 31.100744247436523, | |
| "learning_rate": 2.1414242728184556e-05, | |
| "loss": 0.2422, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 12.757442474365234, | |
| "learning_rate": 2.1247074557004347e-05, | |
| "loss": 0.2275, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 4.860738277435303, | |
| "learning_rate": 2.1079906385824142e-05, | |
| "loss": 0.2252, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 10.574835777282715, | |
| "learning_rate": 2.091273821464393e-05, | |
| "loss": 0.2114, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 13.01117992401123, | |
| "learning_rate": 2.0745570043463725e-05, | |
| "loss": 0.2407, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 4.970390319824219, | |
| "learning_rate": 2.0578401872283517e-05, | |
| "loss": 0.2509, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 18.95350456237793, | |
| "learning_rate": 2.041123370110331e-05, | |
| "loss": 0.2814, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 1.5296308994293213, | |
| "learning_rate": 2.0244065529923103e-05, | |
| "loss": 0.235, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 12.501904487609863, | |
| "learning_rate": 2.0076897358742898e-05, | |
| "loss": 0.2479, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.8965068087625814, | |
| "eval_f1": 0.8930257247589533, | |
| "eval_loss": 0.36622655391693115, | |
| "eval_precision": 0.8950199629292306, | |
| "eval_recall": 0.8965068087625814, | |
| "eval_runtime": 64.0862, | |
| "eval_samples_per_second": 131.776, | |
| "eval_steps_per_second": 8.239, | |
| "step": 17946 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 19.13836097717285, | |
| "learning_rate": 1.990972918756269e-05, | |
| "loss": 0.2272, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 3.03, | |
| "grad_norm": 8.622084617614746, | |
| "learning_rate": 1.9742561016382484e-05, | |
| "loss": 0.131, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 32.99411392211914, | |
| "learning_rate": 1.9575392845202276e-05, | |
| "loss": 0.1477, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 5.467390060424805, | |
| "learning_rate": 1.9408224674022067e-05, | |
| "loss": 0.1439, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 2.5153982639312744, | |
| "learning_rate": 1.924105650284186e-05, | |
| "loss": 0.1405, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 3.09, | |
| "grad_norm": 20.424579620361328, | |
| "learning_rate": 1.907388833166165e-05, | |
| "loss": 0.1594, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "grad_norm": 5.207544803619385, | |
| "learning_rate": 1.8906720160481445e-05, | |
| "loss": 0.1323, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 8.750362396240234, | |
| "learning_rate": 1.8739551989301237e-05, | |
| "loss": 0.1683, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 2.464329481124878, | |
| "learning_rate": 1.857238381812103e-05, | |
| "loss": 0.1388, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 3.784031867980957, | |
| "learning_rate": 1.8405215646940823e-05, | |
| "loss": 0.149, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 2.632542610168457, | |
| "learning_rate": 1.8238047475760618e-05, | |
| "loss": 0.1284, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 11.050533294677734, | |
| "learning_rate": 1.807087930458041e-05, | |
| "loss": 0.1525, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 7.363661766052246, | |
| "learning_rate": 1.7903711133400204e-05, | |
| "loss": 0.1481, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 9.882287979125977, | |
| "learning_rate": 1.7736542962219992e-05, | |
| "loss": 0.1231, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 24.93657684326172, | |
| "learning_rate": 1.7569374791039787e-05, | |
| "loss": 0.1332, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 2.2802133560180664, | |
| "learning_rate": 1.740220661985958e-05, | |
| "loss": 0.1425, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 1.5991661548614502, | |
| "learning_rate": 1.7235038448679374e-05, | |
| "loss": 0.1283, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 8.344457626342773, | |
| "learning_rate": 1.7067870277499165e-05, | |
| "loss": 0.1502, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 12.95904541015625, | |
| "learning_rate": 1.6900702106318957e-05, | |
| "loss": 0.1287, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 20.562625885009766, | |
| "learning_rate": 1.673353393513875e-05, | |
| "loss": 0.1422, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 4.20346736907959, | |
| "learning_rate": 1.6566365763958543e-05, | |
| "loss": 0.1082, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 25.636775970458984, | |
| "learning_rate": 1.6399197592778338e-05, | |
| "loss": 0.1416, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 23.23301887512207, | |
| "learning_rate": 1.6232029421598126e-05, | |
| "loss": 0.1497, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 3.39, | |
| "grad_norm": 22.21303939819336, | |
| "learning_rate": 1.606486125041792e-05, | |
| "loss": 0.1568, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 21.14128303527832, | |
| "learning_rate": 1.5897693079237712e-05, | |
| "loss": 0.139, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 3.43, | |
| "grad_norm": 22.63404083251953, | |
| "learning_rate": 1.5730524908057507e-05, | |
| "loss": 0.1518, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 13.030010223388672, | |
| "learning_rate": 1.55633567368773e-05, | |
| "loss": 0.1319, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 18.308670043945312, | |
| "learning_rate": 1.5396188565697094e-05, | |
| "loss": 0.1494, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 24.907419204711914, | |
| "learning_rate": 1.5229020394516885e-05, | |
| "loss": 0.1425, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 3.49, | |
| "grad_norm": 19.32282066345215, | |
| "learning_rate": 1.5061852223336678e-05, | |
| "loss": 0.1264, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "grad_norm": 17.444271087646484, | |
| "learning_rate": 1.4894684052156472e-05, | |
| "loss": 0.14, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 3.53, | |
| "grad_norm": 1.832461953163147, | |
| "learning_rate": 1.4727515880976261e-05, | |
| "loss": 0.1438, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 10.410861015319824, | |
| "learning_rate": 1.4560347709796055e-05, | |
| "loss": 0.1393, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 3.6459202766418457, | |
| "learning_rate": 1.4393179538615848e-05, | |
| "loss": 0.1077, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 3.216399669647217, | |
| "learning_rate": 1.4226011367435641e-05, | |
| "loss": 0.1154, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 5.621729373931885, | |
| "learning_rate": 1.4058843196255434e-05, | |
| "loss": 0.1208, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 5.559453010559082, | |
| "learning_rate": 1.3891675025075226e-05, | |
| "loss": 0.1441, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 22.32745933532715, | |
| "learning_rate": 1.3724506853895019e-05, | |
| "loss": 0.1176, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 4.509443759918213, | |
| "learning_rate": 1.3557338682714812e-05, | |
| "loss": 0.1382, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 15.154895782470703, | |
| "learning_rate": 1.3390170511534605e-05, | |
| "loss": 0.1475, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 0.8804099559783936, | |
| "learning_rate": 1.3223002340354398e-05, | |
| "loss": 0.1325, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 1.9917913675308228, | |
| "learning_rate": 1.3055834169174188e-05, | |
| "loss": 0.1255, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 16.314374923706055, | |
| "learning_rate": 1.2888665997993981e-05, | |
| "loss": 0.1275, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 5.355242729187012, | |
| "learning_rate": 1.2721497826813775e-05, | |
| "loss": 0.1185, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "grad_norm": 20.218473434448242, | |
| "learning_rate": 1.2554329655633568e-05, | |
| "loss": 0.1203, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 1.39955735206604, | |
| "learning_rate": 1.2387161484453361e-05, | |
| "loss": 0.1636, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 17.855899810791016, | |
| "learning_rate": 1.2219993313273154e-05, | |
| "loss": 0.1369, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 14.41054630279541, | |
| "learning_rate": 1.2052825142092947e-05, | |
| "loss": 0.1245, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 11.451350212097168, | |
| "learning_rate": 1.1885656970912739e-05, | |
| "loss": 0.1508, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 9.41112995147705, | |
| "learning_rate": 1.171848879973253e-05, | |
| "loss": 0.125, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 29.826963424682617, | |
| "learning_rate": 1.1551320628552324e-05, | |
| "loss": 0.1545, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 3.86, | |
| "grad_norm": 11.454690933227539, | |
| "learning_rate": 1.1384152457372117e-05, | |
| "loss": 0.1353, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 12.364923477172852, | |
| "learning_rate": 1.121698428619191e-05, | |
| "loss": 0.1346, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 1.8181456327438354, | |
| "learning_rate": 1.1049816115011702e-05, | |
| "loss": 0.1092, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 30.87436866760254, | |
| "learning_rate": 1.0882647943831495e-05, | |
| "loss": 0.1059, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 3.93, | |
| "grad_norm": 16.423452377319336, | |
| "learning_rate": 1.0715479772651288e-05, | |
| "loss": 0.1157, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 27.86665153503418, | |
| "learning_rate": 1.0548311601471081e-05, | |
| "loss": 0.1317, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 24.479764938354492, | |
| "learning_rate": 1.0381143430290873e-05, | |
| "loss": 0.1184, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 1.4079170227050781, | |
| "learning_rate": 1.0213975259110666e-05, | |
| "loss": 0.1303, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 4.259897232055664, | |
| "learning_rate": 1.0046807087930459e-05, | |
| "loss": 0.1322, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.9113084665482534, | |
| "eval_f1": 0.9092055511030135, | |
| "eval_loss": 0.3260073661804199, | |
| "eval_precision": 0.9099757491171729, | |
| "eval_recall": 0.9113084665482534, | |
| "eval_runtime": 64.1166, | |
| "eval_samples_per_second": 131.713, | |
| "eval_steps_per_second": 8.235, | |
| "step": 23928 | |
| }, | |
| { | |
| "epoch": 4.01, | |
| "grad_norm": 13.925552368164062, | |
| "learning_rate": 9.879638916750252e-06, | |
| "loss": 0.0687, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 4.03, | |
| "grad_norm": 0.18495211005210876, | |
| "learning_rate": 9.712470745570044e-06, | |
| "loss": 0.066, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 1.0808857679367065, | |
| "learning_rate": 9.545302574389837e-06, | |
| "loss": 0.0648, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 4.06, | |
| "grad_norm": 1.0073552131652832, | |
| "learning_rate": 9.378134403209628e-06, | |
| "loss": 0.071, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 15.166232109069824, | |
| "learning_rate": 9.210966232029422e-06, | |
| "loss": 0.0666, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "grad_norm": 18.000640869140625, | |
| "learning_rate": 9.043798060849215e-06, | |
| "loss": 0.0778, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 4.11, | |
| "grad_norm": 1.214728593826294, | |
| "learning_rate": 8.876629889669008e-06, | |
| "loss": 0.07, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 4.13, | |
| "grad_norm": 1.982407808303833, | |
| "learning_rate": 8.7094617184888e-06, | |
| "loss": 0.0752, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 4.15, | |
| "grad_norm": 20.929153442382812, | |
| "learning_rate": 8.542293547308593e-06, | |
| "loss": 0.0785, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 0.8963820934295654, | |
| "learning_rate": 8.375125376128386e-06, | |
| "loss": 0.0524, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 4.18, | |
| "grad_norm": 3.5774483680725098, | |
| "learning_rate": 8.207957204948179e-06, | |
| "loss": 0.0692, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 3.7253074645996094, | |
| "learning_rate": 8.04078903376797e-06, | |
| "loss": 0.0641, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 4.21, | |
| "grad_norm": 1.2855291366577148, | |
| "learning_rate": 7.873620862587764e-06, | |
| "loss": 0.0699, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 4.23, | |
| "grad_norm": 1.9972455501556396, | |
| "learning_rate": 7.706452691407557e-06, | |
| "loss": 0.062, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 1.0809322595596313, | |
| "learning_rate": 7.539284520227349e-06, | |
| "loss": 0.058, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "grad_norm": 3.876232862472534, | |
| "learning_rate": 7.3721163490471425e-06, | |
| "loss": 0.0693, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 6.069151878356934, | |
| "learning_rate": 7.204948177866934e-06, | |
| "loss": 0.0617, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 0.895815372467041, | |
| "learning_rate": 7.037780006686727e-06, | |
| "loss": 0.0623, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 4.31, | |
| "grad_norm": 0.4176822602748871, | |
| "learning_rate": 6.8706118355065195e-06, | |
| "loss": 0.0833, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 0.6760619878768921, | |
| "learning_rate": 6.703443664326313e-06, | |
| "loss": 0.0567, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "grad_norm": 14.889734268188477, | |
| "learning_rate": 6.536275493146106e-06, | |
| "loss": 0.053, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 0.5385121703147888, | |
| "learning_rate": 6.369107321965897e-06, | |
| "loss": 0.0703, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 4.38, | |
| "grad_norm": 6.336006164550781, | |
| "learning_rate": 6.201939150785691e-06, | |
| "loss": 0.063, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 0.20758749544620514, | |
| "learning_rate": 6.034770979605484e-06, | |
| "loss": 0.0753, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 4.41, | |
| "grad_norm": 11.717066764831543, | |
| "learning_rate": 5.867602808425276e-06, | |
| "loss": 0.0598, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 4.43, | |
| "grad_norm": 26.475128173828125, | |
| "learning_rate": 5.7004346372450685e-06, | |
| "loss": 0.064, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "grad_norm": 20.872194290161133, | |
| "learning_rate": 5.533266466064862e-06, | |
| "loss": 0.0708, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "grad_norm": 1.2749828100204468, | |
| "learning_rate": 5.366098294884654e-06, | |
| "loss": 0.0705, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 6.7912702560424805, | |
| "learning_rate": 5.198930123704447e-06, | |
| "loss": 0.0742, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 10.904654502868652, | |
| "learning_rate": 5.03176195252424e-06, | |
| "loss": 0.0665, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 4.51, | |
| "grad_norm": 6.191511154174805, | |
| "learning_rate": 4.864593781344033e-06, | |
| "loss": 0.0549, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 4.53, | |
| "grad_norm": 2.479524850845337, | |
| "learning_rate": 4.697425610163825e-06, | |
| "loss": 0.0539, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 0.7285805940628052, | |
| "learning_rate": 4.5302574389836175e-06, | |
| "loss": 0.0662, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 4.56, | |
| "grad_norm": 4.313304901123047, | |
| "learning_rate": 4.363089267803411e-06, | |
| "loss": 0.0571, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 17.61699867248535, | |
| "learning_rate": 4.195921096623203e-06, | |
| "loss": 0.0634, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 1.3776081800460815, | |
| "learning_rate": 4.028752925442996e-06, | |
| "loss": 0.0526, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 4.61, | |
| "grad_norm": 0.36369597911834717, | |
| "learning_rate": 3.8615847542627886e-06, | |
| "loss": 0.0669, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "grad_norm": 4.591643333435059, | |
| "learning_rate": 3.6944165830825813e-06, | |
| "loss": 0.0578, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "grad_norm": 0.930225670337677, | |
| "learning_rate": 3.5272484119023737e-06, | |
| "loss": 0.0456, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "grad_norm": 1.136043906211853, | |
| "learning_rate": 3.360080240722167e-06, | |
| "loss": 0.0617, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 4.68, | |
| "grad_norm": 0.6426201462745667, | |
| "learning_rate": 3.1929120695419596e-06, | |
| "loss": 0.0568, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 2.6884241104125977, | |
| "learning_rate": 3.025743898361752e-06, | |
| "loss": 0.0606, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 4.71, | |
| "grad_norm": 0.4525424838066101, | |
| "learning_rate": 2.8585757271815448e-06, | |
| "loss": 0.066, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 4.73, | |
| "grad_norm": 1.0276681184768677, | |
| "learning_rate": 2.6914075560013375e-06, | |
| "loss": 0.0444, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 7.886939525604248, | |
| "learning_rate": 2.5242393848211303e-06, | |
| "loss": 0.065, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 4.76, | |
| "grad_norm": 0.37203583121299744, | |
| "learning_rate": 2.357071213640923e-06, | |
| "loss": 0.0559, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "grad_norm": 6.219501495361328, | |
| "learning_rate": 2.1899030424607154e-06, | |
| "loss": 0.07, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 8.10631275177002, | |
| "learning_rate": 2.022734871280508e-06, | |
| "loss": 0.0623, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 24.999059677124023, | |
| "learning_rate": 1.855566700100301e-06, | |
| "loss": 0.0701, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "grad_norm": 3.5445597171783447, | |
| "learning_rate": 1.6883985289200935e-06, | |
| "loss": 0.0561, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "grad_norm": 11.693018913269043, | |
| "learning_rate": 1.5212303577398863e-06, | |
| "loss": 0.062, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 4.86, | |
| "grad_norm": 17.059640884399414, | |
| "learning_rate": 1.354062186559679e-06, | |
| "loss": 0.0663, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 3.2128794193267822, | |
| "learning_rate": 1.1868940153794718e-06, | |
| "loss": 0.0541, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "grad_norm": 1.6803439855575562, | |
| "learning_rate": 1.0197258441992646e-06, | |
| "loss": 0.0619, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 4.91, | |
| "grad_norm": 7.980160236358643, | |
| "learning_rate": 8.525576730190572e-07, | |
| "loss": 0.0649, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 4.93, | |
| "grad_norm": 0.3919593393802643, | |
| "learning_rate": 6.853895018388499e-07, | |
| "loss": 0.0753, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 2.870180368423462, | |
| "learning_rate": 5.182213306586426e-07, | |
| "loss": 0.0461, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 0.5204899907112122, | |
| "learning_rate": 3.510531594784353e-07, | |
| "loss": 0.0446, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 4.98, | |
| "grad_norm": 2.318403482437134, | |
| "learning_rate": 1.8388498829822804e-07, | |
| "loss": 0.0588, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 1.1591626405715942, | |
| "learning_rate": 1.6716817118020728e-08, | |
| "loss": 0.0589, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.9182948490230906, | |
| "eval_f1": 0.9165254517429693, | |
| "eval_loss": 0.3342040479183197, | |
| "eval_precision": 0.9170562701684628, | |
| "eval_recall": 0.9182948490230906, | |
| "eval_runtime": 63.9141, | |
| "eval_samples_per_second": 132.131, | |
| "eval_steps_per_second": 8.261, | |
| "step": 29910 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 29910, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "total_flos": 1.15579279766016e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |