{ "best_global_step": 5771, "best_metric": 3.394857168197632, "best_model_checkpoint": "sindhibert_session3/checkpoint-5771", "epoch": 1.0, "eval_steps": 5771, "global_step": 5771, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017328019407381736, "grad_norm": 9.74232006072998, "learning_rate": 5.147313691507799e-06, "loss": 16.534342041015623, "step": 100 }, { "epoch": 0.03465603881476347, "grad_norm": 9.413031578063965, "learning_rate": 1.0346620450606586e-05, "loss": 16.06064208984375, "step": 200 }, { "epoch": 0.05198405822214521, "grad_norm": 9.366157531738281, "learning_rate": 1.554592720970537e-05, "loss": 15.73246337890625, "step": 300 }, { "epoch": 0.06931207762952694, "grad_norm": 8.934579849243164, "learning_rate": 2.074523396880416e-05, "loss": 15.634798583984375, "step": 400 }, { "epoch": 0.08664009703690868, "grad_norm": 9.873139381408691, "learning_rate": 2.594454072790295e-05, "loss": 15.491142578125, "step": 500 }, { "epoch": 0.10396811644429042, "grad_norm": 9.112743377685547, "learning_rate": 2.9999702019626288e-05, "loss": 15.47271728515625, "step": 600 }, { "epoch": 0.12129613585167215, "grad_norm": 8.721996307373047, "learning_rate": 2.999083739047451e-05, "loss": 15.291612548828125, "step": 700 }, { "epoch": 0.1386241552590539, "grad_norm": 8.849467277526855, "learning_rate": 2.9969667845201166e-05, "loss": 15.32687255859375, "step": 800 }, { "epoch": 0.15595217466643563, "grad_norm": 8.970343589782715, "learning_rate": 2.9936210760385845e-05, "loss": 15.221800537109376, "step": 900 }, { "epoch": 0.17328019407381737, "grad_norm": 9.423188209533691, "learning_rate": 2.9890493598578603e-05, "loss": 15.21154541015625, "step": 1000 }, { "epoch": 0.1906082134811991, "grad_norm": 10.529290199279785, "learning_rate": 2.9832553885757926e-05, "loss": 15.091610107421875, "step": 1100 }, { "epoch": 0.20793623288858085, "grad_norm": 8.895530700683594, "learning_rate": 2.97624391805283e-05, "loss": 15.116024169921875, "step": 1200 }, { "epoch": 0.22526425229596256, "grad_norm": 9.481012344360352, "learning_rate": 2.968020703508272e-05, "loss": 15.086820068359375, "step": 1300 }, { "epoch": 0.2425922717033443, "grad_norm": 8.957283020019531, "learning_rate": 2.9585924947962195e-05, "loss": 15.09182373046875, "step": 1400 }, { "epoch": 0.25992029111072606, "grad_norm": 8.475807189941406, "learning_rate": 2.9479670308650942e-05, "loss": 14.974696044921876, "step": 1500 }, { "epoch": 0.2772483105181078, "grad_norm": 8.860872268676758, "learning_rate": 2.9361530334052883e-05, "loss": 14.967041015625, "step": 1600 }, { "epoch": 0.29457632992548954, "grad_norm": 8.990629196166992, "learning_rate": 2.9231601996901433e-05, "loss": 14.9465673828125, "step": 1700 }, { "epoch": 0.31190434933287126, "grad_norm": 9.683910369873047, "learning_rate": 2.9089991946161484e-05, "loss": 14.9761962890625, "step": 1800 }, { "epoch": 0.32923236874025297, "grad_norm": 9.044540405273438, "learning_rate": 2.89368164194888e-05, "loss": 14.89200927734375, "step": 1900 }, { "epoch": 0.34656038814763473, "grad_norm": 8.935420036315918, "learning_rate": 2.8772201147818787e-05, "loss": 14.9054736328125, "step": 2000 }, { "epoch": 0.36388840755501645, "grad_norm": 8.12104320526123, "learning_rate": 2.8596281252162868e-05, "loss": 14.8011767578125, "step": 2100 }, { "epoch": 0.3812164269623982, "grad_norm": 9.633867263793945, "learning_rate": 2.840920113269721e-05, "loss": 14.789473876953124, "step": 2200 }, { "epoch": 0.3985444463697799, "grad_norm": 9.07466983795166, "learning_rate": 2.8211114350234873e-05, "loss": 14.80165283203125, "step": 2300 }, { "epoch": 0.4158724657771617, "grad_norm": 9.412736892700195, "learning_rate": 2.8002183500178594e-05, "loss": 14.746627197265624, "step": 2400 }, { "epoch": 0.4332004851845434, "grad_norm": 9.755793571472168, "learning_rate": 2.7782580079057772e-05, "loss": 14.778804931640625, "step": 2500 }, { "epoch": 0.4505285045919251, "grad_norm": 9.882634162902832, "learning_rate": 2.7552484343759096e-05, "loss": 14.704544677734376, "step": 2600 }, { "epoch": 0.4678565239993069, "grad_norm": 9.305146217346191, "learning_rate": 2.731208516356645e-05, "loss": 14.75770751953125, "step": 2700 }, { "epoch": 0.4851845434066886, "grad_norm": 9.269790649414062, "learning_rate": 2.7061579865131508e-05, "loss": 14.68646484375, "step": 2800 }, { "epoch": 0.5025125628140703, "grad_norm": 9.310648918151855, "learning_rate": 2.6801174070502248e-05, "loss": 14.635621337890624, "step": 2900 }, { "epoch": 0.5198405822214521, "grad_norm": 9.239577293395996, "learning_rate": 2.653108152834241e-05, "loss": 14.71250732421875, "step": 3000 }, { "epoch": 0.5371686016288338, "grad_norm": 9.674842834472656, "learning_rate": 2.6251523938480346e-05, "loss": 14.602254638671875, "step": 3100 }, { "epoch": 0.5544966210362156, "grad_norm": 10.178524017333984, "learning_rate": 2.5962730769931346e-05, "loss": 14.558492431640625, "step": 3200 }, { "epoch": 0.5718246404435973, "grad_norm": 9.312729835510254, "learning_rate": 2.5664939072542787e-05, "loss": 14.588648681640626, "step": 3300 }, { "epoch": 0.5891526598509791, "grad_norm": 9.438308715820312, "learning_rate": 2.5358393282416714e-05, "loss": 14.535865478515625, "step": 3400 }, { "epoch": 0.6064806792583608, "grad_norm": 8.51146125793457, "learning_rate": 2.5043345021269554e-05, "loss": 14.5489208984375, "step": 3500 }, { "epoch": 0.6238086986657425, "grad_norm": 9.856837272644043, "learning_rate": 2.4720052889893698e-05, "loss": 14.565177001953124, "step": 3600 }, { "epoch": 0.6411367180731242, "grad_norm": 9.223260879516602, "learning_rate": 2.4388782255890405e-05, "loss": 14.452093505859375, "step": 3700 }, { "epoch": 0.6584647374805059, "grad_norm": 9.016181945800781, "learning_rate": 2.404980503584838e-05, "loss": 14.49298828125, "step": 3800 }, { "epoch": 0.6757927568878878, "grad_norm": 9.865802764892578, "learning_rate": 2.370339947214669e-05, "loss": 14.474598388671875, "step": 3900 }, { "epoch": 0.6931207762952695, "grad_norm": 8.965621948242188, "learning_rate": 2.3349849904565318e-05, "loss": 14.46911376953125, "step": 4000 }, { "epoch": 0.7104487957026512, "grad_norm": 8.362798690795898, "learning_rate": 2.2989446536890786e-05, "loss": 14.390712890625, "step": 4100 }, { "epoch": 0.7277768151100329, "grad_norm": 10.564478874206543, "learning_rate": 2.2622485198708445e-05, "loss": 14.45989501953125, "step": 4200 }, { "epoch": 0.7451048345174146, "grad_norm": 9.188340187072754, "learning_rate": 2.2249267102576903e-05, "loss": 14.422335205078125, "step": 4300 }, { "epoch": 0.7624328539247964, "grad_norm": 9.867836952209473, "learning_rate": 2.1870098596784012e-05, "loss": 14.341461181640625, "step": 4400 }, { "epoch": 0.7797608733321781, "grad_norm": 9.469503402709961, "learning_rate": 2.148529091388725e-05, "loss": 14.42570556640625, "step": 4500 }, { "epoch": 0.7970888927395599, "grad_norm": 9.195992469787598, "learning_rate": 2.1095159915244956e-05, "loss": 14.3226025390625, "step": 4600 }, { "epoch": 0.8144169121469416, "grad_norm": 9.930395126342773, "learning_rate": 2.070002583174816e-05, "loss": 14.317152099609375, "step": 4700 }, { "epoch": 0.8317449315543234, "grad_norm": 9.45024299621582, "learning_rate": 2.0300213000965707e-05, "loss": 14.355799560546876, "step": 4800 }, { "epoch": 0.8490729509617051, "grad_norm": 9.889897346496582, "learning_rate": 1.989604960091854e-05, "loss": 14.314393310546874, "step": 4900 }, { "epoch": 0.8664009703690868, "grad_norm": 10.8844575881958, "learning_rate": 1.948786738070162e-05, "loss": 14.279014892578125, "step": 5000 }, { "epoch": 0.8837289897764685, "grad_norm": 9.387309074401855, "learning_rate": 1.9076001388174608e-05, "loss": 14.240478515625, "step": 5100 }, { "epoch": 0.9010570091838502, "grad_norm": 10.535667419433594, "learning_rate": 1.866078969494479e-05, "loss": 14.26585205078125, "step": 5200 }, { "epoch": 0.918385028591232, "grad_norm": 9.147391319274902, "learning_rate": 1.8242573118868094e-05, "loss": 14.309058837890625, "step": 5300 }, { "epoch": 0.9357130479986138, "grad_norm": 9.556977272033691, "learning_rate": 1.7821694944295836e-05, "loss": 14.21564453125, "step": 5400 }, { "epoch": 0.9530410674059955, "grad_norm": 9.025933265686035, "learning_rate": 1.7398500640296928e-05, "loss": 14.192568359375, "step": 5500 }, { "epoch": 0.9703690868133772, "grad_norm": 9.630436897277832, "learning_rate": 1.6973337577086803e-05, "loss": 14.193314208984376, "step": 5600 }, { "epoch": 0.987697106220759, "grad_norm": 9.064878463745117, "learning_rate": 1.6546554740895815e-05, "loss": 14.1739111328125, "step": 5700 }, { "epoch": 1.0, "eval_loss": 3.394857168197632, "eval_runtime": 22.6074, "eval_samples_per_second": 660.048, "eval_steps_per_second": 10.351, "step": 5771 } ], "logging_steps": 100, "max_steps": 11542, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 5771, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.888486575810888e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }