{ "best_global_step": 5780, "best_metric": 3.4774351119995117, "best_model_checkpoint": "sindhibert_session6/checkpoint-5780", "epoch": 2.0, "eval_steps": 2890, "global_step": 5780, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03460207612456748, "grad_norm": 5.271186828613281, "learning_rate": 8.583815028901734e-07, "loss": 14.7112890625, "step": 100 }, { "epoch": 0.06920415224913495, "grad_norm": 5.328507900238037, "learning_rate": 1.7254335260115608e-06, "loss": 14.65601806640625, "step": 200 }, { "epoch": 0.10380622837370242, "grad_norm": 5.2988433837890625, "learning_rate": 2.592485549132948e-06, "loss": 14.6437841796875, "step": 300 }, { "epoch": 0.1384083044982699, "grad_norm": 5.144472122192383, "learning_rate": 2.9992958916410005e-06, "loss": 14.644112548828126, "step": 400 }, { "epoch": 0.17301038062283736, "grad_norm": 5.255625247955322, "learning_rate": 2.994135629312096e-06, "loss": 14.592845458984375, "step": 500 }, { "epoch": 0.20761245674740483, "grad_norm": 5.365286827087402, "learning_rate": 2.9839827352831524e-06, "loss": 14.59123779296875, "step": 600 }, { "epoch": 0.2422145328719723, "grad_norm": 5.114892959594727, "learning_rate": 2.96887113529664e-06, "loss": 14.588660888671875, "step": 700 }, { "epoch": 0.2768166089965398, "grad_norm": 5.162683486938477, "learning_rate": 2.948851324536296e-06, "loss": 14.5647998046875, "step": 800 }, { "epoch": 0.31141868512110726, "grad_norm": 5.2245635986328125, "learning_rate": 2.9239901988982294e-06, "loss": 14.566002197265625, "step": 900 }, { "epoch": 0.3460207612456747, "grad_norm": 5.22702169418335, "learning_rate": 2.8943708314592917e-06, "loss": 14.56361328125, "step": 1000 }, { "epoch": 0.3806228373702422, "grad_norm": 5.353369235992432, "learning_rate": 2.8600921948896393e-06, "loss": 14.56337890625, "step": 1100 }, { "epoch": 0.41522491349480967, "grad_norm": 5.189014911651611, "learning_rate": 2.821268830737051e-06, "loss": 14.566427001953125, "step": 1200 }, { "epoch": 0.44982698961937717, "grad_norm": 5.108746528625488, "learning_rate": 2.7780304666880683e-06, "loss": 14.5394287109375, "step": 1300 }, { "epoch": 0.4844290657439446, "grad_norm": 5.1513895988464355, "learning_rate": 2.7305215830848867e-06, "loss": 14.531617431640624, "step": 1400 }, { "epoch": 0.5190311418685121, "grad_norm": 5.2011942863464355, "learning_rate": 2.678900930146467e-06, "loss": 14.5282666015625, "step": 1500 }, { "epoch": 0.5536332179930796, "grad_norm": 5.219590663909912, "learning_rate": 2.6233409975070707e-06, "loss": 14.489625244140624, "step": 1600 }, { "epoch": 0.5882352941176471, "grad_norm": 5.411579132080078, "learning_rate": 2.5640274378447445e-06, "loss": 14.49999755859375, "step": 1700 }, { "epoch": 0.6228373702422145, "grad_norm": 5.120180606842041, "learning_rate": 2.5011584465256946e-06, "loss": 14.515745849609376, "step": 1800 }, { "epoch": 0.657439446366782, "grad_norm": 5.3347883224487305, "learning_rate": 2.434944099337454e-06, "loss": 14.52021728515625, "step": 1900 }, { "epoch": 0.6920415224913494, "grad_norm": 4.978041172027588, "learning_rate": 2.365605650523803e-06, "loss": 14.50703857421875, "step": 2000 }, { "epoch": 0.726643598615917, "grad_norm": 5.3864898681640625, "learning_rate": 2.293374793467048e-06, "loss": 14.493427734375, "step": 2100 }, { "epoch": 0.7612456747404844, "grad_norm": 5.149505138397217, "learning_rate": 2.2184928864880712e-06, "loss": 14.453104248046875, "step": 2200 }, { "epoch": 0.7958477508650519, "grad_norm": 5.451559066772461, "learning_rate": 2.1412101463511406e-06, "loss": 14.48880126953125, "step": 2300 }, { "epoch": 0.8304498269896193, "grad_norm": 5.231166839599609, "learning_rate": 2.0617848121683582e-06, "loss": 14.46824951171875, "step": 2400 }, { "epoch": 0.8650519031141869, "grad_norm": 5.132116794586182, "learning_rate": 1.9804822824975567e-06, "loss": 14.45693603515625, "step": 2500 }, { "epoch": 0.8996539792387543, "grad_norm": 5.454078197479248, "learning_rate": 1.8975742285170185e-06, "loss": 14.468741455078124, "step": 2600 }, { "epoch": 0.9342560553633218, "grad_norm": 5.184955596923828, "learning_rate": 1.8133376862403233e-06, "loss": 14.482393798828125, "step": 2700 }, { "epoch": 0.9688581314878892, "grad_norm": 5.030216693878174, "learning_rate": 1.7280541308046812e-06, "loss": 14.4536962890625, "step": 2800 }, { "epoch": 1.0, "eval_loss": 3.484435558319092, "eval_runtime": 11.6834, "eval_samples_per_second": 639.538, "eval_steps_per_second": 10.014, "step": 2890 }, { "epoch": 1.0034602076124568, "grad_norm": 5.57059383392334, "learning_rate": 1.642008535926004e-06, "loss": 14.452493896484375, "step": 2900 }, { "epoch": 1.0380622837370241, "grad_norm": 5.143775939941406, "learning_rate": 1.555488421663523e-06, "loss": 14.467264404296875, "step": 3000 }, { "epoch": 1.0726643598615917, "grad_norm": 5.192799091339111, "learning_rate": 1.4687828936758435e-06, "loss": 14.464970703125, "step": 3100 }, { "epoch": 1.1072664359861593, "grad_norm": 5.265404224395752, "learning_rate": 1.3821816771787413e-06, "loss": 14.4964404296875, "step": 3200 }, { "epoch": 1.1418685121107266, "grad_norm": 5.28303337097168, "learning_rate": 1.295974148832716e-06, "loss": 14.460384521484375, "step": 3300 }, { "epoch": 1.1764705882352942, "grad_norm": 5.456825256347656, "learning_rate": 1.2104483697952423e-06, "loss": 14.472330322265625, "step": 3400 }, { "epoch": 1.2110726643598615, "grad_norm": 5.192461013793945, "learning_rate": 1.1258901231687475e-06, "loss": 14.4242919921875, "step": 3500 }, { "epoch": 1.245674740484429, "grad_norm": 5.02662992477417, "learning_rate": 1.0425819590606824e-06, "loss": 14.4378662109375, "step": 3600 }, { "epoch": 1.2802768166089966, "grad_norm": 5.156518459320068, "learning_rate": 9.608022504465906e-07, "loss": 14.4332080078125, "step": 3700 }, { "epoch": 1.314878892733564, "grad_norm": 5.137354850769043, "learning_rate": 8.808242629910054e-07, "loss": 14.42055419921875, "step": 3800 }, { "epoch": 1.3494809688581315, "grad_norm": 4.931822299957275, "learning_rate": 8.029152419343472e-07, "loss": 14.446268310546875, "step": 3900 }, { "epoch": 1.3840830449826989, "grad_norm": 5.438608169555664, "learning_rate": 7.273355190969783e-07, "loss": 14.436180419921875, "step": 4000 }, { "epoch": 1.4186851211072664, "grad_norm": 5.530498027801514, "learning_rate": 6.54337642984345e-07, "loss": 14.427529296875, "step": 4100 }, { "epoch": 1.453287197231834, "grad_norm": 5.223482608795166, "learning_rate": 5.841655348999535e-07, "loss": 14.415745849609374, "step": 4200 }, { "epoch": 1.4878892733564013, "grad_norm": 5.245316982269287, "learning_rate": 5.170536738860046e-07, "loss": 14.42922119140625, "step": 4300 }, { "epoch": 1.5224913494809689, "grad_norm": 5.100404739379883, "learning_rate": 4.532263132152101e-07, "loss": 14.461243896484374, "step": 4400 }, { "epoch": 1.5570934256055362, "grad_norm": 5.252062797546387, "learning_rate": 3.928967310518557e-07, "loss": 14.419617919921874, "step": 4500 }, { "epoch": 1.5916955017301038, "grad_norm": 5.039636611938477, "learning_rate": 3.362665177860309e-07, "loss": 14.477161865234375, "step": 4600 }, { "epoch": 1.6262975778546713, "grad_norm": 5.289979457855225, "learning_rate": 2.8352490242237476e-07, "loss": 14.38912109375, "step": 4700 }, { "epoch": 1.6608996539792389, "grad_norm": 5.163676738739014, "learning_rate": 2.348481202742086e-07, "loss": 14.403753662109375, "step": 4800 }, { "epoch": 1.6955017301038062, "grad_norm": 5.600104331970215, "learning_rate": 1.9039882407588976e-07, "loss": 14.447159423828126, "step": 4900 }, { "epoch": 1.7301038062283736, "grad_norm": 5.240525722503662, "learning_rate": 1.503255404811511e-07, "loss": 14.401494140625, "step": 5000 }, { "epoch": 1.7647058823529411, "grad_norm": 5.186587810516357, "learning_rate": 1.1476217376352293e-07, "loss": 14.4153125, "step": 5100 }, { "epoch": 1.7993079584775087, "grad_norm": 5.226868152618408, "learning_rate": 8.382755837722234e-08, "loss": 14.4057421875, "step": 5200 }, { "epoch": 1.8339100346020762, "grad_norm": 5.335669994354248, "learning_rate": 5.762506187361205e-08, "loss": 14.435169677734375, "step": 5300 }, { "epoch": 1.8685121107266436, "grad_norm": 5.08687686920166, "learning_rate": 3.624223950008548e-08, "loss": 14.43446533203125, "step": 5400 }, { "epoch": 1.903114186851211, "grad_norm": 5.379976272583008, "learning_rate": 1.975054163552692e-08, "loss": 14.415914306640625, "step": 5500 }, { "epoch": 1.9377162629757785, "grad_norm": 5.047989845275879, "learning_rate": 8.205075039945553e-09, "loss": 14.41682861328125, "step": 5600 }, { "epoch": 1.972318339100346, "grad_norm": 5.379687786102295, "learning_rate": 1.6444187160679635e-09, "loss": 14.4242138671875, "step": 5700 }, { "epoch": 2.0, "eval_loss": 3.4774351119995117, "eval_runtime": 11.6654, "eval_samples_per_second": 640.524, "eval_steps_per_second": 10.03, "step": 5780 } ], "logging_steps": 100, "max_steps": 5780, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2890, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.8943456160818586e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }