{ "best_global_step": 5886, "best_metric": 3.5591108798980713, "best_model_checkpoint": "sindhibert_session4/checkpoint-5886", "epoch": 3.0, "eval_steps": 1962, "global_step": 5886, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05098139179199592, "grad_norm": 4.590001106262207, "learning_rate": 5.609065155807366e-06, "loss": 15.86372314453125, "step": 100 }, { "epoch": 0.10196278358399184, "grad_norm": 5.000253677368164, "learning_rate": 1.1274787535410765e-05, "loss": 15.6683056640625, "step": 200 }, { "epoch": 0.15294417537598776, "grad_norm": 5.164661407470703, "learning_rate": 1.6940509915014164e-05, "loss": 15.58547607421875, "step": 300 }, { "epoch": 0.20392556716798368, "grad_norm": 4.895200729370117, "learning_rate": 1.999658933249201e-05, "loss": 15.5261376953125, "step": 400 }, { "epoch": 0.2549069589599796, "grad_norm": 5.010247707366943, "learning_rate": 1.9965659596003744e-05, "loss": 15.493291015625, "step": 500 }, { "epoch": 0.3058883507519755, "grad_norm": 4.85853910446167, "learning_rate": 1.990261043359342e-05, "loss": 15.43971435546875, "step": 600 }, { "epoch": 0.35686974254397147, "grad_norm": 4.788653373718262, "learning_rate": 1.9807645053376055e-05, "loss": 15.409666748046876, "step": 700 }, { "epoch": 0.40785113433596737, "grad_norm": 4.742185592651367, "learning_rate": 1.968106952977309e-05, "loss": 15.346304931640624, "step": 800 }, { "epoch": 0.45883252612796327, "grad_norm": 4.758422374725342, "learning_rate": 1.9523291817031276e-05, "loss": 15.344024658203125, "step": 900 }, { "epoch": 0.5098139179199592, "grad_norm": 4.854381084442139, "learning_rate": 1.933482043438185e-05, "loss": 15.307811279296875, "step": 1000 }, { "epoch": 0.5607953097119551, "grad_norm": 4.7934041023254395, "learning_rate": 1.9116262827077703e-05, "loss": 15.254422607421875, "step": 1100 }, { "epoch": 0.611776701503951, "grad_norm": 4.670731544494629, "learning_rate": 1.88683234085909e-05, "loss": 15.23345703125, "step": 1200 }, { "epoch": 0.6627580932959469, "grad_norm": 4.993561267852783, "learning_rate": 1.8591801290280664e-05, "loss": 15.2450927734375, "step": 1300 }, { "epoch": 0.7137394850879429, "grad_norm": 4.720964431762695, "learning_rate": 1.8287587705849013e-05, "loss": 15.1839599609375, "step": 1400 }, { "epoch": 0.7647208768799388, "grad_norm": 5.050419330596924, "learning_rate": 1.7956663138885173e-05, "loss": 15.164833984375, "step": 1500 }, { "epoch": 0.8157022686719347, "grad_norm": 4.826648712158203, "learning_rate": 1.760009416275661e-05, "loss": 15.130496826171875, "step": 1600 }, { "epoch": 0.8666836604639306, "grad_norm": 4.858438014984131, "learning_rate": 1.721903000303185e-05, "loss": 15.125797119140625, "step": 1700 }, { "epoch": 0.9176650522559265, "grad_norm": 4.9611430168151855, "learning_rate": 1.6814698833514326e-05, "loss": 15.13617431640625, "step": 1800 }, { "epoch": 0.9686464440479226, "grad_norm": 4.663859844207764, "learning_rate": 1.63884038178253e-05, "loss": 15.072591552734375, "step": 1900 }, { "epoch": 1.0, "eval_loss": 3.636704444885254, "eval_runtime": 8.0138, "eval_samples_per_second": 632.91, "eval_steps_per_second": 9.983, "step": 1962 }, { "epoch": 1.0193729288809585, "grad_norm": 4.863068103790283, "learning_rate": 1.5941518909293737e-05, "loss": 14.968798828125, "step": 2000 }, { "epoch": 1.0703543206729544, "grad_norm": 5.036495685577393, "learning_rate": 1.5475484422690282e-05, "loss": 15.0290869140625, "step": 2100 }, { "epoch": 1.1213357124649503, "grad_norm": 5.248174667358398, "learning_rate": 1.4991802392077543e-05, "loss": 15.004036865234376, "step": 2200 }, { "epoch": 1.1723171042569462, "grad_norm": 4.950564384460449, "learning_rate": 1.4492031729738489e-05, "loss": 15.002611083984375, "step": 2300 }, { "epoch": 1.2232984960489421, "grad_norm": 4.509192943572998, "learning_rate": 1.3977783201785732e-05, "loss": 14.96060302734375, "step": 2400 }, { "epoch": 1.274279887840938, "grad_norm": 4.900182723999023, "learning_rate": 1.3450714236645352e-05, "loss": 14.971297607421874, "step": 2500 }, { "epoch": 1.325261279632934, "grad_norm": 5.138764381408691, "learning_rate": 1.2912523583147625e-05, "loss": 14.928385009765625, "step": 2600 }, { "epoch": 1.3762426714249298, "grad_norm": 4.894199848175049, "learning_rate": 1.2364945835441636e-05, "loss": 14.938167724609375, "step": 2700 }, { "epoch": 1.4272240632169257, "grad_norm": 4.8737921714782715, "learning_rate": 1.1809745842380042e-05, "loss": 14.923902587890625, "step": 2800 }, { "epoch": 1.4782054550089216, "grad_norm": 4.8258819580078125, "learning_rate": 1.1248713019392635e-05, "loss": 14.89677001953125, "step": 2900 }, { "epoch": 1.5291868468009175, "grad_norm": 4.769787788391113, "learning_rate": 1.0683655581181524e-05, "loss": 14.87692626953125, "step": 3000 }, { "epoch": 1.5801682385929134, "grad_norm": 4.92316198348999, "learning_rate": 1.0116394713826117e-05, "loss": 14.849693603515625, "step": 3100 }, { "epoch": 1.6311496303849093, "grad_norm": 4.873258590698242, "learning_rate": 9.548758705081177e-06, "loss": 14.833634033203126, "step": 3200 }, { "epoch": 1.6821310221769055, "grad_norm": 4.738825798034668, "learning_rate": 8.98257705178612e-06, "loss": 14.85665283203125, "step": 3300 }, { "epoch": 1.7331124139689014, "grad_norm": 4.907736778259277, "learning_rate": 8.419674563377416e-06, "loss": 14.8664599609375, "step": 3400 }, { "epoch": 1.7840938057608973, "grad_norm": 4.977413177490234, "learning_rate": 7.861865480508541e-06, "loss": 14.83008056640625, "step": 3500 }, { "epoch": 1.8350751975528932, "grad_norm": 4.792273044586182, "learning_rate": 7.310947627733231e-06, "loss": 14.81404541015625, "step": 3600 }, { "epoch": 1.886056589344889, "grad_norm": 4.84648323059082, "learning_rate": 6.768696619097996e-06, "loss": 14.831793212890625, "step": 3700 }, { "epoch": 1.9370379811368852, "grad_norm": 4.854404449462891, "learning_rate": 6.236860135319321e-06, "loss": 14.826976318359375, "step": 3800 }, { "epoch": 1.988019372928881, "grad_norm": 4.615888595581055, "learning_rate": 5.717152290990302e-06, "loss": 14.767562255859374, "step": 3900 }, { "epoch": 2.0, "eval_loss": 3.56946063041687, "eval_runtime": 8.0481, "eval_samples_per_second": 630.208, "eval_steps_per_second": 9.94, "step": 3924 }, { "epoch": 2.038745857761917, "grad_norm": 5.015805721282959, "learning_rate": 5.211248109971254e-06, "loss": 14.695634765625, "step": 4000 }, { "epoch": 2.089727249553913, "grad_norm": 4.800245761871338, "learning_rate": 4.720778126770141e-06, "loss": 14.764068603515625, "step": 4100 }, { "epoch": 2.140708641345909, "grad_norm": 4.756154537200928, "learning_rate": 4.247323131312676e-06, "loss": 14.755054931640625, "step": 4200 }, { "epoch": 2.191690033137905, "grad_norm": 4.989803314208984, "learning_rate": 3.7924090740397178e-06, "loss": 14.760721435546875, "step": 4300 }, { "epoch": 2.2426714249299007, "grad_norm": 4.568801403045654, "learning_rate": 3.3575021477529313e-06, "loss": 14.72455810546875, "step": 4400 }, { "epoch": 2.2936528167218966, "grad_norm": 4.871072769165039, "learning_rate": 2.944004062059924e-06, "loss": 14.743800048828126, "step": 4500 }, { "epoch": 2.3446342085138925, "grad_norm": 4.790256500244141, "learning_rate": 2.5532475256494073e-06, "loss": 14.7241162109375, "step": 4600 }, { "epoch": 2.3956156003058884, "grad_norm": 4.770144462585449, "learning_rate": 2.186491950957048e-06, "loss": 14.711162109375, "step": 4700 }, { "epoch": 2.4465969920978843, "grad_norm": 4.44427490234375, "learning_rate": 1.8449193950659018e-06, "loss": 14.72890625, "step": 4800 }, { "epoch": 2.49757838388988, "grad_norm": 4.664465427398682, "learning_rate": 1.5296307499239903e-06, "loss": 14.713804931640626, "step": 4900 }, { "epoch": 2.548559775681876, "grad_norm": 4.861291408538818, "learning_rate": 1.2416421941579448e-06, "loss": 14.730694580078126, "step": 5000 }, { "epoch": 2.599541167473872, "grad_norm": 4.662012577056885, "learning_rate": 9.818819179185713e-07, "loss": 14.70477294921875, "step": 5100 }, { "epoch": 2.650522559265868, "grad_norm": 4.803001403808594, "learning_rate": 7.511871313142238e-07, "loss": 14.7314208984375, "step": 5200 }, { "epoch": 2.701503951057864, "grad_norm": 4.746646404266357, "learning_rate": 5.503013660737899e-07, "loss": 14.70580810546875, "step": 5300 }, { "epoch": 2.7524853428498597, "grad_norm": 4.867108345031738, "learning_rate": 3.798720791360988e-07, "loss": 14.710306396484375, "step": 5400 }, { "epoch": 2.8034667346418556, "grad_norm": 4.6949992179870605, "learning_rate": 2.404485658893807e-07, "loss": 14.725491943359375, "step": 5500 }, { "epoch": 2.8544481264338515, "grad_norm": 4.641607284545898, "learning_rate": 1.3248018978643695e-07, "loss": 14.7078369140625, "step": 5600 }, { "epoch": 2.905429518225848, "grad_norm": 4.756202220916748, "learning_rate": 5.6314934041501455e-08, "loss": 14.697396240234376, "step": 5700 }, { "epoch": 2.9564109100178433, "grad_norm": 4.691574573516846, "learning_rate": 1.2198280076668455e-08, "loss": 14.694278564453125, "step": 5800 }, { "epoch": 3.0, "eval_loss": 3.5591108798980713, "eval_runtime": 8.0338, "eval_samples_per_second": 631.333, "eval_steps_per_second": 9.958, "step": 5886 } ], "logging_steps": 100, "max_steps": 5886, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1962, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.964983111028869e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }