| { |
| "best_global_step": 5886, |
| "best_metric": 3.5591108798980713, |
| "best_model_checkpoint": "sindhibert_session4/checkpoint-5886", |
| "epoch": 3.0, |
| "eval_steps": 1962, |
| "global_step": 5886, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.05098139179199592, |
| "grad_norm": 4.590001106262207, |
| "learning_rate": 5.609065155807366e-06, |
| "loss": 15.86372314453125, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.10196278358399184, |
| "grad_norm": 5.000253677368164, |
| "learning_rate": 1.1274787535410765e-05, |
| "loss": 15.6683056640625, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.15294417537598776, |
| "grad_norm": 5.164661407470703, |
| "learning_rate": 1.6940509915014164e-05, |
| "loss": 15.58547607421875, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.20392556716798368, |
| "grad_norm": 4.895200729370117, |
| "learning_rate": 1.999658933249201e-05, |
| "loss": 15.5261376953125, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.2549069589599796, |
| "grad_norm": 5.010247707366943, |
| "learning_rate": 1.9965659596003744e-05, |
| "loss": 15.493291015625, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3058883507519755, |
| "grad_norm": 4.85853910446167, |
| "learning_rate": 1.990261043359342e-05, |
| "loss": 15.43971435546875, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.35686974254397147, |
| "grad_norm": 4.788653373718262, |
| "learning_rate": 1.9807645053376055e-05, |
| "loss": 15.409666748046876, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.40785113433596737, |
| "grad_norm": 4.742185592651367, |
| "learning_rate": 1.968106952977309e-05, |
| "loss": 15.346304931640624, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.45883252612796327, |
| "grad_norm": 4.758422374725342, |
| "learning_rate": 1.9523291817031276e-05, |
| "loss": 15.344024658203125, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5098139179199592, |
| "grad_norm": 4.854381084442139, |
| "learning_rate": 1.933482043438185e-05, |
| "loss": 15.307811279296875, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.5607953097119551, |
| "grad_norm": 4.7934041023254395, |
| "learning_rate": 1.9116262827077703e-05, |
| "loss": 15.254422607421875, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.611776701503951, |
| "grad_norm": 4.670731544494629, |
| "learning_rate": 1.88683234085909e-05, |
| "loss": 15.23345703125, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.6627580932959469, |
| "grad_norm": 4.993561267852783, |
| "learning_rate": 1.8591801290280664e-05, |
| "loss": 15.2450927734375, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.7137394850879429, |
| "grad_norm": 4.720964431762695, |
| "learning_rate": 1.8287587705849013e-05, |
| "loss": 15.1839599609375, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.7647208768799388, |
| "grad_norm": 5.050419330596924, |
| "learning_rate": 1.7956663138885173e-05, |
| "loss": 15.164833984375, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.8157022686719347, |
| "grad_norm": 4.826648712158203, |
| "learning_rate": 1.760009416275661e-05, |
| "loss": 15.130496826171875, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.8666836604639306, |
| "grad_norm": 4.858438014984131, |
| "learning_rate": 1.721903000303185e-05, |
| "loss": 15.125797119140625, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.9176650522559265, |
| "grad_norm": 4.9611430168151855, |
| "learning_rate": 1.6814698833514326e-05, |
| "loss": 15.13617431640625, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.9686464440479226, |
| "grad_norm": 4.663859844207764, |
| "learning_rate": 1.63884038178253e-05, |
| "loss": 15.072591552734375, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 3.636704444885254, |
| "eval_runtime": 8.0138, |
| "eval_samples_per_second": 632.91, |
| "eval_steps_per_second": 9.983, |
| "step": 1962 |
| }, |
| { |
| "epoch": 1.0193729288809585, |
| "grad_norm": 4.863068103790283, |
| "learning_rate": 1.5941518909293737e-05, |
| "loss": 14.968798828125, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.0703543206729544, |
| "grad_norm": 5.036495685577393, |
| "learning_rate": 1.5475484422690282e-05, |
| "loss": 15.0290869140625, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.1213357124649503, |
| "grad_norm": 5.248174667358398, |
| "learning_rate": 1.4991802392077543e-05, |
| "loss": 15.004036865234376, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.1723171042569462, |
| "grad_norm": 4.950564384460449, |
| "learning_rate": 1.4492031729738489e-05, |
| "loss": 15.002611083984375, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.2232984960489421, |
| "grad_norm": 4.509192943572998, |
| "learning_rate": 1.3977783201785732e-05, |
| "loss": 14.96060302734375, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.274279887840938, |
| "grad_norm": 4.900182723999023, |
| "learning_rate": 1.3450714236645352e-05, |
| "loss": 14.971297607421874, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.325261279632934, |
| "grad_norm": 5.138764381408691, |
| "learning_rate": 1.2912523583147625e-05, |
| "loss": 14.928385009765625, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.3762426714249298, |
| "grad_norm": 4.894199848175049, |
| "learning_rate": 1.2364945835441636e-05, |
| "loss": 14.938167724609375, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.4272240632169257, |
| "grad_norm": 4.8737921714782715, |
| "learning_rate": 1.1809745842380042e-05, |
| "loss": 14.923902587890625, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.4782054550089216, |
| "grad_norm": 4.8258819580078125, |
| "learning_rate": 1.1248713019392635e-05, |
| "loss": 14.89677001953125, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.5291868468009175, |
| "grad_norm": 4.769787788391113, |
| "learning_rate": 1.0683655581181524e-05, |
| "loss": 14.87692626953125, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.5801682385929134, |
| "grad_norm": 4.92316198348999, |
| "learning_rate": 1.0116394713826117e-05, |
| "loss": 14.849693603515625, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.6311496303849093, |
| "grad_norm": 4.873258590698242, |
| "learning_rate": 9.548758705081177e-06, |
| "loss": 14.833634033203126, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.6821310221769055, |
| "grad_norm": 4.738825798034668, |
| "learning_rate": 8.98257705178612e-06, |
| "loss": 14.85665283203125, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.7331124139689014, |
| "grad_norm": 4.907736778259277, |
| "learning_rate": 8.419674563377416e-06, |
| "loss": 14.8664599609375, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.7840938057608973, |
| "grad_norm": 4.977413177490234, |
| "learning_rate": 7.861865480508541e-06, |
| "loss": 14.83008056640625, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.8350751975528932, |
| "grad_norm": 4.792273044586182, |
| "learning_rate": 7.310947627733231e-06, |
| "loss": 14.81404541015625, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.886056589344889, |
| "grad_norm": 4.84648323059082, |
| "learning_rate": 6.768696619097996e-06, |
| "loss": 14.831793212890625, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.9370379811368852, |
| "grad_norm": 4.854404449462891, |
| "learning_rate": 6.236860135319321e-06, |
| "loss": 14.826976318359375, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.988019372928881, |
| "grad_norm": 4.615888595581055, |
| "learning_rate": 5.717152290990302e-06, |
| "loss": 14.767562255859374, |
| "step": 3900 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 3.56946063041687, |
| "eval_runtime": 8.0481, |
| "eval_samples_per_second": 630.208, |
| "eval_steps_per_second": 9.94, |
| "step": 3924 |
| }, |
| { |
| "epoch": 2.038745857761917, |
| "grad_norm": 5.015805721282959, |
| "learning_rate": 5.211248109971254e-06, |
| "loss": 14.695634765625, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.089727249553913, |
| "grad_norm": 4.800245761871338, |
| "learning_rate": 4.720778126770141e-06, |
| "loss": 14.764068603515625, |
| "step": 4100 |
| }, |
| { |
| "epoch": 2.140708641345909, |
| "grad_norm": 4.756154537200928, |
| "learning_rate": 4.247323131312676e-06, |
| "loss": 14.755054931640625, |
| "step": 4200 |
| }, |
| { |
| "epoch": 2.191690033137905, |
| "grad_norm": 4.989803314208984, |
| "learning_rate": 3.7924090740397178e-06, |
| "loss": 14.760721435546875, |
| "step": 4300 |
| }, |
| { |
| "epoch": 2.2426714249299007, |
| "grad_norm": 4.568801403045654, |
| "learning_rate": 3.3575021477529313e-06, |
| "loss": 14.72455810546875, |
| "step": 4400 |
| }, |
| { |
| "epoch": 2.2936528167218966, |
| "grad_norm": 4.871072769165039, |
| "learning_rate": 2.944004062059924e-06, |
| "loss": 14.743800048828126, |
| "step": 4500 |
| }, |
| { |
| "epoch": 2.3446342085138925, |
| "grad_norm": 4.790256500244141, |
| "learning_rate": 2.5532475256494073e-06, |
| "loss": 14.7241162109375, |
| "step": 4600 |
| }, |
| { |
| "epoch": 2.3956156003058884, |
| "grad_norm": 4.770144462585449, |
| "learning_rate": 2.186491950957048e-06, |
| "loss": 14.711162109375, |
| "step": 4700 |
| }, |
| { |
| "epoch": 2.4465969920978843, |
| "grad_norm": 4.44427490234375, |
| "learning_rate": 1.8449193950659018e-06, |
| "loss": 14.72890625, |
| "step": 4800 |
| }, |
| { |
| "epoch": 2.49757838388988, |
| "grad_norm": 4.664465427398682, |
| "learning_rate": 1.5296307499239903e-06, |
| "loss": 14.713804931640626, |
| "step": 4900 |
| }, |
| { |
| "epoch": 2.548559775681876, |
| "grad_norm": 4.861291408538818, |
| "learning_rate": 1.2416421941579448e-06, |
| "loss": 14.730694580078126, |
| "step": 5000 |
| }, |
| { |
| "epoch": 2.599541167473872, |
| "grad_norm": 4.662012577056885, |
| "learning_rate": 9.818819179185713e-07, |
| "loss": 14.70477294921875, |
| "step": 5100 |
| }, |
| { |
| "epoch": 2.650522559265868, |
| "grad_norm": 4.803001403808594, |
| "learning_rate": 7.511871313142238e-07, |
| "loss": 14.7314208984375, |
| "step": 5200 |
| }, |
| { |
| "epoch": 2.701503951057864, |
| "grad_norm": 4.746646404266357, |
| "learning_rate": 5.503013660737899e-07, |
| "loss": 14.70580810546875, |
| "step": 5300 |
| }, |
| { |
| "epoch": 2.7524853428498597, |
| "grad_norm": 4.867108345031738, |
| "learning_rate": 3.798720791360988e-07, |
| "loss": 14.710306396484375, |
| "step": 5400 |
| }, |
| { |
| "epoch": 2.8034667346418556, |
| "grad_norm": 4.6949992179870605, |
| "learning_rate": 2.404485658893807e-07, |
| "loss": 14.725491943359375, |
| "step": 5500 |
| }, |
| { |
| "epoch": 2.8544481264338515, |
| "grad_norm": 4.641607284545898, |
| "learning_rate": 1.3248018978643695e-07, |
| "loss": 14.7078369140625, |
| "step": 5600 |
| }, |
| { |
| "epoch": 2.905429518225848, |
| "grad_norm": 4.756202220916748, |
| "learning_rate": 5.6314934041501455e-08, |
| "loss": 14.697396240234376, |
| "step": 5700 |
| }, |
| { |
| "epoch": 2.9564109100178433, |
| "grad_norm": 4.691574573516846, |
| "learning_rate": 1.2198280076668455e-08, |
| "loss": 14.694278564453125, |
| "step": 5800 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 3.5591108798980713, |
| "eval_runtime": 8.0338, |
| "eval_samples_per_second": 631.333, |
| "eval_steps_per_second": 9.958, |
| "step": 5886 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 5886, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 1962, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 3, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 0 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.964983111028869e+17, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|