Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN,
"... is not valid JSON
| { | |
| "best_metric": 1.6663275957107544, | |
| "best_model_checkpoint": "/data/user_data/gonilude/java_and_text_pythia_410m/checkpoint-150", | |
| "epoch": 3.0, | |
| "eval_steps": 50, | |
| "global_step": 243, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0, | |
| "eval_accuracy": 0.25, | |
| "eval_loss": 2.6309967041015625, | |
| "eval_runtime": 0.5335, | |
| "eval_samples_per_second": 134.957, | |
| "eval_steps_per_second": 16.87, | |
| "num_input_tokens_seen": 0, | |
| "step": 0 | |
| }, | |
| { | |
| "epoch": 0.012345679012345678, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0, | |
| "loss": 3.7346, | |
| "num_input_tokens_seen": 8192, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.06172839506172839, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.0, | |
| "loss": 3.0853, | |
| "num_input_tokens_seen": 40960, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.12345679012345678, | |
| "grad_norm": 140.6628875732422, | |
| "learning_rate": 1e-05, | |
| "loss": 2.951, | |
| "num_input_tokens_seen": 81920, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.18518518518518517, | |
| "grad_norm": 121.90998840332031, | |
| "learning_rate": 1.9999106432103785e-05, | |
| "loss": 3.6623, | |
| "num_input_tokens_seen": 122880, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.24691358024691357, | |
| "grad_norm": 119.89352416992188, | |
| "learning_rate": 1.9967848320275253e-05, | |
| "loss": 2.5551, | |
| "num_input_tokens_seen": 163840, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.30864197530864196, | |
| "grad_norm": 69.07218170166016, | |
| "learning_rate": 1.9892071378116378e-05, | |
| "loss": 2.146, | |
| "num_input_tokens_seen": 204800, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.37037037037037035, | |
| "grad_norm": 38.68893051147461, | |
| "learning_rate": 1.9772114043845968e-05, | |
| "loss": 2.4626, | |
| "num_input_tokens_seen": 245760, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.43209876543209874, | |
| "grad_norm": 12.328490257263184, | |
| "learning_rate": 1.9608512076038964e-05, | |
| "loss": 1.976, | |
| "num_input_tokens_seen": 286720, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.49382716049382713, | |
| "grad_norm": 28.689950942993164, | |
| "learning_rate": 1.9401996160798574e-05, | |
| "loss": 1.8516, | |
| "num_input_tokens_seen": 327680, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 20.923078536987305, | |
| "learning_rate": 1.915348864833476e-05, | |
| "loss": 1.669, | |
| "num_input_tokens_seen": 368640, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.6172839506172839, | |
| "grad_norm": 60.305381774902344, | |
| "learning_rate": 1.8864099433524302e-05, | |
| "loss": 1.9603, | |
| "num_input_tokens_seen": 409600, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6172839506172839, | |
| "eval_accuracy": 0.19444444444444445, | |
| "eval_loss": 1.8224691152572632, | |
| "eval_runtime": 0.3688, | |
| "eval_samples_per_second": 195.251, | |
| "eval_steps_per_second": 24.406, | |
| "num_input_tokens_seen": 409600, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6790123456790124, | |
| "grad_norm": 37.31528854370117, | |
| "learning_rate": 1.853512099885085e-05, | |
| "loss": 1.6856, | |
| "num_input_tokens_seen": 450560, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 34.03938674926758, | |
| "learning_rate": 1.816802264186438e-05, | |
| "loss": 1.989, | |
| "num_input_tokens_seen": 491520, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.8024691358024691, | |
| "grad_norm": 27.516170501708984, | |
| "learning_rate": 1.7764443912941675e-05, | |
| "loss": 1.8994, | |
| "num_input_tokens_seen": 532480, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.8641975308641975, | |
| "grad_norm": 23.523881912231445, | |
| "learning_rate": 1.7326187292656332e-05, | |
| "loss": 1.7613, | |
| "num_input_tokens_seen": 573440, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.9259259259259259, | |
| "grad_norm": 25.84837532043457, | |
| "learning_rate": 1.6855210141462964e-05, | |
| "loss": 1.9531, | |
| "num_input_tokens_seen": 614400, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.9876543209876543, | |
| "grad_norm": 34.3070068359375, | |
| "learning_rate": 1.635361595765024e-05, | |
| "loss": 1.9051, | |
| "num_input_tokens_seen": 655360, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.0493827160493827, | |
| "grad_norm": 31.36409568786621, | |
| "learning_rate": 1.5823644982606905e-05, | |
| "loss": 1.8392, | |
| "num_input_tokens_seen": 696320, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 27.246265411376953, | |
| "learning_rate": 1.5267664195359917e-05, | |
| "loss": 1.6923, | |
| "num_input_tokens_seen": 737280, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.1728395061728394, | |
| "grad_norm": 41.13190841674805, | |
| "learning_rate": 1.4688156741071513e-05, | |
| "loss": 1.8443, | |
| "num_input_tokens_seen": 778240, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.2345679012345678, | |
| "grad_norm": 38.65595626831055, | |
| "learning_rate": 1.408771084071012e-05, | |
| "loss": 1.7774, | |
| "num_input_tokens_seen": 819200, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.2345679012345678, | |
| "eval_accuracy": 0.18055555555555555, | |
| "eval_loss": 1.7543673515319824, | |
| "eval_runtime": 0.371, | |
| "eval_samples_per_second": 194.075, | |
| "eval_steps_per_second": 24.259, | |
| "num_input_tokens_seen": 819200, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.2962962962962963, | |
| "grad_norm": 14.981557846069336, | |
| "learning_rate": 1.3469008231427207e-05, | |
| "loss": 1.8287, | |
| "num_input_tokens_seen": 860160, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.3580246913580247, | |
| "grad_norm": 21.5086727142334, | |
| "learning_rate": 1.283481218926818e-05, | |
| "loss": 1.5601, | |
| "num_input_tokens_seen": 901120, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.4197530864197532, | |
| "grad_norm": 24.150880813598633, | |
| "learning_rate": 1.2187955187710752e-05, | |
| "loss": 1.8594, | |
| "num_input_tokens_seen": 942080, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.4814814814814814, | |
| "grad_norm": 11.27273941040039, | |
| "learning_rate": 1.1531326247150802e-05, | |
| "loss": 1.6694, | |
| "num_input_tokens_seen": 983040, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.5432098765432098, | |
| "grad_norm": 24.10595703125, | |
| "learning_rate": 1.0867858031835975e-05, | |
| "loss": 1.6684, | |
| "num_input_tokens_seen": 1024000, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.6049382716049383, | |
| "grad_norm": 15.398138046264648, | |
| "learning_rate": 1.0200513751875227e-05, | |
| "loss": 1.6126, | |
| "num_input_tokens_seen": 1064960, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 25.667972564697266, | |
| "learning_rate": 9.532273928823151e-06, | |
| "loss": 1.5059, | |
| "num_input_tokens_seen": 1105920, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.7283950617283952, | |
| "grad_norm": 21.25739288330078, | |
| "learning_rate": 8.866123083947182e-06, | |
| "loss": 1.6684, | |
| "num_input_tokens_seen": 1146880, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.7901234567901234, | |
| "grad_norm": 17.899890899658203, | |
| "learning_rate": 8.20503640863111e-06, | |
| "loss": 1.5919, | |
| "num_input_tokens_seen": 1187840, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.8518518518518519, | |
| "grad_norm": 25.205127716064453, | |
| "learning_rate": 7.55196647644814e-06, | |
| "loss": 1.6193, | |
| "num_input_tokens_seen": 1228800, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.8518518518518519, | |
| "eval_accuracy": 0.18055555555555555, | |
| "eval_loss": 1.6663275957107544, | |
| "eval_runtime": 0.3724, | |
| "eval_samples_per_second": 193.328, | |
| "eval_steps_per_second": 24.166, | |
| "num_input_tokens_seen": 1228800, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.9135802469135803, | |
| "grad_norm": 32.01460647583008, | |
| "learning_rate": 6.909830056250527e-06, | |
| "loss": 1.649, | |
| "num_input_tokens_seen": 1269760, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.9753086419753085, | |
| "grad_norm": 23.55994415283203, | |
| "learning_rate": 6.2814950851718695e-06, | |
| "loss": 1.7344, | |
| "num_input_tokens_seen": 1310720, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.037037037037037, | |
| "grad_norm": 40.77457046508789, | |
| "learning_rate": 5.669767859723636e-06, | |
| "loss": 1.5739, | |
| "num_input_tokens_seen": 1351680, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.0987654320987654, | |
| "grad_norm": 23.66048240661621, | |
| "learning_rate": 5.077380502193725e-06, | |
| "loss": 1.4769, | |
| "num_input_tokens_seen": 1392640, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.1604938271604937, | |
| "grad_norm": 20.150760650634766, | |
| "learning_rate": 4.5069787583250815e-06, | |
| "loss": 1.5486, | |
| "num_input_tokens_seen": 1433600, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 11.237411499023438, | |
| "learning_rate": 3.961110180772955e-06, | |
| "loss": 1.5313, | |
| "num_input_tokens_seen": 1474560, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.2839506172839505, | |
| "grad_norm": 30.582534790039062, | |
| "learning_rate": 3.442212751116305e-06, | |
| "loss": 1.6429, | |
| "num_input_tokens_seen": 1515520, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.3456790123456788, | |
| "grad_norm": 24.695331573486328, | |
| "learning_rate": 2.9526039912402504e-06, | |
| "loss": 1.4812, | |
| "num_input_tokens_seen": 1556480, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.4074074074074074, | |
| "grad_norm": 6.564646244049072, | |
| "learning_rate": 2.4944706127207252e-06, | |
| "loss": 1.5954, | |
| "num_input_tokens_seen": 1597440, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.4691358024691357, | |
| "grad_norm": 31.85268783569336, | |
| "learning_rate": 2.0698587504397684e-06, | |
| "loss": 1.7311, | |
| "num_input_tokens_seen": 1638400, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.4691358024691357, | |
| "eval_accuracy": 0.19444444444444445, | |
| "eval_loss": 1.7398681640625, | |
| "eval_runtime": 0.4238, | |
| "eval_samples_per_second": 169.875, | |
| "eval_steps_per_second": 21.234, | |
| "num_input_tokens_seen": 1638400, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.5308641975308643, | |
| "grad_norm": 29.221206665039062, | |
| "learning_rate": 1.680664824050432e-06, | |
| "loss": 1.4484, | |
| "num_input_tokens_seen": 1679360, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.5925925925925926, | |
| "grad_norm": 35.87575149536133, | |
| "learning_rate": 1.3286270681062275e-06, | |
| "loss": 1.5614, | |
| "num_input_tokens_seen": 1720320, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.6543209876543212, | |
| "grad_norm": 29.08688735961914, | |
| "learning_rate": 1.015317768683669e-06, | |
| "loss": 1.5382, | |
| "num_input_tokens_seen": 1761280, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.7160493827160495, | |
| "grad_norm": 23.70969581604004, | |
| "learning_rate": 7.421362411709676e-07, | |
| "loss": 1.5272, | |
| "num_input_tokens_seen": 1802240, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "grad_norm": 35.782222747802734, | |
| "learning_rate": 5.103025805858197e-07, | |
| "loss": 1.5842, | |
| "num_input_tokens_seen": 1843200, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.8395061728395063, | |
| "grad_norm": 31.512516021728516, | |
| "learning_rate": 3.2085221233487564e-07, | |
| "loss": 1.5197, | |
| "num_input_tokens_seen": 1884160, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.9012345679012346, | |
| "grad_norm": 24.439847946166992, | |
| "learning_rate": 1.7463126775252192e-07, | |
| "loss": 1.5678, | |
| "num_input_tokens_seen": 1925120, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.962962962962963, | |
| "grad_norm": 28.297935485839844, | |
| "learning_rate": 7.229280507293657e-08, | |
| "loss": 1.6146, | |
| "num_input_tokens_seen": 1966080, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "num_input_tokens_seen": 1990656, | |
| "step": 243, | |
| "total_flos": 3610849453277184.0, | |
| "train_loss": 1.8222727520475663, | |
| "train_runtime": 109.7362, | |
| "train_samples_per_second": 17.633, | |
| "train_steps_per_second": 2.214 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 243, | |
| "num_input_tokens_seen": 1990656, | |
| "num_train_epochs": 3, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3610849453277184.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |