Invalid JSON: Unexpected token 'N', ..."al_loss": NaN,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 260, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.1927710843373494, | |
| "grad_norm": 95.26730346679688, | |
| "learning_rate": 0.0004180909090909091, | |
| "loss": 22.4465, | |
| "mean_token_accuracy": 0.08969678990542888, | |
| "num_tokens": 1310720.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.3855421686746988, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0005103526968014265, | |
| "loss": 0.8012, | |
| "mean_token_accuracy": 0.13804710581898688, | |
| "num_tokens": 2621440.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.5783132530120482, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0005077286477833616, | |
| "loss": 452.9577, | |
| "mean_token_accuracy": 0.05160275483503938, | |
| "num_tokens": 3932160.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.7710843373493976, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0005031081504278389, | |
| "loss": 470.5136, | |
| "mean_token_accuracy": 0.03822226445190609, | |
| "num_tokens": 5242880.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.963855421686747, | |
| "grad_norm": 26.303752899169922, | |
| "learning_rate": 0.0004965277770447238, | |
| "loss": 167.1384, | |
| "mean_token_accuracy": 0.057517293840646744, | |
| "num_tokens": 6553600.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.13677339731378757, | |
| "eval_num_tokens": 6713344.0, | |
| "eval_runtime": 8.9806, | |
| "eval_samples_per_second": 41.089, | |
| "eval_steps_per_second": 5.233, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.1542168674698796, | |
| "grad_norm": 28.755094528198242, | |
| "learning_rate": 0.00048803961281790017, | |
| "loss": 27.9726, | |
| "mean_token_accuracy": 0.03075966710531259, | |
| "num_tokens": 7761920.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.346987951807229, | |
| "grad_norm": 0.9693858027458191, | |
| "learning_rate": 0.000477710843538941, | |
| "loss": 2.2869, | |
| "mean_token_accuracy": 0.10747051909565926, | |
| "num_tokens": 9072640.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.5397590361445783, | |
| "grad_norm": 0.36548200249671936, | |
| "learning_rate": 0.0004656232238159615, | |
| "loss": 60.0031, | |
| "mean_token_accuracy": 0.10124717205762863, | |
| "num_tokens": 10383360.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.7325301204819277, | |
| "grad_norm": 0.8749092817306519, | |
| "learning_rate": 0.0004518724299669051, | |
| "loss": 0.8994, | |
| "mean_token_accuracy": 0.1551567144691944, | |
| "num_tokens": 11694080.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.9253012048192772, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0004365673027192623, | |
| "loss": 2.2759, | |
| "mean_token_accuracy": 0.13096993789076805, | |
| "num_tokens": 13004800.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.14378934084100925, | |
| "eval_num_tokens": 13426688.0, | |
| "eval_runtime": 8.9689, | |
| "eval_samples_per_second": 41.142, | |
| "eval_steps_per_second": 5.24, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 2.1156626506024097, | |
| "grad_norm": 0.6895984411239624, | |
| "learning_rate": 0.0004198289857104298, | |
| "loss": 0.2064, | |
| "mean_token_accuracy": 0.08719592305678356, | |
| "num_tokens": 14213120.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.3084337349397592, | |
| "grad_norm": 0.7038294672966003, | |
| "learning_rate": 0.0004017899666076801, | |
| "loss": 1.3155, | |
| "mean_token_accuracy": 0.1053241491317749, | |
| "num_tokens": 15523840.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.5012048192771084, | |
| "grad_norm": 27.594745635986328, | |
| "learning_rate": 0.0003825930284374996, | |
| "loss": 0.0836, | |
| "mean_token_accuracy": 0.07201291918754578, | |
| "num_tokens": 16834560.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.693975903614458, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00036239011942476655, | |
| "loss": 1.364, | |
| "mean_token_accuracy": 0.15817394778132438, | |
| "num_tokens": 18145280.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.886746987951807, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00034134115028725524, | |
| "loss": 3.5977, | |
| "mean_token_accuracy": 0.10589548945426941, | |
| "num_tokens": 19456000.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.14411297884393245, | |
| "eval_num_tokens": 20140032.0, | |
| "eval_runtime": 8.9831, | |
| "eval_samples_per_second": 41.077, | |
| "eval_steps_per_second": 5.232, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 3.07710843373494, | |
| "grad_norm": 0.1902359277009964, | |
| "learning_rate": 0.0003196127285051592, | |
| "loss": 8.9424, | |
| "mean_token_accuracy": 0.062061098557484304, | |
| "num_tokens": 20664320.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 3.269879518072289, | |
| "grad_norm": 0.3390277326107025, | |
| "learning_rate": 0.00029737683958418377, | |
| "loss": 12.1371, | |
| "mean_token_accuracy": 0.07337962239980697, | |
| "num_tokens": 21975040.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 3.4626506024096386, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00027480948575031854, | |
| "loss": 42.6417, | |
| "mean_token_accuracy": 0.08556168600916862, | |
| "num_tokens": 23285760.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.6554216867469878, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0002520892928513346, | |
| "loss": 1.5423, | |
| "mean_token_accuracy": 0.13269576877355577, | |
| "num_tokens": 24596480.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.8481927710843373, | |
| "grad_norm": 0.19443857669830322, | |
| "learning_rate": 0.0002293960964917063, | |
| "loss": 0.3356, | |
| "mean_token_accuracy": 0.17251307517290115, | |
| "num_tokens": 25907200.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.14541077867467353, | |
| "eval_num_tokens": 26853376.0, | |
| "eval_runtime": 8.9721, | |
| "eval_samples_per_second": 41.127, | |
| "eval_steps_per_second": 5.238, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 4.03855421686747, | |
| "grad_norm": 0.009991697035729885, | |
| "learning_rate": 0.00020690951859202796, | |
| "loss": 0.4818, | |
| "mean_token_accuracy": 0.16193881593173062, | |
| "num_tokens": 27115520.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 4.231325301204819, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001848075456397883, | |
| "loss": 0.3988, | |
| "mean_token_accuracy": 0.12297056466341019, | |
| "num_tokens": 28426240.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 4.424096385542168, | |
| "grad_norm": 0.007613173220306635, | |
| "learning_rate": 0.00016326511988497662, | |
| "loss": 0.0274, | |
| "mean_token_accuracy": 0.11160993352532386, | |
| "num_tokens": 29736960.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 4.6168674698795185, | |
| "grad_norm": 0.0015245635295286775, | |
| "learning_rate": 0.0001424527546315377, | |
| "loss": 2.8338, | |
| "mean_token_accuracy": 0.06221988797187805, | |
| "num_tokens": 31047680.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 4.809638554216868, | |
| "grad_norm": 0.26247891783714294, | |
| "learning_rate": 0.00012253518458496144, | |
| "loss": 0.2528, | |
| "mean_token_accuracy": 0.09861943274736404, | |
| "num_tokens": 32358400.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.00404323311522603, | |
| "learning_rate": 0.00010367006193882396, | |
| "loss": 0.2094, | |
| "mean_token_accuracy": 0.13716269520264637, | |
| "num_tokens": 33566720.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.14629162118789998, | |
| "eval_num_tokens": 33566720.0, | |
| "eval_runtime": 9.0068, | |
| "eval_samples_per_second": 40.969, | |
| "eval_steps_per_second": 5.218, | |
| "step": 260 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 364, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 7, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.4752094832096707e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |