| { | |
| "best_global_step": 20000, | |
| "best_metric": 7.266454117854436, | |
| "best_model_checkpoint": "/scratch/lma/model/checkpoints-new/checkpoint-20000", | |
| "epoch": 31.78848, | |
| "eval_steps": 5000, | |
| "global_step": 25000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1.27904, | |
| "grad_norm": 28670.287109375, | |
| "learning_rate": 4.995e-05, | |
| "loss": 1.738, | |
| "perplexity": 5.685960122920831, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.55808, | |
| "grad_norm": 16521.41015625, | |
| "learning_rate": 9.994999999999999e-05, | |
| "loss": 1.1317, | |
| "perplexity": 3.10092359193447, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.83712, | |
| "grad_norm": 11799.3828125, | |
| "learning_rate": 0.00014994999999999999, | |
| "loss": 0.9731, | |
| "perplexity": 2.6461347756092684, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 5.1152, | |
| "grad_norm": 10640.298828125, | |
| "learning_rate": 0.00019994999999999998, | |
| "loss": 0.8739, | |
| "perplexity": 2.3962379819312827, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 6.39424, | |
| "grad_norm": 9062.7080078125, | |
| "learning_rate": 0.00024995, | |
| "loss": 0.8026, | |
| "perplexity": 2.231334863758477, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 7.27904, | |
| "grad_norm": 26485.513671875, | |
| "learning_rate": 0.00024622986747569733, | |
| "loss": 0.7023, | |
| "perplexity": 2.018389669159474, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 8.55808, | |
| "grad_norm": 24189.75390625, | |
| "learning_rate": 0.00022641892733779356, | |
| "loss": 0.7066, | |
| "perplexity": 2.0270874315241456, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 9.83712, | |
| "grad_norm": 28136.08984375, | |
| "learning_rate": 0.00020460836503068182, | |
| "loss": 0.7021, | |
| "perplexity": 2.0179860315907443, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 11.1152, | |
| "grad_norm": 19359.896484375, | |
| "learning_rate": 0.0001813688884330453, | |
| "loss": 0.6614, | |
| "perplexity": 1.937502940651837, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 12.39424, | |
| "grad_norm": 27617.408203125, | |
| "learning_rate": 0.00015730859522728144, | |
| "loss": 0.605, | |
| "perplexity": 1.8312522088857732, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 12.39424, | |
| "eval/loss": 5.23094868906339, | |
| "eval/ppl": 186.97009589435876, | |
| "eval/ppl_en": 190.0160900550089, | |
| "eval/ppl_hi": 126.77963486780418, | |
| "eval/ppl_mi": 206.84769822688918, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 13.67328, | |
| "grad_norm": 29511.85546875, | |
| "learning_rate": 0.00013305706106130836, | |
| "loss": 0.5833, | |
| "perplexity": 1.7919420932571126, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 14.95232, | |
| "grad_norm": 31045.4453125, | |
| "learning_rate": 0.00010924886570541292, | |
| "loss": 0.5432, | |
| "perplexity": 1.721506879478172, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 16.2304, | |
| "grad_norm": 19424.427734375, | |
| "learning_rate": 8.650698826831828e-05, | |
| "loss": 0.4869, | |
| "perplexity": 1.6272638748545094, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 17.50944, | |
| "grad_norm": 29583.099609375, | |
| "learning_rate": 6.542650596291983e-05, | |
| "loss": 0.4222, | |
| "perplexity": 1.5253135568904201, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 18.78848, | |
| "grad_norm": 33708.65234375, | |
| "learning_rate": 4.65590229692808e-05, | |
| "loss": 0.3845, | |
| "perplexity": 1.4688796979516008, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 18.78848, | |
| "eval/loss": 6.0393949168523156, | |
| "eval/ppl": 419.6390415387314, | |
| "eval/ppl_en": 433.6206088421918, | |
| "eval/ppl_hi": 261.7910852218736, | |
| "eval/ppl_mi": 473.2004975522671, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 20.27904, | |
| "grad_norm": 27125.3125, | |
| "learning_rate": 0.00014111482083512477, | |
| "loss": 0.4418, | |
| "perplexity": 1.5555046084002613, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 21.55808, | |
| "grad_norm": 29882.669921875, | |
| "learning_rate": 0.0001250325903022969, | |
| "loss": 0.4051, | |
| "perplexity": 1.499452437803535, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 22.83712, | |
| "grad_norm": 30726.154296875, | |
| "learning_rate": 0.00010924107340635552, | |
| "loss": 0.3712, | |
| "perplexity": 1.4494729389869156, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 24.1152, | |
| "grad_norm": 23363.291015625, | |
| "learning_rate": 9.39241422175057e-05, | |
| "loss": 0.3321, | |
| "perplexity": 1.3938922307663644, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 25.39424, | |
| "grad_norm": 27908.248046875, | |
| "learning_rate": 7.92601428607867e-05, | |
| "loss": 0.2506, | |
| "perplexity": 1.2847960631085609, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 25.39424, | |
| "eval_loss": 7.266454117854436, | |
| "eval_loss_en": 7.243548462824586, | |
| "eval_loss_hi": 6.703572651624223, | |
| "eval_loss_mi": 7.425478159377612, | |
| "eval_perplexity": 1431.4656348447334, | |
| "eval_perplexity_en": 1399.0496488155554, | |
| "eval_perplexity_hi": 815.3134590470939, | |
| "eval_perplexity_mi": 1678.2018291410614, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 26.67328, | |
| "grad_norm": 32178.92578125, | |
| "learning_rate": 6.541981890294837e-05, | |
| "loss": 0.2113, | |
| "perplexity": 1.2352828843445263, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 27.95232, | |
| "grad_norm": 35835.1171875, | |
| "learning_rate": 5.256432326142149e-05, | |
| "loss": 0.17, | |
| "perplexity": 1.1853048513203654, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 29.2304, | |
| "grad_norm": 19210.462890625, | |
| "learning_rate": 4.084334178416445e-05, | |
| "loss": 0.1396, | |
| "perplexity": 1.14981378134732, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 30.50944, | |
| "grad_norm": 23385.990234375, | |
| "learning_rate": 3.0393350348907002e-05, | |
| "loss": 0.0965, | |
| "perplexity": 1.1013095811437943, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 31.78848, | |
| "grad_norm": 22185.587890625, | |
| "learning_rate": 2.1336025775646154e-05, | |
| "loss": 0.0761, | |
| "perplexity": 1.0790704758096923, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 31.78848, | |
| "eval_loss": 8.725315799395243, | |
| "eval_loss_en": 8.752607714958739, | |
| "eval_loss_hi": 8.09166133420816, | |
| "eval_loss_mi": 8.888291169885575, | |
| "eval_perplexity": 6156.820676809646, | |
| "eval_perplexity_en": 6327.166059430776, | |
| "eval_perplexity_hi": 3267.110827037123, | |
| "eval_perplexity_mi": 7246.625345410556, | |
| "step": 25000 | |
| } | |
| ], | |
| "logging_steps": 1000, | |
| "max_steps": 30000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 39, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": true, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.5653480633296486e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |