DilLeiX / trainer_state.json
ameykaran's picture
Uploaded model
091f551
{
"best_global_step": 20000,
"best_metric": 7.266454117854436,
"best_model_checkpoint": "/scratch/lma/model/checkpoints-new/checkpoint-20000",
"epoch": 31.78848,
"eval_steps": 5000,
"global_step": 25000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.27904,
"grad_norm": 28670.287109375,
"learning_rate": 4.995e-05,
"loss": 1.738,
"perplexity": 5.685960122920831,
"step": 1000
},
{
"epoch": 2.55808,
"grad_norm": 16521.41015625,
"learning_rate": 9.994999999999999e-05,
"loss": 1.1317,
"perplexity": 3.10092359193447,
"step": 2000
},
{
"epoch": 3.83712,
"grad_norm": 11799.3828125,
"learning_rate": 0.00014994999999999999,
"loss": 0.9731,
"perplexity": 2.6461347756092684,
"step": 3000
},
{
"epoch": 5.1152,
"grad_norm": 10640.298828125,
"learning_rate": 0.00019994999999999998,
"loss": 0.8739,
"perplexity": 2.3962379819312827,
"step": 4000
},
{
"epoch": 6.39424,
"grad_norm": 9062.7080078125,
"learning_rate": 0.00024995,
"loss": 0.8026,
"perplexity": 2.231334863758477,
"step": 5000
},
{
"epoch": 7.27904,
"grad_norm": 26485.513671875,
"learning_rate": 0.00024622986747569733,
"loss": 0.7023,
"perplexity": 2.018389669159474,
"step": 6000
},
{
"epoch": 8.55808,
"grad_norm": 24189.75390625,
"learning_rate": 0.00022641892733779356,
"loss": 0.7066,
"perplexity": 2.0270874315241456,
"step": 7000
},
{
"epoch": 9.83712,
"grad_norm": 28136.08984375,
"learning_rate": 0.00020460836503068182,
"loss": 0.7021,
"perplexity": 2.0179860315907443,
"step": 8000
},
{
"epoch": 11.1152,
"grad_norm": 19359.896484375,
"learning_rate": 0.0001813688884330453,
"loss": 0.6614,
"perplexity": 1.937502940651837,
"step": 9000
},
{
"epoch": 12.39424,
"grad_norm": 27617.408203125,
"learning_rate": 0.00015730859522728144,
"loss": 0.605,
"perplexity": 1.8312522088857732,
"step": 10000
},
{
"epoch": 12.39424,
"eval/loss": 5.23094868906339,
"eval/ppl": 186.97009589435876,
"eval/ppl_en": 190.0160900550089,
"eval/ppl_hi": 126.77963486780418,
"eval/ppl_mi": 206.84769822688918,
"step": 10000
},
{
"epoch": 13.67328,
"grad_norm": 29511.85546875,
"learning_rate": 0.00013305706106130836,
"loss": 0.5833,
"perplexity": 1.7919420932571126,
"step": 11000
},
{
"epoch": 14.95232,
"grad_norm": 31045.4453125,
"learning_rate": 0.00010924886570541292,
"loss": 0.5432,
"perplexity": 1.721506879478172,
"step": 12000
},
{
"epoch": 16.2304,
"grad_norm": 19424.427734375,
"learning_rate": 8.650698826831828e-05,
"loss": 0.4869,
"perplexity": 1.6272638748545094,
"step": 13000
},
{
"epoch": 17.50944,
"grad_norm": 29583.099609375,
"learning_rate": 6.542650596291983e-05,
"loss": 0.4222,
"perplexity": 1.5253135568904201,
"step": 14000
},
{
"epoch": 18.78848,
"grad_norm": 33708.65234375,
"learning_rate": 4.65590229692808e-05,
"loss": 0.3845,
"perplexity": 1.4688796979516008,
"step": 15000
},
{
"epoch": 18.78848,
"eval/loss": 6.0393949168523156,
"eval/ppl": 419.6390415387314,
"eval/ppl_en": 433.6206088421918,
"eval/ppl_hi": 261.7910852218736,
"eval/ppl_mi": 473.2004975522671,
"step": 15000
},
{
"epoch": 20.27904,
"grad_norm": 27125.3125,
"learning_rate": 0.00014111482083512477,
"loss": 0.4418,
"perplexity": 1.5555046084002613,
"step": 16000
},
{
"epoch": 21.55808,
"grad_norm": 29882.669921875,
"learning_rate": 0.0001250325903022969,
"loss": 0.4051,
"perplexity": 1.499452437803535,
"step": 17000
},
{
"epoch": 22.83712,
"grad_norm": 30726.154296875,
"learning_rate": 0.00010924107340635552,
"loss": 0.3712,
"perplexity": 1.4494729389869156,
"step": 18000
},
{
"epoch": 24.1152,
"grad_norm": 23363.291015625,
"learning_rate": 9.39241422175057e-05,
"loss": 0.3321,
"perplexity": 1.3938922307663644,
"step": 19000
},
{
"epoch": 25.39424,
"grad_norm": 27908.248046875,
"learning_rate": 7.92601428607867e-05,
"loss": 0.2506,
"perplexity": 1.2847960631085609,
"step": 20000
},
{
"epoch": 25.39424,
"eval_loss": 7.266454117854436,
"eval_loss_en": 7.243548462824586,
"eval_loss_hi": 6.703572651624223,
"eval_loss_mi": 7.425478159377612,
"eval_perplexity": 1431.4656348447334,
"eval_perplexity_en": 1399.0496488155554,
"eval_perplexity_hi": 815.3134590470939,
"eval_perplexity_mi": 1678.2018291410614,
"step": 20000
},
{
"epoch": 26.67328,
"grad_norm": 32178.92578125,
"learning_rate": 6.541981890294837e-05,
"loss": 0.2113,
"perplexity": 1.2352828843445263,
"step": 21000
},
{
"epoch": 27.95232,
"grad_norm": 35835.1171875,
"learning_rate": 5.256432326142149e-05,
"loss": 0.17,
"perplexity": 1.1853048513203654,
"step": 22000
},
{
"epoch": 29.2304,
"grad_norm": 19210.462890625,
"learning_rate": 4.084334178416445e-05,
"loss": 0.1396,
"perplexity": 1.14981378134732,
"step": 23000
},
{
"epoch": 30.50944,
"grad_norm": 23385.990234375,
"learning_rate": 3.0393350348907002e-05,
"loss": 0.0965,
"perplexity": 1.1013095811437943,
"step": 24000
},
{
"epoch": 31.78848,
"grad_norm": 22185.587890625,
"learning_rate": 2.1336025775646154e-05,
"loss": 0.0761,
"perplexity": 1.0790704758096923,
"step": 25000
},
{
"epoch": 31.78848,
"eval_loss": 8.725315799395243,
"eval_loss_en": 8.752607714958739,
"eval_loss_hi": 8.09166133420816,
"eval_loss_mi": 8.888291169885575,
"eval_perplexity": 6156.820676809646,
"eval_perplexity_en": 6327.166059430776,
"eval_perplexity_hi": 3267.110827037123,
"eval_perplexity_mi": 7246.625345410556,
"step": 25000
}
],
"logging_steps": 1000,
"max_steps": 30000,
"num_input_tokens_seen": 0,
"num_train_epochs": 39,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": true,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.5653480633296486e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}