{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 3770, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13262599469496023, "grad_norm": 5.429837703704834, "learning_rate": 4.967957276368492e-05, "loss": 2.3201, "step": 100 }, { "epoch": 0.26525198938992045, "grad_norm": 6.075057029724121, "learning_rate": 4.901869158878505e-05, "loss": 1.9594, "step": 200 }, { "epoch": 0.3978779840848806, "grad_norm": 3.753885507583618, "learning_rate": 4.835113484646196e-05, "loss": 1.9226, "step": 300 }, { "epoch": 0.5305039787798409, "grad_norm": 5.202290058135986, "learning_rate": 4.768357810413886e-05, "loss": 1.8727, "step": 400 }, { "epoch": 0.6631299734748011, "grad_norm": 4.6010637283325195, "learning_rate": 4.701602136181576e-05, "loss": 1.815, "step": 500 }, { "epoch": 0.7957559681697612, "grad_norm": 3.4173789024353027, "learning_rate": 4.634846461949266e-05, "loss": 1.828, "step": 600 }, { "epoch": 0.9283819628647215, "grad_norm": 8.333471298217773, "learning_rate": 4.568090787716956e-05, "loss": 1.8053, "step": 700 }, { "epoch": 1.0, "eval_gen_len": 128.048, "eval_loss": 1.6762282848358154, "eval_rouge1": 40.7296, "eval_rouge2": 14.8309, "eval_rougeL": 26.5174, "eval_rougeLsum": 37.7044, "eval_runtime": 609.5513, "eval_samples_per_second": 1.641, "eval_steps_per_second": 0.205, "step": 754 }, { "epoch": 1.0610079575596818, "grad_norm": 2.8465678691864014, "learning_rate": 4.5013351134846464e-05, "loss": 1.7625, "step": 800 }, { "epoch": 1.193633952254642, "grad_norm": 3.454566717147827, "learning_rate": 4.4345794392523364e-05, "loss": 1.7285, "step": 900 }, { "epoch": 1.3262599469496021, "grad_norm": 24.434019088745117, "learning_rate": 4.367823765020027e-05, "loss": 1.6906, "step": 1000 }, { "epoch": 1.4588859416445623, "grad_norm": 2.913654327392578, "learning_rate": 4.301068090787717e-05, "loss": 1.7003, "step": 1100 }, { "epoch": 1.5915119363395225, "grad_norm": 3.6399612426757812, "learning_rate": 4.234312416555408e-05, "loss": 1.7144, "step": 1200 }, { "epoch": 1.7241379310344827, "grad_norm": 3.5709495544433594, "learning_rate": 4.167556742323098e-05, "loss": 1.6836, "step": 1300 }, { "epoch": 1.8567639257294428, "grad_norm": 12.881651878356934, "learning_rate": 4.100801068090788e-05, "loss": 1.7071, "step": 1400 }, { "epoch": 1.9893899204244032, "grad_norm": 7.4129133224487305, "learning_rate": 4.0340453938584784e-05, "loss": 1.6916, "step": 1500 }, { "epoch": 2.0, "eval_gen_len": 127.846, "eval_loss": 1.6203532218933105, "eval_rouge1": 41.7668, "eval_rouge2": 15.5459, "eval_rougeL": 26.7721, "eval_rougeLsum": 38.8375, "eval_runtime": 602.6964, "eval_samples_per_second": 1.659, "eval_steps_per_second": 0.207, "step": 1508 }, { "epoch": 2.1220159151193636, "grad_norm": 2.7983736991882324, "learning_rate": 3.9679572763684914e-05, "loss": 1.6306, "step": 1600 }, { "epoch": 2.2546419098143238, "grad_norm": 3.959669589996338, "learning_rate": 3.901201602136182e-05, "loss": 1.6281, "step": 1700 }, { "epoch": 2.387267904509284, "grad_norm": 6.940447807312012, "learning_rate": 3.835113484646195e-05, "loss": 1.6283, "step": 1800 }, { "epoch": 2.519893899204244, "grad_norm": 3.5200612545013428, "learning_rate": 3.768357810413886e-05, "loss": 1.5985, "step": 1900 }, { "epoch": 2.6525198938992043, "grad_norm": 4.220067024230957, "learning_rate": 3.701602136181576e-05, "loss": 1.5914, "step": 2000 }, { "epoch": 2.7851458885941645, "grad_norm": 3.168060302734375, "learning_rate": 3.636181575433912e-05, "loss": 1.6099, "step": 2100 }, { "epoch": 2.9177718832891246, "grad_norm": 4.414212226867676, "learning_rate": 3.5700934579439256e-05, "loss": 1.6376, "step": 2200 }, { "epoch": 3.0, "eval_gen_len": 127.821, "eval_loss": 1.5945690870285034, "eval_rouge1": 42.6661, "eval_rouge2": 16.1955, "eval_rougeL": 27.2585, "eval_rougeLsum": 39.7771, "eval_runtime": 599.573, "eval_samples_per_second": 1.668, "eval_steps_per_second": 0.208, "step": 2262 }, { "epoch": 3.050397877984085, "grad_norm": 3.4827208518981934, "learning_rate": 3.5033377837116156e-05, "loss": 1.6044, "step": 2300 }, { "epoch": 3.183023872679045, "grad_norm": 4.390041351318359, "learning_rate": 3.436582109479306e-05, "loss": 1.5388, "step": 2400 }, { "epoch": 3.315649867374005, "grad_norm": 2.9073328971862793, "learning_rate": 3.369826435246996e-05, "loss": 1.5563, "step": 2500 }, { "epoch": 3.4482758620689653, "grad_norm": 3.0479063987731934, "learning_rate": 3.303070761014686e-05, "loss": 1.5731, "step": 2600 }, { "epoch": 3.5809018567639255, "grad_norm": 2.9282639026641846, "learning_rate": 3.236315086782377e-05, "loss": 1.5433, "step": 2700 }, { "epoch": 3.713527851458886, "grad_norm": 4.056313514709473, "learning_rate": 3.169559412550067e-05, "loss": 1.5509, "step": 2800 }, { "epoch": 3.8461538461538463, "grad_norm": 2.7992489337921143, "learning_rate": 3.102803738317757e-05, "loss": 1.5407, "step": 2900 }, { "epoch": 3.9787798408488064, "grad_norm": 4.021698474884033, "learning_rate": 3.0360480640854473e-05, "loss": 1.5609, "step": 3000 }, { "epoch": 4.0, "eval_gen_len": 127.946, "eval_loss": 1.5801172256469727, "eval_rouge1": 42.7385, "eval_rouge2": 16.2447, "eval_rougeL": 27.2408, "eval_rougeLsum": 39.7737, "eval_runtime": 599.4838, "eval_samples_per_second": 1.668, "eval_steps_per_second": 0.209, "step": 3016 }, { "epoch": 4.111405835543766, "grad_norm": 4.38268518447876, "learning_rate": 2.9692923898531376e-05, "loss": 1.4975, "step": 3100 }, { "epoch": 4.244031830238727, "grad_norm": 3.2881453037261963, "learning_rate": 2.9025367156208276e-05, "loss": 1.5086, "step": 3200 }, { "epoch": 4.376657824933687, "grad_norm": 14.788896560668945, "learning_rate": 2.8357810413885183e-05, "loss": 1.5175, "step": 3300 }, { "epoch": 4.5092838196286475, "grad_norm": 3.1153130531311035, "learning_rate": 2.7690253671562083e-05, "loss": 1.498, "step": 3400 }, { "epoch": 4.641909814323608, "grad_norm": 5.817938804626465, "learning_rate": 2.702269692923899e-05, "loss": 1.4979, "step": 3500 }, { "epoch": 4.774535809018568, "grad_norm": 7.6480584144592285, "learning_rate": 2.635514018691589e-05, "loss": 1.507, "step": 3600 }, { "epoch": 4.907161803713528, "grad_norm": 4.517747402191162, "learning_rate": 2.5687583444592793e-05, "loss": 1.5067, "step": 3700 }, { "epoch": 5.0, "eval_gen_len": 127.887, "eval_loss": 1.569287896156311, "eval_rouge1": 43.3524, "eval_rouge2": 16.6388, "eval_rougeL": 27.57, "eval_rougeLsum": 40.4783, "eval_runtime": 598.5089, "eval_samples_per_second": 1.671, "eval_steps_per_second": 0.209, "step": 3770 } ], "logging_steps": 100, "max_steps": 7540, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.708845744422912e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }