| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.996941896024465, | |
| "eval_steps": 500, | |
| "global_step": 735, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.2513757646083832, | |
| "learning_rate": 1.0135135135135135e-05, | |
| "loss": 1.5178, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.38634929060935974, | |
| "learning_rate": 2.027027027027027e-05, | |
| "loss": 1.4603, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.5580678582191467, | |
| "learning_rate": 2.995461422087746e-05, | |
| "loss": 1.3665, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.6025068163871765, | |
| "learning_rate": 2.881996974281392e-05, | |
| "loss": 1.2772, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.6379350423812866, | |
| "learning_rate": 2.768532526475038e-05, | |
| "loss": 1.2555, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.666475236415863, | |
| "learning_rate": 2.655068078668684e-05, | |
| "loss": 1.2434, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.7137765288352966, | |
| "learning_rate": 2.54160363086233e-05, | |
| "loss": 1.2304, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.6858211159706116, | |
| "learning_rate": 2.428139183055976e-05, | |
| "loss": 1.1842, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.6926656365394592, | |
| "learning_rate": 2.314674735249622e-05, | |
| "loss": 1.1929, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 0.6960567235946655, | |
| "learning_rate": 2.2012102874432675e-05, | |
| "loss": 1.1852, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.6939908266067505, | |
| "learning_rate": 2.087745839636914e-05, | |
| "loss": 1.17, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 0.7124253511428833, | |
| "learning_rate": 1.97428139183056e-05, | |
| "loss": 1.1512, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.6936700344085693, | |
| "learning_rate": 1.8608169440242055e-05, | |
| "loss": 1.1686, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 0.7236443758010864, | |
| "learning_rate": 1.747352496217852e-05, | |
| "loss": 1.1702, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 0.7483662366867065, | |
| "learning_rate": 1.6338880484114978e-05, | |
| "loss": 1.1655, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 0.8027617931365967, | |
| "learning_rate": 1.5204236006051437e-05, | |
| "loss": 1.1616, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 0.8072408437728882, | |
| "learning_rate": 1.4069591527987896e-05, | |
| "loss": 1.1828, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 0.7527270913124084, | |
| "learning_rate": 1.2934947049924356e-05, | |
| "loss": 1.1429, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 0.7795941829681396, | |
| "learning_rate": 1.1800302571860818e-05, | |
| "loss": 1.1738, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 0.8151872158050537, | |
| "learning_rate": 1.0665658093797276e-05, | |
| "loss": 1.1376, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 0.7859562635421753, | |
| "learning_rate": 9.531013615733736e-06, | |
| "loss": 1.1459, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.8044187426567078, | |
| "learning_rate": 8.396369137670198e-06, | |
| "loss": 1.1523, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 0.8209382891654968, | |
| "learning_rate": 7.261724659606657e-06, | |
| "loss": 1.1165, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.8191347122192383, | |
| "learning_rate": 6.127080181543117e-06, | |
| "loss": 1.1574, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 0.864337682723999, | |
| "learning_rate": 4.992435703479576e-06, | |
| "loss": 1.1237, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.8072352409362793, | |
| "learning_rate": 3.857791225416037e-06, | |
| "loss": 1.1432, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.8192684054374695, | |
| "learning_rate": 2.7231467473524962e-06, | |
| "loss": 1.1176, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 0.7951787710189819, | |
| "learning_rate": 1.5885022692889562e-06, | |
| "loss": 1.1419, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.8341733813285828, | |
| "learning_rate": 4.5385779122541606e-07, | |
| "loss": 1.1387, | |
| "step": 725 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 735, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "total_flos": 4.82271317563392e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |