| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 660, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.22727272727272727, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 4.99840706414248e-05, | |
| "loss": 0.442, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 4.993630286525634e-05, | |
| "loss": 0.2502, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6818181818181818, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 4.985675754429744e-05, | |
| "loss": 0.2375, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 4.9745536047023324e-05, | |
| "loss": 0.2422, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.2382592111825943, | |
| "eval_runtime": 55.0686, | |
| "eval_samples_per_second": 14.582, | |
| "eval_steps_per_second": 0.472, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.1363636363636362, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 4.96027801084029e-05, | |
| "loss": 0.227, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 4.942867164927899e-05, | |
| "loss": 0.2144, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.5909090909090908, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 4.922343254453768e-05, | |
| "loss": 0.2207, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 4.898732434036244e-05, | |
| "loss": 0.2146, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.22858716547489166, | |
| "eval_runtime": 68.0042, | |
| "eval_samples_per_second": 11.808, | |
| "eval_steps_per_second": 0.382, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.0454545454545454, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 4.872064792093299e-05, | |
| "loss": 0.2053, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.2727272727272725, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 4.842374312499405e-05, | |
| "loss": 0.1972, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 4.8096988312782174e-05, | |
| "loss": 0.2116, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.7272727272727275, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 4.774079988386296e-05, | |
| "loss": 0.1983, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.9545454545454546, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 4.735563174649278e-05, | |
| "loss": 0.1906, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.22559815645217896, | |
| "eval_runtime": 55.1364, | |
| "eval_samples_per_second": 14.564, | |
| "eval_steps_per_second": 0.472, | |
| "step": 660 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 4400, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "total_flos": 4.643078518509404e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |