| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 8.592910848549947, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.6707318425178528, | |
| "learning_rate": 1.97816091954023e-05, | |
| "loss": 2.2888, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": NaN, | |
| "learning_rate": 1.9574712643678162e-05, | |
| "loss": 2.0848, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 0.7721680402755737, | |
| "learning_rate": 1.9344827586206897e-05, | |
| "loss": 2.0044, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.1140433549880981, | |
| "learning_rate": 1.9126436781609195e-05, | |
| "loss": 1.8016, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.7205075621604919, | |
| "learning_rate": 1.8896551724137934e-05, | |
| "loss": 1.7217, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 0.8933233618736267, | |
| "learning_rate": 1.866666666666667e-05, | |
| "loss": 1.5705, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 0.7114273905754089, | |
| "learning_rate": 1.8436781609195404e-05, | |
| "loss": 1.4006, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.7229479551315308, | |
| "learning_rate": 1.820689655172414e-05, | |
| "loss": 1.3137, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 3.09, | |
| "grad_norm": 0.9370490908622742, | |
| "learning_rate": 1.7977011494252874e-05, | |
| "loss": 1.1898, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 0.6051978468894958, | |
| "learning_rate": 1.774712643678161e-05, | |
| "loss": 1.1229, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 0.6857028007507324, | |
| "learning_rate": 1.7517241379310347e-05, | |
| "loss": 1.051, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 0.6715748310089111, | |
| "learning_rate": 1.7287356321839082e-05, | |
| "loss": 0.9894, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 4.47, | |
| "grad_norm": 0.5918118953704834, | |
| "learning_rate": 1.7057471264367817e-05, | |
| "loss": 0.9687, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 0.6621690392494202, | |
| "learning_rate": 1.6827586206896552e-05, | |
| "loss": 0.9199, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 5.16, | |
| "grad_norm": 0.6697206497192383, | |
| "learning_rate": 1.659770114942529e-05, | |
| "loss": 0.9303, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 0.8184316158294678, | |
| "learning_rate": 1.6367816091954025e-05, | |
| "loss": 0.8898, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 5.84, | |
| "grad_norm": 0.6429987549781799, | |
| "learning_rate": 1.613793103448276e-05, | |
| "loss": 0.8623, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 6.19, | |
| "grad_norm": 0.7518043518066406, | |
| "learning_rate": 1.5908045977011495e-05, | |
| "loss": 0.8239, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 6.53, | |
| "grad_norm": 0.6667824983596802, | |
| "learning_rate": 1.567816091954023e-05, | |
| "loss": 0.8119, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 6.87, | |
| "grad_norm": 0.8569457530975342, | |
| "learning_rate": 1.5448275862068965e-05, | |
| "loss": 0.8139, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 7.22, | |
| "grad_norm": 0.7754850387573242, | |
| "learning_rate": 1.5218390804597702e-05, | |
| "loss": 0.7835, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 7.56, | |
| "grad_norm": 1.159196138381958, | |
| "learning_rate": 1.4988505747126439e-05, | |
| "loss": 0.7546, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 7.91, | |
| "grad_norm": 1.119764804840088, | |
| "learning_rate": 1.4758620689655174e-05, | |
| "loss": 0.7571, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 8.25, | |
| "grad_norm": 1.3600786924362183, | |
| "learning_rate": 1.452873563218391e-05, | |
| "loss": 0.7451, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 8.59, | |
| "grad_norm": 0.7608994245529175, | |
| "learning_rate": 1.4298850574712644e-05, | |
| "loss": 0.7109, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 1740, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 30, | |
| "save_steps": 500, | |
| "total_flos": 1.2995638935552e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |