| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 372, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.08064516129032258, |
| "grad_norm": 2.805160321645274, |
| "learning_rate": 5e-06, |
| "loss": 0.9727, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.16129032258064516, |
| "grad_norm": 1.9270382397062333, |
| "learning_rate": 5e-06, |
| "loss": 0.8717, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.24193548387096775, |
| "grad_norm": 1.057918769043547, |
| "learning_rate": 5e-06, |
| "loss": 0.83, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.3225806451612903, |
| "grad_norm": 0.966612939522159, |
| "learning_rate": 5e-06, |
| "loss": 0.8074, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.4032258064516129, |
| "grad_norm": 0.8000273000282917, |
| "learning_rate": 5e-06, |
| "loss": 0.788, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.4838709677419355, |
| "grad_norm": 0.783866628727888, |
| "learning_rate": 5e-06, |
| "loss": 0.7788, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.5645161290322581, |
| "grad_norm": 0.7831017216368418, |
| "learning_rate": 5e-06, |
| "loss": 0.7687, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 0.9560148392425689, |
| "learning_rate": 5e-06, |
| "loss": 0.7603, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.7258064516129032, |
| "grad_norm": 0.7918650824417934, |
| "learning_rate": 5e-06, |
| "loss": 0.7564, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.8064516129032258, |
| "grad_norm": 0.7898271934758637, |
| "learning_rate": 5e-06, |
| "loss": 0.7519, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.8870967741935484, |
| "grad_norm": 0.5867946798125393, |
| "learning_rate": 5e-06, |
| "loss": 0.7457, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.967741935483871, |
| "grad_norm": 0.8366098203850484, |
| "learning_rate": 5e-06, |
| "loss": 0.7458, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.7471059560775757, |
| "eval_runtime": 11.9985, |
| "eval_samples_per_second": 277.369, |
| "eval_steps_per_second": 1.083, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.0483870967741935, |
| "grad_norm": 1.0650310085574715, |
| "learning_rate": 5e-06, |
| "loss": 0.7149, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.129032258064516, |
| "grad_norm": 0.727121943810538, |
| "learning_rate": 5e-06, |
| "loss": 0.6903, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.2096774193548387, |
| "grad_norm": 0.6339466549156947, |
| "learning_rate": 5e-06, |
| "loss": 0.6944, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.2903225806451613, |
| "grad_norm": 0.8086692624435744, |
| "learning_rate": 5e-06, |
| "loss": 0.6924, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.370967741935484, |
| "grad_norm": 0.5491840043564988, |
| "learning_rate": 5e-06, |
| "loss": 0.69, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.4516129032258065, |
| "grad_norm": 0.8274190963187956, |
| "learning_rate": 5e-06, |
| "loss": 0.6899, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.532258064516129, |
| "grad_norm": 0.5342519121078966, |
| "learning_rate": 5e-06, |
| "loss": 0.6952, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.6129032258064515, |
| "grad_norm": 0.48455883160986385, |
| "learning_rate": 5e-06, |
| "loss": 0.6905, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.6935483870967742, |
| "grad_norm": 0.5798130179708565, |
| "learning_rate": 5e-06, |
| "loss": 0.6918, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.7741935483870968, |
| "grad_norm": 0.7691729478745076, |
| "learning_rate": 5e-06, |
| "loss": 0.6881, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.8548387096774195, |
| "grad_norm": 0.6172716433272999, |
| "learning_rate": 5e-06, |
| "loss": 0.687, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.935483870967742, |
| "grad_norm": 0.78742771593668, |
| "learning_rate": 5e-06, |
| "loss": 0.6849, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.7319301962852478, |
| "eval_runtime": 12.0, |
| "eval_samples_per_second": 277.333, |
| "eval_steps_per_second": 1.083, |
| "step": 248 |
| }, |
| { |
| "epoch": 2.0161290322580645, |
| "grad_norm": 1.0092259897740703, |
| "learning_rate": 5e-06, |
| "loss": 0.6754, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.096774193548387, |
| "grad_norm": 0.7032989766617842, |
| "learning_rate": 5e-06, |
| "loss": 0.6395, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.1774193548387095, |
| "grad_norm": 0.632979906199952, |
| "learning_rate": 5e-06, |
| "loss": 0.6373, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.258064516129032, |
| "grad_norm": 0.7615512869416773, |
| "learning_rate": 5e-06, |
| "loss": 0.6389, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.338709677419355, |
| "grad_norm": 0.5250121544786817, |
| "learning_rate": 5e-06, |
| "loss": 0.6405, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.4193548387096775, |
| "grad_norm": 0.6442525531598334, |
| "learning_rate": 5e-06, |
| "loss": 0.637, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.822258862773572, |
| "learning_rate": 5e-06, |
| "loss": 0.6425, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.5806451612903225, |
| "grad_norm": 0.6138611952424787, |
| "learning_rate": 5e-06, |
| "loss": 0.6414, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.661290322580645, |
| "grad_norm": 0.6395065998468912, |
| "learning_rate": 5e-06, |
| "loss": 0.6382, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.741935483870968, |
| "grad_norm": 0.61108863793147, |
| "learning_rate": 5e-06, |
| "loss": 0.6415, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.8225806451612905, |
| "grad_norm": 0.6362018532608607, |
| "learning_rate": 5e-06, |
| "loss": 0.6429, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.903225806451613, |
| "grad_norm": 0.664123527978051, |
| "learning_rate": 5e-06, |
| "loss": 0.638, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.9838709677419355, |
| "grad_norm": 0.6724728395726959, |
| "learning_rate": 5e-06, |
| "loss": 0.6404, |
| "step": 370 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.7328692078590393, |
| "eval_runtime": 11.8309, |
| "eval_samples_per_second": 281.297, |
| "eval_steps_per_second": 1.099, |
| "step": 372 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 372, |
| "total_flos": 623113855303680.0, |
| "train_loss": 0.7086310264884784, |
| "train_runtime": 2594.5447, |
| "train_samples_per_second": 73.106, |
| "train_steps_per_second": 0.143 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 372, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 623113855303680.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|