{ "best_metric": 0.4982830286026001, "best_model_checkpoint": "./results/checkpoint-218", "epoch": 12.0, "eval_steps": 500, "global_step": 327, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.37, "grad_norm": 0.376583456993103, "learning_rate": 0.0002, "loss": 1.4678, "step": 10 }, { "epoch": 0.73, "grad_norm": 0.5780853629112244, "learning_rate": 0.0002, "loss": 1.473, "step": 20 }, { "epoch": 0.99, "eval_loss": 1.2861968278884888, "eval_runtime": 122.192, "eval_samples_per_second": 0.892, "eval_steps_per_second": 0.115, "step": 27 }, { "epoch": 1.1, "grad_norm": 0.5245345830917358, "learning_rate": 0.0002, "loss": 1.3456, "step": 30 }, { "epoch": 1.47, "grad_norm": 0.6904464364051819, "learning_rate": 0.0002, "loss": 1.1644, "step": 40 }, { "epoch": 1.83, "grad_norm": 0.6669853925704956, "learning_rate": 0.0002, "loss": 1.0884, "step": 50 }, { "epoch": 1.98, "eval_loss": 1.0433109998703003, "eval_runtime": 122.2126, "eval_samples_per_second": 0.892, "eval_steps_per_second": 0.115, "step": 54 }, { "epoch": 2.2, "grad_norm": 0.6909930109977722, "learning_rate": 0.0002, "loss": 0.898, "step": 60 }, { "epoch": 2.57, "grad_norm": 0.733932375907898, "learning_rate": 0.0002, "loss": 0.8057, "step": 70 }, { "epoch": 2.94, "grad_norm": 0.8011370897293091, "learning_rate": 0.0002, "loss": 0.7697, "step": 80 }, { "epoch": 2.97, "eval_loss": 0.8525019884109497, "eval_runtime": 122.2258, "eval_samples_per_second": 0.892, "eval_steps_per_second": 0.115, "step": 81 }, { "epoch": 3.3, "grad_norm": 0.7747021913528442, "learning_rate": 0.0002, "loss": 0.5784, "step": 90 }, { "epoch": 3.67, "grad_norm": 0.6826988458633423, "learning_rate": 0.0002, "loss": 0.5246, "step": 100 }, { "epoch": 4.0, "eval_loss": 0.7216689586639404, "eval_runtime": 122.1337, "eval_samples_per_second": 0.892, "eval_steps_per_second": 0.115, "step": 109 }, { "epoch": 4.04, "grad_norm": 0.8631507158279419, "learning_rate": 0.0002, "loss": 0.5263, "step": 110 }, { "epoch": 4.4, "grad_norm": 0.7449190020561218, "learning_rate": 0.0002, "loss": 0.3803, "step": 120 }, { "epoch": 4.77, "grad_norm": 0.7547981142997742, "learning_rate": 0.0002, "loss": 0.3664, "step": 130 }, { "epoch": 4.99, "eval_loss": 0.636491596698761, "eval_runtime": 122.2077, "eval_samples_per_second": 0.892, "eval_steps_per_second": 0.115, "step": 136 }, { "epoch": 5.14, "grad_norm": 0.8107866644859314, "learning_rate": 0.0002, "loss": 0.3591, "step": 140 }, { "epoch": 5.5, "grad_norm": 0.7747142314910889, "learning_rate": 0.0002, "loss": 0.2611, "step": 150 }, { "epoch": 5.87, "grad_norm": 0.66695237159729, "learning_rate": 0.0002, "loss": 0.2933, "step": 160 }, { "epoch": 5.98, "eval_loss": 0.5818299055099487, "eval_runtime": 122.1387, "eval_samples_per_second": 0.892, "eval_steps_per_second": 0.115, "step": 163 }, { "epoch": 6.24, "grad_norm": 0.6931923031806946, "learning_rate": 0.0002, "loss": 0.2187, "step": 170 }, { "epoch": 6.61, "grad_norm": 0.6592116355895996, "learning_rate": 0.0002, "loss": 0.2054, "step": 180 }, { "epoch": 6.97, "grad_norm": 0.5976551175117493, "learning_rate": 0.0002, "loss": 0.2359, "step": 190 }, { "epoch": 6.97, "eval_loss": 0.5233004689216614, "eval_runtime": 122.2093, "eval_samples_per_second": 0.892, "eval_steps_per_second": 0.115, "step": 190 }, { "epoch": 7.34, "grad_norm": 0.7745209336280823, "learning_rate": 0.0002, "loss": 0.1512, "step": 200 }, { "epoch": 7.71, "grad_norm": 0.6759055256843567, "learning_rate": 0.0002, "loss": 0.1816, "step": 210 }, { "epoch": 8.0, "eval_loss": 0.4982830286026001, "eval_runtime": 122.1774, "eval_samples_per_second": 0.892, "eval_steps_per_second": 0.115, "step": 218 }, { "epoch": 8.07, "grad_norm": 0.6047687530517578, "learning_rate": 0.0002, "loss": 0.1495, "step": 220 }, { "epoch": 8.44, "grad_norm": 0.8196636438369751, "learning_rate": 0.0002, "loss": 0.1141, "step": 230 }, { "epoch": 8.81, "grad_norm": 0.5866973996162415, "learning_rate": 0.0002, "loss": 0.1403, "step": 240 }, { "epoch": 8.99, "eval_loss": 0.5128377676010132, "eval_runtime": 122.2482, "eval_samples_per_second": 0.892, "eval_steps_per_second": 0.115, "step": 245 }, { "epoch": 9.17, "grad_norm": 0.537775456905365, "learning_rate": 0.0002, "loss": 0.1013, "step": 250 }, { "epoch": 9.54, "grad_norm": 0.5751469135284424, "learning_rate": 0.0002, "loss": 0.0721, "step": 260 }, { "epoch": 9.91, "grad_norm": 0.7250769138336182, "learning_rate": 0.0002, "loss": 0.1221, "step": 270 }, { "epoch": 9.98, "eval_loss": 0.5187135338783264, "eval_runtime": 122.1638, "eval_samples_per_second": 0.892, "eval_steps_per_second": 0.115, "step": 272 }, { "epoch": 10.28, "grad_norm": 0.564387321472168, "learning_rate": 0.0002, "loss": 0.0788, "step": 280 }, { "epoch": 10.64, "grad_norm": 0.6789527535438538, "learning_rate": 0.0002, "loss": 0.0895, "step": 290 }, { "epoch": 10.97, "eval_loss": 0.5100827217102051, "eval_runtime": 122.168, "eval_samples_per_second": 0.892, "eval_steps_per_second": 0.115, "step": 299 }, { "epoch": 11.01, "grad_norm": 0.5745537281036377, "learning_rate": 0.0002, "loss": 0.081, "step": 300 }, { "epoch": 11.38, "grad_norm": 0.6635628342628479, "learning_rate": 0.0002, "loss": 0.071, "step": 310 }, { "epoch": 11.74, "grad_norm": 0.5656697154045105, "learning_rate": 0.0002, "loss": 0.076, "step": 320 }, { "epoch": 12.0, "eval_loss": 0.5288968682289124, "eval_runtime": 122.2102, "eval_samples_per_second": 0.892, "eval_steps_per_second": 0.115, "step": 327 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 19, "save_steps": 10, "total_flos": 1.1527346189028557e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }