{ "best_metric": 0.45652173913043476, "best_model_checkpoint": "SW2-DMAE\\checkpoint-49", "epoch": 34.285714285714285, "eval_steps": 500, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.86, "eval_accuracy": 0.10869565217391304, "eval_loss": 7.9393720626831055, "eval_runtime": 0.2251, "eval_samples_per_second": 204.399, "eval_steps_per_second": 13.33, "step": 3 }, { "epoch": 2.0, "eval_accuracy": 0.10869565217391304, "eval_loss": 7.897927761077881, "eval_runtime": 0.2391, "eval_samples_per_second": 192.425, "eval_steps_per_second": 12.549, "step": 7 }, { "epoch": 2.86, "learning_rate": 4.166666666666667e-05, "loss": 7.935, "step": 10 }, { "epoch": 2.86, "eval_accuracy": 0.10869565217391304, "eval_loss": 7.767155647277832, "eval_runtime": 0.2321, "eval_samples_per_second": 198.231, "eval_steps_per_second": 12.928, "step": 10 }, { "epoch": 4.0, "eval_accuracy": 0.10869565217391304, "eval_loss": 7.219719886779785, "eval_runtime": 0.2351, "eval_samples_per_second": 195.7, "eval_steps_per_second": 12.763, "step": 14 }, { "epoch": 4.86, "eval_accuracy": 0.10869565217391304, "eval_loss": 6.566103935241699, "eval_runtime": 0.2292, "eval_samples_per_second": 200.702, "eval_steps_per_second": 13.089, "step": 17 }, { "epoch": 5.71, "learning_rate": 4.62962962962963e-05, "loss": 7.0143, "step": 20 }, { "epoch": 6.0, "eval_accuracy": 0.10869565217391304, "eval_loss": 5.730363368988037, "eval_runtime": 0.2911, "eval_samples_per_second": 158.04, "eval_steps_per_second": 10.307, "step": 21 }, { "epoch": 6.86, "eval_accuracy": 0.10869565217391304, "eval_loss": 5.118360996246338, "eval_runtime": 0.2291, "eval_samples_per_second": 200.828, "eval_steps_per_second": 13.097, "step": 24 }, { "epoch": 8.0, "eval_accuracy": 0.10869565217391304, "eval_loss": 4.352573394775391, "eval_runtime": 0.2536, "eval_samples_per_second": 181.411, "eval_steps_per_second": 11.831, "step": 28 }, { "epoch": 8.57, "learning_rate": 4.166666666666667e-05, "loss": 4.9972, "step": 30 }, { "epoch": 8.86, "eval_accuracy": 0.10869565217391304, "eval_loss": 3.8116581439971924, "eval_runtime": 0.2421, "eval_samples_per_second": 190.04, "eval_steps_per_second": 12.394, "step": 31 }, { "epoch": 10.0, "eval_accuracy": 0.10869565217391304, "eval_loss": 3.151806354522705, "eval_runtime": 0.2396, "eval_samples_per_second": 192.018, "eval_steps_per_second": 12.523, "step": 35 }, { "epoch": 10.86, "eval_accuracy": 0.10869565217391304, "eval_loss": 2.7124626636505127, "eval_runtime": 0.2381, "eval_samples_per_second": 193.229, "eval_steps_per_second": 12.602, "step": 38 }, { "epoch": 11.43, "learning_rate": 3.7037037037037037e-05, "loss": 3.3803, "step": 40 }, { "epoch": 12.0, "eval_accuracy": 0.10869565217391304, "eval_loss": 2.2254273891448975, "eval_runtime": 0.2416, "eval_samples_per_second": 190.423, "eval_steps_per_second": 12.419, "step": 42 }, { "epoch": 12.86, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.9449864625930786, "eval_runtime": 0.2426, "eval_samples_per_second": 189.639, "eval_steps_per_second": 12.368, "step": 45 }, { "epoch": 14.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.6738574504852295, "eval_runtime": 0.2396, "eval_samples_per_second": 192.014, "eval_steps_per_second": 12.523, "step": 49 }, { "epoch": 14.29, "learning_rate": 3.240740740740741e-05, "loss": 2.0759, "step": 50 }, { "epoch": 14.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.5299274921417236, "eval_runtime": 0.2221, "eval_samples_per_second": 207.16, "eval_steps_per_second": 13.51, "step": 52 }, { "epoch": 16.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.3876434564590454, "eval_runtime": 0.2486, "eval_samples_per_second": 185.061, "eval_steps_per_second": 12.069, "step": 56 }, { "epoch": 16.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.305933952331543, "eval_runtime": 0.2481, "eval_samples_per_second": 185.442, "eval_steps_per_second": 12.094, "step": 59 }, { "epoch": 17.14, "learning_rate": 2.777777777777778e-05, "loss": 1.4466, "step": 60 }, { "epoch": 18.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2341125011444092, "eval_runtime": 0.2951, "eval_samples_per_second": 155.897, "eval_steps_per_second": 10.167, "step": 63 }, { "epoch": 18.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2120041847229004, "eval_runtime": 0.2371, "eval_samples_per_second": 194.049, "eval_steps_per_second": 12.655, "step": 66 }, { "epoch": 20.0, "learning_rate": 2.314814814814815e-05, "loss": 1.2349, "step": 70 }, { "epoch": 20.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2095657587051392, "eval_runtime": 0.2456, "eval_samples_per_second": 187.325, "eval_steps_per_second": 12.217, "step": 70 }, { "epoch": 20.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2118462324142456, "eval_runtime": 0.2391, "eval_samples_per_second": 192.425, "eval_steps_per_second": 12.549, "step": 73 }, { "epoch": 22.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2113664150238037, "eval_runtime": 0.2471, "eval_samples_per_second": 186.183, "eval_steps_per_second": 12.142, "step": 77 }, { "epoch": 22.86, "learning_rate": 1.8518518518518518e-05, "loss": 1.1854, "step": 80 }, { "epoch": 22.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2140562534332275, "eval_runtime": 0.2411, "eval_samples_per_second": 190.828, "eval_steps_per_second": 12.445, "step": 80 }, { "epoch": 24.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.211725115776062, "eval_runtime": 0.2571, "eval_samples_per_second": 178.948, "eval_steps_per_second": 11.671, "step": 84 }, { "epoch": 24.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2102160453796387, "eval_runtime": 0.2521, "eval_samples_per_second": 182.498, "eval_steps_per_second": 11.902, "step": 87 }, { "epoch": 25.71, "learning_rate": 1.388888888888889e-05, "loss": 1.1878, "step": 90 }, { "epoch": 26.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2075891494750977, "eval_runtime": 0.2521, "eval_samples_per_second": 182.498, "eval_steps_per_second": 11.902, "step": 91 }, { "epoch": 26.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2082799673080444, "eval_runtime": 0.2431, "eval_samples_per_second": 189.258, "eval_steps_per_second": 12.343, "step": 94 }, { "epoch": 28.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2129710912704468, "eval_runtime": 0.2271, "eval_samples_per_second": 202.585, "eval_steps_per_second": 13.212, "step": 98 }, { "epoch": 28.57, "learning_rate": 9.259259259259259e-06, "loss": 1.1986, "step": 100 }, { "epoch": 28.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2068954706192017, "eval_runtime": 0.2281, "eval_samples_per_second": 201.708, "eval_steps_per_second": 13.155, "step": 101 }, { "epoch": 30.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2058416604995728, "eval_runtime": 0.2551, "eval_samples_per_second": 180.351, "eval_steps_per_second": 11.762, "step": 105 }, { "epoch": 30.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2070415019989014, "eval_runtime": 0.2396, "eval_samples_per_second": 192.016, "eval_steps_per_second": 12.523, "step": 108 }, { "epoch": 31.43, "learning_rate": 4.6296296296296296e-06, "loss": 1.182, "step": 110 }, { "epoch": 32.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2074666023254395, "eval_runtime": 0.2631, "eval_samples_per_second": 174.865, "eval_steps_per_second": 11.404, "step": 112 }, { "epoch": 32.86, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.2073932886123657, "eval_runtime": 0.2401, "eval_samples_per_second": 191.624, "eval_steps_per_second": 12.497, "step": 115 }, { "epoch": 34.0, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.207228422164917, "eval_runtime": 0.2461, "eval_samples_per_second": 186.949, "eval_steps_per_second": 12.192, "step": 119 }, { "epoch": 34.29, "learning_rate": 0.0, "loss": 1.2064, "step": 120 }, { "epoch": 34.29, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.207200050354004, "eval_runtime": 0.2516, "eval_samples_per_second": 182.854, "eval_steps_per_second": 11.925, "step": 120 }, { "epoch": 34.29, "step": 120, "total_flos": 2.3770905934823424e+17, "train_loss": 2.837039653460185, "train_runtime": 179.4861, "train_samples_per_second": 47.469, "train_steps_per_second": 0.669 } ], "logging_steps": 10, "max_steps": 120, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 500, "total_flos": 2.3770905934823424e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }