{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0041928721174003, "eval_steps": 500, "global_step": 239, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.041928721174004195, "grad_norm": 0.011030412279069424, "learning_rate": 0.00025, "loss": 11.9318, "step": 5 }, { "epoch": 0.08385744234800839, "grad_norm": 0.014100322499871254, "learning_rate": 0.00024465811965811965, "loss": 11.9305, "step": 10 }, { "epoch": 0.12578616352201258, "grad_norm": 0.017396269366145134, "learning_rate": 0.00023931623931623932, "loss": 11.9291, "step": 15 }, { "epoch": 0.16771488469601678, "grad_norm": 0.022825436666607857, "learning_rate": 0.000233974358974359, "loss": 11.9293, "step": 20 }, { "epoch": 0.20964360587002095, "grad_norm": 0.030763259157538414, "learning_rate": 0.00022863247863247864, "loss": 11.928, "step": 25 }, { "epoch": 0.25157232704402516, "grad_norm": 0.05623968690633774, "learning_rate": 0.0002232905982905983, "loss": 11.9273, "step": 30 }, { "epoch": 0.29350104821802936, "grad_norm": 0.0468871183693409, "learning_rate": 0.00021794871794871795, "loss": 11.9263, "step": 35 }, { "epoch": 0.33542976939203356, "grad_norm": 0.05555358901619911, "learning_rate": 0.0002126068376068376, "loss": 11.9248, "step": 40 }, { "epoch": 0.37735849056603776, "grad_norm": 0.0784514918923378, "learning_rate": 0.00020726495726495727, "loss": 11.9244, "step": 45 }, { "epoch": 0.4192872117400419, "grad_norm": 0.05951184406876564, "learning_rate": 0.00020192307692307694, "loss": 11.9228, "step": 50 }, { "epoch": 0.4612159329140461, "grad_norm": 0.057042159140110016, "learning_rate": 0.00019658119658119659, "loss": 11.9221, "step": 55 }, { "epoch": 0.5031446540880503, "grad_norm": 0.04163195937871933, "learning_rate": 0.00019123931623931623, "loss": 11.9225, "step": 60 }, { "epoch": 0.5450733752620545, "grad_norm": 0.03262303024530411, "learning_rate": 0.0001858974358974359, "loss": 11.9226, "step": 65 }, { "epoch": 0.5870020964360587, "grad_norm": 0.05241989716887474, "learning_rate": 0.00018055555555555555, "loss": 11.922, "step": 70 }, { "epoch": 0.6289308176100629, "grad_norm": 0.06784799695014954, "learning_rate": 0.00017521367521367522, "loss": 11.9214, "step": 75 }, { "epoch": 0.6708595387840671, "grad_norm": 0.042793747037649155, "learning_rate": 0.0001698717948717949, "loss": 11.9183, "step": 80 }, { "epoch": 0.7127882599580713, "grad_norm": 0.0430237241089344, "learning_rate": 0.00016452991452991454, "loss": 11.9216, "step": 85 }, { "epoch": 0.7547169811320755, "grad_norm": 0.03868071734905243, "learning_rate": 0.00015918803418803418, "loss": 11.9194, "step": 90 }, { "epoch": 0.7966457023060797, "grad_norm": 0.024328265339136124, "learning_rate": 0.00015384615384615385, "loss": 11.9217, "step": 95 }, { "epoch": 0.8385744234800838, "grad_norm": 0.04353172332048416, "learning_rate": 0.0001485042735042735, "loss": 11.9212, "step": 100 }, { "epoch": 0.8805031446540881, "grad_norm": 0.057023949921131134, "learning_rate": 0.00014316239316239317, "loss": 11.92, "step": 105 }, { "epoch": 0.9224318658280922, "grad_norm": 0.039732299745082855, "learning_rate": 0.00013782051282051284, "loss": 11.9183, "step": 110 }, { "epoch": 0.9643605870020965, "grad_norm": 0.0544021911919117, "learning_rate": 0.00013247863247863248, "loss": 11.9203, "step": 115 }, { "epoch": 0.9979035639412998, "eval_loss": 11.919066429138184, "eval_runtime": 0.416, "eval_samples_per_second": 242.779, "eval_steps_per_second": 62.498, "step": 119 }, { "epoch": 1.0062893081761006, "grad_norm": 0.10388734191656113, "learning_rate": 0.00012713675213675213, "loss": 13.7207, "step": 120 }, { "epoch": 1.0482180293501049, "grad_norm": 0.028476731851696968, "learning_rate": 0.0001217948717948718, "loss": 11.9223, "step": 125 }, { "epoch": 1.090146750524109, "grad_norm": 0.0434449277818203, "learning_rate": 0.00011645299145299146, "loss": 11.9223, "step": 130 }, { "epoch": 1.1320754716981132, "grad_norm": 0.09088350832462311, "learning_rate": 0.0001111111111111111, "loss": 11.9366, "step": 135 }, { "epoch": 1.1740041928721174, "grad_norm": 0.07184627652168274, "learning_rate": 0.00010576923076923077, "loss": 11.5661, "step": 140 }, { "epoch": 1.2159329140461215, "grad_norm": 0.04500441253185272, "learning_rate": 0.00010042735042735043, "loss": 12.3022, "step": 145 }, { "epoch": 1.2578616352201257, "grad_norm": 0.029749717563390732, "learning_rate": 9.508547008547008e-05, "loss": 11.9321, "step": 150 }, { "epoch": 1.29979035639413, "grad_norm": 0.046956080943346024, "learning_rate": 8.974358974358975e-05, "loss": 11.8407, "step": 155 }, { "epoch": 1.3417190775681342, "grad_norm": 0.06576091051101685, "learning_rate": 8.440170940170941e-05, "loss": 11.9356, "step": 160 }, { "epoch": 1.3836477987421385, "grad_norm": 0.056520890444517136, "learning_rate": 7.905982905982905e-05, "loss": 11.9497, "step": 165 }, { "epoch": 1.4255765199161425, "grad_norm": 0.05084730684757233, "learning_rate": 7.371794871794872e-05, "loss": 11.5847, "step": 170 }, { "epoch": 1.4675052410901468, "grad_norm": 0.03961843624711037, "learning_rate": 6.837606837606838e-05, "loss": 12.261, "step": 175 }, { "epoch": 1.509433962264151, "grad_norm": 0.03475997969508171, "learning_rate": 6.303418803418804e-05, "loss": 11.8777, "step": 180 }, { "epoch": 1.551362683438155, "grad_norm": 0.028086921200156212, "learning_rate": 5.76923076923077e-05, "loss": 11.9761, "step": 185 }, { "epoch": 1.5932914046121593, "grad_norm": 0.046144578605890274, "learning_rate": 5.2350427350427356e-05, "loss": 11.8616, "step": 190 }, { "epoch": 1.6352201257861636, "grad_norm": 0.05854855850338936, "learning_rate": 4.700854700854701e-05, "loss": 11.9751, "step": 195 }, { "epoch": 1.6771488469601676, "grad_norm": 0.035215962678194046, "learning_rate": 4.1666666666666665e-05, "loss": 11.9493, "step": 200 }, { "epoch": 1.719077568134172, "grad_norm": 0.06034635007381439, "learning_rate": 3.632478632478633e-05, "loss": 11.8182, "step": 205 }, { "epoch": 1.7610062893081762, "grad_norm": 0.027154497802257538, "learning_rate": 3.098290598290598e-05, "loss": 11.9445, "step": 210 }, { "epoch": 1.8029350104821802, "grad_norm": 0.055567361414432526, "learning_rate": 2.564102564102564e-05, "loss": 11.9099, "step": 215 }, { "epoch": 1.8448637316561844, "grad_norm": 0.04124658182263374, "learning_rate": 2.02991452991453e-05, "loss": 11.7515, "step": 220 }, { "epoch": 1.8867924528301887, "grad_norm": 0.047468505799770355, "learning_rate": 1.4957264957264958e-05, "loss": 12.3221, "step": 225 }, { "epoch": 1.9287211740041927, "grad_norm": 0.04315986856818199, "learning_rate": 9.615384615384616e-06, "loss": 11.6918, "step": 230 }, { "epoch": 1.9706498951781972, "grad_norm": 0.08445514738559723, "learning_rate": 4.273504273504274e-06, "loss": 12.0431, "step": 235 }, { "epoch": 1.9958071278825997, "eval_loss": 11.917438507080078, "eval_runtime": 0.4203, "eval_samples_per_second": 240.321, "eval_steps_per_second": 61.865, "step": 238 }, { "epoch": 2.0041928721174003, "eval_loss": 11.917825698852539, "eval_runtime": 0.4191, "eval_samples_per_second": 241.009, "eval_steps_per_second": 62.042, "step": 239 } ], "logging_steps": 5, "max_steps": 239, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 269012385792.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }