| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0041928721174003, | |
| "eval_steps": 500, | |
| "global_step": 239, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.041928721174004195, | |
| "grad_norm": 0.011030412279069424, | |
| "learning_rate": 0.00025, | |
| "loss": 11.9318, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.08385744234800839, | |
| "grad_norm": 0.014100322499871254, | |
| "learning_rate": 0.00024465811965811965, | |
| "loss": 11.9305, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12578616352201258, | |
| "grad_norm": 0.017396269366145134, | |
| "learning_rate": 0.00023931623931623932, | |
| "loss": 11.9291, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.16771488469601678, | |
| "grad_norm": 0.022825436666607857, | |
| "learning_rate": 0.000233974358974359, | |
| "loss": 11.9293, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.20964360587002095, | |
| "grad_norm": 0.030763259157538414, | |
| "learning_rate": 0.00022863247863247864, | |
| "loss": 11.928, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.25157232704402516, | |
| "grad_norm": 0.05623968690633774, | |
| "learning_rate": 0.0002232905982905983, | |
| "loss": 11.9273, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.29350104821802936, | |
| "grad_norm": 0.0468871183693409, | |
| "learning_rate": 0.00021794871794871795, | |
| "loss": 11.9263, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.33542976939203356, | |
| "grad_norm": 0.05555358901619911, | |
| "learning_rate": 0.0002126068376068376, | |
| "loss": 11.9248, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.37735849056603776, | |
| "grad_norm": 0.0784514918923378, | |
| "learning_rate": 0.00020726495726495727, | |
| "loss": 11.9244, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.4192872117400419, | |
| "grad_norm": 0.05951184406876564, | |
| "learning_rate": 0.00020192307692307694, | |
| "loss": 11.9228, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.4612159329140461, | |
| "grad_norm": 0.057042159140110016, | |
| "learning_rate": 0.00019658119658119659, | |
| "loss": 11.9221, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5031446540880503, | |
| "grad_norm": 0.04163195937871933, | |
| "learning_rate": 0.00019123931623931623, | |
| "loss": 11.9225, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5450733752620545, | |
| "grad_norm": 0.03262303024530411, | |
| "learning_rate": 0.0001858974358974359, | |
| "loss": 11.9226, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.5870020964360587, | |
| "grad_norm": 0.05241989716887474, | |
| "learning_rate": 0.00018055555555555555, | |
| "loss": 11.922, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6289308176100629, | |
| "grad_norm": 0.06784799695014954, | |
| "learning_rate": 0.00017521367521367522, | |
| "loss": 11.9214, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.6708595387840671, | |
| "grad_norm": 0.042793747037649155, | |
| "learning_rate": 0.0001698717948717949, | |
| "loss": 11.9183, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.7127882599580713, | |
| "grad_norm": 0.0430237241089344, | |
| "learning_rate": 0.00016452991452991454, | |
| "loss": 11.9216, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.7547169811320755, | |
| "grad_norm": 0.03868071734905243, | |
| "learning_rate": 0.00015918803418803418, | |
| "loss": 11.9194, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.7966457023060797, | |
| "grad_norm": 0.024328265339136124, | |
| "learning_rate": 0.00015384615384615385, | |
| "loss": 11.9217, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.8385744234800838, | |
| "grad_norm": 0.04353172332048416, | |
| "learning_rate": 0.0001485042735042735, | |
| "loss": 11.9212, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.8805031446540881, | |
| "grad_norm": 0.057023949921131134, | |
| "learning_rate": 0.00014316239316239317, | |
| "loss": 11.92, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.9224318658280922, | |
| "grad_norm": 0.039732299745082855, | |
| "learning_rate": 0.00013782051282051284, | |
| "loss": 11.9183, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.9643605870020965, | |
| "grad_norm": 0.0544021911919117, | |
| "learning_rate": 0.00013247863247863248, | |
| "loss": 11.9203, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.9979035639412998, | |
| "eval_loss": 11.919066429138184, | |
| "eval_runtime": 0.416, | |
| "eval_samples_per_second": 242.779, | |
| "eval_steps_per_second": 62.498, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.0062893081761006, | |
| "grad_norm": 0.10388734191656113, | |
| "learning_rate": 0.00012713675213675213, | |
| "loss": 13.7207, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.0482180293501049, | |
| "grad_norm": 0.028476731851696968, | |
| "learning_rate": 0.0001217948717948718, | |
| "loss": 11.9223, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.090146750524109, | |
| "grad_norm": 0.0434449277818203, | |
| "learning_rate": 0.00011645299145299146, | |
| "loss": 11.9223, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.1320754716981132, | |
| "grad_norm": 0.09088350832462311, | |
| "learning_rate": 0.0001111111111111111, | |
| "loss": 11.9366, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.1740041928721174, | |
| "grad_norm": 0.07184627652168274, | |
| "learning_rate": 0.00010576923076923077, | |
| "loss": 11.5661, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.2159329140461215, | |
| "grad_norm": 0.04500441253185272, | |
| "learning_rate": 0.00010042735042735043, | |
| "loss": 12.3022, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.2578616352201257, | |
| "grad_norm": 0.029749717563390732, | |
| "learning_rate": 9.508547008547008e-05, | |
| "loss": 11.9321, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.29979035639413, | |
| "grad_norm": 0.046956080943346024, | |
| "learning_rate": 8.974358974358975e-05, | |
| "loss": 11.8407, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.3417190775681342, | |
| "grad_norm": 0.06576091051101685, | |
| "learning_rate": 8.440170940170941e-05, | |
| "loss": 11.9356, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.3836477987421385, | |
| "grad_norm": 0.056520890444517136, | |
| "learning_rate": 7.905982905982905e-05, | |
| "loss": 11.9497, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.4255765199161425, | |
| "grad_norm": 0.05084730684757233, | |
| "learning_rate": 7.371794871794872e-05, | |
| "loss": 11.5847, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.4675052410901468, | |
| "grad_norm": 0.03961843624711037, | |
| "learning_rate": 6.837606837606838e-05, | |
| "loss": 12.261, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.509433962264151, | |
| "grad_norm": 0.03475997969508171, | |
| "learning_rate": 6.303418803418804e-05, | |
| "loss": 11.8777, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.551362683438155, | |
| "grad_norm": 0.028086921200156212, | |
| "learning_rate": 5.76923076923077e-05, | |
| "loss": 11.9761, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.5932914046121593, | |
| "grad_norm": 0.046144578605890274, | |
| "learning_rate": 5.2350427350427356e-05, | |
| "loss": 11.8616, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.6352201257861636, | |
| "grad_norm": 0.05854855850338936, | |
| "learning_rate": 4.700854700854701e-05, | |
| "loss": 11.9751, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.6771488469601676, | |
| "grad_norm": 0.035215962678194046, | |
| "learning_rate": 4.1666666666666665e-05, | |
| "loss": 11.9493, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.719077568134172, | |
| "grad_norm": 0.06034635007381439, | |
| "learning_rate": 3.632478632478633e-05, | |
| "loss": 11.8182, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.7610062893081762, | |
| "grad_norm": 0.027154497802257538, | |
| "learning_rate": 3.098290598290598e-05, | |
| "loss": 11.9445, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.8029350104821802, | |
| "grad_norm": 0.055567361414432526, | |
| "learning_rate": 2.564102564102564e-05, | |
| "loss": 11.9099, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.8448637316561844, | |
| "grad_norm": 0.04124658182263374, | |
| "learning_rate": 2.02991452991453e-05, | |
| "loss": 11.7515, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.8867924528301887, | |
| "grad_norm": 0.047468505799770355, | |
| "learning_rate": 1.4957264957264958e-05, | |
| "loss": 12.3221, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.9287211740041927, | |
| "grad_norm": 0.04315986856818199, | |
| "learning_rate": 9.615384615384616e-06, | |
| "loss": 11.6918, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.9706498951781972, | |
| "grad_norm": 0.08445514738559723, | |
| "learning_rate": 4.273504273504274e-06, | |
| "loss": 12.0431, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.9958071278825997, | |
| "eval_loss": 11.917438507080078, | |
| "eval_runtime": 0.4203, | |
| "eval_samples_per_second": 240.321, | |
| "eval_steps_per_second": 61.865, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.0041928721174003, | |
| "eval_loss": 11.917825698852539, | |
| "eval_runtime": 0.4191, | |
| "eval_samples_per_second": 241.009, | |
| "eval_steps_per_second": 62.042, | |
| "step": 239 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 239, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 269012385792.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |