| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 100, | |
| "global_step": 220, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02280501710376283, | |
| "grad_norm": 32.61074447631836, | |
| "learning_rate": 2.9999999999999997e-05, | |
| "loss": 12.6644, | |
| "num_input_tokens_seen": 61696, | |
| "step": 5, | |
| "train_runtime": 24.3804, | |
| "train_tokens_per_second": 2530.557 | |
| }, | |
| { | |
| "epoch": 0.04561003420752566, | |
| "grad_norm": 18.813081741333008, | |
| "learning_rate": 6.75e-05, | |
| "loss": 5.0641, | |
| "num_input_tokens_seen": 123136, | |
| "step": 10, | |
| "train_runtime": 47.2224, | |
| "train_tokens_per_second": 2607.578 | |
| }, | |
| { | |
| "epoch": 0.06841505131128849, | |
| "grad_norm": 6.64304256439209, | |
| "learning_rate": 0.00010499999999999999, | |
| "loss": 1.5053, | |
| "num_input_tokens_seen": 184832, | |
| "step": 15, | |
| "train_runtime": 70.2288, | |
| "train_tokens_per_second": 2631.856 | |
| }, | |
| { | |
| "epoch": 0.09122006841505131, | |
| "grad_norm": 1.0877084732055664, | |
| "learning_rate": 0.0001425, | |
| "loss": 0.6917, | |
| "num_input_tokens_seen": 247552, | |
| "step": 20, | |
| "train_runtime": 93.5022, | |
| "train_tokens_per_second": 2647.553 | |
| }, | |
| { | |
| "epoch": 0.11402508551881414, | |
| "grad_norm": 2.6032357215881348, | |
| "learning_rate": 0.00017999999999999998, | |
| "loss": 0.5721, | |
| "num_input_tokens_seen": 308992, | |
| "step": 25, | |
| "train_runtime": 116.3387, | |
| "train_tokens_per_second": 2655.97 | |
| }, | |
| { | |
| "epoch": 0.13683010262257697, | |
| "grad_norm": 3.105325698852539, | |
| "learning_rate": 0.00021749999999999997, | |
| "loss": 0.6254, | |
| "num_input_tokens_seen": 370176, | |
| "step": 30, | |
| "train_runtime": 139.2197, | |
| "train_tokens_per_second": 2658.935 | |
| }, | |
| { | |
| "epoch": 0.15963511972633979, | |
| "grad_norm": 0.9677203297615051, | |
| "learning_rate": 0.00025499999999999996, | |
| "loss": 0.6389, | |
| "num_input_tokens_seen": 430720, | |
| "step": 35, | |
| "train_runtime": 161.8821, | |
| "train_tokens_per_second": 2660.702 | |
| }, | |
| { | |
| "epoch": 0.18244013683010263, | |
| "grad_norm": 1.2506942749023438, | |
| "learning_rate": 0.00029249999999999995, | |
| "loss": 0.5258, | |
| "num_input_tokens_seen": 492800, | |
| "step": 40, | |
| "train_runtime": 185.2921, | |
| "train_tokens_per_second": 2659.584 | |
| }, | |
| { | |
| "epoch": 0.20524515393386544, | |
| "grad_norm": 6.443251132965088, | |
| "learning_rate": 0.0002996346075389736, | |
| "loss": 0.5041, | |
| "num_input_tokens_seen": 553984, | |
| "step": 45, | |
| "train_runtime": 208.2146, | |
| "train_tokens_per_second": 2660.639 | |
| }, | |
| { | |
| "epoch": 0.22805017103762829, | |
| "grad_norm": 0.32593730092048645, | |
| "learning_rate": 0.00029815325108927063, | |
| "loss": 0.4831, | |
| "num_input_tokens_seen": 615040, | |
| "step": 50, | |
| "train_runtime": 231.0893, | |
| "train_tokens_per_second": 2661.482 | |
| }, | |
| { | |
| "epoch": 0.2508551881413911, | |
| "grad_norm": 0.46452033519744873, | |
| "learning_rate": 0.0002955443589413994, | |
| "loss": 0.4957, | |
| "num_input_tokens_seen": 676736, | |
| "step": 55, | |
| "train_runtime": 254.0788, | |
| "train_tokens_per_second": 2663.488 | |
| }, | |
| { | |
| "epoch": 0.27366020524515394, | |
| "grad_norm": 0.26457586884498596, | |
| "learning_rate": 0.00029182778633989753, | |
| "loss": 0.4764, | |
| "num_input_tokens_seen": 738176, | |
| "step": 60, | |
| "train_runtime": 277.0778, | |
| "train_tokens_per_second": 2664.147 | |
| }, | |
| { | |
| "epoch": 0.29646522234891676, | |
| "grad_norm": 0.14807738363742828, | |
| "learning_rate": 0.0002870318186463901, | |
| "loss": 0.4829, | |
| "num_input_tokens_seen": 799488, | |
| "step": 65, | |
| "train_runtime": 300.021, | |
| "train_tokens_per_second": 2664.773 | |
| }, | |
| { | |
| "epoch": 0.31927023945267957, | |
| "grad_norm": 0.3857417404651642, | |
| "learning_rate": 0.00028119295607090933, | |
| "loss": 0.478, | |
| "num_input_tokens_seen": 861568, | |
| "step": 70, | |
| "train_runtime": 323.133, | |
| "train_tokens_per_second": 2666.296 | |
| }, | |
| { | |
| "epoch": 0.34207525655644244, | |
| "grad_norm": 0.22548428177833557, | |
| "learning_rate": 0.0002743556358832562, | |
| "loss": 0.4771, | |
| "num_input_tokens_seen": 924544, | |
| "step": 75, | |
| "train_runtime": 346.7417, | |
| "train_tokens_per_second": 2666.376 | |
| }, | |
| { | |
| "epoch": 0.36488027366020526, | |
| "grad_norm": 0.22088338434696198, | |
| "learning_rate": 0.0002665718942185456, | |
| "loss": 0.4657, | |
| "num_input_tokens_seen": 985472, | |
| "step": 80, | |
| "train_runtime": 369.5185, | |
| "train_tokens_per_second": 2666.908 | |
| }, | |
| { | |
| "epoch": 0.38768529076396807, | |
| "grad_norm": 0.3554978668689728, | |
| "learning_rate": 0.00025790097005079764, | |
| "loss": 0.4831, | |
| "num_input_tokens_seen": 1046912, | |
| "step": 85, | |
| "train_runtime": 392.5346, | |
| "train_tokens_per_second": 2667.056 | |
| }, | |
| { | |
| "epoch": 0.4104903078677309, | |
| "grad_norm": 0.131392702460289, | |
| "learning_rate": 0.0002484088543485761, | |
| "loss": 0.4778, | |
| "num_input_tokens_seen": 1108992, | |
| "step": 90, | |
| "train_runtime": 415.6419, | |
| "train_tokens_per_second": 2668.143 | |
| }, | |
| { | |
| "epoch": 0.43329532497149376, | |
| "grad_norm": 0.30744704604148865, | |
| "learning_rate": 0.00023816778784387094, | |
| "loss": 0.455, | |
| "num_input_tokens_seen": 1170048, | |
| "step": 95, | |
| "train_runtime": 438.5167, | |
| "train_tokens_per_second": 2668.195 | |
| }, | |
| { | |
| "epoch": 0.45610034207525657, | |
| "grad_norm": 0.18866093456745148, | |
| "learning_rate": 0.00022725571123650813, | |
| "loss": 0.4571, | |
| "num_input_tokens_seen": 1230464, | |
| "step": 100, | |
| "train_runtime": 461.123, | |
| "train_tokens_per_second": 2668.407 | |
| }, | |
| { | |
| "epoch": 0.45610034207525657, | |
| "eval_loss": 0.4646710157394409, | |
| "eval_runtime": 32.6532, | |
| "eval_samples_per_second": 95.488, | |
| "eval_steps_per_second": 5.972, | |
| "num_input_tokens_seen": 1230464, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4789053591790194, | |
| "grad_norm": 0.23965908586978912, | |
| "learning_rate": 0.0002157556720183616, | |
| "loss": 0.4667, | |
| "num_input_tokens_seen": 1292288, | |
| "step": 105, | |
| "train_runtime": 516.8377, | |
| "train_tokens_per_second": 2500.375 | |
| }, | |
| { | |
| "epoch": 0.5017103762827823, | |
| "grad_norm": 0.25870010256767273, | |
| "learning_rate": 0.000203755192431795, | |
| "loss": 0.4758, | |
| "num_input_tokens_seen": 1353344, | |
| "step": 110, | |
| "train_runtime": 539.6435, | |
| "train_tokens_per_second": 2507.848 | |
| }, | |
| { | |
| "epoch": 0.5245153933865451, | |
| "grad_norm": 0.26426610350608826, | |
| "learning_rate": 0.00019134560337254986, | |
| "loss": 0.4783, | |
| "num_input_tokens_seen": 1415040, | |
| "step": 115, | |
| "train_runtime": 562.5569, | |
| "train_tokens_per_second": 2515.372 | |
| }, | |
| { | |
| "epoch": 0.5473204104903079, | |
| "grad_norm": 0.22881975769996643, | |
| "learning_rate": 0.0001786213493064817, | |
| "loss": 0.4643, | |
| "num_input_tokens_seen": 1476480, | |
| "step": 120, | |
| "train_runtime": 585.45, | |
| "train_tokens_per_second": 2521.958 | |
| }, | |
| { | |
| "epoch": 0.5701254275940707, | |
| "grad_norm": 0.2016284018754959, | |
| "learning_rate": 0.000165679269490148, | |
| "loss": 0.4542, | |
| "num_input_tokens_seen": 1537664, | |
| "step": 125, | |
| "train_runtime": 608.3455, | |
| "train_tokens_per_second": 2527.616 | |
| }, | |
| { | |
| "epoch": 0.5929304446978335, | |
| "grad_norm": 0.1549897938966751, | |
| "learning_rate": 0.00015261786096559254, | |
| "loss": 0.4539, | |
| "num_input_tokens_seen": 1598848, | |
| "step": 130, | |
| "train_runtime": 631.1849, | |
| "train_tokens_per_second": 2533.09 | |
| }, | |
| { | |
| "epoch": 0.6157354618015963, | |
| "grad_norm": 0.4272724986076355, | |
| "learning_rate": 0.00013953652893838119, | |
| "loss": 0.4563, | |
| "num_input_tokens_seen": 1660800, | |
| "step": 135, | |
| "train_runtime": 654.1659, | |
| "train_tokens_per_second": 2538.806 | |
| }, | |
| { | |
| "epoch": 0.6385404789053591, | |
| "grad_norm": 0.2291804850101471, | |
| "learning_rate": 0.00012653483024396533, | |
| "loss": 0.4434, | |
| "num_input_tokens_seen": 1721600, | |
| "step": 140, | |
| "train_runtime": 676.8974, | |
| "train_tokens_per_second": 2543.369 | |
| }, | |
| { | |
| "epoch": 0.661345496009122, | |
| "grad_norm": 0.3113957345485687, | |
| "learning_rate": 0.00011371171566004985, | |
| "loss": 0.4484, | |
| "num_input_tokens_seen": 1783168, | |
| "step": 145, | |
| "train_runtime": 699.8482, | |
| "train_tokens_per_second": 2547.935 | |
| }, | |
| { | |
| "epoch": 0.6841505131128849, | |
| "grad_norm": 0.25940707325935364, | |
| "learning_rate": 0.00010116477683142652, | |
| "loss": 0.4314, | |
| "num_input_tokens_seen": 1844992, | |
| "step": 150, | |
| "train_runtime": 722.8201, | |
| "train_tokens_per_second": 2552.491 | |
| }, | |
| { | |
| "epoch": 0.7069555302166477, | |
| "grad_norm": 0.28187158703804016, | |
| "learning_rate": 8.898950353862998e-05, | |
| "loss": 0.4211, | |
| "num_input_tokens_seen": 1906048, | |
| "step": 155, | |
| "train_runtime": 745.621, | |
| "train_tokens_per_second": 2556.323 | |
| }, | |
| { | |
| "epoch": 0.7297605473204105, | |
| "grad_norm": 0.317065954208374, | |
| "learning_rate": 7.727855696304944e-05, | |
| "loss": 0.4324, | |
| "num_input_tokens_seen": 1967744, | |
| "step": 160, | |
| "train_runtime": 768.5694, | |
| "train_tokens_per_second": 2560.268 | |
| }, | |
| { | |
| "epoch": 0.7525655644241733, | |
| "grad_norm": 0.2838670313358307, | |
| "learning_rate": 6.612106447938799e-05, | |
| "loss": 0.4093, | |
| "num_input_tokens_seen": 2028032, | |
| "step": 165, | |
| "train_runtime": 791.1533, | |
| "train_tokens_per_second": 2563.387 | |
| }, | |
| { | |
| "epoch": 0.7753705815279361, | |
| "grad_norm": 0.29432976245880127, | |
| "learning_rate": 5.56019413425244e-05, | |
| "loss": 0.4113, | |
| "num_input_tokens_seen": 2088448, | |
| "step": 170, | |
| "train_runtime": 813.7398, | |
| "train_tokens_per_second": 2566.481 | |
| }, | |
| { | |
| "epoch": 0.798175598631699, | |
| "grad_norm": 0.2523755133152008, | |
| "learning_rate": 4.5801244431150394e-05, | |
| "loss": 0.4142, | |
| "num_input_tokens_seen": 2150144, | |
| "step": 175, | |
| "train_runtime": 836.6712, | |
| "train_tokens_per_second": 2569.879 | |
| }, | |
| { | |
| "epoch": 0.8209806157354618, | |
| "grad_norm": 0.25406619906425476, | |
| "learning_rate": 3.6793562966584196e-05, | |
| "loss": 0.407, | |
| "num_input_tokens_seen": 2211584, | |
| "step": 180, | |
| "train_runtime": 859.6283, | |
| "train_tokens_per_second": 2572.721 | |
| }, | |
| { | |
| "epoch": 0.8437856328392246, | |
| "grad_norm": 0.35335877537727356, | |
| "learning_rate": 2.8647450843757897e-05, | |
| "loss": 0.3836, | |
| "num_input_tokens_seen": 2272256, | |
| "step": 185, | |
| "train_runtime": 882.269, | |
| "train_tokens_per_second": 2575.469 | |
| }, | |
| { | |
| "epoch": 0.8665906499429875, | |
| "grad_norm": 0.3066641688346863, | |
| "learning_rate": 2.1424904894683165e-05, | |
| "loss": 0.3904, | |
| "num_input_tokens_seen": 2333696, | |
| "step": 190, | |
| "train_runtime": 905.2082, | |
| "train_tokens_per_second": 2578.076 | |
| }, | |
| { | |
| "epoch": 0.8893956670467503, | |
| "grad_norm": 0.3389071524143219, | |
| "learning_rate": 1.5180893055124977e-05, | |
| "loss": 0.4011, | |
| "num_input_tokens_seen": 2394880, | |
| "step": 195, | |
| "train_runtime": 928.0104, | |
| "train_tokens_per_second": 2580.661 | |
| }, | |
| { | |
| "epoch": 0.9122006841505131, | |
| "grad_norm": 0.3012617826461792, | |
| "learning_rate": 9.962936025419754e-06, | |
| "loss": 0.3809, | |
| "num_input_tokens_seen": 2455680, | |
| "step": 200, | |
| "train_runtime": 950.7291, | |
| "train_tokens_per_second": 2582.944 | |
| }, | |
| { | |
| "epoch": 0.9122006841505131, | |
| "eval_loss": 0.3808976411819458, | |
| "eval_runtime": 32.6975, | |
| "eval_samples_per_second": 95.359, | |
| "eval_steps_per_second": 5.964, | |
| "num_input_tokens_seen": 2455680, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.935005701254276, | |
| "grad_norm": 0.2728487253189087, | |
| "learning_rate": 5.810745609252165e-06, | |
| "loss": 0.3799, | |
| "num_input_tokens_seen": 2517376, | |
| "step": 205, | |
| "train_runtime": 1006.4425, | |
| "train_tokens_per_second": 2501.262 | |
| }, | |
| { | |
| "epoch": 0.9578107183580388, | |
| "grad_norm": 0.29773786664009094, | |
| "learning_rate": 2.7559224828504035e-06, | |
| "loss": 0.3944, | |
| "num_input_tokens_seen": 2578816, | |
| "step": 210, | |
| "train_runtime": 1029.2761, | |
| "train_tokens_per_second": 2505.466 | |
| }, | |
| { | |
| "epoch": 0.9806157354618016, | |
| "grad_norm": 0.3076622486114502, | |
| "learning_rate": 8.217156947590064e-07, | |
| "loss": 0.3721, | |
| "num_input_tokens_seen": 2640128, | |
| "step": 215, | |
| "train_runtime": 1052.1692, | |
| "train_tokens_per_second": 2509.224 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.5157559514045715, | |
| "learning_rate": 2.284572654130956e-08, | |
| "loss": 0.3749, | |
| "num_input_tokens_seen": 2691984, | |
| "step": 220, | |
| "train_runtime": 1071.5462, | |
| "train_tokens_per_second": 2512.243 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "num_input_tokens_seen": 2691984, | |
| "step": 220, | |
| "total_flos": 1.0930399586117222e+17, | |
| "train_loss": 0.8674792235547846, | |
| "train_runtime": 1074.777, | |
| "train_samples_per_second": 26.106, | |
| "train_steps_per_second": 0.205 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 220, | |
| "num_input_tokens_seen": 2691984, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0930399586117222e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |