{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 220, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02280501710376283, "grad_norm": 32.61074447631836, "learning_rate": 2.9999999999999997e-05, "loss": 12.6644, "num_input_tokens_seen": 61696, "step": 5, "train_runtime": 24.3804, "train_tokens_per_second": 2530.557 }, { "epoch": 0.04561003420752566, "grad_norm": 18.813081741333008, "learning_rate": 6.75e-05, "loss": 5.0641, "num_input_tokens_seen": 123136, "step": 10, "train_runtime": 47.2224, "train_tokens_per_second": 2607.578 }, { "epoch": 0.06841505131128849, "grad_norm": 6.64304256439209, "learning_rate": 0.00010499999999999999, "loss": 1.5053, "num_input_tokens_seen": 184832, "step": 15, "train_runtime": 70.2288, "train_tokens_per_second": 2631.856 }, { "epoch": 0.09122006841505131, "grad_norm": 1.0877084732055664, "learning_rate": 0.0001425, "loss": 0.6917, "num_input_tokens_seen": 247552, "step": 20, "train_runtime": 93.5022, "train_tokens_per_second": 2647.553 }, { "epoch": 0.11402508551881414, "grad_norm": 2.6032357215881348, "learning_rate": 0.00017999999999999998, "loss": 0.5721, "num_input_tokens_seen": 308992, "step": 25, "train_runtime": 116.3387, "train_tokens_per_second": 2655.97 }, { "epoch": 0.13683010262257697, "grad_norm": 3.105325698852539, "learning_rate": 0.00021749999999999997, "loss": 0.6254, "num_input_tokens_seen": 370176, "step": 30, "train_runtime": 139.2197, "train_tokens_per_second": 2658.935 }, { "epoch": 0.15963511972633979, "grad_norm": 0.9677203297615051, "learning_rate": 0.00025499999999999996, "loss": 0.6389, "num_input_tokens_seen": 430720, "step": 35, "train_runtime": 161.8821, "train_tokens_per_second": 2660.702 }, { "epoch": 0.18244013683010263, "grad_norm": 1.2506942749023438, "learning_rate": 0.00029249999999999995, "loss": 0.5258, "num_input_tokens_seen": 492800, "step": 40, "train_runtime": 185.2921, "train_tokens_per_second": 2659.584 }, { "epoch": 0.20524515393386544, "grad_norm": 6.443251132965088, "learning_rate": 0.0002996346075389736, "loss": 0.5041, "num_input_tokens_seen": 553984, "step": 45, "train_runtime": 208.2146, "train_tokens_per_second": 2660.639 }, { "epoch": 0.22805017103762829, "grad_norm": 0.32593730092048645, "learning_rate": 0.00029815325108927063, "loss": 0.4831, "num_input_tokens_seen": 615040, "step": 50, "train_runtime": 231.0893, "train_tokens_per_second": 2661.482 }, { "epoch": 0.2508551881413911, "grad_norm": 0.46452033519744873, "learning_rate": 0.0002955443589413994, "loss": 0.4957, "num_input_tokens_seen": 676736, "step": 55, "train_runtime": 254.0788, "train_tokens_per_second": 2663.488 }, { "epoch": 0.27366020524515394, "grad_norm": 0.26457586884498596, "learning_rate": 0.00029182778633989753, "loss": 0.4764, "num_input_tokens_seen": 738176, "step": 60, "train_runtime": 277.0778, "train_tokens_per_second": 2664.147 }, { "epoch": 0.29646522234891676, "grad_norm": 0.14807738363742828, "learning_rate": 0.0002870318186463901, "loss": 0.4829, "num_input_tokens_seen": 799488, "step": 65, "train_runtime": 300.021, "train_tokens_per_second": 2664.773 }, { "epoch": 0.31927023945267957, "grad_norm": 0.3857417404651642, "learning_rate": 0.00028119295607090933, "loss": 0.478, "num_input_tokens_seen": 861568, "step": 70, "train_runtime": 323.133, "train_tokens_per_second": 2666.296 }, { "epoch": 0.34207525655644244, "grad_norm": 0.22548428177833557, "learning_rate": 0.0002743556358832562, "loss": 0.4771, "num_input_tokens_seen": 924544, "step": 75, "train_runtime": 346.7417, "train_tokens_per_second": 2666.376 }, { "epoch": 0.36488027366020526, "grad_norm": 0.22088338434696198, "learning_rate": 0.0002665718942185456, "loss": 0.4657, "num_input_tokens_seen": 985472, "step": 80, "train_runtime": 369.5185, "train_tokens_per_second": 2666.908 }, { "epoch": 0.38768529076396807, "grad_norm": 0.3554978668689728, "learning_rate": 0.00025790097005079764, "loss": 0.4831, "num_input_tokens_seen": 1046912, "step": 85, "train_runtime": 392.5346, "train_tokens_per_second": 2667.056 }, { "epoch": 0.4104903078677309, "grad_norm": 0.131392702460289, "learning_rate": 0.0002484088543485761, "loss": 0.4778, "num_input_tokens_seen": 1108992, "step": 90, "train_runtime": 415.6419, "train_tokens_per_second": 2668.143 }, { "epoch": 0.43329532497149376, "grad_norm": 0.30744704604148865, "learning_rate": 0.00023816778784387094, "loss": 0.455, "num_input_tokens_seen": 1170048, "step": 95, "train_runtime": 438.5167, "train_tokens_per_second": 2668.195 }, { "epoch": 0.45610034207525657, "grad_norm": 0.18866093456745148, "learning_rate": 0.00022725571123650813, "loss": 0.4571, "num_input_tokens_seen": 1230464, "step": 100, "train_runtime": 461.123, "train_tokens_per_second": 2668.407 }, { "epoch": 0.45610034207525657, "eval_loss": 0.4646710157394409, "eval_runtime": 32.6532, "eval_samples_per_second": 95.488, "eval_steps_per_second": 5.972, "num_input_tokens_seen": 1230464, "step": 100 }, { "epoch": 0.4789053591790194, "grad_norm": 0.23965908586978912, "learning_rate": 0.0002157556720183616, "loss": 0.4667, "num_input_tokens_seen": 1292288, "step": 105, "train_runtime": 516.8377, "train_tokens_per_second": 2500.375 }, { "epoch": 0.5017103762827823, "grad_norm": 0.25870010256767273, "learning_rate": 0.000203755192431795, "loss": 0.4758, "num_input_tokens_seen": 1353344, "step": 110, "train_runtime": 539.6435, "train_tokens_per_second": 2507.848 }, { "epoch": 0.5245153933865451, "grad_norm": 0.26426610350608826, "learning_rate": 0.00019134560337254986, "loss": 0.4783, "num_input_tokens_seen": 1415040, "step": 115, "train_runtime": 562.5569, "train_tokens_per_second": 2515.372 }, { "epoch": 0.5473204104903079, "grad_norm": 0.22881975769996643, "learning_rate": 0.0001786213493064817, "loss": 0.4643, "num_input_tokens_seen": 1476480, "step": 120, "train_runtime": 585.45, "train_tokens_per_second": 2521.958 }, { "epoch": 0.5701254275940707, "grad_norm": 0.2016284018754959, "learning_rate": 0.000165679269490148, "loss": 0.4542, "num_input_tokens_seen": 1537664, "step": 125, "train_runtime": 608.3455, "train_tokens_per_second": 2527.616 }, { "epoch": 0.5929304446978335, "grad_norm": 0.1549897938966751, "learning_rate": 0.00015261786096559254, "loss": 0.4539, "num_input_tokens_seen": 1598848, "step": 130, "train_runtime": 631.1849, "train_tokens_per_second": 2533.09 }, { "epoch": 0.6157354618015963, "grad_norm": 0.4272724986076355, "learning_rate": 0.00013953652893838119, "loss": 0.4563, "num_input_tokens_seen": 1660800, "step": 135, "train_runtime": 654.1659, "train_tokens_per_second": 2538.806 }, { "epoch": 0.6385404789053591, "grad_norm": 0.2291804850101471, "learning_rate": 0.00012653483024396533, "loss": 0.4434, "num_input_tokens_seen": 1721600, "step": 140, "train_runtime": 676.8974, "train_tokens_per_second": 2543.369 }, { "epoch": 0.661345496009122, "grad_norm": 0.3113957345485687, "learning_rate": 0.00011371171566004985, "loss": 0.4484, "num_input_tokens_seen": 1783168, "step": 145, "train_runtime": 699.8482, "train_tokens_per_second": 2547.935 }, { "epoch": 0.6841505131128849, "grad_norm": 0.25940707325935364, "learning_rate": 0.00010116477683142652, "loss": 0.4314, "num_input_tokens_seen": 1844992, "step": 150, "train_runtime": 722.8201, "train_tokens_per_second": 2552.491 }, { "epoch": 0.7069555302166477, "grad_norm": 0.28187158703804016, "learning_rate": 8.898950353862998e-05, "loss": 0.4211, "num_input_tokens_seen": 1906048, "step": 155, "train_runtime": 745.621, "train_tokens_per_second": 2556.323 }, { "epoch": 0.7297605473204105, "grad_norm": 0.317065954208374, "learning_rate": 7.727855696304944e-05, "loss": 0.4324, "num_input_tokens_seen": 1967744, "step": 160, "train_runtime": 768.5694, "train_tokens_per_second": 2560.268 }, { "epoch": 0.7525655644241733, "grad_norm": 0.2838670313358307, "learning_rate": 6.612106447938799e-05, "loss": 0.4093, "num_input_tokens_seen": 2028032, "step": 165, "train_runtime": 791.1533, "train_tokens_per_second": 2563.387 }, { "epoch": 0.7753705815279361, "grad_norm": 0.29432976245880127, "learning_rate": 5.56019413425244e-05, "loss": 0.4113, "num_input_tokens_seen": 2088448, "step": 170, "train_runtime": 813.7398, "train_tokens_per_second": 2566.481 }, { "epoch": 0.798175598631699, "grad_norm": 0.2523755133152008, "learning_rate": 4.5801244431150394e-05, "loss": 0.4142, "num_input_tokens_seen": 2150144, "step": 175, "train_runtime": 836.6712, "train_tokens_per_second": 2569.879 }, { "epoch": 0.8209806157354618, "grad_norm": 0.25406619906425476, "learning_rate": 3.6793562966584196e-05, "loss": 0.407, "num_input_tokens_seen": 2211584, "step": 180, "train_runtime": 859.6283, "train_tokens_per_second": 2572.721 }, { "epoch": 0.8437856328392246, "grad_norm": 0.35335877537727356, "learning_rate": 2.8647450843757897e-05, "loss": 0.3836, "num_input_tokens_seen": 2272256, "step": 185, "train_runtime": 882.269, "train_tokens_per_second": 2575.469 }, { "epoch": 0.8665906499429875, "grad_norm": 0.3066641688346863, "learning_rate": 2.1424904894683165e-05, "loss": 0.3904, "num_input_tokens_seen": 2333696, "step": 190, "train_runtime": 905.2082, "train_tokens_per_second": 2578.076 }, { "epoch": 0.8893956670467503, "grad_norm": 0.3389071524143219, "learning_rate": 1.5180893055124977e-05, "loss": 0.4011, "num_input_tokens_seen": 2394880, "step": 195, "train_runtime": 928.0104, "train_tokens_per_second": 2580.661 }, { "epoch": 0.9122006841505131, "grad_norm": 0.3012617826461792, "learning_rate": 9.962936025419754e-06, "loss": 0.3809, "num_input_tokens_seen": 2455680, "step": 200, "train_runtime": 950.7291, "train_tokens_per_second": 2582.944 }, { "epoch": 0.9122006841505131, "eval_loss": 0.3808976411819458, "eval_runtime": 32.6975, "eval_samples_per_second": 95.359, "eval_steps_per_second": 5.964, "num_input_tokens_seen": 2455680, "step": 200 }, { "epoch": 0.935005701254276, "grad_norm": 0.2728487253189087, "learning_rate": 5.810745609252165e-06, "loss": 0.3799, "num_input_tokens_seen": 2517376, "step": 205, "train_runtime": 1006.4425, "train_tokens_per_second": 2501.262 }, { "epoch": 0.9578107183580388, "grad_norm": 0.29773786664009094, "learning_rate": 2.7559224828504035e-06, "loss": 0.3944, "num_input_tokens_seen": 2578816, "step": 210, "train_runtime": 1029.2761, "train_tokens_per_second": 2505.466 }, { "epoch": 0.9806157354618016, "grad_norm": 0.3076622486114502, "learning_rate": 8.217156947590064e-07, "loss": 0.3721, "num_input_tokens_seen": 2640128, "step": 215, "train_runtime": 1052.1692, "train_tokens_per_second": 2509.224 }, { "epoch": 1.0, "grad_norm": 0.5157559514045715, "learning_rate": 2.284572654130956e-08, "loss": 0.3749, "num_input_tokens_seen": 2691984, "step": 220, "train_runtime": 1071.5462, "train_tokens_per_second": 2512.243 }, { "epoch": 1.0, "num_input_tokens_seen": 2691984, "step": 220, "total_flos": 1.0930399586117222e+17, "train_loss": 0.8674792235547846, "train_runtime": 1074.777, "train_samples_per_second": 26.106, "train_steps_per_second": 0.205 } ], "logging_steps": 5, "max_steps": 220, "num_input_tokens_seen": 2691984, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0930399586117222e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }