{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 63, "global_step": 63, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.047619047619047616, "grad_norm": 1.0377577543258667, "learning_rate": 0.0, "loss": 2.2598, "step": 1 }, { "epoch": 0.09523809523809523, "grad_norm": 0.950491726398468, "learning_rate": 5e-05, "loss": 2.1162, "step": 2 }, { "epoch": 0.14285714285714285, "grad_norm": 0.8062677383422852, "learning_rate": 0.0001, "loss": 1.8789, "step": 3 }, { "epoch": 0.19047619047619047, "grad_norm": 0.2896675169467926, "learning_rate": 0.00015000000000000001, "loss": 1.415, "step": 4 }, { "epoch": 0.23809523809523808, "grad_norm": 0.18858225643634796, "learning_rate": 0.0002, "loss": 1.2537, "step": 5 }, { "epoch": 0.2857142857142857, "grad_norm": 0.3589648902416229, "learning_rate": 0.0001998724426109086, "loss": 1.157, "step": 6 }, { "epoch": 0.3333333333333333, "grad_norm": 0.2034045308828354, "learning_rate": 0.00019949013201891235, "loss": 1.256, "step": 7 }, { "epoch": 0.38095238095238093, "grad_norm": 0.20806469023227692, "learning_rate": 0.00019885415192492104, "loss": 0.9347, "step": 8 }, { "epoch": 0.42857142857142855, "grad_norm": 0.1329486072063446, "learning_rate": 0.000197966305083608, "loss": 1.0225, "step": 9 }, { "epoch": 0.47619047619047616, "grad_norm": 0.10501191765069962, "learning_rate": 0.00019682910819330646, "loss": 1.0608, "step": 10 }, { "epoch": 0.5238095238095238, "grad_norm": 0.08167339116334915, "learning_rate": 0.0001954457847621543, "loss": 1.0442, "step": 11 }, { "epoch": 0.5714285714285714, "grad_norm": 0.07467832416296005, "learning_rate": 0.00019382025597071049, "loss": 0.9116, "step": 12 }, { "epoch": 0.6190476190476191, "grad_norm": 0.10425141453742981, "learning_rate": 0.0001919571295569422, "loss": 1.0774, "step": 13 }, { "epoch": 0.6666666666666666, "grad_norm": 0.08753609657287598, "learning_rate": 0.00018986168675509145, "loss": 0.9084, "step": 14 }, { "epoch": 0.7142857142857143, "grad_norm": 0.09329564869403839, "learning_rate": 0.00018753986732544255, "loss": 1.0326, "step": 15 }, { "epoch": 0.7619047619047619, "grad_norm": 0.06999287754297256, "learning_rate": 0.00018499825271742635, "loss": 1.066, "step": 16 }, { "epoch": 0.8095238095238095, "grad_norm": 0.07671773433685303, "learning_rate": 0.00018224404741378674, "loss": 0.9196, "step": 17 }, { "epoch": 0.8571428571428571, "grad_norm": 0.07140030711889267, "learning_rate": 0.00017928505850869157, "loss": 1.0462, "step": 18 }, { "epoch": 0.9047619047619048, "grad_norm": 0.10820895433425903, "learning_rate": 0.00017612967357767605, "loss": 0.7579, "step": 19 }, { "epoch": 0.9523809523809523, "grad_norm": 0.11008598655462265, "learning_rate": 0.00017278683690214865, "loss": 0.5615, "step": 20 }, { "epoch": 1.0, "grad_norm": 0.07539282739162445, "learning_rate": 0.0001692660241158535, "loss": 0.8203, "step": 21 }, { "epoch": 1.0476190476190477, "grad_norm": 0.10306309908628464, "learning_rate": 0.0001655772153451573, "loss": 0.885, "step": 22 }, { "epoch": 1.0952380952380953, "grad_norm": 0.09677709639072418, "learning_rate": 0.00016173086691929664, "loss": 0.8434, "step": 23 }, { "epoch": 1.1428571428571428, "grad_norm": 0.07850930839776993, "learning_rate": 0.00015773788173077682, "loss": 0.9404, "step": 24 }, { "epoch": 1.1904761904761905, "grad_norm": 0.0830693244934082, "learning_rate": 0.00015360957832993852, "loss": 0.9149, "step": 25 }, { "epoch": 1.2380952380952381, "grad_norm": 0.0840546041727066, "learning_rate": 0.000149357658841297, "loss": 0.827, "step": 26 }, { "epoch": 1.2857142857142856, "grad_norm": 0.07045507431030273, "learning_rate": 0.0001449941757925989, "loss": 0.7903, "step": 27 }, { "epoch": 1.3333333333333333, "grad_norm": 0.07093216478824615, "learning_rate": 0.00014053149795062274, "loss": 0.962, "step": 28 }, { "epoch": 1.380952380952381, "grad_norm": 0.07280776649713516, "learning_rate": 0.0001359822752605659, "loss": 0.6519, "step": 29 }, { "epoch": 1.4285714285714286, "grad_norm": 0.0633326843380928, "learning_rate": 0.00013135940298840035, "loss": 0.7894, "step": 30 }, { "epoch": 1.4761904761904763, "grad_norm": 0.06777395308017731, "learning_rate": 0.0001266759851678403, "loss": 0.8445, "step": 31 }, { "epoch": 1.5238095238095237, "grad_norm": 0.06890520453453064, "learning_rate": 0.00012194529745553497, "loss": 0.8394, "step": 32 }, { "epoch": 1.5714285714285714, "grad_norm": 0.07028112560510635, "learning_rate": 0.00011718074949977748, "loss": 0.7128, "step": 33 }, { "epoch": 1.619047619047619, "grad_norm": 0.0746479257941246, "learning_rate": 0.00011239584692939975, "loss": 0.8896, "step": 34 }, { "epoch": 1.6666666666666665, "grad_norm": 0.08016974478960037, "learning_rate": 0.00010760415307060027, "loss": 0.7318, "step": 35 }, { "epoch": 1.7142857142857144, "grad_norm": 0.06959321349859238, "learning_rate": 0.00010281925050022251, "loss": 0.8489, "step": 36 }, { "epoch": 1.7619047619047619, "grad_norm": 0.06887049973011017, "learning_rate": 9.805470254446503e-05, "loss": 0.9084, "step": 37 }, { "epoch": 1.8095238095238095, "grad_norm": 0.0744592696428299, "learning_rate": 9.332401483215973e-05, "loss": 0.7318, "step": 38 }, { "epoch": 1.8571428571428572, "grad_norm": 0.0794808566570282, "learning_rate": 8.864059701159965e-05, "loss": 0.9065, "step": 39 }, { "epoch": 1.9047619047619047, "grad_norm": 0.07422224432229996, "learning_rate": 8.401772473943415e-05, "loss": 0.5881, "step": 40 }, { "epoch": 1.9523809523809523, "grad_norm": 0.07543457299470901, "learning_rate": 7.946850204937728e-05, "loss": 0.4109, "step": 41 }, { "epoch": 2.0, "grad_norm": 0.07449755817651749, "learning_rate": 7.500582420740115e-05, "loss": 0.6763, "step": 42 }, { "epoch": 2.0476190476190474, "grad_norm": 0.0908975750207901, "learning_rate": 7.064234115870303e-05, "loss": 0.7145, "step": 43 }, { "epoch": 2.0952380952380953, "grad_norm": 0.09462336450815201, "learning_rate": 6.63904216700615e-05, "loss": 0.6831, "step": 44 }, { "epoch": 2.142857142857143, "grad_norm": 0.07695115357637405, "learning_rate": 6.226211826922319e-05, "loss": 0.8054, "step": 45 }, { "epoch": 2.1904761904761907, "grad_norm": 0.07964155077934265, "learning_rate": 5.8269133080703386e-05, "loss": 0.8001, "step": 46 }, { "epoch": 2.238095238095238, "grad_norm": 0.08799073100090027, "learning_rate": 5.442278465484274e-05, "loss": 0.6975, "step": 47 }, { "epoch": 2.2857142857142856, "grad_norm": 0.07706860452890396, "learning_rate": 5.0733975884146533e-05, "loss": 0.6651, "step": 48 }, { "epoch": 2.3333333333333335, "grad_norm": 0.07903925329446793, "learning_rate": 4.721316309785139e-05, "loss": 0.8473, "step": 49 }, { "epoch": 2.380952380952381, "grad_norm": 0.094353087246418, "learning_rate": 4.387032642232397e-05, "loss": 0.5177, "step": 50 }, { "epoch": 2.4285714285714284, "grad_norm": 0.09457288682460785, "learning_rate": 4.071494149130843e-05, "loss": 0.6799, "step": 51 }, { "epoch": 2.4761904761904763, "grad_norm": 0.08974256366491318, "learning_rate": 3.775595258621329e-05, "loss": 0.7302, "step": 52 }, { "epoch": 2.5238095238095237, "grad_norm": 0.08180077373981476, "learning_rate": 3.500174728257365e-05, "loss": 0.7274, "step": 53 }, { "epoch": 2.571428571428571, "grad_norm": 0.07987571507692337, "learning_rate": 3.246013267455745e-05, "loss": 0.6073, "step": 54 }, { "epoch": 2.619047619047619, "grad_norm": 0.08051332831382751, "learning_rate": 3.0138313244908573e-05, "loss": 0.7881, "step": 55 }, { "epoch": 2.6666666666666665, "grad_norm": 0.08457138389348984, "learning_rate": 2.8042870443057812e-05, "loss": 0.636, "step": 56 }, { "epoch": 2.7142857142857144, "grad_norm": 0.0923929363489151, "learning_rate": 2.6179744029289542e-05, "loss": 0.7524, "step": 57 }, { "epoch": 2.761904761904762, "grad_norm": 0.08431456238031387, "learning_rate": 2.4554215237845687e-05, "loss": 0.8206, "step": 58 }, { "epoch": 2.8095238095238093, "grad_norm": 0.09262314438819885, "learning_rate": 2.3170891806693566e-05, "loss": 0.6298, "step": 59 }, { "epoch": 2.857142857142857, "grad_norm": 0.08955827355384827, "learning_rate": 2.2033694916392002e-05, "loss": 0.8282, "step": 60 }, { "epoch": 2.9047619047619047, "grad_norm": 0.0884852334856987, "learning_rate": 2.1145848075078993e-05, "loss": 0.5063, "step": 61 }, { "epoch": 2.9523809523809526, "grad_norm": 0.07892720401287079, "learning_rate": 2.0509867981087663e-05, "loss": 0.3413, "step": 62 }, { "epoch": 3.0, "grad_norm": 0.0838647335767746, "learning_rate": 2.0127557389091428e-05, "loss": 0.5994, "step": 63 }, { "epoch": 3.0, "eval_loss": 0.6357218623161316, "eval_runtime": 3.9804, "eval_samples_per_second": 5.778, "eval_steps_per_second": 0.754, "step": 63 } ], "logging_steps": 1.0, "max_steps": 63, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2942816179965133e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }