{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 84, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023809523809523808, "grad_norm": 26.742956161499023, "learning_rate": 5.0000000000000004e-08, "loss": 4.1563, "step": 1 }, { "epoch": 0.047619047619047616, "grad_norm": 26.336389541625977, "learning_rate": 1.0000000000000001e-07, "loss": 4.0633, "step": 2 }, { "epoch": 0.07142857142857142, "grad_norm": 28.836008071899414, "learning_rate": 1.5000000000000002e-07, "loss": 4.3575, "step": 3 }, { "epoch": 0.09523809523809523, "grad_norm": 27.459413528442383, "learning_rate": 2.0000000000000002e-07, "loss": 4.1943, "step": 4 }, { "epoch": 0.11904761904761904, "grad_norm": 27.914960861206055, "learning_rate": 2.5000000000000004e-07, "loss": 4.3336, "step": 5 }, { "epoch": 0.14285714285714285, "grad_norm": 27.47397232055664, "learning_rate": 3.0000000000000004e-07, "loss": 4.2354, "step": 6 }, { "epoch": 0.16666666666666666, "grad_norm": 26.212989807128906, "learning_rate": 3.5000000000000004e-07, "loss": 4.1001, "step": 7 }, { "epoch": 0.19047619047619047, "grad_norm": 26.80431365966797, "learning_rate": 4.0000000000000003e-07, "loss": 4.1827, "step": 8 }, { "epoch": 0.21428571428571427, "grad_norm": 27.641605377197266, "learning_rate": 4.5000000000000003e-07, "loss": 4.2596, "step": 9 }, { "epoch": 0.23809523809523808, "grad_norm": 27.783071517944336, "learning_rate": 5.000000000000001e-07, "loss": 4.2694, "step": 10 }, { "epoch": 0.2619047619047619, "grad_norm": 26.553335189819336, "learning_rate": 5.5e-07, "loss": 4.0739, "step": 11 }, { "epoch": 0.2857142857142857, "grad_norm": 26.77140998840332, "learning_rate": 6.000000000000001e-07, "loss": 4.1049, "step": 12 }, { "epoch": 0.30952380952380953, "grad_norm": 25.07087516784668, "learning_rate": 6.5e-07, "loss": 3.8982, "step": 13 }, { "epoch": 0.3333333333333333, "grad_norm": 25.834062576293945, "learning_rate": 7.000000000000001e-07, "loss": 4.0768, "step": 14 }, { "epoch": 0.35714285714285715, "grad_norm": 25.00474739074707, "learning_rate": 7.5e-07, "loss": 4.0104, "step": 15 }, { "epoch": 0.38095238095238093, "grad_norm": 23.692975997924805, "learning_rate": 8.000000000000001e-07, "loss": 3.8469, "step": 16 }, { "epoch": 0.40476190476190477, "grad_norm": 22.197919845581055, "learning_rate": 8.500000000000001e-07, "loss": 3.7738, "step": 17 }, { "epoch": 0.42857142857142855, "grad_norm": 20.92680549621582, "learning_rate": 9.000000000000001e-07, "loss": 3.6514, "step": 18 }, { "epoch": 0.4523809523809524, "grad_norm": 20.251178741455078, "learning_rate": 9.500000000000001e-07, "loss": 3.6739, "step": 19 }, { "epoch": 0.47619047619047616, "grad_norm": 17.55536460876465, "learning_rate": 1.0000000000000002e-06, "loss": 3.2675, "step": 20 }, { "epoch": 0.5, "grad_norm": 17.066797256469727, "learning_rate": 1.0500000000000001e-06, "loss": 3.4232, "step": 21 }, { "epoch": 0.5238095238095238, "grad_norm": 16.1475887298584, "learning_rate": 1.1e-06, "loss": 3.1988, "step": 22 }, { "epoch": 0.5476190476190477, "grad_norm": 15.61026382446289, "learning_rate": 1.1500000000000002e-06, "loss": 3.1338, "step": 23 }, { "epoch": 0.5714285714285714, "grad_norm": 15.409480094909668, "learning_rate": 1.2000000000000002e-06, "loss": 2.9836, "step": 24 }, { "epoch": 0.5952380952380952, "grad_norm": 15.391901969909668, "learning_rate": 1.25e-06, "loss": 2.9064, "step": 25 }, { "epoch": 0.6190476190476191, "grad_norm": 16.92401885986328, "learning_rate": 1.3e-06, "loss": 2.906, "step": 26 }, { "epoch": 0.6428571428571429, "grad_norm": 17.880958557128906, "learning_rate": 1.3500000000000002e-06, "loss": 2.702, "step": 27 }, { "epoch": 0.6666666666666666, "grad_norm": 18.114517211914062, "learning_rate": 1.4000000000000001e-06, "loss": 2.4876, "step": 28 }, { "epoch": 0.6904761904761905, "grad_norm": 17.608840942382812, "learning_rate": 1.45e-06, "loss": 2.2996, "step": 29 }, { "epoch": 0.7142857142857143, "grad_norm": 17.055673599243164, "learning_rate": 1.5e-06, "loss": 2.2709, "step": 30 }, { "epoch": 0.7380952380952381, "grad_norm": 14.92151927947998, "learning_rate": 1.5500000000000002e-06, "loss": 2.0406, "step": 31 }, { "epoch": 0.7619047619047619, "grad_norm": 13.657073020935059, "learning_rate": 1.6000000000000001e-06, "loss": 1.8564, "step": 32 }, { "epoch": 0.7857142857142857, "grad_norm": 13.274576187133789, "learning_rate": 1.6500000000000003e-06, "loss": 1.7382, "step": 33 }, { "epoch": 0.8095238095238095, "grad_norm": 13.728348731994629, "learning_rate": 1.7000000000000002e-06, "loss": 1.6629, "step": 34 }, { "epoch": 0.8333333333333334, "grad_norm": 13.521151542663574, "learning_rate": 1.75e-06, "loss": 1.5301, "step": 35 }, { "epoch": 0.8571428571428571, "grad_norm": 13.709525108337402, "learning_rate": 1.8000000000000001e-06, "loss": 1.4122, "step": 36 }, { "epoch": 0.8809523809523809, "grad_norm": 12.587928771972656, "learning_rate": 1.85e-06, "loss": 1.2014, "step": 37 }, { "epoch": 0.9047619047619048, "grad_norm": 13.494888305664062, "learning_rate": 1.9000000000000002e-06, "loss": 1.1793, "step": 38 }, { "epoch": 0.9285714285714286, "grad_norm": 13.417922019958496, "learning_rate": 1.9500000000000004e-06, "loss": 1.0282, "step": 39 }, { "epoch": 0.9523809523809523, "grad_norm": 12.754359245300293, "learning_rate": 2.0000000000000003e-06, "loss": 0.8525, "step": 40 }, { "epoch": 0.9761904761904762, "grad_norm": 12.766407012939453, "learning_rate": 2.05e-06, "loss": 0.7043, "step": 41 }, { "epoch": 1.0, "grad_norm": 11.440421104431152, "learning_rate": 2.1000000000000002e-06, "loss": 0.6245, "step": 42 }, { "epoch": 1.0238095238095237, "grad_norm": 9.672205924987793, "learning_rate": 2.15e-06, "loss": 0.4768, "step": 43 }, { "epoch": 1.0476190476190477, "grad_norm": 7.8501057624816895, "learning_rate": 2.2e-06, "loss": 0.3679, "step": 44 }, { "epoch": 1.0714285714285714, "grad_norm": 6.751816749572754, "learning_rate": 2.25e-06, "loss": 0.2708, "step": 45 }, { "epoch": 1.0952380952380953, "grad_norm": 5.267884731292725, "learning_rate": 2.3000000000000004e-06, "loss": 0.2143, "step": 46 }, { "epoch": 1.119047619047619, "grad_norm": 3.251101016998291, "learning_rate": 2.35e-06, "loss": 0.1759, "step": 47 }, { "epoch": 1.1428571428571428, "grad_norm": 2.585360050201416, "learning_rate": 2.4000000000000003e-06, "loss": 0.1855, "step": 48 }, { "epoch": 1.1666666666666667, "grad_norm": 2.0107483863830566, "learning_rate": 2.4500000000000003e-06, "loss": 0.1251, "step": 49 }, { "epoch": 1.1904761904761905, "grad_norm": 1.941689133644104, "learning_rate": 2.5e-06, "loss": 0.1441, "step": 50 }, { "epoch": 1.2142857142857142, "grad_norm": 1.4036344289779663, "learning_rate": 2.55e-06, "loss": 0.1097, "step": 51 }, { "epoch": 1.2380952380952381, "grad_norm": 1.2856179475784302, "learning_rate": 2.6e-06, "loss": 0.1303, "step": 52 }, { "epoch": 1.2619047619047619, "grad_norm": 1.3184994459152222, "learning_rate": 2.6500000000000005e-06, "loss": 0.1021, "step": 53 }, { "epoch": 1.2857142857142856, "grad_norm": 1.144294261932373, "learning_rate": 2.7000000000000004e-06, "loss": 0.115, "step": 54 }, { "epoch": 1.3095238095238095, "grad_norm": 0.9276831150054932, "learning_rate": 2.7500000000000004e-06, "loss": 0.1028, "step": 55 }, { "epoch": 1.3333333333333333, "grad_norm": 0.9152742028236389, "learning_rate": 2.8000000000000003e-06, "loss": 0.0979, "step": 56 }, { "epoch": 1.3571428571428572, "grad_norm": 0.8525308966636658, "learning_rate": 2.85e-06, "loss": 0.0908, "step": 57 }, { "epoch": 1.380952380952381, "grad_norm": 0.9806348085403442, "learning_rate": 2.9e-06, "loss": 0.0817, "step": 58 }, { "epoch": 1.4047619047619047, "grad_norm": 0.606792151927948, "learning_rate": 2.95e-06, "loss": 0.0904, "step": 59 }, { "epoch": 1.4285714285714286, "grad_norm": 0.7274054884910583, "learning_rate": 3e-06, "loss": 0.0811, "step": 60 }, { "epoch": 1.4523809523809523, "grad_norm": 1.0523946285247803, "learning_rate": 3.05e-06, "loss": 0.0881, "step": 61 }, { "epoch": 1.4761904761904763, "grad_norm": 0.5840473175048828, "learning_rate": 3.1000000000000004e-06, "loss": 0.0848, "step": 62 }, { "epoch": 1.5, "grad_norm": 0.7410831451416016, "learning_rate": 3.1500000000000003e-06, "loss": 0.0788, "step": 63 }, { "epoch": 1.5238095238095237, "grad_norm": 0.828996479511261, "learning_rate": 3.2000000000000003e-06, "loss": 0.0866, "step": 64 }, { "epoch": 1.5476190476190477, "grad_norm": 0.7505109310150146, "learning_rate": 3.2500000000000002e-06, "loss": 0.076, "step": 65 }, { "epoch": 1.5714285714285714, "grad_norm": 0.9672189354896545, "learning_rate": 3.3000000000000006e-06, "loss": 0.0765, "step": 66 }, { "epoch": 1.5952380952380953, "grad_norm": 0.5925746560096741, "learning_rate": 3.3500000000000005e-06, "loss": 0.0706, "step": 67 }, { "epoch": 1.619047619047619, "grad_norm": 0.6671133637428284, "learning_rate": 3.4000000000000005e-06, "loss": 0.0713, "step": 68 }, { "epoch": 1.6428571428571428, "grad_norm": 0.5542609095573425, "learning_rate": 3.45e-06, "loss": 0.0715, "step": 69 }, { "epoch": 1.6666666666666665, "grad_norm": 0.541200578212738, "learning_rate": 3.5e-06, "loss": 0.0701, "step": 70 }, { "epoch": 1.6904761904761905, "grad_norm": 0.4222320020198822, "learning_rate": 3.5500000000000003e-06, "loss": 0.0669, "step": 71 }, { "epoch": 1.7142857142857144, "grad_norm": 0.717410147190094, "learning_rate": 3.6000000000000003e-06, "loss": 0.0782, "step": 72 }, { "epoch": 1.7380952380952381, "grad_norm": 0.6776471734046936, "learning_rate": 3.65e-06, "loss": 0.0695, "step": 73 }, { "epoch": 1.7619047619047619, "grad_norm": 0.5480474829673767, "learning_rate": 3.7e-06, "loss": 0.0662, "step": 74 }, { "epoch": 1.7857142857142856, "grad_norm": 0.4779343605041504, "learning_rate": 3.7500000000000005e-06, "loss": 0.0743, "step": 75 }, { "epoch": 1.8095238095238095, "grad_norm": 0.43138471245765686, "learning_rate": 3.8000000000000005e-06, "loss": 0.0666, "step": 76 }, { "epoch": 1.8333333333333335, "grad_norm": 0.6058762669563293, "learning_rate": 3.85e-06, "loss": 0.0696, "step": 77 }, { "epoch": 1.8571428571428572, "grad_norm": 1.3352755308151245, "learning_rate": 3.900000000000001e-06, "loss": 0.0891, "step": 78 }, { "epoch": 1.880952380952381, "grad_norm": 0.5319089293479919, "learning_rate": 3.95e-06, "loss": 0.0617, "step": 79 }, { "epoch": 1.9047619047619047, "grad_norm": 0.5629184246063232, "learning_rate": 4.000000000000001e-06, "loss": 0.0622, "step": 80 }, { "epoch": 1.9285714285714286, "grad_norm": 0.37953704595565796, "learning_rate": 4.05e-06, "loss": 0.0676, "step": 81 }, { "epoch": 1.9523809523809523, "grad_norm": 0.37576770782470703, "learning_rate": 4.1e-06, "loss": 0.0719, "step": 82 }, { "epoch": 1.9761904761904763, "grad_norm": 0.4720636010169983, "learning_rate": 4.15e-06, "loss": 0.0662, "step": 83 }, { "epoch": 2.0, "grad_norm": 0.4793304204940796, "learning_rate": 4.2000000000000004e-06, "loss": 0.0664, "step": 84 } ], "logging_steps": 1, "max_steps": 252, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 42, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.8817201233670963e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }