{ "best_global_step": 60, "best_metric": 0.4450100064277649, "best_model_checkpoint": "outputs/checkpoint-60", "epoch": 0.08571428571428572, "eval_steps": 5, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014285714285714286, "grad_norm": 4.179355621337891, "learning_rate": 0.0, "loss": 1.3819, "step": 1 }, { "epoch": 0.002857142857142857, "grad_norm": 4.235444068908691, "learning_rate": 4e-05, "loss": 1.5352, "step": 2 }, { "epoch": 0.004285714285714286, "grad_norm": 3.5132126808166504, "learning_rate": 8e-05, "loss": 0.996, "step": 3 }, { "epoch": 0.005714285714285714, "grad_norm": 2.1232669353485107, "learning_rate": 0.00012, "loss": 0.3471, "step": 4 }, { "epoch": 0.007142857142857143, "grad_norm": 3.057875633239746, "learning_rate": 0.00016, "loss": 0.5202, "step": 5 }, { "epoch": 0.007142857142857143, "eval_loss": 0.729008674621582, "eval_runtime": 61.1607, "eval_samples_per_second": 4.578, "eval_steps_per_second": 1.145, "step": 5 }, { "epoch": 0.008571428571428572, "grad_norm": 5.017824649810791, "learning_rate": 0.0002, "loss": 0.6374, "step": 6 }, { "epoch": 0.01, "grad_norm": 3.8987300395965576, "learning_rate": 0.00019555555555555556, "loss": 0.7946, "step": 7 }, { "epoch": 0.011428571428571429, "grad_norm": 2.3041818141937256, "learning_rate": 0.00019111111111111114, "loss": 0.475, "step": 8 }, { "epoch": 0.012857142857142857, "grad_norm": 1.1344828605651855, "learning_rate": 0.0001866666666666667, "loss": 0.499, "step": 9 }, { "epoch": 0.014285714285714285, "grad_norm": 1.3779399394989014, "learning_rate": 0.00018222222222222224, "loss": 0.5737, "step": 10 }, { "epoch": 0.014285714285714285, "eval_loss": 0.4982975423336029, "eval_runtime": 58.9374, "eval_samples_per_second": 4.751, "eval_steps_per_second": 1.188, "step": 10 }, { "epoch": 0.015714285714285715, "grad_norm": 1.039328932762146, "learning_rate": 0.00017777777777777779, "loss": 0.7145, "step": 11 }, { "epoch": 0.017142857142857144, "grad_norm": 1.0184112787246704, "learning_rate": 0.00017333333333333334, "loss": 0.5086, "step": 12 }, { "epoch": 0.018571428571428572, "grad_norm": 3.169090986251831, "learning_rate": 0.00016888888888888889, "loss": 0.3931, "step": 13 }, { "epoch": 0.02, "grad_norm": 0.8889961242675781, "learning_rate": 0.00016444444444444444, "loss": 0.4175, "step": 14 }, { "epoch": 0.02142857142857143, "grad_norm": 1.040206789970398, "learning_rate": 0.00016, "loss": 0.7937, "step": 15 }, { "epoch": 0.02142857142857143, "eval_loss": 0.47482773661613464, "eval_runtime": 58.0691, "eval_samples_per_second": 4.822, "eval_steps_per_second": 1.205, "step": 15 }, { "epoch": 0.022857142857142857, "grad_norm": 1.05618417263031, "learning_rate": 0.00015555555555555556, "loss": 0.4657, "step": 16 }, { "epoch": 0.024285714285714285, "grad_norm": 1.636629343032837, "learning_rate": 0.0001511111111111111, "loss": 0.496, "step": 17 }, { "epoch": 0.025714285714285714, "grad_norm": 0.8520965576171875, "learning_rate": 0.00014666666666666666, "loss": 0.2923, "step": 18 }, { "epoch": 0.027142857142857142, "grad_norm": 1.2350469827651978, "learning_rate": 0.00014222222222222224, "loss": 0.6657, "step": 19 }, { "epoch": 0.02857142857142857, "grad_norm": 0.8397138118743896, "learning_rate": 0.0001377777777777778, "loss": 0.2923, "step": 20 }, { "epoch": 0.02857142857142857, "eval_loss": 0.4646180272102356, "eval_runtime": 58.7341, "eval_samples_per_second": 4.767, "eval_steps_per_second": 1.192, "step": 20 }, { "epoch": 0.03, "grad_norm": 1.4164972305297852, "learning_rate": 0.00013333333333333334, "loss": 0.5831, "step": 21 }, { "epoch": 0.03142857142857143, "grad_norm": 1.0668251514434814, "learning_rate": 0.00012888888888888892, "loss": 0.5977, "step": 22 }, { "epoch": 0.032857142857142856, "grad_norm": 1.0122352838516235, "learning_rate": 0.00012444444444444444, "loss": 0.3958, "step": 23 }, { "epoch": 0.03428571428571429, "grad_norm": 1.1400679349899292, "learning_rate": 0.00012, "loss": 0.3899, "step": 24 }, { "epoch": 0.03571428571428571, "grad_norm": 1.161012887954712, "learning_rate": 0.00011555555555555555, "loss": 0.6196, "step": 25 }, { "epoch": 0.03571428571428571, "eval_loss": 0.4567541480064392, "eval_runtime": 58.108, "eval_samples_per_second": 4.819, "eval_steps_per_second": 1.205, "step": 25 }, { "epoch": 0.037142857142857144, "grad_norm": 1.1181843280792236, "learning_rate": 0.00011111111111111112, "loss": 0.3504, "step": 26 }, { "epoch": 0.03857142857142857, "grad_norm": 1.0887891054153442, "learning_rate": 0.00010666666666666667, "loss": 0.474, "step": 27 }, { "epoch": 0.04, "grad_norm": 0.8779735565185547, "learning_rate": 0.00010222222222222222, "loss": 0.2359, "step": 28 }, { "epoch": 0.041428571428571426, "grad_norm": 1.2299634218215942, "learning_rate": 9.777777777777778e-05, "loss": 0.5917, "step": 29 }, { "epoch": 0.04285714285714286, "grad_norm": 0.6770172715187073, "learning_rate": 9.333333333333334e-05, "loss": 0.2978, "step": 30 }, { "epoch": 0.04285714285714286, "eval_loss": 0.4585675895214081, "eval_runtime": 58.9389, "eval_samples_per_second": 4.751, "eval_steps_per_second": 1.188, "step": 30 }, { "epoch": 0.04428571428571428, "grad_norm": 1.2675914764404297, "learning_rate": 8.888888888888889e-05, "loss": 0.464, "step": 31 }, { "epoch": 0.045714285714285714, "grad_norm": 0.9487901926040649, "learning_rate": 8.444444444444444e-05, "loss": 0.6331, "step": 32 }, { "epoch": 0.047142857142857146, "grad_norm": 1.002474069595337, "learning_rate": 8e-05, "loss": 0.5226, "step": 33 }, { "epoch": 0.04857142857142857, "grad_norm": 2.1608269214630127, "learning_rate": 7.555555555555556e-05, "loss": 0.9663, "step": 34 }, { "epoch": 0.05, "grad_norm": 0.7519217133522034, "learning_rate": 7.111111111111112e-05, "loss": 0.2157, "step": 35 }, { "epoch": 0.05, "eval_loss": 0.4550328850746155, "eval_runtime": 57.9729, "eval_samples_per_second": 4.83, "eval_steps_per_second": 1.207, "step": 35 }, { "epoch": 0.05142857142857143, "grad_norm": 0.7981988787651062, "learning_rate": 6.666666666666667e-05, "loss": 0.3595, "step": 36 }, { "epoch": 0.05285714285714286, "grad_norm": 1.1520148515701294, "learning_rate": 6.222222222222222e-05, "loss": 0.4654, "step": 37 }, { "epoch": 0.054285714285714284, "grad_norm": 0.7494262456893921, "learning_rate": 5.7777777777777776e-05, "loss": 0.2049, "step": 38 }, { "epoch": 0.055714285714285716, "grad_norm": 0.8117587566375732, "learning_rate": 5.333333333333333e-05, "loss": 0.4791, "step": 39 }, { "epoch": 0.05714285714285714, "grad_norm": 0.9275745749473572, "learning_rate": 4.888888888888889e-05, "loss": 0.4328, "step": 40 }, { "epoch": 0.05714285714285714, "eval_loss": 0.44896164536476135, "eval_runtime": 59.566, "eval_samples_per_second": 4.701, "eval_steps_per_second": 1.175, "step": 40 }, { "epoch": 0.05857142857142857, "grad_norm": 0.9272159337997437, "learning_rate": 4.4444444444444447e-05, "loss": 0.3348, "step": 41 }, { "epoch": 0.06, "grad_norm": 1.161618947982788, "learning_rate": 4e-05, "loss": 0.7938, "step": 42 }, { "epoch": 0.06142857142857143, "grad_norm": 0.6889943480491638, "learning_rate": 3.555555555555556e-05, "loss": 0.3177, "step": 43 }, { "epoch": 0.06285714285714286, "grad_norm": 1.577309250831604, "learning_rate": 3.111111111111111e-05, "loss": 0.5701, "step": 44 }, { "epoch": 0.06428571428571428, "grad_norm": 1.2045623064041138, "learning_rate": 2.6666666666666667e-05, "loss": 0.912, "step": 45 }, { "epoch": 0.06428571428571428, "eval_loss": 0.4481067657470703, "eval_runtime": 58.0782, "eval_samples_per_second": 4.821, "eval_steps_per_second": 1.205, "step": 45 }, { "epoch": 0.06571428571428571, "grad_norm": 1.6550114154815674, "learning_rate": 2.2222222222222223e-05, "loss": 0.8362, "step": 46 }, { "epoch": 0.06714285714285714, "grad_norm": 0.6368440985679626, "learning_rate": 1.777777777777778e-05, "loss": 0.291, "step": 47 }, { "epoch": 0.06857142857142857, "grad_norm": 0.8126080632209778, "learning_rate": 1.3333333333333333e-05, "loss": 0.21, "step": 48 }, { "epoch": 0.07, "grad_norm": 1.02597975730896, "learning_rate": 8.88888888888889e-06, "loss": 0.4724, "step": 49 }, { "epoch": 0.07142857142857142, "grad_norm": 0.9014645218849182, "learning_rate": 4.444444444444445e-06, "loss": 0.351, "step": 50 }, { "epoch": 0.07142857142857142, "eval_loss": 0.4469253420829773, "eval_runtime": 59.9788, "eval_samples_per_second": 4.668, "eval_steps_per_second": 1.167, "step": 50 }, { "epoch": 0.07285714285714286, "grad_norm": 0.8374767303466797, "learning_rate": 0.0, "loss": 0.4678, "step": 51 }, { "epoch": 0.07428571428571429, "grad_norm": 1.0087125301361084, "learning_rate": 3.272727272727273e-05, "loss": 0.8477, "step": 52 }, { "epoch": 0.07571428571428572, "grad_norm": 0.7490191459655762, "learning_rate": 2.909090909090909e-05, "loss": 0.2679, "step": 53 }, { "epoch": 0.07714285714285714, "grad_norm": 0.8489861488342285, "learning_rate": 2.5454545454545454e-05, "loss": 0.3833, "step": 54 }, { "epoch": 0.07857142857142857, "grad_norm": 0.897487223148346, "learning_rate": 2.1818181818181818e-05, "loss": 0.5637, "step": 55 }, { "epoch": 0.07857142857142857, "eval_loss": 0.44575706124305725, "eval_runtime": 57.9488, "eval_samples_per_second": 4.832, "eval_steps_per_second": 1.208, "step": 55 }, { "epoch": 0.08, "grad_norm": 0.9447337985038757, "learning_rate": 1.8181818181818182e-05, "loss": 0.5945, "step": 56 }, { "epoch": 0.08142857142857143, "grad_norm": 0.8487027287483215, "learning_rate": 1.4545454545454545e-05, "loss": 0.4971, "step": 57 }, { "epoch": 0.08285714285714285, "grad_norm": 1.3720009326934814, "learning_rate": 1.0909090909090909e-05, "loss": 0.905, "step": 58 }, { "epoch": 0.08428571428571428, "grad_norm": 0.8870661854743958, "learning_rate": 7.272727272727272e-06, "loss": 0.4392, "step": 59 }, { "epoch": 0.08571428571428572, "grad_norm": 0.8729221224784851, "learning_rate": 3.636363636363636e-06, "loss": 0.3077, "step": 60 }, { "epoch": 0.08571428571428572, "eval_loss": 0.4450100064277649, "eval_runtime": 59.8757, "eval_samples_per_second": 4.676, "eval_steps_per_second": 1.169, "step": 60 } ], "logging_steps": 1, "max_steps": 60, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3217787051458560.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }