| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 100, | |
| "global_step": 503, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0019880715705765406, | |
| "grad_norm": 127.0667237907462, | |
| "learning_rate": 5.882352941176471e-08, | |
| "loss": 3.918, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.019880715705765408, | |
| "grad_norm": 115.98143785016711, | |
| "learning_rate": 5.882352941176471e-07, | |
| "loss": 3.7431, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.039761431411530816, | |
| "grad_norm": 65.61390946147996, | |
| "learning_rate": 1.1764705882352942e-06, | |
| "loss": 3.0734, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05964214711729622, | |
| "grad_norm": 41.009392563213375, | |
| "learning_rate": 1.7647058823529412e-06, | |
| "loss": 2.4079, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07952286282306163, | |
| "grad_norm": 25.739828109506355, | |
| "learning_rate": 2.3529411764705885e-06, | |
| "loss": 2.1062, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09940357852882704, | |
| "grad_norm": 17.26914793098219, | |
| "learning_rate": 2.941176470588235e-06, | |
| "loss": 2.0659, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.11928429423459244, | |
| "grad_norm": 25.05959744665656, | |
| "learning_rate": 2.9970662200387674e-06, | |
| "loss": 2.0592, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.13916500994035785, | |
| "grad_norm": 15.43081495953451, | |
| "learning_rate": 2.986939491128791e-06, | |
| "loss": 2.0407, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.15904572564612326, | |
| "grad_norm": 12.58490370084457, | |
| "learning_rate": 2.969632483038685e-06, | |
| "loss": 2.0257, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.17892644135188868, | |
| "grad_norm": 20.125125391550814, | |
| "learning_rate": 2.94522876954573e-06, | |
| "loss": 1.9816, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1988071570576541, | |
| "grad_norm": 8.501045071308724, | |
| "learning_rate": 2.9138461936939467e-06, | |
| "loss": 1.9523, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1988071570576541, | |
| "eval_loss": 1.9134721755981445, | |
| "eval_runtime": 63.5356, | |
| "eval_samples_per_second": 26.662, | |
| "eval_steps_per_second": 0.425, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.21868787276341947, | |
| "grad_norm": 22.082329977354508, | |
| "learning_rate": 2.875636298742058e-06, | |
| "loss": 1.9368, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.23856858846918488, | |
| "grad_norm": 15.818359711105115, | |
| "learning_rate": 2.8307835963765403e-06, | |
| "loss": 1.9224, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2584493041749503, | |
| "grad_norm": 13.730610615135147, | |
| "learning_rate": 2.779504675723508e-06, | |
| "loss": 1.9146, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2783300198807157, | |
| "grad_norm": 6.635422891471246, | |
| "learning_rate": 2.722047157461906e-06, | |
| "loss": 1.9164, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2982107355864811, | |
| "grad_norm": 8.086513909738859, | |
| "learning_rate": 2.6586884980885044e-06, | |
| "loss": 1.9076, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.31809145129224653, | |
| "grad_norm": 13.60628995566901, | |
| "learning_rate": 2.5897346501087633e-06, | |
| "loss": 1.9047, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3379721669980119, | |
| "grad_norm": 18.644035876158316, | |
| "learning_rate": 2.5155185846233844e-06, | |
| "loss": 1.8866, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.35785288270377735, | |
| "grad_norm": 17.47074603496702, | |
| "learning_rate": 2.43639868344482e-06, | |
| "loss": 1.8956, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.37773359840954274, | |
| "grad_norm": 7.178728301299398, | |
| "learning_rate": 2.3527570085080407e-06, | |
| "loss": 1.9043, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3976143141153082, | |
| "grad_norm": 12.253857653229236, | |
| "learning_rate": 2.264997456932413e-06, | |
| "loss": 1.9159, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3976143141153082, | |
| "eval_loss": 1.8660345077514648, | |
| "eval_runtime": 63.4283, | |
| "eval_samples_per_second": 26.707, | |
| "eval_steps_per_second": 0.426, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.41749502982107356, | |
| "grad_norm": 11.988930445624767, | |
| "learning_rate": 2.1735438106436967e-06, | |
| "loss": 1.9004, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.43737574552683894, | |
| "grad_norm": 10.753394166683348, | |
| "learning_rate": 2.078837689974332e-06, | |
| "loss": 1.9172, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4572564612326044, | |
| "grad_norm": 9.351072895819225, | |
| "learning_rate": 1.981336421123892e-06, | |
| "loss": 1.9192, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.47713717693836977, | |
| "grad_norm": 7.682035183703906, | |
| "learning_rate": 1.8815108277774976e-06, | |
| "loss": 1.8959, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4970178926441352, | |
| "grad_norm": 11.950308335271014, | |
| "learning_rate": 1.7798429575462477e-06, | |
| "loss": 1.8733, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5168986083499006, | |
| "grad_norm": 10.982955764593422, | |
| "learning_rate": 1.6768237542084645e-06, | |
| "loss": 1.8827, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.536779324055666, | |
| "grad_norm": 11.904679442862472, | |
| "learning_rate": 1.5729506869922447e-06, | |
| "loss": 1.8765, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5566600397614314, | |
| "grad_norm": 9.827244740110832, | |
| "learning_rate": 1.4687253483472872e-06, | |
| "loss": 1.8841, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5765407554671969, | |
| "grad_norm": 7.306293618482458, | |
| "learning_rate": 1.3646510318060986e-06, | |
| "loss": 1.8773, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5964214711729622, | |
| "grad_norm": 9.071120593329336, | |
| "learning_rate": 1.2612303016308466e-06, | |
| "loss": 1.875, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5964214711729622, | |
| "eval_loss": 1.8548645973205566, | |
| "eval_runtime": 63.4556, | |
| "eval_samples_per_second": 26.696, | |
| "eval_steps_per_second": 0.425, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6163021868787276, | |
| "grad_norm": 9.443760924559943, | |
| "learning_rate": 1.1589625659817845e-06, | |
| "loss": 1.8568, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6361829025844931, | |
| "grad_norm": 11.674966838003883, | |
| "learning_rate": 1.0583416653261663e-06, | |
| "loss": 1.877, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6560636182902585, | |
| "grad_norm": 8.948890555802585, | |
| "learning_rate": 9.598534877329919e-07, | |
| "loss": 1.8663, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6759443339960238, | |
| "grad_norm": 7.926486654587874, | |
| "learning_rate": 8.639736225690654e-07, | |
| "loss": 1.8776, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6958250497017893, | |
| "grad_norm": 6.133488811515441, | |
| "learning_rate": 7.711650639264374e-07, | |
| "loss": 1.8669, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7157057654075547, | |
| "grad_norm": 5.995054734148766, | |
| "learning_rate": 6.818759748711476e-07, | |
| "loss": 1.8661, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.73558648111332, | |
| "grad_norm": 6.532247908566974, | |
| "learning_rate": 5.965375233094762e-07, | |
| "loss": 1.8429, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7554671968190855, | |
| "grad_norm": 8.52788084676906, | |
| "learning_rate": 5.155617999220938e-07, | |
| "loss": 1.883, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.7753479125248509, | |
| "grad_norm": 7.412951150651719, | |
| "learning_rate": 4.3933982822017883e-07, | |
| "loss": 1.8518, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7952286282306164, | |
| "grad_norm": 8.704060015623933, | |
| "learning_rate": 3.6823967633276183e-07, | |
| "loss": 1.841, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7952286282306164, | |
| "eval_loss": 1.834498643875122, | |
| "eval_runtime": 63.4011, | |
| "eval_samples_per_second": 26.719, | |
| "eval_steps_per_second": 0.426, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8151093439363817, | |
| "grad_norm": 10.837619200928202, | |
| "learning_rate": 3.026046796432582e-07, | |
| "loss": 1.8274, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8349900596421471, | |
| "grad_norm": 9.650000403237328, | |
| "learning_rate": 2.4275178285790973e-07, | |
| "loss": 1.8457, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8548707753479126, | |
| "grad_norm": 8.514388424330665, | |
| "learning_rate": 1.889700095121219e-07, | |
| "loss": 1.8333, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.8747514910536779, | |
| "grad_norm": 11.640897943920702, | |
| "learning_rate": 1.4151906630527865e-07, | |
| "loss": 1.8412, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8946322067594433, | |
| "grad_norm": 13.422663396524422, | |
| "learning_rate": 1.00628089003575e-07, | |
| "loss": 1.8505, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9145129224652088, | |
| "grad_norm": 6.751723671685878, | |
| "learning_rate": 6.649453596676663e-08, | |
| "loss": 1.8411, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9343936381709742, | |
| "grad_norm": 7.821816393267081, | |
| "learning_rate": 3.928323464188621e-08, | |
| "loss": 1.8268, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9542743538767395, | |
| "grad_norm": 7.257934599669054, | |
| "learning_rate": 1.9125585628307407e-08, | |
| "loss": 1.8413, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.974155069582505, | |
| "grad_norm": 7.949497125452414, | |
| "learning_rate": 6.118928157650749e-09, | |
| "loss": 1.8531, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9940357852882704, | |
| "grad_norm": 7.167162679479334, | |
| "learning_rate": 3.260700525591909e-10, | |
| "loss": 1.8309, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9940357852882704, | |
| "eval_loss": 1.8293424844741821, | |
| "eval_runtime": 63.4663, | |
| "eval_samples_per_second": 26.691, | |
| "eval_steps_per_second": 0.425, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 503, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 162675912867840.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |