{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.43873012004143563, "eval_steps": 500, "global_step": 4500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004874779111571507, "grad_norm": 581.0810546875, "learning_rate": 1.6233766233766232e-07, "loss": 42.9948, "step": 50 }, { "epoch": 0.009749558223143015, "grad_norm": 331.4380798339844, "learning_rate": 3.2467532467532465e-07, "loss": 36.4433, "step": 100 }, { "epoch": 0.014624337334714521, "grad_norm": 399.9220886230469, "learning_rate": 4.87012987012987e-07, "loss": 33.9757, "step": 150 }, { "epoch": 0.01949911644628603, "grad_norm": 227.4508514404297, "learning_rate": 6.493506493506493e-07, "loss": 24.5654, "step": 200 }, { "epoch": 0.024373895557857534, "grad_norm": 169.74786376953125, "learning_rate": 8.116883116883116e-07, "loss": 18.221, "step": 250 }, { "epoch": 0.029248674669429042, "grad_norm": 151.637451171875, "learning_rate": 9.74025974025974e-07, "loss": 15.1792, "step": 300 }, { "epoch": 0.03412345378100055, "grad_norm": 140.70602416992188, "learning_rate": 9.99956019474448e-07, "loss": 13.3062, "step": 350 }, { "epoch": 0.03899823289257206, "grad_norm": 163.72286987304688, "learning_rate": 9.997889850109673e-07, "loss": 12.1289, "step": 400 }, { "epoch": 0.04387301200414356, "grad_norm": 205.9079132080078, "learning_rate": 9.994973425669175e-07, "loss": 11.2206, "step": 450 }, { "epoch": 0.04874779111571507, "grad_norm": 190.47525024414062, "learning_rate": 9.990811648549374e-07, "loss": 10.3846, "step": 500 }, { "epoch": 0.05362257022728657, "grad_norm": 63.44021224975586, "learning_rate": 9.98540555636946e-07, "loss": 10.0807, "step": 550 }, { "epoch": 0.058497349338858085, "grad_norm": 134.91310119628906, "learning_rate": 9.978756496982724e-07, "loss": 9.2068, "step": 600 }, { "epoch": 0.0633721284504296, "grad_norm": 125.61437225341797, "learning_rate": 9.97086612814052e-07, "loss": 8.8829, "step": 650 }, { "epoch": 0.0682469075620011, "grad_norm": 148.3848876953125, "learning_rate": 9.961736417078928e-07, "loss": 8.8043, "step": 700 }, { "epoch": 0.0731216866735726, "grad_norm": 360.50970458984375, "learning_rate": 9.951369640028304e-07, "loss": 9.8165, "step": 750 }, { "epoch": 0.07799646578514412, "grad_norm": 89.09446716308594, "learning_rate": 9.939768381645761e-07, "loss": 8.9056, "step": 800 }, { "epoch": 0.08287124489671562, "grad_norm": 163.72415161132812, "learning_rate": 9.92693553437075e-07, "loss": 9.5952, "step": 850 }, { "epoch": 0.08774602400828713, "grad_norm": 147.0449981689453, "learning_rate": 9.912874297703925e-07, "loss": 9.0044, "step": 900 }, { "epoch": 0.09262080311985862, "grad_norm": 171.43394470214844, "learning_rate": 9.897588177409434e-07, "loss": 9.1141, "step": 950 }, { "epoch": 0.09749558223143014, "grad_norm": 206.17628479003906, "learning_rate": 9.88108098464086e-07, "loss": 8.1566, "step": 1000 }, { "epoch": 0.10237036134300165, "grad_norm": 131.0735321044922, "learning_rate": 9.863356834991016e-07, "loss": 8.4912, "step": 1050 }, { "epoch": 0.10724514045457315, "grad_norm": 140.00730895996094, "learning_rate": 9.844420147465848e-07, "loss": 8.1491, "step": 1100 }, { "epoch": 0.11211991956614466, "grad_norm": 163.5810089111328, "learning_rate": 9.824275643382676e-07, "loss": 8.6904, "step": 1150 }, { "epoch": 0.11699469867771617, "grad_norm": 129.7275848388672, "learning_rate": 9.802928345193068e-07, "loss": 8.1686, "step": 1200 }, { "epoch": 0.12186947778928767, "grad_norm": 115.78919219970703, "learning_rate": 9.780383575230648e-07, "loss": 7.6378, "step": 1250 }, { "epoch": 0.1267442569008592, "grad_norm": 74.11811828613281, "learning_rate": 9.756646954384115e-07, "loss": 7.8103, "step": 1300 }, { "epoch": 0.1316190360124307, "grad_norm": 93.70452880859375, "learning_rate": 9.731724400695836e-07, "loss": 8.122, "step": 1350 }, { "epoch": 0.1364938151240022, "grad_norm": 110.20355987548828, "learning_rate": 9.70562212788636e-07, "loss": 7.8767, "step": 1400 }, { "epoch": 0.1413685942355737, "grad_norm": 96.42841339111328, "learning_rate": 9.6783466438052e-07, "loss": 8.0516, "step": 1450 }, { "epoch": 0.1462433733471452, "grad_norm": 98.32221221923828, "learning_rate": 9.649904748808292e-07, "loss": 7.6941, "step": 1500 }, { "epoch": 0.1511181524587167, "grad_norm": 107.42027282714844, "learning_rate": 9.620303534062518e-07, "loss": 8.0057, "step": 1550 }, { "epoch": 0.15599293157028823, "grad_norm": 55.05694580078125, "learning_rate": 9.589550379777732e-07, "loss": 7.4756, "step": 1600 }, { "epoch": 0.16086771068185973, "grad_norm": 117.27649688720703, "learning_rate": 9.557652953366717e-07, "loss": 6.8833, "step": 1650 }, { "epoch": 0.16574248979343123, "grad_norm": 174.34263610839844, "learning_rate": 9.52461920753353e-07, "loss": 7.4795, "step": 1700 }, { "epoch": 0.17061726890500276, "grad_norm": 119.58318328857422, "learning_rate": 9.490457378290737e-07, "loss": 7.7871, "step": 1750 }, { "epoch": 0.17549204801657425, "grad_norm": 142.45582580566406, "learning_rate": 9.455175982905988e-07, "loss": 8.1505, "step": 1800 }, { "epoch": 0.18036682712814575, "grad_norm": 122.0265884399414, "learning_rate": 9.418783817778484e-07, "loss": 7.6914, "step": 1850 }, { "epoch": 0.18524160623971725, "grad_norm": 96.58927917480469, "learning_rate": 9.381289956245861e-07, "loss": 7.5846, "step": 1900 }, { "epoch": 0.19011638535128877, "grad_norm": 238.81964111328125, "learning_rate": 9.342703746321997e-07, "loss": 7.7886, "step": 1950 }, { "epoch": 0.19499116446286027, "grad_norm": 61.6027946472168, "learning_rate": 9.303034808366366e-07, "loss": 7.2491, "step": 2000 }, { "epoch": 0.19986594357443177, "grad_norm": 96.19196319580078, "learning_rate": 9.262293032685475e-07, "loss": 6.8776, "step": 2050 }, { "epoch": 0.2047407226860033, "grad_norm": 72.44068908691406, "learning_rate": 9.220488577066996e-07, "loss": 7.2714, "step": 2100 }, { "epoch": 0.2096155017975748, "grad_norm": 160.9955291748047, "learning_rate": 9.177631864247226e-07, "loss": 7.4344, "step": 2150 }, { "epoch": 0.2144902809091463, "grad_norm": 78.55003356933594, "learning_rate": 9.133733579312468e-07, "loss": 7.2211, "step": 2200 }, { "epoch": 0.21936506002071782, "grad_norm": 141.0493621826172, "learning_rate": 9.088804667035016e-07, "loss": 7.3533, "step": 2250 }, { "epoch": 0.22423983913228931, "grad_norm": 106.55406951904297, "learning_rate": 9.042856329144392e-07, "loss": 7.526, "step": 2300 }, { "epoch": 0.2291146182438608, "grad_norm": 116.47844696044922, "learning_rate": 8.995900021534517e-07, "loss": 6.5839, "step": 2350 }, { "epoch": 0.23398939735543234, "grad_norm": 87.43718719482422, "learning_rate": 8.947947451407512e-07, "loss": 7.2284, "step": 2400 }, { "epoch": 0.23886417646700384, "grad_norm": 135.8431854248047, "learning_rate": 8.89901057435485e-07, "loss": 7.6484, "step": 2450 }, { "epoch": 0.24373895557857533, "grad_norm": 87.54926300048828, "learning_rate": 8.849101591376568e-07, "loss": 7.2991, "step": 2500 }, { "epoch": 0.24861373469014686, "grad_norm": 109.77268981933594, "learning_rate": 8.798232945839304e-07, "loss": 6.9895, "step": 2550 }, { "epoch": 0.2534885138017184, "grad_norm": 118.82627868652344, "learning_rate": 8.746417320373896e-07, "loss": 7.4786, "step": 2600 }, { "epoch": 0.25836329291328985, "grad_norm": 175.50918579101562, "learning_rate": 8.693667633713338e-07, "loss": 6.6877, "step": 2650 }, { "epoch": 0.2632380720248614, "grad_norm": 190.9921875, "learning_rate": 8.639997037471867e-07, "loss": 6.8118, "step": 2700 }, { "epoch": 0.2681128511364329, "grad_norm": 163.4025115966797, "learning_rate": 8.585418912865986e-07, "loss": 6.9759, "step": 2750 }, { "epoch": 0.2729876302480044, "grad_norm": 149.16815185546875, "learning_rate": 8.529946867378241e-07, "loss": 7.2147, "step": 2800 }, { "epoch": 0.2778624093595759, "grad_norm": 97.56202697753906, "learning_rate": 8.473594731364587e-07, "loss": 7.1163, "step": 2850 }, { "epoch": 0.2827371884711474, "grad_norm": 104.49602508544922, "learning_rate": 8.416376554606195e-07, "loss": 7.5656, "step": 2900 }, { "epoch": 0.2876119675827189, "grad_norm": 130.78414916992188, "learning_rate": 8.358306602806534e-07, "loss": 6.901, "step": 2950 }, { "epoch": 0.2924867466942904, "grad_norm": 152.71543884277344, "learning_rate": 8.299399354034633e-07, "loss": 7.0534, "step": 3000 }, { "epoch": 0.29736152580586195, "grad_norm": 128.23233032226562, "learning_rate": 8.239669495115393e-07, "loss": 7.2949, "step": 3050 }, { "epoch": 0.3022363049174334, "grad_norm": 212.286865234375, "learning_rate": 8.179131917967852e-07, "loss": 7.1819, "step": 3100 }, { "epoch": 0.30711108402900494, "grad_norm": 146.38832092285156, "learning_rate": 8.117801715892306e-07, "loss": 7.3945, "step": 3150 }, { "epoch": 0.31198586314057647, "grad_norm": 98.95829010009766, "learning_rate": 8.05569417980724e-07, "loss": 7.0111, "step": 3200 }, { "epoch": 0.31686064225214794, "grad_norm": 147.78128051757812, "learning_rate": 7.992824794436971e-07, "loss": 7.1754, "step": 3250 }, { "epoch": 0.32173542136371946, "grad_norm": 137.8080596923828, "learning_rate": 7.92920923445098e-07, "loss": 7.2801, "step": 3300 }, { "epoch": 0.326610200475291, "grad_norm": 156.37132263183594, "learning_rate": 7.864863360555886e-07, "loss": 7.1625, "step": 3350 }, { "epoch": 0.33148497958686246, "grad_norm": 176.6288299560547, "learning_rate": 7.799803215541036e-07, "loss": 7.5386, "step": 3400 }, { "epoch": 0.336359758698434, "grad_norm": 150.56468200683594, "learning_rate": 7.734045020278694e-07, "loss": 6.9751, "step": 3450 }, { "epoch": 0.3412345378100055, "grad_norm": 138.31723022460938, "learning_rate": 7.667605169679842e-07, "loss": 6.8245, "step": 3500 }, { "epoch": 0.346109316921577, "grad_norm": 100.9895248413086, "learning_rate": 7.600500228606573e-07, "loss": 6.947, "step": 3550 }, { "epoch": 0.3509840960331485, "grad_norm": 142.1031036376953, "learning_rate": 7.532746927742119e-07, "loss": 6.9751, "step": 3600 }, { "epoch": 0.35585887514472, "grad_norm": 182.1207275390625, "learning_rate": 7.464362159419551e-07, "loss": 7.1473, "step": 3650 }, { "epoch": 0.3607336542562915, "grad_norm": 162.8542938232422, "learning_rate": 7.395362973410145e-07, "loss": 7.7815, "step": 3700 }, { "epoch": 0.36560843336786303, "grad_norm": 168.93276977539062, "learning_rate": 7.325766572672528e-07, "loss": 7.7646, "step": 3750 }, { "epoch": 0.3704832124794345, "grad_norm": 159.7053985595703, "learning_rate": 7.255590309063604e-07, "loss": 6.4885, "step": 3800 }, { "epoch": 0.375357991591006, "grad_norm": 96.07608795166016, "learning_rate": 7.184851679012374e-07, "loss": 6.9556, "step": 3850 }, { "epoch": 0.38023277070257755, "grad_norm": 158.5734100341797, "learning_rate": 7.113568319157707e-07, "loss": 6.9754, "step": 3900 }, { "epoch": 0.385107549814149, "grad_norm": 151.5749969482422, "learning_rate": 7.041758001951149e-07, "loss": 6.6478, "step": 3950 }, { "epoch": 0.38998232892572054, "grad_norm": 118.84528350830078, "learning_rate": 6.969438631225877e-07, "loss": 6.3464, "step": 4000 }, { "epoch": 0.39485710803729207, "grad_norm": 130.48410034179688, "learning_rate": 6.896628237732894e-07, "loss": 7.0122, "step": 4050 }, { "epoch": 0.39973188714886354, "grad_norm": 92.12020874023438, "learning_rate": 6.823344974645576e-07, "loss": 7.2089, "step": 4100 }, { "epoch": 0.40460666626043507, "grad_norm": 113.18313598632812, "learning_rate": 6.749607113033709e-07, "loss": 7.2546, "step": 4150 }, { "epoch": 0.4094814453720066, "grad_norm": 106.96129608154297, "learning_rate": 6.675433037308119e-07, "loss": 7.3078, "step": 4200 }, { "epoch": 0.41435622448357806, "grad_norm": 150.419677734375, "learning_rate": 6.600841240637052e-07, "loss": 7.1537, "step": 4250 }, { "epoch": 0.4192310035951496, "grad_norm": 88.62400817871094, "learning_rate": 6.525850320335433e-07, "loss": 7.0714, "step": 4300 }, { "epoch": 0.4241057827067211, "grad_norm": 83.91474151611328, "learning_rate": 6.450478973228162e-07, "loss": 6.9181, "step": 4350 }, { "epoch": 0.4289805618182926, "grad_norm": 135.38629150390625, "learning_rate": 6.374745990988598e-07, "loss": 7.1421, "step": 4400 }, { "epoch": 0.4338553409298641, "grad_norm": 101.2317123413086, "learning_rate": 6.298670255453404e-07, "loss": 6.6926, "step": 4450 }, { "epoch": 0.43873012004143563, "grad_norm": 112.95464324951172, "learning_rate": 6.222270733914895e-07, "loss": 6.7252, "step": 4500 } ], "logging_steps": 50, "max_steps": 10256, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.8045681436358345e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }