{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6827233074151337, "eval_steps": 500, "global_step": 5400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012643024211391365, "grad_norm": 0.8759542107582092, "learning_rate": 0.0004987484197218711, "loss": 3.8722, "step": 100 }, { "epoch": 0.02528604842278273, "grad_norm": 0.9236809015274048, "learning_rate": 0.0004974841972187105, "loss": 3.4858, "step": 200 }, { "epoch": 0.037929072634174096, "grad_norm": 0.8505849242210388, "learning_rate": 0.0004962199747155499, "loss": 3.3438, "step": 300 }, { "epoch": 0.05057209684556546, "grad_norm": 0.8044902682304382, "learning_rate": 0.0004949557522123893, "loss": 3.2837, "step": 400 }, { "epoch": 0.06321512105695683, "grad_norm": 0.7873915433883667, "learning_rate": 0.0004936915297092288, "loss": 3.194, "step": 500 }, { "epoch": 0.07585814526834819, "grad_norm": 0.7622674107551575, "learning_rate": 0.0004924273072060682, "loss": 3.126, "step": 600 }, { "epoch": 0.08850116947973956, "grad_norm": 0.8418383002281189, "learning_rate": 0.0004911630847029077, "loss": 3.0518, "step": 700 }, { "epoch": 0.10114419369113092, "grad_norm": 0.7434802055358887, "learning_rate": 0.0004898988621997471, "loss": 3.0434, "step": 800 }, { "epoch": 0.11378721790252229, "grad_norm": 0.8024940490722656, "learning_rate": 0.0004886346396965867, "loss": 2.9942, "step": 900 }, { "epoch": 0.12643024211391365, "grad_norm": 0.8081286549568176, "learning_rate": 0.00048737041719342606, "loss": 2.9878, "step": 1000 }, { "epoch": 0.139073266325305, "grad_norm": 0.7084025144577026, "learning_rate": 0.0004861061946902655, "loss": 2.9314, "step": 1100 }, { "epoch": 0.15171629053669639, "grad_norm": 0.7388598322868347, "learning_rate": 0.000484841972187105, "loss": 2.9152, "step": 1200 }, { "epoch": 0.16435931474808774, "grad_norm": 0.7991167306900024, "learning_rate": 0.0004835777496839444, "loss": 2.917, "step": 1300 }, { "epoch": 0.17700233895947912, "grad_norm": 0.7912219762802124, "learning_rate": 0.0004823135271807838, "loss": 2.8725, "step": 1400 }, { "epoch": 0.18964536317087047, "grad_norm": 0.8445726633071899, "learning_rate": 0.00048104930467762324, "loss": 2.8843, "step": 1500 }, { "epoch": 0.20228838738226185, "grad_norm": 0.7209933400154114, "learning_rate": 0.0004797850821744627, "loss": 2.8298, "step": 1600 }, { "epoch": 0.2149314115936532, "grad_norm": 0.7905689477920532, "learning_rate": 0.00047852085967130215, "loss": 2.862, "step": 1700 }, { "epoch": 0.22757443580504458, "grad_norm": 0.745158314704895, "learning_rate": 0.0004772566371681416, "loss": 2.781, "step": 1800 }, { "epoch": 0.24021746001643593, "grad_norm": 0.7118976712226868, "learning_rate": 0.00047599241466498107, "loss": 2.7783, "step": 1900 }, { "epoch": 0.2528604842278273, "grad_norm": 0.7946869730949402, "learning_rate": 0.0004747281921618205, "loss": 2.7825, "step": 2000 }, { "epoch": 0.26550350843921866, "grad_norm": 0.7247060537338257, "learning_rate": 0.00047346396965865993, "loss": 2.7839, "step": 2100 }, { "epoch": 0.27814653265061, "grad_norm": 0.7256483435630798, "learning_rate": 0.0004721997471554994, "loss": 2.7731, "step": 2200 }, { "epoch": 0.29078955686200136, "grad_norm": 0.7218326926231384, "learning_rate": 0.0004709355246523388, "loss": 2.8133, "step": 2300 }, { "epoch": 0.30343258107339277, "grad_norm": 0.7010550498962402, "learning_rate": 0.00046967130214917825, "loss": 2.7432, "step": 2400 }, { "epoch": 0.3160756052847841, "grad_norm": 0.7964794635772705, "learning_rate": 0.0004684070796460177, "loss": 2.7811, "step": 2500 }, { "epoch": 0.3287186294961755, "grad_norm": 0.8072954416275024, "learning_rate": 0.00046714285714285716, "loss": 2.7089, "step": 2600 }, { "epoch": 0.3413616537075668, "grad_norm": 0.6594070196151733, "learning_rate": 0.0004658786346396966, "loss": 2.7161, "step": 2700 }, { "epoch": 0.35400467791895823, "grad_norm": 0.704298734664917, "learning_rate": 0.000464614412136536, "loss": 2.698, "step": 2800 }, { "epoch": 0.3666477021303496, "grad_norm": 0.7253355383872986, "learning_rate": 0.0004633501896333755, "loss": 2.696, "step": 2900 }, { "epoch": 0.37929072634174094, "grad_norm": 0.7043545246124268, "learning_rate": 0.00046208596713021493, "loss": 2.6807, "step": 3000 }, { "epoch": 0.3919337505531323, "grad_norm": 0.6532794237136841, "learning_rate": 0.0004608217446270544, "loss": 2.6985, "step": 3100 }, { "epoch": 0.4045767747645237, "grad_norm": 0.7272788286209106, "learning_rate": 0.0004595575221238938, "loss": 2.6767, "step": 3200 }, { "epoch": 0.41721979897591505, "grad_norm": 0.695071280002594, "learning_rate": 0.00045829329962073325, "loss": 2.6609, "step": 3300 }, { "epoch": 0.4298628231873064, "grad_norm": 0.7230761051177979, "learning_rate": 0.0004570290771175727, "loss": 2.6488, "step": 3400 }, { "epoch": 0.44250584739869775, "grad_norm": 0.7420136332511902, "learning_rate": 0.00045576485461441217, "loss": 2.6507, "step": 3500 }, { "epoch": 0.45514887161008916, "grad_norm": 0.7115824222564697, "learning_rate": 0.00045450063211125157, "loss": 2.644, "step": 3600 }, { "epoch": 0.4677918958214805, "grad_norm": 0.6667810678482056, "learning_rate": 0.000453236409608091, "loss": 2.6841, "step": 3700 }, { "epoch": 0.48043492003287186, "grad_norm": 0.6836283802986145, "learning_rate": 0.0004519721871049305, "loss": 2.6462, "step": 3800 }, { "epoch": 0.4930779442442632, "grad_norm": 0.7117214202880859, "learning_rate": 0.00045070796460176994, "loss": 2.6201, "step": 3900 }, { "epoch": 0.5057209684556546, "grad_norm": 0.6085230708122253, "learning_rate": 0.0004494437420986094, "loss": 2.6198, "step": 4000 }, { "epoch": 0.5183639926670459, "grad_norm": 0.663446843624115, "learning_rate": 0.0004481795195954488, "loss": 2.5972, "step": 4100 }, { "epoch": 0.5310070168784373, "grad_norm": 0.670093297958374, "learning_rate": 0.00044691529709228826, "loss": 2.6052, "step": 4200 }, { "epoch": 0.5436500410898287, "grad_norm": 0.6052363514900208, "learning_rate": 0.00044565107458912766, "loss": 2.6038, "step": 4300 }, { "epoch": 0.55629306530122, "grad_norm": 0.6686172485351562, "learning_rate": 0.0004443868520859671, "loss": 2.5484, "step": 4400 }, { "epoch": 0.5689360895126114, "grad_norm": 0.6228762865066528, "learning_rate": 0.0004431226295828066, "loss": 2.6119, "step": 4500 }, { "epoch": 0.5815791137240027, "grad_norm": 0.6712014079093933, "learning_rate": 0.00044185840707964603, "loss": 2.581, "step": 4600 }, { "epoch": 0.5942221379353941, "grad_norm": 0.6657222509384155, "learning_rate": 0.0004405941845764855, "loss": 2.5822, "step": 4700 }, { "epoch": 0.6068651621467855, "grad_norm": 0.639202356338501, "learning_rate": 0.00043932996207332494, "loss": 2.5736, "step": 4800 }, { "epoch": 0.6195081863581768, "grad_norm": 0.654742419719696, "learning_rate": 0.0004380657395701644, "loss": 2.5515, "step": 4900 }, { "epoch": 0.6321512105695682, "grad_norm": 0.704134464263916, "learning_rate": 0.0004368015170670038, "loss": 2.5499, "step": 5000 }, { "epoch": 0.6447942347809597, "grad_norm": 0.6817001104354858, "learning_rate": 0.0004355372945638432, "loss": 2.611, "step": 5100 }, { "epoch": 0.657437258992351, "grad_norm": 0.6351118087768555, "learning_rate": 0.00043427307206068266, "loss": 2.566, "step": 5200 }, { "epoch": 0.6700802832037424, "grad_norm": 0.6755563020706177, "learning_rate": 0.0004330088495575221, "loss": 2.5771, "step": 5300 }, { "epoch": 0.6827233074151337, "grad_norm": 0.6010642647743225, "learning_rate": 0.0004317446270543616, "loss": 2.5216, "step": 5400 } ], "logging_steps": 100, "max_steps": 39550, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.2329309184e+16, "train_batch_size": 15, "trial_name": null, "trial_params": null }