{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.001194743130227, "eval_steps": 500, "global_step": 1675, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05973715651135006, "grad_norm": 2.8490114212036133, "learning_rate": 0.00016447368421052634, "loss": 11.6497, "step": 50 }, { "epoch": 0.11947431302270012, "grad_norm": 0.05463433265686035, "learning_rate": 0.0002475359342915811, "loss": 4.6396, "step": 100 }, { "epoch": 0.17921146953405018, "grad_norm": 0.03409096226096153, "learning_rate": 0.00024240246406570843, "loss": 4.4228, "step": 150 }, { "epoch": 0.23894862604540024, "grad_norm": 0.03413880988955498, "learning_rate": 0.00023726899383983574, "loss": 4.3988, "step": 200 }, { "epoch": 0.2986857825567503, "grad_norm": 0.033178623765707016, "learning_rate": 0.00023213552361396305, "loss": 4.3922, "step": 250 }, { "epoch": 0.35842293906810035, "grad_norm": 0.028784427791833878, "learning_rate": 0.00022700205338809036, "loss": 4.4053, "step": 300 }, { "epoch": 0.41816009557945044, "grad_norm": 0.036163728684186935, "learning_rate": 0.00022186858316221766, "loss": 4.3944, "step": 350 }, { "epoch": 0.4778972520908005, "grad_norm": 0.03532182425260544, "learning_rate": 0.00021673511293634497, "loss": 4.3938, "step": 400 }, { "epoch": 0.5376344086021505, "grad_norm": 0.03272629156708717, "learning_rate": 0.00021160164271047228, "loss": 4.3859, "step": 450 }, { "epoch": 0.5973715651135006, "grad_norm": 0.027959033846855164, "learning_rate": 0.0002064681724845996, "loss": 4.3881, "step": 500 }, { "epoch": 0.6571087216248507, "grad_norm": 0.024525364860892296, "learning_rate": 0.0002013347022587269, "loss": 4.3989, "step": 550 }, { "epoch": 0.7168458781362007, "grad_norm": 0.025551579892635345, "learning_rate": 0.00019620123203285423, "loss": 4.3802, "step": 600 }, { "epoch": 0.7765830346475507, "grad_norm": 0.03189048916101456, "learning_rate": 0.00019106776180698152, "loss": 4.4041, "step": 650 }, { "epoch": 0.8363201911589009, "grad_norm": 0.02770661748945713, "learning_rate": 0.00018593429158110883, "loss": 4.3955, "step": 700 }, { "epoch": 0.8960573476702509, "grad_norm": 0.03752126544713974, "learning_rate": 0.00018080082135523616, "loss": 4.3857, "step": 750 }, { "epoch": 0.955794504181601, "grad_norm": 0.0396958664059639, "learning_rate": 0.00017566735112936344, "loss": 4.3847, "step": 800 }, { "epoch": 1.015531660692951, "grad_norm": 0.03522910550236702, "learning_rate": 0.00017053388090349075, "loss": 4.3815, "step": 850 }, { "epoch": 1.075268817204301, "grad_norm": 0.033044200390577316, "learning_rate": 0.00016540041067761806, "loss": 4.3903, "step": 900 }, { "epoch": 1.135005973715651, "grad_norm": 0.03267841041088104, "learning_rate": 0.0001602669404517454, "loss": 4.3836, "step": 950 }, { "epoch": 1.194743130227001, "grad_norm": 0.04201454669237137, "learning_rate": 0.00015513347022587268, "loss": 4.3776, "step": 1000 }, { "epoch": 1.2544802867383513, "grad_norm": 0.047623638063669205, "learning_rate": 0.00015, "loss": 4.3734, "step": 1050 }, { "epoch": 1.3142174432497014, "grad_norm": 0.03200829401612282, "learning_rate": 0.00014486652977412732, "loss": 4.3837, "step": 1100 }, { "epoch": 1.3739545997610514, "grad_norm": 0.04358180612325668, "learning_rate": 0.00013973305954825463, "loss": 4.3815, "step": 1150 }, { "epoch": 1.4336917562724014, "grad_norm": 0.04975922778248787, "learning_rate": 0.0001345995893223819, "loss": 4.3746, "step": 1200 }, { "epoch": 1.4934289127837514, "grad_norm": 0.03673349320888519, "learning_rate": 0.00012946611909650925, "loss": 4.3755, "step": 1250 }, { "epoch": 1.5531660692951017, "grad_norm": 0.03130173310637474, "learning_rate": 0.00012433264887063656, "loss": 4.3902, "step": 1300 }, { "epoch": 1.6129032258064515, "grad_norm": 0.03993390500545502, "learning_rate": 0.00011919917864476385, "loss": 4.3852, "step": 1350 }, { "epoch": 1.6726403823178018, "grad_norm": 0.04937516897916794, "learning_rate": 0.00011406570841889118, "loss": 4.3782, "step": 1400 }, { "epoch": 1.7323775388291516, "grad_norm": 0.04578279331326485, "learning_rate": 0.00010893223819301848, "loss": 4.377, "step": 1450 }, { "epoch": 1.7921146953405018, "grad_norm": 0.048149123787879944, "learning_rate": 0.00010379876796714579, "loss": 4.3835, "step": 1500 }, { "epoch": 1.8518518518518519, "grad_norm": 0.0500078909099102, "learning_rate": 9.86652977412731e-05, "loss": 4.3806, "step": 1550 }, { "epoch": 1.911589008363202, "grad_norm": 0.040174700319767, "learning_rate": 9.353182751540041e-05, "loss": 4.3863, "step": 1600 }, { "epoch": 1.971326164874552, "grad_norm": 0.033409375697374344, "learning_rate": 8.839835728952772e-05, "loss": 4.3754, "step": 1650 } ], "logging_steps": 50, "max_steps": 2511, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.0486522326430515e+17, "train_batch_size": 6, "trial_name": null, "trial_params": null }