{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5187, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00963948332369385, "grad_norm": 6.363353729248047, "learning_rate": 4.9527665317139e-05, "loss": 5.7207, "step": 50 }, { "epoch": 0.0192789666473877, "grad_norm": 6.101255416870117, "learning_rate": 4.904569115095431e-05, "loss": 4.3487, "step": 100 }, { "epoch": 0.02891844997108155, "grad_norm": 5.563638210296631, "learning_rate": 4.856371698476962e-05, "loss": 4.077, "step": 150 }, { "epoch": 0.0385579332947754, "grad_norm": 5.894318580627441, "learning_rate": 4.8081742818584925e-05, "loss": 3.8687, "step": 200 }, { "epoch": 0.04819741661846925, "grad_norm": 5.281033992767334, "learning_rate": 4.7599768652400236e-05, "loss": 3.698, "step": 250 }, { "epoch": 0.0578368999421631, "grad_norm": 5.9448089599609375, "learning_rate": 4.711779448621554e-05, "loss": 3.5277, "step": 300 }, { "epoch": 0.06747638326585695, "grad_norm": 5.122015953063965, "learning_rate": 4.6635820320030846e-05, "loss": 3.524, "step": 350 }, { "epoch": 0.0771158665895508, "grad_norm": 5.508582592010498, "learning_rate": 4.615384615384616e-05, "loss": 3.3969, "step": 400 }, { "epoch": 0.08675534991324466, "grad_norm": 4.6635661125183105, "learning_rate": 4.567187198766146e-05, "loss": 3.4104, "step": 450 }, { "epoch": 0.0963948332369385, "grad_norm": 4.993603706359863, "learning_rate": 4.5189897821476775e-05, "loss": 3.317, "step": 500 }, { "epoch": 0.10603431656063235, "grad_norm": 4.281131267547607, "learning_rate": 4.470792365529208e-05, "loss": 3.27, "step": 550 }, { "epoch": 0.1156737998843262, "grad_norm": 4.340400218963623, "learning_rate": 4.4225949489107385e-05, "loss": 3.2469, "step": 600 }, { "epoch": 0.12531328320802004, "grad_norm": 4.594860076904297, "learning_rate": 4.374397532292269e-05, "loss": 3.1855, "step": 650 }, { "epoch": 0.1349527665317139, "grad_norm": 4.50878381729126, "learning_rate": 4.3262001156738e-05, "loss": 3.1646, "step": 700 }, { "epoch": 0.14459224985540775, "grad_norm": 4.4101715087890625, "learning_rate": 4.278002699055331e-05, "loss": 3.0478, "step": 750 }, { "epoch": 0.1542317331791016, "grad_norm": 4.285751819610596, "learning_rate": 4.229805282436862e-05, "loss": 3.0654, "step": 800 }, { "epoch": 0.16387121650279546, "grad_norm": 4.346066474914551, "learning_rate": 4.1816078658183924e-05, "loss": 3.1125, "step": 850 }, { "epoch": 0.1735106998264893, "grad_norm": 4.069290637969971, "learning_rate": 4.133410449199923e-05, "loss": 3.0389, "step": 900 }, { "epoch": 0.18315018315018314, "grad_norm": 4.5510382652282715, "learning_rate": 4.0852130325814534e-05, "loss": 2.9808, "step": 950 }, { "epoch": 0.192789666473877, "grad_norm": 4.202208995819092, "learning_rate": 4.0370156159629845e-05, "loss": 3.0209, "step": 1000 }, { "epoch": 0.20242914979757085, "grad_norm": 5.317561149597168, "learning_rate": 3.988818199344515e-05, "loss": 3.0355, "step": 1050 }, { "epoch": 0.2120686331212647, "grad_norm": 4.339507102966309, "learning_rate": 3.940620782726046e-05, "loss": 2.9353, "step": 1100 }, { "epoch": 0.22170811644495855, "grad_norm": 4.036981105804443, "learning_rate": 3.8924233661075774e-05, "loss": 2.9902, "step": 1150 }, { "epoch": 0.2313475997686524, "grad_norm": 4.5993757247924805, "learning_rate": 3.844225949489107e-05, "loss": 2.9957, "step": 1200 }, { "epoch": 0.24098708309234626, "grad_norm": 4.273294448852539, "learning_rate": 3.7960285328706384e-05, "loss": 2.9309, "step": 1250 }, { "epoch": 0.2506265664160401, "grad_norm": 4.0223774909973145, "learning_rate": 3.747831116252169e-05, "loss": 2.8886, "step": 1300 }, { "epoch": 0.26026604973973394, "grad_norm": 4.813283443450928, "learning_rate": 3.6996336996337e-05, "loss": 2.7496, "step": 1350 }, { "epoch": 0.2699055330634278, "grad_norm": 4.476084232330322, "learning_rate": 3.6514362830152306e-05, "loss": 2.8673, "step": 1400 }, { "epoch": 0.27954501638712165, "grad_norm": 4.051555156707764, "learning_rate": 3.603238866396762e-05, "loss": 2.8135, "step": 1450 }, { "epoch": 0.2891844997108155, "grad_norm": 4.186788558959961, "learning_rate": 3.5550414497782916e-05, "loss": 2.9019, "step": 1500 }, { "epoch": 0.29882398303450936, "grad_norm": 4.216615200042725, "learning_rate": 3.506844033159823e-05, "loss": 2.8877, "step": 1550 }, { "epoch": 0.3084634663582032, "grad_norm": 4.653785705566406, "learning_rate": 3.458646616541353e-05, "loss": 2.8091, "step": 1600 }, { "epoch": 0.31810294968189706, "grad_norm": 3.883335828781128, "learning_rate": 3.4104491999228844e-05, "loss": 2.7521, "step": 1650 }, { "epoch": 0.3277424330055909, "grad_norm": 4.467517375946045, "learning_rate": 3.362251783304415e-05, "loss": 2.7753, "step": 1700 }, { "epoch": 0.33738191632928477, "grad_norm": 3.839921474456787, "learning_rate": 3.314054366685946e-05, "loss": 2.7853, "step": 1750 }, { "epoch": 0.3470213996529786, "grad_norm": 3.923483371734619, "learning_rate": 3.2658569500674766e-05, "loss": 2.872, "step": 1800 }, { "epoch": 0.3566608829766725, "grad_norm": 4.523361682891846, "learning_rate": 3.217659533449007e-05, "loss": 2.7785, "step": 1850 }, { "epoch": 0.3663003663003663, "grad_norm": 3.865365743637085, "learning_rate": 3.169462116830538e-05, "loss": 2.8373, "step": 1900 }, { "epoch": 0.37593984962406013, "grad_norm": 3.9936673641204834, "learning_rate": 3.121264700212069e-05, "loss": 2.7323, "step": 1950 }, { "epoch": 0.385579332947754, "grad_norm": 4.1067633628845215, "learning_rate": 3.0730672835936e-05, "loss": 2.7585, "step": 2000 }, { "epoch": 0.39521881627144784, "grad_norm": 3.85208797454834, "learning_rate": 3.02486986697513e-05, "loss": 2.7564, "step": 2050 }, { "epoch": 0.4048582995951417, "grad_norm": 4.24629020690918, "learning_rate": 2.9766724503566613e-05, "loss": 2.7274, "step": 2100 }, { "epoch": 0.41449778291883554, "grad_norm": 3.905611276626587, "learning_rate": 2.9284750337381918e-05, "loss": 2.7094, "step": 2150 }, { "epoch": 0.4241372662425294, "grad_norm": 3.9592058658599854, "learning_rate": 2.8802776171197226e-05, "loss": 2.7046, "step": 2200 }, { "epoch": 0.43377674956622325, "grad_norm": 3.860285520553589, "learning_rate": 2.832080200501253e-05, "loss": 2.7154, "step": 2250 }, { "epoch": 0.4434162328899171, "grad_norm": 3.989696502685547, "learning_rate": 2.783882783882784e-05, "loss": 2.6632, "step": 2300 }, { "epoch": 0.45305571621361096, "grad_norm": 3.987741708755493, "learning_rate": 2.7356853672643145e-05, "loss": 2.7178, "step": 2350 }, { "epoch": 0.4626951995373048, "grad_norm": 3.9146411418914795, "learning_rate": 2.6874879506458457e-05, "loss": 2.7039, "step": 2400 }, { "epoch": 0.47233468286099867, "grad_norm": 4.281154155731201, "learning_rate": 2.639290534027376e-05, "loss": 2.6205, "step": 2450 }, { "epoch": 0.4819741661846925, "grad_norm": 3.6197686195373535, "learning_rate": 2.591093117408907e-05, "loss": 2.723, "step": 2500 }, { "epoch": 0.4916136495083864, "grad_norm": 3.7195041179656982, "learning_rate": 2.5428957007904375e-05, "loss": 2.6071, "step": 2550 }, { "epoch": 0.5012531328320802, "grad_norm": 3.775972604751587, "learning_rate": 2.4946982841719683e-05, "loss": 2.6749, "step": 2600 }, { "epoch": 0.5108926161557741, "grad_norm": 3.9212749004364014, "learning_rate": 2.4465008675534992e-05, "loss": 2.6913, "step": 2650 }, { "epoch": 0.5205320994794679, "grad_norm": 3.9374866485595703, "learning_rate": 2.39830345093503e-05, "loss": 2.6457, "step": 2700 }, { "epoch": 0.5301715828031618, "grad_norm": 4.192444801330566, "learning_rate": 2.3501060343165605e-05, "loss": 2.7204, "step": 2750 }, { "epoch": 0.5398110661268556, "grad_norm": 3.428612232208252, "learning_rate": 2.3019086176980914e-05, "loss": 2.7221, "step": 2800 }, { "epoch": 0.5494505494505495, "grad_norm": 4.013959884643555, "learning_rate": 2.2537112010796222e-05, "loss": 2.6046, "step": 2850 }, { "epoch": 0.5590900327742433, "grad_norm": 4.100067138671875, "learning_rate": 2.205513784461153e-05, "loss": 2.6887, "step": 2900 }, { "epoch": 0.5687295160979372, "grad_norm": 3.5404722690582275, "learning_rate": 2.157316367842684e-05, "loss": 2.5933, "step": 2950 }, { "epoch": 0.578368999421631, "grad_norm": 3.6547091007232666, "learning_rate": 2.1091189512242147e-05, "loss": 2.6171, "step": 3000 }, { "epoch": 0.5880084827453248, "grad_norm": 3.81042742729187, "learning_rate": 2.0609215346057452e-05, "loss": 2.5319, "step": 3050 }, { "epoch": 0.5976479660690187, "grad_norm": 3.987117052078247, "learning_rate": 2.012724117987276e-05, "loss": 2.6596, "step": 3100 }, { "epoch": 0.6072874493927125, "grad_norm": 3.5897133350372314, "learning_rate": 1.964526701368807e-05, "loss": 2.634, "step": 3150 }, { "epoch": 0.6169269327164064, "grad_norm": 4.190171241760254, "learning_rate": 1.9163292847503374e-05, "loss": 2.5889, "step": 3200 }, { "epoch": 0.6265664160401002, "grad_norm": 3.7671003341674805, "learning_rate": 1.8681318681318682e-05, "loss": 2.6186, "step": 3250 }, { "epoch": 0.6362058993637941, "grad_norm": 4.126290798187256, "learning_rate": 1.819934451513399e-05, "loss": 2.5847, "step": 3300 }, { "epoch": 0.6458453826874879, "grad_norm": 4.023561000823975, "learning_rate": 1.7717370348949296e-05, "loss": 2.5474, "step": 3350 }, { "epoch": 0.6554848660111818, "grad_norm": 3.9225897789001465, "learning_rate": 1.7235396182764604e-05, "loss": 2.6056, "step": 3400 }, { "epoch": 0.6651243493348756, "grad_norm": 3.6160168647766113, "learning_rate": 1.6753422016579912e-05, "loss": 2.5559, "step": 3450 }, { "epoch": 0.6747638326585695, "grad_norm": 4.005686283111572, "learning_rate": 1.6271447850395217e-05, "loss": 2.5416, "step": 3500 }, { "epoch": 0.6844033159822633, "grad_norm": 3.8741414546966553, "learning_rate": 1.5789473684210526e-05, "loss": 2.5972, "step": 3550 }, { "epoch": 0.6940427993059572, "grad_norm": 3.710710048675537, "learning_rate": 1.5307499518025834e-05, "loss": 2.5787, "step": 3600 }, { "epoch": 0.703682282629651, "grad_norm": 3.460242748260498, "learning_rate": 1.4825525351841141e-05, "loss": 2.554, "step": 3650 }, { "epoch": 0.713321765953345, "grad_norm": 3.8803932666778564, "learning_rate": 1.4343551185656451e-05, "loss": 2.5613, "step": 3700 }, { "epoch": 0.7229612492770388, "grad_norm": 3.8178253173828125, "learning_rate": 1.3861577019471758e-05, "loss": 2.5626, "step": 3750 }, { "epoch": 0.7326007326007326, "grad_norm": 3.364790201187134, "learning_rate": 1.3379602853287066e-05, "loss": 2.5721, "step": 3800 }, { "epoch": 0.7422402159244265, "grad_norm": 3.5198776721954346, "learning_rate": 1.2897628687102373e-05, "loss": 2.5233, "step": 3850 }, { "epoch": 0.7518796992481203, "grad_norm": 3.782043695449829, "learning_rate": 1.241565452091768e-05, "loss": 2.541, "step": 3900 }, { "epoch": 0.7615191825718142, "grad_norm": 4.032742023468018, "learning_rate": 1.1933680354732988e-05, "loss": 2.493, "step": 3950 }, { "epoch": 0.771158665895508, "grad_norm": 3.8995766639709473, "learning_rate": 1.1451706188548295e-05, "loss": 2.5361, "step": 4000 }, { "epoch": 0.7807981492192019, "grad_norm": 3.67946457862854, "learning_rate": 1.0969732022363601e-05, "loss": 2.5408, "step": 4050 }, { "epoch": 0.7904376325428957, "grad_norm": 3.4958484172821045, "learning_rate": 1.048775785617891e-05, "loss": 2.5261, "step": 4100 }, { "epoch": 0.8000771158665896, "grad_norm": 3.8448803424835205, "learning_rate": 1.0005783689994216e-05, "loss": 2.5389, "step": 4150 }, { "epoch": 0.8097165991902834, "grad_norm": 3.7735507488250732, "learning_rate": 9.523809523809523e-06, "loss": 2.6247, "step": 4200 }, { "epoch": 0.8193560825139773, "grad_norm": 3.4487788677215576, "learning_rate": 9.041835357624833e-06, "loss": 2.5704, "step": 4250 }, { "epoch": 0.8289955658376711, "grad_norm": 3.943000316619873, "learning_rate": 8.55986119144014e-06, "loss": 2.5289, "step": 4300 }, { "epoch": 0.838635049161365, "grad_norm": 3.7842445373535156, "learning_rate": 8.077887025255447e-06, "loss": 2.5765, "step": 4350 }, { "epoch": 0.8482745324850588, "grad_norm": 3.741563320159912, "learning_rate": 7.595912859070754e-06, "loss": 2.4869, "step": 4400 }, { "epoch": 0.8579140158087527, "grad_norm": 3.6693384647369385, "learning_rate": 7.113938692886062e-06, "loss": 2.5513, "step": 4450 }, { "epoch": 0.8675534991324465, "grad_norm": 3.590758800506592, "learning_rate": 6.631964526701369e-06, "loss": 2.566, "step": 4500 }, { "epoch": 0.8771929824561403, "grad_norm": 3.282844305038452, "learning_rate": 6.149990360516677e-06, "loss": 2.5001, "step": 4550 }, { "epoch": 0.8868324657798342, "grad_norm": 4.462714195251465, "learning_rate": 5.668016194331984e-06, "loss": 2.4838, "step": 4600 }, { "epoch": 0.896471949103528, "grad_norm": 4.180957317352295, "learning_rate": 5.186042028147292e-06, "loss": 2.477, "step": 4650 }, { "epoch": 0.9061114324272219, "grad_norm": 4.506474018096924, "learning_rate": 4.7040678619625985e-06, "loss": 2.5097, "step": 4700 }, { "epoch": 0.9157509157509157, "grad_norm": 3.257143974304199, "learning_rate": 4.222093695777907e-06, "loss": 2.6252, "step": 4750 }, { "epoch": 0.9253903990746096, "grad_norm": 3.498189926147461, "learning_rate": 3.740119529593214e-06, "loss": 2.5133, "step": 4800 }, { "epoch": 0.9350298823983034, "grad_norm": 3.404567241668701, "learning_rate": 3.258145363408521e-06, "loss": 2.4482, "step": 4850 }, { "epoch": 0.9446693657219973, "grad_norm": 3.703936815261841, "learning_rate": 2.776171197223829e-06, "loss": 2.5769, "step": 4900 }, { "epoch": 0.9543088490456911, "grad_norm": 4.4313883781433105, "learning_rate": 2.2941970310391366e-06, "loss": 2.5262, "step": 4950 }, { "epoch": 0.963948332369385, "grad_norm": 3.5869264602661133, "learning_rate": 1.8122228648544438e-06, "loss": 2.5166, "step": 5000 }, { "epoch": 0.9735878156930788, "grad_norm": 3.5782413482666016, "learning_rate": 1.3302486986697513e-06, "loss": 2.4681, "step": 5050 }, { "epoch": 0.9832272990167727, "grad_norm": 3.565708637237549, "learning_rate": 8.482745324850588e-07, "loss": 2.4661, "step": 5100 }, { "epoch": 0.9928667823404665, "grad_norm": 3.5679666996002197, "learning_rate": 3.663003663003663e-07, "loss": 2.5452, "step": 5150 } ], "logging_steps": 50, "max_steps": 5187, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1355321769984000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }