{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 561, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05351170568561873, "grad_norm": 1.2401437759399414, "learning_rate": 3.157894736842105e-05, "loss": 3.5023, "step": 10 }, { "epoch": 0.10702341137123746, "grad_norm": 1.08428955078125, "learning_rate": 6.666666666666667e-05, "loss": 3.3947, "step": 20 }, { "epoch": 0.1605351170568562, "grad_norm": 1.0022192001342773, "learning_rate": 0.0001017543859649123, "loss": 3.2935, "step": 30 }, { "epoch": 0.2140468227424749, "grad_norm": 0.9494397044181824, "learning_rate": 0.0001368421052631579, "loss": 3.2952, "step": 40 }, { "epoch": 0.26755852842809363, "grad_norm": 0.8602828979492188, "learning_rate": 0.00017192982456140353, "loss": 3.3231, "step": 50 }, { "epoch": 0.3210702341137124, "grad_norm": 0.9193262457847595, "learning_rate": 0.0001999922292480975, "loss": 3.2551, "step": 60 }, { "epoch": 0.3745819397993311, "grad_norm": 0.8206238150596619, "learning_rate": 0.00019972037971811802, "loss": 3.2773, "step": 70 }, { "epoch": 0.4280936454849498, "grad_norm": 0.8329986929893494, "learning_rate": 0.0001990611994413053, "loss": 3.3447, "step": 80 }, { "epoch": 0.4816053511705686, "grad_norm": 0.9009258151054382, "learning_rate": 0.00019801724878485438, "loss": 3.3449, "step": 90 }, { "epoch": 0.5351170568561873, "grad_norm": 0.7764396667480469, "learning_rate": 0.00019659258262890683, "loss": 3.3038, "step": 100 }, { "epoch": 0.5886287625418061, "grad_norm": 0.8236237168312073, "learning_rate": 0.0001947927346167132, "loss": 3.3162, "step": 110 }, { "epoch": 0.6421404682274248, "grad_norm": 0.7898942232131958, "learning_rate": 0.0001926246956610309, "loss": 3.2868, "step": 120 }, { "epoch": 0.6956521739130435, "grad_norm": 0.7577053308486938, "learning_rate": 0.0001900968867902419, "loss": 3.2781, "step": 130 }, { "epoch": 0.7491638795986622, "grad_norm": 0.7448907494544983, "learning_rate": 0.00018721912643966055, "loss": 3.2776, "step": 140 }, { "epoch": 0.802675585284281, "grad_norm": 0.7627265453338623, "learning_rate": 0.00018400259231507717, "loss": 3.2989, "step": 150 }, { "epoch": 0.8561872909698997, "grad_norm": 0.7306041121482849, "learning_rate": 0.00018045977797666684, "loss": 3.3012, "step": 160 }, { "epoch": 0.9096989966555183, "grad_norm": 0.7832388281822205, "learning_rate": 0.0001766044443118978, "loss": 3.2706, "step": 170 }, { "epoch": 0.9632107023411371, "grad_norm": 0.8290534019470215, "learning_rate": 0.00017245156608592727, "loss": 3.383, "step": 180 }, { "epoch": 1.0160535117056857, "grad_norm": 0.7920368909835815, "learning_rate": 0.00016801727377709194, "loss": 3.1903, "step": 190 }, { "epoch": 1.0695652173913044, "grad_norm": 0.91294926404953, "learning_rate": 0.000163318790923414, "loss": 3.0714, "step": 200 }, { "epoch": 1.123076923076923, "grad_norm": 0.9004078507423401, "learning_rate": 0.000158374367223479, "loss": 3.0892, "step": 210 }, { "epoch": 1.1765886287625418, "grad_norm": 0.8549134135246277, "learning_rate": 0.00015320320765153367, "loss": 3.0669, "step": 220 }, { "epoch": 1.2301003344481605, "grad_norm": 0.8814137578010559, "learning_rate": 0.00014782539786213183, "loss": 3.0504, "step": 230 }, { "epoch": 1.2836120401337792, "grad_norm": 0.8694167733192444, "learning_rate": 0.00014226182617406996, "loss": 3.0227, "step": 240 }, { "epoch": 1.3371237458193979, "grad_norm": 0.8546176552772522, "learning_rate": 0.00013653410243663952, "loss": 3.0313, "step": 250 }, { "epoch": 1.3906354515050168, "grad_norm": 0.9608170390129089, "learning_rate": 0.00013066447409333345, "loss": 3.0832, "step": 260 }, { "epoch": 1.4441471571906355, "grad_norm": 0.938625693321228, "learning_rate": 0.00012467573976902935, "loss": 3.0271, "step": 270 }, { "epoch": 1.4976588628762542, "grad_norm": 0.9460514187812805, "learning_rate": 0.00011859116071629149, "loss": 3.0237, "step": 280 }, { "epoch": 1.551170568561873, "grad_norm": 0.9218224883079529, "learning_rate": 0.00011243437046474853, "loss": 3.0411, "step": 290 }, { "epoch": 1.6046822742474918, "grad_norm": 0.9326902627944946, "learning_rate": 0.00010622928302448523, "loss": 3.0379, "step": 300 }, { "epoch": 1.6581939799331105, "grad_norm": 0.8940379619598389, "learning_rate": 0.0001, "loss": 3.0212, "step": 310 }, { "epoch": 1.7117056856187292, "grad_norm": 0.9038141965866089, "learning_rate": 9.37707169755148e-05, "loss": 3.0407, "step": 320 }, { "epoch": 1.7652173913043478, "grad_norm": 0.9220936894416809, "learning_rate": 8.756562953525152e-05, "loss": 3.0481, "step": 330 }, { "epoch": 1.8187290969899665, "grad_norm": 0.8182596564292908, "learning_rate": 8.140883928370855e-05, "loss": 3.0711, "step": 340 }, { "epoch": 1.8722408026755852, "grad_norm": 0.9355226755142212, "learning_rate": 7.532426023097063e-05, "loss": 3.1391, "step": 350 }, { "epoch": 1.925752508361204, "grad_norm": 0.9329375624656677, "learning_rate": 6.933552590666659e-05, "loss": 3.0399, "step": 360 }, { "epoch": 1.9792642140468226, "grad_norm": 0.9382308721542358, "learning_rate": 6.34658975633605e-05, "loss": 3.015, "step": 370 }, { "epoch": 2.0321070234113714, "grad_norm": 0.982527494430542, "learning_rate": 5.773817382593008e-05, "loss": 2.8628, "step": 380 }, { "epoch": 2.08561872909699, "grad_norm": 1.1158782243728638, "learning_rate": 5.217460213786821e-05, "loss": 2.7862, "step": 390 }, { "epoch": 2.139130434782609, "grad_norm": 1.0975059270858765, "learning_rate": 4.6796792348466356e-05, "loss": 2.8002, "step": 400 }, { "epoch": 2.1926421404682275, "grad_norm": 0.9911369681358337, "learning_rate": 4.1625632776521037e-05, "loss": 2.7731, "step": 410 }, { "epoch": 2.246153846153846, "grad_norm": 1.120684027671814, "learning_rate": 3.668120907658603e-05, "loss": 2.7325, "step": 420 }, { "epoch": 2.299665551839465, "grad_norm": 1.149767518043518, "learning_rate": 3.198272622290804e-05, "loss": 2.7713, "step": 430 }, { "epoch": 2.3531772575250836, "grad_norm": 1.0604385137557983, "learning_rate": 2.7548433914072734e-05, "loss": 2.74, "step": 440 }, { "epoch": 2.4066889632107022, "grad_norm": 1.0821083784103394, "learning_rate": 2.339555568810221e-05, "loss": 2.7514, "step": 450 }, { "epoch": 2.460200668896321, "grad_norm": 1.0802868604660034, "learning_rate": 1.9540222023333166e-05, "loss": 2.7896, "step": 460 }, { "epoch": 2.5137123745819396, "grad_norm": 1.0751402378082275, "learning_rate": 1.5997407684922862e-05, "loss": 2.7561, "step": 470 }, { "epoch": 2.5672240802675583, "grad_norm": 1.1360961198806763, "learning_rate": 1.2780873560339468e-05, "loss": 2.7271, "step": 480 }, { "epoch": 2.620735785953177, "grad_norm": 1.1104183197021484, "learning_rate": 9.903113209758096e-06, "loss": 2.7425, "step": 490 }, { "epoch": 2.6742474916387957, "grad_norm": 1.1601494550704956, "learning_rate": 7.375304338969136e-06, "loss": 2.7524, "step": 500 }, { "epoch": 2.727759197324415, "grad_norm": 1.121878743171692, "learning_rate": 5.20726538328683e-06, "loss": 2.7738, "step": 510 }, { "epoch": 2.7812709030100335, "grad_norm": 1.1342693567276, "learning_rate": 3.40741737109318e-06, "loss": 2.7854, "step": 520 }, { "epoch": 2.8347826086956522, "grad_norm": 1.1444578170776367, "learning_rate": 1.9827512151456173e-06, "loss": 2.7591, "step": 530 }, { "epoch": 2.888294314381271, "grad_norm": 1.1180880069732666, "learning_rate": 9.388005586947191e-07, "loss": 2.7576, "step": 540 }, { "epoch": 2.9418060200668896, "grad_norm": 1.117174506187439, "learning_rate": 2.7962028188198706e-07, "loss": 2.7276, "step": 550 }, { "epoch": 2.9953177257525083, "grad_norm": 1.159109354019165, "learning_rate": 7.770751902513862e-09, "loss": 2.7297, "step": 560 } ], "logging_steps": 10, "max_steps": 561, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9811509803003904.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }