{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9626168224299065, "eval_steps": 500, "global_step": 105, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028037383177570093, "grad_norm": 5.46907120535221, "learning_rate": 9.090909090909091e-07, "loss": 0.6948, "step": 1 }, { "epoch": 0.056074766355140186, "grad_norm": 5.695701169228273, "learning_rate": 1.8181818181818183e-06, "loss": 0.659, "step": 2 }, { "epoch": 0.08411214953271028, "grad_norm": 5.869868275090631, "learning_rate": 2.7272727272727272e-06, "loss": 0.6481, "step": 3 }, { "epoch": 0.11214953271028037, "grad_norm": 5.197705973950515, "learning_rate": 3.6363636363636366e-06, "loss": 0.6641, "step": 4 }, { "epoch": 0.14018691588785046, "grad_norm": 3.3963665075709906, "learning_rate": 4.5454545454545455e-06, "loss": 0.5579, "step": 5 }, { "epoch": 0.16822429906542055, "grad_norm": 2.20924738877731, "learning_rate": 5.4545454545454545e-06, "loss": 0.6036, "step": 6 }, { "epoch": 0.19626168224299065, "grad_norm": 1.984739518286341, "learning_rate": 6.363636363636364e-06, "loss": 0.5288, "step": 7 }, { "epoch": 0.22429906542056074, "grad_norm": 1.9291295994072917, "learning_rate": 7.272727272727273e-06, "loss": 0.5054, "step": 8 }, { "epoch": 0.2523364485981308, "grad_norm": 1.6389948204701459, "learning_rate": 8.181818181818183e-06, "loss": 0.5339, "step": 9 }, { "epoch": 0.2803738317757009, "grad_norm": 1.4300152832592459, "learning_rate": 9.090909090909091e-06, "loss": 0.5433, "step": 10 }, { "epoch": 0.308411214953271, "grad_norm": 1.711886946596263, "learning_rate": 1e-05, "loss": 0.5537, "step": 11 }, { "epoch": 0.3364485981308411, "grad_norm": 1.5442931844274987, "learning_rate": 9.997207818651273e-06, "loss": 0.498, "step": 12 }, { "epoch": 0.3644859813084112, "grad_norm": 1.3068883684134325, "learning_rate": 9.988834393115768e-06, "loss": 0.6412, "step": 13 }, { "epoch": 0.3925233644859813, "grad_norm": 1.2741183296138552, "learning_rate": 9.97488907544252e-06, "loss": 0.5796, "step": 14 }, { "epoch": 0.4205607476635514, "grad_norm": 1.0164680008182152, "learning_rate": 9.955387440773902e-06, "loss": 0.5019, "step": 15 }, { "epoch": 0.4485981308411215, "grad_norm": 1.036540358883728, "learning_rate": 9.930351269950144e-06, "loss": 0.5982, "step": 16 }, { "epoch": 0.4766355140186916, "grad_norm": 1.0480857466954097, "learning_rate": 9.899808525182935e-06, "loss": 0.5377, "step": 17 }, { "epoch": 0.5046728971962616, "grad_norm": 0.8148023572178399, "learning_rate": 9.863793318825186e-06, "loss": 0.4776, "step": 18 }, { "epoch": 0.5327102803738317, "grad_norm": 1.0017091343990974, "learning_rate": 9.822345875271884e-06, "loss": 0.648, "step": 19 }, { "epoch": 0.5607476635514018, "grad_norm": 0.7723478036715665, "learning_rate": 9.775512486034564e-06, "loss": 0.4544, "step": 20 }, { "epoch": 0.5887850467289719, "grad_norm": 0.8710833273679682, "learning_rate": 9.723345458039595e-06, "loss": 0.4936, "step": 21 }, { "epoch": 0.616822429906542, "grad_norm": 0.9055028217774582, "learning_rate": 9.665903055208013e-06, "loss": 0.5819, "step": 22 }, { "epoch": 0.6448598130841121, "grad_norm": 0.8103202901623571, "learning_rate": 9.603249433382145e-06, "loss": 0.5506, "step": 23 }, { "epoch": 0.6728971962616822, "grad_norm": 0.7712121760995281, "learning_rate": 9.535454568671705e-06, "loss": 0.4932, "step": 24 }, { "epoch": 0.7009345794392523, "grad_norm": 0.9091205022340342, "learning_rate": 9.462594179299408e-06, "loss": 0.5776, "step": 25 }, { "epoch": 0.7289719626168224, "grad_norm": 0.7512412627292558, "learning_rate": 9.384749641033358e-06, "loss": 0.4665, "step": 26 }, { "epoch": 0.7570093457943925, "grad_norm": 0.9759396921985853, "learning_rate": 9.302007896300697e-06, "loss": 0.6547, "step": 27 }, { "epoch": 0.7850467289719626, "grad_norm": 0.8218161831230707, "learning_rate": 9.214461357083986e-06, "loss": 0.5403, "step": 28 }, { "epoch": 0.8130841121495327, "grad_norm": 0.7802775736672756, "learning_rate": 9.122207801708802e-06, "loss": 0.4792, "step": 29 }, { "epoch": 0.8411214953271028, "grad_norm": 1.2211854499314727, "learning_rate": 9.025350265637816e-06, "loss": 0.5767, "step": 30 }, { "epoch": 0.8691588785046729, "grad_norm": 0.7291315847510311, "learning_rate": 8.923996926393306e-06, "loss": 0.5339, "step": 31 }, { "epoch": 0.897196261682243, "grad_norm": 0.7685026874314949, "learning_rate": 8.818260982736662e-06, "loss": 0.5102, "step": 32 }, { "epoch": 0.9252336448598131, "grad_norm": 0.7819112577141099, "learning_rate": 8.708260528239788e-06, "loss": 0.5196, "step": 33 }, { "epoch": 0.9532710280373832, "grad_norm": 0.805804762659976, "learning_rate": 8.594118419389648e-06, "loss": 0.5486, "step": 34 }, { "epoch": 0.9813084112149533, "grad_norm": 0.6960763295513189, "learning_rate": 8.475962138373212e-06, "loss": 0.4885, "step": 35 }, { "epoch": 1.0186915887850467, "grad_norm": 1.261779486223386, "learning_rate": 8.353923650696119e-06, "loss": 0.7283, "step": 36 }, { "epoch": 1.0467289719626167, "grad_norm": 2.1904861480940703, "learning_rate": 8.228139257794012e-06, "loss": 0.5858, "step": 37 }, { "epoch": 1.074766355140187, "grad_norm": 0.836565359732725, "learning_rate": 8.098749444801226e-06, "loss": 0.4984, "step": 38 }, { "epoch": 1.102803738317757, "grad_norm": 0.7156392071456283, "learning_rate": 7.965898723646777e-06, "loss": 0.4133, "step": 39 }, { "epoch": 1.1308411214953271, "grad_norm": 0.7811150734148334, "learning_rate": 7.829735471652978e-06, "loss": 0.4737, "step": 40 }, { "epoch": 1.158878504672897, "grad_norm": 0.8005430413673945, "learning_rate": 7.690411765816864e-06, "loss": 0.461, "step": 41 }, { "epoch": 1.1869158878504673, "grad_norm": 0.734823009686673, "learning_rate": 7.548083212959588e-06, "loss": 0.4878, "step": 42 }, { "epoch": 1.2149532710280373, "grad_norm": 0.9106888643783978, "learning_rate": 7.402908775933419e-06, "loss": 0.5398, "step": 43 }, { "epoch": 1.2429906542056075, "grad_norm": 0.6471585113802202, "learning_rate": 7.25505059608051e-06, "loss": 0.4282, "step": 44 }, { "epoch": 1.2710280373831775, "grad_norm": 0.7710047485521259, "learning_rate": 7.104673812141676e-06, "loss": 0.4352, "step": 45 }, { "epoch": 1.2990654205607477, "grad_norm": 0.6543038711142444, "learning_rate": 6.9519463758174745e-06, "loss": 0.3746, "step": 46 }, { "epoch": 1.3271028037383177, "grad_norm": 0.6480235385313834, "learning_rate": 6.797038864187564e-06, "loss": 0.4648, "step": 47 }, { "epoch": 1.355140186915888, "grad_norm": 0.663332249258245, "learning_rate": 6.640124289197845e-06, "loss": 0.412, "step": 48 }, { "epoch": 1.3831775700934579, "grad_norm": 0.735147105832247, "learning_rate": 6.481377904428171e-06, "loss": 0.4639, "step": 49 }, { "epoch": 1.411214953271028, "grad_norm": 0.7169861474373482, "learning_rate": 6.3209770093564315e-06, "loss": 0.4511, "step": 50 }, { "epoch": 1.439252336448598, "grad_norm": 0.7241007166406079, "learning_rate": 6.1591007513376425e-06, "loss": 0.4204, "step": 51 }, { "epoch": 1.4672897196261683, "grad_norm": 0.7785664265979848, "learning_rate": 5.995929925519181e-06, "loss": 0.4717, "step": 52 }, { "epoch": 1.4953271028037383, "grad_norm": 0.643031905575638, "learning_rate": 5.831646772915651e-06, "loss": 0.451, "step": 53 }, { "epoch": 1.5233644859813085, "grad_norm": 0.6622611979477618, "learning_rate": 5.666434776868895e-06, "loss": 0.4598, "step": 54 }, { "epoch": 1.5514018691588785, "grad_norm": 0.6681223638603929, "learning_rate": 5.500478458120493e-06, "loss": 0.4471, "step": 55 }, { "epoch": 1.5794392523364484, "grad_norm": 0.6742446596507364, "learning_rate": 5.3339631687256085e-06, "loss": 0.4323, "step": 56 }, { "epoch": 1.6074766355140186, "grad_norm": 0.6011350150512597, "learning_rate": 5.1670748850383734e-06, "loss": 0.421, "step": 57 }, { "epoch": 1.6355140186915889, "grad_norm": 0.6177912691346672, "learning_rate": 5e-06, "loss": 0.3956, "step": 58 }, { "epoch": 1.6635514018691588, "grad_norm": 0.6868626354051661, "learning_rate": 4.832925114961629e-06, "loss": 0.4661, "step": 59 }, { "epoch": 1.6915887850467288, "grad_norm": 0.6421560332745099, "learning_rate": 4.666036831274392e-06, "loss": 0.4704, "step": 60 }, { "epoch": 1.719626168224299, "grad_norm": 0.7187797397726619, "learning_rate": 4.499521541879508e-06, "loss": 0.465, "step": 61 }, { "epoch": 1.7476635514018692, "grad_norm": 0.5867907128348048, "learning_rate": 4.333565223131107e-06, "loss": 0.3283, "step": 62 }, { "epoch": 1.7757009345794392, "grad_norm": 0.662616949214837, "learning_rate": 4.1683532270843505e-06, "loss": 0.4216, "step": 63 }, { "epoch": 1.8037383177570092, "grad_norm": 0.6719507669829111, "learning_rate": 4.004070074480821e-06, "loss": 0.4443, "step": 64 }, { "epoch": 1.8317757009345794, "grad_norm": 0.6464555389487582, "learning_rate": 3.840899248662358e-06, "loss": 0.4901, "step": 65 }, { "epoch": 1.8598130841121496, "grad_norm": 0.642004590459993, "learning_rate": 3.6790229906435706e-06, "loss": 0.4687, "step": 66 }, { "epoch": 1.8878504672897196, "grad_norm": 0.6811855386223414, "learning_rate": 3.518622095571831e-06, "loss": 0.5103, "step": 67 }, { "epoch": 1.9158878504672896, "grad_norm": 0.6780639408715025, "learning_rate": 3.3598757108021546e-06, "loss": 0.4563, "step": 68 }, { "epoch": 1.9439252336448598, "grad_norm": 0.6096518872897251, "learning_rate": 3.202961135812437e-06, "loss": 0.4273, "step": 69 }, { "epoch": 1.97196261682243, "grad_norm": 0.6148567020386766, "learning_rate": 3.0480536241825263e-06, "loss": 0.3991, "step": 70 }, { "epoch": 2.0093457943925235, "grad_norm": 1.1093908775727706, "learning_rate": 2.8953261878583263e-06, "loss": 0.6552, "step": 71 }, { "epoch": 2.0373831775700935, "grad_norm": 0.5907034426928125, "learning_rate": 2.74494940391949e-06, "loss": 0.4244, "step": 72 }, { "epoch": 2.0654205607476634, "grad_norm": 0.6237639437264948, "learning_rate": 2.5970912240665815e-06, "loss": 0.3433, "step": 73 }, { "epoch": 2.0934579439252334, "grad_norm": 0.6456026273029044, "learning_rate": 2.4519167870404126e-06, "loss": 0.3633, "step": 74 }, { "epoch": 2.121495327102804, "grad_norm": 0.6114145984670124, "learning_rate": 2.309588234183137e-06, "loss": 0.3607, "step": 75 }, { "epoch": 2.149532710280374, "grad_norm": 0.5622408748138118, "learning_rate": 2.1702645283470238e-06, "loss": 0.3255, "step": 76 }, { "epoch": 2.177570093457944, "grad_norm": 0.6289348884143411, "learning_rate": 2.0341012763532243e-06, "loss": 0.3917, "step": 77 }, { "epoch": 2.205607476635514, "grad_norm": 2.11632857452312, "learning_rate": 1.9012505551987764e-06, "loss": 0.5284, "step": 78 }, { "epoch": 2.2336448598130842, "grad_norm": 0.6011421789250062, "learning_rate": 1.771860742205988e-06, "loss": 0.3717, "step": 79 }, { "epoch": 2.2616822429906542, "grad_norm": 0.605196953414431, "learning_rate": 1.646076349303884e-06, "loss": 0.3896, "step": 80 }, { "epoch": 2.289719626168224, "grad_norm": 0.6538497800730606, "learning_rate": 1.5240378616267887e-06, "loss": 0.3952, "step": 81 }, { "epoch": 2.317757009345794, "grad_norm": 0.6004986923958435, "learning_rate": 1.4058815806103542e-06, "loss": 0.4167, "step": 82 }, { "epoch": 2.3457943925233646, "grad_norm": 0.5871010729484297, "learning_rate": 1.2917394717602123e-06, "loss": 0.3395, "step": 83 }, { "epoch": 2.3738317757009346, "grad_norm": 0.6272792477288378, "learning_rate": 1.1817390172633402e-06, "loss": 0.3955, "step": 84 }, { "epoch": 2.4018691588785046, "grad_norm": 0.602988779712659, "learning_rate": 1.0760030736066952e-06, "loss": 0.3712, "step": 85 }, { "epoch": 2.4299065420560746, "grad_norm": 0.6514347127868433, "learning_rate": 9.746497343621857e-07, "loss": 0.439, "step": 86 }, { "epoch": 2.457943925233645, "grad_norm": 0.6171164966676075, "learning_rate": 8.777921982911996e-07, "loss": 0.3293, "step": 87 }, { "epoch": 2.485981308411215, "grad_norm": 0.5985061076335229, "learning_rate": 7.85538642916015e-07, "loss": 0.3554, "step": 88 }, { "epoch": 2.514018691588785, "grad_norm": 0.6023670543052161, "learning_rate": 6.979921036993042e-07, "loss": 0.3759, "step": 89 }, { "epoch": 2.542056074766355, "grad_norm": 0.6350504526148657, "learning_rate": 6.152503589666426e-07, "loss": 0.4048, "step": 90 }, { "epoch": 2.5700934579439254, "grad_norm": 0.6232837840764863, "learning_rate": 5.374058207005945e-07, "loss": 0.4454, "step": 91 }, { "epoch": 2.5981308411214954, "grad_norm": 0.6261955186268706, "learning_rate": 4.6454543132829653e-07, "loss": 0.3954, "step": 92 }, { "epoch": 2.6261682242990654, "grad_norm": 0.6079216763424581, "learning_rate": 3.9675056661785563e-07, "loss": 0.4013, "step": 93 }, { "epoch": 2.6542056074766354, "grad_norm": 0.6437379172509019, "learning_rate": 3.340969447919873e-07, "loss": 0.4624, "step": 94 }, { "epoch": 2.6822429906542054, "grad_norm": 0.6182485342353894, "learning_rate": 2.7665454196040665e-07, "loss": 0.3614, "step": 95 }, { "epoch": 2.710280373831776, "grad_norm": 0.6138213235934379, "learning_rate": 2.2448751396543788e-07, "loss": 0.3844, "step": 96 }, { "epoch": 2.7383177570093458, "grad_norm": 0.5790126523883554, "learning_rate": 1.776541247281177e-07, "loss": 0.3865, "step": 97 }, { "epoch": 2.7663551401869158, "grad_norm": 0.602742444058507, "learning_rate": 1.3620668117481471e-07, "loss": 0.4459, "step": 98 }, { "epoch": 2.794392523364486, "grad_norm": 0.6125089915071265, "learning_rate": 1.0019147481706626e-07, "loss": 0.4199, "step": 99 }, { "epoch": 2.822429906542056, "grad_norm": 0.6081244590976984, "learning_rate": 6.964873004985717e-08, "loss": 0.4009, "step": 100 }, { "epoch": 2.850467289719626, "grad_norm": 0.7145788800241012, "learning_rate": 4.461255922609986e-08, "loss": 0.3987, "step": 101 }, { "epoch": 2.878504672897196, "grad_norm": 0.6581593717216647, "learning_rate": 2.511092455747932e-08, "loss": 0.3866, "step": 102 }, { "epoch": 2.906542056074766, "grad_norm": 0.6290938268876005, "learning_rate": 1.1165606884234182e-08, "loss": 0.4225, "step": 103 }, { "epoch": 2.9345794392523366, "grad_norm": 0.6534972156634564, "learning_rate": 2.792181348726941e-09, "loss": 0.4279, "step": 104 }, { "epoch": 2.9626168224299065, "grad_norm": 0.58156649259114, "learning_rate": 0.0, "loss": 0.3581, "step": 105 }, { "epoch": 2.9626168224299065, "step": 105, "total_flos": 18524510396416.0, "train_loss": 0.47221575805119104, "train_runtime": 641.5687, "train_samples_per_second": 15.913, "train_steps_per_second": 0.164 } ], "logging_steps": 1, "max_steps": 105, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 18524510396416.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }