{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2474, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008085299914093689, "grad_norm": 1.7893829836216715, "learning_rate": 3.830645161290323e-06, "loss": 0.4938, "step": 20 }, { "epoch": 0.016170599828187377, "grad_norm": 1.544145448548272, "learning_rate": 7.862903225806451e-06, "loss": 0.4366, "step": 40 }, { "epoch": 0.024255899742281064, "grad_norm": 1.5679306059971794, "learning_rate": 1.1895161290322582e-05, "loss": 0.4118, "step": 60 }, { "epoch": 0.032341199656374754, "grad_norm": 1.395964355237053, "learning_rate": 1.592741935483871e-05, "loss": 0.4251, "step": 80 }, { "epoch": 0.04042649957046844, "grad_norm": 1.3199581670369915, "learning_rate": 1.995967741935484e-05, "loss": 0.4003, "step": 100 }, { "epoch": 0.04851179948456213, "grad_norm": 1.208480802915807, "learning_rate": 2.3991935483870968e-05, "loss": 0.4006, "step": 120 }, { "epoch": 0.05659709939865582, "grad_norm": 1.1298545768045016, "learning_rate": 2.8024193548387097e-05, "loss": 0.4189, "step": 140 }, { "epoch": 0.06468239931274951, "grad_norm": 1.1666261970423883, "learning_rate": 3.205645161290323e-05, "loss": 0.4503, "step": 160 }, { "epoch": 0.0727676992268432, "grad_norm": 0.9707198547224671, "learning_rate": 3.608870967741936e-05, "loss": 0.4518, "step": 180 }, { "epoch": 0.08085299914093688, "grad_norm": 0.9077078784165401, "learning_rate": 4.0120967741935485e-05, "loss": 0.4568, "step": 200 }, { "epoch": 0.08893829905503058, "grad_norm": 1.1533412194423165, "learning_rate": 4.415322580645162e-05, "loss": 0.4513, "step": 220 }, { "epoch": 0.09702359896912426, "grad_norm": 0.9729124530590224, "learning_rate": 4.818548387096775e-05, "loss": 0.4733, "step": 240 }, { "epoch": 0.10510889888321795, "grad_norm": 0.8740160088683773, "learning_rate": 4.999698743735772e-05, "loss": 0.4758, "step": 260 }, { "epoch": 0.11319419879731164, "grad_norm": 0.9664220859409508, "learning_rate": 4.997607711513815e-05, "loss": 0.4948, "step": 280 }, { "epoch": 0.12127949871140532, "grad_norm": 0.9046563007004271, "learning_rate": 4.99352690044939e-05, "loss": 0.4909, "step": 300 }, { "epoch": 0.12936479862549902, "grad_norm": 1.0102916306305423, "learning_rate": 4.987459561618109e-05, "loss": 0.4813, "step": 320 }, { "epoch": 0.1374500985395927, "grad_norm": 0.7964130200362666, "learning_rate": 4.979410528710377e-05, "loss": 0.4657, "step": 340 }, { "epoch": 0.1455353984536864, "grad_norm": 3.4749268002378666, "learning_rate": 4.969386214180523e-05, "loss": 0.4769, "step": 360 }, { "epoch": 0.15362069836778008, "grad_norm": 0.9409849930897388, "learning_rate": 4.957394604138165e-05, "loss": 0.4611, "step": 380 }, { "epoch": 0.16170599828187376, "grad_norm": 0.7527126720894318, "learning_rate": 4.94344525198588e-05, "loss": 0.4733, "step": 400 }, { "epoch": 0.16979129819596744, "grad_norm": 0.9209239697756653, "learning_rate": 4.9275492708082604e-05, "loss": 0.4829, "step": 420 }, { "epoch": 0.17787659811006115, "grad_norm": 0.9136833504109678, "learning_rate": 4.909719324518412e-05, "loss": 0.4751, "step": 440 }, { "epoch": 0.18596189802415483, "grad_norm": 0.8981636505844758, "learning_rate": 4.8899696177689404e-05, "loss": 0.4874, "step": 460 }, { "epoch": 0.1940471979382485, "grad_norm": 0.8300876947503312, "learning_rate": 4.8683158846354786e-05, "loss": 0.4618, "step": 480 }, { "epoch": 0.20213249785234222, "grad_norm": 0.8242465855991236, "learning_rate": 4.84477537608176e-05, "loss": 0.4893, "step": 500 }, { "epoch": 0.2102177977664359, "grad_norm": 0.800700567006772, "learning_rate": 4.819366846216224e-05, "loss": 0.4726, "step": 520 }, { "epoch": 0.21830309768052958, "grad_norm": 0.779146282903641, "learning_rate": 4.792110537351109e-05, "loss": 0.468, "step": 540 }, { "epoch": 0.2263883975946233, "grad_norm": 0.7743872804155105, "learning_rate": 4.763028163875928e-05, "loss": 0.4624, "step": 560 }, { "epoch": 0.23447369750871697, "grad_norm": 1.288690581905525, "learning_rate": 4.7321428949581885e-05, "loss": 0.442, "step": 580 }, { "epoch": 0.24255899742281065, "grad_norm": 0.7215163472877574, "learning_rate": 4.699479336085113e-05, "loss": 0.4782, "step": 600 }, { "epoch": 0.25064429733690435, "grad_norm": 0.7069830331678881, "learning_rate": 4.665063509461097e-05, "loss": 0.4534, "step": 620 }, { "epoch": 0.25872959725099803, "grad_norm": 0.8227398911837543, "learning_rate": 4.6289228332764924e-05, "loss": 0.4706, "step": 640 }, { "epoch": 0.2668148971650917, "grad_norm": 0.80160540139322, "learning_rate": 4.5910860998642566e-05, "loss": 0.4784, "step": 660 }, { "epoch": 0.2749001970791854, "grad_norm": 0.8392100792211208, "learning_rate": 4.551583452761849e-05, "loss": 0.4676, "step": 680 }, { "epoch": 0.2829854969932791, "grad_norm": 1.3191462108675702, "learning_rate": 4.510446362696664e-05, "loss": 0.4679, "step": 700 }, { "epoch": 0.2910707969073728, "grad_norm": 0.7565884365460607, "learning_rate": 4.467707602514122e-05, "loss": 0.4496, "step": 720 }, { "epoch": 0.2991560968214665, "grad_norm": 0.7086008790878013, "learning_rate": 4.4234012210684026e-05, "loss": 0.4316, "step": 740 }, { "epoch": 0.30724139673556017, "grad_norm": 0.699596566391461, "learning_rate": 4.377562516096607e-05, "loss": 0.4558, "step": 760 }, { "epoch": 0.31532669664965385, "grad_norm": 1.0631183317034627, "learning_rate": 4.330228006097979e-05, "loss": 0.4497, "step": 780 }, { "epoch": 0.32341199656374753, "grad_norm": 0.7860971712287887, "learning_rate": 4.281435401240563e-05, "loss": 0.473, "step": 800 }, { "epoch": 0.3314972964778412, "grad_norm": 0.8091591439036853, "learning_rate": 4.231223573318504e-05, "loss": 0.4435, "step": 820 }, { "epoch": 0.3395825963919349, "grad_norm": 0.6418471132042838, "learning_rate": 4.179632524783899e-05, "loss": 0.4605, "step": 840 }, { "epoch": 0.3476678963060286, "grad_norm": 0.6675200504051455, "learning_rate": 4.126703356877893e-05, "loss": 0.4386, "step": 860 }, { "epoch": 0.3557531962201223, "grad_norm": 0.8350300201126641, "learning_rate": 4.072478236886392e-05, "loss": 0.44, "step": 880 }, { "epoch": 0.363838496134216, "grad_norm": 0.7065809423140008, "learning_rate": 4.017000364546484e-05, "loss": 0.4585, "step": 900 }, { "epoch": 0.37192379604830966, "grad_norm": 0.7805666899810452, "learning_rate": 3.96031393763034e-05, "loss": 0.4333, "step": 920 }, { "epoch": 0.38000909596240334, "grad_norm": 0.8177836753419291, "learning_rate": 3.902464116734003e-05, "loss": 0.4471, "step": 940 }, { "epoch": 0.388094395876497, "grad_norm": 0.7575936626356954, "learning_rate": 3.84349698929912e-05, "loss": 0.4373, "step": 960 }, { "epoch": 0.39617969579059076, "grad_norm": 0.7492067475736178, "learning_rate": 3.7834595328962794e-05, "loss": 0.4497, "step": 980 }, { "epoch": 0.40426499570468444, "grad_norm": 0.7908131728874307, "learning_rate": 3.72239957779921e-05, "loss": 0.449, "step": 1000 }, { "epoch": 0.4123502956187781, "grad_norm": 0.6640701333672742, "learning_rate": 3.6603657688796465e-05, "loss": 0.4345, "step": 1020 }, { "epoch": 0.4204355955328718, "grad_norm": 0.7662918750682771, "learning_rate": 3.597407526853235e-05, "loss": 0.4276, "step": 1040 }, { "epoch": 0.4285208954469655, "grad_norm": 0.7191116615450338, "learning_rate": 3.533575008907338e-05, "loss": 0.4291, "step": 1060 }, { "epoch": 0.43660619536105916, "grad_norm": 0.6538640833264813, "learning_rate": 3.468919068742113e-05, "loss": 0.4271, "step": 1080 }, { "epoch": 0.44469149527515284, "grad_norm": 0.84633760406372, "learning_rate": 3.403491216056695e-05, "loss": 0.441, "step": 1100 }, { "epoch": 0.4527767951892466, "grad_norm": 0.6956037005349539, "learning_rate": 3.337343575512768e-05, "loss": 0.433, "step": 1120 }, { "epoch": 0.46086209510334025, "grad_norm": 0.6833758980525753, "learning_rate": 3.270528845208207e-05, "loss": 0.412, "step": 1140 }, { "epoch": 0.46894739501743393, "grad_norm": 1.4679545429292338, "learning_rate": 3.203100254693878e-05, "loss": 0.4256, "step": 1160 }, { "epoch": 0.4770326949315276, "grad_norm": 0.7006335987685887, "learning_rate": 3.135111522567048e-05, "loss": 0.4396, "step": 1180 }, { "epoch": 0.4851179948456213, "grad_norm": 0.7567851300681356, "learning_rate": 3.0666168136751776e-05, "loss": 0.4204, "step": 1200 }, { "epoch": 0.49320329475971497, "grad_norm": 0.7039630629968385, "learning_rate": 2.997670695964204e-05, "loss": 0.4149, "step": 1220 }, { "epoch": 0.5012885946738087, "grad_norm": 0.6801686276783195, "learning_rate": 2.9283280970056826e-05, "loss": 0.4213, "step": 1240 }, { "epoch": 0.5093738945879024, "grad_norm": 0.5702376498270392, "learning_rate": 2.8586442602374246e-05, "loss": 0.4107, "step": 1260 }, { "epoch": 0.5174591945019961, "grad_norm": 0.5903215734392792, "learning_rate": 2.788674700952495e-05, "loss": 0.426, "step": 1280 }, { "epoch": 0.5255444944160897, "grad_norm": 0.7072289573693732, "learning_rate": 2.7184751620716254e-05, "loss": 0.4166, "step": 1300 }, { "epoch": 0.5336297943301834, "grad_norm": 0.7528995772779531, "learning_rate": 2.648101569734286e-05, "loss": 0.4088, "step": 1320 }, { "epoch": 0.5417150942442771, "grad_norm": 0.7990855433501134, "learning_rate": 2.5776099887437906e-05, "loss": 0.4167, "step": 1340 }, { "epoch": 0.5498003941583708, "grad_norm": 0.6292477887263497, "learning_rate": 2.5070565779019316e-05, "loss": 0.422, "step": 1360 }, { "epoch": 0.5578856940724645, "grad_norm": 0.7554555570744919, "learning_rate": 2.43649754526873e-05, "loss": 0.4155, "step": 1380 }, { "epoch": 0.5659709939865581, "grad_norm": 0.7699820762080785, "learning_rate": 2.365989103382942e-05, "loss": 0.4286, "step": 1400 }, { "epoch": 0.5740562939006518, "grad_norm": 0.7266763971520528, "learning_rate": 2.2955874244789934e-05, "loss": 0.4158, "step": 1420 }, { "epoch": 0.5821415938147456, "grad_norm": 0.7740051062605608, "learning_rate": 2.2253485957360336e-05, "loss": 0.4223, "step": 1440 }, { "epoch": 0.5902268937288393, "grad_norm": 0.6515042328015387, "learning_rate": 2.1553285745947393e-05, "loss": 0.4032, "step": 1460 }, { "epoch": 0.598312193642933, "grad_norm": 0.6585346292648555, "learning_rate": 2.0855831441774858e-05, "loss": 0.4097, "step": 1480 }, { "epoch": 0.6063974935570267, "grad_norm": 0.7026076918363772, "learning_rate": 2.0161678688473902e-05, "loss": 0.4102, "step": 1500 }, { "epoch": 0.6144827934711203, "grad_norm": 0.6988836739982369, "learning_rate": 1.947138049941629e-05, "loss": 0.4144, "step": 1520 }, { "epoch": 0.622568093385214, "grad_norm": 0.7005776385652717, "learning_rate": 1.878548681714317e-05, "loss": 0.412, "step": 1540 }, { "epoch": 0.6306533932993077, "grad_norm": 0.6745885622939958, "learning_rate": 1.8104544075240086e-05, "loss": 0.4038, "step": 1560 }, { "epoch": 0.6387386932134014, "grad_norm": 0.6945027772851639, "learning_rate": 1.742909476300773e-05, "loss": 0.392, "step": 1580 }, { "epoch": 0.6468239931274951, "grad_norm": 0.7034384881922375, "learning_rate": 1.6759676993274804e-05, "loss": 0.4212, "step": 1600 }, { "epoch": 0.6549092930415887, "grad_norm": 0.7064404152869979, "learning_rate": 1.609682407369761e-05, "loss": 0.3931, "step": 1620 }, { "epoch": 0.6629945929556824, "grad_norm": 0.6741314936898479, "learning_rate": 1.5441064081887762e-05, "loss": 0.412, "step": 1640 }, { "epoch": 0.6710798928697761, "grad_norm": 0.6489494957365249, "learning_rate": 1.4792919444706531e-05, "loss": 0.3977, "step": 1660 }, { "epoch": 0.6791651927838698, "grad_norm": 0.6239148906365745, "learning_rate": 1.4152906522061048e-05, "loss": 0.3925, "step": 1680 }, { "epoch": 0.6872504926979636, "grad_norm": 0.6758196156693772, "learning_rate": 1.3521535195533797e-05, "loss": 0.401, "step": 1700 }, { "epoch": 0.6953357926120572, "grad_norm": 0.6670657379068383, "learning_rate": 1.2899308462173293e-05, "loss": 0.3964, "step": 1720 }, { "epoch": 0.7034210925261509, "grad_norm": 0.7080774192825836, "learning_rate": 1.2286722033769493e-05, "loss": 0.3872, "step": 1740 }, { "epoch": 0.7115063924402446, "grad_norm": 0.78706482596111, "learning_rate": 1.1684263941933116e-05, "loss": 0.3999, "step": 1760 }, { "epoch": 0.7195916923543383, "grad_norm": 0.6453268176626914, "learning_rate": 1.109241414929365e-05, "loss": 0.3839, "step": 1780 }, { "epoch": 0.727676992268432, "grad_norm": 0.7743809381350635, "learning_rate": 1.0511644167125611e-05, "loss": 0.403, "step": 1800 }, { "epoch": 0.7357622921825256, "grad_norm": 0.5812286294961497, "learning_rate": 9.94241667970785e-06, "loss": 0.4051, "step": 1820 }, { "epoch": 0.7438475920966193, "grad_norm": 0.6627529924806566, "learning_rate": 9.385185175715067e-06, "loss": 0.3878, "step": 1840 }, { "epoch": 0.751932892010713, "grad_norm": 0.5787935835133774, "learning_rate": 8.840393586935228e-06, "loss": 0.3879, "step": 1860 }, { "epoch": 0.7600181919248067, "grad_norm": 0.5575588303157542, "learning_rate": 8.30847593460069e-06, "loss": 0.3799, "step": 1880 }, { "epoch": 0.7681034918389004, "grad_norm": 0.6937350938930752, "learning_rate": 7.789855983614826e-06, "loss": 0.3981, "step": 1900 }, { "epoch": 0.776188791752994, "grad_norm": 0.6705779864961804, "learning_rate": 7.2849469049496216e-06, "loss": 0.3982, "step": 1920 }, { "epoch": 0.7842740916670877, "grad_norm": 0.6247131470946646, "learning_rate": 6.794150946483091e-06, "loss": 0.3894, "step": 1940 }, { "epoch": 0.7923593915811815, "grad_norm": 0.7527011970876276, "learning_rate": 6.317859112538943e-06, "loss": 0.3933, "step": 1960 }, { "epoch": 0.8004446914952752, "grad_norm": 0.70959890519849, "learning_rate": 5.856450852383627e-06, "loss": 0.4001, "step": 1980 }, { "epoch": 0.8085299914093689, "grad_norm": 0.7637107619681417, "learning_rate": 5.410293757929027e-06, "loss": 0.3825, "step": 2000 }, { "epoch": 0.8166152913234626, "grad_norm": 0.6687923533892048, "learning_rate": 4.979743270881559e-06, "loss": 0.4063, "step": 2020 }, { "epoch": 0.8247005912375562, "grad_norm": 0.6751152707749739, "learning_rate": 4.56514239957109e-06, "loss": 0.4046, "step": 2040 }, { "epoch": 0.8327858911516499, "grad_norm": 0.6861920430003902, "learning_rate": 4.166821445685087e-06, "loss": 0.3964, "step": 2060 }, { "epoch": 0.8408711910657436, "grad_norm": 0.6621186149402191, "learning_rate": 3.7850977411259705e-06, "loss": 0.3753, "step": 2080 }, { "epoch": 0.8489564909798373, "grad_norm": 0.6470208514078205, "learning_rate": 3.420275395201014e-06, "loss": 0.3818, "step": 2100 }, { "epoch": 0.857041790893931, "grad_norm": 0.6610917416320168, "learning_rate": 3.0726450523464208e-06, "loss": 0.3844, "step": 2120 }, { "epoch": 0.8651270908080246, "grad_norm": 0.6756461162410473, "learning_rate": 2.74248366057851e-06, "loss": 0.3757, "step": 2140 }, { "epoch": 0.8732123907221183, "grad_norm": 0.6961821177637604, "learning_rate": 2.430054250856412e-06, "loss": 0.3803, "step": 2160 }, { "epoch": 0.881297690636212, "grad_norm": 0.7418320454835176, "learning_rate": 2.1356057275322417e-06, "loss": 0.3869, "step": 2180 }, { "epoch": 0.8893829905503057, "grad_norm": 0.6584153523981158, "learning_rate": 1.8593726700554342e-06, "loss": 0.3806, "step": 2200 }, { "epoch": 0.8974682904643995, "grad_norm": 0.6627761573934713, "learning_rate": 1.6015751460894762e-06, "loss": 0.399, "step": 2220 }, { "epoch": 0.9055535903784931, "grad_norm": 0.5659307004910596, "learning_rate": 1.3624185361897135e-06, "loss": 0.3806, "step": 2240 }, { "epoch": 0.9136388902925868, "grad_norm": 0.6155894884801492, "learning_rate": 1.1420933701820347e-06, "loss": 0.3897, "step": 2260 }, { "epoch": 0.9217241902066805, "grad_norm": 0.5448653880194827, "learning_rate": 9.407751753727323e-07, "loss": 0.4017, "step": 2280 }, { "epoch": 0.9298094901207742, "grad_norm": 0.6080654098190524, "learning_rate": 7.586243367104894e-07, "loss": 0.3767, "step": 2300 }, { "epoch": 0.9378947900348679, "grad_norm": 0.7569388927738113, "learning_rate": 5.95785969011875e-07, "loss": 0.3919, "step": 2320 }, { "epoch": 0.9459800899489615, "grad_norm": 0.9500614354033146, "learning_rate": 4.52389801352146e-07, "loss": 0.3918, "step": 2340 }, { "epoch": 0.9540653898630552, "grad_norm": 0.6408282025966349, "learning_rate": 3.285500737135072e-07, "loss": 0.3778, "step": 2360 }, { "epoch": 0.9621506897771489, "grad_norm": 0.7007941767466861, "learning_rate": 2.2436544597307652e-07, "loss": 0.3788, "step": 2380 }, { "epoch": 0.9702359896912426, "grad_norm": 0.6363318157541741, "learning_rate": 1.399189193031475e-07, "loss": 0.383, "step": 2400 }, { "epoch": 0.9783212896053363, "grad_norm": 0.6023554877318151, "learning_rate": 7.527777004632253e-08, "loss": 0.3832, "step": 2420 }, { "epoch": 0.9864065895194299, "grad_norm": 0.7881624086353561, "learning_rate": 3.049349611820851e-08, "loss": 0.3751, "step": 2440 }, { "epoch": 0.9944918894335236, "grad_norm": 0.6497592434119226, "learning_rate": 5.601775980371149e-09, "loss": 0.3836, "step": 2460 } ], "logging_steps": 20, "max_steps": 2474, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 600, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 40816993337344.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }