| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.098106712564544, | |
| "eval_steps": 500, | |
| "global_step": 1800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01721170395869191, | |
| "grad_norm": 0.29955029487609863, | |
| "learning_rate": 2.0293089116901574e-06, | |
| "loss": 0.6322, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03442340791738382, | |
| "grad_norm": 0.06169761344790459, | |
| "learning_rate": 2.6401917645771237e-06, | |
| "loss": 0.4697, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05163511187607573, | |
| "grad_norm": 0.051926977932453156, | |
| "learning_rate": 2.9975353258495578e-06, | |
| "loss": 0.5617, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06884681583476764, | |
| "grad_norm": 0.07096195966005325, | |
| "learning_rate": 3.25107461746409e-06, | |
| "loss": 0.4301, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08605851979345955, | |
| "grad_norm": 0.06899057328701019, | |
| "learning_rate": 3.4477349704933476e-06, | |
| "loss": 0.4905, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10327022375215146, | |
| "grad_norm": 0.08537387102842331, | |
| "learning_rate": 3.6084181787365237e-06, | |
| "loss": 0.4551, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.12048192771084337, | |
| "grad_norm": 0.049780745059251785, | |
| "learning_rate": 3.7442738955429737e-06, | |
| "loss": 0.4058, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.13769363166953527, | |
| "grad_norm": 0.04421038553118706, | |
| "learning_rate": 3.861957470351056e-06, | |
| "loss": 0.6748, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1549053356282272, | |
| "grad_norm": 1.9084473848342896, | |
| "learning_rate": 3.965761740008958e-06, | |
| "loss": 0.8719, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1721170395869191, | |
| "grad_norm": 0.08046019077301025, | |
| "learning_rate": 4.058617823380315e-06, | |
| "loss": 0.4635, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.18932874354561102, | |
| "grad_norm": 0.21439455449581146, | |
| "learning_rate": 4.142616368250685e-06, | |
| "loss": 0.928, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.20654044750430292, | |
| "grad_norm": 0.06055545434355736, | |
| "learning_rate": 4.21930103162349e-06, | |
| "loss": 0.3721, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.22375215146299485, | |
| "grad_norm": 0.08670035004615784, | |
| "learning_rate": 4.289844083644429e-06, | |
| "loss": 0.7536, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.24096385542168675, | |
| "grad_norm": 0.06118405610322952, | |
| "learning_rate": 4.355156748429939e-06, | |
| "loss": 0.9829, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.25817555938037867, | |
| "grad_norm": 0.04853704199194908, | |
| "learning_rate": 4.415961384652748e-06, | |
| "loss": 0.4444, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.27538726333907054, | |
| "grad_norm": 0.03537767753005028, | |
| "learning_rate": 4.472840323238023e-06, | |
| "loss": 0.5064, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.29259896729776247, | |
| "grad_norm": 0.06154410541057587, | |
| "learning_rate": 4.52626987322263e-06, | |
| "loss": 0.5456, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3098106712564544, | |
| "grad_norm": 0.052560485899448395, | |
| "learning_rate": 4.576644592895925e-06, | |
| "loss": 0.5106, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3270223752151463, | |
| "grad_norm": 0.04913010448217392, | |
| "learning_rate": 4.6242949899596115e-06, | |
| "loss": 0.4026, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3442340791738382, | |
| "grad_norm": 0.07974158972501755, | |
| "learning_rate": 4.66950067626728e-06, | |
| "loss": 0.4828, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3614457831325301, | |
| "grad_norm": 0.03538183122873306, | |
| "learning_rate": 4.712500309702374e-06, | |
| "loss": 0.3549, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.37865748709122204, | |
| "grad_norm": 0.21638496220111847, | |
| "learning_rate": 4.753499221137652e-06, | |
| "loss": 0.4912, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3958691910499139, | |
| "grad_norm": 0.03895362466573715, | |
| "learning_rate": 4.792675344617211e-06, | |
| "loss": 0.3846, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.41308089500860584, | |
| "grad_norm": 0.03565879911184311, | |
| "learning_rate": 4.830183884510456e-06, | |
| "loss": 0.8434, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.43029259896729777, | |
| "grad_norm": 0.03526683151721954, | |
| "learning_rate": 4.866161029296539e-06, | |
| "loss": 0.3603, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4475043029259897, | |
| "grad_norm": 0.064102903008461, | |
| "learning_rate": 4.900726936531396e-06, | |
| "loss": 0.5178, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.46471600688468157, | |
| "grad_norm": 0.06982860714197159, | |
| "learning_rate": 4.9339881541683585e-06, | |
| "loss": 0.3712, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4819277108433735, | |
| "grad_norm": 0.0654272809624672, | |
| "learning_rate": 4.966039601316906e-06, | |
| "loss": 0.9119, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4991394148020654, | |
| "grad_norm": 0.04955059662461281, | |
| "learning_rate": 4.9969662012643525e-06, | |
| "loss": 0.3874, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5163511187607573, | |
| "grad_norm": 1.0234352350234985, | |
| "learning_rate": 4.984697781178272e-06, | |
| "loss": 0.8952, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5335628227194492, | |
| "grad_norm": 0.03769606724381447, | |
| "learning_rate": 4.96557000765111e-06, | |
| "loss": 0.3347, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5507745266781411, | |
| "grad_norm": 0.11739111691713333, | |
| "learning_rate": 4.946442234123948e-06, | |
| "loss": 0.3677, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5679862306368331, | |
| "grad_norm": 0.04959660395979881, | |
| "learning_rate": 4.927314460596787e-06, | |
| "loss": 1.1762, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5851979345955249, | |
| "grad_norm": 0.1042531356215477, | |
| "learning_rate": 4.908186687069626e-06, | |
| "loss": 0.4252, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6024096385542169, | |
| "grad_norm": 0.05064910277724266, | |
| "learning_rate": 4.889058913542464e-06, | |
| "loss": 0.3836, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6196213425129088, | |
| "grad_norm": 0.0689607635140419, | |
| "learning_rate": 4.869931140015303e-06, | |
| "loss": 0.7539, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6368330464716007, | |
| "grad_norm": 0.23462702333927155, | |
| "learning_rate": 4.850803366488141e-06, | |
| "loss": 0.8236, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6540447504302926, | |
| "grad_norm": 0.11018137633800507, | |
| "learning_rate": 4.83167559296098e-06, | |
| "loss": 0.4839, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6712564543889845, | |
| "grad_norm": 0.0751522108912468, | |
| "learning_rate": 4.812547819433818e-06, | |
| "loss": 0.5791, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6884681583476764, | |
| "grad_norm": 0.17227555811405182, | |
| "learning_rate": 4.793420045906657e-06, | |
| "loss": 0.7993, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7056798623063684, | |
| "grad_norm": 0.0664035975933075, | |
| "learning_rate": 4.7742922723794954e-06, | |
| "loss": 0.387, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7228915662650602, | |
| "grad_norm": 0.04762504622340202, | |
| "learning_rate": 4.755164498852334e-06, | |
| "loss": 0.5436, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7401032702237521, | |
| "grad_norm": 0.03658389300107956, | |
| "learning_rate": 4.736036725325173e-06, | |
| "loss": 0.6715, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7573149741824441, | |
| "grad_norm": 0.03955502808094025, | |
| "learning_rate": 4.716908951798011e-06, | |
| "loss": 0.4902, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.774526678141136, | |
| "grad_norm": 0.05926811322569847, | |
| "learning_rate": 4.69778117827085e-06, | |
| "loss": 0.7329, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7917383820998278, | |
| "grad_norm": 0.26404136419296265, | |
| "learning_rate": 4.678653404743688e-06, | |
| "loss": 0.5748, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.8089500860585198, | |
| "grad_norm": 0.07195431739091873, | |
| "learning_rate": 4.6595256312165265e-06, | |
| "loss": 0.5501, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8261617900172117, | |
| "grad_norm": 0.0486939400434494, | |
| "learning_rate": 4.640397857689365e-06, | |
| "loss": 0.4527, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8433734939759037, | |
| "grad_norm": 0.05488497018814087, | |
| "learning_rate": 4.621270084162204e-06, | |
| "loss": 0.8637, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8605851979345955, | |
| "grad_norm": 0.045418575406074524, | |
| "learning_rate": 4.6021423106350425e-06, | |
| "loss": 0.437, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8777969018932874, | |
| "grad_norm": 0.04055708646774292, | |
| "learning_rate": 4.583014537107881e-06, | |
| "loss": 0.6466, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.8950086058519794, | |
| "grad_norm": 0.03856475651264191, | |
| "learning_rate": 4.563886763580719e-06, | |
| "loss": 0.669, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.9122203098106713, | |
| "grad_norm": 0.035741958767175674, | |
| "learning_rate": 4.5447589900535585e-06, | |
| "loss": 0.3615, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.9294320137693631, | |
| "grad_norm": 0.04278489947319031, | |
| "learning_rate": 4.525631216526396e-06, | |
| "loss": 0.3849, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9466437177280551, | |
| "grad_norm": 0.031775712966918945, | |
| "learning_rate": 4.506503442999236e-06, | |
| "loss": 0.6446, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.963855421686747, | |
| "grad_norm": 0.19989252090454102, | |
| "learning_rate": 4.487375669472074e-06, | |
| "loss": 0.6668, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.9810671256454389, | |
| "grad_norm": 0.04056662693619728, | |
| "learning_rate": 4.468247895944912e-06, | |
| "loss": 0.4243, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.9982788296041308, | |
| "grad_norm": 0.06392610818147659, | |
| "learning_rate": 4.449120122417751e-06, | |
| "loss": 0.3431, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.0154905335628228, | |
| "grad_norm": 0.03935154527425766, | |
| "learning_rate": 4.42999234889059e-06, | |
| "loss": 0.5167, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.0327022375215147, | |
| "grad_norm": 0.05566889047622681, | |
| "learning_rate": 4.410864575363428e-06, | |
| "loss": 0.4372, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.0499139414802066, | |
| "grad_norm": 0.07127536088228226, | |
| "learning_rate": 4.391736801836267e-06, | |
| "loss": 1.4152, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.0671256454388984, | |
| "grad_norm": 0.04618392139673233, | |
| "learning_rate": 4.372609028309105e-06, | |
| "loss": 0.601, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.0843373493975903, | |
| "grad_norm": 0.04588570445775986, | |
| "learning_rate": 4.3534812547819434e-06, | |
| "loss": 0.4723, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.1015490533562822, | |
| "grad_norm": 0.03991321101784706, | |
| "learning_rate": 4.334353481254782e-06, | |
| "loss": 0.4807, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.1187607573149743, | |
| "grad_norm": 0.2501582205295563, | |
| "learning_rate": 4.315225707727621e-06, | |
| "loss": 0.8098, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.1359724612736661, | |
| "grad_norm": 0.042163778096437454, | |
| "learning_rate": 4.296097934200459e-06, | |
| "loss": 0.4158, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.153184165232358, | |
| "grad_norm": 0.04054609313607216, | |
| "learning_rate": 4.276970160673298e-06, | |
| "loss": 0.3728, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.1703958691910499, | |
| "grad_norm": 0.0925000011920929, | |
| "learning_rate": 4.257842387146137e-06, | |
| "loss": 0.4251, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.1876075731497417, | |
| "grad_norm": 0.06017041206359863, | |
| "learning_rate": 4.2387146136189745e-06, | |
| "loss": 0.4782, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.2048192771084336, | |
| "grad_norm": 0.040517594665288925, | |
| "learning_rate": 4.219586840091814e-06, | |
| "loss": 0.4354, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.2220309810671257, | |
| "grad_norm": 0.04731125384569168, | |
| "learning_rate": 4.200459066564652e-06, | |
| "loss": 0.4969, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.2392426850258176, | |
| "grad_norm": 0.050880610942840576, | |
| "learning_rate": 4.1813312930374905e-06, | |
| "loss": 0.492, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.2564543889845095, | |
| "grad_norm": 0.04548948258161545, | |
| "learning_rate": 4.162203519510329e-06, | |
| "loss": 0.3914, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.2736660929432013, | |
| "grad_norm": 0.03825736418366432, | |
| "learning_rate": 4.143075745983168e-06, | |
| "loss": 0.3921, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.2908777969018934, | |
| "grad_norm": 0.046227287501096725, | |
| "learning_rate": 4.1239479724560065e-06, | |
| "loss": 0.4632, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.3080895008605853, | |
| "grad_norm": 0.04002716392278671, | |
| "learning_rate": 4.104820198928845e-06, | |
| "loss": 0.7436, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.3253012048192772, | |
| "grad_norm": 0.04381329566240311, | |
| "learning_rate": 4.085692425401683e-06, | |
| "loss": 0.5388, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.342512908777969, | |
| "grad_norm": 0.09227538853883743, | |
| "learning_rate": 4.0665646518745225e-06, | |
| "loss": 0.7008, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.359724612736661, | |
| "grad_norm": 0.0453125424683094, | |
| "learning_rate": 4.04743687834736e-06, | |
| "loss": 0.4813, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.3769363166953528, | |
| "grad_norm": 0.20484060049057007, | |
| "learning_rate": 4.0283091048202e-06, | |
| "loss": 0.6594, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.3941480206540446, | |
| "grad_norm": 0.05485668033361435, | |
| "learning_rate": 4.009181331293038e-06, | |
| "loss": 0.6538, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.4113597246127367, | |
| "grad_norm": 0.04452645406126976, | |
| "learning_rate": 3.990053557765876e-06, | |
| "loss": 0.3713, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.03632510080933571, | |
| "learning_rate": 3.970925784238715e-06, | |
| "loss": 0.3395, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.4457831325301205, | |
| "grad_norm": 0.0884113535284996, | |
| "learning_rate": 3.951798010711554e-06, | |
| "loss": 0.3602, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.4629948364888123, | |
| "grad_norm": 0.1275469958782196, | |
| "learning_rate": 3.932670237184392e-06, | |
| "loss": 0.4533, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.4802065404475044, | |
| "grad_norm": 0.03843805938959122, | |
| "learning_rate": 3.913542463657231e-06, | |
| "loss": 0.7519, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.4974182444061963, | |
| "grad_norm": 0.03635178506374359, | |
| "learning_rate": 3.89441469013007e-06, | |
| "loss": 0.388, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.5146299483648882, | |
| "grad_norm": 0.039031002670526505, | |
| "learning_rate": 3.875286916602907e-06, | |
| "loss": 0.4425, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.53184165232358, | |
| "grad_norm": 0.04110798239707947, | |
| "learning_rate": 3.856159143075746e-06, | |
| "loss": 0.4095, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.549053356282272, | |
| "grad_norm": 0.04002736508846283, | |
| "learning_rate": 3.837031369548585e-06, | |
| "loss": 0.6104, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.5662650602409638, | |
| "grad_norm": 0.03314425051212311, | |
| "learning_rate": 3.817903596021423e-06, | |
| "loss": 0.5594, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.5834767641996557, | |
| "grad_norm": 0.03947990760207176, | |
| "learning_rate": 3.798775822494262e-06, | |
| "loss": 0.4931, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.6006884681583475, | |
| "grad_norm": 0.05939627066254616, | |
| "learning_rate": 3.7796480489671007e-06, | |
| "loss": 0.5127, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.6179001721170396, | |
| "grad_norm": 0.03439631685614586, | |
| "learning_rate": 3.760520275439939e-06, | |
| "loss": 0.4139, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.6351118760757315, | |
| "grad_norm": 0.06566853076219559, | |
| "learning_rate": 3.7413925019127776e-06, | |
| "loss": 0.6641, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.6523235800344234, | |
| "grad_norm": 0.06731946766376495, | |
| "learning_rate": 3.7222647283856163e-06, | |
| "loss": 0.6865, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.6695352839931155, | |
| "grad_norm": 0.03529343381524086, | |
| "learning_rate": 3.703136954858455e-06, | |
| "loss": 0.6395, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.6867469879518073, | |
| "grad_norm": 0.09028229117393494, | |
| "learning_rate": 3.684009181331293e-06, | |
| "loss": 0.774, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.7039586919104992, | |
| "grad_norm": 0.04828124865889549, | |
| "learning_rate": 3.664881407804132e-06, | |
| "loss": 0.4953, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.721170395869191, | |
| "grad_norm": 0.050330750644207, | |
| "learning_rate": 3.6457536342769705e-06, | |
| "loss": 0.6435, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.738382099827883, | |
| "grad_norm": 0.03781217709183693, | |
| "learning_rate": 3.6266258607498087e-06, | |
| "loss": 0.4538, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.7555938037865748, | |
| "grad_norm": 0.053586967289447784, | |
| "learning_rate": 3.607498087222648e-06, | |
| "loss": 0.384, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.7728055077452667, | |
| "grad_norm": 0.04280597344040871, | |
| "learning_rate": 3.588370313695486e-06, | |
| "loss": 0.385, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.7900172117039586, | |
| "grad_norm": 0.05530484393239021, | |
| "learning_rate": 3.5692425401683243e-06, | |
| "loss": 0.732, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.8072289156626506, | |
| "grad_norm": 0.05707624554634094, | |
| "learning_rate": 3.5501147666411634e-06, | |
| "loss": 0.4075, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.8244406196213425, | |
| "grad_norm": 0.07795403897762299, | |
| "learning_rate": 3.5309869931140016e-06, | |
| "loss": 1.0486, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.8416523235800344, | |
| "grad_norm": 0.08253274112939835, | |
| "learning_rate": 3.5118592195868407e-06, | |
| "loss": 0.7014, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.8588640275387265, | |
| "grad_norm": 0.037665221840143204, | |
| "learning_rate": 3.492731446059679e-06, | |
| "loss": 0.5129, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.8760757314974184, | |
| "grad_norm": 0.08074070513248444, | |
| "learning_rate": 3.473603672532517e-06, | |
| "loss": 0.6965, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.8932874354561102, | |
| "grad_norm": 0.053863946348428726, | |
| "learning_rate": 3.4544758990053563e-06, | |
| "loss": 0.3608, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.910499139414802, | |
| "grad_norm": 0.03980562463402748, | |
| "learning_rate": 3.4353481254781945e-06, | |
| "loss": 0.3408, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.927710843373494, | |
| "grad_norm": 0.03091476857662201, | |
| "learning_rate": 3.4162203519510336e-06, | |
| "loss": 0.4147, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.9449225473321858, | |
| "grad_norm": 0.05423520505428314, | |
| "learning_rate": 3.399005355776588e-06, | |
| "loss": 0.501, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.9621342512908777, | |
| "grad_norm": 0.056222882121801376, | |
| "learning_rate": 3.379877582249426e-06, | |
| "loss": 0.6646, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.9793459552495696, | |
| "grad_norm": 0.04780727997422218, | |
| "learning_rate": 3.360749808722265e-06, | |
| "loss": 0.4433, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.9965576592082617, | |
| "grad_norm": 0.0465485118329525, | |
| "learning_rate": 3.3416220351951034e-06, | |
| "loss": 0.4117, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.0137693631669533, | |
| "grad_norm": 0.038410015404224396, | |
| "learning_rate": 3.3224942616679424e-06, | |
| "loss": 0.9719, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.0309810671256456, | |
| "grad_norm": 0.03839205205440521, | |
| "learning_rate": 3.3033664881407807e-06, | |
| "loss": 0.5383, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.0481927710843375, | |
| "grad_norm": 0.05250284820795059, | |
| "learning_rate": 3.284238714613619e-06, | |
| "loss": 0.5573, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.0654044750430294, | |
| "grad_norm": 0.05850391089916229, | |
| "learning_rate": 3.265110941086458e-06, | |
| "loss": 0.3652, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.0826161790017212, | |
| "grad_norm": 0.03551226481795311, | |
| "learning_rate": 3.2459831675592962e-06, | |
| "loss": 1.1687, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.099827882960413, | |
| "grad_norm": 0.035683631896972656, | |
| "learning_rate": 3.226855394032135e-06, | |
| "loss": 0.3377, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.117039586919105, | |
| "grad_norm": 0.05406322330236435, | |
| "learning_rate": 3.2077276205049736e-06, | |
| "loss": 0.4614, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.134251290877797, | |
| "grad_norm": 0.030787965282797813, | |
| "learning_rate": 3.188599846977812e-06, | |
| "loss": 0.3771, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.1514629948364887, | |
| "grad_norm": 0.04496818408370018, | |
| "learning_rate": 3.169472073450651e-06, | |
| "loss": 0.4846, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.1686746987951806, | |
| "grad_norm": 0.03633632883429527, | |
| "learning_rate": 3.150344299923489e-06, | |
| "loss": 0.3549, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.1858864027538725, | |
| "grad_norm": 0.033117033541202545, | |
| "learning_rate": 3.1312165263963278e-06, | |
| "loss": 0.4224, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.2030981067125643, | |
| "grad_norm": 0.04940853640437126, | |
| "learning_rate": 3.1120887528691664e-06, | |
| "loss": 0.6976, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.2203098106712567, | |
| "grad_norm": 0.03474991396069527, | |
| "learning_rate": 3.092960979342005e-06, | |
| "loss": 0.5837, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.2375215146299485, | |
| "grad_norm": 0.08616980165243149, | |
| "learning_rate": 3.0738332058148433e-06, | |
| "loss": 0.5885, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.2547332185886404, | |
| "grad_norm": 0.04921899363398552, | |
| "learning_rate": 3.054705432287682e-06, | |
| "loss": 0.4007, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.2719449225473323, | |
| "grad_norm": 0.033128101378679276, | |
| "learning_rate": 3.0355776587605207e-06, | |
| "loss": 0.3948, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.289156626506024, | |
| "grad_norm": 0.0420563630759716, | |
| "learning_rate": 3.016449885233359e-06, | |
| "loss": 0.6675, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.306368330464716, | |
| "grad_norm": 0.04620426893234253, | |
| "learning_rate": 2.997322111706198e-06, | |
| "loss": 0.3454, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.323580034423408, | |
| "grad_norm": 0.031115278601646423, | |
| "learning_rate": 2.9781943381790362e-06, | |
| "loss": 0.4697, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.3407917383820998, | |
| "grad_norm": 0.03716883435845375, | |
| "learning_rate": 2.9590665646518745e-06, | |
| "loss": 0.7016, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.3580034423407916, | |
| "grad_norm": 0.2217116802930832, | |
| "learning_rate": 2.9399387911247135e-06, | |
| "loss": 0.6504, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.3752151462994835, | |
| "grad_norm": 0.08799983561038971, | |
| "learning_rate": 2.9208110175975518e-06, | |
| "loss": 0.3518, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.3924268502581754, | |
| "grad_norm": 0.03414052352309227, | |
| "learning_rate": 2.901683244070391e-06, | |
| "loss": 0.5522, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.4096385542168672, | |
| "grad_norm": 0.14305748045444489, | |
| "learning_rate": 2.882555470543229e-06, | |
| "loss": 0.7692, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.4268502581755595, | |
| "grad_norm": 0.04776856303215027, | |
| "learning_rate": 2.8634276970160673e-06, | |
| "loss": 0.4163, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.4440619621342514, | |
| "grad_norm": 0.06117096543312073, | |
| "learning_rate": 2.8442999234889064e-06, | |
| "loss": 0.3797, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.4612736660929433, | |
| "grad_norm": 0.1437849998474121, | |
| "learning_rate": 2.8251721499617447e-06, | |
| "loss": 0.3978, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.478485370051635, | |
| "grad_norm": 0.03535407409071922, | |
| "learning_rate": 2.8060443764345833e-06, | |
| "loss": 0.7543, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.495697074010327, | |
| "grad_norm": 0.034573543816804886, | |
| "learning_rate": 2.786916602907422e-06, | |
| "loss": 0.4385, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.512908777969019, | |
| "grad_norm": 0.05264075845479965, | |
| "learning_rate": 2.7677888293802602e-06, | |
| "loss": 0.5788, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.5301204819277108, | |
| "grad_norm": 0.047263339161872864, | |
| "learning_rate": 2.748661055853099e-06, | |
| "loss": 0.5397, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.5473321858864026, | |
| "grad_norm": 0.03852943331003189, | |
| "learning_rate": 2.7295332823259375e-06, | |
| "loss": 0.3995, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.5645438898450945, | |
| "grad_norm": 0.04756772890686989, | |
| "learning_rate": 2.710405508798776e-06, | |
| "loss": 0.5136, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.581755593803787, | |
| "grad_norm": 0.07750029861927032, | |
| "learning_rate": 2.6912777352716144e-06, | |
| "loss": 0.8293, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.5989672977624787, | |
| "grad_norm": 0.047012392431497574, | |
| "learning_rate": 2.672149961744453e-06, | |
| "loss": 0.5485, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.6161790017211706, | |
| "grad_norm": 0.04318179562687874, | |
| "learning_rate": 2.6530221882172918e-06, | |
| "loss": 0.4112, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.6333907056798624, | |
| "grad_norm": 0.06012555584311485, | |
| "learning_rate": 2.63389441469013e-06, | |
| "loss": 0.7031, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.6506024096385543, | |
| "grad_norm": 0.03384987264871597, | |
| "learning_rate": 2.614766641162969e-06, | |
| "loss": 0.439, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.667814113597246, | |
| "grad_norm": 0.05770883336663246, | |
| "learning_rate": 2.5956388676358073e-06, | |
| "loss": 0.3991, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.685025817555938, | |
| "grad_norm": 0.05510050430893898, | |
| "learning_rate": 2.5765110941086456e-06, | |
| "loss": 0.9784, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.70223752151463, | |
| "grad_norm": 0.055017050355672836, | |
| "learning_rate": 2.5573833205814846e-06, | |
| "loss": 0.3796, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.719449225473322, | |
| "grad_norm": 0.04332127049565315, | |
| "learning_rate": 2.538255547054323e-06, | |
| "loss": 0.433, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.7366609294320137, | |
| "grad_norm": 0.060054711997509, | |
| "learning_rate": 2.519127773527162e-06, | |
| "loss": 0.2799, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.7538726333907055, | |
| "grad_norm": 0.0340825691819191, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.6797, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.7710843373493974, | |
| "grad_norm": 0.22405555844306946, | |
| "learning_rate": 2.480872226472839e-06, | |
| "loss": 0.6071, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.7882960413080893, | |
| "grad_norm": 0.04493927210569382, | |
| "learning_rate": 2.4617444529456775e-06, | |
| "loss": 0.4004, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.805507745266781, | |
| "grad_norm": 0.06454917788505554, | |
| "learning_rate": 2.4426166794185158e-06, | |
| "loss": 0.3903, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.8227194492254735, | |
| "grad_norm": 0.07336492091417313, | |
| "learning_rate": 2.4234889058913544e-06, | |
| "loss": 0.9157, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.8399311531841653, | |
| "grad_norm": 0.08775831758975983, | |
| "learning_rate": 2.404361132364193e-06, | |
| "loss": 0.4865, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.03372660651803017, | |
| "learning_rate": 2.3852333588370317e-06, | |
| "loss": 0.3975, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.874354561101549, | |
| "grad_norm": 0.034449730068445206, | |
| "learning_rate": 2.3661055853098704e-06, | |
| "loss": 0.3927, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.891566265060241, | |
| "grad_norm": 0.02975647896528244, | |
| "learning_rate": 2.3469778117827086e-06, | |
| "loss": 0.3664, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.908777969018933, | |
| "grad_norm": 0.037901297211647034, | |
| "learning_rate": 2.3278500382555473e-06, | |
| "loss": 0.3973, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.9259896729776247, | |
| "grad_norm": 0.05662724748253822, | |
| "learning_rate": 2.308722264728386e-06, | |
| "loss": 0.4422, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.9432013769363166, | |
| "grad_norm": 0.044157788157463074, | |
| "learning_rate": 2.289594491201224e-06, | |
| "loss": 0.4324, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.960413080895009, | |
| "grad_norm": 0.04280713573098183, | |
| "learning_rate": 2.270466717674063e-06, | |
| "loss": 0.5674, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.9776247848537007, | |
| "grad_norm": 0.04871043935418129, | |
| "learning_rate": 2.2513389441469015e-06, | |
| "loss": 0.3223, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.9948364888123926, | |
| "grad_norm": 0.036149609833955765, | |
| "learning_rate": 2.2322111706197398e-06, | |
| "loss": 0.6471, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 3.0120481927710845, | |
| "grad_norm": 0.02951321005821228, | |
| "learning_rate": 2.2130833970925784e-06, | |
| "loss": 0.3926, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.0292598967297764, | |
| "grad_norm": 0.04006199911236763, | |
| "learning_rate": 2.193955623565417e-06, | |
| "loss": 0.6222, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 3.0464716006884682, | |
| "grad_norm": 0.03238508850336075, | |
| "learning_rate": 2.1748278500382557e-06, | |
| "loss": 0.4144, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 3.06368330464716, | |
| "grad_norm": 0.035425204783678055, | |
| "learning_rate": 2.1557000765110944e-06, | |
| "loss": 0.3745, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 3.080895008605852, | |
| "grad_norm": 0.08181657642126083, | |
| "learning_rate": 2.1365723029839326e-06, | |
| "loss": 0.4049, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 3.098106712564544, | |
| "grad_norm": 0.03448079526424408, | |
| "learning_rate": 2.1174445294567713e-06, | |
| "loss": 0.5435, | |
| "step": 1800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2905, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |