| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 1856, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0005387931034482759, | |
| "grad_norm": 24.06527582915772, | |
| "learning_rate": 5.376344086021506e-08, | |
| "loss": 1.3568, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0026939655172413795, | |
| "grad_norm": 23.2847675267083, | |
| "learning_rate": 2.688172043010753e-07, | |
| "loss": 1.3668, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.005387931034482759, | |
| "grad_norm": 16.195930738756566, | |
| "learning_rate": 5.376344086021506e-07, | |
| "loss": 1.3204, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.008081896551724138, | |
| "grad_norm": 12.068298869370592, | |
| "learning_rate": 8.064516129032258e-07, | |
| "loss": 1.153, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.010775862068965518, | |
| "grad_norm": 8.564123494535863, | |
| "learning_rate": 1.0752688172043011e-06, | |
| "loss": 1.0452, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.013469827586206896, | |
| "grad_norm": 3.533789309391932, | |
| "learning_rate": 1.3440860215053765e-06, | |
| "loss": 0.9515, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.016163793103448277, | |
| "grad_norm": 3.24461197562523, | |
| "learning_rate": 1.6129032258064516e-06, | |
| "loss": 0.9001, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.018857758620689655, | |
| "grad_norm": 2.990611660406535, | |
| "learning_rate": 1.881720430107527e-06, | |
| "loss": 0.8773, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.021551724137931036, | |
| "grad_norm": 3.0063853939062346, | |
| "learning_rate": 2.1505376344086023e-06, | |
| "loss": 0.851, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.024245689655172414, | |
| "grad_norm": 2.956366561006899, | |
| "learning_rate": 2.4193548387096776e-06, | |
| "loss": 0.8574, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.02693965517241379, | |
| "grad_norm": 2.983398789032246, | |
| "learning_rate": 2.688172043010753e-06, | |
| "loss": 0.84, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.029633620689655173, | |
| "grad_norm": 2.964731632227324, | |
| "learning_rate": 2.9569892473118283e-06, | |
| "loss": 0.824, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.032327586206896554, | |
| "grad_norm": 2.9208803498660623, | |
| "learning_rate": 3.225806451612903e-06, | |
| "loss": 0.8138, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03502155172413793, | |
| "grad_norm": 3.2063303145455366, | |
| "learning_rate": 3.494623655913979e-06, | |
| "loss": 0.8009, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.03771551724137931, | |
| "grad_norm": 3.242653708652505, | |
| "learning_rate": 3.763440860215054e-06, | |
| "loss": 0.792, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04040948275862069, | |
| "grad_norm": 3.1462448663803846, | |
| "learning_rate": 4.032258064516129e-06, | |
| "loss": 0.7902, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.04310344827586207, | |
| "grad_norm": 3.0229975986392716, | |
| "learning_rate": 4.3010752688172045e-06, | |
| "loss": 0.7699, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.045797413793103446, | |
| "grad_norm": 3.12423094671722, | |
| "learning_rate": 4.56989247311828e-06, | |
| "loss": 0.7644, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.04849137931034483, | |
| "grad_norm": 3.2796596768473902, | |
| "learning_rate": 4.838709677419355e-06, | |
| "loss": 0.7712, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.05118534482758621, | |
| "grad_norm": 3.0184242042359943, | |
| "learning_rate": 5.1075268817204305e-06, | |
| "loss": 0.7546, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.05387931034482758, | |
| "grad_norm": 3.0881392753326447, | |
| "learning_rate": 5.376344086021506e-06, | |
| "loss": 0.7487, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.056573275862068964, | |
| "grad_norm": 3.4110841994799657, | |
| "learning_rate": 5.645161290322582e-06, | |
| "loss": 0.7496, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.059267241379310345, | |
| "grad_norm": 2.92733810047956, | |
| "learning_rate": 5.9139784946236566e-06, | |
| "loss": 0.7368, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06196120689655173, | |
| "grad_norm": 3.3139008810992046, | |
| "learning_rate": 6.182795698924732e-06, | |
| "loss": 0.7277, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.06465517241379311, | |
| "grad_norm": 3.1747479144288455, | |
| "learning_rate": 6.451612903225806e-06, | |
| "loss": 0.7283, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.06734913793103449, | |
| "grad_norm": 2.894519107469561, | |
| "learning_rate": 6.720430107526882e-06, | |
| "loss": 0.7282, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.07004310344827586, | |
| "grad_norm": 2.8405180587913987, | |
| "learning_rate": 6.989247311827958e-06, | |
| "loss": 0.7123, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07273706896551724, | |
| "grad_norm": 2.7948188759602717, | |
| "learning_rate": 7.258064516129033e-06, | |
| "loss": 0.7193, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.07543103448275862, | |
| "grad_norm": 3.154756842274138, | |
| "learning_rate": 7.526881720430108e-06, | |
| "loss": 0.7207, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.078125, | |
| "grad_norm": 2.9457108929499207, | |
| "learning_rate": 7.795698924731183e-06, | |
| "loss": 0.7212, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.08081896551724138, | |
| "grad_norm": 2.8503644648477517, | |
| "learning_rate": 8.064516129032258e-06, | |
| "loss": 0.72, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08351293103448276, | |
| "grad_norm": 2.949964251276019, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.723, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.08620689655172414, | |
| "grad_norm": 2.959116036250926, | |
| "learning_rate": 8.602150537634409e-06, | |
| "loss": 0.7158, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.08890086206896551, | |
| "grad_norm": 2.7803395603035517, | |
| "learning_rate": 8.870967741935484e-06, | |
| "loss": 0.7067, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.09159482758620689, | |
| "grad_norm": 2.8799202670097115, | |
| "learning_rate": 9.13978494623656e-06, | |
| "loss": 0.71, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.09428879310344827, | |
| "grad_norm": 2.9537594310040687, | |
| "learning_rate": 9.408602150537635e-06, | |
| "loss": 0.7152, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.09698275862068965, | |
| "grad_norm": 2.8628517050727873, | |
| "learning_rate": 9.67741935483871e-06, | |
| "loss": 0.7054, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.09967672413793104, | |
| "grad_norm": 2.8896943288351586, | |
| "learning_rate": 9.946236559139786e-06, | |
| "loss": 0.7235, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.10237068965517242, | |
| "grad_norm": 2.938518709851193, | |
| "learning_rate": 9.999858445152838e-06, | |
| "loss": 0.7122, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1050646551724138, | |
| "grad_norm": 2.58690085015114, | |
| "learning_rate": 9.999283392323047e-06, | |
| "loss": 0.7061, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.10775862068965517, | |
| "grad_norm": 2.763129396160507, | |
| "learning_rate": 9.998266045169356e-06, | |
| "loss": 0.7063, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11045258620689655, | |
| "grad_norm": 2.816275952414151, | |
| "learning_rate": 9.996806493698038e-06, | |
| "loss": 0.7087, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.11314655172413793, | |
| "grad_norm": 2.73738463168911, | |
| "learning_rate": 9.994904867037867e-06, | |
| "loss": 0.6986, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.11584051724137931, | |
| "grad_norm": 2.810575578616004, | |
| "learning_rate": 9.99256133342869e-06, | |
| "loss": 0.6929, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.11853448275862069, | |
| "grad_norm": 2.6652685941669265, | |
| "learning_rate": 9.989776100206547e-06, | |
| "loss": 0.6898, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.12122844827586207, | |
| "grad_norm": 2.7660230194471107, | |
| "learning_rate": 9.986549413785323e-06, | |
| "loss": 0.695, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.12392241379310345, | |
| "grad_norm": 2.5553942202252466, | |
| "learning_rate": 9.982881559634946e-06, | |
| "loss": 0.7017, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.12661637931034483, | |
| "grad_norm": 2.5245345530966192, | |
| "learning_rate": 9.978772862256145e-06, | |
| "loss": 0.6916, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.12931034482758622, | |
| "grad_norm": 2.520167957976126, | |
| "learning_rate": 9.97422368515172e-06, | |
| "loss": 0.694, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1320043103448276, | |
| "grad_norm": 2.7125840301494706, | |
| "learning_rate": 9.969234430794395e-06, | |
| "loss": 0.6887, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.13469827586206898, | |
| "grad_norm": 2.631424447595556, | |
| "learning_rate": 9.96380554059121e-06, | |
| "loss": 0.685, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.13739224137931033, | |
| "grad_norm": 2.555021040773695, | |
| "learning_rate": 9.957937494844472e-06, | |
| "loss": 0.7004, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.1400862068965517, | |
| "grad_norm": 2.539978410855113, | |
| "learning_rate": 9.951630812709245e-06, | |
| "loss": 0.6897, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.1427801724137931, | |
| "grad_norm": 2.7494174109330842, | |
| "learning_rate": 9.944886052147445e-06, | |
| "loss": 0.6928, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.14547413793103448, | |
| "grad_norm": 2.559956756758314, | |
| "learning_rate": 9.937703809878455e-06, | |
| "loss": 0.6813, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.14816810344827586, | |
| "grad_norm": 2.525562445581053, | |
| "learning_rate": 9.930084721326342e-06, | |
| "loss": 0.6944, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.15086206896551724, | |
| "grad_norm": 2.777619881263396, | |
| "learning_rate": 9.92202946056364e-06, | |
| "loss": 0.6745, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.15355603448275862, | |
| "grad_norm": 2.4859789362282076, | |
| "learning_rate": 9.913538740251711e-06, | |
| "loss": 0.6527, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.15625, | |
| "grad_norm": 2.4614571056065624, | |
| "learning_rate": 9.904613311577696e-06, | |
| "loss": 0.6673, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.15894396551724138, | |
| "grad_norm": 2.503690727361147, | |
| "learning_rate": 9.895253964188056e-06, | |
| "loss": 0.6601, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.16163793103448276, | |
| "grad_norm": 2.61491684131174, | |
| "learning_rate": 9.885461526118713e-06, | |
| "loss": 0.6629, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.16433189655172414, | |
| "grad_norm": 2.563289578189323, | |
| "learning_rate": 9.875236863721788e-06, | |
| "loss": 0.6834, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.16702586206896552, | |
| "grad_norm": 2.542961491155676, | |
| "learning_rate": 9.864580881588958e-06, | |
| "loss": 0.6634, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1697198275862069, | |
| "grad_norm": 2.5998608415854774, | |
| "learning_rate": 9.853494522471423e-06, | |
| "loss": 0.6564, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.1724137931034483, | |
| "grad_norm": 2.580998138867243, | |
| "learning_rate": 9.841978767196495e-06, | |
| "loss": 0.6522, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.17510775862068967, | |
| "grad_norm": 2.462022076166109, | |
| "learning_rate": 9.830034634580833e-06, | |
| "loss": 0.6575, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.17780172413793102, | |
| "grad_norm": 2.641866987114795, | |
| "learning_rate": 9.8176631813403e-06, | |
| "loss": 0.6654, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1804956896551724, | |
| "grad_norm": 2.483224928563204, | |
| "learning_rate": 9.804865501996472e-06, | |
| "loss": 0.6687, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.18318965517241378, | |
| "grad_norm": 2.6158710388060755, | |
| "learning_rate": 9.79164272877981e-06, | |
| "loss": 0.6606, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.18588362068965517, | |
| "grad_norm": 2.6690109052148396, | |
| "learning_rate": 9.777996031529486e-06, | |
| "loss": 0.6587, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.18857758620689655, | |
| "grad_norm": 2.5145797557443403, | |
| "learning_rate": 9.763926617589883e-06, | |
| "loss": 0.6455, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.19127155172413793, | |
| "grad_norm": 2.34228188842774, | |
| "learning_rate": 9.749435731703786e-06, | |
| "loss": 0.6467, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.1939655172413793, | |
| "grad_norm": 2.518236951767628, | |
| "learning_rate": 9.734524655902253e-06, | |
| "loss": 0.6651, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1966594827586207, | |
| "grad_norm": 2.3327366524820423, | |
| "learning_rate": 9.719194709391191e-06, | |
| "loss": 0.6527, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.19935344827586207, | |
| "grad_norm": 2.6721928236725425, | |
| "learning_rate": 9.70344724843465e-06, | |
| "loss": 0.6471, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.20204741379310345, | |
| "grad_norm": 2.512497207087126, | |
| "learning_rate": 9.687283666234823e-06, | |
| "loss": 0.6345, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.20474137931034483, | |
| "grad_norm": 2.5381248307269106, | |
| "learning_rate": 9.670705392808796e-06, | |
| "loss": 0.6549, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.20743534482758622, | |
| "grad_norm": 2.489609282435604, | |
| "learning_rate": 9.653713894862024e-06, | |
| "loss": 0.6287, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.2101293103448276, | |
| "grad_norm": 2.4187969624820767, | |
| "learning_rate": 9.63631067565858e-06, | |
| "loss": 0.6372, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.21282327586206898, | |
| "grad_norm": 2.378128543534024, | |
| "learning_rate": 9.618497274888147e-06, | |
| "loss": 0.6344, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.21551724137931033, | |
| "grad_norm": 2.3554799136699383, | |
| "learning_rate": 9.600275268529809e-06, | |
| "loss": 0.632, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2182112068965517, | |
| "grad_norm": 2.9669359679831437, | |
| "learning_rate": 9.58164626871261e-06, | |
| "loss": 0.6409, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.2209051724137931, | |
| "grad_norm": 2.510424077340063, | |
| "learning_rate": 9.562611923572944e-06, | |
| "loss": 0.6316, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.22359913793103448, | |
| "grad_norm": 2.5067266793187843, | |
| "learning_rate": 9.543173917108725e-06, | |
| "loss": 0.6337, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.22629310344827586, | |
| "grad_norm": 2.4014165442615627, | |
| "learning_rate": 9.523333969030413e-06, | |
| "loss": 0.6285, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.22898706896551724, | |
| "grad_norm": 2.5503305669006266, | |
| "learning_rate": 9.503093834608856e-06, | |
| "loss": 0.6297, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.23168103448275862, | |
| "grad_norm": 2.683370610867663, | |
| "learning_rate": 9.482455304520013e-06, | |
| "loss": 0.6222, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.234375, | |
| "grad_norm": 2.3415254501274156, | |
| "learning_rate": 9.46142020468652e-06, | |
| "loss": 0.6181, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.23706896551724138, | |
| "grad_norm": 2.4296203317167513, | |
| "learning_rate": 9.439990396116149e-06, | |
| "loss": 0.6191, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.23976293103448276, | |
| "grad_norm": 2.4277540188724833, | |
| "learning_rate": 9.418167774737173e-06, | |
| "loss": 0.6218, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.24245689655172414, | |
| "grad_norm": 2.594904022170311, | |
| "learning_rate": 9.395954271230606e-06, | |
| "loss": 0.622, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.24515086206896552, | |
| "grad_norm": 2.347098862192039, | |
| "learning_rate": 9.373351850859417e-06, | |
| "loss": 0.6136, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.2478448275862069, | |
| "grad_norm": 2.3928008650888204, | |
| "learning_rate": 9.350362513294652e-06, | |
| "loss": 0.6272, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2505387931034483, | |
| "grad_norm": 2.335542398750826, | |
| "learning_rate": 9.326988292438514e-06, | |
| "loss": 0.6245, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.25323275862068967, | |
| "grad_norm": 2.3458410101982174, | |
| "learning_rate": 9.30323125624443e-06, | |
| "loss": 0.6176, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.25592672413793105, | |
| "grad_norm": 2.5491037378725188, | |
| "learning_rate": 9.279093506534085e-06, | |
| "loss": 0.6039, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.25862068965517243, | |
| "grad_norm": 2.35768113596503, | |
| "learning_rate": 9.254577178811482e-06, | |
| "loss": 0.6062, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.2613146551724138, | |
| "grad_norm": 2.4427975704018072, | |
| "learning_rate": 9.229684442074005e-06, | |
| "loss": 0.6038, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.2640086206896552, | |
| "grad_norm": 2.3518303928123183, | |
| "learning_rate": 9.204417498620521e-06, | |
| "loss": 0.6071, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2667025862068966, | |
| "grad_norm": 2.3978894249163285, | |
| "learning_rate": 9.178778583856552e-06, | |
| "loss": 0.6024, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.26939655172413796, | |
| "grad_norm": 2.530047013657598, | |
| "learning_rate": 9.152769966096483e-06, | |
| "loss": 0.6028, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.27209051724137934, | |
| "grad_norm": 2.4123317555719708, | |
| "learning_rate": 9.126393946362906e-06, | |
| "loss": 0.6083, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.27478448275862066, | |
| "grad_norm": 2.4793056830777753, | |
| "learning_rate": 9.099652858183027e-06, | |
| "loss": 0.6051, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.27747844827586204, | |
| "grad_norm": 2.372688897527012, | |
| "learning_rate": 9.072549067382225e-06, | |
| "loss": 0.6157, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.2801724137931034, | |
| "grad_norm": 2.380240348074666, | |
| "learning_rate": 9.045084971874738e-06, | |
| "loss": 0.6073, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2828663793103448, | |
| "grad_norm": 2.545807161286919, | |
| "learning_rate": 9.017263001451518e-06, | |
| "loss": 0.5884, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.2855603448275862, | |
| "grad_norm": 2.5935659051260824, | |
| "learning_rate": 8.989085617565261e-06, | |
| "loss": 0.5983, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.28825431034482757, | |
| "grad_norm": 2.2548884783469836, | |
| "learning_rate": 8.960555313112646e-06, | |
| "loss": 0.5895, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.29094827586206895, | |
| "grad_norm": 2.3534621434136533, | |
| "learning_rate": 8.93167461221378e-06, | |
| "loss": 0.5914, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.29364224137931033, | |
| "grad_norm": 2.5336260688373495, | |
| "learning_rate": 8.902446069988878e-06, | |
| "loss": 0.5939, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.2963362068965517, | |
| "grad_norm": 2.624683890197873, | |
| "learning_rate": 8.87287227233222e-06, | |
| "loss": 0.5836, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.2990301724137931, | |
| "grad_norm": 2.3588318708883604, | |
| "learning_rate": 8.842955835683368e-06, | |
| "loss": 0.5786, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.3017241379310345, | |
| "grad_norm": 2.501675897313923, | |
| "learning_rate": 8.812699406795683e-06, | |
| "loss": 0.5799, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.30441810344827586, | |
| "grad_norm": 2.6078839400922424, | |
| "learning_rate": 8.78210566250216e-06, | |
| "loss": 0.5801, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.30711206896551724, | |
| "grad_norm": 2.3496389383543135, | |
| "learning_rate": 8.751177309478618e-06, | |
| "loss": 0.5756, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3098060344827586, | |
| "grad_norm": 2.3002443057548727, | |
| "learning_rate": 8.71991708400422e-06, | |
| "loss": 0.5823, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 2.368311996486066, | |
| "learning_rate": 8.688327751719403e-06, | |
| "loss": 0.57, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3151939655172414, | |
| "grad_norm": 2.316476591326147, | |
| "learning_rate": 8.656412107381187e-06, | |
| "loss": 0.572, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.31788793103448276, | |
| "grad_norm": 2.648056237571166, | |
| "learning_rate": 8.624172974615926e-06, | |
| "loss": 0.5759, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.32058189655172414, | |
| "grad_norm": 2.5273275022283035, | |
| "learning_rate": 8.591613205669494e-06, | |
| "loss": 0.5751, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.3232758620689655, | |
| "grad_norm": 2.3674743965920433, | |
| "learning_rate": 8.558735681154944e-06, | |
| "loss": 0.5525, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3259698275862069, | |
| "grad_norm": 2.334754085556647, | |
| "learning_rate": 8.525543309797653e-06, | |
| "loss": 0.5501, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.3286637931034483, | |
| "grad_norm": 2.511690588702945, | |
| "learning_rate": 8.492039028177985e-06, | |
| "loss": 0.5703, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.33135775862068967, | |
| "grad_norm": 2.41344799771138, | |
| "learning_rate": 8.458225800471492e-06, | |
| "loss": 0.5674, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.33405172413793105, | |
| "grad_norm": 2.274991518802859, | |
| "learning_rate": 8.424106618186653e-06, | |
| "loss": 0.568, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.33674568965517243, | |
| "grad_norm": 2.2914893865907375, | |
| "learning_rate": 8.389684499900231e-06, | |
| "loss": 0.5578, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.3394396551724138, | |
| "grad_norm": 2.2271331744770175, | |
| "learning_rate": 8.354962490990202e-06, | |
| "loss": 0.554, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.3421336206896552, | |
| "grad_norm": 2.346436964348071, | |
| "learning_rate": 8.319943663366325e-06, | |
| "loss": 0.5623, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.3448275862068966, | |
| "grad_norm": 2.2365182629879707, | |
| "learning_rate": 8.284631115198371e-06, | |
| "loss": 0.5534, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.34752155172413796, | |
| "grad_norm": 2.461241222937466, | |
| "learning_rate": 8.24902797064203e-06, | |
| "loss": 0.5564, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.35021551724137934, | |
| "grad_norm": 2.442140982131872, | |
| "learning_rate": 8.213137379562486e-06, | |
| "loss": 0.5506, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.35290948275862066, | |
| "grad_norm": 2.388325267487531, | |
| "learning_rate": 8.176962517255776e-06, | |
| "loss": 0.5531, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.35560344827586204, | |
| "grad_norm": 2.398524248781268, | |
| "learning_rate": 8.140506584167845e-06, | |
| "loss": 0.5415, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3582974137931034, | |
| "grad_norm": 2.566763693618945, | |
| "learning_rate": 8.103772805611403e-06, | |
| "loss": 0.5616, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.3609913793103448, | |
| "grad_norm": 2.3106768834034805, | |
| "learning_rate": 8.066764431480584e-06, | |
| "loss": 0.5328, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.3636853448275862, | |
| "grad_norm": 2.2940366514378425, | |
| "learning_rate": 8.029484735963409e-06, | |
| "loss": 0.5452, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.36637931034482757, | |
| "grad_norm": 2.4096028111246652, | |
| "learning_rate": 7.991937017252127e-06, | |
| "loss": 0.5448, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.36907327586206895, | |
| "grad_norm": 2.450510234216877, | |
| "learning_rate": 7.95412459725141e-06, | |
| "loss": 0.5407, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.37176724137931033, | |
| "grad_norm": 2.498635611862816, | |
| "learning_rate": 7.916050821284462e-06, | |
| "loss": 0.536, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.3744612068965517, | |
| "grad_norm": 2.3384557737181306, | |
| "learning_rate": 7.877719057797055e-06, | |
| "loss": 0.5404, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.3771551724137931, | |
| "grad_norm": 2.395634299723523, | |
| "learning_rate": 7.839132698059515e-06, | |
| "loss": 0.5469, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3798491379310345, | |
| "grad_norm": 2.528299315994187, | |
| "learning_rate": 7.800295155866688e-06, | |
| "loss": 0.5272, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.38254310344827586, | |
| "grad_norm": 2.383516192036904, | |
| "learning_rate": 7.761209867235924e-06, | |
| "loss": 0.5495, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.38523706896551724, | |
| "grad_norm": 2.3221638101603954, | |
| "learning_rate": 7.721880290103082e-06, | |
| "loss": 0.5517, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.3879310344827586, | |
| "grad_norm": 2.451275702370551, | |
| "learning_rate": 7.6823099040166e-06, | |
| "loss": 0.5195, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "grad_norm": 2.469988525493039, | |
| "learning_rate": 7.64250220982966e-06, | |
| "loss": 0.5151, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.3933189655172414, | |
| "grad_norm": 2.4698654498618016, | |
| "learning_rate": 7.602460729390455e-06, | |
| "loss": 0.5296, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.39601293103448276, | |
| "grad_norm": 2.433689149450146, | |
| "learning_rate": 7.562189005230609e-06, | |
| "loss": 0.5122, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.39870689655172414, | |
| "grad_norm": 2.317764828643439, | |
| "learning_rate": 7.521690600251765e-06, | |
| "loss": 0.5389, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4014008620689655, | |
| "grad_norm": 2.3785211168925997, | |
| "learning_rate": 7.480969097410369e-06, | |
| "loss": 0.5342, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.4040948275862069, | |
| "grad_norm": 2.352268614869421, | |
| "learning_rate": 7.4400280994006765e-06, | |
| "loss": 0.5222, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4067887931034483, | |
| "grad_norm": 2.3334817294609844, | |
| "learning_rate": 7.398871228336022e-06, | |
| "loss": 0.5148, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.40948275862068967, | |
| "grad_norm": 2.2180745679186513, | |
| "learning_rate": 7.357502125428359e-06, | |
| "loss": 0.5269, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.41217672413793105, | |
| "grad_norm": 2.4024098190438448, | |
| "learning_rate": 7.315924450666129e-06, | |
| "loss": 0.5252, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.41487068965517243, | |
| "grad_norm": 2.4847050155908326, | |
| "learning_rate": 7.274141882490435e-06, | |
| "loss": 0.5215, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4175646551724138, | |
| "grad_norm": 2.3489603723016423, | |
| "learning_rate": 7.23215811746963e-06, | |
| "loss": 0.5331, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.4202586206896552, | |
| "grad_norm": 2.3846378852084276, | |
| "learning_rate": 7.189976869972249e-06, | |
| "loss": 0.526, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.4229525862068966, | |
| "grad_norm": 2.2721960920466087, | |
| "learning_rate": 7.147601871838419e-06, | |
| "loss": 0.5111, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.42564655172413796, | |
| "grad_norm": 2.242972711736404, | |
| "learning_rate": 7.105036872049676e-06, | |
| "loss": 0.5079, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.42834051724137934, | |
| "grad_norm": 2.5168627834860944, | |
| "learning_rate": 7.0622856363973e-06, | |
| "loss": 0.5037, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.43103448275862066, | |
| "grad_norm": 2.3034024680284797, | |
| "learning_rate": 7.019351947149149e-06, | |
| "loss": 0.5037, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.43372844827586204, | |
| "grad_norm": 2.3169182311354204, | |
| "learning_rate": 6.976239602715025e-06, | |
| "loss": 0.5244, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.4364224137931034, | |
| "grad_norm": 2.342523099764779, | |
| "learning_rate": 6.932952417310634e-06, | |
| "loss": 0.4955, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4391163793103448, | |
| "grad_norm": 2.4079674615936213, | |
| "learning_rate": 6.889494220620135e-06, | |
| "loss": 0.5039, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.4418103448275862, | |
| "grad_norm": 2.2705187143965704, | |
| "learning_rate": 6.8458688574573164e-06, | |
| "loss": 0.4921, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.44450431034482757, | |
| "grad_norm": 2.3040634798061053, | |
| "learning_rate": 6.8020801874254425e-06, | |
| "loss": 0.4952, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.44719827586206895, | |
| "grad_norm": 2.283780585980132, | |
| "learning_rate": 6.758132084575791e-06, | |
| "loss": 0.5204, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.44989224137931033, | |
| "grad_norm": 2.2311658006536175, | |
| "learning_rate": 6.7140284370649015e-06, | |
| "loss": 0.5062, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.4525862068965517, | |
| "grad_norm": 2.381000659447914, | |
| "learning_rate": 6.6697731468105985e-06, | |
| "loss": 0.5054, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.4552801724137931, | |
| "grad_norm": 2.5645822620698295, | |
| "learning_rate": 6.625370129146771e-06, | |
| "loss": 0.4967, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.4579741379310345, | |
| "grad_norm": 2.518018472550615, | |
| "learning_rate": 6.580823312476976e-06, | |
| "loss": 0.5057, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.46066810344827586, | |
| "grad_norm": 2.3310109009449937, | |
| "learning_rate": 6.536136637926898e-06, | |
| "loss": 0.4923, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.46336206896551724, | |
| "grad_norm": 2.4572949530360235, | |
| "learning_rate": 6.491314058995653e-06, | |
| "loss": 0.4923, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4660560344827586, | |
| "grad_norm": 2.333469399501826, | |
| "learning_rate": 6.446359541206042e-06, | |
| "loss": 0.4984, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.46875, | |
| "grad_norm": 2.3170414009513287, | |
| "learning_rate": 6.401277061753689e-06, | |
| "loss": 0.4805, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.4714439655172414, | |
| "grad_norm": 2.3105233267502068, | |
| "learning_rate": 6.356070609155188e-06, | |
| "loss": 0.4857, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.47413793103448276, | |
| "grad_norm": 2.406900488225167, | |
| "learning_rate": 6.310744182895231e-06, | |
| "loss": 0.474, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.47683189655172414, | |
| "grad_norm": 2.3233269304186246, | |
| "learning_rate": 6.265301793072762e-06, | |
| "loss": 0.4947, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.4795258620689655, | |
| "grad_norm": 2.336797328678939, | |
| "learning_rate": 6.219747460046203e-06, | |
| "loss": 0.4771, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.4822198275862069, | |
| "grad_norm": 2.3058756900360566, | |
| "learning_rate": 6.17408521407776e-06, | |
| "loss": 0.4791, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.4849137931034483, | |
| "grad_norm": 2.467884893673803, | |
| "learning_rate": 6.128319094976869e-06, | |
| "loss": 0.492, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.48760775862068967, | |
| "grad_norm": 2.3280199883273047, | |
| "learning_rate": 6.0824531517427765e-06, | |
| "loss": 0.4816, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.49030172413793105, | |
| "grad_norm": 2.2642826853033053, | |
| "learning_rate": 6.03649144220633e-06, | |
| "loss": 0.4805, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.49299568965517243, | |
| "grad_norm": 2.2845546468033007, | |
| "learning_rate": 5.990438032670968e-06, | |
| "loss": 0.4804, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.4956896551724138, | |
| "grad_norm": 2.320099011292584, | |
| "learning_rate": 5.944296997552968e-06, | |
| "loss": 0.4807, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.4983836206896552, | |
| "grad_norm": 2.4032671750639607, | |
| "learning_rate": 5.898072419020978e-06, | |
| "loss": 0.479, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.5010775862068966, | |
| "grad_norm": 2.3454490179654948, | |
| "learning_rate": 5.851768386634863e-06, | |
| "loss": 0.4657, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.5037715517241379, | |
| "grad_norm": 2.2272370976346707, | |
| "learning_rate": 5.805388996983891e-06, | |
| "loss": 0.4778, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.5064655172413793, | |
| "grad_norm": 2.399429478516486, | |
| "learning_rate": 5.758938353324308e-06, | |
| "loss": 0.4766, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5091594827586207, | |
| "grad_norm": 2.2479225788941726, | |
| "learning_rate": 5.712420565216305e-06, | |
| "loss": 0.4689, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.5118534482758621, | |
| "grad_norm": 2.333910684063406, | |
| "learning_rate": 5.66583974816045e-06, | |
| "loss": 0.4689, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5145474137931034, | |
| "grad_norm": 2.494414220923278, | |
| "learning_rate": 5.619200023233582e-06, | |
| "loss": 0.4654, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.5172413793103449, | |
| "grad_norm": 2.4303474928270314, | |
| "learning_rate": 5.572505516724207e-06, | |
| "loss": 0.4841, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5199353448275862, | |
| "grad_norm": 2.3290300558522605, | |
| "learning_rate": 5.52576035976744e-06, | |
| "loss": 0.4631, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.5226293103448276, | |
| "grad_norm": 2.303763077645539, | |
| "learning_rate": 5.478968687979527e-06, | |
| "loss": 0.4535, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.525323275862069, | |
| "grad_norm": 2.3158015015015367, | |
| "learning_rate": 5.432134641091945e-06, | |
| "loss": 0.4653, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.5280172413793104, | |
| "grad_norm": 2.412268625727716, | |
| "learning_rate": 5.3852623625851655e-06, | |
| "loss": 0.4553, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5307112068965517, | |
| "grad_norm": 2.4152646593142477, | |
| "learning_rate": 5.338355999322069e-06, | |
| "loss": 0.459, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.5334051724137931, | |
| "grad_norm": 2.3009383932051186, | |
| "learning_rate": 5.291419701181069e-06, | |
| "loss": 0.4574, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.5360991379310345, | |
| "grad_norm": 2.3404820672273683, | |
| "learning_rate": 5.244457620688962e-06, | |
| "loss": 0.4457, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.5387931034482759, | |
| "grad_norm": 2.2918401803413277, | |
| "learning_rate": 5.197473912653549e-06, | |
| "loss": 0.4625, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5414870689655172, | |
| "grad_norm": 2.330307145203118, | |
| "learning_rate": 5.150472733796053e-06, | |
| "loss": 0.4614, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.5441810344827587, | |
| "grad_norm": 2.317228108453964, | |
| "learning_rate": 5.103458242383371e-06, | |
| "loss": 0.4346, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.546875, | |
| "grad_norm": 2.246449210384358, | |
| "learning_rate": 5.056434597860176e-06, | |
| "loss": 0.4332, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.5495689655172413, | |
| "grad_norm": 2.2315633880832917, | |
| "learning_rate": 5.009405960480937e-06, | |
| "loss": 0.4374, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5522629310344828, | |
| "grad_norm": 2.236917389881302, | |
| "learning_rate": 4.962376490941846e-06, | |
| "loss": 0.4443, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.5549568965517241, | |
| "grad_norm": 2.2257101057521953, | |
| "learning_rate": 4.915350350012714e-06, | |
| "loss": 0.4485, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.5576508620689655, | |
| "grad_norm": 2.2768475081245696, | |
| "learning_rate": 4.868331698168875e-06, | |
| "loss": 0.456, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.5603448275862069, | |
| "grad_norm": 2.2588873812858243, | |
| "learning_rate": 4.82132469522308e-06, | |
| "loss": 0.4531, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.5630387931034483, | |
| "grad_norm": 2.2517674521156414, | |
| "learning_rate": 4.774333499957488e-06, | |
| "loss": 0.4439, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.5657327586206896, | |
| "grad_norm": 2.3879681903493277, | |
| "learning_rate": 4.727362269755736e-06, | |
| "loss": 0.4507, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.568426724137931, | |
| "grad_norm": 2.2168932530530654, | |
| "learning_rate": 4.68041516023511e-06, | |
| "loss": 0.4436, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.5711206896551724, | |
| "grad_norm": 2.328909950607463, | |
| "learning_rate": 4.633496324878906e-06, | |
| "loss": 0.4408, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.5738146551724138, | |
| "grad_norm": 2.2564887174276183, | |
| "learning_rate": 4.586609914668963e-06, | |
| "loss": 0.4516, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.5765086206896551, | |
| "grad_norm": 2.2979177074885424, | |
| "learning_rate": 4.539760077718416e-06, | |
| "loss": 0.4389, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.5792025862068966, | |
| "grad_norm": 2.2933960847054515, | |
| "learning_rate": 4.492950958904707e-06, | |
| "loss": 0.4266, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.5818965517241379, | |
| "grad_norm": 2.2594325799250594, | |
| "learning_rate": 4.4461866995028776e-06, | |
| "loss": 0.427, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.5845905172413793, | |
| "grad_norm": 2.349659814217747, | |
| "learning_rate": 4.399471436819199e-06, | |
| "loss": 0.4346, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.5872844827586207, | |
| "grad_norm": 2.297930957947952, | |
| "learning_rate": 4.352809303825115e-06, | |
| "loss": 0.4279, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.5899784482758621, | |
| "grad_norm": 2.202712644399629, | |
| "learning_rate": 4.306204428791609e-06, | |
| "loss": 0.4291, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.5926724137931034, | |
| "grad_norm": 2.2128476870439813, | |
| "learning_rate": 4.259660934923965e-06, | |
| "loss": 0.44, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.5953663793103449, | |
| "grad_norm": 2.367627389505961, | |
| "learning_rate": 4.213182939996978e-06, | |
| "loss": 0.4379, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.5980603448275862, | |
| "grad_norm": 2.274117011259563, | |
| "learning_rate": 4.166774555990654e-06, | |
| "loss": 0.4344, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.6007543103448276, | |
| "grad_norm": 2.2261394360036983, | |
| "learning_rate": 4.120439888726407e-06, | |
| "loss": 0.4142, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.603448275862069, | |
| "grad_norm": 2.1852891937100436, | |
| "learning_rate": 4.074183037503827e-06, | |
| "loss": 0.4266, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.6061422413793104, | |
| "grad_norm": 2.3083672939605053, | |
| "learning_rate": 4.028008094737989e-06, | |
| "loss": 0.4394, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.6088362068965517, | |
| "grad_norm": 2.2610041056896963, | |
| "learning_rate": 3.981919145597404e-06, | |
| "loss": 0.4128, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.6115301724137931, | |
| "grad_norm": 2.19751146715402, | |
| "learning_rate": 3.935920267642592e-06, | |
| "loss": 0.4227, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.6142241379310345, | |
| "grad_norm": 2.3415136999781963, | |
| "learning_rate": 3.890015530465342e-06, | |
| "loss": 0.4133, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.6169181034482759, | |
| "grad_norm": 2.291673599344672, | |
| "learning_rate": 3.844208995328659e-06, | |
| "loss": 0.4192, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.6196120689655172, | |
| "grad_norm": 2.2459859353779508, | |
| "learning_rate": 3.7985047148074584e-06, | |
| "loss": 0.4257, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6223060344827587, | |
| "grad_norm": 2.3753214874892072, | |
| "learning_rate": 3.75290673243004e-06, | |
| "loss": 0.421, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 2.181100394703554, | |
| "learning_rate": 3.707419082320336e-06, | |
| "loss": 0.4287, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.6276939655172413, | |
| "grad_norm": 2.242465849693457, | |
| "learning_rate": 3.6620457888410143e-06, | |
| "loss": 0.4143, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.6303879310344828, | |
| "grad_norm": 2.3646959150338813, | |
| "learning_rate": 3.616790866237433e-06, | |
| "loss": 0.4045, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6330818965517241, | |
| "grad_norm": 2.312802724452316, | |
| "learning_rate": 3.5716583182825023e-06, | |
| "loss": 0.4248, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.6357758620689655, | |
| "grad_norm": 2.208443511882899, | |
| "learning_rate": 3.5266521379224506e-06, | |
| "loss": 0.4135, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.6384698275862069, | |
| "grad_norm": 2.2774985396607046, | |
| "learning_rate": 3.4817763069235747e-06, | |
| "loss": 0.4028, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.6411637931034483, | |
| "grad_norm": 2.3080269121559898, | |
| "learning_rate": 3.4370347955199634e-06, | |
| "loss": 0.4086, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6438577586206896, | |
| "grad_norm": 2.3130128907712355, | |
| "learning_rate": 3.392431562062238e-06, | |
| "loss": 0.408, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.646551724137931, | |
| "grad_norm": 2.2776700595089676, | |
| "learning_rate": 3.347970552667361e-06, | |
| "loss": 0.4159, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6492456896551724, | |
| "grad_norm": 2.1524296489308576, | |
| "learning_rate": 3.303655700869507e-06, | |
| "loss": 0.4035, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.6519396551724138, | |
| "grad_norm": 2.2146294105038185, | |
| "learning_rate": 3.259490927272071e-06, | |
| "loss": 0.4012, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.6546336206896551, | |
| "grad_norm": 2.2480654104489752, | |
| "learning_rate": 3.2154801392007883e-06, | |
| "loss": 0.4153, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.6573275862068966, | |
| "grad_norm": 2.169871400965887, | |
| "learning_rate": 3.171627230358063e-06, | |
| "loss": 0.404, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.6600215517241379, | |
| "grad_norm": 2.4015866937415056, | |
| "learning_rate": 3.1279360804784785e-06, | |
| "loss": 0.4063, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.6627155172413793, | |
| "grad_norm": 2.3038799378482557, | |
| "learning_rate": 3.084410554985553e-06, | |
| "loss": 0.3898, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.6654094827586207, | |
| "grad_norm": 2.198625588166285, | |
| "learning_rate": 3.0410545046497553e-06, | |
| "loss": 0.4035, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.6681034482758621, | |
| "grad_norm": 2.1950219963512176, | |
| "learning_rate": 2.9978717652478343e-06, | |
| "loss": 0.3902, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.6707974137931034, | |
| "grad_norm": 2.247458718435766, | |
| "learning_rate": 2.954866157223445e-06, | |
| "loss": 0.4082, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.6734913793103449, | |
| "grad_norm": 2.2241261994844588, | |
| "learning_rate": 2.9120414853491574e-06, | |
| "loss": 0.404, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6761853448275862, | |
| "grad_norm": 2.1606540598223103, | |
| "learning_rate": 2.86940153838984e-06, | |
| "loss": 0.3948, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.6788793103448276, | |
| "grad_norm": 2.0718054651873437, | |
| "learning_rate": 2.826950088767469e-06, | |
| "loss": 0.3927, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.681573275862069, | |
| "grad_norm": 2.227847088159035, | |
| "learning_rate": 2.784690892227363e-06, | |
| "loss": 0.3903, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.6842672413793104, | |
| "grad_norm": 2.207892303296737, | |
| "learning_rate": 2.7426276875059145e-06, | |
| "loss": 0.3955, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.6869612068965517, | |
| "grad_norm": 2.1465153515114093, | |
| "learning_rate": 2.700764195999819e-06, | |
| "loss": 0.3788, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.6896551724137931, | |
| "grad_norm": 2.223157201107058, | |
| "learning_rate": 2.6591041214368383e-06, | |
| "loss": 0.4053, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.6923491379310345, | |
| "grad_norm": 2.392548147708553, | |
| "learning_rate": 2.6176511495481172e-06, | |
| "loss": 0.3834, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.6950431034482759, | |
| "grad_norm": 2.059476074487736, | |
| "learning_rate": 2.5764089477421067e-06, | |
| "loss": 0.3857, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.6977370689655172, | |
| "grad_norm": 2.157455657651667, | |
| "learning_rate": 2.5353811647801107e-06, | |
| "loss": 0.3884, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.7004310344827587, | |
| "grad_norm": 2.307643086382308, | |
| "learning_rate": 2.4945714304534584e-06, | |
| "loss": 0.3815, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.703125, | |
| "grad_norm": 2.26315069416342, | |
| "learning_rate": 2.453983355262382e-06, | |
| "loss": 0.3865, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.7058189655172413, | |
| "grad_norm": 2.332313222729813, | |
| "learning_rate": 2.413620530096592e-06, | |
| "loss": 0.391, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.7085129310344828, | |
| "grad_norm": 2.1418117590999413, | |
| "learning_rate": 2.373486525917575e-06, | |
| "loss": 0.3912, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.7112068965517241, | |
| "grad_norm": 2.178180423311831, | |
| "learning_rate": 2.333584893442675e-06, | |
| "loss": 0.3854, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.7139008620689655, | |
| "grad_norm": 2.151591142836586, | |
| "learning_rate": 2.2939191628309482e-06, | |
| "loss": 0.3815, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.7165948275862069, | |
| "grad_norm": 2.1488408048158916, | |
| "learning_rate": 2.254492843370857e-06, | |
| "loss": 0.3741, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.7192887931034483, | |
| "grad_norm": 2.3225770656541624, | |
| "learning_rate": 2.2153094231697807e-06, | |
| "loss": 0.3865, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.7219827586206896, | |
| "grad_norm": 2.225461569667121, | |
| "learning_rate": 2.1763723688454297e-06, | |
| "loss": 0.389, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.724676724137931, | |
| "grad_norm": 2.310688191216032, | |
| "learning_rate": 2.1376851252191465e-06, | |
| "loss": 0.3905, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.7273706896551724, | |
| "grad_norm": 2.206817710811153, | |
| "learning_rate": 2.09925111501113e-06, | |
| "loss": 0.3705, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7300646551724138, | |
| "grad_norm": 2.194541840528301, | |
| "learning_rate": 2.061073738537635e-06, | |
| "loss": 0.38, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.7327586206896551, | |
| "grad_norm": 2.1363777762782568, | |
| "learning_rate": 2.0231563734101245e-06, | |
| "loss": 0.3826, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.7354525862068966, | |
| "grad_norm": 2.043722143372559, | |
| "learning_rate": 1.9855023742364647e-06, | |
| "loss": 0.3722, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.7381465517241379, | |
| "grad_norm": 2.296022903294665, | |
| "learning_rate": 1.9481150723241236e-06, | |
| "loss": 0.3836, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.7408405172413793, | |
| "grad_norm": 2.1320085273295333, | |
| "learning_rate": 1.9109977753854496e-06, | |
| "loss": 0.367, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.7435344827586207, | |
| "grad_norm": 2.126131429150438, | |
| "learning_rate": 1.8741537672450406e-06, | |
| "loss": 0.3756, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.7462284482758621, | |
| "grad_norm": 2.3054341669665708, | |
| "learning_rate": 1.8375863075492062e-06, | |
| "loss": 0.3737, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.7489224137931034, | |
| "grad_norm": 2.3340813640902867, | |
| "learning_rate": 1.8012986314775888e-06, | |
| "loss": 0.3694, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.7516163793103449, | |
| "grad_norm": 2.1335614766566544, | |
| "learning_rate": 1.7652939494569428e-06, | |
| "loss": 0.3706, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.7543103448275862, | |
| "grad_norm": 2.135867482259856, | |
| "learning_rate": 1.7295754468771026e-06, | |
| "loss": 0.3826, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7570043103448276, | |
| "grad_norm": 2.253239028561062, | |
| "learning_rate": 1.6941462838091643e-06, | |
| "loss": 0.3879, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.759698275862069, | |
| "grad_norm": 2.1899554008641613, | |
| "learning_rate": 1.6590095947259083e-06, | |
| "loss": 0.3657, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.7623922413793104, | |
| "grad_norm": 1.9335639886365577, | |
| "learning_rate": 1.6241684882244952e-06, | |
| "loss": 0.3647, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.7650862068965517, | |
| "grad_norm": 2.158271364922754, | |
| "learning_rate": 1.5896260467514335e-06, | |
| "loss": 0.3613, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.7677801724137931, | |
| "grad_norm": 2.283426548356461, | |
| "learning_rate": 1.5553853263298741e-06, | |
| "loss": 0.3804, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.7704741379310345, | |
| "grad_norm": 1.973245710047114, | |
| "learning_rate": 1.521449356289245e-06, | |
| "loss": 0.3616, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.7731681034482759, | |
| "grad_norm": 2.176003470736959, | |
| "learning_rate": 1.4878211389972369e-06, | |
| "loss": 0.3594, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.7758620689655172, | |
| "grad_norm": 2.350333157030792, | |
| "learning_rate": 1.454503649594176e-06, | |
| "loss": 0.3745, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.7785560344827587, | |
| "grad_norm": 2.1046600168472254, | |
| "learning_rate": 1.421499835729812e-06, | |
| "loss": 0.3614, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 2.2403959550973376, | |
| "learning_rate": 1.3888126173025412e-06, | |
| "loss": 0.3667, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.7839439655172413, | |
| "grad_norm": 2.2036204076799244, | |
| "learning_rate": 1.3564448862010653e-06, | |
| "loss": 0.3719, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.7866379310344828, | |
| "grad_norm": 2.1004023468667223, | |
| "learning_rate": 1.3243995060485537e-06, | |
| "loss": 0.3609, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.7893318965517241, | |
| "grad_norm": 2.049485866619644, | |
| "learning_rate": 1.2926793119492848e-06, | |
| "loss": 0.3562, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.7920258620689655, | |
| "grad_norm": 2.2562907662057015, | |
| "learning_rate": 1.2612871102378305e-06, | |
| "loss": 0.3638, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.7947198275862069, | |
| "grad_norm": 2.0015131375954045, | |
| "learning_rate": 1.230225678230766e-06, | |
| "loss": 0.3523, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.7974137931034483, | |
| "grad_norm": 1.9761111123797053, | |
| "learning_rate": 1.1994977639809575e-06, | |
| "loss": 0.3605, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.8001077586206896, | |
| "grad_norm": 2.1818297029398916, | |
| "learning_rate": 1.169106086034446e-06, | |
| "loss": 0.369, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.802801724137931, | |
| "grad_norm": 2.2176123875649782, | |
| "learning_rate": 1.1390533331899235e-06, | |
| "loss": 0.359, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.8054956896551724, | |
| "grad_norm": 2.1415950875401952, | |
| "learning_rate": 1.109342164260853e-06, | |
| "loss": 0.365, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.8081896551724138, | |
| "grad_norm": 1.9579230862394106, | |
| "learning_rate": 1.079975207840247e-06, | |
| "loss": 0.3475, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8108836206896551, | |
| "grad_norm": 1.9891326864430916, | |
| "learning_rate": 1.050955062068098e-06, | |
| "loss": 0.3636, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.8135775862068966, | |
| "grad_norm": 2.1589113372475826, | |
| "learning_rate": 1.0222842944015327e-06, | |
| "loss": 0.3637, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.8162715517241379, | |
| "grad_norm": 2.2093770653678817, | |
| "learning_rate": 9.939654413876493e-07, | |
| "loss": 0.3704, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.8189655172413793, | |
| "grad_norm": 2.117779906161616, | |
| "learning_rate": 9.660010084391197e-07, | |
| "loss": 0.3549, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.8216594827586207, | |
| "grad_norm": 2.2081164429406623, | |
| "learning_rate": 9.383934696125213e-07, | |
| "loss": 0.3637, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.8243534482758621, | |
| "grad_norm": 2.0797066327192915, | |
| "learning_rate": 9.111452673894589e-07, | |
| "loss": 0.355, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.8270474137931034, | |
| "grad_norm": 1.9884207565802496, | |
| "learning_rate": 8.842588124604695e-07, | |
| "loss": 0.3598, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.8297413793103449, | |
| "grad_norm": 1.9966503677289194, | |
| "learning_rate": 8.577364835117552e-07, | |
| "loss": 0.3503, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.8324353448275862, | |
| "grad_norm": 2.0974426601893006, | |
| "learning_rate": 8.315806270147237e-07, | |
| "loss": 0.3513, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.8351293103448276, | |
| "grad_norm": 2.0409953572157264, | |
| "learning_rate": 8.057935570184e-07, | |
| "loss": 0.353, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.837823275862069, | |
| "grad_norm": 2.05994767546201, | |
| "learning_rate": 7.803775549447017e-07, | |
| "loss": 0.3612, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.8405172413793104, | |
| "grad_norm": 1.9798689534701572, | |
| "learning_rate": 7.553348693865897e-07, | |
| "loss": 0.3433, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.8432112068965517, | |
| "grad_norm": 2.0314728151818557, | |
| "learning_rate": 7.306677159091385e-07, | |
| "loss": 0.3554, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.8459051724137931, | |
| "grad_norm": 2.1770521072409665, | |
| "learning_rate": 7.06378276853516e-07, | |
| "loss": 0.3434, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.8485991379310345, | |
| "grad_norm": 3.199094357987707, | |
| "learning_rate": 6.824687011439168e-07, | |
| "loss": 0.3555, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.8512931034482759, | |
| "grad_norm": 2.0350410942770267, | |
| "learning_rate": 6.589411040974369e-07, | |
| "loss": 0.3455, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.8539870689655172, | |
| "grad_norm": 2.0106939788979994, | |
| "learning_rate": 6.35797567236926e-07, | |
| "loss": 0.342, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.8566810344827587, | |
| "grad_norm": 2.0462922997663333, | |
| "learning_rate": 6.130401381068424e-07, | |
| "loss": 0.3484, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.859375, | |
| "grad_norm": 1.9989302742973973, | |
| "learning_rate": 5.906708300920916e-07, | |
| "loss": 0.358, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.8620689655172413, | |
| "grad_norm": 2.1421705464248997, | |
| "learning_rate": 5.686916222399069e-07, | |
| "loss": 0.3479, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8647629310344828, | |
| "grad_norm": 1.8665911668349293, | |
| "learning_rate": 5.471044590847569e-07, | |
| "loss": 0.3485, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.8674568965517241, | |
| "grad_norm": 2.252328311927183, | |
| "learning_rate": 5.259112504763115e-07, | |
| "loss": 0.3537, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.8701508620689655, | |
| "grad_norm": 2.242291713625665, | |
| "learning_rate": 5.051138714104726e-07, | |
| "loss": 0.3493, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.8728448275862069, | |
| "grad_norm": 1.9256177965601142, | |
| "learning_rate": 4.847141618634899e-07, | |
| "loss": 0.346, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.8755387931034483, | |
| "grad_norm": 2.0978920858884806, | |
| "learning_rate": 4.647139266291789e-07, | |
| "loss": 0.3447, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.8782327586206896, | |
| "grad_norm": 2.1438665447656424, | |
| "learning_rate": 4.4511493515924373e-07, | |
| "loss": 0.3467, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.880926724137931, | |
| "grad_norm": 1.943275187391926, | |
| "learning_rate": 4.2591892140673383e-07, | |
| "loss": 0.359, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.8836206896551724, | |
| "grad_norm": 1.9691693184683765, | |
| "learning_rate": 4.0712758367263573e-07, | |
| "loss": 0.3453, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.8863146551724138, | |
| "grad_norm": 2.2550989234096432, | |
| "learning_rate": 3.8874258445562694e-07, | |
| "loss": 0.354, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.8890086206896551, | |
| "grad_norm": 1.9743645882114702, | |
| "learning_rate": 3.7076555030498505e-07, | |
| "loss": 0.3545, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.8917025862068966, | |
| "grad_norm": 2.069394313953148, | |
| "learning_rate": 3.531980716766914e-07, | |
| "loss": 0.3465, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.8943965517241379, | |
| "grad_norm": 2.084992853821571, | |
| "learning_rate": 3.3604170279271375e-07, | |
| "loss": 0.347, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.8970905172413793, | |
| "grad_norm": 2.028486932834069, | |
| "learning_rate": 3.1929796150351076e-07, | |
| "loss": 0.3385, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.8997844827586207, | |
| "grad_norm": 1.9042104552013777, | |
| "learning_rate": 3.02968329153735e-07, | |
| "loss": 0.3456, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.9024784482758621, | |
| "grad_norm": 2.138202184025318, | |
| "learning_rate": 2.870542504511864e-07, | |
| "loss": 0.3524, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.9051724137931034, | |
| "grad_norm": 2.0791032572613615, | |
| "learning_rate": 2.7155713333898826e-07, | |
| "loss": 0.3557, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.9078663793103449, | |
| "grad_norm": 2.032552582124559, | |
| "learning_rate": 2.564783488710293e-07, | |
| "loss": 0.3472, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.9105603448275862, | |
| "grad_norm": 2.0702198858374063, | |
| "learning_rate": 2.4181923109066254e-07, | |
| "loss": 0.3423, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.9132543103448276, | |
| "grad_norm": 2.223955152789369, | |
| "learning_rate": 2.2758107691268294e-07, | |
| "loss": 0.353, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.915948275862069, | |
| "grad_norm": 2.151000423198189, | |
| "learning_rate": 2.1376514600858212e-07, | |
| "loss": 0.3446, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9186422413793104, | |
| "grad_norm": 1.9722858881802758, | |
| "learning_rate": 2.003726606951084e-07, | |
| "loss": 0.3423, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.9213362068965517, | |
| "grad_norm": 2.152676598806774, | |
| "learning_rate": 1.874048058261252e-07, | |
| "loss": 0.3566, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.9240301724137931, | |
| "grad_norm": 2.14241065854355, | |
| "learning_rate": 1.7486272868778299e-07, | |
| "loss": 0.3451, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.9267241379310345, | |
| "grad_norm": 1.9240645550272026, | |
| "learning_rate": 1.62747538897019e-07, | |
| "loss": 0.3526, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.9294181034482759, | |
| "grad_norm": 1.9864527165081682, | |
| "learning_rate": 1.5106030830338791e-07, | |
| "loss": 0.3414, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.9321120689655172, | |
| "grad_norm": 1.891840587890648, | |
| "learning_rate": 1.3980207089423326e-07, | |
| "loss": 0.3507, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.9348060344827587, | |
| "grad_norm": 2.197241548310695, | |
| "learning_rate": 1.2897382270320947e-07, | |
| "loss": 0.3415, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 2.1206142876832708, | |
| "learning_rate": 1.1857652172215905e-07, | |
| "loss": 0.3453, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.9401939655172413, | |
| "grad_norm": 2.0575425778092375, | |
| "learning_rate": 1.0861108781636099e-07, | |
| "loss": 0.3414, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.9428879310344828, | |
| "grad_norm": 2.067217232750268, | |
| "learning_rate": 9.907840264314572e-08, | |
| "loss": 0.3429, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.9455818965517241, | |
| "grad_norm": 2.08954775323305, | |
| "learning_rate": 8.997930957389433e-08, | |
| "loss": 0.3406, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.9482758620689655, | |
| "grad_norm": 2.0413104358527865, | |
| "learning_rate": 8.13146136194265e-08, | |
| "loss": 0.3544, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.9509698275862069, | |
| "grad_norm": 1.9504574949587095, | |
| "learning_rate": 7.308508135877745e-08, | |
| "loss": 0.3515, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.9536637931034483, | |
| "grad_norm": 2.0325177039467266, | |
| "learning_rate": 6.52914408713784e-08, | |
| "loss": 0.3422, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.9563577586206896, | |
| "grad_norm": 2.080402951454278, | |
| "learning_rate": 5.7934381672640206e-08, | |
| "loss": 0.3302, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.959051724137931, | |
| "grad_norm": 1.9103094146698458, | |
| "learning_rate": 5.101455465295557e-08, | |
| "loss": 0.3388, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.9617456896551724, | |
| "grad_norm": 2.0461617336274665, | |
| "learning_rate": 4.453257202011008e-08, | |
| "loss": 0.3437, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.9644396551724138, | |
| "grad_norm": 1.8955751541723638, | |
| "learning_rate": 3.848900724511828e-08, | |
| "loss": 0.3448, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.9671336206896551, | |
| "grad_norm": 1.8502858059698502, | |
| "learning_rate": 3.28843950114921e-08, | |
| "loss": 0.3318, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.9698275862068966, | |
| "grad_norm": 1.9634830726403167, | |
| "learning_rate": 2.771923116793307e-08, | |
| "loss": 0.3506, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9725215517241379, | |
| "grad_norm": 2.12551984941854, | |
| "learning_rate": 2.299397268446413e-08, | |
| "loss": 0.3425, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.9752155172413793, | |
| "grad_norm": 2.4278727464472136, | |
| "learning_rate": 1.8709037612003044e-08, | |
| "loss": 0.3471, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.9779094827586207, | |
| "grad_norm": 2.191866602098634, | |
| "learning_rate": 1.4864805045373687e-08, | |
| "loss": 0.3384, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.9806034482758621, | |
| "grad_norm": 2.128906961450063, | |
| "learning_rate": 1.1461615089770062e-08, | |
| "loss": 0.349, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.9832974137931034, | |
| "grad_norm": 2.0846719469136916, | |
| "learning_rate": 8.499768830663723e-09, | |
| "loss": 0.3357, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.9859913793103449, | |
| "grad_norm": 2.319036063763146, | |
| "learning_rate": 5.979528307168414e-09, | |
| "loss": 0.3402, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.9886853448275862, | |
| "grad_norm": 2.0237346794749858, | |
| "learning_rate": 3.901116488855827e-09, | |
| "loss": 0.3554, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.9913793103448276, | |
| "grad_norm": 2.007135214089839, | |
| "learning_rate": 2.264717256030835e-09, | |
| "loss": 0.3462, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.994073275862069, | |
| "grad_norm": 1.994084875393067, | |
| "learning_rate": 1.0704753834600567e-09, | |
| "loss": 0.3455, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.9967672413793104, | |
| "grad_norm": 2.1211709513233856, | |
| "learning_rate": 3.184965275676577e-10, | |
| "loss": 0.3438, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.9994612068965517, | |
| "grad_norm": 2.0397937800653443, | |
| "learning_rate": 8.847217084495541e-12, | |
| "loss": 0.3482, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_runtime": 3.3988, | |
| "eval_samples_per_second": 2.942, | |
| "eval_steps_per_second": 0.883, | |
| "step": 1856 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 1856, | |
| "total_flos": 194304320471040.0, | |
| "train_loss": 0.50882549257949, | |
| "train_runtime": 16510.7518, | |
| "train_samples_per_second": 1.799, | |
| "train_steps_per_second": 0.112 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1856, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 194304320471040.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |