diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,29815 +2,8017 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 117.000154, + "epoch": 0.034, "eval_steps": 1000, - "global_step": 382000, + "global_step": 31000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 2.7908170954292e-06, - "grad_norm": 13.12192153930664, - "learning_rate": 0.0, - "loss": 10.9656, + "epoch": 2e-06, + "grad_norm": 13.825847625732422, + "learning_rate": 9.118382907149165e-05, + "loss": 2.5112, "step": 1 }, { - "epoch": 0.00027908170954291995, - "grad_norm": 9.881576538085938, - "learning_rate": 1.3811383928571428e-06, - "loss": 10.7287, - "step": 100 + "epoch": 0.000198, + "loss_gen": 7.338496208190918, + "loss_rtd": 0.3364008367061615, + "loss_sent": 0.6368644833564758, + "loss_sod": 0.48023903369903564, + "loss_total": 1.6714577674865723, + "step": 99 }, { - "epoch": 0.0005581634190858399, - "grad_norm": 2.9879977703094482, - "learning_rate": 2.7762276785714284e-06, - "loss": 9.8264, - "step": 200 + "epoch": 0.000198, + "loss_gen": 7.39376163482666, + "loss_rtd": 0.3249678909778595, + "loss_sent": 0.6036056876182556, + "loss_sod": 0.6825383901596069, + "loss_total": 1.8307068347930908, + "step": 99 }, { - "epoch": 0.0008372451286287599, - "grad_norm": 2.5677335262298584, - "learning_rate": 4.171316964285715e-06, - "loss": 9.2477, - "step": 300 + "epoch": 0.0002, + "grad_norm": 2.552236795425415, + "learning_rate": 9.116600623227749e-05, + "loss": 1.6687, + "step": 100 }, { - "epoch": 0.0011163268381716798, - "grad_norm": 2.4358127117156982, - "learning_rate": 5.56640625e-06, - "loss": 8.6932, - "step": 400 + "epoch": 0.000398, + "loss_gen": 7.468382358551025, + "loss_rtd": 0.34856757521629333, + "loss_sent": 0.24916481971740723, + "loss_sod": 0.5723617672920227, + "loss_total": 1.6159565448760986, + "step": 199 }, { - "epoch": 0.0013954085477146, - "grad_norm": 2.0098073482513428, - "learning_rate": 6.9614955357142865e-06, - "loss": 8.0634, - "step": 500 + "epoch": 0.000398, + "loss_gen": 7.193928241729736, + "loss_rtd": 0.3576851189136505, + "loss_sent": 0.08575951308012009, + "loss_sod": 0.5566340684890747, + "loss_total": 1.4295562505722046, + "step": 199 }, { - "epoch": 0.0016744902572575198, - "grad_norm": 2.2659668922424316, - "learning_rate": 8.356584821428571e-06, - "loss": 7.4637, - "step": 600 + "epoch": 0.0004, + "grad_norm": 4.255260467529297, + "learning_rate": 9.11479868656544e-05, + "loss": 1.6235, + "step": 200 }, { - "epoch": 0.00195357196680044, - "grad_norm": 4.478851318359375, - "learning_rate": 9.751674107142858e-06, - "loss": 6.9922, - "step": 700 + "epoch": 0.000598, + "loss_gen": 6.456632137298584, + "loss_rtd": 0.3662375509738922, + "loss_sent": 0.05986621230840683, + "loss_sod": 0.5555340647697449, + "loss_total": 1.5607976913452148, + "step": 299 }, { - "epoch": 0.0022326536763433596, - "grad_norm": 4.981658458709717, - "learning_rate": 1.1146763392857142e-05, - "loss": 6.6324, - "step": 800 + "epoch": 0.000598, + "loss_gen": 6.634022235870361, + "loss_rtd": 0.3693581521511078, + "loss_sent": 0.4121516942977905, + "loss_sod": 0.30377814173698425, + "loss_total": 1.6803597211837769, + "step": 299 }, { - "epoch": 0.0025117353858862797, - "grad_norm": 7.247958660125732, - "learning_rate": 1.2541852678571428e-05, - "loss": 6.3859, - "step": 900 + "epoch": 0.0006, + "grad_norm": 3.2695043087005615, + "learning_rate": 9.11299509246271e-05, + "loss": 1.5732, + "step": 300 }, { - "epoch": 0.0027908170954292, - "grad_norm": 5.4013261795043945, - "learning_rate": 1.3936941964285715e-05, - "loss": 6.1855, - "step": 1000 + "epoch": 0.000798, + "loss_gen": 6.009239196777344, + "loss_rtd": 0.3889811038970947, + "loss_sent": 0.05189042165875435, + "loss_sod": 0.22444681823253632, + "loss_total": 1.3846241235733032, + "step": 399 }, { - "epoch": 0.0027908170954292, - "eval_loss": 6.096754550933838, - "eval_runtime": 51.5887, - "eval_samples_per_second": 197.601, - "eval_steps_per_second": 1.551, - "step": 1000 + "epoch": 0.000798, + "loss_gen": 6.323911666870117, + "loss_rtd": 0.3717780113220215, + "loss_sent": 0.47785434126853943, + "loss_sod": 0.27205729484558105, + "loss_total": 1.878661870956421, + "step": 399 }, { - "epoch": 0.00306989880497212, - "grad_norm": 7.10856819152832, - "learning_rate": 1.5332031250000002e-05, - "loss": 6.022, - "step": 1100 + "epoch": 0.0008, + "grad_norm": 1.4569134712219238, + "learning_rate": 9.111189841646048e-05, + "loss": 1.6297, + "step": 400 }, { - "epoch": 0.0033489805145150396, - "grad_norm": 5.756898880004883, - "learning_rate": 1.6727120535714284e-05, - "loss": 5.8669, - "step": 1200 + "epoch": 0.000998, + "loss_gen": 5.7959980964660645, + "loss_rtd": 0.41233113408088684, + "loss_sent": 0.06202005594968796, + "loss_sod": 0.3987869620323181, + "loss_total": 1.7407991886138916, + "step": 499 }, { - "epoch": 0.0036280622240579597, - "grad_norm": 5.455397605895996, - "learning_rate": 1.8122209821428573e-05, - "loss": 5.7152, - "step": 1300 + "epoch": 0.000998, + "loss_gen": 5.973730564117432, + "loss_rtd": 0.3940364122390747, + "loss_sent": 0.20922060310840607, + "loss_sod": 0.1169593557715416, + "loss_total": 1.6144838333129883, + "step": 499 }, { - "epoch": 0.00390714393360088, - "grad_norm": 5.5224175453186035, - "learning_rate": 1.951729910714286e-05, - "loss": 5.5815, - "step": 1400 + "epoch": 0.001, + "grad_norm": 1.5367815494537354, + "learning_rate": 9.109382934842612e-05, + "loss": 1.731, + "step": 500 }, { - "epoch": 0.0041862256431437995, - "grad_norm": 5.809144020080566, - "learning_rate": 2.0912388392857144e-05, - "loss": 5.4534, - "step": 1500 + "epoch": 0.001198, + "loss_gen": 5.828492164611816, + "loss_rtd": 0.39731845259666443, + "loss_sent": 0.15445075929164886, + "loss_sod": 0.08247369527816772, + "loss_total": 1.681622862815857, + "step": 599 }, { - "epoch": 0.004465307352686719, - "grad_norm": 4.571648120880127, - "learning_rate": 2.230747767857143e-05, - "loss": 5.3257, - "step": 1600 + "epoch": 0.001198, + "loss_gen": 5.8320417404174805, + "loss_rtd": 0.40642160177230835, + "loss_sent": 0.28684261441230774, + "loss_sod": 0.1827951818704605, + "loss_total": 1.9240772724151611, + "step": 599 }, { - "epoch": 0.00474438906222964, - "grad_norm": 5.078343868255615, - "learning_rate": 2.3702566964285715e-05, - "loss": 5.1922, - "step": 1700 + "epoch": 0.0012, + "grad_norm": 1.504315733909607, + "learning_rate": 9.10757437278022e-05, + "loss": 1.874, + "step": 600 }, { - "epoch": 0.005023470771772559, - "grad_norm": 4.566476821899414, - "learning_rate": 2.509765625e-05, - "loss": 5.0766, - "step": 1800 + "epoch": 0.001398, + "loss_gen": 6.005682945251465, + "loss_rtd": 0.40744271874427795, + "loss_sent": 0.12264258414506912, + "loss_sod": 0.1687479019165039, + "loss_total": 1.9582250118255615, + "step": 699 }, { - "epoch": 0.00530255248131548, - "grad_norm": 4.724873065948486, - "learning_rate": 2.6492745535714286e-05, - "loss": 4.9649, - "step": 1900 + "epoch": 0.001398, + "loss_gen": 5.766935348510742, + "loss_rtd": 0.4119797646999359, + "loss_sent": 0.10859901458024979, + "loss_sod": 0.2178047150373459, + "loss_total": 1.9477099180221558, + "step": 699 }, { - "epoch": 0.0055816341908584, - "grad_norm": 3.901001453399658, - "learning_rate": 2.788783482142857e-05, - "loss": 4.8461, - "step": 2000 + "epoch": 0.0014, + "grad_norm": 1.2932411432266235, + "learning_rate": 9.105764156187362e-05, + "loss": 1.9748, + "step": 700 }, { - "epoch": 0.0055816341908584, - "eval_loss": 4.796176910400391, - "eval_runtime": 51.038, - "eval_samples_per_second": 199.734, - "eval_steps_per_second": 1.567, - "step": 2000 + "epoch": 0.001598, + "loss_gen": 5.579442024230957, + "loss_rtd": 0.4019645154476166, + "loss_sent": 0.15888626873493195, + "loss_sod": 0.09037375450134277, + "loss_total": 1.9886168241500854, + "step": 799 }, { - "epoch": 0.005860715900401319, - "grad_norm": 3.6539864540100098, - "learning_rate": 2.9282924107142857e-05, - "loss": 4.7684, - "step": 2100 + "epoch": 0.001598, + "loss_gen": 5.165345668792725, + "loss_rtd": 0.42980027198791504, + "loss_sent": 0.12720005214214325, + "loss_sod": 0.2558518648147583, + "loss_total": 2.05098557472229, + "step": 799 }, { - "epoch": 0.00613979760994424, - "grad_norm": 3.753115653991699, - "learning_rate": 3.067801339285715e-05, - "loss": 4.6727, - "step": 2200 + "epoch": 0.0016, + "grad_norm": 0.9107739329338074, + "learning_rate": 9.103952285793193e-05, + "loss": 2.1122, + "step": 800 }, { - "epoch": 0.0064188793194871595, - "grad_norm": 4.209320545196533, - "learning_rate": 3.207310267857143e-05, - "loss": 4.5833, - "step": 2300 + "epoch": 0.001798, + "loss_gen": 5.492117404937744, + "loss_rtd": 0.39895787835121155, + "loss_sent": 0.18793721497058868, + "loss_sod": 0.1065969318151474, + "loss_total": 2.174715995788574, + "step": 899 }, { - "epoch": 0.006697961029030079, - "grad_norm": 3.7540743350982666, - "learning_rate": 3.3468191964285714e-05, - "loss": 4.5171, - "step": 2400 + "epoch": 0.001798, + "loss_gen": 5.479669570922852, + "loss_rtd": 0.39764881134033203, + "loss_sent": 0.1811528503894806, + "loss_sod": 0.1471906304359436, + "loss_total": 2.2038590908050537, + "step": 899 }, { - "epoch": 0.006977042738573, - "grad_norm": 3.203145742416382, - "learning_rate": 3.486328125e-05, - "loss": 4.4384, - "step": 2500 + "epoch": 0.0018, + "grad_norm": 1.1011357307434082, + "learning_rate": 9.102138762327534e-05, + "loss": 2.2494, + "step": 900 }, { - "epoch": 0.0072561244481159195, - "grad_norm": 3.7786471843719482, - "learning_rate": 3.6258370535714285e-05, - "loss": 4.3796, - "step": 2600 + "epoch": 0.001998, + "loss_gen": 5.582163333892822, + "loss_rtd": 0.4162678122520447, + "loss_sent": 0.5247966051101685, + "loss_sod": 0.19601702690124512, + "loss_total": 2.810055732727051, + "step": 999 }, { - "epoch": 0.007535206157658839, - "grad_norm": 3.3760221004486084, - "learning_rate": 3.7653459821428574e-05, - "loss": 4.3006, - "step": 2700 + "epoch": 0.001998, + "loss_gen": 5.055418491363525, + "loss_rtd": 0.4222857356071472, + "loss_sent": 0.23062478005886078, + "loss_sod": 0.21447616815567017, + "loss_total": 2.382495403289795, + "step": 999 }, { - "epoch": 0.00781428786720176, - "grad_norm": 3.218376636505127, - "learning_rate": 3.9048549107142856e-05, - "loss": 4.2447, - "step": 2800 + "epoch": 0.002, + "grad_norm": 2.1947152614593506, + "learning_rate": 9.100323586520871e-05, + "loss": 2.3633, + "step": 1000 }, { - "epoch": 0.00809336957674468, - "grad_norm": 3.5393011569976807, - "learning_rate": 4.0443638392857145e-05, - "loss": 4.1994, - "step": 2900 + "epoch": 0.002, + "eval_loss": 2.3820672035217285, + "eval_runtime": 152.1395, + "eval_samples_per_second": 101.506, + "eval_steps_per_second": 0.795, + "step": 1000 }, { - "epoch": 0.008372451286287599, - "grad_norm": 3.0938923358917236, - "learning_rate": 4.1838727678571433e-05, - "loss": 4.1581, - "step": 3000 + "epoch": 0.002198, + "loss_gen": 5.590832233428955, + "loss_rtd": 0.44405338168144226, + "loss_sent": 0.17209087312221527, + "loss_sod": 0.13748066127300262, + "loss_total": 2.4303739070892334, + "step": 1099 }, { - "epoch": 0.008372451286287599, - "eval_loss": 4.10986852645874, - "eval_runtime": 51.1232, - "eval_samples_per_second": 199.401, - "eval_steps_per_second": 1.565, - "step": 3000 + "epoch": 0.002198, + "loss_gen": 5.535982608795166, + "loss_rtd": 0.436458945274353, + "loss_sent": 0.4832025170326233, + "loss_sod": 0.1516181230545044, + "loss_total": 2.731578826904297, + "step": 1099 }, { - "epoch": 0.008651532995830519, - "grad_norm": 3.75787353515625, - "learning_rate": 4.3233816964285716e-05, - "loss": 4.0955, - "step": 3100 + "epoch": 0.0022, + "grad_norm": 1.503093957901001, + "learning_rate": 9.09850675910436e-05, + "loss": 2.4442, + "step": 1100 }, { - "epoch": 0.008930614705373438, - "grad_norm": 3.1666648387908936, - "learning_rate": 4.4628906250000004e-05, - "loss": 4.0524, - "step": 3200 + "epoch": 0.002398, + "loss_gen": 5.088871955871582, + "loss_rtd": 0.4277195334434509, + "loss_sent": 0.1527119278907776, + "loss_sod": 0.07266837358474731, + "loss_total": 2.177920341491699, + "step": 1199 }, { - "epoch": 0.00920969641491636, - "grad_norm": 3.065993070602417, - "learning_rate": 4.6023995535714287e-05, - "loss": 3.9998, - "step": 3300 + "epoch": 0.002398, + "loss_gen": 5.475919246673584, + "loss_rtd": 0.4493253827095032, + "loss_sent": 0.2745739817619324, + "loss_sod": 0.11778494715690613, + "loss_total": 2.4824793338775635, + "step": 1199 }, { - "epoch": 0.00948877812445928, - "grad_norm": 3.1383607387542725, - "learning_rate": 4.7419084821428575e-05, - "loss": 3.9697, - "step": 3400 + "epoch": 0.0024, + "grad_norm": 1.0774881839752197, + "learning_rate": 9.096688280809814e-05, + "loss": 2.3916, + "step": 1200 }, { - "epoch": 0.0097678598340022, - "grad_norm": 3.024055242538452, - "learning_rate": 4.881417410714286e-05, - "loss": 3.9265, - "step": 3500 + "epoch": 0.002598, + "loss_gen": 5.362448215484619, + "loss_rtd": 0.43012556433677673, + "loss_sent": 0.19409601390361786, + "loss_sod": 0.058121513575315475, + "loss_total": 2.286700487136841, + "step": 1299 }, { - "epoch": 0.010046941543545119, - "grad_norm": 2.9224233627319336, - "learning_rate": 4.9999999779409584e-05, - "loss": 3.8761, - "step": 3600 + "epoch": 0.002598, + "loss_gen": 5.386651515960693, + "loss_rtd": 0.4165177345275879, + "loss_sent": 0.30741289258003235, + "loss_sod": 0.15456417202949524, + "loss_total": 2.490093469619751, + "step": 1299 }, { - "epoch": 0.010326023253088039, - "grad_norm": 2.803724765777588, - "learning_rate": 4.9999987034186534e-05, - "loss": 3.8359, - "step": 3700 + "epoch": 0.0026, + "grad_norm": 0.9983309507369995, + "learning_rate": 9.09486815236972e-05, + "loss": 2.3624, + "step": 1300 }, { - "epoch": 0.01060510496263096, - "grad_norm": 3.1602187156677246, - "learning_rate": 4.9999954680937745e-05, - "loss": 3.8042, - "step": 3800 + "epoch": 0.002798, + "loss_gen": 5.062756061553955, + "loss_rtd": 0.4479227364063263, + "loss_sent": 0.1385871171951294, + "loss_sod": 0.06484868377447128, + "loss_total": 2.1628317832946777, + "step": 1399 }, { - "epoch": 0.01088418667217388, - "grad_norm": 2.560638904571533, - "learning_rate": 4.9999902719688585e-05, - "loss": 3.7615, - "step": 3900 + "epoch": 0.002798, + "loss_gen": 5.175561904907227, + "loss_rtd": 0.4320078492164612, + "loss_sent": 0.23488734662532806, + "loss_sod": 0.12735715508460999, + "loss_total": 2.3394033908843994, + "step": 1399 }, { - "epoch": 0.0111632683817168, - "grad_norm": 2.6802313327789307, - "learning_rate": 4.999983115047983e-05, - "loss": 3.7277, - "step": 4000 + "epoch": 0.0028, + "grad_norm": 1.7671256065368652, + "learning_rate": 9.093046374517224e-05, + "loss": 2.325, + "step": 1400 }, { - "epoch": 0.0111632683817168, - "eval_loss": 3.714073419570923, - "eval_runtime": 51.2605, - "eval_samples_per_second": 198.867, - "eval_steps_per_second": 1.561, - "step": 4000 + "epoch": 0.002998, + "loss_gen": 5.061497688293457, + "loss_rtd": 0.41403764486312866, + "loss_sent": 0.0569935068488121, + "loss_sod": 0.05051300302147865, + "loss_total": 2.0285050868988037, + "step": 1499 }, { - "epoch": 0.011442350091259719, - "grad_norm": 2.718958854675293, - "learning_rate": 4.999973997336759e-05, - "loss": 3.6956, - "step": 4100 + "epoch": 0.002998, + "loss_gen": 5.155898571014404, + "loss_rtd": 0.42731085419654846, + "loss_sent": 0.583581268787384, + "loss_sod": 0.1655517816543579, + "loss_total": 2.7115108966827393, + "step": 1499 }, { - "epoch": 0.011721431800802639, - "grad_norm": 2.873365879058838, - "learning_rate": 4.999962918842338e-05, - "loss": 3.6612, - "step": 4200 + "epoch": 0.003, + "grad_norm": 1.6871683597564697, + "learning_rate": 9.091222947986137e-05, + "loss": 2.3133, + "step": 1500 }, { - "epoch": 0.012000513510345558, - "grad_norm": 2.890824317932129, - "learning_rate": 4.9999498795734114e-05, - "loss": 3.6473, - "step": 4300 + "epoch": 0.003198, + "loss_gen": 5.307708263397217, + "loss_rtd": 0.4442152976989746, + "loss_sent": 0.12124097347259521, + "loss_sod": 0.15964221954345703, + "loss_total": 2.3000707626342773, + "step": 1599 }, { - "epoch": 0.01227959521988848, - "grad_norm": 2.821676731109619, - "learning_rate": 4.999934879540203e-05, - "loss": 3.6056, - "step": 4400 + "epoch": 0.003198, + "loss_gen": 5.3126912117004395, + "loss_rtd": 0.42704111337661743, + "loss_sent": 0.34546709060668945, + "loss_sod": 0.13138636946678162, + "loss_total": 2.4803454875946045, + "step": 1599 }, { - "epoch": 0.0125586769294314, - "grad_norm": 2.426208972930908, - "learning_rate": 4.99991791875448e-05, - "loss": 3.5991, - "step": 4500 + "epoch": 0.0032, + "grad_norm": 1.4612807035446167, + "learning_rate": 9.089397873510937e-05, + "loss": 2.2816, + "step": 1600 }, { - "epoch": 0.012837758638974319, - "grad_norm": 2.584897518157959, - "learning_rate": 4.999898997229545e-05, - "loss": 3.5615, - "step": 4600 + "epoch": 0.003398, + "loss_gen": 4.37318229675293, + "loss_rtd": 0.4542223811149597, + "loss_sent": 0.06731607019901276, + "loss_sod": 0.14954794943332672, + "loss_total": 1.963611125946045, + "step": 1699 }, { - "epoch": 0.013116840348517239, - "grad_norm": 2.6380631923675537, - "learning_rate": 4.9998781149802365e-05, - "loss": 3.5295, - "step": 4700 + "epoch": 0.003398, + "loss_gen": 5.005198001861572, + "loss_rtd": 0.4181549549102783, + "loss_sent": 0.19707848131656647, + "loss_sod": 0.04773856699466705, + "loss_total": 2.1422934532165527, + "step": 1699 }, { - "epoch": 0.013395922058060158, - "grad_norm": 2.4087016582489014, - "learning_rate": 4.999855272022935e-05, - "loss": 3.516, - "step": 4800 + "epoch": 0.0034, + "grad_norm": 1.238786220550537, + "learning_rate": 9.087571151826762e-05, + "loss": 2.2543, + "step": 1700 }, { - "epoch": 0.013675003767603078, - "grad_norm": 2.611582040786743, - "learning_rate": 4.999830468375556e-05, - "loss": 3.4883, - "step": 4900 + "epoch": 0.003598, + "loss_gen": 4.819798469543457, + "loss_rtd": 0.4338776171207428, + "loss_sent": 0.1732659637928009, + "loss_sod": 0.07842444628477097, + "loss_total": 2.1035704612731934, + "step": 1799 }, { - "epoch": 0.013954085477146, - "grad_norm": 2.8757617473602295, - "learning_rate": 4.999803704057553e-05, - "loss": 3.4691, - "step": 5000 + "epoch": 0.003598, + "loss_gen": 4.023968696594238, + "loss_rtd": 0.46381649374961853, + "loss_sent": 0.0971527025103569, + "loss_sod": 0.27688997983932495, + "loss_total": 2.021725654602051, + "step": 1799 }, { - "epoch": 0.013954085477146, - "eval_loss": 3.4387500286102295, - "eval_runtime": 51.2725, - "eval_samples_per_second": 198.82, - "eval_steps_per_second": 1.56, - "step": 5000 + "epoch": 0.0036, + "grad_norm": 1.0480766296386719, + "learning_rate": 9.085742783669415e-05, + "loss": 2.2141, + "step": 1800 }, { - "epoch": 0.01423316718668892, - "grad_norm": 2.399738073348999, - "learning_rate": 4.9997749790899184e-05, - "loss": 3.427, - "step": 5100 + "epoch": 0.003798, + "loss_gen": 5.054291725158691, + "loss_rtd": 0.4202824532985687, + "loss_sent": 0.09639989584684372, + "loss_sod": 0.0953429564833641, + "loss_total": 2.0912883281707764, + "step": 1899 }, { - "epoch": 0.014512248896231839, - "grad_norm": 2.7125585079193115, - "learning_rate": 4.999744293495182e-05, - "loss": 3.4346, - "step": 5200 + "epoch": 0.003798, + "loss_gen": 4.769758701324463, + "loss_rtd": 0.43368253111839294, + "loss_sent": 0.12541694939136505, + "loss_sod": 0.14418333768844604, + "loss_total": 2.0992703437805176, + "step": 1899 }, { - "epoch": 0.014791330605774759, - "grad_norm": 2.672027587890625, - "learning_rate": 4.999711647297412e-05, - "loss": 3.4339, - "step": 5300 + "epoch": 0.0038, + "grad_norm": 1.2859623432159424, + "learning_rate": 9.083912769775365e-05, + "loss": 2.1992, + "step": 1900 }, { - "epoch": 0.015070412315317678, - "grad_norm": 2.661180019378662, - "learning_rate": 4.999677040522211e-05, - "loss": 3.3783, - "step": 5400 + "epoch": 0.003998, + "loss_gen": 5.237246990203857, + "loss_rtd": 0.41872021555900574, + "loss_sent": 0.06527635455131531, + "loss_sod": 0.1806328445672989, + "loss_total": 2.188520669937134, + "step": 1999 }, { - "epoch": 0.015349494024860598, - "grad_norm": 2.44797420501709, - "learning_rate": 4.9996404731967234e-05, - "loss": 3.3507, - "step": 5500 + "epoch": 0.003998, + "loss_gen": 4.804346561431885, + "loss_rtd": 0.4427565336227417, + "loss_sent": 0.37472474575042725, + "loss_sod": 0.07424280792474747, + "loss_total": 2.2896533012390137, + "step": 1999 }, { - "epoch": 0.01562857573440352, - "grad_norm": 3.1140902042388916, - "learning_rate": 4.99960194534963e-05, - "loss": 3.3896, - "step": 5600 + "epoch": 0.004, + "grad_norm": 1.116328477859497, + "learning_rate": 9.082081110881737e-05, + "loss": 2.1936, + "step": 2000 }, { - "epoch": 0.015907657443946437, - "grad_norm": 2.3039517402648926, - "learning_rate": 4.999561457011148e-05, - "loss": 3.5312, - "step": 5700 + "epoch": 0.004, + "eval_loss": 2.1267459392547607, + "eval_runtime": 151.0606, + "eval_samples_per_second": 102.23, + "eval_steps_per_second": 0.801, + "step": 2000 }, { - "epoch": 0.01618673915348936, - "grad_norm": 2.9061505794525146, - "learning_rate": 4.999519008213035e-05, - "loss": 3.5034, - "step": 5800 + "epoch": 0.004198, + "loss_gen": 4.939090728759766, + "loss_rtd": 0.406046062707901, + "loss_sent": 0.177232027053833, + "loss_sod": 0.07092248648405075, + "loss_total": 2.082077980041504, + "step": 2099 }, { - "epoch": 0.01646582086303228, - "grad_norm": 2.226949691772461, - "learning_rate": 4.999474598988583e-05, - "loss": 3.4697, - "step": 5900 + "epoch": 0.004198, + "loss_gen": 5.136044025421143, + "loss_rtd": 0.4234750270843506, + "loss_sent": 0.49980977177619934, + "loss_sod": 0.10405763983726501, + "loss_total": 2.512158155441284, + "step": 2099 }, { - "epoch": 0.016744902572575198, - "grad_norm": 2.87302565574646, - "learning_rate": 4.9994282293726226e-05, - "loss": 3.4468, - "step": 6000 + "epoch": 0.0042, + "grad_norm": 1.969171404838562, + "learning_rate": 9.080247807726327e-05, + "loss": 2.1834, + "step": 2100 }, { - "epoch": 0.016744902572575198, - "eval_loss": 3.3136606216430664, - "eval_runtime": 51.5034, - "eval_samples_per_second": 197.929, - "eval_steps_per_second": 1.553, - "step": 6000 + "epoch": 0.004398, + "loss_gen": 5.003271102905273, + "loss_rtd": 0.4190343916416168, + "loss_sent": 0.21502840518951416, + "loss_sod": 0.1662188321352005, + "loss_total": 2.2364859580993652, + "step": 2199 }, { - "epoch": 0.01702398428211812, - "grad_norm": 2.1791470050811768, - "learning_rate": 4.999379899401524e-05, - "loss": 3.4389, - "step": 6100 + "epoch": 0.004398, + "loss_gen": 3.9248173236846924, + "loss_rtd": 0.46104612946510315, + "loss_sent": 0.043037645518779755, + "loss_sod": 0.11977896094322205, + "loss_total": 1.7504937648773193, + "step": 2199 }, { - "epoch": 0.017303065991661037, - "grad_norm": 2.3793013095855713, - "learning_rate": 4.9993296091131926e-05, - "loss": 3.3977, - "step": 6200 + "epoch": 0.0044, + "grad_norm": 0.8276557922363281, + "learning_rate": 9.07841286104759e-05, + "loss": 2.1231, + "step": 2200 }, { - "epoch": 0.01758214770120396, - "grad_norm": 2.6156604290008545, - "learning_rate": 4.999277358547073e-05, - "loss": 3.3978, - "step": 6300 + "epoch": 0.004598, + "loss_gen": 3.965472459793091, + "loss_rtd": 0.467626690864563, + "loss_sent": 0.049993958324193954, + "loss_sod": 0.29348552227020264, + "loss_total": 1.9406394958496094, + "step": 2299 }, { - "epoch": 0.017861229410746877, - "grad_norm": 2.4849677085876465, - "learning_rate": 4.9992231477441454e-05, - "loss": 3.3905, - "step": 6400 + "epoch": 0.004598, + "loss_gen": 4.1308698654174805, + "loss_rtd": 0.44145962595939636, + "loss_sent": 0.0027684078086167574, + "loss_sod": 0.40908950567245483, + "loss_total": 2.0299630165100098, + "step": 2299 }, { - "epoch": 0.018140311120289798, - "grad_norm": 2.5651509761810303, - "learning_rate": 4.9991669767469285e-05, - "loss": 3.3651, - "step": 6500 + "epoch": 0.0046, + "grad_norm": 2.2544002532958984, + "learning_rate": 9.076576271584638e-05, + "loss": 2.151, + "step": 2300 }, { - "epoch": 0.01841939282983272, - "grad_norm": 2.676013469696045, - "learning_rate": 4.99910884559948e-05, - "loss": 3.3486, - "step": 6600 + "epoch": 0.004798, + "loss_gen": 4.784628868103027, + "loss_rtd": 0.43977904319763184, + "loss_sent": 0.47826358675956726, + "loss_sod": 0.14754629135131836, + "loss_total": 2.4170873165130615, + "step": 2399 }, { - "epoch": 0.018698474539375638, - "grad_norm": 2.618835926055908, - "learning_rate": 4.999048754347391e-05, - "loss": 3.3289, - "step": 6700 + "epoch": 0.004798, + "loss_gen": 4.764978885650635, + "loss_rtd": 0.44556763768196106, + "loss_sent": 0.28284481167793274, + "loss_sod": 0.039426274597644806, + "loss_total": 2.1137866973876953, + "step": 2399 }, { - "epoch": 0.01897755624891856, - "grad_norm": 2.771819591522217, - "learning_rate": 4.998986703037793e-05, - "loss": 3.3371, - "step": 6800 + "epoch": 0.0048, + "grad_norm": 3.047312021255493, + "learning_rate": 9.074738040077253e-05, + "loss": 2.1187, + "step": 2400 }, { - "epoch": 0.019256637958461477, - "grad_norm": 2.562443733215332, - "learning_rate": 4.9989226917193564e-05, - "loss": 3.3101, - "step": 6900 + "epoch": 0.004998, + "loss_gen": 4.629762172698975, + "loss_rtd": 0.4105084240436554, + "loss_sent": 0.4007233679294586, + "loss_sod": 0.04058845341205597, + "loss_total": 2.147829532623291, + "step": 2499 }, { - "epoch": 0.0195357196680044, - "grad_norm": 2.585446357727051, - "learning_rate": 4.9988567204422844e-05, - "loss": 3.2926, - "step": 7000 + "epoch": 0.004998, + "loss_gen": 4.5216569900512695, + "loss_rtd": 0.43388718366622925, + "loss_sent": 0.10170675814151764, + "loss_sod": 0.08774755895137787, + "loss_total": 1.8890888690948486, + "step": 2499 }, { - "epoch": 0.0195357196680044, - "eval_loss": 3.241293430328369, - "eval_runtime": 51.6342, - "eval_samples_per_second": 197.427, - "eval_steps_per_second": 1.549, - "step": 7000 + "epoch": 0.005, + "grad_norm": 1.293394684791565, + "learning_rate": 9.07289816726587e-05, + "loss": 2.1094, + "step": 2500 }, { - "epoch": 0.01981480137754732, - "grad_norm": 2.1645052433013916, - "learning_rate": 4.998788789258321e-05, - "loss": 3.2866, - "step": 7100 + "epoch": 0.005198, + "loss_gen": 4.740551948547363, + "loss_rtd": 0.4234953224658966, + "loss_sent": 0.2437671273946762, + "loss_sod": 0.13527914881706238, + "loss_total": 2.116788148880005, + "step": 2599 }, { - "epoch": 0.020093883087090238, - "grad_norm": 2.2520089149475098, - "learning_rate": 4.998718898220744e-05, - "loss": 3.2748, - "step": 7200 + "epoch": 0.005198, + "loss_gen": 4.820707321166992, + "loss_rtd": 0.4365331828594208, + "loss_sent": 0.18041883409023285, + "loss_sod": 0.07415901869535446, + "loss_total": 2.0275795459747314, + "step": 2599 }, { - "epoch": 0.02037296479663316, - "grad_norm": 2.62365460395813, - "learning_rate": 4.998647047384373e-05, - "loss": 3.2634, - "step": 7300 + "epoch": 0.0052, + "grad_norm": 1.3240649700164795, + "learning_rate": 9.071056653891595e-05, + "loss": 2.0402, + "step": 2600 }, { - "epoch": 0.020652046506176077, - "grad_norm": 2.539928913116455, - "learning_rate": 4.99857323680556e-05, - "loss": 3.2459, - "step": 7400 + "epoch": 0.005398, + "loss_gen": 4.781034469604492, + "loss_rtd": 0.426374226808548, + "loss_sent": 0.32308676838874817, + "loss_sod": 0.16155104339122772, + "loss_total": 2.222855806350708, + "step": 2699 }, { - "epoch": 0.020931128215719, - "grad_norm": 2.6277120113372803, - "learning_rate": 4.9984974665421974e-05, - "loss": 3.2469, - "step": 7500 + "epoch": 0.005398, + "loss_gen": 5.135765075683594, + "loss_rtd": 0.3981669843196869, + "loss_sent": 0.21877646446228027, + "loss_sod": 0.18226270377635956, + "loss_total": 2.2083826065063477, + "step": 2699 }, { - "epoch": 0.02121020992526192, - "grad_norm": 2.4832637310028076, - "learning_rate": 4.9984197366537137e-05, - "loss": 3.2147, - "step": 7600 + "epoch": 0.0054, + "grad_norm": 1.8904129266738892, + "learning_rate": 9.069213500696186e-05, + "loss": 2.0394, + "step": 2700 }, { - "epoch": 0.021489291634804838, - "grad_norm": 2.6693716049194336, - "learning_rate": 4.9983400472010736e-05, - "loss": 3.2117, - "step": 7700 + "epoch": 0.005598, + "loss_gen": 3.7506027221679688, + "loss_rtd": 0.4511221647262573, + "loss_sent": 0.028750889003276825, + "loss_sod": 0.32733532786369324, + "loss_total": 1.825059413909912, + "step": 2799 }, { - "epoch": 0.02176837334434776, - "grad_norm": 2.1815237998962402, - "learning_rate": 4.998258398246778e-05, - "loss": 3.2072, - "step": 7800 + "epoch": 0.005598, + "loss_gen": 3.439668655395508, + "loss_rtd": 0.4633672833442688, + "loss_sent": 0.000510736252181232, + "loss_sod": 0.4404051601886749, + "loss_total": 1.8377517461776733, + "step": 2799 }, { - "epoch": 0.022047455053890677, - "grad_norm": 2.225245237350464, - "learning_rate": 4.998174789854869e-05, - "loss": 3.2105, - "step": 7900 + "epoch": 0.0056, + "grad_norm": 0.7309032678604126, + "learning_rate": 9.067368708422066e-05, + "loss": 2.0247, + "step": 2800 }, { - "epoch": 0.0223265367634336, - "grad_norm": 2.040609836578369, - "learning_rate": 4.9980892220909194e-05, - "loss": 3.2022, - "step": 8000 + "epoch": 0.005798, + "loss_gen": 4.358363628387451, + "loss_rtd": 0.435396283864975, + "loss_sent": 0.1706308275461197, + "loss_sod": 0.02801288664340973, + "loss_total": 1.8031005859375, + "step": 2899 }, { - "epoch": 0.0223265367634336, - "eval_loss": 3.160486936569214, - "eval_runtime": 51.5394, - "eval_samples_per_second": 197.79, - "eval_steps_per_second": 1.552, - "step": 8000 + "epoch": 0.005798, + "loss_gen": 4.692056179046631, + "loss_rtd": 0.4350062608718872, + "loss_sent": 0.135645791888237, + "loss_sod": 0.07625914365053177, + "loss_total": 1.9054793119430542, + "step": 2899 }, { - "epoch": 0.022605618472976517, - "grad_norm": 1.8777775764465332, - "learning_rate": 4.9980016950220435e-05, - "loss": 3.4064, - "step": 8100 + "epoch": 0.0058, + "grad_norm": 1.024571180343628, + "learning_rate": 9.065522277812314e-05, + "loss": 1.9753, + "step": 2900 }, { - "epoch": 0.022884700182519438, - "grad_norm": 2.6112220287323, - "learning_rate": 4.9979122087168903e-05, - "loss": 3.351, - "step": 8200 + "epoch": 0.005998, + "loss_gen": 3.3283891677856445, + "loss_rtd": 0.4622822403907776, + "loss_sent": 0.042274124920368195, + "loss_sod": 0.08875666558742523, + "loss_total": 1.4751375913619995, + "step": 2999 }, { - "epoch": 0.02316378189206236, - "grad_norm": 2.346705913543701, - "learning_rate": 4.9978207632456464e-05, - "loss": 3.3498, - "step": 8300 + "epoch": 0.005998, + "loss_gen": 3.772102117538452, + "loss_rtd": 0.471868634223938, + "loss_sent": 0.03946878761053085, + "loss_sod": 0.13368847966194153, + "loss_total": 1.6444077491760254, + "step": 2999 }, { - "epoch": 0.023442863601605277, - "grad_norm": 2.55951189994812, - "learning_rate": 4.9977273586800336e-05, - "loss": 3.3247, - "step": 8400 + "epoch": 0.006, + "grad_norm": 0.7709165215492249, + "learning_rate": 9.063674209610678e-05, + "loss": 1.95, + "step": 3000 }, { - "epoch": 0.0237219453111482, - "grad_norm": 2.5219528675079346, - "learning_rate": 4.99763199509331e-05, - "loss": 3.3002, - "step": 8500 + "epoch": 0.006, + "eval_loss": 1.9189565181732178, + "eval_runtime": 151.1801, + "eval_samples_per_second": 102.15, + "eval_steps_per_second": 0.8, + "step": 3000 }, { - "epoch": 0.024001027020691117, - "grad_norm": 2.726630926132202, - "learning_rate": 4.997534672560274e-05, - "loss": 3.2826, - "step": 8600 + "epoch": 0.006198, + "loss_gen": 4.625843048095703, + "loss_rtd": 0.41452756524086, + "loss_sent": 0.42125964164733887, + "loss_sod": 0.04516763985157013, + "loss_total": 2.090644121170044, + "step": 3099 }, { - "epoch": 0.024280108730234038, - "grad_norm": 2.3680546283721924, - "learning_rate": 4.997435391157256e-05, - "loss": 3.2899, - "step": 8700 + "epoch": 0.006198, + "loss_gen": 4.561415672302246, + "loss_rtd": 0.4227144420146942, + "loss_sent": 0.24549300968647003, + "loss_sod": 0.09064850211143494, + "loss_total": 1.9516971111297607, + "step": 3099 }, { - "epoch": 0.02455919043977696, - "grad_norm": 2.168916702270508, - "learning_rate": 4.9973341509621246e-05, - "loss": 3.2514, - "step": 8800 + "epoch": 0.0062, + "grad_norm": 1.147400975227356, + "learning_rate": 9.061824504561555e-05, + "loss": 1.9453, + "step": 3100 }, { - "epoch": 0.024838272149319877, - "grad_norm": 2.8766160011291504, - "learning_rate": 4.997230952054285e-05, - "loss": 3.2714, - "step": 8900 + "epoch": 0.006398, + "loss_gen": 5.0961737632751465, + "loss_rtd": 0.4299827516078949, + "loss_sent": 0.2821832001209259, + "loss_sod": 0.23717480897903442, + "loss_total": 2.263834238052368, + "step": 3199 }, { - "epoch": 0.0251173538588628, - "grad_norm": 2.4274535179138184, - "learning_rate": 4.997125794514679e-05, - "loss": 3.2574, - "step": 9000 + "epoch": 0.006398, + "loss_gen": 4.924869060516357, + "loss_rtd": 0.4275287985801697, + "loss_sent": 0.1261586993932724, + "loss_sod": 0.17284032702445984, + "loss_total": 1.996835470199585, + "step": 3199 }, { - "epoch": 0.0251173538588628, - "eval_loss": 3.084817886352539, - "eval_runtime": 51.7822, - "eval_samples_per_second": 196.863, - "eval_steps_per_second": 1.545, - "step": 9000 + "epoch": 0.0064, + "grad_norm": 1.0568593740463257, + "learning_rate": 9.059973163410007e-05, + "loss": 1.9416, + "step": 3200 }, { - "epoch": 0.025396435568405717, - "grad_norm": 2.0968375205993652, - "learning_rate": 4.9970186784257824e-05, - "loss": 3.2289, - "step": 9100 + "epoch": 0.006598, + "loss_gen": 4.659168243408203, + "loss_rtd": 0.42212942242622375, + "loss_sent": 0.27326828241348267, + "loss_sod": 0.15088553726673126, + "loss_total": 2.030813455581665, + "step": 3299 }, { - "epoch": 0.025675517277948638, - "grad_norm": 2.216857671737671, - "learning_rate": 4.99690960387161e-05, - "loss": 3.2337, - "step": 9200 + "epoch": 0.006598, + "loss_gen": 4.539935111999512, + "loss_rtd": 0.4461780786514282, + "loss_sent": 0.3518986105918884, + "loss_sod": 0.0318928137421608, + "loss_total": 1.9841861724853516, + "step": 3299 }, { - "epoch": 0.025954598987491556, - "grad_norm": 1.9617056846618652, - "learning_rate": 4.996798570937711e-05, - "loss": 3.2173, - "step": 9300 + "epoch": 0.0066, + "grad_norm": 2.5101399421691895, + "learning_rate": 9.058120186901755e-05, + "loss": 1.8857, + "step": 3300 }, { - "epoch": 0.026233680697034478, - "grad_norm": 2.227616548538208, - "learning_rate": 4.99668557971117e-05, - "loss": 3.1942, - "step": 9400 + "epoch": 0.006798, + "loss_gen": 4.367181777954102, + "loss_rtd": 0.4400678873062134, + "loss_sent": 0.1957220435142517, + "loss_sod": 0.11265156418085098, + "loss_total": 1.8420209884643555, + "step": 3399 }, { - "epoch": 0.0265127624065774, - "grad_norm": 2.8554770946502686, - "learning_rate": 4.996570630280609e-05, - "loss": 3.1955, - "step": 9500 + "epoch": 0.006798, + "loss_gen": 4.619999408721924, + "loss_rtd": 0.4316225051879883, + "loss_sent": 0.1329963207244873, + "loss_sod": 0.11254145950078964, + "loss_total": 1.8340474367141724, + "step": 3399 }, { - "epoch": 0.026791844116120317, - "grad_norm": 2.183323383331299, - "learning_rate": 4.996453722736186e-05, - "loss": 3.1807, - "step": 9600 + "epoch": 0.0068, + "grad_norm": 1.8781721591949463, + "learning_rate": 9.056265575783176e-05, + "loss": 1.8745, + "step": 3400 }, { - "epoch": 0.02707092582566324, - "grad_norm": 2.538712978363037, - "learning_rate": 4.996334857169593e-05, - "loss": 3.1745, - "step": 9700 + "epoch": 0.006998, + "loss_gen": 4.546133518218994, + "loss_rtd": 0.437592476606369, + "loss_sent": 0.38809189200401306, + "loss_sod": 0.14976780116558075, + "loss_total": 2.095884084701538, + "step": 3499 }, { - "epoch": 0.027350007535206156, - "grad_norm": 2.2385454177856445, - "learning_rate": 4.9962140336740594e-05, - "loss": 3.1722, - "step": 9800 + "epoch": 0.006998, + "loss_gen": 4.594987869262695, + "loss_rtd": 0.43721577525138855, + "loss_sent": 0.3743051588535309, + "loss_sod": 0.022005265578627586, + "loss_total": 1.965998888015747, + "step": 3499 }, { - "epoch": 0.027629089244749078, - "grad_norm": 2.5218403339385986, - "learning_rate": 4.9960912523443496e-05, - "loss": 3.1654, - "step": 9900 + "epoch": 0.007, + "grad_norm": 3.105059862136841, + "learning_rate": 9.05440933080131e-05, + "loss": 1.855, + "step": 3500 }, { - "epoch": 0.027908170954292, - "grad_norm": 1.8915151357650757, - "learning_rate": 4.9959665132767644e-05, - "loss": 3.1463, - "step": 10000 + "epoch": 0.007198, + "loss_gen": 5.049171447753906, + "loss_rtd": 0.43322035670280457, + "loss_sent": 0.34750890731811523, + "loss_sod": 0.07064300775527954, + "loss_total": 2.0752432346343994, + "step": 3599 }, { - "epoch": 0.027908170954292, - "eval_loss": 3.02693510055542, - "eval_runtime": 52.2993, - "eval_samples_per_second": 194.916, - "eval_steps_per_second": 1.53, - "step": 10000 + "epoch": 0.007198, + "loss_gen": 3.8683502674102783, + "loss_rtd": 0.4488944113254547, + "loss_sent": 0.08359670639038086, + "loss_sod": 0.15620453655719757, + "loss_total": 1.6263469457626343, + "step": 3599 }, { - "epoch": 0.028187252663834917, - "grad_norm": 2.5629944801330566, - "learning_rate": 4.9958398165691375e-05, - "loss": 3.1482, - "step": 10100 + "epoch": 0.0072, + "grad_norm": 1.1705760955810547, + "learning_rate": 9.052551452703848e-05, + "loss": 1.8166, + "step": 3600 }, { - "epoch": 0.02846633437337784, - "grad_norm": 2.781618356704712, - "learning_rate": 4.995711162320841e-05, - "loss": 3.1428, - "step": 10200 + "epoch": 0.007398, + "loss_gen": 4.654787063598633, + "loss_rtd": 0.4270414113998413, + "loss_sent": 0.2564600110054016, + "loss_sod": 0.2547892928123474, + "loss_total": 2.0471081733703613, + "step": 3699 }, { - "epoch": 0.028745416082920756, - "grad_norm": 1.8569979667663574, - "learning_rate": 4.9955805506327816e-05, - "loss": 3.1133, - "step": 10300 + "epoch": 0.007398, + "loss_gen": 4.532984733581543, + "loss_rtd": 0.43204793334007263, + "loss_sent": 0.31941744685173035, + "loss_sod": 0.10170117765665054, + "loss_total": 1.9329694509506226, + "step": 3699 }, { - "epoch": 0.029024497792463678, - "grad_norm": 2.1309852600097656, - "learning_rate": 4.9954479816073995e-05, - "loss": 3.1153, - "step": 10400 + "epoch": 0.0074, + "grad_norm": 1.5379921197891235, + "learning_rate": 9.050691942239147e-05, + "loss": 1.8172, + "step": 3700 }, { - "epoch": 0.0293035795020066, - "grad_norm": 2.1795830726623535, - "learning_rate": 4.995313455348673e-05, - "loss": 3.1324, - "step": 10500 + "epoch": 0.007598, + "loss_gen": 4.633910655975342, + "loss_rtd": 0.4312874376773834, + "loss_sent": 0.10443449020385742, + "loss_sod": 0.10198817402124405, + "loss_total": 1.7216852903366089, + "step": 3799 }, { - "epoch": 0.029582661211549517, - "grad_norm": 2.7019176483154297, - "learning_rate": 4.995176971962112e-05, - "loss": 3.1096, - "step": 10600 + "epoch": 0.007598, + "loss_gen": 4.459100723266602, + "loss_rtd": 0.4371269941329956, + "loss_sent": 0.1382666677236557, + "loss_sod": 0.11707606166601181, + "loss_total": 1.7355529069900513, + "step": 3799 }, { - "epoch": 0.02986174292109244, - "grad_norm": 1.9261529445648193, - "learning_rate": 4.995038531554765e-05, - "loss": 3.1006, - "step": 10700 + "epoch": 0.0076, + "grad_norm": 1.3472646474838257, + "learning_rate": 9.048830800156217e-05, + "loss": 1.7869, + "step": 3800 }, { - "epoch": 0.030140824630635357, - "grad_norm": 2.2618935108184814, - "learning_rate": 4.9948981342352144e-05, - "loss": 3.1023, - "step": 10800 + "epoch": 0.007798, + "loss_gen": 4.800703525543213, + "loss_rtd": 0.43522247672080994, + "loss_sent": 0.1432768851518631, + "loss_sod": 0.1126156821846962, + "loss_total": 1.7930315732955933, + "step": 3899 }, { - "epoch": 0.030419906340178278, - "grad_norm": 2.6323728561401367, - "learning_rate": 4.994755780113575e-05, - "loss": 3.0989, - "step": 10900 + "epoch": 0.007798, + "loss_gen": 4.192720890045166, + "loss_rtd": 0.443738728761673, + "loss_sent": 0.12932538986206055, + "loss_sod": 0.21474014222621918, + "loss_total": 1.750169038772583, + "step": 3899 }, { - "epoch": 0.030698988049721196, - "grad_norm": 2.481395959854126, - "learning_rate": 4.994611469301499e-05, - "loss": 3.0748, - "step": 11000 + "epoch": 0.0078, + "grad_norm": 0.6541185975074768, + "learning_rate": 9.046968027204723e-05, + "loss": 1.7513, + "step": 3900 }, { - "epoch": 0.030698988049721196, - "eval_loss": 2.99800443649292, - "eval_runtime": 51.5763, - "eval_samples_per_second": 197.649, - "eval_steps_per_second": 1.551, - "step": 11000 + "epoch": 0.007998, + "loss_gen": 4.6910858154296875, + "loss_rtd": 0.42430779337882996, + "loss_sent": 0.18219135701656342, + "loss_sod": 0.18787550926208496, + "loss_total": 1.8500816822052002, + "step": 3999 }, { - "epoch": 0.030978069759264117, - "grad_norm": 2.4254820346832275, - "learning_rate": 4.994465201912172e-05, - "loss": 3.0841, - "step": 11100 + "epoch": 0.007998, + "loss_gen": 3.837690830230713, + "loss_rtd": 0.45955485105514526, + "loss_sent": 0.011964034289121628, + "loss_sod": 0.5141125917434692, + "loss_total": 1.8492859601974487, + "step": 3999 }, { - "epoch": 0.03125715146880704, - "grad_norm": 2.4835855960845947, - "learning_rate": 4.994316978060315e-05, - "loss": 3.0653, - "step": 11200 + "epoch": 0.008, + "grad_norm": 1.1112271547317505, + "learning_rate": 9.045103624134992e-05, + "loss": 1.7631, + "step": 4000 }, { - "epoch": 0.03153623317834996, - "grad_norm": 1.8872084617614746, - "learning_rate": 4.994166797862183e-05, - "loss": 3.0625, - "step": 11300 + "epoch": 0.008, + "eval_loss": 1.6880124807357788, + "eval_runtime": 151.9356, + "eval_samples_per_second": 101.642, + "eval_steps_per_second": 0.796, + "step": 4000 }, { - "epoch": 0.031815314887892875, - "grad_norm": 2.41929030418396, - "learning_rate": 4.9940146614355665e-05, - "loss": 3.0617, - "step": 11400 + "epoch": 0.008198, + "loss_gen": 4.3431878089904785, + "loss_rtd": 0.43319565057754517, + "loss_sent": 0.31508129835128784, + "loss_sod": 0.0758034810423851, + "loss_total": 1.7816097736358643, + "step": 4099 }, { - "epoch": 0.0320943965974358, - "grad_norm": 2.8046977519989014, - "learning_rate": 4.9938605688997874e-05, - "loss": 3.0434, - "step": 11500 + "epoch": 0.008198, + "loss_gen": 4.724102020263672, + "loss_rtd": 0.43733254075050354, + "loss_sent": 0.36949220299720764, + "loss_sod": 0.044229187071323395, + "loss_total": 1.8925622701644897, + "step": 4099 }, { - "epoch": 0.03237347830697872, - "grad_norm": 2.526066780090332, - "learning_rate": 4.993704520375706e-05, - "loss": 3.0408, - "step": 11600 + "epoch": 0.0082, + "grad_norm": 2.3654143810272217, + "learning_rate": 9.043237591698004e-05, + "loss": 1.6969, + "step": 4100 }, { - "epoch": 0.032652560016521635, - "grad_norm": 2.3707504272460938, - "learning_rate": 4.9935465159857134e-05, - "loss": 2.922, - "step": 11700 + "epoch": 0.008398, + "loss_gen": 4.745401859283447, + "loss_rtd": 0.42455577850341797, + "loss_sent": 0.6431677937507629, + "loss_sod": 0.08375408500432968, + "loss_total": 2.1755483150482178, + "step": 4199 }, { - "epoch": 0.03293164172606456, - "grad_norm": 2.107308864593506, - "learning_rate": 4.993386555853735e-05, - "loss": 2.7976, - "step": 11800 + "epoch": 0.008398, + "loss_gen": 4.193449020385742, + "loss_rtd": 0.44740211963653564, + "loss_sent": 0.2250525802373886, + "loss_sod": 0.06576678156852722, + "loss_total": 1.6431792974472046, + "step": 4199 }, { - "epoch": 0.03321072343560748, - "grad_norm": 2.3073184490203857, - "learning_rate": 4.993224640105232e-05, - "loss": 2.7741, - "step": 11900 + "epoch": 0.0084, + "grad_norm": 2.3624160289764404, + "learning_rate": 9.0413699306454e-05, + "loss": 1.6817, + "step": 4200 }, { - "epoch": 0.033489805145150396, - "grad_norm": 2.261526346206665, - "learning_rate": 4.993060768867199e-05, - "loss": 2.7348, - "step": 12000 + "epoch": 0.008598, + "loss_gen": 3.6266229152679443, + "loss_rtd": 0.4430789053440094, + "loss_sent": 0.02863028645515442, + "loss_sod": 0.125107541680336, + "loss_total": 1.3622456789016724, + "step": 4299 }, { - "epoch": 0.033489805145150396, - "eval_loss": 2.938948154449463, - "eval_runtime": 51.6436, - "eval_samples_per_second": 197.391, - "eval_steps_per_second": 1.549, - "step": 12000 + "epoch": 0.008598, + "loss_gen": 4.528926849365234, + "loss_rtd": 0.43553048372268677, + "loss_sent": 0.1775393933057785, + "loss_sod": 0.057973627001047134, + "loss_total": 1.6269111633300781, + "step": 4299 }, { - "epoch": 0.033768886854693314, - "grad_norm": 2.0570199489593506, - "learning_rate": 4.992894942268163e-05, - "loss": 2.7452, - "step": 12100 + "epoch": 0.0086, + "grad_norm": 0.9962396025657654, + "learning_rate": 9.03950064172947e-05, + "loss": 1.6548, + "step": 4300 }, { - "epoch": 0.03404796856423624, - "grad_norm": 2.29695463180542, - "learning_rate": 4.992727160438184e-05, - "loss": 2.7419, - "step": 12200 + "epoch": 0.008798, + "loss_gen": 4.4079437255859375, + "loss_rtd": 0.4349976181983948, + "loss_sent": 0.367504745721817, + "loss_sod": 0.18055737018585205, + "loss_total": 1.892151951789856, + "step": 4399 }, { - "epoch": 0.03432705027377916, - "grad_norm": 2.3399369716644287, - "learning_rate": 4.992557423508859e-05, - "loss": 2.7148, - "step": 12300 + "epoch": 0.008798, + "loss_gen": 4.170320987701416, + "loss_rtd": 0.46673741936683655, + "loss_sent": 0.29705068469047546, + "loss_sod": 0.09038496017456055, + "loss_total": 1.7142581939697266, + "step": 4399 }, { - "epoch": 0.034606131983322075, - "grad_norm": 2.331512928009033, - "learning_rate": 4.992385731613316e-05, - "loss": 2.7038, - "step": 12400 + "epoch": 0.0088, + "grad_norm": 1.3224109411239624, + "learning_rate": 9.037629725703166e-05, + "loss": 1.6651, + "step": 4400 }, { - "epoch": 0.034885213692865, - "grad_norm": 1.9081106185913086, - "learning_rate": 4.9922120848862155e-05, - "loss": 2.6973, - "step": 12500 + "epoch": 0.008998, + "loss_gen": 4.52614164352417, + "loss_rtd": 0.4263684153556824, + "loss_sent": 0.19911253452301025, + "loss_sod": 0.07762130349874496, + "loss_total": 1.6144509315490723, + "step": 4499 }, { - "epoch": 0.03516429540240792, - "grad_norm": 2.4314184188842773, - "learning_rate": 4.9920364834637534e-05, - "loss": 2.7015, - "step": 12600 + "epoch": 0.008998, + "loss_gen": 4.717104911804199, + "loss_rtd": 0.4144288897514343, + "loss_sent": 0.4404892921447754, + "loss_sod": 0.05830947682261467, + "loss_total": 1.8630272150039673, + "step": 4499 }, { - "epoch": 0.035443377111950836, - "grad_norm": 2.290562868118286, - "learning_rate": 4.991858927483657e-05, - "loss": 2.6786, - "step": 12700 + "epoch": 0.009, + "grad_norm": 1.3386483192443848, + "learning_rate": 9.035757183320088e-05, + "loss": 1.622, + "step": 4500 }, { - "epoch": 0.035722458821493754, - "grad_norm": 1.9110828638076782, - "learning_rate": 4.991679417085188e-05, - "loss": 2.666, - "step": 12800 + "epoch": 0.009198, + "loss_gen": 4.1291399002075195, + "loss_rtd": 0.44797226786613464, + "loss_sent": 0.13307587802410126, + "loss_sod": 0.14009913802146912, + "loss_total": 1.5321199893951416, + "step": 4599 }, { - "epoch": 0.03600154053103668, - "grad_norm": 2.187831401824951, - "learning_rate": 4.9914979524091385e-05, - "loss": 2.6589, - "step": 12900 + "epoch": 0.009198, + "loss_gen": 3.3122775554656982, + "loss_rtd": 0.4629718065261841, + "loss_sent": 0.0329078771173954, + "loss_sod": 0.2527591586112976, + "loss_total": 1.3991779088974, + "step": 4599 }, { - "epoch": 0.036280622240579596, - "grad_norm": 1.9408071041107178, - "learning_rate": 4.9913145335978374e-05, - "loss": 2.6557, - "step": 13000 + "epoch": 0.0092, + "grad_norm": 0.636795699596405, + "learning_rate": 9.033883015334501e-05, + "loss": 1.6116, + "step": 4600 }, { - "epoch": 0.036280622240579596, - "eval_loss": 2.9094300270080566, - "eval_runtime": 51.6083, - "eval_samples_per_second": 197.526, - "eval_steps_per_second": 1.55, - "step": 13000 + "epoch": 0.009398, + "loss_gen": 4.3619160652160645, + "loss_rtd": 0.3999948799610138, + "loss_sent": 0.1576240509748459, + "loss_sod": 0.0319533571600914, + "loss_total": 1.4244252443313599, + "step": 4699 }, { - "epoch": 0.036559703950122514, - "grad_norm": 2.288982629776001, - "learning_rate": 4.9911291607951426e-05, - "loss": 2.6532, - "step": 13100 + "epoch": 0.009398, + "loss_gen": 4.300095081329346, + "loss_rtd": 0.45642659068107605, + "loss_sent": 0.17598015069961548, + "loss_sod": 0.09119284152984619, + "loss_total": 1.5466203689575195, + "step": 4699 }, { - "epoch": 0.03683878565966544, - "grad_norm": 2.217569589614868, - "learning_rate": 4.990941834146446e-05, - "loss": 2.6439, - "step": 13200 + "epoch": 0.0094, + "grad_norm": 0.5705883502960205, + "learning_rate": 9.032007222501318e-05, + "loss": 1.6013, + "step": 4700 }, { - "epoch": 0.03711786736920836, - "grad_norm": 1.804709792137146, - "learning_rate": 4.990752553798673e-05, - "loss": 2.6386, - "step": 13300 + "epoch": 0.009598, + "loss_gen": 4.3331685066223145, + "loss_rtd": 0.4457979202270508, + "loss_sent": 0.5673869848251343, + "loss_sod": 0.05349266901612282, + "loss_total": 1.8741161823272705, + "step": 4799 }, { - "epoch": 0.037396949078751275, - "grad_norm": 2.4551687240600586, - "learning_rate": 4.9905613199002796e-05, - "loss": 2.6311, - "step": 13400 + "epoch": 0.009598, + "loss_gen": 4.254533767700195, + "loss_rtd": 0.4539889991283417, + "loss_sent": 0.3765316903591156, + "loss_sod": 0.011228205636143684, + "loss_total": 1.6345348358154297, + "step": 4799 }, { - "epoch": 0.0376760307882942, - "grad_norm": 2.1292805671691895, - "learning_rate": 4.990368132601255e-05, - "loss": 2.6276, - "step": 13500 + "epoch": 0.0096, + "grad_norm": 2.2074193954467773, + "learning_rate": 9.030129805576109e-05, + "loss": 1.5485, + "step": 4800 }, { - "epoch": 0.03795511249783712, - "grad_norm": 1.9907737970352173, - "learning_rate": 4.9901729920531185e-05, - "loss": 2.6229, - "step": 13600 + "epoch": 0.009798, + "loss_gen": 4.519925594329834, + "loss_rtd": 0.42422616481781006, + "loss_sent": 0.1548745185136795, + "loss_sod": 0.11548137664794922, + "loss_total": 1.5137642621994019, + "step": 4899 }, { - "epoch": 0.038234194207380036, - "grad_norm": 2.1702795028686523, - "learning_rate": 4.989975898408925e-05, - "loss": 2.6197, - "step": 13700 + "epoch": 0.009798, + "loss_gen": 4.240198135375977, + "loss_rtd": 0.4464409649372101, + "loss_sent": 0.13806650042533875, + "loss_sod": 0.06082164868712425, + "loss_total": 1.4138140678405762, + "step": 4899 }, { - "epoch": 0.038513275916922954, - "grad_norm": 1.9034184217453003, - "learning_rate": 4.989776851823258e-05, - "loss": 2.6094, - "step": 13800 + "epoch": 0.0098, + "grad_norm": 0.8736671209335327, + "learning_rate": 9.028250765315094e-05, + "loss": 1.5344, + "step": 4900 }, { - "epoch": 0.03879235762646588, - "grad_norm": 2.083097219467163, - "learning_rate": 4.989575852452234e-05, - "loss": 2.6054, - "step": 13900 + "epoch": 0.009998, + "loss_gen": 4.306075096130371, + "loss_rtd": 0.4342154860496521, + "loss_sent": 0.3236868381500244, + "loss_sod": 0.03643370792269707, + "loss_total": 1.5526306629180908, + "step": 4999 }, { - "epoch": 0.0390714393360088, - "grad_norm": 1.9699125289916992, - "learning_rate": 4.989372900453503e-05, - "loss": 2.5887, - "step": 14000 + "epoch": 0.009998, + "loss_gen": 3.5708041191101074, + "loss_rtd": 0.45510080456733704, + "loss_sent": 0.13939236104488373, + "loss_sod": 0.19058340787887573, + "loss_total": 1.4138908386230469, + "step": 4999 }, { - "epoch": 0.0390714393360088, - "eval_loss": 2.882699728012085, - "eval_runtime": 51.7187, - "eval_samples_per_second": 197.105, - "eval_steps_per_second": 1.547, - "step": 14000 + "epoch": 0.01, + "grad_norm": 2.0682897567749023, + "learning_rate": 9.026370102475154e-05, + "loss": 1.5218, + "step": 5000 }, { - "epoch": 0.039350521045551715, - "grad_norm": 2.3234829902648926, - "learning_rate": 4.989167995986242e-05, - "loss": 2.5979, - "step": 14100 + "epoch": 0.01, + "eval_loss": 1.4553194046020508, + "eval_runtime": 151.1516, + "eval_samples_per_second": 102.169, + "eval_steps_per_second": 0.801, + "step": 5000 }, { - "epoch": 0.03962960275509464, - "grad_norm": 2.396756172180176, - "learning_rate": 4.988961139211164e-05, - "loss": 2.5901, - "step": 14200 + "epoch": 0.010198, + "loss_gen": 4.2357563972473145, + "loss_rtd": 0.4142667353153229, + "loss_sent": 0.18892182409763336, + "loss_sod": 0.07149676233530045, + "loss_total": 1.3986937999725342, + "step": 5099 }, { - "epoch": 0.03990868446463756, - "grad_norm": 2.3113210201263428, - "learning_rate": 4.9887523302905104e-05, - "loss": 2.5918, - "step": 14300 + "epoch": 0.010198, + "loss_gen": 4.266040325164795, + "loss_rtd": 0.43612152338027954, + "loss_sent": 0.20986633002758026, + "loss_sod": 0.16188688576221466, + "loss_total": 1.5370596647262573, + "step": 5099 }, { - "epoch": 0.040187766174180475, - "grad_norm": 2.006770372390747, - "learning_rate": 4.988541569388054e-05, - "loss": 2.581, - "step": 14400 + "epoch": 0.0102, + "grad_norm": 3.3480117321014404, + "learning_rate": 9.024487817813818e-05, + "loss": 1.502, + "step": 5100 }, { - "epoch": 0.04046684788372339, - "grad_norm": 2.482415199279785, - "learning_rate": 4.988328856669099e-05, - "loss": 2.5714, - "step": 14500 + "epoch": 0.010398, + "loss_gen": 3.4499049186706543, + "loss_rtd": 0.4549427926540375, + "loss_sent": 0.11068418622016907, + "loss_sod": 0.1226339265704155, + "loss_total": 1.260018229484558, + "step": 5199 }, { - "epoch": 0.04074592959326632, - "grad_norm": 2.2281925678253174, - "learning_rate": 4.988114192300482e-05, - "loss": 2.5735, - "step": 14600 + "epoch": 0.010398, + "loss_gen": 2.777376890182495, + "loss_rtd": 0.4569548964500427, + "loss_sent": 0.0002673329727258533, + "loss_sod": 0.3624283969402313, + "loss_total": 1.2799490690231323, + "step": 5199 }, { - "epoch": 0.041025011302809236, - "grad_norm": 1.868297815322876, - "learning_rate": 4.987897576450567e-05, - "loss": 2.5685, - "step": 14700 + "epoch": 0.0104, + "grad_norm": 0.6756449937820435, + "learning_rate": 9.02260391208927e-05, + "loss": 1.4573, + "step": 5200 }, { - "epoch": 0.041304093012352154, - "grad_norm": 2.096435785293579, - "learning_rate": 4.9876790092892534e-05, - "loss": 2.5773, - "step": 14800 + "epoch": 0.010598, + "loss_gen": 4.136087417602539, + "loss_rtd": 0.4495868980884552, + "loss_sent": 0.09559452533721924, + "loss_sod": 0.09505974501371384, + "loss_total": 1.3041480779647827, + "step": 5299 }, { - "epoch": 0.04158317472189508, - "grad_norm": 2.155036687850952, - "learning_rate": 4.987458490987965e-05, - "loss": 2.5527, - "step": 14900 + "epoch": 0.010598, + "loss_gen": 4.018187046051025, + "loss_rtd": 0.46124017238616943, + "loss_sent": 0.21007047593593597, + "loss_sod": 0.017557775601744652, + "loss_total": 1.3338505029678345, + "step": 5299 }, { - "epoch": 0.041862256431438, - "grad_norm": 2.4677577018737793, - "learning_rate": 4.98723602171966e-05, - "loss": 2.5662, - "step": 15000 + "epoch": 0.0106, + "grad_norm": 0.9290640950202942, + "learning_rate": 9.020718386060347e-05, + "loss": 1.4342, + "step": 5300 }, { - "epoch": 0.041862256431438, - "eval_loss": 2.8656771183013916, - "eval_runtime": 51.7196, - "eval_samples_per_second": 197.101, - "eval_steps_per_second": 1.547, - "step": 15000 + "epoch": 0.010798, + "loss_gen": 4.0267462730407715, + "loss_rtd": 0.4358631372451782, + "loss_sent": 0.24202901124954224, + "loss_sod": 0.007630678825080395, + "loss_total": 1.3108251094818115, + "step": 5399 }, { - "epoch": 0.042141338140980915, - "grad_norm": 2.102792978286743, - "learning_rate": 4.9870116016588265e-05, - "loss": 2.5432, - "step": 15100 + "epoch": 0.010798, + "loss_gen": 3.931260347366333, + "loss_rtd": 0.45226356387138367, + "loss_sent": 0.051588136702775955, + "loss_sod": 0.2738020420074463, + "loss_total": 1.388128399848938, + "step": 5399 }, { - "epoch": 0.04242041985052384, - "grad_norm": 1.8835171461105347, - "learning_rate": 4.986785230981481e-05, - "loss": 2.5391, - "step": 15200 + "epoch": 0.0108, + "grad_norm": 0.7409310340881348, + "learning_rate": 9.018831240486539e-05, + "loss": 1.4041, + "step": 5400 }, { - "epoch": 0.04269950156006676, - "grad_norm": 2.396533727645874, - "learning_rate": 4.986556909865171e-05, - "loss": 2.5458, - "step": 15300 + "epoch": 0.010998, + "loss_gen": 4.556105613708496, + "loss_rtd": 0.44320401549339294, + "loss_sent": 0.22269946336746216, + "loss_sod": 0.029224302619695663, + "loss_total": 1.378782033920288, + "step": 5499 }, { - "epoch": 0.042978583269609676, - "grad_norm": 2.206937074661255, - "learning_rate": 4.9863266384889746e-05, - "loss": 2.5341, - "step": 15400 + "epoch": 0.010998, + "loss_gen": 4.53646183013916, + "loss_rtd": 0.4213176369667053, + "loss_sent": 0.14112795889377594, + "loss_sod": 0.24416619539260864, + "loss_total": 1.4873186349868774, + "step": 5499 }, { - "epoch": 0.043257664979152594, - "grad_norm": 1.9710756540298462, - "learning_rate": 4.986094417033498e-05, - "loss": 2.5342, - "step": 15500 + "epoch": 0.011, + "grad_norm": 1.722217082977295, + "learning_rate": 9.016942476127988e-05, + "loss": 1.3937, + "step": 5500 }, { - "epoch": 0.04353674668869552, - "grad_norm": 2.3568851947784424, - "learning_rate": 4.985860245680877e-05, - "loss": 2.5343, - "step": 15600 + "epoch": 0.011198, + "loss_gen": 4.555620193481445, + "loss_rtd": 0.45380261540412903, + "loss_sent": 0.4497606158256531, + "loss_sod": 0.06671988219022751, + "loss_total": 1.6300160884857178, + "step": 5599 }, { - "epoch": 0.043815828398238436, - "grad_norm": 2.142500638961792, - "learning_rate": 4.985624124614778e-05, - "loss": 2.5315, - "step": 15700 + "epoch": 0.011198, + "loss_gen": 4.417397975921631, + "loss_rtd": 0.4402684271335602, + "loss_sent": 0.6401938199996948, + "loss_sod": 0.12325134873390198, + "loss_total": 1.843429684638977, + "step": 5599 }, { - "epoch": 0.044094910107781354, - "grad_norm": 2.2401161193847656, - "learning_rate": 4.9853860540203954e-05, - "loss": 2.5274, - "step": 15800 + "epoch": 0.0112, + "grad_norm": 2.376171588897705, + "learning_rate": 9.015052093745488e-05, + "loss": 1.382, + "step": 5600 }, { - "epoch": 0.04437399181732428, - "grad_norm": 2.0953330993652344, - "learning_rate": 4.985146034084453e-05, - "loss": 2.5218, - "step": 15900 + "epoch": 0.011398, + "loss_gen": 3.161689519882202, + "loss_rtd": 0.45311564207077026, + "loss_sent": 0.030058998614549637, + "loss_sod": 0.1347379982471466, + "loss_total": 1.059248924255371, + "step": 5699 }, { - "epoch": 0.0446530735268672, - "grad_norm": 2.396833658218384, - "learning_rate": 4.9849040649952036e-05, - "loss": 2.5292, - "step": 16000 + "epoch": 0.011398, + "loss_gen": 4.352455139160156, + "loss_rtd": 0.42320743203163147, + "loss_sent": 0.2760780155658722, + "loss_sod": 0.05662854015827179, + "loss_total": 1.363467812538147, + "step": 5699 }, { - "epoch": 0.0446530735268672, - "eval_loss": 2.829500198364258, - "eval_runtime": 51.7607, - "eval_samples_per_second": 196.945, - "eval_steps_per_second": 1.546, - "step": 16000 + "epoch": 0.0114, + "grad_norm": 1.3683046102523804, + "learning_rate": 9.013160094100485e-05, + "loss": 1.3398, + "step": 5700 }, { - "epoch": 0.044932155236410115, - "grad_norm": 2.0131983757019043, - "learning_rate": 4.984660146942429e-05, - "loss": 2.5153, - "step": 16100 + "epoch": 0.011598, + "loss_gen": 2.8516530990600586, + "loss_rtd": 0.46841028332710266, + "loss_sent": 0.00027514316025190055, + "loss_sod": 0.3374539613723755, + "loss_total": 1.1893240213394165, + "step": 5799 }, { - "epoch": 0.04521123694595303, - "grad_norm": 2.258866786956787, - "learning_rate": 4.9844142801174395e-05, - "loss": 2.5173, - "step": 16200 + "epoch": 0.011598, + "loss_gen": 3.1871232986450195, + "loss_rtd": 0.473550945520401, + "loss_sent": 0.05105242878198624, + "loss_sod": 0.2973681092262268, + "loss_total": 1.2502342462539673, + "step": 5799 }, { - "epoch": 0.04549031865549596, - "grad_norm": 2.1101086139678955, - "learning_rate": 4.984166464713073e-05, - "loss": 2.5179, - "step": 16300 + "epoch": 0.0116, + "grad_norm": 0.8012044429779053, + "learning_rate": 9.011266477955076e-05, + "loss": 1.3203, + "step": 5800 }, { - "epoch": 0.045769400365038876, - "grad_norm": 1.7976003885269165, - "learning_rate": 4.983916700923697e-05, - "loss": 2.4978, - "step": 16400 + "epoch": 0.011798, + "loss_gen": 4.541375160217285, + "loss_rtd": 0.43058615922927856, + "loss_sent": 0.12835851311683655, + "loss_sod": 0.0999765694141388, + "loss_total": 1.2455573081970215, + "step": 5899 }, { - "epoch": 0.046048482074581794, - "grad_norm": 2.168721914291382, - "learning_rate": 4.983664988945206e-05, - "loss": 2.992, - "step": 16500 + "epoch": 0.011798, + "loss_gen": 4.475722312927246, + "loss_rtd": 0.4540126919746399, + "loss_sent": 0.04481251910328865, + "loss_sod": 0.10295294225215912, + "loss_total": 1.1799335479736328, + "step": 5899 }, { - "epoch": 0.04632756378412472, - "grad_norm": 2.422194480895996, - "learning_rate": 4.983411328975024e-05, - "loss": 3.0074, - "step": 16600 + "epoch": 0.0118, + "grad_norm": 0.9984089136123657, + "learning_rate": 9.00937124607201e-05, + "loss": 1.2794, + "step": 5900 }, { - "epoch": 0.04660664549366764, - "grad_norm": 2.764707565307617, - "learning_rate": 4.983155721212102e-05, - "loss": 2.9743, - "step": 16700 + "epoch": 0.011998, + "loss_gen": 4.149008750915527, + "loss_rtd": 0.44907456636428833, + "loss_sent": 0.1539432555437088, + "loss_sod": 0.1032748594880104, + "loss_total": 1.220787763595581, + "step": 5999 }, { - "epoch": 0.046885727203210555, - "grad_norm": 2.046159267425537, - "learning_rate": 4.9828981658569175e-05, - "loss": 2.9528, - "step": 16800 + "epoch": 0.011998, + "loss_gen": 4.26630163192749, + "loss_rtd": 0.43877696990966797, + "loss_sent": 0.11144015192985535, + "loss_sod": 0.14063358306884766, + "loss_total": 1.2198905944824219, + "step": 5999 }, { - "epoch": 0.04716480891275347, - "grad_norm": 1.9275652170181274, - "learning_rate": 4.982638663111477e-05, - "loss": 2.9338, - "step": 16900 + "epoch": 0.012, + "grad_norm": 1.0076442956924438, + "learning_rate": 9.007474399214685e-05, + "loss": 1.2577, + "step": 6000 }, { - "epoch": 0.0474438906222964, - "grad_norm": 2.423919200897217, - "learning_rate": 4.9823772131793155e-05, - "loss": 2.9202, - "step": 17000 + "epoch": 0.012, + "eval_loss": 1.2371487617492676, + "eval_runtime": 151.5278, + "eval_samples_per_second": 101.915, + "eval_steps_per_second": 0.799, + "step": 6000 }, { - "epoch": 0.0474438906222964, - "eval_loss": 2.8014559745788574, - "eval_runtime": 51.8179, - "eval_samples_per_second": 196.727, - "eval_steps_per_second": 1.544, - "step": 17000 + "epoch": 0.012198, + "loss_gen": 4.438620090484619, + "loss_rtd": 0.44645529985427856, + "loss_sent": 0.31584903597831726, + "loss_sod": 0.038453131914138794, + "loss_total": 1.3283517360687256, + "step": 6099 }, { - "epoch": 0.047722972331839315, - "grad_norm": 2.4509029388427734, - "learning_rate": 4.9821138162654924e-05, - "loss": 2.9037, - "step": 17100 + "epoch": 0.012198, + "loss_gen": 3.957967758178711, + "loss_rtd": 0.4457968771457672, + "loss_sent": 0.2645883560180664, + "loss_sod": 0.07057566940784454, + "loss_total": 1.251422643661499, + "step": 6099 }, { - "epoch": 0.04800205404138223, - "grad_norm": 2.6597063541412354, - "learning_rate": 4.981848472576595e-05, - "loss": 2.8937, - "step": 17200 + "epoch": 0.0122, + "grad_norm": 3.596428155899048, + "learning_rate": 9.005575938147153e-05, + "loss": 1.2417, + "step": 6100 }, { - "epoch": 0.04828113575092516, - "grad_norm": 2.3224332332611084, - "learning_rate": 4.981581182320739e-05, - "loss": 2.8945, - "step": 17300 + "epoch": 0.012398, + "loss_gen": 4.318472862243652, + "loss_rtd": 0.4407297372817993, + "loss_sent": 0.1350451558828354, + "loss_sod": 0.1250823438167572, + "loss_total": 1.192137598991394, + "step": 6199 }, { - "epoch": 0.048560217460468076, - "grad_norm": 1.8250792026519775, - "learning_rate": 4.9813119457075666e-05, - "loss": 2.8944, - "step": 17400 + "epoch": 0.012398, + "loss_gen": 3.8575875759124756, + "loss_rtd": 0.4555145800113678, + "loss_sent": 0.1910392791032791, + "loss_sod": 0.038792889565229416, + "loss_total": 1.1241956949234009, + "step": 6199 }, { - "epoch": 0.048839299170010994, - "grad_norm": 1.8885750770568848, - "learning_rate": 4.981040762948245e-05, - "loss": 2.8731, - "step": 17500 + "epoch": 0.0124, + "grad_norm": 1.5775076150894165, + "learning_rate": 9.003675863634109e-05, + "loss": 1.2279, + "step": 6200 }, { - "epoch": 0.04911838087955392, - "grad_norm": 2.1029040813446045, - "learning_rate": 4.9807676342554674e-05, - "loss": 2.8734, - "step": 17600 + "epoch": 0.012598, + "loss_gen": 4.14998722076416, + "loss_rtd": 0.4265194833278656, + "loss_sent": 0.30052652955055237, + "loss_sod": 0.07797949016094208, + "loss_total": 1.2561488151550293, + "step": 6299 }, { - "epoch": 0.04939746258909684, - "grad_norm": 2.3319969177246094, - "learning_rate": 4.9804925598434574e-05, - "loss": 2.8551, - "step": 17700 + "epoch": 0.012598, + "loss_gen": 2.7819416522979736, + "loss_rtd": 0.4592662751674652, + "loss_sent": 0.06214335188269615, + "loss_sod": 0.27429041266441345, + "loss_total": 1.0981101989746094, + "step": 6299 }, { - "epoch": 0.049676544298639755, - "grad_norm": 2.2127387523651123, - "learning_rate": 4.9802155399279594e-05, - "loss": 2.861, - "step": 17800 + "epoch": 0.0126, + "grad_norm": 1.3763465881347656, + "learning_rate": 9.001774176440907e-05, + "loss": 1.2032, + "step": 6300 }, { - "epoch": 0.04995562600818267, - "grad_norm": 2.348116159439087, - "learning_rate": 4.979936574726246e-05, - "loss": 2.8403, - "step": 17900 + "epoch": 0.012798, + "loss_gen": 4.117013931274414, + "loss_rtd": 0.4338371753692627, + "loss_sent": 0.20068877935409546, + "loss_sod": 0.014359413646161556, + "loss_total": 1.0758084058761597, + "step": 6399 }, { - "epoch": 0.0502347077177256, - "grad_norm": 2.1458938121795654, - "learning_rate": 4.979655664457117e-05, - "loss": 2.8348, - "step": 18000 + "epoch": 0.012798, + "loss_gen": 3.95107364654541, + "loss_rtd": 0.43674370646476746, + "loss_sent": 0.11374907940626144, + "loss_sod": 0.005498822778463364, + "loss_total": 0.9657070636749268, + "step": 6399 }, { - "epoch": 0.0502347077177256, - "eval_loss": 2.782224416732788, - "eval_runtime": 51.846, - "eval_samples_per_second": 196.621, - "eval_steps_per_second": 1.543, - "step": 18000 + "epoch": 0.0128, + "grad_norm": 1.0737333297729492, + "learning_rate": 8.999870877333546e-05, + "loss": 1.2041, + "step": 6400 }, { - "epoch": 0.00027908170954291995, - "grad_norm": 2.324357748031616, - "learning_rate": 4.9821138162654924e-05, - "loss": 2.838, - "step": 18100 + "epoch": 0.012998, + "loss_gen": 4.177030563354492, + "loss_rtd": 0.4391840994358063, + "loss_sent": 0.13953599333763123, + "loss_sod": 0.06702539324760437, + "loss_total": 1.058211326599121, + "step": 6499 }, { - "epoch": 0.0005581634190858399, - "grad_norm": 2.52563214302063, - "learning_rate": 4.981848472576595e-05, - "loss": 2.824, - "step": 18200 + "epoch": 0.012998, + "loss_gen": 4.0403361320495605, + "loss_rtd": 0.4460426867008209, + "loss_sent": 0.4360485076904297, + "loss_sod": 0.0397292897105217, + "loss_total": 1.3207882642745972, + "step": 6499 }, { - "epoch": 0.0008372451286287599, - "grad_norm": 2.134593963623047, - "learning_rate": 4.981581182320739e-05, - "loss": 2.8365, - "step": 18300 + "epoch": 0.013, + "grad_norm": 2.090071201324463, + "learning_rate": 8.997965967078675e-05, + "loss": 1.1728, + "step": 6500 }, { - "epoch": 0.0011163268381716798, - "grad_norm": 2.040562868118286, - "learning_rate": 4.9813119457075666e-05, - "loss": 2.808, - "step": 18400 + "epoch": 0.013198, + "loss_gen": 4.261964797973633, + "loss_rtd": 0.45017364621162415, + "loss_sent": 0.680860161781311, + "loss_sod": 0.02003784291446209, + "loss_total": 1.5510892868041992, + "step": 6599 }, { - "epoch": 0.0013954085477146, - "grad_norm": 2.019237995147705, - "learning_rate": 4.981040762948245e-05, - "loss": 2.8112, - "step": 18500 + "epoch": 0.013198, + "loss_gen": 4.514923095703125, + "loss_rtd": 0.4445608854293823, + "loss_sent": 0.15158210694789886, + "loss_sod": 0.28943219780921936, + "loss_total": 1.3093348741531372, + "step": 6599 }, { - "epoch": 0.0016744902572575198, - "grad_norm": 2.032052755355835, - "learning_rate": 4.9807676342554674e-05, - "loss": 2.8119, - "step": 18600 + "epoch": 0.0132, + "grad_norm": 2.357900857925415, + "learning_rate": 8.996059446443587e-05, + "loss": 1.1415, + "step": 6600 }, { - "epoch": 0.00195357196680044, - "grad_norm": 2.2819409370422363, - "learning_rate": 4.9804925598434574e-05, - "loss": 2.8083, - "step": 18700 + "epoch": 0.013398, + "loss_gen": 4.206921100616455, + "loss_rtd": 0.45165202021598816, + "loss_sent": 0.37640759348869324, + "loss_sod": 0.11158914864063263, + "loss_total": 1.3142218589782715, + "step": 6699 }, { - "epoch": 0.0022326536763433596, - "grad_norm": 2.2714831829071045, - "learning_rate": 4.9802155399279594e-05, - "loss": 2.7902, - "step": 18800 + "epoch": 0.013398, + "loss_gen": 4.138385772705078, + "loss_rtd": 0.4333704113960266, + "loss_sent": 0.19938777387142181, + "loss_sod": 0.07604871690273285, + "loss_total": 1.0772777795791626, + "step": 6699 }, { - "epoch": 0.0025117353858862797, - "grad_norm": 2.1569690704345703, - "learning_rate": 4.979936574726246e-05, - "loss": 2.7999, - "step": 18900 + "epoch": 0.0134, + "grad_norm": 2.0228612422943115, + "learning_rate": 8.994151316196236e-05, + "loss": 1.1384, + "step": 6700 }, { - "epoch": 0.0027908170954292, - "grad_norm": 2.416579484939575, - "learning_rate": 4.979655664457117e-05, - "loss": 2.7883, - "step": 19000 + "epoch": 0.013598, + "loss_gen": 4.004166603088379, + "loss_rtd": 0.4367596209049225, + "loss_sent": 0.11661457270383835, + "loss_sod": 0.051644932478666306, + "loss_total": 0.9425358772277832, + "step": 6799 }, { - "epoch": 0.0027908170954292, - "eval_loss": 2.7761881351470947, - "eval_runtime": 52.5964, - "eval_samples_per_second": 193.816, - "eval_steps_per_second": 1.521, - "step": 19000 + "epoch": 0.013598, + "loss_gen": 4.20042610168457, + "loss_rtd": 0.4377318024635315, + "loss_sent": 0.2091219127178192, + "loss_sod": 0.10864714533090591, + "loss_total": 1.1095606088638306, + "step": 6799 }, { - "epoch": 0.00306989880497212, - "grad_norm": 2.2108051776885986, - "learning_rate": 4.979372809340896e-05, - "loss": 2.7821, - "step": 19100 + "epoch": 0.0136, + "grad_norm": 1.7526026964187622, + "learning_rate": 8.992241577105209e-05, + "loss": 1.1267, + "step": 6800 }, { - "epoch": 0.0033489805145150396, - "grad_norm": 2.3721492290496826, - "learning_rate": 4.979088009599432e-05, - "loss": 2.7715, - "step": 19200 + "epoch": 0.013798, + "loss_gen": 4.2462592124938965, + "loss_rtd": 0.46197614073753357, + "loss_sent": 0.19782444834709167, + "loss_sod": 0.12475350499153137, + "loss_total": 1.1226646900177002, + "step": 6899 }, { - "epoch": 0.0036280622240579597, - "grad_norm": 2.696380853652954, - "learning_rate": 4.978801265456099e-05, - "loss": 2.7653, - "step": 19300 + "epoch": 0.013798, + "loss_gen": 3.794999122619629, + "loss_rtd": 0.46073493361473083, + "loss_sent": 0.2632262110710144, + "loss_sod": 0.19318082928657532, + "loss_total": 1.2193207740783691, + "step": 6899 }, { - "epoch": 0.00390714393360088, - "grad_norm": 2.230356216430664, - "learning_rate": 4.9785125771357974e-05, - "loss": 2.7876, - "step": 19400 + "epoch": 0.0138, + "grad_norm": 3.328732490539551, + "learning_rate": 8.990330229939755e-05, + "loss": 1.0854, + "step": 6900 }, { - "epoch": 0.0041862256431437995, - "grad_norm": 1.7768421173095703, - "learning_rate": 4.978221944864951e-05, - "loss": 2.7697, - "step": 19500 + "epoch": 0.013998, + "loss_gen": 4.278293132781982, + "loss_rtd": 0.451276570558548, + "loss_sent": 0.3585604429244995, + "loss_sod": 0.027789060026407242, + "loss_total": 1.1586921215057373, + "step": 6999 }, { - "epoch": 0.004465307352686719, - "grad_norm": 1.6667171716690063, - "learning_rate": 4.9779293688715104e-05, - "loss": 2.7683, - "step": 19600 + "epoch": 0.013998, + "loss_gen": 4.299320697784424, + "loss_rtd": 0.4333668351173401, + "loss_sent": 0.32892492413520813, + "loss_sod": 0.11924998462200165, + "loss_total": 1.2041857242584229, + "step": 6999 }, { - "epoch": 0.00474438906222964, - "grad_norm": 2.404054641723633, - "learning_rate": 4.977634849384947e-05, - "loss": 2.7704, - "step": 19700 + "epoch": 0.014, + "grad_norm": 1.338874101638794, + "learning_rate": 8.988417275469759e-05, + "loss": 1.061, + "step": 7000 }, { - "epoch": 0.005023470771772559, - "grad_norm": 2.229520320892334, - "learning_rate": 4.977338386636261e-05, - "loss": 2.7521, - "step": 19800 + "epoch": 0.014, + "eval_loss": 1.0338186025619507, + "eval_runtime": 151.247, + "eval_samples_per_second": 102.104, + "eval_steps_per_second": 0.8, + "step": 7000 }, { - "epoch": 0.00530255248131548, - "grad_norm": 2.0139310359954834, - "learning_rate": 4.977039980857973e-05, - "loss": 2.7504, - "step": 19900 + "epoch": 0.014198, + "loss_gen": 2.923762083053589, + "loss_rtd": 0.4590458869934082, + "loss_sent": 0.00043936684960499406, + "loss_sod": 0.4023163318634033, + "loss_total": 1.0680921077728271, + "step": 7099 }, { - "epoch": 0.0055816341908584, - "grad_norm": 2.225908041000366, - "learning_rate": 4.9767396322841285e-05, - "loss": 2.7583, - "step": 20000 + "epoch": 0.014198, + "loss_gen": 2.4773359298706055, + "loss_rtd": 0.4419349431991577, + "loss_sent": 0.0001352017861790955, + "loss_sod": 0.2943947911262512, + "loss_total": 0.9112571477890015, + "step": 7099 }, { - "epoch": 0.0055816341908584, - "eval_loss": 2.734092950820923, - "eval_runtime": 51.4077, - "eval_samples_per_second": 198.297, - "eval_steps_per_second": 1.556, - "step": 20000 + "epoch": 0.0142, + "grad_norm": 1.9851511716842651, + "learning_rate": 8.986502714465762e-05, + "loss": 1.0493, + "step": 7100 }, { - "epoch": 0.005860715900401319, - "grad_norm": 2.034360885620117, - "learning_rate": 4.9764373411502985e-05, - "loss": 2.7538, - "step": 20100 + "epoch": 0.014398, + "loss_gen": 4.283316612243652, + "loss_rtd": 0.4622212052345276, + "loss_sent": 0.07737211883068085, + "loss_sod": 0.11309823393821716, + "loss_total": 0.9360949397087097, + "step": 7199 }, { - "epoch": 0.00613979760994424, - "grad_norm": 2.1626219749450684, - "learning_rate": 4.976133107693577e-05, - "loss": 2.7521, - "step": 20200 + "epoch": 0.014398, + "loss_gen": 3.8199846744537354, + "loss_rtd": 0.4555726945400238, + "loss_sent": 0.07644452154636383, + "loss_sod": 0.06696701049804688, + "loss_total": 0.8517314791679382, + "step": 7199 }, { - "epoch": 0.0064188793194871595, - "grad_norm": 2.2981114387512207, - "learning_rate": 4.9758269321525795e-05, - "loss": 2.7491, - "step": 20300 + "epoch": 0.0144, + "grad_norm": 0.9992061853408813, + "learning_rate": 8.98458654769895e-05, + "loss": 1.0395, + "step": 7200 }, { - "epoch": 0.006697961029030079, - "grad_norm": 2.4370241165161133, - "learning_rate": 4.975518814767446e-05, - "loss": 2.7414, - "step": 20400 + "epoch": 0.014598, + "loss_gen": 4.160510540008545, + "loss_rtd": 0.46040648221969604, + "loss_sent": 0.23881760239601135, + "loss_sod": 0.019337791949510574, + "loss_total": 0.9759917259216309, + "step": 7299 }, { - "epoch": 0.006977042738573, - "grad_norm": 2.6030125617980957, - "learning_rate": 4.975208755779841e-05, - "loss": 2.7324, - "step": 20500 + "epoch": 0.014598, + "loss_gen": 4.084108829498291, + "loss_rtd": 0.43639713525772095, + "loss_sent": 0.39973214268684387, + "loss_sod": 0.009163892827928066, + "loss_total": 1.097995638847351, + "step": 7299 }, { - "epoch": 0.0072561244481159195, - "grad_norm": 2.730182409286499, - "learning_rate": 4.974896755432949e-05, - "loss": 2.7294, - "step": 20600 + "epoch": 0.0146, + "grad_norm": 1.4760040044784546, + "learning_rate": 8.98266877594115e-05, + "loss": 1.0213, + "step": 7300 }, { - "epoch": 0.007535206157658839, - "grad_norm": 2.383457660675049, - "learning_rate": 4.974582813971479e-05, - "loss": 2.7281, - "step": 20700 + "epoch": 0.014798, + "loss_gen": 4.018357276916504, + "loss_rtd": 0.4384211301803589, + "loss_sent": 0.30337029695510864, + "loss_sod": 0.1135876476764679, + "loss_total": 1.0872063636779785, + "step": 7399 }, { - "epoch": 0.00781428786720176, - "grad_norm": 2.381148338317871, - "learning_rate": 4.974266931641662e-05, - "loss": 2.721, - "step": 20800 + "epoch": 0.014798, + "loss_gen": 3.9204838275909424, + "loss_rtd": 0.43890058994293213, + "loss_sent": 0.34354591369628906, + "loss_sod": 0.05680248886346817, + "loss_total": 1.0654296875, + "step": 7399 }, { - "epoch": 0.00809336957674468, - "grad_norm": 2.2645883560180664, - "learning_rate": 4.973949108691252e-05, - "loss": 2.7207, - "step": 20900 + "epoch": 0.0148, + "grad_norm": 1.4493780136108398, + "learning_rate": 8.980749399964847e-05, + "loss": 0.9879, + "step": 7400 }, { - "epoch": 0.008372451286287599, - "grad_norm": 2.185866594314575, - "learning_rate": 4.973629345369523e-05, - "loss": 2.7158, - "step": 21000 + "epoch": 0.014998, + "loss_gen": 4.476629257202148, + "loss_rtd": 0.43758487701416016, + "loss_sent": 0.2716887295246124, + "loss_sod": 0.23893794417381287, + "loss_total": 1.1882572174072266, + "step": 7499 }, { - "epoch": 0.008372451286287599, - "eval_loss": 2.7255687713623047, - "eval_runtime": 51.339, - "eval_samples_per_second": 198.562, - "eval_steps_per_second": 1.558, - "step": 21000 + "epoch": 0.014998, + "loss_gen": 3.8585877418518066, + "loss_rtd": 0.45572131872177124, + "loss_sent": 0.3436616063117981, + "loss_sod": 0.14704178273677826, + "loss_total": 1.153329849243164, + "step": 7499 }, { - "epoch": 0.008651532995830519, - "grad_norm": 2.4258711338043213, - "learning_rate": 4.973307641927273e-05, - "loss": 2.7171, - "step": 21100 + "epoch": 0.015, + "grad_norm": 1.700055718421936, + "learning_rate": 8.97882842054316e-05, + "loss": 0.9677, + "step": 7500 }, { - "epoch": 0.008930614705373438, - "grad_norm": 2.1136467456817627, - "learning_rate": 4.972983998616821e-05, - "loss": 2.7141, - "step": 21200 + "epoch": 0.015198, + "loss_gen": 4.266899585723877, + "loss_rtd": 0.4483409821987152, + "loss_sent": 0.18687587976455688, + "loss_sod": 0.04890352860093117, + "loss_total": 0.8960543870925903, + "step": 7599 }, { - "epoch": 0.00920969641491636, - "grad_norm": 2.4467060565948486, - "learning_rate": 4.972658415692007e-05, - "loss": 2.7, - "step": 21300 + "epoch": 0.015198, + "loss_gen": 2.754318952560425, + "loss_rtd": 0.44348207116127014, + "loss_sent": 0.022434303537011147, + "loss_sod": 0.42374366521835327, + "loss_total": 1.0264652967453003, + "step": 7599 }, { - "epoch": 0.00948877812445928, - "grad_norm": 2.4384939670562744, - "learning_rate": 4.972330893408194e-05, - "loss": 2.7041, - "step": 21400 + "epoch": 0.0152, + "grad_norm": 1.109738826751709, + "learning_rate": 8.976905838449861e-05, + "loss": 0.9591, + "step": 7600 }, { - "epoch": 0.0097678598340022, - "grad_norm": 2.105285406112671, - "learning_rate": 4.972001432022263e-05, - "loss": 2.7052, - "step": 21500 + "epoch": 0.015398, + "loss_gen": 3.008835554122925, + "loss_rtd": 0.47243383526802063, + "loss_sent": 0.04867243766784668, + "loss_sod": 0.1572308987379074, + "loss_total": 0.81625896692276, + "step": 7699 }, { - "epoch": 0.010046941543545119, - "grad_norm": 2.446324586868286, - "learning_rate": 4.971670031792618e-05, - "loss": 2.7132, - "step": 21600 + "epoch": 0.015398, + "loss_gen": 4.257413864135742, + "loss_rtd": 0.42728057503700256, + "loss_sent": 0.1736775040626526, + "loss_sod": 0.015286522917449474, + "loss_total": 0.8113998174667358, + "step": 7699 }, { - "epoch": 0.010326023253088039, - "grad_norm": 2.2751736640930176, - "learning_rate": 4.971336692979184e-05, - "loss": 2.6998, - "step": 21700 + "epoch": 0.0154, + "grad_norm": 1.0539206266403198, + "learning_rate": 8.974981654459366e-05, + "loss": 0.9468, + "step": 7700 }, { - "epoch": 0.01060510496263096, - "grad_norm": 2.3609867095947266, - "learning_rate": 4.9710014158434045e-05, - "loss": 2.7069, - "step": 21800 + "epoch": 0.015598, + "loss_gen": 3.177402973175049, + "loss_rtd": 0.45968639850616455, + "loss_sent": 0.009485319256782532, + "loss_sod": 0.3038681149482727, + "loss_total": 0.9069209694862366, + "step": 7799 }, { - "epoch": 0.01088418667217388, - "grad_norm": 2.401102304458618, - "learning_rate": 4.970664200648246e-05, - "loss": 2.6906, - "step": 21900 + "epoch": 0.015598, + "loss_gen": 4.26717472076416, + "loss_rtd": 0.4419127404689789, + "loss_sent": 0.07726690173149109, + "loss_sod": 0.03336562216281891, + "loss_total": 0.732344388961792, + "step": 7799 }, { - "epoch": 0.0111632683817168, - "grad_norm": 1.9187495708465576, - "learning_rate": 4.970325047658193e-05, - "loss": 2.6623, - "step": 22000 + "epoch": 0.0156, + "grad_norm": 0.9270355701446533, + "learning_rate": 8.973055869346735e-05, + "loss": 0.9286, + "step": 7800 }, { - "epoch": 0.0111632683817168, - "eval_loss": 2.7077319622039795, - "eval_runtime": 51.4142, - "eval_samples_per_second": 198.272, - "eval_steps_per_second": 1.556, - "step": 22000 + "epoch": 0.015798, + "loss_gen": 4.384439468383789, + "loss_rtd": 0.44737496972084045, + "loss_sent": 0.1470516175031662, + "loss_sod": 0.1933356523513794, + "loss_total": 0.9568408131599426, + "step": 7899 }, { - "epoch": 0.011442350091259719, - "grad_norm": 1.8887262344360352, - "learning_rate": 4.9699839571392503e-05, - "loss": 2.6982, - "step": 22100 + "epoch": 0.015798, + "loss_gen": 4.084644317626953, + "loss_rtd": 0.4502268433570862, + "loss_sent": 0.14218765497207642, + "loss_sod": 0.06409710645675659, + "loss_total": 0.8140290379524231, + "step": 7899 }, { - "epoch": 0.011721431800802639, - "grad_norm": 1.9740933179855347, - "learning_rate": 4.969640929358943e-05, - "loss": 2.6728, - "step": 22200 + "epoch": 0.0158, + "grad_norm": 1.05494225025177, + "learning_rate": 8.971128483887676e-05, + "loss": 0.9236, + "step": 7900 }, { - "epoch": 0.012000513510345558, - "grad_norm": 2.2788689136505127, - "learning_rate": 4.9692959645863145e-05, - "loss": 2.6663, - "step": 22300 + "epoch": 0.015998, + "loss_gen": 4.135157585144043, + "loss_rtd": 0.43412676453590393, + "loss_sent": 0.28543326258659363, + "loss_sod": 0.09229005873203278, + "loss_total": 0.9571057558059692, + "step": 7999 }, { - "epoch": 0.01227959521988848, - "grad_norm": 1.694128394126892, - "learning_rate": 4.9689490630919286e-05, - "loss": 2.6871, - "step": 22400 + "epoch": 0.015998, + "loss_gen": 3.857150077819824, + "loss_rtd": 0.442083477973938, + "loss_sent": 0.2047467678785324, + "loss_sod": 0.14205977320671082, + "loss_total": 0.9243801236152649, + "step": 7999 }, { - "epoch": 0.0125586769294314, - "grad_norm": 1.3262840509414673, - "learning_rate": 4.968600225147867e-05, - "loss": 2.4514, - "step": 22500 + "epoch": 0.016, + "grad_norm": 0.962242841720581, + "learning_rate": 8.96919949885854e-05, + "loss": 0.901, + "step": 8000 }, { - "epoch": 0.012837758638974319, - "grad_norm": 1.4721438884735107, - "learning_rate": 4.9682494510277314e-05, - "loss": 2.3541, - "step": 22600 + "epoch": 0.016, + "eval_loss": 0.8876739740371704, + "eval_runtime": 152.9353, + "eval_samples_per_second": 100.977, + "eval_steps_per_second": 0.791, + "step": 8000 }, { - "epoch": 0.013116840348517239, - "grad_norm": 1.294517993927002, - "learning_rate": 4.967896741006641e-05, - "loss": 2.3089, - "step": 22700 + "epoch": 0.016198, + "loss_gen": 2.9248859882354736, + "loss_rtd": 0.4561987817287445, + "loss_sent": 0.00010435195144964382, + "loss_sod": 0.36148083209991455, + "loss_total": 0.9108849763870239, + "step": 8099 }, { - "epoch": 0.013395922058060158, - "grad_norm": 1.2886101007461548, - "learning_rate": 4.967542095361234e-05, - "loss": 2.2613, - "step": 22800 + "epoch": 0.016198, + "loss_gen": 2.6596930027008057, + "loss_rtd": 0.4671928286552429, + "loss_sent": 0.00011411526065785438, + "loss_sod": 0.3841462731361389, + "loss_total": 0.9361129403114319, + "step": 8099 }, { - "epoch": 0.013675003767603078, - "grad_norm": 1.0832226276397705, - "learning_rate": 4.967185514369668e-05, - "loss": 2.2194, - "step": 22900 + "epoch": 0.0162, + "grad_norm": 2.1761651039123535, + "learning_rate": 8.967268915036318e-05, + "loss": 0.9123, + "step": 8100 }, { - "epoch": 0.013954085477146, - "grad_norm": 1.356217384338379, - "learning_rate": 4.966826998311614e-05, - "loss": 2.2088, - "step": 23000 + "epoch": 0.016398, + "loss_gen": 4.17715311050415, + "loss_rtd": 0.451453298330307, + "loss_sent": 0.354988157749176, + "loss_sod": 0.1447845697402954, + "loss_total": 1.0710194110870361, + "step": 8199 }, { - "epoch": 0.013954085477146, - "eval_loss": 2.7295706272125244, - "eval_runtime": 51.4928, - "eval_samples_per_second": 197.969, - "eval_steps_per_second": 1.554, - "step": 23000 + "epoch": 0.016398, + "loss_gen": 3.652554512023926, + "loss_rtd": 0.47347867488861084, + "loss_sent": 0.10158122330904007, + "loss_sod": 0.11182337254285812, + "loss_total": 0.7916321158409119, + "step": 8199 }, { - "epoch": 0.01423316718668892, - "grad_norm": 1.1834540367126465, - "learning_rate": 4.966466547468266e-05, - "loss": 2.1729, - "step": 23100 + "epoch": 0.0164, + "grad_norm": 1.7405667304992676, + "learning_rate": 8.965336733198653e-05, + "loss": 0.8714, + "step": 8200 }, { - "epoch": 0.014512248896231839, - "grad_norm": 1.0406333208084106, - "learning_rate": 4.9661041621223325e-05, - "loss": 2.1618, - "step": 23200 + "epoch": 0.016598, + "loss_gen": 4.009151935577393, + "loss_rtd": 0.4614793360233307, + "loss_sent": 0.27255332469940186, + "loss_sod": 0.03202224522829056, + "loss_total": 0.8689844608306885, + "step": 8299 }, { - "epoch": 0.014791330605774759, - "grad_norm": 1.6098556518554688, - "learning_rate": 4.965739842558041e-05, - "loss": 2.1318, - "step": 23300 + "epoch": 0.016598, + "loss_gen": 4.062987804412842, + "loss_rtd": 0.4590751826763153, + "loss_sent": 0.1842603087425232, + "loss_sod": 0.19588200747966766, + "loss_total": 0.9435292482376099, + "step": 8299 }, { - "epoch": 0.015070412315317678, - "grad_norm": 1.3630579710006714, - "learning_rate": 4.9653735890611353e-05, - "loss": 2.1278, - "step": 23400 + "epoch": 0.0166, + "grad_norm": 1.2510912418365479, + "learning_rate": 8.963402954123825e-05, + "loss": 0.872, + "step": 8300 }, { - "epoch": 0.015349494024860598, - "grad_norm": 1.8604717254638672, - "learning_rate": 4.965005401918874e-05, - "loss": 2.108, - "step": 23500 + "epoch": 0.016798, + "loss_gen": 4.116705894470215, + "loss_rtd": 0.44752010703086853, + "loss_sent": 0.06567193567752838, + "loss_sod": 0.1416502296924591, + "loss_total": 0.7487877607345581, + "step": 8399 }, { - "epoch": 0.01562857573440352, - "grad_norm": 1.192683458328247, - "learning_rate": 4.964635281420036e-05, - "loss": 2.1193, - "step": 23600 + "epoch": 0.016798, + "loss_gen": 4.250878810882568, + "loss_rtd": 0.4497721791267395, + "loss_sent": 0.1908683329820633, + "loss_sod": 0.022904343903064728, + "loss_total": 0.760552167892456, + "step": 8399 }, { - "epoch": 0.015907657443946437, - "grad_norm": 1.057704210281372, - "learning_rate": 4.964263227854914e-05, - "loss": 2.0887, - "step": 23700 + "epoch": 0.0168, + "grad_norm": 0.8308121562004089, + "learning_rate": 8.961467578590762e-05, + "loss": 0.8702, + "step": 8400 }, { - "epoch": 0.01618673915348936, - "grad_norm": 1.4525959491729736, - "learning_rate": 4.963889241515317e-05, - "loss": 2.0653, - "step": 23800 + "epoch": 0.016998, + "loss_gen": 3.3889882564544678, + "loss_rtd": 0.4579627513885498, + "loss_sent": 0.20038780570030212, + "loss_sod": 0.14318343997001648, + "loss_total": 0.8697285652160645, + "step": 8499 }, { - "epoch": 0.01646582086303228, - "grad_norm": 1.0620732307434082, - "learning_rate": 4.963513322694572e-05, - "loss": 2.0613, - "step": 23900 + "epoch": 0.016998, + "loss_gen": 3.9100821018218994, + "loss_rtd": 0.45459499955177307, + "loss_sent": 0.11043836176395416, + "loss_sod": 0.07976808398962021, + "loss_total": 0.7234815359115601, + "step": 8499 }, { - "epoch": 0.016744902572575198, - "grad_norm": 1.6661323308944702, - "learning_rate": 4.963135471687519e-05, - "loss": 2.0485, - "step": 24000 + "epoch": 0.017, + "grad_norm": 1.0994970798492432, + "learning_rate": 8.959530607379032e-05, + "loss": 0.8502, + "step": 8500 }, { - "epoch": 0.016744902572575198, - "eval_loss": 2.732348680496216, - "eval_runtime": 51.6262, - "eval_samples_per_second": 197.458, - "eval_steps_per_second": 1.55, - "step": 24000 + "epoch": 0.017198, + "loss_gen": 4.52324914932251, + "loss_rtd": 0.4543832838535309, + "loss_sent": 0.35701337456703186, + "loss_sod": 0.07280991226434708, + "loss_total": 0.9637364149093628, + "step": 8599 }, { - "epoch": 0.01702398428211812, - "grad_norm": 1.2704720497131348, - "learning_rate": 4.962755688790515e-05, - "loss": 2.0539, - "step": 24100 + "epoch": 0.017198, + "loss_gen": 3.273129940032959, + "loss_rtd": 0.4423699676990509, + "loss_sent": 0.15605831146240234, + "loss_sod": 0.10610494017601013, + "loss_total": 0.7620828747749329, + "step": 8599 }, { - "epoch": 0.017303065991661037, - "grad_norm": 1.9598422050476074, - "learning_rate": 4.962373974301432e-05, - "loss": 2.0434, - "step": 24200 + "epoch": 0.0172, + "grad_norm": 1.2175599336624146, + "learning_rate": 8.957592041268846e-05, + "loss": 0.8415, + "step": 8600 }, { - "epoch": 0.01758214770120396, - "grad_norm": 1.8419973850250244, - "learning_rate": 4.9619903285196567e-05, - "loss": 2.0325, - "step": 24300 + "epoch": 0.017398, + "loss_gen": 3.7413508892059326, + "loss_rtd": 0.43950116634368896, + "loss_sent": 0.21325911581516266, + "loss_sod": 0.13741731643676758, + "loss_total": 0.8470605611801147, + "step": 8699 }, { - "epoch": 0.017861229410746877, - "grad_norm": 1.2445131540298462, - "learning_rate": 4.961604751746091e-05, - "loss": 2.0143, - "step": 24400 + "epoch": 0.017398, + "loss_gen": 4.3951568603515625, + "loss_rtd": 0.4412088394165039, + "loss_sent": 0.30719298124313354, + "loss_sod": 0.1277753859758377, + "loss_total": 0.9430004954338074, + "step": 8699 }, { - "epoch": 0.018140311120289798, - "grad_norm": 1.3413686752319336, - "learning_rate": 4.9612172442831504e-05, - "loss": 2.0058, - "step": 24500 + "epoch": 0.0174, + "grad_norm": 1.5301543474197388, + "learning_rate": 8.955651881041059e-05, + "loss": 0.827, + "step": 8700 }, { - "epoch": 0.01841939282983272, - "grad_norm": 1.2016973495483398, - "learning_rate": 4.960827806434766e-05, - "loss": 1.9985, - "step": 24600 + "epoch": 0.017598, + "loss_gen": 4.006596088409424, + "loss_rtd": 0.4507193863391876, + "loss_sent": 0.3619312047958374, + "loss_sod": 0.005532963667064905, + "loss_total": 0.8702272176742554, + "step": 8799 }, { - "epoch": 0.018698474539375638, - "grad_norm": 1.309383749961853, - "learning_rate": 4.960436438506382e-05, - "loss": 1.9984, - "step": 24700 + "epoch": 0.017598, + "loss_gen": 3.811447858810425, + "loss_rtd": 0.4546920955181122, + "loss_sent": 0.0801302045583725, + "loss_sod": 0.25472018122673035, + "loss_total": 0.8390512466430664, + "step": 8799 }, { - "epoch": 0.01897755624891856, - "grad_norm": 1.8526884317398071, - "learning_rate": 4.960043140804956e-05, - "loss": 1.9868, - "step": 24800 + "epoch": 0.0176, + "grad_norm": 1.0982296466827393, + "learning_rate": 8.953710127477168e-05, + "loss": 0.8128, + "step": 8800 }, { - "epoch": 0.019256637958461477, - "grad_norm": 1.0628525018692017, - "learning_rate": 4.9596479136389605e-05, - "loss": 1.9859, - "step": 24900 + "epoch": 0.017798, + "loss_gen": 3.8621981143951416, + "loss_rtd": 0.442514568567276, + "loss_sent": 0.2113874852657318, + "loss_sod": 0.06732678413391113, + "loss_total": 0.7634892463684082, + "step": 8899 }, { - "epoch": 0.0195357196680044, - "grad_norm": 2.349247694015503, - "learning_rate": 4.959250757318381e-05, - "loss": 2.3249, - "step": 25000 + "epoch": 0.017798, + "loss_gen": 4.029360294342041, + "loss_rtd": 0.4374653697013855, + "loss_sent": 0.43539056181907654, + "loss_sod": 0.12874938547611237, + "loss_total": 1.0456947088241577, + "step": 8899 }, { - "epoch": 0.0195357196680044, - "eval_loss": 2.7338526248931885, - "eval_runtime": 51.7043, - "eval_samples_per_second": 197.16, - "eval_steps_per_second": 1.547, - "step": 25000 + "epoch": 0.0178, + "grad_norm": 1.562687635421753, + "learning_rate": 8.951766781359311e-05, + "loss": 0.8058, + "step": 8900 }, { - "epoch": 0.00027908170954291995, - "grad_norm": 1.5308209657669067, - "learning_rate": 4.958851672154715e-05, - "loss": 2.6006, - "step": 25100 + "epoch": 0.017998, + "loss_gen": 3.94120717048645, + "loss_rtd": 0.4648306965827942, + "loss_sent": 0.1459241509437561, + "loss_sod": 0.052158210426568985, + "loss_total": 0.6986362934112549, + "step": 8999 }, { - "epoch": 0.0005581634190858399, - "grad_norm": 2.163060426712036, - "learning_rate": 4.958450658460975e-05, - "loss": 2.5887, - "step": 25200 + "epoch": 0.017998, + "loss_gen": 3.806023359298706, + "loss_rtd": 0.45226743817329407, + "loss_sent": 0.17193950712680817, + "loss_sod": 0.027455970644950867, + "loss_total": 0.6861608028411865, + "step": 8999 }, { - "epoch": 0.0008372451286287599, - "grad_norm": 1.5415065288543701, - "learning_rate": 4.958047716551682e-05, - "loss": 2.5461, - "step": 25300 + "epoch": 0.018, + "grad_norm": 1.8158535957336426, + "learning_rate": 8.949821843470266e-05, + "loss": 0.7964, + "step": 9000 }, { - "epoch": 0.0011163268381716798, - "grad_norm": 2.232367992401123, - "learning_rate": 4.957642846742874e-05, - "loss": 2.5486, - "step": 25400 + "epoch": 0.018, + "eval_loss": 0.7903507947921753, + "eval_runtime": 151.2799, + "eval_samples_per_second": 102.082, + "eval_steps_per_second": 0.8, + "step": 9000 }, { - "epoch": 0.0013954085477146, - "grad_norm": 2.119978666305542, - "learning_rate": 4.957236049352098e-05, - "loss": 2.518, - "step": 25500 + "epoch": 0.018198, + "loss_gen": 2.4747531414031982, + "loss_rtd": 0.44327718019485474, + "loss_sent": 0.00012714380864053965, + "loss_sod": 0.38547617197036743, + "loss_total": 0.8470889925956726, + "step": 9099 }, { - "epoch": 0.0016744902572575198, - "grad_norm": 2.2122764587402344, - "learning_rate": 4.9568273246984146e-05, - "loss": 2.5058, - "step": 25600 + "epoch": 0.018198, + "loss_gen": 3.196993350982666, + "loss_rtd": 0.4125911295413971, + "loss_sent": 0.16173741221427917, + "loss_sod": 0.03193075954914093, + "loss_total": 0.6297818422317505, + "step": 9099 }, { - "epoch": 0.00195357196680044, - "grad_norm": 2.1825742721557617, - "learning_rate": 4.9564166731023954e-05, - "loss": 2.5044, - "step": 25700 + "epoch": 0.0182, + "grad_norm": 0.7714298963546753, + "learning_rate": 8.947875314593455e-05, + "loss": 0.7744, + "step": 9100 }, { - "epoch": 0.0022326536763433596, - "grad_norm": 2.136810541152954, - "learning_rate": 4.956004094886123e-05, - "loss": 2.4867, - "step": 25800 + "epoch": 0.018398, + "loss_gen": 3.0633838176727295, + "loss_rtd": 0.4389207065105438, + "loss_sent": 0.05032424256205559, + "loss_sod": 0.30537647008895874, + "loss_total": 0.8124662041664124, + "step": 9199 }, { - "epoch": 0.0025117353858862797, - "grad_norm": 1.419161319732666, - "learning_rate": 4.955589590373191e-05, - "loss": 2.4694, - "step": 25900 + "epoch": 0.018398, + "loss_gen": 2.1238484382629395, + "loss_rtd": 0.4132879376411438, + "loss_sent": 9.437712287763134e-05, + "loss_sod": 0.3491120934486389, + "loss_total": 0.7748661637306213, + "step": 9199 }, { - "epoch": 0.0027908170954292, - "grad_norm": 1.423763632774353, - "learning_rate": 4.955173159888705e-05, - "loss": 2.4686, - "step": 26000 + "epoch": 0.0184, + "grad_norm": 0.7082574367523193, + "learning_rate": 8.94592719551294e-05, + "loss": 0.7931, + "step": 9200 }, { - "epoch": 0.0027908170954292, - "eval_loss": 2.6800363063812256, - "eval_runtime": 52.7528, - "eval_samples_per_second": 193.241, - "eval_steps_per_second": 1.517, - "step": 26000 + "epoch": 0.018598, + "loss_gen": 2.9134955406188965, + "loss_rtd": 0.4698430299758911, + "loss_sent": 0.039316605776548386, + "loss_sod": 0.17961709201335907, + "loss_total": 0.7017951607704163, + "step": 9299 }, { - "epoch": 0.00306989880497212, - "grad_norm": 2.0498838424682617, - "learning_rate": 4.95475480375928e-05, - "loss": 2.4683, - "step": 26100 + "epoch": 0.018598, + "loss_gen": 3.965733289718628, + "loss_rtd": 0.4487987160682678, + "loss_sent": 0.3107520639896393, + "loss_sod": 0.006400309503078461, + "loss_total": 0.7836712598800659, + "step": 9299 }, { - "epoch": 0.0033489805145150396, - "grad_norm": 2.147402763366699, - "learning_rate": 4.9543345223130407e-05, - "loss": 2.4489, - "step": 26200 + "epoch": 0.0186, + "grad_norm": 1.1956804990768433, + "learning_rate": 8.943977487013423e-05, + "loss": 0.7891, + "step": 9300 }, { - "epoch": 0.0036280622240579597, - "grad_norm": 2.1229827404022217, - "learning_rate": 4.953912315879624e-05, - "loss": 2.476, - "step": 26300 + "epoch": 0.018798, + "loss_gen": 4.273687362670898, + "loss_rtd": 0.44777828454971313, + "loss_sent": 0.21755613386631012, + "loss_sod": 0.032184895128011703, + "loss_total": 0.7115744352340698, + "step": 9399 }, { - "epoch": 0.00390714393360088, - "grad_norm": 2.3337790966033936, - "learning_rate": 4.9534881847901746e-05, - "loss": 2.4518, - "step": 26400 + "epoch": 0.018798, + "loss_gen": 4.014370918273926, + "loss_rtd": 0.4529024064540863, + "loss_sent": 0.25203970074653625, + "loss_sod": 0.05144047737121582, + "loss_total": 0.7695848941802979, + "step": 9399 }, { - "epoch": 0.0041862256431437995, - "grad_norm": 1.2053618431091309, - "learning_rate": 4.953062129377349e-05, - "loss": 2.443, - "step": 26500 + "epoch": 0.0188, + "grad_norm": 0.8677006363868713, + "learning_rate": 8.942026189880244e-05, + "loss": 0.7976, + "step": 9400 }, { - "epoch": 0.004465307352686719, - "grad_norm": 1.3509132862091064, - "learning_rate": 4.9526341499753104e-05, - "loss": 2.436, - "step": 26600 + "epoch": 0.018998, + "loss_gen": 3.9230942726135254, + "loss_rtd": 0.4471578299999237, + "loss_sent": 0.12711574137210846, + "loss_sod": 0.011995519511401653, + "loss_total": 0.5952448844909668, + "step": 9499 }, { - "epoch": 0.00474438906222964, - "grad_norm": 1.974365234375, - "learning_rate": 4.9522042469197326e-05, - "loss": 2.428, - "step": 26700 + "epoch": 0.018998, + "loss_gen": 3.7593722343444824, + "loss_rtd": 0.4566221237182617, + "loss_sent": 0.2904764413833618, + "loss_sod": 0.09695194661617279, + "loss_total": 0.8526517152786255, + "step": 9499 }, { - "epoch": 0.005023470771772559, - "grad_norm": 2.1379354000091553, - "learning_rate": 4.9517724205477976e-05, - "loss": 2.4362, - "step": 26800 + "epoch": 0.019, + "grad_norm": 1.596506953239441, + "learning_rate": 8.940073304899388e-05, + "loss": 0.7528, + "step": 9500 }, { - "epoch": 0.00530255248131548, - "grad_norm": 2.1530823707580566, - "learning_rate": 4.951338671198196e-05, - "loss": 2.4254, - "step": 26900 + "epoch": 0.019198, + "loss_gen": 3.8245770931243896, + "loss_rtd": 0.43909817934036255, + "loss_sent": 0.15130890905857086, + "loss_sod": 0.040884099900722504, + "loss_total": 0.6369021534919739, + "step": 9599 }, { - "epoch": 0.0055816341908584, - "grad_norm": 2.167712926864624, - "learning_rate": 4.9509029992111276e-05, - "loss": 2.4241, - "step": 27000 + "epoch": 0.019198, + "loss_gen": 3.8713860511779785, + "loss_rtd": 0.45773789286613464, + "loss_sent": 0.33891892433166504, + "loss_sod": 0.02650151401758194, + "loss_total": 0.8288379907608032, + "step": 9599 }, { - "epoch": 0.0055816341908584, - "eval_loss": 2.6660702228546143, - "eval_runtime": 52.0355, - "eval_samples_per_second": 195.905, - "eval_steps_per_second": 1.537, - "step": 27000 + "epoch": 0.0192, + "grad_norm": 1.0625760555267334, + "learning_rate": 8.938118832857476e-05, + "loss": 0.7587, + "step": 9600 }, { - "epoch": 0.005860715900401319, - "grad_norm": 1.138418436050415, - "learning_rate": 4.950465404928298e-05, - "loss": 2.3908, - "step": 27100 + "epoch": 0.019398, + "loss_gen": 3.8790111541748047, + "loss_rtd": 0.4518607258796692, + "loss_sent": 0.25272804498672485, + "loss_sod": 0.11352493613958359, + "loss_total": 0.8213223814964294, + "step": 9699 }, { - "epoch": 0.00613979760994424, - "grad_norm": 2.0318148136138916, - "learning_rate": 4.9500258886929225e-05, - "loss": 2.4082, - "step": 27200 + "epoch": 0.019398, + "loss_gen": 4.176455020904541, + "loss_rtd": 0.44528916478157043, + "loss_sent": 0.3503030240535736, + "loss_sod": 0.09298127889633179, + "loss_total": 0.8920282125473022, + "step": 9699 }, { - "epoch": 0.0064188793194871595, - "grad_norm": 2.1356847286224365, - "learning_rate": 4.949584450849723e-05, - "loss": 2.3908, - "step": 27300 + "epoch": 0.0194, + "grad_norm": 2.159792184829712, + "learning_rate": 8.93616277454177e-05, + "loss": 0.7643, + "step": 9700 }, { - "epoch": 0.006697961029030079, - "grad_norm": 2.0764498710632324, - "learning_rate": 4.949141091744929e-05, - "loss": 2.41, - "step": 27400 + "epoch": 0.019598, + "loss_gen": 4.2923502922058105, + "loss_rtd": 0.44386738538742065, + "loss_sent": 0.33425548672676086, + "loss_sod": 0.13649314641952515, + "loss_total": 0.9162001609802246, + "step": 9799 }, { - "epoch": 0.006977042738573, - "grad_norm": 2.0822367668151855, - "learning_rate": 4.948695811726276e-05, - "loss": 2.4014, - "step": 27500 + "epoch": 0.019598, + "loss_gen": 3.989419937133789, + "loss_rtd": 0.45562487840652466, + "loss_sent": 0.09749240428209305, + "loss_sod": 0.017574891448020935, + "loss_total": 0.5721644759178162, + "step": 9799 }, { - "epoch": 0.0072561244481159195, - "grad_norm": 2.1181235313415527, - "learning_rate": 4.948248611143006e-05, - "loss": 2.3955, - "step": 27600 + "epoch": 0.0196, + "grad_norm": 1.0099166631698608, + "learning_rate": 8.934205130740169e-05, + "loss": 0.7612, + "step": 9800 }, { - "epoch": 0.007535206157658839, - "grad_norm": 2.22819447517395, - "learning_rate": 4.9477994903458704e-05, - "loss": 2.3786, - "step": 27700 + "epoch": 0.019798, + "loss_gen": 3.232377290725708, + "loss_rtd": 0.44479018449783325, + "loss_sent": 0.07202599197626114, + "loss_sod": 0.2975013256072998, + "loss_total": 0.8146188259124756, + "step": 9899 }, { - "epoch": 0.00781428786720176, - "grad_norm": 1.9119658470153809, - "learning_rate": 4.947348449687122e-05, - "loss": 2.3923, - "step": 27800 + "epoch": 0.019798, + "loss_gen": 2.5692195892333984, + "loss_rtd": 0.42933931946754456, + "loss_sent": 0.025050949305295944, + "loss_sod": 0.2836762070655823, + "loss_total": 0.7383059859275818, + "step": 9899 }, { - "epoch": 0.00809336957674468, - "grad_norm": 2.070460796356201, - "learning_rate": 4.946895489520522e-05, - "loss": 2.3896, - "step": 27900 + "epoch": 0.0198, + "grad_norm": 1.441141128540039, + "learning_rate": 8.93224590224121e-05, + "loss": 0.7724, + "step": 9900 }, { - "epoch": 0.008372451286287599, - "grad_norm": 2.1365408897399902, - "learning_rate": 4.946440610201337e-05, - "loss": 2.3966, - "step": 28000 + "epoch": 0.019998, + "loss_gen": 4.065705299377441, + "loss_rtd": 0.41851821541786194, + "loss_sent": 0.057108424603939056, + "loss_sod": 0.003937193192541599, + "loss_total": 0.4795638620853424, + "step": 9999 }, { - "epoch": 0.008372451286287599, - "eval_loss": 2.6492345333099365, - "eval_runtime": 51.9795, - "eval_samples_per_second": 196.116, - "eval_steps_per_second": 1.539, - "step": 28000 + "epoch": 0.019998, + "loss_gen": 3.380124568939209, + "loss_rtd": 0.4455118775367737, + "loss_sent": 0.03728582710027695, + "loss_sod": 0.2612026035785675, + "loss_total": 0.7440003156661987, + "step": 9999 }, { - "epoch": 0.008651532995830519, - "grad_norm": 1.5030018091201782, - "learning_rate": 4.9459838120863396e-05, - "loss": 2.3825, - "step": 28100 + "epoch": 0.02, + "grad_norm": 1.000209927558899, + "learning_rate": 8.930285089834074e-05, + "loss": 0.7523, + "step": 10000 }, { - "epoch": 0.008930614705373438, - "grad_norm": 2.0156290531158447, - "learning_rate": 4.945525095533805e-05, - "loss": 2.3712, - "step": 28200 + "epoch": 0.02, + "eval_loss": 0.7508344054222107, + "eval_runtime": 151.4354, + "eval_samples_per_second": 101.977, + "eval_steps_per_second": 0.799, + "step": 10000 }, { - "epoch": 0.00920969641491636, - "grad_norm": 2.3122830390930176, - "learning_rate": 4.945064460903515e-05, - "loss": 2.3646, - "step": 28300 + "epoch": 0.020198, + "loss_gen": 4.481088161468506, + "loss_rtd": 0.44522103667259216, + "loss_sent": 0.23012477159500122, + "loss_sod": 0.10265250504016876, + "loss_total": 0.7779983282089233, + "step": 10099 }, { - "epoch": 0.00948877812445928, - "grad_norm": 1.6442335844039917, - "learning_rate": 4.944601908556755e-05, - "loss": 2.3821, - "step": 28400 + "epoch": 0.020198, + "loss_gen": 3.8373875617980957, + "loss_rtd": 0.43886369466781616, + "loss_sent": 0.5344423651695251, + "loss_sod": 0.03612484782934189, + "loss_total": 1.0094308853149414, + "step": 10099 }, { - "epoch": 0.0097678598340022, - "grad_norm": 2.1097826957702637, - "learning_rate": 4.944137438856316e-05, - "loss": 2.375, - "step": 28500 + "epoch": 0.0202, + "grad_norm": 2.045971155166626, + "learning_rate": 8.928322694308574e-05, + "loss": 0.7515, + "step": 10100 }, { - "epoch": 0.010046941543545119, - "grad_norm": 1.3789490461349487, - "learning_rate": 4.943671052166489e-05, - "loss": 2.3586, - "step": 28600 + "epoch": 0.020398, + "loss_gen": 4.168641090393066, + "loss_rtd": 0.44839605689048767, + "loss_sent": 0.2666820287704468, + "loss_sod": 0.07253801822662354, + "loss_total": 0.7876161336898804, + "step": 10199 }, { - "epoch": 0.010326023253088039, - "grad_norm": 1.3923848867416382, - "learning_rate": 4.943202748853073e-05, - "loss": 2.3569, - "step": 28700 + "epoch": 0.020398, + "loss_gen": 4.216071128845215, + "loss_rtd": 0.4510549008846283, + "loss_sent": 0.2517627477645874, + "loss_sod": 0.1449786126613617, + "loss_total": 0.8477962613105774, + "step": 10199 }, { - "epoch": 0.01060510496263096, - "grad_norm": 2.0596909523010254, - "learning_rate": 4.9427325292833685e-05, - "loss": 2.3775, - "step": 28800 + "epoch": 0.0204, + "grad_norm": 1.3105542659759521, + "learning_rate": 8.92635871645516e-05, + "loss": 0.7545, + "step": 10200 }, { - "epoch": 0.01088418667217388, - "grad_norm": 1.5026627779006958, - "learning_rate": 4.942260393826177e-05, - "loss": 2.3509, - "step": 28900 + "epoch": 0.020598, + "loss_gen": 3.248972177505493, + "loss_rtd": 0.45092254877090454, + "loss_sent": 0.37074267864227295, + "loss_sod": 0.17783309519290924, + "loss_total": 0.9994983077049255, + "step": 10299 }, { - "epoch": 0.0111632683817168, - "grad_norm": 2.089820384979248, - "learning_rate": 4.941786342851806e-05, - "loss": 2.3467, - "step": 29000 + "epoch": 0.020598, + "loss_gen": 2.8247666358947754, + "loss_rtd": 0.45638176798820496, + "loss_sent": 0.0008790385909378529, + "loss_sod": 0.4168325364589691, + "loss_total": 0.8740933537483215, + "step": 10299 }, { - "epoch": 0.0111632683817168, - "eval_loss": 2.6393887996673584, - "eval_runtime": 52.0314, - "eval_samples_per_second": 195.92, - "eval_steps_per_second": 1.538, - "step": 29000 + "epoch": 0.0206, + "grad_norm": 1.7141914367675781, + "learning_rate": 8.924393157064926e-05, + "loss": 0.7535, + "step": 10300 }, { - "epoch": 0.011442350091259719, - "grad_norm": 1.9598230123519897, - "learning_rate": 4.941310376732063e-05, - "loss": 2.3222, - "step": 29100 + "epoch": 0.020798, + "loss_gen": 4.234827518463135, + "loss_rtd": 0.4640049934387207, + "loss_sent": 0.1334572434425354, + "loss_sod": 0.07902702689170837, + "loss_total": 0.6764892935752869, + "step": 10399 }, { - "epoch": 0.011721431800802639, - "grad_norm": 1.3596618175506592, - "learning_rate": 4.94083249584026e-05, - "loss": 2.3524, - "step": 29200 + "epoch": 0.020798, + "loss_gen": 3.7969512939453125, + "loss_rtd": 0.44702625274658203, + "loss_sent": 0.33471742272377014, + "loss_sod": 0.09909866005182266, + "loss_total": 0.8808423280715942, + "step": 10399 }, { - "epoch": 0.012000513510345558, - "grad_norm": 1.5016840696334839, - "learning_rate": 4.9403527005512066e-05, - "loss": 2.3581, - "step": 29300 + "epoch": 0.0208, + "grad_norm": 1.0838115215301514, + "learning_rate": 8.922426016929598e-05, + "loss": 0.7524, + "step": 10400 }, { - "epoch": 0.01227959521988848, - "grad_norm": 1.5459811687469482, - "learning_rate": 4.939870991241219e-05, - "loss": 2.36, - "step": 29400 + "epoch": 0.020998, + "loss_gen": 3.911032199859619, + "loss_rtd": 0.4394637942314148, + "loss_sent": 0.08548645675182343, + "loss_sod": 0.14958754181861877, + "loss_total": 0.6745378375053406, + "step": 10499 }, { - "epoch": 0.0125586769294314, - "grad_norm": 1.747701644897461, - "learning_rate": 4.939387368288111e-05, - "loss": 2.3373, - "step": 29500 + "epoch": 0.020998, + "loss_gen": 4.120211601257324, + "loss_rtd": 0.44440609216690063, + "loss_sent": 0.2759931981563568, + "loss_sod": 0.0870596170425415, + "loss_total": 0.8074588775634766, + "step": 10499 }, { - "epoch": 0.012837758638974319, - "grad_norm": 2.089578151702881, - "learning_rate": 4.938901832071198e-05, - "loss": 2.332, - "step": 29600 + "epoch": 0.021, + "grad_norm": 0.9914191365242004, + "learning_rate": 8.920457296841538e-05, + "loss": 0.7554, + "step": 10500 }, { - "epoch": 0.013116840348517239, - "grad_norm": 2.2954776287078857, - "learning_rate": 4.938414382971298e-05, - "loss": 2.6504, - "step": 29700 + "epoch": 0.021198, + "loss_gen": 3.870936393737793, + "loss_rtd": 0.4242846965789795, + "loss_sent": 0.21312680840492249, + "loss_sod": 0.09429841488599777, + "loss_total": 0.7317099571228027, + "step": 10599 }, { - "epoch": 0.013395922058060158, - "grad_norm": 1.9996869564056396, - "learning_rate": 4.937925021370726e-05, - "loss": 2.667, - "step": 29800 + "epoch": 0.021198, + "loss_gen": 4.131606578826904, + "loss_rtd": 0.43067824840545654, + "loss_sent": 0.28381282091140747, + "loss_sod": 0.0976143404841423, + "loss_total": 0.8121054172515869, + "step": 10599 }, { - "epoch": 0.013675003767603078, - "grad_norm": 2.092458724975586, - "learning_rate": 4.9374337476533e-05, - "loss": 2.6471, - "step": 29900 + "epoch": 0.0212, + "grad_norm": 1.5844788551330566, + "learning_rate": 8.918486997593749e-05, + "loss": 0.7619, + "step": 10600 }, { - "epoch": 0.013954085477146, - "grad_norm": 2.073460102081299, - "learning_rate": 4.936940562204336e-05, - "loss": 2.6341, - "step": 30000 + "epoch": 0.021398, + "loss_gen": 3.798642158508301, + "loss_rtd": 0.43546780943870544, + "loss_sent": 0.2033432424068451, + "loss_sod": 0.13071854412555695, + "loss_total": 0.7695295810699463, + "step": 10699 }, { - "epoch": 0.013954085477146, - "eval_loss": 2.62418794631958, - "eval_runtime": 52.039, - "eval_samples_per_second": 195.891, - "eval_steps_per_second": 1.537, - "step": 30000 + "epoch": 0.021398, + "loss_gen": 3.8915579319000244, + "loss_rtd": 0.4383130967617035, + "loss_sent": 0.19973257184028625, + "loss_sod": 0.0048730941489338875, + "loss_total": 0.6429187655448914, + "step": 10699 }, { - "epoch": 0.01423316718668892, - "grad_norm": 2.089350938796997, - "learning_rate": 4.93644546541065e-05, - "loss": 2.6249, - "step": 30100 + "epoch": 0.0214, + "grad_norm": 1.2073383331298828, + "learning_rate": 8.916515119979866e-05, + "loss": 0.7569, + "step": 10700 }, { - "epoch": 0.014512248896231839, - "grad_norm": 2.0827994346618652, - "learning_rate": 4.935948457660557e-05, - "loss": 2.6251, - "step": 30200 + "epoch": 0.021598, + "loss_gen": 4.278448104858398, + "loss_rtd": 0.4538666605949402, + "loss_sent": 0.4568008482456207, + "loss_sod": 0.08655402809381485, + "loss_total": 0.9972215890884399, + "step": 10799 }, { - "epoch": 0.014791330605774759, - "grad_norm": 2.1806418895721436, - "learning_rate": 4.9354495393438716e-05, - "loss": 2.6239, - "step": 30300 + "epoch": 0.021598, + "loss_gen": 3.7883036136627197, + "loss_rtd": 0.44330617785453796, + "loss_sent": 0.460102915763855, + "loss_sod": 0.04768051952123642, + "loss_total": 0.95108962059021, + "step": 10799 }, { - "epoch": 0.015070412315317678, - "grad_norm": 2.05653977394104, - "learning_rate": 4.9349487108519046e-05, - "loss": 2.6125, - "step": 30400 + "epoch": 0.0216, + "grad_norm": 2.6000397205352783, + "learning_rate": 8.91454166479416e-05, + "loss": 0.7608, + "step": 10800 }, { - "epoch": 0.015349494024860598, - "grad_norm": 2.205066442489624, - "learning_rate": 4.9344459725774675e-05, - "loss": 2.6152, - "step": 30500 + "epoch": 0.021798, + "loss_gen": 3.8277363777160645, + "loss_rtd": 0.46418583393096924, + "loss_sent": 0.3211608827114105, + "loss_sod": 0.10692701488733292, + "loss_total": 0.8922737836837769, + "step": 10899 }, { - "epoch": 0.01562857573440352, - "grad_norm": 2.0800061225891113, - "learning_rate": 4.933941324914869e-05, - "loss": 2.6031, - "step": 30600 + "epoch": 0.021798, + "loss_gen": 4.054731845855713, + "loss_rtd": 0.45032167434692383, + "loss_sent": 0.27219444513320923, + "loss_sod": 0.28465166687965393, + "loss_total": 1.0071678161621094, + "step": 10899 }, { - "epoch": 0.015907657443946437, - "grad_norm": 2.062309980392456, - "learning_rate": 4.933434768259915e-05, - "loss": 2.5932, - "step": 30700 + "epoch": 0.0218, + "grad_norm": 2.0657763481140137, + "learning_rate": 8.912566632831541e-05, + "loss": 0.7661, + "step": 10900 }, { - "epoch": 0.01618673915348936, - "grad_norm": 2.150096893310547, - "learning_rate": 4.932926303009907e-05, - "loss": 2.5954, - "step": 30800 + "epoch": 0.021998, + "loss_gen": 4.123061656951904, + "loss_rtd": 0.42160412669181824, + "loss_sent": 0.17279018461704254, + "loss_sod": 0.02948286198079586, + "loss_total": 0.6238771677017212, + "step": 10999 }, { - "epoch": 0.01646582086303228, - "grad_norm": 2.0282931327819824, - "learning_rate": 4.932415929563647e-05, - "loss": 2.5872, - "step": 30900 + "epoch": 0.021998, + "loss_gen": 4.480509281158447, + "loss_rtd": 0.44309404492378235, + "loss_sent": 0.18066415190696716, + "loss_sod": 0.13343116641044617, + "loss_total": 0.7571893930435181, + "step": 10999 }, { - "epoch": 0.016744902572575198, - "grad_norm": 2.0225014686584473, - "learning_rate": 4.9319036483214324e-05, - "loss": 2.5846, - "step": 31000 + "epoch": 0.022, + "grad_norm": 0.9440023899078369, + "learning_rate": 8.91059002488755e-05, + "loss": 0.764, + "step": 11000 }, { - "epoch": 0.016744902572575198, - "eval_loss": 2.616325616836548, - "eval_runtime": 52.1651, - "eval_samples_per_second": 195.418, - "eval_steps_per_second": 1.534, - "step": 31000 + "epoch": 0.022, + "eval_loss": 0.7398257255554199, + "eval_runtime": 151.466, + "eval_samples_per_second": 101.957, + "eval_steps_per_second": 0.799, + "step": 11000 }, { - "epoch": 0.01702398428211812, - "grad_norm": 2.1831114292144775, - "learning_rate": 4.931389459685055e-05, - "loss": 2.5882, - "step": 31100 + "epoch": 0.022198, + "loss_gen": 3.9331908226013184, + "loss_rtd": 0.45400765538215637, + "loss_sent": 0.2909051179885864, + "loss_sod": 0.15119487047195435, + "loss_total": 0.8961076736450195, + "step": 11099 }, { - "epoch": 0.017303065991661037, - "grad_norm": 2.005201816558838, - "learning_rate": 4.930873364057804e-05, - "loss": 2.5875, - "step": 31200 + "epoch": 0.022198, + "loss_gen": 2.9539029598236084, + "loss_rtd": 0.43858084082603455, + "loss_sent": 0.037397708743810654, + "loss_sod": 0.3715130388736725, + "loss_total": 0.8474915623664856, + "step": 11099 }, { - "epoch": 0.01758214770120396, - "grad_norm": 2.165627956390381, - "learning_rate": 4.9303553618444645e-05, - "loss": 2.5833, - "step": 31300 + "epoch": 0.0222, + "grad_norm": 1.1551802158355713, + "learning_rate": 8.908611841758363e-05, + "loss": 0.7472, + "step": 11100 }, { - "epoch": 0.017861229410746877, - "grad_norm": 2.0518550872802734, - "learning_rate": 4.929835453451317e-05, - "loss": 2.5811, - "step": 31400 + "epoch": 0.022398, + "loss_gen": 3.897245168685913, + "loss_rtd": 0.42984846234321594, + "loss_sent": 0.22936731576919556, + "loss_sod": 0.021883495151996613, + "loss_total": 0.6810992956161499, + "step": 11199 }, { - "epoch": 0.018140311120289798, - "grad_norm": 2.180809497833252, - "learning_rate": 4.929313639286137e-05, - "loss": 2.5783, - "step": 31500 + "epoch": 0.022398, + "loss_gen": 3.969332456588745, + "loss_rtd": 0.44527700543403625, + "loss_sent": 0.3268181383609772, + "loss_sod": 0.09524273872375488, + "loss_total": 0.8673378229141235, + "step": 11199 }, { - "epoch": 0.01841939282983272, - "grad_norm": 2.042470693588257, - "learning_rate": 4.928789919758194e-05, - "loss": 2.5806, - "step": 31600 + "epoch": 0.0224, + "grad_norm": 2.26607608795166, + "learning_rate": 8.906632084240796e-05, + "loss": 0.759, + "step": 11200 }, { - "epoch": 0.018698474539375638, - "grad_norm": 1.963271975517273, - "learning_rate": 4.928264295278252e-05, - "loss": 2.5603, - "step": 31700 + "epoch": 0.022598, + "loss_gen": 4.213681221008301, + "loss_rtd": 0.4063725769519806, + "loss_sent": 0.2478422373533249, + "loss_sod": 0.058716077357530594, + "loss_total": 0.7129309177398682, + "step": 11299 }, { - "epoch": 0.01897755624891856, - "grad_norm": 2.0756092071533203, - "learning_rate": 4.927736766258571e-05, - "loss": 2.5779, - "step": 31800 + "epoch": 0.022598, + "loss_gen": 4.215815544128418, + "loss_rtd": 0.43394604325294495, + "loss_sent": 0.4162815511226654, + "loss_sod": 0.04028809815645218, + "loss_total": 0.8905156850814819, + "step": 11299 }, { - "epoch": 0.019256637958461477, - "grad_norm": 2.152949333190918, - "learning_rate": 4.927207333112902e-05, - "loss": 2.5729, - "step": 31900 + "epoch": 0.0226, + "grad_norm": 1.892191767692566, + "learning_rate": 8.90465075313229e-05, + "loss": 0.7353, + "step": 11300 }, { - "epoch": 0.0195357196680044, - "grad_norm": 1.90077805519104, - "learning_rate": 4.926675996256492e-05, - "loss": 2.5569, - "step": 32000 + "epoch": 0.022798, + "loss_gen": 2.236022710800171, + "loss_rtd": 0.4328562617301941, + "loss_sent": 0.0001368140656268224, + "loss_sod": 0.2996408939361572, + "loss_total": 0.7326339483261108, + "step": 11399 }, { - "epoch": 0.0195357196680044, - "eval_loss": 2.6170480251312256, - "eval_runtime": 52.3031, - "eval_samples_per_second": 194.902, - "eval_steps_per_second": 1.53, - "step": 32000 + "epoch": 0.022798, + "loss_gen": 3.444706916809082, + "loss_rtd": 0.41880279779434204, + "loss_sent": 0.12022778391838074, + "loss_sod": 0.06587596237659454, + "loss_total": 0.6049065589904785, + "step": 11399 }, { - "epoch": 0.01981480137754732, - "grad_norm": 2.1005971431732178, - "learning_rate": 4.926142756106078e-05, - "loss": 2.5584, - "step": 32100 + "epoch": 0.0228, + "grad_norm": 0.6935063600540161, + "learning_rate": 8.902667849230929e-05, + "loss": 0.7465, + "step": 11400 }, { - "epoch": 0.020093883087090238, - "grad_norm": 1.9015015363693237, - "learning_rate": 4.925607613079895e-05, - "loss": 2.5554, - "step": 32200 + "epoch": 0.022998, + "loss_gen": 3.834214448928833, + "loss_rtd": 0.44961848855018616, + "loss_sent": 0.4442434012889862, + "loss_sod": 0.07701162248849869, + "loss_total": 0.9708734750747681, + "step": 11499 }, { - "epoch": 0.02037296479663316, - "grad_norm": 2.0273337364196777, - "learning_rate": 4.925070567597663e-05, - "loss": 2.554, - "step": 32300 + "epoch": 0.022998, + "loss_gen": 3.003070831298828, + "loss_rtd": 0.4377424418926239, + "loss_sent": 0.06250195950269699, + "loss_sod": 0.1500367373228073, + "loss_total": 0.6502811312675476, + "step": 11499 }, { - "epoch": 0.020652046506176077, - "grad_norm": 2.083922863006592, - "learning_rate": 4.9245316200806004e-05, - "loss": 2.5488, - "step": 32400 + "epoch": 0.023, + "grad_norm": 1.296846866607666, + "learning_rate": 8.900683373335425e-05, + "loss": 0.7588, + "step": 11500 }, { - "epoch": 0.020931128215719, - "grad_norm": 2.1102890968322754, - "learning_rate": 4.9239907709514155e-05, - "loss": 2.5645, - "step": 32500 + "epoch": 0.023198, + "loss_gen": 4.146674633026123, + "loss_rtd": 0.44089600443840027, + "loss_sent": 0.20838849246501923, + "loss_sod": 0.08634405583143234, + "loss_total": 0.7356285452842712, + "step": 11599 }, { - "epoch": 0.02121020992526192, - "grad_norm": 1.9879834651947021, - "learning_rate": 4.923448020634308e-05, - "loss": 2.5594, - "step": 32600 + "epoch": 0.023198, + "loss_gen": 4.042768955230713, + "loss_rtd": 0.4361976981163025, + "loss_sent": 0.2685220241546631, + "loss_sod": 0.1542719006538391, + "loss_total": 0.8589916229248047, + "step": 11599 }, { - "epoch": 0.021489291634804838, - "grad_norm": 1.9959540367126465, - "learning_rate": 4.922903369554967e-05, - "loss": 2.5537, - "step": 32700 + "epoch": 0.0232, + "grad_norm": 0.9552421569824219, + "learning_rate": 8.898697326245124e-05, + "loss": 0.7565, + "step": 11600 }, { - "epoch": 0.02176837334434776, - "grad_norm": 2.0482735633850098, - "learning_rate": 4.922356818140576e-05, - "loss": 2.5438, - "step": 32800 + "epoch": 0.023398, + "loss_gen": 3.578911781311035, + "loss_rtd": 0.44924330711364746, + "loss_sent": 0.04782826453447342, + "loss_sod": 0.3338066041469574, + "loss_total": 0.8308781981468201, + "step": 11699 }, { - "epoch": 0.022047455053890677, - "grad_norm": 1.9612691402435303, - "learning_rate": 4.921808366819806e-05, - "loss": 2.5507, - "step": 32900 + "epoch": 0.023398, + "loss_gen": 2.7252180576324463, + "loss_rtd": 0.43577805161476135, + "loss_sent": 0.001131089637055993, + "loss_sod": 0.42657536268234253, + "loss_total": 0.8634845614433289, + "step": 11699 }, { - "epoch": 0.0223265367634336, - "grad_norm": 2.07208251953125, - "learning_rate": 4.92125801602282e-05, - "loss": 2.5468, - "step": 33000 + "epoch": 0.0234, + "grad_norm": 0.6433441638946533, + "learning_rate": 8.896709708760008e-05, + "loss": 0.7481, + "step": 11700 }, { - "epoch": 0.0223265367634336, - "eval_loss": 2.6100504398345947, - "eval_runtime": 52.3552, - "eval_samples_per_second": 194.708, - "eval_steps_per_second": 1.528, - "step": 33000 + "epoch": 0.023598, + "loss_gen": 3.7408361434936523, + "loss_rtd": 0.4493173658847809, + "loss_sent": 0.1397567242383957, + "loss_sod": 0.02804996259510517, + "loss_total": 0.617124080657959, + "step": 11799 }, { - "epoch": 0.022605618472976517, - "grad_norm": 1.9684903621673584, - "learning_rate": 4.9207057661812674e-05, - "loss": 2.5356, - "step": 33100 + "epoch": 0.023598, + "loss_gen": 4.120850563049316, + "loss_rtd": 0.43224528431892395, + "loss_sent": 0.17164933681488037, + "loss_sod": 0.07089000195264816, + "loss_total": 0.6747846007347107, + "step": 11799 }, { - "epoch": 0.022884700182519438, - "grad_norm": 2.1681063175201416, - "learning_rate": 4.920151617728292e-05, - "loss": 2.5392, - "step": 33200 + "epoch": 0.0236, + "grad_norm": 0.7836123704910278, + "learning_rate": 8.894720521680686e-05, + "loss": 0.7493, + "step": 11800 }, { - "epoch": 0.02316378189206236, - "grad_norm": 1.9988055229187012, - "learning_rate": 4.9195955710985244e-05, - "loss": 2.5294, - "step": 33300 + "epoch": 0.023798, + "loss_gen": 4.201231002807617, + "loss_rtd": 0.42488178610801697, + "loss_sent": 0.25421980023384094, + "loss_sod": 0.03894122317433357, + "loss_total": 0.7180428504943848, + "step": 11899 }, { - "epoch": 0.023442863601605277, - "grad_norm": 2.0547211170196533, - "learning_rate": 4.919037626728083e-05, - "loss": 2.5351, - "step": 33400 + "epoch": 0.023798, + "loss_gen": 4.196627140045166, + "loss_rtd": 0.4310776889324188, + "loss_sent": 0.14267674088478088, + "loss_sod": 0.15274421870708466, + "loss_total": 0.7264986038208008, + "step": 11899 }, { - "epoch": 0.0237219453111482, - "grad_norm": 1.9304970502853394, - "learning_rate": 4.9184777850545756e-05, - "loss": 2.5337, - "step": 33500 + "epoch": 0.0238, + "grad_norm": 0.8541756272315979, + "learning_rate": 8.892729765808402e-05, + "loss": 0.743, + "step": 11900 }, { - "epoch": 0.024001027020691117, - "grad_norm": 2.06661057472229, - "learning_rate": 4.917916046517098e-05, - "loss": 2.5394, - "step": 33600 + "epoch": 0.023998, + "loss_gen": 4.17164945602417, + "loss_rtd": 0.4589729905128479, + "loss_sent": 0.12974581122398376, + "loss_sod": 0.11565913259983063, + "loss_total": 0.7043778896331787, + "step": 11999 }, { - "epoch": 0.024280108730234038, - "grad_norm": 2.064086675643921, - "learning_rate": 4.917352411556234e-05, - "loss": 2.5255, - "step": 33700 + "epoch": 0.023998, + "loss_gen": 4.203178882598877, + "loss_rtd": 0.4291312098503113, + "loss_sent": 0.38069620728492737, + "loss_sod": 0.1546536386013031, + "loss_total": 0.964480996131897, + "step": 11999 }, { - "epoch": 0.02455919043977696, - "grad_norm": 2.0449039936065674, - "learning_rate": 4.916786880614055e-05, - "loss": 2.5274, - "step": 33800 + "epoch": 0.024, + "grad_norm": 0.7374743819236755, + "learning_rate": 8.890737441945037e-05, + "loss": 0.7471, + "step": 12000 }, { - "epoch": 0.024838272149319877, - "grad_norm": 1.985840916633606, - "learning_rate": 4.916219454134118e-05, - "loss": 2.5138, - "step": 33900 + "epoch": 0.024, + "eval_loss": 0.7346014380455017, + "eval_runtime": 151.7074, + "eval_samples_per_second": 101.795, + "eval_steps_per_second": 0.798, + "step": 12000 }, { - "epoch": 0.0251173538588628, - "grad_norm": 2.0639407634735107, - "learning_rate": 4.9156501325614676e-05, - "loss": 2.5299, - "step": 34000 + "epoch": 0.024198, + "loss_gen": 4.091280937194824, + "loss_rtd": 0.44482794404029846, + "loss_sent": 0.2618083655834198, + "loss_sod": 0.034425701946020126, + "loss_total": 0.7410620450973511, + "step": 12099 }, { - "epoch": 0.0251173538588628, - "eval_loss": 2.585704803466797, - "eval_runtime": 52.2575, - "eval_samples_per_second": 195.072, - "eval_steps_per_second": 1.531, - "step": 34000 + "epoch": 0.024198, + "loss_gen": 3.968428134918213, + "loss_rtd": 0.43132540583610535, + "loss_sent": 0.18958622217178345, + "loss_sod": 0.08942354470491409, + "loss_total": 0.7103351950645447, + "step": 12099 }, { - "epoch": 0.025396435568405717, - "grad_norm": 1.969303011894226, - "learning_rate": 4.915078916342637e-05, - "loss": 2.522, - "step": 34100 + "epoch": 0.0242, + "grad_norm": 0.7772459983825684, + "learning_rate": 8.888743550893095e-05, + "loss": 0.7556, + "step": 12100 }, { - "epoch": 0.025675517277948638, - "grad_norm": 2.1575772762298584, - "learning_rate": 4.914505805925641e-05, - "loss": 2.527, - "step": 34200 + "epoch": 0.024398, + "loss_gen": 3.7260987758636475, + "loss_rtd": 0.4445788860321045, + "loss_sent": 0.2562042772769928, + "loss_sod": 0.05302653834223747, + "loss_total": 0.7538096904754639, + "step": 12199 }, { - "epoch": 0.025954598987491556, - "grad_norm": 2.1083667278289795, - "learning_rate": 4.913930801759984e-05, - "loss": 2.5296, - "step": 34300 + "epoch": 0.024398, + "loss_gen": 4.109078407287598, + "loss_rtd": 0.4398147463798523, + "loss_sent": 0.1253279596567154, + "loss_sod": 0.013701645657420158, + "loss_total": 0.5788443684577942, + "step": 12199 }, { - "epoch": 0.026233680697034478, - "grad_norm": 1.9617717266082764, - "learning_rate": 4.9133539042966525e-05, - "loss": 2.5132, - "step": 34400 + "epoch": 0.0244, + "grad_norm": 1.5994316339492798, + "learning_rate": 8.886748093455714e-05, + "loss": 0.7373, + "step": 12200 }, { - "epoch": 0.0265127624065774, - "grad_norm": 1.9009830951690674, - "learning_rate": 4.912775113988121e-05, - "loss": 2.5141, - "step": 34500 + "epoch": 0.024598, + "loss_gen": 4.129921913146973, + "loss_rtd": 0.43831920623779297, + "loss_sent": 0.27449101209640503, + "loss_sod": 0.13593009114265442, + "loss_total": 0.84874027967453, + "step": 12299 }, { - "epoch": 0.026791844116120317, - "grad_norm": 2.035285472869873, - "learning_rate": 4.912194431288347e-05, - "loss": 2.5303, - "step": 34600 + "epoch": 0.024598, + "loss_gen": 3.085507869720459, + "loss_rtd": 0.43336358666419983, + "loss_sent": 0.17228688299655914, + "loss_sod": 0.15589821338653564, + "loss_total": 0.761548638343811, + "step": 12299 }, { - "epoch": 0.02707092582566324, - "grad_norm": 2.072775363922119, - "learning_rate": 4.911611856652771e-05, - "loss": 2.5047, - "step": 34700 + "epoch": 0.0246, + "grad_norm": 1.5170425176620483, + "learning_rate": 8.884751070436668e-05, + "loss": 0.7545, + "step": 12300 }, { - "epoch": 0.027350007535206156, - "grad_norm": 2.071577548980713, - "learning_rate": 4.9110273905383206e-05, - "loss": 2.5182, - "step": 34800 + "epoch": 0.024798, + "loss_gen": 3.849769115447998, + "loss_rtd": 0.45669013261795044, + "loss_sent": 0.03489939868450165, + "loss_sod": 0.32191336154937744, + "loss_total": 0.8135029077529907, + "step": 12399 }, { - "epoch": 0.027629089244749078, - "grad_norm": 1.9403921365737915, - "learning_rate": 4.9104410334034034e-05, - "loss": 2.5137, - "step": 34900 + "epoch": 0.024798, + "loss_gen": 3.112346649169922, + "loss_rtd": 0.44622284173965454, + "loss_sent": 0.0024297181516885757, + "loss_sod": 0.3605993986129761, + "loss_total": 0.8092519640922546, + "step": 12399 }, { - "epoch": 0.027908170954292, - "grad_norm": 1.925726294517517, - "learning_rate": 4.9098527857079136e-05, - "loss": 2.5051, - "step": 35000 + "epoch": 0.0248, + "grad_norm": 1.343874454498291, + "learning_rate": 8.882752482640354e-05, + "loss": 0.7642, + "step": 12400 }, { - "epoch": 0.027908170954292, - "eval_loss": 2.5801749229431152, - "eval_runtime": 52.5441, - "eval_samples_per_second": 194.008, - "eval_steps_per_second": 1.523, - "step": 35000 + "epoch": 0.024998, + "loss_gen": 4.160418510437012, + "loss_rtd": 0.4546605050563812, + "loss_sent": 0.5152992606163025, + "loss_sod": 0.23410066962242126, + "loss_total": 1.204060435295105, + "step": 12499 }, { - "epoch": 0.028187252663834917, - "grad_norm": 2.0000064373016357, - "learning_rate": 4.909262647913225e-05, - "loss": 2.5111, - "step": 35100 + "epoch": 0.024998, + "loss_gen": 4.379058361053467, + "loss_rtd": 0.44232314825057983, + "loss_sent": 0.32886990904808044, + "loss_sod": 0.14197906851768494, + "loss_total": 0.9131721258163452, + "step": 12499 }, { - "epoch": 0.02846633437337784, - "grad_norm": 2.0426831245422363, - "learning_rate": 4.908670620482197e-05, - "loss": 2.5006, - "step": 35200 + "epoch": 0.025, + "grad_norm": 3.379681348800659, + "learning_rate": 8.880752330871805e-05, + "loss": 0.752, + "step": 12500 }, { - "epoch": 0.028745416082920756, - "grad_norm": 1.9477788209915161, - "learning_rate": 4.908076703879167e-05, - "loss": 2.5018, - "step": 35300 + "epoch": 0.025198, + "loss_gen": 3.828108310699463, + "loss_rtd": 0.4420686662197113, + "loss_sent": 0.607168436050415, + "loss_sod": 0.07426024228334427, + "loss_total": 1.1234973669052124, + "step": 12599 }, { - "epoch": 0.029024497792463678, - "grad_norm": 1.8843337297439575, - "learning_rate": 4.907480898569959e-05, - "loss": 2.5041, - "step": 35400 + "epoch": 0.025198, + "loss_gen": 4.14678955078125, + "loss_rtd": 0.4491349160671234, + "loss_sent": 0.2777857780456543, + "loss_sod": 0.0520130880177021, + "loss_total": 0.7789337635040283, + "step": 12599 }, { - "epoch": 0.0293035795020066, - "grad_norm": 1.884911298751831, - "learning_rate": 4.906883205021874e-05, - "loss": 2.5115, - "step": 35500 + "epoch": 0.0252, + "grad_norm": 1.5856245756149292, + "learning_rate": 8.87875061593668e-05, + "loss": 0.7549, + "step": 12600 }, { - "epoch": 0.029582661211549517, - "grad_norm": 2.0197665691375732, - "learning_rate": 4.906283623703697e-05, - "loss": 2.5018, - "step": 35600 + "epoch": 0.025398, + "loss_gen": 4.272764205932617, + "loss_rtd": 0.4308563470840454, + "loss_sent": 0.31138402223587036, + "loss_sod": 0.04572285711765289, + "loss_total": 0.7879632115364075, + "step": 12699 }, { - "epoch": 0.02986174292109244, - "grad_norm": 1.9508568048477173, - "learning_rate": 4.905682155085692e-05, - "loss": 2.4827, - "step": 35700 + "epoch": 0.025398, + "loss_gen": 4.319077491760254, + "loss_rtd": 0.45940783619880676, + "loss_sent": 0.1377851366996765, + "loss_sod": 0.08773832768201828, + "loss_total": 0.6849312782287598, + "step": 12699 }, { - "epoch": 0.030140824630635357, - "grad_norm": 2.0333240032196045, - "learning_rate": 4.905078799639603e-05, - "loss": 2.5011, - "step": 35800 + "epoch": 0.0254, + "grad_norm": 1.2356271743774414, + "learning_rate": 8.876747338641271e-05, + "loss": 0.7513, + "step": 12700 }, { - "epoch": 0.030419906340178278, - "grad_norm": 1.9671634435653687, - "learning_rate": 4.904473557838657e-05, - "loss": 2.4871, - "step": 35900 + "epoch": 0.025598, + "loss_gen": 4.071308612823486, + "loss_rtd": 0.43188050389289856, + "loss_sent": 0.2389868199825287, + "loss_sod": 0.055405814200639725, + "loss_total": 0.7262731790542603, + "step": 12799 }, { - "epoch": 0.030698988049721196, - "grad_norm": 2.0202693939208984, - "learning_rate": 4.9038664301575554e-05, - "loss": 2.5087, - "step": 36000 + "epoch": 0.025598, + "loss_gen": 3.7361228466033936, + "loss_rtd": 0.43908101320266724, + "loss_sent": 0.08752284198999405, + "loss_sod": 0.12760449945926666, + "loss_total": 0.6542083621025085, + "step": 12799 }, { - "epoch": 0.030698988049721196, - "eval_loss": 2.574017286300659, - "eval_runtime": 52.2416, - "eval_samples_per_second": 195.132, - "eval_steps_per_second": 1.531, - "step": 36000 + "epoch": 0.0256, + "grad_norm": 0.9795969724655151, + "learning_rate": 8.874742499792499e-05, + "loss": 0.7467, + "step": 12800 }, { - "epoch": 0.030978069759264117, - "grad_norm": 1.998271107673645, - "learning_rate": 4.9032574170724835e-05, - "loss": 2.4942, - "step": 36100 + "epoch": 0.025798, + "loss_gen": 3.2456018924713135, + "loss_rtd": 0.43680670857429504, + "loss_sent": 0.11466845124959946, + "loss_sod": 0.1494515985250473, + "loss_total": 0.7009267210960388, + "step": 12899 }, { - "epoch": 0.03125715146880704, - "grad_norm": 1.8772566318511963, - "learning_rate": 4.9026465190611016e-05, - "loss": 2.4976, - "step": 36200 + "epoch": 0.025798, + "loss_gen": 2.999803066253662, + "loss_rtd": 0.43333643674850464, + "loss_sent": 0.09659186750650406, + "loss_sod": 0.2513529658317566, + "loss_total": 0.7812812924385071, + "step": 12899 }, { - "epoch": 0.03153623317834996, - "grad_norm": 1.927706241607666, - "learning_rate": 4.9020337366025505e-05, - "loss": 2.4949, - "step": 36300 + "epoch": 0.0258, + "grad_norm": 1.0719258785247803, + "learning_rate": 8.87273610019791e-05, + "loss": 0.7372, + "step": 12900 }, { - "epoch": 0.031815314887892875, - "grad_norm": 1.902381181716919, - "learning_rate": 4.9014190701774496e-05, - "loss": 2.4972, - "step": 36400 + "epoch": 0.025998, + "loss_gen": 3.9534850120544434, + "loss_rtd": 0.42895567417144775, + "loss_sent": 0.17686723172664642, + "loss_sod": 0.007649438455700874, + "loss_total": 0.6134723424911499, + "step": 12999 }, { - "epoch": 0.0320943965974358, - "grad_norm": 1.9228150844573975, - "learning_rate": 4.900802520267895e-05, - "loss": 2.4873, - "step": 36500 + "epoch": 0.025998, + "loss_gen": 4.115196228027344, + "loss_rtd": 0.4262949824333191, + "loss_sent": 0.4657926857471466, + "loss_sod": 0.06579498201608658, + "loss_total": 0.9578826427459717, + "step": 12999 }, { - "epoch": 0.03237347830697872, - "grad_norm": 2.13718581199646, - "learning_rate": 4.900184087357459e-05, - "loss": 2.4842, - "step": 36600 + "epoch": 0.026, + "grad_norm": 1.4140347242355347, + "learning_rate": 8.870728140665684e-05, + "loss": 0.7338, + "step": 13000 }, { - "epoch": 0.032652560016521635, - "grad_norm": 1.832413911819458, - "learning_rate": 4.8995637719311916e-05, - "loss": 2.4899, - "step": 36700 + "epoch": 0.026, + "eval_loss": 0.7341647744178772, + "eval_runtime": 151.3683, + "eval_samples_per_second": 102.023, + "eval_steps_per_second": 0.799, + "step": 13000 }, { - "epoch": 0.03293164172606456, - "grad_norm": 1.9981021881103516, - "learning_rate": 4.898941574475621e-05, - "loss": 2.4911, - "step": 36800 + "epoch": 0.026198, + "loss_gen": 4.0114216804504395, + "loss_rtd": 0.4345766007900238, + "loss_sent": 0.3323989510536194, + "loss_sod": 0.07648509740829468, + "loss_total": 0.8434606194496155, + "step": 13099 }, { - "epoch": 0.03321072343560748, - "grad_norm": 1.9768383502960205, - "learning_rate": 4.898317495478748e-05, - "loss": 2.4865, - "step": 36900 + "epoch": 0.026198, + "loss_gen": 3.950547933578491, + "loss_rtd": 0.43731945753097534, + "loss_sent": 0.16473570466041565, + "loss_sod": 0.13462206721305847, + "loss_total": 0.7366771697998047, + "step": 13099 }, { - "epoch": 0.033489805145150396, - "grad_norm": 1.9410667419433594, - "learning_rate": 4.8976915354300536e-05, - "loss": 2.4787, - "step": 37000 + "epoch": 0.0262, + "grad_norm": 1.0907080173492432, + "learning_rate": 8.868718622004626e-05, + "loss": 0.7625, + "step": 13100 }, { - "epoch": 0.033489805145150396, - "eval_loss": 2.566740036010742, - "eval_runtime": 52.377, - "eval_samples_per_second": 194.627, - "eval_steps_per_second": 1.527, - "step": 37000 + "epoch": 0.026398, + "loss_gen": 3.7565226554870605, + "loss_rtd": 0.4329445958137512, + "loss_sent": 0.1427980661392212, + "loss_sod": 0.19305050373077393, + "loss_total": 0.7687931656837463, + "step": 13199 }, { - "epoch": 0.00027908170954291995, - "grad_norm": 1.9920895099639893, - "learning_rate": 4.897063694820489e-05, - "loss": 2.4785, - "step": 37100 + "epoch": 0.026398, + "loss_gen": 3.9972236156463623, + "loss_rtd": 0.45005112886428833, + "loss_sent": 0.22460077702999115, + "loss_sod": 0.003133713733404875, + "loss_total": 0.6777856349945068, + "step": 13199 }, { - "epoch": 0.0005581634190858399, - "grad_norm": 1.919732689857483, - "learning_rate": 4.896433974142485e-05, - "loss": 2.4881, - "step": 37200 + "epoch": 0.0264, + "grad_norm": 1.207594633102417, + "learning_rate": 8.866707545024169e-05, + "loss": 0.7235, + "step": 13200 }, { - "epoch": 0.0008372451286287599, - "grad_norm": 1.887787938117981, - "learning_rate": 4.895802373889944e-05, - "loss": 2.4728, - "step": 37300 + "epoch": 0.026598, + "loss_gen": 4.161458969116211, + "loss_rtd": 0.4533689618110657, + "loss_sent": 0.267709344625473, + "loss_sod": 0.008421978913247585, + "loss_total": 0.7295002937316895, + "step": 13299 }, { - "epoch": 0.0011163268381716798, - "grad_norm": 1.9295459985733032, - "learning_rate": 4.895168894558244e-05, - "loss": 2.4807, - "step": 37400 + "epoch": 0.026598, + "loss_gen": 4.003698348999023, + "loss_rtd": 0.4371702969074249, + "loss_sent": 0.41131556034088135, + "loss_sod": 0.11664707213640213, + "loss_total": 0.9651329517364502, + "step": 13299 }, { - "epoch": 0.0013954085477146, - "grad_norm": 1.921574592590332, - "learning_rate": 4.8945335366442367e-05, - "loss": 2.473, - "step": 37500 + "epoch": 0.0266, + "grad_norm": 1.1547691822052002, + "learning_rate": 8.864694910534375e-05, + "loss": 0.7531, + "step": 13300 }, { - "epoch": 0.0016744902572575198, - "grad_norm": 2.0197806358337402, - "learning_rate": 4.893896300646247e-05, - "loss": 2.4685, - "step": 37600 + "epoch": 0.026798, + "loss_gen": 3.7176673412323, + "loss_rtd": 0.42440980672836304, + "loss_sent": 0.2140468806028366, + "loss_sod": 0.04582613706588745, + "loss_total": 0.6842828392982483, + "step": 13399 }, { - "epoch": 0.00195357196680044, - "grad_norm": 2.001652240753174, - "learning_rate": 4.893257187064072e-05, - "loss": 2.4639, - "step": 37700 + "epoch": 0.026798, + "loss_gen": 4.113990783691406, + "loss_rtd": 0.431838721036911, + "loss_sent": 0.1240159422159195, + "loss_sod": 0.019157452508807182, + "loss_total": 0.5750120878219604, + "step": 13399 }, { - "epoch": 0.0022326536763433596, - "grad_norm": 1.920426845550537, - "learning_rate": 4.8926161963989826e-05, - "loss": 2.4759, - "step": 37800 + "epoch": 0.0268, + "grad_norm": 1.4807409048080444, + "learning_rate": 8.862680719345933e-05, + "loss": 0.7454, + "step": 13400 }, { - "epoch": 0.0025117353858862797, - "grad_norm": 1.9771814346313477, - "learning_rate": 4.8919733291537216e-05, - "loss": 2.4604, - "step": 37900 + "epoch": 0.026998, + "loss_gen": 3.7197763919830322, + "loss_rtd": 0.4261091351509094, + "loss_sent": 0.14779257774353027, + "loss_sod": 0.028960872441530228, + "loss_total": 0.6028625965118408, + "step": 13499 }, { - "epoch": 0.0027908170954292, - "grad_norm": 1.8774875402450562, - "learning_rate": 4.891328585832503e-05, - "loss": 2.4732, - "step": 38000 + "epoch": 0.026998, + "loss_gen": 3.850450277328491, + "loss_rtd": 0.44980937242507935, + "loss_sent": 0.1829732209444046, + "loss_sod": 0.09278829395771027, + "loss_total": 0.7255708575248718, + "step": 13499 }, { - "epoch": 0.0027908170954292, - "eval_loss": 2.5558669567108154, - "eval_runtime": 52.2529, - "eval_samples_per_second": 195.09, - "eval_steps_per_second": 1.531, - "step": 38000 + "epoch": 0.027, + "grad_norm": 0.9217216372489929, + "learning_rate": 8.860664972270161e-05, + "loss": 0.7366, + "step": 13500 }, { - "epoch": 0.00306989880497212, - "grad_norm": 1.8203305006027222, - "learning_rate": 4.890681966941014e-05, - "loss": 2.4663, - "step": 38100 + "epoch": 0.027198, + "loss_gen": 4.276337146759033, + "loss_rtd": 0.43566420674324036, + "loss_sent": 0.19798988103866577, + "loss_sod": 0.07652521133422852, + "loss_total": 0.7101792693138123, + "step": 13599 }, { - "epoch": 0.0033489805145150396, - "grad_norm": 1.8878498077392578, - "learning_rate": 4.89003347298641e-05, - "loss": 2.467, - "step": 38200 + "epoch": 0.027198, + "loss_gen": 4.133307456970215, + "loss_rtd": 0.4287024140357971, + "loss_sent": 0.2429627925157547, + "loss_sod": 0.03901500999927521, + "loss_total": 0.7106801867485046, + "step": 13599 }, { - "epoch": 0.0036280622240579597, - "grad_norm": 1.984248399734497, - "learning_rate": 4.889383104477321e-05, - "loss": 2.474, - "step": 38300 + "epoch": 0.0272, + "grad_norm": 0.744624674320221, + "learning_rate": 8.858647670118998e-05, + "loss": 0.7416, + "step": 13600 }, { - "epoch": 0.00390714393360088, - "grad_norm": 2.0134291648864746, - "learning_rate": 4.888730861923842e-05, - "loss": 2.465, - "step": 38400 + "epoch": 0.027398, + "loss_gen": 2.6930673122406006, + "loss_rtd": 0.4245775640010834, + "loss_sent": 0.1255381852388382, + "loss_sod": 0.31067463755607605, + "loss_total": 0.8607903718948364, + "step": 13699 }, { - "epoch": 0.0041862256431437995, - "grad_norm": 1.943984866142273, - "learning_rate": 4.8880767458375435e-05, - "loss": 2.4601, - "step": 38500 + "epoch": 0.027398, + "loss_gen": 3.989245653152466, + "loss_rtd": 0.4460928440093994, + "loss_sent": 0.3285417854785919, + "loss_sod": 0.04119309410452843, + "loss_total": 0.8158277273178101, + "step": 13699 }, { - "epoch": 0.004465307352686719, - "grad_norm": 1.8883605003356934, - "learning_rate": 4.8874207567314614e-05, - "loss": 2.4641, - "step": 38600 + "epoch": 0.0274, + "grad_norm": 1.0839942693710327, + "learning_rate": 8.856628813705014e-05, + "loss": 0.7379, + "step": 13700 }, { - "epoch": 0.00474438906222964, - "grad_norm": 2.030266523361206, - "learning_rate": 4.886762895120102e-05, - "loss": 2.463, - "step": 38700 + "epoch": 0.027598, + "loss_gen": 3.781327247619629, + "loss_rtd": 0.45150938630104065, + "loss_sent": 0.15958184003829956, + "loss_sod": 0.09167152643203735, + "loss_total": 0.7027627825737, + "step": 13799 }, { - "epoch": 0.005023470771772559, - "grad_norm": 1.917089581489563, - "learning_rate": 4.886103161519441e-05, - "loss": 2.83, - "step": 38800 + "epoch": 0.027598, + "loss_gen": 4.02975606918335, + "loss_rtd": 0.4286169707775116, + "loss_sent": 0.27698010206222534, + "loss_sod": 0.06525172293186188, + "loss_total": 0.77084881067276, + "step": 13799 }, { - "epoch": 0.00530255248131548, - "grad_norm": 1.9824928045272827, - "learning_rate": 4.885441556446921e-05, - "loss": 2.793, - "step": 38900 + "epoch": 0.0276, + "grad_norm": 1.9123098850250244, + "learning_rate": 8.854608403841407e-05, + "loss": 0.7504, + "step": 13800 }, { - "epoch": 0.0055816341908584, - "grad_norm": 1.7435353994369507, - "learning_rate": 4.884778080421453e-05, - "loss": 2.7929, - "step": 39000 + "epoch": 0.027798, + "loss_gen": 4.6888298988342285, + "loss_rtd": 0.425262987613678, + "loss_sent": 0.05898591876029968, + "loss_sod": 0.2515452802181244, + "loss_total": 0.735794186592102, + "step": 13899 }, { - "epoch": 0.0055816341908584, - "eval_loss": 2.5374677181243896, - "eval_runtime": 51.6213, - "eval_samples_per_second": 197.476, - "eval_steps_per_second": 1.55, - "step": 39000 + "epoch": 0.027798, + "loss_gen": 3.9502158164978027, + "loss_rtd": 0.4248698651790619, + "loss_sent": 0.14980857074260712, + "loss_sod": 0.017817402258515358, + "loss_total": 0.592495858669281, + "step": 13899 }, { - "epoch": 0.005860715900401319, - "grad_norm": 1.9794387817382812, - "learning_rate": 4.884112733963416e-05, - "loss": 2.7787, - "step": 39100 + "epoch": 0.0278, + "grad_norm": 1.2942866086959839, + "learning_rate": 8.852586441341996e-05, + "loss": 0.7301, + "step": 13900 }, { - "epoch": 0.00613979760994424, - "grad_norm": 1.9924589395523071, - "learning_rate": 4.8834455175946556e-05, - "loss": 2.7397, - "step": 39200 + "epoch": 0.027998, + "loss_gen": 3.0356056690216064, + "loss_rtd": 0.43091312050819397, + "loss_sent": 0.019389711320400238, + "loss_sod": 0.258874773979187, + "loss_total": 0.709177553653717, + "step": 13999 }, { - "epoch": 0.0064188793194871595, - "grad_norm": 2.0478086471557617, - "learning_rate": 4.8827764318384826e-05, - "loss": 2.7015, - "step": 39300 + "epoch": 0.027998, + "loss_gen": 2.5347843170166016, + "loss_rtd": 0.4192298948764801, + "loss_sent": 0.01333379466086626, + "loss_sod": 0.25479552149772644, + "loss_total": 0.6873592138290405, + "step": 13999 }, { - "epoch": 0.006697961029030079, - "grad_norm": 1.8016703128814697, - "learning_rate": 4.882105477219676e-05, - "loss": 2.7263, - "step": 39400 + "epoch": 0.028, + "grad_norm": 0.7091480493545532, + "learning_rate": 8.850562927021227e-05, + "loss": 0.7375, + "step": 14000 }, { - "epoch": 0.006977042738573, - "grad_norm": 2.0154685974121094, - "learning_rate": 4.881432654264481e-05, - "loss": 2.7214, - "step": 39500 + "epoch": 0.028, + "eval_loss": 0.732259213924408, + "eval_runtime": 151.7404, + "eval_samples_per_second": 101.773, + "eval_steps_per_second": 0.797, + "step": 14000 }, { - "epoch": 0.0072561244481159195, - "grad_norm": 1.7267248630523682, - "learning_rate": 4.880757963500606e-05, - "loss": 2.7212, - "step": 39600 + "epoch": 0.000198, + "loss_gen": 3.0679430961608887, + "loss_rtd": 0.43229687213897705, + "loss_sent": 0.03127670660614967, + "loss_sod": 0.18361902236938477, + "loss_total": 0.6471925973892212, + "step": 14099 }, { - "epoch": 0.007535206157658839, - "grad_norm": 1.7600693702697754, - "learning_rate": 4.880081405457225e-05, - "loss": 2.6696, - "step": 39700 + "epoch": 0.000198, + "loss_gen": 2.065791606903076, + "loss_rtd": 0.3922792375087738, + "loss_sent": 0.023536257445812225, + "loss_sod": 0.3744746744632721, + "loss_total": 0.7902901768684387, + "step": 14099 }, { - "epoch": 0.00781428786720176, - "grad_norm": 1.4433882236480713, - "learning_rate": 4.879402980664978e-05, - "loss": 2.6657, - "step": 39800 + "epoch": 0.0002, + "grad_norm": 1.5569876432418823, + "learning_rate": 8.84853786169417e-05, + "loss": 0.7265, + "step": 14100 }, { - "epoch": 0.00809336957674468, - "grad_norm": 1.9625422954559326, - "learning_rate": 4.878722689655967e-05, - "loss": 2.6537, - "step": 39900 + "epoch": 0.000398, + "loss_gen": 4.120703220367432, + "loss_rtd": 0.44122856855392456, + "loss_sent": 0.284933477640152, + "loss_sod": 0.08465798199176788, + "loss_total": 0.8108199834823608, + "step": 14199 }, { - "epoch": 0.008372451286287599, - "grad_norm": 2.0143187046051025, - "learning_rate": 4.87804053296376e-05, - "loss": 2.6378, - "step": 40000 + "epoch": 0.000398, + "loss_gen": 4.25185489654541, + "loss_rtd": 0.41677260398864746, + "loss_sent": 0.35926559567451477, + "loss_sod": 0.024174978956580162, + "loss_total": 0.8002132177352905, + "step": 14199 }, { - "epoch": 0.008372451286287599, - "eval_loss": 2.5359911918640137, - "eval_runtime": 51.3825, - "eval_samples_per_second": 198.394, - "eval_steps_per_second": 1.557, - "step": 40000 + "epoch": 0.0004, + "grad_norm": 1.4152344465255737, + "learning_rate": 8.846511246176526e-05, + "loss": 0.7529, + "step": 14200 }, { - "epoch": 0.008651532995830519, - "grad_norm": 1.282121181488037, - "learning_rate": 4.8773565111233865e-05, - "loss": 2.6675, - "step": 40100 + "epoch": 0.000598, + "loss_gen": 3.732604742050171, + "loss_rtd": 0.43974068760871887, + "loss_sent": 0.05152856558561325, + "loss_sod": 0.1073233112692833, + "loss_total": 0.5985925793647766, + "step": 14299 }, { - "epoch": 0.008930614705373438, - "grad_norm": 2.094029188156128, - "learning_rate": 4.87667062467134e-05, - "loss": 2.6298, - "step": 40200 + "epoch": 0.000598, + "loss_gen": 3.9523119926452637, + "loss_rtd": 0.41860440373420715, + "loss_sent": 0.2899419665336609, + "loss_sod": 0.018143948167562485, + "loss_total": 0.7266902923583984, + "step": 14299 }, { - "epoch": 0.00920969641491636, - "grad_norm": 2.0000486373901367, - "learning_rate": 4.875982874145575e-05, - "loss": 2.6263, - "step": 40300 + "epoch": 0.0006, + "grad_norm": 1.4069437980651855, + "learning_rate": 8.844483081284609e-05, + "loss": 0.7454, + "step": 14300 }, { - "epoch": 0.00948877812445928, - "grad_norm": 1.591666340827942, - "learning_rate": 4.875293260085509e-05, - "loss": 2.6303, - "step": 40400 + "epoch": 0.000798, + "loss_gen": 4.0675506591796875, + "loss_rtd": 0.4320642650127411, + "loss_sent": 0.1469995379447937, + "loss_sod": 0.118939608335495, + "loss_total": 0.6980034112930298, + "step": 14399 }, { - "epoch": 0.0097678598340022, - "grad_norm": 1.9758822917938232, - "learning_rate": 4.8746017830320225e-05, - "loss": 2.6154, - "step": 40500 + "epoch": 0.000798, + "loss_gen": 3.9876673221588135, + "loss_rtd": 0.461688756942749, + "loss_sent": 0.6275394558906555, + "loss_sod": 0.04515424743294716, + "loss_total": 1.1343824863433838, + "step": 14399 }, { - "epoch": 0.010046941543545119, - "grad_norm": 1.9928988218307495, - "learning_rate": 4.873908443527454e-05, - "loss": 2.6406, - "step": 40600 + "epoch": 0.0008, + "grad_norm": 2.224152088165283, + "learning_rate": 8.842453367835366e-05, + "loss": 0.7377, + "step": 14400 }, { - "epoch": 0.010326023253088039, - "grad_norm": 1.2124347686767578, - "learning_rate": 4.8732132421156065e-05, - "loss": 2.6172, - "step": 40700 + "epoch": 0.000998, + "loss_gen": 3.8818235397338867, + "loss_rtd": 0.4337279200553894, + "loss_sent": 0.14887529611587524, + "loss_sod": 0.2178276777267456, + "loss_total": 0.8004308938980103, + "step": 14499 }, { - "epoch": 0.01060510496263096, - "grad_norm": 2.098443031311035, - "learning_rate": 4.8725161793417394e-05, - "loss": 2.6035, - "step": 40800 + "epoch": 0.000998, + "loss_gen": 4.115235328674316, + "loss_rtd": 0.4256025552749634, + "loss_sent": 0.3175339996814728, + "loss_sod": 0.146746426820755, + "loss_total": 0.8898829221725464, + "step": 14499 }, { - "epoch": 0.01088418667217388, - "grad_norm": 1.9364486932754517, - "learning_rate": 4.871817255752575e-05, - "loss": 2.6072, - "step": 40900 + "epoch": 0.001, + "grad_norm": 1.2925978899002075, + "learning_rate": 8.840422106646368e-05, + "loss": 0.7259, + "step": 14500 }, { - "epoch": 0.0111632683817168, - "grad_norm": 1.9571590423583984, - "learning_rate": 4.8711164718962945e-05, - "loss": 2.6045, - "step": 41000 + "epoch": 0.001198, + "loss_gen": 3.6830289363861084, + "loss_rtd": 0.4349622428417206, + "loss_sent": 0.0974566638469696, + "loss_sod": 0.036854855716228485, + "loss_total": 0.5692737698554993, + "step": 14599 }, { - "epoch": 0.0111632683817168, - "eval_loss": 2.519200563430786, - "eval_runtime": 51.3669, - "eval_samples_per_second": 198.455, - "eval_steps_per_second": 1.557, - "step": 41000 + "epoch": 0.001198, + "loss_gen": 4.088166236877441, + "loss_rtd": 0.4255536198616028, + "loss_sent": 0.19805455207824707, + "loss_sod": 0.03682982921600342, + "loss_total": 0.6604380011558533, + "step": 14599 }, { - "epoch": 0.011442350091259719, - "grad_norm": 2.0674235820770264, - "learning_rate": 4.8704138283225365e-05, - "loss": 2.6071, - "step": 41100 + "epoch": 0.0012, + "grad_norm": 0.7997679114341736, + "learning_rate": 8.838389298535805e-05, + "loss": 0.7445, + "step": 14600 }, { - "epoch": 0.011721431800802639, - "grad_norm": 1.9325226545333862, - "learning_rate": 4.8697093255824e-05, - "loss": 2.6097, - "step": 41200 + "epoch": 0.001398, + "loss_gen": 3.9595894813537598, + "loss_rtd": 0.4168483018875122, + "loss_sent": 0.2763338088989258, + "loss_sod": 0.04373210668563843, + "loss_total": 0.7369142174720764, + "step": 14699 }, { - "epoch": 0.012000513510345558, - "grad_norm": 2.109142303466797, - "learning_rate": 4.869002964228442e-05, - "loss": 2.6, - "step": 41300 + "epoch": 0.001398, + "loss_gen": 3.125018835067749, + "loss_rtd": 0.42329004406929016, + "loss_sent": 0.008385371416807175, + "loss_sod": 0.38495877385139465, + "loss_total": 0.8166341781616211, + "step": 14699 }, { - "epoch": 0.01227959521988848, - "grad_norm": 1.6291286945343018, - "learning_rate": 4.8682947448146765e-05, - "loss": 2.5922, - "step": 41400 + "epoch": 0.0014, + "grad_norm": 1.3052629232406616, + "learning_rate": 8.83635494432249e-05, + "loss": 0.7247, + "step": 14700 }, { - "epoch": 0.0125586769294314, - "grad_norm": 1.5578373670578003, - "learning_rate": 4.8675846678965745e-05, - "loss": 2.5938, - "step": 41500 + "epoch": 0.001598, + "loss_gen": 4.314852714538574, + "loss_rtd": 0.431550532579422, + "loss_sent": 0.19600588083267212, + "loss_sod": 0.1637813299894333, + "loss_total": 0.7913377285003662, + "step": 14799 }, { - "epoch": 0.012837758638974319, - "grad_norm": 1.3479361534118652, - "learning_rate": 4.866872734031066e-05, - "loss": 2.5824, - "step": 41600 + "epoch": 0.001598, + "loss_gen": 3.685572862625122, + "loss_rtd": 0.4151884913444519, + "loss_sent": 0.5205061435699463, + "loss_sod": 0.02497510239481926, + "loss_total": 0.960669755935669, + "step": 14799 }, { - "epoch": 0.013116840348517239, - "grad_norm": 1.1882200241088867, - "learning_rate": 4.866158943776534e-05, - "loss": 2.5633, - "step": 41700 + "epoch": 0.0016, + "grad_norm": 2.231931686401367, + "learning_rate": 8.834319044825862e-05, + "loss": 0.7334, + "step": 14800 }, { - "epoch": 0.013395922058060158, - "grad_norm": 1.9686213731765747, - "learning_rate": 4.865443297692822e-05, - "loss": 2.5868, - "step": 41800 + "epoch": 0.001798, + "loss_gen": 3.1399123668670654, + "loss_rtd": 0.41176992654800415, + "loss_sent": 0.10314608365297318, + "loss_sod": 0.08458153158426285, + "loss_total": 0.5994975566864014, + "step": 14899 }, { - "epoch": 0.013675003767603078, - "grad_norm": 1.294700264930725, - "learning_rate": 4.8647257963412245e-05, - "loss": 2.5785, - "step": 41900 + "epoch": 0.001798, + "loss_gen": 3.982893705368042, + "loss_rtd": 0.42843541502952576, + "loss_sent": 0.23265258967876434, + "loss_sod": 0.11196303367614746, + "loss_total": 0.7730510830879211, + "step": 14899 }, { - "epoch": 0.013954085477146, - "grad_norm": 1.924691081047058, - "learning_rate": 4.864006440284494e-05, - "loss": 2.5577, - "step": 42000 + "epoch": 0.0018, + "grad_norm": 1.1219249963760376, + "learning_rate": 8.832281600865983e-05, + "loss": 0.7279, + "step": 14900 }, { - "epoch": 0.013954085477146, - "eval_loss": 2.5312111377716064, - "eval_runtime": 51.3653, - "eval_samples_per_second": 198.461, - "eval_steps_per_second": 1.557, - "step": 42000 + "epoch": 0.001998, + "loss_gen": 3.748601198196411, + "loss_rtd": 0.42924025654792786, + "loss_sent": 0.2589420676231384, + "loss_sod": 0.12195281684398651, + "loss_total": 0.8101351261138916, + "step": 14999 }, { - "epoch": 0.01423316718668892, - "grad_norm": 1.277314305305481, - "learning_rate": 4.863285230086837e-05, - "loss": 2.5733, - "step": 42100 + "epoch": 0.001998, + "loss_gen": 3.7271108627319336, + "loss_rtd": 0.4306026101112366, + "loss_sent": 0.2542365789413452, + "loss_sod": 0.09196815639734268, + "loss_total": 0.7768073678016663, + "step": 14999 }, { - "epoch": 0.014512248896231839, - "grad_norm": 1.2349932193756104, - "learning_rate": 4.862562166313914e-05, - "loss": 2.5341, - "step": 42200 + "epoch": 0.002, + "grad_norm": 1.8965760469436646, + "learning_rate": 8.830242613263532e-05, + "loss": 0.7431, + "step": 15000 }, { - "epoch": 0.014791330605774759, - "grad_norm": 2.112508773803711, - "learning_rate": 4.861837249532839e-05, - "loss": 2.5531, - "step": 42300 + "epoch": 0.002, + "eval_loss": 0.7258825302124023, + "eval_runtime": 152.5564, + "eval_samples_per_second": 101.228, + "eval_steps_per_second": 0.793, + "step": 15000 }, { - "epoch": 0.015070412315317678, - "grad_norm": 2.134838104248047, - "learning_rate": 4.86111048031218e-05, - "loss": 2.5655, - "step": 42400 + "epoch": 0.002198, + "loss_gen": 4.080875396728516, + "loss_rtd": 0.44319817423820496, + "loss_sent": 0.3066878616809845, + "loss_sod": 0.01762264408171177, + "loss_total": 0.7675086855888367, + "step": 15099 }, { - "epoch": 0.015349494024860598, - "grad_norm": 2.0029845237731934, - "learning_rate": 4.8603818592219585e-05, - "loss": 2.5533, - "step": 42500 + "epoch": 0.002198, + "loss_gen": 3.494755744934082, + "loss_rtd": 0.429073303937912, + "loss_sent": 0.21678392589092255, + "loss_sod": 0.06768958270549774, + "loss_total": 0.7135468125343323, + "step": 15099 }, { - "epoch": 0.01562857573440352, - "grad_norm": 1.4246734380722046, - "learning_rate": 4.8596513868336466e-05, - "loss": 2.5432, - "step": 42600 + "epoch": 0.0022, + "grad_norm": 1.2699775695800781, + "learning_rate": 8.828202082839815e-05, + "loss": 0.7201, + "step": 15100 }, { - "epoch": 0.015907657443946437, - "grad_norm": 1.815590500831604, - "learning_rate": 4.8589190637201695e-05, - "loss": 2.5538, - "step": 42700 + "epoch": 0.002398, + "loss_gen": 4.007122993469238, + "loss_rtd": 0.4222743511199951, + "loss_sent": 0.31278446316719055, + "loss_sod": 0.002229036297649145, + "loss_total": 0.7372878789901733, + "step": 15199 }, { - "epoch": 0.01618673915348936, - "grad_norm": 1.9902212619781494, - "learning_rate": 4.858184890455905e-05, - "loss": 2.5491, - "step": 42800 + "epoch": 0.002398, + "loss_gen": 3.383213996887207, + "loss_rtd": 0.42610523104667664, + "loss_sent": 0.11944480985403061, + "loss_sod": 0.13941094279289246, + "loss_total": 0.6849609613418579, + "step": 15199 }, { - "epoch": 0.01646582086303228, - "grad_norm": 2.026778221130371, - "learning_rate": 4.857448867616679e-05, - "loss": 2.5439, - "step": 42900 + "epoch": 0.0024, + "grad_norm": 1.189070224761963, + "learning_rate": 8.826160010416756e-05, + "loss": 0.7106, + "step": 15200 }, { - "epoch": 0.016744902572575198, - "grad_norm": 1.3105437755584717, - "learning_rate": 4.856710995779772e-05, - "loss": 2.5419, - "step": 43000 + "epoch": 0.002598, + "loss_gen": 2.794424533843994, + "loss_rtd": 0.36382150650024414, + "loss_sent": 0.12546056509017944, + "loss_sod": 0.06349128484725952, + "loss_total": 0.5527733564376831, + "step": 15299 }, { - "epoch": 0.016744902572575198, - "eval_loss": 2.5215396881103516, - "eval_runtime": 51.4226, - "eval_samples_per_second": 198.24, - "eval_steps_per_second": 1.556, - "step": 43000 + "epoch": 0.002598, + "loss_gen": 3.5569169521331787, + "loss_rtd": 0.42518049478530884, + "loss_sent": 0.14433139562606812, + "loss_sod": 0.1257479041814804, + "loss_total": 0.6952598094940186, + "step": 15299 }, { - "epoch": 0.01702398428211812, - "grad_norm": 1.722645878791809, - "learning_rate": 4.8559712755239114e-05, - "loss": 2.5421, - "step": 43100 + "epoch": 0.0026, + "grad_norm": 1.2007133960723877, + "learning_rate": 8.824116396816904e-05, + "loss": 0.7406, + "step": 15300 }, { - "epoch": 0.017303065991661037, - "grad_norm": 1.5124750137329102, - "learning_rate": 4.855229707429276e-05, - "loss": 2.5246, - "step": 43200 + "epoch": 0.002798, + "loss_gen": 3.122385263442993, + "loss_rtd": 0.4065248966217041, + "loss_sent": 0.09375770390033722, + "loss_sod": 0.21171408891677856, + "loss_total": 0.7119966745376587, + "step": 15399 }, { - "epoch": 0.01758214770120396, - "grad_norm": 1.814021110534668, - "learning_rate": 4.854486292077494e-05, - "loss": 2.443, - "step": 43300 + "epoch": 0.002798, + "loss_gen": 4.033022403717041, + "loss_rtd": 0.42629608511924744, + "loss_sent": 0.4818168878555298, + "loss_sod": 0.17908814549446106, + "loss_total": 1.0872011184692383, + "step": 15399 }, { - "epoch": 0.017861229410746877, - "grad_norm": 1.8268548250198364, - "learning_rate": 4.853741030051641e-05, - "loss": 2.3454, - "step": 43400 + "epoch": 0.0028, + "grad_norm": 1.5872125625610352, + "learning_rate": 8.822071242863424e-05, + "loss": 0.736, + "step": 15400 }, { - "epoch": 0.018140311120289798, - "grad_norm": 1.842734456062317, - "learning_rate": 4.852993921936243e-05, - "loss": 2.3159, - "step": 43500 + "epoch": 0.002998, + "loss_gen": 3.652820348739624, + "loss_rtd": 0.4053482413291931, + "loss_sent": 0.255595862865448, + "loss_sod": 0.07709679007530212, + "loss_total": 0.7380409240722656, + "step": 15499 }, { - "epoch": 0.01841939282983272, - "grad_norm": 1.8235430717468262, - "learning_rate": 4.8522449683172725e-05, - "loss": 2.32, - "step": 43600 + "epoch": 0.002998, + "loss_gen": 4.354541301727295, + "loss_rtd": 0.4241352379322052, + "loss_sent": 0.156655415892601, + "loss_sod": 0.1135091483592987, + "loss_total": 0.6942998170852661, + "step": 15499 }, { - "epoch": 0.018698474539375638, - "grad_norm": 1.8531183004379272, - "learning_rate": 4.8514941697821504e-05, - "loss": 2.2961, - "step": 43700 + "epoch": 0.003, + "grad_norm": 1.6814513206481934, + "learning_rate": 8.820024549380103e-05, + "loss": 0.7249, + "step": 15500 }, { - "epoch": 0.01897755624891856, - "grad_norm": 1.7762449979782104, - "learning_rate": 4.850741526919743e-05, - "loss": 2.2962, - "step": 43800 + "epoch": 0.003198, + "loss_gen": 4.114484786987305, + "loss_rtd": 0.412880003452301, + "loss_sent": 0.3902363181114197, + "loss_sod": 0.024768974632024765, + "loss_total": 0.8278852701187134, + "step": 15599 }, { - "epoch": 0.019256637958461477, - "grad_norm": 1.7467913627624512, - "learning_rate": 4.8499870403203645e-05, - "loss": 2.2753, - "step": 43900 + "epoch": 0.003198, + "loss_gen": 2.7664718627929688, + "loss_rtd": 0.41608572006225586, + "loss_sent": 0.0032381522469222546, + "loss_sod": 0.36812782287597656, + "loss_total": 0.7874516844749451, + "step": 15599 }, { - "epoch": 0.0195357196680044, - "grad_norm": 1.7463856935501099, - "learning_rate": 4.849230710575776e-05, - "loss": 2.2773, - "step": 44000 + "epoch": 0.0032, + "grad_norm": 1.080322265625, + "learning_rate": 8.817976317191352e-05, + "loss": 0.7262, + "step": 15600 }, { - "epoch": 0.0195357196680044, - "eval_loss": 2.527839183807373, - "eval_runtime": 51.7307, - "eval_samples_per_second": 197.059, - "eval_steps_per_second": 1.546, - "step": 44000 + "epoch": 0.003398, + "loss_gen": 4.035771369934082, + "loss_rtd": 0.42975643277168274, + "loss_sent": 0.22751572728157043, + "loss_sod": 0.06185830011963844, + "loss_total": 0.7191304564476013, + "step": 15699 }, { - "epoch": 0.01981480137754732, - "grad_norm": 1.73529851436615, - "learning_rate": 4.8484725382791816e-05, - "loss": 2.2767, - "step": 44100 + "epoch": 0.003398, + "loss_gen": 4.123803615570068, + "loss_rtd": 0.40343937277793884, + "loss_sent": 0.14625126123428345, + "loss_sod": 0.06565766036510468, + "loss_total": 0.6153482794761658, + "step": 15699 }, { - "epoch": 0.020093883087090238, - "grad_norm": 1.7689580917358398, - "learning_rate": 4.847712524025233e-05, - "loss": 2.2713, - "step": 44200 + "epoch": 0.0034, + "grad_norm": 1.0878139734268188, + "learning_rate": 8.815926547122197e-05, + "loss": 0.7328, + "step": 15700 }, { - "epoch": 0.02037296479663316, - "grad_norm": 1.852257490158081, - "learning_rate": 4.846950668410026e-05, - "loss": 2.2602, - "step": 44300 + "epoch": 0.003598, + "loss_gen": 4.012997627258301, + "loss_rtd": 0.4305029809474945, + "loss_sent": 0.22090695798397064, + "loss_sod": 0.0283144973218441, + "loss_total": 0.6797244548797607, + "step": 15799 }, { - "epoch": 0.020652046506176077, - "grad_norm": 1.7741947174072266, - "learning_rate": 4.846186972031099e-05, - "loss": 2.2681, - "step": 44400 + "epoch": 0.003598, + "loss_gen": 3.9291770458221436, + "loss_rtd": 0.4147172272205353, + "loss_sent": 0.19529950618743896, + "loss_sod": 0.07241620123386383, + "loss_total": 0.6824329495429993, + "step": 15799 }, { - "epoch": 0.020931128215719, - "grad_norm": 1.6933701038360596, - "learning_rate": 4.845421435487437e-05, - "loss": 2.2527, - "step": 44500 + "epoch": 0.0036, + "grad_norm": 0.7841458916664124, + "learning_rate": 8.813875239998286e-05, + "loss": 0.7196, + "step": 15800 }, { - "epoch": 0.02121020992526192, - "grad_norm": 1.8038115501403809, - "learning_rate": 4.844654059379467e-05, - "loss": 2.2473, - "step": 44600 + "epoch": 0.003798, + "loss_gen": 4.31983757019043, + "loss_rtd": 0.4128066599369049, + "loss_sent": 0.13574691116809845, + "loss_sod": 0.22686126828193665, + "loss_total": 0.7754148244857788, + "step": 15899 }, { - "epoch": 0.021489291634804838, - "grad_norm": 1.754543662071228, - "learning_rate": 4.843884844309056e-05, - "loss": 2.2485, - "step": 44700 + "epoch": 0.003798, + "loss_gen": 4.224883079528809, + "loss_rtd": 0.42368102073669434, + "loss_sent": 0.3318007290363312, + "loss_sod": 0.10107254981994629, + "loss_total": 0.8565542697906494, + "step": 15899 }, { - "epoch": 0.02176837334434776, - "grad_norm": 1.7147287130355835, - "learning_rate": 4.843113790879517e-05, - "loss": 2.2336, - "step": 44800 + "epoch": 0.0038, + "grad_norm": 1.1637392044067383, + "learning_rate": 8.811822396645881e-05, + "loss": 0.7431, + "step": 15900 }, { - "epoch": 0.022047455053890677, - "grad_norm": 1.7862520217895508, - "learning_rate": 4.842340899695604e-05, - "loss": 2.2509, - "step": 44900 + "epoch": 0.003998, + "loss_gen": 4.401248455047607, + "loss_rtd": 0.4079552888870239, + "loss_sent": 0.1784553825855255, + "loss_sod": 0.05878440663218498, + "loss_total": 0.6451950669288635, + "step": 15999 }, { - "epoch": 0.0223265367634336, - "grad_norm": 1.6962544918060303, - "learning_rate": 4.8415661713635136e-05, - "loss": 2.2442, - "step": 45000 + "epoch": 0.003998, + "loss_gen": 4.055051803588867, + "loss_rtd": 0.42362701892852783, + "loss_sent": 0.28670841455459595, + "loss_sod": 0.0232734065502882, + "loss_total": 0.7336088418960571, + "step": 15999 }, { - "epoch": 0.0223265367634336, - "eval_loss": 2.5255188941955566, - "eval_runtime": 51.7869, - "eval_samples_per_second": 196.845, - "eval_steps_per_second": 1.545, - "step": 45000 + "epoch": 0.004, + "grad_norm": 1.3481870889663696, + "learning_rate": 8.809768017891873e-05, + "loss": 0.7344, + "step": 16000 }, { - "epoch": 0.022605618472976517, - "grad_norm": 1.7465505599975586, - "learning_rate": 4.8407896064908796e-05, - "loss": 2.2274, - "step": 45100 + "epoch": 0.004, + "eval_loss": 0.7216879725456238, + "eval_runtime": 152.7099, + "eval_samples_per_second": 101.126, + "eval_steps_per_second": 0.792, + "step": 16000 }, { - "epoch": 0.022884700182519438, - "grad_norm": 1.7054563760757446, - "learning_rate": 4.84001120568678e-05, - "loss": 2.2297, - "step": 45200 + "epoch": 0.004198, + "loss_gen": 2.5740246772766113, + "loss_rtd": 0.40873804688453674, + "loss_sent": 0.018980562686920166, + "loss_sod": 0.21097786724567413, + "loss_total": 0.6386964917182922, + "step": 16099 }, { - "epoch": 0.02316378189206236, - "grad_norm": 1.6769620180130005, - "learning_rate": 4.83923096956173e-05, - "loss": 2.223, - "step": 45300 + "epoch": 0.004198, + "loss_gen": 3.902594804763794, + "loss_rtd": 0.42422205209732056, + "loss_sent": 0.04060244932770729, + "loss_sod": 0.17612992227077484, + "loss_total": 0.6409544348716736, + "step": 16099 }, { - "epoch": 0.023442863601605277, - "grad_norm": 1.8379839658737183, - "learning_rate": 4.8384488987276854e-05, - "loss": 2.2316, - "step": 45400 + "epoch": 0.0042, + "grad_norm": 0.7013354897499084, + "learning_rate": 8.807712104563763e-05, + "loss": 0.7182, + "step": 16100 }, { - "epoch": 0.0237219453111482, - "grad_norm": 1.8331048488616943, - "learning_rate": 4.837664993798043e-05, - "loss": 2.2272, - "step": 45500 + "epoch": 0.004398, + "loss_gen": 2.7281432151794434, + "loss_rtd": 0.39617347717285156, + "loss_sent": 0.07382925599813461, + "loss_sod": 0.17214326560497284, + "loss_total": 0.6421459913253784, + "step": 16199 }, { - "epoch": 0.024001027020691117, - "grad_norm": 1.7977640628814697, - "learning_rate": 4.836879255387634e-05, - "loss": 2.2328, - "step": 45600 + "epoch": 0.004398, + "loss_gen": 3.9564368724823, + "loss_rtd": 0.4245787560939789, + "loss_sent": 0.13513240218162537, + "loss_sod": 0.1007312759757042, + "loss_total": 0.6604424715042114, + "step": 16199 }, { - "epoch": 0.024280108730234038, - "grad_norm": 1.7152631282806396, - "learning_rate": 4.836091684112732e-05, - "loss": 2.2221, - "step": 45700 + "epoch": 0.0044, + "grad_norm": 1.2529441118240356, + "learning_rate": 8.805654657489672e-05, + "loss": 0.7395, + "step": 16200 }, { - "epoch": 0.02455919043977696, - "grad_norm": 1.638720154762268, - "learning_rate": 4.835302280591045e-05, - "loss": 2.2276, - "step": 45800 + "epoch": 0.004598, + "loss_gen": 4.069423198699951, + "loss_rtd": 0.42716529965400696, + "loss_sent": 0.20060759782791138, + "loss_sod": 0.2048812359571457, + "loss_total": 0.8326541185379028, + "step": 16299 }, { - "epoch": 0.024838272149319877, - "grad_norm": 1.716223955154419, - "learning_rate": 4.8345110454417184e-05, - "loss": 2.2162, - "step": 45900 + "epoch": 0.004598, + "loss_gen": 3.879422187805176, + "loss_rtd": 0.40844979882240295, + "loss_sent": 0.09611134231090546, + "loss_sod": 0.04968631640076637, + "loss_total": 0.5542474389076233, + "step": 16299 }, { - "epoch": 0.0251173538588628, - "grad_norm": 1.6937196254730225, - "learning_rate": 4.833717979285336e-05, - "loss": 2.2159, - "step": 46000 + "epoch": 0.0046, + "grad_norm": 1.2504940032958984, + "learning_rate": 8.803595677498341e-05, + "loss": 0.7247, + "step": 16300 }, { - "epoch": 0.0251173538588628, - "eval_loss": 2.5174078941345215, - "eval_runtime": 51.848, - "eval_samples_per_second": 196.613, - "eval_steps_per_second": 1.543, - "step": 46000 + "epoch": 0.004798, + "loss_gen": 3.7565174102783203, + "loss_rtd": 0.42008933424949646, + "loss_sent": 0.13713136315345764, + "loss_sod": 0.007490306627005339, + "loss_total": 0.5647109746932983, + "step": 16399 }, { - "epoch": 0.00027908170954291995, - "grad_norm": 1.7433758974075317, - "learning_rate": 4.832923082743915e-05, - "loss": 2.2103, - "step": 46100 + "epoch": 0.004798, + "loss_gen": 3.987405776977539, + "loss_rtd": 0.4281255900859833, + "loss_sent": 0.23581212759017944, + "loss_sod": 0.06856285035610199, + "loss_total": 0.7325005531311035, + "step": 16399 }, { - "epoch": 0.0005581634190858399, - "grad_norm": 1.6650173664093018, - "learning_rate": 4.832126356440912e-05, - "loss": 2.2208, - "step": 46200 + "epoch": 0.0048, + "grad_norm": 1.352505087852478, + "learning_rate": 8.801535165419124e-05, + "loss": 0.7326, + "step": 16400 }, { - "epoch": 0.0008372451286287599, - "grad_norm": 1.7328202724456787, - "learning_rate": 4.831327801001215e-05, - "loss": 2.2041, - "step": 46300 + "epoch": 0.004998, + "loss_gen": 4.19617223739624, + "loss_rtd": 0.40191999077796936, + "loss_sent": 0.1798119843006134, + "loss_sod": 0.09213796257972717, + "loss_total": 0.6738699674606323, + "step": 16499 }, { - "epoch": 0.0011163268381716798, - "grad_norm": 1.7945832014083862, - "learning_rate": 4.8305274170511495e-05, - "loss": 2.211, - "step": 46400 + "epoch": 0.004998, + "loss_gen": 3.9264073371887207, + "loss_rtd": 0.42804864048957825, + "loss_sent": 0.10941962897777557, + "loss_sod": 0.09465447813272476, + "loss_total": 0.6321227550506592, + "step": 16499 }, { - "epoch": 0.0013954085477146, - "grad_norm": 1.8517906665802002, - "learning_rate": 4.829725205218472e-05, - "loss": 2.2047, - "step": 46500 + "epoch": 0.005, + "grad_norm": 1.0817756652832031, + "learning_rate": 8.799473122081999e-05, + "loss": 0.7278, + "step": 16500 }, { - "epoch": 0.0016744902572575198, - "grad_norm": 1.7758690118789673, - "learning_rate": 4.828921166132376e-05, - "loss": 2.208, - "step": 46600 + "epoch": 0.005198, + "loss_gen": 4.115394115447998, + "loss_rtd": 0.4185012876987457, + "loss_sent": 0.3424651622772217, + "loss_sod": 0.04793938249349594, + "loss_total": 0.808905839920044, + "step": 16599 }, { - "epoch": 0.00195357196680044, - "grad_norm": 1.693491816520691, - "learning_rate": 4.828115300423485e-05, - "loss": 2.2021, - "step": 46700 + "epoch": 0.005198, + "loss_gen": 3.8694446086883545, + "loss_rtd": 0.425730437040329, + "loss_sent": 0.35807597637176514, + "loss_sod": 0.08995547145605087, + "loss_total": 0.8737618923187256, + "step": 16599 }, { - "epoch": 0.0022326536763433596, - "grad_norm": 1.726741075515747, - "learning_rate": 4.8273076087238584e-05, - "loss": 2.2052, - "step": 46800 + "epoch": 0.0052, + "grad_norm": 3.3067049980163574, + "learning_rate": 8.797409548317555e-05, + "loss": 0.7214, + "step": 16600 }, { - "epoch": 0.0025117353858862797, - "grad_norm": 1.670870304107666, - "learning_rate": 4.8264980916669855e-05, - "loss": 2.2057, - "step": 46900 + "epoch": 0.005398, + "loss_gen": 4.149951457977295, + "loss_rtd": 0.41732972860336304, + "loss_sent": 0.12850509583950043, + "loss_sod": 0.16574223339557648, + "loss_total": 0.7115770578384399, + "step": 16699 }, { - "epoch": 0.0027908170954292, - "grad_norm": 1.5561637878417969, - "learning_rate": 4.8256867498877874e-05, - "loss": 2.1926, - "step": 47000 + "epoch": 0.005398, + "loss_gen": 3.211554527282715, + "loss_rtd": 0.4234715402126312, + "loss_sent": 0.120115265250206, + "loss_sod": 0.2308167815208435, + "loss_total": 0.7744035720825195, + "step": 16699 }, { - "epoch": 0.0027908170954292, - "eval_loss": 2.506756544113159, - "eval_runtime": 53.839, - "eval_samples_per_second": 189.342, - "eval_steps_per_second": 1.486, - "step": 47000 + "epoch": 0.0054, + "grad_norm": 1.411970853805542, + "learning_rate": 8.795344444956998e-05, + "loss": 0.719, + "step": 16700 }, { - "epoch": 0.00306989880497212, - "grad_norm": 1.6574453115463257, - "learning_rate": 4.8248735840226163e-05, - "loss": 2.1965, - "step": 47100 + "epoch": 0.005598, + "loss_gen": 2.7611095905303955, + "loss_rtd": 0.40610721707344055, + "loss_sent": 0.04416600242257118, + "loss_sod": 0.39784079790115356, + "loss_total": 0.8481140732765198, + "step": 16799 }, { - "epoch": 0.0033489805145150396, - "grad_norm": 1.6572227478027344, - "learning_rate": 4.824058594709258e-05, - "loss": 2.192, - "step": 47200 + "epoch": 0.005598, + "loss_gen": 3.9168972969055176, + "loss_rtd": 0.3954412043094635, + "loss_sent": 0.38528358936309814, + "loss_sod": 0.04784262180328369, + "loss_total": 0.828567385673523, + "step": 16799 }, { - "epoch": 0.0036280622240579597, - "grad_norm": 1.8155920505523682, - "learning_rate": 4.823241782586925e-05, - "loss": 2.1908, - "step": 47300 + "epoch": 0.0056, + "grad_norm": 3.84426212310791, + "learning_rate": 8.793277812832153e-05, + "loss": 0.717, + "step": 16800 }, { - "epoch": 0.00390714393360088, - "grad_norm": 1.7016464471817017, - "learning_rate": 4.822423148296259e-05, - "loss": 2.1816, - "step": 47400 + "epoch": 0.005798, + "loss_gen": 2.4084560871124268, + "loss_rtd": 0.39277756214141846, + "loss_sent": 0.00010122371168108657, + "loss_sod": 0.40928834676742554, + "loss_total": 0.8021671175956726, + "step": 16899 }, { - "epoch": 0.0041862256431437995, - "grad_norm": 1.7593576908111572, - "learning_rate": 4.8216026924793335e-05, - "loss": 2.1899, - "step": 47500 + "epoch": 0.005798, + "loss_gen": 2.609027624130249, + "loss_rtd": 0.40788689255714417, + "loss_sent": 0.021917378529906273, + "loss_sod": 0.16589200496673584, + "loss_total": 0.5956962704658508, + "step": 16899 }, { - "epoch": 0.004465307352686719, - "grad_norm": 1.7329928874969482, - "learning_rate": 4.82078041577965e-05, - "loss": 2.1865, - "step": 47600 + "epoch": 0.0058, + "grad_norm": 0.5878534913063049, + "learning_rate": 8.791209652775459e-05, + "loss": 0.7236, + "step": 16900 }, { - "epoch": 0.00474438906222964, - "grad_norm": 1.7763489484786987, - "learning_rate": 4.819956318842138e-05, - "loss": 2.1887, - "step": 47700 + "epoch": 0.005998, + "loss_gen": 4.018768787384033, + "loss_rtd": 0.42293936014175415, + "loss_sent": 0.12526118755340576, + "loss_sod": 0.08860775828361511, + "loss_total": 0.6368082761764526, + "step": 16999 }, { - "epoch": 0.005023470771772559, - "grad_norm": 1.6088265180587769, - "learning_rate": 4.819130402313153e-05, - "loss": 2.1819, - "step": 47800 + "epoch": 0.005998, + "loss_gen": 3.610734462738037, + "loss_rtd": 0.42120155692100525, + "loss_sent": 0.18627098202705383, + "loss_sod": 0.08135885000228882, + "loss_total": 0.6888313293457031, + "step": 16999 }, { - "epoch": 0.00530255248131548, - "grad_norm": 1.5476899147033691, - "learning_rate": 4.8183026668404805e-05, - "loss": 2.1779, - "step": 47900 + "epoch": 0.006, + "grad_norm": 0.8335890769958496, + "learning_rate": 8.789139965619968e-05, + "loss": 0.7083, + "step": 17000 }, { - "epoch": 0.0055816341908584, - "grad_norm": 1.824232578277588, - "learning_rate": 4.8174731130733304e-05, - "loss": 2.1795, - "step": 48000 + "epoch": 0.006, + "eval_loss": 0.7154964804649353, + "eval_runtime": 151.3533, + "eval_samples_per_second": 102.033, + "eval_steps_per_second": 0.799, + "step": 17000 }, { - "epoch": 0.0055816341908584, - "eval_loss": 2.5157690048217773, - "eval_runtime": 51.5425, - "eval_samples_per_second": 197.778, - "eval_steps_per_second": 1.552, - "step": 48000 + "epoch": 0.006198, + "loss_gen": 2.9088780879974365, + "loss_rtd": 0.4015282094478607, + "loss_sent": 0.03383695334196091, + "loss_sod": 0.19729620218276978, + "loss_total": 0.6326613426208496, + "step": 17099 }, { - "epoch": 0.005860715900401319, - "grad_norm": 1.7068545818328857, - "learning_rate": 4.8166417416623394e-05, - "loss": 2.1692, - "step": 48100 + "epoch": 0.006198, + "loss_gen": 3.805980920791626, + "loss_rtd": 0.41610702872276306, + "loss_sent": 0.3630307912826538, + "loss_sod": 0.033853236585855484, + "loss_total": 0.8129910230636597, + "step": 17099 }, { - "epoch": 0.00613979760994424, - "grad_norm": 1.5785512924194336, - "learning_rate": 4.81580855325957e-05, - "loss": 2.1854, - "step": 48200 + "epoch": 0.0062, + "grad_norm": 1.3783321380615234, + "learning_rate": 8.787068752199353e-05, + "loss": 0.731, + "step": 17100 }, { - "epoch": 0.0064188793194871595, - "grad_norm": 1.620144248008728, - "learning_rate": 4.814973548518511e-05, - "loss": 2.1706, - "step": 48300 + "epoch": 0.006398, + "loss_gen": 4.188808917999268, + "loss_rtd": 0.43117642402648926, + "loss_sent": 0.0706799104809761, + "loss_sod": 0.08008696883916855, + "loss_total": 0.5819432735443115, + "step": 17199 }, { - "epoch": 0.006697961029030079, - "grad_norm": 1.7049171924591064, - "learning_rate": 4.814136728094072e-05, - "loss": 2.1706, - "step": 48400 + "epoch": 0.006398, + "loss_gen": 3.1088032722473145, + "loss_rtd": 0.42036527395248413, + "loss_sent": 0.04297754913568497, + "loss_sod": 0.12957589328289032, + "loss_total": 0.5929187536239624, + "step": 17199 }, { - "epoch": 0.006977042738573, - "grad_norm": 1.7441779375076294, - "learning_rate": 4.813298092642591e-05, - "loss": 2.1713, - "step": 48500 + "epoch": 0.0064, + "grad_norm": 1.0201504230499268, + "learning_rate": 8.7849960133479e-05, + "loss": 0.719, + "step": 17200 }, { - "epoch": 0.0072561244481159195, - "grad_norm": 1.75383460521698, - "learning_rate": 4.8124576428218277e-05, - "loss": 2.1885, - "step": 48600 + "epoch": 0.006598, + "loss_gen": 3.864455223083496, + "loss_rtd": 0.41520410776138306, + "loss_sent": 0.37994444370269775, + "loss_sod": 0.05261944234371185, + "loss_total": 0.8477680087089539, + "step": 17299 }, { - "epoch": 0.007535206157658839, - "grad_norm": 1.6548856496810913, - "learning_rate": 4.8116153792909646e-05, - "loss": 2.1724, - "step": 48700 + "epoch": 0.006598, + "loss_gen": 3.6532490253448486, + "loss_rtd": 0.4261130690574646, + "loss_sent": 0.1113138422369957, + "loss_sod": 0.01775341108441353, + "loss_total": 0.5551803112030029, + "step": 17299 }, { - "epoch": 0.00781428786720176, - "grad_norm": 1.8267414569854736, - "learning_rate": 4.8107713027106066e-05, - "loss": 2.1808, - "step": 48800 + "epoch": 0.0066, + "grad_norm": 1.0252410173416138, + "learning_rate": 8.782921749900501e-05, + "loss": 0.7198, + "step": 17300 }, { - "epoch": 0.00809336957674468, - "grad_norm": 1.6873103380203247, - "learning_rate": 4.809925413742782e-05, - "loss": 2.1654, - "step": 48900 + "epoch": 0.006798, + "loss_gen": 3.8775410652160645, + "loss_rtd": 0.43246281147003174, + "loss_sent": 0.39332959055900574, + "loss_sod": 0.0777227133512497, + "loss_total": 0.903515100479126, + "step": 17399 }, { - "epoch": 0.008372451286287599, - "grad_norm": 1.7922857999801636, - "learning_rate": 4.809077713050939e-05, - "loss": 2.1694, - "step": 49000 + "epoch": 0.006798, + "loss_gen": 3.4301466941833496, + "loss_rtd": 0.4230908453464508, + "loss_sent": 0.2612267732620239, + "loss_sod": 0.13481563329696655, + "loss_total": 0.8191332817077637, + "step": 17399 }, { - "epoch": 0.008372451286287599, - "eval_loss": 2.5077826976776123, - "eval_runtime": 51.5956, - "eval_samples_per_second": 197.575, - "eval_steps_per_second": 1.551, - "step": 49000 + "epoch": 0.0068, + "grad_norm": 1.6101447343826294, + "learning_rate": 8.780845962692677e-05, + "loss": 0.7277, + "step": 17400 }, { - "epoch": 0.008651532995830519, - "grad_norm": 1.796513319015503, - "learning_rate": 4.808228201299947e-05, - "loss": 2.1731, - "step": 49100 + "epoch": 0.006998, + "loss_gen": 3.934807777404785, + "loss_rtd": 0.42464226484298706, + "loss_sent": 0.15972498059272766, + "loss_sod": 0.1780788153409958, + "loss_total": 0.7624460458755493, + "step": 17499 }, { - "epoch": 0.008930614705373438, - "grad_norm": 1.6628786325454712, - "learning_rate": 4.807376879156097e-05, - "loss": 2.158, - "step": 49200 + "epoch": 0.006998, + "loss_gen": 4.128389358520508, + "loss_rtd": 0.425221711397171, + "loss_sent": 0.13899770379066467, + "loss_sod": 0.045398104935884476, + "loss_total": 0.6096175312995911, + "step": 17499 }, { - "epoch": 0.00920969641491636, - "grad_norm": 1.7021812200546265, - "learning_rate": 4.806523747287099e-05, - "loss": 2.1536, - "step": 49300 + "epoch": 0.007, + "grad_norm": 1.1656321287155151, + "learning_rate": 8.778768652560551e-05, + "loss": 0.7327, + "step": 17500 }, { - "epoch": 0.00948877812445928, - "grad_norm": 1.6801835298538208, - "learning_rate": 4.805668806362083e-05, - "loss": 2.1621, - "step": 49400 + "epoch": 0.007198, + "loss_gen": 4.306458473205566, + "loss_rtd": 0.4018927216529846, + "loss_sent": 0.27341505885124207, + "loss_sod": 0.028834078460931778, + "loss_total": 0.7041418552398682, + "step": 17599 }, { - "epoch": 0.0097678598340022, - "grad_norm": 1.5964114665985107, - "learning_rate": 4.804812057051597e-05, - "loss": 2.1627, - "step": 49500 + "epoch": 0.007198, + "loss_gen": 3.6742680072784424, + "loss_rtd": 0.42321765422821045, + "loss_sent": 0.17562679946422577, + "loss_sod": 0.14957062900066376, + "loss_total": 0.7484151124954224, + "step": 17599 }, { - "epoch": 0.010046941543545119, - "grad_norm": 1.6952677965164185, - "learning_rate": 4.803953500027608e-05, - "loss": 2.1678, - "step": 49600 + "epoch": 0.0072, + "grad_norm": 2.1263222694396973, + "learning_rate": 8.776689820340862e-05, + "loss": 0.7369, + "step": 17600 }, { - "epoch": 0.010326023253088039, - "grad_norm": 1.6913024187088013, - "learning_rate": 4.8030931359635e-05, - "loss": 2.1582, - "step": 49700 + "epoch": 0.007398, + "loss_gen": 3.879549980163574, + "loss_rtd": 0.40929073095321655, + "loss_sent": 0.2122492492198944, + "loss_sod": 0.012041620910167694, + "loss_total": 0.6335816383361816, + "step": 17699 }, { - "epoch": 0.01060510496263096, - "grad_norm": 1.811557412147522, - "learning_rate": 4.802230965534077e-05, - "loss": 2.1526, - "step": 49800 + "epoch": 0.007398, + "loss_gen": 4.083215236663818, + "loss_rtd": 0.4366784691810608, + "loss_sent": 0.16557861864566803, + "loss_sod": 0.016638852655887604, + "loss_total": 0.618895947933197, + "step": 17699 }, { - "epoch": 0.01088418667217388, - "grad_norm": 1.6537189483642578, - "learning_rate": 4.801366989415556e-05, - "loss": 2.1591, - "step": 49900 + "epoch": 0.0074, + "grad_norm": 0.6774116158485413, + "learning_rate": 8.774609466870966e-05, + "loss": 0.7327, + "step": 17700 }, { - "epoch": 0.0111632683817168, - "grad_norm": 1.6723755598068237, - "learning_rate": 4.800501208285572e-05, - "loss": 2.1547, - "step": 50000 + "epoch": 0.007598, + "loss_gen": 4.151714324951172, + "loss_rtd": 0.4287411570549011, + "loss_sent": 0.18538439273834229, + "loss_sod": 0.02048429846763611, + "loss_total": 0.6346098184585571, + "step": 17799 }, { - "epoch": 0.0111632683817168, - "eval_loss": 2.5098140239715576, - "eval_runtime": 51.5299, - "eval_samples_per_second": 197.827, - "eval_steps_per_second": 1.552, - "step": 50000 + "epoch": 0.007598, + "loss_gen": 4.426347732543945, + "loss_rtd": 0.42809152603149414, + "loss_sent": 0.19042737782001495, + "loss_sod": 0.07416960597038269, + "loss_total": 0.692688524723053, + "step": 17799 }, { - "epoch": 0.011442350091259719, - "grad_norm": 1.5575144290924072, - "learning_rate": 4.799633622823176e-05, - "loss": 2.1564, - "step": 50100 + "epoch": 0.0076, + "grad_norm": 2.4827094078063965, + "learning_rate": 8.772527592988829e-05, + "loss": 0.7175, + "step": 17800 }, { - "epoch": 0.011721431800802639, - "grad_norm": 1.7833404541015625, - "learning_rate": 4.798764233708834e-05, - "loss": 2.1546, - "step": 50200 + "epoch": 0.007798, + "loss_gen": 4.004205703735352, + "loss_rtd": 0.41530540585517883, + "loss_sent": 0.16743431985378265, + "loss_sod": 0.18945026397705078, + "loss_total": 0.7721899747848511, + "step": 17899 }, { - "epoch": 0.012000513510345558, - "grad_norm": 1.711753010749817, - "learning_rate": 4.7978930416244267e-05, - "loss": 2.1518, - "step": 50300 + "epoch": 0.007798, + "loss_gen": 3.50095534324646, + "loss_rtd": 0.4054555594921112, + "loss_sent": 0.00267047923989594, + "loss_sod": 0.399612694978714, + "loss_total": 0.807738721370697, + "step": 17899 }, { - "epoch": 0.01227959521988848, - "grad_norm": 1.6810834407806396, - "learning_rate": 4.7970200472532485e-05, - "loss": 2.1462, - "step": 50400 + "epoch": 0.0078, + "grad_norm": 1.0332685708999634, + "learning_rate": 8.770444199533028e-05, + "loss": 0.7162, + "step": 17900 }, { - "epoch": 0.0125586769294314, - "grad_norm": 1.6707885265350342, - "learning_rate": 4.7961452512800075e-05, - "loss": 2.1505, - "step": 50500 + "epoch": 0.007998, + "loss_gen": 4.209635257720947, + "loss_rtd": 0.39917030930519104, + "loss_sent": 0.11306151747703552, + "loss_sod": 0.03569847345352173, + "loss_total": 0.5479303002357483, + "step": 17999 }, { - "epoch": 0.012837758638974319, - "grad_norm": 1.641190767288208, - "learning_rate": 4.795268654390825e-05, - "loss": 2.1549, - "step": 50600 + "epoch": 0.007998, + "loss_gen": 3.763282299041748, + "loss_rtd": 0.4105561375617981, + "loss_sent": 0.1345166563987732, + "loss_sod": 0.04830792546272278, + "loss_total": 0.5933806896209717, + "step": 17999 }, { - "epoch": 0.013116840348517239, - "grad_norm": 1.6623954772949219, - "learning_rate": 4.7943902572732344e-05, - "loss": 2.1477, - "step": 50700 + "epoch": 0.008, + "grad_norm": 1.0159022808074951, + "learning_rate": 8.768359287342754e-05, + "loss": 0.7193, + "step": 18000 }, { - "epoch": 0.013395922058060158, - "grad_norm": 1.7293742895126343, - "learning_rate": 4.793510060616182e-05, - "loss": 2.1338, - "step": 50800 + "epoch": 0.008, + "eval_loss": 0.7110997438430786, + "eval_runtime": 151.5607, + "eval_samples_per_second": 101.893, + "eval_steps_per_second": 0.798, + "step": 18000 }, { - "epoch": 0.013675003767603078, - "grad_norm": 1.7338817119598389, - "learning_rate": 4.7926280651100255e-05, - "loss": 2.1423, - "step": 50900 + "epoch": 0.008198, + "loss_gen": 4.019419193267822, + "loss_rtd": 0.4001118540763855, + "loss_sent": 0.09835583716630936, + "loss_sod": 0.098896324634552, + "loss_total": 0.5973640084266663, + "step": 18099 }, { - "epoch": 0.013954085477146, - "grad_norm": 1.7197438478469849, - "learning_rate": 4.791744271446531e-05, - "loss": 2.1333, - "step": 51000 + "epoch": 0.008198, + "loss_gen": 4.136058807373047, + "loss_rtd": 0.42017704248428345, + "loss_sent": 0.17160357534885406, + "loss_sod": 0.02384444698691368, + "loss_total": 0.6156250834465027, + "step": 18099 }, { - "epoch": 0.013954085477146, - "eval_loss": 2.5122387409210205, - "eval_runtime": 51.4468, - "eval_samples_per_second": 198.147, - "eval_steps_per_second": 1.555, - "step": 51000 + "epoch": 0.0082, + "grad_norm": 0.8614924550056458, + "learning_rate": 8.766272857257808e-05, + "loss": 0.7178, + "step": 18100 }, { - "epoch": 0.01423316718668892, - "grad_norm": 1.1121740341186523, - "learning_rate": 4.790858680318878e-05, - "loss": 2.0646, - "step": 51100 + "epoch": 0.008398, + "loss_gen": 3.8973662853240967, + "loss_rtd": 0.4116802215576172, + "loss_sent": 0.22857199609279633, + "loss_sod": 0.08869585394859314, + "loss_total": 0.7289481163024902, + "step": 18199 }, { - "epoch": 0.014512248896231839, - "grad_norm": 1.1387221813201904, - "learning_rate": 4.7899712924216545e-05, - "loss": 1.9953, - "step": 51200 + "epoch": 0.008398, + "loss_gen": 4.000563621520996, + "loss_rtd": 0.4114419221878052, + "loss_sent": 0.11508604139089584, + "loss_sod": 0.019892679527401924, + "loss_total": 0.5464206337928772, + "step": 18199 }, { - "epoch": 0.014791330605774759, - "grad_norm": 1.2224198579788208, - "learning_rate": 4.789082108450859e-05, - "loss": 1.9696, - "step": 51300 + "epoch": 0.0084, + "grad_norm": 0.9504892826080322, + "learning_rate": 8.764184910118609e-05, + "loss": 0.7141, + "step": 18200 }, { - "epoch": 0.015070412315317678, - "grad_norm": 1.130934476852417, - "learning_rate": 4.788191129103895e-05, - "loss": 1.9529, - "step": 51400 + "epoch": 0.008598, + "loss_gen": 4.2602105140686035, + "loss_rtd": 0.4248410165309906, + "loss_sent": 0.1012086570262909, + "loss_sod": 0.13853920996189117, + "loss_total": 0.6645889282226562, + "step": 18299 }, { - "epoch": 0.015349494024860598, - "grad_norm": 1.211732029914856, - "learning_rate": 4.787298355079578e-05, - "loss": 1.9375, - "step": 51500 + "epoch": 0.008598, + "loss_gen": 2.558227062225342, + "loss_rtd": 0.39907151460647583, + "loss_sent": 0.007993497885763645, + "loss_sod": 0.31753554940223694, + "loss_total": 0.724600613117218, + "step": 18299 }, { - "epoch": 0.01562857573440352, - "grad_norm": 1.1475533246994019, - "learning_rate": 4.78640378707813e-05, - "loss": 1.9268, - "step": 51600 + "epoch": 0.0086, + "grad_norm": 1.2532877922058105, + "learning_rate": 8.762095446766176e-05, + "loss": 0.7124, + "step": 18300 }, { - "epoch": 0.015907657443946437, - "grad_norm": 1.0950398445129395, - "learning_rate": 4.78550742580118e-05, - "loss": 1.9179, - "step": 51700 + "epoch": 0.008798, + "loss_gen": 3.8672940731048584, + "loss_rtd": 0.41844481229782104, + "loss_sent": 0.2934986352920532, + "loss_sod": 0.10334864258766174, + "loss_total": 0.8152921199798584, + "step": 18399 }, { - "epoch": 0.01618673915348936, - "grad_norm": 1.1658161878585815, - "learning_rate": 4.784609271951763e-05, - "loss": 1.9087, - "step": 51800 + "epoch": 0.008798, + "loss_gen": 3.9968318939208984, + "loss_rtd": 0.40939950942993164, + "loss_sent": 0.17195989191532135, + "loss_sod": 0.03714260458946228, + "loss_total": 0.6185020208358765, + "step": 18399 }, { - "epoch": 0.01646582086303228, - "grad_norm": 1.1191028356552124, - "learning_rate": 4.783709326234321e-05, - "loss": 1.9045, - "step": 51900 + "epoch": 0.0088, + "grad_norm": 1.0843919515609741, + "learning_rate": 8.760004468042147e-05, + "loss": 0.7133, + "step": 18400 }, { - "epoch": 0.016744902572575198, - "grad_norm": 1.3949638605117798, - "learning_rate": 4.7828075893547e-05, - "loss": 1.8949, - "step": 52000 + "epoch": 0.008998, + "loss_gen": 3.860063314437866, + "loss_rtd": 0.4077662527561188, + "loss_sent": 0.22858792543411255, + "loss_sod": 0.07041435688734055, + "loss_total": 0.7067685127258301, + "step": 18499 }, { - "epoch": 0.016744902572575198, - "eval_loss": 2.4026219844818115, - "eval_runtime": 51.6144, - "eval_samples_per_second": 197.503, - "eval_steps_per_second": 1.55, - "step": 52000 + "epoch": 0.008998, + "loss_gen": 3.951281785964966, + "loss_rtd": 0.4170241951942444, + "loss_sent": 0.1347436010837555, + "loss_sod": 0.02688794955611229, + "loss_total": 0.5786557197570801, + "step": 18499 }, { - "epoch": 0.01702398428211812, - "grad_norm": 1.1054247617721558, - "learning_rate": 4.781904062020151e-05, - "loss": 1.8887, - "step": 52100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 1.081113576889038, - "learning_rate": 4.780998744939331e-05, - "loss": 1.8907, - "step": 52200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 1.1169079542160034, - "learning_rate": 4.780091638822299e-05, - "loss": 1.8776, - "step": 52300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 1.1226317882537842, - "learning_rate": 4.779182744380518e-05, - "loss": 1.881, - "step": 52400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 1.3107126951217651, - "learning_rate": 4.7782720623268534e-05, - "loss": 1.879, - "step": 52500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 1.2411494255065918, - "learning_rate": 4.7773595933755723e-05, - "loss": 1.8714, - "step": 52600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 1.1438442468643188, - "learning_rate": 4.776445338242344e-05, - "loss": 1.8674, - "step": 52700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 1.1175483465194702, - "learning_rate": 4.775529297644239e-05, - "loss": 1.8695, - "step": 52800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 1.1415314674377441, - "learning_rate": 4.774611472299726e-05, - "loss": 1.8605, - "step": 52900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 1.1231484413146973, - "learning_rate": 4.7736918629286766e-05, - "loss": 1.8568, - "step": 53000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.392717123031616, - "eval_runtime": 51.8009, - "eval_samples_per_second": 196.792, - "eval_steps_per_second": 1.544, - "step": 53000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 1.127394437789917, - "learning_rate": 4.772770470252359e-05, - "loss": 1.8553, - "step": 53100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 1.2071884870529175, - "learning_rate": 4.7718472949934424e-05, - "loss": 1.8599, - "step": 53200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 1.171433448791504, - "learning_rate": 4.7709223378759914e-05, - "loss": 1.8533, - "step": 53300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 1.095151662826538, - "learning_rate": 4.769995599625471e-05, - "loss": 1.8414, - "step": 53400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 1.2364022731781006, - "learning_rate": 4.769067080968742e-05, - "loss": 1.8406, - "step": 53500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 1.2860286235809326, - "learning_rate": 4.768136782634061e-05, - "loss": 1.8435, - "step": 53600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 1.1113066673278809, - "learning_rate": 4.76720470535108e-05, - "loss": 1.8388, - "step": 53700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 1.4060981273651123, - "learning_rate": 4.7662708498508484e-05, - "loss": 1.8311, - "step": 53800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 1.1273114681243896, - "learning_rate": 4.7653352168658086e-05, - "loss": 1.8258, - "step": 53900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 1.2581157684326172, - "learning_rate": 4.764397807129798e-05, - "loss": 1.8323, - "step": 54000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.3841710090637207, - "eval_runtime": 51.7139, - "eval_samples_per_second": 197.123, - "eval_steps_per_second": 1.547, - "step": 54000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 1.15262770652771, - "learning_rate": 4.763458621378047e-05, - "loss": 1.8296, - "step": 54100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 1.1026790142059326, - "learning_rate": 4.76251766034718e-05, - "loss": 1.8262, - "step": 54200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 1.2467550039291382, - "learning_rate": 4.761574924775211e-05, - "loss": 1.8183, - "step": 54300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 1.1515969038009644, - "learning_rate": 4.760630415401548e-05, - "loss": 1.8248, - "step": 54400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 1.0883244276046753, - "learning_rate": 4.7596841329669916e-05, - "loss": 1.8169, - "step": 54500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 1.0941274166107178, - "learning_rate": 4.75873607821373e-05, - "loss": 1.8133, - "step": 54600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 1.1419379711151123, - "learning_rate": 4.757786251885344e-05, - "loss": 1.8129, - "step": 54700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 1.1331219673156738, - "learning_rate": 4.756834654726802e-05, - "loss": 1.8229, - "step": 54800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 1.1496691703796387, - "learning_rate": 4.755881287484461e-05, - "loss": 1.8102, - "step": 54900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 1.0749887228012085, - "learning_rate": 4.754926150906069e-05, - "loss": 1.8093, - "step": 55000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.3733201026916504, - "eval_runtime": 51.6128, - "eval_samples_per_second": 197.509, - "eval_steps_per_second": 1.55, - "step": 55000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 1.1389296054840088, - "learning_rate": 4.753969245740759e-05, - "loss": 1.808, - "step": 55100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 1.1606720685958862, - "learning_rate": 4.753010572739054e-05, - "loss": 1.8092, - "step": 55200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 1.1364072561264038, - "learning_rate": 4.75205013265286e-05, - "loss": 1.8067, - "step": 55300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 1.1075749397277832, - "learning_rate": 4.7510879262354715e-05, - "loss": 1.8011, - "step": 55400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 1.3133985996246338, - "learning_rate": 4.7501239542415666e-05, - "loss": 1.8057, - "step": 55500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 1.1680502891540527, - "learning_rate": 4.749158217427211e-05, - "loss": 1.7985, - "step": 55600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 1.1045200824737549, - "learning_rate": 4.748190716549851e-05, - "loss": 1.7947, - "step": 55700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 1.7189477682113647, - "learning_rate": 4.74722145236832e-05, - "loss": 2.1046, - "step": 55800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 1.6057113409042358, - "learning_rate": 4.746250425642831e-05, - "loss": 2.2521, - "step": 55900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 1.7283493280410767, - "learning_rate": 4.7452776371349824e-05, - "loss": 2.2374, - "step": 56000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.362111806869507, - "eval_runtime": 51.6988, - "eval_samples_per_second": 197.181, - "eval_steps_per_second": 1.547, - "step": 56000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 1.4895098209381104, - "learning_rate": 4.744303087607752e-05, - "loss": 2.234, - "step": 56100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 1.7794862985610962, - "learning_rate": 4.7433267778255005e-05, - "loss": 2.2276, - "step": 56200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 1.7328559160232544, - "learning_rate": 4.7423487085539684e-05, - "loss": 2.2177, - "step": 56300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 1.5167728662490845, - "learning_rate": 4.7413688805602774e-05, - "loss": 2.2112, - "step": 56400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 1.7203041315078735, - "learning_rate": 4.7403872946129255e-05, - "loss": 2.213, - "step": 56500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 1.7185243368148804, - "learning_rate": 4.739403951481793e-05, - "loss": 2.213, - "step": 56600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 1.630519986152649, - "learning_rate": 4.738418851938138e-05, - "loss": 2.206, - "step": 56700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 1.704929232597351, - "learning_rate": 4.737431996754593e-05, - "loss": 2.2065, - "step": 56800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 1.7357486486434937, - "learning_rate": 4.7364433867051704e-05, - "loss": 2.1935, - "step": 56900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 1.681195616722107, - "learning_rate": 4.73545302256526e-05, - "loss": 2.2023, - "step": 57000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.3744301795959473, - "eval_runtime": 51.6827, - "eval_samples_per_second": 197.242, - "eval_steps_per_second": 1.548, - "step": 57000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 1.4501374959945679, - "learning_rate": 4.734460905111622e-05, - "loss": 2.1934, - "step": 57100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 1.6807043552398682, - "learning_rate": 4.7334670351223984e-05, - "loss": 2.2011, - "step": 57200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 1.7063562870025635, - "learning_rate": 4.732471413377103e-05, - "loss": 2.2039, - "step": 57300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 1.7342766523361206, - "learning_rate": 4.7314740406566205e-05, - "loss": 2.1887, - "step": 57400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 1.7052456140518188, - "learning_rate": 4.730474917743214e-05, - "loss": 2.1974, - "step": 57500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 1.668006181716919, - "learning_rate": 4.7294740454205164e-05, - "loss": 2.1889, - "step": 57600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 1.637566089630127, - "learning_rate": 4.728471424473533e-05, - "loss": 2.1846, - "step": 57700 + "epoch": 0.009, + "grad_norm": 0.7378911375999451, + "learning_rate": 8.75791197478877e-05, + "loss": 0.7178, + "step": 18500 }, { - "epoch": 0.03293164172606456, - "grad_norm": 1.4663089513778687, - "learning_rate": 4.7274670556886413e-05, - "loss": 2.1818, - "step": 57800 + "epoch": 0.009198, + "loss_gen": 4.030672073364258, + "loss_rtd": 0.42401623725891113, + "loss_sent": 0.0964384451508522, + "loss_sod": 0.010999690741300583, + "loss_total": 0.5314543843269348, + "step": 18599 }, { - "epoch": 0.03321072343560748, - "grad_norm": 1.6797181367874146, - "learning_rate": 4.72646093985359e-05, - "loss": 2.188, - "step": 57900 + "epoch": 0.009198, + "loss_gen": 4.139626502990723, + "loss_rtd": 0.4047938287258148, + "loss_sent": 0.2407665252685547, + "loss_sod": 0.09544012695550919, + "loss_total": 0.7410004734992981, + "step": 18599 }, { - "epoch": 0.033489805145150396, - "grad_norm": 1.6873514652252197, - "learning_rate": 4.725453077757496e-05, - "loss": 2.1786, - "step": 58000 + "epoch": 0.0092, + "grad_norm": 1.2465193271636963, + "learning_rate": 8.755817967848897e-05, + "loss": 0.7071, + "step": 18600 }, { - "epoch": 0.033489805145150396, - "eval_loss": 2.3785526752471924, - "eval_runtime": 51.6733, - "eval_samples_per_second": 197.278, - "eval_steps_per_second": 1.548, - "step": 58000 + "epoch": 0.009398, + "loss_gen": 4.339874744415283, + "loss_rtd": 0.425040602684021, + "loss_sent": 0.16886325180530548, + "loss_sod": 0.03933661803603172, + "loss_total": 0.6332404613494873, + "step": 18699 }, { - "epoch": 0.00027908170954291995, - "grad_norm": 1.725947618484497, - "learning_rate": 4.7244434701908466e-05, - "loss": 2.1771, - "step": 58100 + "epoch": 0.009398, + "loss_gen": 4.066360950469971, + "loss_rtd": 0.42420873045921326, + "loss_sent": 0.385574609041214, + "loss_sod": 0.0640186294913292, + "loss_total": 0.8738019466400146, + "step": 18699 }, { - "epoch": 0.0005581634190858399, - "grad_norm": 1.6296665668487549, - "learning_rate": 4.7234321179455e-05, - "loss": 2.184, - "step": 58200 + "epoch": 0.0094, + "grad_norm": 1.517830491065979, + "learning_rate": 8.753722448065996e-05, + "loss": 0.713, + "step": 18700 }, { - "epoch": 0.0008372451286287599, - "grad_norm": 1.6986839771270752, - "learning_rate": 4.722419021814682e-05, - "loss": 2.1744, - "step": 58300 + "epoch": 0.009598, + "loss_gen": 4.080881595611572, + "loss_rtd": 0.4158501625061035, + "loss_sent": 0.04737195372581482, + "loss_sod": 0.004243885166943073, + "loss_total": 0.46746599674224854, + "step": 18799 }, { - "epoch": 0.0011163268381716798, - "grad_norm": 1.3352227210998535, - "learning_rate": 4.7214041825929844e-05, - "loss": 2.1793, - "step": 58400 + "epoch": 0.009598, + "loss_gen": 2.9473581314086914, + "loss_rtd": 0.40688350796699524, + "loss_sent": 0.005198832601308823, + "loss_sod": 0.21468880772590637, + "loss_total": 0.6267711520195007, + "step": 18799 }, { - "epoch": 0.0013954085477146, - "grad_norm": 1.7643837928771973, - "learning_rate": 4.720387601076367e-05, - "loss": 2.1733, - "step": 58500 + "epoch": 0.0096, + "grad_norm": 0.9139093160629272, + "learning_rate": 8.751625416284142e-05, + "loss": 0.7025, + "step": 18800 }, { - "epoch": 0.0016744902572575198, - "grad_norm": 1.6393450498580933, - "learning_rate": 4.719369278062158e-05, - "loss": 2.1745, - "step": 58600 + "epoch": 0.009798, + "loss_gen": 2.641028642654419, + "loss_rtd": 0.3995615839958191, + "loss_sent": 0.00015060137957334518, + "loss_sod": 0.3031446635723114, + "loss_total": 0.7028568387031555, + "step": 18899 }, { - "epoch": 0.00195357196680044, - "grad_norm": 1.721708059310913, - "learning_rate": 4.718349214349049e-05, - "loss": 2.1741, - "step": 58700 + "epoch": 0.009798, + "loss_gen": 3.585927724838257, + "loss_rtd": 0.41310915350914, + "loss_sent": 0.03582323342561722, + "loss_sod": 0.17621608078479767, + "loss_total": 0.6251484751701355, + "step": 18899 }, { - "epoch": 0.0022326536763433596, - "grad_norm": 1.6743454933166504, - "learning_rate": 4.717327410737098e-05, - "loss": 2.1785, - "step": 58800 + "epoch": 0.0098, + "grad_norm": 0.656723141670227, + "learning_rate": 8.74952687334802e-05, + "loss": 0.7155, + "step": 18900 }, { - "epoch": 0.0025117353858862797, - "grad_norm": 1.5261398553848267, - "learning_rate": 4.716303868027727e-05, - "loss": 2.1632, - "step": 58900 + "epoch": 0.009998, + "loss_gen": 4.061804294586182, + "loss_rtd": 0.42302989959716797, + "loss_sent": 0.1835642009973526, + "loss_sod": 0.05181720480322838, + "loss_total": 0.6584113240242004, + "step": 18999 }, { - "epoch": 0.0027908170954292, - "grad_norm": 1.6939880847930908, - "learning_rate": 4.7152785870237224e-05, - "loss": 2.1673, - "step": 59000 + "epoch": 0.009998, + "loss_gen": 3.725508451461792, + "loss_rtd": 0.4165676534175873, + "loss_sent": 0.12800826132297516, + "loss_sod": 0.18614144623279572, + "loss_total": 0.7307173609733582, + "step": 18999 }, { - "epoch": 0.0027908170954292, - "eval_loss": 2.3614025115966797, - "eval_runtime": 51.7206, - "eval_samples_per_second": 197.097, - "eval_steps_per_second": 1.547, - "step": 59000 + "epoch": 0.01, + "grad_norm": 1.5900828838348389, + "learning_rate": 8.74742682010292e-05, + "loss": 0.7148, + "step": 19000 }, { - "epoch": 0.00306989880497212, - "grad_norm": 1.6760599613189697, - "learning_rate": 4.714251568529236e-05, - "loss": 2.1698, - "step": 59100 + "epoch": 0.01, + "eval_loss": 0.7048465609550476, + "eval_runtime": 151.8862, + "eval_samples_per_second": 101.675, + "eval_steps_per_second": 0.797, + "step": 19000 }, { - "epoch": 0.0033489805145150396, - "grad_norm": 1.7031495571136475, - "learning_rate": 4.713222813349778e-05, - "loss": 2.1573, - "step": 59200 + "epoch": 0.010198, + "loss_gen": 3.7057571411132812, + "loss_rtd": 0.4221038520336151, + "loss_sent": 0.16967017948627472, + "loss_sod": 0.1264842003583908, + "loss_total": 0.7182582020759583, + "step": 19099 }, { - "epoch": 0.0036280622240579597, - "grad_norm": 1.59571373462677, - "learning_rate": 4.712192322292225e-05, - "loss": 2.1598, - "step": 59300 + "epoch": 0.010198, + "loss_gen": 4.596755027770996, + "loss_rtd": 0.4268624186515808, + "loss_sent": 0.12956024706363678, + "loss_sod": 0.1693856567144394, + "loss_total": 0.725808322429657, + "step": 19099 }, { - "epoch": 0.00390714393360088, - "grad_norm": 1.496177077293396, - "learning_rate": 4.711160096164812e-05, - "loss": 2.17, - "step": 59400 + "epoch": 0.0102, + "grad_norm": 1.438319444656372, + "learning_rate": 8.745325257394747e-05, + "loss": 0.723, + "step": 19100 }, { - "epoch": 0.0041862256431437995, - "grad_norm": 1.710594892501831, - "learning_rate": 4.710126135777136e-05, - "loss": 2.1584, - "step": 59500 + "epoch": 0.010398, + "loss_gen": 3.9707088470458984, + "loss_rtd": 0.4285086989402771, + "loss_sent": 0.14790266752243042, + "loss_sod": 0.027327165007591248, + "loss_total": 0.60373854637146, + "step": 19199 }, { - "epoch": 0.004465307352686719, - "grad_norm": 1.6488218307495117, - "learning_rate": 4.709090441940155e-05, - "loss": 2.1608, - "step": 59600 + "epoch": 0.010398, + "loss_gen": 3.938842296600342, + "loss_rtd": 0.39713433384895325, + "loss_sent": 0.18230196833610535, + "loss_sod": 0.0404755175113678, + "loss_total": 0.619911789894104, + "step": 19199 }, { - "epoch": 0.00474438906222964, - "grad_norm": 1.4921833276748657, - "learning_rate": 4.708053015466185e-05, - "loss": 2.157, - "step": 59700 + "epoch": 0.0104, + "grad_norm": 0.9801616072654724, + "learning_rate": 8.743222186070006e-05, + "loss": 0.7152, + "step": 19200 }, { - "epoch": 0.005023470771772559, - "grad_norm": 1.6850916147232056, - "learning_rate": 4.707013857168904e-05, - "loss": 2.1594, - "step": 59800 + "epoch": 0.010598, + "loss_gen": 3.9666779041290283, + "loss_rtd": 0.40703949332237244, + "loss_sent": 0.35698920488357544, + "loss_sod": 0.11823119223117828, + "loss_total": 0.8822599053382874, + "step": 19299 }, { - "epoch": 0.00530255248131548, - "grad_norm": 1.6800413131713867, - "learning_rate": 4.705972967863344e-05, - "loss": 2.1591, - "step": 59900 + "epoch": 0.010598, + "loss_gen": 2.5196690559387207, + "loss_rtd": 0.3921911418437958, + "loss_sent": 0.030794350430369377, + "loss_sod": 0.2004784196615219, + "loss_total": 0.6234638690948486, + "step": 19299 }, { - "epoch": 0.0055816341908584, - "grad_norm": 1.7270710468292236, - "learning_rate": 4.704930348365897e-05, - "loss": 2.1586, - "step": 60000 + "epoch": 0.0106, + "grad_norm": 1.678354263305664, + "learning_rate": 8.741117606975817e-05, + "loss": 0.7391, + "step": 19300 }, { - "epoch": 0.0055816341908584, - "eval_loss": 2.3516249656677246, - "eval_runtime": 51.3986, - "eval_samples_per_second": 198.332, - "eval_steps_per_second": 1.556, - "step": 60000 + "epoch": 0.010798, + "loss_gen": 3.5528931617736816, + "loss_rtd": 0.4105856716632843, + "loss_sent": 0.5260259509086609, + "loss_sod": 0.04325145110487938, + "loss_total": 0.9798630475997925, + "step": 19399 }, { - "epoch": 0.005860715900401319, - "grad_norm": 1.5420916080474854, - "learning_rate": 4.703885999494312e-05, - "loss": 2.155, - "step": 60100 + "epoch": 0.010798, + "loss_gen": 4.052915096282959, + "loss_rtd": 0.4151667654514313, + "loss_sent": 0.2871251106262207, + "loss_sod": 0.030132891610264778, + "loss_total": 0.7324247360229492, + "step": 19399 }, { - "epoch": 0.00613979760994424, - "grad_norm": 1.7403862476348877, - "learning_rate": 4.702839922067695e-05, - "loss": 2.1472, - "step": 60200 + "epoch": 0.0108, + "grad_norm": 1.2643232345581055, + "learning_rate": 8.739011520959904e-05, + "loss": 0.713, + "step": 19400 }, { - "epoch": 0.0064188793194871595, - "grad_norm": 1.6129839420318604, - "learning_rate": 4.701792116906506e-05, - "loss": 2.1531, - "step": 60300 + "epoch": 0.010998, + "loss_gen": 4.023740768432617, + "loss_rtd": 0.40644070506095886, + "loss_sent": 0.2646462321281433, + "loss_sod": 0.09764538705348969, + "loss_total": 0.7687323093414307, + "step": 19499 }, { - "epoch": 0.006697961029030079, - "grad_norm": 1.7149808406829834, - "learning_rate": 4.7007425848325616e-05, - "loss": 2.1573, - "step": 60400 + "epoch": 0.010998, + "loss_gen": 4.029313564300537, + "loss_rtd": 0.4271722137928009, + "loss_sent": 0.2032327502965927, + "loss_sod": 0.027567539364099503, + "loss_total": 0.657972514629364, + "step": 19499 }, { - "epoch": 0.006977042738573, - "grad_norm": 1.6517390012741089, - "learning_rate": 4.699691326669032e-05, - "loss": 2.1478, - "step": 60500 + "epoch": 0.011, + "grad_norm": 1.3250114917755127, + "learning_rate": 8.736903928870597e-05, + "loss": 0.7128, + "step": 19500 }, { - "epoch": 0.0072561244481159195, - "grad_norm": 1.6524746417999268, - "learning_rate": 4.698638343240441e-05, - "loss": 2.1538, - "step": 60600 + "epoch": 0.011198, + "loss_gen": 4.008549690246582, + "loss_rtd": 0.4227379560470581, + "loss_sent": 0.20236265659332275, + "loss_sod": 0.052655987441539764, + "loss_total": 0.6777566075325012, + "step": 19599 }, { - "epoch": 0.007535206157658839, - "grad_norm": 1.6119221448898315, - "learning_rate": 4.6975836353726675e-05, - "loss": 2.1432, - "step": 60700 + "epoch": 0.011198, + "loss_gen": 3.985772132873535, + "loss_rtd": 0.43641579151153564, + "loss_sent": 0.27415043115615845, + "loss_sod": 0.19595825672149658, + "loss_total": 0.9065244793891907, + "step": 19599 }, { - "epoch": 0.00781428786720176, - "grad_norm": 1.681077480316162, - "learning_rate": 4.696527203892941e-05, - "loss": 2.1473, - "step": 60800 + "epoch": 0.0112, + "grad_norm": 1.8043322563171387, + "learning_rate": 8.734794831556834e-05, + "loss": 0.7076, + "step": 19600 }, { - "epoch": 0.00809336957674468, - "grad_norm": 1.6262125968933105, - "learning_rate": 4.695469049629843e-05, - "loss": 2.1586, - "step": 60900 + "epoch": 0.011398, + "loss_gen": 4.039675235748291, + "loss_rtd": 0.41741177439689636, + "loss_sent": 0.4203322231769562, + "loss_sod": 0.08601180464029312, + "loss_total": 0.9237557649612427, + "step": 19699 }, { - "epoch": 0.008372451286287599, - "grad_norm": 1.7309602499008179, - "learning_rate": 4.694409173413306e-05, - "loss": 2.1517, - "step": 61000 + "epoch": 0.011398, + "loss_gen": 4.08049201965332, + "loss_rtd": 0.4036373198032379, + "loss_sent": 0.43353450298309326, + "loss_sod": 0.0564105249941349, + "loss_total": 0.8935823440551758, + "step": 19699 }, { - "epoch": 0.008372451286287599, - "eval_loss": 2.3563082218170166, - "eval_runtime": 51.3454, - "eval_samples_per_second": 198.538, - "eval_steps_per_second": 1.558, - "step": 61000 + "epoch": 0.0114, + "grad_norm": 1.7929366827011108, + "learning_rate": 8.73268422986816e-05, + "loss": 0.7175, + "step": 19700 }, { - "epoch": 0.008651532995830519, - "grad_norm": 1.609298586845398, - "learning_rate": 4.693347576074615e-05, - "loss": 2.1506, - "step": 61100 + "epoch": 0.011598, + "loss_gen": 2.2888705730438232, + "loss_rtd": 0.4113491177558899, + "loss_sent": 0.01757943071424961, + "loss_sod": 0.2767329812049866, + "loss_total": 0.7056615352630615, + "step": 19799 }, { - "epoch": 0.008930614705373438, - "grad_norm": 1.6471832990646362, - "learning_rate": 4.6922842584464015e-05, - "loss": 2.1557, - "step": 61200 + "epoch": 0.011598, + "loss_gen": 4.463761329650879, + "loss_rtd": 0.42178013920783997, + "loss_sent": 0.12542349100112915, + "loss_sod": 0.08447619527578354, + "loss_total": 0.6316798329353333, + "step": 19799 }, { - "epoch": 0.00920969641491636, - "grad_norm": 1.6230864524841309, - "learning_rate": 4.6912192213626506e-05, - "loss": 2.1479, - "step": 61300 + "epoch": 0.0116, + "grad_norm": 0.8132031559944153, + "learning_rate": 8.730572124654725e-05, + "loss": 0.7143, + "step": 19800 }, { - "epoch": 0.00948877812445928, - "grad_norm": 1.7009743452072144, - "learning_rate": 4.6901524656586924e-05, - "loss": 2.1392, - "step": 61400 + "epoch": 0.011798, + "loss_gen": 3.2291722297668457, + "loss_rtd": 0.40131255984306335, + "loss_sent": 0.05474870651960373, + "loss_sod": 0.13893182575702667, + "loss_total": 0.5949931144714355, + "step": 19899 }, { - "epoch": 0.0097678598340022, - "grad_norm": 1.7315740585327148, - "learning_rate": 4.689083992171207e-05, - "loss": 2.1408, - "step": 61500 + "epoch": 0.011798, + "loss_gen": 2.4490771293640137, + "loss_rtd": 0.3775515854358673, + "loss_sent": 0.06087493896484375, + "loss_sod": 0.21542909741401672, + "loss_total": 0.6538556218147278, + "step": 19899 }, { - "epoch": 0.010046941543545119, - "grad_norm": 1.758393406867981, - "learning_rate": 4.68801380173822e-05, - "loss": 2.1383, - "step": 61600 + "epoch": 0.0118, + "grad_norm": 0.6356754899024963, + "learning_rate": 8.728458516767284e-05, + "loss": 0.7232, + "step": 19900 }, { - "epoch": 0.010326023253088039, - "grad_norm": 1.673213005065918, - "learning_rate": 4.686941895199106e-05, - "loss": 2.1439, - "step": 61700 + "epoch": 0.011998, + "loss_gen": 4.252781391143799, + "loss_rtd": 0.39302095770835876, + "loss_sent": 0.03697744384407997, + "loss_sod": 0.034853093326091766, + "loss_total": 0.4648514986038208, + "step": 19999 }, { - "epoch": 0.01060510496263096, - "grad_norm": 1.715144395828247, - "learning_rate": 4.685868273394583e-05, - "loss": 2.1441, - "step": 61800 + "epoch": 0.011998, + "loss_gen": 3.7617712020874023, + "loss_rtd": 0.40322282910346985, + "loss_sent": 0.1799575388431549, + "loss_sod": 0.02489660680294037, + "loss_total": 0.6080769300460815, + "step": 19999 }, { - "epoch": 0.01088418667217388, - "grad_norm": 1.5925661325454712, - "learning_rate": 4.684792937166716e-05, - "loss": 2.1315, - "step": 61900 + "epoch": 0.012, + "grad_norm": 0.8684195876121521, + "learning_rate": 8.726343407057197e-05, + "loss": 0.7041, + "step": 20000 }, { - "epoch": 0.0111632683817168, - "grad_norm": 1.5832064151763916, - "learning_rate": 4.683715887358916e-05, - "loss": 2.1467, - "step": 62000 + "epoch": 0.012, + "eval_loss": 0.7028465270996094, + "eval_runtime": 151.5434, + "eval_samples_per_second": 101.905, + "eval_steps_per_second": 0.798, + "step": 20000 }, { - "epoch": 0.0111632683817168, - "eval_loss": 2.3658885955810547, - "eval_runtime": 51.3451, - "eval_samples_per_second": 198.539, - "eval_steps_per_second": 1.558, - "step": 62000 + "epoch": 0.012198, + "loss_gen": 3.716078519821167, + "loss_rtd": 0.4142749011516571, + "loss_sent": 0.1574735790491104, + "loss_sod": 0.011068914085626602, + "loss_total": 0.5828173756599426, + "step": 20099 }, { - "epoch": 0.011442350091259719, - "grad_norm": 1.555420994758606, - "learning_rate": 4.682637124815934e-05, - "loss": 2.1498, - "step": 62100 + "epoch": 0.012198, + "loss_gen": 3.796159505844116, + "loss_rtd": 0.409680038690567, + "loss_sent": 0.032792288810014725, + "loss_sod": 0.1799483299255371, + "loss_total": 0.622420608997345, + "step": 20099 }, { - "epoch": 0.011721431800802639, - "grad_norm": 1.670647144317627, - "learning_rate": 4.681556650383867e-05, - "loss": 2.1335, - "step": 62200 + "epoch": 0.0122, + "grad_norm": 0.7984619140625, + "learning_rate": 8.724226796376433e-05, + "loss": 0.711, + "step": 20100 }, { - "epoch": 0.012000513510345558, - "grad_norm": 1.7037968635559082, - "learning_rate": 4.680474464910155e-05, - "loss": 2.1389, - "step": 62300 + "epoch": 0.012398, + "loss_gen": 3.9472317695617676, + "loss_rtd": 0.40650129318237305, + "loss_sent": 0.08266563713550568, + "loss_sod": 0.045177169144153595, + "loss_total": 0.5343440771102905, + "step": 20199 }, { - "epoch": 0.01227959521988848, - "grad_norm": 1.689375877380371, - "learning_rate": 4.679390569243578e-05, - "loss": 2.137, - "step": 62400 + "epoch": 0.012398, + "loss_gen": 2.4588019847869873, + "loss_rtd": 0.38946664333343506, + "loss_sent": 0.0076849148608744144, + "loss_sod": 0.44422250986099243, + "loss_total": 0.8413740396499634, + "step": 20199 }, { - "epoch": 0.0125586769294314, - "grad_norm": 1.6479361057281494, - "learning_rate": 4.67830496423426e-05, - "loss": 2.14, - "step": 62500 + "epoch": 0.0124, + "grad_norm": 1.0634772777557373, + "learning_rate": 8.72210868557756e-05, + "loss": 0.6964, + "step": 20200 }, { - "epoch": 0.012837758638974319, - "grad_norm": 1.5668725967407227, - "learning_rate": 4.677217650733664e-05, - "loss": 2.1356, - "step": 62600 + "epoch": 0.012598, + "loss_gen": 2.4628233909606934, + "loss_rtd": 0.3977556526660919, + "loss_sent": 9.53001290326938e-05, + "loss_sod": 0.35795658826828003, + "loss_total": 0.7558075785636902, + "step": 20299 }, { - "epoch": 0.013116840348517239, - "grad_norm": 1.6967672109603882, - "learning_rate": 4.676128629594593e-05, - "loss": 2.1288, - "step": 62700 + "epoch": 0.012598, + "loss_gen": 2.3897507190704346, + "loss_rtd": 0.4025641083717346, + "loss_sent": 0.001091538928449154, + "loss_sod": 0.42349517345428467, + "loss_total": 0.827150821685791, + "step": 20299 }, { - "epoch": 0.013395922058060158, - "grad_norm": 1.641052484512329, - "learning_rate": 4.675037901671189e-05, - "loss": 2.1315, - "step": 62800 + "epoch": 0.0126, + "grad_norm": 1.2861807346343994, + "learning_rate": 8.719989075513753e-05, + "loss": 0.711, + "step": 20300 }, { - "epoch": 0.013675003767603078, - "grad_norm": 1.680564284324646, - "learning_rate": 4.673945467818934e-05, - "loss": 2.133, - "step": 62900 + "epoch": 0.012798, + "loss_gen": 4.174057960510254, + "loss_rtd": 0.4290989935398102, + "loss_sent": 0.09670135378837585, + "loss_sod": 0.023639153689146042, + "loss_total": 0.5494394898414612, + "step": 20399 }, { - "epoch": 0.013954085477146, - "grad_norm": 1.6770386695861816, - "learning_rate": 4.672851328894647e-05, - "loss": 2.1307, - "step": 63000 + "epoch": 0.012798, + "loss_gen": 3.941239356994629, + "loss_rtd": 0.3997499644756317, + "loss_sent": 0.12611424922943115, + "loss_sod": 0.07376294583082199, + "loss_total": 0.5996271371841431, + "step": 20399 }, { - "epoch": 0.013954085477146, - "eval_loss": 2.3682708740234375, - "eval_runtime": 51.4984, - "eval_samples_per_second": 197.948, - "eval_steps_per_second": 1.553, - "step": 63000 + "epoch": 0.0128, + "grad_norm": 1.0112621784210205, + "learning_rate": 8.717867967038792e-05, + "loss": 0.718, + "step": 20400 }, { - "epoch": 0.01423316718668892, - "grad_norm": 1.4313061237335205, - "learning_rate": 4.671755485756486e-05, - "loss": 2.1402, - "step": 63100 + "epoch": 0.012998, + "loss_gen": 4.399020195007324, + "loss_rtd": 0.4018489420413971, + "loss_sent": 0.21433569490909576, + "loss_sod": 0.06530171632766724, + "loss_total": 0.6814863681793213, + "step": 20499 }, { - "epoch": 0.014512248896231839, - "grad_norm": 1.6399835348129272, - "learning_rate": 4.6706579392639426e-05, - "loss": 2.1295, - "step": 63200 + "epoch": 0.012998, + "loss_gen": 4.367480754852295, + "loss_rtd": 0.41640469431877136, + "loss_sent": 0.32281652092933655, + "loss_sod": 0.06447701901197433, + "loss_total": 0.8036982417106628, + "step": 20499 }, { - "epoch": 0.014791330605774759, - "grad_norm": 1.748159646987915, - "learning_rate": 4.6695586902778465e-05, - "loss": 2.1326, - "step": 63300 + "epoch": 0.013, + "grad_norm": 1.4023051261901855, + "learning_rate": 8.715745361007059e-05, + "loss": 0.7078, + "step": 20500 }, { - "epoch": 0.015070412315317678, - "grad_norm": 1.7075271606445312, - "learning_rate": 4.6684577396603615e-05, - "loss": 2.1275, - "step": 63400 + "epoch": 0.013198, + "loss_gen": 4.183026313781738, + "loss_rtd": 0.40223562717437744, + "loss_sent": 0.1258079558610916, + "loss_sod": 0.11566033959388733, + "loss_total": 0.6437038779258728, + "step": 20599 }, { - "epoch": 0.015349494024860598, - "grad_norm": 1.4170197248458862, - "learning_rate": 4.6673550882749884e-05, - "loss": 2.1281, - "step": 63500 + "epoch": 0.013198, + "loss_gen": 3.8825387954711914, + "loss_rtd": 0.4105805456638336, + "loss_sent": 0.21834413707256317, + "loss_sod": 0.037197474390268326, + "loss_total": 0.6661221385002136, + "step": 20599 }, { - "epoch": 0.01562857573440352, - "grad_norm": 1.6267995834350586, - "learning_rate": 4.666250736986559e-05, - "loss": 2.1267, - "step": 63600 + "epoch": 0.0132, + "grad_norm": 1.2524880170822144, + "learning_rate": 8.713621258273538e-05, + "loss": 0.7103, + "step": 20600 }, { - "epoch": 0.015907657443946437, - "grad_norm": 1.6507539749145508, - "learning_rate": 4.665144686661241e-05, - "loss": 2.123, - "step": 63700 + "epoch": 0.013398, + "loss_gen": 4.011019229888916, + "loss_rtd": 0.39504221081733704, + "loss_sent": 0.108099065721035, + "loss_sod": 0.01876910775899887, + "loss_total": 0.5219103693962097, + "step": 20699 }, { - "epoch": 0.01618673915348936, - "grad_norm": 1.6222537755966187, - "learning_rate": 4.664036938166532e-05, - "loss": 2.1211, - "step": 63800 + "epoch": 0.013398, + "loss_gen": 4.195362091064453, + "loss_rtd": 0.39853763580322266, + "loss_sent": 0.3297787606716156, + "loss_sod": 0.0631496012210846, + "loss_total": 0.7914659976959229, + "step": 20699 }, { - "epoch": 0.01646582086303228, - "grad_norm": 1.5079693794250488, - "learning_rate": 4.662927492371263e-05, - "loss": 2.1155, - "step": 63900 + "epoch": 0.0134, + "grad_norm": 2.073709726333618, + "learning_rate": 8.711495659693818e-05, + "loss": 0.7065, + "step": 20700 }, { - "epoch": 0.016744902572575198, - "grad_norm": 1.565535306930542, - "learning_rate": 4.661816350145597e-05, - "loss": 2.1179, - "step": 64000 + "epoch": 0.013598, + "loss_gen": 4.199145317077637, + "loss_rtd": 0.4101455807685852, + "loss_sent": 0.1480754017829895, + "loss_sod": 0.08632524311542511, + "loss_total": 0.6445462107658386, + "step": 20799 }, { - "epoch": 0.016744902572575198, - "eval_loss": 2.358680486679077, - "eval_runtime": 51.4395, - "eval_samples_per_second": 198.174, - "eval_steps_per_second": 1.555, - "step": 64000 + "epoch": 0.013598, + "loss_gen": 4.125156879425049, + "loss_rtd": 0.3986564576625824, + "loss_sent": 0.11776266992092133, + "loss_sod": 0.03158608078956604, + "loss_total": 0.548005223274231, + "step": 20799 }, { - "epoch": 0.01702398428211812, - "grad_norm": 1.7206209897994995, - "learning_rate": 4.660703512361027e-05, - "loss": 2.1434, - "step": 64100 + "epoch": 0.0136, + "grad_norm": 1.2252863645553589, + "learning_rate": 8.709368566124091e-05, + "loss": 0.7105, + "step": 20800 }, { - "epoch": 0.017303065991661037, - "grad_norm": 1.9430965185165405, - "learning_rate": 4.659588979890373e-05, - "loss": 2.3221, - "step": 64200 + "epoch": 0.013798, + "loss_gen": 3.4146010875701904, + "loss_rtd": 0.3956667482852936, + "loss_sent": 0.16512219607830048, + "loss_sod": 0.10856941342353821, + "loss_total": 0.6693583726882935, + "step": 20899 }, { - "epoch": 0.01758214770120396, - "grad_norm": 1.7088842391967773, - "learning_rate": 4.658472753607789e-05, - "loss": 2.3117, - "step": 64300 + "epoch": 0.013798, + "loss_gen": 3.9524085521698, + "loss_rtd": 0.39170873165130615, + "loss_sent": 0.3147831857204437, + "loss_sod": 0.01406177319586277, + "loss_total": 0.7205536961555481, + "step": 20899 }, { - "epoch": 0.017861229410746877, - "grad_norm": 1.6779052019119263, - "learning_rate": 4.657354834388755e-05, - "loss": 2.3036, - "step": 64400 + "epoch": 0.0138, + "grad_norm": 1.7617398500442505, + "learning_rate": 8.70723997842115e-05, + "loss": 0.7165, + "step": 20900 }, { - "epoch": 0.018140311120289798, - "grad_norm": 1.80711829662323, - "learning_rate": 4.656235223110079e-05, - "loss": 2.2953, - "step": 64500 + "epoch": 0.013998, + "loss_gen": 2.8314666748046875, + "loss_rtd": 0.3838663399219513, + "loss_sent": 0.09685777127742767, + "loss_sod": 0.12291301041841507, + "loss_total": 0.603637158870697, + "step": 20999 }, { - "epoch": 0.01841939282983272, - "grad_norm": 1.6596226692199707, - "learning_rate": 4.655113920649896e-05, - "loss": 2.2955, - "step": 64600 + "epoch": 0.013998, + "loss_gen": 4.050356864929199, + "loss_rtd": 0.41659289598464966, + "loss_sent": 0.16024036705493927, + "loss_sod": 0.0964965671300888, + "loss_total": 0.6733298301696777, + "step": 20999 }, { - "epoch": 0.018698474539375638, - "grad_norm": 1.8285378217697144, - "learning_rate": 4.6539909278876656e-05, - "loss": 2.294, - "step": 64700 + "epoch": 0.014, + "grad_norm": 0.82136070728302, + "learning_rate": 8.705109897442388e-05, + "loss": 0.6817, + "step": 21000 }, { - "epoch": 0.01897755624891856, - "grad_norm": 1.6968961954116821, - "learning_rate": 4.6528662457041784e-05, - "loss": 2.2886, - "step": 64800 + "epoch": 0.014, + "eval_loss": 0.6961836218833923, + "eval_runtime": 151.3756, + "eval_samples_per_second": 102.018, + "eval_steps_per_second": 0.799, + "step": 21000 }, { - "epoch": 0.019256637958461477, - "grad_norm": 1.7172420024871826, - "learning_rate": 4.651739874981545e-05, - "loss": 2.2961, - "step": 64900 + "epoch": 0.014198, + "loss_gen": 4.205439567565918, + "loss_rtd": 0.390224426984787, + "loss_sent": 0.43894219398498535, + "loss_sod": 0.0434480756521225, + "loss_total": 0.8726146817207336, + "step": 21099 }, { - "epoch": 0.0195357196680044, - "grad_norm": 1.7524107694625854, - "learning_rate": 4.650611816603202e-05, - "loss": 2.2796, - "step": 65000 + "epoch": 0.014198, + "loss_gen": 4.209077835083008, + "loss_rtd": 0.4092998206615448, + "loss_sent": 0.49253588914871216, + "loss_sod": 0.06910588592290878, + "loss_total": 0.9709416031837463, + "step": 21099 }, { - "epoch": 0.0195357196680044, - "eval_loss": 2.3449864387512207, - "eval_runtime": 51.7575, - "eval_samples_per_second": 196.957, - "eval_steps_per_second": 1.546, - "step": 65000 + "epoch": 0.0142, + "grad_norm": 2.717113971710205, + "learning_rate": 8.7029783240458e-05, + "loss": 0.7054, + "step": 21100 }, { - "epoch": 0.01981480137754732, - "grad_norm": 1.6822710037231445, - "learning_rate": 4.64948207145391e-05, - "loss": 2.2906, - "step": 65100 + "epoch": 0.014398, + "loss_gen": 4.044052600860596, + "loss_rtd": 0.4245973527431488, + "loss_sent": 0.14878441393375397, + "loss_sod": 0.052621856331825256, + "loss_total": 0.626003623008728, + "step": 21199 }, { - "epoch": 0.020093883087090238, - "grad_norm": 1.7092459201812744, - "learning_rate": 4.648350640419753e-05, - "loss": 2.2821, - "step": 65200 + "epoch": 0.014398, + "loss_gen": 3.825176954269409, + "loss_rtd": 0.4175969064235687, + "loss_sent": 0.2602907419204712, + "loss_sod": 0.1315835416316986, + "loss_total": 0.8094711899757385, + "step": 21199 }, { - "epoch": 0.02037296479663316, - "grad_norm": 1.6398138999938965, - "learning_rate": 4.6472175243881355e-05, - "loss": 2.2842, - "step": 65300 + "epoch": 0.0144, + "grad_norm": 0.7577776312828064, + "learning_rate": 8.700845259089989e-05, + "loss": 0.7134, + "step": 21200 }, { - "epoch": 0.020652046506176077, - "grad_norm": 1.7621639966964722, - "learning_rate": 4.646082724247785e-05, - "loss": 2.2677, - "step": 65400 + "epoch": 0.014598, + "loss_gen": 4.306161880493164, + "loss_rtd": 0.3955383002758026, + "loss_sent": 0.047602277249097824, + "loss_sod": 0.011172058060765266, + "loss_total": 0.45431262254714966, + "step": 21299 }, { - "epoch": 0.020931128215719, - "grad_norm": 1.7094637155532837, - "learning_rate": 4.64494624088875e-05, - "loss": 2.2709, - "step": 65500 + "epoch": 0.014598, + "loss_gen": 3.892268657684326, + "loss_rtd": 0.4082190692424774, + "loss_sent": 0.1741885244846344, + "loss_sod": 0.15817657113075256, + "loss_total": 0.740584135055542, + "step": 21299 }, { - "epoch": 0.02121020992526192, - "grad_norm": 1.6392730474472046, - "learning_rate": 4.643808075202399e-05, - "loss": 2.2665, - "step": 65600 + "epoch": 0.0146, + "grad_norm": 0.9689379930496216, + "learning_rate": 8.69871070343415e-05, + "loss": 0.6951, + "step": 21300 }, { - "epoch": 0.021489291634804838, - "grad_norm": 1.7624675035476685, - "learning_rate": 4.642668228081418e-05, - "loss": 2.2726, - "step": 65700 + "epoch": 0.014798, + "loss_gen": 4.0580854415893555, + "loss_rtd": 0.3923904299736023, + "loss_sent": 0.26663270592689514, + "loss_sod": 0.011745485477149487, + "loss_total": 0.6707686185836792, + "step": 21399 }, { - "epoch": 0.02176837334434776, - "grad_norm": 1.5905216932296753, - "learning_rate": 4.641526700419816e-05, - "loss": 2.2701, - "step": 65800 + "epoch": 0.014798, + "loss_gen": 4.141996383666992, + "loss_rtd": 0.41980284452438354, + "loss_sent": 0.2592174708843231, + "loss_sod": 0.053839970380067825, + "loss_total": 0.7328603267669678, + "step": 21399 }, { - "epoch": 0.022047455053890677, - "grad_norm": 1.6838109493255615, - "learning_rate": 4.640383493112917e-05, - "loss": 2.2725, - "step": 65900 + "epoch": 0.0148, + "grad_norm": 1.1764981746673584, + "learning_rate": 8.696574657938081e-05, + "loss": 0.7015, + "step": 21400 }, { - "epoch": 0.0223265367634336, - "grad_norm": 1.7110956907272339, - "learning_rate": 4.639238607057361e-05, - "loss": 2.2647, - "step": 66000 + "epoch": 0.014998, + "loss_gen": 4.1991496086120605, + "loss_rtd": 0.40171363949775696, + "loss_sent": 0.17878541350364685, + "loss_sod": 0.10786039382219315, + "loss_total": 0.6883594989776611, + "step": 21499 }, { - "epoch": 0.0223265367634336, - "eval_loss": 2.333463191986084, - "eval_runtime": 51.6799, - "eval_samples_per_second": 197.253, - "eval_steps_per_second": 1.548, - "step": 66000 + "epoch": 0.014998, + "loss_gen": 3.7081212997436523, + "loss_rtd": 0.41223984956741333, + "loss_sent": 0.08660165965557098, + "loss_sod": 0.032381195574998856, + "loss_total": 0.5312227010726929, + "step": 21499 }, { - "epoch": 0.022605618472976517, - "grad_norm": 1.653095006942749, - "learning_rate": 4.6380920431511085e-05, - "loss": 2.2693, - "step": 66100 + "epoch": 0.015, + "grad_norm": 0.9278017282485962, + "learning_rate": 8.694437123462182e-05, + "loss": 0.715, + "step": 21500 }, { - "epoch": 0.022884700182519438, - "grad_norm": 1.8128857612609863, - "learning_rate": 4.636943802293434e-05, - "loss": 2.2558, - "step": 66200 + "epoch": 0.015198, + "loss_gen": 3.3395333290100098, + "loss_rtd": 0.399366557598114, + "loss_sent": 0.10116645693778992, + "loss_sod": 0.21150001883506775, + "loss_total": 0.7120330333709717, + "step": 21599 }, { - "epoch": 0.02316378189206236, - "grad_norm": 1.6910260915756226, - "learning_rate": 4.635793885384927e-05, - "loss": 2.2705, - "step": 66300 + "epoch": 0.015198, + "loss_gen": 2.31325364112854, + "loss_rtd": 0.37933480739593506, + "loss_sent": 0.0013232976198196411, + "loss_sod": 0.27269241213798523, + "loss_total": 0.6533505320549011, + "step": 21599 }, { - "epoch": 0.023442863601605277, - "grad_norm": 1.6153894662857056, - "learning_rate": 4.6346422933274915e-05, - "loss": 2.263, - "step": 66400 + "epoch": 0.0152, + "grad_norm": 0.8672342300415039, + "learning_rate": 8.692298100867453e-05, + "loss": 0.7099, + "step": 21600 }, { - "epoch": 0.0237219453111482, - "grad_norm": 1.5947723388671875, - "learning_rate": 4.633489027024347e-05, - "loss": 2.2638, - "step": 66500 + "epoch": 0.015398, + "loss_gen": 3.8823935985565186, + "loss_rtd": 0.4169360399246216, + "loss_sent": 0.088588185608387, + "loss_sod": 0.053140271455049515, + "loss_total": 0.5586645007133484, + "step": 21699 }, { - "epoch": 0.024001027020691117, - "grad_norm": 1.6512424945831299, - "learning_rate": 4.6323340873800246e-05, - "loss": 2.2603, - "step": 66600 + "epoch": 0.015398, + "loss_gen": 4.013762950897217, + "loss_rtd": 0.43377217650413513, + "loss_sent": 0.27658548951148987, + "loss_sod": 0.09921903163194656, + "loss_total": 0.8095767498016357, + "step": 21699 }, { - "epoch": 0.024280108730234038, - "grad_norm": 1.7382216453552246, - "learning_rate": 4.6311774753003686e-05, - "loss": 2.2527, - "step": 66700 + "epoch": 0.0154, + "grad_norm": 0.9942051768302917, + "learning_rate": 8.690157591015489e-05, + "loss": 0.7113, + "step": 21700 }, { - "epoch": 0.02455919043977696, - "grad_norm": 1.7034409046173096, - "learning_rate": 4.630019191692533e-05, - "loss": 2.2624, - "step": 66800 + "epoch": 0.015598, + "loss_gen": 3.874941110610962, + "loss_rtd": 0.4076584577560425, + "loss_sent": 0.19729302823543549, + "loss_sod": 0.0727071613073349, + "loss_total": 0.6776586771011353, + "step": 21799 }, { - "epoch": 0.024838272149319877, - "grad_norm": 1.5847420692443848, - "learning_rate": 4.628859237464986e-05, - "loss": 2.2461, - "step": 66900 + "epoch": 0.015598, + "loss_gen": 4.1540727615356445, + "loss_rtd": 0.42252597212791443, + "loss_sent": 0.11478852480649948, + "loss_sod": 0.11771350353956223, + "loss_total": 0.6550279855728149, + "step": 21799 }, { - "epoch": 0.0251173538588628, - "grad_norm": 1.6025781631469727, - "learning_rate": 4.627697613527505e-05, - "loss": 2.2576, - "step": 67000 + "epoch": 0.0156, + "grad_norm": 1.6577136516571045, + "learning_rate": 8.688015594768488e-05, + "loss": 0.6966, + "step": 21800 }, { - "epoch": 0.0251173538588628, - "eval_loss": 2.3275885581970215, - "eval_runtime": 51.6271, - "eval_samples_per_second": 197.455, - "eval_steps_per_second": 1.55, - "step": 67000 + "epoch": 0.015798, + "loss_gen": 3.9389901161193848, + "loss_rtd": 0.4259053170681, + "loss_sent": 0.17768579721450806, + "loss_sod": 0.04217896610498428, + "loss_total": 0.6457700729370117, + "step": 21899 }, { - "epoch": 0.025396435568405717, - "grad_norm": 1.8018252849578857, - "learning_rate": 4.626534320791175e-05, - "loss": 2.252, - "step": 67100 + "epoch": 0.015798, + "loss_gen": 4.207218647003174, + "loss_rtd": 0.39804330468177795, + "loss_sent": 0.21022315323352814, + "loss_sod": 0.09626258909702301, + "loss_total": 0.7045290470123291, + "step": 21899 }, { - "epoch": 0.025675517277948638, - "grad_norm": 1.5845881700515747, - "learning_rate": 4.625369360168392e-05, - "loss": 2.2588, - "step": 67200 + "epoch": 0.0158, + "grad_norm": 0.9212267994880676, + "learning_rate": 8.685872112989248e-05, + "loss": 0.7099, + "step": 21900 }, { - "epoch": 0.025954598987491556, - "grad_norm": 1.7306795120239258, - "learning_rate": 4.624202732572861e-05, - "loss": 2.2509, - "step": 67300 + "epoch": 0.015998, + "loss_gen": 4.139245510101318, + "loss_rtd": 0.4145708382129669, + "loss_sent": 0.17489010095596313, + "loss_sod": 0.03991694003343582, + "loss_total": 0.6293778419494629, + "step": 21999 }, { - "epoch": 0.026233680697034478, - "grad_norm": 1.6429874897003174, - "learning_rate": 4.62303443891959e-05, - "loss": 2.2454, - "step": 67400 + "epoch": 0.015998, + "loss_gen": 4.0036516189575195, + "loss_rtd": 0.4070430397987366, + "loss_sent": 0.19366881251335144, + "loss_sod": 0.04858838766813278, + "loss_total": 0.649300217628479, + "step": 21999 }, { - "epoch": 0.0265127624065774, - "grad_norm": 1.67123281955719, - "learning_rate": 4.6218644801249e-05, - "loss": 2.2581, - "step": 67500 + "epoch": 0.016, + "grad_norm": 0.8448497653007507, + "learning_rate": 8.68372714654116e-05, + "loss": 0.7183, + "step": 22000 }, { - "epoch": 0.026791844116120317, - "grad_norm": 1.7947686910629272, - "learning_rate": 4.620692857106412e-05, - "loss": 2.2474, - "step": 67600 + "epoch": 0.016, + "eval_loss": 0.6913487315177917, + "eval_runtime": 151.5751, + "eval_samples_per_second": 101.883, + "eval_steps_per_second": 0.798, + "step": 22000 }, { - "epoch": 0.02707092582566324, - "grad_norm": 1.6741690635681152, - "learning_rate": 4.619519570783057e-05, - "loss": 2.25, - "step": 67700 + "epoch": 0.016198, + "loss_gen": 3.783210277557373, + "loss_rtd": 0.41268104314804077, + "loss_sent": 0.13717064261436462, + "loss_sod": 0.015744447708129883, + "loss_total": 0.5655961036682129, + "step": 22099 }, { - "epoch": 0.027350007535206156, - "grad_norm": 1.6634832620620728, - "learning_rate": 4.618344622075068e-05, - "loss": 2.2439, - "step": 67800 + "epoch": 0.016198, + "loss_gen": 4.48651647567749, + "loss_rtd": 0.4079062342643738, + "loss_sent": 0.03336573764681816, + "loss_sod": 0.272824764251709, + "loss_total": 0.71409672498703, + "step": 22099 }, { - "epoch": 0.027629089244749078, - "grad_norm": 1.7358968257904053, - "learning_rate": 4.617168011903983e-05, - "loss": 2.2435, - "step": 67900 + "epoch": 0.0162, + "grad_norm": 0.8137263655662537, + "learning_rate": 8.681580696288219e-05, + "loss": 0.7106, + "step": 22100 }, { - "epoch": 0.027908170954292, - "grad_norm": 1.6776198148727417, - "learning_rate": 4.615989741192641e-05, - "loss": 2.2447, - "step": 68000 + "epoch": 0.016398, + "loss_gen": 3.734351634979248, + "loss_rtd": 0.39496752619743347, + "loss_sent": 0.3425740599632263, + "loss_sod": 0.032268062233924866, + "loss_total": 0.7698096632957458, + "step": 22199 }, { - "epoch": 0.027908170954292, - "eval_loss": 2.3279337882995605, - "eval_runtime": 51.7439, - "eval_samples_per_second": 197.009, - "eval_steps_per_second": 1.546, - "step": 68000 + "epoch": 0.016398, + "loss_gen": 2.9667320251464844, + "loss_rtd": 0.41361331939697266, + "loss_sent": 0.007793530356138945, + "loss_sod": 0.19353321194648743, + "loss_total": 0.6149400472640991, + "step": 22199 }, { - "epoch": 0.028187252663834917, - "grad_norm": 1.752332091331482, - "learning_rate": 4.614809810865186e-05, - "loss": 2.2424, - "step": 68100 + "epoch": 0.0164, + "grad_norm": 0.9939231276512146, + "learning_rate": 8.679432763095014e-05, + "loss": 0.7061, + "step": 22200 }, { - "epoch": 0.02846633437337784, - "grad_norm": 1.7249300479888916, - "learning_rate": 4.6136282218470625e-05, - "loss": 2.2455, - "step": 68200 + "epoch": 0.016598, + "loss_gen": 3.9864869117736816, + "loss_rtd": 0.41875556111335754, + "loss_sent": 0.29814907908439636, + "loss_sod": 0.05030648410320282, + "loss_total": 0.7672110795974731, + "step": 22299 }, { - "epoch": 0.028745416082920756, - "grad_norm": 1.6715984344482422, - "learning_rate": 4.6124449750650175e-05, - "loss": 2.2472, - "step": 68300 + "epoch": 0.016598, + "loss_gen": 3.6811013221740723, + "loss_rtd": 0.3975054621696472, + "loss_sent": 0.06244634836912155, + "loss_sod": 0.13688090443611145, + "loss_total": 0.5968327522277832, + "step": 22299 }, { - "epoch": 0.029024497792463678, - "grad_norm": 1.7050753831863403, - "learning_rate": 4.6112600714470946e-05, - "loss": 2.2404, - "step": 68400 + "epoch": 0.0166, + "grad_norm": 0.8836981654167175, + "learning_rate": 8.677283347826732e-05, + "loss": 0.7077, + "step": 22300 }, { - "epoch": 0.0293035795020066, - "grad_norm": 1.657313585281372, - "learning_rate": 4.61007351192264e-05, - "loss": 2.2446, - "step": 68500 + "epoch": 0.016798, + "loss_gen": 3.832756519317627, + "loss_rtd": 0.4089447259902954, + "loss_sent": 0.3234836757183075, + "loss_sod": 0.006607224233448505, + "loss_total": 0.7390356063842773, + "step": 22399 }, { - "epoch": 0.029582661211549517, - "grad_norm": 1.6855632066726685, - "learning_rate": 4.608885297422298e-05, - "loss": 2.2476, - "step": 68600 + "epoch": 0.016798, + "loss_gen": 3.8547472953796387, + "loss_rtd": 0.39647242426872253, + "loss_sent": 0.15630517899990082, + "loss_sod": 0.08985313773155212, + "loss_total": 0.6426307559013367, + "step": 22399 }, { - "epoch": 0.02986174292109244, - "grad_norm": 1.743253231048584, - "learning_rate": 4.607695428878011e-05, - "loss": 2.2452, - "step": 68700 + "epoch": 0.0168, + "grad_norm": 0.8587817549705505, + "learning_rate": 8.675132451349157e-05, + "loss": 0.693, + "step": 22400 }, { - "epoch": 0.030140824630635357, - "grad_norm": 1.6922000646591187, - "learning_rate": 4.6065039072230166e-05, - "loss": 2.2361, - "step": 68800 + "epoch": 0.016998, + "loss_gen": 4.252213954925537, + "loss_rtd": 0.40426650643348694, + "loss_sent": 0.31181442737579346, + "loss_sod": 0.09723158180713654, + "loss_total": 0.8133125305175781, + "step": 22499 }, { - "epoch": 0.030419906340178278, - "grad_norm": 1.7949845790863037, - "learning_rate": 4.605310733391852e-05, - "loss": 2.2261, - "step": 68900 + "epoch": 0.016998, + "loss_gen": 4.409897804260254, + "loss_rtd": 0.40113937854766846, + "loss_sent": 0.240126371383667, + "loss_sod": 0.06067778170108795, + "loss_total": 0.7019435167312622, + "step": 22499 }, { - "epoch": 0.030698988049721196, - "grad_norm": 1.686171293258667, - "learning_rate": 4.604115908320351e-05, - "loss": 2.2367, - "step": 69000 + "epoch": 0.017, + "grad_norm": 1.4950565099716187, + "learning_rate": 8.67298007452867e-05, + "loss": 0.7183, + "step": 22500 }, { - "epoch": 0.030698988049721196, - "eval_loss": 2.314418315887451, - "eval_runtime": 52.12, - "eval_samples_per_second": 195.587, - "eval_steps_per_second": 1.535, - "step": 69000 + "epoch": 0.017198, + "loss_gen": 2.8861172199249268, + "loss_rtd": 0.40829846262931824, + "loss_sent": 0.057722508907318115, + "loss_sod": 0.20995157957077026, + "loss_total": 0.6759725213050842, + "step": 22599 }, { - "epoch": 0.030978069759264117, - "grad_norm": 1.704830288887024, - "learning_rate": 4.602919432945637e-05, - "loss": 2.2284, - "step": 69100 + "epoch": 0.017198, + "loss_gen": 4.093786716461182, + "loss_rtd": 0.4014229476451874, + "loss_sent": 0.535052478313446, + "loss_sod": 0.021234866231679916, + "loss_total": 0.9577102661132812, + "step": 22599 }, { - "epoch": 0.03125715146880704, - "grad_norm": 1.7228840589523315, - "learning_rate": 4.601721308206133e-05, - "loss": 2.2318, - "step": 69200 + "epoch": 0.0172, + "grad_norm": 2.5805695056915283, + "learning_rate": 8.670826218232248e-05, + "loss": 0.6747, + "step": 22600 }, { - "epoch": 0.03153623317834996, - "grad_norm": 1.7233203649520874, - "learning_rate": 4.600521535041555e-05, - "loss": 2.2403, - "step": 69300 + "epoch": 0.017398, + "loss_gen": 4.194330215454102, + "loss_rtd": 0.3959496319293976, + "loss_sent": 0.21887068450450897, + "loss_sod": 0.055604949593544006, + "loss_total": 0.670425295829773, + "step": 22699 }, { - "epoch": 0.031815314887892875, - "grad_norm": 1.7140865325927734, - "learning_rate": 4.599320114392909e-05, - "loss": 2.2489, - "step": 69400 + "epoch": 0.017398, + "loss_gen": 4.2793169021606445, + "loss_rtd": 0.39640936255455017, + "loss_sent": 0.27822908759117126, + "loss_sod": 0.029160842299461365, + "loss_total": 0.7037992477416992, + "step": 22699 }, { - "epoch": 0.0320943965974358, - "grad_norm": 1.6340928077697754, - "learning_rate": 4.598117047202495e-05, - "loss": 2.2334, - "step": 69500 + "epoch": 0.0174, + "grad_norm": 1.0775412321090698, + "learning_rate": 8.668670883327466e-05, + "loss": 0.7011, + "step": 22700 }, { - "epoch": 0.03237347830697872, - "grad_norm": 1.6307443380355835, - "learning_rate": 4.5969123344139054e-05, - "loss": 2.239, - "step": 69600 + "epoch": 0.017598, + "loss_gen": 3.990032196044922, + "loss_rtd": 0.40130147337913513, + "loss_sent": 0.03100929781794548, + "loss_sod": 0.04583095759153366, + "loss_total": 0.478141725063324, + "step": 22799 }, { - "epoch": 0.032652560016521635, - "grad_norm": 1.770732045173645, - "learning_rate": 4.595705976972022e-05, - "loss": 2.234, - "step": 69700 + "epoch": 0.017598, + "loss_gen": 3.3408172130584717, + "loss_rtd": 0.4124903976917267, + "loss_sent": 0.001292606582865119, + "loss_sod": 0.3106074631214142, + "loss_total": 0.72439044713974, + "step": 22799 }, { - "epoch": 0.03293164172606456, - "grad_norm": 1.6619263887405396, - "learning_rate": 4.5944979758230166e-05, - "loss": 2.2269, - "step": 69800 + "epoch": 0.0176, + "grad_norm": 1.9182292222976685, + "learning_rate": 8.666514070682489e-05, + "loss": 0.6921, + "step": 22800 }, { - "epoch": 0.03321072343560748, - "grad_norm": 1.7418696880340576, - "learning_rate": 4.5932883319143504e-05, - "loss": 2.236, - "step": 69900 + "epoch": 0.017798, + "loss_gen": 4.052070140838623, + "loss_rtd": 0.41997236013412476, + "loss_sent": 0.4130243957042694, + "loss_sod": 0.05000931769609451, + "loss_total": 0.8830060958862305, + "step": 22899 }, { - "epoch": 0.033489805145150396, - "grad_norm": 1.7320384979248047, - "learning_rate": 4.5920770461947734e-05, - "loss": 2.2229, - "step": 70000 + "epoch": 0.017798, + "loss_gen": 4.178469181060791, + "loss_rtd": 0.39809826016426086, + "loss_sent": 0.18564869463443756, + "loss_sod": 0.11889810860157013, + "loss_total": 0.7026450634002686, + "step": 22899 }, { - "epoch": 0.033489805145150396, - "eval_loss": 2.320129871368408, - "eval_runtime": 51.8106, - "eval_samples_per_second": 196.755, - "eval_steps_per_second": 1.544, - "step": 70000 + "epoch": 0.0178, + "grad_norm": 1.1307878494262695, + "learning_rate": 8.664355781166084e-05, + "loss": 0.6988, + "step": 22900 }, { - "epoch": 0.033768886854693314, - "grad_norm": 1.6818188428878784, - "learning_rate": 4.590864119614322e-05, - "loss": 2.2268, - "step": 70100 + "epoch": 0.017998, + "loss_gen": 3.903327226638794, + "loss_rtd": 0.3989095091819763, + "loss_sent": 0.24173308908939362, + "loss_sod": 0.0987914428114891, + "loss_total": 0.7394340634346008, + "step": 22999 }, { - "epoch": 0.03404796856423624, - "grad_norm": 1.7064534425735474, - "learning_rate": 4.5896495531243225e-05, - "loss": 2.2287, - "step": 70200 + "epoch": 0.017998, + "loss_gen": 3.99613356590271, + "loss_rtd": 0.4007818400859833, + "loss_sent": 0.42962703108787537, + "loss_sod": 0.2058112770318985, + "loss_total": 1.0362201929092407, + "step": 22999 }, { - "epoch": 0.03432705027377916, - "grad_norm": 1.7550357580184937, - "learning_rate": 4.588433347677382e-05, - "loss": 2.2288, - "step": 70300 + "epoch": 0.018, + "grad_norm": 1.4807555675506592, + "learning_rate": 8.662196015647608e-05, + "loss": 0.699, + "step": 23000 }, { - "epoch": 0.034606131983322075, - "grad_norm": 1.8009535074234009, - "learning_rate": 4.5872155042274e-05, - "loss": 2.2178, - "step": 70400 + "epoch": 0.018, + "eval_loss": 0.6816897988319397, + "eval_runtime": 151.3149, + "eval_samples_per_second": 102.059, + "eval_steps_per_second": 0.8, + "step": 23000 }, { - "epoch": 0.034885213692865, - "grad_norm": 1.7630360126495361, - "learning_rate": 4.585996023729556e-05, - "loss": 2.222, - "step": 70500 + "epoch": 0.018198, + "loss_gen": 3.8977420330047607, + "loss_rtd": 0.39817938208580017, + "loss_sent": 0.12031625211238861, + "loss_sod": 0.02363927662372589, + "loss_total": 0.5421349406242371, + "step": 23099 }, { - "epoch": 0.03516429540240792, - "grad_norm": 1.6290085315704346, - "learning_rate": 4.584774907140314e-05, - "loss": 2.232, - "step": 70600 + "epoch": 0.018198, + "loss_gen": 3.8846075534820557, + "loss_rtd": 0.4139736592769623, + "loss_sent": 0.09967810660600662, + "loss_sod": 0.03566112369298935, + "loss_total": 0.5493128895759583, + "step": 23099 }, { - "epoch": 0.035443377111950836, - "grad_norm": 1.5934759378433228, - "learning_rate": 4.583552155417423e-05, - "loss": 2.2278, - "step": 70700 + "epoch": 0.0182, + "grad_norm": 0.8791826963424683, + "learning_rate": 8.660034774997014e-05, + "loss": 0.686, + "step": 23100 }, { - "epoch": 0.035722458821493754, - "grad_norm": 1.6519941091537476, - "learning_rate": 4.5823277695199116e-05, - "loss": 2.2099, - "step": 70800 + "epoch": 0.018398, + "loss_gen": 4.103387832641602, + "loss_rtd": 0.39818358421325684, + "loss_sent": 0.24097369611263275, + "loss_sod": 0.0187580157071352, + "loss_total": 0.6579152941703796, + "step": 23199 }, { - "epoch": 0.03600154053103668, - "grad_norm": 1.7670360803604126, - "learning_rate": 4.581101750408095e-05, - "loss": 2.2171, - "step": 70900 + "epoch": 0.018398, + "loss_gen": 4.216580867767334, + "loss_rtd": 0.39543870091438293, + "loss_sent": 0.4009850323200226, + "loss_sod": 0.06736785918474197, + "loss_total": 0.8637915849685669, + "step": 23199 }, { - "epoch": 0.036280622240579596, - "grad_norm": 1.7788828611373901, - "learning_rate": 4.579874099043563e-05, - "loss": 2.2307, - "step": 71000 + "epoch": 0.0184, + "grad_norm": 1.7484025955200195, + "learning_rate": 8.657872060084852e-05, + "loss": 0.7072, + "step": 23200 }, { - "epoch": 0.036280622240579596, - "eval_loss": 2.3103456497192383, - "eval_runtime": 51.7808, - "eval_samples_per_second": 196.868, - "eval_steps_per_second": 1.545, - "step": 71000 + "epoch": 0.018598, + "loss_gen": 3.958742618560791, + "loss_rtd": 0.4209873080253601, + "loss_sent": 0.22494757175445557, + "loss_sod": 0.033440593630075455, + "loss_total": 0.6793754696846008, + "step": 23299 }, { - "epoch": 0.036559703950122514, - "grad_norm": 1.6188585758209229, - "learning_rate": 4.57864481638919e-05, - "loss": 2.221, - "step": 71100 + "epoch": 0.018598, + "loss_gen": 3.8709557056427, + "loss_rtd": 0.4054337441921234, + "loss_sent": 0.12235992401838303, + "loss_sod": 0.0359024778008461, + "loss_total": 0.5636961460113525, + "step": 23299 }, { - "epoch": 0.03683878565966544, - "grad_norm": 1.7492269277572632, - "learning_rate": 4.57741390340913e-05, - "loss": 2.2296, - "step": 71200 + "epoch": 0.0186, + "grad_norm": 1.2961993217468262, + "learning_rate": 8.65570787178226e-05, + "loss": 0.7079, + "step": 23300 }, { - "epoch": 0.03711786736920836, - "grad_norm": 1.6889007091522217, - "learning_rate": 4.576181361068813e-05, - "loss": 2.2177, - "step": 71300 + "epoch": 0.018798, + "loss_gen": 3.9349918365478516, + "loss_rtd": 0.40120649337768555, + "loss_sent": 0.1885022073984146, + "loss_sod": 0.012976760044693947, + "loss_total": 0.6026854515075684, + "step": 23399 }, { - "epoch": 0.037396949078751275, - "grad_norm": 1.611937165260315, - "learning_rate": 4.574947190334949e-05, - "loss": 2.2185, - "step": 71400 + "epoch": 0.018798, + "loss_gen": 4.091781139373779, + "loss_rtd": 0.40365511178970337, + "loss_sent": 0.2327946424484253, + "loss_sod": 0.06162922829389572, + "loss_total": 0.698078989982605, + "step": 23399 }, { - "epoch": 0.0376760307882942, - "grad_norm": 1.768141269683838, - "learning_rate": 4.573711392175525e-05, - "loss": 2.2223, - "step": 71500 + "epoch": 0.0188, + "grad_norm": 1.784354567527771, + "learning_rate": 8.653542210960975e-05, + "loss": 0.7064, + "step": 23400 }, { - "epoch": 0.03795511249783712, - "grad_norm": 1.7611727714538574, - "learning_rate": 4.5724739675598025e-05, - "loss": 2.2128, - "step": 71600 + "epoch": 0.018998, + "loss_gen": 4.1265716552734375, + "loss_rtd": 0.3895909786224365, + "loss_sent": 0.1206795871257782, + "loss_sod": 0.17931661009788513, + "loss_total": 0.6895872354507446, + "step": 23499 }, { - "epoch": 0.038234194207380036, - "grad_norm": 1.6445674896240234, - "learning_rate": 4.5712349174583214e-05, - "loss": 2.2155, - "step": 71700 + "epoch": 0.018998, + "loss_gen": 3.7460741996765137, + "loss_rtd": 0.4037736654281616, + "loss_sent": 0.25270503759384155, + "loss_sod": 0.07054421305656433, + "loss_total": 0.7270228862762451, + "step": 23499 }, { - "epoch": 0.038513275916922954, - "grad_norm": 1.7434101104736328, - "learning_rate": 4.569994242842895e-05, - "loss": 2.2145, - "step": 71800 + "epoch": 0.019, + "grad_norm": 1.0526052713394165, + "learning_rate": 8.651375078493325e-05, + "loss": 0.6982, + "step": 23500 }, { - "epoch": 0.03879235762646588, - "grad_norm": 1.6908565759658813, - "learning_rate": 4.568751944686611e-05, - "loss": 2.2136, - "step": 71900 + "epoch": 0.019198, + "loss_gen": 3.9294276237487793, + "loss_rtd": 0.4179944694042206, + "loss_sent": 0.4713047742843628, + "loss_sod": 0.07345118373632431, + "loss_total": 0.9627504348754883, + "step": 23599 }, { - "epoch": 0.0390714393360088, - "grad_norm": 1.713879108428955, - "learning_rate": 4.5675080239638304e-05, - "loss": 2.2167, - "step": 72000 + "epoch": 0.019198, + "loss_gen": 3.983640193939209, + "loss_rtd": 0.3858213424682617, + "loss_sent": 0.08541528880596161, + "loss_sod": 0.12690787017345428, + "loss_total": 0.59814453125, + "step": 23599 }, - { - "epoch": 0.0390714393360088, - "eval_loss": 2.320613145828247, - "eval_runtime": 51.806, - "eval_samples_per_second": 196.773, - "eval_steps_per_second": 1.544, - "step": 72000 - }, - { - "epoch": 0.039350521045551715, - "grad_norm": 1.7077282667160034, - "learning_rate": 4.566262481650186e-05, - "loss": 2.2166, - "step": 72100 - }, - { - "epoch": 0.03962960275509464, - "grad_norm": 1.5940215587615967, - "learning_rate": 4.565015318722585e-05, - "loss": 2.223, - "step": 72200 - }, - { - "epoch": 0.03990868446463756, - "grad_norm": 1.7113089561462402, - "learning_rate": 4.563766536159203e-05, - "loss": 2.2114, - "step": 72300 - }, - { - "epoch": 0.040187766174180475, - "grad_norm": 1.6140506267547607, - "learning_rate": 4.562516134939487e-05, - "loss": 2.2103, - "step": 72400 - }, - { - "epoch": 0.04046684788372339, - "grad_norm": 1.7744345664978027, - "learning_rate": 4.5612641160441535e-05, - "loss": 2.2103, - "step": 72500 - }, - { - "epoch": 0.04074592959326632, - "grad_norm": 1.7696763277053833, - "learning_rate": 4.560010480455188e-05, - "loss": 2.2053, - "step": 72600 - }, - { - "epoch": 0.041025011302809236, - "grad_norm": 1.6375198364257812, - "learning_rate": 4.5587552291558436e-05, - "loss": 2.2022, - "step": 72700 - }, - { - "epoch": 0.041304093012352154, - "grad_norm": 1.7271989583969116, - "learning_rate": 4.557498363130641e-05, - "loss": 2.2099, - "step": 72800 - }, - { - "epoch": 0.04158317472189508, - "grad_norm": 1.6933537721633911, - "learning_rate": 4.556239883365367e-05, - "loss": 2.2101, - "step": 72900 - }, - { - "epoch": 0.041862256431438, - "grad_norm": 1.635075330734253, - "learning_rate": 4.554979790847075e-05, - "loss": 2.2776, - "step": 73000 - }, - { - "epoch": 0.041862256431438, - "eval_loss": 2.318647623062134, - "eval_runtime": 51.7253, - "eval_samples_per_second": 197.08, - "eval_steps_per_second": 1.547, - "step": 73000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 1.7909903526306152, - "learning_rate": 4.553718086564081e-05, - "loss": 2.2744, - "step": 73100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 1.6957789659500122, - "learning_rate": 4.552454771505968e-05, - "loss": 2.2831, - "step": 73200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 1.5448663234710693, - "learning_rate": 4.5511898466635806e-05, - "loss": 2.2834, - "step": 73300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 1.5787805318832397, - "learning_rate": 4.5499233130290266e-05, - "loss": 2.2748, - "step": 73400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 1.7719733715057373, - "learning_rate": 4.548655171595675e-05, - "loss": 2.2747, - "step": 73500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 1.6902601718902588, - "learning_rate": 4.5473854233581566e-05, - "loss": 2.2677, - "step": 73600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 1.7311447858810425, - "learning_rate": 4.5461140693123625e-05, - "loss": 2.2678, - "step": 73700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 1.6785364151000977, - "learning_rate": 4.544841110455442e-05, - "loss": 2.2705, - "step": 73800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 1.7150171995162964, - "learning_rate": 4.5435665477858056e-05, - "loss": 2.2652, - "step": 73900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 1.699508547782898, - "learning_rate": 4.5422903823031195e-05, - "loss": 2.2589, - "step": 74000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.2980968952178955, - "eval_runtime": 53.1514, - "eval_samples_per_second": 191.792, - "eval_steps_per_second": 1.505, - "step": 74000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 1.552603840827942, - "learning_rate": 4.541012615008307e-05, - "loss": 2.254, - "step": 74100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 1.587418794631958, - "learning_rate": 4.539733246903549e-05, - "loss": 2.2592, - "step": 74200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 1.7338536977767944, - "learning_rate": 4.538452278992281e-05, - "loss": 2.2594, - "step": 74300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 1.7801527976989746, - "learning_rate": 4.537169712279194e-05, - "loss": 2.2622, - "step": 74400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 1.6883269548416138, - "learning_rate": 4.535885547770231e-05, - "loss": 2.2512, - "step": 74500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 1.749293327331543, - "learning_rate": 4.534599786472592e-05, - "loss": 2.2477, - "step": 74600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 1.6291452646255493, - "learning_rate": 4.533312429394726e-05, - "loss": 2.2634, - "step": 74700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 1.6647717952728271, - "learning_rate": 4.5320234775463345e-05, - "loss": 2.2537, - "step": 74800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 1.6967101097106934, - "learning_rate": 4.5307329319383705e-05, - "loss": 2.2522, - "step": 74900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 1.7119522094726562, - "learning_rate": 4.5294407935830376e-05, - "loss": 2.2629, - "step": 75000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.299330472946167, - "eval_runtime": 51.3755, - "eval_samples_per_second": 198.422, - "eval_steps_per_second": 1.557, - "step": 75000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 1.655090570449829, - "learning_rate": 4.528147063493786e-05, - "loss": 2.2451, - "step": 75100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 1.610613465309143, - "learning_rate": 4.5268517426853155e-05, - "loss": 2.253, - "step": 75200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 1.5857921838760376, - "learning_rate": 4.525554832173577e-05, - "loss": 2.2476, - "step": 75300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 1.6049814224243164, - "learning_rate": 4.5242563329757616e-05, - "loss": 2.2455, - "step": 75400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 1.558239221572876, - "learning_rate": 4.5229562461103114e-05, - "loss": 2.2445, - "step": 75500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 1.6403298377990723, - "learning_rate": 4.5216545725969137e-05, - "loss": 2.2477, - "step": 75600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 1.810544490814209, - "learning_rate": 4.520351313456498e-05, - "loss": 2.2586, - "step": 75700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 1.4252396821975708, - "learning_rate": 4.519046469711237e-05, - "loss": 2.2445, - "step": 75800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 1.5683726072311401, - "learning_rate": 4.51774004238455e-05, - "loss": 2.2441, - "step": 75900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 1.5287100076675415, - "learning_rate": 4.5164320325010945e-05, - "loss": 2.2361, - "step": 76000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.308971405029297, - "eval_runtime": 51.5136, - "eval_samples_per_second": 197.89, - "eval_steps_per_second": 1.553, - "step": 76000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 1.4644399881362915, - "learning_rate": 4.515122441086771e-05, - "loss": 2.2371, - "step": 76100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 1.7674343585968018, - "learning_rate": 4.5138112691687207e-05, - "loss": 2.2397, - "step": 76200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 1.713692545890808, - "learning_rate": 4.512498517775323e-05, - "loss": 2.2385, - "step": 76300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 1.6533859968185425, - "learning_rate": 4.511184187936197e-05, - "loss": 2.2424, - "step": 76400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 1.669550895690918, - "learning_rate": 4.5098682806822e-05, - "loss": 2.2404, - "step": 76500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 1.6391149759292603, - "learning_rate": 4.508550797045427e-05, - "loss": 2.236, - "step": 76600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 1.7389802932739258, - "learning_rate": 4.5072317380592075e-05, - "loss": 2.2343, - "step": 76700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 1.5248901844024658, - "learning_rate": 4.505911104758108e-05, - "loss": 2.2403, - "step": 76800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 1.699749231338501, - "learning_rate": 4.5045888981779296e-05, - "loss": 2.2325, - "step": 76900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 1.7699368000030518, - "learning_rate": 4.503265119355708e-05, - "loss": 2.2273, - "step": 77000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.306685209274292, - "eval_runtime": 51.4586, - "eval_samples_per_second": 198.101, - "eval_steps_per_second": 1.555, - "step": 77000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 1.6230815649032593, - "learning_rate": 4.50193976932971e-05, - "loss": 2.2366, - "step": 77100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 1.599594235420227, - "learning_rate": 4.500612849139437e-05, - "loss": 2.236, - "step": 77200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 1.6760362386703491, - "learning_rate": 4.4992843598256204e-05, - "loss": 2.2314, - "step": 77300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 1.5943468809127808, - "learning_rate": 4.497954302430224e-05, - "loss": 2.2336, - "step": 77400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 1.5486056804656982, - "learning_rate": 4.4966226779964385e-05, - "loss": 2.2357, - "step": 77500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 1.767898678779602, - "learning_rate": 4.495289487568687e-05, - "loss": 2.2379, - "step": 77600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 1.6087191104888916, - "learning_rate": 4.493954732192618e-05, - "loss": 2.2281, - "step": 77700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 1.6370609998703003, - "learning_rate": 4.4926184129151104e-05, - "loss": 2.2236, - "step": 77800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 1.664980411529541, - "learning_rate": 4.491280530784267e-05, - "loss": 2.2144, - "step": 77900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 1.6448901891708374, - "learning_rate": 4.4899410868494173e-05, - "loss": 2.2187, - "step": 78000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.3003854751586914, - "eval_runtime": 51.4481, - "eval_samples_per_second": 198.141, - "eval_steps_per_second": 1.555, - "step": 78000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 1.503465175628662, - "learning_rate": 4.488600082161116e-05, - "loss": 2.2178, - "step": 78100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 1.4563822746276855, - "learning_rate": 4.487257517771142e-05, - "loss": 2.2301, - "step": 78200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 1.5082318782806396, - "learning_rate": 4.485913394732498e-05, - "loss": 2.2157, - "step": 78300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 1.7546818256378174, - "learning_rate": 4.484567714099407e-05, - "loss": 2.2304, - "step": 78400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 1.732563853263855, - "learning_rate": 4.4832204769273166e-05, - "loss": 2.2209, - "step": 78500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 1.7507960796356201, - "learning_rate": 4.481871684272894e-05, - "loss": 2.227, - "step": 78600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 1.5634974241256714, - "learning_rate": 4.4805213371940236e-05, - "loss": 2.2218, - "step": 78700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 1.6190506219863892, - "learning_rate": 4.479169436749814e-05, - "loss": 2.2221, - "step": 78800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 1.6474273204803467, - "learning_rate": 4.477815984000589e-05, - "loss": 2.222, - "step": 78900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 1.7117503881454468, - "learning_rate": 4.4764609800078915e-05, - "loss": 2.2176, - "step": 79000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.2944180965423584, - "eval_runtime": 51.4998, - "eval_samples_per_second": 197.943, - "eval_steps_per_second": 1.553, - "step": 79000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 1.5974689722061157, - "learning_rate": 4.475104425834479e-05, - "loss": 2.2146, - "step": 79100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 1.7292412519454956, - "learning_rate": 4.473746322544326e-05, - "loss": 2.2297, - "step": 79200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 1.641387939453125, - "learning_rate": 4.472386671202623e-05, - "loss": 2.2097, - "step": 79300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 1.6808158159255981, - "learning_rate": 4.4710254728757724e-05, - "loss": 2.2202, - "step": 79400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 1.5556340217590332, - "learning_rate": 4.4696627286313916e-05, - "loss": 2.2251, - "step": 79500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 1.5181496143341064, - "learning_rate": 4.4682984395383116e-05, - "loss": 2.2087, - "step": 79600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 1.8447983264923096, - "learning_rate": 4.4669326066665715e-05, - "loss": 2.2081, - "step": 79700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 1.595247507095337, - "learning_rate": 4.4655652310874246e-05, - "loss": 2.2195, - "step": 79800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 1.7529734373092651, - "learning_rate": 4.464196313873332e-05, - "loss": 2.2121, - "step": 79900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 1.6872299909591675, - "learning_rate": 4.462825856097966e-05, - "loss": 2.2105, - "step": 80000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.282845973968506, - "eval_runtime": 51.7632, - "eval_samples_per_second": 196.935, - "eval_steps_per_second": 1.546, - "step": 80000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 1.7083994150161743, - "learning_rate": 4.461453858836206e-05, - "loss": 2.2091, - "step": 80100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 1.4984794855117798, - "learning_rate": 4.460080323164136e-05, - "loss": 2.3458, - "step": 80200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 1.4929925203323364, - "learning_rate": 4.458705250159053e-05, - "loss": 2.3548, - "step": 80300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 1.528534173965454, - "learning_rate": 4.457328640899455e-05, - "loss": 2.3536, - "step": 80400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 1.6262974739074707, - "learning_rate": 4.455950496465046e-05, - "loss": 2.3425, - "step": 80500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 1.5449632406234741, - "learning_rate": 4.454570817936734e-05, - "loss": 2.3339, - "step": 80600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 1.5070366859436035, - "learning_rate": 4.4531896063966304e-05, - "loss": 2.3321, - "step": 80700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 1.6697232723236084, - "learning_rate": 4.4518068629280505e-05, - "loss": 2.3288, - "step": 80800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 1.672261118888855, - "learning_rate": 4.450422588615507e-05, - "loss": 2.3247, - "step": 80900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 1.497429370880127, - "learning_rate": 4.4490367845447186e-05, - "loss": 2.3227, - "step": 81000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.307387351989746, - "eval_runtime": 51.7303, - "eval_samples_per_second": 197.061, - "eval_steps_per_second": 1.546, - "step": 81000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 1.451381802558899, - "learning_rate": 4.447649451802599e-05, - "loss": 2.3206, - "step": 81100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 1.7021763324737549, - "learning_rate": 4.446260591477265e-05, - "loss": 2.3278, - "step": 81200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 1.693590521812439, - "learning_rate": 4.444870204658027e-05, - "loss": 2.3276, - "step": 81300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 1.5749512910842896, - "learning_rate": 4.443478292435397e-05, - "loss": 2.3106, - "step": 81400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 1.5621660947799683, - "learning_rate": 4.4420848559010816e-05, - "loss": 2.3088, - "step": 81500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 1.602555513381958, - "learning_rate": 4.440689896147983e-05, - "loss": 2.3207, - "step": 81600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 1.565977931022644, - "learning_rate": 4.439293414270198e-05, - "loss": 2.3104, - "step": 81700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 1.6112767457962036, - "learning_rate": 4.437895411363016e-05, - "loss": 2.3042, - "step": 81800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 1.574155330657959, - "learning_rate": 4.436495888522921e-05, - "loss": 2.3086, - "step": 81900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 1.580681324005127, - "learning_rate": 4.43509484684759e-05, - "loss": 2.3097, - "step": 82000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.311988592147827, - "eval_runtime": 51.7596, - "eval_samples_per_second": 196.949, - "eval_steps_per_second": 1.546, - "step": 82000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 1.5292607545852661, - "learning_rate": 4.4336922874358887e-05, - "loss": 2.2958, - "step": 82100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 1.5391873121261597, - "learning_rate": 4.432288211387876e-05, - "loss": 2.2988, - "step": 82200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 1.6578278541564941, - "learning_rate": 4.430882619804798e-05, - "loss": 2.3013, - "step": 82300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 1.5353553295135498, - "learning_rate": 4.42947551378909e-05, - "loss": 2.2993, - "step": 82400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 1.5105656385421753, - "learning_rate": 4.428066894444376e-05, - "loss": 2.2884, - "step": 82500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 1.4228105545043945, - "learning_rate": 4.4266567628754654e-05, - "loss": 2.3006, - "step": 82600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 1.6121501922607422, - "learning_rate": 4.425245120188356e-05, - "loss": 2.2925, - "step": 82700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 1.6145355701446533, - "learning_rate": 4.423831967490228e-05, - "loss": 2.2918, - "step": 82800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 1.4856330156326294, - "learning_rate": 4.422417305889448e-05, - "loss": 2.2916, - "step": 82900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 1.5816203355789185, - "learning_rate": 4.421001136495566e-05, - "loss": 2.2816, - "step": 83000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.320969581604004, - "eval_runtime": 51.7264, - "eval_samples_per_second": 197.075, - "eval_steps_per_second": 1.547, - "step": 83000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 1.5632624626159668, - "learning_rate": 4.419583460419313e-05, - "loss": 2.2814, - "step": 83100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 1.5248332023620605, - "learning_rate": 4.418164278772604e-05, - "loss": 2.283, - "step": 83200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 1.5358643531799316, - "learning_rate": 4.416743592668532e-05, - "loss": 2.2881, - "step": 83300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 1.531286358833313, - "learning_rate": 4.415321403221372e-05, - "loss": 2.2796, - "step": 83400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 1.591470718383789, - "learning_rate": 4.413897711546579e-05, - "loss": 2.2824, - "step": 83500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 1.6353243589401245, - "learning_rate": 4.412472518760783e-05, - "loss": 2.2822, - "step": 83600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 1.535651445388794, - "learning_rate": 4.4110458259817954e-05, - "loss": 2.2792, - "step": 83700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 1.5406070947647095, - "learning_rate": 4.4096176343286e-05, - "loss": 2.274, - "step": 83800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 1.5447317361831665, - "learning_rate": 4.408187944921359e-05, - "loss": 2.2733, - "step": 83900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 1.625569224357605, - "learning_rate": 4.406756758881408e-05, - "loss": 2.2741, - "step": 84000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.316434144973755, - "eval_runtime": 51.6963, - "eval_samples_per_second": 197.19, - "eval_steps_per_second": 1.548, - "step": 84000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 1.4764138460159302, - "learning_rate": 4.405324077331257e-05, - "loss": 2.2714, - "step": 84100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 1.6313000917434692, - "learning_rate": 4.40388990139459e-05, - "loss": 2.2733, - "step": 84200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 1.6612203121185303, - "learning_rate": 4.40245423219626e-05, - "loss": 2.2696, - "step": 84300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 1.5471678972244263, - "learning_rate": 4.4010170708622945e-05, - "loss": 2.2687, - "step": 84400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 1.4957951307296753, - "learning_rate": 4.3995784185198895e-05, - "loss": 2.2729, - "step": 84500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 1.7474133968353271, - "learning_rate": 4.3981382762974105e-05, - "loss": 2.2676, - "step": 84600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 1.6063662767410278, - "learning_rate": 4.396696645324393e-05, - "loss": 2.2737, - "step": 84700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 1.5897927284240723, - "learning_rate": 4.395253526731538e-05, - "loss": 2.2744, - "step": 84800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 1.4931442737579346, - "learning_rate": 4.3938089216507146e-05, - "loss": 2.2638, - "step": 84900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 1.703892469406128, - "learning_rate": 4.3923628312149575e-05, - "loss": 2.2564, - "step": 85000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.3203041553497314, - "eval_runtime": 51.8215, - "eval_samples_per_second": 196.714, - "eval_steps_per_second": 1.544, - "step": 85000 - }, - { - "epoch": 0.033768886854693314, - "grad_norm": 1.5777971744537354, - "learning_rate": 4.390915256558467e-05, - "loss": 2.265, - "step": 85100 - }, - { - "epoch": 0.03404796856423624, - "grad_norm": 1.5620445013046265, - "learning_rate": 4.389466198816606e-05, - "loss": 2.2689, - "step": 85200 - }, - { - "epoch": 0.03432705027377916, - "grad_norm": 1.50547456741333, - "learning_rate": 4.388015659125903e-05, - "loss": 2.2659, - "step": 85300 - }, - { - "epoch": 0.034606131983322075, - "grad_norm": 1.6230157613754272, - "learning_rate": 4.386563638624046e-05, - "loss": 2.2593, - "step": 85400 - }, - { - "epoch": 0.034885213692865, - "grad_norm": 1.553817868232727, - "learning_rate": 4.3851101384498864e-05, - "loss": 2.2565, - "step": 85500 - }, - { - "epoch": 0.03516429540240792, - "grad_norm": 1.5825836658477783, - "learning_rate": 4.383655159743435e-05, - "loss": 2.2475, - "step": 85600 - }, - { - "epoch": 0.035443377111950836, - "grad_norm": 1.497200608253479, - "learning_rate": 4.382198703645864e-05, - "loss": 2.259, - "step": 85700 - }, - { - "epoch": 0.035722458821493754, - "grad_norm": 1.7085014581680298, - "learning_rate": 4.380740771299502e-05, - "loss": 2.2513, - "step": 85800 - }, - { - "epoch": 0.03600154053103668, - "grad_norm": 1.654452919960022, - "learning_rate": 4.3792813638478366e-05, - "loss": 2.2516, - "step": 85900 - }, - { - "epoch": 0.036280622240579596, - "grad_norm": 1.5657737255096436, - "learning_rate": 4.377820482435513e-05, - "loss": 2.2623, - "step": 86000 - }, - { - "epoch": 0.036280622240579596, - "eval_loss": 2.310570001602173, - "eval_runtime": 51.7573, - "eval_samples_per_second": 196.958, - "eval_steps_per_second": 1.546, - "step": 86000 - }, - { - "epoch": 0.036559703950122514, - "grad_norm": 1.5554670095443726, - "learning_rate": 4.3763581282083314e-05, - "loss": 2.2464, - "step": 86100 - }, - { - "epoch": 0.03683878565966544, - "grad_norm": 1.6469368934631348, - "learning_rate": 4.374894302313247e-05, - "loss": 2.2569, - "step": 86200 - }, - { - "epoch": 0.03711786736920836, - "grad_norm": 1.640326976776123, - "learning_rate": 4.3734290058983714e-05, - "loss": 2.2481, - "step": 86300 - }, - { - "epoch": 0.037396949078751275, - "grad_norm": 1.5348613262176514, - "learning_rate": 4.3719622401129665e-05, - "loss": 2.251, - "step": 86400 - }, - { - "epoch": 0.0376760307882942, - "grad_norm": 1.5372469425201416, - "learning_rate": 4.370494006107449e-05, - "loss": 2.246, - "step": 86500 - }, - { - "epoch": 0.03795511249783712, - "grad_norm": 1.5619806051254272, - "learning_rate": 4.369024305033386e-05, - "loss": 2.2472, - "step": 86600 - }, - { - "epoch": 0.038234194207380036, - "grad_norm": 1.5712941884994507, - "learning_rate": 4.367553138043495e-05, - "loss": 2.2443, - "step": 86700 - }, - { - "epoch": 0.038513275916922954, - "grad_norm": 1.4509296417236328, - "learning_rate": 4.3660805062916456e-05, - "loss": 2.2376, - "step": 86800 - }, - { - "epoch": 0.03879235762646588, - "grad_norm": 1.5161213874816895, - "learning_rate": 4.3646064109328525e-05, - "loss": 2.2363, - "step": 86900 - }, - { - "epoch": 0.0390714393360088, - "grad_norm": 1.6400469541549683, - "learning_rate": 4.3631308531232805e-05, - "loss": 2.2478, - "step": 87000 - }, - { - "epoch": 0.0390714393360088, - "eval_loss": 2.3120627403259277, - "eval_runtime": 51.7529, - "eval_samples_per_second": 196.974, - "eval_steps_per_second": 1.546, - "step": 87000 - }, - { - "epoch": 0.039350521045551715, - "grad_norm": 1.5647891759872437, - "learning_rate": 4.3616538340202424e-05, - "loss": 2.2447, - "step": 87100 - }, - { - "epoch": 0.03962960275509464, - "grad_norm": 1.5194950103759766, - "learning_rate": 4.360175354782196e-05, - "loss": 2.252, - "step": 87200 - }, - { - "epoch": 0.03990868446463756, - "grad_norm": 1.5468082427978516, - "learning_rate": 4.358695416568742e-05, - "loss": 2.2406, - "step": 87300 - }, - { - "epoch": 0.040187766174180475, - "grad_norm": 1.579084873199463, - "learning_rate": 4.3572140205406295e-05, - "loss": 2.2425, - "step": 87400 - }, - { - "epoch": 0.04046684788372339, - "grad_norm": 1.608675241470337, - "learning_rate": 4.35573116785975e-05, - "loss": 2.233, - "step": 87500 - }, - { - "epoch": 0.04074592959326632, - "grad_norm": 1.764980673789978, - "learning_rate": 4.354246859689134e-05, - "loss": 2.2387, - "step": 87600 - }, - { - "epoch": 0.041025011302809236, - "grad_norm": 1.750517725944519, - "learning_rate": 4.352761097192959e-05, - "loss": 2.2221, - "step": 87700 - }, - { - "epoch": 0.041304093012352154, - "grad_norm": 1.7584564685821533, - "learning_rate": 4.3512738815365385e-05, - "loss": 2.2177, - "step": 87800 - }, - { - "epoch": 0.04158317472189508, - "grad_norm": 1.7790557146072388, - "learning_rate": 4.3497852138863284e-05, - "loss": 2.2101, - "step": 87900 - }, - { - "epoch": 0.041862256431438, - "grad_norm": 1.7064695358276367, - "learning_rate": 4.348295095409922e-05, - "loss": 2.2024, - "step": 88000 - }, - { - "epoch": 0.041862256431438, - "eval_loss": 2.2995381355285645, - "eval_runtime": 51.8012, - "eval_samples_per_second": 196.791, - "eval_steps_per_second": 1.544, - "step": 88000 - }, - { - "epoch": 0.042141338140980915, - "grad_norm": 1.5459762811660767, - "learning_rate": 4.346803527276053e-05, - "loss": 2.1982, - "step": 88100 - }, - { - "epoch": 0.04242041985052384, - "grad_norm": 1.6607937812805176, - "learning_rate": 4.3453105106545875e-05, - "loss": 2.1937, - "step": 88200 - }, - { - "epoch": 0.04269950156006676, - "grad_norm": 1.6780369281768799, - "learning_rate": 4.3438160467165326e-05, - "loss": 2.1887, - "step": 88300 - }, - { - "epoch": 0.042978583269609676, - "grad_norm": 1.6140103340148926, - "learning_rate": 4.342320136634027e-05, - "loss": 2.1768, - "step": 88400 - }, - { - "epoch": 0.043257664979152594, - "grad_norm": 1.6889429092407227, - "learning_rate": 4.340822781580346e-05, - "loss": 2.1783, - "step": 88500 - }, - { - "epoch": 0.04353674668869552, - "grad_norm": 1.717197299003601, - "learning_rate": 4.3393239827298973e-05, - "loss": 2.1842, - "step": 88600 - }, - { - "epoch": 0.043815828398238436, - "grad_norm": 1.6120058298110962, - "learning_rate": 4.33782374125822e-05, - "loss": 2.1775, - "step": 88700 - }, - { - "epoch": 0.044094910107781354, - "grad_norm": 1.7139967679977417, - "learning_rate": 4.3363220583419875e-05, - "loss": 2.1724, - "step": 88800 - }, - { - "epoch": 0.04437399181732428, - "grad_norm": 1.6353269815444946, - "learning_rate": 4.3348189351589996e-05, - "loss": 2.1773, - "step": 88900 - }, - { - "epoch": 0.0446530735268672, - "grad_norm": 1.6677064895629883, - "learning_rate": 4.333314372888189e-05, - "loss": 2.177, - "step": 89000 - }, - { - "epoch": 0.0446530735268672, - "eval_loss": 2.2914843559265137, - "eval_runtime": 51.8687, - "eval_samples_per_second": 196.535, - "eval_steps_per_second": 1.542, - "step": 89000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 1.6284213066101074, - "learning_rate": 4.331808372709617e-05, - "loss": 2.1737, - "step": 89100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 1.5599644184112549, - "learning_rate": 4.33030093580447e-05, - "loss": 2.1666, - "step": 89200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 1.5871340036392212, - "learning_rate": 4.328792063355065e-05, - "loss": 2.1656, - "step": 89300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 1.655039668083191, - "learning_rate": 4.327281756544842e-05, - "loss": 2.169, - "step": 89400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 1.549782633781433, - "learning_rate": 4.325770016558367e-05, - "loss": 2.1748, - "step": 89500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 1.6504789590835571, - "learning_rate": 4.3242568445813306e-05, - "loss": 2.1637, - "step": 89600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 1.6154310703277588, - "learning_rate": 4.322742241800545e-05, - "loss": 2.1582, - "step": 89700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 1.611567497253418, - "learning_rate": 4.321226209403947e-05, - "loss": 2.1519, - "step": 89800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 1.5689648389816284, - "learning_rate": 4.319708748580592e-05, - "loss": 2.1659, - "step": 89900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 1.630550503730774, - "learning_rate": 4.318189860520658e-05, - "loss": 2.1545, - "step": 90000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.2889482975006104, - "eval_runtime": 51.8104, - "eval_samples_per_second": 196.756, - "eval_steps_per_second": 1.544, - "step": 90000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 1.5548030138015747, - "learning_rate": 4.316669546415441e-05, - "loss": 2.1585, - "step": 90100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 1.675545573234558, - "learning_rate": 4.315147807457356e-05, - "loss": 2.1588, - "step": 90200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 1.6402935981750488, - "learning_rate": 4.313624644839936e-05, - "loss": 2.1548, - "step": 90300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 1.6139987707138062, - "learning_rate": 4.312100059757829e-05, - "loss": 2.1571, - "step": 90400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 1.625307321548462, - "learning_rate": 4.310574053406801e-05, - "loss": 2.1595, - "step": 90500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 1.6252094507217407, - "learning_rate": 4.3090466269837304e-05, - "loss": 2.1593, - "step": 90600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 1.6178643703460693, - "learning_rate": 4.307517781686611e-05, - "loss": 2.1393, - "step": 90700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 1.4477986097335815, - "learning_rate": 4.3059875187145495e-05, - "loss": 2.1526, - "step": 90800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 1.6491955518722534, - "learning_rate": 4.3044558392677627e-05, - "loss": 2.1456, - "step": 90900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 1.6281492710113525, - "learning_rate": 4.30292274454758e-05, - "loss": 2.1516, - "step": 91000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.270752191543579, - "eval_runtime": 51.3993, - "eval_samples_per_second": 198.329, - "eval_steps_per_second": 1.556, - "step": 91000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 1.6257696151733398, - "learning_rate": 4.301388235756442e-05, - "loss": 2.1455, - "step": 91100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 1.6170133352279663, - "learning_rate": 4.299852314097894e-05, - "loss": 2.1379, - "step": 91200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 1.5804738998413086, - "learning_rate": 4.298314980776594e-05, - "loss": 2.1415, - "step": 91300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 1.6626943349838257, - "learning_rate": 4.2967762369983065e-05, - "loss": 2.1513, - "step": 91400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 1.5107390880584717, - "learning_rate": 4.295236083969899e-05, - "loss": 2.1477, - "step": 91500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 1.6073352098464966, - "learning_rate": 4.293694522899349e-05, - "loss": 2.1468, - "step": 91600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 1.5642460584640503, - "learning_rate": 4.292151554995734e-05, - "loss": 2.1394, - "step": 91700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 1.4667211771011353, - "learning_rate": 4.290607181469236e-05, - "loss": 2.1446, - "step": 91800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 1.5254555940628052, - "learning_rate": 4.2890614035311425e-05, - "loss": 2.1516, - "step": 91900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 1.5663783550262451, - "learning_rate": 4.2875142223938395e-05, - "loss": 2.1407, - "step": 92000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.2744436264038086, - "eval_runtime": 51.2413, - "eval_samples_per_second": 198.941, - "eval_steps_per_second": 1.561, - "step": 92000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 1.6381663084030151, - "learning_rate": 4.285965639270814e-05, - "loss": 2.1364, - "step": 92100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 1.5091722011566162, - "learning_rate": 4.284415655376654e-05, - "loss": 2.1387, - "step": 92200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 1.53826105594635, - "learning_rate": 4.282864271927042e-05, - "loss": 2.1422, - "step": 92300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 1.5313266515731812, - "learning_rate": 4.281311490138765e-05, - "loss": 2.1358, - "step": 92400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 1.5104668140411377, - "learning_rate": 4.279757311229702e-05, - "loss": 2.1324, - "step": 92500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 1.5808652639389038, - "learning_rate": 4.278201736418828e-05, - "loss": 2.1479, - "step": 92600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 1.5481529235839844, - "learning_rate": 4.276644766926213e-05, - "loss": 2.1359, - "step": 92700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 1.6417638063430786, - "learning_rate": 4.275086403973023e-05, - "loss": 2.1421, - "step": 92800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 1.6985423564910889, - "learning_rate": 4.2735266487815156e-05, - "loss": 2.1376, - "step": 92900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 1.7289358377456665, - "learning_rate": 4.271965502575039e-05, - "loss": 2.1446, - "step": 93000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.2753000259399414, - "eval_runtime": 51.3478, - "eval_samples_per_second": 198.529, - "eval_steps_per_second": 1.558, - "step": 93000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 1.5933712720870972, - "learning_rate": 4.2704029665780354e-05, - "loss": 2.1341, - "step": 93100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 1.4210540056228638, - "learning_rate": 4.2688390420160335e-05, - "loss": 2.125, - "step": 93200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 1.7447474002838135, - "learning_rate": 4.267273730115654e-05, - "loss": 2.1259, - "step": 93300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 1.6314444541931152, - "learning_rate": 4.265707032104603e-05, - "loss": 2.1334, - "step": 93400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 1.6820199489593506, - "learning_rate": 4.264138949211678e-05, - "loss": 2.1263, - "step": 93500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 1.6244908571243286, - "learning_rate": 4.2625694826667576e-05, - "loss": 2.1313, - "step": 93600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 1.6561076641082764, - "learning_rate": 4.260998633700809e-05, - "loss": 2.1274, - "step": 93700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 1.5971171855926514, - "learning_rate": 4.259426403545883e-05, - "loss": 2.1304, - "step": 93800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 1.5423985719680786, - "learning_rate": 4.257852793435113e-05, - "loss": 2.1356, - "step": 93900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 1.6720298528671265, - "learning_rate": 4.256277804602715e-05, - "loss": 2.1297, - "step": 94000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.2692244052886963, - "eval_runtime": 51.3308, - "eval_samples_per_second": 198.594, - "eval_steps_per_second": 1.559, - "step": 94000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 1.670507550239563, - "learning_rate": 4.254701438283987e-05, - "loss": 2.1326, - "step": 94100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 1.6407012939453125, - "learning_rate": 4.253123695715307e-05, - "loss": 2.1201, - "step": 94200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 1.6673415899276733, - "learning_rate": 4.2515445781341306e-05, - "loss": 2.1236, - "step": 94300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 1.6600122451782227, - "learning_rate": 4.2499640867789955e-05, - "loss": 2.1387, - "step": 94400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 1.669821858406067, - "learning_rate": 4.248382222889515e-05, - "loss": 2.1376, - "step": 94500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 1.6163934469223022, - "learning_rate": 4.246798987706378e-05, - "loss": 2.1219, - "step": 94600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 1.6449049711227417, - "learning_rate": 4.24521438247135e-05, - "loss": 2.1197, - "step": 94700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 1.6894525289535522, - "learning_rate": 4.2436284084272706e-05, - "loss": 2.1185, - "step": 94800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 1.6635791063308716, - "learning_rate": 4.242041066818053e-05, - "loss": 2.1202, - "step": 94900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 1.5680242776870728, - "learning_rate": 4.240452358888685e-05, - "loss": 2.1157, - "step": 95000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.2662878036499023, - "eval_runtime": 51.3865, - "eval_samples_per_second": 198.379, - "eval_steps_per_second": 1.557, - "step": 95000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 1.6089997291564941, - "learning_rate": 4.2388622858852224e-05, - "loss": 2.1249, - "step": 95100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 1.5722119808197021, - "learning_rate": 4.237270849054794e-05, - "loss": 2.1169, - "step": 95200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 1.6253992319107056, - "learning_rate": 4.2356780496455984e-05, - "loss": 2.121, - "step": 95300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 1.7267462015151978, - "learning_rate": 4.2340838889069014e-05, - "loss": 2.1174, - "step": 95400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 1.639393925666809, - "learning_rate": 4.232488368089038e-05, - "loss": 2.1191, - "step": 95500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 1.570025086402893, - "learning_rate": 4.2308914884434096e-05, - "loss": 2.1208, - "step": 95600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 1.6924635171890259, - "learning_rate": 4.2292932512224835e-05, - "loss": 2.1218, - "step": 95700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 1.5708248615264893, - "learning_rate": 4.22769365767979e-05, - "loss": 2.1191, - "step": 95800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 1.6411412954330444, - "learning_rate": 4.226092709069926e-05, - "loss": 2.1017, - "step": 95900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 1.6997929811477661, - "learning_rate": 4.224490406648548e-05, - "loss": 2.1125, - "step": 96000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.2651915550231934, - "eval_runtime": 51.6612, - "eval_samples_per_second": 197.324, - "eval_steps_per_second": 1.549, - "step": 96000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 1.678976058959961, - "learning_rate": 4.222886751672379e-05, - "loss": 2.1085, - "step": 96100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 1.6502765417099, - "learning_rate": 4.221281745399197e-05, - "loss": 2.104, - "step": 96200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 1.7067300081253052, - "learning_rate": 4.219675389087845e-05, - "loss": 2.0918, - "step": 96300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 1.5428402423858643, - "learning_rate": 4.218067683998221e-05, - "loss": 2.0912, - "step": 96400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 1.5912175178527832, - "learning_rate": 4.2164586313912844e-05, - "loss": 2.1018, - "step": 96500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 1.6541246175765991, - "learning_rate": 4.214848232529048e-05, - "loss": 2.1059, - "step": 96600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 1.6385318040847778, - "learning_rate": 4.2132364886745834e-05, - "loss": 2.0992, - "step": 96700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 1.6574422121047974, - "learning_rate": 4.2116234010920153e-05, - "loss": 2.1014, - "step": 96800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 1.798996090888977, - "learning_rate": 4.210008971046522e-05, - "loss": 2.0837, - "step": 96900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 1.7654832601547241, - "learning_rate": 4.208393199804337e-05, - "loss": 2.0967, - "step": 97000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.2732906341552734, - "eval_runtime": 51.5925, - "eval_samples_per_second": 197.587, - "eval_steps_per_second": 1.551, - "step": 97000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 1.659149169921875, - "learning_rate": 4.206776088632744e-05, - "loss": 2.0867, - "step": 97100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 1.6352757215499878, - "learning_rate": 4.205157638800077e-05, - "loss": 2.0883, - "step": 97200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 1.6729434728622437, - "learning_rate": 4.203537851575722e-05, - "loss": 2.0826, - "step": 97300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 1.6211354732513428, - "learning_rate": 4.201916728230112e-05, - "loss": 2.0862, - "step": 97400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 1.6426453590393066, - "learning_rate": 4.20029427003473e-05, - "loss": 2.1016, - "step": 97500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 1.7282731533050537, - "learning_rate": 4.198670478262103e-05, - "loss": 2.0897, - "step": 97600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 1.6720244884490967, - "learning_rate": 4.1970453541858075e-05, - "loss": 2.0887, - "step": 97700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 1.6182509660720825, - "learning_rate": 4.195418899080462e-05, - "loss": 2.0941, - "step": 97800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 1.5953019857406616, - "learning_rate": 4.19379111422173e-05, - "loss": 2.0799, - "step": 97900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 1.6194970607757568, - "learning_rate": 4.1921620008863193e-05, - "loss": 2.0915, - "step": 98000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.269259452819824, - "eval_runtime": 51.5672, - "eval_samples_per_second": 197.684, - "eval_steps_per_second": 1.551, - "step": 98000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 1.6391347646713257, - "learning_rate": 4.1905315603519765e-05, - "loss": 2.0761, - "step": 98100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 1.6369253396987915, - "learning_rate": 4.1888997938974935e-05, - "loss": 2.0862, - "step": 98200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 1.628437876701355, - "learning_rate": 4.187266702802698e-05, - "loss": 2.0835, - "step": 98300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 1.643228530883789, - "learning_rate": 4.1856322883484584e-05, - "loss": 2.078, - "step": 98400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 1.6307592391967773, - "learning_rate": 4.183996551816681e-05, - "loss": 2.0703, - "step": 98500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 1.6314188241958618, - "learning_rate": 4.18235949449031e-05, - "loss": 2.0926, - "step": 98600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 1.5619598627090454, - "learning_rate": 4.180721117653323e-05, - "loss": 2.0827, - "step": 98700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 1.5566825866699219, - "learning_rate": 4.179081422590736e-05, - "loss": 2.0754, - "step": 98800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 1.621233344078064, - "learning_rate": 4.177440410588596e-05, - "loss": 2.0702, - "step": 98900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 1.6783591508865356, - "learning_rate": 4.1757980829339826e-05, - "loss": 2.0691, - "step": 99000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.2590763568878174, - "eval_runtime": 51.6452, - "eval_samples_per_second": 197.385, - "eval_steps_per_second": 1.549, - "step": 99000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 1.6113650798797607, - "learning_rate": 4.1741544409150104e-05, - "loss": 2.0677, - "step": 99100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 1.6120631694793701, - "learning_rate": 4.172509485820823e-05, - "loss": 2.0784, - "step": 99200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 1.603555679321289, - "learning_rate": 4.170863218941593e-05, - "loss": 2.0685, - "step": 99300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 1.5876059532165527, - "learning_rate": 4.1692156415685234e-05, - "loss": 2.0694, - "step": 99400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 1.5650913715362549, - "learning_rate": 4.167566754993844e-05, - "loss": 2.0714, - "step": 99500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 1.6612671613693237, - "learning_rate": 4.1659165605108134e-05, - "loss": 2.0754, - "step": 99600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 1.6820577383041382, - "learning_rate": 4.1642650594137116e-05, - "loss": 2.0686, - "step": 99700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 1.6811972856521606, - "learning_rate": 4.162612252997849e-05, - "loss": 2.0719, - "step": 99800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 1.6226907968521118, - "learning_rate": 4.160958142559556e-05, - "loss": 2.0654, - "step": 99900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 1.5367672443389893, - "learning_rate": 4.159302729396186e-05, - "loss": 2.077, - "step": 100000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.2724859714508057, - "eval_runtime": 51.6394, - "eval_samples_per_second": 197.407, - "eval_steps_per_second": 1.549, - "step": 100000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 1.6159389019012451, - "learning_rate": 4.157646014806117e-05, - "loss": 2.0694, - "step": 100100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 1.6105570793151855, - "learning_rate": 4.155988000088745e-05, - "loss": 2.0527, - "step": 100200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 1.6409541368484497, - "learning_rate": 4.1543286865444856e-05, - "loss": 2.0671, - "step": 100300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 1.6089441776275635, - "learning_rate": 4.152668075474775e-05, - "loss": 2.0659, - "step": 100400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 1.6299337148666382, - "learning_rate": 4.151006168182065e-05, - "loss": 2.0634, - "step": 100500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 1.5447492599487305, - "learning_rate": 4.1493429659698266e-05, - "loss": 2.0709, - "step": 100600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 1.6599881649017334, - "learning_rate": 4.147678470142544e-05, - "loss": 2.0663, - "step": 100700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 1.6231052875518799, - "learning_rate": 4.146012682005717e-05, - "loss": 2.0691, - "step": 100800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 1.67854642868042, - "learning_rate": 4.144345602865859e-05, - "loss": 2.0569, - "step": 100900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 1.655781626701355, - "learning_rate": 4.1426772340304964e-05, - "loss": 2.0546, - "step": 101000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.275222063064575, - "eval_runtime": 51.7111, - "eval_samples_per_second": 197.134, - "eval_steps_per_second": 1.547, - "step": 101000 - }, - { - "epoch": 0.033768886854693314, - "grad_norm": 1.6492592096328735, - "learning_rate": 4.141007576808166e-05, - "loss": 2.0605, - "step": 101100 - }, - { - "epoch": 0.03404796856423624, - "grad_norm": 1.5914729833602905, - "learning_rate": 4.139336632508415e-05, - "loss": 2.0417, - "step": 101200 - }, - { - "epoch": 0.03432705027377916, - "grad_norm": 1.6000899076461792, - "learning_rate": 4.1376644024418035e-05, - "loss": 2.0621, - "step": 101300 - }, - { - "epoch": 0.034606131983322075, - "grad_norm": 1.5423117876052856, - "learning_rate": 4.135990887919894e-05, - "loss": 2.0709, - "step": 101400 - }, - { - "epoch": 0.034885213692865, - "grad_norm": 1.5954508781433105, - "learning_rate": 4.134316090255263e-05, - "loss": 2.0585, - "step": 101500 - }, - { - "epoch": 0.03516429540240792, - "grad_norm": 1.603651523590088, - "learning_rate": 4.1326400107614877e-05, - "loss": 2.0688, - "step": 101600 - }, - { - "epoch": 0.035443377111950836, - "grad_norm": 1.6470582485198975, - "learning_rate": 4.130962650753154e-05, - "loss": 2.0548, - "step": 101700 - }, - { - "epoch": 0.035722458821493754, - "grad_norm": 1.6013911962509155, - "learning_rate": 4.129284011545852e-05, - "loss": 2.0502, - "step": 101800 - }, - { - "epoch": 0.03600154053103668, - "grad_norm": 1.6595914363861084, - "learning_rate": 4.127604094456174e-05, - "loss": 2.0653, - "step": 101900 - }, - { - "epoch": 0.036280622240579596, - "grad_norm": 1.6441375017166138, - "learning_rate": 4.125922900801715e-05, - "loss": 2.0181, - "step": 102000 - }, - { - "epoch": 0.036280622240579596, - "eval_loss": 2.27272891998291, - "eval_runtime": 51.6797, - "eval_samples_per_second": 197.253, - "eval_steps_per_second": 1.548, - "step": 102000 - }, - { - "epoch": 0.036559703950122514, - "grad_norm": 1.5254052877426147, - "learning_rate": 4.124240431901071e-05, - "loss": 2.0585, - "step": 102100 - }, - { - "epoch": 0.03683878565966544, - "grad_norm": 1.5843355655670166, - "learning_rate": 4.1225566890738384e-05, - "loss": 2.059, - "step": 102200 - }, - { - "epoch": 0.03711786736920836, - "grad_norm": 1.64484441280365, - "learning_rate": 4.120871673640613e-05, - "loss": 2.0522, - "step": 102300 - }, - { - "epoch": 0.037396949078751275, - "grad_norm": 1.7020013332366943, - "learning_rate": 4.119185386922988e-05, - "loss": 2.0519, - "step": 102400 - }, - { - "epoch": 0.0376760307882942, - "grad_norm": 1.6111907958984375, - "learning_rate": 4.117497830243555e-05, - "loss": 2.0563, - "step": 102500 - }, - { - "epoch": 0.03795511249783712, - "grad_norm": 1.5560317039489746, - "learning_rate": 4.1158090049259005e-05, - "loss": 2.0476, - "step": 102600 - }, - { - "epoch": 0.038234194207380036, - "grad_norm": 1.6740190982818604, - "learning_rate": 4.114118912294607e-05, - "loss": 2.0654, - "step": 102700 - }, - { - "epoch": 0.038513275916922954, - "grad_norm": 1.5790201425552368, - "learning_rate": 4.1124275536752494e-05, - "loss": 2.0557, - "step": 102800 - }, - { - "epoch": 0.03879235762646588, - "grad_norm": 1.6047033071517944, - "learning_rate": 4.110734930394397e-05, - "loss": 2.0472, - "step": 102900 - }, - { - "epoch": 0.0390714393360088, - "grad_norm": 1.5579067468643188, - "learning_rate": 4.1090410437796104e-05, - "loss": 2.054, - "step": 103000 - }, - { - "epoch": 0.0390714393360088, - "eval_loss": 2.261636257171631, - "eval_runtime": 51.6907, - "eval_samples_per_second": 197.212, - "eval_steps_per_second": 1.548, - "step": 103000 - }, - { - "epoch": 0.039350521045551715, - "grad_norm": 1.5974478721618652, - "learning_rate": 4.107345895159441e-05, - "loss": 2.0528, - "step": 103100 - }, - { - "epoch": 0.03962960275509464, - "grad_norm": 1.6119569540023804, - "learning_rate": 4.105649485863431e-05, - "loss": 2.0571, - "step": 103200 - }, - { - "epoch": 0.03990868446463756, - "grad_norm": 1.606919527053833, - "learning_rate": 4.1039518172221105e-05, - "loss": 2.0516, - "step": 103300 - }, - { - "epoch": 0.040187766174180475, - "grad_norm": 1.700379490852356, - "learning_rate": 4.1022528905669954e-05, - "loss": 2.0405, - "step": 103400 - }, - { - "epoch": 0.04046684788372339, - "grad_norm": 1.5789330005645752, - "learning_rate": 4.100552707230593e-05, - "loss": 2.0551, - "step": 103500 - }, - { - "epoch": 0.04074592959326632, - "grad_norm": 1.5743343830108643, - "learning_rate": 4.098851268546392e-05, - "loss": 2.0558, - "step": 103600 - }, - { - "epoch": 0.041025011302809236, - "grad_norm": 1.6383607387542725, - "learning_rate": 4.097148575848868e-05, - "loss": 2.0473, - "step": 103700 - }, - { - "epoch": 0.041304093012352154, - "grad_norm": 1.5813179016113281, - "learning_rate": 4.095444630473478e-05, - "loss": 2.0462, - "step": 103800 - }, - { - "epoch": 0.04158317472189508, - "grad_norm": 1.5996896028518677, - "learning_rate": 4.093739433756665e-05, - "loss": 2.0445, - "step": 103900 - }, - { - "epoch": 0.041862256431438, - "grad_norm": 1.627163290977478, - "learning_rate": 4.09203298703585e-05, - "loss": 2.0512, - "step": 104000 - }, - { - "epoch": 0.041862256431438, - "eval_loss": 2.2715542316436768, - "eval_runtime": 51.5969, - "eval_samples_per_second": 197.57, - "eval_steps_per_second": 1.55, - "step": 104000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 1.621848225593567, - "learning_rate": 4.090325291649436e-05, - "loss": 2.056, - "step": 104100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 1.5937111377716064, - "learning_rate": 4.088616348936804e-05, - "loss": 2.0527, - "step": 104200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 3.054250478744507, - "learning_rate": 4.0869061602383166e-05, - "loss": 2.1108, - "step": 104300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 1.599531888961792, - "learning_rate": 4.0851947268953096e-05, - "loss": 2.148, - "step": 104400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 1.4354475736618042, - "learning_rate": 4.083482050250098e-05, - "loss": 2.1317, - "step": 104500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 1.6972846984863281, - "learning_rate": 4.08176813164597e-05, - "loss": 2.1313, - "step": 104600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 1.3760380744934082, - "learning_rate": 4.0800529724271896e-05, - "loss": 2.1105, - "step": 104700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 1.5800480842590332, - "learning_rate": 4.0783365739389924e-05, - "loss": 2.1108, - "step": 104800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 1.6185262203216553, - "learning_rate": 4.076618937527585e-05, - "loss": 2.0927, - "step": 104900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 1.666864037513733, - "learning_rate": 4.07490006454015e-05, - "loss": 2.096, - "step": 105000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.255873441696167, - "eval_runtime": 52.3492, - "eval_samples_per_second": 194.731, - "eval_steps_per_second": 1.528, - "step": 105000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 1.558935284614563, - "learning_rate": 4.0731799563248334e-05, - "loss": 2.0891, - "step": 105100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 1.62444269657135, - "learning_rate": 4.0714586142307546e-05, - "loss": 2.0738, - "step": 105200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 1.6997108459472656, - "learning_rate": 4.069736039607998e-05, - "loss": 2.0742, - "step": 105300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 1.2430105209350586, - "learning_rate": 4.0680122338076156e-05, - "loss": 2.0892, - "step": 105400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 1.6265345811843872, - "learning_rate": 4.0662871981816266e-05, - "loss": 2.0759, - "step": 105500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 1.429768681526184, - "learning_rate": 4.064560934083012e-05, - "loss": 2.0731, - "step": 105600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 1.5453941822052002, - "learning_rate": 4.062833442865719e-05, - "loss": 2.0334, - "step": 105700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 1.6710811853408813, - "learning_rate": 4.061104725884654e-05, - "loss": 2.0727, - "step": 105800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 1.668752908706665, - "learning_rate": 4.0593747844956896e-05, - "loss": 2.0566, - "step": 105900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 1.6783727407455444, - "learning_rate": 4.057643620055654e-05, - "loss": 2.0607, - "step": 106000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.251800537109375, - "eval_runtime": 51.9592, - "eval_samples_per_second": 196.192, - "eval_steps_per_second": 1.54, - "step": 106000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 1.4173957109451294, - "learning_rate": 4.055911233922338e-05, - "loss": 2.0537, - "step": 106100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 1.6333705186843872, - "learning_rate": 4.054177627454487e-05, - "loss": 2.0679, - "step": 106200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 1.078372836112976, - "learning_rate": 4.0524428020118074e-05, - "loss": 2.0599, - "step": 106300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 1.7360562086105347, - "learning_rate": 4.0507067589549595e-05, - "loss": 2.0451, - "step": 106400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 1.0331660509109497, - "learning_rate": 4.048969499645559e-05, - "loss": 2.0569, - "step": 106500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 1.7799084186553955, - "learning_rate": 4.0472310254461765e-05, - "loss": 2.0452, - "step": 106600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 1.6667675971984863, - "learning_rate": 4.045491337720333e-05, - "loss": 2.0386, - "step": 106700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 1.5602868795394897, - "learning_rate": 4.043750437832504e-05, - "loss": 2.0379, - "step": 106800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 1.670214295387268, - "learning_rate": 4.0420083271481144e-05, - "loss": 2.0362, - "step": 106900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 1.6006022691726685, - "learning_rate": 4.040265007033538e-05, - "loss": 2.042, - "step": 107000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.252915620803833, - "eval_runtime": 51.9529, - "eval_samples_per_second": 196.216, - "eval_steps_per_second": 1.54, - "step": 107000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 1.5068638324737549, - "learning_rate": 4.0385204788561e-05, - "loss": 2.0379, - "step": 107100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 1.7357958555221558, - "learning_rate": 4.0367747439840694e-05, - "loss": 2.0289, - "step": 107200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 1.6686527729034424, - "learning_rate": 4.0350278037866654e-05, - "loss": 2.0413, - "step": 107300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 1.226962924003601, - "learning_rate": 4.0332796596340485e-05, - "loss": 2.0204, - "step": 107400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 1.696763515472412, - "learning_rate": 4.031530312897327e-05, - "loss": 2.0296, - "step": 107500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 1.6187881231307983, - "learning_rate": 4.0297797649485515e-05, - "loss": 2.0211, - "step": 107600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 1.082560658454895, - "learning_rate": 4.028028017160712e-05, - "loss": 2.0304, - "step": 107700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 1.551513910293579, - "learning_rate": 4.026275070907744e-05, - "loss": 2.0332, - "step": 107800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 1.658700704574585, - "learning_rate": 4.024520927564521e-05, - "loss": 2.0134, - "step": 107900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 1.734298586845398, - "learning_rate": 4.022765588506854e-05, - "loss": 2.0259, - "step": 108000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.248534917831421, - "eval_runtime": 51.918, - "eval_samples_per_second": 196.348, - "eval_steps_per_second": 1.541, - "step": 108000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 1.3358790874481201, - "learning_rate": 4.021009055111493e-05, - "loss": 2.0226, - "step": 108100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 1.6433982849121094, - "learning_rate": 4.019251328756125e-05, - "loss": 2.0231, - "step": 108200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 1.6571450233459473, - "learning_rate": 4.0174924108193734e-05, - "loss": 2.0272, - "step": 108300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 1.5485866069793701, - "learning_rate": 4.015732302680795e-05, - "loss": 2.0154, - "step": 108400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 1.717331886291504, - "learning_rate": 4.0139710057208794e-05, - "loss": 2.0186, - "step": 108500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 1.3007159233093262, - "learning_rate": 4.012208521321049e-05, - "loss": 2.017, - "step": 108600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 1.6857043504714966, - "learning_rate": 4.01044485086366e-05, - "loss": 2.0183, - "step": 108700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 1.5786222219467163, - "learning_rate": 4.0086799957319965e-05, - "loss": 2.0081, - "step": 108800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 1.708694338798523, - "learning_rate": 4.0069139573102715e-05, - "loss": 2.0052, - "step": 108900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 1.6401610374450684, - "learning_rate": 4.005146736983627e-05, - "loss": 1.998, - "step": 109000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.2400035858154297, - "eval_runtime": 52.1098, - "eval_samples_per_second": 195.625, - "eval_steps_per_second": 1.535, - "step": 109000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 1.3219704627990723, - "learning_rate": 4.0033783361381324e-05, - "loss": 2.0154, - "step": 109100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 1.7542061805725098, - "learning_rate": 4.001608756160781e-05, - "loss": 2.0129, - "step": 109200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 1.690807819366455, - "learning_rate": 3.999837998439494e-05, - "loss": 2.0031, - "step": 109300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 1.652106523513794, - "learning_rate": 3.9980660643631137e-05, - "loss": 2.0025, - "step": 109400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 1.5204631090164185, - "learning_rate": 3.996292955321406e-05, - "loss": 2.0024, - "step": 109500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 1.6795552968978882, - "learning_rate": 3.9945186727050574e-05, - "loss": 2.0078, - "step": 109600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 1.7057318687438965, - "learning_rate": 3.992743217905678e-05, - "loss": 2.0012, - "step": 109700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 1.5931191444396973, - "learning_rate": 3.990966592315793e-05, - "loss": 2.0042, - "step": 109800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 1.6618363857269287, - "learning_rate": 3.989188797328851e-05, - "loss": 2.0029, - "step": 109900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 1.6900277137756348, - "learning_rate": 3.987409834339211e-05, - "loss": 1.9952, - "step": 110000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.2469289302825928, - "eval_runtime": 52.1385, - "eval_samples_per_second": 195.518, - "eval_steps_per_second": 1.534, - "step": 110000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 1.4866619110107422, - "learning_rate": 3.985629704742153e-05, - "loss": 2.0031, - "step": 110100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 1.759400725364685, - "learning_rate": 3.9838484099338714e-05, - "loss": 1.9986, - "step": 110200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 1.6036171913146973, - "learning_rate": 3.9820659513114735e-05, - "loss": 2.0032, - "step": 110300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 1.650638461112976, - "learning_rate": 3.9802823302729806e-05, - "loss": 1.9968, - "step": 110400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 0.8783386945724487, - "learning_rate": 3.978497548217324e-05, - "loss": 1.927, - "step": 110500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 1.0009950399398804, - "learning_rate": 3.9767116065443464e-05, - "loss": 1.8641, - "step": 110600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 1.0648219585418701, - "learning_rate": 3.974924506654801e-05, - "loss": 1.8177, - "step": 110700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 0.8869590759277344, - "learning_rate": 3.9731362499503474e-05, - "loss": 1.8099, - "step": 110800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 1.1313343048095703, - "learning_rate": 3.971346837833556e-05, - "loss": 1.7797, - "step": 110900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 0.87049400806427, - "learning_rate": 3.969556271707898e-05, - "loss": 1.7763, - "step": 111000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.235299587249756, - "eval_runtime": 52.3725, - "eval_samples_per_second": 194.644, - "eval_steps_per_second": 1.528, - "step": 111000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 0.9816115498542786, - "learning_rate": 3.967764552977754e-05, - "loss": 1.7607, - "step": 111100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 1.1495822668075562, - "learning_rate": 3.9659716830484085e-05, - "loss": 1.7387, - "step": 111200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 1.117358684539795, - "learning_rate": 3.9641776633260464e-05, - "loss": 1.7432, - "step": 111300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 1.1056963205337524, - "learning_rate": 3.962382495217757e-05, - "loss": 1.7173, - "step": 111400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 1.3292206525802612, - "learning_rate": 3.960586180131528e-05, - "loss": 1.7183, - "step": 111500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 1.182369589805603, - "learning_rate": 3.9587887194762485e-05, - "loss": 1.7172, - "step": 111600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 0.8990152478218079, - "learning_rate": 3.956990114661705e-05, - "loss": 1.6941, - "step": 111700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 1.1310220956802368, - "learning_rate": 3.955190367098582e-05, - "loss": 1.6928, - "step": 111800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 0.880366325378418, - "learning_rate": 3.9533894781984606e-05, - "loss": 1.6886, - "step": 111900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 0.8530257940292358, - "learning_rate": 3.951587449373816e-05, - "loss": 1.6671, - "step": 112000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.2474164962768555, - "eval_runtime": 52.3853, - "eval_samples_per_second": 194.597, - "eval_steps_per_second": 1.527, - "step": 112000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 1.0223313570022583, - "learning_rate": 3.949784282038018e-05, - "loss": 1.6847, - "step": 112100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 0.878471314907074, - "learning_rate": 3.9479799776053306e-05, - "loss": 1.6647, - "step": 112200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 1.2385903596878052, - "learning_rate": 3.9461745374909066e-05, - "loss": 1.6462, - "step": 112300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 1.071331262588501, - "learning_rate": 3.9443679631107924e-05, - "loss": 1.641, - "step": 112400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 0.8975522518157959, - "learning_rate": 3.942560255881922e-05, - "loss": 1.6262, - "step": 112500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 0.8427426218986511, - "learning_rate": 3.94075141722212e-05, - "loss": 1.6467, - "step": 112600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 0.8209208250045776, - "learning_rate": 3.938941448550098e-05, - "loss": 1.6257, - "step": 112700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 0.8760822415351868, - "learning_rate": 3.937130351285452e-05, - "loss": 1.6292, - "step": 112800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 0.9947307705879211, - "learning_rate": 3.935318126848664e-05, - "loss": 1.6358, - "step": 112900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 1.6038973331451416, - "learning_rate": 3.933504776661102e-05, - "loss": 1.8923, - "step": 113000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.267655849456787, - "eval_runtime": 51.8322, - "eval_samples_per_second": 196.673, - "eval_steps_per_second": 1.543, - "step": 113000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 1.4726253747940063, - "learning_rate": 3.931690302145014e-05, - "loss": 1.9579, - "step": 113100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 1.4382061958312988, - "learning_rate": 3.9298747047235327e-05, - "loss": 1.9466, - "step": 113200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 1.5668888092041016, - "learning_rate": 3.928057985820668e-05, - "loss": 1.9479, - "step": 113300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 1.5239572525024414, - "learning_rate": 3.926240146861314e-05, - "loss": 1.9306, - "step": 113400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 1.6437016725540161, - "learning_rate": 3.924421189271239e-05, - "loss": 1.9364, - "step": 113500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 1.632081151008606, - "learning_rate": 3.9226011144770904e-05, - "loss": 1.9297, - "step": 113600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 1.577589750289917, - "learning_rate": 3.920779923906393e-05, - "loss": 1.9223, - "step": 113700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 1.5894604921340942, - "learning_rate": 3.918957618987545e-05, - "loss": 1.9263, - "step": 113800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 1.5186114311218262, - "learning_rate": 3.9171342011498185e-05, - "loss": 1.9261, - "step": 113900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 1.639272689819336, - "learning_rate": 3.9153096718233604e-05, - "loss": 1.9166, - "step": 114000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.2479372024536133, - "eval_runtime": 51.8659, - "eval_samples_per_second": 196.545, - "eval_steps_per_second": 1.542, - "step": 114000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 1.5154080390930176, - "learning_rate": 3.913484032439187e-05, - "loss": 1.916, - "step": 114100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 1.5147068500518799, - "learning_rate": 3.911657284429189e-05, - "loss": 1.9153, - "step": 114200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 1.5977636575698853, - "learning_rate": 3.9098294292261205e-05, - "loss": 1.9027, - "step": 114300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 1.4557409286499023, - "learning_rate": 3.908000468263609e-05, - "loss": 1.9073, - "step": 114400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 1.5657786130905151, - "learning_rate": 3.9061704029761495e-05, - "loss": 1.9026, - "step": 114500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 1.5239813327789307, - "learning_rate": 3.904339234799098e-05, - "loss": 1.9092, - "step": 114600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 1.4764059782028198, - "learning_rate": 3.9025069651686816e-05, - "loss": 1.9019, - "step": 114700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 1.5580713748931885, - "learning_rate": 3.9006735955219874e-05, - "loss": 1.8958, - "step": 114800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 1.5598928928375244, - "learning_rate": 3.898839127296968e-05, - "loss": 1.8961, - "step": 114900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 1.4871125221252441, - "learning_rate": 3.897003561932434e-05, - "loss": 1.8978, - "step": 115000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.2493135929107666, - "eval_runtime": 51.934, - "eval_samples_per_second": 196.288, - "eval_steps_per_second": 1.54, - "step": 115000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 1.4677025079727173, - "learning_rate": 3.89516690086806e-05, - "loss": 1.8905, - "step": 115100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 1.4988230466842651, - "learning_rate": 3.8933291455443786e-05, - "loss": 1.8935, - "step": 115200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 1.5289825201034546, - "learning_rate": 3.891490297402781e-05, - "loss": 1.8875, - "step": 115300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 1.480161428451538, - "learning_rate": 3.889650357885514e-05, - "loss": 1.8897, - "step": 115400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 1.6153870820999146, - "learning_rate": 3.887809328435683e-05, - "loss": 1.8878, - "step": 115500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 1.409119725227356, - "learning_rate": 3.8859672104972454e-05, - "loss": 1.8976, - "step": 115600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 1.5816664695739746, - "learning_rate": 3.884124005515015e-05, - "loss": 1.8876, - "step": 115700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 1.5077482461929321, - "learning_rate": 3.882279714934657e-05, - "loss": 1.888, - "step": 115800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 1.4968522787094116, - "learning_rate": 3.880434340202686e-05, - "loss": 1.8841, - "step": 115900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 1.467680811882019, - "learning_rate": 3.878587882766472e-05, - "loss": 1.8832, - "step": 116000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.2356553077697754, - "eval_runtime": 51.966, - "eval_samples_per_second": 196.167, - "eval_steps_per_second": 1.539, - "step": 116000 - }, - { - "epoch": 0.033768886854693314, - "grad_norm": 1.4993896484375, - "learning_rate": 3.87674034407423e-05, - "loss": 1.8783, - "step": 116100 - }, - { - "epoch": 0.03404796856423624, - "grad_norm": 1.479200005531311, - "learning_rate": 3.8748917255750225e-05, - "loss": 1.8864, - "step": 116200 - }, - { - "epoch": 0.03432705027377916, - "grad_norm": 1.4365732669830322, - "learning_rate": 3.873042028718764e-05, - "loss": 1.877, - "step": 116300 - }, - { - "epoch": 0.034606131983322075, - "grad_norm": 1.4553158283233643, - "learning_rate": 3.871191254956208e-05, - "loss": 1.8873, - "step": 116400 - }, - { - "epoch": 0.034885213692865, - "grad_norm": 1.505732774734497, - "learning_rate": 3.8693394057389574e-05, - "loss": 1.8737, - "step": 116500 - }, - { - "epoch": 0.03516429540240792, - "grad_norm": 1.4430396556854248, - "learning_rate": 3.8674864825194574e-05, - "loss": 1.8743, - "step": 116600 - }, - { - "epoch": 0.035443377111950836, - "grad_norm": 1.5058810710906982, - "learning_rate": 3.865632486750996e-05, - "loss": 1.8734, - "step": 116700 - }, - { - "epoch": 0.035722458821493754, - "grad_norm": 1.551901936531067, - "learning_rate": 3.8637774198877e-05, - "loss": 1.8754, - "step": 116800 - }, - { - "epoch": 0.03600154053103668, - "grad_norm": 1.5766918659210205, - "learning_rate": 3.86192128338454e-05, - "loss": 1.878, - "step": 116900 - }, - { - "epoch": 0.036280622240579596, - "grad_norm": 1.5308256149291992, - "learning_rate": 3.860064078697323e-05, - "loss": 1.8833, - "step": 117000 - }, - { - "epoch": 0.036280622240579596, - "eval_loss": 2.2324624061584473, - "eval_runtime": 51.883, - "eval_samples_per_second": 196.48, - "eval_steps_per_second": 1.542, - "step": 117000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 1.2410722970962524, - "learning_rate": 3.858205807282694e-05, - "loss": 1.9933, - "step": 117100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 1.6566898822784424, - "learning_rate": 3.8563464705981354e-05, - "loss": 1.9959, - "step": 117200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 1.6581664085388184, - "learning_rate": 3.854486070101965e-05, - "loss": 1.9696, - "step": 117300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 1.6906877756118774, - "learning_rate": 3.8526246072533345e-05, - "loss": 1.9822, - "step": 117400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 1.5992618799209595, - "learning_rate": 3.850762083512229e-05, - "loss": 1.9615, - "step": 117500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 1.3387641906738281, - "learning_rate": 3.848898500339466e-05, - "loss": 1.973, - "step": 117600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 1.6966856718063354, - "learning_rate": 3.847033859196694e-05, - "loss": 1.9695, - "step": 117700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 1.525177240371704, - "learning_rate": 3.8451681615463915e-05, - "loss": 1.9705, - "step": 117800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 1.657475233078003, - "learning_rate": 3.843301408851864e-05, - "loss": 1.9697, - "step": 117900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 1.721798300743103, - "learning_rate": 3.8414336025772456e-05, - "loss": 1.9546, - "step": 118000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.221956968307495, - "eval_runtime": 52.1068, - "eval_samples_per_second": 195.637, - "eval_steps_per_second": 1.535, - "step": 118000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 1.5945574045181274, - "learning_rate": 3.839564744187498e-05, - "loss": 1.9359, - "step": 118100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 1.7270615100860596, - "learning_rate": 3.837694835148406e-05, - "loss": 1.9548, - "step": 118200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 1.653337836265564, - "learning_rate": 3.835823876926579e-05, - "loss": 1.9536, - "step": 118300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 1.7248106002807617, - "learning_rate": 3.833951870989451e-05, - "loss": 1.9573, - "step": 118400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 1.1816054582595825, - "learning_rate": 3.832078818805275e-05, - "loss": 1.9473, - "step": 118500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 1.6775774955749512, - "learning_rate": 3.8302047218431266e-05, - "loss": 1.9735, - "step": 118600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 1.046647071838379, - "learning_rate": 3.8283295815729e-05, - "loss": 1.9687, - "step": 118700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 1.624248743057251, - "learning_rate": 3.8264533994653087e-05, - "loss": 1.9574, - "step": 118800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 1.0700417757034302, - "learning_rate": 3.824576176991882e-05, - "loss": 1.9535, - "step": 118900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 1.6102774143218994, - "learning_rate": 3.8226979156249655e-05, - "loss": 1.9551, - "step": 119000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.2151541709899902, - "eval_runtime": 51.66, - "eval_samples_per_second": 197.329, - "eval_steps_per_second": 1.549, - "step": 119000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 1.7988970279693604, - "learning_rate": 3.820818616837719e-05, - "loss": 1.9406, - "step": 119100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 1.6762808561325073, - "learning_rate": 3.818938282104119e-05, - "loss": 1.9413, - "step": 119200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 1.2559093236923218, - "learning_rate": 3.817056912898951e-05, - "loss": 1.9393, - "step": 119300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 1.659754991531372, - "learning_rate": 3.815174510697813e-05, - "loss": 1.9473, - "step": 119400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 1.2323046922683716, - "learning_rate": 3.813291076977114e-05, - "loss": 1.9449, - "step": 119500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 1.651207685470581, - "learning_rate": 3.811406613214071e-05, - "loss": 1.9452, - "step": 119600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 1.6981886625289917, - "learning_rate": 3.80952112088671e-05, - "loss": 1.9506, - "step": 119700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 1.7373100519180298, - "learning_rate": 3.807634601473862e-05, - "loss": 1.9405, - "step": 119800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 1.8006949424743652, - "learning_rate": 3.805747056455166e-05, - "loss": 1.9428, - "step": 119900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 1.7427655458450317, - "learning_rate": 3.803858487311063e-05, - "loss": 1.9296, - "step": 120000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.2108328342437744, - "eval_runtime": 51.7291, - "eval_samples_per_second": 197.065, - "eval_steps_per_second": 1.547, - "step": 120000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 2.748073101043701, - "learning_rate": 3.8019688955227974e-05, - "loss": 3.8624, - "step": 120100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 3.3290884494781494, - "learning_rate": 3.800078282572419e-05, - "loss": 3.8983, - "step": 120200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 3.4201247692108154, - "learning_rate": 3.798186649942774e-05, - "loss": 3.8567, - "step": 120300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 3.4081761837005615, - "learning_rate": 3.796293999117511e-05, - "loss": 3.8651, - "step": 120400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 3.281001329421997, - "learning_rate": 3.7944003315810776e-05, - "loss": 3.8587, - "step": 120500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 2.8812153339385986, - "learning_rate": 3.792505648818715e-05, - "loss": 3.8612, - "step": 120600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 3.5344903469085693, - "learning_rate": 3.790609952316467e-05, - "loss": 3.8711, - "step": 120700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 2.995441436767578, - "learning_rate": 3.7887132435611677e-05, - "loss": 3.8788, - "step": 120800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 3.382432699203491, - "learning_rate": 3.786815524040446e-05, - "loss": 3.8611, - "step": 120900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 3.4205808639526367, - "learning_rate": 3.784916795242724e-05, - "loss": 3.855, - "step": 121000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.2230899333953857, - "eval_runtime": 52.6219, - "eval_samples_per_second": 193.722, - "eval_steps_per_second": 1.52, - "step": 121000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 3.3155264854431152, - "learning_rate": 3.783017058657215e-05, - "loss": 3.8192, - "step": 121100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 3.595088243484497, - "learning_rate": 3.7811163157739246e-05, - "loss": 3.8689, - "step": 121200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 3.4232356548309326, - "learning_rate": 3.7792145680836453e-05, - "loss": 3.8547, - "step": 121300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 3.4015252590179443, - "learning_rate": 3.7773118170779584e-05, - "loss": 3.8624, - "step": 121400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 2.724720001220703, - "learning_rate": 3.775408064249233e-05, - "loss": 3.8415, - "step": 121500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 3.4392824172973633, - "learning_rate": 3.773503311090622e-05, - "loss": 3.8783, - "step": 121600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 1.9879323244094849, - "learning_rate": 3.771597559096066e-05, - "loss": 3.873, - "step": 121700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 3.4553592205047607, - "learning_rate": 3.7696908097602844e-05, - "loss": 3.8727, - "step": 121800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 2.3339455127716064, - "learning_rate": 3.767783064578784e-05, - "loss": 3.8546, - "step": 121900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 3.121299982070923, - "learning_rate": 3.7658743250478495e-05, - "loss": 3.859, - "step": 122000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.219958782196045, - "eval_runtime": 51.8398, - "eval_samples_per_second": 196.644, - "eval_steps_per_second": 1.543, - "step": 122000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 3.5799753665924072, - "learning_rate": 3.763964592664546e-05, - "loss": 3.8475, - "step": 122100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 3.5003957748413086, - "learning_rate": 3.7620538689267186e-05, - "loss": 3.8556, - "step": 122200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 2.886596202850342, - "learning_rate": 3.7601421553329876e-05, - "loss": 3.8463, - "step": 122300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 3.3040432929992676, - "learning_rate": 3.758229453382751e-05, - "loss": 3.868, - "step": 122400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 2.8264660835266113, - "learning_rate": 3.756315764576183e-05, - "loss": 3.8631, - "step": 122500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 3.332125186920166, - "learning_rate": 3.754401090414229e-05, - "loss": 3.8392, - "step": 122600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 3.4241065979003906, - "learning_rate": 3.75248543239861e-05, - "loss": 3.8693, - "step": 122700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 3.448194980621338, - "learning_rate": 3.750568792031819e-05, - "loss": 3.8463, - "step": 122800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 3.5221004486083984, - "learning_rate": 3.748651170817116e-05, - "loss": 3.8652, - "step": 122900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 3.5034878253936768, - "learning_rate": 3.746732570258533e-05, - "loss": 3.8299, - "step": 123000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.2181193828582764, - "eval_runtime": 51.8674, - "eval_samples_per_second": 196.54, - "eval_steps_per_second": 1.542, - "step": 123000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 2.127269744873047, - "learning_rate": 3.7448129918608706e-05, - "loss": 3.8731, - "step": 123100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 3.29091739654541, - "learning_rate": 3.7428924371296935e-05, - "loss": 3.9066, - "step": 123200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 2.9820969104766846, - "learning_rate": 3.740970907571336e-05, - "loss": 3.8654, - "step": 123300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 3.3646039962768555, - "learning_rate": 3.739048404692893e-05, - "loss": 3.8893, - "step": 123400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 2.2552099227905273, - "learning_rate": 3.737124930002226e-05, - "loss": 3.8794, - "step": 123500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 3.449066638946533, - "learning_rate": 3.735200485007957e-05, - "loss": 3.8869, - "step": 123600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 3.5163280963897705, - "learning_rate": 3.733275071219469e-05, - "loss": 3.8858, - "step": 123700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 2.634568452835083, - "learning_rate": 3.731348690146906e-05, - "loss": 3.878, - "step": 123800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 3.3576948642730713, - "learning_rate": 3.72942134330117e-05, - "loss": 3.8731, - "step": 123900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 3.476285219192505, - "learning_rate": 3.7274930321939205e-05, - "loss": 3.887, - "step": 124000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.2249643802642822, - "eval_runtime": 51.7986, - "eval_samples_per_second": 196.801, - "eval_steps_per_second": 1.544, - "step": 124000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 3.3951408863067627, - "learning_rate": 3.7255637583375725e-05, - "loss": 3.8708, - "step": 124100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 3.3581931591033936, - "learning_rate": 3.7236335232452977e-05, - "loss": 3.8622, - "step": 124200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 3.3235626220703125, - "learning_rate": 3.7217023284310196e-05, - "loss": 3.8526, - "step": 124300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 3.508228063583374, - "learning_rate": 3.719770175409417e-05, - "loss": 3.848, - "step": 124400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 3.0591793060302734, - "learning_rate": 3.717837065695918e-05, - "loss": 3.8698, - "step": 124500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 3.3803138732910156, - "learning_rate": 3.715903000806703e-05, - "loss": 3.8825, - "step": 124600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 3.422788143157959, - "learning_rate": 3.7139679822586996e-05, - "loss": 3.856, - "step": 124700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 3.3941519260406494, - "learning_rate": 3.7120320115695857e-05, - "loss": 3.8594, - "step": 124800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 2.929302453994751, - "learning_rate": 3.710095090257782e-05, - "loss": 3.8679, - "step": 124900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 3.4934051036834717, - "learning_rate": 3.708157219842461e-05, - "loss": 3.8595, - "step": 125000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.225058078765869, - "eval_runtime": 51.8264, - "eval_samples_per_second": 196.695, - "eval_steps_per_second": 1.544, - "step": 125000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 2.81558895111084, - "learning_rate": 3.706218401843532e-05, - "loss": 3.8671, - "step": 125100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 3.2801120281219482, - "learning_rate": 3.704278637781655e-05, - "loss": 3.8591, - "step": 125200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 3.1782455444335938, - "learning_rate": 3.702337929178226e-05, - "loss": 3.8703, - "step": 125300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 3.1894006729125977, - "learning_rate": 3.7003962775553866e-05, - "loss": 3.8597, - "step": 125400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 3.327129602432251, - "learning_rate": 3.698453684436014e-05, - "loss": 3.859, - "step": 125500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 3.5265657901763916, - "learning_rate": 3.6965101513437267e-05, - "loss": 3.8468, - "step": 125600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 3.223062038421631, - "learning_rate": 3.6945656798028785e-05, - "loss": 3.8544, - "step": 125700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 3.4239840507507324, - "learning_rate": 3.6926202713385606e-05, - "loss": 3.8502, - "step": 125800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 1.897346019744873, - "learning_rate": 3.6906739274765986e-05, - "loss": 3.6361, - "step": 125900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 1.675214171409607, - "learning_rate": 3.6887266497435516e-05, - "loss": 3.3334, - "step": 126000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.207582950592041, - "eval_runtime": 52.021, - "eval_samples_per_second": 195.959, - "eval_steps_per_second": 1.538, - "step": 126000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 2.446707248687744, - "learning_rate": 3.686778439666712e-05, - "loss": 3.2678, - "step": 126100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 1.7556930780410767, - "learning_rate": 3.6848292987741006e-05, - "loss": 3.2757, - "step": 126200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 2.2318058013916016, - "learning_rate": 3.682879228594472e-05, - "loss": 3.2595, - "step": 126300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 2.536383867263794, - "learning_rate": 3.680928230657308e-05, - "loss": 3.2332, - "step": 126400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 2.480672836303711, - "learning_rate": 3.678976306492819e-05, - "loss": 3.2357, - "step": 126500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 2.3426804542541504, - "learning_rate": 3.677023457631939e-05, - "loss": 3.2118, - "step": 126600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 2.1600797176361084, - "learning_rate": 3.6750696856063304e-05, - "loss": 3.2129, - "step": 126700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 1.6937828063964844, - "learning_rate": 3.673114991948379e-05, - "loss": 3.2046, - "step": 126800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 2.2766456604003906, - "learning_rate": 3.671159378191191e-05, - "loss": 3.1943, - "step": 126900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 1.9862797260284424, - "learning_rate": 3.669202845868597e-05, - "loss": 3.1908, - "step": 127000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.2138376235961914, - "eval_runtime": 51.9698, - "eval_samples_per_second": 196.152, - "eval_steps_per_second": 1.539, - "step": 127000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 1.726231336593628, - "learning_rate": 3.6672453965151485e-05, - "loss": 3.1654, - "step": 127100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 2.486175537109375, - "learning_rate": 3.6652870316661133e-05, - "loss": 3.1584, - "step": 127200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 1.9903950691223145, - "learning_rate": 3.663327752857481e-05, - "loss": 3.1698, - "step": 127300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 1.7355810403823853, - "learning_rate": 3.661367561625954e-05, - "loss": 3.119, - "step": 127400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 2.132373809814453, - "learning_rate": 3.6594064595089534e-05, - "loss": 3.1671, - "step": 127500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 1.8258633613586426, - "learning_rate": 3.657444448044612e-05, - "loss": 3.1271, - "step": 127600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 2.02746844291687, - "learning_rate": 3.65548152877178e-05, - "loss": 3.1182, - "step": 127700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 2.4161229133605957, - "learning_rate": 3.6535177032300144e-05, - "loss": 3.1113, - "step": 127800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 1.707440733909607, - "learning_rate": 3.651552972959588e-05, - "loss": 3.0544, - "step": 127900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 1.7457507848739624, - "learning_rate": 3.649587339501479e-05, - "loss": 3.1207, - "step": 128000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.2254478931427, - "eval_runtime": 52.0855, - "eval_samples_per_second": 195.717, - "eval_steps_per_second": 1.536, - "step": 128000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 1.717077612876892, - "learning_rate": 3.647620804397378e-05, - "loss": 3.0992, - "step": 128100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 1.6803147792816162, - "learning_rate": 3.6456533691896785e-05, - "loss": 3.095, - "step": 128200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 2.2515881061553955, - "learning_rate": 3.643685035421483e-05, - "loss": 3.1254, - "step": 128300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 2.9855501651763916, - "learning_rate": 3.641715804636598e-05, - "loss": 3.5587, - "step": 128400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 3.2831578254699707, - "learning_rate": 3.6397456783795336e-05, - "loss": 3.7799, - "step": 128500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 3.2432754039764404, - "learning_rate": 3.637774658195501e-05, - "loss": 3.7901, - "step": 128600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 3.28851056098938, - "learning_rate": 3.6358027456304144e-05, - "loss": 3.7778, - "step": 128700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 3.192739963531494, - "learning_rate": 3.633829942230888e-05, - "loss": 3.7389, - "step": 128800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 3.0515987873077393, - "learning_rate": 3.6318562495442315e-05, - "loss": 3.742, - "step": 128900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 2.9723589420318604, - "learning_rate": 3.629881669118456e-05, - "loss": 3.7461, - "step": 129000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.2262234687805176, - "eval_runtime": 52.1567, - "eval_samples_per_second": 195.449, - "eval_steps_per_second": 1.534, - "step": 129000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 3.1359448432922363, - "learning_rate": 3.627906202502267e-05, - "loss": 3.7374, - "step": 129100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 3.2023541927337646, - "learning_rate": 3.6259298512450645e-05, - "loss": 3.7531, - "step": 129200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 2.9798941612243652, - "learning_rate": 3.623952616896945e-05, - "loss": 3.7509, - "step": 129300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 3.104867458343506, - "learning_rate": 3.621974501008695e-05, - "loss": 3.7258, - "step": 129400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 2.8948721885681152, - "learning_rate": 3.6199955051317914e-05, - "loss": 3.7291, - "step": 129500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 3.4071433544158936, - "learning_rate": 3.618015630818406e-05, - "loss": 3.7334, - "step": 129600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 3.0882623195648193, - "learning_rate": 3.6160348796213936e-05, - "loss": 3.7099, - "step": 129700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 3.026120185852051, - "learning_rate": 3.6140532530943025e-05, - "loss": 3.7309, - "step": 129800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 3.150139570236206, - "learning_rate": 3.612070752791363e-05, - "loss": 3.7094, - "step": 129900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 3.0325634479522705, - "learning_rate": 3.610087380267495e-05, - "loss": 3.7265, - "step": 130000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.2089452743530273, - "eval_runtime": 52.1784, - "eval_samples_per_second": 195.368, - "eval_steps_per_second": 1.533, - "step": 130000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 2.9735958576202393, - "learning_rate": 3.6081031370782974e-05, - "loss": 3.7094, - "step": 130100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 2.9261887073516846, - "learning_rate": 3.6061180247800564e-05, - "loss": 3.7091, - "step": 130200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 2.927654266357422, - "learning_rate": 3.604132044929736e-05, - "loss": 3.7146, - "step": 130300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 3.1641175746917725, - "learning_rate": 3.602145199084986e-05, - "loss": 3.706, - "step": 130400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 3.108304023742676, - "learning_rate": 3.600157488804129e-05, - "loss": 3.7051, - "step": 130500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 3.076998710632324, - "learning_rate": 3.598168915646171e-05, - "loss": 3.7033, - "step": 130600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 3.1134989261627197, - "learning_rate": 3.5961794811707915e-05, - "loss": 3.6995, - "step": 130700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 3.0833628177642822, - "learning_rate": 3.5941891869383474e-05, - "loss": 3.7156, - "step": 130800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 2.9580423831939697, - "learning_rate": 3.592198034509868e-05, - "loss": 3.6818, - "step": 130900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 3.1217994689941406, - "learning_rate": 3.590206025447058e-05, - "loss": 3.6902, - "step": 131000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.215585708618164, - "eval_runtime": 52.1786, - "eval_samples_per_second": 195.368, - "eval_steps_per_second": 1.533, - "step": 131000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 3.0639188289642334, - "learning_rate": 3.588213161312291e-05, - "loss": 3.6862, - "step": 131100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 2.920884132385254, - "learning_rate": 3.5862194436686156e-05, - "loss": 3.6815, - "step": 131200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 3.0907158851623535, - "learning_rate": 3.584224874079745e-05, - "loss": 3.6951, - "step": 131300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 3.105325222015381, - "learning_rate": 3.582229454110065e-05, - "loss": 3.7043, - "step": 131400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 3.2944116592407227, - "learning_rate": 3.5802331853246245e-05, - "loss": 3.6847, - "step": 131500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 3.0563266277313232, - "learning_rate": 3.578236069289141e-05, - "loss": 3.692, - "step": 131600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 3.1431596279144287, - "learning_rate": 3.576238107569994e-05, - "loss": 3.6776, - "step": 131700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 2.9235949516296387, - "learning_rate": 3.5742393017342294e-05, - "loss": 3.6924, - "step": 131800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 2.9574053287506104, - "learning_rate": 3.572239653349552e-05, - "loss": 3.6733, - "step": 131900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 3.0737438201904297, - "learning_rate": 3.570239163984331e-05, - "loss": 3.6787, - "step": 132000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.210517644882202, - "eval_runtime": 52.1762, - "eval_samples_per_second": 195.376, - "eval_steps_per_second": 1.533, - "step": 132000 - }, - { - "epoch": 0.033768886854693314, - "grad_norm": 2.946190595626831, - "learning_rate": 3.568237835207591e-05, - "loss": 3.6731, - "step": 132100 - }, - { - "epoch": 0.03404796856423624, - "grad_norm": 2.947497844696045, - "learning_rate": 3.566235668589017e-05, - "loss": 3.6757, - "step": 132200 - }, - { - "epoch": 0.03432705027377916, - "grad_norm": 3.0203778743743896, - "learning_rate": 3.5642326656989525e-05, - "loss": 3.6767, - "step": 132300 - }, - { - "epoch": 0.034606131983322075, - "grad_norm": 3.1348791122436523, - "learning_rate": 3.562228828108396e-05, - "loss": 3.6904, - "step": 132400 - }, - { - "epoch": 0.034885213692865, - "grad_norm": 3.3154730796813965, - "learning_rate": 3.5602241573889984e-05, - "loss": 3.6876, - "step": 132500 - }, - { - "epoch": 0.03516429540240792, - "grad_norm": 2.9193050861358643, - "learning_rate": 3.558218655113066e-05, - "loss": 3.6869, - "step": 132600 - }, - { - "epoch": 0.035443377111950836, - "grad_norm": 2.8012895584106445, - "learning_rate": 3.5562123228535594e-05, - "loss": 3.6905, - "step": 132700 - }, - { - "epoch": 0.035722458821493754, - "grad_norm": 3.0849194526672363, - "learning_rate": 3.554205162184087e-05, - "loss": 3.6691, - "step": 132800 - }, - { - "epoch": 0.03600154053103668, - "grad_norm": 3.2069287300109863, - "learning_rate": 3.552197174678907e-05, - "loss": 3.6841, - "step": 132900 - }, - { - "epoch": 0.036280622240579596, - "grad_norm": 2.993010997772217, - "learning_rate": 3.550188361912927e-05, - "loss": 3.6776, - "step": 133000 - }, - { - "epoch": 0.036280622240579596, - "eval_loss": 2.211658477783203, - "eval_runtime": 52.2134, - "eval_samples_per_second": 195.237, - "eval_steps_per_second": 1.532, - "step": 133000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 3.175614833831787, - "learning_rate": 3.548178725461704e-05, - "loss": 3.6787, - "step": 133100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 3.2140350341796875, - "learning_rate": 3.546168266901436e-05, - "loss": 3.6835, - "step": 133200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 3.0778603553771973, - "learning_rate": 3.544156987808971e-05, - "loss": 3.662, - "step": 133300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 2.9668691158294678, - "learning_rate": 3.542144889761798e-05, - "loss": 3.67, - "step": 133400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 2.9988768100738525, - "learning_rate": 3.5401319743380477e-05, - "loss": 3.6596, - "step": 133500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 3.003577470779419, - "learning_rate": 3.538118243116494e-05, - "loss": 3.6823, - "step": 133600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 3.0805444717407227, - "learning_rate": 3.536103697676548e-05, - "loss": 3.656, - "step": 133700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 3.2663848400115967, - "learning_rate": 3.5340883395982617e-05, - "loss": 3.6776, - "step": 133800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 3.1434547901153564, - "learning_rate": 3.532072170462324e-05, - "loss": 3.6624, - "step": 133900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 3.08821964263916, - "learning_rate": 3.53005519185006e-05, - "loss": 3.6923, - "step": 134000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.2045140266418457, - "eval_runtime": 51.8625, - "eval_samples_per_second": 196.558, - "eval_steps_per_second": 1.543, - "step": 134000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 2.9615020751953125, - "learning_rate": 3.528037405343427e-05, - "loss": 3.6576, - "step": 134100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 3.0420079231262207, - "learning_rate": 3.52601881252502e-05, - "loss": 3.6607, - "step": 134200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 2.960287094116211, - "learning_rate": 3.5239994149780645e-05, - "loss": 3.6668, - "step": 134300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 3.181061267852783, - "learning_rate": 3.521979214286417e-05, - "loss": 3.6564, - "step": 134400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 2.98903489112854, - "learning_rate": 3.519958212034564e-05, - "loss": 3.6662, - "step": 134500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 3.0481224060058594, - "learning_rate": 3.5179364098076216e-05, - "loss": 3.5675, - "step": 134600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 3.1568892002105713, - "learning_rate": 3.5159138091913325e-05, - "loss": 3.6681, - "step": 134700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 3.0683400630950928, - "learning_rate": 3.5138904117720653e-05, - "loss": 3.6584, - "step": 134800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 3.077857494354248, - "learning_rate": 3.511866219136814e-05, - "loss": 3.6734, - "step": 134900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 3.012407064437866, - "learning_rate": 3.509841232873195e-05, - "loss": 3.6649, - "step": 135000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.210810661315918, - "eval_runtime": 51.4646, - "eval_samples_per_second": 198.078, - "eval_steps_per_second": 1.554, - "step": 135000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 3.164175033569336, - "learning_rate": 3.507815454569451e-05, - "loss": 3.6716, - "step": 135100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 3.1793689727783203, - "learning_rate": 3.5057888858144416e-05, - "loss": 3.643, - "step": 135200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 3.0864334106445312, - "learning_rate": 3.5037615281976495e-05, - "loss": 3.6401, - "step": 135300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 3.1052911281585693, - "learning_rate": 3.501733383309174e-05, - "loss": 3.6583, - "step": 135400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 3.1001381874084473, - "learning_rate": 3.499704452739732e-05, - "loss": 3.6582, - "step": 135500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 3.0288331508636475, - "learning_rate": 3.4976747380806574e-05, - "loss": 3.6652, - "step": 135600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 2.998553991317749, - "learning_rate": 3.4956442409238986e-05, - "loss": 3.6602, - "step": 135700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 3.052278757095337, - "learning_rate": 3.49361296286202e-05, - "loss": 3.6572, - "step": 135800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 2.945789098739624, - "learning_rate": 3.491580905488195e-05, - "loss": 3.643, - "step": 135900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 2.888101577758789, - "learning_rate": 3.48954807039621e-05, - "loss": 3.6616, - "step": 136000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.203134298324585, - "eval_runtime": 51.3238, - "eval_samples_per_second": 198.621, - "eval_steps_per_second": 1.559, - "step": 136000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 3.194450855255127, - "learning_rate": 3.487514459180461e-05, - "loss": 3.6524, - "step": 136100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 2.904616355895996, - "learning_rate": 3.485480073435953e-05, - "loss": 3.6361, - "step": 136200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 3.003732919692993, - "learning_rate": 3.483444914758298e-05, - "loss": 3.6364, - "step": 136300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 3.1838271617889404, - "learning_rate": 3.481408984743716e-05, - "loss": 3.6386, - "step": 136400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 3.0291504859924316, - "learning_rate": 3.479372284989028e-05, - "loss": 3.6795, - "step": 136500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 3.087804079055786, - "learning_rate": 3.477334817091664e-05, - "loss": 3.6999, - "step": 136600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 3.039384365081787, - "learning_rate": 3.475296582649652e-05, - "loss": 3.7043, - "step": 136700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 3.3074495792388916, - "learning_rate": 3.4732575832616235e-05, - "loss": 3.6944, - "step": 136800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 2.9702234268188477, - "learning_rate": 3.471217820526808e-05, - "loss": 3.7179, - "step": 136900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 3.0739312171936035, - "learning_rate": 3.469177296045039e-05, - "loss": 3.706, - "step": 137000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.2038073539733887, - "eval_runtime": 51.4208, - "eval_samples_per_second": 198.247, - "eval_steps_per_second": 1.556, - "step": 137000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 3.1482625007629395, - "learning_rate": 3.4671360114167395e-05, - "loss": 3.6934, - "step": 137100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 3.0761804580688477, - "learning_rate": 3.465093968242935e-05, - "loss": 3.7073, - "step": 137200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 3.0399630069732666, - "learning_rate": 3.463051168125243e-05, - "loss": 3.6919, - "step": 137300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 2.898616075515747, - "learning_rate": 3.4610076126658765e-05, - "loss": 3.4094, - "step": 137400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 3.054553985595703, - "learning_rate": 3.458963303467638e-05, - "loss": 3.709, - "step": 137500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 3.0647337436676025, - "learning_rate": 3.456918242133924e-05, - "loss": 3.6935, - "step": 137600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 2.9094271659851074, - "learning_rate": 3.45487243026872e-05, - "loss": 3.709, - "step": 137700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 3.0624611377716064, - "learning_rate": 3.4528258694766e-05, - "loss": 3.7097, - "step": 137800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 3.080920696258545, - "learning_rate": 3.4507785613627246e-05, - "loss": 3.7166, - "step": 137900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 3.08603835105896, - "learning_rate": 3.4487305075328434e-05, - "loss": 3.6971, - "step": 138000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.199204683303833, - "eval_runtime": 51.4121, - "eval_samples_per_second": 198.28, - "eval_steps_per_second": 1.556, - "step": 138000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 2.937602996826172, - "learning_rate": 3.446681709593288e-05, - "loss": 3.6892, - "step": 138100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 3.0739290714263916, - "learning_rate": 3.444632169150974e-05, - "loss": 3.6923, - "step": 138200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 3.076711654663086, - "learning_rate": 3.4425818878134006e-05, - "loss": 3.6838, - "step": 138300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 3.1818246841430664, - "learning_rate": 3.4405308671886465e-05, - "loss": 3.7162, - "step": 138400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 3.0970702171325684, - "learning_rate": 3.438479108885372e-05, - "loss": 3.6906, - "step": 138500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 2.924048662185669, - "learning_rate": 3.436426614512815e-05, - "loss": 3.688, - "step": 138600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 3.151340961456299, - "learning_rate": 3.434373385680791e-05, - "loss": 3.6952, - "step": 138700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 3.1132941246032715, - "learning_rate": 3.4323194239996906e-05, - "loss": 3.6774, - "step": 138800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 3.077354907989502, - "learning_rate": 3.43026473108048e-05, - "loss": 3.7007, - "step": 138900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 3.079162120819092, - "learning_rate": 3.4282093085347e-05, - "loss": 3.6982, - "step": 139000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.199798107147217, - "eval_runtime": 51.6229, - "eval_samples_per_second": 197.47, - "eval_steps_per_second": 1.55, - "step": 139000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 3.0697269439697266, - "learning_rate": 3.426153157974462e-05, - "loss": 3.6903, - "step": 139100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 2.9596915245056152, - "learning_rate": 3.4240962810124485e-05, - "loss": 3.6961, - "step": 139200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 3.1975579261779785, - "learning_rate": 3.4220386792619134e-05, - "loss": 3.6893, - "step": 139300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 3.0534658432006836, - "learning_rate": 3.419980354336677e-05, - "loss": 3.6867, - "step": 139400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 3.014533519744873, - "learning_rate": 3.4179213078511276e-05, - "loss": 3.6807, - "step": 139500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 3.1853954792022705, - "learning_rate": 3.415861541420219e-05, - "loss": 3.6836, - "step": 139600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 2.994035482406616, - "learning_rate": 3.413801056659471e-05, - "loss": 3.6843, - "step": 139700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 3.01277232170105, - "learning_rate": 3.411739855184966e-05, - "loss": 3.6875, - "step": 139800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 2.956777334213257, - "learning_rate": 3.409677938613348e-05, - "loss": 3.6708, - "step": 139900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 3.136683225631714, - "learning_rate": 3.407615308561822e-05, - "loss": 3.6853, - "step": 140000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.1988320350646973, - "eval_runtime": 51.6562, - "eval_samples_per_second": 197.343, - "eval_steps_per_second": 1.549, - "step": 140000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 3.045262575149536, - "learning_rate": 3.405551966648155e-05, - "loss": 3.6856, - "step": 140100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 3.14791202545166, - "learning_rate": 3.4034879144906674e-05, - "loss": 3.6802, - "step": 140200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 2.9139297008514404, - "learning_rate": 3.401423153708242e-05, - "loss": 3.6717, - "step": 140300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 3.0042569637298584, - "learning_rate": 3.399357685920314e-05, - "loss": 3.6752, - "step": 140400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 3.045513153076172, - "learning_rate": 3.397291512746873e-05, - "loss": 3.6921, - "step": 140500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 3.0931684970855713, - "learning_rate": 3.3952246358084645e-05, - "loss": 3.6733, - "step": 140600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 3.1561226844787598, - "learning_rate": 3.393157056726184e-05, - "loss": 3.6702, - "step": 140700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 3.267413854598999, - "learning_rate": 3.391088777121678e-05, - "loss": 3.6848, - "step": 140800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 2.9542646408081055, - "learning_rate": 3.3890197986171426e-05, - "loss": 3.668, - "step": 140900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 3.1285250186920166, - "learning_rate": 3.386950122835321e-05, - "loss": 3.6633, - "step": 141000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.194960355758667, - "eval_runtime": 51.7975, - "eval_samples_per_second": 196.805, - "eval_steps_per_second": 1.544, - "step": 141000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 3.117668628692627, - "learning_rate": 3.3848797513995054e-05, - "loss": 3.6846, - "step": 141100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 3.2943692207336426, - "learning_rate": 3.3828086859335326e-05, - "loss": 3.6798, - "step": 141200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 3.078214645385742, - "learning_rate": 3.3807369280617834e-05, - "loss": 3.6393, - "step": 141300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 2.948484182357788, - "learning_rate": 3.3786644794091816e-05, - "loss": 3.6748, - "step": 141400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 3.035909652709961, - "learning_rate": 3.3765913416011935e-05, - "loss": 3.6745, - "step": 141500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 2.9795923233032227, - "learning_rate": 3.374517516263824e-05, - "loss": 3.6788, - "step": 141600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 3.133145809173584, - "learning_rate": 3.372443005023622e-05, - "loss": 3.6672, - "step": 141700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 3.067864179611206, - "learning_rate": 3.370367809507668e-05, - "loss": 3.6433, - "step": 141800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 2.9287595748901367, - "learning_rate": 3.3682919313435836e-05, - "loss": 3.6574, - "step": 141900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 2.9526288509368896, - "learning_rate": 3.3662153721595244e-05, - "loss": 3.658, - "step": 142000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.188488483428955, - "eval_runtime": 51.6355, - "eval_samples_per_second": 197.422, - "eval_steps_per_second": 1.549, - "step": 142000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 3.129251718521118, - "learning_rate": 3.36413813358418e-05, - "loss": 3.6714, - "step": 142100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 3.0163700580596924, - "learning_rate": 3.362060217246775e-05, - "loss": 3.662, - "step": 142200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 3.160065174102783, - "learning_rate": 3.359981624777061e-05, - "loss": 3.6398, - "step": 142300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 3.0241780281066895, - "learning_rate": 3.3579023578053245e-05, - "loss": 3.6516, - "step": 142400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 2.963665008544922, - "learning_rate": 3.355822417962378e-05, - "loss": 3.6691, - "step": 142500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 3.1197776794433594, - "learning_rate": 3.3537418068795634e-05, - "loss": 3.6647, - "step": 142600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 3.207745313644409, - "learning_rate": 3.3516605261887494e-05, - "loss": 3.6587, - "step": 142700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 3.1258158683776855, - "learning_rate": 3.3495785775223274e-05, - "loss": 3.6582, - "step": 142800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 3.1733665466308594, - "learning_rate": 3.347495962513215e-05, - "loss": 3.6611, - "step": 142900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 3.0605976581573486, - "learning_rate": 3.345412682794853e-05, - "loss": 3.6533, - "step": 143000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.192350149154663, - "eval_runtime": 51.7936, - "eval_samples_per_second": 196.82, - "eval_steps_per_second": 1.545, - "step": 143000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 3.114699363708496, - "learning_rate": 3.3433287400012e-05, - "loss": 3.637, - "step": 143100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 3.0836739540100098, - "learning_rate": 3.34124413576674e-05, - "loss": 3.6546, - "step": 143200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 3.009408950805664, - "learning_rate": 3.33915887172647e-05, - "loss": 3.6605, - "step": 143300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 3.173821210861206, - "learning_rate": 3.337072949515909e-05, - "loss": 3.6607, - "step": 143400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 3.0830142498016357, - "learning_rate": 3.334986370771089e-05, - "loss": 3.6414, - "step": 143500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 3.06382155418396, - "learning_rate": 3.3328991371285604e-05, - "loss": 3.6384, - "step": 143600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 3.039879083633423, - "learning_rate": 3.3308112502253844e-05, - "loss": 3.6414, - "step": 143700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 3.145960807800293, - "learning_rate": 3.3287227116991346e-05, - "loss": 3.6554, - "step": 143800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 2.9724478721618652, - "learning_rate": 3.326633523187897e-05, - "loss": 3.6537, - "step": 143900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 3.0227105617523193, - "learning_rate": 3.324543686330268e-05, - "loss": 3.6496, - "step": 144000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.199296236038208, - "eval_runtime": 51.7247, - "eval_samples_per_second": 197.082, - "eval_steps_per_second": 1.547, - "step": 144000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 3.1452724933624268, - "learning_rate": 3.3224532027653506e-05, - "loss": 3.6534, - "step": 144100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 3.0349013805389404, - "learning_rate": 3.3203620741327555e-05, - "loss": 3.6355, - "step": 144200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 2.9786078929901123, - "learning_rate": 3.3182703020726e-05, - "loss": 3.6582, - "step": 144300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 3.17039155960083, - "learning_rate": 3.316177888225506e-05, - "loss": 3.6421, - "step": 144400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 2.8847384452819824, - "learning_rate": 3.3140848342325985e-05, - "loss": 3.6547, - "step": 144500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 2.9963126182556152, - "learning_rate": 3.3119911417355045e-05, - "loss": 3.6473, - "step": 144600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 3.042747974395752, - "learning_rate": 3.309896812376353e-05, - "loss": 3.6501, - "step": 144700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 3.0630815029144287, - "learning_rate": 3.307801847797769e-05, - "loss": 3.6571, - "step": 144800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 3.174445390701294, - "learning_rate": 3.30570624964288e-05, - "loss": 3.6574, - "step": 144900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 3.1276638507843018, - "learning_rate": 3.3036100195553074e-05, - "loss": 3.6543, - "step": 145000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.1886751651763916, - "eval_runtime": 51.7134, - "eval_samples_per_second": 197.125, - "eval_steps_per_second": 1.547, - "step": 145000 - }, - { - "epoch": 0.033768886854693314, - "grad_norm": 2.9898717403411865, - "learning_rate": 3.3015131591791705e-05, - "loss": 3.6664, - "step": 145100 - }, - { - "epoch": 0.03404796856423624, - "grad_norm": 3.03507137298584, - "learning_rate": 3.2994156701590813e-05, - "loss": 3.6707, - "step": 145200 - }, - { - "epoch": 0.03432705027377916, - "grad_norm": 2.995556354522705, - "learning_rate": 3.297317554140146e-05, - "loss": 3.6656, - "step": 145300 - }, - { - "epoch": 0.034606131983322075, - "grad_norm": 3.1159939765930176, - "learning_rate": 3.295218812767961e-05, - "loss": 3.6558, - "step": 145400 - }, - { - "epoch": 0.034885213692865, - "grad_norm": 2.9724996089935303, - "learning_rate": 3.293119447688615e-05, - "loss": 3.6455, - "step": 145500 - }, - { - "epoch": 0.03516429540240792, - "grad_norm": 3.123499631881714, - "learning_rate": 3.291019460548684e-05, - "loss": 3.6437, - "step": 145600 - }, - { - "epoch": 0.035443377111950836, - "grad_norm": 3.0609242916107178, - "learning_rate": 3.2889188529952334e-05, - "loss": 3.6567, - "step": 145700 - }, - { - "epoch": 0.035722458821493754, - "grad_norm": 3.1065030097961426, - "learning_rate": 3.286817626675815e-05, - "loss": 3.6503, - "step": 145800 - }, - { - "epoch": 0.03600154053103668, - "grad_norm": 3.0567867755889893, - "learning_rate": 3.284715783238466e-05, - "loss": 3.6493, - "step": 145900 - }, - { - "epoch": 0.036280622240579596, - "grad_norm": 2.944715738296509, - "learning_rate": 3.282613324331707e-05, - "loss": 3.663, - "step": 146000 - }, - { - "epoch": 0.036280622240579596, - "eval_loss": 2.181875467300415, - "eval_runtime": 51.9227, - "eval_samples_per_second": 196.33, - "eval_steps_per_second": 1.541, - "step": 146000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 3.1367740631103516, - "learning_rate": 3.280510251604541e-05, - "loss": 3.6419, - "step": 146100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 3.077601671218872, - "learning_rate": 3.2784065667064536e-05, - "loss": 3.661, - "step": 146200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 3.0808331966400146, - "learning_rate": 3.2763022712874094e-05, - "loss": 3.6409, - "step": 146300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 2.791093111038208, - "learning_rate": 3.274197366997852e-05, - "loss": 3.6515, - "step": 146400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 3.005890369415283, - "learning_rate": 3.272091855488705e-05, - "loss": 3.6402, - "step": 146500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 3.0411083698272705, - "learning_rate": 3.2699857384113644e-05, - "loss": 3.6484, - "step": 146600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 2.9706947803497314, - "learning_rate": 3.267879017417705e-05, - "loss": 3.6431, - "step": 146700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 2.8929619789123535, - "learning_rate": 3.2657716941600694e-05, - "loss": 3.6325, - "step": 146800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 3.0911691188812256, - "learning_rate": 3.2636637702912805e-05, - "loss": 3.6321, - "step": 146900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 3.0026369094848633, - "learning_rate": 3.261555247464626e-05, - "loss": 3.6279, - "step": 147000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.1829118728637695, - "eval_runtime": 52.5212, - "eval_samples_per_second": 194.093, - "eval_steps_per_second": 1.523, - "step": 147000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 3.0439035892486572, - "learning_rate": 3.259446127333865e-05, - "loss": 3.6467, - "step": 147100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 2.952643871307373, - "learning_rate": 3.2573364115532276e-05, - "loss": 3.6524, - "step": 147200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 3.039597988128662, - "learning_rate": 3.2552261017774075e-05, - "loss": 3.6339, - "step": 147300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 3.1887271404266357, - "learning_rate": 3.253115199661567e-05, - "loss": 3.6367, - "step": 147400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 3.123321056365967, - "learning_rate": 3.2510037068613314e-05, - "loss": 3.6283, - "step": 147500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 3.1954147815704346, - "learning_rate": 3.248891625032789e-05, - "loss": 3.6295, - "step": 147600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 3.2092411518096924, - "learning_rate": 3.246778955832493e-05, - "loss": 3.6417, - "step": 147700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 3.2568812370300293, - "learning_rate": 3.2446657009174523e-05, - "loss": 3.6327, - "step": 147800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 3.068138837814331, - "learning_rate": 3.242551861945141e-05, - "loss": 3.6543, - "step": 147900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 3.317512273788452, - "learning_rate": 3.240437440573485e-05, - "loss": 3.6408, - "step": 148000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.1870715618133545, - "eval_runtime": 52.1184, - "eval_samples_per_second": 195.593, - "eval_steps_per_second": 1.535, - "step": 148000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 3.3598294258117676, - "learning_rate": 3.238322438460874e-05, - "loss": 3.6164, - "step": 148100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 3.139274835586548, - "learning_rate": 3.2362068572661465e-05, - "loss": 3.6436, - "step": 148200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 3.0762457847595215, - "learning_rate": 3.234090698648599e-05, - "loss": 3.6247, - "step": 148300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 3.061337947845459, - "learning_rate": 3.2319739642679806e-05, - "loss": 3.623, - "step": 148400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 2.983355760574341, - "learning_rate": 3.229856655784491e-05, - "loss": 3.6257, - "step": 148500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 3.085252523422241, - "learning_rate": 3.227738774858782e-05, - "loss": 3.6421, - "step": 148600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 3.194308042526245, - "learning_rate": 3.225620323151951e-05, - "loss": 3.6212, - "step": 148700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 2.822134494781494, - "learning_rate": 3.223501302325546e-05, - "loss": 3.6332, - "step": 148800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 3.303119421005249, - "learning_rate": 3.2213817140415606e-05, - "loss": 3.6295, - "step": 148900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 3.2773683071136475, - "learning_rate": 3.219261559962433e-05, - "loss": 3.637, - "step": 149000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.179696798324585, - "eval_runtime": 52.2402, - "eval_samples_per_second": 195.137, - "eval_steps_per_second": 1.531, - "step": 149000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 3.0133464336395264, - "learning_rate": 3.217140841751045e-05, - "loss": 3.6203, - "step": 149100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 2.9925966262817383, - "learning_rate": 3.215019561070723e-05, - "loss": 3.6204, - "step": 149200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 3.0842456817626953, - "learning_rate": 3.2128977195852314e-05, - "loss": 3.6303, - "step": 149300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 3.073462724685669, - "learning_rate": 3.210775318958776e-05, - "loss": 3.6235, - "step": 149400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 3.0209946632385254, - "learning_rate": 3.208652360856002e-05, - "loss": 3.6212, - "step": 149500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 3.250084161758423, - "learning_rate": 3.2065288469419906e-05, - "loss": 3.6139, - "step": 149600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 3.0430448055267334, - "learning_rate": 3.204404778882258e-05, - "loss": 3.6206, - "step": 149700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 3.081878662109375, - "learning_rate": 3.20228015834276e-05, - "loss": 3.6167, - "step": 149800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 3.110133171081543, - "learning_rate": 3.2001549869898774e-05, - "loss": 3.627, - "step": 149900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 3.1069679260253906, - "learning_rate": 3.198029266490431e-05, - "loss": 3.6122, - "step": 150000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.1840949058532715, - "eval_runtime": 52.0857, - "eval_samples_per_second": 195.716, - "eval_steps_per_second": 1.536, - "step": 150000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 3.110675096511841, - "learning_rate": 3.195902998511666e-05, - "loss": 3.6101, - "step": 150100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 3.100144386291504, - "learning_rate": 3.193776184721263e-05, - "loss": 3.6098, - "step": 150200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 3.0613949298858643, - "learning_rate": 3.191648826787326e-05, - "loss": 3.5987, - "step": 150300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 2.962594747543335, - "learning_rate": 3.189520926378388e-05, - "loss": 3.6353, - "step": 150400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 3.2426040172576904, - "learning_rate": 3.187392485163406e-05, - "loss": 3.6268, - "step": 150500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 3.0770397186279297, - "learning_rate": 3.1852635048117634e-05, - "loss": 3.6132, - "step": 150600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 3.1057562828063965, - "learning_rate": 3.183133986993265e-05, - "loss": 3.6077, - "step": 150700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 3.1398537158966064, - "learning_rate": 3.181003933378136e-05, - "loss": 3.5958, - "step": 150800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 3.1277642250061035, - "learning_rate": 3.178873345637023e-05, - "loss": 3.6304, - "step": 150900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 3.247443675994873, - "learning_rate": 3.176742225440994e-05, - "loss": 3.6196, - "step": 151000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.1872923374176025, - "eval_runtime": 52.3067, - "eval_samples_per_second": 194.889, - "eval_steps_per_second": 1.529, - "step": 151000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 3.074709177017212, - "learning_rate": 3.17461057446153e-05, - "loss": 3.6314, - "step": 151100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 3.163147211074829, - "learning_rate": 3.1724783943705304e-05, - "loss": 3.6013, - "step": 151200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 3.062178373336792, - "learning_rate": 3.1703456868403126e-05, - "loss": 3.6219, - "step": 151300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 2.8890295028686523, - "learning_rate": 3.168212453543601e-05, - "loss": 3.6319, - "step": 151400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 2.8499069213867188, - "learning_rate": 3.166078696153539e-05, - "loss": 3.615, - "step": 151500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 3.079871892929077, - "learning_rate": 3.163944416343677e-05, - "loss": 3.5953, - "step": 151600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 3.028519868850708, - "learning_rate": 3.1618096157879776e-05, - "loss": 3.6217, - "step": 151700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 3.1988399028778076, - "learning_rate": 3.159674296160809e-05, - "loss": 3.6, - "step": 151800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 3.1502091884613037, - "learning_rate": 3.157538459136949e-05, - "loss": 3.6181, - "step": 151900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 3.1333131790161133, - "learning_rate": 3.1554021063915806e-05, - "loss": 3.6065, - "step": 152000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.1706299781799316, - "eval_runtime": 52.2255, - "eval_samples_per_second": 195.192, - "eval_steps_per_second": 1.532, - "step": 152000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 3.1799027919769287, - "learning_rate": 3.153265239600291e-05, - "loss": 3.6177, - "step": 152100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 3.09015154838562, - "learning_rate": 3.1511278604390694e-05, - "loss": 3.6111, - "step": 152200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 3.2853939533233643, - "learning_rate": 3.1489899705843094e-05, - "loss": 3.6164, - "step": 152300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 3.114593982696533, - "learning_rate": 3.146851571712804e-05, - "loss": 3.5874, - "step": 152400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 3.1264195442199707, - "learning_rate": 3.1447126655017446e-05, - "loss": 3.6051, - "step": 152500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 3.064248561859131, - "learning_rate": 3.142573253628721e-05, - "loss": 3.5926, - "step": 152600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 3.199820041656494, - "learning_rate": 3.140433337771721e-05, - "loss": 3.6214, - "step": 152700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 3.1903645992279053, - "learning_rate": 3.138292919609125e-05, - "loss": 3.602, - "step": 152800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 3.2537660598754883, - "learning_rate": 3.13615200081971e-05, - "loss": 3.618, - "step": 152900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 3.051445722579956, - "learning_rate": 3.134010583082643e-05, - "loss": 3.5982, - "step": 153000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.1735916137695312, - "eval_runtime": 52.2439, - "eval_samples_per_second": 195.123, - "eval_steps_per_second": 1.531, - "step": 153000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 2.984973192214966, - "learning_rate": 3.131868668077486e-05, - "loss": 3.5892, - "step": 153100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 2.9883575439453125, - "learning_rate": 3.129726257484187e-05, - "loss": 3.6092, - "step": 153200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 3.299248695373535, - "learning_rate": 3.127583352983086e-05, - "loss": 3.5973, - "step": 153300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 3.1858959197998047, - "learning_rate": 3.125439956254907e-05, - "loss": 3.5986, - "step": 153400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 3.2093448638916016, - "learning_rate": 3.123296068980764e-05, - "loss": 3.5987, - "step": 153500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 3.049703598022461, - "learning_rate": 3.1211516928421526e-05, - "loss": 3.5995, - "step": 153600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 3.1410481929779053, - "learning_rate": 3.119006829520953e-05, - "loss": 3.586, - "step": 153700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 2.9701108932495117, - "learning_rate": 3.1168614806994286e-05, - "loss": 3.5826, - "step": 153800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 3.016268253326416, - "learning_rate": 3.114715648060221e-05, - "loss": 3.5746, - "step": 153900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 3.112840175628662, - "learning_rate": 3.1125693332863545e-05, - "loss": 3.5908, - "step": 154000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.1794586181640625, - "eval_runtime": 52.4216, - "eval_samples_per_second": 194.462, - "eval_steps_per_second": 1.526, - "step": 154000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 3.132110595703125, - "learning_rate": 3.110422538061228e-05, - "loss": 3.57, - "step": 154100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 3.2359519004821777, - "learning_rate": 3.108275264068619e-05, - "loss": 3.6035, - "step": 154200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 2.9039528369903564, - "learning_rate": 3.1061275129926816e-05, - "loss": 3.5772, - "step": 154300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 3.10616397857666, - "learning_rate": 3.103979286517943e-05, - "loss": 3.58, - "step": 154400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 3.2507059574127197, - "learning_rate": 3.101830586329302e-05, - "loss": 3.5788, - "step": 154500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 3.145289421081543, - "learning_rate": 3.099681414112032e-05, - "loss": 3.5909, - "step": 154600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 3.2056355476379395, - "learning_rate": 3.097531771551774e-05, - "loss": 3.5776, - "step": 154700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 3.3703291416168213, - "learning_rate": 3.095381660334539e-05, - "loss": 3.5746, - "step": 154800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 3.181138277053833, - "learning_rate": 3.0932310821467036e-05, - "loss": 3.5715, - "step": 154900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 3.000821590423584, - "learning_rate": 3.091080038675015e-05, - "loss": 3.5743, - "step": 155000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.1706509590148926, - "eval_runtime": 52.3403, - "eval_samples_per_second": 194.764, - "eval_steps_per_second": 1.528, - "step": 155000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 2.9740653038024902, - "learning_rate": 3.0889285316065806e-05, - "loss": 3.5711, - "step": 155100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 3.1311419010162354, - "learning_rate": 3.0867765626288755e-05, - "loss": 3.5845, - "step": 155200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 3.0719974040985107, - "learning_rate": 3.084624133429733e-05, - "loss": 3.5731, - "step": 155300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 3.0461819171905518, - "learning_rate": 3.082471245697351e-05, - "loss": 3.5738, - "step": 155400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 2.9734132289886475, - "learning_rate": 3.080317901120285e-05, - "loss": 3.5853, - "step": 155500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 3.117506980895996, - "learning_rate": 3.078164101387449e-05, - "loss": 3.5847, - "step": 155600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 3.141174554824829, - "learning_rate": 3.076009848188114e-05, - "loss": 3.5861, - "step": 155700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 3.1444389820098877, - "learning_rate": 3.0738551432119086e-05, - "loss": 3.5716, - "step": 155800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 3.2603206634521484, - "learning_rate": 3.0716999881488135e-05, - "loss": 3.5878, - "step": 155900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 3.119466781616211, - "learning_rate": 3.069544384689162e-05, - "loss": 3.5913, - "step": 156000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.1707427501678467, - "eval_runtime": 52.4036, - "eval_samples_per_second": 194.529, - "eval_steps_per_second": 1.527, - "step": 156000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 3.3349738121032715, - "learning_rate": 3.06738833452364e-05, - "loss": 3.5642, - "step": 156100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 3.151176691055298, - "learning_rate": 3.065231839343285e-05, - "loss": 3.5908, - "step": 156200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 3.139475107192993, - "learning_rate": 3.0630749008394813e-05, - "loss": 3.5672, - "step": 156300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 3.1396217346191406, - "learning_rate": 3.0609175207039636e-05, - "loss": 3.5787, - "step": 156400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 2.9696104526519775, - "learning_rate": 3.05875970062881e-05, - "loss": 3.573, - "step": 156500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 3.078080415725708, - "learning_rate": 3.056601442306445e-05, - "loss": 3.5583, - "step": 156600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 3.1915009021759033, - "learning_rate": 3.054442747429638e-05, - "loss": 3.5809, - "step": 156700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 3.194831132888794, - "learning_rate": 3.052283617691499e-05, - "loss": 3.5695, - "step": 156800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 3.0851330757141113, - "learning_rate": 3.0501240547854793e-05, - "loss": 3.5686, - "step": 156900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 3.050123929977417, - "learning_rate": 3.047964060405371e-05, - "loss": 3.5643, - "step": 157000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.165821075439453, - "eval_runtime": 52.6341, - "eval_samples_per_second": 193.677, - "eval_steps_per_second": 1.52, - "step": 157000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 3.080662727355957, - "learning_rate": 3.0458036362453036e-05, - "loss": 3.5674, - "step": 157100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 2.9070680141448975, - "learning_rate": 3.0436427839997444e-05, - "loss": 3.5709, - "step": 157200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 3.212815046310425, - "learning_rate": 3.0414815053634966e-05, - "loss": 3.5596, - "step": 157300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 2.9851808547973633, - "learning_rate": 3.039319802031696e-05, - "loss": 3.5877, - "step": 157400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 3.1525375843048096, - "learning_rate": 3.037157675699814e-05, - "loss": 3.5742, - "step": 157500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 3.253023386001587, - "learning_rate": 3.034995128063651e-05, - "loss": 3.5823, - "step": 157600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 3.126237154006958, - "learning_rate": 3.0328321608193427e-05, - "loss": 3.5695, - "step": 157700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 2.9912712574005127, - "learning_rate": 3.030668775663347e-05, - "loss": 3.5762, - "step": 157800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 3.2612810134887695, - "learning_rate": 3.0285049742924564e-05, - "loss": 3.551, - "step": 157900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 3.226860761642456, - "learning_rate": 3.026340758403785e-05, - "loss": 3.5442, - "step": 158000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.1677935123443604, - "eval_runtime": 52.5523, - "eval_samples_per_second": 193.978, - "eval_steps_per_second": 1.522, - "step": 158000 - }, - { - "epoch": 0.033768886854693314, - "grad_norm": 3.011373281478882, - "learning_rate": 3.024176129694774e-05, - "loss": 3.5603, - "step": 158100 - }, - { - "epoch": 0.03404796856423624, - "grad_norm": 3.067375898361206, - "learning_rate": 3.022011089863187e-05, - "loss": 3.5734, - "step": 158200 - }, - { - "epoch": 0.03432705027377916, - "grad_norm": 3.1003239154815674, - "learning_rate": 3.0198456406071134e-05, - "loss": 3.5688, - "step": 158300 - }, - { - "epoch": 0.034606131983322075, - "grad_norm": 2.9454071521759033, - "learning_rate": 3.017679783624959e-05, - "loss": 3.5617, - "step": 158400 - }, - { - "epoch": 0.034885213692865, - "grad_norm": 3.2112362384796143, - "learning_rate": 3.015513520615455e-05, - "loss": 3.5651, - "step": 158500 - }, - { - "epoch": 0.03516429540240792, - "grad_norm": 3.0805153846740723, - "learning_rate": 3.0133468532776454e-05, - "loss": 3.555, - "step": 158600 - }, - { - "epoch": 0.035443377111950836, - "grad_norm": 3.0264370441436768, - "learning_rate": 3.011179783310894e-05, - "loss": 3.5537, - "step": 158700 - }, - { - "epoch": 0.035722458821493754, - "grad_norm": 3.0615999698638916, - "learning_rate": 3.0090123124148807e-05, - "loss": 3.5466, - "step": 158800 - }, - { - "epoch": 0.03600154053103668, - "grad_norm": 3.1815524101257324, - "learning_rate": 3.0068444422896004e-05, - "loss": 3.5535, - "step": 158900 - }, - { - "epoch": 0.036280622240579596, - "grad_norm": 3.0562305450439453, - "learning_rate": 3.004676174635358e-05, - "loss": 3.5663, - "step": 159000 - }, - { - "epoch": 0.036280622240579596, - "eval_loss": 2.173013925552368, - "eval_runtime": 52.4542, - "eval_samples_per_second": 194.341, - "eval_steps_per_second": 1.525, - "step": 159000 - }, - { - "epoch": 0.036559703950122514, - "grad_norm": 3.14689302444458, - "learning_rate": 3.002507511152774e-05, - "loss": 3.5568, - "step": 159100 - }, - { - "epoch": 0.03683878565966544, - "grad_norm": 3.087399959564209, - "learning_rate": 3.0003384535427765e-05, - "loss": 3.557, - "step": 159200 - }, - { - "epoch": 0.03711786736920836, - "grad_norm": 3.04844331741333, - "learning_rate": 2.9981690035066057e-05, - "loss": 3.5409, - "step": 159300 - }, - { - "epoch": 0.037396949078751275, - "grad_norm": 3.2028706073760986, - "learning_rate": 2.995999162745805e-05, - "loss": 3.5761, - "step": 159400 - }, - { - "epoch": 0.0376760307882942, - "grad_norm": 2.9123711585998535, - "learning_rate": 2.99382893296223e-05, - "loss": 3.5473, - "step": 159500 - }, - { - "epoch": 0.03795511249783712, - "grad_norm": 3.165459156036377, - "learning_rate": 2.9916583158580357e-05, - "loss": 3.5596, - "step": 159600 - }, - { - "epoch": 0.038234194207380036, - "grad_norm": 3.1565003395080566, - "learning_rate": 2.989487313135686e-05, - "loss": 3.5577, - "step": 159700 - }, - { - "epoch": 0.038513275916922954, - "grad_norm": 3.1155638694763184, - "learning_rate": 2.9873159264979433e-05, - "loss": 3.5572, - "step": 159800 - }, - { - "epoch": 0.03879235762646588, - "grad_norm": 3.1283843517303467, - "learning_rate": 2.9851441576478734e-05, - "loss": 3.5478, - "step": 159900 - }, - { - "epoch": 0.0390714393360088, - "grad_norm": 3.2107434272766113, - "learning_rate": 2.9829720082888406e-05, - "loss": 3.5637, - "step": 160000 - }, - { - "epoch": 0.0390714393360088, - "eval_loss": 2.1652143001556396, - "eval_runtime": 52.479, - "eval_samples_per_second": 194.249, - "eval_steps_per_second": 1.524, - "step": 160000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 1.561998724937439, - "learning_rate": 2.9807994801245094e-05, - "loss": 1.7734, - "step": 160100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 1.5832147598266602, - "learning_rate": 2.9786265748588383e-05, - "loss": 1.7793, - "step": 160200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 1.5801053047180176, - "learning_rate": 2.9764532941960848e-05, - "loss": 1.7738, - "step": 160300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 1.5710026025772095, - "learning_rate": 2.9742796398407996e-05, - "loss": 1.7729, - "step": 160400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 1.5338846445083618, - "learning_rate": 2.9721056134978263e-05, - "loss": 1.7725, - "step": 160500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 1.5545586347579956, - "learning_rate": 2.9699312168722998e-05, - "loss": 1.7748, - "step": 160600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 1.5022891759872437, - "learning_rate": 2.967756451669646e-05, - "loss": 1.7757, - "step": 160700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 1.5773268938064575, - "learning_rate": 2.9655813195955808e-05, - "loss": 1.7746, - "step": 160800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 1.5689061880111694, - "learning_rate": 2.9634058223561058e-05, - "loss": 1.7767, - "step": 160900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 1.523888349533081, - "learning_rate": 2.9612299616575108e-05, - "loss": 1.7725, - "step": 161000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.154878616333008, - "eval_runtime": 52.3123, - "eval_samples_per_second": 194.868, - "eval_steps_per_second": 1.529, - "step": 161000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 1.5541263818740845, - "learning_rate": 2.9590537392063693e-05, - "loss": 1.775, - "step": 161100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 1.542082667350769, - "learning_rate": 2.9568771567095403e-05, - "loss": 1.775, - "step": 161200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 1.6016403436660767, - "learning_rate": 2.9547002158741637e-05, - "loss": 1.7809, - "step": 161300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 1.604466438293457, - "learning_rate": 2.952522918407661e-05, - "loss": 1.7691, - "step": 161400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 1.507333755493164, - "learning_rate": 2.950345266017732e-05, - "loss": 1.7706, - "step": 161500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 1.6002788543701172, - "learning_rate": 2.948167260412358e-05, - "loss": 1.7947, - "step": 161600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 1.5560261011123657, - "learning_rate": 2.9459889032997933e-05, - "loss": 1.7981, - "step": 161700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 1.665317177772522, - "learning_rate": 2.9438101963885728e-05, - "loss": 1.7923, - "step": 161800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 1.5762344598770142, - "learning_rate": 2.9416311413875008e-05, - "loss": 1.7968, - "step": 161900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 1.553109884262085, - "learning_rate": 2.9394517400056583e-05, - "loss": 1.7948, - "step": 162000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.1695141792297363, - "eval_runtime": 51.7758, - "eval_samples_per_second": 196.887, - "eval_steps_per_second": 1.545, - "step": 162000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 1.566721796989441, - "learning_rate": 2.937271993952395e-05, - "loss": 1.7959, - "step": 162100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 1.5952585935592651, - "learning_rate": 2.9350919049373343e-05, - "loss": 1.7892, - "step": 162200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 1.5383343696594238, - "learning_rate": 2.932911474670365e-05, - "loss": 1.7918, - "step": 162300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 1.5342581272125244, - "learning_rate": 2.9307307048616468e-05, - "loss": 1.7815, - "step": 162400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 1.533451795578003, - "learning_rate": 2.9285495972216027e-05, - "loss": 1.7834, - "step": 162500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 1.5433770418167114, - "learning_rate": 2.9263681534609233e-05, - "loss": 1.7886, - "step": 162600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 1.5695182085037231, - "learning_rate": 2.924186375290562e-05, - "loss": 1.7934, - "step": 162700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 1.5996042490005493, - "learning_rate": 2.922004264421733e-05, - "loss": 1.7896, - "step": 162800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 1.4973070621490479, - "learning_rate": 2.919821822565913e-05, - "loss": 1.7862, - "step": 162900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 1.6223851442337036, - "learning_rate": 2.9176390514348384e-05, - "loss": 1.7797, - "step": 163000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.1567208766937256, - "eval_runtime": 51.8158, - "eval_samples_per_second": 196.735, - "eval_steps_per_second": 1.544, - "step": 163000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 1.5375715494155884, - "learning_rate": 2.915455952740503e-05, - "loss": 1.7847, - "step": 163100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 1.4768236875534058, - "learning_rate": 2.9132725281951584e-05, - "loss": 1.7804, - "step": 163200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 1.583068609237671, - "learning_rate": 2.9110887795113108e-05, - "loss": 1.785, - "step": 163300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 1.5692275762557983, - "learning_rate": 2.9089047084017206e-05, - "loss": 1.7824, - "step": 163400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 1.5128995180130005, - "learning_rate": 2.9067203165794028e-05, - "loss": 1.7888, - "step": 163500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 1.5675851106643677, - "learning_rate": 2.904535605757622e-05, - "loss": 1.7761, - "step": 163600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 1.5943450927734375, - "learning_rate": 2.902350577649894e-05, - "loss": 1.7837, - "step": 163700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 1.4797823429107666, - "learning_rate": 2.9001652339699818e-05, - "loss": 1.7785, - "step": 163800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 1.5837212800979614, - "learning_rate": 2.8979795764319007e-05, - "loss": 1.7769, - "step": 163900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 1.5809427499771118, - "learning_rate": 2.8957936067499054e-05, - "loss": 1.7876, - "step": 164000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.157485246658325, - "eval_runtime": 51.8186, - "eval_samples_per_second": 196.725, - "eval_steps_per_second": 1.544, - "step": 164000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 1.4939141273498535, - "learning_rate": 2.8936073266385e-05, - "loss": 1.7851, - "step": 164100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 1.6713926792144775, - "learning_rate": 2.8914207378124304e-05, - "loss": 1.7852, - "step": 164200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 1.526655673980713, - "learning_rate": 2.889233841986686e-05, - "loss": 1.7744, - "step": 164300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 1.674926996231079, - "learning_rate": 2.8870466408764952e-05, - "loss": 1.7761, - "step": 164400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 1.5404378175735474, - "learning_rate": 2.8848591361973278e-05, - "loss": 1.7889, - "step": 164500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 1.5554423332214355, - "learning_rate": 2.88267132966489e-05, - "loss": 1.7876, - "step": 164600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 1.595197319984436, - "learning_rate": 2.880483222995125e-05, - "loss": 1.7806, - "step": 164700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 1.611559510231018, - "learning_rate": 2.8782948179042114e-05, - "loss": 1.7856, - "step": 164800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 1.622501015663147, - "learning_rate": 2.876106116108564e-05, - "loss": 1.7838, - "step": 164900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 1.5229750871658325, - "learning_rate": 2.873917119324826e-05, - "loss": 1.7851, - "step": 165000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.1587064266204834, - "eval_runtime": 51.9184, - "eval_samples_per_second": 196.347, - "eval_steps_per_second": 1.541, - "step": 165000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 1.5421415567398071, - "learning_rate": 2.8717278292698767e-05, - "loss": 1.7853, - "step": 165100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 1.598402976989746, - "learning_rate": 2.8695382476608228e-05, - "loss": 1.7886, - "step": 165200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 1.5251154899597168, - "learning_rate": 2.867348376215e-05, - "loss": 1.7885, - "step": 165300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 1.554371953010559, - "learning_rate": 2.86515821664997e-05, - "loss": 1.7831, - "step": 165400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 1.606136679649353, - "learning_rate": 2.8629677706835234e-05, - "loss": 1.7672, - "step": 165500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 1.5520561933517456, - "learning_rate": 2.8607770400336738e-05, - "loss": 1.7775, - "step": 165600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 1.5017564296722412, - "learning_rate": 2.8585860264186582e-05, - "loss": 1.7837, - "step": 165700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 1.5462771654129028, - "learning_rate": 2.8563947315569346e-05, - "loss": 1.7757, - "step": 165800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 1.519423484802246, - "learning_rate": 2.8542031571671833e-05, - "loss": 1.7737, - "step": 165900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 1.552426815032959, - "learning_rate": 2.852011304968304e-05, - "loss": 1.7845, - "step": 166000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.1654627323150635, - "eval_runtime": 51.7515, - "eval_samples_per_second": 196.98, - "eval_steps_per_second": 1.546, - "step": 166000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 1.6090401411056519, - "learning_rate": 2.849819176679412e-05, - "loss": 1.7792, - "step": 166100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 1.4991530179977417, - "learning_rate": 2.8476267740198403e-05, - "loss": 1.7757, - "step": 166200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 1.545792579650879, - "learning_rate": 2.8454340987091382e-05, - "loss": 1.7782, - "step": 166300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 1.5758668184280396, - "learning_rate": 2.8432411524670675e-05, - "loss": 1.7627, - "step": 166400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 1.5638821125030518, - "learning_rate": 2.8410479370136035e-05, - "loss": 1.7816, - "step": 166500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 1.6477131843566895, - "learning_rate": 2.8388544540689314e-05, - "loss": 1.7814, - "step": 166600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 1.5519869327545166, - "learning_rate": 2.836660705353447e-05, - "loss": 1.7747, - "step": 166700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 1.5598399639129639, - "learning_rate": 2.8344666925877556e-05, - "loss": 1.7778, - "step": 166800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 1.5361994504928589, - "learning_rate": 2.8322724174926664e-05, - "loss": 1.7796, - "step": 166900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 1.5680959224700928, - "learning_rate": 2.8300778817891976e-05, - "loss": 1.7742, - "step": 167000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.1607890129089355, - "eval_runtime": 52.1406, - "eval_samples_per_second": 195.51, - "eval_steps_per_second": 1.534, - "step": 167000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 1.5712428092956543, - "learning_rate": 2.8278830871985708e-05, - "loss": 1.7747, - "step": 167100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 1.5386340618133545, - "learning_rate": 2.8256880354422098e-05, - "loss": 1.7738, - "step": 167200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 1.5471428632736206, - "learning_rate": 2.8234927282417417e-05, - "loss": 1.779, - "step": 167300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 1.5163718461990356, - "learning_rate": 2.821297167318992e-05, - "loss": 1.7741, - "step": 167400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 1.5554001331329346, - "learning_rate": 2.819101354395986e-05, - "loss": 1.7825, - "step": 167500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 1.4839155673980713, - "learning_rate": 2.8169052911949484e-05, - "loss": 1.7729, - "step": 167600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 1.5696512460708618, - "learning_rate": 2.8147089794382965e-05, - "loss": 1.7754, - "step": 167700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 1.6730250120162964, - "learning_rate": 2.8125124208486465e-05, - "loss": 1.7736, - "step": 167800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 1.6201075315475464, - "learning_rate": 2.810315617148806e-05, - "loss": 1.7771, - "step": 167900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 1.6662862300872803, - "learning_rate": 2.8081185700617746e-05, - "loss": 1.7761, - "step": 168000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.161256790161133, - "eval_runtime": 52.0964, - "eval_samples_per_second": 195.676, - "eval_steps_per_second": 1.536, - "step": 168000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 1.4697953462600708, - "learning_rate": 2.8059212813107438e-05, - "loss": 1.7894, - "step": 168100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 1.6400997638702393, - "learning_rate": 2.803723752619094e-05, - "loss": 1.7779, - "step": 168200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 1.5220052003860474, - "learning_rate": 2.8015259857103942e-05, - "loss": 1.7732, - "step": 168300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 1.542869210243225, - "learning_rate": 2.7993279823084007e-05, - "loss": 1.7771, - "step": 168400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 1.4953099489212036, - "learning_rate": 2.7971297441370542e-05, - "loss": 1.7774, - "step": 168500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 1.5665849447250366, - "learning_rate": 2.7949312729204803e-05, - "loss": 1.7633, - "step": 168600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 1.5860687494277954, - "learning_rate": 2.792732570382986e-05, - "loss": 1.7798, - "step": 168700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 1.602845311164856, - "learning_rate": 2.790533638249062e-05, - "loss": 1.7694, - "step": 168800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 1.5015400648117065, - "learning_rate": 2.7883344782433774e-05, - "loss": 1.7628, - "step": 168900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 1.5296344757080078, - "learning_rate": 2.7861350920907807e-05, - "loss": 1.7753, - "step": 169000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.1639742851257324, - "eval_runtime": 52.1527, - "eval_samples_per_second": 195.465, - "eval_steps_per_second": 1.534, - "step": 169000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 1.591369390487671, - "learning_rate": 2.783935481516297e-05, - "loss": 1.7695, - "step": 169100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 1.5569419860839844, - "learning_rate": 2.7817356482451297e-05, - "loss": 1.7689, - "step": 169200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 1.6080352067947388, - "learning_rate": 2.779535594002654e-05, - "loss": 1.767, - "step": 169300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 1.47182035446167, - "learning_rate": 2.77733532051442e-05, - "loss": 1.7717, - "step": 169400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 1.6706403493881226, - "learning_rate": 2.775134829506148e-05, - "loss": 1.7787, - "step": 169500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 1.6530786752700806, - "learning_rate": 2.7729341227037313e-05, - "loss": 1.7726, - "step": 169600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 1.4457296133041382, - "learning_rate": 2.7707332018332323e-05, - "loss": 1.7697, - "step": 169700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 1.5824190378189087, - "learning_rate": 2.7685320686208793e-05, - "loss": 1.7734, - "step": 169800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 1.6177047491073608, - "learning_rate": 2.7663307247930686e-05, - "loss": 1.7782, - "step": 169900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 1.505018949508667, - "learning_rate": 2.7641291720763612e-05, - "loss": 1.7659, - "step": 170000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.1508195400238037, - "eval_runtime": 52.1147, - "eval_samples_per_second": 195.607, - "eval_steps_per_second": 1.535, - "step": 170000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 1.6319383382797241, - "learning_rate": 2.7619274121974825e-05, - "loss": 1.7709, - "step": 170100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 1.6314260959625244, - "learning_rate": 2.759725446883319e-05, - "loss": 1.7675, - "step": 170200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 1.471872329711914, - "learning_rate": 2.7575232778609206e-05, - "loss": 1.771, - "step": 170300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 1.5450881719589233, - "learning_rate": 2.755320906857494e-05, - "loss": 1.836, - "step": 170400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 1.5527344942092896, - "learning_rate": 2.753118335600408e-05, - "loss": 1.8808, - "step": 170500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 1.8364976644515991, - "learning_rate": 2.7509155658171852e-05, - "loss": 1.8776, - "step": 170600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 1.4847674369812012, - "learning_rate": 2.7487125992355058e-05, - "loss": 1.8724, - "step": 170700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 1.5595808029174805, - "learning_rate": 2.7465094375832028e-05, - "loss": 1.8799, - "step": 170800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 1.54868483543396, - "learning_rate": 2.744306082588264e-05, - "loss": 1.8704, - "step": 170900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 1.8504784107208252, - "learning_rate": 2.742102535978827e-05, - "loss": 1.8736, - "step": 171000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.1541635990142822, - "eval_runtime": 52.1803, - "eval_samples_per_second": 195.361, - "eval_steps_per_second": 1.533, - "step": 171000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 1.6168150901794434, - "learning_rate": 2.7398987994831822e-05, - "loss": 1.8737, - "step": 171100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 1.6291587352752686, - "learning_rate": 2.737694874829766e-05, - "loss": 1.8691, - "step": 171200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 1.5887749195098877, - "learning_rate": 2.735490763747164e-05, - "loss": 1.8725, - "step": 171300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 1.6395853757858276, - "learning_rate": 2.733286467964108e-05, - "loss": 1.8857, - "step": 171400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 1.5826025009155273, - "learning_rate": 2.7310819892094742e-05, - "loss": 1.8546, - "step": 171500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 2.160349130630493, - "learning_rate": 2.7288773292122827e-05, - "loss": 1.8623, - "step": 171600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 1.6130859851837158, - "learning_rate": 2.726672489701696e-05, - "loss": 1.8629, - "step": 171700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 1.619787335395813, - "learning_rate": 2.7244674724070163e-05, - "loss": 1.8646, - "step": 171800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 2.099820375442505, - "learning_rate": 2.722262279057687e-05, - "loss": 1.8679, - "step": 171900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 1.7083640098571777, - "learning_rate": 2.720056911383287e-05, - "loss": 1.8554, - "step": 172000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.1523571014404297, - "eval_runtime": 52.1491, - "eval_samples_per_second": 195.478, - "eval_steps_per_second": 1.534, - "step": 172000 - }, - { - "epoch": 0.033768886854693314, - "grad_norm": 1.5392628908157349, - "learning_rate": 2.717851371113534e-05, - "loss": 1.8658, - "step": 172100 - }, - { - "epoch": 0.03404796856423624, - "grad_norm": 2.007720708847046, - "learning_rate": 2.715645659978281e-05, - "loss": 1.861, - "step": 172200 - }, - { - "epoch": 0.03432705027377916, - "grad_norm": 1.566613793373108, - "learning_rate": 2.7134397797075145e-05, - "loss": 1.8669, - "step": 172300 - }, - { - "epoch": 0.034606131983322075, - "grad_norm": 1.588408350944519, - "learning_rate": 2.7112337320313524e-05, - "loss": 1.8568, - "step": 172400 - }, - { - "epoch": 0.034885213692865, - "grad_norm": 1.6406699419021606, - "learning_rate": 2.7090275186800474e-05, - "loss": 1.8713, - "step": 172500 - }, - { - "epoch": 0.03516429540240792, - "grad_norm": 1.5397433042526245, - "learning_rate": 2.7068211413839782e-05, - "loss": 1.8629, - "step": 172600 - }, - { - "epoch": 0.035443377111950836, - "grad_norm": 1.5865190029144287, - "learning_rate": 2.704614601873654e-05, - "loss": 1.8579, - "step": 172700 - }, - { - "epoch": 0.035722458821493754, - "grad_norm": 1.7077267169952393, - "learning_rate": 2.702407901879712e-05, - "loss": 1.8616, - "step": 172800 - }, - { - "epoch": 0.03600154053103668, - "grad_norm": 1.727586269378662, - "learning_rate": 2.7002010431329134e-05, - "loss": 1.8574, - "step": 172900 - }, - { - "epoch": 0.036280622240579596, - "grad_norm": 1.5238264799118042, - "learning_rate": 2.6979940273641453e-05, - "loss": 1.8595, - "step": 173000 - }, - { - "epoch": 0.036280622240579596, - "eval_loss": 2.141134738922119, - "eval_runtime": 52.1591, - "eval_samples_per_second": 195.441, - "eval_steps_per_second": 1.534, - "step": 173000 - }, - { - "epoch": 0.036559703950122514, - "grad_norm": 1.5688259601593018, - "learning_rate": 2.6957868563044176e-05, - "loss": 1.8674, - "step": 173100 - }, - { - "epoch": 0.03683878565966544, - "grad_norm": 1.5195534229278564, - "learning_rate": 2.6935795316848612e-05, - "loss": 1.8653, - "step": 173200 - }, - { - "epoch": 0.03711786736920836, - "grad_norm": 1.6201164722442627, - "learning_rate": 2.691372055236728e-05, - "loss": 1.8579, - "step": 173300 - }, - { - "epoch": 0.037396949078751275, - "grad_norm": 1.8065686225891113, - "learning_rate": 2.6891644286913897e-05, - "loss": 1.8755, - "step": 173400 - }, - { - "epoch": 0.0376760307882942, - "grad_norm": 1.5661702156066895, - "learning_rate": 2.6869566537803347e-05, - "loss": 1.8552, - "step": 173500 - }, - { - "epoch": 0.03795511249783712, - "grad_norm": 1.6565943956375122, - "learning_rate": 2.6847487322351694e-05, - "loss": 1.8664, - "step": 173600 - }, - { - "epoch": 0.038234194207380036, - "grad_norm": 1.49613356590271, - "learning_rate": 2.6825406657876123e-05, - "loss": 1.8524, - "step": 173700 - }, - { - "epoch": 0.038513275916922954, - "grad_norm": 1.5829864740371704, - "learning_rate": 2.6803324561694988e-05, - "loss": 1.8732, - "step": 173800 - }, - { - "epoch": 0.03879235762646588, - "grad_norm": 1.6095563173294067, - "learning_rate": 2.6781241051127738e-05, - "loss": 1.8503, - "step": 173900 - }, - { - "epoch": 0.0390714393360088, - "grad_norm": 1.5767251253128052, - "learning_rate": 2.675915614349495e-05, - "loss": 1.856, - "step": 174000 - }, - { - "epoch": 0.0390714393360088, - "eval_loss": 2.1416378021240234, - "eval_runtime": 52.1112, - "eval_samples_per_second": 195.62, - "eval_steps_per_second": 1.535, - "step": 174000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 1.5513286590576172, - "learning_rate": 2.6737069856118284e-05, - "loss": 1.7542, - "step": 174100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 1.5664585828781128, - "learning_rate": 2.67149822063205e-05, - "loss": 1.7515, - "step": 174200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 1.5423948764801025, - "learning_rate": 2.66928932114254e-05, - "loss": 1.7557, - "step": 174300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 1.5535671710968018, - "learning_rate": 2.667080288875788e-05, - "loss": 1.7569, - "step": 174400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 1.5592520236968994, - "learning_rate": 2.6648711255643828e-05, - "loss": 1.7506, - "step": 174500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 1.5440510511398315, - "learning_rate": 2.6626618329410198e-05, - "loss": 1.7618, - "step": 174600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 1.54314124584198, - "learning_rate": 2.6604524127384937e-05, - "loss": 1.7491, - "step": 174700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 1.592208743095398, - "learning_rate": 2.658242866689702e-05, - "loss": 1.7458, - "step": 174800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 1.5204849243164062, - "learning_rate": 2.6560331965276363e-05, - "loss": 1.7523, - "step": 174900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 1.5259612798690796, - "learning_rate": 2.653823403985391e-05, - "loss": 1.7535, - "step": 175000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.1326749324798584, - "eval_runtime": 52.049, - "eval_samples_per_second": 195.854, - "eval_steps_per_second": 1.537, - "step": 175000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 1.52047598361969, - "learning_rate": 2.651613490796152e-05, - "loss": 1.7447, - "step": 175100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 1.5134586095809937, - "learning_rate": 2.6494034586932027e-05, - "loss": 1.7452, - "step": 175200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 1.572095513343811, - "learning_rate": 2.6471933094099177e-05, - "loss": 1.7571, - "step": 175300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 1.5933750867843628, - "learning_rate": 2.6449830446797653e-05, - "loss": 1.745, - "step": 175400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 1.6601353883743286, - "learning_rate": 2.6427726662363023e-05, - "loss": 1.7462, - "step": 175500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 1.5466818809509277, - "learning_rate": 2.640562175813177e-05, - "loss": 1.7573, - "step": 175600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 1.5273200273513794, - "learning_rate": 2.6383515751441234e-05, - "loss": 1.7578, - "step": 175700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 1.609778881072998, - "learning_rate": 2.636140865962965e-05, - "loss": 1.7513, - "step": 175800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 1.6019160747528076, - "learning_rate": 2.633930050003606e-05, - "loss": 1.7557, - "step": 175900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 1.5547572374343872, - "learning_rate": 2.6317191290000383e-05, - "loss": 1.7645, - "step": 176000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.141494035720825, - "eval_runtime": 51.4645, - "eval_samples_per_second": 198.078, - "eval_steps_per_second": 1.554, - "step": 176000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 1.6100679636001587, - "learning_rate": 2.629508104686334e-05, - "loss": 1.7566, - "step": 176100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 1.5966265201568604, - "learning_rate": 2.6272969787966466e-05, - "loss": 1.7511, - "step": 176200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 1.5519967079162598, - "learning_rate": 2.6250857530652113e-05, - "loss": 1.7534, - "step": 176300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 1.5537617206573486, - "learning_rate": 2.6228744292263367e-05, - "loss": 1.7448, - "step": 176400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 1.5397429466247559, - "learning_rate": 2.6206630090144153e-05, - "loss": 1.7456, - "step": 176500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 1.5131994485855103, - "learning_rate": 2.618451494163908e-05, - "loss": 1.7472, - "step": 176600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 1.553226113319397, - "learning_rate": 2.6162398864093553e-05, - "loss": 1.7588, - "step": 176700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 1.5782634019851685, - "learning_rate": 2.6140281874853666e-05, - "loss": 1.7498, - "step": 176800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 1.5181629657745361, - "learning_rate": 2.6118163991266275e-05, - "loss": 1.7525, - "step": 176900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 1.622118353843689, - "learning_rate": 2.6096045230678888e-05, - "loss": 1.7472, - "step": 177000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.1567530632019043, - "eval_runtime": 51.4987, - "eval_samples_per_second": 197.947, - "eval_steps_per_second": 1.553, - "step": 177000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 1.5844262838363647, - "learning_rate": 2.6073925610439738e-05, - "loss": 1.7489, - "step": 177100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 1.4944721460342407, - "learning_rate": 2.6051805147897713e-05, - "loss": 1.7535, - "step": 177200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 1.607365608215332, - "learning_rate": 2.602968386040236e-05, - "loss": 1.7476, - "step": 177300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 1.5790349245071411, - "learning_rate": 2.6007561765303878e-05, - "loss": 1.7465, - "step": 177400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 1.5833547115325928, - "learning_rate": 2.5985438879953107e-05, - "loss": 1.7581, - "step": 177500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 1.5244640111923218, - "learning_rate": 2.5963315221701496e-05, - "loss": 1.7489, - "step": 177600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 1.6332496404647827, - "learning_rate": 2.5941190807901117e-05, - "loss": 1.7593, - "step": 177700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 1.4967930316925049, - "learning_rate": 2.5919065655904606e-05, - "loss": 1.7487, - "step": 177800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 1.5874158143997192, - "learning_rate": 2.5896939783065198e-05, - "loss": 1.7488, - "step": 177900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 1.6334315538406372, - "learning_rate": 2.587481320673669e-05, - "loss": 1.7558, - "step": 178000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.1407663822174072, - "eval_runtime": 51.564, - "eval_samples_per_second": 197.696, - "eval_steps_per_second": 1.551, - "step": 178000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 1.5070706605911255, - "learning_rate": 2.5852685944273437e-05, - "loss": 1.7515, - "step": 178100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 1.675197958946228, - "learning_rate": 2.583055801303031e-05, - "loss": 1.7517, - "step": 178200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 1.6129719018936157, - "learning_rate": 2.5808429430362734e-05, - "loss": 1.739, - "step": 178300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 1.6314342021942139, - "learning_rate": 2.5786300213626623e-05, - "loss": 1.7373, - "step": 178400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 1.4758597612380981, - "learning_rate": 2.576417038017841e-05, - "loss": 1.7512, - "step": 178500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 1.6322437524795532, - "learning_rate": 2.574203994737498e-05, - "loss": 1.7529, - "step": 178600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 1.6611186265945435, - "learning_rate": 2.5719908932573716e-05, - "loss": 1.7529, - "step": 178700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 1.6254630088806152, - "learning_rate": 2.5697777353132434e-05, - "loss": 1.7548, - "step": 178800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 1.6417994499206543, - "learning_rate": 2.567564522640942e-05, - "loss": 1.7501, - "step": 178900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 1.5359156131744385, - "learning_rate": 2.5653512569763377e-05, - "loss": 1.7562, - "step": 179000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.144591808319092, - "eval_runtime": 51.5364, - "eval_samples_per_second": 197.802, - "eval_steps_per_second": 1.552, - "step": 179000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 1.5880595445632935, - "learning_rate": 2.5631379400553416e-05, - "loss": 1.75, - "step": 179100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 1.6134679317474365, - "learning_rate": 2.560924573613906e-05, - "loss": 1.7508, - "step": 179200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 1.5464352369308472, - "learning_rate": 2.5587111593880205e-05, - "loss": 1.7502, - "step": 179300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 1.573649525642395, - "learning_rate": 2.556497699113714e-05, - "loss": 1.7435, - "step": 179400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 1.5665711164474487, - "learning_rate": 2.554284194527051e-05, - "loss": 1.7462, - "step": 179500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 1.606072187423706, - "learning_rate": 2.5520706473641316e-05, - "loss": 1.7516, - "step": 179600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 1.5898959636688232, - "learning_rate": 2.549857059361086e-05, - "loss": 1.7482, - "step": 179700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 1.6288598775863647, - "learning_rate": 2.547643432254081e-05, - "loss": 1.7365, - "step": 179800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 1.5765552520751953, - "learning_rate": 2.545429767779311e-05, - "loss": 1.7346, - "step": 179900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 1.5909677743911743, - "learning_rate": 2.5432160676729994e-05, - "loss": 1.7493, - "step": 180000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.1469063758850098, - "eval_runtime": 52.5101, - "eval_samples_per_second": 194.134, - "eval_steps_per_second": 1.524, - "step": 180000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 1.6108888387680054, - "learning_rate": 2.5410023336713996e-05, - "loss": 1.749, - "step": 180100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 1.5427972078323364, - "learning_rate": 2.538788567510791e-05, - "loss": 1.738, - "step": 180200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 1.5925029516220093, - "learning_rate": 2.5365747709274767e-05, - "loss": 1.7418, - "step": 180300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 1.5784283876419067, - "learning_rate": 2.5343609456577867e-05, - "loss": 1.7417, - "step": 180400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 1.623561978340149, - "learning_rate": 2.53214709343807e-05, - "loss": 1.7443, - "step": 180500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 1.6505674123764038, - "learning_rate": 2.5299332160046985e-05, - "loss": 1.7454, - "step": 180600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 1.5555040836334229, - "learning_rate": 2.5277193150940638e-05, - "loss": 1.7416, - "step": 180700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 1.6162723302841187, - "learning_rate": 2.525505392442577e-05, - "loss": 1.7433, - "step": 180800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 1.5440572500228882, - "learning_rate": 2.523291449786663e-05, - "loss": 1.7438, - "step": 180900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 1.596146583557129, - "learning_rate": 2.5210774888627664e-05, - "loss": 1.7425, - "step": 181000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.140672206878662, - "eval_runtime": 51.8004, - "eval_samples_per_second": 196.794, - "eval_steps_per_second": 1.544, - "step": 181000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 1.6086748838424683, - "learning_rate": 2.5188635114073434e-05, - "loss": 1.7488, - "step": 181100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 1.564663290977478, - "learning_rate": 2.516649519156864e-05, - "loss": 1.7452, - "step": 181200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 1.5975944995880127, - "learning_rate": 2.51443551384781e-05, - "loss": 1.7419, - "step": 181300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 1.6056960821151733, - "learning_rate": 2.5122214972166724e-05, - "loss": 1.7536, - "step": 181400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 1.6348010301589966, - "learning_rate": 2.5100074709999526e-05, - "loss": 1.7505, - "step": 181500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 1.4651880264282227, - "learning_rate": 2.5077934369341594e-05, - "loss": 1.7474, - "step": 181600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 1.6000345945358276, - "learning_rate": 2.505579396755806e-05, - "loss": 1.7455, - "step": 181700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 1.6549137830734253, - "learning_rate": 2.503365352201413e-05, - "loss": 1.7404, - "step": 181800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 1.6172484159469604, - "learning_rate": 2.5011513050075014e-05, - "loss": 1.7457, - "step": 181900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 1.6283797025680542, - "learning_rate": 2.4989372569105962e-05, - "loss": 1.7411, - "step": 182000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.1432528495788574, - "eval_runtime": 51.7742, - "eval_samples_per_second": 196.894, - "eval_steps_per_second": 1.545, - "step": 182000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 1.5319279432296753, - "learning_rate": 2.4967232096472236e-05, - "loss": 1.76, - "step": 182100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 1.600860595703125, - "learning_rate": 2.4945091649539086e-05, - "loss": 1.7416, - "step": 182200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 1.5592856407165527, - "learning_rate": 2.4922951245671723e-05, - "loss": 1.7421, - "step": 182300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 1.5361909866333008, - "learning_rate": 2.4900810902235356e-05, - "loss": 1.7436, - "step": 182400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 1.526672124862671, - "learning_rate": 2.4878670636595117e-05, - "loss": 1.7418, - "step": 182500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 1.5167595148086548, - "learning_rate": 2.4856530466116112e-05, - "loss": 1.7389, - "step": 182600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 1.6046936511993408, - "learning_rate": 2.4834390408163324e-05, - "loss": 1.7459, - "step": 182700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 1.572601079940796, - "learning_rate": 2.4812250480101693e-05, - "loss": 1.7464, - "step": 182800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 1.5549017190933228, - "learning_rate": 2.479011069929603e-05, - "loss": 1.7356, - "step": 182900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 1.5163230895996094, - "learning_rate": 2.476797108311106e-05, - "loss": 1.7427, - "step": 183000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.1313729286193848, - "eval_runtime": 51.744, - "eval_samples_per_second": 197.009, - "eval_steps_per_second": 1.546, - "step": 183000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 1.5936397314071655, - "learning_rate": 2.474583164891133e-05, - "loss": 1.7446, - "step": 183100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 1.5533971786499023, - "learning_rate": 2.4723692414061295e-05, - "loss": 1.7452, - "step": 183200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 1.6152623891830444, - "learning_rate": 2.4701553395925214e-05, - "loss": 1.7425, - "step": 183300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 1.4908332824707031, - "learning_rate": 2.4679414611867214e-05, - "loss": 1.755, - "step": 183400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 1.6560674905776978, - "learning_rate": 2.4657276079251194e-05, - "loss": 1.7477, - "step": 183500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 1.7160277366638184, - "learning_rate": 2.4635137815440894e-05, - "loss": 1.7446, - "step": 183600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 1.4447243213653564, - "learning_rate": 2.461299983779983e-05, - "loss": 1.7403, - "step": 183700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 1.605068325996399, - "learning_rate": 2.459086216369129e-05, - "loss": 1.7439, - "step": 183800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 1.6601132154464722, - "learning_rate": 2.4568724810478325e-05, - "loss": 1.7439, - "step": 183900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 1.546660304069519, - "learning_rate": 2.4546587795523733e-05, - "loss": 1.7339, - "step": 184000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.1373305320739746, - "eval_runtime": 51.7742, - "eval_samples_per_second": 196.893, - "eval_steps_per_second": 1.545, - "step": 184000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 1.6656001806259155, - "learning_rate": 2.4524451136190048e-05, - "loss": 1.8435, - "step": 184100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 1.6392732858657837, - "learning_rate": 2.4502314849839546e-05, - "loss": 1.8453, - "step": 184200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 1.7409366369247437, - "learning_rate": 2.4480178953834162e-05, - "loss": 1.8407, - "step": 184300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 1.5873730182647705, - "learning_rate": 2.445804346553557e-05, - "loss": 1.8428, - "step": 184400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 1.5073753595352173, - "learning_rate": 2.4435908402305108e-05, - "loss": 1.8379, - "step": 184500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 2.3680567741394043, - "learning_rate": 2.4413773781503788e-05, - "loss": 1.83, - "step": 184600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 1.6823689937591553, - "learning_rate": 2.4391639620492243e-05, - "loss": 1.8411, - "step": 184700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 1.5574064254760742, - "learning_rate": 2.4369505936630786e-05, - "loss": 1.8351, - "step": 184800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 2.146876096725464, - "learning_rate": 2.4347372747279337e-05, - "loss": 1.833, - "step": 184900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 1.6746612787246704, - "learning_rate": 2.4325240069797438e-05, - "loss": 1.8284, - "step": 185000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.133864641189575, - "eval_runtime": 52.0009, - "eval_samples_per_second": 196.035, - "eval_steps_per_second": 1.538, - "step": 185000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 1.6454411745071411, - "learning_rate": 2.430310792154422e-05, - "loss": 1.8312, - "step": 185100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 1.8907885551452637, - "learning_rate": 2.4280976319878392e-05, - "loss": 1.8384, - "step": 185200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 1.6488444805145264, - "learning_rate": 2.425884528215825e-05, - "loss": 1.8241, - "step": 185300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 1.6460552215576172, - "learning_rate": 2.423671482574164e-05, - "loss": 1.8318, - "step": 185400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 1.6229537725448608, - "learning_rate": 2.4214584967985962e-05, - "loss": 1.8349, - "step": 185500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 1.5805400609970093, - "learning_rate": 2.419245572624812e-05, - "loss": 1.823, - "step": 185600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 1.8274881839752197, - "learning_rate": 2.4170327117884562e-05, - "loss": 1.8363, - "step": 185700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 1.5922763347625732, - "learning_rate": 2.4148199160251238e-05, - "loss": 1.8272, - "step": 185800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 1.6500530242919922, - "learning_rate": 2.4126071870703574e-05, - "loss": 1.821, - "step": 185900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 1.6244685649871826, - "learning_rate": 2.410394526659647e-05, - "loss": 1.8287, - "step": 186000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.131998300552368, - "eval_runtime": 51.5187, - "eval_samples_per_second": 197.87, - "eval_steps_per_second": 1.553, - "step": 186000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 1.608132004737854, - "learning_rate": 2.40818193652843e-05, - "loss": 1.837, - "step": 186100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 1.5261002779006958, - "learning_rate": 2.4059694184120883e-05, - "loss": 1.827, - "step": 186200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 1.604973316192627, - "learning_rate": 2.4037569740459486e-05, - "loss": 1.8157, - "step": 186300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 1.6349529027938843, - "learning_rate": 2.401544605165276e-05, - "loss": 1.8381, - "step": 186400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 1.5540446043014526, - "learning_rate": 2.3993323135052806e-05, - "loss": 1.8383, - "step": 186500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 1.6200664043426514, - "learning_rate": 2.3971201008011093e-05, - "loss": 1.828, - "step": 186600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 1.750746726989746, - "learning_rate": 2.3949079687878492e-05, - "loss": 1.8302, - "step": 186700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 1.6309112310409546, - "learning_rate": 2.392695919200521e-05, - "loss": 1.8118, - "step": 186800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 1.5920358896255493, - "learning_rate": 2.3904839537740837e-05, - "loss": 1.8226, - "step": 186900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 1.7713048458099365, - "learning_rate": 2.3882720742434294e-05, - "loss": 1.8197, - "step": 187000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.121570348739624, - "eval_runtime": 51.4105, - "eval_samples_per_second": 198.286, - "eval_steps_per_second": 1.556, - "step": 187000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 1.674100637435913, - "learning_rate": 2.3860602823433825e-05, - "loss": 1.8338, - "step": 187100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 1.6260745525360107, - "learning_rate": 2.3838485798086984e-05, - "loss": 1.8209, - "step": 187200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 1.786022663116455, - "learning_rate": 2.3816369683740624e-05, - "loss": 1.8298, - "step": 187300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 1.521037220954895, - "learning_rate": 2.3794254497740898e-05, - "loss": 1.8353, - "step": 187400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 1.5519471168518066, - "learning_rate": 2.3772140257433223e-05, - "loss": 1.8361, - "step": 187500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 1.5187164545059204, - "learning_rate": 2.3750026980162256e-05, - "loss": 1.8326, - "step": 187600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 1.7430784702301025, - "learning_rate": 2.3727914683271922e-05, - "loss": 1.8308, - "step": 187700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 1.6210083961486816, - "learning_rate": 2.3705803384105377e-05, - "loss": 1.8252, - "step": 187800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 1.6390823125839233, - "learning_rate": 2.3683693100004985e-05, - "loss": 1.8287, - "step": 187900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 2.0330820083618164, - "learning_rate": 2.3661583848312303e-05, - "loss": 1.8347, - "step": 188000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.131164073944092, - "eval_runtime": 51.4325, - "eval_samples_per_second": 198.202, - "eval_steps_per_second": 1.555, - "step": 188000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 1.5582841634750366, - "learning_rate": 2.36394756463681e-05, - "loss": 1.8215, - "step": 188100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 1.5832375288009644, - "learning_rate": 2.361736851151231e-05, - "loss": 1.8316, - "step": 188200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 1.578747272491455, - "learning_rate": 2.359526246108404e-05, - "loss": 1.828, - "step": 188300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 1.6343365907669067, - "learning_rate": 2.3573157512421535e-05, - "loss": 1.8348, - "step": 188400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 1.5738635063171387, - "learning_rate": 2.3551053682862177e-05, - "loss": 1.8271, - "step": 188500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 1.6531946659088135, - "learning_rate": 2.3528950989742472e-05, - "loss": 1.8168, - "step": 188600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 2.098233699798584, - "learning_rate": 2.350684945039804e-05, - "loss": 1.8323, - "step": 188700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 1.6470394134521484, - "learning_rate": 2.3484749082163605e-05, - "loss": 1.8353, - "step": 188800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 1.6183503866195679, - "learning_rate": 2.346264990237293e-05, - "loss": 1.8204, - "step": 188900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 1.60996675491333, - "learning_rate": 2.3440551928358894e-05, - "loss": 1.8291, - "step": 189000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.130070924758911, - "eval_runtime": 51.3411, - "eval_samples_per_second": 198.554, - "eval_steps_per_second": 1.558, - "step": 189000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 1.5722655057907104, - "learning_rate": 2.3418455177453416e-05, - "loss": 1.8258, - "step": 189100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 2.121628999710083, - "learning_rate": 2.339635966698745e-05, - "loss": 1.8324, - "step": 189200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 1.6077678203582764, - "learning_rate": 2.3374265414290962e-05, - "loss": 1.8243, - "step": 189300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 1.5904488563537598, - "learning_rate": 2.335217243669296e-05, - "loss": 1.825, - "step": 189400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 1.536439061164856, - "learning_rate": 2.333008075152144e-05, - "loss": 1.8242, - "step": 189500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 2.195769786834717, - "learning_rate": 2.3307990376103388e-05, - "loss": 1.8365, - "step": 189600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 1.533521294593811, - "learning_rate": 2.328590132776475e-05, - "loss": 1.8266, - "step": 189700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 1.5849336385726929, - "learning_rate": 2.326381362383045e-05, - "loss": 1.8206, - "step": 189800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 1.5556162595748901, - "learning_rate": 2.3241727281624335e-05, - "loss": 1.8272, - "step": 189900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 1.6486213207244873, - "learning_rate": 2.3219642318469215e-05, - "loss": 1.8333, - "step": 190000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.1350369453430176, - "eval_runtime": 51.4386, - "eval_samples_per_second": 198.178, - "eval_steps_per_second": 1.555, - "step": 190000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 1.6402443647384644, - "learning_rate": 2.3197558751686776e-05, - "loss": 1.83, - "step": 190100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 1.5592520236968994, - "learning_rate": 2.3175476598597648e-05, - "loss": 1.8244, - "step": 190200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 2.0347630977630615, - "learning_rate": 2.3153395876521336e-05, - "loss": 1.8385, - "step": 190300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 1.547045350074768, - "learning_rate": 2.3131316602776232e-05, - "loss": 1.8216, - "step": 190400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 1.564841628074646, - "learning_rate": 2.3109238794679568e-05, - "loss": 1.8232, - "step": 190500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 1.8858461380004883, - "learning_rate": 2.3087162469547443e-05, - "loss": 1.8319, - "step": 190600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 1.7047299146652222, - "learning_rate": 2.30650876446948e-05, - "loss": 1.8391, - "step": 190700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 1.510563850402832, - "learning_rate": 2.30430143374354e-05, - "loss": 1.8226, - "step": 190800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 2.209728956222534, - "learning_rate": 2.3020942565081798e-05, - "loss": 1.8307, - "step": 190900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 1.6156638860702515, - "learning_rate": 2.299887234494537e-05, - "loss": 1.8208, - "step": 191000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.121595621109009, - "eval_runtime": 51.6838, - "eval_samples_per_second": 197.238, - "eval_steps_per_second": 1.548, - "step": 191000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 1.5259544849395752, - "learning_rate": 2.2976803694336256e-05, - "loss": 1.8279, - "step": 191100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 1.6435580253601074, - "learning_rate": 2.2954736630563375e-05, - "loss": 1.8291, - "step": 191200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 1.6680907011032104, - "learning_rate": 2.2932671170934405e-05, - "loss": 1.834, - "step": 191300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 1.6637004613876343, - "learning_rate": 2.2910607332755744e-05, - "loss": 1.8067, - "step": 191400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 1.5594576597213745, - "learning_rate": 2.288854513333254e-05, - "loss": 1.8132, - "step": 191500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 1.502920389175415, - "learning_rate": 2.2866484589968654e-05, - "loss": 1.8337, - "step": 191600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 1.566256046295166, - "learning_rate": 2.2844425719966637e-05, - "loss": 1.8216, - "step": 191700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 1.882520079612732, - "learning_rate": 2.2822368540627736e-05, - "loss": 1.8178, - "step": 191800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 1.5686990022659302, - "learning_rate": 2.2800313069251867e-05, - "loss": 1.831, - "step": 191900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 1.6161882877349854, - "learning_rate": 2.2778259323137607e-05, - "loss": 1.8236, - "step": 192000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.1250271797180176, - "eval_runtime": 51.7856, - "eval_samples_per_second": 196.85, - "eval_steps_per_second": 1.545, - "step": 192000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 1.9454728364944458, - "learning_rate": 2.27562073195822e-05, - "loss": 1.8262, - "step": 192100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 1.568524956703186, - "learning_rate": 2.273415707588148e-05, - "loss": 1.8111, - "step": 192200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 1.6108800172805786, - "learning_rate": 2.2712108609329933e-05, - "loss": 1.8097, - "step": 192300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 1.5785143375396729, - "learning_rate": 2.2690061937220656e-05, - "loss": 1.8223, - "step": 192400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 2.498911142349243, - "learning_rate": 2.2668017076845323e-05, - "loss": 2.0084, - "step": 192500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 2.186514377593994, - "learning_rate": 2.2645974045494175e-05, - "loss": 2.48, - "step": 192600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 2.3486995697021484, - "learning_rate": 2.2623932860456044e-05, - "loss": 2.4545, - "step": 192700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 2.1500723361968994, - "learning_rate": 2.2601893539018305e-05, - "loss": 2.4442, - "step": 192800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 2.1858279705047607, - "learning_rate": 2.2579856098466882e-05, - "loss": 2.4291, - "step": 192900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 2.4530797004699707, - "learning_rate": 2.2557820556086187e-05, - "loss": 2.4252, - "step": 193000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.1376500129699707, - "eval_runtime": 51.9091, - "eval_samples_per_second": 196.382, - "eval_steps_per_second": 1.541, - "step": 193000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 2.192619562149048, - "learning_rate": 2.253578692915919e-05, - "loss": 2.4244, - "step": 193100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 2.2540953159332275, - "learning_rate": 2.2513755234967317e-05, - "loss": 2.4187, - "step": 193200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 2.1056604385375977, - "learning_rate": 2.2491725490790526e-05, - "loss": 2.4017, - "step": 193300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 2.1589183807373047, - "learning_rate": 2.2469697713907186e-05, - "loss": 2.4083, - "step": 193400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 2.1547043323516846, - "learning_rate": 2.244767192159417e-05, - "loss": 2.4065, - "step": 193500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 2.057020425796509, - "learning_rate": 2.2425648131126777e-05, - "loss": 2.3981, - "step": 193600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 2.380244255065918, - "learning_rate": 2.2403626359778753e-05, - "loss": 2.404, - "step": 193700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 2.1975646018981934, - "learning_rate": 2.2381606624822228e-05, - "loss": 2.3931, - "step": 193800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 2.0740909576416016, - "learning_rate": 2.2359588943527746e-05, - "loss": 2.4027, - "step": 193900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 2.2962470054626465, - "learning_rate": 2.233757333316426e-05, - "loss": 2.3949, - "step": 194000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.148186206817627, - "eval_runtime": 51.9071, - "eval_samples_per_second": 196.389, - "eval_steps_per_second": 1.541, - "step": 194000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 2.1983277797698975, - "learning_rate": 2.2315559810999086e-05, - "loss": 2.3911, - "step": 194100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 2.1726229190826416, - "learning_rate": 2.2293548394297893e-05, - "loss": 2.3763, - "step": 194200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 2.190869092941284, - "learning_rate": 2.2271539100324705e-05, - "loss": 2.3822, - "step": 194300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 2.150756359100342, - "learning_rate": 2.22495319463419e-05, - "loss": 2.383, - "step": 194400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 2.159919500350952, - "learning_rate": 2.222752694961014e-05, - "loss": 2.3799, - "step": 194500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 2.1796655654907227, - "learning_rate": 2.2205524127388438e-05, - "loss": 2.3804, - "step": 194600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 2.313180446624756, - "learning_rate": 2.2183523496934052e-05, - "loss": 2.3574, - "step": 194700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 2.2141001224517822, - "learning_rate": 2.2161525075502565e-05, - "loss": 2.3726, - "step": 194800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 2.145921468734741, - "learning_rate": 2.2139528880347807e-05, - "loss": 2.3633, - "step": 194900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 2.279843330383301, - "learning_rate": 2.2117534928721878e-05, - "loss": 2.3747, - "step": 195000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.156066417694092, - "eval_runtime": 51.6132, - "eval_samples_per_second": 197.508, - "eval_steps_per_second": 1.55, - "step": 195000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 2.107222557067871, - "learning_rate": 2.2095543237875088e-05, - "loss": 2.3612, - "step": 195100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 2.1660873889923096, - "learning_rate": 2.207355382505599e-05, - "loss": 2.3562, - "step": 195200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 2.206403970718384, - "learning_rate": 2.2051566707511362e-05, - "loss": 2.371, - "step": 195300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 2.277531147003174, - "learning_rate": 2.2029581902486176e-05, - "loss": 2.3571, - "step": 195400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 2.041177749633789, - "learning_rate": 2.200759942722357e-05, - "loss": 2.3658, - "step": 195500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 2.2721259593963623, - "learning_rate": 2.1985619298964884e-05, - "loss": 2.3531, - "step": 195600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 2.2664246559143066, - "learning_rate": 2.1963641534949597e-05, - "loss": 2.3522, - "step": 195700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 2.323575258255005, - "learning_rate": 2.1941666152415343e-05, - "loss": 2.342, - "step": 195800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 2.1575205326080322, - "learning_rate": 2.1919693168597887e-05, - "loss": 2.3505, - "step": 195900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 2.265693187713623, - "learning_rate": 2.1897722600731107e-05, - "loss": 2.3428, - "step": 196000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.1471190452575684, - "eval_runtime": 51.714, - "eval_samples_per_second": 197.123, - "eval_steps_per_second": 1.547, - "step": 196000 - }, - { - "epoch": 0.033768886854693314, - "grad_norm": 2.1313159465789795, - "learning_rate": 2.187575446604699e-05, - "loss": 2.344, - "step": 196100 - }, - { - "epoch": 0.03404796856423624, - "grad_norm": 2.165553569793701, - "learning_rate": 2.1853788781775626e-05, - "loss": 2.3369, - "step": 196200 - }, - { - "epoch": 0.03432705027377916, - "grad_norm": 2.348489999771118, - "learning_rate": 2.1831825565145155e-05, - "loss": 2.3325, - "step": 196300 - }, - { - "epoch": 0.034606131983322075, - "grad_norm": 2.2844085693359375, - "learning_rate": 2.1809864833381816e-05, - "loss": 2.3458, - "step": 196400 - }, - { - "epoch": 0.034885213692865, - "grad_norm": 2.1077094078063965, - "learning_rate": 2.1787906603709863e-05, - "loss": 2.3301, - "step": 196500 - }, - { - "epoch": 0.03516429540240792, - "grad_norm": 2.2360150814056396, - "learning_rate": 2.1765950893351627e-05, - "loss": 2.3357, - "step": 196600 - }, - { - "epoch": 0.035443377111950836, - "grad_norm": 2.1342897415161133, - "learning_rate": 2.1743997719527423e-05, - "loss": 2.3309, - "step": 196700 - }, - { - "epoch": 0.035722458821493754, - "grad_norm": 2.3143725395202637, - "learning_rate": 2.17220470994556e-05, - "loss": 2.3497, - "step": 196800 - }, - { - "epoch": 0.03600154053103668, - "grad_norm": 2.207287549972534, - "learning_rate": 2.170009905035251e-05, - "loss": 2.3268, - "step": 196900 - }, - { - "epoch": 0.036280622240579596, - "grad_norm": 2.1440131664276123, - "learning_rate": 2.167815358943248e-05, - "loss": 2.3535, - "step": 197000 - }, - { - "epoch": 0.036280622240579596, - "eval_loss": 2.1492607593536377, - "eval_runtime": 51.6958, - "eval_samples_per_second": 197.192, - "eval_steps_per_second": 1.548, - "step": 197000 - }, - { - "epoch": 0.036559703950122514, - "grad_norm": 2.189824342727661, - "learning_rate": 2.165621073390779e-05, - "loss": 2.3368, - "step": 197100 - }, - { - "epoch": 0.03683878565966544, - "grad_norm": 2.1990151405334473, - "learning_rate": 2.16342705009887e-05, - "loss": 2.3344, - "step": 197200 - }, - { - "epoch": 0.03711786736920836, - "grad_norm": 2.077488899230957, - "learning_rate": 2.1612332907883405e-05, - "loss": 2.3267, - "step": 197300 - }, - { - "epoch": 0.037396949078751275, - "grad_norm": 2.2698981761932373, - "learning_rate": 2.1590397971798025e-05, - "loss": 2.3285, - "step": 197400 - }, - { - "epoch": 0.0376760307882942, - "grad_norm": 2.2117862701416016, - "learning_rate": 2.1568465709936615e-05, - "loss": 2.322, - "step": 197500 - }, - { - "epoch": 0.03795511249783712, - "grad_norm": 2.194138288497925, - "learning_rate": 2.15465361395011e-05, - "loss": 2.3228, - "step": 197600 - }, - { - "epoch": 0.038234194207380036, - "grad_norm": 2.151017665863037, - "learning_rate": 2.1524609277691327e-05, - "loss": 2.3376, - "step": 197700 - }, - { - "epoch": 0.038513275916922954, - "grad_norm": 2.273414373397827, - "learning_rate": 2.1502685141704992e-05, - "loss": 2.3298, - "step": 197800 - }, - { - "epoch": 0.03879235762646588, - "grad_norm": 2.2569565773010254, - "learning_rate": 2.148076374873768e-05, - "loss": 2.3371, - "step": 197900 - }, - { - "epoch": 0.0390714393360088, - "grad_norm": 2.109938621520996, - "learning_rate": 2.1458845115982783e-05, - "loss": 2.3074, - "step": 198000 - }, - { - "epoch": 0.0390714393360088, - "eval_loss": 2.156459331512451, - "eval_runtime": 51.6968, - "eval_samples_per_second": 197.188, - "eval_steps_per_second": 1.547, - "step": 198000 - }, - { - "epoch": 0.039350521045551715, - "grad_norm": 2.1745998859405518, - "learning_rate": 2.1436929260631578e-05, - "loss": 2.3337, - "step": 198100 - }, - { - "epoch": 0.03962960275509464, - "grad_norm": 2.120976448059082, - "learning_rate": 2.141501619987313e-05, - "loss": 2.3231, - "step": 198200 - }, - { - "epoch": 0.03990868446463756, - "grad_norm": 2.1885461807250977, - "learning_rate": 2.139310595089434e-05, - "loss": 2.3277, - "step": 198300 - }, - { - "epoch": 0.040187766174180475, - "grad_norm": 2.2620506286621094, - "learning_rate": 2.137119853087986e-05, - "loss": 2.3335, - "step": 198400 - }, - { - "epoch": 0.04046684788372339, - "grad_norm": 2.1864798069000244, - "learning_rate": 2.1349293957012156e-05, - "loss": 2.3239, - "step": 198500 - }, - { - "epoch": 0.04074592959326632, - "grad_norm": 2.1792876720428467, - "learning_rate": 2.1327392246471463e-05, - "loss": 2.3166, - "step": 198600 - }, - { - "epoch": 0.041025011302809236, - "grad_norm": 2.264899730682373, - "learning_rate": 2.1305493416435765e-05, - "loss": 2.3171, - "step": 198700 - }, - { - "epoch": 0.041304093012352154, - "grad_norm": 1.9806472063064575, - "learning_rate": 2.1283597484080765e-05, - "loss": 2.3247, - "step": 198800 - }, - { - "epoch": 0.04158317472189508, - "grad_norm": 2.1849722862243652, - "learning_rate": 2.1261704466579928e-05, - "loss": 2.3158, - "step": 198900 - }, - { - "epoch": 0.041862256431438, - "grad_norm": 2.025466203689575, - "learning_rate": 2.1239814381104417e-05, - "loss": 2.3061, - "step": 199000 - }, - { - "epoch": 0.041862256431438, - "eval_loss": 2.146428108215332, - "eval_runtime": 51.7966, - "eval_samples_per_second": 196.808, - "eval_steps_per_second": 1.545, - "step": 199000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 2.0780715942382812, - "learning_rate": 2.1217927244823092e-05, - "loss": 2.3137, - "step": 199100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 2.1887388229370117, - "learning_rate": 2.1196043074902503e-05, - "loss": 2.311, - "step": 199200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 2.110805034637451, - "learning_rate": 2.1174161888506867e-05, - "loss": 2.3166, - "step": 199300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 2.1829800605773926, - "learning_rate": 2.1152283702798077e-05, - "loss": 2.3035, - "step": 199400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 2.2523720264434814, - "learning_rate": 2.1130408534935664e-05, - "loss": 2.3104, - "step": 199500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 2.268869400024414, - "learning_rate": 2.1108536402076777e-05, - "loss": 2.3095, - "step": 199600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 2.352266788482666, - "learning_rate": 2.108666732137622e-05, - "loss": 2.3235, - "step": 199700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 2.0702219009399414, - "learning_rate": 2.106480130998636e-05, - "loss": 2.301, - "step": 199800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 2.219024896621704, - "learning_rate": 2.1042938385057202e-05, - "loss": 2.2952, - "step": 199900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 2.2651102542877197, - "learning_rate": 2.102107856373628e-05, - "loss": 2.302, - "step": 200000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.1563775539398193, - "eval_runtime": 52.0565, - "eval_samples_per_second": 195.826, - "eval_steps_per_second": 1.537, - "step": 200000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 2.2001454830169678, - "learning_rate": 2.0999221863168736e-05, - "loss": 2.3131, - "step": 200100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 2.1782033443450928, - "learning_rate": 2.0977368300497246e-05, - "loss": 2.3084, - "step": 200200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 2.282090663909912, - "learning_rate": 2.095551789286204e-05, - "loss": 2.2983, - "step": 200300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 2.1379668712615967, - "learning_rate": 2.0933670657400838e-05, - "loss": 2.2989, - "step": 200400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 2.3254175186157227, - "learning_rate": 2.091182661124891e-05, - "loss": 2.3211, - "step": 200500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 2.112151622772217, - "learning_rate": 2.0889985771539002e-05, - "loss": 2.288, - "step": 200600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 2.2548341751098633, - "learning_rate": 2.0868148155401356e-05, - "loss": 2.3027, - "step": 200700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 2.3280234336853027, - "learning_rate": 2.0846313779963696e-05, - "loss": 2.3049, - "step": 200800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 2.256028175354004, - "learning_rate": 2.0824482662351167e-05, - "loss": 2.3023, - "step": 200900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 2.195711851119995, - "learning_rate": 2.0802654819686398e-05, - "loss": 2.2887, - "step": 201000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.14955997467041, - "eval_runtime": 51.6475, - "eval_samples_per_second": 197.376, - "eval_steps_per_second": 1.549, - "step": 201000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 2.0905580520629883, - "learning_rate": 2.0780830269089423e-05, - "loss": 2.2914, - "step": 201100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 2.1279406547546387, - "learning_rate": 2.0759009027677727e-05, - "loss": 2.3037, - "step": 201200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 2.178835868835449, - "learning_rate": 2.0737191112566146e-05, - "loss": 2.2989, - "step": 201300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 2.2267632484436035, - "learning_rate": 2.071537654086696e-05, - "loss": 2.2928, - "step": 201400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 2.310661792755127, - "learning_rate": 2.0693565329689793e-05, - "loss": 2.3337, - "step": 201500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 2.2507314682006836, - "learning_rate": 2.0671757496141665e-05, - "loss": 2.3269, - "step": 201600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 2.161654472351074, - "learning_rate": 2.0649953057326904e-05, - "loss": 2.3191, - "step": 201700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 2.2663004398345947, - "learning_rate": 2.0628152030347214e-05, - "loss": 2.3153, - "step": 201800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 2.2835566997528076, - "learning_rate": 2.06063544323016e-05, - "loss": 2.3127, - "step": 201900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 2.1445398330688477, - "learning_rate": 2.0584560280286397e-05, - "loss": 2.2974, - "step": 202000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.17156720161438, - "eval_runtime": 51.3282, - "eval_samples_per_second": 198.604, - "eval_steps_per_second": 1.559, - "step": 202000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 2.2205893993377686, - "learning_rate": 2.0562769591395203e-05, - "loss": 2.3078, - "step": 202100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 2.205244541168213, - "learning_rate": 2.054098238271894e-05, - "loss": 2.2938, - "step": 202200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 2.2526943683624268, - "learning_rate": 2.0519198671345784e-05, - "loss": 2.2967, - "step": 202300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 2.3271262645721436, - "learning_rate": 2.049741847436116e-05, - "loss": 2.2701, - "step": 202400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 2.225120782852173, - "learning_rate": 2.047564180884775e-05, - "loss": 2.3035, - "step": 202500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 2.1193225383758545, - "learning_rate": 2.0453868691885446e-05, - "loss": 2.287, - "step": 202600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 2.305154800415039, - "learning_rate": 2.043209914055138e-05, - "loss": 2.2997, - "step": 202700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 2.1183183193206787, - "learning_rate": 2.041033317191989e-05, - "loss": 2.3005, - "step": 202800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 2.175776481628418, - "learning_rate": 2.0388570803062465e-05, - "loss": 2.2992, - "step": 202900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 2.2266392707824707, - "learning_rate": 2.036681205104782e-05, - "loss": 2.2959, - "step": 203000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.167436122894287, - "eval_runtime": 51.341, - "eval_samples_per_second": 198.555, - "eval_steps_per_second": 1.558, - "step": 203000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 2.2803258895874023, - "learning_rate": 2.0345056932941793e-05, - "loss": 2.2866, - "step": 203100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 2.1691677570343018, - "learning_rate": 2.032330546580741e-05, - "loss": 2.2798, - "step": 203200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 2.0682337284088135, - "learning_rate": 2.0301557666704787e-05, - "loss": 2.2847, - "step": 203300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 2.41520357131958, - "learning_rate": 2.0279813552691208e-05, - "loss": 2.2897, - "step": 203400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 2.2019283771514893, - "learning_rate": 2.025807314082104e-05, - "loss": 2.2855, - "step": 203500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 2.154576539993286, - "learning_rate": 2.0236336448145766e-05, - "loss": 2.2726, - "step": 203600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 2.2569046020507812, - "learning_rate": 2.0214603491713928e-05, - "loss": 2.2666, - "step": 203700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 2.306614875793457, - "learning_rate": 2.0192874288571152e-05, - "loss": 2.2826, - "step": 203800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 2.2659449577331543, - "learning_rate": 2.017114885576012e-05, - "loss": 2.288, - "step": 203900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 2.14077091217041, - "learning_rate": 2.0149427210320545e-05, - "loss": 2.2729, - "step": 204000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.164825916290283, - "eval_runtime": 51.3793, - "eval_samples_per_second": 198.407, - "eval_steps_per_second": 1.557, - "step": 204000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 2.265152931213379, - "learning_rate": 2.0127709369289202e-05, - "loss": 2.2654, - "step": 204100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 2.0833661556243896, - "learning_rate": 2.0105995349699832e-05, - "loss": 2.2863, - "step": 204200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 2.2797181606292725, - "learning_rate": 2.008428516858323e-05, - "loss": 2.2702, - "step": 204300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 2.2614681720733643, - "learning_rate": 2.006257884296713e-05, - "loss": 2.2846, - "step": 204400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 2.1245336532592773, - "learning_rate": 2.00408763898763e-05, - "loss": 2.2758, - "step": 204500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 2.14581298828125, - "learning_rate": 2.001917782633241e-05, - "loss": 2.2624, - "step": 204600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 2.240208864212036, - "learning_rate": 1.9997483169354124e-05, - "loss": 2.2563, - "step": 204700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 2.290208578109741, - "learning_rate": 1.9975792435957024e-05, - "loss": 2.2733, - "step": 204800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 2.309551954269409, - "learning_rate": 1.9954105643153624e-05, - "loss": 2.2575, - "step": 204900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 2.183645009994507, - "learning_rate": 1.9932422807953323e-05, - "loss": 2.2796, - "step": 205000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.1678764820098877, - "eval_runtime": 51.2196, - "eval_samples_per_second": 199.025, - "eval_steps_per_second": 1.562, - "step": 205000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 2.1871604919433594, - "learning_rate": 1.9910743947362455e-05, - "loss": 2.2631, - "step": 205100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 2.1617250442504883, - "learning_rate": 1.9889069078384193e-05, - "loss": 2.2609, - "step": 205200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 2.183656692504883, - "learning_rate": 1.9867398218018624e-05, - "loss": 2.2568, - "step": 205300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 2.2372233867645264, - "learning_rate": 1.9845731383262646e-05, - "loss": 2.2663, - "step": 205400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 2.200566053390503, - "learning_rate": 1.9824068591110034e-05, - "loss": 2.2511, - "step": 205500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 2.1325571537017822, - "learning_rate": 1.9802409858551382e-05, - "loss": 2.2628, - "step": 205600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 2.1458706855773926, - "learning_rate": 1.9780755202574098e-05, - "loss": 2.2565, - "step": 205700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 2.397474527359009, - "learning_rate": 1.9759104640162388e-05, - "loss": 2.2582, - "step": 205800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 2.239386558532715, - "learning_rate": 1.9737458188297247e-05, - "loss": 2.2484, - "step": 205900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 2.17461895942688, - "learning_rate": 1.9715815863956462e-05, - "loss": 2.2536, - "step": 206000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.1656434535980225, - "eval_runtime": 51.4827, - "eval_samples_per_second": 198.008, - "eval_steps_per_second": 1.554, - "step": 206000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 2.1970889568328857, - "learning_rate": 1.969417768411458e-05, - "loss": 2.269, - "step": 206100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 2.151305913925171, - "learning_rate": 1.967254366574286e-05, - "loss": 2.2609, - "step": 206200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 2.164149045944214, - "learning_rate": 1.965091382580935e-05, - "loss": 2.2608, - "step": 206300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 2.203151226043701, - "learning_rate": 1.9629288181278795e-05, - "loss": 2.2616, - "step": 206400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 2.1855273246765137, - "learning_rate": 1.960766674911264e-05, - "loss": 2.2614, - "step": 206500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 2.124351978302002, - "learning_rate": 1.958604954626906e-05, - "loss": 2.2448, - "step": 206600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 2.177095890045166, - "learning_rate": 1.9564436589702864e-05, - "loss": 2.2519, - "step": 206700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 2.1898281574249268, - "learning_rate": 1.9542827896365568e-05, - "loss": 2.2608, - "step": 206800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 2.2773730754852295, - "learning_rate": 1.9521223483205342e-05, - "loss": 2.262, - "step": 206900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 2.2109436988830566, - "learning_rate": 1.9499623367166982e-05, - "loss": 2.2448, - "step": 207000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.164100408554077, - "eval_runtime": 51.5664, - "eval_samples_per_second": 197.687, - "eval_steps_per_second": 1.551, - "step": 207000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 2.2141733169555664, - "learning_rate": 1.9478027565191922e-05, - "loss": 2.2537, - "step": 207100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 2.2592718601226807, - "learning_rate": 1.945643609421821e-05, - "loss": 2.2441, - "step": 207200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 2.2082977294921875, - "learning_rate": 1.94348489711805e-05, - "loss": 2.2529, - "step": 207300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 2.2095062732696533, - "learning_rate": 1.941326621301005e-05, - "loss": 2.2597, - "step": 207400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 2.189436674118042, - "learning_rate": 1.939168783663466e-05, - "loss": 2.2455, - "step": 207500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 2.218168258666992, - "learning_rate": 1.9370113858978722e-05, - "loss": 2.2485, - "step": 207600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 2.1648590564727783, - "learning_rate": 1.9348544296963165e-05, - "loss": 2.2456, - "step": 207700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 2.121211051940918, - "learning_rate": 1.9326979167505474e-05, - "loss": 2.2364, - "step": 207800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 2.271167039871216, - "learning_rate": 1.9305418487519617e-05, - "loss": 2.2561, - "step": 207900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 2.3215372562408447, - "learning_rate": 1.9283862273916116e-05, - "loss": 2.2397, - "step": 208000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.164187431335449, - "eval_runtime": 51.5373, - "eval_samples_per_second": 197.799, - "eval_steps_per_second": 1.552, - "step": 208000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 2.174811363220215, - "learning_rate": 1.9262310543601962e-05, - "loss": 2.2412, - "step": 208100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 2.1047627925872803, - "learning_rate": 1.9240763313480655e-05, - "loss": 2.2363, - "step": 208200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 2.2328543663024902, - "learning_rate": 1.9219220600452127e-05, - "loss": 2.2537, - "step": 208300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 2.1852455139160156, - "learning_rate": 1.919768242141281e-05, - "loss": 2.2472, - "step": 208400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 2.23559832572937, - "learning_rate": 1.9176148793255543e-05, - "loss": 2.243, - "step": 208500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 2.195355176925659, - "learning_rate": 1.9154619732869626e-05, - "loss": 2.2463, - "step": 208600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 2.295536994934082, - "learning_rate": 1.913309525714075e-05, - "loss": 2.2413, - "step": 208700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 2.373781681060791, - "learning_rate": 1.9111575382951026e-05, - "loss": 2.2385, - "step": 208800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 2.3178882598876953, - "learning_rate": 1.909006012717896e-05, - "loss": 2.2454, - "step": 208900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 2.2002763748168945, - "learning_rate": 1.9068549506699425e-05, - "loss": 2.236, - "step": 209000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.1654672622680664, - "eval_runtime": 51.5727, - "eval_samples_per_second": 197.663, - "eval_steps_per_second": 1.551, - "step": 209000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 2.2618346214294434, - "learning_rate": 1.9047043538383662e-05, - "loss": 2.2211, - "step": 209100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 2.2079176902770996, - "learning_rate": 1.9025542239099252e-05, - "loss": 2.2456, - "step": 209200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 2.119337797164917, - "learning_rate": 1.9004045625710136e-05, - "loss": 2.2356, - "step": 209300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 2.2664501667022705, - "learning_rate": 1.8982553715076583e-05, - "loss": 2.2403, - "step": 209400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 2.2333970069885254, - "learning_rate": 1.8961066524055128e-05, - "loss": 2.2522, - "step": 209500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 2.1713504791259766, - "learning_rate": 1.8939584069498647e-05, - "loss": 2.2488, - "step": 209600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 2.1721699237823486, - "learning_rate": 1.8918106368256302e-05, - "loss": 2.2418, - "step": 209700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 2.102562189102173, - "learning_rate": 1.88966334371735e-05, - "loss": 2.2346, - "step": 209800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 2.1796703338623047, - "learning_rate": 1.8875165293091936e-05, - "loss": 2.2445, - "step": 209900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 2.25935697555542, - "learning_rate": 1.885370195284952e-05, - "loss": 2.2407, - "step": 210000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.174961566925049, - "eval_runtime": 51.6538, - "eval_samples_per_second": 197.352, - "eval_steps_per_second": 1.549, - "step": 210000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 2.1532399654388428, - "learning_rate": 1.8832243433280412e-05, - "loss": 2.2312, - "step": 210100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 2.322571277618408, - "learning_rate": 1.8810789751215e-05, - "loss": 2.235, - "step": 210200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 2.1225528717041016, - "learning_rate": 1.8789340923479862e-05, - "loss": 2.2175, - "step": 210300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 2.2108681201934814, - "learning_rate": 1.8767896966897768e-05, - "loss": 2.239, - "step": 210400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 2.227198839187622, - "learning_rate": 1.8746457898287673e-05, - "loss": 2.2274, - "step": 210500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 2.250565528869629, - "learning_rate": 1.8725023734464702e-05, - "loss": 2.2318, - "step": 210600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 2.1811561584472656, - "learning_rate": 1.8703594492240138e-05, - "loss": 2.2033, - "step": 210700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 2.1336236000061035, - "learning_rate": 1.8682170188421375e-05, - "loss": 2.1952, - "step": 210800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 2.2047863006591797, - "learning_rate": 1.8660750839811963e-05, - "loss": 2.1909, - "step": 210900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 2.003309965133667, - "learning_rate": 1.8639336463211566e-05, - "loss": 2.1693, - "step": 211000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.171804189682007, - "eval_runtime": 51.5992, - "eval_samples_per_second": 197.561, - "eval_steps_per_second": 1.55, - "step": 211000 - }, - { - "epoch": 0.033768886854693314, - "grad_norm": 2.105639934539795, - "learning_rate": 1.861792707541593e-05, - "loss": 2.1683, - "step": 211100 - }, - { - "epoch": 0.03404796856423624, - "grad_norm": 2.2332839965820312, - "learning_rate": 1.8596522693216888e-05, - "loss": 2.1594, - "step": 211200 - }, - { - "epoch": 0.03432705027377916, - "grad_norm": 2.2061290740966797, - "learning_rate": 1.8575123333402367e-05, - "loss": 2.1593, - "step": 211300 - }, - { - "epoch": 0.034606131983322075, - "grad_norm": 2.0589332580566406, - "learning_rate": 1.855372901275634e-05, - "loss": 2.1437, - "step": 211400 - }, - { - "epoch": 0.034885213692865, - "grad_norm": 2.1569809913635254, - "learning_rate": 1.8532339748058844e-05, - "loss": 2.1533, - "step": 211500 - }, - { - "epoch": 0.03516429540240792, - "grad_norm": 2.1025686264038086, - "learning_rate": 1.8510955556085915e-05, - "loss": 2.1525, - "step": 211600 - }, - { - "epoch": 0.035443377111950836, - "grad_norm": 2.19555926322937, - "learning_rate": 1.848957645360965e-05, - "loss": 2.1447, - "step": 211700 - }, - { - "epoch": 0.035722458821493754, - "grad_norm": 2.095914840698242, - "learning_rate": 1.8468202457398126e-05, - "loss": 2.1421, - "step": 211800 - }, - { - "epoch": 0.03600154053103668, - "grad_norm": 2.1924917697906494, - "learning_rate": 1.8446833584215444e-05, - "loss": 2.1416, - "step": 211900 - }, - { - "epoch": 0.036280622240579596, - "grad_norm": 2.123359203338623, - "learning_rate": 1.8425469850821648e-05, - "loss": 2.1465, - "step": 212000 - }, - { - "epoch": 0.036280622240579596, - "eval_loss": 2.1811015605926514, - "eval_runtime": 51.5948, - "eval_samples_per_second": 197.578, - "eval_steps_per_second": 1.551, - "step": 212000 - }, - { - "epoch": 0.036559703950122514, - "grad_norm": 2.065702438354492, - "learning_rate": 1.840411127397278e-05, - "loss": 2.1352, - "step": 212100 - }, - { - "epoch": 0.03683878565966544, - "grad_norm": 2.0806708335876465, - "learning_rate": 1.838275787042083e-05, - "loss": 2.1432, - "step": 212200 - }, - { - "epoch": 0.03711786736920836, - "grad_norm": 2.1028740406036377, - "learning_rate": 1.8361409656913744e-05, - "loss": 2.1349, - "step": 212300 - }, - { - "epoch": 0.037396949078751275, - "grad_norm": 2.1603927612304688, - "learning_rate": 1.8340066650195363e-05, - "loss": 2.1307, - "step": 212400 - }, - { - "epoch": 0.0376760307882942, - "grad_norm": 2.016268014907837, - "learning_rate": 1.831872886700547e-05, - "loss": 2.129, - "step": 212500 - }, - { - "epoch": 0.03795511249783712, - "grad_norm": 1.9362486600875854, - "learning_rate": 1.829739632407975e-05, - "loss": 2.1187, - "step": 212600 - }, - { - "epoch": 0.038234194207380036, - "grad_norm": 2.1569607257843018, - "learning_rate": 1.827606903814977e-05, - "loss": 2.1314, - "step": 212700 - }, - { - "epoch": 0.038513275916922954, - "grad_norm": 2.0166728496551514, - "learning_rate": 1.825474702594299e-05, - "loss": 2.1274, - "step": 212800 - }, - { - "epoch": 0.03879235762646588, - "grad_norm": 2.1779658794403076, - "learning_rate": 1.8233430304182704e-05, - "loss": 2.1183, - "step": 212900 - }, - { - "epoch": 0.0390714393360088, - "grad_norm": 2.1090939044952393, - "learning_rate": 1.821211888958808e-05, - "loss": 2.126, - "step": 213000 - }, - { - "epoch": 0.0390714393360088, - "eval_loss": 2.1809489727020264, - "eval_runtime": 51.5547, - "eval_samples_per_second": 197.732, - "eval_steps_per_second": 1.552, - "step": 213000 - }, - { - "epoch": 0.039350521045551715, - "grad_norm": 2.2175374031066895, - "learning_rate": 1.819081279887411e-05, - "loss": 2.1201, - "step": 213100 - }, - { - "epoch": 0.03962960275509464, - "grad_norm": 2.0139071941375732, - "learning_rate": 1.8169512048751648e-05, - "loss": 2.1207, - "step": 213200 - }, - { - "epoch": 0.03990868446463756, - "grad_norm": 2.101840019226074, - "learning_rate": 1.814821665592729e-05, - "loss": 2.1145, - "step": 213300 - }, - { - "epoch": 0.040187766174180475, - "grad_norm": 2.199965238571167, - "learning_rate": 1.8126926637103484e-05, - "loss": 2.1256, - "step": 213400 - }, - { - "epoch": 0.04046684788372339, - "grad_norm": 2.042839288711548, - "learning_rate": 1.8105642008978458e-05, - "loss": 2.1096, - "step": 213500 - }, - { - "epoch": 0.04074592959326632, - "grad_norm": 2.233668804168701, - "learning_rate": 1.808436278824619e-05, - "loss": 2.1099, - "step": 213600 - }, - { - "epoch": 0.041025011302809236, - "grad_norm": 2.0933728218078613, - "learning_rate": 1.8063088991596437e-05, - "loss": 2.1014, - "step": 213700 - }, - { - "epoch": 0.041304093012352154, - "grad_norm": 2.1422884464263916, - "learning_rate": 1.8041820635714682e-05, - "loss": 2.1034, - "step": 213800 - }, - { - "epoch": 0.04158317472189508, - "grad_norm": 2.0475480556488037, - "learning_rate": 1.802055773728216e-05, - "loss": 2.1116, - "step": 213900 - }, - { - "epoch": 0.041862256431438, - "grad_norm": 2.0574936866760254, - "learning_rate": 1.799930031297583e-05, - "loss": 2.1181, - "step": 214000 - }, - { - "epoch": 0.041862256431438, - "eval_loss": 2.1702778339385986, - "eval_runtime": 51.7407, - "eval_samples_per_second": 197.021, - "eval_steps_per_second": 1.546, - "step": 214000 - }, - { - "epoch": 0.042141338140980915, - "grad_norm": 2.011029005050659, - "learning_rate": 1.7978048379468322e-05, - "loss": 2.1068, - "step": 214100 - }, - { - "epoch": 0.04242041985052384, - "grad_norm": 2.035914897918701, - "learning_rate": 1.7956801953428e-05, - "loss": 2.1174, - "step": 214200 - }, - { - "epoch": 0.04269950156006676, - "grad_norm": 2.129701852798462, - "learning_rate": 1.7935561051518883e-05, - "loss": 2.1197, - "step": 214300 - }, - { - "epoch": 0.042978583269609676, - "grad_norm": 2.043063163757324, - "learning_rate": 1.791432569040068e-05, - "loss": 2.1106, - "step": 214400 - }, - { - "epoch": 0.043257664979152594, - "grad_norm": 2.03788161277771, - "learning_rate": 1.7893095886728716e-05, - "loss": 2.1055, - "step": 214500 - }, - { - "epoch": 0.04353674668869552, - "grad_norm": 1.9218449592590332, - "learning_rate": 1.7871871657153993e-05, - "loss": 2.1038, - "step": 214600 - }, - { - "epoch": 0.043815828398238436, - "grad_norm": 2.175419807434082, - "learning_rate": 1.7850653018323132e-05, - "loss": 2.1049, - "step": 214700 - }, - { - "epoch": 0.044094910107781354, - "grad_norm": 2.14815616607666, - "learning_rate": 1.7829439986878374e-05, - "loss": 2.1158, - "step": 214800 - }, - { - "epoch": 0.04437399181732428, - "grad_norm": 1.9514108896255493, - "learning_rate": 1.7808232579457534e-05, - "loss": 2.092, - "step": 214900 - }, - { - "epoch": 0.0446530735268672, - "grad_norm": 2.0511226654052734, - "learning_rate": 1.778703081269405e-05, - "loss": 2.0992, - "step": 215000 - }, - { - "epoch": 0.0446530735268672, - "eval_loss": 2.183467388153076, - "eval_runtime": 51.5396, - "eval_samples_per_second": 197.79, - "eval_steps_per_second": 1.552, - "step": 215000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 2.159756660461426, - "learning_rate": 1.776583470321692e-05, - "loss": 2.0955, - "step": 215100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 2.170898675918579, - "learning_rate": 1.7744644267650712e-05, - "loss": 2.1049, - "step": 215200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 1.9969067573547363, - "learning_rate": 1.7723459522615522e-05, - "loss": 2.092, - "step": 215300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 1.9468703269958496, - "learning_rate": 1.770228048472701e-05, - "loss": 2.1021, - "step": 215400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 2.082648992538452, - "learning_rate": 1.7681107170596357e-05, - "loss": 2.0915, - "step": 215500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 2.049349546432495, - "learning_rate": 1.7659939596830243e-05, - "loss": 2.0962, - "step": 215600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 2.176790952682495, - "learning_rate": 1.7638777780030844e-05, - "loss": 2.0892, - "step": 215700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 2.1624631881713867, - "learning_rate": 1.7617621736795824e-05, - "loss": 2.0963, - "step": 215800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 2.1935231685638428, - "learning_rate": 1.7596471483718328e-05, - "loss": 2.0814, - "step": 215900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 2.091728925704956, - "learning_rate": 1.757532703738695e-05, - "loss": 2.0956, - "step": 216000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.1795222759246826, - "eval_runtime": 51.863, - "eval_samples_per_second": 196.556, - "eval_steps_per_second": 1.543, - "step": 216000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 1.9175347089767456, - "learning_rate": 1.7554188414385746e-05, - "loss": 2.083, - "step": 216100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 2.0839240550994873, - "learning_rate": 1.753305563129417e-05, - "loss": 2.0849, - "step": 216200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 2.2987542152404785, - "learning_rate": 1.751192870468713e-05, - "loss": 2.107, - "step": 216300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 2.0684635639190674, - "learning_rate": 1.7490807651134916e-05, - "loss": 2.0833, - "step": 216400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 2.094618558883667, - "learning_rate": 1.7469692487203242e-05, - "loss": 2.1003, - "step": 216500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 2.0774834156036377, - "learning_rate": 1.7448583229453163e-05, - "loss": 2.0854, - "step": 216600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 2.2240655422210693, - "learning_rate": 1.7427479894441135e-05, - "loss": 2.0914, - "step": 216700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 2.094910144805908, - "learning_rate": 1.740638249871895e-05, - "loss": 2.0913, - "step": 216800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 2.0924530029296875, - "learning_rate": 1.738529105883376e-05, - "loss": 2.0825, - "step": 216900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 2.0093395709991455, - "learning_rate": 1.7364205591328018e-05, - "loss": 2.0782, - "step": 217000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.17291259765625, - "eval_runtime": 51.4439, - "eval_samples_per_second": 198.157, - "eval_steps_per_second": 1.555, - "step": 217000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 2.0085370540618896, - "learning_rate": 1.734312611273951e-05, - "loss": 2.0714, - "step": 217100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 2.3136491775512695, - "learning_rate": 1.7322052639601328e-05, - "loss": 2.0794, - "step": 217200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 2.062134265899658, - "learning_rate": 1.7300985188441854e-05, - "loss": 2.0822, - "step": 217300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 2.0435168743133545, - "learning_rate": 1.727992377578473e-05, - "loss": 2.0763, - "step": 217400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 2.1942365169525146, - "learning_rate": 1.7258868418148874e-05, - "loss": 2.0876, - "step": 217500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 2.1672890186309814, - "learning_rate": 1.7237819132048467e-05, - "loss": 2.0832, - "step": 217600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 1.8856595754623413, - "learning_rate": 1.7216775933992906e-05, - "loss": 2.0706, - "step": 217700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 2.1063289642333984, - "learning_rate": 1.7195738840486825e-05, - "loss": 2.2249, - "step": 217800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 2.09557843208313, - "learning_rate": 1.717470786803006e-05, - "loss": 2.2446, - "step": 217900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 2.1334340572357178, - "learning_rate": 1.715368303311766e-05, - "loss": 2.2297, - "step": 218000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.1775035858154297, - "eval_runtime": 51.4889, - "eval_samples_per_second": 197.984, - "eval_steps_per_second": 1.554, - "step": 218000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 2.201794385910034, - "learning_rate": 1.713266435223986e-05, - "loss": 2.2351, - "step": 218100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 2.2592103481292725, - "learning_rate": 1.711165184188205e-05, - "loss": 2.223, - "step": 218200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 2.382873773574829, - "learning_rate": 1.7090645518524797e-05, - "loss": 2.2283, - "step": 218300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 2.2751810550689697, - "learning_rate": 1.706964539864381e-05, - "loss": 2.2369, - "step": 218400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 2.439268112182617, - "learning_rate": 1.7048651498709944e-05, - "loss": 2.227, - "step": 218500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 2.244767665863037, - "learning_rate": 1.7027663835189145e-05, - "loss": 2.2235, - "step": 218600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 2.1761574745178223, - "learning_rate": 1.7006682424542497e-05, - "loss": 2.2172, - "step": 218700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 2.32922101020813, - "learning_rate": 1.6985707283226172e-05, - "loss": 2.2169, - "step": 218800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 2.1702868938446045, - "learning_rate": 1.6964738427691426e-05, - "loss": 2.2243, - "step": 218900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 2.0979557037353516, - "learning_rate": 1.6943775874384583e-05, - "loss": 2.2045, - "step": 219000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.1724750995635986, - "eval_runtime": 51.344, - "eval_samples_per_second": 198.543, - "eval_steps_per_second": 1.558, - "step": 219000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 2.1244499683380127, - "learning_rate": 1.6922819639747006e-05, - "loss": 2.2174, - "step": 219100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 2.18345046043396, - "learning_rate": 1.690186974021513e-05, - "loss": 2.2265, - "step": 219200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 2.2020881175994873, - "learning_rate": 1.6880926192220413e-05, - "loss": 2.2272, - "step": 219300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 2.2746477127075195, - "learning_rate": 1.6859989012189337e-05, - "loss": 2.2184, - "step": 219400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 2.2917847633361816, - "learning_rate": 1.6839058216543358e-05, - "loss": 2.2267, - "step": 219500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 2.2045438289642334, - "learning_rate": 1.6818133821698965e-05, - "loss": 2.2119, - "step": 219600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 2.218310594558716, - "learning_rate": 1.6797215844067604e-05, - "loss": 2.2216, - "step": 219700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 2.124152898788452, - "learning_rate": 1.67763043000557e-05, - "loss": 2.2065, - "step": 219800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 2.10780930519104, - "learning_rate": 1.675539920606461e-05, - "loss": 2.2149, - "step": 219900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 2.210146903991699, - "learning_rate": 1.673450057849066e-05, - "loss": 2.2149, - "step": 220000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.164307117462158, - "eval_runtime": 51.3547, - "eval_samples_per_second": 198.502, - "eval_steps_per_second": 1.558, - "step": 220000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 2.1689798831939697, - "learning_rate": 1.671360843372508e-05, - "loss": 2.2174, - "step": 220100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 2.2905499935150146, - "learning_rate": 1.669272278815405e-05, - "loss": 2.2041, - "step": 220200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 2.155677080154419, - "learning_rate": 1.6671843658158613e-05, - "loss": 2.2197, - "step": 220300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 2.2219150066375732, - "learning_rate": 1.665097106011471e-05, - "loss": 2.2173, - "step": 220400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 2.145770311355591, - "learning_rate": 1.6630105010393178e-05, - "loss": 2.1991, - "step": 220500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 2.2329516410827637, - "learning_rate": 1.6609245525359717e-05, - "loss": 2.222, - "step": 220600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 2.230044364929199, - "learning_rate": 1.6588392621374846e-05, - "loss": 2.2124, - "step": 220700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 2.2386929988861084, - "learning_rate": 1.6567546314793956e-05, - "loss": 2.1982, - "step": 220800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 2.178781747817993, - "learning_rate": 1.6546706621967255e-05, - "loss": 2.2056, - "step": 220900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 2.2631821632385254, - "learning_rate": 1.6525873559239764e-05, - "loss": 2.1995, - "step": 221000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.167518138885498, - "eval_runtime": 51.2411, - "eval_samples_per_second": 198.942, - "eval_steps_per_second": 1.561, - "step": 221000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 2.186282157897949, - "learning_rate": 1.650504714295129e-05, - "loss": 2.2005, - "step": 221100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 2.2361273765563965, - "learning_rate": 1.648422738943644e-05, - "loss": 2.2034, - "step": 221200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 2.1385703086853027, - "learning_rate": 1.646341431502459e-05, - "loss": 2.2073, - "step": 221300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 2.232243299484253, - "learning_rate": 1.64426079360399e-05, - "loss": 2.2008, - "step": 221400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 2.30553936958313, - "learning_rate": 1.6421808268801235e-05, - "loss": 2.2029, - "step": 221500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 2.1158080101013184, - "learning_rate": 1.6401015329622233e-05, - "loss": 2.1912, - "step": 221600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 2.136540412902832, - "learning_rate": 1.6380229134811232e-05, - "loss": 2.2066, - "step": 221700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 2.0367746353149414, - "learning_rate": 1.6359449700671307e-05, - "loss": 2.2027, - "step": 221800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 2.1502268314361572, - "learning_rate": 1.6338677043500197e-05, - "loss": 2.2027, - "step": 221900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 2.2150540351867676, - "learning_rate": 1.6317911179590346e-05, - "loss": 2.207, - "step": 222000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.16145920753479, - "eval_runtime": 51.444, - "eval_samples_per_second": 198.157, - "eval_steps_per_second": 1.555, - "step": 222000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 2.327277183532715, - "learning_rate": 1.629715212522887e-05, - "loss": 2.2025, - "step": 222100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 2.240081548690796, - "learning_rate": 1.627639989669754e-05, - "loss": 2.2018, - "step": 222200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 2.3731963634490967, - "learning_rate": 1.6255654510272778e-05, - "loss": 2.2009, - "step": 222300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 2.1497604846954346, - "learning_rate": 1.623491598222563e-05, - "loss": 2.1973, - "step": 222400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 2.194458246231079, - "learning_rate": 1.621418432882176e-05, - "loss": 2.2045, - "step": 222500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 2.1718227863311768, - "learning_rate": 1.6193459566321456e-05, - "loss": 2.1977, - "step": 222600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 2.2664620876312256, - "learning_rate": 1.6172741710979606e-05, - "loss": 2.2011, - "step": 222700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 2.388573169708252, - "learning_rate": 1.6152030779045647e-05, - "loss": 2.1984, - "step": 222800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 2.1636369228363037, - "learning_rate": 1.6131326786763616e-05, - "loss": 2.2017, - "step": 222900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 2.3732447624206543, - "learning_rate": 1.6110629750372096e-05, - "loss": 2.1938, - "step": 223000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.170623779296875, - "eval_runtime": 51.4801, - "eval_samples_per_second": 198.018, - "eval_steps_per_second": 1.554, - "step": 223000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 2.167587995529175, - "learning_rate": 1.608993968610423e-05, - "loss": 2.191, - "step": 223100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 2.159860849380493, - "learning_rate": 1.6069256610187656e-05, - "loss": 2.2105, - "step": 223200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 2.154714822769165, - "learning_rate": 1.6048580538844566e-05, - "loss": 2.1955, - "step": 223300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 2.1291658878326416, - "learning_rate": 1.602791148829164e-05, - "loss": 2.2017, - "step": 223400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 2.1027395725250244, - "learning_rate": 1.600724947474008e-05, - "loss": 2.1981, - "step": 223500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 2.206848621368408, - "learning_rate": 1.5986594514395513e-05, - "loss": 2.1952, - "step": 223600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 2.2017011642456055, - "learning_rate": 1.5965946623458084e-05, - "loss": 2.2008, - "step": 223700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 2.31180477142334, - "learning_rate": 1.5945305818122376e-05, - "loss": 2.1875, - "step": 223800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 2.226900577545166, - "learning_rate": 1.5924672114577422e-05, - "loss": 2.1909, - "step": 223900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 2.177281618118286, - "learning_rate": 1.5904045529006657e-05, - "loss": 2.1933, - "step": 224000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.158267021179199, - "eval_runtime": 51.4171, - "eval_samples_per_second": 198.261, - "eval_steps_per_second": 1.556, - "step": 224000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 2.1759471893310547, - "learning_rate": 1.588342607758797e-05, - "loss": 2.1969, - "step": 224100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 2.1845242977142334, - "learning_rate": 1.586281377649364e-05, - "loss": 2.2041, - "step": 224200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 2.3617475032806396, - "learning_rate": 1.5842208641890337e-05, - "loss": 2.1873, - "step": 224300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 2.091614007949829, - "learning_rate": 1.5821610689939105e-05, - "loss": 2.1918, - "step": 224400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 2.2906229496002197, - "learning_rate": 1.580101993679535e-05, - "loss": 2.1975, - "step": 224500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 2.089142084121704, - "learning_rate": 1.5780436398608854e-05, - "loss": 2.2017, - "step": 224600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 2.2736806869506836, - "learning_rate": 1.575986009152373e-05, - "loss": 2.1857, - "step": 224700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 2.1917905807495117, - "learning_rate": 1.5739291031678404e-05, - "loss": 2.1903, - "step": 224800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 2.207611322402954, - "learning_rate": 1.5718729235205642e-05, - "loss": 2.1948, - "step": 224900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 2.3215441703796387, - "learning_rate": 1.5698174718232494e-05, - "loss": 2.192, - "step": 225000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.1532270908355713, - "eval_runtime": 51.4641, - "eval_samples_per_second": 198.08, - "eval_steps_per_second": 1.554, - "step": 225000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 2.1780614852905273, - "learning_rate": 1.567762749688031e-05, - "loss": 2.1826, - "step": 225100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 2.1773393154144287, - "learning_rate": 1.5657087587264724e-05, - "loss": 2.187, - "step": 225200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 2.1740593910217285, - "learning_rate": 1.5636555005495616e-05, - "loss": 2.186, - "step": 225300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 2.338139295578003, - "learning_rate": 1.561602976767713e-05, - "loss": 2.1901, - "step": 225400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 2.3076512813568115, - "learning_rate": 1.5595511889907665e-05, - "loss": 2.1911, - "step": 225500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 2.286112070083618, - "learning_rate": 1.557500138827982e-05, - "loss": 2.1823, - "step": 225600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 2.1310651302337646, - "learning_rate": 1.5554498278880424e-05, - "loss": 2.1904, - "step": 225700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 2.149794578552246, - "learning_rate": 1.5534002577790497e-05, - "loss": 2.1857, - "step": 225800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 2.250833511352539, - "learning_rate": 1.5513514301085266e-05, - "loss": 2.1748, - "step": 225900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 2.2140324115753174, - "learning_rate": 1.5493033464834133e-05, - "loss": 2.1891, - "step": 226000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.149634838104248, - "eval_runtime": 51.5665, - "eval_samples_per_second": 197.687, - "eval_steps_per_second": 1.551, - "step": 226000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 2.228729009628296, - "learning_rate": 1.547256008510064e-05, - "loss": 2.1815, - "step": 226100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 2.263529062271118, - "learning_rate": 1.545209417794251e-05, - "loss": 2.2412, - "step": 226200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 2.239266872406006, - "learning_rate": 1.5431635759411582e-05, - "loss": 2.3094, - "step": 226300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 2.179316997528076, - "learning_rate": 1.541118484555385e-05, - "loss": 2.2971, - "step": 226400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 2.152000665664673, - "learning_rate": 1.539074145240938e-05, - "loss": 2.3019, - "step": 226500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 2.2889840602874756, - "learning_rate": 1.5370305596012376e-05, - "loss": 2.284, - "step": 226600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 2.195444345474243, - "learning_rate": 1.5349877292391122e-05, - "loss": 2.2919, - "step": 226700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 2.3559839725494385, - "learning_rate": 1.5329456557567978e-05, - "loss": 2.2882, - "step": 226800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 2.2163028717041016, - "learning_rate": 1.5309043407559345e-05, - "loss": 2.2731, - "step": 226900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 2.3102822303771973, - "learning_rate": 1.5288637858375714e-05, - "loss": 2.2873, - "step": 227000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.1502978801727295, - "eval_runtime": 51.5237, - "eval_samples_per_second": 197.851, - "eval_steps_per_second": 1.553, - "step": 227000 - }, - { - "epoch": 0.033768886854693314, - "grad_norm": 2.150144577026367, - "learning_rate": 1.5268239926021576e-05, - "loss": 2.2731, - "step": 227100 - }, - { - "epoch": 0.03404796856423624, - "grad_norm": 2.355604410171509, - "learning_rate": 1.5247849626495492e-05, - "loss": 2.2814, - "step": 227200 - }, - { - "epoch": 0.03432705027377916, - "grad_norm": 2.2507338523864746, - "learning_rate": 1.5227466975789987e-05, - "loss": 2.2773, - "step": 227300 - }, - { - "epoch": 0.034606131983322075, - "grad_norm": 2.3993356227874756, - "learning_rate": 1.5207091989891617e-05, - "loss": 2.275, - "step": 227400 - }, - { - "epoch": 0.034885213692865, - "grad_norm": 2.2218728065490723, - "learning_rate": 1.5186724684780929e-05, - "loss": 2.29, - "step": 227500 - }, - { - "epoch": 0.03516429540240792, - "grad_norm": 2.109447717666626, - "learning_rate": 1.5166365076432432e-05, - "loss": 2.2635, - "step": 227600 - }, - { - "epoch": 0.035443377111950836, - "grad_norm": 2.2415287494659424, - "learning_rate": 1.51460131808146e-05, - "loss": 2.2773, - "step": 227700 - }, - { - "epoch": 0.035722458821493754, - "grad_norm": 2.3350560665130615, - "learning_rate": 1.5125669013889861e-05, - "loss": 2.2789, - "step": 227800 - }, - { - "epoch": 0.03600154053103668, - "grad_norm": 2.2049736976623535, - "learning_rate": 1.5105332591614585e-05, - "loss": 2.2747, - "step": 227900 - }, - { - "epoch": 0.036280622240579596, - "grad_norm": 2.2645366191864014, - "learning_rate": 1.5085003929939067e-05, - "loss": 2.2662, - "step": 228000 - }, - { - "epoch": 0.036280622240579596, - "eval_loss": 2.140353202819824, - "eval_runtime": 51.6063, - "eval_samples_per_second": 197.534, - "eval_steps_per_second": 1.55, - "step": 228000 - }, - { - "epoch": 0.036559703950122514, - "grad_norm": 2.245758295059204, - "learning_rate": 1.5064683044807504e-05, - "loss": 2.2559, - "step": 228100 - }, - { - "epoch": 0.03683878565966544, - "grad_norm": 2.1644320487976074, - "learning_rate": 1.5044369952158e-05, - "loss": 2.2621, - "step": 228200 - }, - { - "epoch": 0.03711786736920836, - "grad_norm": 2.24301815032959, - "learning_rate": 1.5024064667922563e-05, - "loss": 2.2643, - "step": 228300 - }, - { - "epoch": 0.037396949078751275, - "grad_norm": 2.1599223613739014, - "learning_rate": 1.5003767208027048e-05, - "loss": 2.2675, - "step": 228400 - }, - { - "epoch": 0.0376760307882942, - "grad_norm": 2.279449701309204, - "learning_rate": 1.4983477588391203e-05, - "loss": 2.2637, - "step": 228500 - }, - { - "epoch": 0.03795511249783712, - "grad_norm": 2.155567169189453, - "learning_rate": 1.4963195824928595e-05, - "loss": 2.2511, - "step": 228600 - }, - { - "epoch": 0.038234194207380036, - "grad_norm": 2.1678829193115234, - "learning_rate": 1.4942921933546653e-05, - "loss": 2.2637, - "step": 228700 - }, - { - "epoch": 0.038513275916922954, - "grad_norm": 2.173006772994995, - "learning_rate": 1.4922655930146628e-05, - "loss": 2.2565, - "step": 228800 - }, - { - "epoch": 0.03879235762646588, - "grad_norm": 2.268568992614746, - "learning_rate": 1.4902397830623583e-05, - "loss": 2.267, - "step": 228900 - }, - { - "epoch": 0.0390714393360088, - "grad_norm": 2.140665292739868, - "learning_rate": 1.488214765086637e-05, - "loss": 2.2609, - "step": 229000 - }, - { - "epoch": 0.0390714393360088, - "eval_loss": 2.1331050395965576, - "eval_runtime": 51.4755, - "eval_samples_per_second": 198.036, - "eval_steps_per_second": 1.554, - "step": 229000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 2.2197189331054688, - "learning_rate": 1.4861905406757642e-05, - "loss": 2.2665, - "step": 229100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 2.3529903888702393, - "learning_rate": 1.4841671114173825e-05, - "loss": 2.2607, - "step": 229200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 2.280348300933838, - "learning_rate": 1.4821444788985119e-05, - "loss": 2.2596, - "step": 229300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 2.2226016521453857, - "learning_rate": 1.4801226447055449e-05, - "loss": 2.259, - "step": 229400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 2.238063335418701, - "learning_rate": 1.4781016104242502e-05, - "loss": 2.2592, - "step": 229500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 2.200965642929077, - "learning_rate": 1.476081377639768e-05, - "loss": 2.255, - "step": 229600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 2.0392613410949707, - "learning_rate": 1.4740619479366114e-05, - "loss": 2.2506, - "step": 229700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 2.3026771545410156, - "learning_rate": 1.47204332289866e-05, - "loss": 2.2568, - "step": 229800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 2.056729555130005, - "learning_rate": 1.4700255041091663e-05, - "loss": 2.2553, - "step": 229900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 2.2352523803710938, - "learning_rate": 1.4680084931507482e-05, - "loss": 2.2435, - "step": 230000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.144761085510254, - "eval_runtime": 52.5572, - "eval_samples_per_second": 193.96, - "eval_steps_per_second": 1.522, - "step": 230000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 2.1020162105560303, - "learning_rate": 1.4659922916053925e-05, - "loss": 2.2631, - "step": 230100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 2.259777545928955, - "learning_rate": 1.4639769010544466e-05, - "loss": 2.2601, - "step": 230200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 2.2175509929656982, - "learning_rate": 1.4619623230786262e-05, - "loss": 2.249, - "step": 230300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 2.2818946838378906, - "learning_rate": 1.459948559258007e-05, - "loss": 2.2602, - "step": 230400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 2.2155439853668213, - "learning_rate": 1.4579356111720282e-05, - "loss": 2.2534, - "step": 230500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 2.2038073539733887, - "learning_rate": 1.455923480399488e-05, - "loss": 2.2556, - "step": 230600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 2.2248752117156982, - "learning_rate": 1.4539121685185426e-05, - "loss": 2.2457, - "step": 230700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 2.232311487197876, - "learning_rate": 1.4519016771067073e-05, - "loss": 2.2528, - "step": 230800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 2.3795855045318604, - "learning_rate": 1.4498920077408551e-05, - "loss": 2.2463, - "step": 230900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 2.296515941619873, - "learning_rate": 1.4478831619972107e-05, - "loss": 2.2423, - "step": 231000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.142361640930176, - "eval_runtime": 51.7261, - "eval_samples_per_second": 197.077, - "eval_steps_per_second": 1.547, - "step": 231000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 2.3775389194488525, - "learning_rate": 1.445875141451356e-05, - "loss": 2.2486, - "step": 231100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 2.1756324768066406, - "learning_rate": 1.4438679476782241e-05, - "loss": 2.2403, - "step": 231200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 2.409360408782959, - "learning_rate": 1.4418615822521009e-05, - "loss": 2.2332, - "step": 231300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 2.2292256355285645, - "learning_rate": 1.4398560467466218e-05, - "loss": 2.2484, - "step": 231400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 2.317793369293213, - "learning_rate": 1.43785134273477e-05, - "loss": 2.2578, - "step": 231500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 2.13801908493042, - "learning_rate": 1.4358474717888787e-05, - "loss": 2.2562, - "step": 231600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 2.161449432373047, - "learning_rate": 1.4338444354806269e-05, - "loss": 2.2434, - "step": 231700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 2.174821376800537, - "learning_rate": 1.4318422353810395e-05, - "loss": 2.2448, - "step": 231800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 2.315488338470459, - "learning_rate": 1.4298408730604845e-05, - "loss": 2.2507, - "step": 231900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 2.229074478149414, - "learning_rate": 1.4278403500886716e-05, - "loss": 2.2469, - "step": 232000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.145362377166748, - "eval_runtime": 51.5487, - "eval_samples_per_second": 197.755, - "eval_steps_per_second": 1.552, - "step": 232000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 2.3216850757598877, - "learning_rate": 1.4258406680346559e-05, - "loss": 2.2483, - "step": 232100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 2.247835874557495, - "learning_rate": 1.4238418284668309e-05, - "loss": 2.229, - "step": 232200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 2.2360994815826416, - "learning_rate": 1.4218438329529276e-05, - "loss": 2.2504, - "step": 232300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 2.2599055767059326, - "learning_rate": 1.4198466830600183e-05, - "loss": 2.238, - "step": 232400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 2.299938440322876, - "learning_rate": 1.4178503803545096e-05, - "loss": 2.2389, - "step": 232500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 2.140632390975952, - "learning_rate": 1.415854926402146e-05, - "loss": 2.2454, - "step": 232600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 2.2972567081451416, - "learning_rate": 1.4138603227680026e-05, - "loss": 2.2421, - "step": 232700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 2.119060516357422, - "learning_rate": 1.4118665710164908e-05, - "loss": 2.25, - "step": 232800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 2.258012533187866, - "learning_rate": 1.4098736727113529e-05, - "loss": 2.2384, - "step": 232900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 2.2731425762176514, - "learning_rate": 1.4078816294156626e-05, - "loss": 2.2315, - "step": 233000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.1522159576416016, - "eval_runtime": 51.7751, - "eval_samples_per_second": 196.89, - "eval_steps_per_second": 1.545, - "step": 233000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 2.2772982120513916, - "learning_rate": 1.405890442691821e-05, - "loss": 2.2507, - "step": 233100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 2.1224279403686523, - "learning_rate": 1.4039001141015595e-05, - "loss": 2.252, - "step": 233200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 2.3541483879089355, - "learning_rate": 1.4019106452059338e-05, - "loss": 2.2445, - "step": 233300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 2.249394416809082, - "learning_rate": 1.399922037565329e-05, - "loss": 2.2282, - "step": 233400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 2.2116713523864746, - "learning_rate": 1.3979342927394509e-05, - "loss": 2.2295, - "step": 233500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 2.350203275680542, - "learning_rate": 1.3959474122873311e-05, - "loss": 2.2294, - "step": 233600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 2.2494752407073975, - "learning_rate": 1.3939613977673227e-05, - "loss": 2.2258, - "step": 233700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 2.126502513885498, - "learning_rate": 1.3919762507371007e-05, - "loss": 2.2066, - "step": 233800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 2.2300145626068115, - "learning_rate": 1.3899919727536559e-05, - "loss": 2.2102, - "step": 233900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 2.1797988414764404, - "learning_rate": 1.3880085653733014e-05, - "loss": 2.2193, - "step": 234000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.137155532836914, - "eval_runtime": 51.6878, - "eval_samples_per_second": 197.222, - "eval_steps_per_second": 1.548, - "step": 234000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 2.1567306518554688, - "learning_rate": 1.3860260301516659e-05, - "loss": 2.2073, - "step": 234100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 2.185314178466797, - "learning_rate": 1.3840443686436949e-05, - "loss": 2.2035, - "step": 234200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 2.074904203414917, - "learning_rate": 1.3820635824036482e-05, - "loss": 2.2055, - "step": 234300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 1.9893766641616821, - "learning_rate": 1.3800836729850972e-05, - "loss": 2.2006, - "step": 234400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 2.0517656803131104, - "learning_rate": 1.3781046419409294e-05, - "loss": 2.2047, - "step": 234500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 2.0877463817596436, - "learning_rate": 1.3761264908233395e-05, - "loss": 2.2147, - "step": 234600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 2.173692464828491, - "learning_rate": 1.3741492211838353e-05, - "loss": 2.2037, - "step": 234700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 2.1288397312164307, - "learning_rate": 1.3721728345732299e-05, - "loss": 2.2081, - "step": 234800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 2.2528440952301025, - "learning_rate": 1.370197332541647e-05, - "loss": 2.2004, - "step": 234900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 2.060171127319336, - "learning_rate": 1.3682227166385148e-05, - "loss": 2.1902, - "step": 235000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.1390364170074463, - "eval_runtime": 51.7756, - "eval_samples_per_second": 196.888, - "eval_steps_per_second": 1.545, - "step": 235000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 2.295802593231201, - "learning_rate": 1.3662489884125684e-05, - "loss": 2.199, - "step": 235100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 2.1694698333740234, - "learning_rate": 1.3642761494118426e-05, - "loss": 2.1802, - "step": 235200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 2.199690818786621, - "learning_rate": 1.3623042011836784e-05, - "loss": 2.2079, - "step": 235300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 2.1490964889526367, - "learning_rate": 1.3603331452747176e-05, - "loss": 2.1914, - "step": 235400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 2.0006728172302246, - "learning_rate": 1.358362983230902e-05, - "loss": 2.188, - "step": 235500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 2.1562349796295166, - "learning_rate": 1.35639371659747e-05, - "loss": 2.1896, - "step": 235600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 2.129549980163574, - "learning_rate": 1.354425346918961e-05, - "loss": 2.1935, - "step": 235700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 2.2000486850738525, - "learning_rate": 1.3524578757392103e-05, - "loss": 2.1936, - "step": 235800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 2.061960220336914, - "learning_rate": 1.3504913046013456e-05, - "loss": 2.1902, - "step": 235900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 2.2487034797668457, - "learning_rate": 1.3485256350477931e-05, - "loss": 2.1836, - "step": 236000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.1369080543518066, - "eval_runtime": 51.9173, - "eval_samples_per_second": 196.351, - "eval_steps_per_second": 1.541, - "step": 236000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 2.135773181915283, - "learning_rate": 1.3465608686202672e-05, - "loss": 2.1847, - "step": 236100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 2.2150774002075195, - "learning_rate": 1.3445970068597774e-05, - "loss": 2.193, - "step": 236200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 2.102975368499756, - "learning_rate": 1.342634051306624e-05, - "loss": 2.1916, - "step": 236300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 2.2181150913238525, - "learning_rate": 1.3406720035003928e-05, - "loss": 2.1875, - "step": 236400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 2.1293530464172363, - "learning_rate": 1.3387108649799607e-05, - "loss": 2.1907, - "step": 236500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 2.229583501815796, - "learning_rate": 1.3367506372834915e-05, - "loss": 2.1913, - "step": 236600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 2.1583094596862793, - "learning_rate": 1.3347913219484336e-05, - "loss": 2.1895, - "step": 236700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 2.043151617050171, - "learning_rate": 1.3328329205115191e-05, - "loss": 2.1967, - "step": 236800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 2.052990436553955, - "learning_rate": 1.3308754345087646e-05, - "loss": 2.1919, - "step": 236900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 2.242903232574463, - "learning_rate": 1.3289188654754686e-05, - "loss": 2.1793, - "step": 237000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.135045051574707, - "eval_runtime": 51.9719, - "eval_samples_per_second": 196.144, - "eval_steps_per_second": 1.539, - "step": 237000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 2.035278081893921, - "learning_rate": 1.3269632149462111e-05, - "loss": 2.1832, - "step": 237100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 2.1866884231567383, - "learning_rate": 1.3250084844548488e-05, - "loss": 2.2013, - "step": 237200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 2.1690127849578857, - "learning_rate": 1.3230546755345202e-05, - "loss": 2.1791, - "step": 237300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 2.116481304168701, - "learning_rate": 1.3211017897176384e-05, - "loss": 2.1849, - "step": 237400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 2.1610612869262695, - "learning_rate": 1.3191498285358939e-05, - "loss": 2.18, - "step": 237500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 2.0972769260406494, - "learning_rate": 1.317198793520253e-05, - "loss": 2.1791, - "step": 237600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 2.1171815395355225, - "learning_rate": 1.3152486862009521e-05, - "loss": 2.1865, - "step": 237700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 2.1703948974609375, - "learning_rate": 1.3132995081075038e-05, - "loss": 2.1841, - "step": 237800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 2.2092208862304688, - "learning_rate": 1.3113512607686895e-05, - "loss": 2.1961, - "step": 237900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 2.2685964107513428, - "learning_rate": 1.3094039457125623e-05, - "loss": 2.1812, - "step": 238000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.133305549621582, - "eval_runtime": 51.8888, - "eval_samples_per_second": 196.459, - "eval_steps_per_second": 1.542, - "step": 238000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 2.251767873764038, - "learning_rate": 1.307457564466442e-05, - "loss": 2.1865, - "step": 238100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 2.094947576522827, - "learning_rate": 1.3055121185569171e-05, - "loss": 2.1822, - "step": 238200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 2.155735731124878, - "learning_rate": 1.3035676095098434e-05, - "loss": 2.1836, - "step": 238300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 2.0563013553619385, - "learning_rate": 1.3016240388503415e-05, - "loss": 2.1685, - "step": 238400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 2.171740770339966, - "learning_rate": 1.2996814081027936e-05, - "loss": 2.1751, - "step": 238500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 2.242612361907959, - "learning_rate": 1.2977397187908492e-05, - "loss": 2.182, - "step": 238600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 2.174983263015747, - "learning_rate": 1.2957989724374137e-05, - "loss": 2.1864, - "step": 238700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 2.335228204727173, - "learning_rate": 1.2938591705646591e-05, - "loss": 2.1797, - "step": 238800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 2.131338357925415, - "learning_rate": 1.2919203146940113e-05, - "loss": 2.1832, - "step": 238900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 2.1694304943084717, - "learning_rate": 1.2899824063461574e-05, - "loss": 2.1738, - "step": 239000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.1172709465026855, - "eval_runtime": 52.0193, - "eval_samples_per_second": 195.966, - "eval_steps_per_second": 1.538, - "step": 239000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 2.2490172386169434, - "learning_rate": 1.2880454470410405e-05, - "loss": 2.1752, - "step": 239100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 2.0559821128845215, - "learning_rate": 1.2861094382978603e-05, - "loss": 2.1812, - "step": 239200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 2.3845577239990234, - "learning_rate": 1.284174381635068e-05, - "loss": 2.1728, - "step": 239300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 2.1738946437835693, - "learning_rate": 1.2822402785703708e-05, - "loss": 2.1659, - "step": 239400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 2.2159125804901123, - "learning_rate": 1.2803071306207276e-05, - "loss": 2.1672, - "step": 239500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 2.1570019721984863, - "learning_rate": 1.2783749393023486e-05, - "loss": 2.1808, - "step": 239600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 2.138986110687256, - "learning_rate": 1.2764437061306909e-05, - "loss": 2.1742, - "step": 239700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 2.287302017211914, - "learning_rate": 1.2745134326204638e-05, - "loss": 2.1791, - "step": 239800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 2.1529247760772705, - "learning_rate": 1.2725841202856203e-05, - "loss": 2.1742, - "step": 239900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 2.148305892944336, - "learning_rate": 1.270655770639364e-05, - "loss": 2.1799, - "step": 240000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.1318085193634033, - "eval_runtime": 51.882, - "eval_samples_per_second": 196.484, - "eval_steps_per_second": 1.542, - "step": 240000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 2.213149070739746, - "learning_rate": 1.2687283851941381e-05, - "loss": 2.1723, - "step": 240100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 2.2811832427978516, - "learning_rate": 1.2668019654616337e-05, - "loss": 2.176, - "step": 240200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 2.0813376903533936, - "learning_rate": 1.2648765129527829e-05, - "loss": 2.1701, - "step": 240300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 2.146592617034912, - "learning_rate": 1.2629520291777597e-05, - "loss": 2.1738, - "step": 240400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 2.2072625160217285, - "learning_rate": 1.2610285156459783e-05, - "loss": 2.1762, - "step": 240500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 2.159395694732666, - "learning_rate": 1.2591059738660904e-05, - "loss": 2.1626, - "step": 240600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 2.3325672149658203, - "learning_rate": 1.2571844053459875e-05, - "loss": 2.169, - "step": 240700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 2.2384254932403564, - "learning_rate": 1.2552638115927966e-05, - "loss": 2.1681, - "step": 240800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 2.148493766784668, - "learning_rate": 1.253344194112882e-05, - "loss": 2.1672, - "step": 240900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 2.0037782192230225, - "learning_rate": 1.2514255544118387e-05, - "loss": 2.1678, - "step": 241000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.127737045288086, - "eval_runtime": 51.9867, - "eval_samples_per_second": 196.089, - "eval_steps_per_second": 1.539, - "step": 241000 - }, - { - "epoch": 0.033768886854693314, - "grad_norm": 2.1538326740264893, - "learning_rate": 1.2495078939944987e-05, - "loss": 2.1763, - "step": 241100 - }, - { - "epoch": 0.03404796856423624, - "grad_norm": 2.2411739826202393, - "learning_rate": 1.2475912143649224e-05, - "loss": 2.1725, - "step": 241200 - }, - { - "epoch": 0.03432705027377916, - "grad_norm": 2.161746025085449, - "learning_rate": 1.2456755170264047e-05, - "loss": 2.1658, - "step": 241300 - }, - { - "epoch": 0.034606131983322075, - "grad_norm": 2.173823595046997, - "learning_rate": 1.2437608034814663e-05, - "loss": 2.1476, - "step": 241400 - }, - { - "epoch": 0.034885213692865, - "grad_norm": 2.1510214805603027, - "learning_rate": 1.2418470752318586e-05, - "loss": 2.1408, - "step": 241500 - }, - { - "epoch": 0.03516429540240792, - "grad_norm": 2.1306464672088623, - "learning_rate": 1.2399343337785602e-05, - "loss": 2.1489, - "step": 241600 - }, - { - "epoch": 0.035443377111950836, - "grad_norm": 2.0896074771881104, - "learning_rate": 1.2380225806217757e-05, - "loss": 2.1384, - "step": 241700 - }, - { - "epoch": 0.035722458821493754, - "grad_norm": 2.154553174972534, - "learning_rate": 1.2361118172609326e-05, - "loss": 2.1427, - "step": 241800 - }, - { - "epoch": 0.03600154053103668, - "grad_norm": 2.235761880874634, - "learning_rate": 1.2342020451946843e-05, - "loss": 2.1545, - "step": 241900 - }, - { - "epoch": 0.036280622240579596, - "grad_norm": 2.165769100189209, - "learning_rate": 1.2322932659209057e-05, - "loss": 2.1387, - "step": 242000 - }, - { - "epoch": 0.036280622240579596, - "eval_loss": 2.122012138366699, - "eval_runtime": 52.0197, - "eval_samples_per_second": 195.964, - "eval_steps_per_second": 1.538, - "step": 242000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 2.312960624694824, - "learning_rate": 1.2303854809366949e-05, - "loss": 2.1314, - "step": 242100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 2.210526943206787, - "learning_rate": 1.2284786917383661e-05, - "loss": 2.1418, - "step": 242200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 2.0691890716552734, - "learning_rate": 1.2265728998214562e-05, - "loss": 2.1397, - "step": 242300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 2.160215377807617, - "learning_rate": 1.2246681066807195e-05, - "loss": 2.136, - "step": 242400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 2.01823353767395, - "learning_rate": 1.2227643138101242e-05, - "loss": 2.136, - "step": 242500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 2.073930501937866, - "learning_rate": 1.2208615227028577e-05, - "loss": 2.1447, - "step": 242600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 2.1365602016448975, - "learning_rate": 1.2189597348513183e-05, - "loss": 2.1365, - "step": 242700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 2.1082065105438232, - "learning_rate": 1.2170589517471193e-05, - "loss": 2.1502, - "step": 242800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 2.0763580799102783, - "learning_rate": 1.215159174881087e-05, - "loss": 2.1318, - "step": 242900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 2.2010183334350586, - "learning_rate": 1.2132604057432551e-05, - "loss": 2.14, - "step": 243000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.1260664463043213, - "eval_runtime": 51.6429, - "eval_samples_per_second": 197.394, - "eval_steps_per_second": 1.549, - "step": 243000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 2.1431691646575928, - "learning_rate": 1.21136264582287e-05, - "loss": 2.1348, - "step": 243100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 2.1176159381866455, - "learning_rate": 1.2094658966083853e-05, - "loss": 2.141, - "step": 243200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 2.1691203117370605, - "learning_rate": 1.207570159587463e-05, - "loss": 2.1408, - "step": 243300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 2.2299644947052, - "learning_rate": 1.2056754362469688e-05, - "loss": 2.1231, - "step": 243400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 2.0889995098114014, - "learning_rate": 1.2037817280729755e-05, - "loss": 2.1263, - "step": 243500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 2.0144128799438477, - "learning_rate": 1.2018890365507587e-05, - "loss": 2.1297, - "step": 243600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 2.1843581199645996, - "learning_rate": 1.1999973631647984e-05, - "loss": 2.1339, - "step": 243700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 2.199553966522217, - "learning_rate": 1.1981067093987724e-05, - "loss": 2.1246, - "step": 243800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 2.118969440460205, - "learning_rate": 1.1962170767355633e-05, - "loss": 2.1357, - "step": 243900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 2.2393405437469482, - "learning_rate": 1.1943284666572479e-05, - "loss": 2.1312, - "step": 244000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.127676486968994, - "eval_runtime": 51.305, - "eval_samples_per_second": 198.694, - "eval_steps_per_second": 1.559, - "step": 244000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 2.1407392024993896, - "learning_rate": 1.192440880645105e-05, - "loss": 2.1227, - "step": 244100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 2.188948154449463, - "learning_rate": 1.1905543201796097e-05, - "loss": 2.1432, - "step": 244200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 2.155181407928467, - "learning_rate": 1.1886687867404295e-05, - "loss": 2.1286, - "step": 244300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 2.2504708766937256, - "learning_rate": 1.1867842818064304e-05, - "loss": 2.132, - "step": 244400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 2.1262989044189453, - "learning_rate": 1.1849008068556692e-05, - "loss": 2.1373, - "step": 244500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 2.098700523376465, - "learning_rate": 1.1830183633653971e-05, - "loss": 2.1201, - "step": 244600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 2.1884560585021973, - "learning_rate": 1.181136952812053e-05, - "loss": 2.1291, - "step": 244700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 2.124889850616455, - "learning_rate": 1.1792565766712684e-05, - "loss": 2.127, - "step": 244800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 2.2935962677001953, - "learning_rate": 1.1773772364178626e-05, - "loss": 2.1384, - "step": 244900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 2.247445821762085, - "learning_rate": 1.1754989335258432e-05, - "loss": 2.1298, - "step": 245000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.123136520385742, - "eval_runtime": 51.1588, - "eval_samples_per_second": 199.262, - "eval_steps_per_second": 1.564, - "step": 245000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 2.1179420948028564, - "learning_rate": 1.1736216694684019e-05, - "loss": 2.1381, - "step": 245100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 2.1974310874938965, - "learning_rate": 1.1717454457179186e-05, - "loss": 2.1324, - "step": 245200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 2.1987204551696777, - "learning_rate": 1.1698702637459543e-05, - "loss": 2.1341, - "step": 245300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 2.1850931644439697, - "learning_rate": 1.167996125023256e-05, - "loss": 2.1222, - "step": 245400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 2.0711605548858643, - "learning_rate": 1.1661230310197494e-05, - "loss": 2.1305, - "step": 245500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 2.1303722858428955, - "learning_rate": 1.1642509832045428e-05, - "loss": 2.1223, - "step": 245600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 2.0326571464538574, - "learning_rate": 1.1623799830459236e-05, - "loss": 2.1458, - "step": 245700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 2.179598331451416, - "learning_rate": 1.1605100320113585e-05, - "loss": 2.1191, - "step": 245800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 2.1782078742980957, - "learning_rate": 1.158641131567488e-05, - "loss": 2.1253, - "step": 245900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 2.117875099182129, - "learning_rate": 1.1567732831801316e-05, - "loss": 2.1177, - "step": 246000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.1340432167053223, - "eval_runtime": 51.3485, - "eval_samples_per_second": 198.526, - "eval_steps_per_second": 1.558, - "step": 246000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 2.203430414199829, - "learning_rate": 1.1549064883142832e-05, - "loss": 2.1243, - "step": 246100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 2.1262495517730713, - "learning_rate": 1.1530407484341108e-05, - "loss": 2.1271, - "step": 246200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 2.0860841274261475, - "learning_rate": 1.151176065002952e-05, - "loss": 2.1139, - "step": 246300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 2.167961835861206, - "learning_rate": 1.1493124394833196e-05, - "loss": 2.1299, - "step": 246400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 2.213210344314575, - "learning_rate": 1.1474498733368957e-05, - "loss": 2.1187, - "step": 246500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 2.1387743949890137, - "learning_rate": 1.1455883680245285e-05, - "loss": 2.1185, - "step": 246600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 2.21766996383667, - "learning_rate": 1.143727925006239e-05, - "loss": 2.1257, - "step": 246700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 2.1220784187316895, - "learning_rate": 1.1418685457412103e-05, - "loss": 2.1358, - "step": 246800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 2.175532341003418, - "learning_rate": 1.1400102316877948e-05, - "loss": 2.1127, - "step": 246900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 2.1164164543151855, - "learning_rate": 1.1381529843035077e-05, - "loss": 2.1382, - "step": 247000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.1271872520446777, - "eval_runtime": 51.251, - "eval_samples_per_second": 198.903, - "eval_steps_per_second": 1.561, - "step": 247000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 2.076045036315918, - "learning_rate": 1.1362968050450287e-05, - "loss": 2.1156, - "step": 247100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 2.122551679611206, - "learning_rate": 1.1344416953681974e-05, - "loss": 2.1248, - "step": 247200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 2.206083059310913, - "learning_rate": 1.132587656728017e-05, - "loss": 2.1265, - "step": 247300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 2.175978660583496, - "learning_rate": 1.1307346905786498e-05, - "loss": 2.1175, - "step": 247400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 2.058006525039673, - "learning_rate": 1.1288827983734173e-05, - "loss": 2.1116, - "step": 247500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 2.055121660232544, - "learning_rate": 1.1270319815647972e-05, - "loss": 2.1198, - "step": 247600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 2.154327392578125, - "learning_rate": 1.1251822416044252e-05, - "loss": 2.131, - "step": 247700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 2.2314298152923584, - "learning_rate": 1.1233335799430933e-05, - "loss": 2.119, - "step": 247800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 2.142615795135498, - "learning_rate": 1.1214859980307448e-05, - "loss": 2.1223, - "step": 247900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 2.1668100357055664, - "learning_rate": 1.1196394973164778e-05, - "loss": 2.1211, - "step": 248000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.1204113960266113, - "eval_runtime": 51.2734, - "eval_samples_per_second": 198.816, - "eval_steps_per_second": 1.56, - "step": 248000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 2.069814443588257, - "learning_rate": 1.1177940792485428e-05, - "loss": 2.1099, - "step": 248100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 2.072711229324341, - "learning_rate": 1.1159497452743409e-05, - "loss": 2.1382, - "step": 248200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 2.114495038986206, - "learning_rate": 1.1141064968404236e-05, - "loss": 2.243, - "step": 248300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 1.239190936088562, - "learning_rate": 1.112264335392488e-05, - "loss": 2.2247, - "step": 248400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 2.097609519958496, - "learning_rate": 1.1104232623753824e-05, - "loss": 2.214, - "step": 248500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 1.0227768421173096, - "learning_rate": 1.1085832792330996e-05, - "loss": 2.2084, - "step": 248600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 2.114027738571167, - "learning_rate": 1.1067443874087785e-05, - "loss": 2.1914, - "step": 248700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 1.0278912782669067, - "learning_rate": 1.1049065883446999e-05, - "loss": 2.2196, - "step": 248800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 2.1365833282470703, - "learning_rate": 1.1030698834822895e-05, - "loss": 2.2016, - "step": 248900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 2.083474636077881, - "learning_rate": 1.1012342742621145e-05, - "loss": 2.2319, - "step": 249000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.1132681369781494, - "eval_runtime": 51.5743, - "eval_samples_per_second": 197.657, - "eval_steps_per_second": 1.551, - "step": 249000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 1.1456485986709595, - "learning_rate": 1.0993997621238836e-05, - "loss": 2.1891, - "step": 249100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 2.1170260906219482, - "learning_rate": 1.097566348506443e-05, - "loss": 2.1947, - "step": 249200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 2.1078946590423584, - "learning_rate": 1.0957340348477771e-05, - "loss": 2.183, - "step": 249300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 1.6463110446929932, - "learning_rate": 1.09390282258501e-05, - "loss": 2.1768, - "step": 249400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 1.4292881488800049, - "learning_rate": 1.092072713154402e-05, - "loss": 2.1944, - "step": 249500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 2.064131259918213, - "learning_rate": 1.0902437079913447e-05, - "loss": 2.1829, - "step": 249600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 2.1172714233398438, - "learning_rate": 1.088415808530367e-05, - "loss": 2.1588, - "step": 249700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 2.112825393676758, - "learning_rate": 1.08658901620513e-05, - "loss": 2.1718, - "step": 249800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 2.0806221961975098, - "learning_rate": 1.0847633324484261e-05, - "loss": 2.1993, - "step": 249900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 1.1664646863937378, - "learning_rate": 1.0829387586921785e-05, - "loss": 2.185, - "step": 250000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.100121021270752, - "eval_runtime": 51.6801, - "eval_samples_per_second": 197.252, - "eval_steps_per_second": 1.548, - "step": 250000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 2.2250313758850098, - "learning_rate": 1.0811152963674384e-05, - "loss": 2.1625, - "step": 250100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 2.1046063899993896, - "learning_rate": 1.079292946904387e-05, - "loss": 2.1639, - "step": 250200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 1.1374410390853882, - "learning_rate": 1.077471711732333e-05, - "loss": 2.1794, - "step": 250300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 1.7234727144241333, - "learning_rate": 1.075651592279708e-05, - "loss": 2.1634, - "step": 250400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 2.190262794494629, - "learning_rate": 1.0738325899740733e-05, - "loss": 2.1585, - "step": 250500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 2.0651297569274902, - "learning_rate": 1.072014706242109e-05, - "loss": 2.1685, - "step": 250600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 2.1923389434814453, - "learning_rate": 1.0701979425096212e-05, - "loss": 2.1704, - "step": 250700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 1.0421335697174072, - "learning_rate": 1.0683823002015378e-05, - "loss": 2.156, - "step": 250800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 0.9804383516311646, - "learning_rate": 1.0665677807419038e-05, - "loss": 2.1738, - "step": 250900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 2.0791897773742676, - "learning_rate": 1.0647543855538871e-05, - "loss": 2.1742, - "step": 251000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.099701404571533, - "eval_runtime": 51.5047, - "eval_samples_per_second": 197.924, - "eval_steps_per_second": 1.553, - "step": 251000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 1.1516467332839966, - "learning_rate": 1.0629421160597724e-05, - "loss": 2.153, - "step": 251100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 2.0791218280792236, - "learning_rate": 1.0611309736809618e-05, - "loss": 2.1469, - "step": 251200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 1.9377182722091675, - "learning_rate": 1.0593209598379719e-05, - "loss": 2.1601, - "step": 251300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 2.2188608646392822, - "learning_rate": 1.0575120759504362e-05, - "loss": 2.1604, - "step": 251400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 1.0223076343536377, - "learning_rate": 1.0557043234371006e-05, - "loss": 2.1496, - "step": 251500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 2.205202579498291, - "learning_rate": 1.0538977037158254e-05, - "loss": 2.1409, - "step": 251600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 2.187514543533325, - "learning_rate": 1.0520922182035798e-05, - "loss": 2.1522, - "step": 251700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 2.1280906200408936, - "learning_rate": 1.0502878683164458e-05, - "loss": 2.1501, - "step": 251800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 1.2866263389587402, - "learning_rate": 1.0484846554696123e-05, - "loss": 2.1556, - "step": 251900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 1.111025333404541, - "learning_rate": 1.0466825810773796e-05, - "loss": 2.1422, - "step": 252000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.102551221847534, - "eval_runtime": 51.511, - "eval_samples_per_second": 197.899, - "eval_steps_per_second": 1.553, - "step": 252000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 2.0444447994232178, - "learning_rate": 1.0448816465531513e-05, - "loss": 2.147, - "step": 252100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 2.0270159244537354, - "learning_rate": 1.0430818533094403e-05, - "loss": 2.1492, - "step": 252200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 2.180891275405884, - "learning_rate": 1.0412832027578622e-05, - "loss": 2.152, - "step": 252300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 1.3767441511154175, - "learning_rate": 1.039485696309139e-05, - "loss": 2.1437, - "step": 252400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 2.1430041790008545, - "learning_rate": 1.0376893353730913e-05, - "loss": 2.1453, - "step": 252500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 1.045134425163269, - "learning_rate": 1.0358941213586443e-05, - "loss": 2.1367, - "step": 252600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 2.374004602432251, - "learning_rate": 1.0341000556738229e-05, - "loss": 2.1499, - "step": 252700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 0.9447265863418579, - "learning_rate": 1.0323071397257514e-05, - "loss": 2.1664, - "step": 252800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 0.9592018127441406, - "learning_rate": 1.0305153749206531e-05, - "loss": 2.2281, - "step": 252900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 0.9208105802536011, - "learning_rate": 1.0287247626638455e-05, - "loss": 2.2157, - "step": 253000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.099860429763794, - "eval_runtime": 51.562, - "eval_samples_per_second": 197.704, - "eval_steps_per_second": 1.552, - "step": 253000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 0.9355520606040955, - "learning_rate": 1.0269353043597463e-05, - "loss": 2.218, - "step": 253100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 0.9669928550720215, - "learning_rate": 1.0251470014118641e-05, - "loss": 2.2214, - "step": 253200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 0.9217200875282288, - "learning_rate": 1.0233598552228049e-05, - "loss": 2.1921, - "step": 253300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 0.9365478754043579, - "learning_rate": 1.021573867194264e-05, - "loss": 2.1832, - "step": 253400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 0.952186107635498, - "learning_rate": 1.0197890387270311e-05, - "loss": 2.1899, - "step": 253500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 0.950163722038269, - "learning_rate": 1.0180053712209855e-05, - "loss": 2.1778, - "step": 253600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 0.9093700051307678, - "learning_rate": 1.0162228660750967e-05, - "loss": 2.1641, - "step": 253700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 0.9941834211349487, - "learning_rate": 1.0144415246874198e-05, - "loss": 2.1762, - "step": 253800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 0.9335500001907349, - "learning_rate": 1.0126613484550997e-05, - "loss": 2.1727, - "step": 253900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 0.963545560836792, - "learning_rate": 1.0108823387743674e-05, - "loss": 2.171, - "step": 254000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.112656354904175, - "eval_runtime": 51.6053, - "eval_samples_per_second": 197.538, - "eval_steps_per_second": 1.55, - "step": 254000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 0.9451214075088501, - "learning_rate": 1.0091044970405386e-05, - "loss": 2.1437, - "step": 254100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 0.9729546904563904, - "learning_rate": 1.0073278246480113e-05, - "loss": 2.1442, - "step": 254200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 0.9474550485610962, - "learning_rate": 1.0055523229902686e-05, - "loss": 2.1518, - "step": 254300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 0.984620213508606, - "learning_rate": 1.0037779934598754e-05, - "loss": 2.1314, - "step": 254400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 0.9437354803085327, - "learning_rate": 1.0020048374484745e-05, - "loss": 2.1189, - "step": 254500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 0.9442464113235474, - "learning_rate": 1.0002328563467917e-05, - "loss": 2.123, - "step": 254600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 0.9733272790908813, - "learning_rate": 9.984620515446283e-06, - "loss": 2.1317, - "step": 254700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 0.9501616358757019, - "learning_rate": 9.966924244308656e-06, - "loss": 2.1229, - "step": 254800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 0.9912092089653015, - "learning_rate": 9.949239763934603e-06, - "loss": 2.1353, - "step": 254900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 0.9838760495185852, - "learning_rate": 9.931567088194429e-06, - "loss": 2.1175, - "step": 255000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.1234488487243652, - "eval_runtime": 52.035, - "eval_samples_per_second": 195.907, - "eval_steps_per_second": 1.537, - "step": 255000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 0.9307495951652527, - "learning_rate": 9.913906230949201e-06, - "loss": 2.1272, - "step": 255100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 0.9529586434364319, - "learning_rate": 9.896257206050705e-06, - "loss": 2.1184, - "step": 255200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 1.0264242887496948, - "learning_rate": 9.87862002734146e-06, - "loss": 2.1008, - "step": 255300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 0.9989575147628784, - "learning_rate": 9.860994708654663e-06, - "loss": 2.0978, - "step": 255400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 0.9874339699745178, - "learning_rate": 9.843381263814242e-06, - "loss": 2.1105, - "step": 255500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 0.8986032605171204, - "learning_rate": 9.8257797066348e-06, - "loss": 2.1097, - "step": 255600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 0.968845546245575, - "learning_rate": 9.808190050921618e-06, - "loss": 2.0813, - "step": 255700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 0.9542651176452637, - "learning_rate": 9.790612310470637e-06, - "loss": 2.1062, - "step": 255800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 0.9598912596702576, - "learning_rate": 9.773046499068447e-06, - "loss": 2.1088, - "step": 255900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 0.9718886017799377, - "learning_rate": 9.755492630492296e-06, - "loss": 2.1028, - "step": 256000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.1331796646118164, - "eval_runtime": 51.757, - "eval_samples_per_second": 196.959, - "eval_steps_per_second": 1.546, - "step": 256000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 0.9607440829277039, - "learning_rate": 9.73795071851006e-06, - "loss": 2.0957, - "step": 256100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 0.9715221524238586, - "learning_rate": 9.720420776880248e-06, - "loss": 2.0837, - "step": 256200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 0.9565144777297974, - "learning_rate": 9.70290281935195e-06, - "loss": 2.0889, - "step": 256300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 0.921954333782196, - "learning_rate": 9.685396859664883e-06, - "loss": 2.0754, - "step": 256400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 0.9448678493499756, - "learning_rate": 9.667902911549348e-06, - "loss": 2.0773, - "step": 256500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 0.9528698325157166, - "learning_rate": 9.650420988726231e-06, - "loss": 2.0708, - "step": 256600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 0.9941433668136597, - "learning_rate": 9.632951104906962e-06, - "loss": 2.0687, - "step": 256700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 0.9805740118026733, - "learning_rate": 9.615493273793555e-06, - "loss": 2.0667, - "step": 256800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 1.0303276777267456, - "learning_rate": 9.598047509078562e-06, - "loss": 2.072, - "step": 256900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 0.9494913816452026, - "learning_rate": 9.580613824445076e-06, - "loss": 2.0597, - "step": 257000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.1403818130493164, - "eval_runtime": 51.6861, - "eval_samples_per_second": 197.229, - "eval_steps_per_second": 1.548, - "step": 257000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 0.9716615080833435, - "learning_rate": 9.563192233566701e-06, - "loss": 2.0605, - "step": 257100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 0.9704664349555969, - "learning_rate": 9.54578275010756e-06, - "loss": 2.0707, - "step": 257200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 0.981453537940979, - "learning_rate": 9.528385387722285e-06, - "loss": 2.058, - "step": 257300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 0.9496264457702637, - "learning_rate": 9.511000160056016e-06, - "loss": 2.0587, - "step": 257400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 0.9844343662261963, - "learning_rate": 9.493627080744341e-06, - "loss": 2.0689, - "step": 257500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 1.0004667043685913, - "learning_rate": 9.476266163413345e-06, - "loss": 2.0594, - "step": 257600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 1.0279428958892822, - "learning_rate": 9.458917421679568e-06, - "loss": 2.055, - "step": 257700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 0.9908036589622498, - "learning_rate": 9.44158086915001e-06, - "loss": 2.0572, - "step": 257800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 0.9733301401138306, - "learning_rate": 9.42425651942208e-06, - "loss": 2.0424, - "step": 257900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 0.98853999376297, - "learning_rate": 9.406944386083652e-06, - "loss": 2.0598, - "step": 258000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.1505215167999268, - "eval_runtime": 51.7324, - "eval_samples_per_second": 197.053, - "eval_steps_per_second": 1.546, - "step": 258000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 0.9915367960929871, - "learning_rate": 9.389644482712997e-06, - "loss": 2.0376, - "step": 258100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 0.9439958930015564, - "learning_rate": 9.372356822878813e-06, - "loss": 2.0403, - "step": 258200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 0.9918619394302368, - "learning_rate": 9.355081420140164e-06, - "loss": 2.0297, - "step": 258300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 0.9755305647850037, - "learning_rate": 9.337818288046535e-06, - "loss": 2.042, - "step": 258400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 0.9416148662567139, - "learning_rate": 9.32056744013775e-06, - "loss": 2.0301, - "step": 258500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 0.9932397603988647, - "learning_rate": 9.303328889944044e-06, - "loss": 2.0513, - "step": 258600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 0.9738903045654297, - "learning_rate": 9.286102650985957e-06, - "loss": 2.0442, - "step": 258700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 0.9329037070274353, - "learning_rate": 9.268888736774408e-06, - "loss": 2.0121, - "step": 258800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 0.9853471517562866, - "learning_rate": 9.251687160810643e-06, - "loss": 2.0339, - "step": 258900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 0.9692677855491638, - "learning_rate": 9.23449793658622e-06, - "loss": 2.0281, - "step": 259000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.1587061882019043, - "eval_runtime": 51.7564, - "eval_samples_per_second": 196.961, - "eval_steps_per_second": 1.546, - "step": 259000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 0.9744108319282532, - "learning_rate": 9.21732107758303e-06, - "loss": 2.0165, - "step": 259100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 0.9786638617515564, - "learning_rate": 9.200156597273235e-06, - "loss": 2.0186, - "step": 259200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 0.9883900880813599, - "learning_rate": 9.183004509119308e-06, - "loss": 2.0363, - "step": 259300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 0.9344263076782227, - "learning_rate": 9.165864826574003e-06, - "loss": 2.0281, - "step": 259400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 0.9897043704986572, - "learning_rate": 9.148737563080348e-06, - "loss": 2.0178, - "step": 259500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 1.0104656219482422, - "learning_rate": 9.131622732071607e-06, - "loss": 2.0277, - "step": 259600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 0.9919978976249695, - "learning_rate": 9.114520346971324e-06, - "loss": 2.012, - "step": 259700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 0.9721410274505615, - "learning_rate": 9.097430421193254e-06, - "loss": 2.0232, - "step": 259800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 0.9424166083335876, - "learning_rate": 9.080352968141404e-06, - "loss": 2.0118, - "step": 259900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 0.9954734444618225, - "learning_rate": 9.063288001209969e-06, - "loss": 2.017, - "step": 260000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.156609296798706, - "eval_runtime": 51.7856, - "eval_samples_per_second": 196.85, - "eval_steps_per_second": 1.545, - "step": 260000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 0.9650399684906006, - "learning_rate": 9.046235533783381e-06, - "loss": 2.0188, - "step": 260100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 1.014950156211853, - "learning_rate": 9.029195579236252e-06, - "loss": 2.0116, - "step": 260200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 1.0187995433807373, - "learning_rate": 9.012168150933394e-06, - "loss": 2.015, - "step": 260300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 0.959968090057373, - "learning_rate": 8.995153262229769e-06, - "loss": 2.009, - "step": 260400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 0.9517911076545715, - "learning_rate": 8.978150926470524e-06, - "loss": 1.986, - "step": 260500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 0.9722422957420349, - "learning_rate": 8.961161156990958e-06, - "loss": 1.9976, - "step": 260600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 0.9573680758476257, - "learning_rate": 8.944183967116519e-06, - "loss": 2.0034, - "step": 260700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 0.9799074530601501, - "learning_rate": 8.92721937016276e-06, - "loss": 1.992, - "step": 260800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 0.9554262161254883, - "learning_rate": 8.910267379435391e-06, - "loss": 2.009, - "step": 260900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 0.988070547580719, - "learning_rate": 8.893328008230231e-06, - "loss": 1.9862, - "step": 261000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.1644506454467773, - "eval_runtime": 51.9499, - "eval_samples_per_second": 196.227, - "eval_steps_per_second": 1.54, - "step": 261000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 0.9487972259521484, - "learning_rate": 8.876401269833173e-06, - "loss": 1.9909, - "step": 261100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 0.9944525957107544, - "learning_rate": 8.859487177520237e-06, - "loss": 2.0028, - "step": 261200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 1.0023555755615234, - "learning_rate": 8.842585744557493e-06, - "loss": 1.9972, - "step": 261300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 0.9712923765182495, - "learning_rate": 8.825696984201107e-06, - "loss": 1.996, - "step": 261400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 0.9709236025810242, - "learning_rate": 8.8088209096973e-06, - "loss": 1.9829, - "step": 261500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 1.0036200284957886, - "learning_rate": 8.791957534282322e-06, - "loss": 1.9891, - "step": 261600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 1.0096827745437622, - "learning_rate": 8.775106871182492e-06, - "loss": 1.9834, - "step": 261700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 1.0069483518600464, - "learning_rate": 8.758268933614148e-06, - "loss": 1.9952, - "step": 261800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 0.9731675982475281, - "learning_rate": 8.741443734783646e-06, - "loss": 1.9898, - "step": 261900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 0.9884027242660522, - "learning_rate": 8.724631287887342e-06, - "loss": 1.9831, - "step": 262000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.1643013954162598, - "eval_runtime": 52.068, - "eval_samples_per_second": 195.783, - "eval_steps_per_second": 1.536, - "step": 262000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 0.9630898237228394, - "learning_rate": 8.7078316061116e-06, - "loss": 1.9869, - "step": 262100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 0.9709094762802124, - "learning_rate": 8.691044702632775e-06, - "loss": 1.9807, - "step": 262200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 0.9887747168540955, - "learning_rate": 8.674270590617201e-06, - "loss": 1.9784, - "step": 262300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 0.9830183386802673, - "learning_rate": 8.657509283221157e-06, - "loss": 1.9951, - "step": 262400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 0.9983903765678406, - "learning_rate": 8.640760793590915e-06, - "loss": 1.982, - "step": 262500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 0.9943532347679138, - "learning_rate": 8.624025134862654e-06, - "loss": 1.9767, - "step": 262600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 1.0101203918457031, - "learning_rate": 8.607302320162522e-06, - "loss": 1.9672, - "step": 262700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 0.9875810146331787, - "learning_rate": 8.590592362606587e-06, - "loss": 1.987, - "step": 262800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 0.997082531452179, - "learning_rate": 8.573895275300811e-06, - "loss": 1.9738, - "step": 262900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 0.9400111436843872, - "learning_rate": 8.557211071341084e-06, - "loss": 1.9804, - "step": 263000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.1664023399353027, - "eval_runtime": 52.0751, - "eval_samples_per_second": 195.756, - "eval_steps_per_second": 1.536, - "step": 263000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 1.0243192911148071, - "learning_rate": 8.540539763813187e-06, - "loss": 1.9642, - "step": 263100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 1.0003769397735596, - "learning_rate": 8.523881365792794e-06, - "loss": 1.9634, - "step": 263200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 0.9745362997055054, - "learning_rate": 8.507235890345424e-06, - "loss": 1.9727, - "step": 263300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 0.9894079566001892, - "learning_rate": 8.490603350526489e-06, - "loss": 1.9687, - "step": 263400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 1.0135899782180786, - "learning_rate": 8.473983759381247e-06, - "loss": 1.9791, - "step": 263500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 0.9646082520484924, - "learning_rate": 8.457377129944805e-06, - "loss": 1.9704, - "step": 263600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 0.9717702269554138, - "learning_rate": 8.440783475242086e-06, - "loss": 1.9584, - "step": 263700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 0.9921123385429382, - "learning_rate": 8.424202808287865e-06, - "loss": 1.9765, - "step": 263800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 0.9913526177406311, - "learning_rate": 8.407635142086698e-06, - "loss": 1.9592, - "step": 263900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 0.9921115040779114, - "learning_rate": 8.391080489632974e-06, - "loss": 1.9656, - "step": 264000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.1658823490142822, - "eval_runtime": 52.007, - "eval_samples_per_second": 196.012, - "eval_steps_per_second": 1.538, - "step": 264000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 1.003160834312439, - "learning_rate": 8.37453886391085e-06, - "loss": 1.956, - "step": 264100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 0.9790710806846619, - "learning_rate": 8.358010277894282e-06, - "loss": 1.9423, - "step": 264200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 1.0112133026123047, - "learning_rate": 8.341494744546995e-06, - "loss": 1.9547, - "step": 264300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 1.0514979362487793, - "learning_rate": 8.324992276822489e-06, - "loss": 1.9609, - "step": 264400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 1.0433865785598755, - "learning_rate": 8.30850288766398e-06, - "loss": 1.9587, - "step": 264500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 0.986564040184021, - "learning_rate": 8.29202659000446e-06, - "loss": 1.9445, - "step": 264600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 0.9676020741462708, - "learning_rate": 8.275563396766643e-06, - "loss": 1.9563, - "step": 264700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 1.0368075370788574, - "learning_rate": 8.259113320862971e-06, - "loss": 1.9514, - "step": 264800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 1.046763300895691, - "learning_rate": 8.24267637519558e-06, - "loss": 1.9756, - "step": 264900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 0.9866767525672913, - "learning_rate": 8.22625257265632e-06, - "loss": 1.9415, - "step": 265000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.180581569671631, - "eval_runtime": 52.0428, - "eval_samples_per_second": 195.877, - "eval_steps_per_second": 1.537, - "step": 265000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 0.9820157289505005, - "learning_rate": 8.209841926126744e-06, - "loss": 1.9674, - "step": 265100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 0.9998511672019958, - "learning_rate": 8.193444448478054e-06, - "loss": 1.9582, - "step": 265200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 1.0134938955307007, - "learning_rate": 8.177060152571165e-06, - "loss": 1.9455, - "step": 265300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 0.993373692035675, - "learning_rate": 8.16068905125661e-06, - "loss": 1.9548, - "step": 265400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 0.9485350847244263, - "learning_rate": 8.144331157374604e-06, - "loss": 1.936, - "step": 265500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 0.9534901976585388, - "learning_rate": 8.127986483754996e-06, - "loss": 1.9494, - "step": 265600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 0.989976167678833, - "learning_rate": 8.111655043217274e-06, - "loss": 1.9456, - "step": 265700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 0.9938270449638367, - "learning_rate": 8.095336848570512e-06, - "loss": 1.9265, - "step": 265800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 1.0147507190704346, - "learning_rate": 8.079031912613436e-06, - "loss": 1.9714, - "step": 265900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 0.9483819007873535, - "learning_rate": 8.06274024813435e-06, - "loss": 1.9384, - "step": 266000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.1801016330718994, - "eval_runtime": 52.1929, - "eval_samples_per_second": 195.314, - "eval_steps_per_second": 1.533, - "step": 266000 - }, - { - "epoch": 0.033768886854693314, - "grad_norm": 0.9632075428962708, - "learning_rate": 8.046461867911173e-06, - "loss": 1.9424, - "step": 266100 - }, - { - "epoch": 0.03404796856423624, - "grad_norm": 0.9533110857009888, - "learning_rate": 8.030196784711364e-06, - "loss": 1.9376, - "step": 266200 - }, - { - "epoch": 0.03432705027377916, - "grad_norm": 1.0330917835235596, - "learning_rate": 8.013945011291996e-06, - "loss": 1.9395, - "step": 266300 - }, - { - "epoch": 0.034606131983322075, - "grad_norm": 1.0646517276763916, - "learning_rate": 7.997706560399665e-06, - "loss": 1.931, - "step": 266400 - }, - { - "epoch": 0.034885213692865, - "grad_norm": 0.9834697842597961, - "learning_rate": 7.981481444770552e-06, - "loss": 1.9239, - "step": 266500 - }, - { - "epoch": 0.03516429540240792, - "grad_norm": 0.9982201457023621, - "learning_rate": 7.965269677130349e-06, - "loss": 1.9457, - "step": 266600 - }, - { - "epoch": 0.035443377111950836, - "grad_norm": 0.9639807343482971, - "learning_rate": 7.949071270194303e-06, - "loss": 1.951, - "step": 266700 - }, - { - "epoch": 0.035722458821493754, - "grad_norm": 0.959434986114502, - "learning_rate": 7.932886236667163e-06, - "loss": 1.9321, - "step": 266800 - }, - { - "epoch": 0.03600154053103668, - "grad_norm": 1.0044972896575928, - "learning_rate": 7.916714589243215e-06, - "loss": 1.9204, - "step": 266900 - }, - { - "epoch": 0.036280622240579596, - "grad_norm": 1.0055923461914062, - "learning_rate": 7.90055634060621e-06, - "loss": 1.9338, - "step": 267000 - }, - { - "epoch": 0.036280622240579596, - "eval_loss": 2.1793017387390137, - "eval_runtime": 52.1353, - "eval_samples_per_second": 195.53, - "eval_steps_per_second": 1.534, - "step": 267000 - }, - { - "epoch": 0.036559703950122514, - "grad_norm": 1.01926851272583, - "learning_rate": 7.884411503429415e-06, - "loss": 1.9398, - "step": 267100 - }, - { - "epoch": 0.03683878565966544, - "grad_norm": 0.9919995665550232, - "learning_rate": 7.868280090375574e-06, - "loss": 1.9266, - "step": 267200 - }, - { - "epoch": 0.03711786736920836, - "grad_norm": 1.0640407800674438, - "learning_rate": 7.852162114096905e-06, - "loss": 1.9299, - "step": 267300 - }, - { - "epoch": 0.037396949078751275, - "grad_norm": 0.9906590580940247, - "learning_rate": 7.836057587235068e-06, - "loss": 1.9352, - "step": 267400 - }, - { - "epoch": 0.0376760307882942, - "grad_norm": 1.0245193243026733, - "learning_rate": 7.819966522421199e-06, - "loss": 1.9367, - "step": 267500 - }, - { - "epoch": 0.03795511249783712, - "grad_norm": 1.0126677751541138, - "learning_rate": 7.803888932275872e-06, - "loss": 1.9239, - "step": 267600 - }, - { - "epoch": 0.038234194207380036, - "grad_norm": 1.0562597513198853, - "learning_rate": 7.787824829409066e-06, - "loss": 1.9371, - "step": 267700 - }, - { - "epoch": 0.038513275916922954, - "grad_norm": 1.034492015838623, - "learning_rate": 7.771774226420219e-06, - "loss": 1.9432, - "step": 267800 - }, - { - "epoch": 0.03879235762646588, - "grad_norm": 1.0192219018936157, - "learning_rate": 7.75573713589815e-06, - "loss": 1.9177, - "step": 267900 - }, - { - "epoch": 0.0390714393360088, - "grad_norm": 0.9676885008811951, - "learning_rate": 7.739713570421098e-06, - "loss": 1.9144, - "step": 268000 - }, - { - "epoch": 0.0390714393360088, - "eval_loss": 2.1744418144226074, - "eval_runtime": 52.0494, - "eval_samples_per_second": 195.852, - "eval_steps_per_second": 1.537, - "step": 268000 - }, - { - "epoch": 0.039350521045551715, - "grad_norm": 1.0004535913467407, - "learning_rate": 7.72370354255669e-06, - "loss": 1.9302, - "step": 268100 - }, - { - "epoch": 0.03962960275509464, - "grad_norm": 0.9848925471305847, - "learning_rate": 7.707707064861941e-06, - "loss": 1.913, - "step": 268200 - }, - { - "epoch": 0.03990868446463756, - "grad_norm": 0.9667465090751648, - "learning_rate": 7.691724149883217e-06, - "loss": 1.9257, - "step": 268300 - }, - { - "epoch": 0.040187766174180475, - "grad_norm": 1.0000005960464478, - "learning_rate": 7.67575481015627e-06, - "loss": 1.905, - "step": 268400 - }, - { - "epoch": 0.04046684788372339, - "grad_norm": 0.9792052507400513, - "learning_rate": 7.659799058206188e-06, - "loss": 1.9354, - "step": 268500 - }, - { - "epoch": 0.04074592959326632, - "grad_norm": 0.9806801676750183, - "learning_rate": 7.643856906547425e-06, - "loss": 1.9173, - "step": 268600 - }, - { - "epoch": 0.041025011302809236, - "grad_norm": 1.023319125175476, - "learning_rate": 7.627928367683735e-06, - "loss": 1.919, - "step": 268700 - }, - { - "epoch": 0.041304093012352154, - "grad_norm": 1.0140260457992554, - "learning_rate": 7.612013454108219e-06, - "loss": 1.9271, - "step": 268800 - }, - { - "epoch": 0.04158317472189508, - "grad_norm": 1.0070114135742188, - "learning_rate": 7.596112178303291e-06, - "loss": 1.918, - "step": 268900 - }, - { - "epoch": 0.041862256431438, - "grad_norm": 0.9720276594161987, - "learning_rate": 7.58022455274065e-06, - "loss": 1.9272, - "step": 269000 - }, - { - "epoch": 0.041862256431438, - "eval_loss": 2.1881604194641113, - "eval_runtime": 52.0271, - "eval_samples_per_second": 195.936, - "eval_steps_per_second": 1.538, - "step": 269000 - }, - { - "epoch": 0.042141338140980915, - "grad_norm": 1.0044163465499878, - "learning_rate": 7.564350589881317e-06, - "loss": 1.9334, - "step": 269100 - }, - { - "epoch": 0.04242041985052384, - "grad_norm": 0.9554632902145386, - "learning_rate": 7.548490302175565e-06, - "loss": 1.9105, - "step": 269200 - }, - { - "epoch": 0.04269950156006676, - "grad_norm": 0.9780552387237549, - "learning_rate": 7.532643702062963e-06, - "loss": 1.9146, - "step": 269300 - }, - { - "epoch": 0.042978583269609676, - "grad_norm": 0.9981115460395813, - "learning_rate": 7.516810801972348e-06, - "loss": 1.9328, - "step": 269400 - }, - { - "epoch": 0.043257664979152594, - "grad_norm": 1.0094027519226074, - "learning_rate": 7.500991614321792e-06, - "loss": 1.9343, - "step": 269500 - }, - { - "epoch": 0.04353674668869552, - "grad_norm": 0.993590772151947, - "learning_rate": 7.485186151518625e-06, - "loss": 1.9142, - "step": 269600 - }, - { - "epoch": 0.043815828398238436, - "grad_norm": 0.9575207829475403, - "learning_rate": 7.469394425959411e-06, - "loss": 1.9234, - "step": 269700 - }, - { - "epoch": 0.044094910107781354, - "grad_norm": 1.001413106918335, - "learning_rate": 7.453616450029951e-06, - "loss": 1.9087, - "step": 269800 - }, - { - "epoch": 0.04437399181732428, - "grad_norm": 0.9725953340530396, - "learning_rate": 7.437852236105231e-06, - "loss": 1.9153, - "step": 269900 - }, - { - "epoch": 0.0446530735268672, - "grad_norm": 0.9421985149383545, - "learning_rate": 7.422101796549466e-06, - "loss": 1.8918, - "step": 270000 - }, - { - "epoch": 0.0446530735268672, - "eval_loss": 2.1810696125030518, - "eval_runtime": 52.1502, - "eval_samples_per_second": 195.474, - "eval_steps_per_second": 1.534, - "step": 270000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 0.9838644862174988, - "learning_rate": 7.406365143716071e-06, - "loss": 1.9287, - "step": 270100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 1.0142942667007446, - "learning_rate": 7.390642289947644e-06, - "loss": 1.9146, - "step": 270200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 1.0048279762268066, - "learning_rate": 7.374933247575938e-06, - "loss": 1.921, - "step": 270300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 1.061614751815796, - "learning_rate": 7.359238028921914e-06, - "loss": 1.9098, - "step": 270400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 1.0184416770935059, - "learning_rate": 7.343556646295647e-06, - "loss": 1.9307, - "step": 270500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 1.0567371845245361, - "learning_rate": 7.327889111996397e-06, - "loss": 1.9093, - "step": 270600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 1.0020757913589478, - "learning_rate": 7.312235438312537e-06, - "loss": 1.9089, - "step": 270700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 0.9947327375411987, - "learning_rate": 7.296595637521581e-06, - "loss": 1.9175, - "step": 270800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 0.9927939176559448, - "learning_rate": 7.280969721890163e-06, - "loss": 1.9116, - "step": 270900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 0.994209885597229, - "learning_rate": 7.26535770367403e-06, - "loss": 1.9031, - "step": 271000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.1892189979553223, - "eval_runtime": 53.5994, - "eval_samples_per_second": 190.189, - "eval_steps_per_second": 1.493, - "step": 271000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 0.9945200085639954, - "learning_rate": 7.249759595118011e-06, - "loss": 1.9045, - "step": 271100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 0.9387146234512329, - "learning_rate": 7.234175408456037e-06, - "loss": 1.9048, - "step": 271200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 0.9996144771575928, - "learning_rate": 7.218605155911126e-06, - "loss": 1.9089, - "step": 271300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 0.9891520142555237, - "learning_rate": 7.203048849695357e-06, - "loss": 1.9093, - "step": 271400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 1.0603066682815552, - "learning_rate": 7.187506502009886e-06, - "loss": 1.8988, - "step": 271500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 1.0593341588974, - "learning_rate": 7.17197812504489e-06, - "loss": 1.9138, - "step": 271600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 1.0183734893798828, - "learning_rate": 7.156463730979626e-06, - "loss": 1.9011, - "step": 271700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 0.9992024302482605, - "learning_rate": 7.140963331982351e-06, - "loss": 1.9059, - "step": 271800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 0.9801898002624512, - "learning_rate": 7.125476940210371e-06, - "loss": 1.905, - "step": 271900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 0.965479850769043, - "learning_rate": 7.110004567809986e-06, - "loss": 1.9043, - "step": 272000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.1842684745788574, - "eval_runtime": 51.7157, - "eval_samples_per_second": 197.116, - "eval_steps_per_second": 1.547, - "step": 272000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 1.0330137014389038, - "learning_rate": 7.094546226916513e-06, - "loss": 1.9144, - "step": 272100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 0.9688111543655396, - "learning_rate": 7.079101929654261e-06, - "loss": 1.9102, - "step": 272200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 0.9989941120147705, - "learning_rate": 7.06367168813653e-06, - "loss": 1.9074, - "step": 272300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 1.0278581380844116, - "learning_rate": 7.048255514465577e-06, - "loss": 1.8924, - "step": 272400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 0.9955400228500366, - "learning_rate": 7.032853420732644e-06, - "loss": 1.8814, - "step": 272500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 0.9963505864143372, - "learning_rate": 7.017465419017921e-06, - "loss": 1.8934, - "step": 272600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 1.0569164752960205, - "learning_rate": 7.002091521390555e-06, - "loss": 1.8939, - "step": 272700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 0.9949243068695068, - "learning_rate": 6.986731739908611e-06, - "loss": 1.9021, - "step": 272800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 1.0075616836547852, - "learning_rate": 6.971386086619103e-06, - "loss": 1.8978, - "step": 272900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 0.9863401651382446, - "learning_rate": 6.9560545735579606e-06, - "loss": 1.9168, - "step": 273000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.190558433532715, - "eval_runtime": 51.5702, - "eval_samples_per_second": 197.672, - "eval_steps_per_second": 1.551, - "step": 273000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 0.9959931969642639, - "learning_rate": 6.940737212750012e-06, - "loss": 1.8908, - "step": 273100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 1.0437434911727905, - "learning_rate": 6.9254340162089846e-06, - "loss": 1.892, - "step": 273200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 0.9680078625679016, - "learning_rate": 6.91014499593751e-06, - "loss": 1.8859, - "step": 273300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 0.9896761775016785, - "learning_rate": 6.894870163927095e-06, - "loss": 1.8885, - "step": 273400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 1.0668361186981201, - "learning_rate": 6.879609532158124e-06, - "loss": 1.9031, - "step": 273500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 0.9838683605194092, - "learning_rate": 6.864363112599823e-06, - "loss": 1.9065, - "step": 273600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 1.0146870613098145, - "learning_rate": 6.849130917210295e-06, - "loss": 1.8873, - "step": 273700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 0.959338366985321, - "learning_rate": 6.833912957936478e-06, - "loss": 1.8851, - "step": 273800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 1.032836675643921, - "learning_rate": 6.818709246714147e-06, - "loss": 1.8971, - "step": 273900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 0.9915603399276733, - "learning_rate": 6.803519795467888e-06, - "loss": 1.8906, - "step": 274000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.1938321590423584, - "eval_runtime": 51.6041, - "eval_samples_per_second": 197.543, - "eval_steps_per_second": 1.55, - "step": 274000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 0.9873210787773132, - "learning_rate": 6.788344616111117e-06, - "loss": 1.879, - "step": 274100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 0.9958903193473816, - "learning_rate": 6.773183720546056e-06, - "loss": 1.8915, - "step": 274200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 0.9812116026878357, - "learning_rate": 6.758037120663727e-06, - "loss": 1.8922, - "step": 274300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 1.0199834108352661, - "learning_rate": 6.742904828343921e-06, - "loss": 1.8928, - "step": 274400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 0.9892932772636414, - "learning_rate": 6.727786855455218e-06, - "loss": 1.8689, - "step": 274500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 0.9794331789016724, - "learning_rate": 6.712683213854973e-06, - "loss": 1.8766, - "step": 274600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 0.9654126763343811, - "learning_rate": 6.697593915389297e-06, - "loss": 1.8887, - "step": 274700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 0.9861681461334229, - "learning_rate": 6.682518971893053e-06, - "loss": 1.8936, - "step": 274800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 1.0138262510299683, - "learning_rate": 6.667458395189835e-06, - "loss": 1.8718, - "step": 274900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 0.9910663962364197, - "learning_rate": 6.652412197091979e-06, - "loss": 1.8931, - "step": 275000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.1973860263824463, - "eval_runtime": 51.7316, - "eval_samples_per_second": 197.056, - "eval_steps_per_second": 1.546, - "step": 275000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 0.9887643456459045, - "learning_rate": 6.637380389400538e-06, - "loss": 1.8915, - "step": 275100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 1.0442452430725098, - "learning_rate": 6.622362983905295e-06, - "loss": 1.8866, - "step": 275200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 1.025341272354126, - "learning_rate": 6.607359992384704e-06, - "loss": 1.8727, - "step": 275300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 1.0826934576034546, - "learning_rate": 6.592371426605942e-06, - "loss": 1.878, - "step": 275400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 0.9907537698745728, - "learning_rate": 6.5773972983248635e-06, - "loss": 1.8876, - "step": 275500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 1.0108195543289185, - "learning_rate": 6.562437619286002e-06, - "loss": 1.8791, - "step": 275600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 0.9989004731178284, - "learning_rate": 6.547492401222549e-06, - "loss": 1.8747, - "step": 275700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 1.0045630931854248, - "learning_rate": 6.532561655856351e-06, - "loss": 1.8863, - "step": 275800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 0.9753278493881226, - "learning_rate": 6.517645394897923e-06, - "loss": 1.8804, - "step": 275900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 0.9882794618606567, - "learning_rate": 6.5027436300464095e-06, - "loss": 1.8751, - "step": 276000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.195430040359497, - "eval_runtime": 51.7195, - "eval_samples_per_second": 197.102, - "eval_steps_per_second": 1.547, - "step": 276000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 1.0431910753250122, - "learning_rate": 6.487856372989573e-06, - "loss": 1.8739, - "step": 276100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 1.0198723077774048, - "learning_rate": 6.472983635403818e-06, - "loss": 1.8622, - "step": 276200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 1.0333479642868042, - "learning_rate": 6.458125428954146e-06, - "loss": 1.871, - "step": 276300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 0.9855126738548279, - "learning_rate": 6.443281765294177e-06, - "loss": 1.8632, - "step": 276400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 1.05318284034729, - "learning_rate": 6.4284526560661005e-06, - "loss": 1.8804, - "step": 276500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 1.0296765565872192, - "learning_rate": 6.41363811290071e-06, - "loss": 1.8752, - "step": 276600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 1.0334811210632324, - "learning_rate": 6.398838147417374e-06, - "loss": 1.8768, - "step": 276700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 0.9788868427276611, - "learning_rate": 6.384052771224022e-06, - "loss": 1.867, - "step": 276800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 1.0330471992492676, - "learning_rate": 6.369281995917134e-06, - "loss": 1.8668, - "step": 276900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 0.9711721539497375, - "learning_rate": 6.354525833081759e-06, - "loss": 1.8703, - "step": 277000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.19480037689209, - "eval_runtime": 51.7555, - "eval_samples_per_second": 196.964, - "eval_steps_per_second": 1.546, - "step": 277000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 1.0206748247146606, - "learning_rate": 6.339784294291454e-06, - "loss": 1.8639, - "step": 277100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 1.019838571548462, - "learning_rate": 6.325057391108341e-06, - "loss": 1.8703, - "step": 277200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 0.9485549330711365, - "learning_rate": 6.3103451350830316e-06, - "loss": 1.8753, - "step": 277300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 0.9893754124641418, - "learning_rate": 6.295647537754668e-06, - "loss": 1.8808, - "step": 277400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 0.9906275868415833, - "learning_rate": 6.280964610650894e-06, - "loss": 1.875, - "step": 277500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 1.0166252851486206, - "learning_rate": 6.266296365287844e-06, - "loss": 1.8808, - "step": 277600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 1.0124883651733398, - "learning_rate": 6.251642813170142e-06, - "loss": 1.8795, - "step": 277700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 1.0064804553985596, - "learning_rate": 6.237003965790872e-06, - "loss": 1.8692, - "step": 277800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 1.0103724002838135, - "learning_rate": 6.222379834631598e-06, - "loss": 1.8863, - "step": 277900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 1.0621378421783447, - "learning_rate": 6.207770431162343e-06, - "loss": 1.8616, - "step": 278000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.2066683769226074, - "eval_runtime": 51.8686, - "eval_samples_per_second": 196.535, - "eval_steps_per_second": 1.542, - "step": 278000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 1.0321729183197021, - "learning_rate": 6.1931757668415855e-06, - "loss": 1.8622, - "step": 278100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 1.0194209814071655, - "learning_rate": 6.178595853116212e-06, - "loss": 1.8701, - "step": 278200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 1.0299484729766846, - "learning_rate": 6.164030701421583e-06, - "loss": 1.8809, - "step": 278300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 1.0566426515579224, - "learning_rate": 6.149480323181439e-06, - "loss": 1.871, - "step": 278400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 1.0435720682144165, - "learning_rate": 6.134944729807971e-06, - "loss": 1.8587, - "step": 278500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 0.9985933303833008, - "learning_rate": 6.120423932701741e-06, - "loss": 1.8571, - "step": 278600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 1.0292407274246216, - "learning_rate": 6.1059179432517295e-06, - "loss": 1.8612, - "step": 278700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 1.0021073818206787, - "learning_rate": 6.091426772835293e-06, - "loss": 1.8695, - "step": 278800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 1.0160890817642212, - "learning_rate": 6.076950432818176e-06, - "loss": 1.8865, - "step": 278900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 1.002803921699524, - "learning_rate": 6.062488934554469e-06, - "loss": 1.8657, - "step": 279000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.2028017044067383, - "eval_runtime": 51.9876, - "eval_samples_per_second": 196.085, - "eval_steps_per_second": 1.539, - "step": 279000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 1.0246683359146118, - "learning_rate": 6.048042289386643e-06, - "loss": 1.8605, - "step": 279100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 1.0214561223983765, - "learning_rate": 6.033610508645507e-06, - "loss": 1.869, - "step": 279200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 1.0367177724838257, - "learning_rate": 6.019193603650225e-06, - "loss": 1.8564, - "step": 279300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 1.0507416725158691, - "learning_rate": 6.004791585708272e-06, - "loss": 1.8819, - "step": 279400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 1.0242899656295776, - "learning_rate": 5.990404466115465e-06, - "loss": 1.8804, - "step": 279500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 1.0185377597808838, - "learning_rate": 5.976032256155939e-06, - "loss": 1.873, - "step": 279600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 1.0047358274459839, - "learning_rate": 5.961674967102113e-06, - "loss": 1.8726, - "step": 279700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 1.0124741792678833, - "learning_rate": 5.9473326102147255e-06, - "loss": 1.8711, - "step": 279800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 1.0482338666915894, - "learning_rate": 5.933005196742783e-06, - "loss": 1.8599, - "step": 279900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 0.9833710193634033, - "learning_rate": 5.918692737923592e-06, - "loss": 1.8488, - "step": 280000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.2023353576660156, - "eval_runtime": 51.8909, - "eval_samples_per_second": 196.451, - "eval_steps_per_second": 1.542, - "step": 280000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 1.0626851320266724, - "learning_rate": 5.9043952449827275e-06, - "loss": 1.8484, - "step": 280100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 1.0383021831512451, - "learning_rate": 5.890112729134004e-06, - "loss": 1.8728, - "step": 280200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 1.0421721935272217, - "learning_rate": 5.875845201579513e-06, - "loss": 1.8676, - "step": 280300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 1.0611158609390259, - "learning_rate": 5.861592673509581e-06, - "loss": 1.8549, - "step": 280400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 1.0284640789031982, - "learning_rate": 5.847355156102771e-06, - "loss": 1.8523, - "step": 280500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 0.9916824698448181, - "learning_rate": 5.833132660525883e-06, - "loss": 1.864, - "step": 280600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 1.0592784881591797, - "learning_rate": 5.818925197933911e-06, - "loss": 1.8686, - "step": 280700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 1.014319896697998, - "learning_rate": 5.804732779470074e-06, - "loss": 1.8572, - "step": 280800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 1.017314076423645, - "learning_rate": 5.7905554162658025e-06, - "loss": 1.8666, - "step": 280900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 1.0180898904800415, - "learning_rate": 5.77639311944069e-06, - "loss": 1.8735, - "step": 281000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.203965187072754, - "eval_runtime": 52.0422, - "eval_samples_per_second": 195.879, - "eval_steps_per_second": 1.537, - "step": 281000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 0.9797289371490479, - "learning_rate": 5.762245900102545e-06, - "loss": 1.8685, - "step": 281100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 1.0343824625015259, - "learning_rate": 5.748113769347319e-06, - "loss": 1.836, - "step": 281200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 1.0038707256317139, - "learning_rate": 5.7339967382591534e-06, - "loss": 1.8629, - "step": 281300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 1.0676428079605103, - "learning_rate": 5.7198948179103455e-06, - "loss": 1.8547, - "step": 281400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 1.0127052068710327, - "learning_rate": 5.70580801936132e-06, - "loss": 1.8591, - "step": 281500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 0.9936082363128662, - "learning_rate": 5.6917363536606596e-06, - "loss": 1.827, - "step": 281600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 1.0413488149642944, - "learning_rate": 5.6776798318450755e-06, - "loss": 1.8518, - "step": 281700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 1.0689826011657715, - "learning_rate": 5.663638464939405e-06, - "loss": 1.8469, - "step": 281800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 1.0071433782577515, - "learning_rate": 5.64961226395658e-06, - "loss": 1.8499, - "step": 281900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 1.0158226490020752, - "learning_rate": 5.635601239897659e-06, - "loss": 1.8408, - "step": 282000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.204533338546753, - "eval_runtime": 52.0127, - "eval_samples_per_second": 195.991, - "eval_steps_per_second": 1.538, - "step": 282000 - }, - { - "epoch": 0.033768886854693314, - "grad_norm": 1.0187814235687256, - "learning_rate": 5.6216054037517865e-06, - "loss": 1.8592, - "step": 282100 - }, - { - "epoch": 0.03404796856423624, - "grad_norm": 1.0497030019760132, - "learning_rate": 5.607624766496203e-06, - "loss": 1.866, - "step": 282200 - }, - { - "epoch": 0.03432705027377916, - "grad_norm": 1.0341991186141968, - "learning_rate": 5.5936593390962165e-06, - "loss": 1.8463, - "step": 282300 - }, - { - "epoch": 0.034606131983322075, - "grad_norm": 1.0514994859695435, - "learning_rate": 5.579709132505203e-06, - "loss": 1.8384, - "step": 282400 - }, - { - "epoch": 0.034885213692865, - "grad_norm": 1.0009490251541138, - "learning_rate": 5.565774157664616e-06, - "loss": 1.8544, - "step": 282500 - }, - { - "epoch": 0.03516429540240792, - "grad_norm": 1.033133625984192, - "learning_rate": 5.551854425503964e-06, - "loss": 1.871, - "step": 282600 - }, - { - "epoch": 0.035443377111950836, - "grad_norm": 1.0105431079864502, - "learning_rate": 5.537949946940774e-06, - "loss": 1.8499, - "step": 282700 - }, - { - "epoch": 0.035722458821493754, - "grad_norm": 1.0701624155044556, - "learning_rate": 5.524060732880637e-06, - "loss": 1.8447, - "step": 282800 - }, - { - "epoch": 0.03600154053103668, - "grad_norm": 1.0628591775894165, - "learning_rate": 5.510186794217157e-06, - "loss": 1.8413, - "step": 282900 - }, - { - "epoch": 0.036280622240579596, - "grad_norm": 1.0623475313186646, - "learning_rate": 5.4963281418319716e-06, - "loss": 1.8549, - "step": 283000 - }, - { - "epoch": 0.036280622240579596, - "eval_loss": 2.2079403400421143, - "eval_runtime": 52.0695, - "eval_samples_per_second": 195.777, - "eval_steps_per_second": 1.536, - "step": 283000 - }, - { - "epoch": 0.036559703950122514, - "grad_norm": 1.002023696899414, - "learning_rate": 5.4824847865947045e-06, - "loss": 1.8312, - "step": 283100 - }, - { - "epoch": 0.03683878565966544, - "grad_norm": 1.057966947555542, - "learning_rate": 5.468656739363004e-06, - "loss": 1.8501, - "step": 283200 - }, - { - "epoch": 0.03711786736920836, - "grad_norm": 1.0365926027297974, - "learning_rate": 5.454844010982504e-06, - "loss": 1.861, - "step": 283300 - }, - { - "epoch": 0.037396949078751275, - "grad_norm": 0.9884259700775146, - "learning_rate": 5.441046612286827e-06, - "loss": 1.8495, - "step": 283400 - }, - { - "epoch": 0.0376760307882942, - "grad_norm": 1.0611577033996582, - "learning_rate": 5.427264554097555e-06, - "loss": 1.8521, - "step": 283500 - }, - { - "epoch": 0.03795511249783712, - "grad_norm": 1.0047646760940552, - "learning_rate": 5.413497847224272e-06, - "loss": 1.8497, - "step": 283600 - }, - { - "epoch": 0.038234194207380036, - "grad_norm": 1.0214877128601074, - "learning_rate": 5.399746502464479e-06, - "loss": 1.847, - "step": 283700 - }, - { - "epoch": 0.038513275916922954, - "grad_norm": 1.0316286087036133, - "learning_rate": 5.386010530603663e-06, - "loss": 1.8566, - "step": 283800 - }, - { - "epoch": 0.03879235762646588, - "grad_norm": 1.046181559562683, - "learning_rate": 5.3722899424152456e-06, - "loss": 1.856, - "step": 283900 - }, - { - "epoch": 0.0390714393360088, - "grad_norm": 1.1351335048675537, - "learning_rate": 5.358584748660567e-06, - "loss": 1.857, - "step": 284000 - }, - { - "epoch": 0.0390714393360088, - "eval_loss": 2.196443796157837, - "eval_runtime": 52.012, - "eval_samples_per_second": 195.993, - "eval_steps_per_second": 1.538, - "step": 284000 - }, - { - "epoch": 0.039350521045551715, - "grad_norm": 1.0209579467773438, - "learning_rate": 5.344894960088906e-06, - "loss": 1.8367, - "step": 284100 - }, - { - "epoch": 0.03962960275509464, - "grad_norm": 0.9978814125061035, - "learning_rate": 5.331220587437463e-06, - "loss": 1.8627, - "step": 284200 - }, - { - "epoch": 0.03990868446463756, - "grad_norm": 1.0793671607971191, - "learning_rate": 5.317561641431349e-06, - "loss": 1.8504, - "step": 284300 - }, - { - "epoch": 0.040187766174180475, - "grad_norm": 0.9899407029151917, - "learning_rate": 5.303918132783547e-06, - "loss": 1.8294, - "step": 284400 - }, - { - "epoch": 0.04046684788372339, - "grad_norm": 1.0489463806152344, - "learning_rate": 5.290290072194967e-06, - "loss": 1.8507, - "step": 284500 - }, - { - "epoch": 0.04074592959326632, - "grad_norm": 1.0309258699417114, - "learning_rate": 5.2766774703543855e-06, - "loss": 1.8558, - "step": 284600 - }, - { - "epoch": 0.041025011302809236, - "grad_norm": 1.0452024936676025, - "learning_rate": 5.2630803379384665e-06, - "loss": 1.8562, - "step": 284700 - }, - { - "epoch": 0.041304093012352154, - "grad_norm": 1.0156536102294922, - "learning_rate": 5.24949868561172e-06, - "loss": 1.8487, - "step": 284800 - }, - { - "epoch": 0.04158317472189508, - "grad_norm": 1.0449084043502808, - "learning_rate": 5.2359325240265375e-06, - "loss": 1.8601, - "step": 284900 - }, - { - "epoch": 0.041862256431438, - "grad_norm": 1.0443174839019775, - "learning_rate": 5.222381863823139e-06, - "loss": 1.8411, - "step": 285000 - }, - { - "epoch": 0.041862256431438, - "eval_loss": 2.207988739013672, - "eval_runtime": 51.9411, - "eval_samples_per_second": 196.261, - "eval_steps_per_second": 1.54, - "step": 285000 - }, - { - "epoch": 0.042141338140980915, - "grad_norm": 1.0384715795516968, - "learning_rate": 5.208846715629609e-06, - "loss": 1.8524, - "step": 285100 - }, - { - "epoch": 0.04242041985052384, - "grad_norm": 1.0093611478805542, - "learning_rate": 5.195327090061844e-06, - "loss": 1.848, - "step": 285200 - }, - { - "epoch": 0.04269950156006676, - "grad_norm": 1.0251661539077759, - "learning_rate": 5.181822997723582e-06, - "loss": 1.8428, - "step": 285300 - }, - { - "epoch": 0.042978583269609676, - "grad_norm": 1.056368112564087, - "learning_rate": 5.168334449206372e-06, - "loss": 1.858, - "step": 285400 - }, - { - "epoch": 0.043257664979152594, - "grad_norm": 1.0536158084869385, - "learning_rate": 5.154861455089577e-06, - "loss": 1.8375, - "step": 285500 - }, - { - "epoch": 0.04353674668869552, - "grad_norm": 1.0133367776870728, - "learning_rate": 5.141404025940341e-06, - "loss": 1.8382, - "step": 285600 - }, - { - "epoch": 0.043815828398238436, - "grad_norm": 1.0329740047454834, - "learning_rate": 5.127962172313624e-06, - "loss": 1.8546, - "step": 285700 - }, - { - "epoch": 0.044094910107781354, - "grad_norm": 1.0410008430480957, - "learning_rate": 5.114535904752157e-06, - "loss": 1.8303, - "step": 285800 - }, - { - "epoch": 0.04437399181732428, - "grad_norm": 1.0532476902008057, - "learning_rate": 5.1011252337864605e-06, - "loss": 1.8418, - "step": 285900 - }, - { - "epoch": 0.0446530735268672, - "grad_norm": 1.031091332435608, - "learning_rate": 5.087730169934793e-06, - "loss": 1.8524, - "step": 286000 - }, - { - "epoch": 0.0446530735268672, - "eval_loss": 2.2169249057769775, - "eval_runtime": 52.0382, - "eval_samples_per_second": 195.895, - "eval_steps_per_second": 1.537, - "step": 286000 - }, - { - "epoch": 0.044932155236410115, - "grad_norm": 1.0077662467956543, - "learning_rate": 5.0743507237032e-06, - "loss": 1.8372, - "step": 286100 - }, - { - "epoch": 0.04521123694595303, - "grad_norm": 0.9833052754402161, - "learning_rate": 5.0609869055854714e-06, - "loss": 1.8493, - "step": 286200 - }, - { - "epoch": 0.04549031865549596, - "grad_norm": 1.0318917036056519, - "learning_rate": 5.047638726063128e-06, - "loss": 1.8331, - "step": 286300 - }, - { - "epoch": 0.045769400365038876, - "grad_norm": 1.0132189989089966, - "learning_rate": 5.03430619560544e-06, - "loss": 1.836, - "step": 286400 - }, - { - "epoch": 0.046048482074581794, - "grad_norm": 1.016453742980957, - "learning_rate": 5.0209893246693895e-06, - "loss": 1.8561, - "step": 286500 - }, - { - "epoch": 0.04632756378412472, - "grad_norm": 1.0423955917358398, - "learning_rate": 5.007688123699686e-06, - "loss": 1.8488, - "step": 286600 - }, - { - "epoch": 0.04660664549366764, - "grad_norm": 1.0047492980957031, - "learning_rate": 4.994402603128751e-06, - "loss": 1.8384, - "step": 286700 - }, - { - "epoch": 0.046885727203210555, - "grad_norm": 1.0472545623779297, - "learning_rate": 4.981132773376704e-06, - "loss": 1.8359, - "step": 286800 - }, - { - "epoch": 0.04716480891275347, - "grad_norm": 1.0129274129867554, - "learning_rate": 4.967878644851351e-06, - "loss": 1.8428, - "step": 286900 - }, - { - "epoch": 0.0474438906222964, - "grad_norm": 1.0653069019317627, - "learning_rate": 4.954640227948188e-06, - "loss": 1.8516, - "step": 287000 - }, - { - "epoch": 0.0474438906222964, - "eval_loss": 2.207484483718872, - "eval_runtime": 51.9821, - "eval_samples_per_second": 196.106, - "eval_steps_per_second": 1.539, - "step": 287000 - }, - { - "epoch": 0.047722972331839315, - "grad_norm": 1.0424509048461914, - "learning_rate": 4.941417533050394e-06, - "loss": 1.8518, - "step": 287100 - }, - { - "epoch": 0.04800205404138223, - "grad_norm": 1.0581769943237305, - "learning_rate": 4.9282105705288185e-06, - "loss": 1.8239, - "step": 287200 - }, - { - "epoch": 0.04828113575092516, - "grad_norm": 1.0191422700881958, - "learning_rate": 4.9150193507419505e-06, - "loss": 1.8555, - "step": 287300 - }, - { - "epoch": 0.048560217460468076, - "grad_norm": 1.0733542442321777, - "learning_rate": 4.901843884035953e-06, - "loss": 1.8397, - "step": 287400 - }, - { - "epoch": 0.048839299170010994, - "grad_norm": 1.0520180463790894, - "learning_rate": 4.888684180744635e-06, - "loss": 1.841, - "step": 287500 - }, - { - "epoch": 0.04911838087955392, - "grad_norm": 1.047424077987671, - "learning_rate": 4.8755402511894175e-06, - "loss": 1.8359, - "step": 287600 - }, - { - "epoch": 0.04939746258909684, - "grad_norm": 1.0499184131622314, - "learning_rate": 4.862412105679384e-06, - "loss": 1.8258, - "step": 287700 - }, - { - "epoch": 0.049676544298639755, - "grad_norm": 1.0351871252059937, - "learning_rate": 4.849299754511205e-06, - "loss": 1.8348, - "step": 287800 - }, - { - "epoch": 0.04995562600818267, - "grad_norm": 1.0380860567092896, - "learning_rate": 4.836203207969183e-06, - "loss": 1.8306, - "step": 287900 - }, - { - "epoch": 0.0502347077177256, - "grad_norm": 1.0105301141738892, - "learning_rate": 4.823122476325231e-06, - "loss": 1.8432, - "step": 288000 - }, - { - "epoch": 0.0502347077177256, - "eval_loss": 2.199079751968384, - "eval_runtime": 52.0806, - "eval_samples_per_second": 195.735, - "eval_steps_per_second": 1.536, - "step": 288000 - }, - { - "epoch": 0.00027908170954291995, - "grad_norm": 1.004279375076294, - "learning_rate": 4.8100575698388324e-06, - "loss": 1.8457, - "step": 288100 - }, - { - "epoch": 0.0005581634190858399, - "grad_norm": 1.001739740371704, - "learning_rate": 4.79700849875708e-06, - "loss": 1.8321, - "step": 288200 - }, - { - "epoch": 0.0008372451286287599, - "grad_norm": 1.0556291341781616, - "learning_rate": 4.7839752733146395e-06, - "loss": 1.8446, - "step": 288300 - }, - { - "epoch": 0.0011163268381716798, - "grad_norm": 1.014046311378479, - "learning_rate": 4.7709579037337525e-06, - "loss": 1.8194, - "step": 288400 - }, - { - "epoch": 0.0013954085477146, - "grad_norm": 1.0474220514297485, - "learning_rate": 4.757956400224214e-06, - "loss": 1.8424, - "step": 288500 - }, - { - "epoch": 0.0016744902572575198, - "grad_norm": 1.0006691217422485, - "learning_rate": 4.744970772983387e-06, - "loss": 1.83, - "step": 288600 - }, - { - "epoch": 0.00195357196680044, - "grad_norm": 1.0182647705078125, - "learning_rate": 4.732001032196173e-06, - "loss": 1.8357, - "step": 288700 - }, - { - "epoch": 0.0022326536763433596, - "grad_norm": 1.0402028560638428, - "learning_rate": 4.719047188035028e-06, - "loss": 1.8395, - "step": 288800 - }, - { - "epoch": 0.0025117353858862797, - "grad_norm": 0.9996068477630615, - "learning_rate": 4.706109250659915e-06, - "loss": 1.8601, - "step": 288900 - }, - { - "epoch": 0.0027908170954292, - "grad_norm": 1.0102015733718872, - "learning_rate": 4.693187230218351e-06, - "loss": 1.8282, - "step": 289000 - }, - { - "epoch": 0.0027908170954292, - "eval_loss": 2.2036867141723633, - "eval_runtime": 51.9029, - "eval_samples_per_second": 196.405, - "eval_steps_per_second": 1.541, - "step": 289000 - }, - { - "epoch": 0.00306989880497212, - "grad_norm": 1.0433801412582397, - "learning_rate": 4.680281136845338e-06, - "loss": 1.843, - "step": 289100 - }, - { - "epoch": 0.0033489805145150396, - "grad_norm": 1.04608952999115, - "learning_rate": 4.667390980663416e-06, - "loss": 1.8499, - "step": 289200 - }, - { - "epoch": 0.0036280622240579597, - "grad_norm": 1.0211896896362305, - "learning_rate": 4.654516771782597e-06, - "loss": 1.8431, - "step": 289300 - }, - { - "epoch": 0.00390714393360088, - "grad_norm": 1.0457395315170288, - "learning_rate": 4.641658520300407e-06, - "loss": 1.8281, - "step": 289400 - }, - { - "epoch": 0.0041862256431437995, - "grad_norm": 1.0063202381134033, - "learning_rate": 4.6288162363018475e-06, - "loss": 1.8336, - "step": 289500 - }, - { - "epoch": 0.004465307352686719, - "grad_norm": 1.003279209136963, - "learning_rate": 4.615989929859402e-06, - "loss": 1.8408, - "step": 289600 - }, - { - "epoch": 0.00474438906222964, - "grad_norm": 1.0246212482452393, - "learning_rate": 4.603179611033006e-06, - "loss": 1.8343, - "step": 289700 - }, - { - "epoch": 0.005023470771772559, - "grad_norm": 1.0443739891052246, - "learning_rate": 4.590385289870075e-06, - "loss": 1.8401, - "step": 289800 - }, - { - "epoch": 0.00530255248131548, - "grad_norm": 1.0705519914627075, - "learning_rate": 4.577606976405466e-06, - "loss": 1.8389, - "step": 289900 - }, - { - "epoch": 0.0055816341908584, - "grad_norm": 1.0605077743530273, - "learning_rate": 4.564844680661487e-06, - "loss": 1.8219, - "step": 290000 - }, - { - "epoch": 0.0055816341908584, - "eval_loss": 2.2071588039398193, - "eval_runtime": 51.4776, - "eval_samples_per_second": 198.028, - "eval_steps_per_second": 1.554, - "step": 290000 - }, - { - "epoch": 0.005860715900401319, - "grad_norm": 1.0212249755859375, - "learning_rate": 4.552098412647887e-06, - "loss": 1.8286, - "step": 290100 - }, - { - "epoch": 0.00613979760994424, - "grad_norm": 1.0316197872161865, - "learning_rate": 4.539368182361822e-06, - "loss": 1.831, - "step": 290200 - }, - { - "epoch": 0.0064188793194871595, - "grad_norm": 1.059012532234192, - "learning_rate": 4.526653999787897e-06, - "loss": 1.8454, - "step": 290300 - }, - { - "epoch": 0.006697961029030079, - "grad_norm": 1.0601192712783813, - "learning_rate": 4.51395587489811e-06, - "loss": 1.8344, - "step": 290400 - }, - { - "epoch": 0.006977042738573, - "grad_norm": 1.028264045715332, - "learning_rate": 4.50127381765188e-06, - "loss": 1.8418, - "step": 290500 - }, - { - "epoch": 0.0072561244481159195, - "grad_norm": 1.0473825931549072, - "learning_rate": 4.488607837996006e-06, - "loss": 1.8273, - "step": 290600 - }, - { - "epoch": 0.007535206157658839, - "grad_norm": 1.0223557949066162, - "learning_rate": 4.475957945864692e-06, - "loss": 1.8318, - "step": 290700 - }, - { - "epoch": 0.00781428786720176, - "grad_norm": 1.026455283164978, - "learning_rate": 4.463324151179521e-06, - "loss": 1.8252, - "step": 290800 - }, - { - "epoch": 0.00809336957674468, - "grad_norm": 1.0249643325805664, - "learning_rate": 4.450706463849458e-06, - "loss": 1.8384, - "step": 290900 - }, - { - "epoch": 0.008372451286287599, - "grad_norm": 1.0897847414016724, - "learning_rate": 4.438104893770806e-06, - "loss": 1.8316, - "step": 291000 - }, - { - "epoch": 0.008372451286287599, - "eval_loss": 2.2129878997802734, - "eval_runtime": 51.4364, - "eval_samples_per_second": 198.186, - "eval_steps_per_second": 1.555, - "step": 291000 - }, - { - "epoch": 0.008651532995830519, - "grad_norm": 1.0365105867385864, - "learning_rate": 4.425519450827259e-06, - "loss": 1.8085, - "step": 291100 - }, - { - "epoch": 0.008930614705373438, - "grad_norm": 1.0275731086730957, - "learning_rate": 4.412950144889849e-06, - "loss": 1.8278, - "step": 291200 - }, - { - "epoch": 0.00920969641491636, - "grad_norm": 1.022194504737854, - "learning_rate": 4.400396985816957e-06, - "loss": 1.8147, - "step": 291300 - }, - { - "epoch": 0.00948877812445928, - "grad_norm": 1.0360770225524902, - "learning_rate": 4.387859983454279e-06, - "loss": 1.835, - "step": 291400 - }, - { - "epoch": 0.0097678598340022, - "grad_norm": 1.0490261316299438, - "learning_rate": 4.375339147634866e-06, - "loss": 1.8309, - "step": 291500 - }, - { - "epoch": 0.010046941543545119, - "grad_norm": 1.0081874132156372, - "learning_rate": 4.362834488179085e-06, - "loss": 1.8247, - "step": 291600 - }, - { - "epoch": 0.010326023253088039, - "grad_norm": 1.0340025424957275, - "learning_rate": 4.350346014894596e-06, - "loss": 1.8288, - "step": 291700 - }, - { - "epoch": 0.01060510496263096, - "grad_norm": 1.080769419670105, - "learning_rate": 4.337873737576376e-06, - "loss": 1.8186, - "step": 291800 - }, - { - "epoch": 0.01088418667217388, - "grad_norm": 1.027443528175354, - "learning_rate": 4.3254176660067005e-06, - "loss": 1.8374, - "step": 291900 - }, - { - "epoch": 0.0111632683817168, - "grad_norm": 0.9847263097763062, - "learning_rate": 4.3129778099551376e-06, - "loss": 1.8312, - "step": 292000 - }, - { - "epoch": 0.0111632683817168, - "eval_loss": 2.2189362049102783, - "eval_runtime": 51.4867, - "eval_samples_per_second": 197.993, - "eval_steps_per_second": 1.554, - "step": 292000 - }, - { - "epoch": 0.011442350091259719, - "grad_norm": 1.0336400270462036, - "learning_rate": 4.30055417917854e-06, - "loss": 1.8155, - "step": 292100 - }, - { - "epoch": 0.011721431800802639, - "grad_norm": 1.011435627937317, - "learning_rate": 4.288146783421012e-06, - "loss": 1.8494, - "step": 292200 - }, - { - "epoch": 0.012000513510345558, - "grad_norm": 1.0581125020980835, - "learning_rate": 4.275755632413947e-06, - "loss": 1.8194, - "step": 292300 - }, - { - "epoch": 0.01227959521988848, - "grad_norm": 1.0441781282424927, - "learning_rate": 4.263380735875991e-06, - "loss": 1.8043, - "step": 292400 - }, - { - "epoch": 0.0125586769294314, - "grad_norm": 0.9977090358734131, - "learning_rate": 4.251022103513047e-06, - "loss": 1.8424, - "step": 292500 - }, - { - "epoch": 0.012837758638974319, - "grad_norm": 1.0513739585876465, - "learning_rate": 4.238679745018243e-06, - "loss": 1.8396, - "step": 292600 - }, - { - "epoch": 0.013116840348517239, - "grad_norm": 1.013461947441101, - "learning_rate": 4.226353670071961e-06, - "loss": 1.8254, - "step": 292700 - }, - { - "epoch": 0.013395922058060158, - "grad_norm": 1.0267400741577148, - "learning_rate": 4.214043888341812e-06, - "loss": 1.8194, - "step": 292800 - }, - { - "epoch": 0.013675003767603078, - "grad_norm": 1.0605510473251343, - "learning_rate": 4.201750409482607e-06, - "loss": 1.83, - "step": 292900 - }, - { - "epoch": 0.013954085477146, - "grad_norm": 1.0429390668869019, - "learning_rate": 4.189473243136402e-06, - "loss": 1.8305, - "step": 293000 - }, - { - "epoch": 0.013954085477146, - "eval_loss": 2.212700843811035, - "eval_runtime": 51.5608, - "eval_samples_per_second": 197.708, - "eval_steps_per_second": 1.552, - "step": 293000 - }, - { - "epoch": 0.01423316718668892, - "grad_norm": 0.9996118545532227, - "learning_rate": 4.177212398932428e-06, - "loss": 1.8341, - "step": 293100 - }, - { - "epoch": 0.014512248896231839, - "grad_norm": 1.0341185331344604, - "learning_rate": 4.164967886487131e-06, - "loss": 1.8232, - "step": 293200 - }, - { - "epoch": 0.014791330605774759, - "grad_norm": 1.0189030170440674, - "learning_rate": 4.15273971540415e-06, - "loss": 1.8226, - "step": 293300 - }, - { - "epoch": 0.015070412315317678, - "grad_norm": 1.0681477785110474, - "learning_rate": 4.140527895274301e-06, - "loss": 1.8146, - "step": 293400 - }, - { - "epoch": 0.015349494024860598, - "grad_norm": 1.066925048828125, - "learning_rate": 4.128332435675569e-06, - "loss": 1.8229, - "step": 293500 - }, - { - "epoch": 0.01562857573440352, - "grad_norm": 1.0204412937164307, - "learning_rate": 4.116153346173121e-06, - "loss": 1.8244, - "step": 293600 - }, - { - "epoch": 0.015907657443946437, - "grad_norm": 1.0246905088424683, - "learning_rate": 4.103990636319274e-06, - "loss": 1.8073, - "step": 293700 - }, - { - "epoch": 0.01618673915348936, - "grad_norm": 1.0570878982543945, - "learning_rate": 4.091844315653512e-06, - "loss": 1.8125, - "step": 293800 - }, - { - "epoch": 0.01646582086303228, - "grad_norm": 1.0208039283752441, - "learning_rate": 4.079714393702441e-06, - "loss": 1.8197, - "step": 293900 - }, - { - "epoch": 0.016744902572575198, - "grad_norm": 1.0461581945419312, - "learning_rate": 4.067600879979824e-06, - "loss": 1.8177, - "step": 294000 - }, - { - "epoch": 0.016744902572575198, - "eval_loss": 2.225311756134033, - "eval_runtime": 51.5825, - "eval_samples_per_second": 197.625, - "eval_steps_per_second": 1.551, - "step": 294000 - }, - { - "epoch": 0.01702398428211812, - "grad_norm": 1.0400618314743042, - "learning_rate": 4.055503783986556e-06, - "loss": 1.8126, - "step": 294100 - }, - { - "epoch": 0.017303065991661037, - "grad_norm": 1.0598971843719482, - "learning_rate": 4.043423115210637e-06, - "loss": 1.826, - "step": 294200 - }, - { - "epoch": 0.01758214770120396, - "grad_norm": 0.9947335124015808, - "learning_rate": 4.031358883127207e-06, - "loss": 1.8312, - "step": 294300 - }, - { - "epoch": 0.017861229410746877, - "grad_norm": 1.0881414413452148, - "learning_rate": 4.019311097198489e-06, - "loss": 1.8321, - "step": 294400 - }, - { - "epoch": 0.018140311120289798, - "grad_norm": 1.0416432619094849, - "learning_rate": 4.007279766873828e-06, - "loss": 1.8171, - "step": 294500 - }, - { - "epoch": 0.01841939282983272, - "grad_norm": 1.0456783771514893, - "learning_rate": 3.9952649015896545e-06, - "loss": 1.8077, - "step": 294600 - }, - { - "epoch": 0.018698474539375638, - "grad_norm": 1.0717263221740723, - "learning_rate": 3.983266510769479e-06, - "loss": 1.8269, - "step": 294700 - }, - { - "epoch": 0.01897755624891856, - "grad_norm": 1.0348212718963623, - "learning_rate": 3.971284603823899e-06, - "loss": 1.839, - "step": 294800 - }, - { - "epoch": 0.019256637958461477, - "grad_norm": 1.017639398574829, - "learning_rate": 3.9593191901505846e-06, - "loss": 1.8076, - "step": 294900 - }, - { - "epoch": 0.0195357196680044, - "grad_norm": 1.0568976402282715, - "learning_rate": 3.947370279134269e-06, - "loss": 1.8317, - "step": 295000 - }, - { - "epoch": 0.0195357196680044, - "eval_loss": 2.2106025218963623, - "eval_runtime": 51.5707, - "eval_samples_per_second": 197.67, - "eval_steps_per_second": 1.551, - "step": 295000 - }, - { - "epoch": 0.01981480137754732, - "grad_norm": 0.9975104928016663, - "learning_rate": 3.935437880146728e-06, - "loss": 1.8075, - "step": 295100 - }, - { - "epoch": 0.020093883087090238, - "grad_norm": 1.0724748373031616, - "learning_rate": 3.923522002546804e-06, - "loss": 1.8101, - "step": 295200 - }, - { - "epoch": 0.02037296479663316, - "grad_norm": 1.0251374244689941, - "learning_rate": 3.911622655680375e-06, - "loss": 1.8165, - "step": 295300 - }, - { - "epoch": 0.020652046506176077, - "grad_norm": 0.9875963926315308, - "learning_rate": 3.89973984888036e-06, - "loss": 1.832, - "step": 295400 - }, - { - "epoch": 0.020931128215719, - "grad_norm": 1.022261619567871, - "learning_rate": 3.887873591466687e-06, - "loss": 1.822, - "step": 295500 - }, - { - "epoch": 0.02121020992526192, - "grad_norm": 1.035934329032898, - "learning_rate": 3.8760238927463306e-06, - "loss": 1.8143, - "step": 295600 - }, - { - "epoch": 0.021489291634804838, - "grad_norm": 1.0614137649536133, - "learning_rate": 3.864190762013248e-06, - "loss": 1.8123, - "step": 295700 - }, - { - "epoch": 0.02176837334434776, - "grad_norm": 1.0247828960418701, - "learning_rate": 3.8523742085484235e-06, - "loss": 1.8284, - "step": 295800 - }, - { - "epoch": 0.022047455053890677, - "grad_norm": 1.0341575145721436, - "learning_rate": 3.84057424161984e-06, - "loss": 1.8288, - "step": 295900 - }, - { - "epoch": 0.0223265367634336, - "grad_norm": 1.0558165311813354, - "learning_rate": 3.8287908704824545e-06, - "loss": 1.8145, - "step": 296000 - }, - { - "epoch": 0.0223265367634336, - "eval_loss": 2.2238752841949463, - "eval_runtime": 52.008, - "eval_samples_per_second": 196.008, - "eval_steps_per_second": 1.538, - "step": 296000 - }, - { - "epoch": 0.022605618472976517, - "grad_norm": 1.0519689321517944, - "learning_rate": 3.8170241043782225e-06, - "loss": 1.8309, - "step": 296100 - }, - { - "epoch": 0.022884700182519438, - "grad_norm": 1.0083707571029663, - "learning_rate": 3.8052739525360674e-06, - "loss": 1.8125, - "step": 296200 - }, - { - "epoch": 0.02316378189206236, - "grad_norm": 1.028019905090332, - "learning_rate": 3.793540424171896e-06, - "loss": 1.819, - "step": 296300 - }, - { - "epoch": 0.023442863601605277, - "grad_norm": 1.0253424644470215, - "learning_rate": 3.781823528488554e-06, - "loss": 1.7998, - "step": 296400 - }, - { - "epoch": 0.0237219453111482, - "grad_norm": 1.0056533813476562, - "learning_rate": 3.770123274675855e-06, - "loss": 1.8169, - "step": 296500 - }, - { - "epoch": 0.024001027020691117, - "grad_norm": 0.9970018863677979, - "learning_rate": 3.758439671910563e-06, - "loss": 1.8182, - "step": 296600 - }, - { - "epoch": 0.024280108730234038, - "grad_norm": 1.1070934534072876, - "learning_rate": 3.746772729356382e-06, - "loss": 1.8255, - "step": 296700 - }, - { - "epoch": 0.02455919043977696, - "grad_norm": 1.0187138319015503, - "learning_rate": 3.735122456163936e-06, - "loss": 1.8185, - "step": 296800 - }, - { - "epoch": 0.024838272149319877, - "grad_norm": 1.029852271080017, - "learning_rate": 3.723488861470792e-06, - "loss": 1.8215, - "step": 296900 - }, - { - "epoch": 0.0251173538588628, - "grad_norm": 1.0434601306915283, - "learning_rate": 3.711871954401419e-06, - "loss": 1.8068, - "step": 297000 - }, - { - "epoch": 0.0251173538588628, - "eval_loss": 2.2075464725494385, - "eval_runtime": 51.6992, - "eval_samples_per_second": 197.179, - "eval_steps_per_second": 1.547, - "step": 297000 - }, - { - "epoch": 0.025396435568405717, - "grad_norm": 1.087276816368103, - "learning_rate": 3.7002717440672184e-06, - "loss": 1.8137, - "step": 297100 - }, - { - "epoch": 0.025675517277948638, - "grad_norm": 1.039167046546936, - "learning_rate": 3.688688239566471e-06, - "loss": 1.802, - "step": 297200 - }, - { - "epoch": 0.025954598987491556, - "grad_norm": 1.0578351020812988, - "learning_rate": 3.6771214499843693e-06, - "loss": 1.8276, - "step": 297300 - }, - { - "epoch": 0.026233680697034478, - "grad_norm": 1.0496500730514526, - "learning_rate": 3.6655713843930018e-06, - "loss": 1.806, - "step": 297400 - }, - { - "epoch": 0.0265127624065774, - "grad_norm": 1.0503089427947998, - "learning_rate": 3.654038051851333e-06, - "loss": 1.8041, - "step": 297500 - }, - { - "epoch": 0.026791844116120317, - "grad_norm": 1.0246284008026123, - "learning_rate": 3.6425214614051936e-06, - "loss": 1.7952, - "step": 297600 - }, - { - "epoch": 0.02707092582566324, - "grad_norm": 1.0332282781600952, - "learning_rate": 3.631021622087297e-06, - "loss": 1.8265, - "step": 297700 - }, - { - "epoch": 0.027350007535206156, - "grad_norm": 1.0222516059875488, - "learning_rate": 3.619538542917217e-06, - "loss": 1.8215, - "step": 297800 - }, - { - "epoch": 0.027629089244749078, - "grad_norm": 1.0545893907546997, - "learning_rate": 3.608072232901377e-06, - "loss": 1.8263, - "step": 297900 - }, - { - "epoch": 0.027908170954292, - "grad_norm": 1.091201901435852, - "learning_rate": 3.596622701033048e-06, - "loss": 1.8228, - "step": 298000 - }, - { - "epoch": 0.027908170954292, - "eval_loss": 2.2129366397857666, - "eval_runtime": 51.7579, - "eval_samples_per_second": 196.955, - "eval_steps_per_second": 1.546, - "step": 298000 - }, - { - "epoch": 0.028187252663834917, - "grad_norm": 1.0261002779006958, - "learning_rate": 3.58518995629234e-06, - "loss": 1.8203, - "step": 298100 - }, - { - "epoch": 0.02846633437337784, - "grad_norm": 1.0479872226715088, - "learning_rate": 3.5737740076462106e-06, - "loss": 1.7966, - "step": 298200 - }, - { - "epoch": 0.028745416082920756, - "grad_norm": 1.036954641342163, - "learning_rate": 3.562374864048429e-06, - "loss": 1.8111, - "step": 298300 - }, - { - "epoch": 0.029024497792463678, - "grad_norm": 1.0831959247589111, - "learning_rate": 3.550992534439576e-06, - "loss": 1.7991, - "step": 298400 - }, - { - "epoch": 0.0293035795020066, - "grad_norm": 1.0515846014022827, - "learning_rate": 3.539627027747067e-06, - "loss": 1.815, - "step": 298500 - }, - { - "epoch": 0.029582661211549517, - "grad_norm": 1.0659505128860474, - "learning_rate": 3.5282783528851117e-06, - "loss": 1.8105, - "step": 298600 - }, - { - "epoch": 0.02986174292109244, - "grad_norm": 1.0669214725494385, - "learning_rate": 3.516946518754724e-06, - "loss": 1.7961, - "step": 298700 - }, - { - "epoch": 0.030140824630635357, - "grad_norm": 1.0422730445861816, - "learning_rate": 3.5056315342436945e-06, - "loss": 1.8219, - "step": 298800 - }, - { - "epoch": 0.030419906340178278, - "grad_norm": 1.0351274013519287, - "learning_rate": 3.4943334082266103e-06, - "loss": 1.8183, - "step": 298900 - }, - { - "epoch": 0.030698988049721196, - "grad_norm": 1.0675437450408936, - "learning_rate": 3.483052149564839e-06, - "loss": 1.8024, - "step": 299000 - }, - { - "epoch": 0.030698988049721196, - "eval_loss": 2.2119719982147217, - "eval_runtime": 51.8915, - "eval_samples_per_second": 196.448, - "eval_steps_per_second": 1.542, - "step": 299000 - }, - { - "epoch": 0.030978069759264117, - "grad_norm": 1.033592939376831, - "learning_rate": 3.4717877671065103e-06, - "loss": 1.8264, - "step": 299100 - }, - { - "epoch": 0.03125715146880704, - "grad_norm": 1.0776604413986206, - "learning_rate": 3.460540269686524e-06, - "loss": 1.7936, - "step": 299200 - }, - { - "epoch": 0.03153623317834996, - "grad_norm": 1.0117859840393066, - "learning_rate": 3.4493096661265267e-06, - "loss": 1.7972, - "step": 299300 - }, - { - "epoch": 0.031815314887892875, - "grad_norm": 1.0392084121704102, - "learning_rate": 3.438095965234928e-06, - "loss": 1.8174, - "step": 299400 - }, - { - "epoch": 0.0320943965974358, - "grad_norm": 1.015053153038025, - "learning_rate": 3.4268991758068745e-06, - "loss": 1.8134, - "step": 299500 - }, - { - "epoch": 0.03237347830697872, - "grad_norm": 1.012290596961975, - "learning_rate": 3.415719306624246e-06, - "loss": 1.8254, - "step": 299600 - }, - { - "epoch": 0.032652560016521635, - "grad_norm": 1.0493707656860352, - "learning_rate": 3.404556366455647e-06, - "loss": 1.8037, - "step": 299700 - }, - { - "epoch": 0.03293164172606456, - "grad_norm": 1.0958573818206787, - "learning_rate": 3.3934103640564152e-06, - "loss": 1.8072, - "step": 299800 - }, - { - "epoch": 0.03321072343560748, - "grad_norm": 1.0864017009735107, - "learning_rate": 3.382281308168603e-06, - "loss": 1.8337, - "step": 299900 - }, - { - "epoch": 0.033489805145150396, - "grad_norm": 1.0446292161941528, - "learning_rate": 3.3711692075209687e-06, - "loss": 1.8123, - "step": 300000 - }, - { - "epoch": 0.033489805145150396, - "eval_loss": 2.2072536945343018, - "eval_runtime": 51.8799, - "eval_samples_per_second": 196.492, - "eval_steps_per_second": 1.542, - "step": 300000 - }, - { - "epoch": 0.033768886854693314, - "grad_norm": 1.0435408353805542, - "learning_rate": 3.3600740708289615e-06, - "loss": 1.7973, - "step": 300100 - }, - { - "epoch": 0.03404796856423624, - "grad_norm": 1.0345299243927002, - "learning_rate": 3.348995906794741e-06, - "loss": 1.8213, - "step": 300200 - }, - { - "epoch": 0.03432705027377916, - "grad_norm": 1.037927269935608, - "learning_rate": 3.33793472410715e-06, - "loss": 1.8048, - "step": 300300 - }, - { - "epoch": 0.034606131983322075, - "grad_norm": 1.0403209924697876, - "learning_rate": 3.326890531441712e-06, - "loss": 1.8136, - "step": 300400 - }, - { - "epoch": 0.034885213692865, - "grad_norm": 1.0413801670074463, - "learning_rate": 3.31586333746062e-06, - "loss": 1.7982, - "step": 300500 - }, - { - "epoch": 0.03516429540240792, - "grad_norm": 1.0370949506759644, - "learning_rate": 3.3048531508127366e-06, - "loss": 1.7944, - "step": 300600 - }, - { - "epoch": 0.035443377111950836, - "grad_norm": 1.0159741640090942, - "learning_rate": 3.2938599801335928e-06, - "loss": 1.8001, - "step": 300700 - }, - { - "epoch": 0.035722458821493754, - "grad_norm": 1.0671415328979492, - "learning_rate": 3.282883834045372e-06, - "loss": 1.7925, - "step": 300800 - }, - { - "epoch": 0.03600154053103668, - "grad_norm": 1.0309702157974243, - "learning_rate": 3.2719247211568965e-06, - "loss": 1.8119, - "step": 300900 - }, - { - "epoch": 0.036280622240579596, - "grad_norm": 1.02182137966156, - "learning_rate": 3.2609826500636238e-06, - "loss": 1.8186, - "step": 301000 - }, - { - "epoch": 0.036280622240579596, - "eval_loss": 2.212273597717285, - "eval_runtime": 51.7058, - "eval_samples_per_second": 197.154, - "eval_steps_per_second": 1.547, - "step": 301000 - }, - { - "epoch": 0.036559703950122514, - "grad_norm": 1.0467265844345093, - "learning_rate": 3.2500576293476638e-06, - "loss": 1.8002, - "step": 301100 - }, - { - "epoch": 0.03683878565966544, - "grad_norm": 1.0487096309661865, - "learning_rate": 3.2391496675777484e-06, - "loss": 1.7995, - "step": 301200 - }, - { - "epoch": 0.03711786736920836, - "grad_norm": 1.0497572422027588, - "learning_rate": 3.2282587733092173e-06, - "loss": 1.8021, - "step": 301300 - }, - { - "epoch": 0.037396949078751275, - "grad_norm": 1.0201036930084229, - "learning_rate": 3.217384955084035e-06, - "loss": 1.809, - "step": 301400 - }, - { - "epoch": 0.0376760307882942, - "grad_norm": 1.0497982501983643, - "learning_rate": 3.2065282214307712e-06, - "loss": 1.8115, - "step": 301500 - }, - { - "epoch": 0.03795511249783712, - "grad_norm": 1.0479981899261475, - "learning_rate": 3.1956885808646002e-06, - "loss": 1.805, - "step": 301600 - }, - { - "epoch": 0.038234194207380036, - "grad_norm": 1.0764997005462646, - "learning_rate": 3.1848660418872744e-06, - "loss": 1.8092, - "step": 301700 - }, - { - "epoch": 0.038513275916922954, - "grad_norm": 1.046151876449585, - "learning_rate": 3.174060612987148e-06, - "loss": 1.8185, - "step": 301800 - }, - { - "epoch": 0.03879235762646588, - "grad_norm": 1.0357836484909058, - "learning_rate": 3.1632723026391503e-06, - "loss": 1.8195, - "step": 301900 - }, - { - "epoch": 0.0390714393360088, - "grad_norm": 1.089996099472046, - "learning_rate": 3.1525011193047847e-06, - "loss": 1.7961, - "step": 302000 - }, - { - "epoch": 0.0390714393360088, - "eval_loss": 2.2081243991851807, - "eval_runtime": 51.8009, - "eval_samples_per_second": 196.792, - "eval_steps_per_second": 1.544, - "step": 302000 - }, - { - "epoch": 0.039350521045551715, - "grad_norm": 1.045300006866455, - "learning_rate": 3.1417470714321275e-06, - "loss": 1.8065, - "step": 302100 - }, - { - "epoch": 0.03962960275509464, - "grad_norm": 1.0354883670806885, - "learning_rate": 3.1310101674558e-06, - "loss": 1.795, - "step": 302200 - }, - { - "epoch": 0.03990868446463756, - "grad_norm": 1.07806396484375, - "learning_rate": 3.1202904157969865e-06, - "loss": 1.7949, - "step": 302300 - }, - { - "epoch": 0.040187766174180475, - "grad_norm": 1.0537368059158325, - "learning_rate": 3.1095878248634164e-06, - "loss": 1.8252, - "step": 302400 - }, - { - "epoch": 0.04046684788372339, - "grad_norm": 1.066607117652893, - "learning_rate": 3.0989024030493723e-06, - "loss": 1.7998, - "step": 302500 - }, - { - "epoch": 0.04074592959326632, - "grad_norm": 1.0885719060897827, - "learning_rate": 3.0882341587356476e-06, - "loss": 1.8006, - "step": 302600 - }, - { - "epoch": 0.041025011302809236, - "grad_norm": 1.0701121091842651, - "learning_rate": 3.0775831002895774e-06, - "loss": 1.8307, - "step": 302700 - }, - { - "epoch": 0.041304093012352154, - "grad_norm": 1.045860767364502, - "learning_rate": 3.0669492360650196e-06, - "loss": 1.8094, - "step": 302800 - }, - { - "epoch": 0.04158317472189508, - "grad_norm": 1.0620360374450684, - "learning_rate": 3.056332574402346e-06, - "loss": 1.8162, - "step": 302900 - }, - { - "epoch": 0.041862256431438, - "grad_norm": 1.0758084058761597, - "learning_rate": 3.0457331236284166e-06, - "loss": 1.7981, - "step": 303000 - }, - { - "epoch": 0.041862256431438, - "eval_loss": 2.2137293815612793, - "eval_runtime": 51.9224, - "eval_samples_per_second": 196.331, - "eval_steps_per_second": 1.541, - "step": 303000 - }, - { - "epoch": 0.0002, - "grad_norm": 1.0506842136383057, - "learning_rate": 1.7259637505723265e-05, - "loss": 1.7958, - "step": 303100 - }, - { - "epoch": 0.0004, - "grad_norm": 1.034621000289917, - "learning_rate": 1.7244552087867325e-05, - "loss": 1.8261, - "step": 303200 - }, - { - "epoch": 0.0006, - "grad_norm": 1.125115156173706, - "learning_rate": 1.7229469793904873e-05, - "loss": 1.8172, - "step": 303300 - }, - { - "epoch": 0.0008, - "grad_norm": 1.0532312393188477, - "learning_rate": 1.7214390629911066e-05, - "loss": 1.8165, - "step": 303400 - }, - { - "epoch": 0.001, - "grad_norm": 1.0483386516571045, - "learning_rate": 1.7199314601959778e-05, - "loss": 1.8275, - "step": 303500 - }, - { - "epoch": 0.0012, - "grad_norm": 1.0204639434814453, - "learning_rate": 1.7184241716123635e-05, - "loss": 1.816, - "step": 303600 - }, - { - "epoch": 0.0014, - "grad_norm": 1.069264531135559, - "learning_rate": 1.7169171978473994e-05, - "loss": 1.8174, - "step": 303700 - }, - { - "epoch": 0.0016, - "grad_norm": 1.0591576099395752, - "learning_rate": 1.715410539508095e-05, - "loss": 1.8284, - "step": 303800 - }, - { - "epoch": 0.0018, - "grad_norm": 1.086665391921997, - "learning_rate": 1.7139041972013304e-05, - "loss": 1.8279, - "step": 303900 - }, - { - "epoch": 0.002, - "grad_norm": 1.094480037689209, - "learning_rate": 1.712398171533862e-05, - "loss": 1.8144, - "step": 304000 - }, - { - "epoch": 0.002, - "eval_loss": 2.229548931121826, - "eval_runtime": 52.0986, - "eval_samples_per_second": 195.667, - "eval_steps_per_second": 1.536, - "step": 304000 - }, - { - "epoch": 0.0022, - "grad_norm": 1.0918675661087036, - "learning_rate": 1.710892463112316e-05, - "loss": 1.8137, - "step": 304100 - }, - { - "epoch": 0.0024, - "grad_norm": 1.0117205381393433, - "learning_rate": 1.709387072543191e-05, - "loss": 1.8065, - "step": 304200 - }, - { - "epoch": 0.0026, - "grad_norm": 1.1326615810394287, - "learning_rate": 1.7078820004328587e-05, - "loss": 1.814, - "step": 304300 - }, - { - "epoch": 0.0028, - "grad_norm": 1.048861026763916, - "learning_rate": 1.7063772473875616e-05, - "loss": 1.8061, - "step": 304400 - }, - { - "epoch": 0.003, - "grad_norm": 1.0585246086120605, - "learning_rate": 1.7048728140134152e-05, - "loss": 1.8293, - "step": 304500 - }, - { - "epoch": 0.0032, - "grad_norm": 1.0181670188903809, - "learning_rate": 1.7033687009164033e-05, - "loss": 1.8163, - "step": 304600 - }, - { - "epoch": 0.0034, - "grad_norm": 1.0025696754455566, - "learning_rate": 1.701864908702384e-05, - "loss": 1.8058, - "step": 304700 - }, - { - "epoch": 0.0036, - "grad_norm": 1.0825532674789429, - "learning_rate": 1.700361437977084e-05, - "loss": 1.818, - "step": 304800 - }, - { - "epoch": 0.0038, - "grad_norm": 1.0389013290405273, - "learning_rate": 1.6988582893461008e-05, - "loss": 1.8142, - "step": 304900 - }, - { - "epoch": 0.004, - "grad_norm": 1.0209424495697021, - "learning_rate": 1.697355463414903e-05, - "loss": 1.8103, - "step": 305000 - }, - { - "epoch": 0.004, - "eval_loss": 2.230199098587036, - "eval_runtime": 51.6157, - "eval_samples_per_second": 197.498, - "eval_steps_per_second": 1.55, - "step": 305000 - }, - { - "epoch": 0.0042, - "grad_norm": 1.085379958152771, - "learning_rate": 1.695852960788829e-05, - "loss": 1.8192, - "step": 305100 - }, - { - "epoch": 0.0044, - "grad_norm": 1.0386351346969604, - "learning_rate": 1.6943507820730854e-05, - "loss": 1.8061, - "step": 305200 - }, - { - "epoch": 0.0046, - "grad_norm": 1.0565484762191772, - "learning_rate": 1.692848927872751e-05, - "loss": 1.8081, - "step": 305300 - }, - { - "epoch": 0.0048, - "grad_norm": 1.0819813013076782, - "learning_rate": 1.6913473987927713e-05, - "loss": 1.8158, - "step": 305400 - }, - { - "epoch": 0.005, - "grad_norm": 1.0319418907165527, - "learning_rate": 1.6898461954379636e-05, - "loss": 1.7954, - "step": 305500 - }, - { - "epoch": 0.0052, - "grad_norm": 1.0530176162719727, - "learning_rate": 1.6883453184130116e-05, - "loss": 1.8046, - "step": 305600 - }, - { - "epoch": 0.0054, - "grad_norm": 1.0865267515182495, - "learning_rate": 1.686844768322467e-05, - "loss": 1.7917, - "step": 305700 - }, - { - "epoch": 0.0056, - "grad_norm": 1.027178406715393, - "learning_rate": 1.6853445457707538e-05, - "loss": 1.7988, - "step": 305800 - }, - { - "epoch": 0.0058, - "grad_norm": 1.0627230405807495, - "learning_rate": 1.6838446513621593e-05, - "loss": 1.7954, - "step": 305900 - }, - { - "epoch": 0.006, - "grad_norm": 1.059670329093933, - "learning_rate": 1.6823450857008423e-05, - "loss": 1.7974, - "step": 306000 - }, - { - "epoch": 0.006, - "eval_loss": 2.230013608932495, - "eval_runtime": 51.6205, - "eval_samples_per_second": 197.48, - "eval_steps_per_second": 1.55, - "step": 306000 - }, - { - "epoch": 0.0062, - "grad_norm": 1.0403988361358643, - "learning_rate": 1.6808458493908258e-05, - "loss": 1.7976, - "step": 306100 - }, - { - "epoch": 0.0064, - "grad_norm": 1.0143063068389893, - "learning_rate": 1.6793469430360042e-05, - "loss": 1.7949, - "step": 306200 - }, - { - "epoch": 0.0066, - "grad_norm": 1.1919389963150024, - "learning_rate": 1.6778483672401356e-05, - "loss": 1.8018, - "step": 306300 - }, - { - "epoch": 0.0068, - "grad_norm": 1.06490957736969, - "learning_rate": 1.6763501226068465e-05, - "loss": 1.8087, - "step": 306400 - }, - { - "epoch": 0.007, - "grad_norm": 1.0884573459625244, - "learning_rate": 1.674852209739629e-05, - "loss": 1.8177, - "step": 306500 - }, - { - "epoch": 0.0072, - "grad_norm": 1.0523546934127808, - "learning_rate": 1.6733546292418434e-05, - "loss": 1.7789, - "step": 306600 - }, - { - "epoch": 0.0074, - "grad_norm": 1.0929498672485352, - "learning_rate": 1.6718573817167137e-05, - "loss": 1.8022, - "step": 306700 - }, - { - "epoch": 0.0076, - "grad_norm": 1.0335514545440674, - "learning_rate": 1.6703604677673322e-05, - "loss": 1.7912, - "step": 306800 - }, - { - "epoch": 0.0078, - "grad_norm": 1.0258134603500366, - "learning_rate": 1.6688638879966546e-05, - "loss": 1.7952, - "step": 306900 - }, - { - "epoch": 0.008, - "grad_norm": 1.0420570373535156, - "learning_rate": 1.6673676430075036e-05, - "loss": 1.7981, - "step": 307000 - }, - { - "epoch": 0.008, - "eval_loss": 2.228384256362915, - "eval_runtime": 51.6428, - "eval_samples_per_second": 197.395, - "eval_steps_per_second": 1.549, - "step": 307000 - }, - { - "epoch": 0.0082, - "grad_norm": 1.065299391746521, - "learning_rate": 1.6658717334025664e-05, - "loss": 1.8051, - "step": 307100 - }, - { - "epoch": 0.0084, - "grad_norm": 1.015187382698059, - "learning_rate": 1.6643761597843953e-05, - "loss": 1.8016, - "step": 307200 - }, - { - "epoch": 0.0086, - "grad_norm": 1.047338843345642, - "learning_rate": 1.6628809227554077e-05, - "loss": 1.7974, - "step": 307300 - }, - { - "epoch": 0.0088, - "grad_norm": 1.0116043090820312, - "learning_rate": 1.6613860229178836e-05, - "loss": 1.793, - "step": 307400 - }, - { - "epoch": 0.009, - "grad_norm": 1.0261743068695068, - "learning_rate": 1.6598914608739695e-05, - "loss": 1.789, - "step": 307500 - }, - { - "epoch": 0.0092, - "grad_norm": 1.0221142768859863, - "learning_rate": 1.658397237225674e-05, - "loss": 1.7865, - "step": 307600 - }, - { - "epoch": 0.0094, - "grad_norm": 1.050794005393982, - "learning_rate": 1.6569033525748712e-05, - "loss": 1.7725, - "step": 307700 - }, - { - "epoch": 0.0096, - "grad_norm": 1.1043586730957031, - "learning_rate": 1.6554098075232967e-05, - "loss": 1.7772, - "step": 307800 - }, - { - "epoch": 0.0098, - "grad_norm": 1.0293883085250854, - "learning_rate": 1.6539166026725515e-05, - "loss": 1.8076, - "step": 307900 - }, - { - "epoch": 0.01, - "grad_norm": 1.0498391389846802, - "learning_rate": 1.6524237386240964e-05, - "loss": 1.7978, - "step": 308000 - }, - { - "epoch": 0.01, - "eval_loss": 2.2343056201934814, - "eval_runtime": 51.6864, - "eval_samples_per_second": 197.228, - "eval_steps_per_second": 1.548, - "step": 308000 - }, - { - "epoch": 0.0102, - "grad_norm": 1.0795516967773438, - "learning_rate": 1.6509312159792594e-05, - "loss": 1.8164, - "step": 308100 - }, - { - "epoch": 0.0104, - "grad_norm": 1.0950493812561035, - "learning_rate": 1.6494390353392258e-05, - "loss": 1.7901, - "step": 308200 - }, - { - "epoch": 0.0106, - "grad_norm": 1.0679363012313843, - "learning_rate": 1.6479471973050482e-05, - "loss": 1.8094, - "step": 308300 - }, - { - "epoch": 0.0108, - "grad_norm": 1.0638396739959717, - "learning_rate": 1.6464557024776365e-05, - "loss": 1.7981, - "step": 308400 - }, - { - "epoch": 0.011, - "grad_norm": 1.0541220903396606, - "learning_rate": 1.6449645514577668e-05, - "loss": 1.7955, - "step": 308500 - }, - { - "epoch": 0.0112, - "grad_norm": 1.0506057739257812, - "learning_rate": 1.6434737448460725e-05, - "loss": 1.7793, - "step": 308600 - }, - { - "epoch": 0.0114, - "grad_norm": 1.0873394012451172, - "learning_rate": 1.6419832832430522e-05, - "loss": 1.7941, - "step": 308700 - }, - { - "epoch": 0.0116, - "grad_norm": 1.0632107257843018, - "learning_rate": 1.6404931672490625e-05, - "loss": 1.7861, - "step": 308800 - }, - { - "epoch": 0.0118, - "grad_norm": 1.1098015308380127, - "learning_rate": 1.6390033974643222e-05, - "loss": 1.7709, - "step": 308900 - }, - { - "epoch": 0.012, - "grad_norm": 1.046675682067871, - "learning_rate": 1.6375139744889107e-05, - "loss": 1.7811, - "step": 309000 - }, - { - "epoch": 0.012, - "eval_loss": 2.2420222759246826, - "eval_runtime": 51.584, - "eval_samples_per_second": 197.619, - "eval_steps_per_second": 1.551, - "step": 309000 - }, - { - "epoch": 0.0122, - "grad_norm": 1.047875165939331, - "learning_rate": 1.6360248989227666e-05, - "loss": 1.7818, - "step": 309100 - }, - { - "epoch": 0.0124, - "grad_norm": 1.0503313541412354, - "learning_rate": 1.6345361713656904e-05, - "loss": 1.7718, - "step": 309200 - }, - { - "epoch": 0.0126, - "grad_norm": 1.046026587486267, - "learning_rate": 1.6330477924173403e-05, - "loss": 1.7518, - "step": 309300 - }, - { - "epoch": 0.0128, - "grad_norm": 1.0461571216583252, - "learning_rate": 1.6315597626772365e-05, - "loss": 1.7751, - "step": 309400 - }, - { - "epoch": 0.013, - "grad_norm": 1.0191349983215332, - "learning_rate": 1.6300720827447556e-05, - "loss": 1.7724, - "step": 309500 - }, - { - "epoch": 0.0132, - "grad_norm": 1.0420078039169312, - "learning_rate": 1.6285847532191364e-05, - "loss": 1.7394, - "step": 309600 - }, - { - "epoch": 0.0134, - "grad_norm": 1.0415441989898682, - "learning_rate": 1.627097774699474e-05, - "loss": 1.7405, - "step": 309700 - }, - { - "epoch": 0.0136, - "grad_norm": 1.0861761569976807, - "learning_rate": 1.625611147784724e-05, - "loss": 1.7572, - "step": 309800 - }, - { - "epoch": 0.0138, - "grad_norm": 1.042179822921753, - "learning_rate": 1.6241248730736985e-05, - "loss": 1.7634, - "step": 309900 - }, - { - "epoch": 0.014, - "grad_norm": 1.0887514352798462, - "learning_rate": 1.6226389511650697e-05, - "loss": 1.7487, - "step": 310000 - }, - { - "epoch": 0.014, - "eval_loss": 2.244732618331909, - "eval_runtime": 51.6991, - "eval_samples_per_second": 197.18, - "eval_steps_per_second": 1.547, - "step": 310000 - }, - { - "epoch": 0.0142, - "grad_norm": 1.0510177612304688, - "learning_rate": 1.6211533826573662e-05, - "loss": 1.7426, - "step": 310100 - }, - { - "epoch": 0.0144, - "grad_norm": 0.9902233481407166, - "learning_rate": 1.6196681681489755e-05, - "loss": 1.7452, - "step": 310200 - }, - { - "epoch": 0.0146, - "grad_norm": 1.0358948707580566, - "learning_rate": 1.6181833082381413e-05, - "loss": 1.7292, - "step": 310300 - }, - { - "epoch": 0.0148, - "grad_norm": 1.0080764293670654, - "learning_rate": 1.6166988035229652e-05, - "loss": 1.7368, - "step": 310400 - }, - { - "epoch": 0.015, - "grad_norm": 1.0920326709747314, - "learning_rate": 1.6152146546014053e-05, - "loss": 1.7186, - "step": 310500 - }, - { - "epoch": 0.0152, - "grad_norm": 1.0890278816223145, - "learning_rate": 1.6137308620712765e-05, - "loss": 1.7179, - "step": 310600 - }, - { - "epoch": 0.0154, - "grad_norm": 1.0208715200424194, - "learning_rate": 1.612247426530251e-05, - "loss": 1.744, - "step": 310700 - }, - { - "epoch": 0.0156, - "grad_norm": 0.9500866532325745, - "learning_rate": 1.610764348575856e-05, - "loss": 1.6606, - "step": 310800 - }, - { - "epoch": 0.0158, - "grad_norm": 0.9557023048400879, - "learning_rate": 1.6092816288054746e-05, - "loss": 1.4109, - "step": 310900 - }, - { - "epoch": 0.016, - "grad_norm": 0.9185681343078613, - "learning_rate": 1.6077992678163467e-05, - "loss": 1.3687, - "step": 311000 - }, - { - "epoch": 0.016, - "eval_loss": 2.290562629699707, - "eval_runtime": 51.8115, - "eval_samples_per_second": 196.752, - "eval_steps_per_second": 1.544, - "step": 311000 - }, - { - "epoch": 0.0162, - "grad_norm": 0.9145857691764832, - "learning_rate": 1.6063172662055665e-05, - "loss": 1.3382, - "step": 311100 - }, - { - "epoch": 0.0164, - "grad_norm": 0.9351733922958374, - "learning_rate": 1.6048356245700856e-05, - "loss": 1.3208, - "step": 311200 - }, - { - "epoch": 0.0166, - "grad_norm": 0.9079789519309998, - "learning_rate": 1.603354343506707e-05, - "loss": 1.2985, - "step": 311300 - }, - { - "epoch": 0.0168, - "grad_norm": 1.0671257972717285, - "learning_rate": 1.6018734236120926e-05, - "loss": 1.3041, - "step": 311400 - }, - { - "epoch": 0.017, - "grad_norm": 0.8702555894851685, - "learning_rate": 1.600392865482755e-05, - "loss": 1.278, - "step": 311500 - }, - { - "epoch": 0.0172, - "grad_norm": 0.9119271039962769, - "learning_rate": 1.598912669715064e-05, - "loss": 1.2662, - "step": 311600 - }, - { - "epoch": 0.0174, - "grad_norm": 0.8721778988838196, - "learning_rate": 1.5974328369052415e-05, - "loss": 1.2713, - "step": 311700 - }, - { - "epoch": 0.0176, - "grad_norm": 0.9360621571540833, - "learning_rate": 1.5959533676493647e-05, - "loss": 1.2523, - "step": 311800 - }, - { - "epoch": 0.0178, - "grad_norm": 0.9057286381721497, - "learning_rate": 1.5944742625433633e-05, - "loss": 1.2308, - "step": 311900 - }, - { - "epoch": 0.018, - "grad_norm": 0.874999463558197, - "learning_rate": 1.5929955221830202e-05, - "loss": 1.2274, - "step": 312000 - }, - { - "epoch": 0.018, - "eval_loss": 2.374765634536743, - "eval_runtime": 51.8773, - "eval_samples_per_second": 196.502, - "eval_steps_per_second": 1.542, - "step": 312000 - }, - { - "epoch": 0.0182, - "grad_norm": 0.9309009313583374, - "learning_rate": 1.591517147163973e-05, - "loss": 1.224, - "step": 312100 - }, - { - "epoch": 0.0184, - "grad_norm": 0.8504728674888611, - "learning_rate": 1.59003913808171e-05, - "loss": 1.2055, - "step": 312200 - }, - { - "epoch": 0.0186, - "grad_norm": 0.9231265783309937, - "learning_rate": 1.588561495531573e-05, - "loss": 1.2074, - "step": 312300 - }, - { - "epoch": 0.0188, - "grad_norm": 0.9524025321006775, - "learning_rate": 1.587084220108757e-05, - "loss": 1.1945, - "step": 312400 - }, - { - "epoch": 0.019, - "grad_norm": 0.8538132309913635, - "learning_rate": 1.585607312408308e-05, - "loss": 1.202, - "step": 312500 - }, - { - "epoch": 0.0192, - "grad_norm": 1.1738858222961426, - "learning_rate": 1.5841307730251237e-05, - "loss": 1.1787, - "step": 312600 - }, - { - "epoch": 0.0194, - "grad_norm": 0.9254825711250305, - "learning_rate": 1.5826546025539552e-05, - "loss": 1.1737, - "step": 312700 - }, - { - "epoch": 0.0196, - "grad_norm": 0.8884557485580444, - "learning_rate": 1.5811788015894025e-05, - "loss": 1.1715, - "step": 312800 - }, - { - "epoch": 0.0198, - "grad_norm": 0.8768421411514282, - "learning_rate": 1.579703370725919e-05, - "loss": 1.1701, - "step": 312900 - }, - { - "epoch": 0.02, - "grad_norm": 0.9079160690307617, - "learning_rate": 1.5782283105578076e-05, - "loss": 1.1533, - "step": 313000 - }, - { - "epoch": 0.02, - "eval_loss": 2.40177059173584, - "eval_runtime": 52.039, - "eval_samples_per_second": 195.892, - "eval_steps_per_second": 1.537, - "step": 313000 - }, - { - "epoch": 0.0202, - "grad_norm": 0.8182185888290405, - "learning_rate": 1.5767536216792224e-05, - "loss": 1.1693, - "step": 313100 - }, - { - "epoch": 0.0204, - "grad_norm": 0.8927570581436157, - "learning_rate": 1.575279304684168e-05, - "loss": 1.1373, - "step": 313200 - }, - { - "epoch": 0.0206, - "grad_norm": 0.881147027015686, - "learning_rate": 1.573805360166499e-05, - "loss": 1.1504, - "step": 313300 - }, - { - "epoch": 0.0208, - "grad_norm": 0.8833571672439575, - "learning_rate": 1.572331788719921e-05, - "loss": 1.1405, - "step": 313400 - }, - { - "epoch": 0.021, - "grad_norm": 0.8598502278327942, - "learning_rate": 1.5708585909379864e-05, - "loss": 1.1365, - "step": 313500 - }, - { - "epoch": 0.0212, - "grad_norm": 0.8392401337623596, - "learning_rate": 1.5693857674141012e-05, - "loss": 1.1331, - "step": 313600 - }, - { - "epoch": 0.0214, - "grad_norm": 0.8870404958724976, - "learning_rate": 1.5679133187415168e-05, - "loss": 1.115, - "step": 313700 - }, - { - "epoch": 0.0216, - "grad_norm": 0.9391428232192993, - "learning_rate": 1.566441245513337e-05, - "loss": 1.1178, - "step": 313800 - }, - { - "epoch": 0.0218, - "grad_norm": 0.8332740664482117, - "learning_rate": 1.5649695483225107e-05, - "loss": 1.1335, - "step": 313900 - }, - { - "epoch": 0.022, - "grad_norm": 0.8561016917228699, - "learning_rate": 1.5634982277618392e-05, - "loss": 1.126, - "step": 314000 - }, - { - "epoch": 0.022, - "eval_loss": 2.42391037940979, - "eval_runtime": 51.9237, - "eval_samples_per_second": 196.327, - "eval_steps_per_second": 1.541, - "step": 314000 - }, - { - "epoch": 0.0222, - "grad_norm": 0.8221678137779236, - "learning_rate": 1.5620272844239697e-05, - "loss": 1.1344, - "step": 314100 - }, - { - "epoch": 0.0224, - "grad_norm": 0.865084707736969, - "learning_rate": 1.5605567189013977e-05, - "loss": 1.1195, - "step": 314200 - }, - { - "epoch": 0.0226, - "grad_norm": 0.8354145288467407, - "learning_rate": 1.5590865317864666e-05, - "loss": 1.1236, - "step": 314300 - }, - { - "epoch": 0.0228, - "grad_norm": 0.8688293099403381, - "learning_rate": 1.557616723671369e-05, - "loss": 1.1169, - "step": 314400 - }, - { - "epoch": 0.023, - "grad_norm": 0.8651818037033081, - "learning_rate": 1.5561472951481414e-05, - "loss": 1.1099, - "step": 314500 - }, - { - "epoch": 0.0232, - "grad_norm": 0.8726403713226318, - "learning_rate": 1.5546782468086706e-05, - "loss": 1.1284, - "step": 314600 - }, - { - "epoch": 0.0234, - "grad_norm": 0.8787026405334473, - "learning_rate": 1.5532095792446894e-05, - "loss": 1.1046, - "step": 314700 - }, - { - "epoch": 0.0236, - "grad_norm": 0.8764083981513977, - "learning_rate": 1.5517412930477762e-05, - "loss": 1.0929, - "step": 314800 - }, - { - "epoch": 0.0238, - "grad_norm": 0.8777551651000977, - "learning_rate": 1.5502733888093564e-05, - "loss": 1.1143, - "step": 314900 - }, - { - "epoch": 0.024, - "grad_norm": 0.8219897150993347, - "learning_rate": 1.5488058671207027e-05, - "loss": 1.0936, - "step": 315000 - }, - { - "epoch": 0.024, - "eval_loss": 2.4566423892974854, - "eval_runtime": 52.0959, - "eval_samples_per_second": 195.678, - "eval_steps_per_second": 1.536, - "step": 315000 - }, - { - "epoch": 0.0242, - "grad_norm": 0.8803728818893433, - "learning_rate": 1.5473387285729317e-05, - "loss": 1.1068, - "step": 315100 - }, - { - "epoch": 0.0244, - "grad_norm": 0.9315307140350342, - "learning_rate": 1.5458719737570067e-05, - "loss": 1.0864, - "step": 315200 - }, - { - "epoch": 0.0246, - "grad_norm": 0.9067742824554443, - "learning_rate": 1.544405603263737e-05, - "loss": 1.0905, - "step": 315300 - }, - { - "epoch": 0.0248, - "grad_norm": 0.836283802986145, - "learning_rate": 1.5429396176837756e-05, - "loss": 1.0925, - "step": 315400 - }, - { - "epoch": 0.025, - "grad_norm": 0.8385760188102722, - "learning_rate": 1.541474017607622e-05, - "loss": 1.0998, - "step": 315500 - }, - { - "epoch": 0.0252, - "grad_norm": 0.820689857006073, - "learning_rate": 1.5400088036256187e-05, - "loss": 1.0826, - "step": 315600 - }, - { - "epoch": 0.0254, - "grad_norm": 0.8749442100524902, - "learning_rate": 1.5385439763279556e-05, - "loss": 1.0923, - "step": 315700 - }, - { - "epoch": 0.0256, - "grad_norm": 0.8703187704086304, - "learning_rate": 1.537079536304663e-05, - "loss": 1.0874, - "step": 315800 - }, - { - "epoch": 0.0258, - "grad_norm": 0.8370440006256104, - "learning_rate": 1.535615484145619e-05, - "loss": 1.0905, - "step": 315900 - }, - { - "epoch": 0.026, - "grad_norm": 0.8787978887557983, - "learning_rate": 1.5341518204405416e-05, - "loss": 1.0855, - "step": 316000 - }, - { - "epoch": 0.026, - "eval_loss": 2.462463617324829, - "eval_runtime": 51.7904, - "eval_samples_per_second": 196.832, - "eval_steps_per_second": 1.545, - "step": 316000 - }, - { - "epoch": 0.0262, - "grad_norm": 0.8166690468788147, - "learning_rate": 1.5326885457789964e-05, - "loss": 1.0895, - "step": 316100 - }, - { - "epoch": 0.0264, - "grad_norm": 0.8346712589263916, - "learning_rate": 1.5312256607503884e-05, - "loss": 1.0795, - "step": 316200 - }, - { - "epoch": 0.0266, - "grad_norm": 0.8622503876686096, - "learning_rate": 1.529763165943969e-05, - "loss": 1.0682, - "step": 316300 - }, - { - "epoch": 0.0268, - "grad_norm": 0.8298165798187256, - "learning_rate": 1.5283010619488296e-05, - "loss": 1.077, - "step": 316400 - }, - { - "epoch": 0.027, - "grad_norm": 0.8516880869865417, - "learning_rate": 1.5268393493539073e-05, - "loss": 1.0686, - "step": 316500 - }, - { - "epoch": 0.0272, - "grad_norm": 0.8550381660461426, - "learning_rate": 1.5253780287479785e-05, - "loss": 1.0696, - "step": 316600 - }, - { - "epoch": 0.0274, - "grad_norm": 0.821546733379364, - "learning_rate": 1.5239171007196623e-05, - "loss": 1.0689, - "step": 316700 - }, - { - "epoch": 0.0276, - "grad_norm": 0.8041675686836243, - "learning_rate": 1.522456565857422e-05, - "loss": 1.0649, - "step": 316800 - }, - { - "epoch": 0.0278, - "grad_norm": 0.9088461995124817, - "learning_rate": 1.5209964247495595e-05, - "loss": 1.0751, - "step": 316900 - }, - { - "epoch": 0.028, - "grad_norm": 0.8547507524490356, - "learning_rate": 1.5195366779842207e-05, - "loss": 1.0798, - "step": 317000 - }, - { - "epoch": 0.028, - "eval_loss": 2.4812233448028564, - "eval_runtime": 52.0302, - "eval_samples_per_second": 195.925, - "eval_steps_per_second": 1.538, - "step": 317000 - }, - { - "epoch": 0.0282, - "grad_norm": 0.8872113823890686, - "learning_rate": 1.5180773261493902e-05, - "loss": 1.0652, - "step": 317100 - }, - { - "epoch": 0.0284, - "grad_norm": 0.984126091003418, - "learning_rate": 1.5166183698328957e-05, - "loss": 1.0654, - "step": 317200 - }, - { - "epoch": 0.0286, - "grad_norm": 0.8874821066856384, - "learning_rate": 1.5151598096224037e-05, - "loss": 1.0571, - "step": 317300 - }, - { - "epoch": 0.0288, - "grad_norm": 0.8837223649024963, - "learning_rate": 1.5137016461054233e-05, - "loss": 1.066, - "step": 317400 - }, - { - "epoch": 0.029, - "grad_norm": 0.879486083984375, - "learning_rate": 1.512243879869301e-05, - "loss": 1.0572, - "step": 317500 - }, - { - "epoch": 0.0292, - "grad_norm": 0.8751283884048462, - "learning_rate": 1.5107865115012265e-05, - "loss": 1.0552, - "step": 317600 - }, - { - "epoch": 0.0294, - "grad_norm": 0.8803706765174866, - "learning_rate": 1.5093295415882267e-05, - "loss": 1.0499, - "step": 317700 - }, - { - "epoch": 0.0296, - "grad_norm": 0.8694496750831604, - "learning_rate": 1.507872970717169e-05, - "loss": 1.0608, - "step": 317800 - }, - { - "epoch": 0.0298, - "grad_norm": 0.8200892806053162, - "learning_rate": 1.5064167994747603e-05, - "loss": 1.0415, - "step": 317900 - }, - { - "epoch": 0.03, - "grad_norm": 0.8422415256500244, - "learning_rate": 1.5049610284475458e-05, - "loss": 1.0487, - "step": 318000 - }, - { - "epoch": 0.03, - "eval_loss": 2.492359161376953, - "eval_runtime": 51.9706, - "eval_samples_per_second": 196.149, - "eval_steps_per_second": 1.539, - "step": 318000 - }, - { - "epoch": 0.0002, - "grad_norm": 0.8418950438499451, - "learning_rate": 1.5035056582219098e-05, - "loss": 1.0456, - "step": 318100 - }, - { - "epoch": 0.0004, - "grad_norm": 0.8390074968338013, - "learning_rate": 1.5020506893840758e-05, - "loss": 1.0318, - "step": 318200 - }, - { - "epoch": 0.0006, - "grad_norm": 0.8178459405899048, - "learning_rate": 1.5005961225201048e-05, - "loss": 1.0373, - "step": 318300 - }, - { - "epoch": 0.0008, - "grad_norm": 0.8252522349357605, - "learning_rate": 1.4991419582158959e-05, - "loss": 1.0267, - "step": 318400 - }, - { - "epoch": 0.001, - "grad_norm": 0.8596453070640564, - "learning_rate": 1.4976881970571868e-05, - "loss": 1.045, - "step": 318500 - }, - { - "epoch": 0.0012, - "grad_norm": 0.9191332459449768, - "learning_rate": 1.4962348396295517e-05, - "loss": 1.0201, - "step": 318600 - }, - { - "epoch": 0.0014, - "grad_norm": 0.8910384774208069, - "learning_rate": 1.4947818865184035e-05, - "loss": 1.0176, - "step": 318700 - }, - { - "epoch": 0.0016, - "grad_norm": 0.8146995902061462, - "learning_rate": 1.4933293383089908e-05, - "loss": 1.0263, - "step": 318800 - }, - { - "epoch": 0.0018, - "grad_norm": 0.8134050965309143, - "learning_rate": 1.4918771955864009e-05, - "loss": 1.0085, - "step": 318900 - }, - { - "epoch": 0.002, - "grad_norm": 0.8413226008415222, - "learning_rate": 1.4904254589355555e-05, - "loss": 1.0336, - "step": 319000 - }, - { - "epoch": 0.002, - "eval_loss": 2.506340742111206, - "eval_runtime": 52.1719, - "eval_samples_per_second": 195.393, - "eval_steps_per_second": 1.533, - "step": 319000 - }, - { - "epoch": 0.0022, - "grad_norm": 0.8138185739517212, - "learning_rate": 1.4889741289412145e-05, - "loss": 1.023, - "step": 319100 - }, - { - "epoch": 0.0024, - "grad_norm": 0.8572819232940674, - "learning_rate": 1.4875232061879735e-05, - "loss": 1.0055, - "step": 319200 - }, - { - "epoch": 0.0026, - "grad_norm": 0.8657738566398621, - "learning_rate": 1.4860726912602643e-05, - "loss": 1.009, - "step": 319300 - }, - { - "epoch": 0.0028, - "grad_norm": 0.8982349634170532, - "learning_rate": 1.4846225847423545e-05, - "loss": 1.021, - "step": 319400 - }, - { - "epoch": 0.003, - "grad_norm": 0.8425928354263306, - "learning_rate": 1.4831728872183448e-05, - "loss": 1.0206, - "step": 319500 - }, - { - "epoch": 0.0032, - "grad_norm": 0.8392213582992554, - "learning_rate": 1.481723599272175e-05, - "loss": 1.0088, - "step": 319600 - }, - { - "epoch": 0.0034, - "grad_norm": 0.8505594730377197, - "learning_rate": 1.480274721487618e-05, - "loss": 0.9964, - "step": 319700 - }, - { - "epoch": 0.0036, - "grad_norm": 0.7965133190155029, - "learning_rate": 1.4788262544482805e-05, - "loss": 1.0288, - "step": 319800 - }, - { - "epoch": 0.0038, - "grad_norm": 0.8193480372428894, - "learning_rate": 1.4773781987376061e-05, - "loss": 0.9985, - "step": 319900 - }, - { - "epoch": 0.004, - "grad_norm": 0.8430262207984924, - "learning_rate": 1.4759305549388708e-05, - "loss": 1.0053, - "step": 320000 - }, - { - "epoch": 0.004, - "eval_loss": 2.515505790710449, - "eval_runtime": 51.658, - "eval_samples_per_second": 197.337, - "eval_steps_per_second": 1.549, - "step": 320000 - }, - { - "epoch": 0.0042, - "grad_norm": 0.8491013050079346, - "learning_rate": 1.4744833236351857e-05, - "loss": 1.0021, - "step": 320100 - }, - { - "epoch": 0.0044, - "grad_norm": 0.8557093739509583, - "learning_rate": 1.4730365054094947e-05, - "loss": 0.9974, - "step": 320200 - }, - { - "epoch": 0.0046, - "grad_norm": 0.8552497625350952, - "learning_rate": 1.471590100844577e-05, - "loss": 0.9937, - "step": 320300 - }, - { - "epoch": 0.0048, - "grad_norm": 0.7959555983543396, - "learning_rate": 1.4701441105230435e-05, - "loss": 1.0001, - "step": 320400 - }, - { - "epoch": 0.005, - "grad_norm": 0.8395636081695557, - "learning_rate": 1.4686985350273391e-05, - "loss": 0.9984, - "step": 320500 - }, - { - "epoch": 0.0052, - "grad_norm": 0.8316648602485657, - "learning_rate": 1.4672533749397414e-05, - "loss": 0.988, - "step": 320600 - }, - { - "epoch": 0.0054, - "grad_norm": 0.8290709853172302, - "learning_rate": 1.4658086308423608e-05, - "loss": 0.9984, - "step": 320700 - }, - { - "epoch": 0.0056, - "grad_norm": 0.8538153767585754, - "learning_rate": 1.46436430331714e-05, - "loss": 1.0038, - "step": 320800 - }, - { - "epoch": 0.0058, - "grad_norm": 0.828048586845398, - "learning_rate": 1.462920392945854e-05, - "loss": 0.9952, - "step": 320900 - }, - { - "epoch": 0.006, - "grad_norm": 0.8509120941162109, - "learning_rate": 1.4614769003101097e-05, - "loss": 1.0151, - "step": 321000 - }, - { - "epoch": 0.006, - "eval_loss": 2.529923677444458, - "eval_runtime": 51.641, - "eval_samples_per_second": 197.401, - "eval_steps_per_second": 1.549, - "step": 321000 - }, - { - "epoch": 0.0062, - "grad_norm": 0.8277125358581543, - "learning_rate": 1.460033825991346e-05, - "loss": 1.0018, - "step": 321100 - }, - { - "epoch": 0.0064, - "grad_norm": 0.8201048374176025, - "learning_rate": 1.4585911705708325e-05, - "loss": 1.0042, - "step": 321200 - }, - { - "epoch": 0.0066, - "grad_norm": 0.8629177212715149, - "learning_rate": 1.4571489346296718e-05, - "loss": 1.0076, - "step": 321300 - }, - { - "epoch": 0.0068, - "grad_norm": 0.8436629176139832, - "learning_rate": 1.4557071187487945e-05, - "loss": 1.0137, - "step": 321400 - }, - { - "epoch": 0.007, - "grad_norm": 0.9035348892211914, - "learning_rate": 1.4542657235089649e-05, - "loss": 0.9959, - "step": 321500 - }, - { - "epoch": 0.0072, - "grad_norm": 0.8393178582191467, - "learning_rate": 1.4528247494907768e-05, - "loss": 1.0055, - "step": 321600 - }, - { - "epoch": 0.0074, - "grad_norm": 0.8507473468780518, - "learning_rate": 1.4513841972746555e-05, - "loss": 1.0039, - "step": 321700 - }, - { - "epoch": 0.0076, - "grad_norm": 0.8492685556411743, - "learning_rate": 1.4499440674408529e-05, - "loss": 1.0109, - "step": 321800 - }, - { - "epoch": 0.0078, - "grad_norm": 0.8794492483139038, - "learning_rate": 1.4485043605694545e-05, - "loss": 0.9981, - "step": 321900 - }, - { - "epoch": 0.008, - "grad_norm": 0.9299744963645935, - "learning_rate": 1.447065077240374e-05, - "loss": 0.999, - "step": 322000 - }, - { - "epoch": 0.008, - "eval_loss": 2.534123659133911, - "eval_runtime": 51.7664, - "eval_samples_per_second": 196.923, - "eval_steps_per_second": 1.545, - "step": 322000 - }, - { - "epoch": 0.0082, - "grad_norm": 0.8244746923446655, - "learning_rate": 1.4456262180333552e-05, - "loss": 0.9991, - "step": 322100 - }, - { - "epoch": 0.0084, - "grad_norm": 0.8086799383163452, - "learning_rate": 1.4441877835279691e-05, - "loss": 0.9995, - "step": 322200 - }, - { - "epoch": 0.0086, - "grad_norm": 0.8285476565361023, - "learning_rate": 1.4427497743036172e-05, - "loss": 1.0018, - "step": 322300 - }, - { - "epoch": 0.0088, - "grad_norm": 0.8461373448371887, - "learning_rate": 1.4413121909395299e-05, - "loss": 0.9767, - "step": 322400 - }, - { - "epoch": 0.009, - "grad_norm": 0.864859938621521, - "learning_rate": 1.4398750340147666e-05, - "loss": 1.001, - "step": 322500 - }, - { - "epoch": 0.0092, - "grad_norm": 0.8466659784317017, - "learning_rate": 1.4384383041082117e-05, - "loss": 0.9958, - "step": 322600 - }, - { - "epoch": 0.0094, - "grad_norm": 0.8037152290344238, - "learning_rate": 1.4370020017985807e-05, - "loss": 0.9959, - "step": 322700 - }, - { - "epoch": 0.0096, - "grad_norm": 0.8187578320503235, - "learning_rate": 1.4355661276644178e-05, - "loss": 0.9955, - "step": 322800 - }, - { - "epoch": 0.0098, - "grad_norm": 0.8383049368858337, - "learning_rate": 1.43413068228409e-05, - "loss": 0.9861, - "step": 322900 - }, - { - "epoch": 0.01, - "grad_norm": 0.8338568210601807, - "learning_rate": 1.432695666235796e-05, - "loss": 0.9907, - "step": 323000 - }, - { - "epoch": 0.01, - "eval_loss": 2.5478382110595703, - "eval_runtime": 51.7181, - "eval_samples_per_second": 197.107, - "eval_steps_per_second": 1.547, - "step": 323000 - }, - { - "epoch": 0.0102, - "grad_norm": 0.9476732611656189, - "learning_rate": 1.4312610800975602e-05, - "loss": 0.9817, - "step": 323100 - }, - { - "epoch": 0.0104, - "grad_norm": 0.8296193480491638, - "learning_rate": 1.429826924447234e-05, - "loss": 0.9883, - "step": 323200 - }, - { - "epoch": 0.0106, - "grad_norm": 0.8237991333007812, - "learning_rate": 1.4283931998624938e-05, - "loss": 0.9966, - "step": 323300 - }, - { - "epoch": 0.0108, - "grad_norm": 0.8200727701187134, - "learning_rate": 1.426959906920845e-05, - "loss": 0.9925, - "step": 323400 - }, - { - "epoch": 0.011, - "grad_norm": 0.7869872450828552, - "learning_rate": 1.4255270461996171e-05, - "loss": 0.9913, - "step": 323500 - }, - { - "epoch": 0.0112, - "grad_norm": 0.8540888428688049, - "learning_rate": 1.4240946182759673e-05, - "loss": 0.9851, - "step": 323600 - }, - { - "epoch": 0.0114, - "grad_norm": 0.9450783729553223, - "learning_rate": 1.4226626237268758e-05, - "loss": 0.9841, - "step": 323700 - }, - { - "epoch": 0.0116, - "grad_norm": 0.8994350433349609, - "learning_rate": 1.421231063129151e-05, - "loss": 0.9751, - "step": 323800 - }, - { - "epoch": 0.0118, - "grad_norm": 0.9152923822402954, - "learning_rate": 1.4197999370594246e-05, - "loss": 0.9788, - "step": 323900 - }, - { - "epoch": 0.012, - "grad_norm": 0.8692894577980042, - "learning_rate": 1.418369246094155e-05, - "loss": 0.9692, - "step": 324000 - }, - { - "epoch": 0.012, - "eval_loss": 2.554483652114868, - "eval_runtime": 51.714, - "eval_samples_per_second": 197.123, - "eval_steps_per_second": 1.547, - "step": 324000 - }, - { - "epoch": 0.0122, - "grad_norm": 0.8307340145111084, - "learning_rate": 1.4169389908096232e-05, - "loss": 0.9791, - "step": 324100 - }, - { - "epoch": 0.0124, - "grad_norm": 0.8067870736122131, - "learning_rate": 1.4155091717819363e-05, - "loss": 0.977, - "step": 324200 - }, - { - "epoch": 0.0126, - "grad_norm": 0.904922604560852, - "learning_rate": 1.414079789587025e-05, - "loss": 0.9615, - "step": 324300 - }, - { - "epoch": 0.0128, - "grad_norm": 0.8454153537750244, - "learning_rate": 1.4126508448006459e-05, - "loss": 0.9681, - "step": 324400 - }, - { - "epoch": 0.013, - "grad_norm": 0.8959038257598877, - "learning_rate": 1.4112223379983755e-05, - "loss": 0.9746, - "step": 324500 - }, - { - "epoch": 0.0132, - "grad_norm": 0.9153333306312561, - "learning_rate": 1.4097942697556172e-05, - "loss": 0.9728, - "step": 324600 - }, - { - "epoch": 0.0134, - "grad_norm": 0.809781551361084, - "learning_rate": 1.4083666406475976e-05, - "loss": 0.964, - "step": 324700 - }, - { - "epoch": 0.0136, - "grad_norm": 0.8854051232337952, - "learning_rate": 1.4069394512493634e-05, - "loss": 0.9826, - "step": 324800 - }, - { - "epoch": 0.0138, - "grad_norm": 0.8811824917793274, - "learning_rate": 1.4055127021357877e-05, - "loss": 0.9809, - "step": 324900 - }, - { - "epoch": 0.014, - "grad_norm": 0.8924720883369446, - "learning_rate": 1.4040863938815645e-05, - "loss": 0.9611, - "step": 325000 - }, - { - "epoch": 0.014, - "eval_loss": 2.559173583984375, - "eval_runtime": 51.7882, - "eval_samples_per_second": 196.84, - "eval_steps_per_second": 1.545, - "step": 325000 - }, - { - "epoch": 0.0142, - "grad_norm": 0.8205790519714355, - "learning_rate": 1.402660527061212e-05, - "loss": 0.9903, - "step": 325100 - }, - { - "epoch": 0.0144, - "grad_norm": 0.8341870903968811, - "learning_rate": 1.4012351022490672e-05, - "loss": 0.9615, - "step": 325200 - }, - { - "epoch": 0.0146, - "grad_norm": 0.8305156230926514, - "learning_rate": 1.3998101200192915e-05, - "loss": 0.9627, - "step": 325300 - }, - { - "epoch": 0.0148, - "grad_norm": 0.9122214317321777, - "learning_rate": 1.398385580945868e-05, - "loss": 0.9129, - "step": 325400 - }, - { - "epoch": 0.015, - "grad_norm": 0.868425190448761, - "learning_rate": 1.3969614856026014e-05, - "loss": 0.968, - "step": 325500 - }, - { - "epoch": 0.0152, - "grad_norm": 0.8120792508125305, - "learning_rate": 1.3955378345631159e-05, - "loss": 0.9689, - "step": 325600 - }, - { - "epoch": 0.0154, - "grad_norm": 0.8308644890785217, - "learning_rate": 1.3941146284008582e-05, - "loss": 0.9404, - "step": 325700 - }, - { - "epoch": 0.0156, - "grad_norm": 0.7607423663139343, - "learning_rate": 1.3926918676890965e-05, - "loss": 0.9587, - "step": 325800 - }, - { - "epoch": 0.0158, - "grad_norm": 0.8530341386795044, - "learning_rate": 1.3912695530009184e-05, - "loss": 0.9584, - "step": 325900 - }, - { - "epoch": 0.016, - "grad_norm": 0.8315464854240417, - "learning_rate": 1.3898476849092312e-05, - "loss": 0.9507, - "step": 326000 - }, - { - "epoch": 0.016, - "eval_loss": 2.574967861175537, - "eval_runtime": 52.1092, - "eval_samples_per_second": 195.628, - "eval_steps_per_second": 1.535, - "step": 326000 - }, - { - "epoch": 0.0002, - "grad_norm": 0.87019944190979, - "learning_rate": 1.3884262639867638e-05, - "loss": 0.7316, - "step": 326100 - }, - { - "epoch": 0.0004, - "grad_norm": 0.8352780342102051, - "learning_rate": 1.3870052908060651e-05, - "loss": 0.7268, - "step": 326200 - }, - { - "epoch": 0.0006, - "grad_norm": 0.9428650736808777, - "learning_rate": 1.3855847659395013e-05, - "loss": 0.717, - "step": 326300 - }, - { - "epoch": 0.0008, - "grad_norm": 1.0137333869934082, - "learning_rate": 1.3841646899592603e-05, - "loss": 0.7362, - "step": 326400 - }, - { - "epoch": 0.001, - "grad_norm": 0.9063905477523804, - "learning_rate": 1.382745063437349e-05, - "loss": 0.7192, - "step": 326500 - }, - { - "epoch": 0.0012, - "grad_norm": 0.8576821088790894, - "learning_rate": 1.3813258869455936e-05, - "loss": 0.72, - "step": 326600 - }, - { - "epoch": 0.0014, - "grad_norm": 0.8997663259506226, - "learning_rate": 1.3799071610556358e-05, - "loss": 0.7216, - "step": 326700 - }, - { - "epoch": 0.0016, - "grad_norm": 0.8130722641944885, - "learning_rate": 1.37848888633894e-05, - "loss": 0.7251, - "step": 326800 - }, - { - "epoch": 0.0018, - "grad_norm": 0.9513541460037231, - "learning_rate": 1.3770710633667863e-05, - "loss": 0.7245, - "step": 326900 - }, - { - "epoch": 0.002, - "grad_norm": 0.8725600838661194, - "learning_rate": 1.3756536927102753e-05, - "loss": 0.7186, - "step": 327000 - }, - { - "epoch": 0.002, - "eval_loss": 2.0350961685180664, - "eval_runtime": 51.8928, - "eval_samples_per_second": 196.444, - "eval_steps_per_second": 1.542, - "step": 327000 - }, - { - "epoch": 0.0022, - "grad_norm": 0.9190706610679626, - "learning_rate": 1.3742367749403212e-05, - "loss": 0.7326, - "step": 327100 - }, - { - "epoch": 0.0024, - "grad_norm": 0.8598017692565918, - "learning_rate": 1.3728203106276594e-05, - "loss": 0.7282, - "step": 327200 - }, - { - "epoch": 0.0026, - "grad_norm": 0.833091139793396, - "learning_rate": 1.371404300342842e-05, - "loss": 0.7183, - "step": 327300 - }, - { - "epoch": 0.0028, - "grad_norm": 0.8222286105155945, - "learning_rate": 1.3699887446562382e-05, - "loss": 0.7139, - "step": 327400 - }, - { - "epoch": 0.003, - "grad_norm": 0.8653368353843689, - "learning_rate": 1.368573644138032e-05, - "loss": 0.7237, - "step": 327500 - }, - { - "epoch": 0.0032, - "grad_norm": 0.9050326943397522, - "learning_rate": 1.3671589993582268e-05, - "loss": 0.7282, - "step": 327600 - }, - { - "epoch": 0.0034, - "grad_norm": 0.9215336441993713, - "learning_rate": 1.3657448108866423e-05, - "loss": 0.7107, - "step": 327700 - }, - { - "epoch": 0.0036, - "grad_norm": 0.8540416359901428, - "learning_rate": 1.364331079292911e-05, - "loss": 0.7176, - "step": 327800 - }, - { - "epoch": 0.0038, - "grad_norm": 0.8809969425201416, - "learning_rate": 1.3629178051464858e-05, - "loss": 0.7223, - "step": 327900 - }, - { - "epoch": 0.004, - "grad_norm": 0.8728992342948914, - "learning_rate": 1.3615049890166323e-05, - "loss": 0.7169, - "step": 328000 - }, - { - "epoch": 0.004, - "eval_loss": 2.0252230167388916, - "eval_runtime": 51.5937, - "eval_samples_per_second": 197.582, - "eval_steps_per_second": 1.551, - "step": 328000 - }, - { - "epoch": 0.0042, - "grad_norm": 1.0202641487121582, - "learning_rate": 1.360092631472433e-05, - "loss": 0.7341, - "step": 328100 - }, - { - "epoch": 0.0044, - "grad_norm": 0.8477998375892639, - "learning_rate": 1.3586807330827861e-05, - "loss": 0.7145, - "step": 328200 - }, - { - "epoch": 0.0046, - "grad_norm": 0.8075670599937439, - "learning_rate": 1.3572692944164029e-05, - "loss": 0.7198, - "step": 328300 - }, - { - "epoch": 0.0048, - "grad_norm": 0.8715834021568298, - "learning_rate": 1.3558583160418109e-05, - "loss": 0.7202, - "step": 328400 - }, - { - "epoch": 0.005, - "grad_norm": 0.8973333239555359, - "learning_rate": 1.3544477985273524e-05, - "loss": 0.7165, - "step": 328500 - }, - { - "epoch": 0.0052, - "grad_norm": 0.923931360244751, - "learning_rate": 1.3530377424411849e-05, - "loss": 0.7214, - "step": 328600 - }, - { - "epoch": 0.0054, - "grad_norm": 0.9258859753608704, - "learning_rate": 1.3516281483512765e-05, - "loss": 0.7255, - "step": 328700 - }, - { - "epoch": 0.0056, - "grad_norm": 0.8883686661720276, - "learning_rate": 1.3502190168254125e-05, - "loss": 0.713, - "step": 328800 - }, - { - "epoch": 0.0058, - "grad_norm": 0.8454500436782837, - "learning_rate": 1.348810348431191e-05, - "loss": 0.7117, - "step": 328900 - }, - { - "epoch": 0.006, - "grad_norm": 0.9518053531646729, - "learning_rate": 1.3474021437360245e-05, - "loss": 0.7189, - "step": 329000 - }, - { - "epoch": 0.006, - "eval_loss": 2.032439708709717, - "eval_runtime": 51.733, - "eval_samples_per_second": 197.05, - "eval_steps_per_second": 1.546, - "step": 329000 - }, - { - "epoch": 0.0062, - "grad_norm": 0.878307044506073, - "learning_rate": 1.345994403307136e-05, - "loss": 0.7136, - "step": 329100 - }, - { - "epoch": 0.0064, - "grad_norm": 0.8827186226844788, - "learning_rate": 1.3445871277115635e-05, - "loss": 0.7237, - "step": 329200 - }, - { - "epoch": 0.0066, - "grad_norm": 0.8805004954338074, - "learning_rate": 1.3431803175161586e-05, - "loss": 0.7024, - "step": 329300 - }, - { - "epoch": 0.0068, - "grad_norm": 0.8745920062065125, - "learning_rate": 1.3417739732875829e-05, - "loss": 0.7175, - "step": 329400 - }, - { - "epoch": 0.007, - "grad_norm": 0.8587835431098938, - "learning_rate": 1.340368095592312e-05, - "loss": 0.7054, - "step": 329500 - }, - { - "epoch": 0.0072, - "grad_norm": 0.8374196290969849, - "learning_rate": 1.3389626849966335e-05, - "loss": 0.7107, - "step": 329600 - }, - { - "epoch": 0.0074, - "grad_norm": 0.929682731628418, - "learning_rate": 1.3375577420666477e-05, - "loss": 0.7183, - "step": 329700 - }, - { - "epoch": 0.0076, - "grad_norm": 0.8738675713539124, - "learning_rate": 1.3361532673682633e-05, - "loss": 0.7236, - "step": 329800 - }, - { - "epoch": 0.0078, - "grad_norm": 0.8550043106079102, - "learning_rate": 1.3347492614672039e-05, - "loss": 0.7107, - "step": 329900 - }, - { - "epoch": 0.008, - "grad_norm": 0.9196627736091614, - "learning_rate": 1.3333457249290024e-05, - "loss": 0.716, - "step": 330000 - }, - { - "epoch": 0.008, - "eval_loss": 2.035661220550537, - "eval_runtime": 51.7487, - "eval_samples_per_second": 196.99, - "eval_steps_per_second": 1.546, - "step": 330000 - }, - { - "epoch": 0.0082, - "grad_norm": 0.8340585231781006, - "learning_rate": 1.3319426583190042e-05, - "loss": 0.7279, - "step": 330100 - }, - { - "epoch": 0.0084, - "grad_norm": 0.858969509601593, - "learning_rate": 1.3305400622023628e-05, - "loss": 0.716, - "step": 330200 - }, - { - "epoch": 0.0086, - "grad_norm": 0.9872186183929443, - "learning_rate": 1.3291379371440446e-05, - "loss": 0.7278, - "step": 330300 - }, - { - "epoch": 0.0088, - "grad_norm": 0.8357021808624268, - "learning_rate": 1.3277362837088252e-05, - "loss": 0.7057, - "step": 330400 - }, - { - "epoch": 0.009, - "grad_norm": 0.8592823147773743, - "learning_rate": 1.3263351024612914e-05, - "loss": 0.7107, - "step": 330500 - }, - { - "epoch": 0.0092, - "grad_norm": 0.8655655384063721, - "learning_rate": 1.3249343939658371e-05, - "loss": 0.7093, - "step": 330600 - }, - { - "epoch": 0.0094, - "grad_norm": 0.8590738773345947, - "learning_rate": 1.3235341587866684e-05, - "loss": 0.7073, - "step": 330700 - }, - { - "epoch": 0.0096, - "grad_norm": 0.8633531332015991, - "learning_rate": 1.322134397487801e-05, - "loss": 0.7129, - "step": 330800 - }, - { - "epoch": 0.0098, - "grad_norm": 0.8816627264022827, - "learning_rate": 1.3207351106330559e-05, - "loss": 0.7114, - "step": 330900 - }, - { - "epoch": 0.01, - "grad_norm": 0.9330505132675171, - "learning_rate": 1.3193362987860675e-05, - "loss": 0.7059, - "step": 331000 - }, - { - "epoch": 0.01, - "eval_loss": 2.0230836868286133, - "eval_runtime": 51.7504, - "eval_samples_per_second": 196.984, - "eval_steps_per_second": 1.546, - "step": 331000 - }, - { - "epoch": 0.0102, - "grad_norm": 0.8758464455604553, - "learning_rate": 1.317937962510277e-05, - "loss": 0.7078, - "step": 331100 - }, - { - "epoch": 0.0104, - "grad_norm": 0.9444248676300049, - "learning_rate": 1.3165401023689344e-05, - "loss": 0.7174, - "step": 331200 - }, - { - "epoch": 0.0106, - "grad_norm": 0.8706777095794678, - "learning_rate": 1.3151427189250965e-05, - "loss": 0.7058, - "step": 331300 - }, - { - "epoch": 0.0108, - "grad_norm": 0.8867092132568359, - "learning_rate": 1.3137458127416297e-05, - "loss": 0.7058, - "step": 331400 - }, - { - "epoch": 0.011, - "grad_norm": 0.968101978302002, - "learning_rate": 1.3123493843812074e-05, - "loss": 0.7212, - "step": 331500 - }, - { - "epoch": 0.0112, - "grad_norm": 0.8708505630493164, - "learning_rate": 1.3109534344063118e-05, - "loss": 0.7175, - "step": 331600 - }, - { - "epoch": 0.0114, - "grad_norm": 0.910325288772583, - "learning_rate": 1.30955796337923e-05, - "loss": 0.7078, - "step": 331700 - }, - { - "epoch": 0.0116, - "grad_norm": 0.8591578006744385, - "learning_rate": 1.308162971862058e-05, - "loss": 0.7101, - "step": 331800 - }, - { - "epoch": 0.0118, - "grad_norm": 0.9007583260536194, - "learning_rate": 1.3067684604166988e-05, - "loss": 0.7157, - "step": 331900 - }, - { - "epoch": 0.012, - "grad_norm": 0.9580846428871155, - "learning_rate": 1.3053744296048617e-05, - "loss": 0.7102, - "step": 332000 - }, - { - "epoch": 0.012, - "eval_loss": 2.037156581878662, - "eval_runtime": 51.5881, - "eval_samples_per_second": 197.604, - "eval_steps_per_second": 1.551, - "step": 332000 - }, - { - "epoch": 0.0122, - "grad_norm": 0.8679760098457336, - "learning_rate": 1.3039808799880604e-05, - "loss": 0.7144, - "step": 332100 - }, - { - "epoch": 0.0124, - "grad_norm": 0.8794786334037781, - "learning_rate": 1.302587812127618e-05, - "loss": 0.7089, - "step": 332200 - }, - { - "epoch": 0.0126, - "grad_norm": 0.855987548828125, - "learning_rate": 1.3011952265846626e-05, - "loss": 0.7164, - "step": 332300 - }, - { - "epoch": 0.0128, - "grad_norm": 0.8838660717010498, - "learning_rate": 1.2998031239201252e-05, - "loss": 0.7166, - "step": 332400 - }, - { - "epoch": 0.013, - "grad_norm": 0.8379763960838318, - "learning_rate": 1.2984115046947463e-05, - "loss": 0.7168, - "step": 332500 - }, - { - "epoch": 0.0132, - "grad_norm": 0.8760377764701843, - "learning_rate": 1.2970203694690694e-05, - "loss": 0.7106, - "step": 332600 - }, - { - "epoch": 0.0134, - "grad_norm": 0.8472399711608887, - "learning_rate": 1.295629718803445e-05, - "loss": 0.7118, - "step": 332700 - }, - { - "epoch": 0.0136, - "grad_norm": 0.8849984407424927, - "learning_rate": 1.2942395532580247e-05, - "loss": 0.7207, - "step": 332800 - }, - { - "epoch": 0.0138, - "grad_norm": 0.8308677077293396, - "learning_rate": 1.2928498733927682e-05, - "loss": 0.7004, - "step": 332900 - }, - { - "epoch": 0.014, - "grad_norm": 0.9149287343025208, - "learning_rate": 1.2914606797674384e-05, - "loss": 0.7088, - "step": 333000 - }, - { - "epoch": 0.014, - "eval_loss": 2.029548168182373, - "eval_runtime": 51.6647, - "eval_samples_per_second": 197.311, - "eval_steps_per_second": 1.548, - "step": 333000 - }, - { - "epoch": 0.0142, - "grad_norm": 0.8902376890182495, - "learning_rate": 1.2900719729416033e-05, - "loss": 0.7095, - "step": 333100 - }, - { - "epoch": 0.0144, - "grad_norm": 0.9412351250648499, - "learning_rate": 1.2886837534746316e-05, - "loss": 0.7186, - "step": 333200 - }, - { - "epoch": 0.0146, - "grad_norm": 0.8445390462875366, - "learning_rate": 1.2872960219256992e-05, - "loss": 0.7093, - "step": 333300 - }, - { - "epoch": 0.0148, - "grad_norm": 0.8830252289772034, - "learning_rate": 1.2859087788537844e-05, - "loss": 0.7074, - "step": 333400 - }, - { - "epoch": 0.015, - "grad_norm": 0.8642695546150208, - "learning_rate": 1.284522024817669e-05, - "loss": 0.7146, - "step": 333500 - }, - { - "epoch": 0.0152, - "grad_norm": 0.9142852425575256, - "learning_rate": 1.2831357603759358e-05, - "loss": 0.7126, - "step": 333600 - }, - { - "epoch": 0.0154, - "grad_norm": 0.9412261247634888, - "learning_rate": 1.2817499860869725e-05, - "loss": 0.7105, - "step": 333700 - }, - { - "epoch": 0.0156, - "grad_norm": 0.8529816269874573, - "learning_rate": 1.2803647025089705e-05, - "loss": 0.7086, - "step": 333800 - }, - { - "epoch": 0.0158, - "grad_norm": 0.8930657505989075, - "learning_rate": 1.2789799101999194e-05, - "loss": 0.7148, - "step": 333900 - }, - { - "epoch": 0.016, - "grad_norm": 0.9034160375595093, - "learning_rate": 1.2775956097176142e-05, - "loss": 0.7138, - "step": 334000 - }, - { - "epoch": 0.016, - "eval_loss": 2.034317970275879, - "eval_runtime": 52.1314, - "eval_samples_per_second": 195.544, - "eval_steps_per_second": 1.535, - "step": 334000 - }, - { - "epoch": 0.0162, - "grad_norm": 0.7935868501663208, - "learning_rate": 1.2762118016196514e-05, - "loss": 0.7061, - "step": 334100 - }, - { - "epoch": 0.0164, - "grad_norm": 0.8745686411857605, - "learning_rate": 1.2748284864634296e-05, - "loss": 0.7079, - "step": 334200 - }, - { - "epoch": 0.0166, - "grad_norm": 0.8833600878715515, - "learning_rate": 1.273445664806146e-05, - "loss": 0.7103, - "step": 334300 - }, - { - "epoch": 0.0168, - "grad_norm": 0.9068960547447205, - "learning_rate": 1.272063337204802e-05, - "loss": 0.7001, - "step": 334400 - }, - { - "epoch": 0.017, - "grad_norm": 0.8197974562644958, - "learning_rate": 1.2706815042161984e-05, - "loss": 0.7052, - "step": 334500 - }, - { - "epoch": 0.0172, - "grad_norm": 0.8796073794364929, - "learning_rate": 1.2693001663969395e-05, - "loss": 0.7123, - "step": 334600 - }, - { - "epoch": 0.0174, - "grad_norm": 0.883787989616394, - "learning_rate": 1.2679193243034249e-05, - "loss": 0.7028, - "step": 334700 - }, - { - "epoch": 0.0176, - "grad_norm": 0.885678768157959, - "learning_rate": 1.2665389784918597e-05, - "loss": 0.696, - "step": 334800 - }, - { - "epoch": 0.0178, - "grad_norm": 0.895122766494751, - "learning_rate": 1.2651591295182457e-05, - "loss": 0.7095, - "step": 334900 - }, - { - "epoch": 0.018, - "grad_norm": 0.8656454086303711, - "learning_rate": 1.2637797779383881e-05, - "loss": 0.7098, - "step": 335000 - }, - { - "epoch": 0.018, - "eval_loss": 2.041609764099121, - "eval_runtime": 51.8364, - "eval_samples_per_second": 196.657, - "eval_steps_per_second": 1.543, - "step": 335000 - }, - { - "epoch": 0.0182, - "grad_norm": 0.8860552906990051, - "learning_rate": 1.2624009243078872e-05, - "loss": 0.7323, - "step": 335100 - }, - { - "epoch": 0.0184, - "grad_norm": 0.9041178226470947, - "learning_rate": 1.261022569182146e-05, - "loss": 0.7102, - "step": 335200 - }, - { - "epoch": 0.0186, - "grad_norm": 0.8467496037483215, - "learning_rate": 1.2596447131163657e-05, - "loss": 0.7061, - "step": 335300 - }, - { - "epoch": 0.0188, - "grad_norm": 0.8838053941726685, - "learning_rate": 1.2582673566655474e-05, - "loss": 0.7032, - "step": 335400 - }, - { - "epoch": 0.019, - "grad_norm": 0.8892683982849121, - "learning_rate": 1.2568905003844885e-05, - "loss": 0.7032, - "step": 335500 - }, - { - "epoch": 0.0192, - "grad_norm": 0.8915057182312012, - "learning_rate": 1.2555141448277874e-05, - "loss": 0.7162, - "step": 335600 - }, - { - "epoch": 0.0194, - "grad_norm": 0.8544843196868896, - "learning_rate": 1.2541382905498411e-05, - "loss": 0.6972, - "step": 335700 - }, - { - "epoch": 0.0196, - "grad_norm": 0.9270769953727722, - "learning_rate": 1.2527629381048411e-05, - "loss": 0.6981, - "step": 335800 - }, - { - "epoch": 0.0198, - "grad_norm": 1.0345507860183716, - "learning_rate": 1.2513880880467807e-05, - "loss": 0.6987, - "step": 335900 - }, - { - "epoch": 0.02, - "grad_norm": 0.8447591662406921, - "learning_rate": 1.2500137409294488e-05, - "loss": 0.7021, - "step": 336000 - }, - { - "epoch": 0.02, - "eval_loss": 2.0394201278686523, - "eval_runtime": 52.0024, - "eval_samples_per_second": 196.029, - "eval_steps_per_second": 1.538, - "step": 336000 - }, - { - "epoch": 0.0202, - "grad_norm": 0.871530294418335, - "learning_rate": 1.2486398973064339e-05, - "loss": 0.7097, - "step": 336100 - }, - { - "epoch": 0.0204, - "grad_norm": 0.8515340089797974, - "learning_rate": 1.2472665577311176e-05, - "loss": 0.705, - "step": 336200 - }, - { - "epoch": 0.0206, - "grad_norm": 0.8740963339805603, - "learning_rate": 1.2458937227566819e-05, - "loss": 0.7004, - "step": 336300 - }, - { - "epoch": 0.0208, - "grad_norm": 0.8944967985153198, - "learning_rate": 1.244521392936106e-05, - "loss": 0.6948, - "step": 336400 - }, - { - "epoch": 0.021, - "grad_norm": 0.8867557644844055, - "learning_rate": 1.2431495688221618e-05, - "loss": 0.7037, - "step": 336500 - }, - { - "epoch": 0.0212, - "grad_norm": 0.925564706325531, - "learning_rate": 1.2417782509674216e-05, - "loss": 0.6971, - "step": 336600 - }, - { - "epoch": 0.0214, - "grad_norm": 0.8457061052322388, - "learning_rate": 1.240407439924251e-05, - "loss": 0.7007, - "step": 336700 - }, - { - "epoch": 0.0216, - "grad_norm": 0.8768745064735413, - "learning_rate": 1.2390371362448125e-05, - "loss": 0.7015, - "step": 336800 - }, - { - "epoch": 0.0218, - "grad_norm": 0.8154018521308899, - "learning_rate": 1.237667340481066e-05, - "loss": 0.6984, - "step": 336900 - }, - { - "epoch": 0.022, - "grad_norm": 0.8525890707969666, - "learning_rate": 1.2362980531847626e-05, - "loss": 0.6991, - "step": 337000 - }, - { - "epoch": 0.022, - "eval_loss": 2.052788496017456, - "eval_runtime": 52.066, - "eval_samples_per_second": 195.79, - "eval_steps_per_second": 1.537, - "step": 337000 - }, - { - "epoch": 0.0222, - "grad_norm": 0.8477676510810852, - "learning_rate": 1.2349292749074526e-05, - "loss": 0.6756, - "step": 337100 - }, - { - "epoch": 0.0224, - "grad_norm": 0.8637740612030029, - "learning_rate": 1.233561006200479e-05, - "loss": 0.7043, - "step": 337200 - }, - { - "epoch": 0.0226, - "grad_norm": 0.9340733885765076, - "learning_rate": 1.232193247614982e-05, - "loss": 0.697, - "step": 337300 - }, - { - "epoch": 0.0228, - "grad_norm": 0.8994996547698975, - "learning_rate": 1.230825999701892e-05, - "loss": 0.6975, - "step": 337400 - }, - { - "epoch": 0.023, - "grad_norm": 0.9119468331336975, - "learning_rate": 1.2294592630119375e-05, - "loss": 0.695, - "step": 337500 - }, - { - "epoch": 0.0232, - "grad_norm": 0.8722793459892273, - "learning_rate": 1.2280930380956402e-05, - "loss": 0.694, - "step": 337600 - }, - { - "epoch": 0.0234, - "grad_norm": 0.9214362502098083, - "learning_rate": 1.2267273255033157e-05, - "loss": 0.7004, - "step": 337700 - }, - { - "epoch": 0.0236, - "grad_norm": 0.928554892539978, - "learning_rate": 1.2253621257850714e-05, - "loss": 0.6978, - "step": 337800 - }, - { - "epoch": 0.0238, - "grad_norm": 0.952670693397522, - "learning_rate": 1.2239974394908102e-05, - "loss": 0.7041, - "step": 337900 - }, - { - "epoch": 0.024, - "grad_norm": 0.8799007534980774, - "learning_rate": 1.2226332671702282e-05, - "loss": 0.689, - "step": 338000 - }, - { - "epoch": 0.024, - "eval_loss": 2.0413691997528076, - "eval_runtime": 52.1051, - "eval_samples_per_second": 195.643, - "eval_steps_per_second": 1.535, - "step": 338000 - }, - { - "epoch": 0.0242, - "grad_norm": 1.0080713033676147, - "learning_rate": 1.2212696093728141e-05, - "loss": 0.7069, - "step": 338100 - }, - { - "epoch": 0.0244, - "grad_norm": 0.9208382964134216, - "learning_rate": 1.2199064666478474e-05, - "loss": 0.7086, - "step": 338200 - }, - { - "epoch": 0.0246, - "grad_norm": 0.9424040913581848, - "learning_rate": 1.2185438395444029e-05, - "loss": 0.699, - "step": 338300 - }, - { - "epoch": 0.0248, - "grad_norm": 0.9521956443786621, - "learning_rate": 1.2171817286113476e-05, - "loss": 0.6972, - "step": 338400 - }, - { - "epoch": 0.025, - "grad_norm": 0.9072422385215759, - "learning_rate": 1.2158201343973377e-05, - "loss": 0.686, - "step": 338500 - }, - { - "epoch": 0.0252, - "grad_norm": 0.8915525078773499, - "learning_rate": 1.2144590574508241e-05, - "loss": 0.6992, - "step": 338600 - }, - { - "epoch": 0.0254, - "grad_norm": 0.8471651673316956, - "learning_rate": 1.2130984983200486e-05, - "loss": 0.6933, - "step": 338700 - }, - { - "epoch": 0.0256, - "grad_norm": 0.8865765929222107, - "learning_rate": 1.2117384575530446e-05, - "loss": 0.6899, - "step": 338800 - }, - { - "epoch": 0.0258, - "grad_norm": 0.8501929640769958, - "learning_rate": 1.2103789356976353e-05, - "loss": 0.6942, - "step": 338900 - }, - { - "epoch": 0.026, - "grad_norm": 0.8459005951881409, - "learning_rate": 1.2090199333014363e-05, - "loss": 0.6883, - "step": 339000 - }, - { - "epoch": 0.026, - "eval_loss": 2.0509862899780273, - "eval_runtime": 52.0225, - "eval_samples_per_second": 195.954, - "eval_steps_per_second": 1.538, - "step": 339000 - }, - { - "epoch": 0.0262, - "grad_norm": 0.8724320530891418, - "learning_rate": 1.2076614509118537e-05, - "loss": 0.6903, - "step": 339100 - }, - { - "epoch": 0.0264, - "grad_norm": 0.8801888227462769, - "learning_rate": 1.206303489076085e-05, - "loss": 0.6934, - "step": 339200 - }, - { - "epoch": 0.0266, - "grad_norm": 0.9318116903305054, - "learning_rate": 1.2049460483411154e-05, - "loss": 0.6909, - "step": 339300 - }, - { - "epoch": 0.0268, - "grad_norm": 0.892590343952179, - "learning_rate": 1.2035891292537228e-05, - "loss": 0.6931, - "step": 339400 - }, - { - "epoch": 0.027, - "grad_norm": 0.8987374901771545, - "learning_rate": 1.2022327323604735e-05, - "loss": 0.682, - "step": 339500 - }, - { - "epoch": 0.0272, - "grad_norm": 0.9365059733390808, - "learning_rate": 1.2008768582077257e-05, - "loss": 0.6849, - "step": 339600 - }, - { - "epoch": 0.0274, - "grad_norm": 0.908043384552002, - "learning_rate": 1.199521507341623e-05, - "loss": 0.6959, - "step": 339700 - }, - { - "epoch": 0.0276, - "grad_norm": 0.9550427794456482, - "learning_rate": 1.1981666803081015e-05, - "loss": 0.6928, - "step": 339800 - }, - { - "epoch": 0.0278, - "grad_norm": 0.9074381589889526, - "learning_rate": 1.1968123776528855e-05, - "loss": 0.6907, - "step": 339900 - }, - { - "epoch": 0.028, - "grad_norm": 0.8685894012451172, - "learning_rate": 1.195458599921489e-05, - "loss": 0.6792, - "step": 340000 - }, - { - "epoch": 0.028, - "eval_loss": 2.0385003089904785, - "eval_runtime": 51.8057, - "eval_samples_per_second": 196.774, - "eval_steps_per_second": 1.544, - "step": 340000 - }, - { - "epoch": 0.0282, - "grad_norm": 0.8610774874687195, - "learning_rate": 1.1941053476592115e-05, - "loss": 0.6883, - "step": 340100 - }, - { - "epoch": 0.0284, - "grad_norm": 0.8860583901405334, - "learning_rate": 1.192752621411144e-05, - "loss": 0.6962, - "step": 340200 - }, - { - "epoch": 0.0286, - "grad_norm": 0.872675895690918, - "learning_rate": 1.191400421722165e-05, - "loss": 0.6941, - "step": 340300 - }, - { - "epoch": 0.0288, - "grad_norm": 0.9522199630737305, - "learning_rate": 1.1900487491369386e-05, - "loss": 0.6885, - "step": 340400 - }, - { - "epoch": 0.029, - "grad_norm": 0.8697762489318848, - "learning_rate": 1.1886976041999196e-05, - "loss": 0.688, - "step": 340500 - }, - { - "epoch": 0.0292, - "grad_norm": 0.939105749130249, - "learning_rate": 1.1873469874553486e-05, - "loss": 0.677, - "step": 340600 - }, - { - "epoch": 0.0294, - "grad_norm": 0.8797380328178406, - "learning_rate": 1.1859968994472551e-05, - "loss": 0.681, - "step": 340700 - }, - { - "epoch": 0.0296, - "grad_norm": 0.9089605212211609, - "learning_rate": 1.1846473407194522e-05, - "loss": 0.6916, - "step": 340800 - }, - { - "epoch": 0.0298, - "grad_norm": 0.8749380111694336, - "learning_rate": 1.1832983118155436e-05, - "loss": 0.6855, - "step": 340900 - }, - { - "epoch": 0.03, - "grad_norm": 0.8690641522407532, - "learning_rate": 1.1819498132789173e-05, - "loss": 0.6923, - "step": 341000 - }, - { - "epoch": 0.03, - "eval_loss": 2.083442449569702, - "eval_runtime": 52.125, - "eval_samples_per_second": 195.568, - "eval_steps_per_second": 1.535, - "step": 341000 - }, - { - "epoch": 0.0302, - "grad_norm": 0.8813545107841492, - "learning_rate": 1.1806018456527495e-05, - "loss": 0.679, - "step": 341100 - }, - { - "epoch": 0.0304, - "grad_norm": 0.8487153649330139, - "learning_rate": 1.1792544094799995e-05, - "loss": 0.6851, - "step": 341200 - }, - { - "epoch": 0.0306, - "grad_norm": 0.8273248672485352, - "learning_rate": 1.1779075053034155e-05, - "loss": 0.6807, - "step": 341300 - }, - { - "epoch": 0.0308, - "grad_norm": 0.879425585269928, - "learning_rate": 1.1765611336655305e-05, - "loss": 0.6816, - "step": 341400 - }, - { - "epoch": 0.031, - "grad_norm": 0.8405166268348694, - "learning_rate": 1.1752152951086631e-05, - "loss": 0.6762, - "step": 341500 - }, - { - "epoch": 0.0312, - "grad_norm": 0.8572484254837036, - "learning_rate": 1.1738699901749157e-05, - "loss": 0.692, - "step": 341600 - }, - { - "epoch": 0.0314, - "grad_norm": 0.9151760935783386, - "learning_rate": 1.1725252194061775e-05, - "loss": 0.683, - "step": 341700 - }, - { - "epoch": 0.0316, - "grad_norm": 0.9075136780738831, - "learning_rate": 1.1711809833441235e-05, - "loss": 0.6859, - "step": 341800 - }, - { - "epoch": 0.0318, - "grad_norm": 0.9236798882484436, - "learning_rate": 1.1698372825302093e-05, - "loss": 0.6901, - "step": 341900 - }, - { - "epoch": 0.032, - "grad_norm": 0.8434112071990967, - "learning_rate": 1.1684941175056785e-05, - "loss": 0.6844, - "step": 342000 - }, - { - "epoch": 0.032, - "eval_loss": 2.0505099296569824, - "eval_runtime": 52.0084, - "eval_samples_per_second": 196.007, - "eval_steps_per_second": 1.538, - "step": 342000 - }, - { - "epoch": 0.0322, - "grad_norm": 0.9039320349693298, - "learning_rate": 1.1671514888115582e-05, - "loss": 0.6859, - "step": 342100 - }, - { - "epoch": 0.0324, - "grad_norm": 0.8539577126502991, - "learning_rate": 1.1658093969886596e-05, - "loss": 0.6734, - "step": 342200 - }, - { - "epoch": 0.0326, - "grad_norm": 0.8575844168663025, - "learning_rate": 1.1644678425775755e-05, - "loss": 0.6762, - "step": 342300 - }, - { - "epoch": 0.0328, - "grad_norm": 0.9679238200187683, - "learning_rate": 1.1631268261186845e-05, - "loss": 0.676, - "step": 342400 - }, - { - "epoch": 0.033, - "grad_norm": 0.8782627582550049, - "learning_rate": 1.1617863481521483e-05, - "loss": 0.6758, - "step": 342500 - }, - { - "epoch": 0.0332, - "grad_norm": 0.9136931300163269, - "learning_rate": 1.1604464092179118e-05, - "loss": 0.6818, - "step": 342600 - }, - { - "epoch": 0.0334, - "grad_norm": 0.8847256302833557, - "learning_rate": 1.1591070098557006e-05, - "loss": 0.6728, - "step": 342700 - }, - { - "epoch": 0.0336, - "grad_norm": 0.8676889538764954, - "learning_rate": 1.1577681506050253e-05, - "loss": 0.682, - "step": 342800 - }, - { - "epoch": 0.0338, - "grad_norm": 0.8871778845787048, - "learning_rate": 1.1564298320051787e-05, - "loss": 0.6748, - "step": 342900 - }, - { - "epoch": 0.034, - "grad_norm": 0.9254991412162781, - "learning_rate": 1.155092054595236e-05, - "loss": 0.6791, - "step": 343000 - }, - { - "epoch": 0.034, - "eval_loss": 2.0616466999053955, - "eval_runtime": 52.253, - "eval_samples_per_second": 195.089, - "eval_steps_per_second": 1.531, - "step": 343000 - }, - { - "epoch": 0.0002, - "grad_norm": 0.8649879693984985, - "learning_rate": 1.1537548189140518e-05, - "loss": 0.6746, - "step": 343100 - }, - { - "epoch": 0.0004, - "grad_norm": 0.8530526161193848, - "learning_rate": 1.1524181255002655e-05, - "loss": 0.6714, - "step": 343200 - }, - { - "epoch": 0.0006, - "grad_norm": 0.8391575813293457, - "learning_rate": 1.1510819748922983e-05, - "loss": 0.673, - "step": 343300 - }, - { - "epoch": 0.0008, - "grad_norm": 0.8824005126953125, - "learning_rate": 1.149746367628349e-05, - "loss": 0.6745, - "step": 343400 - }, - { - "epoch": 0.001, - "grad_norm": 0.9381487965583801, - "learning_rate": 1.1484113042464018e-05, - "loss": 0.6775, - "step": 343500 - }, - { - "epoch": 0.0012, - "grad_norm": 0.8851874470710754, - "learning_rate": 1.1470767852842192e-05, - "loss": 0.6714, - "step": 343600 - }, - { - "epoch": 0.0014, - "grad_norm": 0.8769415616989136, - "learning_rate": 1.1457428112793467e-05, - "loss": 0.6649, - "step": 343700 - }, - { - "epoch": 0.0016, - "grad_norm": 0.8536527156829834, - "learning_rate": 1.1444093827691072e-05, - "loss": 0.6689, - "step": 343800 - }, - { - "epoch": 0.0018, - "grad_norm": 0.8344665765762329, - "learning_rate": 1.143076500290606e-05, - "loss": 0.6714, - "step": 343900 - }, - { - "epoch": 0.002, - "grad_norm": 0.857262372970581, - "learning_rate": 1.141744164380728e-05, - "loss": 0.668, - "step": 344000 - }, - { - "epoch": 0.002, - "eval_loss": 2.0636377334594727, - "eval_runtime": 52.1973, - "eval_samples_per_second": 195.297, - "eval_steps_per_second": 1.533, - "step": 344000 - }, - { - "epoch": 0.0022, - "grad_norm": 0.9240826964378357, - "learning_rate": 1.1404123755761394e-05, - "loss": 0.6738, - "step": 344100 - }, - { - "epoch": 0.0024, - "grad_norm": 0.864179790019989, - "learning_rate": 1.1390811344132823e-05, - "loss": 0.6675, - "step": 344200 - }, - { - "epoch": 0.0026, - "grad_norm": 0.9233891367912292, - "learning_rate": 1.1377504414283816e-05, - "loss": 0.6683, - "step": 344300 - }, - { - "epoch": 0.0028, - "grad_norm": 0.8253393769264221, - "learning_rate": 1.13642029715744e-05, - "loss": 0.6724, - "step": 344400 - }, - { - "epoch": 0.003, - "grad_norm": 0.9402153491973877, - "learning_rate": 1.1350907021362409e-05, - "loss": 0.6686, - "step": 344500 - }, - { - "epoch": 0.0032, - "grad_norm": 0.8452779054641724, - "learning_rate": 1.1337616569003425e-05, - "loss": 0.6776, - "step": 344600 - }, - { - "epoch": 0.0034, - "grad_norm": 0.8500985503196716, - "learning_rate": 1.1324331619850856e-05, - "loss": 0.6654, - "step": 344700 - }, - { - "epoch": 0.0036, - "grad_norm": 0.8803905248641968, - "learning_rate": 1.1311052179255871e-05, - "loss": 0.675, - "step": 344800 - }, - { - "epoch": 0.0038, - "grad_norm": 0.9099257588386536, - "learning_rate": 1.1297778252567443e-05, - "loss": 0.6569, - "step": 344900 - }, - { - "epoch": 0.004, - "grad_norm": 0.8804642558097839, - "learning_rate": 1.1284509845132297e-05, - "loss": 0.6655, - "step": 345000 - }, - { - "epoch": 0.004, - "eval_loss": 2.05592942237854, - "eval_runtime": 51.7883, - "eval_samples_per_second": 196.84, - "eval_steps_per_second": 1.545, - "step": 345000 - }, - { - "epoch": 0.0042, - "grad_norm": 0.8482286930084229, - "learning_rate": 1.1271246962294935e-05, - "loss": 0.6641, - "step": 345100 - }, - { - "epoch": 0.0044, - "grad_norm": 0.8636903166770935, - "learning_rate": 1.1257989609397654e-05, - "loss": 0.6632, - "step": 345200 - }, - { - "epoch": 0.0046, - "grad_norm": 0.8937559723854065, - "learning_rate": 1.1244737791780524e-05, - "loss": 0.6634, - "step": 345300 - }, - { - "epoch": 0.0048, - "grad_norm": 0.8914988040924072, - "learning_rate": 1.123149151478136e-05, - "loss": 0.6693, - "step": 345400 - }, - { - "epoch": 0.005, - "grad_norm": 1.0172580480575562, - "learning_rate": 1.1218250783735765e-05, - "loss": 0.6605, - "step": 345500 - }, - { - "epoch": 0.0052, - "grad_norm": 0.9080793857574463, - "learning_rate": 1.1205015603977107e-05, - "loss": 0.6706, - "step": 345600 - }, - { - "epoch": 0.0054, - "grad_norm": 0.8460882306098938, - "learning_rate": 1.1191785980836522e-05, - "loss": 0.6701, - "step": 345700 - }, - { - "epoch": 0.0056, - "grad_norm": 0.8949432373046875, - "learning_rate": 1.1178561919642885e-05, - "loss": 0.6571, - "step": 345800 - }, - { - "epoch": 0.0058, - "grad_norm": 0.8934834599494934, - "learning_rate": 1.1165343425722851e-05, - "loss": 0.6621, - "step": 345900 - }, - { - "epoch": 0.006, - "grad_norm": 0.8950237035751343, - "learning_rate": 1.1152130504400834e-05, - "loss": 0.6678, - "step": 346000 - }, - { - "epoch": 0.006, - "eval_loss": 2.0553648471832275, - "eval_runtime": 51.8108, - "eval_samples_per_second": 196.754, - "eval_steps_per_second": 1.544, - "step": 346000 - }, - { - "epoch": 0.0062, - "grad_norm": 0.9523611068725586, - "learning_rate": 1.1138923160999002e-05, - "loss": 0.673, - "step": 346100 - }, - { - "epoch": 0.0064, - "grad_norm": 0.874225914478302, - "learning_rate": 1.1125721400837255e-05, - "loss": 0.6609, - "step": 346200 - }, - { - "epoch": 0.0066, - "grad_norm": 0.9157487750053406, - "learning_rate": 1.1112525229233268e-05, - "loss": 0.6622, - "step": 346300 - }, - { - "epoch": 0.0068, - "grad_norm": 0.9365401864051819, - "learning_rate": 1.1099334651502466e-05, - "loss": 0.6603, - "step": 346400 - }, - { - "epoch": 0.007, - "grad_norm": 0.9212621450424194, - "learning_rate": 1.1086149672957993e-05, - "loss": 0.6618, - "step": 346500 - }, - { - "epoch": 0.0072, - "grad_norm": 0.9013537168502808, - "learning_rate": 1.107297029891077e-05, - "loss": 0.6665, - "step": 346600 - }, - { - "epoch": 0.0074, - "grad_norm": 0.8723328709602356, - "learning_rate": 1.1059796534669447e-05, - "loss": 0.6548, - "step": 346700 - }, - { - "epoch": 0.0076, - "grad_norm": 0.8133809566497803, - "learning_rate": 1.1046628385540419e-05, - "loss": 0.6352, - "step": 346800 - }, - { - "epoch": 0.0078, - "grad_norm": 0.8866004347801208, - "learning_rate": 1.1033465856827802e-05, - "loss": 0.6679, - "step": 346900 - }, - { - "epoch": 0.008, - "grad_norm": 0.9575750231742859, - "learning_rate": 1.1020308953833467e-05, - "loss": 0.6658, - "step": 347000 - }, - { - "epoch": 0.008, - "eval_loss": 2.0689334869384766, - "eval_runtime": 51.6857, - "eval_samples_per_second": 197.231, - "eval_steps_per_second": 1.548, - "step": 347000 - }, - { - "epoch": 0.0082, - "grad_norm": 0.8472666144371033, - "learning_rate": 1.100715768185701e-05, - "loss": 0.6504, - "step": 347100 - }, - { - "epoch": 0.0084, - "grad_norm": 0.8880901336669922, - "learning_rate": 1.0994012046195779e-05, - "loss": 0.6706, - "step": 347200 - }, - { - "epoch": 0.0086, - "grad_norm": 0.8281514644622803, - "learning_rate": 1.0980872052144809e-05, - "loss": 0.6514, - "step": 347300 - }, - { - "epoch": 0.0088, - "grad_norm": 0.8914335370063782, - "learning_rate": 1.09677377049969e-05, - "loss": 0.6526, - "step": 347400 - }, - { - "epoch": 0.009, - "grad_norm": 0.9571097493171692, - "learning_rate": 1.0954609010042568e-05, - "loss": 0.6623, - "step": 347500 - }, - { - "epoch": 0.0092, - "grad_norm": 0.9575111865997314, - "learning_rate": 1.0941485972570053e-05, - "loss": 0.6526, - "step": 347600 - }, - { - "epoch": 0.0094, - "grad_norm": 0.7946931719779968, - "learning_rate": 1.0928368597865298e-05, - "loss": 0.6621, - "step": 347700 - }, - { - "epoch": 0.0096, - "grad_norm": 0.901408851146698, - "learning_rate": 1.0915256891211992e-05, - "loss": 0.6575, - "step": 347800 - }, - { - "epoch": 0.0098, - "grad_norm": 0.8669435977935791, - "learning_rate": 1.0902150857891532e-05, - "loss": 0.6603, - "step": 347900 - }, - { - "epoch": 0.01, - "grad_norm": 0.8946738243103027, - "learning_rate": 1.0889050503183016e-05, - "loss": 0.6667, - "step": 348000 - }, - { - "epoch": 0.01, - "eval_loss": 2.0592565536499023, - "eval_runtime": 51.912, - "eval_samples_per_second": 196.371, - "eval_steps_per_second": 1.541, - "step": 348000 - }, - { - "epoch": 0.0102, - "grad_norm": 0.8748307228088379, - "learning_rate": 1.0875955832363266e-05, - "loss": 0.6613, - "step": 348100 - }, - { - "epoch": 0.0104, - "grad_norm": 0.846490740776062, - "learning_rate": 1.0862866850706818e-05, - "loss": 0.6577, - "step": 348200 - }, - { - "epoch": 0.0106, - "grad_norm": 0.860930323600769, - "learning_rate": 1.0849783563485921e-05, - "loss": 0.6552, - "step": 348300 - }, - { - "epoch": 0.0108, - "grad_norm": 0.8625341653823853, - "learning_rate": 1.0836705975970504e-05, - "loss": 0.6437, - "step": 348400 - }, - { - "epoch": 0.011, - "grad_norm": 0.8479413986206055, - "learning_rate": 1.0823634093428226e-05, - "loss": 0.664, - "step": 348500 - }, - { - "epoch": 0.0112, - "grad_norm": 0.9355835914611816, - "learning_rate": 1.0810567921124436e-05, - "loss": 0.6606, - "step": 348600 - }, - { - "epoch": 0.0114, - "grad_norm": 0.9027217626571655, - "learning_rate": 1.0797507464322203e-05, - "loss": 0.6509, - "step": 348700 - }, - { - "epoch": 0.0116, - "grad_norm": 0.8765237927436829, - "learning_rate": 1.0784452728282257e-05, - "loss": 0.6564, - "step": 348800 - }, - { - "epoch": 0.0118, - "grad_norm": 0.9060245156288147, - "learning_rate": 1.0771403718263051e-05, - "loss": 0.6555, - "step": 348900 - }, - { - "epoch": 0.012, - "grad_norm": 0.9202615022659302, - "learning_rate": 1.0758360439520727e-05, - "loss": 0.6522, - "step": 349000 - }, - { - "epoch": 0.012, - "eval_loss": 2.057035207748413, - "eval_runtime": 51.8702, - "eval_samples_per_second": 196.529, - "eval_steps_per_second": 1.542, - "step": 349000 - }, - { - "epoch": 0.0122, - "grad_norm": 0.8476743102073669, - "learning_rate": 1.0745322897309124e-05, - "loss": 0.6623, - "step": 349100 - }, - { - "epoch": 0.0124, - "grad_norm": 0.9493403434753418, - "learning_rate": 1.073229109687974e-05, - "loss": 0.6697, - "step": 349200 - }, - { - "epoch": 0.0126, - "grad_norm": 0.8388432860374451, - "learning_rate": 1.07192650434818e-05, - "loss": 0.6494, - "step": 349300 - }, - { - "epoch": 0.0128, - "grad_norm": 0.9042513966560364, - "learning_rate": 1.0706244742362192e-05, - "loss": 0.6473, - "step": 349400 - }, - { - "epoch": 0.013, - "grad_norm": 0.8294413089752197, - "learning_rate": 1.06932301987655e-05, - "loss": 0.6652, - "step": 349500 - }, - { - "epoch": 0.0132, - "grad_norm": 0.9279148578643799, - "learning_rate": 1.0680221417933963e-05, - "loss": 0.6506, - "step": 349600 - }, - { - "epoch": 0.0134, - "grad_norm": 0.8778104782104492, - "learning_rate": 1.066721840510753e-05, - "loss": 0.663, - "step": 349700 - }, - { - "epoch": 0.0136, - "grad_norm": 0.8701128959655762, - "learning_rate": 1.0654221165523817e-05, - "loss": 0.6605, - "step": 349800 - }, - { - "epoch": 0.0138, - "grad_norm": 0.9396702647209167, - "learning_rate": 1.0641229704418093e-05, - "loss": 0.658, - "step": 349900 - }, - { - "epoch": 0.014, - "grad_norm": 0.891123354434967, - "learning_rate": 1.0628244027023329e-05, - "loss": 0.6186, - "step": 350000 - }, - { - "epoch": 0.014, - "eval_loss": 2.059767961502075, - "eval_runtime": 51.9881, - "eval_samples_per_second": 196.083, - "eval_steps_per_second": 1.539, - "step": 350000 - }, - { - "epoch": 0.0142, - "grad_norm": 0.8995864391326904, - "learning_rate": 1.061526413857015e-05, - "loss": 0.6545, - "step": 350100 - }, - { - "epoch": 0.0144, - "grad_norm": 0.8432427048683167, - "learning_rate": 1.0602290044286866e-05, - "loss": 0.6527, - "step": 350200 - }, - { - "epoch": 0.0146, - "grad_norm": 0.8539645671844482, - "learning_rate": 1.058932174939942e-05, - "loss": 0.66, - "step": 350300 - }, - { - "epoch": 0.0148, - "grad_norm": 0.8698434233665466, - "learning_rate": 1.0576359259131452e-05, - "loss": 0.6686, - "step": 350400 - }, - { - "epoch": 0.015, - "grad_norm": 0.8616706728935242, - "learning_rate": 1.0563402578704248e-05, - "loss": 0.6605, - "step": 350500 - }, - { - "epoch": 0.0152, - "grad_norm": 0.891680121421814, - "learning_rate": 1.0550451713336768e-05, - "loss": 0.6471, - "step": 350600 - }, - { - "epoch": 0.0154, - "grad_norm": 0.9290798306465149, - "learning_rate": 1.05375066682456e-05, - "loss": 0.6575, - "step": 350700 - }, - { - "epoch": 0.0156, - "grad_norm": 0.8489027619361877, - "learning_rate": 1.0524567448645018e-05, - "loss": 0.6484, - "step": 350800 - }, - { - "epoch": 0.0158, - "grad_norm": 0.8927240371704102, - "learning_rate": 1.0511634059746935e-05, - "loss": 0.6637, - "step": 350900 - }, - { - "epoch": 0.016, - "grad_norm": 0.8975149393081665, - "learning_rate": 1.0498706506760933e-05, - "loss": 0.6729, - "step": 351000 - }, - { - "epoch": 0.016, - "eval_loss": 2.0625927448272705, - "eval_runtime": 52.1361, - "eval_samples_per_second": 195.527, - "eval_steps_per_second": 1.534, - "step": 351000 - }, - { - "epoch": 0.0162, - "grad_norm": 0.8605362176895142, - "learning_rate": 1.0485784794894205e-05, - "loss": 0.6494, - "step": 351100 - }, - { - "epoch": 0.0164, - "grad_norm": 0.9211152791976929, - "learning_rate": 1.0472868929351622e-05, - "loss": 0.6661, - "step": 351200 - }, - { - "epoch": 0.0166, - "grad_norm": 0.9342173337936401, - "learning_rate": 1.045995891533571e-05, - "loss": 0.6567, - "step": 351300 - }, - { - "epoch": 0.0168, - "grad_norm": 0.9137123227119446, - "learning_rate": 1.0447054758046598e-05, - "loss": 0.6396, - "step": 351400 - }, - { - "epoch": 0.017, - "grad_norm": 0.9604211449623108, - "learning_rate": 1.043415646268209e-05, - "loss": 0.6496, - "step": 351500 - }, - { - "epoch": 0.0172, - "grad_norm": 0.8666329979896545, - "learning_rate": 1.0421264034437616e-05, - "loss": 0.664, - "step": 351600 - }, - { - "epoch": 0.0174, - "grad_norm": 0.86720871925354, - "learning_rate": 1.0408377478506253e-05, - "loss": 0.657, - "step": 351700 - }, - { - "epoch": 0.0176, - "grad_norm": 0.9042288064956665, - "learning_rate": 1.0395496800078692e-05, - "loss": 0.6564, - "step": 351800 - }, - { - "epoch": 0.0178, - "grad_norm": 0.9693347811698914, - "learning_rate": 1.038262200434327e-05, - "loss": 0.644, - "step": 351900 - }, - { - "epoch": 0.018, - "grad_norm": 0.8999383449554443, - "learning_rate": 1.0369753096485957e-05, - "loss": 0.6534, - "step": 352000 - }, - { - "epoch": 0.018, - "eval_loss": 2.0669960975646973, - "eval_runtime": 52.2938, - "eval_samples_per_second": 194.937, - "eval_steps_per_second": 1.53, - "step": 352000 - }, - { - "epoch": 0.0182, - "grad_norm": 0.907943844795227, - "learning_rate": 1.0356890081690356e-05, - "loss": 0.6459, - "step": 352100 - }, - { - "epoch": 0.0184, - "grad_norm": 0.866569995880127, - "learning_rate": 1.034403296513767e-05, - "loss": 0.6519, - "step": 352200 - }, - { - "epoch": 0.0186, - "grad_norm": 0.904236376285553, - "learning_rate": 1.0331181752006755e-05, - "loss": 0.6554, - "step": 352300 - }, - { - "epoch": 0.0188, - "grad_norm": 0.9165827035903931, - "learning_rate": 1.0318336447474075e-05, - "loss": 0.6773, - "step": 352400 - }, - { - "epoch": 0.019, - "grad_norm": 0.8540114164352417, - "learning_rate": 1.0305497056713726e-05, - "loss": 0.6529, - "step": 352500 - }, - { - "epoch": 0.0192, - "grad_norm": 0.9309752583503723, - "learning_rate": 1.0292663584897396e-05, - "loss": 0.6535, - "step": 352600 - }, - { - "epoch": 0.0194, - "grad_norm": 0.8861046433448792, - "learning_rate": 1.0279836037194417e-05, - "loss": 0.6607, - "step": 352700 - }, - { - "epoch": 0.0196, - "grad_norm": 0.9103682637214661, - "learning_rate": 1.026701441877173e-05, - "loss": 0.6708, - "step": 352800 - }, - { - "epoch": 0.0198, - "grad_norm": 0.9763253927230835, - "learning_rate": 1.0254198734793865e-05, - "loss": 0.6319, - "step": 352900 - }, - { - "epoch": 0.02, - "grad_norm": 0.8923797011375427, - "learning_rate": 1.0241388990422986e-05, - "loss": 0.6605, - "step": 353000 - }, - { - "epoch": 0.02, - "eval_loss": 2.066145658493042, - "eval_runtime": 52.3003, - "eval_samples_per_second": 194.913, - "eval_steps_per_second": 1.53, - "step": 353000 - }, - { - "epoch": 0.0202, - "grad_norm": 0.8869938850402832, - "learning_rate": 1.0228585190818857e-05, - "loss": 0.6594, - "step": 353100 - }, - { - "epoch": 0.0204, - "grad_norm": 0.8605444431304932, - "learning_rate": 1.0215787341138854e-05, - "loss": 0.664, - "step": 353200 - }, - { - "epoch": 0.0206, - "grad_norm": 1.001497745513916, - "learning_rate": 1.0202995446537933e-05, - "loss": 0.6574, - "step": 353300 - }, - { - "epoch": 0.0208, - "grad_norm": 0.8902758359909058, - "learning_rate": 1.0190209512168677e-05, - "loss": 0.6536, - "step": 353400 - }, - { - "epoch": 0.021, - "grad_norm": 0.9075655341148376, - "learning_rate": 1.017742954318127e-05, - "loss": 0.6545, - "step": 353500 - }, - { - "epoch": 0.0212, - "grad_norm": 0.9329447746276855, - "learning_rate": 1.016465554472346e-05, - "loss": 0.6589, - "step": 353600 - }, - { - "epoch": 0.0214, - "grad_norm": 0.8853082656860352, - "learning_rate": 1.0151887521940628e-05, - "loss": 0.6532, - "step": 353700 - }, - { - "epoch": 0.0216, - "grad_norm": 0.8958137631416321, - "learning_rate": 1.0139125479975722e-05, - "loss": 0.6563, - "step": 353800 - }, - { - "epoch": 0.0218, - "grad_norm": 0.865190863609314, - "learning_rate": 1.0126369423969293e-05, - "loss": 0.6585, - "step": 353900 - }, - { - "epoch": 0.022, - "grad_norm": 0.9948294162750244, - "learning_rate": 1.0113619359059482e-05, - "loss": 0.65, - "step": 354000 - }, - { - "epoch": 0.022, - "eval_loss": 2.085937976837158, - "eval_runtime": 52.093, - "eval_samples_per_second": 195.689, - "eval_steps_per_second": 1.536, - "step": 354000 - }, - { - "epoch": 0.0222, - "grad_norm": 0.9526733160018921, - "learning_rate": 1.0100875290382022e-05, - "loss": 0.6509, - "step": 354100 - }, - { - "epoch": 0.0224, - "grad_norm": 0.8897534608840942, - "learning_rate": 1.0088137223070205e-05, - "loss": 0.6609, - "step": 354200 - }, - { - "epoch": 0.0226, - "grad_norm": 0.8177494406700134, - "learning_rate": 1.007540516225493e-05, - "loss": 0.6531, - "step": 354300 - }, - { - "epoch": 0.0228, - "grad_norm": 0.9328579306602478, - "learning_rate": 1.006267911306468e-05, - "loss": 0.7497, - "step": 354400 - }, - { - "epoch": 0.023, - "grad_norm": 0.8657885193824768, - "learning_rate": 1.004995908062549e-05, - "loss": 0.7346, - "step": 354500 - }, - { - "epoch": 0.0232, - "grad_norm": 0.8872801661491394, - "learning_rate": 1.0037245070060991e-05, - "loss": 0.7475, - "step": 354600 - }, - { - "epoch": 0.0234, - "grad_norm": 0.8421425223350525, - "learning_rate": 1.002453708649239e-05, - "loss": 0.7338, - "step": 354700 - }, - { - "epoch": 0.0236, - "grad_norm": 0.8456546068191528, - "learning_rate": 1.0011835135038469e-05, - "loss": 0.7163, - "step": 354800 - }, - { - "epoch": 0.0238, - "grad_norm": 0.9232527613639832, - "learning_rate": 9.999139220815554e-06, - "loss": 0.715, - "step": 354900 - }, - { - "epoch": 0.024, - "grad_norm": 0.8569039702415466, - "learning_rate": 9.986449348937568e-06, - "loss": 0.7392, - "step": 355000 - }, - { - "epoch": 0.024, - "eval_loss": 2.056723117828369, - "eval_runtime": 52.2992, - "eval_samples_per_second": 194.917, - "eval_steps_per_second": 1.53, - "step": 355000 - }, - { - "epoch": 0.0242, - "grad_norm": 0.8463347554206848, - "learning_rate": 9.973765524515988e-06, - "loss": 0.719, - "step": 355100 - }, - { - "epoch": 0.0244, - "grad_norm": 0.9859148263931274, - "learning_rate": 9.961087752659866e-06, - "loss": 0.7161, - "step": 355200 - }, - { - "epoch": 0.0246, - "grad_norm": 0.8795856833457947, - "learning_rate": 9.94841603847579e-06, - "loss": 0.7211, - "step": 355300 - }, - { - "epoch": 0.0248, - "grad_norm": 0.8623588681221008, - "learning_rate": 9.935750387067935e-06, - "loss": 0.7134, - "step": 355400 - }, - { - "epoch": 0.025, - "grad_norm": 0.8915929794311523, - "learning_rate": 9.923090803538021e-06, - "loss": 0.718, - "step": 355500 - }, - { - "epoch": 0.0252, - "grad_norm": 0.9230467081069946, - "learning_rate": 9.91043729298534e-06, - "loss": 0.7092, - "step": 355600 - }, - { - "epoch": 0.0254, - "grad_norm": 0.9159933924674988, - "learning_rate": 9.8977898605067e-06, - "loss": 0.7139, - "step": 355700 - }, - { - "epoch": 0.0256, - "grad_norm": 1.0485515594482422, - "learning_rate": 9.885148511196502e-06, - "loss": 0.7071, - "step": 355800 - }, - { - "epoch": 0.0258, - "grad_norm": 0.8589327335357666, - "learning_rate": 9.872513250146681e-06, - "loss": 0.7102, - "step": 355900 - }, - { - "epoch": 0.026, - "grad_norm": 0.9215981960296631, - "learning_rate": 9.859884082446707e-06, - "loss": 0.6789, - "step": 356000 - }, - { - "epoch": 0.026, - "eval_loss": 2.081296920776367, - "eval_runtime": 52.2111, - "eval_samples_per_second": 195.246, - "eval_steps_per_second": 1.532, - "step": 356000 - }, - { - "epoch": 0.0262, - "grad_norm": 0.8868950605392456, - "learning_rate": 9.847261013183615e-06, - "loss": 0.6801, - "step": 356100 - }, - { - "epoch": 0.0264, - "grad_norm": 0.9825394749641418, - "learning_rate": 9.834644047441974e-06, - "loss": 0.6582, - "step": 356200 - }, - { - "epoch": 0.0266, - "grad_norm": 0.8572143316268921, - "learning_rate": 9.822033190303906e-06, - "loss": 0.6731, - "step": 356300 - }, - { - "epoch": 0.0268, - "grad_norm": 0.8867204785346985, - "learning_rate": 9.809428446849044e-06, - "loss": 0.6634, - "step": 356400 - }, - { - "epoch": 0.027, - "grad_norm": 0.8682609796524048, - "learning_rate": 9.796829822154589e-06, - "loss": 0.6678, - "step": 356500 - }, - { - "epoch": 0.0272, - "grad_norm": 0.8932370543479919, - "learning_rate": 9.784237321295262e-06, - "loss": 0.6707, - "step": 356600 - }, - { - "epoch": 0.0274, - "grad_norm": 0.860748291015625, - "learning_rate": 9.771650949343331e-06, - "loss": 0.6604, - "step": 356700 - }, - { - "epoch": 0.0276, - "grad_norm": 0.8779944181442261, - "learning_rate": 9.759070711368568e-06, - "loss": 0.6639, - "step": 356800 - }, - { - "epoch": 0.0278, - "grad_norm": 0.9277738928794861, - "learning_rate": 9.746496612438299e-06, - "loss": 0.6617, - "step": 356900 - }, - { - "epoch": 0.028, - "grad_norm": 0.8405406475067139, - "learning_rate": 9.733928657617373e-06, - "loss": 0.6663, - "step": 357000 - }, - { - "epoch": 0.028, - "eval_loss": 2.0634403228759766, - "eval_runtime": 52.3193, - "eval_samples_per_second": 194.842, - "eval_steps_per_second": 1.529, - "step": 357000 - }, - { - "epoch": 0.0282, - "grad_norm": 0.8827060461044312, - "learning_rate": 9.721366851968165e-06, - "loss": 0.6748, - "step": 357100 - }, - { - "epoch": 0.0284, - "grad_norm": 0.908746063709259, - "learning_rate": 9.708811200550552e-06, - "loss": 0.6614, - "step": 357200 - }, - { - "epoch": 0.0286, - "grad_norm": 0.8800754547119141, - "learning_rate": 9.69626170842196e-06, - "loss": 0.6661, - "step": 357300 - }, - { - "epoch": 0.0288, - "grad_norm": 0.9010385870933533, - "learning_rate": 9.68371838063733e-06, - "loss": 0.6466, - "step": 357400 - }, - { - "epoch": 0.029, - "grad_norm": 0.868073046207428, - "learning_rate": 9.671181222249099e-06, - "loss": 0.6561, - "step": 357500 - }, - { - "epoch": 0.0292, - "grad_norm": 0.982118546962738, - "learning_rate": 9.658650238307235e-06, - "loss": 0.6696, - "step": 357600 - }, - { - "epoch": 0.0294, - "grad_norm": 0.832084059715271, - "learning_rate": 9.646125433859221e-06, - "loss": 0.6513, - "step": 357700 - }, - { - "epoch": 0.0296, - "grad_norm": 0.9348160028457642, - "learning_rate": 9.633606813950055e-06, - "loss": 0.6558, - "step": 357800 - }, - { - "epoch": 0.0298, - "grad_norm": 0.8417104482650757, - "learning_rate": 9.621094383622217e-06, - "loss": 0.6621, - "step": 357900 - }, - { - "epoch": 0.03, - "grad_norm": 0.8583792448043823, - "learning_rate": 9.608588147915726e-06, - "loss": 0.6572, - "step": 358000 - }, - { - "epoch": 0.03, - "eval_loss": 2.086122512817383, - "eval_runtime": 52.2197, - "eval_samples_per_second": 195.214, - "eval_steps_per_second": 1.532, - "step": 358000 - }, - { - "epoch": 0.0002, - "grad_norm": 0.8814049959182739, - "learning_rate": 9.596088111868085e-06, - "loss": 0.653, - "step": 358100 - }, - { - "epoch": 0.0004, - "grad_norm": 0.8665258288383484, - "learning_rate": 9.583594280514318e-06, - "loss": 0.6518, - "step": 358200 - }, - { - "epoch": 0.0006, - "grad_norm": 0.9076094627380371, - "learning_rate": 9.571106658886925e-06, - "loss": 0.6583, - "step": 358300 - }, - { - "epoch": 0.0008, - "grad_norm": 0.9470544457435608, - "learning_rate": 9.558625252015924e-06, - "loss": 0.6539, - "step": 358400 - }, - { - "epoch": 0.001, - "grad_norm": 0.9310306310653687, - "learning_rate": 9.546150064928824e-06, - "loss": 0.661, - "step": 358500 - }, - { - "epoch": 0.0012, - "grad_norm": 0.8882910013198853, - "learning_rate": 9.53368110265064e-06, - "loss": 0.6644, - "step": 358600 - }, - { - "epoch": 0.0014, - "grad_norm": 0.912969172000885, - "learning_rate": 9.52121837020385e-06, - "loss": 0.6477, - "step": 358700 - }, - { - "epoch": 0.0016, - "grad_norm": 0.9159826040267944, - "learning_rate": 9.50876187260845e-06, - "loss": 0.6581, - "step": 358800 - }, - { - "epoch": 0.0018, - "grad_norm": 0.8334347605705261, - "learning_rate": 9.49631161488192e-06, - "loss": 0.6605, - "step": 358900 - }, - { - "epoch": 0.002, - "grad_norm": 0.9216808676719666, - "learning_rate": 9.483867602039212e-06, - "loss": 0.6609, - "step": 359000 - }, - { - "epoch": 0.002, - "eval_loss": 2.071388006210327, - "eval_runtime": 52.0422, - "eval_samples_per_second": 195.879, - "eval_steps_per_second": 1.537, - "step": 359000 - }, - { - "epoch": 0.0022, - "grad_norm": 0.9010413289070129, - "learning_rate": 9.471429839092777e-06, - "loss": 0.6428, - "step": 359100 - }, - { - "epoch": 0.0024, - "grad_norm": 0.8659740686416626, - "learning_rate": 9.458998331052546e-06, - "loss": 0.6462, - "step": 359200 - }, - { - "epoch": 0.0026, - "grad_norm": 0.9039402604103088, - "learning_rate": 9.446573082925938e-06, - "loss": 0.6413, - "step": 359300 - }, - { - "epoch": 0.0028, - "grad_norm": 0.9015378952026367, - "learning_rate": 9.434154099717824e-06, - "loss": 0.6521, - "step": 359400 - }, - { - "epoch": 0.003, - "grad_norm": 0.8885050415992737, - "learning_rate": 9.421741386430575e-06, - "loss": 0.647, - "step": 359500 - }, - { - "epoch": 0.0032, - "grad_norm": 0.8669450879096985, - "learning_rate": 9.409334948064033e-06, - "loss": 0.6564, - "step": 359600 - }, - { - "epoch": 0.0034, - "grad_norm": 0.9445268511772156, - "learning_rate": 9.396934789615519e-06, - "loss": 0.6683, - "step": 359700 - }, - { - "epoch": 0.0036, - "grad_norm": 0.8911668062210083, - "learning_rate": 9.384540916079798e-06, - "loss": 0.6713, - "step": 359800 - }, - { - "epoch": 0.0038, - "grad_norm": 0.8700185418128967, - "learning_rate": 9.372153332449127e-06, - "loss": 0.6621, - "step": 359900 - }, - { - "epoch": 0.004, - "grad_norm": 0.8949635028839111, - "learning_rate": 9.359772043713226e-06, - "loss": 0.6468, - "step": 360000 - }, - { - "epoch": 0.004, - "eval_loss": 2.0606133937835693, - "eval_runtime": 51.5712, - "eval_samples_per_second": 197.668, - "eval_steps_per_second": 1.551, - "step": 360000 - }, - { - "epoch": 0.0042, - "grad_norm": 0.875957190990448, - "learning_rate": 9.347397054859283e-06, - "loss": 0.6823, - "step": 360100 - }, - { - "epoch": 0.0044, - "grad_norm": 0.8829663395881653, - "learning_rate": 9.335028370871925e-06, - "loss": 0.6758, - "step": 360200 - }, - { - "epoch": 0.0046, - "grad_norm": 0.8770716786384583, - "learning_rate": 9.322665996733268e-06, - "loss": 0.6601, - "step": 360300 - }, - { - "epoch": 0.0048, - "grad_norm": 0.9599934220314026, - "learning_rate": 9.310309937422873e-06, - "loss": 0.666, - "step": 360400 - }, - { - "epoch": 0.005, - "grad_norm": 0.8904752135276794, - "learning_rate": 9.297960197917766e-06, - "loss": 0.662, - "step": 360500 - }, - { - "epoch": 0.0052, - "grad_norm": 0.9215303659439087, - "learning_rate": 9.285616783192404e-06, - "loss": 0.6637, - "step": 360600 - }, - { - "epoch": 0.0054, - "grad_norm": 0.9662516117095947, - "learning_rate": 9.273279698218726e-06, - "loss": 0.6735, - "step": 360700 - }, - { - "epoch": 0.0056, - "grad_norm": 0.9039230346679688, - "learning_rate": 9.260948947966111e-06, - "loss": 0.682, - "step": 360800 - }, - { - "epoch": 0.0058, - "grad_norm": 0.914978563785553, - "learning_rate": 9.248624537401368e-06, - "loss": 0.6691, - "step": 360900 - }, - { - "epoch": 0.006, - "grad_norm": 0.8637982606887817, - "learning_rate": 9.236306471488779e-06, - "loss": 0.6775, - "step": 361000 - }, - { - "epoch": 0.006, - "eval_loss": 2.0751538276672363, - "eval_runtime": 51.7366, - "eval_samples_per_second": 197.037, - "eval_steps_per_second": 1.546, - "step": 361000 - }, - { - "epoch": 0.0062, - "grad_norm": 0.8795140981674194, - "learning_rate": 9.223994755190058e-06, - "loss": 0.683, - "step": 361100 + { + "epoch": 0.0192, + "grad_norm": 2.1285250186920166, + "learning_rate": 8.64920647525223e-05, + "loss": 0.6978, + "step": 23600 }, { - "epoch": 0.0064, - "grad_norm": 0.9144249558448792, - "learning_rate": 9.21168939346437e-06, - "loss": 0.7081, - "step": 361200 + "epoch": 0.019398, + "loss_gen": 4.192156791687012, + "loss_rtd": 0.4084530770778656, + "loss_sent": 0.38544365763664246, + "loss_sod": 0.018873078748583794, + "loss_total": 0.8127697706222534, + "step": 23699 }, { - "epoch": 0.0066, - "grad_norm": 0.8885230422019958, - "learning_rate": 9.199390391268301e-06, - "loss": 0.6968, - "step": 361300 + "epoch": 0.019398, + "loss_gen": 4.107691287994385, + "loss_rtd": 0.4033298194408417, + "loss_sent": 0.21068085730075836, + "loss_sod": 0.11531396210193634, + "loss_total": 0.7293246388435364, + "step": 23699 }, { - "epoch": 0.0068, - "grad_norm": 0.8315828442573547, - "learning_rate": 9.18709775355589e-06, + "epoch": 0.0194, + "grad_norm": 1.0886321067810059, + "learning_rate": 8.647036402111202e-05, "loss": 0.6809, - "step": 361400 + "step": 23700 }, { - "epoch": 0.007, - "grad_norm": 0.8375496864318848, - "learning_rate": 9.174811485278614e-06, - "loss": 0.686, - "step": 361500 + "epoch": 0.019598, + "loss_gen": 4.093698024749756, + "loss_rtd": 0.41472548246383667, + "loss_sent": 0.5435208678245544, + "loss_sod": 0.020486906170845032, + "loss_total": 0.978733241558075, + "step": 23799 }, { - "epoch": 0.0072, - "grad_norm": 0.9053453207015991, - "learning_rate": 9.162531591385387e-06, - "loss": 0.6921, - "step": 361600 + "epoch": 0.019598, + "loss_gen": 4.09850549697876, + "loss_rtd": 0.40556395053863525, + "loss_sent": 0.2776758670806885, + "loss_sod": 0.030896909534931183, + "loss_total": 0.7141367197036743, + "step": 23799 }, { - "epoch": 0.0074, - "grad_norm": 0.8914540410041809, - "learning_rate": 9.150258076822535e-06, - "loss": 0.6832, - "step": 361700 + "epoch": 0.0196, + "grad_norm": 1.3980774879455566, + "learning_rate": 8.644864859944348e-05, + "loss": 0.7079, + "step": 23800 }, { - "epoch": 0.0076, - "grad_norm": 0.8982157707214355, - "learning_rate": 9.13799094653383e-06, - "loss": 0.6969, - "step": 361800 + "epoch": 0.019798, + "loss_gen": 2.1838574409484863, + "loss_rtd": 0.38736552000045776, + "loss_sent": 0.0054249693639576435, + "loss_sod": 0.31862956285476685, + "loss_total": 0.7114200592041016, + "step": 23899 }, { - "epoch": 0.0078, - "grad_norm": 1.0123343467712402, - "learning_rate": 9.125730205460478e-06, - "loss": 0.6915, - "step": 361900 + "epoch": 0.019798, + "loss_gen": 4.038309097290039, + "loss_rtd": 0.40513619780540466, + "loss_sent": 0.3140098452568054, + "loss_sod": 0.07896880805492401, + "loss_total": 0.7981148362159729, + "step": 23899 }, { - "epoch": 0.008, - "grad_norm": 0.904523491859436, - "learning_rate": 9.113475858541118e-06, - "loss": 0.6884, - "step": 362000 + "epoch": 0.0198, + "grad_norm": 1.0776424407958984, + "learning_rate": 8.642691849626364e-05, + "loss": 0.703, + "step": 23900 }, { - "epoch": 0.008, - "eval_loss": 2.0824785232543945, - "eval_runtime": 51.6588, - "eval_samples_per_second": 197.333, - "eval_steps_per_second": 1.549, - "step": 362000 + "epoch": 0.019998, + "loss_gen": 2.5359106063842773, + "loss_rtd": 0.36821162700653076, + "loss_sent": 0.031892210245132446, + "loss_sod": 0.18366624414920807, + "loss_total": 0.5837700366973877, + "step": 23999 }, { - "epoch": 0.0082, - "grad_norm": 0.8671389818191528, - "learning_rate": 9.101227910711765e-06, - "loss": 0.706, - "step": 362100 + "epoch": 0.019998, + "loss_gen": 3.8365397453308105, + "loss_rtd": 0.39642348885536194, + "loss_sent": 0.1128220409154892, + "loss_sod": 0.025802068412303925, + "loss_total": 0.5350475907325745, + "step": 23999 }, { - "epoch": 0.0084, - "grad_norm": 0.8754188418388367, - "learning_rate": 9.088986366905908e-06, - "loss": 0.6918, - "step": 362200 + "epoch": 0.02, + "grad_norm": 0.9664748311042786, + "learning_rate": 8.64051737203254e-05, + "loss": 0.7152, + "step": 24000 }, { - "epoch": 0.0086, - "grad_norm": 0.8821722865104675, - "learning_rate": 9.076751232054439e-06, - "loss": 0.6902, - "step": 362300 + "epoch": 0.02, + "eval_loss": 0.676652729511261, + "eval_runtime": 151.6275, + "eval_samples_per_second": 101.848, + "eval_steps_per_second": 0.798, + "step": 24000 }, { - "epoch": 0.0088, - "grad_norm": 0.8519936800003052, - "learning_rate": 9.064522511085677e-06, - "loss": 0.6897, - "step": 362400 + "epoch": 0.020198, + "loss_gen": 2.9289157390594482, + "loss_rtd": 0.39156660437583923, + "loss_sent": 0.08221442252397537, + "loss_sod": 0.12104234844446182, + "loss_total": 0.5948233604431152, + "step": 24099 }, { - "epoch": 0.009, - "grad_norm": 0.9249884486198425, - "learning_rate": 9.052300208925335e-06, - "loss": 0.6762, - "step": 362500 + "epoch": 0.020198, + "loss_gen": 4.0122151374816895, + "loss_rtd": 0.4064323306083679, + "loss_sent": 0.19350147247314453, + "loss_sod": 0.06628310680389404, + "loss_total": 0.6662169098854065, + "step": 24099 }, { - "epoch": 0.0092, - "grad_norm": 0.9254834651947021, - "learning_rate": 9.040084330496562e-06, - "loss": 0.6836, - "step": 362600 + "epoch": 0.0202, + "grad_norm": 1.2565594911575317, + "learning_rate": 8.638341428038752e-05, + "loss": 0.6899, + "step": 24100 }, { - "epoch": 0.0094, - "grad_norm": 0.907455325126648, - "learning_rate": 9.027874880719911e-06, - "loss": 0.6816, - "step": 362700 + "epoch": 0.020398, + "loss_gen": 4.0760273933410645, + "loss_rtd": 0.43153977394104004, + "loss_sent": 0.20869547128677368, + "loss_sod": 0.07006371021270752, + "loss_total": 0.7102989554405212, + "step": 24199 }, { - "epoch": 0.0096, - "grad_norm": 0.8891639709472656, - "learning_rate": 9.015671864513356e-06, - "loss": 0.6493, - "step": 362800 + "epoch": 0.020398, + "loss_gen": 4.087414264678955, + "loss_rtd": 0.40784889459609985, + "loss_sent": 0.14950834214687347, + "loss_sod": 0.05528098717331886, + "loss_total": 0.6126382350921631, + "step": 24199 }, { - "epoch": 0.0098, - "grad_norm": 0.9093591570854187, - "learning_rate": 9.003475286792257e-06, - "loss": 0.659, - "step": 362900 + "epoch": 0.0204, + "grad_norm": 1.1879384517669678, + "learning_rate": 8.636164018521473e-05, + "loss": 0.6932, + "step": 24200 }, { - "epoch": 0.01, - "grad_norm": 0.8426594138145447, - "learning_rate": 8.991285152469395e-06, - "loss": 0.6498, - "step": 363000 + "epoch": 0.020598, + "loss_gen": 4.045533180236816, + "loss_rtd": 0.4118155241012573, + "loss_sent": 0.12074345350265503, + "loss_sod": 0.10650644451379776, + "loss_total": 0.6390654444694519, + "step": 24299 }, { - "epoch": 0.01, - "eval_loss": 2.0885329246520996, - "eval_runtime": 51.6994, - "eval_samples_per_second": 197.178, - "eval_steps_per_second": 1.547, - "step": 363000 + "epoch": 0.020598, + "loss_gen": 3.007084608078003, + "loss_rtd": 0.40772879123687744, + "loss_sent": 0.13143962621688843, + "loss_sod": 0.29404184222221375, + "loss_total": 0.833210289478302, + "step": 24299 }, { - "epoch": 0.0102, - "grad_norm": 0.9149935245513916, - "learning_rate": 8.979101466454962e-06, - "loss": 0.6595, - "step": 363100 + "epoch": 0.0206, + "grad_norm": 0.9798792600631714, + "learning_rate": 8.633985144357762e-05, + "loss": 0.6961, + "step": 24300 }, { - "epoch": 0.0104, - "grad_norm": 0.893366277217865, - "learning_rate": 8.966924233656552e-06, - "loss": 0.6622, - "step": 363200 + "epoch": 0.020798, + "loss_gen": 3.431025981903076, + "loss_rtd": 0.38585808873176575, + "loss_sent": 0.08852383494377136, + "loss_sod": 0.0714193657040596, + "loss_total": 0.5458012819290161, + "step": 24399 }, { - "epoch": 0.0106, - "grad_norm": 0.8946834206581116, - "learning_rate": 8.954753458979132e-06, - "loss": 0.6639, - "step": 363300 + "epoch": 0.020798, + "loss_gen": 3.961956739425659, + "loss_rtd": 0.40572047233581543, + "loss_sent": 0.1551080048084259, + "loss_sod": 0.0599331334233284, + "loss_total": 0.6207616329193115, + "step": 24399 }, { - "epoch": 0.0108, - "grad_norm": 0.8848134279251099, - "learning_rate": 8.9425891473251e-06, - "loss": 0.6623, - "step": 363400 + "epoch": 0.0208, + "grad_norm": 0.948943555355072, + "learning_rate": 8.63180480642527e-05, + "loss": 0.7037, + "step": 24400 }, { - "epoch": 0.011, - "grad_norm": 0.8674115538597107, - "learning_rate": 8.93043130359425e-06, - "loss": 0.6483, - "step": 363500 + "epoch": 0.020998, + "loss_gen": 3.923896074295044, + "loss_rtd": 0.38946792483329773, + "loss_sent": 0.3071112036705017, + "loss_sod": 0.03382628783583641, + "loss_total": 0.7304054498672485, + "step": 24499 }, { - "epoch": 0.0112, - "grad_norm": 0.8136773109436035, - "learning_rate": 8.91827993268374e-06, - "loss": 0.6598, - "step": 363600 + "epoch": 0.020998, + "loss_gen": 3.72622013092041, + "loss_rtd": 0.40381157398223877, + "loss_sent": 0.1826399266719818, + "loss_sod": 0.024815665557980537, + "loss_total": 0.6112672090530396, + "step": 24499 }, { - "epoch": 0.0114, - "grad_norm": 0.9210416674613953, - "learning_rate": 8.906135039488148e-06, - "loss": 0.6427, - "step": 363700 + "epoch": 0.021, + "grad_norm": 0.820321261882782, + "learning_rate": 8.629623005602234e-05, + "loss": 0.6752, + "step": 24500 }, { - "epoch": 0.0116, - "grad_norm": 0.8708541393280029, - "learning_rate": 8.89399662889944e-06, - "loss": 0.6523, - "step": 363800 + "epoch": 0.021198, + "loss_gen": 4.044614791870117, + "loss_rtd": 0.4168679714202881, + "loss_sent": 0.23598124086856842, + "loss_sod": 0.16695289313793182, + "loss_total": 0.8198021054267883, + "step": 24599 }, { - "epoch": 0.0118, - "grad_norm": 0.8490440845489502, - "learning_rate": 8.881864705806971e-06, - "loss": 0.6571, - "step": 363900 + "epoch": 0.021198, + "loss_gen": 2.699082374572754, + "loss_rtd": 0.3844226598739624, + "loss_sent": 0.015604406595230103, + "loss_sod": 0.23557092249393463, + "loss_total": 0.6355979442596436, + "step": 24599 }, { - "epoch": 0.012, - "grad_norm": 0.8714786767959595, - "learning_rate": 8.869739275097464e-06, - "loss": 0.6535, - "step": 364000 + "epoch": 0.0212, + "grad_norm": 1.1755541563034058, + "learning_rate": 8.627439742767488e-05, + "loss": 0.6817, + "step": 24600 }, { - "epoch": 0.012, - "eval_loss": 2.0917515754699707, - "eval_runtime": 51.7459, - "eval_samples_per_second": 197.001, - "eval_steps_per_second": 1.546, - "step": 364000 + "epoch": 0.021398, + "loss_gen": 3.3173317909240723, + "loss_rtd": 0.38064172863960266, + "loss_sent": 0.07833272218704224, + "loss_sod": 0.08912532776594162, + "loss_total": 0.5480997562408447, + "step": 24699 }, { - "epoch": 0.0122, - "grad_norm": 0.8995687961578369, - "learning_rate": 8.857620341655045e-06, - "loss": 0.6561, - "step": 364100 + "epoch": 0.021398, + "loss_gen": 3.951279640197754, + "loss_rtd": 0.40371090173721313, + "loss_sent": 0.1179547980427742, + "loss_sod": 0.03314758092164993, + "loss_total": 0.5548132658004761, + "step": 24699 }, { - "epoch": 0.0124, - "grad_norm": 0.9087790846824646, - "learning_rate": 8.845507910361223e-06, - "loss": 0.6506, - "step": 364200 + "epoch": 0.0214, + "grad_norm": 0.7562770843505859, + "learning_rate": 8.625255018800446e-05, + "loss": 0.6785, + "step": 24700 }, { - "epoch": 0.0126, - "grad_norm": 0.9006063342094421, - "learning_rate": 8.833401986094893e-06, - "loss": 0.6628, - "step": 364300 + "epoch": 0.021598, + "loss_gen": 2.5105037689208984, + "loss_rtd": 0.3946555554866791, + "loss_sent": 0.010327634401619434, + "loss_sod": 0.3795273005962372, + "loss_total": 0.7845104932785034, + "step": 24799 }, { - "epoch": 0.0128, - "grad_norm": 0.9575886726379395, - "learning_rate": 8.821302573732302e-06, - "loss": 0.6563, - "step": 364400 + "epoch": 0.021598, + "loss_gen": 3.9272818565368652, + "loss_rtd": 0.39661532640457153, + "loss_sent": 0.2233789712190628, + "loss_sod": 0.039907559752464294, + "loss_total": 0.6599018573760986, + "step": 24799 }, { - "epoch": 0.013, - "grad_norm": 0.8845739960670471, - "learning_rate": 8.809209678147095e-06, - "loss": 0.649, - "step": 364500 + "epoch": 0.0216, + "grad_norm": 1.2474645376205444, + "learning_rate": 8.623068834581116e-05, + "loss": 0.7006, + "step": 24800 }, { - "epoch": 0.0132, - "grad_norm": 0.8682934641838074, - "learning_rate": 8.797123304210298e-06, - "loss": 0.6513, - "step": 364600 + "epoch": 0.021798, + "loss_gen": 3.11579966545105, + "loss_rtd": 0.40773114562034607, + "loss_sent": 0.07494784891605377, + "loss_sod": 0.16161800920963287, + "loss_total": 0.6442970037460327, + "step": 24899 }, { - "epoch": 0.0134, - "grad_norm": 0.8966580033302307, - "learning_rate": 8.785043456790302e-06, - "loss": 0.6443, - "step": 364700 + "epoch": 0.021798, + "loss_gen": 3.445728063583374, + "loss_rtd": 0.4030747413635254, + "loss_sent": 0.014770938083529472, + "loss_sod": 0.19394944608211517, + "loss_total": 0.6117951273918152, + "step": 24899 }, { - "epoch": 0.0136, - "grad_norm": 0.8867930769920349, - "learning_rate": 8.772970140752854e-06, - "loss": 0.6473, - "step": 364800 + "epoch": 0.0218, + "grad_norm": 0.9890291094779968, + "learning_rate": 8.620881190990095e-05, + "loss": 0.6908, + "step": 24900 }, { - "epoch": 0.0138, - "grad_norm": 0.8712829351425171, - "learning_rate": 8.760903360961096e-06, - "loss": 0.6428, - "step": 364900 + "epoch": 0.021998, + "loss_gen": 3.799651861190796, + "loss_rtd": 0.4000953435897827, + "loss_sent": 0.13671578466892242, + "loss_sod": 0.10781806707382202, + "loss_total": 0.644629180431366, + "step": 24999 }, { - "epoch": 0.014, - "grad_norm": 0.8830559253692627, - "learning_rate": 8.748843122275519e-06, - "loss": 0.657, - "step": 365000 + "epoch": 0.021998, + "loss_gen": 3.714355230331421, + "loss_rtd": 0.3902835547924042, + "loss_sent": 0.342161625623703, + "loss_sod": 0.006307260133326054, + "loss_total": 0.7387524843215942, + "step": 24999 }, { - "epoch": 0.014, - "eval_loss": 2.077829122543335, - "eval_runtime": 51.6249, - "eval_samples_per_second": 197.463, - "eval_steps_per_second": 1.55, - "step": 365000 + "epoch": 0.022, + "grad_norm": 2.0926296710968018, + "learning_rate": 8.618692088908561e-05, + "loss": 0.6998, + "step": 25000 }, { - "epoch": 0.0142, - "grad_norm": 0.9168245792388916, - "learning_rate": 8.736789429553998e-06, - "loss": 0.6542, - "step": 365100 + "epoch": 0.022, + "eval_loss": 0.6693914532661438, + "eval_runtime": 151.6384, + "eval_samples_per_second": 101.841, + "eval_steps_per_second": 0.798, + "step": 25000 }, { - "epoch": 0.0144, - "grad_norm": 0.9041379690170288, - "learning_rate": 8.724742287651741e-06, - "loss": 0.6422, - "step": 365200 + "epoch": 0.022198, + "loss_gen": 4.19512414932251, + "loss_rtd": 0.3900134265422821, + "loss_sent": 0.18684500455856323, + "loss_sod": 0.02522754669189453, + "loss_total": 0.6020859479904175, + "step": 25099 }, { - "epoch": 0.0146, - "grad_norm": 0.8760838508605957, - "learning_rate": 8.712701701421344e-06, - "loss": 0.6532, - "step": 365300 + "epoch": 0.022198, + "loss_gen": 4.123718738555908, + "loss_rtd": 0.38469183444976807, + "loss_sent": 0.25549593567848206, + "loss_sod": 0.048402104526758194, + "loss_total": 0.688589870929718, + "step": 25099 }, { - "epoch": 0.0148, - "grad_norm": 0.8739610910415649, - "learning_rate": 8.700667675712764e-06, - "loss": 0.6485, - "step": 365400 + "epoch": 0.0222, + "grad_norm": 0.8465349078178406, + "learning_rate": 8.616501529218286e-05, + "loss": 0.6847, + "step": 25100 }, { - "epoch": 0.015, - "grad_norm": 0.9175285696983337, - "learning_rate": 8.688640215373287e-06, - "loss": 0.6433, - "step": 365500 + "epoch": 0.022398, + "loss_gen": 4.1009202003479, + "loss_rtd": 0.4004979431629181, + "loss_sent": 0.2241445779800415, + "loss_sod": 0.06554199010133743, + "loss_total": 0.690184473991394, + "step": 25199 }, { - "epoch": 0.0152, - "grad_norm": 0.8679957985877991, - "learning_rate": 8.676619325247578e-06, - "loss": 0.627, - "step": 365600 + "epoch": 0.022398, + "loss_gen": 4.1883864402771, + "loss_rtd": 0.4093237817287445, + "loss_sent": 0.4629150629043579, + "loss_sod": 0.046529121696949005, + "loss_total": 0.9187679290771484, + "step": 25199 }, { - "epoch": 0.0154, - "grad_norm": 0.9219822287559509, - "learning_rate": 8.664605010177653e-06, - "loss": 0.6342, - "step": 365700 + "epoch": 0.0224, + "grad_norm": 0.9161996841430664, + "learning_rate": 8.614309512801628e-05, + "loss": 0.6966, + "step": 25200 }, { - "epoch": 0.0156, - "grad_norm": 0.8707392811775208, - "learning_rate": 8.652597275002888e-06, - "loss": 0.6441, - "step": 365800 + "epoch": 0.022598, + "loss_gen": 4.16085958480835, + "loss_rtd": 0.39612576365470886, + "loss_sent": 0.09838993847370148, + "loss_sod": 0.04299217090010643, + "loss_total": 0.5375078916549683, + "step": 25299 }, { - "epoch": 0.0158, - "grad_norm": 0.8975892663002014, - "learning_rate": 8.640596124559975e-06, - "loss": 0.6119, - "step": 365900 + "epoch": 0.022598, + "loss_gen": 3.9336602687835693, + "loss_rtd": 0.40497827529907227, + "loss_sent": 0.2555369436740875, + "loss_sod": 0.17616719007492065, + "loss_total": 0.8366824388504028, + "step": 25299 }, { - "epoch": 0.016, - "grad_norm": 0.8921619057655334, - "learning_rate": 8.628601563682986e-06, - "loss": 0.6493, - "step": 366000 + "epoch": 0.0226, + "grad_norm": 0.8919493556022644, + "learning_rate": 8.612116040541531e-05, + "loss": 0.6924, + "step": 25300 }, { - "epoch": 0.016, - "eval_loss": 2.0901429653167725, - "eval_runtime": 51.9763, - "eval_samples_per_second": 196.128, - "eval_steps_per_second": 1.539, - "step": 366000 + "epoch": 0.022798, + "loss_gen": 4.016414642333984, + "loss_rtd": 0.42911723256111145, + "loss_sent": 0.15751326084136963, + "loss_sod": 0.0035857190378010273, + "loss_total": 0.5902162194252014, + "step": 25399 }, { - "epoch": 0.0162, - "grad_norm": 0.9101726412773132, - "learning_rate": 8.616613597203333e-06, - "loss": 0.6456, - "step": 366100 + "epoch": 0.022798, + "loss_gen": 3.0825438499450684, + "loss_rtd": 0.38710057735443115, + "loss_sent": 0.023861445486545563, + "loss_sod": 0.08589092642068863, + "loss_total": 0.49685293436050415, + "step": 25399 }, { - "epoch": 0.0164, - "grad_norm": 0.9642266035079956, - "learning_rate": 8.604632229949768e-06, - "loss": 0.6411, - "step": 366200 + "epoch": 0.0228, + "grad_norm": 0.7878300547599792, + "learning_rate": 8.609921113321526e-05, + "loss": 0.6836, + "step": 25400 }, { - "epoch": 0.0166, - "grad_norm": 0.8600582480430603, - "learning_rate": 8.592657466748372e-06, - "loss": 0.635, - "step": 366300 + "epoch": 0.022998, + "loss_gen": 2.6240932941436768, + "loss_rtd": 0.38083410263061523, + "loss_sent": 0.0003823730512522161, + "loss_sod": 0.23906835913658142, + "loss_total": 0.6202848553657532, + "step": 25499 }, { - "epoch": 0.0168, - "grad_norm": 0.9204874038696289, - "learning_rate": 8.580689312422587e-06, - "loss": 0.6456, - "step": 366400 + "epoch": 0.022998, + "loss_gen": 4.091073513031006, + "loss_rtd": 0.40128275752067566, + "loss_sent": 0.13246385753154755, + "loss_sod": 0.09995909035205841, + "loss_total": 0.633705735206604, + "step": 25499 }, { - "epoch": 0.017, - "grad_norm": 0.857318103313446, - "learning_rate": 8.568727771793186e-06, - "loss": 0.6385, - "step": 366500 + "epoch": 0.023, + "grad_norm": 1.3569259643554688, + "learning_rate": 8.607724732025726e-05, + "loss": 0.6916, + "step": 25500 }, { - "epoch": 0.0172, - "grad_norm": 0.9361177682876587, - "learning_rate": 8.55677284967828e-06, - "loss": 0.6299, - "step": 366600 + "epoch": 0.023198, + "loss_gen": 3.147066354751587, + "loss_rtd": 0.3897458612918854, + "loss_sent": 0.14791350066661835, + "loss_sod": 0.16765955090522766, + "loss_total": 0.7053189277648926, + "step": 25599 }, { - "epoch": 1.000196, - "grad_norm": 0.9187692999839783, - "learning_rate": 8.544824550893294e-06, - "loss": 0.6425, - "step": 366700 + "epoch": 0.023198, + "loss_gen": 3.56636381149292, + "loss_rtd": 0.39919835329055786, + "loss_sent": 0.1255677193403244, + "loss_sod": 0.1003207266330719, + "loss_total": 0.625086784362793, + "step": 25599 }, { - "epoch": 1.000396, - "grad_norm": 0.8672967553138733, - "learning_rate": 8.532882880251011e-06, - "loss": 0.6341, - "step": 366800 + "epoch": 0.0232, + "grad_norm": 0.9187750816345215, + "learning_rate": 8.605526897538836e-05, + "loss": 0.6918, + "step": 25600 }, { - "epoch": 1.000596, - "grad_norm": 0.888131320476532, - "learning_rate": 8.520947842561544e-06, - "loss": 0.6451, - "step": 366900 + "epoch": 0.023398, + "loss_gen": 3.95824933052063, + "loss_rtd": 0.4032837748527527, + "loss_sent": 0.3386683762073517, + "loss_sod": 0.08816278725862503, + "loss_total": 0.8301149606704712, + "step": 25699 }, { - "epoch": 1.000796, - "grad_norm": 0.8518761992454529, - "learning_rate": 8.509019442632308e-06, - "loss": 0.637, - "step": 367000 + "epoch": 0.023398, + "loss_gen": 3.9838974475860596, + "loss_rtd": 0.40660005807876587, + "loss_sent": 0.04155031964182854, + "loss_sod": 0.06001908332109451, + "loss_total": 0.5081694722175598, + "step": 25699 }, { - "epoch": 1.000796, - "eval_loss": 2.082726240158081, - "eval_runtime": 51.6098, - "eval_samples_per_second": 197.521, - "eval_steps_per_second": 1.55, - "step": 367000 + "epoch": 0.0234, + "grad_norm": 1.4878605604171753, + "learning_rate": 8.603327610746143e-05, + "loss": 0.6751, + "step": 25700 }, { - "epoch": 1.000996, - "grad_norm": 0.9279243350028992, - "learning_rate": 8.497097685268068e-06, - "loss": 0.6471, - "step": 367100 + "epoch": 0.023598, + "loss_gen": 2.472942590713501, + "loss_rtd": 0.38380375504493713, + "loss_sent": 0.05147818848490715, + "loss_sod": 0.1970677673816681, + "loss_total": 0.6323497295379639, + "step": 25799 }, { - "epoch": 1.001196, - "grad_norm": 0.9042778611183167, - "learning_rate": 8.485182575270905e-06, - "loss": 0.6494, - "step": 367200 + "epoch": 0.023598, + "loss_gen": 4.118562698364258, + "loss_rtd": 0.4042208194732666, + "loss_sent": 0.33144524693489075, + "loss_sod": 0.01570950075984001, + "loss_total": 0.7513755559921265, + "step": 25799 }, { - "epoch": 1.001396, - "grad_norm": 0.9116953611373901, - "learning_rate": 8.473274117440235e-06, - "loss": 0.6333, - "step": 367300 + "epoch": 0.0236, + "grad_norm": 1.1134629249572754, + "learning_rate": 8.60112687253352e-05, + "loss": 0.6792, + "step": 25800 }, { - "epoch": 1.001596, - "grad_norm": 0.9247483611106873, - "learning_rate": 8.461372316572765e-06, - "loss": 0.6432, - "step": 367400 + "epoch": 0.023798, + "loss_gen": 4.10316801071167, + "loss_rtd": 0.41145727038383484, + "loss_sent": 0.11688307672739029, + "loss_sod": 0.063237763941288, + "loss_total": 0.5915781259536743, + "step": 25899 }, { - "epoch": 1.001796, - "grad_norm": 0.8390426635742188, - "learning_rate": 8.44947717746255e-06, - "loss": 0.6492, - "step": 367500 + "epoch": 0.023798, + "loss_gen": 4.139028072357178, + "loss_rtd": 0.38884127140045166, + "loss_sent": 0.15973824262619019, + "loss_sod": 0.020577775314450264, + "loss_total": 0.5691573023796082, + "step": 25899 }, { - "epoch": 1.001996, - "grad_norm": 0.8003919720649719, - "learning_rate": 8.437588704900948e-06, - "loss": 0.6472, - "step": 367600 + "epoch": 0.0238, + "grad_norm": 1.445813536643982, + "learning_rate": 8.598924683787423e-05, + "loss": 0.6812, + "step": 25900 }, { - "epoch": 1.002196, - "grad_norm": 0.8807201981544495, - "learning_rate": 8.425706903676645e-06, - "loss": 0.6338, - "step": 367700 + "epoch": 0.023998, + "loss_gen": 3.7991557121276855, + "loss_rtd": 0.39667585492134094, + "loss_sent": 0.1311122626066208, + "loss_sod": 0.0969759151339531, + "loss_total": 0.6247640252113342, + "step": 25999 }, { - "epoch": 1.002396, - "grad_norm": 0.8409605622291565, - "learning_rate": 8.41383177857561e-06, - "loss": 0.6371, - "step": 367800 + "epoch": 0.023998, + "loss_gen": 2.765401601791382, + "loss_rtd": 0.38930559158325195, + "loss_sent": 0.04718257114291191, + "loss_sod": 0.18341773748397827, + "loss_total": 0.6199058890342712, + "step": 25999 }, { - "epoch": 1.002596, - "grad_norm": 0.8772279024124146, - "learning_rate": 8.401963334381149e-06, - "loss": 0.6305, - "step": 367900 + "epoch": 0.024, + "grad_norm": 1.1262786388397217, + "learning_rate": 8.596721045394893e-05, + "loss": 0.6788, + "step": 26000 }, { - "epoch": 1.002796, - "grad_norm": 0.921270489692688, - "learning_rate": 8.390101575873871e-06, - "loss": 0.6414, - "step": 368000 + "epoch": 0.024, + "eval_loss": 0.6740491390228271, + "eval_runtime": 151.6725, + "eval_samples_per_second": 101.818, + "eval_steps_per_second": 0.798, + "step": 26000 }, { - "epoch": 1.002796, - "eval_loss": 2.0858559608459473, - "eval_runtime": 51.7813, - "eval_samples_per_second": 196.867, - "eval_steps_per_second": 1.545, - "step": 368000 + "epoch": 0.024198, + "loss_gen": 4.190979957580566, + "loss_rtd": 0.40651756525039673, + "loss_sent": 0.06409557163715363, + "loss_sod": 0.05562091991305351, + "loss_total": 0.5262340307235718, + "step": 26099 }, { - "epoch": 0.0002, - "grad_norm": 1.7583304643630981, - "learning_rate": 8.378246507831702e-06, - "loss": 2.0821, - "step": 368100 + "epoch": 0.024198, + "loss_gen": 4.179203510284424, + "loss_rtd": 0.4081471562385559, + "loss_sent": 0.6165047883987427, + "loss_sod": 0.06474655121564865, + "loss_total": 1.0893985033035278, + "step": 26099 }, { - "epoch": 1.000162, - "grad_norm": 1.6928149461746216, - "learning_rate": 8.366398135029847e-06, - "loss": 1.9175, - "step": 368200 + "epoch": 0.0242, + "grad_norm": 1.2236292362213135, + "learning_rate": 8.594515958243557e-05, + "loss": 0.6922, + "step": 26100 }, { - "epoch": 2.000124, - "grad_norm": 1.6941829919815063, - "learning_rate": 8.354556462240829e-06, - "loss": 1.8645, - "step": 368300 + "epoch": 0.024398, + "loss_gen": 4.126471042633057, + "loss_rtd": 0.4005105197429657, + "loss_sent": 0.11121775209903717, + "loss_sod": 0.04821493476629257, + "loss_total": 0.5599431991577148, + "step": 26199 }, { - "epoch": 3.000086, - "grad_norm": 1.6606141328811646, - "learning_rate": 8.342721494234487e-06, - "loss": 1.8296, - "step": 368400 + "epoch": 0.024398, + "loss_gen": 4.1259260177612305, + "loss_rtd": 0.4046095013618469, + "loss_sent": 0.14760328829288483, + "loss_sod": 0.04813776910305023, + "loss_total": 0.600350558757782, + "step": 26199 }, { - "epoch": 4.000048, - "grad_norm": 1.508047342300415, - "learning_rate": 8.330893235777929e-06, - "loss": 1.7982, - "step": 368500 + "epoch": 0.0244, + "grad_norm": 0.9959726929664612, + "learning_rate": 8.592309423221622e-05, + "loss": 0.6985, + "step": 26200 }, { - "epoch": 5.00001, - "grad_norm": 1.6567221879959106, - "learning_rate": 8.31907169163558e-06, - "loss": 1.776, - "step": 368600 + "epoch": 0.024598, + "loss_gen": 3.951544761657715, + "loss_rtd": 0.3880527913570404, + "loss_sent": 0.23553051054477692, + "loss_sod": 0.09306551516056061, + "loss_total": 0.7166488170623779, + "step": 26299 }, { - "epoch": 5.00021, - "grad_norm": 1.5388526916503906, - "learning_rate": 8.30725686656916e-06, - "loss": 1.7492, - "step": 368700 + "epoch": 0.024598, + "loss_gen": 4.2476115226745605, + "loss_rtd": 0.38585108518600464, + "loss_sent": 0.5140498876571655, + "loss_sod": 0.07745873928070068, + "loss_total": 0.9773597121238708, + "step": 26299 }, { - "epoch": 6.000172, - "grad_norm": 1.6148278713226318, - "learning_rate": 8.295448765337685e-06, - "loss": 1.7284, - "step": 368800 + "epoch": 0.0246, + "grad_norm": 1.0858606100082397, + "learning_rate": 8.590101441217881e-05, + "loss": 0.6942, + "step": 26300 }, { - "epoch": 7.000134, - "grad_norm": 1.5249569416046143, - "learning_rate": 8.28364739269744e-06, - "loss": 1.7221, - "step": 368900 + "epoch": 0.024798, + "loss_gen": 4.143537521362305, + "loss_rtd": 0.3949359655380249, + "loss_sent": 0.28887373208999634, + "loss_sod": 0.04987058416008949, + "loss_total": 0.733680248260498, + "step": 26399 }, { - "epoch": 8.000096, - "grad_norm": 1.5550845861434937, - "learning_rate": 8.271852753402028e-06, - "loss": 1.7079, - "step": 369000 + "epoch": 0.024798, + "loss_gen": 3.4738848209381104, + "loss_rtd": 0.39791178703308105, + "loss_sent": 0.020407138392329216, + "loss_sod": 0.20267105102539062, + "loss_total": 0.620989978313446, + "step": 26399 }, { - "epoch": 8.000096, - "eval_loss": 1.9369168281555176, - "eval_runtime": 55.0617, - "eval_samples_per_second": 185.138, - "eval_steps_per_second": 1.453, - "step": 369000 + "epoch": 0.0248, + "grad_norm": 1.310874104499817, + "learning_rate": 8.58789201312171e-05, + "loss": 0.6671, + "step": 26400 }, { - "epoch": 9.000058, - "grad_norm": 2.09401273727417, - "learning_rate": 8.260064852202329e-06, - "loss": 3.9424, - "step": 369100 + "epoch": 0.024998, + "loss_gen": 4.453327655792236, + "loss_rtd": 0.38886758685112, + "loss_sent": 0.2199181467294693, + "loss_sod": 0.10373158007860184, + "loss_total": 0.7125173211097717, + "step": 26499 }, { - "epoch": 10.00002, - "grad_norm": 1.9706476926803589, - "learning_rate": 8.248283693846509e-06, - "loss": 2.7687, - "step": 369200 + "epoch": 0.024998, + "loss_gen": 3.7706122398376465, + "loss_rtd": 0.3938748836517334, + "loss_sent": 0.14808478951454163, + "loss_sod": 0.14805863797664642, + "loss_total": 0.6900182962417603, + "step": 26499 }, { - "epoch": 10.00022, - "grad_norm": 2.0509135723114014, - "learning_rate": 8.23650928308001e-06, - "loss": 2.546, - "step": 369300 + "epoch": 0.025, + "grad_norm": 0.8602674603462219, + "learning_rate": 8.585681139823064e-05, + "loss": 0.7111, + "step": 26500 }, { - "epoch": 11.000182, - "grad_norm": 1.9125868082046509, - "learning_rate": 8.224741624645565e-06, - "loss": 2.4164, - "step": 369400 + "epoch": 0.025198, + "loss_gen": 4.2173871994018555, + "loss_rtd": 0.37780728936195374, + "loss_sent": 0.2071719616651535, + "loss_sod": 0.026507128030061722, + "loss_total": 0.6114863753318787, + "step": 26599 }, { - "epoch": 12.000144, - "grad_norm": 2.175070285797119, - "learning_rate": 8.212980723283186e-06, - "loss": 2.3405, - "step": 369500 + "epoch": 0.025198, + "loss_gen": 3.8598594665527344, + "loss_rtd": 0.38630515336990356, + "loss_sent": 0.25124266743659973, + "loss_sod": 0.12236283719539642, + "loss_total": 0.7599107027053833, + "step": 26599 }, { - "epoch": 13.000106, - "grad_norm": 1.9154648780822754, - "learning_rate": 8.201226583730175e-06, - "loss": 2.2729, - "step": 369600 + "epoch": 0.0252, + "grad_norm": 0.8886291980743408, + "learning_rate": 8.583468822212484e-05, + "loss": 0.6839, + "step": 26600 }, { - "epoch": 14.000068, - "grad_norm": 2.021451711654663, - "learning_rate": 8.189479210721076e-06, - "loss": 2.2268, - "step": 369700 + "epoch": 0.025398, + "loss_gen": 3.7826666831970215, + "loss_rtd": 0.3913278579711914, + "loss_sent": 0.07715705782175064, + "loss_sod": 0.07193191349506378, + "loss_total": 0.5404168367385864, + "step": 26699 }, { - "epoch": 15.00003, - "grad_norm": 2.0009710788726807, - "learning_rate": 8.177738608987745e-06, - "loss": 2.1859, - "step": 369800 + "epoch": 0.025398, + "loss_gen": 4.422722816467285, + "loss_rtd": 0.39607706665992737, + "loss_sent": 0.2463354915380478, + "loss_sod": 0.03971979767084122, + "loss_total": 0.682132363319397, + "step": 26699 }, { - "epoch": 15.00023, - "grad_norm": 1.9311867952346802, - "learning_rate": 8.166004783259295e-06, - "loss": 2.1494, - "step": 369900 + "epoch": 0.0254, + "grad_norm": 0.9952844381332397, + "learning_rate": 8.58125506118109e-05, + "loss": 0.6941, + "step": 26700 }, { - "epoch": 16.000192, - "grad_norm": 1.967115044593811, - "learning_rate": 8.154277738262097e-06, - "loss": 2.1181, - "step": 370000 + "epoch": 0.025598, + "loss_gen": 3.191692352294922, + "loss_rtd": 0.39240193367004395, + "loss_sent": 0.09603174775838852, + "loss_sod": 0.2056460678577423, + "loss_total": 0.6940796971321106, + "step": 26799 }, { - "epoch": 16.000192, - "eval_loss": 2.407406806945801, - "eval_runtime": 54.9275, - "eval_samples_per_second": 185.59, - "eval_steps_per_second": 1.456, - "step": 370000 + "epoch": 0.025598, + "loss_gen": 4.318501949310303, + "loss_rtd": 0.3884369134902954, + "loss_sent": 0.37728720903396606, + "loss_sod": 0.04980402812361717, + "loss_total": 0.815528154373169, + "step": 26799 }, { - "epoch": 17.000154, - "grad_norm": 2.050703525543213, - "learning_rate": 8.142557478719814e-06, - "loss": 2.496, - "step": 370100 + "epoch": 0.0256, + "grad_norm": 1.3234130144119263, + "learning_rate": 8.579039857620587e-05, + "loss": 0.6802, + "step": 26800 }, { - "epoch": 18.000116, - "grad_norm": 2.053346872329712, - "learning_rate": 8.130844009353362e-06, - "loss": 2.3323, - "step": 370200 + "epoch": 0.025798, + "loss_gen": 4.028014183044434, + "loss_rtd": 0.4063258469104767, + "loss_sent": 0.4201416075229645, + "loss_sod": 0.16575351357460022, + "loss_total": 0.9922209978103638, + "step": 26899 }, { - "epoch": 19.000078, - "grad_norm": 1.9913196563720703, - "learning_rate": 8.119137334880933e-06, - "loss": 2.2625, - "step": 370300 + "epoch": 0.025798, + "loss_gen": 4.132768154144287, + "loss_rtd": 0.39652127027511597, + "loss_sent": 0.11059390008449554, + "loss_sod": 0.2088238000869751, + "loss_total": 0.7159389853477478, + "step": 26899 }, { - "epoch": 20.00004, - "grad_norm": 2.018827438354492, - "learning_rate": 8.107437460017958e-06, - "loss": 2.2166, - "step": 370400 + "epoch": 0.0258, + "grad_norm": 4.884906768798828, + "learning_rate": 8.576823212423258e-05, + "loss": 0.684, + "step": 26900 }, { - "epoch": 21.000002, - "grad_norm": 2.2157461643218994, - "learning_rate": 8.095744389477155e-06, - "loss": 2.1759, - "step": 370500 + "epoch": 0.025998, + "loss_gen": 3.7047204971313477, + "loss_rtd": 0.3975735604763031, + "loss_sent": 0.17745862901210785, + "loss_sod": 0.17362171411514282, + "loss_total": 0.7486538887023926, + "step": 26999 }, { - "epoch": 21.000202, - "grad_norm": 1.975722312927246, - "learning_rate": 8.084058127968497e-06, - "loss": 2.1349, - "step": 370600 + "epoch": 0.025998, + "loss_gen": 3.9842333793640137, + "loss_rtd": 0.3972439765930176, + "loss_sent": 0.23744548857212067, + "loss_sod": 0.06872141361236572, + "loss_total": 0.7034108638763428, + "step": 26999 }, { - "epoch": 22.000164, - "grad_norm": 2.118351459503174, - "learning_rate": 8.072378680199197e-06, - "loss": 2.1051, - "step": 370700 + "epoch": 0.026, + "grad_norm": 1.8134444952011108, + "learning_rate": 8.574605126481966e-05, + "loss": 0.6802, + "step": 27000 }, { - "epoch": 23.000126, - "grad_norm": 1.9632095098495483, - "learning_rate": 8.060706050873746e-06, - "loss": 2.0781, - "step": 370800 + "epoch": 0.026, + "eval_loss": 0.6658182144165039, + "eval_runtime": 151.5136, + "eval_samples_per_second": 101.925, + "eval_steps_per_second": 0.799, + "step": 27000 }, { - "epoch": 24.000088, - "grad_norm": 2.0141265392303467, - "learning_rate": 8.049040244693864e-06, - "loss": 2.0583, - "step": 370900 + "epoch": 0.026198, + "loss_gen": 4.2170023918151855, + "loss_rtd": 0.415958046913147, + "loss_sent": 0.21967723965644836, + "loss_sod": 0.021079879254102707, + "loss_total": 0.6567151546478271, + "step": 27099 }, { - "epoch": 25.00005, - "grad_norm": 2.0297467708587646, - "learning_rate": 8.037381266358546e-06, - "loss": 2.0323, - "step": 371000 + "epoch": 0.026198, + "loss_gen": 4.277978897094727, + "loss_rtd": 0.37973552942276, + "loss_sent": 0.14102819561958313, + "loss_sod": 0.03472696989774704, + "loss_total": 0.5554907321929932, + "step": 27099 }, { - "epoch": 25.00005, - "eval_loss": 2.3608412742614746, - "eval_runtime": 55.107, - "eval_samples_per_second": 184.986, - "eval_steps_per_second": 1.452, - "step": 371000 + "epoch": 0.0262, + "grad_norm": 0.7816162109375, + "learning_rate": 8.572385600690156e-05, + "loss": 0.6838, + "step": 27100 }, { - "epoch": 26.000012, - "grad_norm": 2.0017659664154053, - "learning_rate": 8.025729120564025e-06, - "loss": 2.2111, - "step": 371100 + "epoch": 0.026398, + "loss_gen": 4.134129524230957, + "loss_rtd": 0.3900187313556671, + "loss_sent": 0.033807508647441864, + "loss_sod": 0.06719149649143219, + "loss_total": 0.49101775884628296, + "step": 27199 }, { - "epoch": 26.000212, - "grad_norm": 2.087977409362793, - "learning_rate": 8.01408381200379e-06, - "loss": 2.1626, - "step": 371200 + "epoch": 0.026398, + "loss_gen": 3.962702751159668, + "loss_rtd": 0.3858363926410675, + "loss_sent": 0.172145813703537, + "loss_sod": 0.12715598940849304, + "loss_total": 0.6851382255554199, + "step": 27199 }, { - "epoch": 27.000174, - "grad_norm": 1.9115463495254517, - "learning_rate": 8.002445345368556e-06, - "loss": 2.1198, - "step": 371300 + "epoch": 0.0264, + "grad_norm": 0.6766712665557861, + "learning_rate": 8.570164635941853e-05, + "loss": 0.6921, + "step": 27200 }, { - "epoch": 28.000136, - "grad_norm": 2.075347423553467, - "learning_rate": 7.990813725346307e-06, - "loss": 2.0987, - "step": 371400 + "epoch": 0.026598, + "loss_gen": 3.8449392318725586, + "loss_rtd": 0.3860165774822235, + "loss_sent": 0.18103188276290894, + "loss_sod": 0.014358876273036003, + "loss_total": 0.5814073085784912, + "step": 27299 }, { - "epoch": 29.000098, - "grad_norm": 2.004270553588867, - "learning_rate": 7.979188956622263e-06, - "loss": 2.0634, - "step": 371500 + "epoch": 0.026598, + "loss_gen": 3.932614803314209, + "loss_rtd": 0.39760032296180725, + "loss_sent": 0.2425239533185959, + "loss_sod": 0.023994160816073418, + "loss_total": 0.6641184091567993, + "step": 27299 }, { - "epoch": 30.00006, - "grad_norm": 2.0730834007263184, - "learning_rate": 7.967571043878863e-06, - "loss": 2.0421, - "step": 371600 + "epoch": 0.0266, + "grad_norm": 0.8064673542976379, + "learning_rate": 8.567942233131662e-05, + "loss": 0.6864, + "step": 27300 }, { - "epoch": 31.000022, - "grad_norm": 2.0204977989196777, - "learning_rate": 7.955959991795809e-06, - "loss": 2.0191, - "step": 371700 + "epoch": 0.026798, + "loss_gen": 2.5391201972961426, + "loss_rtd": 0.3704368770122528, + "loss_sent": 0.01781933382153511, + "loss_sod": 0.17318029701709747, + "loss_total": 0.5614365339279175, + "step": 27399 }, { - "epoch": 31.000222, - "grad_norm": 1.9809165000915527, - "learning_rate": 7.944355805050032e-06, - "loss": 1.9979, - "step": 371800 + "epoch": 0.026798, + "loss_gen": 3.9527628421783447, + "loss_rtd": 0.3933136761188507, + "loss_sent": 0.05654369294643402, + "loss_sod": 0.0030567459762096405, + "loss_total": 0.45291411876678467, + "step": 27399 }, { - "epoch": 32.000184, - "grad_norm": 1.8896480798721313, - "learning_rate": 7.932758488315705e-06, - "loss": 1.9788, - "step": 371900 + "epoch": 0.0268, + "grad_norm": 1.0445514917373657, + "learning_rate": 8.565718393154767e-05, + "loss": 0.6832, + "step": 27400 }, { - "epoch": 33.000146, - "grad_norm": 1.8905068635940552, - "learning_rate": 7.921168046264213e-06, - "loss": 1.9646, - "step": 372000 + "epoch": 0.026998, + "loss_gen": 3.9675824642181396, + "loss_rtd": 0.4080756604671478, + "loss_sent": 0.3663182556629181, + "loss_sod": 0.03932018578052521, + "loss_total": 0.8137141466140747, + "step": 27499 }, { - "epoch": 33.000146, - "eval_loss": 2.3312835693359375, - "eval_runtime": 55.0336, - "eval_samples_per_second": 185.232, - "eval_steps_per_second": 1.454, - "step": 372000 + "epoch": 0.026998, + "loss_gen": 3.931001663208008, + "loss_rtd": 0.4211898744106293, + "loss_sent": 0.1906636357307434, + "loss_sod": 0.01331951841711998, + "loss_total": 0.625173032283783, + "step": 27499 }, { - "epoch": 34.000108, - "grad_norm": 2.0993173122406006, - "learning_rate": 7.909584483564187e-06, - "loss": 2.0813, - "step": 372100 + "epoch": 0.027, + "grad_norm": 2.134218454360962, + "learning_rate": 8.563493116906929e-05, + "loss": 0.6746, + "step": 27500 }, { - "epoch": 35.00007, - "grad_norm": 2.0958781242370605, - "learning_rate": 7.898007804881485e-06, - "loss": 2.0596, - "step": 372200 + "epoch": 0.027198, + "loss_gen": 3.0001442432403564, + "loss_rtd": 0.40689149498939514, + "loss_sent": 8.177244308171794e-05, + "loss_sod": 0.38727009296417236, + "loss_total": 0.794243335723877, + "step": 27599 }, { - "epoch": 36.000032, - "grad_norm": 1.9180951118469238, - "learning_rate": 7.886438014879205e-06, - "loss": 2.0353, - "step": 372300 + "epoch": 0.027198, + "loss_gen": 2.654207944869995, + "loss_rtd": 0.38280314207077026, + "loss_sent": 0.00010956126061500981, + "loss_sod": 0.34705692529678345, + "loss_total": 0.7299696207046509, + "step": 27599 }, { - "epoch": 36.000232, - "grad_norm": 2.0129170417785645, - "learning_rate": 7.874875118217639e-06, - "loss": 2.007, - "step": 372400 + "epoch": 0.0272, + "grad_norm": 1.3557177782058716, + "learning_rate": 8.561266405284489e-05, + "loss": 0.6848, + "step": 27600 }, { - "epoch": 37.000194, - "grad_norm": 1.9586989879608154, - "learning_rate": 7.863319119554325e-06, - "loss": 1.9911, - "step": 372500 + "epoch": 0.027398, + "loss_gen": 4.053226470947266, + "loss_rtd": 0.4024927318096161, + "loss_sent": 0.08170834183692932, + "loss_sod": 0.09648574143648148, + "loss_total": 0.5806868076324463, + "step": 27699 }, { - "epoch": 38.000156, - "grad_norm": 2.0036728382110596, - "learning_rate": 7.851770023544022e-06, - "loss": 1.97, - "step": 372600 + "epoch": 0.027398, + "loss_gen": 4.063277244567871, + "loss_rtd": 0.40312132239341736, + "loss_sent": 0.08955015242099762, + "loss_sod": 0.036374881863594055, + "loss_total": 0.529046356678009, + "step": 27699 }, { - "epoch": 39.000118, - "grad_norm": 2.0655548572540283, - "learning_rate": 7.840227834838709e-06, - "loss": 1.9609, - "step": 372700 + "epoch": 0.0274, + "grad_norm": 1.108843207359314, + "learning_rate": 8.559038259184369e-05, + "loss": 0.6819, + "step": 27700 }, { - "epoch": 40.00008, - "grad_norm": 1.8536264896392822, - "learning_rate": 7.828692558087566e-06, - "loss": 1.9389, - "step": 372800 + "epoch": 0.027598, + "loss_gen": 2.528923273086548, + "loss_rtd": 0.3807741105556488, + "loss_sent": 0.01023287232965231, + "loss_sod": 0.3588981032371521, + "loss_total": 0.7499050498008728, + "step": 27799 }, { - "epoch": 41.000042, - "grad_norm": 2.0123019218444824, - "learning_rate": 7.817164197937006e-06, - "loss": 1.9311, - "step": 372900 + "epoch": 0.027598, + "loss_gen": 4.049012184143066, + "loss_rtd": 0.4035383462905884, + "loss_sent": 0.2546653747558594, + "loss_sod": 0.01868937723338604, + "loss_total": 0.6768931150436401, + "step": 27799 }, { - "epoch": 42.000004, - "grad_norm": 1.9356095790863037, - "learning_rate": 7.80564275903066e-06, - "loss": 1.9157, - "step": 373000 + "epoch": 0.0276, + "grad_norm": 1.410236120223999, + "learning_rate": 8.556808679504063e-05, + "loss": 0.683, + "step": 27800 }, { - "epoch": 42.000004, - "eval_loss": 2.2908835411071777, - "eval_runtime": 54.8694, - "eval_samples_per_second": 185.787, - "eval_steps_per_second": 1.458, - "step": 373000 + "epoch": 0.027798, + "loss_gen": 4.2119293212890625, + "loss_rtd": 0.3961019515991211, + "loss_sent": 0.2591469883918762, + "loss_sod": 0.1585664451122284, + "loss_total": 0.8138154149055481, + "step": 27899 }, { - "epoch": 42.000204, - "grad_norm": 1.9983534812927246, - "learning_rate": 7.794128246009346e-06, - "loss": 1.9932, - "step": 373100 + "epoch": 0.027798, + "loss_gen": 4.044536590576172, + "loss_rtd": 0.3809899091720581, + "loss_sent": 0.08467037975788116, + "loss_sod": 0.06006523221731186, + "loss_total": 0.5257255434989929, + "step": 27899 }, { - "epoch": 43.000166, - "grad_norm": 2.0036892890930176, - "learning_rate": 7.782620663511117e-06, - "loss": 1.9803, - "step": 373200 + "epoch": 0.0278, + "grad_norm": 1.07132089138031, + "learning_rate": 8.554577667141644e-05, + "loss": 0.6724, + "step": 27900 }, { - "epoch": 44.000128, - "grad_norm": 1.9349839687347412, - "learning_rate": 7.771120016171227e-06, - "loss": 1.9687, - "step": 373300 + "epoch": 0.027998, + "loss_gen": 3.5283443927764893, + "loss_rtd": 0.3904866576194763, + "loss_sent": 0.16283035278320312, + "loss_sod": 0.005995422601699829, + "loss_total": 0.5593124628067017, + "step": 27999 }, { - "epoch": 45.00009, - "grad_norm": 1.8848403692245483, - "learning_rate": 7.759626308622142e-06, - "loss": 1.9474, - "step": 373400 + "epoch": 0.027998, + "loss_gen": 3.9386887550354004, + "loss_rtd": 0.40175795555114746, + "loss_sent": 0.12449917197227478, + "loss_sod": 0.17524287104606628, + "loss_total": 0.7015000581741333, + "step": 27999 }, { - "epoch": 46.000052, - "grad_norm": 1.9943233728408813, - "learning_rate": 7.74813954549351e-06, - "loss": 1.9319, - "step": 373500 + "epoch": 0.028, + "grad_norm": 1.200141429901123, + "learning_rate": 8.552345222995768e-05, + "loss": 0.678, + "step": 28000 }, { - "epoch": 47.000014, - "grad_norm": 1.9002938270568848, - "learning_rate": 7.736659731412204e-06, - "loss": 1.9217, - "step": 373600 + "epoch": 0.028, + "eval_loss": 0.6660157442092896, + "eval_runtime": 151.7199, + "eval_samples_per_second": 101.786, + "eval_steps_per_second": 0.798, + "step": 28000 }, { - "epoch": 47.000214, - "grad_norm": 1.9708117246627808, - "learning_rate": 7.725186871002296e-06, - "loss": 1.9083, - "step": 373700 + "epoch": 0.028198, + "loss_gen": 3.672696590423584, + "loss_rtd": 0.41329845786094666, + "loss_sent": 0.3887781500816345, + "loss_sod": 0.02329905517399311, + "loss_total": 0.8253756761550903, + "step": 28099 }, { - "epoch": 48.000176, - "grad_norm": 1.9721884727478027, - "learning_rate": 7.713720968885057e-06, - "loss": 1.8956, - "step": 373800 + "epoch": 0.028198, + "loss_gen": 3.825963258743286, + "loss_rtd": 0.39754387736320496, + "loss_sent": 0.23071692883968353, + "loss_sod": 0.0226123183965683, + "loss_total": 0.6508731245994568, + "step": 28099 }, { - "epoch": 49.000138, - "grad_norm": 1.9223700761795044, - "learning_rate": 7.702262029678939e-06, - "loss": 1.8808, - "step": 373900 + "epoch": 0.0282, + "grad_norm": 2.2831573486328125, + "learning_rate": 8.550111347965659e-05, + "loss": 0.6707, + "step": 28100 }, { - "epoch": 50.0001, - "grad_norm": 2.03428316116333, - "learning_rate": 7.690810057999607e-06, - "loss": 1.868, - "step": 374000 + "epoch": 0.028398, + "loss_gen": 3.5817105770111084, + "loss_rtd": 0.38901486992836, + "loss_sent": 0.13949772715568542, + "loss_sod": 0.17847463488578796, + "loss_total": 0.7069872617721558, + "step": 28199 }, { - "epoch": 50.0001, - "eval_loss": 2.2805299758911133, - "eval_runtime": 55.0731, - "eval_samples_per_second": 185.099, - "eval_steps_per_second": 1.453, - "step": 374000 + "epoch": 0.028398, + "loss_gen": 3.9351091384887695, + "loss_rtd": 0.3837580680847168, + "loss_sent": 0.34843626618385315, + "loss_sod": 0.036523815244436264, + "loss_total": 0.7687181234359741, + "step": 28199 }, { - "epoch": 51.000062, - "grad_norm": 1.947739601135254, - "learning_rate": 7.67936505845991e-06, - "loss": 1.9356, - "step": 374100 + "epoch": 0.0284, + "grad_norm": 1.053599238395691, + "learning_rate": 8.547876042951127e-05, + "loss": 0.6821, + "step": 28200 }, { - "epoch": 52.000024, - "grad_norm": 1.939833164215088, - "learning_rate": 7.667927035669906e-06, - "loss": 1.9287, - "step": 374200 + "epoch": 0.028598, + "loss_gen": 3.978748083114624, + "loss_rtd": 0.3998650908470154, + "loss_sent": 0.11169924587011337, + "loss_sod": 0.11000090092420578, + "loss_total": 0.6215652227401733, + "step": 28299 }, { - "epoch": 52.000224, - "grad_norm": 2.120412588119507, - "learning_rate": 7.656495994236813e-06, - "loss": 1.9083, - "step": 374300 + "epoch": 0.028598, + "loss_gen": 3.4779841899871826, + "loss_rtd": 0.39652907848358154, + "loss_sent": 0.13573679327964783, + "loss_sod": 0.15448696911334991, + "loss_total": 0.6867527961730957, + "step": 28299 }, { - "epoch": 53.000186, - "grad_norm": 1.9514408111572266, - "learning_rate": 7.645071938765055e-06, - "loss": 1.9005, - "step": 374400 + "epoch": 0.0286, + "grad_norm": 1.2055600881576538, + "learning_rate": 8.545639308852546e-05, + "loss": 0.6904, + "step": 28300 }, { - "epoch": 54.000148, - "grad_norm": 1.9537405967712402, - "learning_rate": 7.633654873856258e-06, - "loss": 1.8885, - "step": 374500 + "epoch": 0.028798, + "loss_gen": 3.9777488708496094, + "loss_rtd": 0.40452948212623596, + "loss_sent": 0.16046662628650665, + "loss_sod": 0.05235912278294563, + "loss_total": 0.617355227470398, + "step": 28399 }, { - "epoch": 55.00011, - "grad_norm": 1.9912673234939575, - "learning_rate": 7.6222448041091884e-06, - "loss": 1.8727, - "step": 374600 + "epoch": 0.028798, + "loss_gen": 3.3435187339782715, + "loss_rtd": 0.39600813388824463, + "loss_sent": 0.18167442083358765, + "loss_sod": 0.09612157940864563, + "loss_total": 0.6738041639328003, + "step": 28399 }, { - "epoch": 56.000072, - "grad_norm": 2.0160086154937744, - "learning_rate": 7.6108417341198366e-06, - "loss": 1.8652, - "step": 374700 + "epoch": 0.0288, + "grad_norm": 1.6021157503128052, + "learning_rate": 8.543401146570876e-05, + "loss": 0.7038, + "step": 28400 }, { - "epoch": 57.000034, - "grad_norm": 1.962786078453064, - "learning_rate": 7.599445668481353e-06, - "loss": 1.8495, - "step": 374800 + "epoch": 0.028998, + "loss_gen": 3.129559278488159, + "loss_rtd": 0.4089212715625763, + "loss_sent": 0.06733108311891556, + "loss_sod": 0.19300401210784912, + "loss_total": 0.669256329536438, + "step": 28499 }, { - "epoch": 57.000234, - "grad_norm": 2.0677285194396973, - "learning_rate": 7.588056611784084e-06, - "loss": 1.8414, - "step": 374900 + "epoch": 0.028998, + "loss_gen": 4.067131519317627, + "loss_rtd": 0.411083459854126, + "loss_sent": 0.19176021218299866, + "loss_sod": 0.019741695374250412, + "loss_total": 0.6225853562355042, + "step": 28499 }, { - "epoch": 58.000196, - "grad_norm": 1.923409104347229, - "learning_rate": 7.576674568615519e-06, - "loss": 1.8278, - "step": 375000 + "epoch": 0.029, + "grad_norm": 1.7359048128128052, + "learning_rate": 8.541161557007649e-05, + "loss": 0.6795, + "step": 28500 }, { - "epoch": 58.000196, - "eval_loss": 2.2644314765930176, - "eval_runtime": 54.7576, - "eval_samples_per_second": 186.166, - "eval_steps_per_second": 1.461, - "step": 375000 + "epoch": 0.029198, + "loss_gen": 4.150300025939941, + "loss_rtd": 0.38514629006385803, + "loss_sent": 0.23437869548797607, + "loss_sod": 0.031097739934921265, + "loss_total": 0.6506227254867554, + "step": 28599 }, { - "epoch": 59.000158, - "grad_norm": 2.0004312992095947, - "learning_rate": 7.565299543560353e-06, - "loss": 1.8848, - "step": 375100 + "epoch": 0.029198, + "loss_gen": 3.720979928970337, + "loss_rtd": 0.3987027108669281, + "loss_sent": 0.01098732091486454, + "loss_sod": 0.17678982019424438, + "loss_total": 0.5864798426628113, + "step": 28599 }, { - "epoch": 60.00012, - "grad_norm": 2.0457980632781982, - "learning_rate": 7.553931541200448e-06, - "loss": 1.8788, - "step": 375200 + "epoch": 0.0292, + "grad_norm": 0.6905227303504944, + "learning_rate": 8.53892054106497e-05, + "loss": 0.6865, + "step": 28600 }, { - "epoch": 61.000082, - "grad_norm": 1.9472349882125854, - "learning_rate": 7.54257056611484e-06, - "loss": 1.8666, - "step": 375300 + "epoch": 0.029398, + "loss_gen": 3.9948174953460693, + "loss_rtd": 0.4022224545478821, + "loss_sent": 0.2017492949962616, + "loss_sod": 0.0130801722407341, + "loss_total": 0.6170519590377808, + "step": 28699 }, { - "epoch": 62.000044, - "grad_norm": 2.019150733947754, - "learning_rate": 7.531216622879711e-06, - "loss": 1.8555, - "step": 375400 + "epoch": 0.029398, + "loss_gen": 4.131485939025879, + "loss_rtd": 0.3997971713542938, + "loss_sent": 0.2441902458667755, + "loss_sod": 0.010508487932384014, + "loss_total": 0.654495894908905, + "step": 28699 }, { - "epoch": 63.000006, - "grad_norm": 1.9674944877624512, - "learning_rate": 7.5198697160684365e-06, - "loss": 1.8495, - "step": 375500 + "epoch": 0.0294, + "grad_norm": 1.1098192930221558, + "learning_rate": 8.536678099645519e-05, + "loss": 0.6859, + "step": 28700 }, { - "epoch": 63.000206, - "grad_norm": 1.959089756011963, - "learning_rate": 7.5085298502515525e-06, - "loss": 1.8353, - "step": 375600 + "epoch": 0.029598, + "loss_gen": 3.984785556793213, + "loss_rtd": 0.39890533685684204, + "loss_sent": 0.1698952615261078, + "loss_sod": 0.016041960567235947, + "loss_total": 0.5848425626754761, + "step": 28799 }, { - "epoch": 64.000168, - "grad_norm": 1.9350240230560303, - "learning_rate": 7.4971970299967605e-06, - "loss": 1.8257, - "step": 375700 + "epoch": 0.029598, + "loss_gen": 3.76471209526062, + "loss_rtd": 0.4008885622024536, + "loss_sent": 0.5502520799636841, + "loss_sod": 0.10835559666156769, + "loss_total": 1.0594961643218994, + "step": 28799 }, { - "epoch": 65.00013, - "grad_norm": 1.9134896993637085, - "learning_rate": 7.4858712598689014e-06, - "loss": 1.8124, - "step": 375800 + "epoch": 0.0296, + "grad_norm": 1.9490429162979126, + "learning_rate": 8.534434233652554e-05, + "loss": 0.6912, + "step": 28800 }, { - "epoch": 66.000092, - "grad_norm": 2.0086705684661865, - "learning_rate": 7.474552544430008e-06, - "loss": 1.8052, - "step": 375900 + "epoch": 0.029798, + "loss_gen": 4.082456588745117, + "loss_rtd": 0.39842408895492554, + "loss_sent": 0.21229511499404907, + "loss_sod": 0.018248165026307106, + "loss_total": 0.6289674043655396, + "step": 28899 }, { - "epoch": 67.000054, - "grad_norm": 1.9945427179336548, - "learning_rate": 7.4632408882392504e-06, - "loss": 1.8005, - "step": 376000 + "epoch": 0.029798, + "loss_gen": 3.9854977130889893, + "loss_rtd": 0.3764669895172119, + "loss_sent": 0.1423240303993225, + "loss_sod": 0.012563586235046387, + "loss_total": 0.5313546061515808, + "step": 28899 }, { - "epoch": 67.000054, - "eval_loss": 2.248349189758301, - "eval_runtime": 54.5876, - "eval_samples_per_second": 186.746, - "eval_steps_per_second": 1.466, - "step": 376000 + "epoch": 0.0298, + "grad_norm": 0.910893440246582, + "learning_rate": 8.532188943989902e-05, + "loss": 0.6787, + "step": 28900 }, { - "epoch": 68.000016, - "grad_norm": 1.9743598699569702, - "learning_rate": 7.451936295852976e-06, - "loss": 1.8454, - "step": 376100 + "epoch": 0.029998, + "loss_gen": 4.123041152954102, + "loss_rtd": 0.3877366781234741, + "loss_sent": 0.15122351050376892, + "loss_sod": 0.06843329221010208, + "loss_total": 0.6073935031890869, + "step": 28999 }, { - "epoch": 68.000216, - "grad_norm": 1.898568034172058, - "learning_rate": 7.440638771824654e-06, - "loss": 1.8431, - "step": 376200 + "epoch": 0.029998, + "loss_gen": 3.9017460346221924, + "loss_rtd": 0.40746182203292847, + "loss_sent": 0.19289076328277588, + "loss_sod": 0.06269969046115875, + "loss_total": 0.6630522608757019, + "step": 28999 }, { - "epoch": 69.000178, - "grad_norm": 2.142463445663452, - "learning_rate": 7.429348320704935e-06, - "loss": 1.8277, - "step": 376300 + "epoch": 0.03, + "grad_norm": 1.3564268350601196, + "learning_rate": 8.529942231561965e-05, + "loss": 0.6793, + "step": 29000 }, { - "epoch": 70.00014, - "grad_norm": 1.9892468452453613, - "learning_rate": 7.41806494704162e-06, - "loss": 1.8119, - "step": 376400 + "epoch": 0.03, + "eval_loss": 0.6600340604782104, + "eval_runtime": 151.6015, + "eval_samples_per_second": 101.866, + "eval_steps_per_second": 0.798, + "step": 29000 }, { - "epoch": 71.000102, - "grad_norm": 2.005885601043701, - "learning_rate": 7.406788655379634e-06, - "loss": 1.8086, - "step": 376500 + "epoch": 0.030198, + "loss_gen": 2.629991054534912, + "loss_rtd": 0.3722720146179199, + "loss_sent": 0.02489466778934002, + "loss_sod": 0.2123338282108307, + "loss_total": 0.6095004677772522, + "step": 29099 }, { - "epoch": 72.000064, - "grad_norm": 1.9385697841644287, - "learning_rate": 7.395519450261074e-06, - "loss": 1.8024, - "step": 376600 + "epoch": 0.030198, + "loss_gen": 2.5899765491485596, + "loss_rtd": 0.37675192952156067, + "loss_sent": 0.03350969776511192, + "loss_sod": 0.09084537625312805, + "loss_total": 0.5011069774627686, + "step": 29099 }, { - "epoch": 73.000026, - "grad_norm": 1.9773157835006714, - "learning_rate": 7.384257336225173e-06, - "loss": 1.7934, - "step": 376700 + "epoch": 0.0302, + "grad_norm": 1.0573091506958008, + "learning_rate": 8.527694097273719e-05, + "loss": 0.6656, + "step": 29100 }, { - "epoch": 73.000226, - "grad_norm": 1.8618143796920776, - "learning_rate": 7.373002317808317e-06, - "loss": 1.7824, - "step": 376800 + "epoch": 0.030398, + "loss_gen": 3.877784490585327, + "loss_rtd": 0.39652708172798157, + "loss_sent": 0.12195165455341339, + "loss_sod": 0.03464795649051666, + "loss_total": 0.5531266927719116, + "step": 29199 }, { - "epoch": 74.000188, - "grad_norm": 1.9531538486480713, - "learning_rate": 7.361754399544013e-06, - "loss": 1.7727, - "step": 376900 + "epoch": 0.030398, + "loss_gen": 3.8500592708587646, + "loss_rtd": 0.393253356218338, + "loss_sent": 0.07029858976602554, + "loss_sod": 0.1901566982269287, + "loss_total": 0.6537086367607117, + "step": 29199 }, { - "epoch": 75.00015, - "grad_norm": 1.931515097618103, - "learning_rate": 7.350513585962926e-06, - "loss": 1.764, - "step": 377000 + "epoch": 0.0304, + "grad_norm": 1.2037585973739624, + "learning_rate": 8.525444542030714e-05, + "loss": 0.6772, + "step": 29200 }, { - "epoch": 75.00015, - "eval_loss": 2.2430500984191895, - "eval_runtime": 54.6415, - "eval_samples_per_second": 186.561, - "eval_steps_per_second": 1.464, - "step": 377000 + "epoch": 0.030598, + "loss_gen": 4.189395904541016, + "loss_rtd": 0.40149644017219543, + "loss_sent": 0.22139061987400055, + "loss_sod": 0.05354627966880798, + "loss_total": 0.6764333248138428, + "step": 29299 }, { - "epoch": 76.000112, - "grad_norm": 1.9521348476409912, - "learning_rate": 7.339279881592859e-06, - "loss": 1.8087, - "step": 377100 + "epoch": 0.030598, + "loss_gen": 4.037633895874023, + "loss_rtd": 0.40255945920944214, + "loss_sent": 0.3815597891807556, + "loss_sod": 0.07546190917491913, + "loss_total": 0.8595811724662781, + "step": 29299 }, { - "epoch": 77.000074, - "grad_norm": 2.0013513565063477, - "learning_rate": 7.32805329095875e-06, - "loss": 1.8023, - "step": 377200 + "epoch": 0.0306, + "grad_norm": 1.5219388008117676, + "learning_rate": 8.523193566739069e-05, + "loss": 0.685, + "step": 29300 }, { - "epoch": 78.000036, - "grad_norm": 1.8955408334732056, - "learning_rate": 7.316833818582652e-06, - "loss": 1.7943, - "step": 377300 + "epoch": 0.030798, + "loss_gen": 3.7690134048461914, + "loss_rtd": 0.3794300854206085, + "loss_sent": 0.16399529576301575, + "loss_sod": 0.032787956297397614, + "loss_total": 0.5762133598327637, + "step": 29399 }, { - "epoch": 78.000236, - "grad_norm": 2.0025761127471924, - "learning_rate": 7.305621468983781e-06, - "loss": 1.7903, - "step": 377400 + "epoch": 0.030798, + "loss_gen": 2.8526697158813477, + "loss_rtd": 0.40557971596717834, + "loss_sent": 0.0011788775445893407, + "loss_sod": 0.24723093211650848, + "loss_total": 0.6539894938468933, + "step": 29399 }, { - "epoch": 79.000198, - "grad_norm": 1.9769165515899658, - "learning_rate": 7.294416246678462e-06, - "loss": 1.7774, - "step": 377500 + "epoch": 0.0308, + "grad_norm": 1.0497325658798218, + "learning_rate": 8.520941172305477e-05, + "loss": 0.6907, + "step": 29400 }, { - "epoch": 80.00016, - "grad_norm": 1.8650860786437988, - "learning_rate": 7.283218156180174e-06, - "loss": 1.7698, - "step": 377600 + "epoch": 0.030998, + "loss_gen": 4.048665523529053, + "loss_rtd": 0.3874911963939667, + "loss_sent": 0.10516659170389175, + "loss_sod": 0.03333348408341408, + "loss_total": 0.5259912610054016, + "step": 29499 }, { - "epoch": 81.000122, - "grad_norm": 1.9133366346359253, - "learning_rate": 7.272027201999484e-06, - "loss": 1.7658, - "step": 377700 + "epoch": 0.030998, + "loss_gen": 4.126290321350098, + "loss_rtd": 0.4164048433303833, + "loss_sent": 0.17039263248443604, + "loss_sod": 0.04183642193675041, + "loss_total": 0.6286338567733765, + "step": 29499 }, { - "epoch": 82.000084, - "grad_norm": 1.9629889726638794, - "learning_rate": 7.260843388644117e-06, - "loss": 1.7552, - "step": 377800 + "epoch": 0.031, + "grad_norm": 0.7477266788482666, + "learning_rate": 8.5186873596372e-05, + "loss": 0.6852, + "step": 29500 }, { - "epoch": 83.000046, - "grad_norm": 1.9844943284988403, - "learning_rate": 7.249666720618919e-06, - "loss": 1.7539, - "step": 377900 + "epoch": 0.031198, + "loss_gen": 4.251503944396973, + "loss_rtd": 0.4096163809299469, + "loss_sent": 0.14379481971263885, + "loss_sod": 0.07312798500061035, + "loss_total": 0.6265391707420349, + "step": 29599 }, { - "epoch": 84.000008, - "grad_norm": 1.9470826387405396, - "learning_rate": 7.238497202425834e-06, - "loss": 1.7404, - "step": 378000 + "epoch": 0.031198, + "loss_gen": 4.066746711730957, + "loss_rtd": 0.41031789779663086, + "loss_sent": 0.2121007889509201, + "loss_sod": 0.01770671084523201, + "loss_total": 0.6401253938674927, + "step": 29599 }, { - "epoch": 84.000008, - "eval_loss": 2.234076499938965, - "eval_runtime": 54.5427, - "eval_samples_per_second": 186.9, - "eval_steps_per_second": 1.467, - "step": 378000 + "epoch": 0.0312, + "grad_norm": 1.068182110786438, + "learning_rate": 8.516432129642076e-05, + "loss": 0.6811, + "step": 29600 }, { - "epoch": 84.000208, - "grad_norm": 2.091539144515991, - "learning_rate": 7.2273348385639535e-06, - "loss": 1.7783, - "step": 378100 + "epoch": 0.031398, + "loss_gen": 2.7509026527404785, + "loss_rtd": 0.38142314553260803, + "loss_sent": 0.010356533341109753, + "loss_sod": 0.17461445927619934, + "loss_total": 0.5663941502571106, + "step": 29699 }, { - "epoch": 85.00017, - "grad_norm": 1.9156265258789062, - "learning_rate": 7.216179633529477e-06, - "loss": 1.7714, - "step": 378200 + "epoch": 0.031398, + "loss_gen": 4.262149810791016, + "loss_rtd": 0.38366538286209106, + "loss_sent": 0.3922903835773468, + "loss_sod": 0.12595689296722412, + "loss_total": 0.9019126892089844, + "step": 29699 }, { - "epoch": 86.000132, - "grad_norm": 2.0570554733276367, - "learning_rate": 7.205031591815723e-06, - "loss": 1.7658, - "step": 378300 + "epoch": 0.0314, + "grad_norm": 1.8687852621078491, + "learning_rate": 8.51417548322851e-05, + "loss": 0.6845, + "step": 29700 }, { - "epoch": 87.000094, - "grad_norm": 2.0413947105407715, - "learning_rate": 7.193890717913107e-06, - "loss": 1.7564, - "step": 378400 + "epoch": 0.031598, + "loss_gen": 4.012904167175293, + "loss_rtd": 0.40136775374412537, + "loss_sent": 0.15064215660095215, + "loss_sod": 0.10415194928646088, + "loss_total": 0.6561618447303772, + "step": 29799 }, { - "epoch": 88.000056, - "grad_norm": 1.91609787940979, - "learning_rate": 7.18275701630918e-06, - "loss": 1.7538, - "step": 378500 + "epoch": 0.031598, + "loss_gen": 3.656109571456909, + "loss_rtd": 0.38143277168273926, + "loss_sent": 0.10486055910587311, + "loss_sod": 0.16847120225429535, + "loss_total": 0.6547645330429077, + "step": 29799 }, { - "epoch": 89.000018, - "grad_norm": 1.8070498704910278, - "learning_rate": 7.171630491488598e-06, - "loss": 1.7439, - "step": 378600 + "epoch": 0.0316, + "grad_norm": 0.9747915863990784, + "learning_rate": 8.511917421305483e-05, + "loss": 0.6796, + "step": 29800 }, { - "epoch": 89.000218, - "grad_norm": 1.9066287279129028, - "learning_rate": 7.16051114793313e-06, - "loss": 1.7382, - "step": 378700 + "epoch": 0.031798, + "loss_gen": 4.163219451904297, + "loss_rtd": 0.38973960280418396, + "loss_sent": 0.17084045708179474, + "loss_sod": 0.040867019444704056, + "loss_total": 0.6014471054077148, + "step": 29899 }, { - "epoch": 90.00018, - "grad_norm": 1.8805670738220215, - "learning_rate": 7.149398990121628e-06, - "loss": 1.7322, - "step": 378800 + "epoch": 0.031798, + "loss_gen": 4.002505302429199, + "loss_rtd": 0.3905410170555115, + "loss_sent": 0.16887712478637695, + "loss_sod": 0.04208110272884369, + "loss_total": 0.6014992594718933, + "step": 29899 }, { - "epoch": 91.000142, - "grad_norm": 1.93112313747406, - "learning_rate": 7.138294022530081e-06, - "loss": 1.7221, - "step": 378900 + "epoch": 0.0318, + "grad_norm": 0.9827485680580139, + "learning_rate": 8.509657944782535e-05, + "loss": 0.6857, + "step": 29900 }, { - "epoch": 92.000104, - "grad_norm": 1.9273699522018433, - "learning_rate": 7.127196249631565e-06, - "loss": 1.717, - "step": 379000 + "epoch": 0.031998, + "loss_gen": 4.085021018981934, + "loss_rtd": 0.40430063009262085, + "loss_sent": 0.10190442204475403, + "loss_sod": 0.18283164501190186, + "loss_total": 0.6890367269515991, + "step": 29999 }, { - "epoch": 92.000104, - "eval_loss": 2.222762107849121, - "eval_runtime": 54.5793, - "eval_samples_per_second": 186.774, - "eval_steps_per_second": 1.466, - "step": 379000 + "epoch": 0.031998, + "loss_gen": 4.095636367797852, + "loss_rtd": 0.41734832525253296, + "loss_sent": 0.20207037031650543, + "loss_sod": 0.024344047531485558, + "loss_total": 0.6437627673149109, + "step": 29999 }, { - "epoch": 93.000066, - "grad_norm": 1.9170584678649902, - "learning_rate": 7.116105675896276e-06, - "loss": 1.7486, - "step": 379100 + "epoch": 0.032, + "grad_norm": 0.8807776570320129, + "learning_rate": 8.507397054569788e-05, + "loss": 0.6884, + "step": 30000 }, { - "epoch": 94.000028, - "grad_norm": 1.886796474456787, - "learning_rate": 7.105022305791467e-06, - "loss": 1.7455, - "step": 379200 + "epoch": 0.032, + "eval_loss": 0.6608233451843262, + "eval_runtime": 151.6967, + "eval_samples_per_second": 101.802, + "eval_steps_per_second": 0.798, + "step": 30000 }, { - "epoch": 94.000228, - "grad_norm": 1.9963804483413696, - "learning_rate": 7.0939461437815354e-06, - "loss": 1.744, - "step": 379300 + "epoch": 0.032198, + "loss_gen": 3.037975549697876, + "loss_rtd": 0.37186214327812195, + "loss_sent": 0.08015337586402893, + "loss_sod": 0.0894375741481781, + "loss_total": 0.5414531230926514, + "step": 30099 }, { - "epoch": 95.00019, - "grad_norm": 1.9092683792114258, - "learning_rate": 7.082877194327953e-06, - "loss": 1.7332, - "step": 379400 + "epoch": 0.032198, + "loss_gen": 4.237514972686768, + "loss_rtd": 0.4175643026828766, + "loss_sent": 0.15542326867580414, + "loss_sod": 0.011203978210687637, + "loss_total": 0.5841915607452393, + "step": 30099 }, { - "epoch": 96.000152, - "grad_norm": 1.9792388677597046, - "learning_rate": 7.071815461889303e-06, - "loss": 1.728, - "step": 379500 + "epoch": 0.0322, + "grad_norm": 0.9229232668876648, + "learning_rate": 8.505134751577925e-05, + "loss": 0.6687, + "step": 30100 }, { - "epoch": 97.000114, - "grad_norm": 1.9630019664764404, - "learning_rate": 7.060760950921233e-06, - "loss": 1.7224, - "step": 379600 + "epoch": 0.032398, + "loss_gen": 2.4504544734954834, + "loss_rtd": 0.39251017570495605, + "loss_sent": 0.00014123741129878908, + "loss_sod": 0.3881913423538208, + "loss_total": 0.7808427810668945, + "step": 30199 }, { - "epoch": 98.000076, - "grad_norm": 1.9032080173492432, - "learning_rate": 7.049713665876509e-06, - "loss": 1.7176, - "step": 379700 + "epoch": 0.032398, + "loss_gen": 2.186908483505249, + "loss_rtd": 0.3671330511569977, + "loss_sent": 0.05180347338318825, + "loss_sod": 0.1732897162437439, + "loss_total": 0.5922262668609619, + "step": 30199 }, { - "epoch": 99.000038, - "grad_norm": 1.9760445356369019, - "learning_rate": 7.038673611204971e-06, - "loss": 1.7142, - "step": 379800 + "epoch": 0.0324, + "grad_norm": 1.0616891384124756, + "learning_rate": 8.502871036718206e-05, + "loss": 0.6948, + "step": 30200 }, { - "epoch": 99.000238, - "grad_norm": 2.5537993907928467, - "learning_rate": 7.027640791353562e-06, - "loss": 1.7043, - "step": 379900 + "epoch": 0.032598, + "loss_gen": 4.021667957305908, + "loss_rtd": 0.39637523889541626, + "loss_sent": 0.1306372582912445, + "loss_sod": 0.004748234525322914, + "loss_total": 0.5317606925964355, + "step": 30299 }, { - "epoch": 100.0002, - "grad_norm": 1.9134443998336792, - "learning_rate": 7.016615210766287e-06, - "loss": 1.6935, - "step": 380000 + "epoch": 0.032598, + "loss_gen": 4.0861287117004395, + "loss_rtd": 0.40060102939605713, + "loss_sent": 0.17788061499595642, + "loss_sod": 0.08653467148542404, + "loss_total": 0.6650162935256958, + "step": 30299 }, { - "epoch": 100.0002, - "eval_loss": 2.2129366397857666, - "eval_runtime": 54.6255, - "eval_samples_per_second": 186.616, - "eval_steps_per_second": 1.465, - "step": 380000 + "epoch": 0.0326, + "grad_norm": 1.5607576370239258, + "learning_rate": 8.500605910902451e-05, + "loss": 0.6983, + "step": 30300 }, { - "epoch": 101.000162, - "grad_norm": 1.8621317148208618, - "learning_rate": 7.005596873884254e-06, - "loss": 1.7287, - "step": 380100 + "epoch": 0.032798, + "loss_gen": 4.1850266456604, + "loss_rtd": 0.39154505729675293, + "loss_sent": 0.2575682997703552, + "loss_sod": 0.06015142798423767, + "loss_total": 0.7092647552490234, + "step": 30399 }, { - "epoch": 102.000124, - "grad_norm": 2.0007071495056152, - "learning_rate": 6.994585785145647e-06, - "loss": 1.7216, - "step": 380200 + "epoch": 0.032798, + "loss_gen": 3.922189474105835, + "loss_rtd": 0.3912644386291504, + "loss_sent": 0.12429532408714294, + "loss_sod": 0.024736901745200157, + "loss_total": 0.5402966737747192, + "step": 30399 }, { - "epoch": 103.000086, - "grad_norm": 1.981418490409851, - "learning_rate": 6.98358194898574e-06, - "loss": 1.7192, - "step": 380300 + "epoch": 0.0328, + "grad_norm": 1.0241069793701172, + "learning_rate": 8.498339375043054e-05, + "loss": 0.6603, + "step": 30400 }, { - "epoch": 104.000048, - "grad_norm": 1.7912635803222656, - "learning_rate": 6.972585369836865e-06, - "loss": 1.7046, - "step": 380400 + "epoch": 0.032998, + "loss_gen": 3.650404691696167, + "loss_rtd": 0.4076765775680542, + "loss_sent": 0.4603433310985565, + "loss_sod": 0.19098269939422607, + "loss_total": 1.0590026378631592, + "step": 30499 }, { - "epoch": 105.00001, - "grad_norm": 1.9558844566345215, - "learning_rate": 6.961596052128444e-06, - "loss": 1.708, - "step": 380500 + "epoch": 0.032998, + "loss_gen": 2.541520118713379, + "loss_rtd": 0.3753470480442047, + "loss_sent": 8.163358870660886e-05, + "loss_sod": 0.20873983204364777, + "loss_total": 0.584168553352356, + "step": 30499 }, { - "epoch": 105.00021, - "grad_norm": 1.9592783451080322, - "learning_rate": 6.9506140002869756e-06, - "loss": 1.699, - "step": 380600 + "epoch": 0.033, + "grad_norm": 1.3215209245681763, + "learning_rate": 8.496071430052975e-05, + "loss": 0.6851, + "step": 30500 }, { - "epoch": 106.000172, - "grad_norm": 1.9580655097961426, - "learning_rate": 6.939639218736041e-06, - "loss": 1.6912, - "step": 380700 + "epoch": 0.033198, + "loss_gen": 4.049553871154785, + "loss_rtd": 0.3839671015739441, + "loss_sent": 0.213576540350914, + "loss_sod": 0.08501975983381271, + "loss_total": 0.6825634241104126, + "step": 30599 }, { - "epoch": 107.000134, - "grad_norm": 1.9187573194503784, - "learning_rate": 6.928671711896259e-06, - "loss": 1.6864, - "step": 380800 + "epoch": 0.033198, + "loss_gen": 4.2370758056640625, + "loss_rtd": 0.39378654956817627, + "loss_sent": 0.1530376374721527, + "loss_sod": 0.15263622999191284, + "loss_total": 0.6994603872299194, + "step": 30599 }, { - "epoch": 108.000096, - "grad_norm": 2.0804340839385986, - "learning_rate": 6.917711484185349e-06, - "loss": 1.6843, - "step": 380900 + "epoch": 0.0332, + "grad_norm": 1.6667479276657104, + "learning_rate": 8.493802076845741e-05, + "loss": 0.6995, + "step": 30600 }, { - "epoch": 109.000058, - "grad_norm": 1.9156286716461182, - "learning_rate": 6.906758540018099e-06, - "loss": 1.6788, - "step": 381000 + "epoch": 0.033398, + "loss_gen": 4.333565711975098, + "loss_rtd": 0.3889717161655426, + "loss_sent": 0.15002062916755676, + "loss_sod": 0.00713471882045269, + "loss_total": 0.5461270809173584, + "step": 30699 }, { - "epoch": 109.000058, - "eval_loss": 2.2096140384674072, - "eval_runtime": 54.6776, - "eval_samples_per_second": 186.438, - "eval_steps_per_second": 1.463, - "step": 381000 + "epoch": 0.033398, + "loss_gen": 3.984656572341919, + "loss_rtd": 0.40549373626708984, + "loss_sent": 0.3065290153026581, + "loss_sod": 0.015845568850636482, + "loss_total": 0.7278683185577393, + "step": 30699 }, { - "epoch": 110.00002, - "grad_norm": 1.8327763080596924, - "learning_rate": 6.895812883806341e-06, - "loss": 1.703, - "step": 381100 + "epoch": 0.0334, + "grad_norm": 1.2295725345611572, + "learning_rate": 8.491531316335451e-05, + "loss": 0.678, + "step": 30700 }, { - "epoch": 110.00022, - "grad_norm": 1.9155895709991455, - "learning_rate": 6.884874519958984e-06, - "loss": 1.6962, - "step": 381200 + "epoch": 0.033598, + "loss_gen": 2.4938693046569824, + "loss_rtd": 0.3633447587490082, + "loss_sent": 0.03735076263546944, + "loss_sod": 0.3045555651187897, + "loss_total": 0.7052510976791382, + "step": 30799 }, { - "epoch": 111.000182, - "grad_norm": 1.8222503662109375, - "learning_rate": 6.873943452882006e-06, - "loss": 1.6917, - "step": 381300 + "epoch": 0.033598, + "loss_gen": 3.8879973888397217, + "loss_rtd": 0.39840835332870483, + "loss_sent": 0.1616201400756836, + "loss_sod": 0.05654360353946686, + "loss_total": 0.6165720820426941, + "step": 30799 }, { - "epoch": 112.000144, - "grad_norm": 1.8987947702407837, - "learning_rate": 6.863019686978445e-06, - "loss": 1.6892, - "step": 381400 + "epoch": 0.0336, + "grad_norm": 1.2731208801269531, + "learning_rate": 8.489259149436762e-05, + "loss": 0.6793, + "step": 30800 }, { - "epoch": 113.000106, - "grad_norm": 1.8653353452682495, - "learning_rate": 6.85210322664838e-06, - "loss": 1.6867, - "step": 381500 + "epoch": 0.033798, + "loss_gen": 4.189641952514648, + "loss_rtd": 0.40135085582733154, + "loss_sent": 0.3362424075603485, + "loss_sod": 0.07014751434326172, + "loss_total": 0.8077408075332642, + "step": 30899 }, { - "epoch": 114.000068, - "grad_norm": 1.8713948726654053, - "learning_rate": 6.841194076288962e-06, - "loss": 1.6777, - "step": 381600 + "epoch": 0.033798, + "loss_gen": 4.071540832519531, + "loss_rtd": 0.3889402449131012, + "loss_sent": 0.2542674243450165, + "loss_sod": 0.02074206806719303, + "loss_total": 0.663949728012085, + "step": 30899 }, { - "epoch": 115.00003, - "grad_norm": 1.9354687929153442, - "learning_rate": 6.830292240294398e-06, - "loss": 1.6756, - "step": 381700 + "epoch": 0.0338, + "grad_norm": 1.0032696723937988, + "learning_rate": 8.486985577064905e-05, + "loss": 0.6825, + "step": 30900 }, { - "epoch": 115.00023, - "grad_norm": 1.8539812564849854, - "learning_rate": 6.8193977230559565e-06, - "loss": 1.669, - "step": 381800 + "epoch": 0.033998, + "loss_gen": 4.200192928314209, + "loss_rtd": 0.3943803608417511, + "loss_sent": 0.20004616677761078, + "loss_sod": 0.10118408501148224, + "loss_total": 0.6956106424331665, + "step": 30999 }, { - "epoch": 116.000192, - "grad_norm": 1.913901448249817, - "learning_rate": 6.808510528961928e-06, - "loss": 1.6632, - "step": 381900 + "epoch": 0.033998, + "loss_gen": 4.053731441497803, + "loss_rtd": 0.3781401216983795, + "loss_sent": 0.2314639687538147, + "loss_sod": 0.0035957596264779568, + "loss_total": 0.6131998300552368, + "step": 30999 }, { - "epoch": 117.000154, - "grad_norm": 1.8366894721984863, - "learning_rate": 6.797630662397683e-06, - "loss": 1.6619, - "step": 382000 + "epoch": 0.034, + "grad_norm": 0.9742544889450073, + "learning_rate": 8.484710600135675e-05, + "loss": 0.6834, + "step": 31000 }, { - "epoch": 117.000154, - "eval_loss": 2.1981077194213867, - "eval_runtime": 54.646, - "eval_samples_per_second": 186.546, - "eval_steps_per_second": 1.464, - "step": 382000 + "epoch": 0.034, + "eval_loss": 0.6606374979019165, + "eval_runtime": 151.7109, + "eval_samples_per_second": 101.792, + "eval_steps_per_second": 0.798, + "step": 31000 } ], "logging_steps": 100, @@ -29830,8 +8032,8 @@ "attributes": {} } }, - "total_flos": 3.333426940465899e+19, - "train_batch_size": 128, + "total_flos": 2.165372989734912e+18, + "train_batch_size": 64, "trial_name": null, "trial_params": null }