{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 26790, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011198208286674132, "grad_norm": 4.2997212409973145, "learning_rate": 4.9983202687569994e-05, "loss": 3.1991, "step": 10 }, { "epoch": 0.0022396416573348264, "grad_norm": 3.1073286533355713, "learning_rate": 4.9964539007092206e-05, "loss": 2.9692, "step": 20 }, { "epoch": 0.0033594624860022394, "grad_norm": 1.520981788635254, "learning_rate": 4.994587532661441e-05, "loss": 3.2009, "step": 30 }, { "epoch": 0.004479283314669653, "grad_norm": 4.404178142547607, "learning_rate": 4.9927211646136616e-05, "loss": 2.8, "step": 40 }, { "epoch": 0.005599104143337066, "grad_norm": 6.26295280456543, "learning_rate": 4.990854796565883e-05, "loss": 3.2157, "step": 50 }, { "epoch": 0.006718924972004479, "grad_norm": 2.1947414875030518, "learning_rate": 4.988988428518104e-05, "loss": 3.358, "step": 60 }, { "epoch": 0.007838745800671893, "grad_norm": 2.184293746948242, "learning_rate": 4.987122060470325e-05, "loss": 3.1155, "step": 70 }, { "epoch": 0.008958566629339306, "grad_norm": 2.162921905517578, "learning_rate": 4.9852556924225456e-05, "loss": 2.113, "step": 80 }, { "epoch": 0.010078387458006719, "grad_norm": 6.238914966583252, "learning_rate": 4.983389324374767e-05, "loss": 2.9041, "step": 90 }, { "epoch": 0.011198208286674132, "grad_norm": 2.3711066246032715, "learning_rate": 4.981522956326988e-05, "loss": 2.6368, "step": 100 }, { "epoch": 0.012318029115341545, "grad_norm": 6.883894443511963, "learning_rate": 4.979656588279209e-05, "loss": 3.1278, "step": 110 }, { "epoch": 0.013437849944008958, "grad_norm": 2.7835214138031006, "learning_rate": 4.9777902202314296e-05, "loss": 3.2703, "step": 120 }, { "epoch": 0.014557670772676373, "grad_norm": 2.707707643508911, "learning_rate": 4.975923852183651e-05, "loss": 3.2012, "step": 130 }, { "epoch": 0.015677491601343786, "grad_norm": 7.556955337524414, "learning_rate": 4.974057484135872e-05, "loss": 3.0857, "step": 140 }, { "epoch": 0.0167973124300112, "grad_norm": 9.100250244140625, "learning_rate": 4.972191116088093e-05, "loss": 2.8656, "step": 150 }, { "epoch": 0.01791713325867861, "grad_norm": 6.301916122436523, "learning_rate": 4.9703247480403136e-05, "loss": 2.6866, "step": 160 }, { "epoch": 0.019036954087346025, "grad_norm": 2.9455721378326416, "learning_rate": 4.968458379992535e-05, "loss": 2.4591, "step": 170 }, { "epoch": 0.020156774916013438, "grad_norm": 2.6669416427612305, "learning_rate": 4.966592011944756e-05, "loss": 2.73, "step": 180 }, { "epoch": 0.02127659574468085, "grad_norm": 7.3631086349487305, "learning_rate": 4.964725643896977e-05, "loss": 2.5342, "step": 190 }, { "epoch": 0.022396416573348264, "grad_norm": 2.861095905303955, "learning_rate": 4.9628592758491976e-05, "loss": 3.152, "step": 200 }, { "epoch": 0.023516237402015677, "grad_norm": 6.908902645111084, "learning_rate": 4.960992907801419e-05, "loss": 2.4202, "step": 210 }, { "epoch": 0.02463605823068309, "grad_norm": 3.1885159015655518, "learning_rate": 4.95912653975364e-05, "loss": 2.6223, "step": 220 }, { "epoch": 0.025755879059350503, "grad_norm": 3.509582042694092, "learning_rate": 4.957260171705861e-05, "loss": 2.2328, "step": 230 }, { "epoch": 0.026875699888017916, "grad_norm": 7.881849765777588, "learning_rate": 4.9553938036580816e-05, "loss": 3.1846, "step": 240 }, { "epoch": 0.027995520716685332, "grad_norm": 6.826298236846924, "learning_rate": 4.953527435610303e-05, "loss": 2.5674, "step": 250 }, { "epoch": 0.029115341545352745, "grad_norm": 2.763533353805542, "learning_rate": 4.951661067562523e-05, "loss": 2.9195, "step": 260 }, { "epoch": 0.030235162374020158, "grad_norm": 2.4903926849365234, "learning_rate": 4.9497946995147444e-05, "loss": 2.8041, "step": 270 }, { "epoch": 0.03135498320268757, "grad_norm": 5.462828636169434, "learning_rate": 4.9479283314669656e-05, "loss": 2.7518, "step": 280 }, { "epoch": 0.032474804031354984, "grad_norm": 2.7415122985839844, "learning_rate": 4.946061963419186e-05, "loss": 2.2559, "step": 290 }, { "epoch": 0.0335946248600224, "grad_norm": 3.042797803878784, "learning_rate": 4.944195595371407e-05, "loss": 2.3064, "step": 300 }, { "epoch": 0.03471444568868981, "grad_norm": 7.439578533172607, "learning_rate": 4.9423292273236284e-05, "loss": 2.2869, "step": 310 }, { "epoch": 0.03583426651735722, "grad_norm": 7.290367603302002, "learning_rate": 4.9404628592758496e-05, "loss": 2.4284, "step": 320 }, { "epoch": 0.036954087346024636, "grad_norm": 3.0886971950531006, "learning_rate": 4.93859649122807e-05, "loss": 2.8264, "step": 330 }, { "epoch": 0.03807390817469205, "grad_norm": 2.817957878112793, "learning_rate": 4.936730123180291e-05, "loss": 2.631, "step": 340 }, { "epoch": 0.03919372900335946, "grad_norm": 2.4056355953216553, "learning_rate": 4.9348637551325124e-05, "loss": 2.538, "step": 350 }, { "epoch": 0.040313549832026875, "grad_norm": 5.376706600189209, "learning_rate": 4.9329973870847336e-05, "loss": 2.8889, "step": 360 }, { "epoch": 0.04143337066069429, "grad_norm": 2.869654893875122, "learning_rate": 4.931131019036954e-05, "loss": 2.4983, "step": 370 }, { "epoch": 0.0425531914893617, "grad_norm": 4.797430515289307, "learning_rate": 4.929264650989175e-05, "loss": 3.0132, "step": 380 }, { "epoch": 0.043673012318029114, "grad_norm": 7.970874786376953, "learning_rate": 4.9273982829413964e-05, "loss": 2.8745, "step": 390 }, { "epoch": 0.04479283314669653, "grad_norm": 5.253184795379639, "learning_rate": 4.9255319148936176e-05, "loss": 2.6653, "step": 400 }, { "epoch": 0.04591265397536394, "grad_norm": 3.0611302852630615, "learning_rate": 4.923665546845838e-05, "loss": 2.5853, "step": 410 }, { "epoch": 0.04703247480403135, "grad_norm": 5.2951884269714355, "learning_rate": 4.921799178798059e-05, "loss": 2.5193, "step": 420 }, { "epoch": 0.048152295632698766, "grad_norm": 2.9463164806365967, "learning_rate": 4.9199328107502804e-05, "loss": 2.5797, "step": 430 }, { "epoch": 0.04927211646136618, "grad_norm": 3.1109468936920166, "learning_rate": 4.9180664427025016e-05, "loss": 2.7147, "step": 440 }, { "epoch": 0.05039193729003359, "grad_norm": 4.468992710113525, "learning_rate": 4.916200074654722e-05, "loss": 2.5695, "step": 450 }, { "epoch": 0.051511758118701005, "grad_norm": 8.419249534606934, "learning_rate": 4.914333706606943e-05, "loss": 2.7006, "step": 460 }, { "epoch": 0.05263157894736842, "grad_norm": 6.9784722328186035, "learning_rate": 4.912467338559164e-05, "loss": 2.6604, "step": 470 }, { "epoch": 0.05375139977603583, "grad_norm": 3.6676979064941406, "learning_rate": 4.9106009705113856e-05, "loss": 2.5849, "step": 480 }, { "epoch": 0.054871220604703244, "grad_norm": 2.428481101989746, "learning_rate": 4.908734602463606e-05, "loss": 2.7081, "step": 490 }, { "epoch": 0.055991041433370664, "grad_norm": 4.069552898406982, "learning_rate": 4.9068682344158266e-05, "loss": 2.7358, "step": 500 }, { "epoch": 0.05711086226203808, "grad_norm": 4.768444538116455, "learning_rate": 4.905001866368048e-05, "loss": 2.3977, "step": 510 }, { "epoch": 0.05823068309070549, "grad_norm": 7.9206342697143555, "learning_rate": 4.903135498320269e-05, "loss": 2.615, "step": 520 }, { "epoch": 0.0593505039193729, "grad_norm": 4.9245476722717285, "learning_rate": 4.90126913027249e-05, "loss": 2.1996, "step": 530 }, { "epoch": 0.060470324748040316, "grad_norm": 3.498934745788574, "learning_rate": 4.8994027622247106e-05, "loss": 2.4096, "step": 540 }, { "epoch": 0.06159014557670773, "grad_norm": 2.8137447834014893, "learning_rate": 4.897536394176932e-05, "loss": 2.3293, "step": 550 }, { "epoch": 0.06270996640537514, "grad_norm": 2.2534000873565674, "learning_rate": 4.895670026129153e-05, "loss": 2.5449, "step": 560 }, { "epoch": 0.06382978723404255, "grad_norm": 5.295638561248779, "learning_rate": 4.893803658081374e-05, "loss": 2.4056, "step": 570 }, { "epoch": 0.06494960806270997, "grad_norm": 2.56434965133667, "learning_rate": 4.8919372900335946e-05, "loss": 2.4592, "step": 580 }, { "epoch": 0.06606942889137737, "grad_norm": 2.1598501205444336, "learning_rate": 4.890070921985816e-05, "loss": 2.3764, "step": 590 }, { "epoch": 0.0671892497200448, "grad_norm": 2.71020245552063, "learning_rate": 4.888204553938037e-05, "loss": 2.8345, "step": 600 }, { "epoch": 0.0683090705487122, "grad_norm": 6.304147243499756, "learning_rate": 4.886338185890258e-05, "loss": 2.6056, "step": 610 }, { "epoch": 0.06942889137737962, "grad_norm": 6.749722003936768, "learning_rate": 4.8844718178424786e-05, "loss": 2.1934, "step": 620 }, { "epoch": 0.07054871220604703, "grad_norm": 7.15731954574585, "learning_rate": 4.8826054497947e-05, "loss": 2.3531, "step": 630 }, { "epoch": 0.07166853303471445, "grad_norm": 2.1473021507263184, "learning_rate": 4.880739081746921e-05, "loss": 2.6166, "step": 640 }, { "epoch": 0.07278835386338185, "grad_norm": 2.7124907970428467, "learning_rate": 4.878872713699142e-05, "loss": 2.3902, "step": 650 }, { "epoch": 0.07390817469204927, "grad_norm": 11.411026954650879, "learning_rate": 4.8770063456513626e-05, "loss": 2.4434, "step": 660 }, { "epoch": 0.07502799552071669, "grad_norm": 2.7030553817749023, "learning_rate": 4.875139977603584e-05, "loss": 2.3203, "step": 670 }, { "epoch": 0.0761478163493841, "grad_norm": 8.861196517944336, "learning_rate": 4.873273609555804e-05, "loss": 2.5645, "step": 680 }, { "epoch": 0.07726763717805152, "grad_norm": 4.628374099731445, "learning_rate": 4.871407241508026e-05, "loss": 2.8101, "step": 690 }, { "epoch": 0.07838745800671892, "grad_norm": 9.408851623535156, "learning_rate": 4.8695408734602466e-05, "loss": 2.2799, "step": 700 }, { "epoch": 0.07950727883538634, "grad_norm": 9.125829696655273, "learning_rate": 4.867674505412468e-05, "loss": 2.6383, "step": 710 }, { "epoch": 0.08062709966405375, "grad_norm": 3.9296653270721436, "learning_rate": 4.865808137364688e-05, "loss": 2.0646, "step": 720 }, { "epoch": 0.08174692049272117, "grad_norm": 5.768338203430176, "learning_rate": 4.8639417693169094e-05, "loss": 2.3089, "step": 730 }, { "epoch": 0.08286674132138858, "grad_norm": 2.457167148590088, "learning_rate": 4.8620754012691306e-05, "loss": 2.2855, "step": 740 }, { "epoch": 0.083986562150056, "grad_norm": 8.373284339904785, "learning_rate": 4.860209033221351e-05, "loss": 2.2884, "step": 750 }, { "epoch": 0.0851063829787234, "grad_norm": 2.7077553272247314, "learning_rate": 4.858342665173572e-05, "loss": 2.5055, "step": 760 }, { "epoch": 0.08622620380739082, "grad_norm": 7.299142360687256, "learning_rate": 4.8564762971257934e-05, "loss": 2.2387, "step": 770 }, { "epoch": 0.08734602463605823, "grad_norm": 2.6765339374542236, "learning_rate": 4.8546099290780146e-05, "loss": 2.6105, "step": 780 }, { "epoch": 0.08846584546472565, "grad_norm": 3.2736477851867676, "learning_rate": 4.852743561030235e-05, "loss": 2.3101, "step": 790 }, { "epoch": 0.08958566629339305, "grad_norm": 8.751072883605957, "learning_rate": 4.850877192982456e-05, "loss": 2.7699, "step": 800 }, { "epoch": 0.09070548712206047, "grad_norm": 2.8005926609039307, "learning_rate": 4.8490108249346774e-05, "loss": 2.5564, "step": 810 }, { "epoch": 0.09182530795072788, "grad_norm": 5.777060031890869, "learning_rate": 4.8471444568868986e-05, "loss": 2.5826, "step": 820 }, { "epoch": 0.0929451287793953, "grad_norm": 5.9840803146362305, "learning_rate": 4.845278088839119e-05, "loss": 2.7461, "step": 830 }, { "epoch": 0.0940649496080627, "grad_norm": 5.613245010375977, "learning_rate": 4.84341172079134e-05, "loss": 2.2355, "step": 840 }, { "epoch": 0.09518477043673013, "grad_norm": 2.8910045623779297, "learning_rate": 4.8415453527435614e-05, "loss": 2.5613, "step": 850 }, { "epoch": 0.09630459126539753, "grad_norm": 2.2605295181274414, "learning_rate": 4.8396789846957826e-05, "loss": 2.3108, "step": 860 }, { "epoch": 0.09742441209406495, "grad_norm": 2.1678943634033203, "learning_rate": 4.837812616648003e-05, "loss": 2.5849, "step": 870 }, { "epoch": 0.09854423292273236, "grad_norm": 3.4123549461364746, "learning_rate": 4.835946248600224e-05, "loss": 2.5897, "step": 880 }, { "epoch": 0.09966405375139978, "grad_norm": 2.3803961277008057, "learning_rate": 4.834079880552445e-05, "loss": 2.3515, "step": 890 }, { "epoch": 0.10078387458006718, "grad_norm": 2.3979332447052, "learning_rate": 4.8322135125046666e-05, "loss": 2.5587, "step": 900 }, { "epoch": 0.1019036954087346, "grad_norm": 4.127903461456299, "learning_rate": 4.830347144456887e-05, "loss": 1.8338, "step": 910 }, { "epoch": 0.10302351623740201, "grad_norm": 3.460048198699951, "learning_rate": 4.828480776409108e-05, "loss": 3.0448, "step": 920 }, { "epoch": 0.10414333706606943, "grad_norm": 2.7010154724121094, "learning_rate": 4.826614408361329e-05, "loss": 2.5111, "step": 930 }, { "epoch": 0.10526315789473684, "grad_norm": 2.2253668308258057, "learning_rate": 4.8247480403135506e-05, "loss": 2.1311, "step": 940 }, { "epoch": 0.10638297872340426, "grad_norm": 2.3751561641693115, "learning_rate": 4.822881672265771e-05, "loss": 2.3021, "step": 950 }, { "epoch": 0.10750279955207166, "grad_norm": 4.968678951263428, "learning_rate": 4.8210153042179916e-05, "loss": 2.7204, "step": 960 }, { "epoch": 0.10862262038073908, "grad_norm": 2.429736375808716, "learning_rate": 4.819148936170213e-05, "loss": 2.4528, "step": 970 }, { "epoch": 0.10974244120940649, "grad_norm": 8.118196487426758, "learning_rate": 4.817282568122434e-05, "loss": 2.8347, "step": 980 }, { "epoch": 0.11086226203807391, "grad_norm": 9.025050163269043, "learning_rate": 4.815416200074655e-05, "loss": 2.2329, "step": 990 }, { "epoch": 0.11198208286674133, "grad_norm": 2.610757827758789, "learning_rate": 4.8135498320268756e-05, "loss": 2.0086, "step": 1000 }, { "epoch": 0.11310190369540873, "grad_norm": 3.6983273029327393, "learning_rate": 4.811683463979097e-05, "loss": 2.6589, "step": 1010 }, { "epoch": 0.11422172452407615, "grad_norm": 6.618379592895508, "learning_rate": 4.809817095931318e-05, "loss": 2.2701, "step": 1020 }, { "epoch": 0.11534154535274356, "grad_norm": 2.155717372894287, "learning_rate": 4.807950727883539e-05, "loss": 2.3469, "step": 1030 }, { "epoch": 0.11646136618141098, "grad_norm": 2.49660325050354, "learning_rate": 4.8060843598357596e-05, "loss": 2.1965, "step": 1040 }, { "epoch": 0.11758118701007839, "grad_norm": 9.351076126098633, "learning_rate": 4.804217991787981e-05, "loss": 3.0117, "step": 1050 }, { "epoch": 0.1187010078387458, "grad_norm": 4.395270824432373, "learning_rate": 4.802351623740202e-05, "loss": 2.091, "step": 1060 }, { "epoch": 0.11982082866741321, "grad_norm": 2.891835927963257, "learning_rate": 4.800485255692423e-05, "loss": 2.4642, "step": 1070 }, { "epoch": 0.12094064949608063, "grad_norm": 3.1243512630462646, "learning_rate": 4.7986188876446436e-05, "loss": 2.3218, "step": 1080 }, { "epoch": 0.12206047032474804, "grad_norm": 4.109086513519287, "learning_rate": 4.796752519596865e-05, "loss": 2.2233, "step": 1090 }, { "epoch": 0.12318029115341546, "grad_norm": 8.871736526489258, "learning_rate": 4.794886151549086e-05, "loss": 2.6833, "step": 1100 }, { "epoch": 0.12430011198208286, "grad_norm": 2.5556600093841553, "learning_rate": 4.793019783501307e-05, "loss": 2.3963, "step": 1110 }, { "epoch": 0.12541993281075028, "grad_norm": 2.431551694869995, "learning_rate": 4.7911534154535276e-05, "loss": 2.2135, "step": 1120 }, { "epoch": 0.1265397536394177, "grad_norm": 6.504064559936523, "learning_rate": 4.789287047405749e-05, "loss": 2.0713, "step": 1130 }, { "epoch": 0.1276595744680851, "grad_norm": 8.992396354675293, "learning_rate": 4.787420679357969e-05, "loss": 2.3186, "step": 1140 }, { "epoch": 0.12877939529675253, "grad_norm": 7.831729888916016, "learning_rate": 4.785554311310191e-05, "loss": 2.5546, "step": 1150 }, { "epoch": 0.12989921612541994, "grad_norm": 2.7570407390594482, "learning_rate": 4.7836879432624116e-05, "loss": 2.5008, "step": 1160 }, { "epoch": 0.13101903695408734, "grad_norm": 10.529077529907227, "learning_rate": 4.781821575214633e-05, "loss": 2.3034, "step": 1170 }, { "epoch": 0.13213885778275475, "grad_norm": 7.510254383087158, "learning_rate": 4.779955207166853e-05, "loss": 2.5347, "step": 1180 }, { "epoch": 0.13325867861142218, "grad_norm": 6.021450519561768, "learning_rate": 4.7780888391190744e-05, "loss": 2.3394, "step": 1190 }, { "epoch": 0.1343784994400896, "grad_norm": 2.8167929649353027, "learning_rate": 4.7762224710712956e-05, "loss": 2.4706, "step": 1200 }, { "epoch": 0.135498320268757, "grad_norm": 2.655770778656006, "learning_rate": 4.774356103023516e-05, "loss": 2.5604, "step": 1210 }, { "epoch": 0.1366181410974244, "grad_norm": 5.053645610809326, "learning_rate": 4.772489734975737e-05, "loss": 2.468, "step": 1220 }, { "epoch": 0.13773796192609183, "grad_norm": 7.558941841125488, "learning_rate": 4.7706233669279584e-05, "loss": 2.5013, "step": 1230 }, { "epoch": 0.13885778275475924, "grad_norm": 7.665897369384766, "learning_rate": 4.7687569988801796e-05, "loss": 2.1974, "step": 1240 }, { "epoch": 0.13997760358342665, "grad_norm": 9.41537094116211, "learning_rate": 4.7668906308324e-05, "loss": 2.2467, "step": 1250 }, { "epoch": 0.14109742441209405, "grad_norm": 9.68034839630127, "learning_rate": 4.765024262784621e-05, "loss": 2.5209, "step": 1260 }, { "epoch": 0.1422172452407615, "grad_norm": 6.756275177001953, "learning_rate": 4.7631578947368424e-05, "loss": 2.4288, "step": 1270 }, { "epoch": 0.1433370660694289, "grad_norm": 7.971835613250732, "learning_rate": 4.7612915266890636e-05, "loss": 2.059, "step": 1280 }, { "epoch": 0.1444568868980963, "grad_norm": 7.094338893890381, "learning_rate": 4.759425158641284e-05, "loss": 2.7495, "step": 1290 }, { "epoch": 0.1455767077267637, "grad_norm": 6.793420791625977, "learning_rate": 4.757558790593505e-05, "loss": 1.8302, "step": 1300 }, { "epoch": 0.14669652855543114, "grad_norm": 6.490263938903809, "learning_rate": 4.7556924225457264e-05, "loss": 2.4643, "step": 1310 }, { "epoch": 0.14781634938409854, "grad_norm": 2.3416640758514404, "learning_rate": 4.7538260544979476e-05, "loss": 2.4583, "step": 1320 }, { "epoch": 0.14893617021276595, "grad_norm": 6.409727096557617, "learning_rate": 4.751959686450168e-05, "loss": 1.7198, "step": 1330 }, { "epoch": 0.15005599104143338, "grad_norm": 3.995352029800415, "learning_rate": 4.750093318402389e-05, "loss": 2.1065, "step": 1340 }, { "epoch": 0.1511758118701008, "grad_norm": 4.906558036804199, "learning_rate": 4.74822695035461e-05, "loss": 2.3096, "step": 1350 }, { "epoch": 0.1522956326987682, "grad_norm": 6.388749122619629, "learning_rate": 4.7463605823068316e-05, "loss": 1.9702, "step": 1360 }, { "epoch": 0.1534154535274356, "grad_norm": 2.246985673904419, "learning_rate": 4.744494214259052e-05, "loss": 2.1969, "step": 1370 }, { "epoch": 0.15453527435610304, "grad_norm": 6.625758647918701, "learning_rate": 4.742627846211273e-05, "loss": 2.4131, "step": 1380 }, { "epoch": 0.15565509518477044, "grad_norm": 2.558464527130127, "learning_rate": 4.740761478163494e-05, "loss": 2.5493, "step": 1390 }, { "epoch": 0.15677491601343785, "grad_norm": 4.546473979949951, "learning_rate": 4.7388951101157156e-05, "loss": 2.8893, "step": 1400 }, { "epoch": 0.15789473684210525, "grad_norm": 2.1784298419952393, "learning_rate": 4.737028742067936e-05, "loss": 2.3148, "step": 1410 }, { "epoch": 0.1590145576707727, "grad_norm": 2.0193071365356445, "learning_rate": 4.735162374020157e-05, "loss": 2.6046, "step": 1420 }, { "epoch": 0.1601343784994401, "grad_norm": 2.74428653717041, "learning_rate": 4.733296005972378e-05, "loss": 2.6196, "step": 1430 }, { "epoch": 0.1612541993281075, "grad_norm": 10.364500045776367, "learning_rate": 4.731429637924599e-05, "loss": 2.473, "step": 1440 }, { "epoch": 0.1623740201567749, "grad_norm": 7.268424034118652, "learning_rate": 4.72956326987682e-05, "loss": 2.9277, "step": 1450 }, { "epoch": 0.16349384098544234, "grad_norm": 7.980413913726807, "learning_rate": 4.7276969018290406e-05, "loss": 2.4812, "step": 1460 }, { "epoch": 0.16461366181410975, "grad_norm": 4.977534770965576, "learning_rate": 4.725830533781262e-05, "loss": 2.3111, "step": 1470 }, { "epoch": 0.16573348264277715, "grad_norm": 2.0615103244781494, "learning_rate": 4.723964165733483e-05, "loss": 2.4883, "step": 1480 }, { "epoch": 0.16685330347144456, "grad_norm": 2.4058101177215576, "learning_rate": 4.722097797685704e-05, "loss": 2.2982, "step": 1490 }, { "epoch": 0.167973124300112, "grad_norm": 5.251309871673584, "learning_rate": 4.7202314296379246e-05, "loss": 2.186, "step": 1500 }, { "epoch": 0.1690929451287794, "grad_norm": 4.544527053833008, "learning_rate": 4.718365061590146e-05, "loss": 1.8374, "step": 1510 }, { "epoch": 0.1702127659574468, "grad_norm": 8.125224113464355, "learning_rate": 4.716498693542367e-05, "loss": 2.3848, "step": 1520 }, { "epoch": 0.1713325867861142, "grad_norm": 2.274805784225464, "learning_rate": 4.714632325494588e-05, "loss": 2.1904, "step": 1530 }, { "epoch": 0.17245240761478164, "grad_norm": 7.999364376068115, "learning_rate": 4.7127659574468086e-05, "loss": 2.4646, "step": 1540 }, { "epoch": 0.17357222844344905, "grad_norm": 4.198975086212158, "learning_rate": 4.71089958939903e-05, "loss": 2.2459, "step": 1550 }, { "epoch": 0.17469204927211646, "grad_norm": 8.396190643310547, "learning_rate": 4.70903322135125e-05, "loss": 2.75, "step": 1560 }, { "epoch": 0.17581187010078386, "grad_norm": 2.833841562271118, "learning_rate": 4.707166853303472e-05, "loss": 2.5594, "step": 1570 }, { "epoch": 0.1769316909294513, "grad_norm": 2.6558115482330322, "learning_rate": 4.7053004852556926e-05, "loss": 2.2972, "step": 1580 }, { "epoch": 0.1780515117581187, "grad_norm": 7.599963188171387, "learning_rate": 4.703434117207914e-05, "loss": 2.5468, "step": 1590 }, { "epoch": 0.1791713325867861, "grad_norm": 2.6800622940063477, "learning_rate": 4.701567749160134e-05, "loss": 2.4275, "step": 1600 }, { "epoch": 0.18029115341545351, "grad_norm": 9.46832275390625, "learning_rate": 4.699701381112356e-05, "loss": 2.2797, "step": 1610 }, { "epoch": 0.18141097424412095, "grad_norm": 2.8210015296936035, "learning_rate": 4.6978350130645766e-05, "loss": 2.3262, "step": 1620 }, { "epoch": 0.18253079507278835, "grad_norm": 6.384908676147461, "learning_rate": 4.695968645016798e-05, "loss": 1.852, "step": 1630 }, { "epoch": 0.18365061590145576, "grad_norm": 11.738371849060059, "learning_rate": 4.694102276969018e-05, "loss": 2.4613, "step": 1640 }, { "epoch": 0.18477043673012317, "grad_norm": 2.614558696746826, "learning_rate": 4.6922359089212394e-05, "loss": 2.3428, "step": 1650 }, { "epoch": 0.1858902575587906, "grad_norm": 3.371556282043457, "learning_rate": 4.6903695408734606e-05, "loss": 2.8777, "step": 1660 }, { "epoch": 0.187010078387458, "grad_norm": 2.5485849380493164, "learning_rate": 4.688503172825681e-05, "loss": 2.1301, "step": 1670 }, { "epoch": 0.1881298992161254, "grad_norm": 4.2750935554504395, "learning_rate": 4.686636804777902e-05, "loss": 2.5627, "step": 1680 }, { "epoch": 0.18924972004479285, "grad_norm": 2.555360794067383, "learning_rate": 4.6847704367301234e-05, "loss": 2.19, "step": 1690 }, { "epoch": 0.19036954087346025, "grad_norm": 6.980922698974609, "learning_rate": 4.6829040686823446e-05, "loss": 2.2301, "step": 1700 }, { "epoch": 0.19148936170212766, "grad_norm": 4.802427768707275, "learning_rate": 4.681037700634565e-05, "loss": 2.1102, "step": 1710 }, { "epoch": 0.19260918253079506, "grad_norm": 6.685520172119141, "learning_rate": 4.679171332586786e-05, "loss": 2.3725, "step": 1720 }, { "epoch": 0.1937290033594625, "grad_norm": 2.2345573902130127, "learning_rate": 4.6773049645390074e-05, "loss": 2.1639, "step": 1730 }, { "epoch": 0.1948488241881299, "grad_norm": 2.531062364578247, "learning_rate": 4.6754385964912286e-05, "loss": 1.8379, "step": 1740 }, { "epoch": 0.1959686450167973, "grad_norm": 7.543485164642334, "learning_rate": 4.673572228443449e-05, "loss": 2.5206, "step": 1750 }, { "epoch": 0.19708846584546472, "grad_norm": 4.684238910675049, "learning_rate": 4.67170586039567e-05, "loss": 2.1743, "step": 1760 }, { "epoch": 0.19820828667413215, "grad_norm": 8.91139030456543, "learning_rate": 4.669839492347891e-05, "loss": 2.3441, "step": 1770 }, { "epoch": 0.19932810750279956, "grad_norm": 2.0204806327819824, "learning_rate": 4.6679731243001126e-05, "loss": 2.3745, "step": 1780 }, { "epoch": 0.20044792833146696, "grad_norm": 10.537651062011719, "learning_rate": 4.666106756252333e-05, "loss": 2.587, "step": 1790 }, { "epoch": 0.20156774916013437, "grad_norm": 3.3336009979248047, "learning_rate": 4.664240388204554e-05, "loss": 2.7144, "step": 1800 }, { "epoch": 0.2026875699888018, "grad_norm": 3.2457361221313477, "learning_rate": 4.662374020156775e-05, "loss": 2.1114, "step": 1810 }, { "epoch": 0.2038073908174692, "grad_norm": 6.266234874725342, "learning_rate": 4.6605076521089966e-05, "loss": 2.3444, "step": 1820 }, { "epoch": 0.20492721164613661, "grad_norm": 5.921943187713623, "learning_rate": 4.658641284061217e-05, "loss": 2.2345, "step": 1830 }, { "epoch": 0.20604703247480402, "grad_norm": 2.481746196746826, "learning_rate": 4.656774916013438e-05, "loss": 2.3925, "step": 1840 }, { "epoch": 0.20716685330347145, "grad_norm": 6.096205711364746, "learning_rate": 4.654908547965659e-05, "loss": 2.4366, "step": 1850 }, { "epoch": 0.20828667413213886, "grad_norm": 7.671387672424316, "learning_rate": 4.65304217991788e-05, "loss": 2.5033, "step": 1860 }, { "epoch": 0.20940649496080627, "grad_norm": 4.001086711883545, "learning_rate": 4.651175811870101e-05, "loss": 2.0047, "step": 1870 }, { "epoch": 0.21052631578947367, "grad_norm": 7.602363586425781, "learning_rate": 4.649309443822322e-05, "loss": 2.3823, "step": 1880 }, { "epoch": 0.2116461366181411, "grad_norm": 5.483312129974365, "learning_rate": 4.647443075774543e-05, "loss": 2.4785, "step": 1890 }, { "epoch": 0.2127659574468085, "grad_norm": 2.5652925968170166, "learning_rate": 4.645576707726764e-05, "loss": 2.1959, "step": 1900 }, { "epoch": 0.21388577827547592, "grad_norm": 8.491823196411133, "learning_rate": 4.643710339678985e-05, "loss": 2.9472, "step": 1910 }, { "epoch": 0.21500559910414332, "grad_norm": 5.945290565490723, "learning_rate": 4.6418439716312056e-05, "loss": 2.2415, "step": 1920 }, { "epoch": 0.21612541993281076, "grad_norm": 4.045243263244629, "learning_rate": 4.639977603583427e-05, "loss": 2.4932, "step": 1930 }, { "epoch": 0.21724524076147816, "grad_norm": 2.715601921081543, "learning_rate": 4.638111235535648e-05, "loss": 2.4262, "step": 1940 }, { "epoch": 0.21836506159014557, "grad_norm": 3.0143299102783203, "learning_rate": 4.636244867487869e-05, "loss": 2.6019, "step": 1950 }, { "epoch": 0.21948488241881298, "grad_norm": 9.742323875427246, "learning_rate": 4.6343784994400896e-05, "loss": 2.9155, "step": 1960 }, { "epoch": 0.2206047032474804, "grad_norm": 5.9390788078308105, "learning_rate": 4.632512131392311e-05, "loss": 2.1963, "step": 1970 }, { "epoch": 0.22172452407614782, "grad_norm": 5.941153049468994, "learning_rate": 4.630645763344531e-05, "loss": 2.2618, "step": 1980 }, { "epoch": 0.22284434490481522, "grad_norm": 4.004471778869629, "learning_rate": 4.628779395296753e-05, "loss": 2.6587, "step": 1990 }, { "epoch": 0.22396416573348266, "grad_norm": 8.82131576538086, "learning_rate": 4.6269130272489736e-05, "loss": 2.1719, "step": 2000 }, { "epoch": 0.22508398656215006, "grad_norm": 2.8698363304138184, "learning_rate": 4.625046659201195e-05, "loss": 2.4046, "step": 2010 }, { "epoch": 0.22620380739081747, "grad_norm": 6.006710529327393, "learning_rate": 4.623180291153415e-05, "loss": 2.3737, "step": 2020 }, { "epoch": 0.22732362821948487, "grad_norm": 2.5947604179382324, "learning_rate": 4.621313923105637e-05, "loss": 2.4821, "step": 2030 }, { "epoch": 0.2284434490481523, "grad_norm": 2.4432547092437744, "learning_rate": 4.6194475550578576e-05, "loss": 2.5101, "step": 2040 }, { "epoch": 0.22956326987681971, "grad_norm": 12.777518272399902, "learning_rate": 4.617581187010079e-05, "loss": 2.89, "step": 2050 }, { "epoch": 0.23068309070548712, "grad_norm": 8.881490707397461, "learning_rate": 4.615714818962299e-05, "loss": 3.0737, "step": 2060 }, { "epoch": 0.23180291153415453, "grad_norm": 11.968159675598145, "learning_rate": 4.613848450914521e-05, "loss": 2.4398, "step": 2070 }, { "epoch": 0.23292273236282196, "grad_norm": 2.413706064224243, "learning_rate": 4.6119820828667416e-05, "loss": 2.1109, "step": 2080 }, { "epoch": 0.23404255319148937, "grad_norm": 8.401453971862793, "learning_rate": 4.610115714818963e-05, "loss": 2.5007, "step": 2090 }, { "epoch": 0.23516237402015677, "grad_norm": 2.3912086486816406, "learning_rate": 4.608249346771183e-05, "loss": 2.4239, "step": 2100 }, { "epoch": 0.23628219484882418, "grad_norm": 8.813179016113281, "learning_rate": 4.6063829787234044e-05, "loss": 2.4587, "step": 2110 }, { "epoch": 0.2374020156774916, "grad_norm": 10.839656829833984, "learning_rate": 4.6045166106756256e-05, "loss": 2.4691, "step": 2120 }, { "epoch": 0.23852183650615902, "grad_norm": 4.4540252685546875, "learning_rate": 4.602650242627846e-05, "loss": 2.4054, "step": 2130 }, { "epoch": 0.23964165733482642, "grad_norm": 2.7125473022460938, "learning_rate": 4.600783874580067e-05, "loss": 2.7286, "step": 2140 }, { "epoch": 0.24076147816349383, "grad_norm": 2.332322359085083, "learning_rate": 4.5989175065322884e-05, "loss": 2.5039, "step": 2150 }, { "epoch": 0.24188129899216126, "grad_norm": 2.539842367172241, "learning_rate": 4.5970511384845096e-05, "loss": 2.2403, "step": 2160 }, { "epoch": 0.24300111982082867, "grad_norm": 6.839804649353027, "learning_rate": 4.59518477043673e-05, "loss": 2.0901, "step": 2170 }, { "epoch": 0.24412094064949608, "grad_norm": 2.5890653133392334, "learning_rate": 4.593318402388951e-05, "loss": 2.2586, "step": 2180 }, { "epoch": 0.24524076147816348, "grad_norm": 2.5026495456695557, "learning_rate": 4.591452034341172e-05, "loss": 2.1888, "step": 2190 }, { "epoch": 0.24636058230683092, "grad_norm": 3.8693251609802246, "learning_rate": 4.5895856662933936e-05, "loss": 2.6531, "step": 2200 }, { "epoch": 0.24748040313549832, "grad_norm": 8.573837280273438, "learning_rate": 4.587719298245614e-05, "loss": 2.5078, "step": 2210 }, { "epoch": 0.24860022396416573, "grad_norm": 3.1866371631622314, "learning_rate": 4.585852930197835e-05, "loss": 2.2594, "step": 2220 }, { "epoch": 0.24972004479283313, "grad_norm": 7.868608474731445, "learning_rate": 4.583986562150056e-05, "loss": 2.2759, "step": 2230 }, { "epoch": 0.25083986562150057, "grad_norm": 3.183617353439331, "learning_rate": 4.5821201941022776e-05, "loss": 2.5888, "step": 2240 }, { "epoch": 0.251959686450168, "grad_norm": 2.5060982704162598, "learning_rate": 4.580253826054498e-05, "loss": 2.1064, "step": 2250 }, { "epoch": 0.2530795072788354, "grad_norm": 2.9019861221313477, "learning_rate": 4.578387458006719e-05, "loss": 2.1935, "step": 2260 }, { "epoch": 0.2541993281075028, "grad_norm": 13.326761245727539, "learning_rate": 4.57652108995894e-05, "loss": 2.5056, "step": 2270 }, { "epoch": 0.2553191489361702, "grad_norm": 7.620180130004883, "learning_rate": 4.5746547219111616e-05, "loss": 2.6445, "step": 2280 }, { "epoch": 0.2564389697648376, "grad_norm": 6.347967147827148, "learning_rate": 4.572788353863382e-05, "loss": 2.1387, "step": 2290 }, { "epoch": 0.25755879059350506, "grad_norm": 7.242101192474365, "learning_rate": 4.570921985815603e-05, "loss": 2.4346, "step": 2300 }, { "epoch": 0.25867861142217247, "grad_norm": 7.027688503265381, "learning_rate": 4.569055617767824e-05, "loss": 2.2526, "step": 2310 }, { "epoch": 0.2597984322508399, "grad_norm": 6.494021892547607, "learning_rate": 4.567189249720045e-05, "loss": 1.9165, "step": 2320 }, { "epoch": 0.2609182530795073, "grad_norm": 8.93453311920166, "learning_rate": 4.565322881672266e-05, "loss": 2.1397, "step": 2330 }, { "epoch": 0.2620380739081747, "grad_norm": 2.471494197845459, "learning_rate": 4.563456513624487e-05, "loss": 2.2892, "step": 2340 }, { "epoch": 0.2631578947368421, "grad_norm": 10.424552917480469, "learning_rate": 4.561590145576708e-05, "loss": 2.514, "step": 2350 }, { "epoch": 0.2642777155655095, "grad_norm": 7.312840938568115, "learning_rate": 4.559723777528929e-05, "loss": 2.2164, "step": 2360 }, { "epoch": 0.26539753639417696, "grad_norm": 11.861546516418457, "learning_rate": 4.55785740948115e-05, "loss": 2.2371, "step": 2370 }, { "epoch": 0.26651735722284436, "grad_norm": 9.549253463745117, "learning_rate": 4.5559910414333706e-05, "loss": 2.6811, "step": 2380 }, { "epoch": 0.26763717805151177, "grad_norm": 2.9422247409820557, "learning_rate": 4.554124673385592e-05, "loss": 2.2888, "step": 2390 }, { "epoch": 0.2687569988801792, "grad_norm": 7.779324054718018, "learning_rate": 4.552258305337813e-05, "loss": 2.3416, "step": 2400 }, { "epoch": 0.2698768197088466, "grad_norm": 2.1986162662506104, "learning_rate": 4.550391937290034e-05, "loss": 2.4494, "step": 2410 }, { "epoch": 0.270996640537514, "grad_norm": 2.420370578765869, "learning_rate": 4.5485255692422546e-05, "loss": 2.6193, "step": 2420 }, { "epoch": 0.2721164613661814, "grad_norm": 2.281414747238159, "learning_rate": 4.546659201194476e-05, "loss": 2.7233, "step": 2430 }, { "epoch": 0.2732362821948488, "grad_norm": 2.4500784873962402, "learning_rate": 4.544792833146696e-05, "loss": 2.658, "step": 2440 }, { "epoch": 0.27435610302351626, "grad_norm": 8.000895500183105, "learning_rate": 4.542926465098918e-05, "loss": 2.7245, "step": 2450 }, { "epoch": 0.27547592385218367, "grad_norm": 2.6147563457489014, "learning_rate": 4.5410600970511386e-05, "loss": 2.4904, "step": 2460 }, { "epoch": 0.2765957446808511, "grad_norm": 12.0834321975708, "learning_rate": 4.53919372900336e-05, "loss": 2.6572, "step": 2470 }, { "epoch": 0.2777155655095185, "grad_norm": 5.857783317565918, "learning_rate": 4.53732736095558e-05, "loss": 1.98, "step": 2480 }, { "epoch": 0.2788353863381859, "grad_norm": 5.242463111877441, "learning_rate": 4.535460992907802e-05, "loss": 2.4122, "step": 2490 }, { "epoch": 0.2799552071668533, "grad_norm": 9.532788276672363, "learning_rate": 4.5335946248600226e-05, "loss": 2.6636, "step": 2500 }, { "epoch": 0.2810750279955207, "grad_norm": 8.554610252380371, "learning_rate": 4.531728256812244e-05, "loss": 2.4047, "step": 2510 }, { "epoch": 0.2821948488241881, "grad_norm": 7.8059234619140625, "learning_rate": 4.529861888764464e-05, "loss": 2.4136, "step": 2520 }, { "epoch": 0.28331466965285557, "grad_norm": 4.768645286560059, "learning_rate": 4.5279955207166854e-05, "loss": 2.6977, "step": 2530 }, { "epoch": 0.284434490481523, "grad_norm": 6.272426128387451, "learning_rate": 4.5261291526689066e-05, "loss": 2.7218, "step": 2540 }, { "epoch": 0.2855543113101904, "grad_norm": 2.5695507526397705, "learning_rate": 4.524262784621128e-05, "loss": 2.8431, "step": 2550 }, { "epoch": 0.2866741321388578, "grad_norm": 2.400848865509033, "learning_rate": 4.522396416573348e-05, "loss": 2.3621, "step": 2560 }, { "epoch": 0.2877939529675252, "grad_norm": 10.295741081237793, "learning_rate": 4.5205300485255694e-05, "loss": 1.9837, "step": 2570 }, { "epoch": 0.2889137737961926, "grad_norm": 2.625807762145996, "learning_rate": 4.5186636804777906e-05, "loss": 2.3817, "step": 2580 }, { "epoch": 0.29003359462486, "grad_norm": 2.6179468631744385, "learning_rate": 4.516797312430011e-05, "loss": 2.6555, "step": 2590 }, { "epoch": 0.2911534154535274, "grad_norm": 2.512031316757202, "learning_rate": 4.514930944382232e-05, "loss": 2.5653, "step": 2600 }, { "epoch": 0.29227323628219487, "grad_norm": 2.6077969074249268, "learning_rate": 4.5130645763344534e-05, "loss": 2.222, "step": 2610 }, { "epoch": 0.2933930571108623, "grad_norm": 2.072172164916992, "learning_rate": 4.5111982082866746e-05, "loss": 2.1437, "step": 2620 }, { "epoch": 0.2945128779395297, "grad_norm": 4.034156799316406, "learning_rate": 4.509331840238895e-05, "loss": 2.4836, "step": 2630 }, { "epoch": 0.2956326987681971, "grad_norm": 6.953413963317871, "learning_rate": 4.507465472191116e-05, "loss": 2.1186, "step": 2640 }, { "epoch": 0.2967525195968645, "grad_norm": 7.338948726654053, "learning_rate": 4.505599104143337e-05, "loss": 2.1858, "step": 2650 }, { "epoch": 0.2978723404255319, "grad_norm": 6.163172245025635, "learning_rate": 4.5037327360955586e-05, "loss": 2.2643, "step": 2660 }, { "epoch": 0.2989921612541993, "grad_norm": 7.6946563720703125, "learning_rate": 4.501866368047779e-05, "loss": 2.294, "step": 2670 }, { "epoch": 0.30011198208286677, "grad_norm": 12.317503929138184, "learning_rate": 4.5e-05, "loss": 2.5286, "step": 2680 }, { "epoch": 0.3012318029115342, "grad_norm": 7.581274509429932, "learning_rate": 4.498133631952221e-05, "loss": 2.146, "step": 2690 }, { "epoch": 0.3023516237402016, "grad_norm": 6.5440778732299805, "learning_rate": 4.4962672639044426e-05, "loss": 2.4251, "step": 2700 }, { "epoch": 0.303471444568869, "grad_norm": 4.657285213470459, "learning_rate": 4.494400895856663e-05, "loss": 1.8409, "step": 2710 }, { "epoch": 0.3045912653975364, "grad_norm": 1.9951245784759521, "learning_rate": 4.492534527808884e-05, "loss": 2.2119, "step": 2720 }, { "epoch": 0.3057110862262038, "grad_norm": 6.937575340270996, "learning_rate": 4.490668159761105e-05, "loss": 1.8932, "step": 2730 }, { "epoch": 0.3068309070548712, "grad_norm": 12.604211807250977, "learning_rate": 4.488801791713326e-05, "loss": 2.1911, "step": 2740 }, { "epoch": 0.3079507278835386, "grad_norm": 2.306835412979126, "learning_rate": 4.486935423665547e-05, "loss": 2.225, "step": 2750 }, { "epoch": 0.3090705487122061, "grad_norm": 7.850268840789795, "learning_rate": 4.485069055617768e-05, "loss": 2.2062, "step": 2760 }, { "epoch": 0.3101903695408735, "grad_norm": 8.962443351745605, "learning_rate": 4.483202687569989e-05, "loss": 2.0101, "step": 2770 }, { "epoch": 0.3113101903695409, "grad_norm": 2.3884646892547607, "learning_rate": 4.48133631952221e-05, "loss": 2.4539, "step": 2780 }, { "epoch": 0.3124300111982083, "grad_norm": 4.534022808074951, "learning_rate": 4.479469951474431e-05, "loss": 2.0423, "step": 2790 }, { "epoch": 0.3135498320268757, "grad_norm": 2.491356372833252, "learning_rate": 4.477603583426652e-05, "loss": 2.3449, "step": 2800 }, { "epoch": 0.3146696528555431, "grad_norm": 5.900778293609619, "learning_rate": 4.475737215378873e-05, "loss": 2.3496, "step": 2810 }, { "epoch": 0.3157894736842105, "grad_norm": 3.9138317108154297, "learning_rate": 4.473870847331094e-05, "loss": 2.2142, "step": 2820 }, { "epoch": 0.3169092945128779, "grad_norm": 9.516107559204102, "learning_rate": 4.472004479283315e-05, "loss": 2.0819, "step": 2830 }, { "epoch": 0.3180291153415454, "grad_norm": 2.504873275756836, "learning_rate": 4.4701381112355356e-05, "loss": 2.0613, "step": 2840 }, { "epoch": 0.3191489361702128, "grad_norm": 8.265789031982422, "learning_rate": 4.468271743187757e-05, "loss": 2.2209, "step": 2850 }, { "epoch": 0.3202687569988802, "grad_norm": 3.2764816284179688, "learning_rate": 4.466405375139977e-05, "loss": 2.7096, "step": 2860 }, { "epoch": 0.3213885778275476, "grad_norm": 10.983661651611328, "learning_rate": 4.464539007092199e-05, "loss": 2.638, "step": 2870 }, { "epoch": 0.322508398656215, "grad_norm": 2.8227787017822266, "learning_rate": 4.4626726390444196e-05, "loss": 2.6929, "step": 2880 }, { "epoch": 0.3236282194848824, "grad_norm": 2.553760528564453, "learning_rate": 4.460806270996641e-05, "loss": 1.8541, "step": 2890 }, { "epoch": 0.3247480403135498, "grad_norm": 9.215750694274902, "learning_rate": 4.458939902948861e-05, "loss": 2.3142, "step": 2900 }, { "epoch": 0.3258678611422172, "grad_norm": 3.168344020843506, "learning_rate": 4.457073534901083e-05, "loss": 2.4242, "step": 2910 }, { "epoch": 0.3269876819708847, "grad_norm": 3.0249898433685303, "learning_rate": 4.4552071668533036e-05, "loss": 2.0356, "step": 2920 }, { "epoch": 0.3281075027995521, "grad_norm": 7.524886608123779, "learning_rate": 4.453340798805525e-05, "loss": 2.7437, "step": 2930 }, { "epoch": 0.3292273236282195, "grad_norm": 8.902599334716797, "learning_rate": 4.451474430757745e-05, "loss": 2.0116, "step": 2940 }, { "epoch": 0.3303471444568869, "grad_norm": 3.2997946739196777, "learning_rate": 4.4496080627099664e-05, "loss": 2.359, "step": 2950 }, { "epoch": 0.3314669652855543, "grad_norm": 3.123281717300415, "learning_rate": 4.4477416946621876e-05, "loss": 2.2666, "step": 2960 }, { "epoch": 0.3325867861142217, "grad_norm": 10.098536491394043, "learning_rate": 4.445875326614409e-05, "loss": 2.501, "step": 2970 }, { "epoch": 0.3337066069428891, "grad_norm": 11.130685806274414, "learning_rate": 4.444008958566629e-05, "loss": 2.4014, "step": 2980 }, { "epoch": 0.3348264277715566, "grad_norm": 8.4888334274292, "learning_rate": 4.4421425905188505e-05, "loss": 2.3178, "step": 2990 }, { "epoch": 0.335946248600224, "grad_norm": 8.757832527160645, "learning_rate": 4.4402762224710716e-05, "loss": 2.1205, "step": 3000 }, { "epoch": 0.3370660694288914, "grad_norm": 8.70385456085205, "learning_rate": 4.438409854423293e-05, "loss": 2.4269, "step": 3010 }, { "epoch": 0.3381858902575588, "grad_norm": 8.281830787658691, "learning_rate": 4.436543486375513e-05, "loss": 2.5491, "step": 3020 }, { "epoch": 0.3393057110862262, "grad_norm": 9.058775901794434, "learning_rate": 4.4346771183277345e-05, "loss": 2.3167, "step": 3030 }, { "epoch": 0.3404255319148936, "grad_norm": 5.364592552185059, "learning_rate": 4.4328107502799556e-05, "loss": 1.9588, "step": 3040 }, { "epoch": 0.341545352743561, "grad_norm": 2.446974277496338, "learning_rate": 4.430944382232177e-05, "loss": 2.3064, "step": 3050 }, { "epoch": 0.3426651735722284, "grad_norm": 2.6895692348480225, "learning_rate": 4.429078014184397e-05, "loss": 2.3751, "step": 3060 }, { "epoch": 0.3437849944008959, "grad_norm": 7.783231735229492, "learning_rate": 4.427211646136618e-05, "loss": 1.6835, "step": 3070 }, { "epoch": 0.3449048152295633, "grad_norm": 3.170950412750244, "learning_rate": 4.4253452780888396e-05, "loss": 2.5539, "step": 3080 }, { "epoch": 0.3460246360582307, "grad_norm": 7.711115837097168, "learning_rate": 4.42347891004106e-05, "loss": 2.78, "step": 3090 }, { "epoch": 0.3471444568868981, "grad_norm": 8.71380615234375, "learning_rate": 4.421612541993281e-05, "loss": 1.7815, "step": 3100 }, { "epoch": 0.3482642777155655, "grad_norm": 2.3626303672790527, "learning_rate": 4.419746173945502e-05, "loss": 2.3035, "step": 3110 }, { "epoch": 0.3493840985442329, "grad_norm": 2.5161445140838623, "learning_rate": 4.4178798058977236e-05, "loss": 2.3065, "step": 3120 }, { "epoch": 0.3505039193729003, "grad_norm": 2.395263433456421, "learning_rate": 4.416013437849944e-05, "loss": 2.3419, "step": 3130 }, { "epoch": 0.3516237402015677, "grad_norm": 6.902035713195801, "learning_rate": 4.414147069802165e-05, "loss": 2.3343, "step": 3140 }, { "epoch": 0.3527435610302352, "grad_norm": 5.079914093017578, "learning_rate": 4.412280701754386e-05, "loss": 2.4017, "step": 3150 }, { "epoch": 0.3538633818589026, "grad_norm": 3.483292579650879, "learning_rate": 4.410414333706607e-05, "loss": 2.3309, "step": 3160 }, { "epoch": 0.35498320268757, "grad_norm": 7.4583940505981445, "learning_rate": 4.408547965658828e-05, "loss": 2.1556, "step": 3170 }, { "epoch": 0.3561030235162374, "grad_norm": 16.233184814453125, "learning_rate": 4.406681597611049e-05, "loss": 2.2442, "step": 3180 }, { "epoch": 0.3572228443449048, "grad_norm": 9.553163528442383, "learning_rate": 4.40481522956327e-05, "loss": 2.1811, "step": 3190 }, { "epoch": 0.3583426651735722, "grad_norm": 5.221775531768799, "learning_rate": 4.402948861515491e-05, "loss": 2.8077, "step": 3200 }, { "epoch": 0.3594624860022396, "grad_norm": 2.419001579284668, "learning_rate": 4.401082493467712e-05, "loss": 1.9981, "step": 3210 }, { "epoch": 0.36058230683090703, "grad_norm": 2.5910959243774414, "learning_rate": 4.399216125419933e-05, "loss": 1.9731, "step": 3220 }, { "epoch": 0.3617021276595745, "grad_norm": 3.1020877361297607, "learning_rate": 4.397349757372154e-05, "loss": 2.0817, "step": 3230 }, { "epoch": 0.3628219484882419, "grad_norm": 3.0343470573425293, "learning_rate": 4.395483389324375e-05, "loss": 2.3142, "step": 3240 }, { "epoch": 0.3639417693169093, "grad_norm": 12.035741806030273, "learning_rate": 4.393617021276596e-05, "loss": 2.4769, "step": 3250 }, { "epoch": 0.3650615901455767, "grad_norm": 3.116953134536743, "learning_rate": 4.391750653228817e-05, "loss": 2.347, "step": 3260 }, { "epoch": 0.3661814109742441, "grad_norm": 2.565833330154419, "learning_rate": 4.389884285181038e-05, "loss": 1.9123, "step": 3270 }, { "epoch": 0.3673012318029115, "grad_norm": 2.983285427093506, "learning_rate": 4.388017917133259e-05, "loss": 2.4622, "step": 3280 }, { "epoch": 0.3684210526315789, "grad_norm": 11.630106925964355, "learning_rate": 4.38615154908548e-05, "loss": 2.4459, "step": 3290 }, { "epoch": 0.36954087346024633, "grad_norm": 6.8500285148620605, "learning_rate": 4.3842851810377006e-05, "loss": 2.143, "step": 3300 }, { "epoch": 0.3706606942889138, "grad_norm": 2.3746914863586426, "learning_rate": 4.382418812989922e-05, "loss": 2.143, "step": 3310 }, { "epoch": 0.3717805151175812, "grad_norm": 2.91323184967041, "learning_rate": 4.380552444942142e-05, "loss": 2.2025, "step": 3320 }, { "epoch": 0.3729003359462486, "grad_norm": 2.4903807640075684, "learning_rate": 4.378686076894364e-05, "loss": 2.3756, "step": 3330 }, { "epoch": 0.374020156774916, "grad_norm": 4.964207172393799, "learning_rate": 4.3768197088465846e-05, "loss": 1.7364, "step": 3340 }, { "epoch": 0.3751399776035834, "grad_norm": 7.413595199584961, "learning_rate": 4.374953340798806e-05, "loss": 2.3942, "step": 3350 }, { "epoch": 0.3762597984322508, "grad_norm": 2.7675399780273438, "learning_rate": 4.373086972751026e-05, "loss": 2.5504, "step": 3360 }, { "epoch": 0.37737961926091823, "grad_norm": 1.9106221199035645, "learning_rate": 4.3712206047032475e-05, "loss": 2.2394, "step": 3370 }, { "epoch": 0.3784994400895857, "grad_norm": 5.255868911743164, "learning_rate": 4.3693542366554686e-05, "loss": 1.7869, "step": 3380 }, { "epoch": 0.3796192609182531, "grad_norm": 4.734898567199707, "learning_rate": 4.36748786860769e-05, "loss": 2.1864, "step": 3390 }, { "epoch": 0.3807390817469205, "grad_norm": 11.226783752441406, "learning_rate": 4.36562150055991e-05, "loss": 2.4792, "step": 3400 }, { "epoch": 0.3818589025755879, "grad_norm": 8.230179786682129, "learning_rate": 4.3637551325121315e-05, "loss": 1.9946, "step": 3410 }, { "epoch": 0.3829787234042553, "grad_norm": 2.981816291809082, "learning_rate": 4.3618887644643526e-05, "loss": 1.9636, "step": 3420 }, { "epoch": 0.3840985442329227, "grad_norm": 7.890393257141113, "learning_rate": 4.360022396416574e-05, "loss": 2.0774, "step": 3430 }, { "epoch": 0.38521836506159013, "grad_norm": 2.7089128494262695, "learning_rate": 4.358156028368794e-05, "loss": 2.1393, "step": 3440 }, { "epoch": 0.38633818589025753, "grad_norm": 7.063770771026611, "learning_rate": 4.3562896603210155e-05, "loss": 1.9429, "step": 3450 }, { "epoch": 0.387458006718925, "grad_norm": 2.608469247817993, "learning_rate": 4.3544232922732366e-05, "loss": 2.8053, "step": 3460 }, { "epoch": 0.3885778275475924, "grad_norm": 2.1650965213775635, "learning_rate": 4.352556924225458e-05, "loss": 2.4611, "step": 3470 }, { "epoch": 0.3896976483762598, "grad_norm": 7.017950057983398, "learning_rate": 4.350690556177678e-05, "loss": 2.0722, "step": 3480 }, { "epoch": 0.3908174692049272, "grad_norm": 2.769286870956421, "learning_rate": 4.3488241881298995e-05, "loss": 2.4542, "step": 3490 }, { "epoch": 0.3919372900335946, "grad_norm": 9.565979957580566, "learning_rate": 4.3469578200821206e-05, "loss": 2.6218, "step": 3500 }, { "epoch": 0.393057110862262, "grad_norm": 12.220897674560547, "learning_rate": 4.345091452034342e-05, "loss": 2.248, "step": 3510 }, { "epoch": 0.39417693169092943, "grad_norm": 12.827961921691895, "learning_rate": 4.343225083986562e-05, "loss": 1.9856, "step": 3520 }, { "epoch": 0.39529675251959684, "grad_norm": 2.4457015991210938, "learning_rate": 4.341358715938783e-05, "loss": 2.6235, "step": 3530 }, { "epoch": 0.3964165733482643, "grad_norm": 5.266937255859375, "learning_rate": 4.3394923478910046e-05, "loss": 2.4887, "step": 3540 }, { "epoch": 0.3975363941769317, "grad_norm": 8.347966194152832, "learning_rate": 4.337625979843225e-05, "loss": 2.212, "step": 3550 }, { "epoch": 0.3986562150055991, "grad_norm": 7.743762969970703, "learning_rate": 4.335759611795446e-05, "loss": 2.3995, "step": 3560 }, { "epoch": 0.3997760358342665, "grad_norm": 3.587676763534546, "learning_rate": 4.333893243747667e-05, "loss": 2.355, "step": 3570 }, { "epoch": 0.4008958566629339, "grad_norm": 3.1175928115844727, "learning_rate": 4.3320268756998886e-05, "loss": 2.5916, "step": 3580 }, { "epoch": 0.40201567749160133, "grad_norm": 8.9489107131958, "learning_rate": 4.330160507652109e-05, "loss": 2.6132, "step": 3590 }, { "epoch": 0.40313549832026874, "grad_norm": 12.342984199523926, "learning_rate": 4.32829413960433e-05, "loss": 2.3637, "step": 3600 }, { "epoch": 0.40425531914893614, "grad_norm": 2.721482276916504, "learning_rate": 4.326427771556551e-05, "loss": 2.3011, "step": 3610 }, { "epoch": 0.4053751399776036, "grad_norm": 2.5782060623168945, "learning_rate": 4.324561403508772e-05, "loss": 2.1841, "step": 3620 }, { "epoch": 0.406494960806271, "grad_norm": 2.5713908672332764, "learning_rate": 4.322695035460993e-05, "loss": 2.3976, "step": 3630 }, { "epoch": 0.4076147816349384, "grad_norm": 7.063972473144531, "learning_rate": 4.320828667413214e-05, "loss": 2.4057, "step": 3640 }, { "epoch": 0.4087346024636058, "grad_norm": 8.767318725585938, "learning_rate": 4.318962299365435e-05, "loss": 2.3432, "step": 3650 }, { "epoch": 0.40985442329227323, "grad_norm": 9.010395050048828, "learning_rate": 4.317095931317656e-05, "loss": 2.3386, "step": 3660 }, { "epoch": 0.41097424412094063, "grad_norm": 5.226011276245117, "learning_rate": 4.315229563269877e-05, "loss": 1.9751, "step": 3670 }, { "epoch": 0.41209406494960804, "grad_norm": 2.9475603103637695, "learning_rate": 4.313363195222098e-05, "loss": 2.0421, "step": 3680 }, { "epoch": 0.4132138857782755, "grad_norm": 2.4759316444396973, "learning_rate": 4.311496827174319e-05, "loss": 2.0569, "step": 3690 }, { "epoch": 0.4143337066069429, "grad_norm": 2.6276895999908447, "learning_rate": 4.30963045912654e-05, "loss": 1.8686, "step": 3700 }, { "epoch": 0.4154535274356103, "grad_norm": 5.415910243988037, "learning_rate": 4.307764091078761e-05, "loss": 2.2398, "step": 3710 }, { "epoch": 0.4165733482642777, "grad_norm": 9.693281173706055, "learning_rate": 4.305897723030982e-05, "loss": 2.4323, "step": 3720 }, { "epoch": 0.4176931690929451, "grad_norm": 6.599532127380371, "learning_rate": 4.304031354983203e-05, "loss": 2.0143, "step": 3730 }, { "epoch": 0.41881298992161253, "grad_norm": 4.097227096557617, "learning_rate": 4.302164986935424e-05, "loss": 2.3824, "step": 3740 }, { "epoch": 0.41993281075027994, "grad_norm": 3.5678653717041016, "learning_rate": 4.300298618887645e-05, "loss": 2.0583, "step": 3750 }, { "epoch": 0.42105263157894735, "grad_norm": 13.594582557678223, "learning_rate": 4.2984322508398656e-05, "loss": 2.3702, "step": 3760 }, { "epoch": 0.4221724524076148, "grad_norm": 10.508759498596191, "learning_rate": 4.296565882792087e-05, "loss": 1.9474, "step": 3770 }, { "epoch": 0.4232922732362822, "grad_norm": 2.452303647994995, "learning_rate": 4.294699514744307e-05, "loss": 2.3192, "step": 3780 }, { "epoch": 0.4244120940649496, "grad_norm": 7.144927978515625, "learning_rate": 4.292833146696529e-05, "loss": 2.1651, "step": 3790 }, { "epoch": 0.425531914893617, "grad_norm": 8.945828437805176, "learning_rate": 4.2909667786487496e-05, "loss": 2.3486, "step": 3800 }, { "epoch": 0.42665173572228443, "grad_norm": 2.609912633895874, "learning_rate": 4.289100410600971e-05, "loss": 1.8995, "step": 3810 }, { "epoch": 0.42777155655095184, "grad_norm": 7.373888969421387, "learning_rate": 4.287234042553191e-05, "loss": 2.0976, "step": 3820 }, { "epoch": 0.42889137737961924, "grad_norm": 2.694624662399292, "learning_rate": 4.2853676745054125e-05, "loss": 1.9521, "step": 3830 }, { "epoch": 0.43001119820828665, "grad_norm": 2.8247783184051514, "learning_rate": 4.2835013064576336e-05, "loss": 2.7777, "step": 3840 }, { "epoch": 0.4311310190369541, "grad_norm": 9.070876121520996, "learning_rate": 4.281634938409855e-05, "loss": 2.4438, "step": 3850 }, { "epoch": 0.4322508398656215, "grad_norm": 5.014525890350342, "learning_rate": 4.279768570362075e-05, "loss": 2.1444, "step": 3860 }, { "epoch": 0.4333706606942889, "grad_norm": 3.661271333694458, "learning_rate": 4.2779022023142965e-05, "loss": 2.052, "step": 3870 }, { "epoch": 0.43449048152295633, "grad_norm": 6.962841033935547, "learning_rate": 4.2760358342665176e-05, "loss": 2.1674, "step": 3880 }, { "epoch": 0.43561030235162373, "grad_norm": 2.9479000568389893, "learning_rate": 4.274169466218739e-05, "loss": 2.0745, "step": 3890 }, { "epoch": 0.43673012318029114, "grad_norm": 2.4860355854034424, "learning_rate": 4.272303098170959e-05, "loss": 2.2103, "step": 3900 }, { "epoch": 0.43784994400895855, "grad_norm": 3.2063636779785156, "learning_rate": 4.2704367301231805e-05, "loss": 2.6216, "step": 3910 }, { "epoch": 0.43896976483762595, "grad_norm": 8.925811767578125, "learning_rate": 4.2685703620754016e-05, "loss": 1.8027, "step": 3920 }, { "epoch": 0.4400895856662934, "grad_norm": 2.438516616821289, "learning_rate": 4.266703994027623e-05, "loss": 2.2463, "step": 3930 }, { "epoch": 0.4412094064949608, "grad_norm": 3.3323545455932617, "learning_rate": 4.264837625979843e-05, "loss": 2.0737, "step": 3940 }, { "epoch": 0.4423292273236282, "grad_norm": 8.51876163482666, "learning_rate": 4.2629712579320645e-05, "loss": 2.366, "step": 3950 }, { "epoch": 0.44344904815229563, "grad_norm": 15.145380973815918, "learning_rate": 4.2611048898842856e-05, "loss": 2.3532, "step": 3960 }, { "epoch": 0.44456886898096304, "grad_norm": 7.819403171539307, "learning_rate": 4.259238521836507e-05, "loss": 2.0457, "step": 3970 }, { "epoch": 0.44568868980963045, "grad_norm": 2.681534767150879, "learning_rate": 4.257372153788727e-05, "loss": 2.159, "step": 3980 }, { "epoch": 0.44680851063829785, "grad_norm": 2.585684061050415, "learning_rate": 4.255505785740948e-05, "loss": 2.5404, "step": 3990 }, { "epoch": 0.4479283314669653, "grad_norm": 6.74754524230957, "learning_rate": 4.2536394176931696e-05, "loss": 2.3685, "step": 4000 }, { "epoch": 0.4490481522956327, "grad_norm": 2.9003231525421143, "learning_rate": 4.25177304964539e-05, "loss": 2.1618, "step": 4010 }, { "epoch": 0.4501679731243001, "grad_norm": 13.633071899414062, "learning_rate": 4.249906681597611e-05, "loss": 2.375, "step": 4020 }, { "epoch": 0.45128779395296753, "grad_norm": 9.91258430480957, "learning_rate": 4.248040313549832e-05, "loss": 2.0909, "step": 4030 }, { "epoch": 0.45240761478163494, "grad_norm": 4.096499443054199, "learning_rate": 4.246173945502053e-05, "loss": 2.1711, "step": 4040 }, { "epoch": 0.45352743561030234, "grad_norm": 10.29516887664795, "learning_rate": 4.244307577454274e-05, "loss": 2.4019, "step": 4050 }, { "epoch": 0.45464725643896975, "grad_norm": 9.679535865783691, "learning_rate": 4.242441209406495e-05, "loss": 2.1732, "step": 4060 }, { "epoch": 0.45576707726763716, "grad_norm": 2.7053027153015137, "learning_rate": 4.240574841358716e-05, "loss": 2.0555, "step": 4070 }, { "epoch": 0.4568868980963046, "grad_norm": 7.90255069732666, "learning_rate": 4.238708473310937e-05, "loss": 2.3904, "step": 4080 }, { "epoch": 0.458006718924972, "grad_norm": 3.999415397644043, "learning_rate": 4.236842105263158e-05, "loss": 2.2267, "step": 4090 }, { "epoch": 0.45912653975363943, "grad_norm": 4.677366256713867, "learning_rate": 4.234975737215379e-05, "loss": 1.7037, "step": 4100 }, { "epoch": 0.46024636058230683, "grad_norm": 10.746310234069824, "learning_rate": 4.2331093691676e-05, "loss": 2.3778, "step": 4110 }, { "epoch": 0.46136618141097424, "grad_norm": 2.9237968921661377, "learning_rate": 4.231243001119821e-05, "loss": 2.2335, "step": 4120 }, { "epoch": 0.46248600223964165, "grad_norm": 3.3537890911102295, "learning_rate": 4.229376633072042e-05, "loss": 2.0904, "step": 4130 }, { "epoch": 0.46360582306830905, "grad_norm": 4.9723358154296875, "learning_rate": 4.227510265024263e-05, "loss": 2.0794, "step": 4140 }, { "epoch": 0.46472564389697646, "grad_norm": 3.5420267581939697, "learning_rate": 4.225643896976484e-05, "loss": 1.95, "step": 4150 }, { "epoch": 0.4658454647256439, "grad_norm": 5.858832359313965, "learning_rate": 4.223777528928705e-05, "loss": 2.3874, "step": 4160 }, { "epoch": 0.4669652855543113, "grad_norm": 3.2437384128570557, "learning_rate": 4.221911160880926e-05, "loss": 1.9892, "step": 4170 }, { "epoch": 0.46808510638297873, "grad_norm": 8.610901832580566, "learning_rate": 4.220044792833147e-05, "loss": 2.8553, "step": 4180 }, { "epoch": 0.46920492721164614, "grad_norm": 8.59118938446045, "learning_rate": 4.218178424785368e-05, "loss": 2.4644, "step": 4190 }, { "epoch": 0.47032474804031354, "grad_norm": 6.852227210998535, "learning_rate": 4.216312056737589e-05, "loss": 2.1337, "step": 4200 }, { "epoch": 0.47144456886898095, "grad_norm": 6.020224571228027, "learning_rate": 4.21444568868981e-05, "loss": 1.9491, "step": 4210 }, { "epoch": 0.47256438969764836, "grad_norm": 4.580352783203125, "learning_rate": 4.2125793206420306e-05, "loss": 2.6378, "step": 4220 }, { "epoch": 0.47368421052631576, "grad_norm": 10.987154006958008, "learning_rate": 4.210712952594252e-05, "loss": 2.3521, "step": 4230 }, { "epoch": 0.4748040313549832, "grad_norm": 2.8252968788146973, "learning_rate": 4.208846584546472e-05, "loss": 2.3216, "step": 4240 }, { "epoch": 0.47592385218365063, "grad_norm": 5.822597503662109, "learning_rate": 4.2069802164986935e-05, "loss": 1.9804, "step": 4250 }, { "epoch": 0.47704367301231804, "grad_norm": 3.373899221420288, "learning_rate": 4.2051138484509146e-05, "loss": 1.9312, "step": 4260 }, { "epoch": 0.47816349384098544, "grad_norm": 8.621574401855469, "learning_rate": 4.203247480403136e-05, "loss": 2.1056, "step": 4270 }, { "epoch": 0.47928331466965285, "grad_norm": 3.1795461177825928, "learning_rate": 4.201381112355356e-05, "loss": 1.9162, "step": 4280 }, { "epoch": 0.48040313549832026, "grad_norm": 3.5701396465301514, "learning_rate": 4.1995147443075775e-05, "loss": 2.3904, "step": 4290 }, { "epoch": 0.48152295632698766, "grad_norm": 2.4584646224975586, "learning_rate": 4.1976483762597986e-05, "loss": 1.9739, "step": 4300 }, { "epoch": 0.4826427771556551, "grad_norm": 9.168150901794434, "learning_rate": 4.19578200821202e-05, "loss": 2.242, "step": 4310 }, { "epoch": 0.48376259798432253, "grad_norm": 6.235483169555664, "learning_rate": 4.19391564016424e-05, "loss": 2.0577, "step": 4320 }, { "epoch": 0.48488241881298993, "grad_norm": 4.428210735321045, "learning_rate": 4.1920492721164615e-05, "loss": 2.3219, "step": 4330 }, { "epoch": 0.48600223964165734, "grad_norm": 10.322796821594238, "learning_rate": 4.1901829040686826e-05, "loss": 2.4196, "step": 4340 }, { "epoch": 0.48712206047032475, "grad_norm": 11.971220016479492, "learning_rate": 4.188316536020904e-05, "loss": 2.221, "step": 4350 }, { "epoch": 0.48824188129899215, "grad_norm": 2.4789071083068848, "learning_rate": 4.186450167973124e-05, "loss": 1.9658, "step": 4360 }, { "epoch": 0.48936170212765956, "grad_norm": 3.5437817573547363, "learning_rate": 4.1845837999253455e-05, "loss": 2.3086, "step": 4370 }, { "epoch": 0.49048152295632697, "grad_norm": 2.637206554412842, "learning_rate": 4.1827174318775666e-05, "loss": 2.0656, "step": 4380 }, { "epoch": 0.4916013437849944, "grad_norm": 2.3072986602783203, "learning_rate": 4.180851063829788e-05, "loss": 1.8875, "step": 4390 }, { "epoch": 0.49272116461366183, "grad_norm": 11.45031452178955, "learning_rate": 4.178984695782008e-05, "loss": 1.8249, "step": 4400 }, { "epoch": 0.49384098544232924, "grad_norm": 8.976868629455566, "learning_rate": 4.1771183277342295e-05, "loss": 2.4993, "step": 4410 }, { "epoch": 0.49496080627099664, "grad_norm": 2.619194507598877, "learning_rate": 4.1752519596864506e-05, "loss": 2.221, "step": 4420 }, { "epoch": 0.49608062709966405, "grad_norm": 10.117256164550781, "learning_rate": 4.173385591638672e-05, "loss": 1.8587, "step": 4430 }, { "epoch": 0.49720044792833146, "grad_norm": 2.874436140060425, "learning_rate": 4.171519223590892e-05, "loss": 1.5831, "step": 4440 }, { "epoch": 0.49832026875699886, "grad_norm": 10.60855484008789, "learning_rate": 4.1696528555431135e-05, "loss": 2.483, "step": 4450 }, { "epoch": 0.49944008958566627, "grad_norm": 5.419251441955566, "learning_rate": 4.167786487495334e-05, "loss": 1.905, "step": 4460 }, { "epoch": 0.5005599104143337, "grad_norm": 4.474939823150635, "learning_rate": 4.165920119447555e-05, "loss": 2.1842, "step": 4470 }, { "epoch": 0.5016797312430011, "grad_norm": 2.611745595932007, "learning_rate": 4.164053751399776e-05, "loss": 2.2377, "step": 4480 }, { "epoch": 0.5027995520716685, "grad_norm": 3.1102752685546875, "learning_rate": 4.162187383351997e-05, "loss": 1.7457, "step": 4490 }, { "epoch": 0.503919372900336, "grad_norm": 3.362260341644287, "learning_rate": 4.160321015304218e-05, "loss": 2.2162, "step": 4500 }, { "epoch": 0.5050391937290034, "grad_norm": 5.862063407897949, "learning_rate": 4.158454647256439e-05, "loss": 1.9277, "step": 4510 }, { "epoch": 0.5061590145576708, "grad_norm": 2.268481969833374, "learning_rate": 4.15658827920866e-05, "loss": 2.0642, "step": 4520 }, { "epoch": 0.5072788353863382, "grad_norm": 3.2130661010742188, "learning_rate": 4.154721911160881e-05, "loss": 2.2511, "step": 4530 }, { "epoch": 0.5083986562150056, "grad_norm": 9.958855628967285, "learning_rate": 4.152855543113102e-05, "loss": 2.8674, "step": 4540 }, { "epoch": 0.509518477043673, "grad_norm": 3.7821731567382812, "learning_rate": 4.150989175065323e-05, "loss": 2.2098, "step": 4550 }, { "epoch": 0.5106382978723404, "grad_norm": 3.210670232772827, "learning_rate": 4.149122807017544e-05, "loss": 2.1041, "step": 4560 }, { "epoch": 0.5117581187010078, "grad_norm": 12.174056053161621, "learning_rate": 4.147256438969765e-05, "loss": 2.1001, "step": 4570 }, { "epoch": 0.5128779395296752, "grad_norm": 10.776714324951172, "learning_rate": 4.145390070921986e-05, "loss": 2.2956, "step": 4580 }, { "epoch": 0.5139977603583427, "grad_norm": 7.005626201629639, "learning_rate": 4.143523702874207e-05, "loss": 2.2044, "step": 4590 }, { "epoch": 0.5151175811870101, "grad_norm": 11.280997276306152, "learning_rate": 4.141657334826428e-05, "loss": 2.3835, "step": 4600 }, { "epoch": 0.5162374020156775, "grad_norm": 8.00539779663086, "learning_rate": 4.139790966778649e-05, "loss": 2.15, "step": 4610 }, { "epoch": 0.5173572228443449, "grad_norm": 11.379185676574707, "learning_rate": 4.13792459873087e-05, "loss": 2.2607, "step": 4620 }, { "epoch": 0.5184770436730123, "grad_norm": 3.4828784465789795, "learning_rate": 4.136058230683091e-05, "loss": 2.6439, "step": 4630 }, { "epoch": 0.5195968645016797, "grad_norm": 8.438654899597168, "learning_rate": 4.134191862635312e-05, "loss": 1.7996, "step": 4640 }, { "epoch": 0.5207166853303471, "grad_norm": 3.8132407665252686, "learning_rate": 4.132325494587533e-05, "loss": 2.1626, "step": 4650 }, { "epoch": 0.5218365061590146, "grad_norm": 11.904292106628418, "learning_rate": 4.130459126539754e-05, "loss": 2.5126, "step": 4660 }, { "epoch": 0.522956326987682, "grad_norm": 8.020877838134766, "learning_rate": 4.1285927584919745e-05, "loss": 2.3528, "step": 4670 }, { "epoch": 0.5240761478163494, "grad_norm": 2.708252191543579, "learning_rate": 4.126726390444196e-05, "loss": 2.0487, "step": 4680 }, { "epoch": 0.5251959686450168, "grad_norm": 3.0927486419677734, "learning_rate": 4.124860022396417e-05, "loss": 1.6742, "step": 4690 }, { "epoch": 0.5263157894736842, "grad_norm": 12.992857933044434, "learning_rate": 4.122993654348637e-05, "loss": 1.9681, "step": 4700 }, { "epoch": 0.5274356103023516, "grad_norm": 6.5138325691223145, "learning_rate": 4.1211272863008585e-05, "loss": 1.923, "step": 4710 }, { "epoch": 0.528555431131019, "grad_norm": 3.025493621826172, "learning_rate": 4.1192609182530796e-05, "loss": 1.6204, "step": 4720 }, { "epoch": 0.5296752519596865, "grad_norm": 3.9649546146392822, "learning_rate": 4.117394550205301e-05, "loss": 2.3652, "step": 4730 }, { "epoch": 0.5307950727883539, "grad_norm": 3.212306499481201, "learning_rate": 4.115528182157521e-05, "loss": 2.0342, "step": 4740 }, { "epoch": 0.5319148936170213, "grad_norm": 9.27729320526123, "learning_rate": 4.1136618141097425e-05, "loss": 1.9462, "step": 4750 }, { "epoch": 0.5330347144456887, "grad_norm": 8.476268768310547, "learning_rate": 4.1117954460619636e-05, "loss": 2.3341, "step": 4760 }, { "epoch": 0.5341545352743561, "grad_norm": 3.2790377140045166, "learning_rate": 4.109929078014185e-05, "loss": 2.1268, "step": 4770 }, { "epoch": 0.5352743561030235, "grad_norm": 2.6565237045288086, "learning_rate": 4.108062709966405e-05, "loss": 1.9036, "step": 4780 }, { "epoch": 0.5363941769316909, "grad_norm": 6.67348051071167, "learning_rate": 4.1061963419186265e-05, "loss": 1.9455, "step": 4790 }, { "epoch": 0.5375139977603584, "grad_norm": 8.719578742980957, "learning_rate": 4.1043299738708476e-05, "loss": 2.1615, "step": 4800 }, { "epoch": 0.5386338185890257, "grad_norm": 5.975245475769043, "learning_rate": 4.102463605823069e-05, "loss": 2.1647, "step": 4810 }, { "epoch": 0.5397536394176932, "grad_norm": 2.856062412261963, "learning_rate": 4.100597237775289e-05, "loss": 2.0456, "step": 4820 }, { "epoch": 0.5408734602463606, "grad_norm": 2.8386130332946777, "learning_rate": 4.0987308697275105e-05, "loss": 2.1015, "step": 4830 }, { "epoch": 0.541993281075028, "grad_norm": 3.556990623474121, "learning_rate": 4.0968645016797316e-05, "loss": 2.2417, "step": 4840 }, { "epoch": 0.5431131019036954, "grad_norm": 7.924489498138428, "learning_rate": 4.094998133631953e-05, "loss": 1.9925, "step": 4850 }, { "epoch": 0.5442329227323628, "grad_norm": 11.762128829956055, "learning_rate": 4.093131765584173e-05, "loss": 2.5949, "step": 4860 }, { "epoch": 0.5453527435610303, "grad_norm": 7.913935661315918, "learning_rate": 4.0912653975363945e-05, "loss": 2.0106, "step": 4870 }, { "epoch": 0.5464725643896976, "grad_norm": 8.176780700683594, "learning_rate": 4.0893990294886156e-05, "loss": 2.1813, "step": 4880 }, { "epoch": 0.5475923852183651, "grad_norm": 2.9236576557159424, "learning_rate": 4.087532661440837e-05, "loss": 2.1256, "step": 4890 }, { "epoch": 0.5487122060470325, "grad_norm": 12.421939849853516, "learning_rate": 4.085666293393057e-05, "loss": 2.2168, "step": 4900 }, { "epoch": 0.5498320268756999, "grad_norm": 2.374150514602661, "learning_rate": 4.0837999253452785e-05, "loss": 2.1841, "step": 4910 }, { "epoch": 0.5509518477043673, "grad_norm": 5.778265953063965, "learning_rate": 4.081933557297499e-05, "loss": 1.866, "step": 4920 }, { "epoch": 0.5520716685330347, "grad_norm": 2.9031143188476562, "learning_rate": 4.08006718924972e-05, "loss": 2.1778, "step": 4930 }, { "epoch": 0.5531914893617021, "grad_norm": 2.8752217292785645, "learning_rate": 4.078200821201941e-05, "loss": 2.1301, "step": 4940 }, { "epoch": 0.5543113101903695, "grad_norm": 6.815023899078369, "learning_rate": 4.076334453154162e-05, "loss": 2.1037, "step": 4950 }, { "epoch": 0.555431131019037, "grad_norm": 3.5605039596557617, "learning_rate": 4.074468085106383e-05, "loss": 2.181, "step": 4960 }, { "epoch": 0.5565509518477044, "grad_norm": 3.3536183834075928, "learning_rate": 4.072601717058604e-05, "loss": 2.1846, "step": 4970 }, { "epoch": 0.5576707726763718, "grad_norm": 14.845537185668945, "learning_rate": 4.070735349010825e-05, "loss": 2.332, "step": 4980 }, { "epoch": 0.5587905935050392, "grad_norm": 6.083976745605469, "learning_rate": 4.068868980963046e-05, "loss": 2.1927, "step": 4990 }, { "epoch": 0.5599104143337066, "grad_norm": 3.194537401199341, "learning_rate": 4.067002612915267e-05, "loss": 2.3715, "step": 5000 }, { "epoch": 0.561030235162374, "grad_norm": 2.682272434234619, "learning_rate": 4.065136244867488e-05, "loss": 1.95, "step": 5010 }, { "epoch": 0.5621500559910414, "grad_norm": 3.429429054260254, "learning_rate": 4.063269876819709e-05, "loss": 2.2317, "step": 5020 }, { "epoch": 0.5632698768197089, "grad_norm": 6.22359037399292, "learning_rate": 4.06140350877193e-05, "loss": 2.2734, "step": 5030 }, { "epoch": 0.5643896976483762, "grad_norm": 12.685219764709473, "learning_rate": 4.059537140724151e-05, "loss": 2.3362, "step": 5040 }, { "epoch": 0.5655095184770437, "grad_norm": 3.122385025024414, "learning_rate": 4.057670772676372e-05, "loss": 2.2147, "step": 5050 }, { "epoch": 0.5666293393057111, "grad_norm": 3.515317678451538, "learning_rate": 4.055804404628593e-05, "loss": 2.3661, "step": 5060 }, { "epoch": 0.5677491601343785, "grad_norm": 5.837533473968506, "learning_rate": 4.053938036580814e-05, "loss": 2.057, "step": 5070 }, { "epoch": 0.568868980963046, "grad_norm": 2.728402614593506, "learning_rate": 4.052071668533035e-05, "loss": 1.9779, "step": 5080 }, { "epoch": 0.5699888017917133, "grad_norm": 5.042017459869385, "learning_rate": 4.050205300485256e-05, "loss": 1.8092, "step": 5090 }, { "epoch": 0.5711086226203808, "grad_norm": 9.7918701171875, "learning_rate": 4.048338932437477e-05, "loss": 2.4933, "step": 5100 }, { "epoch": 0.5722284434490481, "grad_norm": 3.005107879638672, "learning_rate": 4.046472564389698e-05, "loss": 2.2101, "step": 5110 }, { "epoch": 0.5733482642777156, "grad_norm": 4.867323875427246, "learning_rate": 4.044606196341919e-05, "loss": 2.0726, "step": 5120 }, { "epoch": 0.574468085106383, "grad_norm": 4.434531211853027, "learning_rate": 4.0427398282941395e-05, "loss": 2.5219, "step": 5130 }, { "epoch": 0.5755879059350504, "grad_norm": 9.07414722442627, "learning_rate": 4.040873460246361e-05, "loss": 2.2031, "step": 5140 }, { "epoch": 0.5767077267637178, "grad_norm": 2.6722495555877686, "learning_rate": 4.039007092198582e-05, "loss": 2.2475, "step": 5150 }, { "epoch": 0.5778275475923852, "grad_norm": 6.318906784057617, "learning_rate": 4.037140724150802e-05, "loss": 2.1646, "step": 5160 }, { "epoch": 0.5789473684210527, "grad_norm": 2.754269599914551, "learning_rate": 4.0352743561030235e-05, "loss": 2.6936, "step": 5170 }, { "epoch": 0.58006718924972, "grad_norm": 8.491806983947754, "learning_rate": 4.0334079880552446e-05, "loss": 2.308, "step": 5180 }, { "epoch": 0.5811870100783875, "grad_norm": 4.256706714630127, "learning_rate": 4.031541620007466e-05, "loss": 1.9769, "step": 5190 }, { "epoch": 0.5823068309070548, "grad_norm": 8.133840560913086, "learning_rate": 4.029675251959686e-05, "loss": 1.8873, "step": 5200 }, { "epoch": 0.5834266517357223, "grad_norm": 3.6523616313934326, "learning_rate": 4.0278088839119075e-05, "loss": 1.9991, "step": 5210 }, { "epoch": 0.5845464725643897, "grad_norm": 2.976468324661255, "learning_rate": 4.0259425158641286e-05, "loss": 2.6573, "step": 5220 }, { "epoch": 0.5856662933930571, "grad_norm": 8.252400398254395, "learning_rate": 4.02407614781635e-05, "loss": 2.221, "step": 5230 }, { "epoch": 0.5867861142217246, "grad_norm": 3.0009639263153076, "learning_rate": 4.02220977976857e-05, "loss": 2.2523, "step": 5240 }, { "epoch": 0.5879059350503919, "grad_norm": 2.764678955078125, "learning_rate": 4.0203434117207915e-05, "loss": 2.0534, "step": 5250 }, { "epoch": 0.5890257558790594, "grad_norm": 2.818638563156128, "learning_rate": 4.0184770436730126e-05, "loss": 1.9171, "step": 5260 }, { "epoch": 0.5901455767077267, "grad_norm": 3.9487977027893066, "learning_rate": 4.016610675625234e-05, "loss": 2.1506, "step": 5270 }, { "epoch": 0.5912653975363942, "grad_norm": 4.936847686767578, "learning_rate": 4.014744307577454e-05, "loss": 2.1909, "step": 5280 }, { "epoch": 0.5923852183650616, "grad_norm": 14.60064697265625, "learning_rate": 4.0128779395296755e-05, "loss": 2.0533, "step": 5290 }, { "epoch": 0.593505039193729, "grad_norm": 9.342129707336426, "learning_rate": 4.0110115714818966e-05, "loss": 2.0769, "step": 5300 }, { "epoch": 0.5946248600223965, "grad_norm": 16.89434242248535, "learning_rate": 4.009145203434118e-05, "loss": 2.2267, "step": 5310 }, { "epoch": 0.5957446808510638, "grad_norm": 6.977470397949219, "learning_rate": 4.007278835386338e-05, "loss": 2.954, "step": 5320 }, { "epoch": 0.5968645016797313, "grad_norm": 2.928067922592163, "learning_rate": 4.0054124673385595e-05, "loss": 1.9651, "step": 5330 }, { "epoch": 0.5979843225083986, "grad_norm": 2.7120723724365234, "learning_rate": 4.00354609929078e-05, "loss": 2.5949, "step": 5340 }, { "epoch": 0.5991041433370661, "grad_norm": 2.3959896564483643, "learning_rate": 4.001679731243002e-05, "loss": 2.5269, "step": 5350 }, { "epoch": 0.6002239641657335, "grad_norm": 4.766486644744873, "learning_rate": 3.999813363195222e-05, "loss": 2.1063, "step": 5360 }, { "epoch": 0.6013437849944009, "grad_norm": 2.9101717472076416, "learning_rate": 3.9979469951474435e-05, "loss": 2.4363, "step": 5370 }, { "epoch": 0.6024636058230683, "grad_norm": 2.3980298042297363, "learning_rate": 3.996080627099664e-05, "loss": 2.166, "step": 5380 }, { "epoch": 0.6035834266517357, "grad_norm": 3.109349012374878, "learning_rate": 3.994214259051885e-05, "loss": 2.3349, "step": 5390 }, { "epoch": 0.6047032474804032, "grad_norm": 3.364403486251831, "learning_rate": 3.992347891004106e-05, "loss": 2.036, "step": 5400 }, { "epoch": 0.6058230683090705, "grad_norm": 6.84296989440918, "learning_rate": 3.990481522956327e-05, "loss": 2.1545, "step": 5410 }, { "epoch": 0.606942889137738, "grad_norm": 3.494910717010498, "learning_rate": 3.988615154908548e-05, "loss": 2.0747, "step": 5420 }, { "epoch": 0.6080627099664053, "grad_norm": 11.233692169189453, "learning_rate": 3.986748786860769e-05, "loss": 2.1933, "step": 5430 }, { "epoch": 0.6091825307950728, "grad_norm": 2.6794285774230957, "learning_rate": 3.98488241881299e-05, "loss": 1.9889, "step": 5440 }, { "epoch": 0.6103023516237402, "grad_norm": 6.740621089935303, "learning_rate": 3.983016050765211e-05, "loss": 2.4853, "step": 5450 }, { "epoch": 0.6114221724524076, "grad_norm": 3.250119686126709, "learning_rate": 3.981149682717432e-05, "loss": 2.2843, "step": 5460 }, { "epoch": 0.6125419932810751, "grad_norm": 5.2820940017700195, "learning_rate": 3.979283314669653e-05, "loss": 2.0738, "step": 5470 }, { "epoch": 0.6136618141097424, "grad_norm": 5.155092716217041, "learning_rate": 3.977416946621874e-05, "loss": 2.0983, "step": 5480 }, { "epoch": 0.6147816349384099, "grad_norm": 10.836530685424805, "learning_rate": 3.975550578574095e-05, "loss": 1.8994, "step": 5490 }, { "epoch": 0.6159014557670772, "grad_norm": 4.8996968269348145, "learning_rate": 3.973684210526316e-05, "loss": 2.0153, "step": 5500 }, { "epoch": 0.6170212765957447, "grad_norm": 12.028742790222168, "learning_rate": 3.971817842478537e-05, "loss": 2.4317, "step": 5510 }, { "epoch": 0.6181410974244121, "grad_norm": 10.310006141662598, "learning_rate": 3.969951474430758e-05, "loss": 2.5903, "step": 5520 }, { "epoch": 0.6192609182530795, "grad_norm": 3.7316179275512695, "learning_rate": 3.968085106382979e-05, "loss": 2.148, "step": 5530 }, { "epoch": 0.620380739081747, "grad_norm": 4.745426177978516, "learning_rate": 3.9662187383352e-05, "loss": 2.3905, "step": 5540 }, { "epoch": 0.6215005599104143, "grad_norm": 12.419564247131348, "learning_rate": 3.9643523702874205e-05, "loss": 2.6076, "step": 5550 }, { "epoch": 0.6226203807390818, "grad_norm": 3.6548733711242676, "learning_rate": 3.962486002239642e-05, "loss": 2.2913, "step": 5560 }, { "epoch": 0.6237402015677491, "grad_norm": 2.8344454765319824, "learning_rate": 3.960619634191863e-05, "loss": 2.3774, "step": 5570 }, { "epoch": 0.6248600223964166, "grad_norm": 4.7610321044921875, "learning_rate": 3.958753266144084e-05, "loss": 2.2121, "step": 5580 }, { "epoch": 0.6259798432250839, "grad_norm": 3.1101725101470947, "learning_rate": 3.9568868980963045e-05, "loss": 2.3034, "step": 5590 }, { "epoch": 0.6270996640537514, "grad_norm": 2.6766905784606934, "learning_rate": 3.955020530048526e-05, "loss": 1.7205, "step": 5600 }, { "epoch": 0.6282194848824189, "grad_norm": 3.271083116531372, "learning_rate": 3.953154162000747e-05, "loss": 2.4918, "step": 5610 }, { "epoch": 0.6293393057110862, "grad_norm": 7.914976119995117, "learning_rate": 3.951287793952967e-05, "loss": 2.1463, "step": 5620 }, { "epoch": 0.6304591265397537, "grad_norm": 3.1537246704101562, "learning_rate": 3.9494214259051885e-05, "loss": 2.0338, "step": 5630 }, { "epoch": 0.631578947368421, "grad_norm": 10.18811321258545, "learning_rate": 3.9475550578574096e-05, "loss": 2.1563, "step": 5640 }, { "epoch": 0.6326987681970885, "grad_norm": 11.715261459350586, "learning_rate": 3.945688689809631e-05, "loss": 2.3083, "step": 5650 }, { "epoch": 0.6338185890257558, "grad_norm": 2.7163820266723633, "learning_rate": 3.943822321761851e-05, "loss": 2.3445, "step": 5660 }, { "epoch": 0.6349384098544233, "grad_norm": 6.3636627197265625, "learning_rate": 3.9419559537140725e-05, "loss": 2.2905, "step": 5670 }, { "epoch": 0.6360582306830908, "grad_norm": 3.9701311588287354, "learning_rate": 3.9400895856662936e-05, "loss": 1.8656, "step": 5680 }, { "epoch": 0.6371780515117581, "grad_norm": 5.779101848602295, "learning_rate": 3.938223217618515e-05, "loss": 2.1488, "step": 5690 }, { "epoch": 0.6382978723404256, "grad_norm": 3.038818359375, "learning_rate": 3.936356849570735e-05, "loss": 2.2644, "step": 5700 }, { "epoch": 0.6394176931690929, "grad_norm": 2.684335470199585, "learning_rate": 3.9344904815229565e-05, "loss": 2.1819, "step": 5710 }, { "epoch": 0.6405375139977604, "grad_norm": 11.84406566619873, "learning_rate": 3.9326241134751776e-05, "loss": 2.2329, "step": 5720 }, { "epoch": 0.6416573348264277, "grad_norm": 2.978997230529785, "learning_rate": 3.930757745427399e-05, "loss": 2.2307, "step": 5730 }, { "epoch": 0.6427771556550952, "grad_norm": 3.1249163150787354, "learning_rate": 3.928891377379619e-05, "loss": 1.8239, "step": 5740 }, { "epoch": 0.6438969764837627, "grad_norm": 8.90749454498291, "learning_rate": 3.9270250093318405e-05, "loss": 2.3477, "step": 5750 }, { "epoch": 0.64501679731243, "grad_norm": 8.59403133392334, "learning_rate": 3.925158641284061e-05, "loss": 1.6853, "step": 5760 }, { "epoch": 0.6461366181410975, "grad_norm": 3.567573070526123, "learning_rate": 3.923292273236283e-05, "loss": 1.7603, "step": 5770 }, { "epoch": 0.6472564389697648, "grad_norm": 2.913238286972046, "learning_rate": 3.921425905188503e-05, "loss": 1.9284, "step": 5780 }, { "epoch": 0.6483762597984323, "grad_norm": 5.024287700653076, "learning_rate": 3.9195595371407245e-05, "loss": 1.9221, "step": 5790 }, { "epoch": 0.6494960806270996, "grad_norm": 13.80717945098877, "learning_rate": 3.917693169092945e-05, "loss": 2.1171, "step": 5800 }, { "epoch": 0.6506159014557671, "grad_norm": 2.929304599761963, "learning_rate": 3.915826801045167e-05, "loss": 1.929, "step": 5810 }, { "epoch": 0.6517357222844344, "grad_norm": 2.820366859436035, "learning_rate": 3.913960432997387e-05, "loss": 2.1108, "step": 5820 }, { "epoch": 0.6528555431131019, "grad_norm": 3.1681950092315674, "learning_rate": 3.9120940649496085e-05, "loss": 2.4005, "step": 5830 }, { "epoch": 0.6539753639417694, "grad_norm": 10.097253799438477, "learning_rate": 3.910227696901829e-05, "loss": 1.8262, "step": 5840 }, { "epoch": 0.6550951847704367, "grad_norm": 3.0544557571411133, "learning_rate": 3.90836132885405e-05, "loss": 2.2202, "step": 5850 }, { "epoch": 0.6562150055991042, "grad_norm": 7.928321838378906, "learning_rate": 3.906494960806271e-05, "loss": 1.8896, "step": 5860 }, { "epoch": 0.6573348264277715, "grad_norm": 12.526985168457031, "learning_rate": 3.904628592758492e-05, "loss": 2.263, "step": 5870 }, { "epoch": 0.658454647256439, "grad_norm": 12.4088716506958, "learning_rate": 3.902762224710713e-05, "loss": 2.3226, "step": 5880 }, { "epoch": 0.6595744680851063, "grad_norm": 8.499159812927246, "learning_rate": 3.900895856662934e-05, "loss": 1.9171, "step": 5890 }, { "epoch": 0.6606942889137738, "grad_norm": 6.148478031158447, "learning_rate": 3.899029488615155e-05, "loss": 1.7816, "step": 5900 }, { "epoch": 0.6618141097424413, "grad_norm": 2.6093831062316895, "learning_rate": 3.897163120567376e-05, "loss": 1.874, "step": 5910 }, { "epoch": 0.6629339305711086, "grad_norm": 2.9577527046203613, "learning_rate": 3.895296752519597e-05, "loss": 2.4124, "step": 5920 }, { "epoch": 0.6640537513997761, "grad_norm": 2.769073724746704, "learning_rate": 3.893430384471818e-05, "loss": 2.3388, "step": 5930 }, { "epoch": 0.6651735722284434, "grad_norm": 3.398643970489502, "learning_rate": 3.891564016424039e-05, "loss": 2.2894, "step": 5940 }, { "epoch": 0.6662933930571109, "grad_norm": 3.1375699043273926, "learning_rate": 3.88969764837626e-05, "loss": 2.4933, "step": 5950 }, { "epoch": 0.6674132138857782, "grad_norm": 8.086012840270996, "learning_rate": 3.887831280328481e-05, "loss": 2.1847, "step": 5960 }, { "epoch": 0.6685330347144457, "grad_norm": 7.541558742523193, "learning_rate": 3.8859649122807015e-05, "loss": 2.0018, "step": 5970 }, { "epoch": 0.6696528555431132, "grad_norm": 8.947002410888672, "learning_rate": 3.884098544232923e-05, "loss": 2.3184, "step": 5980 }, { "epoch": 0.6707726763717805, "grad_norm": 9.351658821105957, "learning_rate": 3.882232176185144e-05, "loss": 2.3077, "step": 5990 }, { "epoch": 0.671892497200448, "grad_norm": 6.452417850494385, "learning_rate": 3.880365808137365e-05, "loss": 1.9899, "step": 6000 }, { "epoch": 0.6730123180291153, "grad_norm": 7.518797874450684, "learning_rate": 3.8784994400895855e-05, "loss": 2.3504, "step": 6010 }, { "epoch": 0.6741321388577828, "grad_norm": 11.191749572753906, "learning_rate": 3.876633072041807e-05, "loss": 2.2785, "step": 6020 }, { "epoch": 0.6752519596864501, "grad_norm": 3.318284273147583, "learning_rate": 3.874766703994028e-05, "loss": 2.1379, "step": 6030 }, { "epoch": 0.6763717805151176, "grad_norm": 11.762707710266113, "learning_rate": 3.872900335946249e-05, "loss": 2.1742, "step": 6040 }, { "epoch": 0.6774916013437849, "grad_norm": 2.576070785522461, "learning_rate": 3.8710339678984695e-05, "loss": 2.033, "step": 6050 }, { "epoch": 0.6786114221724524, "grad_norm": 3.21813702583313, "learning_rate": 3.869167599850691e-05, "loss": 2.1262, "step": 6060 }, { "epoch": 0.6797312430011199, "grad_norm": 5.945693016052246, "learning_rate": 3.867301231802912e-05, "loss": 2.3859, "step": 6070 }, { "epoch": 0.6808510638297872, "grad_norm": 4.385049819946289, "learning_rate": 3.865434863755133e-05, "loss": 1.8462, "step": 6080 }, { "epoch": 0.6819708846584547, "grad_norm": 3.1201934814453125, "learning_rate": 3.8635684957073535e-05, "loss": 2.2583, "step": 6090 }, { "epoch": 0.683090705487122, "grad_norm": 3.0420446395874023, "learning_rate": 3.8617021276595746e-05, "loss": 2.3901, "step": 6100 }, { "epoch": 0.6842105263157895, "grad_norm": 2.6930365562438965, "learning_rate": 3.859835759611796e-05, "loss": 2.2662, "step": 6110 }, { "epoch": 0.6853303471444568, "grad_norm": 4.4401984214782715, "learning_rate": 3.857969391564016e-05, "loss": 2.472, "step": 6120 }, { "epoch": 0.6864501679731243, "grad_norm": 2.8523037433624268, "learning_rate": 3.8561030235162375e-05, "loss": 1.9654, "step": 6130 }, { "epoch": 0.6875699888017918, "grad_norm": 13.0241060256958, "learning_rate": 3.8542366554684587e-05, "loss": 2.3012, "step": 6140 }, { "epoch": 0.6886898096304591, "grad_norm": 3.5604312419891357, "learning_rate": 3.85237028742068e-05, "loss": 2.0856, "step": 6150 }, { "epoch": 0.6898096304591266, "grad_norm": 2.850850820541382, "learning_rate": 3.8505039193729e-05, "loss": 2.0274, "step": 6160 }, { "epoch": 0.6909294512877939, "grad_norm": 8.735082626342773, "learning_rate": 3.8486375513251215e-05, "loss": 1.6773, "step": 6170 }, { "epoch": 0.6920492721164614, "grad_norm": 3.8772952556610107, "learning_rate": 3.8467711832773427e-05, "loss": 2.5705, "step": 6180 }, { "epoch": 0.6931690929451287, "grad_norm": 8.324105262756348, "learning_rate": 3.844904815229564e-05, "loss": 2.331, "step": 6190 }, { "epoch": 0.6942889137737962, "grad_norm": 3.392038583755493, "learning_rate": 3.843038447181784e-05, "loss": 2.4718, "step": 6200 }, { "epoch": 0.6954087346024636, "grad_norm": 3.956043004989624, "learning_rate": 3.8411720791340055e-05, "loss": 2.0328, "step": 6210 }, { "epoch": 0.696528555431131, "grad_norm": 3.05572247505188, "learning_rate": 3.839305711086226e-05, "loss": 1.9317, "step": 6220 }, { "epoch": 0.6976483762597985, "grad_norm": 3.7026829719543457, "learning_rate": 3.837439343038448e-05, "loss": 2.1722, "step": 6230 }, { "epoch": 0.6987681970884658, "grad_norm": 7.879319190979004, "learning_rate": 3.835572974990668e-05, "loss": 1.8841, "step": 6240 }, { "epoch": 0.6998880179171333, "grad_norm": 11.358241081237793, "learning_rate": 3.8337066069428895e-05, "loss": 2.5414, "step": 6250 }, { "epoch": 0.7010078387458006, "grad_norm": 2.9824588298797607, "learning_rate": 3.83184023889511e-05, "loss": 2.1727, "step": 6260 }, { "epoch": 0.7021276595744681, "grad_norm": 11.302024841308594, "learning_rate": 3.829973870847332e-05, "loss": 2.2552, "step": 6270 }, { "epoch": 0.7032474804031354, "grad_norm": 13.782984733581543, "learning_rate": 3.828107502799552e-05, "loss": 2.3492, "step": 6280 }, { "epoch": 0.7043673012318029, "grad_norm": 2.993431329727173, "learning_rate": 3.8262411347517735e-05, "loss": 1.8148, "step": 6290 }, { "epoch": 0.7054871220604704, "grad_norm": 7.389340400695801, "learning_rate": 3.824374766703994e-05, "loss": 1.9859, "step": 6300 }, { "epoch": 0.7066069428891377, "grad_norm": 9.87246036529541, "learning_rate": 3.822508398656215e-05, "loss": 1.997, "step": 6310 }, { "epoch": 0.7077267637178052, "grad_norm": 3.7313337326049805, "learning_rate": 3.820642030608436e-05, "loss": 2.1025, "step": 6320 }, { "epoch": 0.7088465845464725, "grad_norm": 12.712873458862305, "learning_rate": 3.818775662560657e-05, "loss": 2.1316, "step": 6330 }, { "epoch": 0.70996640537514, "grad_norm": 3.423029661178589, "learning_rate": 3.816909294512878e-05, "loss": 1.8786, "step": 6340 }, { "epoch": 0.7110862262038073, "grad_norm": 2.487156391143799, "learning_rate": 3.815042926465099e-05, "loss": 2.2605, "step": 6350 }, { "epoch": 0.7122060470324748, "grad_norm": 3.7588279247283936, "learning_rate": 3.81317655841732e-05, "loss": 2.1991, "step": 6360 }, { "epoch": 0.7133258678611423, "grad_norm": 3.227130174636841, "learning_rate": 3.811310190369541e-05, "loss": 2.4743, "step": 6370 }, { "epoch": 0.7144456886898096, "grad_norm": 7.917088985443115, "learning_rate": 3.809443822321762e-05, "loss": 2.1551, "step": 6380 }, { "epoch": 0.7155655095184771, "grad_norm": 2.7183947563171387, "learning_rate": 3.807577454273983e-05, "loss": 2.188, "step": 6390 }, { "epoch": 0.7166853303471444, "grad_norm": 3.0164196491241455, "learning_rate": 3.805711086226204e-05, "loss": 2.1092, "step": 6400 }, { "epoch": 0.7178051511758119, "grad_norm": 3.0733413696289062, "learning_rate": 3.803844718178425e-05, "loss": 2.0739, "step": 6410 }, { "epoch": 0.7189249720044792, "grad_norm": 3.4838147163391113, "learning_rate": 3.801978350130646e-05, "loss": 2.5713, "step": 6420 }, { "epoch": 0.7200447928331467, "grad_norm": 7.422530174255371, "learning_rate": 3.8001119820828665e-05, "loss": 2.1976, "step": 6430 }, { "epoch": 0.7211646136618141, "grad_norm": 3.010465383529663, "learning_rate": 3.798245614035088e-05, "loss": 2.1626, "step": 6440 }, { "epoch": 0.7222844344904815, "grad_norm": 2.360297918319702, "learning_rate": 3.796379245987309e-05, "loss": 1.8999, "step": 6450 }, { "epoch": 0.723404255319149, "grad_norm": 4.043231964111328, "learning_rate": 3.79451287793953e-05, "loss": 2.4137, "step": 6460 }, { "epoch": 0.7245240761478163, "grad_norm": 13.390426635742188, "learning_rate": 3.7926465098917505e-05, "loss": 2.3208, "step": 6470 }, { "epoch": 0.7256438969764838, "grad_norm": 3.9308841228485107, "learning_rate": 3.790780141843972e-05, "loss": 2.2158, "step": 6480 }, { "epoch": 0.7267637178051511, "grad_norm": 7.398802280426025, "learning_rate": 3.788913773796193e-05, "loss": 2.3551, "step": 6490 }, { "epoch": 0.7278835386338186, "grad_norm": 9.143538475036621, "learning_rate": 3.787047405748414e-05, "loss": 2.2071, "step": 6500 }, { "epoch": 0.729003359462486, "grad_norm": 3.0637400150299072, "learning_rate": 3.7851810377006345e-05, "loss": 2.0304, "step": 6510 }, { "epoch": 0.7301231802911534, "grad_norm": 3.206883668899536, "learning_rate": 3.7833146696528557e-05, "loss": 2.2099, "step": 6520 }, { "epoch": 0.7312430011198209, "grad_norm": 10.21704387664795, "learning_rate": 3.781448301605077e-05, "loss": 2.4963, "step": 6530 }, { "epoch": 0.7323628219484882, "grad_norm": 11.84740161895752, "learning_rate": 3.779581933557298e-05, "loss": 2.342, "step": 6540 }, { "epoch": 0.7334826427771557, "grad_norm": 9.463152885437012, "learning_rate": 3.7777155655095185e-05, "loss": 2.2016, "step": 6550 }, { "epoch": 0.734602463605823, "grad_norm": 3.3957138061523438, "learning_rate": 3.7758491974617397e-05, "loss": 2.285, "step": 6560 }, { "epoch": 0.7357222844344905, "grad_norm": 3.3638765811920166, "learning_rate": 3.773982829413961e-05, "loss": 2.2848, "step": 6570 }, { "epoch": 0.7368421052631579, "grad_norm": 8.35213565826416, "learning_rate": 3.772116461366181e-05, "loss": 2.1612, "step": 6580 }, { "epoch": 0.7379619260918253, "grad_norm": 7.612375259399414, "learning_rate": 3.7702500933184025e-05, "loss": 2.2899, "step": 6590 }, { "epoch": 0.7390817469204927, "grad_norm": 7.650971412658691, "learning_rate": 3.7683837252706237e-05, "loss": 2.0612, "step": 6600 }, { "epoch": 0.7402015677491601, "grad_norm": 3.540432929992676, "learning_rate": 3.766517357222845e-05, "loss": 2.0103, "step": 6610 }, { "epoch": 0.7413213885778276, "grad_norm": 8.57761001586914, "learning_rate": 3.764650989175065e-05, "loss": 2.3906, "step": 6620 }, { "epoch": 0.7424412094064949, "grad_norm": 11.053874969482422, "learning_rate": 3.7627846211272865e-05, "loss": 2.2103, "step": 6630 }, { "epoch": 0.7435610302351624, "grad_norm": 8.416523933410645, "learning_rate": 3.760918253079507e-05, "loss": 2.3171, "step": 6640 }, { "epoch": 0.7446808510638298, "grad_norm": 3.172659397125244, "learning_rate": 3.759051885031729e-05, "loss": 2.0747, "step": 6650 }, { "epoch": 0.7458006718924972, "grad_norm": 3.0577268600463867, "learning_rate": 3.757185516983949e-05, "loss": 1.7781, "step": 6660 }, { "epoch": 0.7469204927211646, "grad_norm": 2.7276387214660645, "learning_rate": 3.7553191489361705e-05, "loss": 2.3419, "step": 6670 }, { "epoch": 0.748040313549832, "grad_norm": 6.0162529945373535, "learning_rate": 3.753452780888391e-05, "loss": 2.047, "step": 6680 }, { "epoch": 0.7491601343784995, "grad_norm": 8.213367462158203, "learning_rate": 3.751586412840613e-05, "loss": 2.1848, "step": 6690 }, { "epoch": 0.7502799552071668, "grad_norm": 2.8683581352233887, "learning_rate": 3.749720044792833e-05, "loss": 2.3152, "step": 6700 }, { "epoch": 0.7513997760358343, "grad_norm": 7.736207485198975, "learning_rate": 3.7478536767450545e-05, "loss": 2.2199, "step": 6710 }, { "epoch": 0.7525195968645016, "grad_norm": 13.607484817504883, "learning_rate": 3.745987308697275e-05, "loss": 2.0759, "step": 6720 }, { "epoch": 0.7536394176931691, "grad_norm": 12.661739349365234, "learning_rate": 3.744120940649496e-05, "loss": 2.1309, "step": 6730 }, { "epoch": 0.7547592385218365, "grad_norm": 7.0321364402771, "learning_rate": 3.742254572601717e-05, "loss": 2.2709, "step": 6740 }, { "epoch": 0.7558790593505039, "grad_norm": 7.621607780456543, "learning_rate": 3.7403882045539385e-05, "loss": 1.9236, "step": 6750 }, { "epoch": 0.7569988801791714, "grad_norm": 9.952698707580566, "learning_rate": 3.738521836506159e-05, "loss": 2.2815, "step": 6760 }, { "epoch": 0.7581187010078387, "grad_norm": 3.3789877891540527, "learning_rate": 3.73665546845838e-05, "loss": 1.7131, "step": 6770 }, { "epoch": 0.7592385218365062, "grad_norm": 5.496334075927734, "learning_rate": 3.734789100410601e-05, "loss": 2.07, "step": 6780 }, { "epoch": 0.7603583426651735, "grad_norm": 6.971884250640869, "learning_rate": 3.732922732362822e-05, "loss": 2.0964, "step": 6790 }, { "epoch": 0.761478163493841, "grad_norm": 8.502189636230469, "learning_rate": 3.731056364315043e-05, "loss": 2.1573, "step": 6800 }, { "epoch": 0.7625979843225084, "grad_norm": 12.738436698913574, "learning_rate": 3.729189996267264e-05, "loss": 2.1589, "step": 6810 }, { "epoch": 0.7637178051511758, "grad_norm": 4.9455790519714355, "learning_rate": 3.727323628219485e-05, "loss": 2.1109, "step": 6820 }, { "epoch": 0.7648376259798432, "grad_norm": 4.243088245391846, "learning_rate": 3.725457260171706e-05, "loss": 1.96, "step": 6830 }, { "epoch": 0.7659574468085106, "grad_norm": 3.8965704441070557, "learning_rate": 3.723590892123927e-05, "loss": 2.0415, "step": 6840 }, { "epoch": 0.7670772676371781, "grad_norm": 3.0561602115631104, "learning_rate": 3.7217245240761475e-05, "loss": 1.8753, "step": 6850 }, { "epoch": 0.7681970884658454, "grad_norm": 3.344120979309082, "learning_rate": 3.719858156028369e-05, "loss": 2.3332, "step": 6860 }, { "epoch": 0.7693169092945129, "grad_norm": 15.658031463623047, "learning_rate": 3.71799178798059e-05, "loss": 1.8573, "step": 6870 }, { "epoch": 0.7704367301231803, "grad_norm": 13.487674713134766, "learning_rate": 3.716125419932811e-05, "loss": 2.1022, "step": 6880 }, { "epoch": 0.7715565509518477, "grad_norm": 4.366361141204834, "learning_rate": 3.7142590518850315e-05, "loss": 2.1056, "step": 6890 }, { "epoch": 0.7726763717805151, "grad_norm": 6.940586566925049, "learning_rate": 3.712392683837253e-05, "loss": 1.7214, "step": 6900 }, { "epoch": 0.7737961926091825, "grad_norm": 3.119396448135376, "learning_rate": 3.710526315789474e-05, "loss": 2.4966, "step": 6910 }, { "epoch": 0.77491601343785, "grad_norm": 2.804882287979126, "learning_rate": 3.708659947741695e-05, "loss": 1.9811, "step": 6920 }, { "epoch": 0.7760358342665173, "grad_norm": 6.220757484436035, "learning_rate": 3.7067935796939155e-05, "loss": 2.0228, "step": 6930 }, { "epoch": 0.7771556550951848, "grad_norm": 7.664346218109131, "learning_rate": 3.7049272116461367e-05, "loss": 2.375, "step": 6940 }, { "epoch": 0.7782754759238522, "grad_norm": 3.2381927967071533, "learning_rate": 3.703060843598358e-05, "loss": 2.1603, "step": 6950 }, { "epoch": 0.7793952967525196, "grad_norm": 2.67271089553833, "learning_rate": 3.701194475550579e-05, "loss": 1.975, "step": 6960 }, { "epoch": 0.780515117581187, "grad_norm": 8.897006034851074, "learning_rate": 3.6993281075027995e-05, "loss": 2.1017, "step": 6970 }, { "epoch": 0.7816349384098544, "grad_norm": 4.098658084869385, "learning_rate": 3.6974617394550207e-05, "loss": 2.0624, "step": 6980 }, { "epoch": 0.7827547592385219, "grad_norm": 9.428001403808594, "learning_rate": 3.695595371407242e-05, "loss": 1.9006, "step": 6990 }, { "epoch": 0.7838745800671892, "grad_norm": 3.8001720905303955, "learning_rate": 3.693729003359463e-05, "loss": 2.2506, "step": 7000 }, { "epoch": 0.7849944008958567, "grad_norm": 14.967480659484863, "learning_rate": 3.6918626353116835e-05, "loss": 2.4808, "step": 7010 }, { "epoch": 0.786114221724524, "grad_norm": 5.58108377456665, "learning_rate": 3.6899962672639047e-05, "loss": 2.5445, "step": 7020 }, { "epoch": 0.7872340425531915, "grad_norm": 4.169144153594971, "learning_rate": 3.688129899216126e-05, "loss": 2.2701, "step": 7030 }, { "epoch": 0.7883538633818589, "grad_norm": 3.629635810852051, "learning_rate": 3.686263531168346e-05, "loss": 2.0769, "step": 7040 }, { "epoch": 0.7894736842105263, "grad_norm": 3.2318367958068848, "learning_rate": 3.6843971631205675e-05, "loss": 1.9304, "step": 7050 }, { "epoch": 0.7905935050391937, "grad_norm": 3.569641590118408, "learning_rate": 3.682530795072788e-05, "loss": 1.9362, "step": 7060 }, { "epoch": 0.7917133258678611, "grad_norm": 3.6073529720306396, "learning_rate": 3.68066442702501e-05, "loss": 2.1551, "step": 7070 }, { "epoch": 0.7928331466965286, "grad_norm": 2.949209690093994, "learning_rate": 3.67879805897723e-05, "loss": 1.7599, "step": 7080 }, { "epoch": 0.793952967525196, "grad_norm": 7.541772842407227, "learning_rate": 3.6769316909294515e-05, "loss": 2.2318, "step": 7090 }, { "epoch": 0.7950727883538634, "grad_norm": 3.108989953994751, "learning_rate": 3.675065322881672e-05, "loss": 2.1161, "step": 7100 }, { "epoch": 0.7961926091825308, "grad_norm": 2.859032392501831, "learning_rate": 3.673198954833894e-05, "loss": 2.2659, "step": 7110 }, { "epoch": 0.7973124300111982, "grad_norm": 4.491294860839844, "learning_rate": 3.671332586786114e-05, "loss": 1.4341, "step": 7120 }, { "epoch": 0.7984322508398656, "grad_norm": 8.079992294311523, "learning_rate": 3.6694662187383355e-05, "loss": 2.1608, "step": 7130 }, { "epoch": 0.799552071668533, "grad_norm": 3.3629186153411865, "learning_rate": 3.667599850690556e-05, "loss": 2.0803, "step": 7140 }, { "epoch": 0.8006718924972005, "grad_norm": 7.034578800201416, "learning_rate": 3.665733482642777e-05, "loss": 2.2975, "step": 7150 }, { "epoch": 0.8017917133258678, "grad_norm": 3.3165249824523926, "learning_rate": 3.663867114594998e-05, "loss": 2.0453, "step": 7160 }, { "epoch": 0.8029115341545353, "grad_norm": 11.479082107543945, "learning_rate": 3.6620007465472195e-05, "loss": 1.9926, "step": 7170 }, { "epoch": 0.8040313549832027, "grad_norm": 2.8620989322662354, "learning_rate": 3.66013437849944e-05, "loss": 2.0147, "step": 7180 }, { "epoch": 0.8051511758118701, "grad_norm": 9.503447532653809, "learning_rate": 3.658268010451661e-05, "loss": 2.1835, "step": 7190 }, { "epoch": 0.8062709966405375, "grad_norm": 2.9380719661712646, "learning_rate": 3.656401642403882e-05, "loss": 1.9345, "step": 7200 }, { "epoch": 0.8073908174692049, "grad_norm": 9.880309104919434, "learning_rate": 3.6545352743561035e-05, "loss": 2.1885, "step": 7210 }, { "epoch": 0.8085106382978723, "grad_norm": 8.49301528930664, "learning_rate": 3.652668906308324e-05, "loss": 1.7974, "step": 7220 }, { "epoch": 0.8096304591265397, "grad_norm": 7.494529724121094, "learning_rate": 3.650802538260545e-05, "loss": 2.1994, "step": 7230 }, { "epoch": 0.8107502799552072, "grad_norm": 2.999682664871216, "learning_rate": 3.648936170212766e-05, "loss": 2.2821, "step": 7240 }, { "epoch": 0.8118701007838746, "grad_norm": 2.5797007083892822, "learning_rate": 3.647069802164987e-05, "loss": 2.1872, "step": 7250 }, { "epoch": 0.812989921612542, "grad_norm": 9.615920066833496, "learning_rate": 3.645203434117208e-05, "loss": 2.3596, "step": 7260 }, { "epoch": 0.8141097424412094, "grad_norm": 8.524604797363281, "learning_rate": 3.6433370660694285e-05, "loss": 2.6097, "step": 7270 }, { "epoch": 0.8152295632698768, "grad_norm": 7.994124889373779, "learning_rate": 3.64147069802165e-05, "loss": 1.7589, "step": 7280 }, { "epoch": 0.8163493840985442, "grad_norm": 2.902440071105957, "learning_rate": 3.639604329973871e-05, "loss": 2.1298, "step": 7290 }, { "epoch": 0.8174692049272116, "grad_norm": 2.9455184936523438, "learning_rate": 3.637737961926092e-05, "loss": 2.2527, "step": 7300 }, { "epoch": 0.8185890257558791, "grad_norm": 7.609272003173828, "learning_rate": 3.6358715938783125e-05, "loss": 2.2256, "step": 7310 }, { "epoch": 0.8197088465845465, "grad_norm": 3.6720242500305176, "learning_rate": 3.634005225830534e-05, "loss": 2.034, "step": 7320 }, { "epoch": 0.8208286674132139, "grad_norm": 6.270810604095459, "learning_rate": 3.632138857782755e-05, "loss": 1.9424, "step": 7330 }, { "epoch": 0.8219484882418813, "grad_norm": 9.397404670715332, "learning_rate": 3.630272489734976e-05, "loss": 2.0945, "step": 7340 }, { "epoch": 0.8230683090705487, "grad_norm": 3.0468692779541016, "learning_rate": 3.6284061216871965e-05, "loss": 2.382, "step": 7350 }, { "epoch": 0.8241881298992161, "grad_norm": 5.612720966339111, "learning_rate": 3.626539753639418e-05, "loss": 1.9622, "step": 7360 }, { "epoch": 0.8253079507278835, "grad_norm": 4.7055983543396, "learning_rate": 3.624673385591639e-05, "loss": 2.2322, "step": 7370 }, { "epoch": 0.826427771556551, "grad_norm": 4.574550628662109, "learning_rate": 3.62280701754386e-05, "loss": 2.0364, "step": 7380 }, { "epoch": 0.8275475923852184, "grad_norm": 9.704349517822266, "learning_rate": 3.6209406494960805e-05, "loss": 2.6025, "step": 7390 }, { "epoch": 0.8286674132138858, "grad_norm": 3.6313247680664062, "learning_rate": 3.6190742814483017e-05, "loss": 2.0449, "step": 7400 }, { "epoch": 0.8297872340425532, "grad_norm": 5.157100200653076, "learning_rate": 3.617207913400523e-05, "loss": 2.0293, "step": 7410 }, { "epoch": 0.8309070548712206, "grad_norm": 11.102890968322754, "learning_rate": 3.615341545352744e-05, "loss": 2.3059, "step": 7420 }, { "epoch": 0.832026875699888, "grad_norm": 6.320305347442627, "learning_rate": 3.6134751773049645e-05, "loss": 2.1117, "step": 7430 }, { "epoch": 0.8331466965285554, "grad_norm": 9.381714820861816, "learning_rate": 3.611608809257186e-05, "loss": 2.5279, "step": 7440 }, { "epoch": 0.8342665173572228, "grad_norm": 3.505153179168701, "learning_rate": 3.609742441209407e-05, "loss": 2.176, "step": 7450 }, { "epoch": 0.8353863381858903, "grad_norm": 6.633389472961426, "learning_rate": 3.607876073161628e-05, "loss": 2.1151, "step": 7460 }, { "epoch": 0.8365061590145577, "grad_norm": 2.6333770751953125, "learning_rate": 3.6060097051138485e-05, "loss": 2.4062, "step": 7470 }, { "epoch": 0.8376259798432251, "grad_norm": 3.540119171142578, "learning_rate": 3.60414333706607e-05, "loss": 2.1977, "step": 7480 }, { "epoch": 0.8387458006718925, "grad_norm": 2.524616003036499, "learning_rate": 3.602276969018291e-05, "loss": 2.1205, "step": 7490 }, { "epoch": 0.8398656215005599, "grad_norm": 6.62229061126709, "learning_rate": 3.600410600970511e-05, "loss": 2.2605, "step": 7500 }, { "epoch": 0.8409854423292273, "grad_norm": 3.3816375732421875, "learning_rate": 3.5985442329227325e-05, "loss": 2.0532, "step": 7510 }, { "epoch": 0.8421052631578947, "grad_norm": 2.873293161392212, "learning_rate": 3.596677864874953e-05, "loss": 2.251, "step": 7520 }, { "epoch": 0.8432250839865622, "grad_norm": 8.764281272888184, "learning_rate": 3.594811496827175e-05, "loss": 2.2559, "step": 7530 }, { "epoch": 0.8443449048152296, "grad_norm": 11.748472213745117, "learning_rate": 3.592945128779395e-05, "loss": 1.9883, "step": 7540 }, { "epoch": 0.845464725643897, "grad_norm": 8.871268272399902, "learning_rate": 3.5910787607316165e-05, "loss": 2.2952, "step": 7550 }, { "epoch": 0.8465845464725644, "grad_norm": 3.4777164459228516, "learning_rate": 3.589212392683837e-05, "loss": 2.1871, "step": 7560 }, { "epoch": 0.8477043673012318, "grad_norm": 13.322107315063477, "learning_rate": 3.587346024636059e-05, "loss": 2.2841, "step": 7570 }, { "epoch": 0.8488241881298992, "grad_norm": 3.384903907775879, "learning_rate": 3.585479656588279e-05, "loss": 1.7451, "step": 7580 }, { "epoch": 0.8499440089585666, "grad_norm": 3.1881563663482666, "learning_rate": 3.5836132885405005e-05, "loss": 2.3855, "step": 7590 }, { "epoch": 0.851063829787234, "grad_norm": 8.006708145141602, "learning_rate": 3.581746920492721e-05, "loss": 2.0241, "step": 7600 }, { "epoch": 0.8521836506159015, "grad_norm": 3.468590259552002, "learning_rate": 3.579880552444942e-05, "loss": 2.3348, "step": 7610 }, { "epoch": 0.8533034714445689, "grad_norm": 8.839496612548828, "learning_rate": 3.578014184397163e-05, "loss": 2.0181, "step": 7620 }, { "epoch": 0.8544232922732363, "grad_norm": 2.8391735553741455, "learning_rate": 3.5761478163493845e-05, "loss": 2.2046, "step": 7630 }, { "epoch": 0.8555431131019037, "grad_norm": 2.341062068939209, "learning_rate": 3.574281448301605e-05, "loss": 2.509, "step": 7640 }, { "epoch": 0.8566629339305711, "grad_norm": 4.910477161407471, "learning_rate": 3.572415080253826e-05, "loss": 2.4332, "step": 7650 }, { "epoch": 0.8577827547592385, "grad_norm": 9.427245140075684, "learning_rate": 3.570548712206047e-05, "loss": 2.2668, "step": 7660 }, { "epoch": 0.858902575587906, "grad_norm": 2.9346938133239746, "learning_rate": 3.5686823441582685e-05, "loss": 1.8698, "step": 7670 }, { "epoch": 0.8600223964165733, "grad_norm": 3.292447328567505, "learning_rate": 3.566815976110489e-05, "loss": 2.1736, "step": 7680 }, { "epoch": 0.8611422172452408, "grad_norm": 3.89292573928833, "learning_rate": 3.56494960806271e-05, "loss": 2.4429, "step": 7690 }, { "epoch": 0.8622620380739082, "grad_norm": 8.658332824707031, "learning_rate": 3.563083240014931e-05, "loss": 2.1728, "step": 7700 }, { "epoch": 0.8633818589025756, "grad_norm": 8.725335121154785, "learning_rate": 3.5612168719671525e-05, "loss": 2.0126, "step": 7710 }, { "epoch": 0.864501679731243, "grad_norm": 2.872495412826538, "learning_rate": 3.559350503919373e-05, "loss": 2.4384, "step": 7720 }, { "epoch": 0.8656215005599104, "grad_norm": 17.420711517333984, "learning_rate": 3.5574841358715935e-05, "loss": 1.8937, "step": 7730 }, { "epoch": 0.8667413213885778, "grad_norm": 7.1470489501953125, "learning_rate": 3.555617767823815e-05, "loss": 1.6198, "step": 7740 }, { "epoch": 0.8678611422172452, "grad_norm": 9.926697731018066, "learning_rate": 3.553751399776036e-05, "loss": 1.9396, "step": 7750 }, { "epoch": 0.8689809630459127, "grad_norm": 3.2263200283050537, "learning_rate": 3.551885031728257e-05, "loss": 2.3201, "step": 7760 }, { "epoch": 0.8701007838745801, "grad_norm": 7.058889865875244, "learning_rate": 3.5500186636804775e-05, "loss": 2.1745, "step": 7770 }, { "epoch": 0.8712206047032475, "grad_norm": 12.746253967285156, "learning_rate": 3.548152295632699e-05, "loss": 2.4713, "step": 7780 }, { "epoch": 0.8723404255319149, "grad_norm": 10.826375961303711, "learning_rate": 3.54628592758492e-05, "loss": 1.9633, "step": 7790 }, { "epoch": 0.8734602463605823, "grad_norm": 3.0302278995513916, "learning_rate": 3.544419559537141e-05, "loss": 2.3689, "step": 7800 }, { "epoch": 0.8745800671892497, "grad_norm": 2.6338868141174316, "learning_rate": 3.5425531914893615e-05, "loss": 1.6076, "step": 7810 }, { "epoch": 0.8756998880179171, "grad_norm": 2.724898099899292, "learning_rate": 3.540686823441583e-05, "loss": 1.6231, "step": 7820 }, { "epoch": 0.8768197088465846, "grad_norm": 5.752689361572266, "learning_rate": 3.538820455393804e-05, "loss": 1.8717, "step": 7830 }, { "epoch": 0.8779395296752519, "grad_norm": 3.7347118854522705, "learning_rate": 3.536954087346025e-05, "loss": 2.1681, "step": 7840 }, { "epoch": 0.8790593505039194, "grad_norm": 6.816357612609863, "learning_rate": 3.5350877192982455e-05, "loss": 1.6885, "step": 7850 }, { "epoch": 0.8801791713325868, "grad_norm": 10.730504989624023, "learning_rate": 3.533221351250467e-05, "loss": 2.3201, "step": 7860 }, { "epoch": 0.8812989921612542, "grad_norm": 10.225847244262695, "learning_rate": 3.531354983202688e-05, "loss": 2.307, "step": 7870 }, { "epoch": 0.8824188129899216, "grad_norm": 3.7461752891540527, "learning_rate": 3.529488615154909e-05, "loss": 2.4028, "step": 7880 }, { "epoch": 0.883538633818589, "grad_norm": 3.2298667430877686, "learning_rate": 3.5276222471071295e-05, "loss": 2.2726, "step": 7890 }, { "epoch": 0.8846584546472565, "grad_norm": 4.180459976196289, "learning_rate": 3.525755879059351e-05, "loss": 2.3693, "step": 7900 }, { "epoch": 0.8857782754759238, "grad_norm": 3.8476152420043945, "learning_rate": 3.523889511011572e-05, "loss": 1.9672, "step": 7910 }, { "epoch": 0.8868980963045913, "grad_norm": 2.8095366954803467, "learning_rate": 3.522023142963793e-05, "loss": 1.5762, "step": 7920 }, { "epoch": 0.8880179171332587, "grad_norm": 11.424389839172363, "learning_rate": 3.5201567749160135e-05, "loss": 1.9222, "step": 7930 }, { "epoch": 0.8891377379619261, "grad_norm": 10.916204452514648, "learning_rate": 3.518290406868235e-05, "loss": 2.2247, "step": 7940 }, { "epoch": 0.8902575587905935, "grad_norm": 8.619460105895996, "learning_rate": 3.516424038820456e-05, "loss": 2.5038, "step": 7950 }, { "epoch": 0.8913773796192609, "grad_norm": 3.383333444595337, "learning_rate": 3.514557670772676e-05, "loss": 2.0325, "step": 7960 }, { "epoch": 0.8924972004479284, "grad_norm": 2.8794310092926025, "learning_rate": 3.5126913027248975e-05, "loss": 2.2037, "step": 7970 }, { "epoch": 0.8936170212765957, "grad_norm": 3.270104169845581, "learning_rate": 3.510824934677118e-05, "loss": 2.5766, "step": 7980 }, { "epoch": 0.8947368421052632, "grad_norm": 5.58250617980957, "learning_rate": 3.50895856662934e-05, "loss": 2.0883, "step": 7990 }, { "epoch": 0.8958566629339306, "grad_norm": 4.2217488288879395, "learning_rate": 3.50709219858156e-05, "loss": 2.1016, "step": 8000 }, { "epoch": 0.896976483762598, "grad_norm": 5.107589244842529, "learning_rate": 3.5052258305337815e-05, "loss": 1.9658, "step": 8010 }, { "epoch": 0.8980963045912654, "grad_norm": 3.2384800910949707, "learning_rate": 3.503359462486002e-05, "loss": 1.8828, "step": 8020 }, { "epoch": 0.8992161254199328, "grad_norm": 3.6768581867218018, "learning_rate": 3.501493094438223e-05, "loss": 1.6965, "step": 8030 }, { "epoch": 0.9003359462486002, "grad_norm": 3.0174429416656494, "learning_rate": 3.499626726390444e-05, "loss": 2.16, "step": 8040 }, { "epoch": 0.9014557670772676, "grad_norm": 13.087141036987305, "learning_rate": 3.4977603583426655e-05, "loss": 1.7648, "step": 8050 }, { "epoch": 0.9025755879059351, "grad_norm": 11.361166000366211, "learning_rate": 3.495893990294886e-05, "loss": 1.94, "step": 8060 }, { "epoch": 0.9036954087346024, "grad_norm": 3.1637301445007324, "learning_rate": 3.494027622247107e-05, "loss": 2.0952, "step": 8070 }, { "epoch": 0.9048152295632699, "grad_norm": 3.610626459121704, "learning_rate": 3.492161254199328e-05, "loss": 2.4445, "step": 8080 }, { "epoch": 0.9059350503919373, "grad_norm": 3.5841760635375977, "learning_rate": 3.4902948861515495e-05, "loss": 2.3581, "step": 8090 }, { "epoch": 0.9070548712206047, "grad_norm": 8.359783172607422, "learning_rate": 3.48842851810377e-05, "loss": 2.0259, "step": 8100 }, { "epoch": 0.9081746920492721, "grad_norm": 3.30151629447937, "learning_rate": 3.486562150055991e-05, "loss": 2.3721, "step": 8110 }, { "epoch": 0.9092945128779395, "grad_norm": 3.3997480869293213, "learning_rate": 3.484695782008212e-05, "loss": 2.0008, "step": 8120 }, { "epoch": 0.910414333706607, "grad_norm": 4.009676456451416, "learning_rate": 3.4828294139604335e-05, "loss": 1.8577, "step": 8130 }, { "epoch": 0.9115341545352743, "grad_norm": 11.38424015045166, "learning_rate": 3.480963045912654e-05, "loss": 2.1525, "step": 8140 }, { "epoch": 0.9126539753639418, "grad_norm": 3.4985761642456055, "learning_rate": 3.479096677864875e-05, "loss": 2.0185, "step": 8150 }, { "epoch": 0.9137737961926092, "grad_norm": 2.8695907592773438, "learning_rate": 3.4772303098170963e-05, "loss": 2.2721, "step": 8160 }, { "epoch": 0.9148936170212766, "grad_norm": 9.850143432617188, "learning_rate": 3.4753639417693175e-05, "loss": 2.0199, "step": 8170 }, { "epoch": 0.916013437849944, "grad_norm": 6.895715713500977, "learning_rate": 3.473497573721538e-05, "loss": 2.0351, "step": 8180 }, { "epoch": 0.9171332586786114, "grad_norm": 3.088392972946167, "learning_rate": 3.4716312056737585e-05, "loss": 2.1426, "step": 8190 }, { "epoch": 0.9182530795072789, "grad_norm": 2.6038413047790527, "learning_rate": 3.4697648376259803e-05, "loss": 1.9374, "step": 8200 }, { "epoch": 0.9193729003359462, "grad_norm": 8.73396110534668, "learning_rate": 3.467898469578201e-05, "loss": 2.3542, "step": 8210 }, { "epoch": 0.9204927211646137, "grad_norm": 7.529842853546143, "learning_rate": 3.466032101530422e-05, "loss": 2.5576, "step": 8220 }, { "epoch": 0.921612541993281, "grad_norm": 11.439668655395508, "learning_rate": 3.4641657334826425e-05, "loss": 2.3506, "step": 8230 }, { "epoch": 0.9227323628219485, "grad_norm": 11.114765167236328, "learning_rate": 3.462299365434864e-05, "loss": 2.1594, "step": 8240 }, { "epoch": 0.9238521836506159, "grad_norm": 3.631915807723999, "learning_rate": 3.460432997387085e-05, "loss": 1.9301, "step": 8250 }, { "epoch": 0.9249720044792833, "grad_norm": 4.813271522521973, "learning_rate": 3.458566629339306e-05, "loss": 1.7937, "step": 8260 }, { "epoch": 0.9260918253079508, "grad_norm": 9.251919746398926, "learning_rate": 3.4567002612915265e-05, "loss": 2.4907, "step": 8270 }, { "epoch": 0.9272116461366181, "grad_norm": 10.042062759399414, "learning_rate": 3.454833893243748e-05, "loss": 2.3766, "step": 8280 }, { "epoch": 0.9283314669652856, "grad_norm": 5.098442554473877, "learning_rate": 3.452967525195969e-05, "loss": 1.9331, "step": 8290 }, { "epoch": 0.9294512877939529, "grad_norm": 3.054330348968506, "learning_rate": 3.45110115714819e-05, "loss": 1.9556, "step": 8300 }, { "epoch": 0.9305711086226204, "grad_norm": 5.500843524932861, "learning_rate": 3.4492347891004105e-05, "loss": 1.8301, "step": 8310 }, { "epoch": 0.9316909294512878, "grad_norm": 11.334184646606445, "learning_rate": 3.447368421052632e-05, "loss": 2.3866, "step": 8320 }, { "epoch": 0.9328107502799552, "grad_norm": 9.781439781188965, "learning_rate": 3.445502053004853e-05, "loss": 2.3349, "step": 8330 }, { "epoch": 0.9339305711086227, "grad_norm": 5.9633049964904785, "learning_rate": 3.443635684957074e-05, "loss": 1.5215, "step": 8340 }, { "epoch": 0.93505039193729, "grad_norm": 9.052412033081055, "learning_rate": 3.4417693169092945e-05, "loss": 2.2278, "step": 8350 }, { "epoch": 0.9361702127659575, "grad_norm": 15.582505226135254, "learning_rate": 3.439902948861516e-05, "loss": 2.2908, "step": 8360 }, { "epoch": 0.9372900335946248, "grad_norm": 9.649676322937012, "learning_rate": 3.438036580813737e-05, "loss": 2.0235, "step": 8370 }, { "epoch": 0.9384098544232923, "grad_norm": 2.680288314819336, "learning_rate": 3.436170212765958e-05, "loss": 1.9282, "step": 8380 }, { "epoch": 0.9395296752519597, "grad_norm": 3.08258318901062, "learning_rate": 3.4343038447181785e-05, "loss": 2.2415, "step": 8390 }, { "epoch": 0.9406494960806271, "grad_norm": 4.9708380699157715, "learning_rate": 3.4324374766704e-05, "loss": 2.3105, "step": 8400 }, { "epoch": 0.9417693169092946, "grad_norm": 2.7266993522644043, "learning_rate": 3.430571108622621e-05, "loss": 2.2027, "step": 8410 }, { "epoch": 0.9428891377379619, "grad_norm": 10.68362808227539, "learning_rate": 3.428704740574841e-05, "loss": 2.0632, "step": 8420 }, { "epoch": 0.9440089585666294, "grad_norm": 3.0944361686706543, "learning_rate": 3.4268383725270625e-05, "loss": 2.2351, "step": 8430 }, { "epoch": 0.9451287793952967, "grad_norm": 3.2292227745056152, "learning_rate": 3.424972004479283e-05, "loss": 1.9755, "step": 8440 }, { "epoch": 0.9462486002239642, "grad_norm": 16.302453994750977, "learning_rate": 3.423105636431504e-05, "loss": 2.0759, "step": 8450 }, { "epoch": 0.9473684210526315, "grad_norm": 4.625180244445801, "learning_rate": 3.421239268383725e-05, "loss": 2.5148, "step": 8460 }, { "epoch": 0.948488241881299, "grad_norm": 8.57646656036377, "learning_rate": 3.4193729003359465e-05, "loss": 1.9348, "step": 8470 }, { "epoch": 0.9496080627099664, "grad_norm": 3.611316442489624, "learning_rate": 3.417506532288167e-05, "loss": 2.3202, "step": 8480 }, { "epoch": 0.9507278835386338, "grad_norm": 12.828388214111328, "learning_rate": 3.415640164240388e-05, "loss": 2.0078, "step": 8490 }, { "epoch": 0.9518477043673013, "grad_norm": 7.542992115020752, "learning_rate": 3.413773796192609e-05, "loss": 1.8984, "step": 8500 }, { "epoch": 0.9529675251959686, "grad_norm": 10.747339248657227, "learning_rate": 3.4119074281448305e-05, "loss": 2.2511, "step": 8510 }, { "epoch": 0.9540873460246361, "grad_norm": 6.7283453941345215, "learning_rate": 3.410041060097051e-05, "loss": 1.9405, "step": 8520 }, { "epoch": 0.9552071668533034, "grad_norm": 2.935981512069702, "learning_rate": 3.408174692049272e-05, "loss": 2.1033, "step": 8530 }, { "epoch": 0.9563269876819709, "grad_norm": 3.4737389087677, "learning_rate": 3.4063083240014933e-05, "loss": 2.0328, "step": 8540 }, { "epoch": 0.9574468085106383, "grad_norm": 8.525548934936523, "learning_rate": 3.4044419559537145e-05, "loss": 1.8537, "step": 8550 }, { "epoch": 0.9585666293393057, "grad_norm": 6.467761516571045, "learning_rate": 3.402575587905935e-05, "loss": 2.0973, "step": 8560 }, { "epoch": 0.9596864501679732, "grad_norm": 10.39410400390625, "learning_rate": 3.400709219858156e-05, "loss": 2.0941, "step": 8570 }, { "epoch": 0.9608062709966405, "grad_norm": 5.414796829223633, "learning_rate": 3.3988428518103773e-05, "loss": 1.6318, "step": 8580 }, { "epoch": 0.961926091825308, "grad_norm": 2.808164119720459, "learning_rate": 3.3969764837625985e-05, "loss": 1.8583, "step": 8590 }, { "epoch": 0.9630459126539753, "grad_norm": 2.665485382080078, "learning_rate": 3.395110115714819e-05, "loss": 2.1733, "step": 8600 }, { "epoch": 0.9641657334826428, "grad_norm": 3.183068037033081, "learning_rate": 3.39324374766704e-05, "loss": 1.7059, "step": 8610 }, { "epoch": 0.9652855543113102, "grad_norm": 2.7500557899475098, "learning_rate": 3.3913773796192613e-05, "loss": 2.2814, "step": 8620 }, { "epoch": 0.9664053751399776, "grad_norm": 6.916834831237793, "learning_rate": 3.3895110115714825e-05, "loss": 2.0092, "step": 8630 }, { "epoch": 0.9675251959686451, "grad_norm": 11.789180755615234, "learning_rate": 3.387644643523703e-05, "loss": 1.744, "step": 8640 }, { "epoch": 0.9686450167973124, "grad_norm": 12.441953659057617, "learning_rate": 3.3857782754759235e-05, "loss": 2.2952, "step": 8650 }, { "epoch": 0.9697648376259799, "grad_norm": 4.348373889923096, "learning_rate": 3.3839119074281453e-05, "loss": 1.8666, "step": 8660 }, { "epoch": 0.9708846584546472, "grad_norm": 3.6366405487060547, "learning_rate": 3.382045539380366e-05, "loss": 2.3619, "step": 8670 }, { "epoch": 0.9720044792833147, "grad_norm": 5.705763816833496, "learning_rate": 3.380179171332587e-05, "loss": 1.8297, "step": 8680 }, { "epoch": 0.973124300111982, "grad_norm": 8.548418998718262, "learning_rate": 3.3783128032848075e-05, "loss": 1.6297, "step": 8690 }, { "epoch": 0.9742441209406495, "grad_norm": 6.786285877227783, "learning_rate": 3.376446435237029e-05, "loss": 1.6739, "step": 8700 }, { "epoch": 0.975363941769317, "grad_norm": 2.983182430267334, "learning_rate": 3.37458006718925e-05, "loss": 1.8939, "step": 8710 }, { "epoch": 0.9764837625979843, "grad_norm": 7.78575325012207, "learning_rate": 3.372713699141471e-05, "loss": 2.2775, "step": 8720 }, { "epoch": 0.9776035834266518, "grad_norm": 2.9739723205566406, "learning_rate": 3.3708473310936915e-05, "loss": 2.3235, "step": 8730 }, { "epoch": 0.9787234042553191, "grad_norm": 13.118427276611328, "learning_rate": 3.368980963045913e-05, "loss": 2.2344, "step": 8740 }, { "epoch": 0.9798432250839866, "grad_norm": 3.2696194648742676, "learning_rate": 3.367114594998134e-05, "loss": 2.3788, "step": 8750 }, { "epoch": 0.9809630459126539, "grad_norm": 2.9010257720947266, "learning_rate": 3.365248226950355e-05, "loss": 2.0307, "step": 8760 }, { "epoch": 0.9820828667413214, "grad_norm": 3.2224440574645996, "learning_rate": 3.3633818589025755e-05, "loss": 2.0968, "step": 8770 }, { "epoch": 0.9832026875699889, "grad_norm": 9.395108222961426, "learning_rate": 3.361515490854797e-05, "loss": 2.2748, "step": 8780 }, { "epoch": 0.9843225083986562, "grad_norm": 3.0687882900238037, "learning_rate": 3.359649122807018e-05, "loss": 1.7044, "step": 8790 }, { "epoch": 0.9854423292273237, "grad_norm": 3.7267823219299316, "learning_rate": 3.357782754759239e-05, "loss": 1.6726, "step": 8800 }, { "epoch": 0.986562150055991, "grad_norm": 3.8064417839050293, "learning_rate": 3.3559163867114595e-05, "loss": 1.8408, "step": 8810 }, { "epoch": 0.9876819708846585, "grad_norm": 8.669193267822266, "learning_rate": 3.354050018663681e-05, "loss": 1.8474, "step": 8820 }, { "epoch": 0.9888017917133258, "grad_norm": 14.256889343261719, "learning_rate": 3.352183650615902e-05, "loss": 2.0988, "step": 8830 }, { "epoch": 0.9899216125419933, "grad_norm": 10.54806137084961, "learning_rate": 3.350317282568123e-05, "loss": 2.1008, "step": 8840 }, { "epoch": 0.9910414333706606, "grad_norm": 3.6541545391082764, "learning_rate": 3.3484509145203435e-05, "loss": 1.7903, "step": 8850 }, { "epoch": 0.9921612541993281, "grad_norm": 3.3884453773498535, "learning_rate": 3.346584546472565e-05, "loss": 2.0664, "step": 8860 }, { "epoch": 0.9932810750279956, "grad_norm": 3.597472906112671, "learning_rate": 3.344718178424786e-05, "loss": 2.2501, "step": 8870 }, { "epoch": 0.9944008958566629, "grad_norm": 3.326669931411743, "learning_rate": 3.3428518103770063e-05, "loss": 2.2494, "step": 8880 }, { "epoch": 0.9955207166853304, "grad_norm": 3.445563316345215, "learning_rate": 3.3409854423292275e-05, "loss": 2.4157, "step": 8890 }, { "epoch": 0.9966405375139977, "grad_norm": 3.6265370845794678, "learning_rate": 3.339119074281448e-05, "loss": 1.9543, "step": 8900 }, { "epoch": 0.9977603583426652, "grad_norm": 6.9715471267700195, "learning_rate": 3.337252706233669e-05, "loss": 1.9576, "step": 8910 }, { "epoch": 0.9988801791713325, "grad_norm": 2.990663528442383, "learning_rate": 3.3353863381858903e-05, "loss": 1.7998, "step": 8920 }, { "epoch": 1.0, "grad_norm": 16.68695640563965, "learning_rate": 3.3335199701381115e-05, "loss": 2.4764, "step": 8930 }, { "epoch": 1.0011198208286674, "grad_norm": 3.5761334896087646, "learning_rate": 3.331653602090332e-05, "loss": 2.0049, "step": 8940 }, { "epoch": 1.002239641657335, "grad_norm": 7.336765289306641, "learning_rate": 3.329787234042553e-05, "loss": 1.9596, "step": 8950 }, { "epoch": 1.0033594624860023, "grad_norm": 9.628479957580566, "learning_rate": 3.3279208659947743e-05, "loss": 2.1628, "step": 8960 }, { "epoch": 1.0044792833146696, "grad_norm": 6.432254791259766, "learning_rate": 3.3260544979469955e-05, "loss": 1.9689, "step": 8970 }, { "epoch": 1.005599104143337, "grad_norm": 3.5775656700134277, "learning_rate": 3.324188129899216e-05, "loss": 2.0591, "step": 8980 }, { "epoch": 1.0067189249720045, "grad_norm": 7.444267272949219, "learning_rate": 3.322321761851437e-05, "loss": 2.2116, "step": 8990 }, { "epoch": 1.007838745800672, "grad_norm": 9.2912015914917, "learning_rate": 3.3204553938036583e-05, "loss": 1.8121, "step": 9000 }, { "epoch": 1.0089585666293392, "grad_norm": 12.298483848571777, "learning_rate": 3.3185890257558795e-05, "loss": 2.2219, "step": 9010 }, { "epoch": 1.0100783874580068, "grad_norm": 11.347268104553223, "learning_rate": 3.3167226577081e-05, "loss": 1.9998, "step": 9020 }, { "epoch": 1.0111982082866742, "grad_norm": 4.02382755279541, "learning_rate": 3.314856289660321e-05, "loss": 2.1333, "step": 9030 }, { "epoch": 1.0123180291153415, "grad_norm": 8.781681060791016, "learning_rate": 3.3129899216125423e-05, "loss": 2.0904, "step": 9040 }, { "epoch": 1.0134378499440089, "grad_norm": 7.836172580718994, "learning_rate": 3.3111235535647635e-05, "loss": 1.8313, "step": 9050 }, { "epoch": 1.0145576707726764, "grad_norm": 7.975405693054199, "learning_rate": 3.309257185516984e-05, "loss": 2.1517, "step": 9060 }, { "epoch": 1.0156774916013438, "grad_norm": 9.539911270141602, "learning_rate": 3.307390817469205e-05, "loss": 2.228, "step": 9070 }, { "epoch": 1.0167973124300111, "grad_norm": 4.019872665405273, "learning_rate": 3.3055244494214263e-05, "loss": 1.97, "step": 9080 }, { "epoch": 1.0179171332586787, "grad_norm": 3.3895974159240723, "learning_rate": 3.3036580813736475e-05, "loss": 1.6972, "step": 9090 }, { "epoch": 1.019036954087346, "grad_norm": 12.689729690551758, "learning_rate": 3.301791713325868e-05, "loss": 1.7648, "step": 9100 }, { "epoch": 1.0201567749160134, "grad_norm": 15.403214454650879, "learning_rate": 3.2999253452780885e-05, "loss": 2.3582, "step": 9110 }, { "epoch": 1.0212765957446808, "grad_norm": 3.3380067348480225, "learning_rate": 3.29805897723031e-05, "loss": 1.9642, "step": 9120 }, { "epoch": 1.0223964165733483, "grad_norm": 3.8533127307891846, "learning_rate": 3.296192609182531e-05, "loss": 1.9215, "step": 9130 }, { "epoch": 1.0235162374020157, "grad_norm": 3.532688617706299, "learning_rate": 3.294326241134752e-05, "loss": 1.6916, "step": 9140 }, { "epoch": 1.024636058230683, "grad_norm": 2.9172744750976562, "learning_rate": 3.2924598730869725e-05, "loss": 2.0278, "step": 9150 }, { "epoch": 1.0257558790593504, "grad_norm": 4.141864776611328, "learning_rate": 3.290593505039194e-05, "loss": 2.57, "step": 9160 }, { "epoch": 1.026875699888018, "grad_norm": 15.59328842163086, "learning_rate": 3.288727136991415e-05, "loss": 2.3305, "step": 9170 }, { "epoch": 1.0279955207166853, "grad_norm": 8.242568016052246, "learning_rate": 3.286860768943636e-05, "loss": 2.2541, "step": 9180 }, { "epoch": 1.0291153415453527, "grad_norm": 7.377376079559326, "learning_rate": 3.2849944008958565e-05, "loss": 2.2119, "step": 9190 }, { "epoch": 1.0302351623740202, "grad_norm": 3.7288410663604736, "learning_rate": 3.283128032848078e-05, "loss": 1.9549, "step": 9200 }, { "epoch": 1.0313549832026876, "grad_norm": 7.731942653656006, "learning_rate": 3.281261664800299e-05, "loss": 2.0964, "step": 9210 }, { "epoch": 1.032474804031355, "grad_norm": 8.507601737976074, "learning_rate": 3.27939529675252e-05, "loss": 1.4743, "step": 9220 }, { "epoch": 1.0335946248600223, "grad_norm": 10.14968204498291, "learning_rate": 3.2775289287047405e-05, "loss": 2.1759, "step": 9230 }, { "epoch": 1.0347144456886899, "grad_norm": 4.713762283325195, "learning_rate": 3.275662560656962e-05, "loss": 2.4896, "step": 9240 }, { "epoch": 1.0358342665173572, "grad_norm": 4.729640483856201, "learning_rate": 3.273796192609183e-05, "loss": 2.0363, "step": 9250 }, { "epoch": 1.0369540873460246, "grad_norm": 3.9254088401794434, "learning_rate": 3.271929824561404e-05, "loss": 1.82, "step": 9260 }, { "epoch": 1.0380739081746921, "grad_norm": 6.3994622230529785, "learning_rate": 3.2700634565136245e-05, "loss": 2.0063, "step": 9270 }, { "epoch": 1.0391937290033595, "grad_norm": 4.113112449645996, "learning_rate": 3.268197088465846e-05, "loss": 1.8885, "step": 9280 }, { "epoch": 1.0403135498320268, "grad_norm": 9.683294296264648, "learning_rate": 3.266330720418067e-05, "loss": 2.087, "step": 9290 }, { "epoch": 1.0414333706606942, "grad_norm": 3.7569706439971924, "learning_rate": 3.264464352370288e-05, "loss": 2.0629, "step": 9300 }, { "epoch": 1.0425531914893618, "grad_norm": 6.442532062530518, "learning_rate": 3.2625979843225085e-05, "loss": 2.1135, "step": 9310 }, { "epoch": 1.0436730123180291, "grad_norm": 7.427597522735596, "learning_rate": 3.26073161627473e-05, "loss": 1.4265, "step": 9320 }, { "epoch": 1.0447928331466965, "grad_norm": 14.338908195495605, "learning_rate": 3.25886524822695e-05, "loss": 1.9124, "step": 9330 }, { "epoch": 1.045912653975364, "grad_norm": 9.877331733703613, "learning_rate": 3.256998880179172e-05, "loss": 2.5536, "step": 9340 }, { "epoch": 1.0470324748040314, "grad_norm": 5.434894561767578, "learning_rate": 3.2551325121313925e-05, "loss": 2.0743, "step": 9350 }, { "epoch": 1.0481522956326987, "grad_norm": 5.651406764984131, "learning_rate": 3.253266144083613e-05, "loss": 1.6426, "step": 9360 }, { "epoch": 1.049272116461366, "grad_norm": 5.694229602813721, "learning_rate": 3.251399776035834e-05, "loss": 1.709, "step": 9370 }, { "epoch": 1.0503919372900337, "grad_norm": 8.92438793182373, "learning_rate": 3.2495334079880553e-05, "loss": 1.8702, "step": 9380 }, { "epoch": 1.051511758118701, "grad_norm": 6.862886428833008, "learning_rate": 3.2476670399402765e-05, "loss": 2.3834, "step": 9390 }, { "epoch": 1.0526315789473684, "grad_norm": 7.55111026763916, "learning_rate": 3.245800671892497e-05, "loss": 1.8377, "step": 9400 }, { "epoch": 1.053751399776036, "grad_norm": 4.6407341957092285, "learning_rate": 3.243934303844718e-05, "loss": 2.1082, "step": 9410 }, { "epoch": 1.0548712206047033, "grad_norm": 6.718739032745361, "learning_rate": 3.2420679357969393e-05, "loss": 2.4031, "step": 9420 }, { "epoch": 1.0559910414333706, "grad_norm": 2.9721930027008057, "learning_rate": 3.2402015677491605e-05, "loss": 1.9876, "step": 9430 }, { "epoch": 1.057110862262038, "grad_norm": 2.9498345851898193, "learning_rate": 3.238335199701381e-05, "loss": 1.8549, "step": 9440 }, { "epoch": 1.0582306830907056, "grad_norm": 13.339334487915039, "learning_rate": 3.236468831653602e-05, "loss": 1.8697, "step": 9450 }, { "epoch": 1.059350503919373, "grad_norm": 4.650289058685303, "learning_rate": 3.2346024636058234e-05, "loss": 2.2676, "step": 9460 }, { "epoch": 1.0604703247480403, "grad_norm": 14.234888076782227, "learning_rate": 3.2327360955580445e-05, "loss": 2.3395, "step": 9470 }, { "epoch": 1.0615901455767078, "grad_norm": 3.4030983448028564, "learning_rate": 3.230869727510265e-05, "loss": 2.2116, "step": 9480 }, { "epoch": 1.0627099664053752, "grad_norm": 4.666158199310303, "learning_rate": 3.229003359462486e-05, "loss": 2.3914, "step": 9490 }, { "epoch": 1.0638297872340425, "grad_norm": 9.740036010742188, "learning_rate": 3.2271369914147074e-05, "loss": 1.7909, "step": 9500 }, { "epoch": 1.0649496080627099, "grad_norm": 3.0109105110168457, "learning_rate": 3.2252706233669285e-05, "loss": 2.4118, "step": 9510 }, { "epoch": 1.0660694288913775, "grad_norm": 3.5755748748779297, "learning_rate": 3.223404255319149e-05, "loss": 2.0648, "step": 9520 }, { "epoch": 1.0671892497200448, "grad_norm": 3.826801061630249, "learning_rate": 3.22153788727137e-05, "loss": 1.5787, "step": 9530 }, { "epoch": 1.0683090705487122, "grad_norm": 10.07292366027832, "learning_rate": 3.219671519223591e-05, "loss": 2.2986, "step": 9540 }, { "epoch": 1.0694288913773797, "grad_norm": 13.892361640930176, "learning_rate": 3.2178051511758125e-05, "loss": 1.9787, "step": 9550 }, { "epoch": 1.070548712206047, "grad_norm": 36.16096496582031, "learning_rate": 3.215938783128033e-05, "loss": 1.885, "step": 9560 }, { "epoch": 1.0716685330347144, "grad_norm": 2.89412260055542, "learning_rate": 3.214072415080254e-05, "loss": 1.745, "step": 9570 }, { "epoch": 1.0727883538633818, "grad_norm": 11.991975784301758, "learning_rate": 3.212206047032475e-05, "loss": 1.4454, "step": 9580 }, { "epoch": 1.0739081746920494, "grad_norm": 14.4371337890625, "learning_rate": 3.210339678984696e-05, "loss": 1.8037, "step": 9590 }, { "epoch": 1.0750279955207167, "grad_norm": 3.9582924842834473, "learning_rate": 3.208473310936917e-05, "loss": 2.414, "step": 9600 }, { "epoch": 1.076147816349384, "grad_norm": 3.2256970405578613, "learning_rate": 3.2066069428891375e-05, "loss": 1.905, "step": 9610 }, { "epoch": 1.0772676371780516, "grad_norm": 10.48331356048584, "learning_rate": 3.204740574841359e-05, "loss": 1.9094, "step": 9620 }, { "epoch": 1.078387458006719, "grad_norm": 5.042234420776367, "learning_rate": 3.20287420679358e-05, "loss": 2.2765, "step": 9630 }, { "epoch": 1.0795072788353863, "grad_norm": 3.3927605152130127, "learning_rate": 3.201007838745801e-05, "loss": 1.9585, "step": 9640 }, { "epoch": 1.0806270996640537, "grad_norm": 3.9636199474334717, "learning_rate": 3.1991414706980215e-05, "loss": 2.0755, "step": 9650 }, { "epoch": 1.0817469204927213, "grad_norm": 3.8078179359436035, "learning_rate": 3.197275102650243e-05, "loss": 1.7727, "step": 9660 }, { "epoch": 1.0828667413213886, "grad_norm": 11.449139595031738, "learning_rate": 3.195408734602464e-05, "loss": 2.0101, "step": 9670 }, { "epoch": 1.083986562150056, "grad_norm": 13.973347663879395, "learning_rate": 3.193542366554685e-05, "loss": 2.2601, "step": 9680 }, { "epoch": 1.0851063829787233, "grad_norm": 11.239791870117188, "learning_rate": 3.1916759985069055e-05, "loss": 2.1264, "step": 9690 }, { "epoch": 1.0862262038073909, "grad_norm": 11.495058059692383, "learning_rate": 3.189809630459127e-05, "loss": 1.9445, "step": 9700 }, { "epoch": 1.0873460246360582, "grad_norm": 4.135149002075195, "learning_rate": 3.187943262411348e-05, "loss": 1.7415, "step": 9710 }, { "epoch": 1.0884658454647256, "grad_norm": 10.35810375213623, "learning_rate": 3.186076894363569e-05, "loss": 2.1196, "step": 9720 }, { "epoch": 1.0895856662933932, "grad_norm": 3.5504679679870605, "learning_rate": 3.1842105263157895e-05, "loss": 2.1255, "step": 9730 }, { "epoch": 1.0907054871220605, "grad_norm": 7.433374404907227, "learning_rate": 3.182344158268011e-05, "loss": 2.0092, "step": 9740 }, { "epoch": 1.0918253079507279, "grad_norm": 7.075191974639893, "learning_rate": 3.180477790220231e-05, "loss": 1.935, "step": 9750 }, { "epoch": 1.0929451287793952, "grad_norm": 5.563907623291016, "learning_rate": 3.178611422172453e-05, "loss": 2.0882, "step": 9760 }, { "epoch": 1.0940649496080628, "grad_norm": 3.0820200443267822, "learning_rate": 3.1767450541246735e-05, "loss": 1.7695, "step": 9770 }, { "epoch": 1.0951847704367301, "grad_norm": 5.051406383514404, "learning_rate": 3.174878686076895e-05, "loss": 2.3345, "step": 9780 }, { "epoch": 1.0963045912653975, "grad_norm": 9.736443519592285, "learning_rate": 3.173012318029115e-05, "loss": 1.9699, "step": 9790 }, { "epoch": 1.097424412094065, "grad_norm": 3.930483818054199, "learning_rate": 3.171145949981337e-05, "loss": 2.1495, "step": 9800 }, { "epoch": 1.0985442329227324, "grad_norm": 9.857285499572754, "learning_rate": 3.1692795819335575e-05, "loss": 2.1356, "step": 9810 }, { "epoch": 1.0996640537513998, "grad_norm": 3.3318631649017334, "learning_rate": 3.167413213885778e-05, "loss": 1.9198, "step": 9820 }, { "epoch": 1.100783874580067, "grad_norm": 3.945836067199707, "learning_rate": 3.165546845837999e-05, "loss": 2.0346, "step": 9830 }, { "epoch": 1.1019036954087347, "grad_norm": 9.270819664001465, "learning_rate": 3.1636804777902204e-05, "loss": 2.1785, "step": 9840 }, { "epoch": 1.103023516237402, "grad_norm": 14.800010681152344, "learning_rate": 3.1618141097424415e-05, "loss": 1.9342, "step": 9850 }, { "epoch": 1.1041433370660694, "grad_norm": 8.554313659667969, "learning_rate": 3.159947741694662e-05, "loss": 2.0811, "step": 9860 }, { "epoch": 1.1052631578947367, "grad_norm": 3.686922073364258, "learning_rate": 3.158081373646883e-05, "loss": 2.1673, "step": 9870 }, { "epoch": 1.1063829787234043, "grad_norm": 2.909205436706543, "learning_rate": 3.1562150055991044e-05, "loss": 2.0985, "step": 9880 }, { "epoch": 1.1075027995520716, "grad_norm": 3.2634377479553223, "learning_rate": 3.1543486375513255e-05, "loss": 1.7348, "step": 9890 }, { "epoch": 1.108622620380739, "grad_norm": 3.7922704219818115, "learning_rate": 3.152482269503546e-05, "loss": 2.0642, "step": 9900 }, { "epoch": 1.1097424412094066, "grad_norm": 3.7772440910339355, "learning_rate": 3.150615901455767e-05, "loss": 1.6834, "step": 9910 }, { "epoch": 1.110862262038074, "grad_norm": 6.348939895629883, "learning_rate": 3.1487495334079884e-05, "loss": 1.8862, "step": 9920 }, { "epoch": 1.1119820828667413, "grad_norm": 4.5603790283203125, "learning_rate": 3.1468831653602095e-05, "loss": 2.0568, "step": 9930 }, { "epoch": 1.1131019036954086, "grad_norm": 11.244080543518066, "learning_rate": 3.14501679731243e-05, "loss": 2.015, "step": 9940 }, { "epoch": 1.1142217245240762, "grad_norm": 8.52851390838623, "learning_rate": 3.143150429264651e-05, "loss": 2.0168, "step": 9950 }, { "epoch": 1.1153415453527435, "grad_norm": 3.2907376289367676, "learning_rate": 3.1412840612168724e-05, "loss": 2.0653, "step": 9960 }, { "epoch": 1.116461366181411, "grad_norm": 3.665787696838379, "learning_rate": 3.1394176931690935e-05, "loss": 2.5296, "step": 9970 }, { "epoch": 1.1175811870100785, "grad_norm": 3.52567982673645, "learning_rate": 3.137551325121314e-05, "loss": 2.0133, "step": 9980 }, { "epoch": 1.1187010078387458, "grad_norm": 3.8598620891571045, "learning_rate": 3.135684957073535e-05, "loss": 2.0675, "step": 9990 }, { "epoch": 1.1198208286674132, "grad_norm": 10.049873352050781, "learning_rate": 3.133818589025756e-05, "loss": 2.1145, "step": 10000 }, { "epoch": 1.1209406494960805, "grad_norm": 3.7973287105560303, "learning_rate": 3.1319522209779775e-05, "loss": 2.3125, "step": 10010 }, { "epoch": 1.122060470324748, "grad_norm": 9.45921516418457, "learning_rate": 3.130085852930198e-05, "loss": 2.2593, "step": 10020 }, { "epoch": 1.1231802911534154, "grad_norm": 3.3235390186309814, "learning_rate": 3.128219484882419e-05, "loss": 2.0233, "step": 10030 }, { "epoch": 1.1243001119820828, "grad_norm": 8.556841850280762, "learning_rate": 3.12635311683464e-05, "loss": 2.0527, "step": 10040 }, { "epoch": 1.1254199328107504, "grad_norm": 3.7315330505371094, "learning_rate": 3.124486748786861e-05, "loss": 1.9582, "step": 10050 }, { "epoch": 1.1265397536394177, "grad_norm": 9.111562728881836, "learning_rate": 3.122620380739082e-05, "loss": 1.931, "step": 10060 }, { "epoch": 1.127659574468085, "grad_norm": 13.934300422668457, "learning_rate": 3.1207540126913025e-05, "loss": 2.2748, "step": 10070 }, { "epoch": 1.1287793952967524, "grad_norm": 2.9079439640045166, "learning_rate": 3.118887644643524e-05, "loss": 2.0733, "step": 10080 }, { "epoch": 1.12989921612542, "grad_norm": 14.349089622497559, "learning_rate": 3.117021276595745e-05, "loss": 1.9089, "step": 10090 }, { "epoch": 1.1310190369540873, "grad_norm": 4.357903003692627, "learning_rate": 3.115154908547966e-05, "loss": 2.1913, "step": 10100 }, { "epoch": 1.1321388577827547, "grad_norm": 18.6312255859375, "learning_rate": 3.1132885405001865e-05, "loss": 2.0145, "step": 10110 }, { "epoch": 1.1332586786114223, "grad_norm": 10.29723834991455, "learning_rate": 3.111422172452408e-05, "loss": 2.1836, "step": 10120 }, { "epoch": 1.1343784994400896, "grad_norm": 8.374372482299805, "learning_rate": 3.109555804404629e-05, "loss": 1.8299, "step": 10130 }, { "epoch": 1.135498320268757, "grad_norm": 3.970510244369507, "learning_rate": 3.10768943635685e-05, "loss": 1.2505, "step": 10140 }, { "epoch": 1.1366181410974243, "grad_norm": 2.9629228115081787, "learning_rate": 3.1058230683090705e-05, "loss": 1.8105, "step": 10150 }, { "epoch": 1.137737961926092, "grad_norm": 11.693266868591309, "learning_rate": 3.103956700261292e-05, "loss": 2.1849, "step": 10160 }, { "epoch": 1.1388577827547592, "grad_norm": 5.791164398193359, "learning_rate": 3.102090332213513e-05, "loss": 2.1892, "step": 10170 }, { "epoch": 1.1399776035834266, "grad_norm": 13.902082443237305, "learning_rate": 3.100223964165734e-05, "loss": 2.0516, "step": 10180 }, { "epoch": 1.1410974244120942, "grad_norm": 8.851689338684082, "learning_rate": 3.0983575961179545e-05, "loss": 1.8785, "step": 10190 }, { "epoch": 1.1422172452407615, "grad_norm": 7.506970405578613, "learning_rate": 3.096491228070176e-05, "loss": 1.77, "step": 10200 }, { "epoch": 1.1433370660694289, "grad_norm": 5.175302028656006, "learning_rate": 3.094624860022396e-05, "loss": 1.5816, "step": 10210 }, { "epoch": 1.1444568868980962, "grad_norm": 11.070626258850098, "learning_rate": 3.092758491974618e-05, "loss": 2.2113, "step": 10220 }, { "epoch": 1.1455767077267638, "grad_norm": 14.317008018493652, "learning_rate": 3.0908921239268385e-05, "loss": 2.4373, "step": 10230 }, { "epoch": 1.1466965285554311, "grad_norm": 3.7413330078125, "learning_rate": 3.08902575587906e-05, "loss": 2.0543, "step": 10240 }, { "epoch": 1.1478163493840985, "grad_norm": 8.698836326599121, "learning_rate": 3.08715938783128e-05, "loss": 1.8267, "step": 10250 }, { "epoch": 1.148936170212766, "grad_norm": 9.122303009033203, "learning_rate": 3.085293019783502e-05, "loss": 2.1165, "step": 10260 }, { "epoch": 1.1500559910414334, "grad_norm": 10.478148460388184, "learning_rate": 3.0834266517357225e-05, "loss": 1.7777, "step": 10270 }, { "epoch": 1.1511758118701008, "grad_norm": 7.338038444519043, "learning_rate": 3.081560283687943e-05, "loss": 1.5512, "step": 10280 }, { "epoch": 1.1522956326987681, "grad_norm": 5.104288578033447, "learning_rate": 3.079693915640164e-05, "loss": 1.5521, "step": 10290 }, { "epoch": 1.1534154535274357, "grad_norm": 6.295915126800537, "learning_rate": 3.0778275475923854e-05, "loss": 1.8437, "step": 10300 }, { "epoch": 1.154535274356103, "grad_norm": 3.8285670280456543, "learning_rate": 3.0759611795446065e-05, "loss": 2.3621, "step": 10310 }, { "epoch": 1.1556550951847704, "grad_norm": 9.0399751663208, "learning_rate": 3.074094811496827e-05, "loss": 1.9887, "step": 10320 }, { "epoch": 1.156774916013438, "grad_norm": 14.121618270874023, "learning_rate": 3.072228443449048e-05, "loss": 2.091, "step": 10330 }, { "epoch": 1.1578947368421053, "grad_norm": 3.0184133052825928, "learning_rate": 3.0703620754012694e-05, "loss": 2.018, "step": 10340 }, { "epoch": 1.1590145576707727, "grad_norm": 7.774500370025635, "learning_rate": 3.0684957073534905e-05, "loss": 1.6259, "step": 10350 }, { "epoch": 1.16013437849944, "grad_norm": 3.3404550552368164, "learning_rate": 3.066629339305711e-05, "loss": 1.6987, "step": 10360 }, { "epoch": 1.1612541993281076, "grad_norm": 5.787201881408691, "learning_rate": 3.064762971257932e-05, "loss": 1.7056, "step": 10370 }, { "epoch": 1.162374020156775, "grad_norm": 4.292003631591797, "learning_rate": 3.0628966032101534e-05, "loss": 2.0171, "step": 10380 }, { "epoch": 1.1634938409854423, "grad_norm": 8.185914993286133, "learning_rate": 3.0610302351623745e-05, "loss": 1.75, "step": 10390 }, { "epoch": 1.1646136618141099, "grad_norm": 12.01564884185791, "learning_rate": 3.059163867114595e-05, "loss": 1.8298, "step": 10400 }, { "epoch": 1.1657334826427772, "grad_norm": 4.348435878753662, "learning_rate": 3.057297499066816e-05, "loss": 1.829, "step": 10410 }, { "epoch": 1.1668533034714446, "grad_norm": 12.32309341430664, "learning_rate": 3.055431131019037e-05, "loss": 1.6587, "step": 10420 }, { "epoch": 1.167973124300112, "grad_norm": 8.916961669921875, "learning_rate": 3.0535647629712585e-05, "loss": 1.814, "step": 10430 }, { "epoch": 1.1690929451287795, "grad_norm": 6.272485256195068, "learning_rate": 3.051698394923479e-05, "loss": 1.8092, "step": 10440 }, { "epoch": 1.1702127659574468, "grad_norm": 12.134109497070312, "learning_rate": 3.0498320268757002e-05, "loss": 2.1558, "step": 10450 }, { "epoch": 1.1713325867861142, "grad_norm": 5.216768741607666, "learning_rate": 3.047965658827921e-05, "loss": 1.9535, "step": 10460 }, { "epoch": 1.1724524076147818, "grad_norm": 9.889372825622559, "learning_rate": 3.0460992907801422e-05, "loss": 1.6845, "step": 10470 }, { "epoch": 1.173572228443449, "grad_norm": 9.93710708618164, "learning_rate": 3.044232922732363e-05, "loss": 1.7271, "step": 10480 }, { "epoch": 1.1746920492721165, "grad_norm": 11.036845207214355, "learning_rate": 3.0423665546845842e-05, "loss": 1.8613, "step": 10490 }, { "epoch": 1.1758118701007838, "grad_norm": 4.051137924194336, "learning_rate": 3.040500186636805e-05, "loss": 1.6322, "step": 10500 }, { "epoch": 1.1769316909294514, "grad_norm": 12.119963645935059, "learning_rate": 3.0386338185890255e-05, "loss": 1.9307, "step": 10510 }, { "epoch": 1.1780515117581187, "grad_norm": 3.4073593616485596, "learning_rate": 3.036767450541247e-05, "loss": 1.8893, "step": 10520 }, { "epoch": 1.179171332586786, "grad_norm": 4.570372104644775, "learning_rate": 3.0349010824934675e-05, "loss": 2.0297, "step": 10530 }, { "epoch": 1.1802911534154534, "grad_norm": 4.415748119354248, "learning_rate": 3.033034714445689e-05, "loss": 1.818, "step": 10540 }, { "epoch": 1.181410974244121, "grad_norm": 4.645330429077148, "learning_rate": 3.0311683463979095e-05, "loss": 2.1483, "step": 10550 }, { "epoch": 1.1825307950727884, "grad_norm": 5.267803192138672, "learning_rate": 3.0293019783501307e-05, "loss": 2.009, "step": 10560 }, { "epoch": 1.1836506159014557, "grad_norm": 4.232731819152832, "learning_rate": 3.0274356103023515e-05, "loss": 2.0684, "step": 10570 }, { "epoch": 1.184770436730123, "grad_norm": 4.118504047393799, "learning_rate": 3.0255692422545727e-05, "loss": 2.1593, "step": 10580 }, { "epoch": 1.1858902575587906, "grad_norm": 3.479337692260742, "learning_rate": 3.0237028742067935e-05, "loss": 2.428, "step": 10590 }, { "epoch": 1.187010078387458, "grad_norm": 9.307233810424805, "learning_rate": 3.0218365061590147e-05, "loss": 2.113, "step": 10600 }, { "epoch": 1.1881298992161253, "grad_norm": 7.910929203033447, "learning_rate": 3.0199701381112355e-05, "loss": 1.8977, "step": 10610 }, { "epoch": 1.189249720044793, "grad_norm": 14.38522720336914, "learning_rate": 3.0181037700634567e-05, "loss": 2.3208, "step": 10620 }, { "epoch": 1.1903695408734603, "grad_norm": 7.7208147048950195, "learning_rate": 3.0162374020156775e-05, "loss": 1.9594, "step": 10630 }, { "epoch": 1.1914893617021276, "grad_norm": 3.620098114013672, "learning_rate": 3.0143710339678987e-05, "loss": 1.999, "step": 10640 }, { "epoch": 1.192609182530795, "grad_norm": 8.450474739074707, "learning_rate": 3.0125046659201195e-05, "loss": 2.0836, "step": 10650 }, { "epoch": 1.1937290033594625, "grad_norm": 5.103601932525635, "learning_rate": 3.0106382978723407e-05, "loss": 2.0798, "step": 10660 }, { "epoch": 1.1948488241881299, "grad_norm": 3.6850714683532715, "learning_rate": 3.0087719298245615e-05, "loss": 2.0787, "step": 10670 }, { "epoch": 1.1959686450167972, "grad_norm": 10.51152229309082, "learning_rate": 3.0069055617767827e-05, "loss": 2.4453, "step": 10680 }, { "epoch": 1.1970884658454648, "grad_norm": 9.488080978393555, "learning_rate": 3.0050391937290035e-05, "loss": 2.3139, "step": 10690 }, { "epoch": 1.1982082866741322, "grad_norm": 4.7864274978637695, "learning_rate": 3.0031728256812247e-05, "loss": 1.9159, "step": 10700 }, { "epoch": 1.1993281075027995, "grad_norm": 4.440032482147217, "learning_rate": 3.0013064576334455e-05, "loss": 2.0064, "step": 10710 }, { "epoch": 1.2004479283314669, "grad_norm": 5.0448503494262695, "learning_rate": 2.9994400895856667e-05, "loss": 2.42, "step": 10720 }, { "epoch": 1.2015677491601344, "grad_norm": 3.938079357147217, "learning_rate": 2.9975737215378875e-05, "loss": 1.8528, "step": 10730 }, { "epoch": 1.2026875699888018, "grad_norm": 8.685888290405273, "learning_rate": 2.9957073534901087e-05, "loss": 1.8489, "step": 10740 }, { "epoch": 1.2038073908174691, "grad_norm": 4.123725891113281, "learning_rate": 2.9938409854423295e-05, "loss": 2.4495, "step": 10750 }, { "epoch": 1.2049272116461367, "grad_norm": 7.6613593101501465, "learning_rate": 2.99197461739455e-05, "loss": 1.8993, "step": 10760 }, { "epoch": 1.206047032474804, "grad_norm": 4.251744747161865, "learning_rate": 2.9901082493467715e-05, "loss": 2.1757, "step": 10770 }, { "epoch": 1.2071668533034714, "grad_norm": 8.871329307556152, "learning_rate": 2.988241881298992e-05, "loss": 1.6899, "step": 10780 }, { "epoch": 1.2082866741321387, "grad_norm": 3.407541275024414, "learning_rate": 2.9863755132512132e-05, "loss": 2.0731, "step": 10790 }, { "epoch": 1.2094064949608063, "grad_norm": 4.200164794921875, "learning_rate": 2.984509145203434e-05, "loss": 2.1467, "step": 10800 }, { "epoch": 1.2105263157894737, "grad_norm": 5.967247009277344, "learning_rate": 2.9826427771556552e-05, "loss": 2.3341, "step": 10810 }, { "epoch": 1.211646136618141, "grad_norm": 6.902993202209473, "learning_rate": 2.980776409107876e-05, "loss": 2.1097, "step": 10820 }, { "epoch": 1.2127659574468086, "grad_norm": 3.4361205101013184, "learning_rate": 2.9789100410600972e-05, "loss": 2.2153, "step": 10830 }, { "epoch": 1.213885778275476, "grad_norm": 3.588088274002075, "learning_rate": 2.977043673012318e-05, "loss": 2.4579, "step": 10840 }, { "epoch": 1.2150055991041433, "grad_norm": 5.1023173332214355, "learning_rate": 2.9751773049645392e-05, "loss": 1.8074, "step": 10850 }, { "epoch": 1.2161254199328106, "grad_norm": 5.618678092956543, "learning_rate": 2.97331093691676e-05, "loss": 1.7553, "step": 10860 }, { "epoch": 1.2172452407614782, "grad_norm": 3.7290029525756836, "learning_rate": 2.9714445688689812e-05, "loss": 2.2483, "step": 10870 }, { "epoch": 1.2183650615901456, "grad_norm": 9.374983787536621, "learning_rate": 2.969578200821202e-05, "loss": 2.295, "step": 10880 }, { "epoch": 1.219484882418813, "grad_norm": 12.954818725585938, "learning_rate": 2.9677118327734232e-05, "loss": 2.1509, "step": 10890 }, { "epoch": 1.2206047032474805, "grad_norm": 5.120643615722656, "learning_rate": 2.965845464725644e-05, "loss": 1.6961, "step": 10900 }, { "epoch": 1.2217245240761478, "grad_norm": 6.945920944213867, "learning_rate": 2.9639790966778652e-05, "loss": 1.92, "step": 10910 }, { "epoch": 1.2228443449048152, "grad_norm": 4.189951419830322, "learning_rate": 2.962112728630086e-05, "loss": 1.8421, "step": 10920 }, { "epoch": 1.2239641657334825, "grad_norm": 13.6853666305542, "learning_rate": 2.9602463605823072e-05, "loss": 1.6631, "step": 10930 }, { "epoch": 1.2250839865621501, "grad_norm": 13.50125789642334, "learning_rate": 2.958379992534528e-05, "loss": 1.9662, "step": 10940 }, { "epoch": 1.2262038073908175, "grad_norm": 11.182577133178711, "learning_rate": 2.9565136244867492e-05, "loss": 2.474, "step": 10950 }, { "epoch": 1.2273236282194848, "grad_norm": 8.855241775512695, "learning_rate": 2.95464725643897e-05, "loss": 1.9798, "step": 10960 }, { "epoch": 1.2284434490481524, "grad_norm": 2.290292263031006, "learning_rate": 2.9527808883911912e-05, "loss": 1.7049, "step": 10970 }, { "epoch": 1.2295632698768197, "grad_norm": 3.8447682857513428, "learning_rate": 2.950914520343412e-05, "loss": 2.0167, "step": 10980 }, { "epoch": 1.230683090705487, "grad_norm": 3.326638698577881, "learning_rate": 2.9490481522956325e-05, "loss": 1.8316, "step": 10990 }, { "epoch": 1.2318029115341544, "grad_norm": 9.62152099609375, "learning_rate": 2.9471817842478537e-05, "loss": 1.6587, "step": 11000 }, { "epoch": 1.232922732362822, "grad_norm": 4.14316463470459, "learning_rate": 2.9453154162000745e-05, "loss": 1.8924, "step": 11010 }, { "epoch": 1.2340425531914894, "grad_norm": 12.48459529876709, "learning_rate": 2.9434490481522957e-05, "loss": 1.8707, "step": 11020 }, { "epoch": 1.2351623740201567, "grad_norm": 8.288812637329102, "learning_rate": 2.9415826801045165e-05, "loss": 1.8107, "step": 11030 }, { "epoch": 1.2362821948488243, "grad_norm": 14.502120018005371, "learning_rate": 2.9397163120567377e-05, "loss": 2.3128, "step": 11040 }, { "epoch": 1.2374020156774916, "grad_norm": 3.264012336730957, "learning_rate": 2.9378499440089585e-05, "loss": 2.1651, "step": 11050 }, { "epoch": 1.238521836506159, "grad_norm": 7.62103271484375, "learning_rate": 2.9359835759611797e-05, "loss": 2.0078, "step": 11060 }, { "epoch": 1.2396416573348263, "grad_norm": 14.445006370544434, "learning_rate": 2.9341172079134005e-05, "loss": 1.9942, "step": 11070 }, { "epoch": 1.240761478163494, "grad_norm": 4.4992899894714355, "learning_rate": 2.9322508398656217e-05, "loss": 2.1584, "step": 11080 }, { "epoch": 1.2418812989921613, "grad_norm": 7.469330310821533, "learning_rate": 2.9303844718178425e-05, "loss": 1.9318, "step": 11090 }, { "epoch": 1.2430011198208286, "grad_norm": 4.640170574188232, "learning_rate": 2.9285181037700637e-05, "loss": 2.0114, "step": 11100 }, { "epoch": 1.2441209406494962, "grad_norm": 5.856334686279297, "learning_rate": 2.9266517357222845e-05, "loss": 1.7062, "step": 11110 }, { "epoch": 1.2452407614781635, "grad_norm": 5.0445404052734375, "learning_rate": 2.9247853676745057e-05, "loss": 1.8474, "step": 11120 }, { "epoch": 1.2463605823068309, "grad_norm": 11.517007827758789, "learning_rate": 2.9229189996267265e-05, "loss": 2.4335, "step": 11130 }, { "epoch": 1.2474804031354982, "grad_norm": 15.464090347290039, "learning_rate": 2.9210526315789477e-05, "loss": 2.2518, "step": 11140 }, { "epoch": 1.2486002239641658, "grad_norm": 4.1234025955200195, "learning_rate": 2.9191862635311685e-05, "loss": 2.2545, "step": 11150 }, { "epoch": 1.2497200447928332, "grad_norm": 12.045602798461914, "learning_rate": 2.9173198954833897e-05, "loss": 1.9101, "step": 11160 }, { "epoch": 1.2508398656215005, "grad_norm": 9.400586128234863, "learning_rate": 2.9154535274356105e-05, "loss": 2.2002, "step": 11170 }, { "epoch": 1.251959686450168, "grad_norm": 4.317978382110596, "learning_rate": 2.9135871593878317e-05, "loss": 2.008, "step": 11180 }, { "epoch": 1.2530795072788354, "grad_norm": 3.613831043243408, "learning_rate": 2.9117207913400525e-05, "loss": 2.2759, "step": 11190 }, { "epoch": 1.2541993281075028, "grad_norm": 9.903818130493164, "learning_rate": 2.9098544232922737e-05, "loss": 1.8002, "step": 11200 }, { "epoch": 1.2553191489361701, "grad_norm": 4.823100566864014, "learning_rate": 2.9079880552444942e-05, "loss": 2.041, "step": 11210 }, { "epoch": 1.2564389697648375, "grad_norm": 8.871933937072754, "learning_rate": 2.906121687196715e-05, "loss": 1.8554, "step": 11220 }, { "epoch": 1.257558790593505, "grad_norm": 3.7882330417633057, "learning_rate": 2.9042553191489362e-05, "loss": 2.0572, "step": 11230 }, { "epoch": 1.2586786114221724, "grad_norm": 3.4256579875946045, "learning_rate": 2.902388951101157e-05, "loss": 2.0062, "step": 11240 }, { "epoch": 1.25979843225084, "grad_norm": 7.357487678527832, "learning_rate": 2.9005225830533782e-05, "loss": 2.0538, "step": 11250 }, { "epoch": 1.2609182530795073, "grad_norm": 8.090987205505371, "learning_rate": 2.898656215005599e-05, "loss": 1.7325, "step": 11260 }, { "epoch": 1.2620380739081747, "grad_norm": 7.4141669273376465, "learning_rate": 2.8967898469578202e-05, "loss": 2.2278, "step": 11270 }, { "epoch": 1.263157894736842, "grad_norm": 9.293551445007324, "learning_rate": 2.894923478910041e-05, "loss": 1.9437, "step": 11280 }, { "epoch": 1.2642777155655094, "grad_norm": 7.823407173156738, "learning_rate": 2.8930571108622622e-05, "loss": 1.7969, "step": 11290 }, { "epoch": 1.265397536394177, "grad_norm": 7.021416664123535, "learning_rate": 2.891190742814483e-05, "loss": 1.7032, "step": 11300 }, { "epoch": 1.2665173572228443, "grad_norm": 5.103081703186035, "learning_rate": 2.8893243747667042e-05, "loss": 1.9367, "step": 11310 }, { "epoch": 1.2676371780515119, "grad_norm": 11.817680358886719, "learning_rate": 2.887458006718925e-05, "loss": 1.9144, "step": 11320 }, { "epoch": 1.2687569988801792, "grad_norm": 4.182484149932861, "learning_rate": 2.8855916386711462e-05, "loss": 2.2476, "step": 11330 }, { "epoch": 1.2698768197088466, "grad_norm": 12.477089881896973, "learning_rate": 2.883725270623367e-05, "loss": 2.1192, "step": 11340 }, { "epoch": 1.270996640537514, "grad_norm": 4.275122165679932, "learning_rate": 2.8818589025755882e-05, "loss": 2.4108, "step": 11350 }, { "epoch": 1.2721164613661813, "grad_norm": 15.332164764404297, "learning_rate": 2.879992534527809e-05, "loss": 2.3805, "step": 11360 }, { "epoch": 1.2732362821948489, "grad_norm": 10.845608711242676, "learning_rate": 2.8781261664800302e-05, "loss": 2.3275, "step": 11370 }, { "epoch": 1.2743561030235162, "grad_norm": 8.792692184448242, "learning_rate": 2.876259798432251e-05, "loss": 1.6722, "step": 11380 }, { "epoch": 1.2754759238521838, "grad_norm": 9.559557914733887, "learning_rate": 2.8743934303844722e-05, "loss": 2.2257, "step": 11390 }, { "epoch": 1.2765957446808511, "grad_norm": 6.456275463104248, "learning_rate": 2.872527062336693e-05, "loss": 1.7045, "step": 11400 }, { "epoch": 1.2777155655095185, "grad_norm": 8.79680347442627, "learning_rate": 2.8706606942889142e-05, "loss": 2.2707, "step": 11410 }, { "epoch": 1.2788353863381858, "grad_norm": 4.077367782592773, "learning_rate": 2.8687943262411347e-05, "loss": 2.0724, "step": 11420 }, { "epoch": 1.2799552071668532, "grad_norm": 15.875419616699219, "learning_rate": 2.8669279581933562e-05, "loss": 2.4802, "step": 11430 }, { "epoch": 1.2810750279955208, "grad_norm": 9.360994338989258, "learning_rate": 2.8650615901455767e-05, "loss": 2.561, "step": 11440 }, { "epoch": 1.282194848824188, "grad_norm": 3.356452226638794, "learning_rate": 2.8631952220977975e-05, "loss": 1.6649, "step": 11450 }, { "epoch": 1.2833146696528557, "grad_norm": 4.318080425262451, "learning_rate": 2.8613288540500187e-05, "loss": 2.1682, "step": 11460 }, { "epoch": 1.284434490481523, "grad_norm": 14.16100025177002, "learning_rate": 2.8594624860022395e-05, "loss": 2.245, "step": 11470 }, { "epoch": 1.2855543113101904, "grad_norm": 4.342535495758057, "learning_rate": 2.8575961179544607e-05, "loss": 1.9388, "step": 11480 }, { "epoch": 1.2866741321388577, "grad_norm": 4.287493705749512, "learning_rate": 2.8557297499066815e-05, "loss": 2.0392, "step": 11490 }, { "epoch": 1.287793952967525, "grad_norm": 4.413599967956543, "learning_rate": 2.8538633818589027e-05, "loss": 1.7355, "step": 11500 }, { "epoch": 1.2889137737961927, "grad_norm": 6.577482223510742, "learning_rate": 2.8519970138111235e-05, "loss": 2.0267, "step": 11510 }, { "epoch": 1.29003359462486, "grad_norm": 16.563228607177734, "learning_rate": 2.8501306457633447e-05, "loss": 1.7356, "step": 11520 }, { "epoch": 1.2911534154535274, "grad_norm": 4.8970255851745605, "learning_rate": 2.8482642777155655e-05, "loss": 2.0905, "step": 11530 }, { "epoch": 1.292273236282195, "grad_norm": 3.6036787033081055, "learning_rate": 2.8463979096677867e-05, "loss": 1.8632, "step": 11540 }, { "epoch": 1.2933930571108623, "grad_norm": 3.7850587368011475, "learning_rate": 2.8445315416200075e-05, "loss": 1.8984, "step": 11550 }, { "epoch": 1.2945128779395296, "grad_norm": 3.808590888977051, "learning_rate": 2.8426651735722287e-05, "loss": 2.0321, "step": 11560 }, { "epoch": 1.295632698768197, "grad_norm": 6.799190044403076, "learning_rate": 2.8407988055244495e-05, "loss": 2.3606, "step": 11570 }, { "epoch": 1.2967525195968646, "grad_norm": 13.739738464355469, "learning_rate": 2.8389324374766707e-05, "loss": 2.4276, "step": 11580 }, { "epoch": 1.297872340425532, "grad_norm": 10.142343521118164, "learning_rate": 2.8370660694288915e-05, "loss": 2.1635, "step": 11590 }, { "epoch": 1.2989921612541993, "grad_norm": 7.874263286590576, "learning_rate": 2.8351997013811127e-05, "loss": 1.9213, "step": 11600 }, { "epoch": 1.3001119820828668, "grad_norm": 9.147733688354492, "learning_rate": 2.8333333333333335e-05, "loss": 1.9669, "step": 11610 }, { "epoch": 1.3012318029115342, "grad_norm": 4.748540878295898, "learning_rate": 2.8314669652855547e-05, "loss": 2.1498, "step": 11620 }, { "epoch": 1.3023516237402015, "grad_norm": 3.711635112762451, "learning_rate": 2.8296005972377755e-05, "loss": 2.0037, "step": 11630 }, { "epoch": 1.3034714445688689, "grad_norm": 9.331302642822266, "learning_rate": 2.8277342291899967e-05, "loss": 2.2264, "step": 11640 }, { "epoch": 1.3045912653975364, "grad_norm": 3.66086483001709, "learning_rate": 2.8258678611422172e-05, "loss": 1.9577, "step": 11650 }, { "epoch": 1.3057110862262038, "grad_norm": 3.9760518074035645, "learning_rate": 2.8240014930944387e-05, "loss": 2.4175, "step": 11660 }, { "epoch": 1.3068309070548711, "grad_norm": 11.729776382446289, "learning_rate": 2.8221351250466592e-05, "loss": 1.8006, "step": 11670 }, { "epoch": 1.3079507278835387, "grad_norm": 5.228203296661377, "learning_rate": 2.82026875699888e-05, "loss": 2.1344, "step": 11680 }, { "epoch": 1.309070548712206, "grad_norm": 4.904013633728027, "learning_rate": 2.8184023889511012e-05, "loss": 1.3898, "step": 11690 }, { "epoch": 1.3101903695408734, "grad_norm": 10.309314727783203, "learning_rate": 2.816536020903322e-05, "loss": 1.6145, "step": 11700 }, { "epoch": 1.3113101903695408, "grad_norm": 6.570261478424072, "learning_rate": 2.8146696528555432e-05, "loss": 2.0292, "step": 11710 }, { "epoch": 1.3124300111982083, "grad_norm": 4.200310707092285, "learning_rate": 2.812803284807764e-05, "loss": 2.086, "step": 11720 }, { "epoch": 1.3135498320268757, "grad_norm": 10.7840576171875, "learning_rate": 2.8109369167599852e-05, "loss": 2.0459, "step": 11730 }, { "epoch": 1.314669652855543, "grad_norm": 3.7457571029663086, "learning_rate": 2.809070548712206e-05, "loss": 1.6617, "step": 11740 }, { "epoch": 1.3157894736842106, "grad_norm": 4.625324726104736, "learning_rate": 2.8072041806644272e-05, "loss": 1.5736, "step": 11750 }, { "epoch": 1.316909294512878, "grad_norm": 9.565194129943848, "learning_rate": 2.805337812616648e-05, "loss": 2.4976, "step": 11760 }, { "epoch": 1.3180291153415453, "grad_norm": 10.894997596740723, "learning_rate": 2.8034714445688692e-05, "loss": 1.9974, "step": 11770 }, { "epoch": 1.3191489361702127, "grad_norm": 4.6816725730896, "learning_rate": 2.80160507652109e-05, "loss": 2.1758, "step": 11780 }, { "epoch": 1.3202687569988802, "grad_norm": 3.4345543384552, "learning_rate": 2.7997387084733112e-05, "loss": 2.0768, "step": 11790 }, { "epoch": 1.3213885778275476, "grad_norm": 11.61649227142334, "learning_rate": 2.797872340425532e-05, "loss": 2.135, "step": 11800 }, { "epoch": 1.322508398656215, "grad_norm": 3.6992645263671875, "learning_rate": 2.7960059723777532e-05, "loss": 2.0889, "step": 11810 }, { "epoch": 1.3236282194848825, "grad_norm": 3.465416193008423, "learning_rate": 2.794139604329974e-05, "loss": 2.0867, "step": 11820 }, { "epoch": 1.3247480403135499, "grad_norm": 7.6769795417785645, "learning_rate": 2.7922732362821952e-05, "loss": 1.9802, "step": 11830 }, { "epoch": 1.3258678611422172, "grad_norm": 4.010658264160156, "learning_rate": 2.790406868234416e-05, "loss": 2.2954, "step": 11840 }, { "epoch": 1.3269876819708846, "grad_norm": 5.182217597961426, "learning_rate": 2.7885405001866372e-05, "loss": 2.3104, "step": 11850 }, { "epoch": 1.3281075027995521, "grad_norm": 9.429098129272461, "learning_rate": 2.7866741321388577e-05, "loss": 1.7616, "step": 11860 }, { "epoch": 1.3292273236282195, "grad_norm": 3.6495516300201416, "learning_rate": 2.7848077640910792e-05, "loss": 1.7191, "step": 11870 }, { "epoch": 1.3303471444568868, "grad_norm": 3.693429470062256, "learning_rate": 2.7829413960432997e-05, "loss": 2.2813, "step": 11880 }, { "epoch": 1.3314669652855544, "grad_norm": 3.9754602909088135, "learning_rate": 2.7810750279955212e-05, "loss": 1.9402, "step": 11890 }, { "epoch": 1.3325867861142218, "grad_norm": 8.377337455749512, "learning_rate": 2.7792086599477417e-05, "loss": 2.0752, "step": 11900 }, { "epoch": 1.3337066069428891, "grad_norm": 7.4295220375061035, "learning_rate": 2.7773422918999625e-05, "loss": 1.6284, "step": 11910 }, { "epoch": 1.3348264277715565, "grad_norm": 3.935297966003418, "learning_rate": 2.7754759238521837e-05, "loss": 2.2115, "step": 11920 }, { "epoch": 1.335946248600224, "grad_norm": 9.240707397460938, "learning_rate": 2.7736095558044045e-05, "loss": 1.8889, "step": 11930 }, { "epoch": 1.3370660694288914, "grad_norm": 11.241867065429688, "learning_rate": 2.7717431877566257e-05, "loss": 1.7422, "step": 11940 }, { "epoch": 1.3381858902575587, "grad_norm": 5.747684478759766, "learning_rate": 2.7698768197088465e-05, "loss": 1.8776, "step": 11950 }, { "epoch": 1.3393057110862263, "grad_norm": 8.002111434936523, "learning_rate": 2.7680104516610677e-05, "loss": 2.0645, "step": 11960 }, { "epoch": 1.3404255319148937, "grad_norm": 6.661399841308594, "learning_rate": 2.7661440836132885e-05, "loss": 1.914, "step": 11970 }, { "epoch": 1.341545352743561, "grad_norm": 4.897961616516113, "learning_rate": 2.7642777155655097e-05, "loss": 2.0699, "step": 11980 }, { "epoch": 1.3426651735722284, "grad_norm": 8.352503776550293, "learning_rate": 2.7624113475177305e-05, "loss": 2.3203, "step": 11990 }, { "epoch": 1.343784994400896, "grad_norm": 5.313516616821289, "learning_rate": 2.7605449794699517e-05, "loss": 1.4091, "step": 12000 }, { "epoch": 1.3449048152295633, "grad_norm": 8.284523010253906, "learning_rate": 2.7586786114221725e-05, "loss": 2.2698, "step": 12010 }, { "epoch": 1.3460246360582306, "grad_norm": 4.834831237792969, "learning_rate": 2.7568122433743937e-05, "loss": 1.9009, "step": 12020 }, { "epoch": 1.3471444568868982, "grad_norm": 14.059358596801758, "learning_rate": 2.7549458753266145e-05, "loss": 2.1775, "step": 12030 }, { "epoch": 1.3482642777155656, "grad_norm": 8.378131866455078, "learning_rate": 2.7530795072788357e-05, "loss": 2.1616, "step": 12040 }, { "epoch": 1.349384098544233, "grad_norm": 3.4480719566345215, "learning_rate": 2.7512131392310565e-05, "loss": 2.2743, "step": 12050 }, { "epoch": 1.3505039193729003, "grad_norm": 4.051682472229004, "learning_rate": 2.7493467711832777e-05, "loss": 1.7351, "step": 12060 }, { "epoch": 1.3516237402015676, "grad_norm": 15.0495023727417, "learning_rate": 2.7474804031354982e-05, "loss": 2.3378, "step": 12070 }, { "epoch": 1.3527435610302352, "grad_norm": 12.420659065246582, "learning_rate": 2.7456140350877197e-05, "loss": 2.0799, "step": 12080 }, { "epoch": 1.3538633818589025, "grad_norm": 3.576589345932007, "learning_rate": 2.7437476670399402e-05, "loss": 1.7943, "step": 12090 }, { "epoch": 1.35498320268757, "grad_norm": 9.293567657470703, "learning_rate": 2.7418812989921617e-05, "loss": 2.0307, "step": 12100 }, { "epoch": 1.3561030235162375, "grad_norm": 4.058133125305176, "learning_rate": 2.7400149309443822e-05, "loss": 1.6863, "step": 12110 }, { "epoch": 1.3572228443449048, "grad_norm": 3.99945330619812, "learning_rate": 2.7381485628966037e-05, "loss": 2.2194, "step": 12120 }, { "epoch": 1.3583426651735722, "grad_norm": 10.465315818786621, "learning_rate": 2.7362821948488242e-05, "loss": 1.7545, "step": 12130 }, { "epoch": 1.3594624860022395, "grad_norm": 5.384920120239258, "learning_rate": 2.734415826801045e-05, "loss": 1.6758, "step": 12140 }, { "epoch": 1.360582306830907, "grad_norm": 3.6617019176483154, "learning_rate": 2.7325494587532662e-05, "loss": 1.8267, "step": 12150 }, { "epoch": 1.3617021276595744, "grad_norm": 5.872734069824219, "learning_rate": 2.730683090705487e-05, "loss": 1.7598, "step": 12160 }, { "epoch": 1.362821948488242, "grad_norm": 3.7369675636291504, "learning_rate": 2.7288167226577082e-05, "loss": 1.8391, "step": 12170 }, { "epoch": 1.3639417693169094, "grad_norm": 4.193478107452393, "learning_rate": 2.726950354609929e-05, "loss": 1.9595, "step": 12180 }, { "epoch": 1.3650615901455767, "grad_norm": 11.2186918258667, "learning_rate": 2.7250839865621502e-05, "loss": 2.3907, "step": 12190 }, { "epoch": 1.366181410974244, "grad_norm": 10.962636947631836, "learning_rate": 2.723217618514371e-05, "loss": 1.8998, "step": 12200 }, { "epoch": 1.3673012318029114, "grad_norm": 6.624661922454834, "learning_rate": 2.7213512504665922e-05, "loss": 2.1747, "step": 12210 }, { "epoch": 1.368421052631579, "grad_norm": 3.9043078422546387, "learning_rate": 2.719484882418813e-05, "loss": 1.9947, "step": 12220 }, { "epoch": 1.3695408734602463, "grad_norm": 13.550288200378418, "learning_rate": 2.7176185143710342e-05, "loss": 1.7701, "step": 12230 }, { "epoch": 1.370660694288914, "grad_norm": 3.8633484840393066, "learning_rate": 2.715752146323255e-05, "loss": 1.701, "step": 12240 }, { "epoch": 1.3717805151175813, "grad_norm": 15.98534870147705, "learning_rate": 2.7138857782754762e-05, "loss": 2.2531, "step": 12250 }, { "epoch": 1.3729003359462486, "grad_norm": 3.9303388595581055, "learning_rate": 2.712019410227697e-05, "loss": 2.0902, "step": 12260 }, { "epoch": 1.374020156774916, "grad_norm": 5.423203945159912, "learning_rate": 2.7101530421799182e-05, "loss": 2.3482, "step": 12270 }, { "epoch": 1.3751399776035833, "grad_norm": 12.370367050170898, "learning_rate": 2.708286674132139e-05, "loss": 1.9292, "step": 12280 }, { "epoch": 1.3762597984322509, "grad_norm": 5.228443622589111, "learning_rate": 2.7064203060843602e-05, "loss": 2.1979, "step": 12290 }, { "epoch": 1.3773796192609182, "grad_norm": 5.5691423416137695, "learning_rate": 2.7045539380365807e-05, "loss": 2.3052, "step": 12300 }, { "epoch": 1.3784994400895858, "grad_norm": 4.302522659301758, "learning_rate": 2.7026875699888022e-05, "loss": 2.2908, "step": 12310 }, { "epoch": 1.3796192609182532, "grad_norm": 5.334700584411621, "learning_rate": 2.7008212019410227e-05, "loss": 1.704, "step": 12320 }, { "epoch": 1.3807390817469205, "grad_norm": 6.528292655944824, "learning_rate": 2.6989548338932442e-05, "loss": 2.0202, "step": 12330 }, { "epoch": 1.3818589025755879, "grad_norm": 8.879626274108887, "learning_rate": 2.6970884658454647e-05, "loss": 1.9973, "step": 12340 }, { "epoch": 1.3829787234042552, "grad_norm": 12.133624076843262, "learning_rate": 2.6952220977976862e-05, "loss": 2.0525, "step": 12350 }, { "epoch": 1.3840985442329228, "grad_norm": 3.8778038024902344, "learning_rate": 2.6933557297499067e-05, "loss": 2.7127, "step": 12360 }, { "epoch": 1.3852183650615901, "grad_norm": 12.094010353088379, "learning_rate": 2.6914893617021282e-05, "loss": 2.2501, "step": 12370 }, { "epoch": 1.3863381858902575, "grad_norm": 14.439510345458984, "learning_rate": 2.6896229936543487e-05, "loss": 2.4553, "step": 12380 }, { "epoch": 1.387458006718925, "grad_norm": 13.809215545654297, "learning_rate": 2.6877566256065695e-05, "loss": 1.7811, "step": 12390 }, { "epoch": 1.3885778275475924, "grad_norm": 3.6743390560150146, "learning_rate": 2.6858902575587907e-05, "loss": 1.5438, "step": 12400 }, { "epoch": 1.3896976483762598, "grad_norm": 4.393309116363525, "learning_rate": 2.6840238895110115e-05, "loss": 1.4149, "step": 12410 }, { "epoch": 1.390817469204927, "grad_norm": 8.418529510498047, "learning_rate": 2.6821575214632327e-05, "loss": 1.8214, "step": 12420 }, { "epoch": 1.3919372900335947, "grad_norm": 11.139238357543945, "learning_rate": 2.6802911534154535e-05, "loss": 2.2626, "step": 12430 }, { "epoch": 1.393057110862262, "grad_norm": 7.604578495025635, "learning_rate": 2.6784247853676747e-05, "loss": 2.5468, "step": 12440 }, { "epoch": 1.3941769316909294, "grad_norm": 4.145791053771973, "learning_rate": 2.6765584173198955e-05, "loss": 1.6144, "step": 12450 }, { "epoch": 1.395296752519597, "grad_norm": 6.010091781616211, "learning_rate": 2.6746920492721167e-05, "loss": 2.0923, "step": 12460 }, { "epoch": 1.3964165733482643, "grad_norm": 10.133779525756836, "learning_rate": 2.6728256812243375e-05, "loss": 1.9483, "step": 12470 }, { "epoch": 1.3975363941769317, "grad_norm": 3.6997570991516113, "learning_rate": 2.6709593131765587e-05, "loss": 1.701, "step": 12480 }, { "epoch": 1.398656215005599, "grad_norm": 10.962636947631836, "learning_rate": 2.6690929451287795e-05, "loss": 2.384, "step": 12490 }, { "epoch": 1.3997760358342666, "grad_norm": 3.6743199825286865, "learning_rate": 2.6672265770810007e-05, "loss": 1.9178, "step": 12500 }, { "epoch": 1.400895856662934, "grad_norm": 12.958653450012207, "learning_rate": 2.6653602090332212e-05, "loss": 2.594, "step": 12510 }, { "epoch": 1.4020156774916013, "grad_norm": 11.249685287475586, "learning_rate": 2.6634938409854427e-05, "loss": 1.6907, "step": 12520 }, { "epoch": 1.4031354983202688, "grad_norm": 4.222360134124756, "learning_rate": 2.6616274729376632e-05, "loss": 2.1065, "step": 12530 }, { "epoch": 1.4042553191489362, "grad_norm": 11.4049654006958, "learning_rate": 2.6597611048898847e-05, "loss": 2.0968, "step": 12540 }, { "epoch": 1.4053751399776035, "grad_norm": 4.947079181671143, "learning_rate": 2.6578947368421052e-05, "loss": 1.942, "step": 12550 }, { "epoch": 1.406494960806271, "grad_norm": 11.592204093933105, "learning_rate": 2.6560283687943267e-05, "loss": 1.9633, "step": 12560 }, { "epoch": 1.4076147816349385, "grad_norm": 5.250865459442139, "learning_rate": 2.6541620007465472e-05, "loss": 1.9282, "step": 12570 }, { "epoch": 1.4087346024636058, "grad_norm": 12.925929069519043, "learning_rate": 2.6522956326987687e-05, "loss": 2.3242, "step": 12580 }, { "epoch": 1.4098544232922732, "grad_norm": 13.145225524902344, "learning_rate": 2.6504292646509892e-05, "loss": 2.0153, "step": 12590 }, { "epoch": 1.4109742441209407, "grad_norm": 5.17841100692749, "learning_rate": 2.6485628966032107e-05, "loss": 1.7325, "step": 12600 }, { "epoch": 1.412094064949608, "grad_norm": 4.1737213134765625, "learning_rate": 2.6466965285554312e-05, "loss": 1.8959, "step": 12610 }, { "epoch": 1.4132138857782754, "grad_norm": 14.844003677368164, "learning_rate": 2.644830160507652e-05, "loss": 2.1662, "step": 12620 }, { "epoch": 1.4143337066069428, "grad_norm": 3.8433620929718018, "learning_rate": 2.6429637924598732e-05, "loss": 1.7181, "step": 12630 }, { "epoch": 1.4154535274356104, "grad_norm": 7.718703746795654, "learning_rate": 2.641097424412094e-05, "loss": 1.9043, "step": 12640 }, { "epoch": 1.4165733482642777, "grad_norm": 9.38231086730957, "learning_rate": 2.6392310563643152e-05, "loss": 1.8313, "step": 12650 }, { "epoch": 1.417693169092945, "grad_norm": 13.019353866577148, "learning_rate": 2.637364688316536e-05, "loss": 2.2505, "step": 12660 }, { "epoch": 1.4188129899216126, "grad_norm": 14.78768253326416, "learning_rate": 2.6354983202687572e-05, "loss": 1.8465, "step": 12670 }, { "epoch": 1.41993281075028, "grad_norm": 12.229498863220215, "learning_rate": 2.633631952220978e-05, "loss": 2.4691, "step": 12680 }, { "epoch": 1.4210526315789473, "grad_norm": 4.5396294593811035, "learning_rate": 2.6317655841731992e-05, "loss": 2.0822, "step": 12690 }, { "epoch": 1.4221724524076147, "grad_norm": 11.260706901550293, "learning_rate": 2.62989921612542e-05, "loss": 1.8299, "step": 12700 }, { "epoch": 1.4232922732362823, "grad_norm": 7.562645435333252, "learning_rate": 2.6280328480776412e-05, "loss": 1.9251, "step": 12710 }, { "epoch": 1.4244120940649496, "grad_norm": 12.989692687988281, "learning_rate": 2.6261664800298617e-05, "loss": 1.8278, "step": 12720 }, { "epoch": 1.425531914893617, "grad_norm": 15.355886459350586, "learning_rate": 2.6243001119820832e-05, "loss": 2.2122, "step": 12730 }, { "epoch": 1.4266517357222845, "grad_norm": 4.491844654083252, "learning_rate": 2.6224337439343037e-05, "loss": 1.8286, "step": 12740 }, { "epoch": 1.427771556550952, "grad_norm": 11.244644165039062, "learning_rate": 2.6205673758865252e-05, "loss": 2.2198, "step": 12750 }, { "epoch": 1.4288913773796192, "grad_norm": 4.543248176574707, "learning_rate": 2.6187010078387457e-05, "loss": 2.1489, "step": 12760 }, { "epoch": 1.4300111982082866, "grad_norm": 5.585264205932617, "learning_rate": 2.6168346397909672e-05, "loss": 1.9485, "step": 12770 }, { "epoch": 1.4311310190369542, "grad_norm": 16.626436233520508, "learning_rate": 2.6149682717431877e-05, "loss": 2.0467, "step": 12780 }, { "epoch": 1.4322508398656215, "grad_norm": 5.619150161743164, "learning_rate": 2.6131019036954092e-05, "loss": 2.0983, "step": 12790 }, { "epoch": 1.4333706606942889, "grad_norm": 3.807325839996338, "learning_rate": 2.6112355356476297e-05, "loss": 1.736, "step": 12800 }, { "epoch": 1.4344904815229564, "grad_norm": 16.317922592163086, "learning_rate": 2.6093691675998512e-05, "loss": 2.26, "step": 12810 }, { "epoch": 1.4356103023516238, "grad_norm": 4.438934326171875, "learning_rate": 2.6075027995520717e-05, "loss": 2.2066, "step": 12820 }, { "epoch": 1.4367301231802911, "grad_norm": 15.27676010131836, "learning_rate": 2.605636431504293e-05, "loss": 2.0826, "step": 12830 }, { "epoch": 1.4378499440089585, "grad_norm": 7.73093843460083, "learning_rate": 2.6037700634565137e-05, "loss": 1.9085, "step": 12840 }, { "epoch": 1.4389697648376258, "grad_norm": 12.442554473876953, "learning_rate": 2.6019036954087345e-05, "loss": 2.1945, "step": 12850 }, { "epoch": 1.4400895856662934, "grad_norm": 21.156641006469727, "learning_rate": 2.6000373273609557e-05, "loss": 2.3811, "step": 12860 }, { "epoch": 1.4412094064949608, "grad_norm": 4.013643741607666, "learning_rate": 2.5981709593131765e-05, "loss": 2.0067, "step": 12870 }, { "epoch": 1.4423292273236283, "grad_norm": 4.505249977111816, "learning_rate": 2.5963045912653977e-05, "loss": 2.1332, "step": 12880 }, { "epoch": 1.4434490481522957, "grad_norm": 4.283412456512451, "learning_rate": 2.5944382232176185e-05, "loss": 2.2716, "step": 12890 }, { "epoch": 1.444568868980963, "grad_norm": 9.626873016357422, "learning_rate": 2.5925718551698397e-05, "loss": 2.2156, "step": 12900 }, { "epoch": 1.4456886898096304, "grad_norm": 10.732905387878418, "learning_rate": 2.5907054871220605e-05, "loss": 2.1807, "step": 12910 }, { "epoch": 1.4468085106382977, "grad_norm": 7.605788707733154, "learning_rate": 2.5888391190742817e-05, "loss": 1.9077, "step": 12920 }, { "epoch": 1.4479283314669653, "grad_norm": 3.839841604232788, "learning_rate": 2.5869727510265025e-05, "loss": 1.8521, "step": 12930 }, { "epoch": 1.4490481522956327, "grad_norm": 3.6968777179718018, "learning_rate": 2.5851063829787237e-05, "loss": 1.7962, "step": 12940 }, { "epoch": 1.4501679731243002, "grad_norm": 8.658880233764648, "learning_rate": 2.5832400149309442e-05, "loss": 1.8974, "step": 12950 }, { "epoch": 1.4512877939529676, "grad_norm": 3.764810085296631, "learning_rate": 2.5813736468831657e-05, "loss": 2.2889, "step": 12960 }, { "epoch": 1.452407614781635, "grad_norm": 7.589803218841553, "learning_rate": 2.5795072788353862e-05, "loss": 1.701, "step": 12970 }, { "epoch": 1.4535274356103023, "grad_norm": 15.206584930419922, "learning_rate": 2.5776409107876077e-05, "loss": 2.196, "step": 12980 }, { "epoch": 1.4546472564389696, "grad_norm": 13.450560569763184, "learning_rate": 2.5757745427398282e-05, "loss": 1.7454, "step": 12990 }, { "epoch": 1.4557670772676372, "grad_norm": 9.42431926727295, "learning_rate": 2.5739081746920497e-05, "loss": 2.2713, "step": 13000 }, { "epoch": 1.4568868980963046, "grad_norm": 6.517592906951904, "learning_rate": 2.5720418066442702e-05, "loss": 1.9129, "step": 13010 }, { "epoch": 1.4580067189249721, "grad_norm": 3.8636746406555176, "learning_rate": 2.5701754385964917e-05, "loss": 1.8176, "step": 13020 }, { "epoch": 1.4591265397536395, "grad_norm": 4.5231523513793945, "learning_rate": 2.5683090705487122e-05, "loss": 2.1806, "step": 13030 }, { "epoch": 1.4602463605823068, "grad_norm": 4.154500484466553, "learning_rate": 2.5664427025009334e-05, "loss": 2.2852, "step": 13040 }, { "epoch": 1.4613661814109742, "grad_norm": 4.697875022888184, "learning_rate": 2.5645763344531542e-05, "loss": 2.0672, "step": 13050 }, { "epoch": 1.4624860022396415, "grad_norm": 5.5377702713012695, "learning_rate": 2.5627099664053754e-05, "loss": 1.9248, "step": 13060 }, { "epoch": 1.463605823068309, "grad_norm": 3.9264962673187256, "learning_rate": 2.5608435983575962e-05, "loss": 2.226, "step": 13070 }, { "epoch": 1.4647256438969765, "grad_norm": 4.375810146331787, "learning_rate": 2.558977230309817e-05, "loss": 1.8459, "step": 13080 }, { "epoch": 1.465845464725644, "grad_norm": 9.115229606628418, "learning_rate": 2.5571108622620382e-05, "loss": 2.0442, "step": 13090 }, { "epoch": 1.4669652855543114, "grad_norm": 11.976322174072266, "learning_rate": 2.555244494214259e-05, "loss": 1.6853, "step": 13100 }, { "epoch": 1.4680851063829787, "grad_norm": 4.857337951660156, "learning_rate": 2.5533781261664802e-05, "loss": 2.2852, "step": 13110 }, { "epoch": 1.469204927211646, "grad_norm": 15.375901222229004, "learning_rate": 2.551511758118701e-05, "loss": 2.0554, "step": 13120 }, { "epoch": 1.4703247480403134, "grad_norm": 10.337723731994629, "learning_rate": 2.5496453900709222e-05, "loss": 2.0575, "step": 13130 }, { "epoch": 1.471444568868981, "grad_norm": 9.989140510559082, "learning_rate": 2.547779022023143e-05, "loss": 2.2119, "step": 13140 }, { "epoch": 1.4725643896976484, "grad_norm": 8.458128929138184, "learning_rate": 2.5459126539753642e-05, "loss": 2.3751, "step": 13150 }, { "epoch": 1.4736842105263157, "grad_norm": 10.176783561706543, "learning_rate": 2.5440462859275847e-05, "loss": 1.8748, "step": 13160 }, { "epoch": 1.4748040313549833, "grad_norm": 8.326006889343262, "learning_rate": 2.5421799178798062e-05, "loss": 2.2096, "step": 13170 }, { "epoch": 1.4759238521836506, "grad_norm": 4.975625514984131, "learning_rate": 2.5403135498320267e-05, "loss": 2.3334, "step": 13180 }, { "epoch": 1.477043673012318, "grad_norm": 3.7235939502716064, "learning_rate": 2.5384471817842482e-05, "loss": 1.8743, "step": 13190 }, { "epoch": 1.4781634938409853, "grad_norm": 4.65376615524292, "learning_rate": 2.5365808137364687e-05, "loss": 1.8204, "step": 13200 }, { "epoch": 1.479283314669653, "grad_norm": 5.088535308837891, "learning_rate": 2.5347144456886902e-05, "loss": 1.9513, "step": 13210 }, { "epoch": 1.4804031354983203, "grad_norm": 13.776253700256348, "learning_rate": 2.5328480776409107e-05, "loss": 1.7841, "step": 13220 }, { "epoch": 1.4815229563269876, "grad_norm": 13.486886024475098, "learning_rate": 2.5309817095931322e-05, "loss": 2.1629, "step": 13230 }, { "epoch": 1.4826427771556552, "grad_norm": 3.3172719478607178, "learning_rate": 2.5291153415453527e-05, "loss": 1.7235, "step": 13240 }, { "epoch": 1.4837625979843225, "grad_norm": 3.367083787918091, "learning_rate": 2.5272489734975742e-05, "loss": 2.4903, "step": 13250 }, { "epoch": 1.4848824188129899, "grad_norm": 13.519879341125488, "learning_rate": 2.5253826054497947e-05, "loss": 1.8464, "step": 13260 }, { "epoch": 1.4860022396416572, "grad_norm": 6.741097450256348, "learning_rate": 2.523516237402016e-05, "loss": 1.8458, "step": 13270 }, { "epoch": 1.4871220604703248, "grad_norm": 2.4831910133361816, "learning_rate": 2.5216498693542367e-05, "loss": 1.9829, "step": 13280 }, { "epoch": 1.4882418812989922, "grad_norm": 14.098200798034668, "learning_rate": 2.519783501306458e-05, "loss": 2.2151, "step": 13290 }, { "epoch": 1.4893617021276595, "grad_norm": 9.473960876464844, "learning_rate": 2.5179171332586787e-05, "loss": 2.3018, "step": 13300 }, { "epoch": 1.490481522956327, "grad_norm": 10.849329948425293, "learning_rate": 2.5160507652108995e-05, "loss": 2.3845, "step": 13310 }, { "epoch": 1.4916013437849944, "grad_norm": 5.084878921508789, "learning_rate": 2.5141843971631207e-05, "loss": 1.9048, "step": 13320 }, { "epoch": 1.4927211646136618, "grad_norm": 3.708717107772827, "learning_rate": 2.5123180291153416e-05, "loss": 2.0126, "step": 13330 }, { "epoch": 1.4938409854423291, "grad_norm": 4.616156578063965, "learning_rate": 2.5104516610675627e-05, "loss": 2.2239, "step": 13340 }, { "epoch": 1.4949608062709967, "grad_norm": 4.087489604949951, "learning_rate": 2.5085852930197836e-05, "loss": 2.3669, "step": 13350 }, { "epoch": 1.496080627099664, "grad_norm": 5.101741790771484, "learning_rate": 2.5067189249720047e-05, "loss": 2.1289, "step": 13360 }, { "epoch": 1.4972004479283314, "grad_norm": 4.274835109710693, "learning_rate": 2.5048525569242252e-05, "loss": 2.194, "step": 13370 }, { "epoch": 1.498320268756999, "grad_norm": 8.17119026184082, "learning_rate": 2.5029861888764467e-05, "loss": 1.924, "step": 13380 }, { "epoch": 1.4994400895856663, "grad_norm": 7.936726093292236, "learning_rate": 2.5011198208286672e-05, "loss": 2.0178, "step": 13390 }, { "epoch": 1.5005599104143337, "grad_norm": 4.710428714752197, "learning_rate": 2.4992534527808887e-05, "loss": 2.1567, "step": 13400 }, { "epoch": 1.501679731243001, "grad_norm": 17.4936580657959, "learning_rate": 2.4973870847331096e-05, "loss": 2.1754, "step": 13410 }, { "epoch": 1.5027995520716684, "grad_norm": 3.8005337715148926, "learning_rate": 2.4955207166853304e-05, "loss": 1.3665, "step": 13420 }, { "epoch": 1.503919372900336, "grad_norm": 5.918506145477295, "learning_rate": 2.4936543486375512e-05, "loss": 1.8023, "step": 13430 }, { "epoch": 1.5050391937290035, "grad_norm": 3.53682541847229, "learning_rate": 2.4917879805897724e-05, "loss": 2.1256, "step": 13440 }, { "epoch": 1.5061590145576709, "grad_norm": 7.532880783081055, "learning_rate": 2.4899216125419932e-05, "loss": 1.6287, "step": 13450 }, { "epoch": 1.5072788353863382, "grad_norm": 5.1376800537109375, "learning_rate": 2.4880552444942144e-05, "loss": 2.2407, "step": 13460 }, { "epoch": 1.5083986562150056, "grad_norm": 3.9931938648223877, "learning_rate": 2.4861888764464352e-05, "loss": 2.172, "step": 13470 }, { "epoch": 1.509518477043673, "grad_norm": 6.361204147338867, "learning_rate": 2.4843225083986564e-05, "loss": 1.7437, "step": 13480 }, { "epoch": 1.5106382978723403, "grad_norm": 7.249978542327881, "learning_rate": 2.4824561403508772e-05, "loss": 1.8938, "step": 13490 }, { "epoch": 1.5117581187010078, "grad_norm": 1.6755268573760986, "learning_rate": 2.4805897723030984e-05, "loss": 1.9075, "step": 13500 }, { "epoch": 1.5128779395296752, "grad_norm": 3.8444862365722656, "learning_rate": 2.4787234042553192e-05, "loss": 2.2699, "step": 13510 }, { "epoch": 1.5139977603583428, "grad_norm": 5.975175380706787, "learning_rate": 2.4768570362075404e-05, "loss": 1.9212, "step": 13520 }, { "epoch": 1.5151175811870101, "grad_norm": 8.798042297363281, "learning_rate": 2.4749906681597612e-05, "loss": 2.1584, "step": 13530 }, { "epoch": 1.5162374020156775, "grad_norm": 12.09782600402832, "learning_rate": 2.473124300111982e-05, "loss": 2.3702, "step": 13540 }, { "epoch": 1.5173572228443448, "grad_norm": 6.850448131561279, "learning_rate": 2.4712579320642032e-05, "loss": 1.9944, "step": 13550 }, { "epoch": 1.5184770436730122, "grad_norm": 9.481196403503418, "learning_rate": 2.469391564016424e-05, "loss": 1.9876, "step": 13560 }, { "epoch": 1.5195968645016797, "grad_norm": 3.648925304412842, "learning_rate": 2.4675251959686452e-05, "loss": 2.1083, "step": 13570 }, { "epoch": 1.520716685330347, "grad_norm": 4.310310363769531, "learning_rate": 2.465658827920866e-05, "loss": 1.7789, "step": 13580 }, { "epoch": 1.5218365061590147, "grad_norm": 6.024844646453857, "learning_rate": 2.4637924598730872e-05, "loss": 2.0254, "step": 13590 }, { "epoch": 1.522956326987682, "grad_norm": 9.881074905395508, "learning_rate": 2.461926091825308e-05, "loss": 2.0021, "step": 13600 }, { "epoch": 1.5240761478163494, "grad_norm": 17.45454216003418, "learning_rate": 2.4600597237775292e-05, "loss": 1.8438, "step": 13610 }, { "epoch": 1.5251959686450167, "grad_norm": 11.487627029418945, "learning_rate": 2.45819335572975e-05, "loss": 2.1005, "step": 13620 }, { "epoch": 1.526315789473684, "grad_norm": 4.326568126678467, "learning_rate": 2.4563269876819712e-05, "loss": 2.3081, "step": 13630 }, { "epoch": 1.5274356103023516, "grad_norm": 4.558961868286133, "learning_rate": 2.454460619634192e-05, "loss": 1.6342, "step": 13640 }, { "epoch": 1.528555431131019, "grad_norm": 5.035207748413086, "learning_rate": 2.4525942515864132e-05, "loss": 1.785, "step": 13650 }, { "epoch": 1.5296752519596866, "grad_norm": 13.78886604309082, "learning_rate": 2.4507278835386337e-05, "loss": 2.0897, "step": 13660 }, { "epoch": 1.530795072788354, "grad_norm": 9.688088417053223, "learning_rate": 2.448861515490855e-05, "loss": 1.8179, "step": 13670 }, { "epoch": 1.5319148936170213, "grad_norm": 5.47663688659668, "learning_rate": 2.4469951474430757e-05, "loss": 2.2184, "step": 13680 }, { "epoch": 1.5330347144456886, "grad_norm": 16.983068466186523, "learning_rate": 2.445128779395297e-05, "loss": 2.1049, "step": 13690 }, { "epoch": 1.534154535274356, "grad_norm": 4.000819683074951, "learning_rate": 2.4432624113475177e-05, "loss": 1.8282, "step": 13700 }, { "epoch": 1.5352743561030235, "grad_norm": 8.065638542175293, "learning_rate": 2.441396043299739e-05, "loss": 1.8497, "step": 13710 }, { "epoch": 1.536394176931691, "grad_norm": 7.793942451477051, "learning_rate": 2.4395296752519597e-05, "loss": 1.6021, "step": 13720 }, { "epoch": 1.5375139977603585, "grad_norm": 4.013867378234863, "learning_rate": 2.437663307204181e-05, "loss": 1.585, "step": 13730 }, { "epoch": 1.5386338185890258, "grad_norm": 5.563870429992676, "learning_rate": 2.4357969391564017e-05, "loss": 1.7468, "step": 13740 }, { "epoch": 1.5397536394176932, "grad_norm": 3.9095380306243896, "learning_rate": 2.4339305711086226e-05, "loss": 1.8195, "step": 13750 }, { "epoch": 1.5408734602463605, "grad_norm": 3.52274489402771, "learning_rate": 2.4320642030608437e-05, "loss": 2.2316, "step": 13760 }, { "epoch": 1.5419932810750279, "grad_norm": 8.85901927947998, "learning_rate": 2.4301978350130646e-05, "loss": 1.933, "step": 13770 }, { "epoch": 1.5431131019036954, "grad_norm": 6.702213764190674, "learning_rate": 2.4283314669652857e-05, "loss": 1.9826, "step": 13780 }, { "epoch": 1.5442329227323628, "grad_norm": 16.482467651367188, "learning_rate": 2.4264650989175066e-05, "loss": 1.6521, "step": 13790 }, { "epoch": 1.5453527435610304, "grad_norm": 9.306038856506348, "learning_rate": 2.4245987308697277e-05, "loss": 1.6685, "step": 13800 }, { "epoch": 1.5464725643896977, "grad_norm": 15.01627254486084, "learning_rate": 2.4227323628219486e-05, "loss": 1.9639, "step": 13810 }, { "epoch": 1.547592385218365, "grad_norm": 6.116465091705322, "learning_rate": 2.4208659947741697e-05, "loss": 1.8788, "step": 13820 }, { "epoch": 1.5487122060470324, "grad_norm": 8.37788200378418, "learning_rate": 2.4189996267263906e-05, "loss": 2.2028, "step": 13830 }, { "epoch": 1.5498320268756998, "grad_norm": 11.314355850219727, "learning_rate": 2.4171332586786117e-05, "loss": 2.0903, "step": 13840 }, { "epoch": 1.5509518477043673, "grad_norm": 4.621204853057861, "learning_rate": 2.4152668906308326e-05, "loss": 1.9054, "step": 13850 }, { "epoch": 1.5520716685330347, "grad_norm": 5.006560325622559, "learning_rate": 2.4134005225830537e-05, "loss": 1.8487, "step": 13860 }, { "epoch": 1.5531914893617023, "grad_norm": 6.626319885253906, "learning_rate": 2.4115341545352746e-05, "loss": 1.9409, "step": 13870 }, { "epoch": 1.5543113101903696, "grad_norm": 2.9766428470611572, "learning_rate": 2.4096677864874957e-05, "loss": 1.5523, "step": 13880 }, { "epoch": 1.555431131019037, "grad_norm": 15.116016387939453, "learning_rate": 2.4078014184397162e-05, "loss": 2.1559, "step": 13890 }, { "epoch": 1.5565509518477043, "grad_norm": 4.559194564819336, "learning_rate": 2.4059350503919374e-05, "loss": 1.7988, "step": 13900 }, { "epoch": 1.5576707726763717, "grad_norm": 8.903999328613281, "learning_rate": 2.4040686823441582e-05, "loss": 2.2537, "step": 13910 }, { "epoch": 1.5587905935050392, "grad_norm": 17.303340911865234, "learning_rate": 2.4022023142963794e-05, "loss": 1.9898, "step": 13920 }, { "epoch": 1.5599104143337066, "grad_norm": 5.961864948272705, "learning_rate": 2.4003359462486002e-05, "loss": 2.0574, "step": 13930 }, { "epoch": 1.5610302351623742, "grad_norm": 14.988414764404297, "learning_rate": 2.3984695782008214e-05, "loss": 2.1711, "step": 13940 }, { "epoch": 1.5621500559910415, "grad_norm": 4.600130081176758, "learning_rate": 2.3966032101530422e-05, "loss": 2.644, "step": 13950 }, { "epoch": 1.5632698768197089, "grad_norm": 4.028290271759033, "learning_rate": 2.394736842105263e-05, "loss": 2.007, "step": 13960 }, { "epoch": 1.5643896976483762, "grad_norm": 16.717845916748047, "learning_rate": 2.3928704740574842e-05, "loss": 2.1179, "step": 13970 }, { "epoch": 1.5655095184770436, "grad_norm": 13.442608833312988, "learning_rate": 2.391004106009705e-05, "loss": 2.061, "step": 13980 }, { "epoch": 1.5666293393057111, "grad_norm": 4.753323078155518, "learning_rate": 2.3891377379619262e-05, "loss": 2.1708, "step": 13990 }, { "epoch": 1.5677491601343785, "grad_norm": 4.6569600105285645, "learning_rate": 2.387271369914147e-05, "loss": 1.9684, "step": 14000 }, { "epoch": 1.568868980963046, "grad_norm": 3.8971803188323975, "learning_rate": 2.3854050018663682e-05, "loss": 2.4244, "step": 14010 }, { "epoch": 1.5699888017917134, "grad_norm": 4.851494789123535, "learning_rate": 2.383538633818589e-05, "loss": 2.0636, "step": 14020 }, { "epoch": 1.5711086226203808, "grad_norm": 3.54333758354187, "learning_rate": 2.3816722657708102e-05, "loss": 1.9378, "step": 14030 }, { "epoch": 1.572228443449048, "grad_norm": 6.895486831665039, "learning_rate": 2.379805897723031e-05, "loss": 1.7275, "step": 14040 }, { "epoch": 1.5733482642777155, "grad_norm": 4.1524739265441895, "learning_rate": 2.3779395296752522e-05, "loss": 2.2593, "step": 14050 }, { "epoch": 1.574468085106383, "grad_norm": 4.263177394866943, "learning_rate": 2.376073161627473e-05, "loss": 1.8631, "step": 14060 }, { "epoch": 1.5755879059350504, "grad_norm": 14.865904808044434, "learning_rate": 2.3742067935796942e-05, "loss": 1.6639, "step": 14070 }, { "epoch": 1.576707726763718, "grad_norm": 4.426602840423584, "learning_rate": 2.372340425531915e-05, "loss": 1.987, "step": 14080 }, { "epoch": 1.5778275475923853, "grad_norm": 4.001722812652588, "learning_rate": 2.3704740574841362e-05, "loss": 2.0681, "step": 14090 }, { "epoch": 1.5789473684210527, "grad_norm": 3.0097544193267822, "learning_rate": 2.368607689436357e-05, "loss": 2.3647, "step": 14100 }, { "epoch": 1.58006718924972, "grad_norm": 3.5997202396392822, "learning_rate": 2.3667413213885782e-05, "loss": 2.1803, "step": 14110 }, { "epoch": 1.5811870100783874, "grad_norm": 4.4173784255981445, "learning_rate": 2.3648749533407987e-05, "loss": 1.9483, "step": 14120 }, { "epoch": 1.5823068309070547, "grad_norm": 10.679607391357422, "learning_rate": 2.36300858529302e-05, "loss": 2.2298, "step": 14130 }, { "epoch": 1.5834266517357223, "grad_norm": 11.639561653137207, "learning_rate": 2.3611422172452407e-05, "loss": 1.6313, "step": 14140 }, { "epoch": 1.5845464725643899, "grad_norm": 7.233905792236328, "learning_rate": 2.359275849197462e-05, "loss": 2.2515, "step": 14150 }, { "epoch": 1.5856662933930572, "grad_norm": 9.425190925598145, "learning_rate": 2.3574094811496827e-05, "loss": 1.7644, "step": 14160 }, { "epoch": 1.5867861142217246, "grad_norm": 13.162445068359375, "learning_rate": 2.355543113101904e-05, "loss": 1.937, "step": 14170 }, { "epoch": 1.587905935050392, "grad_norm": 5.631926536560059, "learning_rate": 2.3536767450541247e-05, "loss": 1.7986, "step": 14180 }, { "epoch": 1.5890257558790593, "grad_norm": 14.373632431030273, "learning_rate": 2.3518103770063456e-05, "loss": 1.9261, "step": 14190 }, { "epoch": 1.5901455767077266, "grad_norm": 10.449909210205078, "learning_rate": 2.3499440089585667e-05, "loss": 2.1721, "step": 14200 }, { "epoch": 1.5912653975363942, "grad_norm": 3.89623761177063, "learning_rate": 2.3480776409107876e-05, "loss": 2.0745, "step": 14210 }, { "epoch": 1.5923852183650617, "grad_norm": 4.33968448638916, "learning_rate": 2.3462112728630087e-05, "loss": 1.9008, "step": 14220 }, { "epoch": 1.593505039193729, "grad_norm": 9.331836700439453, "learning_rate": 2.3443449048152296e-05, "loss": 2.0979, "step": 14230 }, { "epoch": 1.5946248600223965, "grad_norm": 10.781360626220703, "learning_rate": 2.3424785367674507e-05, "loss": 2.2364, "step": 14240 }, { "epoch": 1.5957446808510638, "grad_norm": 5.107909679412842, "learning_rate": 2.3406121687196716e-05, "loss": 2.0156, "step": 14250 }, { "epoch": 1.5968645016797312, "grad_norm": 3.5812559127807617, "learning_rate": 2.3387458006718927e-05, "loss": 2.2581, "step": 14260 }, { "epoch": 1.5979843225083985, "grad_norm": 13.384634017944336, "learning_rate": 2.3368794326241136e-05, "loss": 2.1169, "step": 14270 }, { "epoch": 1.599104143337066, "grad_norm": 5.330173969268799, "learning_rate": 2.3350130645763347e-05, "loss": 1.7373, "step": 14280 }, { "epoch": 1.6002239641657336, "grad_norm": 7.670846462249756, "learning_rate": 2.3331466965285556e-05, "loss": 1.9934, "step": 14290 }, { "epoch": 1.601343784994401, "grad_norm": 4.610497951507568, "learning_rate": 2.3312803284807767e-05, "loss": 1.5182, "step": 14300 }, { "epoch": 1.6024636058230683, "grad_norm": 7.437980651855469, "learning_rate": 2.3294139604329976e-05, "loss": 1.7472, "step": 14310 }, { "epoch": 1.6035834266517357, "grad_norm": 6.804001808166504, "learning_rate": 2.3275475923852187e-05, "loss": 1.5694, "step": 14320 }, { "epoch": 1.604703247480403, "grad_norm": 11.887704849243164, "learning_rate": 2.3256812243374396e-05, "loss": 1.7139, "step": 14330 }, { "epoch": 1.6058230683090704, "grad_norm": 3.467298746109009, "learning_rate": 2.3238148562896604e-05, "loss": 2.1763, "step": 14340 }, { "epoch": 1.606942889137738, "grad_norm": 8.796690940856934, "learning_rate": 2.3219484882418816e-05, "loss": 1.9841, "step": 14350 }, { "epoch": 1.6080627099664053, "grad_norm": 4.704202651977539, "learning_rate": 2.3200821201941024e-05, "loss": 2.0089, "step": 14360 }, { "epoch": 1.609182530795073, "grad_norm": 7.0840163230896, "learning_rate": 2.3182157521463232e-05, "loss": 1.9829, "step": 14370 }, { "epoch": 1.6103023516237402, "grad_norm": 4.396951198577881, "learning_rate": 2.3163493840985444e-05, "loss": 1.8063, "step": 14380 }, { "epoch": 1.6114221724524076, "grad_norm": 15.083405494689941, "learning_rate": 2.3144830160507652e-05, "loss": 1.6894, "step": 14390 }, { "epoch": 1.612541993281075, "grad_norm": 4.376276969909668, "learning_rate": 2.312616648002986e-05, "loss": 1.8177, "step": 14400 }, { "epoch": 1.6136618141097423, "grad_norm": 5.626823902130127, "learning_rate": 2.3107502799552072e-05, "loss": 2.002, "step": 14410 }, { "epoch": 1.6147816349384099, "grad_norm": 6.14142370223999, "learning_rate": 2.308883911907428e-05, "loss": 1.798, "step": 14420 }, { "epoch": 1.6159014557670772, "grad_norm": 4.125568866729736, "learning_rate": 2.3070175438596492e-05, "loss": 1.9081, "step": 14430 }, { "epoch": 1.6170212765957448, "grad_norm": 7.834494590759277, "learning_rate": 2.30515117581187e-05, "loss": 2.014, "step": 14440 }, { "epoch": 1.6181410974244121, "grad_norm": 14.719797134399414, "learning_rate": 2.3032848077640912e-05, "loss": 2.1659, "step": 14450 }, { "epoch": 1.6192609182530795, "grad_norm": 10.502878189086914, "learning_rate": 2.301418439716312e-05, "loss": 2.0006, "step": 14460 }, { "epoch": 1.6203807390817468, "grad_norm": 4.505667686462402, "learning_rate": 2.2995520716685332e-05, "loss": 2.0919, "step": 14470 }, { "epoch": 1.6215005599104142, "grad_norm": 8.213534355163574, "learning_rate": 2.297685703620754e-05, "loss": 1.8744, "step": 14480 }, { "epoch": 1.6226203807390818, "grad_norm": 3.4308199882507324, "learning_rate": 2.2958193355729752e-05, "loss": 2.0321, "step": 14490 }, { "epoch": 1.6237402015677491, "grad_norm": 8.835275650024414, "learning_rate": 2.293952967525196e-05, "loss": 2.0342, "step": 14500 }, { "epoch": 1.6248600223964167, "grad_norm": 13.789974212646484, "learning_rate": 2.2920865994774172e-05, "loss": 2.1533, "step": 14510 }, { "epoch": 1.625979843225084, "grad_norm": 4.554170608520508, "learning_rate": 2.290220231429638e-05, "loss": 1.8698, "step": 14520 }, { "epoch": 1.6270996640537514, "grad_norm": 4.612897872924805, "learning_rate": 2.2883538633818592e-05, "loss": 1.8866, "step": 14530 }, { "epoch": 1.6282194848824187, "grad_norm": 3.9619531631469727, "learning_rate": 2.28648749533408e-05, "loss": 1.795, "step": 14540 }, { "epoch": 1.629339305711086, "grad_norm": 13.929571151733398, "learning_rate": 2.2846211272863012e-05, "loss": 1.7292, "step": 14550 }, { "epoch": 1.6304591265397537, "grad_norm": 3.8777716159820557, "learning_rate": 2.282754759238522e-05, "loss": 2.1625, "step": 14560 }, { "epoch": 1.631578947368421, "grad_norm": 10.221095085144043, "learning_rate": 2.280888391190743e-05, "loss": 1.9096, "step": 14570 }, { "epoch": 1.6326987681970886, "grad_norm": 11.300918579101562, "learning_rate": 2.279022023142964e-05, "loss": 2.0733, "step": 14580 }, { "epoch": 1.633818589025756, "grad_norm": 3.6058292388916016, "learning_rate": 2.277155655095185e-05, "loss": 1.7012, "step": 14590 }, { "epoch": 1.6349384098544233, "grad_norm": 8.574664115905762, "learning_rate": 2.2752892870474057e-05, "loss": 1.7154, "step": 14600 }, { "epoch": 1.6360582306830906, "grad_norm": 7.9404096603393555, "learning_rate": 2.2734229189996266e-05, "loss": 2.0302, "step": 14610 }, { "epoch": 1.637178051511758, "grad_norm": 4.232813358306885, "learning_rate": 2.2715565509518477e-05, "loss": 2.2532, "step": 14620 }, { "epoch": 1.6382978723404256, "grad_norm": 4.7303643226623535, "learning_rate": 2.2696901829040686e-05, "loss": 1.6629, "step": 14630 }, { "epoch": 1.639417693169093, "grad_norm": 10.848748207092285, "learning_rate": 2.2678238148562897e-05, "loss": 2.2253, "step": 14640 }, { "epoch": 1.6405375139977605, "grad_norm": 13.379515647888184, "learning_rate": 2.2659574468085106e-05, "loss": 1.9358, "step": 14650 }, { "epoch": 1.6416573348264278, "grad_norm": 3.6262221336364746, "learning_rate": 2.2640910787607317e-05, "loss": 1.698, "step": 14660 }, { "epoch": 1.6427771556550952, "grad_norm": 4.654207229614258, "learning_rate": 2.2622247107129526e-05, "loss": 1.6241, "step": 14670 }, { "epoch": 1.6438969764837625, "grad_norm": 18.657798767089844, "learning_rate": 2.2603583426651737e-05, "loss": 2.1006, "step": 14680 }, { "epoch": 1.64501679731243, "grad_norm": 11.848627090454102, "learning_rate": 2.2584919746173946e-05, "loss": 2.1113, "step": 14690 }, { "epoch": 1.6461366181410975, "grad_norm": 9.06197738647461, "learning_rate": 2.2566256065696157e-05, "loss": 1.6563, "step": 14700 }, { "epoch": 1.6472564389697648, "grad_norm": 13.184223175048828, "learning_rate": 2.2547592385218366e-05, "loss": 2.3761, "step": 14710 }, { "epoch": 1.6483762597984324, "grad_norm": 11.905593872070312, "learning_rate": 2.2528928704740577e-05, "loss": 2.0665, "step": 14720 }, { "epoch": 1.6494960806270997, "grad_norm": 3.726668357849121, "learning_rate": 2.2510265024262786e-05, "loss": 1.7046, "step": 14730 }, { "epoch": 1.650615901455767, "grad_norm": 8.543424606323242, "learning_rate": 2.2491601343784997e-05, "loss": 1.8803, "step": 14740 }, { "epoch": 1.6517357222844344, "grad_norm": 7.099303245544434, "learning_rate": 2.2472937663307206e-05, "loss": 1.9246, "step": 14750 }, { "epoch": 1.6528555431131018, "grad_norm": 11.42622184753418, "learning_rate": 2.2454273982829417e-05, "loss": 1.7691, "step": 14760 }, { "epoch": 1.6539753639417694, "grad_norm": 11.167017936706543, "learning_rate": 2.2435610302351626e-05, "loss": 2.0076, "step": 14770 }, { "epoch": 1.6550951847704367, "grad_norm": 3.956162214279175, "learning_rate": 2.2416946621873834e-05, "loss": 2.3021, "step": 14780 }, { "epoch": 1.6562150055991043, "grad_norm": 7.405086994171143, "learning_rate": 2.2398282941396046e-05, "loss": 2.2505, "step": 14790 }, { "epoch": 1.6573348264277716, "grad_norm": 6.949024200439453, "learning_rate": 2.2379619260918254e-05, "loss": 1.6625, "step": 14800 }, { "epoch": 1.658454647256439, "grad_norm": 5.116047382354736, "learning_rate": 2.2360955580440466e-05, "loss": 2.0589, "step": 14810 }, { "epoch": 1.6595744680851063, "grad_norm": 4.537695407867432, "learning_rate": 2.2342291899962674e-05, "loss": 1.7413, "step": 14820 }, { "epoch": 1.6606942889137737, "grad_norm": 4.4663004875183105, "learning_rate": 2.2323628219484882e-05, "loss": 1.8535, "step": 14830 }, { "epoch": 1.6618141097424413, "grad_norm": 4.609202861785889, "learning_rate": 2.230496453900709e-05, "loss": 2.2902, "step": 14840 }, { "epoch": 1.6629339305711086, "grad_norm": 4.477583885192871, "learning_rate": 2.2286300858529302e-05, "loss": 2.2012, "step": 14850 }, { "epoch": 1.6640537513997762, "grad_norm": 19.314029693603516, "learning_rate": 2.226763717805151e-05, "loss": 2.3641, "step": 14860 }, { "epoch": 1.6651735722284435, "grad_norm": 7.962518692016602, "learning_rate": 2.2248973497573722e-05, "loss": 2.1222, "step": 14870 }, { "epoch": 1.6662933930571109, "grad_norm": 4.907433986663818, "learning_rate": 2.223030981709593e-05, "loss": 1.8145, "step": 14880 }, { "epoch": 1.6674132138857782, "grad_norm": 8.89202880859375, "learning_rate": 2.2211646136618142e-05, "loss": 2.2891, "step": 14890 }, { "epoch": 1.6685330347144456, "grad_norm": 9.831536293029785, "learning_rate": 2.219298245614035e-05, "loss": 2.0534, "step": 14900 }, { "epoch": 1.6696528555431132, "grad_norm": 3.9551281929016113, "learning_rate": 2.2174318775662562e-05, "loss": 1.6471, "step": 14910 }, { "epoch": 1.6707726763717805, "grad_norm": 4.45933723449707, "learning_rate": 2.215565509518477e-05, "loss": 2.2105, "step": 14920 }, { "epoch": 1.671892497200448, "grad_norm": 4.2659783363342285, "learning_rate": 2.2136991414706982e-05, "loss": 1.9115, "step": 14930 }, { "epoch": 1.6730123180291154, "grad_norm": 5.429946422576904, "learning_rate": 2.211832773422919e-05, "loss": 1.9213, "step": 14940 }, { "epoch": 1.6741321388577828, "grad_norm": 12.490592956542969, "learning_rate": 2.2099664053751402e-05, "loss": 2.1818, "step": 14950 }, { "epoch": 1.6752519596864501, "grad_norm": 5.016933917999268, "learning_rate": 2.208100037327361e-05, "loss": 2.3414, "step": 14960 }, { "epoch": 1.6763717805151175, "grad_norm": 11.83879566192627, "learning_rate": 2.2062336692795822e-05, "loss": 1.5455, "step": 14970 }, { "epoch": 1.6774916013437848, "grad_norm": 5.847216606140137, "learning_rate": 2.204367301231803e-05, "loss": 1.9266, "step": 14980 }, { "epoch": 1.6786114221724524, "grad_norm": 5.979493141174316, "learning_rate": 2.202500933184024e-05, "loss": 1.5915, "step": 14990 }, { "epoch": 1.67973124300112, "grad_norm": 10.372356414794922, "learning_rate": 2.200634565136245e-05, "loss": 1.5493, "step": 15000 }, { "epoch": 1.6808510638297873, "grad_norm": 5.963084697723389, "learning_rate": 2.198768197088466e-05, "loss": 1.9619, "step": 15010 }, { "epoch": 1.6819708846584547, "grad_norm": 10.619939804077148, "learning_rate": 2.196901829040687e-05, "loss": 2.351, "step": 15020 }, { "epoch": 1.683090705487122, "grad_norm": 4.406311511993408, "learning_rate": 2.195035460992908e-05, "loss": 1.7913, "step": 15030 }, { "epoch": 1.6842105263157894, "grad_norm": 4.74340295791626, "learning_rate": 2.193169092945129e-05, "loss": 1.6961, "step": 15040 }, { "epoch": 1.6853303471444567, "grad_norm": 10.785073280334473, "learning_rate": 2.1913027248973496e-05, "loss": 1.6512, "step": 15050 }, { "epoch": 1.6864501679731243, "grad_norm": 7.105363368988037, "learning_rate": 2.1894363568495707e-05, "loss": 1.8099, "step": 15060 }, { "epoch": 1.6875699888017919, "grad_norm": 4.944157123565674, "learning_rate": 2.1875699888017916e-05, "loss": 1.9102, "step": 15070 }, { "epoch": 1.6886898096304592, "grad_norm": 4.357661724090576, "learning_rate": 2.1857036207540127e-05, "loss": 2.1368, "step": 15080 }, { "epoch": 1.6898096304591266, "grad_norm": 4.5606207847595215, "learning_rate": 2.1838372527062336e-05, "loss": 1.9584, "step": 15090 }, { "epoch": 1.690929451287794, "grad_norm": 9.327258110046387, "learning_rate": 2.1819708846584547e-05, "loss": 1.9882, "step": 15100 }, { "epoch": 1.6920492721164613, "grad_norm": 4.126927375793457, "learning_rate": 2.1801045166106756e-05, "loss": 2.4206, "step": 15110 }, { "epoch": 1.6931690929451286, "grad_norm": 16.299972534179688, "learning_rate": 2.1782381485628967e-05, "loss": 2.2734, "step": 15120 }, { "epoch": 1.6942889137737962, "grad_norm": 7.43623685836792, "learning_rate": 2.1763717805151176e-05, "loss": 1.7911, "step": 15130 }, { "epoch": 1.6954087346024636, "grad_norm": 4.106508255004883, "learning_rate": 2.1745054124673387e-05, "loss": 2.1128, "step": 15140 }, { "epoch": 1.6965285554311311, "grad_norm": 13.86871337890625, "learning_rate": 2.1726390444195596e-05, "loss": 1.7943, "step": 15150 }, { "epoch": 1.6976483762597985, "grad_norm": 6.735291957855225, "learning_rate": 2.1707726763717807e-05, "loss": 1.5294, "step": 15160 }, { "epoch": 1.6987681970884658, "grad_norm": 5.739629745483398, "learning_rate": 2.1689063083240016e-05, "loss": 2.0369, "step": 15170 }, { "epoch": 1.6998880179171332, "grad_norm": 5.946849822998047, "learning_rate": 2.1670399402762227e-05, "loss": 2.0375, "step": 15180 }, { "epoch": 1.7010078387458005, "grad_norm": 4.547854423522949, "learning_rate": 2.1651735722284436e-05, "loss": 2.0474, "step": 15190 }, { "epoch": 1.702127659574468, "grad_norm": 6.0930070877075195, "learning_rate": 2.1633072041806644e-05, "loss": 1.7421, "step": 15200 }, { "epoch": 1.7032474804031354, "grad_norm": 4.635743141174316, "learning_rate": 2.1614408361328856e-05, "loss": 2.3968, "step": 15210 }, { "epoch": 1.704367301231803, "grad_norm": 2.2271034717559814, "learning_rate": 2.1595744680851064e-05, "loss": 1.8041, "step": 15220 }, { "epoch": 1.7054871220604704, "grad_norm": 6.688762664794922, "learning_rate": 2.1577081000373276e-05, "loss": 1.8699, "step": 15230 }, { "epoch": 1.7066069428891377, "grad_norm": 4.520251274108887, "learning_rate": 2.1558417319895484e-05, "loss": 1.7934, "step": 15240 }, { "epoch": 1.707726763717805, "grad_norm": 5.595422744750977, "learning_rate": 2.1539753639417696e-05, "loss": 1.8382, "step": 15250 }, { "epoch": 1.7088465845464724, "grad_norm": 10.029720306396484, "learning_rate": 2.1521089958939904e-05, "loss": 1.9193, "step": 15260 }, { "epoch": 1.70996640537514, "grad_norm": 5.297349452972412, "learning_rate": 2.1502426278462116e-05, "loss": 2.0587, "step": 15270 }, { "epoch": 1.7110862262038073, "grad_norm": 16.516834259033203, "learning_rate": 2.1483762597984324e-05, "loss": 2.0523, "step": 15280 }, { "epoch": 1.712206047032475, "grad_norm": 3.686732292175293, "learning_rate": 2.1465098917506532e-05, "loss": 1.8393, "step": 15290 }, { "epoch": 1.7133258678611423, "grad_norm": 8.316386222839355, "learning_rate": 2.144643523702874e-05, "loss": 2.1096, "step": 15300 }, { "epoch": 1.7144456886898096, "grad_norm": 14.509235382080078, "learning_rate": 2.1427771556550952e-05, "loss": 2.2452, "step": 15310 }, { "epoch": 1.715565509518477, "grad_norm": 12.271526336669922, "learning_rate": 2.140910787607316e-05, "loss": 1.8003, "step": 15320 }, { "epoch": 1.7166853303471443, "grad_norm": 16.485271453857422, "learning_rate": 2.1390444195595372e-05, "loss": 1.8001, "step": 15330 }, { "epoch": 1.717805151175812, "grad_norm": 4.867336273193359, "learning_rate": 2.137178051511758e-05, "loss": 1.8425, "step": 15340 }, { "epoch": 1.7189249720044792, "grad_norm": 3.5718979835510254, "learning_rate": 2.1353116834639792e-05, "loss": 1.6484, "step": 15350 }, { "epoch": 1.7200447928331468, "grad_norm": 26.389127731323242, "learning_rate": 2.1334453154162e-05, "loss": 1.7185, "step": 15360 }, { "epoch": 1.7211646136618142, "grad_norm": 4.237075328826904, "learning_rate": 2.1315789473684212e-05, "loss": 1.8523, "step": 15370 }, { "epoch": 1.7222844344904815, "grad_norm": 11.237632751464844, "learning_rate": 2.129712579320642e-05, "loss": 2.3631, "step": 15380 }, { "epoch": 1.7234042553191489, "grad_norm": 4.580799579620361, "learning_rate": 2.1278462112728632e-05, "loss": 2.2933, "step": 15390 }, { "epoch": 1.7245240761478162, "grad_norm": 5.851457118988037, "learning_rate": 2.125979843225084e-05, "loss": 1.9909, "step": 15400 }, { "epoch": 1.7256438969764838, "grad_norm": 4.036518573760986, "learning_rate": 2.1241134751773052e-05, "loss": 2.1115, "step": 15410 }, { "epoch": 1.7267637178051511, "grad_norm": 10.545909881591797, "learning_rate": 2.122247107129526e-05, "loss": 1.7327, "step": 15420 }, { "epoch": 1.7278835386338187, "grad_norm": 1.6649363040924072, "learning_rate": 2.120380739081747e-05, "loss": 1.7729, "step": 15430 }, { "epoch": 1.729003359462486, "grad_norm": 10.285140991210938, "learning_rate": 2.118514371033968e-05, "loss": 1.9684, "step": 15440 }, { "epoch": 1.7301231802911534, "grad_norm": 10.789081573486328, "learning_rate": 2.116648002986189e-05, "loss": 1.9455, "step": 15450 }, { "epoch": 1.7312430011198208, "grad_norm": 12.79870891571045, "learning_rate": 2.11478163493841e-05, "loss": 2.0917, "step": 15460 }, { "epoch": 1.7323628219484881, "grad_norm": 7.7222065925598145, "learning_rate": 2.112915266890631e-05, "loss": 2.064, "step": 15470 }, { "epoch": 1.7334826427771557, "grad_norm": 4.771847724914551, "learning_rate": 2.111048898842852e-05, "loss": 1.9891, "step": 15480 }, { "epoch": 1.734602463605823, "grad_norm": 3.90159273147583, "learning_rate": 2.109182530795073e-05, "loss": 2.0179, "step": 15490 }, { "epoch": 1.7357222844344906, "grad_norm": 7.232120037078857, "learning_rate": 2.107316162747294e-05, "loss": 2.3234, "step": 15500 }, { "epoch": 1.736842105263158, "grad_norm": 5.076690196990967, "learning_rate": 2.105449794699515e-05, "loss": 2.2486, "step": 15510 }, { "epoch": 1.7379619260918253, "grad_norm": 12.581092834472656, "learning_rate": 2.1035834266517357e-05, "loss": 2.1017, "step": 15520 }, { "epoch": 1.7390817469204927, "grad_norm": 7.462939262390137, "learning_rate": 2.1017170586039566e-05, "loss": 1.8854, "step": 15530 }, { "epoch": 1.74020156774916, "grad_norm": 5.599474906921387, "learning_rate": 2.0998506905561777e-05, "loss": 1.987, "step": 15540 }, { "epoch": 1.7413213885778276, "grad_norm": 2.1986734867095947, "learning_rate": 2.0979843225083986e-05, "loss": 1.3435, "step": 15550 }, { "epoch": 1.742441209406495, "grad_norm": 10.124311447143555, "learning_rate": 2.0961179544606197e-05, "loss": 2.2367, "step": 15560 }, { "epoch": 1.7435610302351625, "grad_norm": 11.939183235168457, "learning_rate": 2.0942515864128406e-05, "loss": 2.5315, "step": 15570 }, { "epoch": 1.7446808510638299, "grad_norm": 10.486783027648926, "learning_rate": 2.0923852183650617e-05, "loss": 2.1949, "step": 15580 }, { "epoch": 1.7458006718924972, "grad_norm": 10.252884864807129, "learning_rate": 2.0905188503172826e-05, "loss": 1.7758, "step": 15590 }, { "epoch": 1.7469204927211646, "grad_norm": 3.4020817279815674, "learning_rate": 2.0886524822695037e-05, "loss": 1.7837, "step": 15600 }, { "epoch": 1.748040313549832, "grad_norm": 7.330861568450928, "learning_rate": 2.0867861142217246e-05, "loss": 1.4382, "step": 15610 }, { "epoch": 1.7491601343784995, "grad_norm": 8.330341339111328, "learning_rate": 2.0849197461739457e-05, "loss": 2.185, "step": 15620 }, { "epoch": 1.7502799552071668, "grad_norm": 4.57420015335083, "learning_rate": 2.0830533781261666e-05, "loss": 2.1382, "step": 15630 }, { "epoch": 1.7513997760358344, "grad_norm": 14.915903091430664, "learning_rate": 2.0811870100783874e-05, "loss": 1.8498, "step": 15640 }, { "epoch": 1.7525195968645018, "grad_norm": 4.439641952514648, "learning_rate": 2.0793206420306086e-05, "loss": 1.9074, "step": 15650 }, { "epoch": 1.7536394176931691, "grad_norm": 4.21160364151001, "learning_rate": 2.0774542739828294e-05, "loss": 2.0153, "step": 15660 }, { "epoch": 1.7547592385218365, "grad_norm": 3.9740211963653564, "learning_rate": 2.0755879059350506e-05, "loss": 2.1055, "step": 15670 }, { "epoch": 1.7558790593505038, "grad_norm": 8.004166603088379, "learning_rate": 2.0737215378872714e-05, "loss": 1.9654, "step": 15680 }, { "epoch": 1.7569988801791714, "grad_norm": 12.65368938446045, "learning_rate": 2.0718551698394926e-05, "loss": 2.3388, "step": 15690 }, { "epoch": 1.7581187010078387, "grad_norm": 7.4648566246032715, "learning_rate": 2.0699888017917134e-05, "loss": 1.9942, "step": 15700 }, { "epoch": 1.7592385218365063, "grad_norm": 3.306600570678711, "learning_rate": 2.0681224337439346e-05, "loss": 1.7987, "step": 15710 }, { "epoch": 1.7603583426651737, "grad_norm": 4.179432392120361, "learning_rate": 2.0662560656961554e-05, "loss": 2.2372, "step": 15720 }, { "epoch": 1.761478163493841, "grad_norm": 13.356534004211426, "learning_rate": 2.0643896976483766e-05, "loss": 1.5609, "step": 15730 }, { "epoch": 1.7625979843225084, "grad_norm": 9.077022552490234, "learning_rate": 2.0625233296005974e-05, "loss": 1.9327, "step": 15740 }, { "epoch": 1.7637178051511757, "grad_norm": 3.596141815185547, "learning_rate": 2.0606569615528182e-05, "loss": 1.971, "step": 15750 }, { "epoch": 1.764837625979843, "grad_norm": 3.860454559326172, "learning_rate": 2.058790593505039e-05, "loss": 2.3288, "step": 15760 }, { "epoch": 1.7659574468085106, "grad_norm": 12.444572448730469, "learning_rate": 2.0569242254572602e-05, "loss": 1.8277, "step": 15770 }, { "epoch": 1.7670772676371782, "grad_norm": 10.383987426757812, "learning_rate": 2.055057857409481e-05, "loss": 2.2227, "step": 15780 }, { "epoch": 1.7681970884658456, "grad_norm": 9.582054138183594, "learning_rate": 2.0531914893617022e-05, "loss": 1.7383, "step": 15790 }, { "epoch": 1.769316909294513, "grad_norm": 12.529754638671875, "learning_rate": 2.051325121313923e-05, "loss": 1.9381, "step": 15800 }, { "epoch": 1.7704367301231803, "grad_norm": 8.915084838867188, "learning_rate": 2.0494587532661442e-05, "loss": 1.668, "step": 15810 }, { "epoch": 1.7715565509518476, "grad_norm": 13.440780639648438, "learning_rate": 2.047592385218365e-05, "loss": 2.1798, "step": 15820 }, { "epoch": 1.772676371780515, "grad_norm": 7.045945167541504, "learning_rate": 2.0457260171705862e-05, "loss": 1.9319, "step": 15830 }, { "epoch": 1.7737961926091825, "grad_norm": 2.6684410572052, "learning_rate": 2.043859649122807e-05, "loss": 1.7845, "step": 15840 }, { "epoch": 1.77491601343785, "grad_norm": 11.95478343963623, "learning_rate": 2.041993281075028e-05, "loss": 2.3651, "step": 15850 }, { "epoch": 1.7760358342665175, "grad_norm": 4.880320072174072, "learning_rate": 2.040126913027249e-05, "loss": 1.9852, "step": 15860 }, { "epoch": 1.7771556550951848, "grad_norm": 12.231099128723145, "learning_rate": 2.03826054497947e-05, "loss": 2.0981, "step": 15870 }, { "epoch": 1.7782754759238522, "grad_norm": 7.029375076293945, "learning_rate": 2.036394176931691e-05, "loss": 2.1145, "step": 15880 }, { "epoch": 1.7793952967525195, "grad_norm": 6.398838043212891, "learning_rate": 2.034527808883912e-05, "loss": 2.0601, "step": 15890 }, { "epoch": 1.7805151175811869, "grad_norm": 4.79886531829834, "learning_rate": 2.032661440836133e-05, "loss": 1.7714, "step": 15900 }, { "epoch": 1.7816349384098544, "grad_norm": 5.065080642700195, "learning_rate": 2.030795072788354e-05, "loss": 1.5375, "step": 15910 }, { "epoch": 1.782754759238522, "grad_norm": 4.638917446136475, "learning_rate": 2.028928704740575e-05, "loss": 2.0297, "step": 15920 }, { "epoch": 1.7838745800671894, "grad_norm": 8.476948738098145, "learning_rate": 2.027062336692796e-05, "loss": 2.2116, "step": 15930 }, { "epoch": 1.7849944008958567, "grad_norm": 14.950053215026855, "learning_rate": 2.025195968645017e-05, "loss": 2.2304, "step": 15940 }, { "epoch": 1.786114221724524, "grad_norm": 4.521371841430664, "learning_rate": 2.023329600597238e-05, "loss": 2.205, "step": 15950 }, { "epoch": 1.7872340425531914, "grad_norm": 4.019099712371826, "learning_rate": 2.021463232549459e-05, "loss": 1.9879, "step": 15960 }, { "epoch": 1.7883538633818588, "grad_norm": 10.061615943908691, "learning_rate": 2.01959686450168e-05, "loss": 1.6731, "step": 15970 }, { "epoch": 1.7894736842105263, "grad_norm": 8.204621315002441, "learning_rate": 2.017730496453901e-05, "loss": 1.9748, "step": 15980 }, { "epoch": 1.7905935050391937, "grad_norm": 5.247344493865967, "learning_rate": 2.0158641284061216e-05, "loss": 1.6542, "step": 15990 }, { "epoch": 1.7917133258678613, "grad_norm": 4.7308735847473145, "learning_rate": 2.0139977603583427e-05, "loss": 1.4543, "step": 16000 }, { "epoch": 1.7928331466965286, "grad_norm": 3.514563798904419, "learning_rate": 2.0121313923105636e-05, "loss": 2.0833, "step": 16010 }, { "epoch": 1.793952967525196, "grad_norm": 4.816470623016357, "learning_rate": 2.0102650242627847e-05, "loss": 2.283, "step": 16020 }, { "epoch": 1.7950727883538633, "grad_norm": 11.659377098083496, "learning_rate": 2.0083986562150056e-05, "loss": 1.8348, "step": 16030 }, { "epoch": 1.7961926091825307, "grad_norm": 5.277092933654785, "learning_rate": 2.0065322881672267e-05, "loss": 1.8455, "step": 16040 }, { "epoch": 1.7973124300111982, "grad_norm": 10.653385162353516, "learning_rate": 2.0046659201194476e-05, "loss": 1.8004, "step": 16050 }, { "epoch": 1.7984322508398656, "grad_norm": 5.165909767150879, "learning_rate": 2.0027995520716687e-05, "loss": 1.9219, "step": 16060 }, { "epoch": 1.7995520716685331, "grad_norm": 12.408156394958496, "learning_rate": 2.0009331840238896e-05, "loss": 2.0149, "step": 16070 }, { "epoch": 1.8006718924972005, "grad_norm": 3.793848752975464, "learning_rate": 1.9990668159761104e-05, "loss": 2.1401, "step": 16080 }, { "epoch": 1.8017917133258678, "grad_norm": 4.723913192749023, "learning_rate": 1.9972004479283316e-05, "loss": 1.5437, "step": 16090 }, { "epoch": 1.8029115341545352, "grad_norm": 5.787063121795654, "learning_rate": 1.9953340798805524e-05, "loss": 1.93, "step": 16100 }, { "epoch": 1.8040313549832026, "grad_norm": 6.674378395080566, "learning_rate": 1.9934677118327736e-05, "loss": 2.3135, "step": 16110 }, { "epoch": 1.8051511758118701, "grad_norm": 14.730244636535645, "learning_rate": 1.9916013437849944e-05, "loss": 2.1795, "step": 16120 }, { "epoch": 1.8062709966405375, "grad_norm": 4.993513584136963, "learning_rate": 1.9897349757372156e-05, "loss": 1.7507, "step": 16130 }, { "epoch": 1.807390817469205, "grad_norm": 14.580843925476074, "learning_rate": 1.9878686076894364e-05, "loss": 1.7252, "step": 16140 }, { "epoch": 1.8085106382978724, "grad_norm": 14.65086841583252, "learning_rate": 1.9860022396416576e-05, "loss": 1.6178, "step": 16150 }, { "epoch": 1.8096304591265397, "grad_norm": 15.6979398727417, "learning_rate": 1.9841358715938784e-05, "loss": 2.3194, "step": 16160 }, { "epoch": 1.810750279955207, "grad_norm": 5.819782733917236, "learning_rate": 1.9822695035460996e-05, "loss": 1.572, "step": 16170 }, { "epoch": 1.8118701007838744, "grad_norm": 4.418210983276367, "learning_rate": 1.9804031354983204e-05, "loss": 2.247, "step": 16180 }, { "epoch": 1.812989921612542, "grad_norm": 5.038919925689697, "learning_rate": 1.9785367674505416e-05, "loss": 1.8143, "step": 16190 }, { "epoch": 1.8141097424412094, "grad_norm": 5.138890743255615, "learning_rate": 1.9766703994027624e-05, "loss": 1.8542, "step": 16200 }, { "epoch": 1.815229563269877, "grad_norm": 12.318073272705078, "learning_rate": 1.9748040313549836e-05, "loss": 2.1731, "step": 16210 }, { "epoch": 1.8163493840985443, "grad_norm": 4.275629043579102, "learning_rate": 1.972937663307204e-05, "loss": 2.4228, "step": 16220 }, { "epoch": 1.8174692049272116, "grad_norm": 4.5237016677856445, "learning_rate": 1.9710712952594252e-05, "loss": 1.9218, "step": 16230 }, { "epoch": 1.818589025755879, "grad_norm": 7.575822353363037, "learning_rate": 1.969204927211646e-05, "loss": 1.7327, "step": 16240 }, { "epoch": 1.8197088465845463, "grad_norm": 12.654701232910156, "learning_rate": 1.9673385591638672e-05, "loss": 1.993, "step": 16250 }, { "epoch": 1.820828667413214, "grad_norm": 8.574930191040039, "learning_rate": 1.965472191116088e-05, "loss": 1.8782, "step": 16260 }, { "epoch": 1.8219484882418813, "grad_norm": 4.255867958068848, "learning_rate": 1.9636058230683092e-05, "loss": 1.8501, "step": 16270 }, { "epoch": 1.8230683090705488, "grad_norm": 6.834265232086182, "learning_rate": 1.96173945502053e-05, "loss": 2.3436, "step": 16280 }, { "epoch": 1.8241881298992162, "grad_norm": 3.866483688354492, "learning_rate": 1.959873086972751e-05, "loss": 2.3195, "step": 16290 }, { "epoch": 1.8253079507278835, "grad_norm": 10.645752906799316, "learning_rate": 1.958006718924972e-05, "loss": 1.7716, "step": 16300 }, { "epoch": 1.826427771556551, "grad_norm": 15.50953197479248, "learning_rate": 1.956140350877193e-05, "loss": 2.5357, "step": 16310 }, { "epoch": 1.8275475923852182, "grad_norm": 8.97745418548584, "learning_rate": 1.954273982829414e-05, "loss": 1.8631, "step": 16320 }, { "epoch": 1.8286674132138858, "grad_norm": 10.974065780639648, "learning_rate": 1.952407614781635e-05, "loss": 1.6453, "step": 16330 }, { "epoch": 1.8297872340425532, "grad_norm": 14.380806922912598, "learning_rate": 1.950541246733856e-05, "loss": 2.1817, "step": 16340 }, { "epoch": 1.8309070548712207, "grad_norm": 3.8893136978149414, "learning_rate": 1.948674878686077e-05, "loss": 2.1023, "step": 16350 }, { "epoch": 1.832026875699888, "grad_norm": 3.2880914211273193, "learning_rate": 1.946808510638298e-05, "loss": 1.466, "step": 16360 }, { "epoch": 1.8331466965285554, "grad_norm": 9.581578254699707, "learning_rate": 1.944942142590519e-05, "loss": 1.8245, "step": 16370 }, { "epoch": 1.8342665173572228, "grad_norm": 15.423023223876953, "learning_rate": 1.94307577454274e-05, "loss": 2.1899, "step": 16380 }, { "epoch": 1.8353863381858901, "grad_norm": 5.308213233947754, "learning_rate": 1.941209406494961e-05, "loss": 1.6693, "step": 16390 }, { "epoch": 1.8365061590145577, "grad_norm": 13.718766212463379, "learning_rate": 1.939343038447182e-05, "loss": 1.6922, "step": 16400 }, { "epoch": 1.837625979843225, "grad_norm": 5.901851177215576, "learning_rate": 1.937476670399403e-05, "loss": 1.9234, "step": 16410 }, { "epoch": 1.8387458006718926, "grad_norm": 4.218606948852539, "learning_rate": 1.935610302351624e-05, "loss": 1.7346, "step": 16420 }, { "epoch": 1.83986562150056, "grad_norm": 3.545685291290283, "learning_rate": 1.933743934303845e-05, "loss": 1.5436, "step": 16430 }, { "epoch": 1.8409854423292273, "grad_norm": 3.544178009033203, "learning_rate": 1.931877566256066e-05, "loss": 2.0027, "step": 16440 }, { "epoch": 1.8421052631578947, "grad_norm": 16.046741485595703, "learning_rate": 1.9300111982082866e-05, "loss": 1.8394, "step": 16450 }, { "epoch": 1.843225083986562, "grad_norm": 3.808443546295166, "learning_rate": 1.9281448301605077e-05, "loss": 1.5258, "step": 16460 }, { "epoch": 1.8443449048152296, "grad_norm": 16.202293395996094, "learning_rate": 1.9262784621127286e-05, "loss": 1.8592, "step": 16470 }, { "epoch": 1.845464725643897, "grad_norm": 12.9262056350708, "learning_rate": 1.9244120940649498e-05, "loss": 2.35, "step": 16480 }, { "epoch": 1.8465845464725645, "grad_norm": 9.115686416625977, "learning_rate": 1.9225457260171706e-05, "loss": 2.0023, "step": 16490 }, { "epoch": 1.8477043673012319, "grad_norm": 4.362748622894287, "learning_rate": 1.9206793579693914e-05, "loss": 2.2523, "step": 16500 }, { "epoch": 1.8488241881298992, "grad_norm": 6.081763744354248, "learning_rate": 1.9188129899216126e-05, "loss": 1.9744, "step": 16510 }, { "epoch": 1.8499440089585666, "grad_norm": 13.35545539855957, "learning_rate": 1.9169466218738334e-05, "loss": 2.0795, "step": 16520 }, { "epoch": 1.851063829787234, "grad_norm": 4.248141765594482, "learning_rate": 1.9150802538260546e-05, "loss": 2.4488, "step": 16530 }, { "epoch": 1.8521836506159015, "grad_norm": 10.578146934509277, "learning_rate": 1.9132138857782754e-05, "loss": 1.8938, "step": 16540 }, { "epoch": 1.8533034714445689, "grad_norm": 8.211806297302246, "learning_rate": 1.9113475177304966e-05, "loss": 1.939, "step": 16550 }, { "epoch": 1.8544232922732364, "grad_norm": 11.0032320022583, "learning_rate": 1.9094811496827174e-05, "loss": 2.0445, "step": 16560 }, { "epoch": 1.8555431131019038, "grad_norm": 15.884440422058105, "learning_rate": 1.9076147816349386e-05, "loss": 2.1312, "step": 16570 }, { "epoch": 1.8566629339305711, "grad_norm": 17.182661056518555, "learning_rate": 1.9057484135871594e-05, "loss": 1.8968, "step": 16580 }, { "epoch": 1.8577827547592385, "grad_norm": 12.066494941711426, "learning_rate": 1.9038820455393806e-05, "loss": 2.1683, "step": 16590 }, { "epoch": 1.8589025755879058, "grad_norm": 5.713685989379883, "learning_rate": 1.9020156774916014e-05, "loss": 1.8316, "step": 16600 }, { "epoch": 1.8600223964165732, "grad_norm": 3.7835745811462402, "learning_rate": 1.9001493094438226e-05, "loss": 1.7352, "step": 16610 }, { "epoch": 1.8611422172452408, "grad_norm": 5.586095809936523, "learning_rate": 1.8982829413960434e-05, "loss": 1.9586, "step": 16620 }, { "epoch": 1.8622620380739083, "grad_norm": 10.472651481628418, "learning_rate": 1.8964165733482646e-05, "loss": 1.8188, "step": 16630 }, { "epoch": 1.8633818589025757, "grad_norm": 8.586959838867188, "learning_rate": 1.8945502053004854e-05, "loss": 1.7482, "step": 16640 }, { "epoch": 1.864501679731243, "grad_norm": 14.469319343566895, "learning_rate": 1.8926838372527066e-05, "loss": 1.9509, "step": 16650 }, { "epoch": 1.8656215005599104, "grad_norm": 12.987029075622559, "learning_rate": 1.8908174692049274e-05, "loss": 1.505, "step": 16660 }, { "epoch": 1.8667413213885777, "grad_norm": 4.787947654724121, "learning_rate": 1.8889511011571483e-05, "loss": 1.7582, "step": 16670 }, { "epoch": 1.867861142217245, "grad_norm": 4.350035667419434, "learning_rate": 1.8870847331093694e-05, "loss": 2.4025, "step": 16680 }, { "epoch": 1.8689809630459127, "grad_norm": 4.656111717224121, "learning_rate": 1.8852183650615903e-05, "loss": 1.558, "step": 16690 }, { "epoch": 1.8701007838745802, "grad_norm": 5.183754920959473, "learning_rate": 1.883351997013811e-05, "loss": 1.6411, "step": 16700 }, { "epoch": 1.8712206047032476, "grad_norm": 13.324991226196289, "learning_rate": 1.8814856289660323e-05, "loss": 2.254, "step": 16710 }, { "epoch": 1.872340425531915, "grad_norm": 15.952241897583008, "learning_rate": 1.879619260918253e-05, "loss": 1.7153, "step": 16720 }, { "epoch": 1.8734602463605823, "grad_norm": 8.312430381774902, "learning_rate": 1.877752892870474e-05, "loss": 2.2136, "step": 16730 }, { "epoch": 1.8745800671892496, "grad_norm": 7.2421393394470215, "learning_rate": 1.875886524822695e-05, "loss": 1.8395, "step": 16740 }, { "epoch": 1.875699888017917, "grad_norm": 9.180643081665039, "learning_rate": 1.874020156774916e-05, "loss": 2.1119, "step": 16750 }, { "epoch": 1.8768197088465846, "grad_norm": 15.752584457397461, "learning_rate": 1.872153788727137e-05, "loss": 2.3431, "step": 16760 }, { "epoch": 1.877939529675252, "grad_norm": 15.961100578308105, "learning_rate": 1.870287420679358e-05, "loss": 2.2094, "step": 16770 }, { "epoch": 1.8790593505039195, "grad_norm": 4.183115482330322, "learning_rate": 1.868421052631579e-05, "loss": 1.9611, "step": 16780 }, { "epoch": 1.8801791713325868, "grad_norm": 15.471096992492676, "learning_rate": 1.8665546845838e-05, "loss": 2.2645, "step": 16790 }, { "epoch": 1.8812989921612542, "grad_norm": 15.710405349731445, "learning_rate": 1.864688316536021e-05, "loss": 2.1026, "step": 16800 }, { "epoch": 1.8824188129899215, "grad_norm": 7.765809535980225, "learning_rate": 1.862821948488242e-05, "loss": 1.8401, "step": 16810 }, { "epoch": 1.8835386338185889, "grad_norm": 6.538113117218018, "learning_rate": 1.860955580440463e-05, "loss": 1.7096, "step": 16820 }, { "epoch": 1.8846584546472565, "grad_norm": 16.50730323791504, "learning_rate": 1.859089212392684e-05, "loss": 2.0152, "step": 16830 }, { "epoch": 1.8857782754759238, "grad_norm": 3.642190933227539, "learning_rate": 1.857222844344905e-05, "loss": 2.0935, "step": 16840 }, { "epoch": 1.8868980963045914, "grad_norm": 5.2225518226623535, "learning_rate": 1.855356476297126e-05, "loss": 1.7445, "step": 16850 }, { "epoch": 1.8880179171332587, "grad_norm": 3.3426289558410645, "learning_rate": 1.853490108249347e-05, "loss": 2.3258, "step": 16860 }, { "epoch": 1.889137737961926, "grad_norm": 8.263337135314941, "learning_rate": 1.851623740201568e-05, "loss": 1.6674, "step": 16870 }, { "epoch": 1.8902575587905934, "grad_norm": 4.517258167266846, "learning_rate": 1.8497573721537888e-05, "loss": 2.0328, "step": 16880 }, { "epoch": 1.8913773796192608, "grad_norm": 5.429361820220947, "learning_rate": 1.84789100410601e-05, "loss": 2.3338, "step": 16890 }, { "epoch": 1.8924972004479284, "grad_norm": 11.747203826904297, "learning_rate": 1.8460246360582308e-05, "loss": 1.9876, "step": 16900 }, { "epoch": 1.8936170212765957, "grad_norm": 14.812180519104004, "learning_rate": 1.844158268010452e-05, "loss": 1.8193, "step": 16910 }, { "epoch": 1.8947368421052633, "grad_norm": 4.837928771972656, "learning_rate": 1.8422918999626728e-05, "loss": 1.8644, "step": 16920 }, { "epoch": 1.8958566629339306, "grad_norm": 9.403674125671387, "learning_rate": 1.8404255319148936e-05, "loss": 1.7544, "step": 16930 }, { "epoch": 1.896976483762598, "grad_norm": 5.102954387664795, "learning_rate": 1.8385591638671144e-05, "loss": 1.8039, "step": 16940 }, { "epoch": 1.8980963045912653, "grad_norm": 13.829090118408203, "learning_rate": 1.8366927958193356e-05, "loss": 2.3956, "step": 16950 }, { "epoch": 1.8992161254199327, "grad_norm": 14.055281639099121, "learning_rate": 1.8348264277715564e-05, "loss": 1.952, "step": 16960 }, { "epoch": 1.9003359462486002, "grad_norm": 4.856631278991699, "learning_rate": 1.8329600597237776e-05, "loss": 1.8025, "step": 16970 }, { "epoch": 1.9014557670772676, "grad_norm": 5.615917205810547, "learning_rate": 1.8310936916759984e-05, "loss": 2.4083, "step": 16980 }, { "epoch": 1.9025755879059352, "grad_norm": 4.638927459716797, "learning_rate": 1.8292273236282196e-05, "loss": 1.7921, "step": 16990 }, { "epoch": 1.9036954087346025, "grad_norm": 3.5314502716064453, "learning_rate": 1.8273609555804404e-05, "loss": 1.8788, "step": 17000 }, { "epoch": 1.9048152295632699, "grad_norm": 6.3414506912231445, "learning_rate": 1.8254945875326616e-05, "loss": 1.8657, "step": 17010 }, { "epoch": 1.9059350503919372, "grad_norm": 8.888124465942383, "learning_rate": 1.8236282194848824e-05, "loss": 2.4341, "step": 17020 }, { "epoch": 1.9070548712206046, "grad_norm": 5.071857929229736, "learning_rate": 1.8217618514371036e-05, "loss": 2.0112, "step": 17030 }, { "epoch": 1.9081746920492721, "grad_norm": 3.5548458099365234, "learning_rate": 1.8198954833893244e-05, "loss": 2.0355, "step": 17040 }, { "epoch": 1.9092945128779395, "grad_norm": 13.80466079711914, "learning_rate": 1.8180291153415456e-05, "loss": 2.532, "step": 17050 }, { "epoch": 1.910414333706607, "grad_norm": 4.249703407287598, "learning_rate": 1.8161627472937664e-05, "loss": 1.9727, "step": 17060 }, { "epoch": 1.9115341545352744, "grad_norm": 4.494642734527588, "learning_rate": 1.8142963792459876e-05, "loss": 2.0125, "step": 17070 }, { "epoch": 1.9126539753639418, "grad_norm": 5.063194274902344, "learning_rate": 1.8124300111982084e-05, "loss": 1.647, "step": 17080 }, { "epoch": 1.9137737961926091, "grad_norm": 9.803994178771973, "learning_rate": 1.8105636431504293e-05, "loss": 2.1401, "step": 17090 }, { "epoch": 1.9148936170212765, "grad_norm": 7.283653736114502, "learning_rate": 1.8086972751026504e-05, "loss": 1.9052, "step": 17100 }, { "epoch": 1.916013437849944, "grad_norm": 11.359768867492676, "learning_rate": 1.8068309070548713e-05, "loss": 2.0145, "step": 17110 }, { "epoch": 1.9171332586786114, "grad_norm": 10.177249908447266, "learning_rate": 1.8049645390070924e-05, "loss": 1.7666, "step": 17120 }, { "epoch": 1.918253079507279, "grad_norm": 5.568352699279785, "learning_rate": 1.8030981709593133e-05, "loss": 1.7245, "step": 17130 }, { "epoch": 1.9193729003359463, "grad_norm": 19.79357147216797, "learning_rate": 1.8012318029115344e-05, "loss": 2.4217, "step": 17140 }, { "epoch": 1.9204927211646137, "grad_norm": 10.292594909667969, "learning_rate": 1.799365434863755e-05, "loss": 1.4904, "step": 17150 }, { "epoch": 1.921612541993281, "grad_norm": 3.906355381011963, "learning_rate": 1.797499066815976e-05, "loss": 2.4061, "step": 17160 }, { "epoch": 1.9227323628219484, "grad_norm": 10.06027889251709, "learning_rate": 1.795632698768197e-05, "loss": 1.826, "step": 17170 }, { "epoch": 1.923852183650616, "grad_norm": 3.928687572479248, "learning_rate": 1.793766330720418e-05, "loss": 1.9778, "step": 17180 }, { "epoch": 1.9249720044792833, "grad_norm": 11.147214889526367, "learning_rate": 1.791899962672639e-05, "loss": 1.6609, "step": 17190 }, { "epoch": 1.9260918253079509, "grad_norm": 5.292778968811035, "learning_rate": 1.79003359462486e-05, "loss": 1.581, "step": 17200 }, { "epoch": 1.9272116461366182, "grad_norm": 5.773550987243652, "learning_rate": 1.788167226577081e-05, "loss": 1.5817, "step": 17210 }, { "epoch": 1.9283314669652856, "grad_norm": 14.817527770996094, "learning_rate": 1.786300858529302e-05, "loss": 1.9826, "step": 17220 }, { "epoch": 1.929451287793953, "grad_norm": 6.223337173461914, "learning_rate": 1.784434490481523e-05, "loss": 2.1578, "step": 17230 }, { "epoch": 1.9305711086226203, "grad_norm": 4.402294158935547, "learning_rate": 1.782568122433744e-05, "loss": 2.2318, "step": 17240 }, { "epoch": 1.9316909294512878, "grad_norm": 7.321905136108398, "learning_rate": 1.780701754385965e-05, "loss": 1.6848, "step": 17250 }, { "epoch": 1.9328107502799552, "grad_norm": 14.152067184448242, "learning_rate": 1.778835386338186e-05, "loss": 2.5409, "step": 17260 }, { "epoch": 1.9339305711086228, "grad_norm": 12.283940315246582, "learning_rate": 1.776969018290407e-05, "loss": 1.9965, "step": 17270 }, { "epoch": 1.9350503919372901, "grad_norm": 8.56460189819336, "learning_rate": 1.775102650242628e-05, "loss": 1.6021, "step": 17280 }, { "epoch": 1.9361702127659575, "grad_norm": 4.710309982299805, "learning_rate": 1.773236282194849e-05, "loss": 2.2814, "step": 17290 }, { "epoch": 1.9372900335946248, "grad_norm": 12.966391563415527, "learning_rate": 1.77136991414707e-05, "loss": 2.172, "step": 17300 }, { "epoch": 1.9384098544232922, "grad_norm": 16.88652229309082, "learning_rate": 1.769503546099291e-05, "loss": 2.029, "step": 17310 }, { "epoch": 1.9395296752519597, "grad_norm": 8.579212188720703, "learning_rate": 1.7676371780515118e-05, "loss": 1.47, "step": 17320 }, { "epoch": 1.940649496080627, "grad_norm": 14.837044715881348, "learning_rate": 1.765770810003733e-05, "loss": 1.9022, "step": 17330 }, { "epoch": 1.9417693169092947, "grad_norm": 14.231435775756836, "learning_rate": 1.7639044419559538e-05, "loss": 2.4319, "step": 17340 }, { "epoch": 1.942889137737962, "grad_norm": 6.342057704925537, "learning_rate": 1.762038073908175e-05, "loss": 1.8036, "step": 17350 }, { "epoch": 1.9440089585666294, "grad_norm": 4.24338436126709, "learning_rate": 1.7601717058603958e-05, "loss": 1.9694, "step": 17360 }, { "epoch": 1.9451287793952967, "grad_norm": 5.161984920501709, "learning_rate": 1.758305337812617e-05, "loss": 1.7682, "step": 17370 }, { "epoch": 1.946248600223964, "grad_norm": 7.184517860412598, "learning_rate": 1.7564389697648374e-05, "loss": 1.5724, "step": 17380 }, { "epoch": 1.9473684210526314, "grad_norm": 7.037195682525635, "learning_rate": 1.7545726017170586e-05, "loss": 1.6202, "step": 17390 }, { "epoch": 1.948488241881299, "grad_norm": 7.23237419128418, "learning_rate": 1.7527062336692794e-05, "loss": 1.7879, "step": 17400 }, { "epoch": 1.9496080627099666, "grad_norm": 4.513615131378174, "learning_rate": 1.7508398656215006e-05, "loss": 2.1071, "step": 17410 }, { "epoch": 1.950727883538634, "grad_norm": 14.149372100830078, "learning_rate": 1.7489734975737214e-05, "loss": 1.8393, "step": 17420 }, { "epoch": 1.9518477043673013, "grad_norm": 3.9815926551818848, "learning_rate": 1.7471071295259426e-05, "loss": 1.9888, "step": 17430 }, { "epoch": 1.9529675251959686, "grad_norm": 15.270926475524902, "learning_rate": 1.7452407614781634e-05, "loss": 2.2696, "step": 17440 }, { "epoch": 1.954087346024636, "grad_norm": 7.519197940826416, "learning_rate": 1.7433743934303846e-05, "loss": 2.0431, "step": 17450 }, { "epoch": 1.9552071668533033, "grad_norm": 4.564593315124512, "learning_rate": 1.7415080253826054e-05, "loss": 1.7656, "step": 17460 }, { "epoch": 1.9563269876819709, "grad_norm": 9.020241737365723, "learning_rate": 1.7396416573348266e-05, "loss": 1.9072, "step": 17470 }, { "epoch": 1.9574468085106385, "grad_norm": 10.36052131652832, "learning_rate": 1.7377752892870474e-05, "loss": 1.7954, "step": 17480 }, { "epoch": 1.9585666293393058, "grad_norm": 5.499046802520752, "learning_rate": 1.7359089212392686e-05, "loss": 2.2032, "step": 17490 }, { "epoch": 1.9596864501679732, "grad_norm": 10.584261894226074, "learning_rate": 1.7340425531914894e-05, "loss": 2.2121, "step": 17500 }, { "epoch": 1.9608062709966405, "grad_norm": 4.816810131072998, "learning_rate": 1.7321761851437106e-05, "loss": 1.9277, "step": 17510 }, { "epoch": 1.9619260918253079, "grad_norm": 5.484105110168457, "learning_rate": 1.7303098170959314e-05, "loss": 1.7758, "step": 17520 }, { "epoch": 1.9630459126539752, "grad_norm": 12.183406829833984, "learning_rate": 1.7284434490481523e-05, "loss": 2.0495, "step": 17530 }, { "epoch": 1.9641657334826428, "grad_norm": 5.112043380737305, "learning_rate": 1.7265770810003734e-05, "loss": 1.9576, "step": 17540 }, { "epoch": 1.9652855543113104, "grad_norm": 4.796443939208984, "learning_rate": 1.7247107129525943e-05, "loss": 2.0039, "step": 17550 }, { "epoch": 1.9664053751399777, "grad_norm": 4.247778415679932, "learning_rate": 1.7228443449048154e-05, "loss": 1.7593, "step": 17560 }, { "epoch": 1.967525195968645, "grad_norm": 4.498353481292725, "learning_rate": 1.7209779768570363e-05, "loss": 1.904, "step": 17570 }, { "epoch": 1.9686450167973124, "grad_norm": 12.006962776184082, "learning_rate": 1.7191116088092574e-05, "loss": 1.6231, "step": 17580 }, { "epoch": 1.9697648376259798, "grad_norm": 4.842081069946289, "learning_rate": 1.7172452407614783e-05, "loss": 2.0985, "step": 17590 }, { "epoch": 1.970884658454647, "grad_norm": 15.521842956542969, "learning_rate": 1.7153788727136994e-05, "loss": 2.1864, "step": 17600 }, { "epoch": 1.9720044792833147, "grad_norm": 9.48452091217041, "learning_rate": 1.7135125046659203e-05, "loss": 1.8124, "step": 17610 }, { "epoch": 1.973124300111982, "grad_norm": 4.017796993255615, "learning_rate": 1.711646136618141e-05, "loss": 2.0427, "step": 17620 }, { "epoch": 1.9742441209406496, "grad_norm": 3.9394009113311768, "learning_rate": 1.709779768570362e-05, "loss": 2.0485, "step": 17630 }, { "epoch": 1.975363941769317, "grad_norm": 14.145578384399414, "learning_rate": 1.707913400522583e-05, "loss": 1.5402, "step": 17640 }, { "epoch": 1.9764837625979843, "grad_norm": 4.282801628112793, "learning_rate": 1.706047032474804e-05, "loss": 1.8805, "step": 17650 }, { "epoch": 1.9776035834266517, "grad_norm": 4.898009300231934, "learning_rate": 1.704180664427025e-05, "loss": 2.1731, "step": 17660 }, { "epoch": 1.978723404255319, "grad_norm": 4.910828590393066, "learning_rate": 1.702314296379246e-05, "loss": 2.0025, "step": 17670 }, { "epoch": 1.9798432250839866, "grad_norm": 15.03659725189209, "learning_rate": 1.700447928331467e-05, "loss": 2.3384, "step": 17680 }, { "epoch": 1.980963045912654, "grad_norm": 10.689837455749512, "learning_rate": 1.698581560283688e-05, "loss": 1.7127, "step": 17690 }, { "epoch": 1.9820828667413215, "grad_norm": 10.339581489562988, "learning_rate": 1.696715192235909e-05, "loss": 2.0183, "step": 17700 }, { "epoch": 1.9832026875699889, "grad_norm": 7.037674903869629, "learning_rate": 1.69484882418813e-05, "loss": 1.9685, "step": 17710 }, { "epoch": 1.9843225083986562, "grad_norm": 14.190945625305176, "learning_rate": 1.692982456140351e-05, "loss": 1.9507, "step": 17720 }, { "epoch": 1.9854423292273236, "grad_norm": 4.3056416511535645, "learning_rate": 1.691116088092572e-05, "loss": 1.9554, "step": 17730 }, { "epoch": 1.986562150055991, "grad_norm": 14.68007755279541, "learning_rate": 1.6892497200447928e-05, "loss": 2.2227, "step": 17740 }, { "epoch": 1.9876819708846585, "grad_norm": 4.058879852294922, "learning_rate": 1.687383351997014e-05, "loss": 1.9667, "step": 17750 }, { "epoch": 1.9888017917133258, "grad_norm": 8.660399436950684, "learning_rate": 1.6855169839492348e-05, "loss": 2.0649, "step": 17760 }, { "epoch": 1.9899216125419934, "grad_norm": 11.349140167236328, "learning_rate": 1.683650615901456e-05, "loss": 1.438, "step": 17770 }, { "epoch": 1.9910414333706608, "grad_norm": 4.842729568481445, "learning_rate": 1.6817842478536768e-05, "loss": 1.9431, "step": 17780 }, { "epoch": 1.992161254199328, "grad_norm": 4.284554958343506, "learning_rate": 1.679917879805898e-05, "loss": 1.8363, "step": 17790 }, { "epoch": 1.9932810750279955, "grad_norm": 6.62599515914917, "learning_rate": 1.6780515117581188e-05, "loss": 1.6479, "step": 17800 }, { "epoch": 1.9944008958566628, "grad_norm": 12.138463973999023, "learning_rate": 1.67618514371034e-05, "loss": 1.9355, "step": 17810 }, { "epoch": 1.9955207166853304, "grad_norm": 9.465065002441406, "learning_rate": 1.6743187756625608e-05, "loss": 2.1428, "step": 17820 }, { "epoch": 1.9966405375139977, "grad_norm": 12.322503089904785, "learning_rate": 1.672452407614782e-05, "loss": 2.1444, "step": 17830 }, { "epoch": 1.9977603583426653, "grad_norm": 9.275611877441406, "learning_rate": 1.6705860395670028e-05, "loss": 1.6226, "step": 17840 }, { "epoch": 1.9988801791713326, "grad_norm": 4.5713982582092285, "learning_rate": 1.6687196715192236e-05, "loss": 1.9358, "step": 17850 }, { "epoch": 2.0, "grad_norm": 4.424788951873779, "learning_rate": 1.6668533034714444e-05, "loss": 1.604, "step": 17860 }, { "epoch": 2.0011198208286674, "grad_norm": 15.465615272521973, "learning_rate": 1.6649869354236656e-05, "loss": 1.7584, "step": 17870 }, { "epoch": 2.0022396416573347, "grad_norm": 5.3801116943359375, "learning_rate": 1.6631205673758864e-05, "loss": 2.4112, "step": 17880 }, { "epoch": 2.003359462486002, "grad_norm": 9.771553993225098, "learning_rate": 1.6612541993281076e-05, "loss": 2.3079, "step": 17890 }, { "epoch": 2.00447928331467, "grad_norm": 6.370817184448242, "learning_rate": 1.6593878312803284e-05, "loss": 1.4866, "step": 17900 }, { "epoch": 2.005599104143337, "grad_norm": 5.146578311920166, "learning_rate": 1.6575214632325496e-05, "loss": 1.9364, "step": 17910 }, { "epoch": 2.0067189249720045, "grad_norm": 6.970976829528809, "learning_rate": 1.6556550951847704e-05, "loss": 1.6884, "step": 17920 }, { "epoch": 2.007838745800672, "grad_norm": 9.017516136169434, "learning_rate": 1.6537887271369916e-05, "loss": 1.4814, "step": 17930 }, { "epoch": 2.0089585666293392, "grad_norm": 5.169244289398193, "learning_rate": 1.6519223590892124e-05, "loss": 1.9879, "step": 17940 }, { "epoch": 2.0100783874580066, "grad_norm": 3.8840739727020264, "learning_rate": 1.6500559910414336e-05, "loss": 1.4718, "step": 17950 }, { "epoch": 2.011198208286674, "grad_norm": 9.54129409790039, "learning_rate": 1.6481896229936544e-05, "loss": 1.709, "step": 17960 }, { "epoch": 2.0123180291153417, "grad_norm": 4.5542192459106445, "learning_rate": 1.6463232549458753e-05, "loss": 2.0193, "step": 17970 }, { "epoch": 2.013437849944009, "grad_norm": 4.2427144050598145, "learning_rate": 1.6444568868980964e-05, "loss": 2.2357, "step": 17980 }, { "epoch": 2.0145576707726764, "grad_norm": 8.179814338684082, "learning_rate": 1.6425905188503173e-05, "loss": 1.8587, "step": 17990 }, { "epoch": 2.015677491601344, "grad_norm": 14.023432731628418, "learning_rate": 1.6407241508025384e-05, "loss": 1.9654, "step": 18000 }, { "epoch": 2.016797312430011, "grad_norm": 4.784511566162109, "learning_rate": 1.6388577827547593e-05, "loss": 1.9038, "step": 18010 }, { "epoch": 2.0179171332586785, "grad_norm": 9.668645858764648, "learning_rate": 1.6369914147069804e-05, "loss": 2.2853, "step": 18020 }, { "epoch": 2.019036954087346, "grad_norm": 5.6623005867004395, "learning_rate": 1.6351250466592013e-05, "loss": 2.1568, "step": 18030 }, { "epoch": 2.0201567749160136, "grad_norm": 12.06014347076416, "learning_rate": 1.6332586786114224e-05, "loss": 1.9534, "step": 18040 }, { "epoch": 2.021276595744681, "grad_norm": 12.4910249710083, "learning_rate": 1.6313923105636433e-05, "loss": 1.6823, "step": 18050 }, { "epoch": 2.0223964165733483, "grad_norm": 12.419768333435059, "learning_rate": 1.6295259425158644e-05, "loss": 2.3436, "step": 18060 }, { "epoch": 2.0235162374020157, "grad_norm": 4.12880802154541, "learning_rate": 1.6276595744680853e-05, "loss": 2.1389, "step": 18070 }, { "epoch": 2.024636058230683, "grad_norm": 6.19962739944458, "learning_rate": 1.625793206420306e-05, "loss": 2.0253, "step": 18080 }, { "epoch": 2.0257558790593504, "grad_norm": 4.155970573425293, "learning_rate": 1.623926838372527e-05, "loss": 1.5464, "step": 18090 }, { "epoch": 2.0268756998880177, "grad_norm": 2.5858302116394043, "learning_rate": 1.622060470324748e-05, "loss": 1.8217, "step": 18100 }, { "epoch": 2.0279955207166855, "grad_norm": 3.9286646842956543, "learning_rate": 1.620194102276969e-05, "loss": 1.3943, "step": 18110 }, { "epoch": 2.029115341545353, "grad_norm": 12.073657035827637, "learning_rate": 1.61832773422919e-05, "loss": 2.0707, "step": 18120 }, { "epoch": 2.0302351623740202, "grad_norm": 6.261038780212402, "learning_rate": 1.616461366181411e-05, "loss": 1.9036, "step": 18130 }, { "epoch": 2.0313549832026876, "grad_norm": 3.7651288509368896, "learning_rate": 1.614594998133632e-05, "loss": 1.3601, "step": 18140 }, { "epoch": 2.032474804031355, "grad_norm": 5.616112232208252, "learning_rate": 1.612728630085853e-05, "loss": 1.9249, "step": 18150 }, { "epoch": 2.0335946248600223, "grad_norm": 14.19587230682373, "learning_rate": 1.610862262038074e-05, "loss": 1.9037, "step": 18160 }, { "epoch": 2.0347144456886896, "grad_norm": 14.49325942993164, "learning_rate": 1.608995893990295e-05, "loss": 1.9606, "step": 18170 }, { "epoch": 2.0358342665173574, "grad_norm": 5.4950270652771, "learning_rate": 1.6071295259425158e-05, "loss": 2.1976, "step": 18180 }, { "epoch": 2.036954087346025, "grad_norm": 4.107669830322266, "learning_rate": 1.605263157894737e-05, "loss": 1.6963, "step": 18190 }, { "epoch": 2.038073908174692, "grad_norm": 5.567134857177734, "learning_rate": 1.6033967898469578e-05, "loss": 1.639, "step": 18200 }, { "epoch": 2.0391937290033595, "grad_norm": 18.579816818237305, "learning_rate": 1.601530421799179e-05, "loss": 1.9576, "step": 18210 }, { "epoch": 2.040313549832027, "grad_norm": 11.057695388793945, "learning_rate": 1.5996640537513998e-05, "loss": 1.8883, "step": 18220 }, { "epoch": 2.041433370660694, "grad_norm": 6.482846260070801, "learning_rate": 1.597797685703621e-05, "loss": 1.812, "step": 18230 }, { "epoch": 2.0425531914893615, "grad_norm": 12.868412017822266, "learning_rate": 1.5959313176558418e-05, "loss": 2.1451, "step": 18240 }, { "epoch": 2.0436730123180293, "grad_norm": 4.0791401863098145, "learning_rate": 1.594064949608063e-05, "loss": 1.974, "step": 18250 }, { "epoch": 2.0447928331466967, "grad_norm": 6.537319660186768, "learning_rate": 1.5921985815602838e-05, "loss": 1.8334, "step": 18260 }, { "epoch": 2.045912653975364, "grad_norm": 8.384710311889648, "learning_rate": 1.590332213512505e-05, "loss": 2.1852, "step": 18270 }, { "epoch": 2.0470324748040314, "grad_norm": 11.995549201965332, "learning_rate": 1.5884658454647258e-05, "loss": 1.9999, "step": 18280 }, { "epoch": 2.0481522956326987, "grad_norm": 11.57607650756836, "learning_rate": 1.586599477416947e-05, "loss": 2.0097, "step": 18290 }, { "epoch": 2.049272116461366, "grad_norm": 21.388427734375, "learning_rate": 1.5847331093691678e-05, "loss": 2.0354, "step": 18300 }, { "epoch": 2.0503919372900334, "grad_norm": 4.375351428985596, "learning_rate": 1.582866741321389e-05, "loss": 1.8486, "step": 18310 }, { "epoch": 2.051511758118701, "grad_norm": 7.059999942779541, "learning_rate": 1.5810003732736094e-05, "loss": 1.7516, "step": 18320 }, { "epoch": 2.0526315789473686, "grad_norm": 4.776148796081543, "learning_rate": 1.5791340052258306e-05, "loss": 1.8479, "step": 18330 }, { "epoch": 2.053751399776036, "grad_norm": 13.596695899963379, "learning_rate": 1.5772676371780514e-05, "loss": 1.9489, "step": 18340 }, { "epoch": 2.0548712206047033, "grad_norm": 15.971503257751465, "learning_rate": 1.5754012691302726e-05, "loss": 2.1302, "step": 18350 }, { "epoch": 2.0559910414333706, "grad_norm": 5.559121131896973, "learning_rate": 1.5735349010824934e-05, "loss": 1.8906, "step": 18360 }, { "epoch": 2.057110862262038, "grad_norm": 11.740134239196777, "learning_rate": 1.5716685330347146e-05, "loss": 1.831, "step": 18370 }, { "epoch": 2.0582306830907053, "grad_norm": 5.161749362945557, "learning_rate": 1.5698021649869354e-05, "loss": 2.0162, "step": 18380 }, { "epoch": 2.0593505039193727, "grad_norm": 13.416109085083008, "learning_rate": 1.5679357969391563e-05, "loss": 2.0551, "step": 18390 }, { "epoch": 2.0604703247480405, "grad_norm": 5.6357269287109375, "learning_rate": 1.5660694288913774e-05, "loss": 1.6993, "step": 18400 }, { "epoch": 2.061590145576708, "grad_norm": 10.636037826538086, "learning_rate": 1.5642030608435983e-05, "loss": 1.8842, "step": 18410 }, { "epoch": 2.062709966405375, "grad_norm": 14.341257095336914, "learning_rate": 1.5623366927958194e-05, "loss": 1.7778, "step": 18420 }, { "epoch": 2.0638297872340425, "grad_norm": 7.4988322257995605, "learning_rate": 1.5604703247480403e-05, "loss": 1.7679, "step": 18430 }, { "epoch": 2.06494960806271, "grad_norm": 12.500404357910156, "learning_rate": 1.5586039567002614e-05, "loss": 2.2651, "step": 18440 }, { "epoch": 2.0660694288913772, "grad_norm": 5.027773380279541, "learning_rate": 1.5567375886524823e-05, "loss": 2.0421, "step": 18450 }, { "epoch": 2.0671892497200446, "grad_norm": 10.962523460388184, "learning_rate": 1.5548712206047034e-05, "loss": 2.0656, "step": 18460 }, { "epoch": 2.0683090705487124, "grad_norm": 2.7904582023620605, "learning_rate": 1.5530048525569243e-05, "loss": 1.9249, "step": 18470 }, { "epoch": 2.0694288913773797, "grad_norm": 7.586933135986328, "learning_rate": 1.5511384845091454e-05, "loss": 2.0239, "step": 18480 }, { "epoch": 2.070548712206047, "grad_norm": 4.160824775695801, "learning_rate": 1.5492721164613663e-05, "loss": 1.9334, "step": 18490 }, { "epoch": 2.0716685330347144, "grad_norm": 7.0400471687316895, "learning_rate": 1.5474057484135874e-05, "loss": 1.8884, "step": 18500 }, { "epoch": 2.072788353863382, "grad_norm": 7.399810314178467, "learning_rate": 1.5455393803658083e-05, "loss": 1.6476, "step": 18510 }, { "epoch": 2.073908174692049, "grad_norm": 6.349668979644775, "learning_rate": 1.5436730123180294e-05, "loss": 1.8177, "step": 18520 }, { "epoch": 2.0750279955207165, "grad_norm": 4.084234714508057, "learning_rate": 1.5418066442702503e-05, "loss": 1.4257, "step": 18530 }, { "epoch": 2.0761478163493843, "grad_norm": 14.362452507019043, "learning_rate": 1.5399402762224714e-05, "loss": 2.0043, "step": 18540 }, { "epoch": 2.0772676371780516, "grad_norm": 8.189460754394531, "learning_rate": 1.538073908174692e-05, "loss": 1.5747, "step": 18550 }, { "epoch": 2.078387458006719, "grad_norm": 9.600176811218262, "learning_rate": 1.536207540126913e-05, "loss": 1.9165, "step": 18560 }, { "epoch": 2.0795072788353863, "grad_norm": 8.519039154052734, "learning_rate": 1.534341172079134e-05, "loss": 1.8993, "step": 18570 }, { "epoch": 2.0806270996640537, "grad_norm": 14.394335746765137, "learning_rate": 1.532474804031355e-05, "loss": 2.0525, "step": 18580 }, { "epoch": 2.081746920492721, "grad_norm": 4.982779502868652, "learning_rate": 1.530608435983576e-05, "loss": 2.0925, "step": 18590 }, { "epoch": 2.0828667413213884, "grad_norm": 15.897424697875977, "learning_rate": 1.528742067935797e-05, "loss": 1.7299, "step": 18600 }, { "epoch": 2.083986562150056, "grad_norm": 12.037178993225098, "learning_rate": 1.526875699888018e-05, "loss": 1.7714, "step": 18610 }, { "epoch": 2.0851063829787235, "grad_norm": 4.796445846557617, "learning_rate": 1.525009331840239e-05, "loss": 2.0635, "step": 18620 }, { "epoch": 2.086226203807391, "grad_norm": 5.05470085144043, "learning_rate": 1.52314296379246e-05, "loss": 2.302, "step": 18630 }, { "epoch": 2.0873460246360582, "grad_norm": 6.144739627838135, "learning_rate": 1.521276595744681e-05, "loss": 1.8878, "step": 18640 }, { "epoch": 2.0884658454647256, "grad_norm": 5.468743801116943, "learning_rate": 1.519410227696902e-05, "loss": 2.1304, "step": 18650 }, { "epoch": 2.089585666293393, "grad_norm": 6.490882873535156, "learning_rate": 1.517543859649123e-05, "loss": 1.913, "step": 18660 }, { "epoch": 2.0907054871220603, "grad_norm": 5.277439117431641, "learning_rate": 1.515677491601344e-05, "loss": 1.9014, "step": 18670 }, { "epoch": 2.091825307950728, "grad_norm": 5.904261112213135, "learning_rate": 1.513811123553565e-05, "loss": 1.963, "step": 18680 }, { "epoch": 2.0929451287793954, "grad_norm": 10.203513145446777, "learning_rate": 1.5119447555057858e-05, "loss": 1.9816, "step": 18690 }, { "epoch": 2.0940649496080628, "grad_norm": 13.511499404907227, "learning_rate": 1.5100783874580068e-05, "loss": 1.9453, "step": 18700 }, { "epoch": 2.09518477043673, "grad_norm": 4.540700435638428, "learning_rate": 1.5082120194102278e-05, "loss": 1.7912, "step": 18710 }, { "epoch": 2.0963045912653975, "grad_norm": 4.4151201248168945, "learning_rate": 1.5063456513624488e-05, "loss": 2.1123, "step": 18720 }, { "epoch": 2.097424412094065, "grad_norm": 10.17835807800293, "learning_rate": 1.5044792833146698e-05, "loss": 2.0146, "step": 18730 }, { "epoch": 2.098544232922732, "grad_norm": 5.04526424407959, "learning_rate": 1.5026129152668908e-05, "loss": 1.6177, "step": 18740 }, { "epoch": 2.0996640537514, "grad_norm": 10.27774429321289, "learning_rate": 1.5007465472191118e-05, "loss": 2.0393, "step": 18750 }, { "epoch": 2.1007838745800673, "grad_norm": 5.038769721984863, "learning_rate": 1.4988801791713328e-05, "loss": 1.741, "step": 18760 }, { "epoch": 2.1019036954087347, "grad_norm": 9.592277526855469, "learning_rate": 1.4970138111235538e-05, "loss": 1.7934, "step": 18770 }, { "epoch": 2.103023516237402, "grad_norm": 9.235641479492188, "learning_rate": 1.4951474430757744e-05, "loss": 1.7898, "step": 18780 }, { "epoch": 2.1041433370660694, "grad_norm": 4.7292327880859375, "learning_rate": 1.4932810750279954e-05, "loss": 1.5251, "step": 18790 }, { "epoch": 2.1052631578947367, "grad_norm": 9.268404006958008, "learning_rate": 1.4914147069802164e-05, "loss": 1.7425, "step": 18800 }, { "epoch": 2.106382978723404, "grad_norm": 4.396312236785889, "learning_rate": 1.4895483389324374e-05, "loss": 1.9284, "step": 18810 }, { "epoch": 2.107502799552072, "grad_norm": 5.53659725189209, "learning_rate": 1.4876819708846584e-05, "loss": 1.5367, "step": 18820 }, { "epoch": 2.108622620380739, "grad_norm": 6.703355312347412, "learning_rate": 1.4858156028368794e-05, "loss": 1.808, "step": 18830 }, { "epoch": 2.1097424412094066, "grad_norm": 13.882152557373047, "learning_rate": 1.4839492347891004e-05, "loss": 2.1221, "step": 18840 }, { "epoch": 2.110862262038074, "grad_norm": 4.497895240783691, "learning_rate": 1.4820828667413214e-05, "loss": 1.5897, "step": 18850 }, { "epoch": 2.1119820828667413, "grad_norm": 9.912936210632324, "learning_rate": 1.4802164986935424e-05, "loss": 1.9576, "step": 18860 }, { "epoch": 2.1131019036954086, "grad_norm": 14.399587631225586, "learning_rate": 1.4783501306457634e-05, "loss": 1.9239, "step": 18870 }, { "epoch": 2.114221724524076, "grad_norm": 17.03645133972168, "learning_rate": 1.4764837625979844e-05, "loss": 1.9142, "step": 18880 }, { "epoch": 2.1153415453527438, "grad_norm": 12.911978721618652, "learning_rate": 1.4746173945502054e-05, "loss": 2.1077, "step": 18890 }, { "epoch": 2.116461366181411, "grad_norm": 6.310555458068848, "learning_rate": 1.4727510265024263e-05, "loss": 1.6921, "step": 18900 }, { "epoch": 2.1175811870100785, "grad_norm": 5.524637699127197, "learning_rate": 1.4708846584546473e-05, "loss": 1.7968, "step": 18910 }, { "epoch": 2.118701007838746, "grad_norm": 12.02706527709961, "learning_rate": 1.4690182904068683e-05, "loss": 1.982, "step": 18920 }, { "epoch": 2.119820828667413, "grad_norm": 16.045839309692383, "learning_rate": 1.4671519223590893e-05, "loss": 1.5663, "step": 18930 }, { "epoch": 2.1209406494960805, "grad_norm": 5.100281715393066, "learning_rate": 1.4652855543113103e-05, "loss": 1.9206, "step": 18940 }, { "epoch": 2.122060470324748, "grad_norm": 8.30729866027832, "learning_rate": 1.4634191862635313e-05, "loss": 1.9549, "step": 18950 }, { "epoch": 2.1231802911534157, "grad_norm": 9.724970817565918, "learning_rate": 1.4615528182157523e-05, "loss": 1.8996, "step": 18960 }, { "epoch": 2.124300111982083, "grad_norm": 9.640581130981445, "learning_rate": 1.4596864501679733e-05, "loss": 1.6712, "step": 18970 }, { "epoch": 2.1254199328107504, "grad_norm": 7.71252965927124, "learning_rate": 1.4578200821201943e-05, "loss": 1.692, "step": 18980 }, { "epoch": 2.1265397536394177, "grad_norm": 6.05610466003418, "learning_rate": 1.4559537140724153e-05, "loss": 1.7808, "step": 18990 }, { "epoch": 2.127659574468085, "grad_norm": 12.274239540100098, "learning_rate": 1.4540873460246363e-05, "loss": 1.8396, "step": 19000 }, { "epoch": 2.1287793952967524, "grad_norm": 5.31697416305542, "learning_rate": 1.4522209779768573e-05, "loss": 1.7804, "step": 19010 }, { "epoch": 2.1298992161254198, "grad_norm": 19.778165817260742, "learning_rate": 1.450354609929078e-05, "loss": 1.6239, "step": 19020 }, { "epoch": 2.131019036954087, "grad_norm": 4.198515892028809, "learning_rate": 1.448488241881299e-05, "loss": 1.7749, "step": 19030 }, { "epoch": 2.132138857782755, "grad_norm": 5.769347667694092, "learning_rate": 1.44662187383352e-05, "loss": 1.7057, "step": 19040 }, { "epoch": 2.1332586786114223, "grad_norm": 4.867179870605469, "learning_rate": 1.444755505785741e-05, "loss": 1.5719, "step": 19050 }, { "epoch": 2.1343784994400896, "grad_norm": 4.64288854598999, "learning_rate": 1.442889137737962e-05, "loss": 1.8134, "step": 19060 }, { "epoch": 2.135498320268757, "grad_norm": 5.441596031188965, "learning_rate": 1.441022769690183e-05, "loss": 2.0065, "step": 19070 }, { "epoch": 2.1366181410974243, "grad_norm": 15.793349266052246, "learning_rate": 1.439156401642404e-05, "loss": 1.5416, "step": 19080 }, { "epoch": 2.1377379619260917, "grad_norm": 9.388581275939941, "learning_rate": 1.437290033594625e-05, "loss": 1.9277, "step": 19090 }, { "epoch": 2.1388577827547595, "grad_norm": 3.7332279682159424, "learning_rate": 1.435423665546846e-05, "loss": 1.7587, "step": 19100 }, { "epoch": 2.139977603583427, "grad_norm": 4.11780309677124, "learning_rate": 1.433557297499067e-05, "loss": 2.2728, "step": 19110 }, { "epoch": 2.141097424412094, "grad_norm": 4.731024742126465, "learning_rate": 1.4316909294512878e-05, "loss": 1.7996, "step": 19120 }, { "epoch": 2.1422172452407615, "grad_norm": 20.14070701599121, "learning_rate": 1.4298245614035088e-05, "loss": 1.6665, "step": 19130 }, { "epoch": 2.143337066069429, "grad_norm": 5.043517589569092, "learning_rate": 1.4279581933557298e-05, "loss": 1.8023, "step": 19140 }, { "epoch": 2.144456886898096, "grad_norm": 15.140097618103027, "learning_rate": 1.4260918253079508e-05, "loss": 1.9381, "step": 19150 }, { "epoch": 2.1455767077267636, "grad_norm": 5.910915374755859, "learning_rate": 1.4242254572601718e-05, "loss": 1.5582, "step": 19160 }, { "epoch": 2.146696528555431, "grad_norm": 4.935706615447998, "learning_rate": 1.4223590892123928e-05, "loss": 1.6278, "step": 19170 }, { "epoch": 2.1478163493840987, "grad_norm": 7.664555549621582, "learning_rate": 1.4204927211646138e-05, "loss": 2.1267, "step": 19180 }, { "epoch": 2.148936170212766, "grad_norm": 5.317629337310791, "learning_rate": 1.4186263531168348e-05, "loss": 1.6991, "step": 19190 }, { "epoch": 2.1500559910414334, "grad_norm": 8.390786170959473, "learning_rate": 1.4167599850690558e-05, "loss": 1.666, "step": 19200 }, { "epoch": 2.1511758118701008, "grad_norm": 4.987608432769775, "learning_rate": 1.4148936170212768e-05, "loss": 1.9875, "step": 19210 }, { "epoch": 2.152295632698768, "grad_norm": 13.22926139831543, "learning_rate": 1.4130272489734978e-05, "loss": 2.5654, "step": 19220 }, { "epoch": 2.1534154535274355, "grad_norm": 5.794547080993652, "learning_rate": 1.4111608809257188e-05, "loss": 1.5586, "step": 19230 }, { "epoch": 2.1545352743561033, "grad_norm": 18.272071838378906, "learning_rate": 1.4092945128779398e-05, "loss": 1.9604, "step": 19240 }, { "epoch": 2.1556550951847706, "grad_norm": 8.715819358825684, "learning_rate": 1.4074281448301604e-05, "loss": 2.167, "step": 19250 }, { "epoch": 2.156774916013438, "grad_norm": 5.621458530426025, "learning_rate": 1.4055617767823814e-05, "loss": 1.7484, "step": 19260 }, { "epoch": 2.1578947368421053, "grad_norm": 5.077019214630127, "learning_rate": 1.4036954087346024e-05, "loss": 1.7654, "step": 19270 }, { "epoch": 2.1590145576707727, "grad_norm": 4.050748825073242, "learning_rate": 1.4018290406868234e-05, "loss": 1.594, "step": 19280 }, { "epoch": 2.16013437849944, "grad_norm": 8.234112739562988, "learning_rate": 1.3999626726390444e-05, "loss": 2.0443, "step": 19290 }, { "epoch": 2.1612541993281074, "grad_norm": 9.419720649719238, "learning_rate": 1.3980963045912654e-05, "loss": 2.0494, "step": 19300 }, { "epoch": 2.1623740201567747, "grad_norm": 5.222434997558594, "learning_rate": 1.3962299365434864e-05, "loss": 2.1207, "step": 19310 }, { "epoch": 2.1634938409854425, "grad_norm": 4.949707508087158, "learning_rate": 1.3943635684957074e-05, "loss": 1.9951, "step": 19320 }, { "epoch": 2.16461366181411, "grad_norm": 5.496902942657471, "learning_rate": 1.3924972004479284e-05, "loss": 2.0267, "step": 19330 }, { "epoch": 2.165733482642777, "grad_norm": 21.034757614135742, "learning_rate": 1.3906308324001493e-05, "loss": 1.8192, "step": 19340 }, { "epoch": 2.1668533034714446, "grad_norm": 16.238187789916992, "learning_rate": 1.3887644643523703e-05, "loss": 1.9281, "step": 19350 }, { "epoch": 2.167973124300112, "grad_norm": 5.808258056640625, "learning_rate": 1.3868980963045913e-05, "loss": 1.7075, "step": 19360 }, { "epoch": 2.1690929451287793, "grad_norm": 4.748766899108887, "learning_rate": 1.3850317282568123e-05, "loss": 2.0612, "step": 19370 }, { "epoch": 2.1702127659574466, "grad_norm": 6.410683631896973, "learning_rate": 1.3831653602090333e-05, "loss": 1.8451, "step": 19380 }, { "epoch": 2.1713325867861144, "grad_norm": 9.00479507446289, "learning_rate": 1.3812989921612543e-05, "loss": 1.8231, "step": 19390 }, { "epoch": 2.1724524076147818, "grad_norm": 10.912854194641113, "learning_rate": 1.3794326241134753e-05, "loss": 1.75, "step": 19400 }, { "epoch": 2.173572228443449, "grad_norm": 4.568667888641357, "learning_rate": 1.3775662560656963e-05, "loss": 1.8237, "step": 19410 }, { "epoch": 2.1746920492721165, "grad_norm": 6.0548930168151855, "learning_rate": 1.3756998880179173e-05, "loss": 2.0656, "step": 19420 }, { "epoch": 2.175811870100784, "grad_norm": 6.4514031410217285, "learning_rate": 1.3738335199701383e-05, "loss": 1.8537, "step": 19430 }, { "epoch": 2.176931690929451, "grad_norm": 7.510464668273926, "learning_rate": 1.3719671519223593e-05, "loss": 2.1201, "step": 19440 }, { "epoch": 2.1780515117581185, "grad_norm": 6.162042617797852, "learning_rate": 1.3701007838745803e-05, "loss": 1.9171, "step": 19450 }, { "epoch": 2.1791713325867863, "grad_norm": 4.513441562652588, "learning_rate": 1.3682344158268013e-05, "loss": 1.6582, "step": 19460 }, { "epoch": 2.1802911534154537, "grad_norm": 5.428256988525391, "learning_rate": 1.3663680477790223e-05, "loss": 1.807, "step": 19470 }, { "epoch": 2.181410974244121, "grad_norm": 4.469424247741699, "learning_rate": 1.364501679731243e-05, "loss": 1.7969, "step": 19480 }, { "epoch": 2.1825307950727884, "grad_norm": 18.87086296081543, "learning_rate": 1.362635311683464e-05, "loss": 2.0421, "step": 19490 }, { "epoch": 2.1836506159014557, "grad_norm": 14.870500564575195, "learning_rate": 1.360768943635685e-05, "loss": 2.0896, "step": 19500 }, { "epoch": 2.184770436730123, "grad_norm": 4.639315605163574, "learning_rate": 1.358902575587906e-05, "loss": 1.6858, "step": 19510 }, { "epoch": 2.1858902575587904, "grad_norm": 5.512360572814941, "learning_rate": 1.357036207540127e-05, "loss": 1.7873, "step": 19520 }, { "epoch": 2.187010078387458, "grad_norm": 4.680398464202881, "learning_rate": 1.355169839492348e-05, "loss": 1.7172, "step": 19530 }, { "epoch": 2.1881298992161256, "grad_norm": 6.576661586761475, "learning_rate": 1.353303471444569e-05, "loss": 1.4995, "step": 19540 }, { "epoch": 2.189249720044793, "grad_norm": 5.627395153045654, "learning_rate": 1.3514371033967898e-05, "loss": 2.0323, "step": 19550 }, { "epoch": 2.1903695408734603, "grad_norm": 9.551543235778809, "learning_rate": 1.3495707353490108e-05, "loss": 1.9394, "step": 19560 }, { "epoch": 2.1914893617021276, "grad_norm": 3.9927797317504883, "learning_rate": 1.3477043673012318e-05, "loss": 1.7925, "step": 19570 }, { "epoch": 2.192609182530795, "grad_norm": 5.432565212249756, "learning_rate": 1.3458379992534528e-05, "loss": 1.9283, "step": 19580 }, { "epoch": 2.1937290033594623, "grad_norm": 16.03640365600586, "learning_rate": 1.3439716312056738e-05, "loss": 1.554, "step": 19590 }, { "epoch": 2.19484882418813, "grad_norm": 9.58271598815918, "learning_rate": 1.3421052631578948e-05, "loss": 2.2815, "step": 19600 }, { "epoch": 2.1959686450167974, "grad_norm": 5.797165393829346, "learning_rate": 1.3402388951101158e-05, "loss": 2.107, "step": 19610 }, { "epoch": 2.197088465845465, "grad_norm": 11.725902557373047, "learning_rate": 1.3383725270623368e-05, "loss": 1.8659, "step": 19620 }, { "epoch": 2.198208286674132, "grad_norm": 16.76238441467285, "learning_rate": 1.3365061590145578e-05, "loss": 1.9411, "step": 19630 }, { "epoch": 2.1993281075027995, "grad_norm": 4.064399242401123, "learning_rate": 1.3346397909667788e-05, "loss": 2.0996, "step": 19640 }, { "epoch": 2.200447928331467, "grad_norm": 12.260157585144043, "learning_rate": 1.3327734229189998e-05, "loss": 1.8016, "step": 19650 }, { "epoch": 2.201567749160134, "grad_norm": 4.968259811401367, "learning_rate": 1.3309070548712208e-05, "loss": 2.5253, "step": 19660 }, { "epoch": 2.202687569988802, "grad_norm": 15.491079330444336, "learning_rate": 1.3290406868234418e-05, "loss": 1.9006, "step": 19670 }, { "epoch": 2.2038073908174693, "grad_norm": 16.073698043823242, "learning_rate": 1.3271743187756628e-05, "loss": 2.1384, "step": 19680 }, { "epoch": 2.2049272116461367, "grad_norm": 4.668467998504639, "learning_rate": 1.3253079507278838e-05, "loss": 1.7092, "step": 19690 }, { "epoch": 2.206047032474804, "grad_norm": 16.72428321838379, "learning_rate": 1.3234415826801048e-05, "loss": 2.2848, "step": 19700 }, { "epoch": 2.2071668533034714, "grad_norm": 16.04388999938965, "learning_rate": 1.3215752146323254e-05, "loss": 1.7384, "step": 19710 }, { "epoch": 2.2082866741321387, "grad_norm": 7.498695373535156, "learning_rate": 1.3197088465845464e-05, "loss": 1.7484, "step": 19720 }, { "epoch": 2.209406494960806, "grad_norm": 4.43148136138916, "learning_rate": 1.3178424785367674e-05, "loss": 1.845, "step": 19730 }, { "epoch": 2.2105263157894735, "grad_norm": 3.5520262718200684, "learning_rate": 1.3159761104889884e-05, "loss": 2.0745, "step": 19740 }, { "epoch": 2.2116461366181412, "grad_norm": 8.417689323425293, "learning_rate": 1.3141097424412094e-05, "loss": 2.0317, "step": 19750 }, { "epoch": 2.2127659574468086, "grad_norm": 6.288638114929199, "learning_rate": 1.3122433743934304e-05, "loss": 1.9475, "step": 19760 }, { "epoch": 2.213885778275476, "grad_norm": 9.5358304977417, "learning_rate": 1.3103770063456513e-05, "loss": 1.5572, "step": 19770 }, { "epoch": 2.2150055991041433, "grad_norm": 6.784647464752197, "learning_rate": 1.3085106382978723e-05, "loss": 1.6474, "step": 19780 }, { "epoch": 2.2161254199328106, "grad_norm": 6.584368705749512, "learning_rate": 1.3066442702500933e-05, "loss": 1.625, "step": 19790 }, { "epoch": 2.217245240761478, "grad_norm": 10.06530475616455, "learning_rate": 1.3047779022023143e-05, "loss": 2.4165, "step": 19800 }, { "epoch": 2.218365061590146, "grad_norm": 8.728497505187988, "learning_rate": 1.3029115341545353e-05, "loss": 2.192, "step": 19810 }, { "epoch": 2.219484882418813, "grad_norm": 4.747127532958984, "learning_rate": 1.3010451661067563e-05, "loss": 1.8832, "step": 19820 }, { "epoch": 2.2206047032474805, "grad_norm": 4.508890628814697, "learning_rate": 1.2991787980589773e-05, "loss": 1.9551, "step": 19830 }, { "epoch": 2.221724524076148, "grad_norm": 9.029202461242676, "learning_rate": 1.2973124300111983e-05, "loss": 1.9141, "step": 19840 }, { "epoch": 2.222844344904815, "grad_norm": 4.136125087738037, "learning_rate": 1.2954460619634193e-05, "loss": 1.9544, "step": 19850 }, { "epoch": 2.2239641657334825, "grad_norm": 4.724370002746582, "learning_rate": 1.2935796939156403e-05, "loss": 1.5162, "step": 19860 }, { "epoch": 2.22508398656215, "grad_norm": 5.846231937408447, "learning_rate": 1.2917133258678613e-05, "loss": 2.1023, "step": 19870 }, { "epoch": 2.2262038073908172, "grad_norm": 5.567933082580566, "learning_rate": 1.2898469578200823e-05, "loss": 1.5832, "step": 19880 }, { "epoch": 2.227323628219485, "grad_norm": 13.980506896972656, "learning_rate": 1.2879805897723033e-05, "loss": 1.5861, "step": 19890 }, { "epoch": 2.2284434490481524, "grad_norm": 14.191877365112305, "learning_rate": 1.2861142217245243e-05, "loss": 1.6714, "step": 19900 }, { "epoch": 2.2295632698768197, "grad_norm": 10.855998992919922, "learning_rate": 1.2842478536767453e-05, "loss": 1.9644, "step": 19910 }, { "epoch": 2.230683090705487, "grad_norm": 5.852384090423584, "learning_rate": 1.2823814856289663e-05, "loss": 1.8953, "step": 19920 }, { "epoch": 2.2318029115341544, "grad_norm": 5.915239334106445, "learning_rate": 1.2805151175811871e-05, "loss": 1.5698, "step": 19930 }, { "epoch": 2.232922732362822, "grad_norm": 5.294043064117432, "learning_rate": 1.2786487495334081e-05, "loss": 2.0655, "step": 19940 }, { "epoch": 2.2340425531914896, "grad_norm": 8.937568664550781, "learning_rate": 1.276782381485629e-05, "loss": 1.6406, "step": 19950 }, { "epoch": 2.235162374020157, "grad_norm": 3.592744827270508, "learning_rate": 1.27491601343785e-05, "loss": 1.5429, "step": 19960 }, { "epoch": 2.2362821948488243, "grad_norm": 5.134018898010254, "learning_rate": 1.273049645390071e-05, "loss": 2.1943, "step": 19970 }, { "epoch": 2.2374020156774916, "grad_norm": 4.749664306640625, "learning_rate": 1.271183277342292e-05, "loss": 1.8952, "step": 19980 }, { "epoch": 2.238521836506159, "grad_norm": 14.125395774841309, "learning_rate": 1.2693169092945128e-05, "loss": 1.9117, "step": 19990 }, { "epoch": 2.2396416573348263, "grad_norm": 5.6524858474731445, "learning_rate": 1.2674505412467338e-05, "loss": 1.8199, "step": 20000 }, { "epoch": 2.2407614781634937, "grad_norm": 9.836930274963379, "learning_rate": 1.2655841731989548e-05, "loss": 1.6642, "step": 20010 }, { "epoch": 2.241881298992161, "grad_norm": 19.449764251708984, "learning_rate": 1.2637178051511758e-05, "loss": 1.9992, "step": 20020 }, { "epoch": 2.243001119820829, "grad_norm": 6.832662105560303, "learning_rate": 1.2618514371033968e-05, "loss": 1.5874, "step": 20030 }, { "epoch": 2.244120940649496, "grad_norm": 17.8643856048584, "learning_rate": 1.2599850690556178e-05, "loss": 2.1804, "step": 20040 }, { "epoch": 2.2452407614781635, "grad_norm": 5.4305620193481445, "learning_rate": 1.2581187010078388e-05, "loss": 1.7595, "step": 20050 }, { "epoch": 2.246360582306831, "grad_norm": 5.813434600830078, "learning_rate": 1.2562523329600598e-05, "loss": 1.8339, "step": 20060 }, { "epoch": 2.2474804031354982, "grad_norm": 22.452621459960938, "learning_rate": 1.2543859649122808e-05, "loss": 2.0927, "step": 20070 }, { "epoch": 2.2486002239641656, "grad_norm": 5.384066104888916, "learning_rate": 1.2525195968645018e-05, "loss": 1.8251, "step": 20080 }, { "epoch": 2.249720044792833, "grad_norm": 16.19381332397461, "learning_rate": 1.2506532288167228e-05, "loss": 1.9519, "step": 20090 }, { "epoch": 2.2508398656215007, "grad_norm": 5.359135627746582, "learning_rate": 1.2487868607689438e-05, "loss": 1.7556, "step": 20100 }, { "epoch": 2.251959686450168, "grad_norm": 8.93488597869873, "learning_rate": 1.2469204927211648e-05, "loss": 2.0605, "step": 20110 }, { "epoch": 2.2530795072788354, "grad_norm": 7.26114559173584, "learning_rate": 1.2450541246733856e-05, "loss": 1.9192, "step": 20120 }, { "epoch": 2.254199328107503, "grad_norm": 10.906415939331055, "learning_rate": 1.2431877566256066e-05, "loss": 1.653, "step": 20130 }, { "epoch": 2.25531914893617, "grad_norm": 5.915148735046387, "learning_rate": 1.2413213885778276e-05, "loss": 2.0476, "step": 20140 }, { "epoch": 2.2564389697648375, "grad_norm": 10.197397232055664, "learning_rate": 1.2394550205300486e-05, "loss": 1.9027, "step": 20150 }, { "epoch": 2.257558790593505, "grad_norm": 14.30677318572998, "learning_rate": 1.2375886524822696e-05, "loss": 1.4843, "step": 20160 }, { "epoch": 2.2586786114221726, "grad_norm": 4.197308540344238, "learning_rate": 1.2357222844344905e-05, "loss": 1.9333, "step": 20170 }, { "epoch": 2.25979843225084, "grad_norm": 6.416319847106934, "learning_rate": 1.2338559163867115e-05, "loss": 2.047, "step": 20180 }, { "epoch": 2.2609182530795073, "grad_norm": 3.727569818496704, "learning_rate": 1.2319895483389325e-05, "loss": 2.0547, "step": 20190 }, { "epoch": 2.2620380739081747, "grad_norm": 4.975082874298096, "learning_rate": 1.2301231802911535e-05, "loss": 1.9228, "step": 20200 }, { "epoch": 2.263157894736842, "grad_norm": 10.25108528137207, "learning_rate": 1.2282568122433745e-05, "loss": 1.7067, "step": 20210 }, { "epoch": 2.2642777155655094, "grad_norm": 6.914962291717529, "learning_rate": 1.2263904441955955e-05, "loss": 1.9841, "step": 20220 }, { "epoch": 2.265397536394177, "grad_norm": 5.214871883392334, "learning_rate": 1.2245240761478165e-05, "loss": 1.6495, "step": 20230 }, { "epoch": 2.2665173572228445, "grad_norm": 11.081496238708496, "learning_rate": 1.2226577081000373e-05, "loss": 1.9833, "step": 20240 }, { "epoch": 2.267637178051512, "grad_norm": 7.861496448516846, "learning_rate": 1.2207913400522583e-05, "loss": 2.2553, "step": 20250 }, { "epoch": 2.2687569988801792, "grad_norm": 4.18981409072876, "learning_rate": 1.2189249720044793e-05, "loss": 1.9001, "step": 20260 }, { "epoch": 2.2698768197088466, "grad_norm": 15.897494316101074, "learning_rate": 1.2170586039567003e-05, "loss": 1.943, "step": 20270 }, { "epoch": 2.270996640537514, "grad_norm": 14.775899887084961, "learning_rate": 1.2151922359089213e-05, "loss": 2.1118, "step": 20280 }, { "epoch": 2.2721164613661813, "grad_norm": 10.079516410827637, "learning_rate": 1.2133258678611423e-05, "loss": 1.9167, "step": 20290 }, { "epoch": 2.2732362821948486, "grad_norm": 4.894615173339844, "learning_rate": 1.2114594998133633e-05, "loss": 2.1881, "step": 20300 }, { "epoch": 2.2743561030235164, "grad_norm": 16.707927703857422, "learning_rate": 1.2095931317655843e-05, "loss": 2.2987, "step": 20310 }, { "epoch": 2.275475923852184, "grad_norm": 7.284656524658203, "learning_rate": 1.2077267637178053e-05, "loss": 1.7505, "step": 20320 }, { "epoch": 2.276595744680851, "grad_norm": 5.649649143218994, "learning_rate": 1.2058603956700263e-05, "loss": 2.2449, "step": 20330 }, { "epoch": 2.2777155655095185, "grad_norm": 5.512756824493408, "learning_rate": 1.2039940276222473e-05, "loss": 1.8765, "step": 20340 }, { "epoch": 2.278835386338186, "grad_norm": 15.581269264221191, "learning_rate": 1.2021276595744681e-05, "loss": 2.1552, "step": 20350 }, { "epoch": 2.279955207166853, "grad_norm": 13.498668670654297, "learning_rate": 1.2002612915266891e-05, "loss": 2.213, "step": 20360 }, { "epoch": 2.2810750279955205, "grad_norm": 9.995055198669434, "learning_rate": 1.1983949234789101e-05, "loss": 2.079, "step": 20370 }, { "epoch": 2.2821948488241883, "grad_norm": 3.790062189102173, "learning_rate": 1.1965285554311311e-05, "loss": 1.6197, "step": 20380 }, { "epoch": 2.2833146696528557, "grad_norm": 4.261139392852783, "learning_rate": 1.194662187383352e-05, "loss": 2.0012, "step": 20390 }, { "epoch": 2.284434490481523, "grad_norm": 3.16943621635437, "learning_rate": 1.192795819335573e-05, "loss": 1.9648, "step": 20400 }, { "epoch": 2.2855543113101904, "grad_norm": 5.687836647033691, "learning_rate": 1.190929451287794e-05, "loss": 1.6828, "step": 20410 }, { "epoch": 2.2866741321388577, "grad_norm": 5.451729774475098, "learning_rate": 1.189063083240015e-05, "loss": 2.2748, "step": 20420 }, { "epoch": 2.287793952967525, "grad_norm": 10.777783393859863, "learning_rate": 1.187196715192236e-05, "loss": 2.0554, "step": 20430 }, { "epoch": 2.2889137737961924, "grad_norm": 5.5777459144592285, "learning_rate": 1.185330347144457e-05, "loss": 1.9674, "step": 20440 }, { "epoch": 2.29003359462486, "grad_norm": 11.55379581451416, "learning_rate": 1.183463979096678e-05, "loss": 1.8068, "step": 20450 }, { "epoch": 2.2911534154535276, "grad_norm": 4.206369876861572, "learning_rate": 1.181597611048899e-05, "loss": 2.1127, "step": 20460 }, { "epoch": 2.292273236282195, "grad_norm": 6.190250873565674, "learning_rate": 1.1797312430011198e-05, "loss": 2.1131, "step": 20470 }, { "epoch": 2.2933930571108623, "grad_norm": 4.842706203460693, "learning_rate": 1.1778648749533408e-05, "loss": 1.7894, "step": 20480 }, { "epoch": 2.2945128779395296, "grad_norm": 9.449223518371582, "learning_rate": 1.1759985069055618e-05, "loss": 1.7278, "step": 20490 }, { "epoch": 2.295632698768197, "grad_norm": 11.598458290100098, "learning_rate": 1.1741321388577828e-05, "loss": 1.9876, "step": 20500 }, { "epoch": 2.2967525195968643, "grad_norm": 4.6162238121032715, "learning_rate": 1.1722657708100038e-05, "loss": 1.4452, "step": 20510 }, { "epoch": 2.297872340425532, "grad_norm": 17.614953994750977, "learning_rate": 1.1703994027622248e-05, "loss": 2.0197, "step": 20520 }, { "epoch": 2.2989921612541995, "grad_norm": 13.69670295715332, "learning_rate": 1.1685330347144458e-05, "loss": 1.661, "step": 20530 }, { "epoch": 2.300111982082867, "grad_norm": 3.7186923027038574, "learning_rate": 1.1666666666666668e-05, "loss": 2.0852, "step": 20540 }, { "epoch": 2.301231802911534, "grad_norm": 7.386007308959961, "learning_rate": 1.1648002986188878e-05, "loss": 1.8018, "step": 20550 }, { "epoch": 2.3023516237402015, "grad_norm": 4.978532791137695, "learning_rate": 1.1629339305711088e-05, "loss": 1.8555, "step": 20560 }, { "epoch": 2.303471444568869, "grad_norm": 11.135411262512207, "learning_rate": 1.1610675625233298e-05, "loss": 2.0469, "step": 20570 }, { "epoch": 2.3045912653975362, "grad_norm": 4.908180236816406, "learning_rate": 1.1592011944755506e-05, "loss": 1.5782, "step": 20580 }, { "epoch": 2.3057110862262036, "grad_norm": 9.538016319274902, "learning_rate": 1.1573348264277716e-05, "loss": 2.1164, "step": 20590 }, { "epoch": 2.3068309070548714, "grad_norm": 10.626128196716309, "learning_rate": 1.1554684583799926e-05, "loss": 2.0624, "step": 20600 }, { "epoch": 2.3079507278835387, "grad_norm": 6.66682243347168, "learning_rate": 1.1536020903322135e-05, "loss": 1.7405, "step": 20610 }, { "epoch": 2.309070548712206, "grad_norm": 5.158255100250244, "learning_rate": 1.1517357222844345e-05, "loss": 1.9986, "step": 20620 }, { "epoch": 2.3101903695408734, "grad_norm": 12.696172714233398, "learning_rate": 1.1498693542366555e-05, "loss": 1.9899, "step": 20630 }, { "epoch": 2.3113101903695408, "grad_norm": 10.744148254394531, "learning_rate": 1.1480029861888765e-05, "loss": 2.0481, "step": 20640 }, { "epoch": 2.312430011198208, "grad_norm": 10.337085723876953, "learning_rate": 1.1461366181410975e-05, "loss": 2.08, "step": 20650 }, { "epoch": 2.313549832026876, "grad_norm": 9.263678550720215, "learning_rate": 1.1442702500933185e-05, "loss": 1.7892, "step": 20660 }, { "epoch": 2.3146696528555433, "grad_norm": 12.324502944946289, "learning_rate": 1.1424038820455395e-05, "loss": 1.9377, "step": 20670 }, { "epoch": 2.3157894736842106, "grad_norm": 6.701484203338623, "learning_rate": 1.1405375139977605e-05, "loss": 1.8667, "step": 20680 }, { "epoch": 2.316909294512878, "grad_norm": 7.743568420410156, "learning_rate": 1.1386711459499815e-05, "loss": 1.5235, "step": 20690 }, { "epoch": 2.3180291153415453, "grad_norm": 4.805120468139648, "learning_rate": 1.1368047779022023e-05, "loss": 1.8663, "step": 20700 }, { "epoch": 2.3191489361702127, "grad_norm": 6.219134330749512, "learning_rate": 1.1349384098544233e-05, "loss": 1.5116, "step": 20710 }, { "epoch": 2.32026875699888, "grad_norm": 12.677251815795898, "learning_rate": 1.1330720418066443e-05, "loss": 1.8188, "step": 20720 }, { "epoch": 2.3213885778275474, "grad_norm": 17.74958038330078, "learning_rate": 1.1312056737588653e-05, "loss": 2.178, "step": 20730 }, { "epoch": 2.322508398656215, "grad_norm": 11.15487289428711, "learning_rate": 1.1293393057110863e-05, "loss": 2.0488, "step": 20740 }, { "epoch": 2.3236282194848825, "grad_norm": 15.36052417755127, "learning_rate": 1.1274729376633073e-05, "loss": 1.7865, "step": 20750 }, { "epoch": 2.32474804031355, "grad_norm": 14.987112998962402, "learning_rate": 1.1256065696155283e-05, "loss": 1.6118, "step": 20760 }, { "epoch": 2.325867861142217, "grad_norm": 4.916079998016357, "learning_rate": 1.1237402015677493e-05, "loss": 2.0262, "step": 20770 }, { "epoch": 2.3269876819708846, "grad_norm": 20.49549102783203, "learning_rate": 1.1218738335199703e-05, "loss": 2.1032, "step": 20780 }, { "epoch": 2.328107502799552, "grad_norm": 16.82073402404785, "learning_rate": 1.1200074654721911e-05, "loss": 2.0666, "step": 20790 }, { "epoch": 2.3292273236282197, "grad_norm": 4.178780555725098, "learning_rate": 1.1181410974244121e-05, "loss": 1.8655, "step": 20800 }, { "epoch": 2.330347144456887, "grad_norm": 16.733497619628906, "learning_rate": 1.1162747293766331e-05, "loss": 1.7731, "step": 20810 }, { "epoch": 2.3314669652855544, "grad_norm": 5.161161422729492, "learning_rate": 1.114408361328854e-05, "loss": 1.6687, "step": 20820 }, { "epoch": 2.3325867861142218, "grad_norm": 4.8293304443359375, "learning_rate": 1.112541993281075e-05, "loss": 2.1226, "step": 20830 }, { "epoch": 2.333706606942889, "grad_norm": 9.590071678161621, "learning_rate": 1.110675625233296e-05, "loss": 1.528, "step": 20840 }, { "epoch": 2.3348264277715565, "grad_norm": 6.294408321380615, "learning_rate": 1.108809257185517e-05, "loss": 1.73, "step": 20850 }, { "epoch": 2.335946248600224, "grad_norm": 10.485013008117676, "learning_rate": 1.106942889137738e-05, "loss": 2.159, "step": 20860 }, { "epoch": 2.337066069428891, "grad_norm": 4.454178333282471, "learning_rate": 1.105076521089959e-05, "loss": 1.6548, "step": 20870 }, { "epoch": 2.338185890257559, "grad_norm": 5.956608772277832, "learning_rate": 1.10321015304218e-05, "loss": 1.5492, "step": 20880 }, { "epoch": 2.3393057110862263, "grad_norm": 7.09451150894165, "learning_rate": 1.101343784994401e-05, "loss": 1.6076, "step": 20890 }, { "epoch": 2.3404255319148937, "grad_norm": 13.640632629394531, "learning_rate": 1.099477416946622e-05, "loss": 2.6322, "step": 20900 }, { "epoch": 2.341545352743561, "grad_norm": 13.958121299743652, "learning_rate": 1.097611048898843e-05, "loss": 2.269, "step": 20910 }, { "epoch": 2.3426651735722284, "grad_norm": 4.459420680999756, "learning_rate": 1.095744680851064e-05, "loss": 2.1053, "step": 20920 }, { "epoch": 2.3437849944008957, "grad_norm": 6.627596855163574, "learning_rate": 1.093878312803285e-05, "loss": 1.8687, "step": 20930 }, { "epoch": 2.3449048152295635, "grad_norm": 11.237924575805664, "learning_rate": 1.0920119447555058e-05, "loss": 1.8006, "step": 20940 }, { "epoch": 2.346024636058231, "grad_norm": 6.611232757568359, "learning_rate": 1.0901455767077268e-05, "loss": 1.7491, "step": 20950 }, { "epoch": 2.347144456886898, "grad_norm": 4.241340160369873, "learning_rate": 1.0882792086599478e-05, "loss": 1.9278, "step": 20960 }, { "epoch": 2.3482642777155656, "grad_norm": 5.34893274307251, "learning_rate": 1.0864128406121688e-05, "loss": 1.9011, "step": 20970 }, { "epoch": 2.349384098544233, "grad_norm": 15.109663009643555, "learning_rate": 1.0845464725643898e-05, "loss": 1.8797, "step": 20980 }, { "epoch": 2.3505039193729003, "grad_norm": 6.220200061798096, "learning_rate": 1.0826801045166108e-05, "loss": 2.1459, "step": 20990 }, { "epoch": 2.3516237402015676, "grad_norm": 4.657541751861572, "learning_rate": 1.0808137364688318e-05, "loss": 1.7789, "step": 21000 }, { "epoch": 2.352743561030235, "grad_norm": 3.8326404094696045, "learning_rate": 1.0789473684210526e-05, "loss": 2.0583, "step": 21010 }, { "epoch": 2.3538633818589028, "grad_norm": 6.379500865936279, "learning_rate": 1.0770810003732736e-05, "loss": 1.9574, "step": 21020 }, { "epoch": 2.35498320268757, "grad_norm": 8.482017517089844, "learning_rate": 1.0752146323254946e-05, "loss": 1.7585, "step": 21030 }, { "epoch": 2.3561030235162375, "grad_norm": 10.17912769317627, "learning_rate": 1.0733482642777156e-05, "loss": 2.1129, "step": 21040 }, { "epoch": 2.357222844344905, "grad_norm": 5.909788131713867, "learning_rate": 1.0714818962299365e-05, "loss": 1.8167, "step": 21050 }, { "epoch": 2.358342665173572, "grad_norm": 4.550489902496338, "learning_rate": 1.0696155281821575e-05, "loss": 1.4302, "step": 21060 }, { "epoch": 2.3594624860022395, "grad_norm": 9.305154800415039, "learning_rate": 1.0677491601343785e-05, "loss": 1.5549, "step": 21070 }, { "epoch": 2.360582306830907, "grad_norm": 6.617506504058838, "learning_rate": 1.0658827920865995e-05, "loss": 1.8111, "step": 21080 }, { "epoch": 2.3617021276595747, "grad_norm": 4.230848789215088, "learning_rate": 1.0640164240388205e-05, "loss": 1.604, "step": 21090 }, { "epoch": 2.362821948488242, "grad_norm": 5.817328929901123, "learning_rate": 1.0621500559910415e-05, "loss": 2.2684, "step": 21100 }, { "epoch": 2.3639417693169094, "grad_norm": 11.449498176574707, "learning_rate": 1.0602836879432625e-05, "loss": 1.7141, "step": 21110 }, { "epoch": 2.3650615901455767, "grad_norm": 11.167801856994629, "learning_rate": 1.0584173198954835e-05, "loss": 2.0209, "step": 21120 }, { "epoch": 2.366181410974244, "grad_norm": 10.6549711227417, "learning_rate": 1.0565509518477045e-05, "loss": 1.5182, "step": 21130 }, { "epoch": 2.3673012318029114, "grad_norm": 7.9820122718811035, "learning_rate": 1.0546845837999255e-05, "loss": 1.5622, "step": 21140 }, { "epoch": 2.3684210526315788, "grad_norm": 3.4148659706115723, "learning_rate": 1.0528182157521465e-05, "loss": 2.0534, "step": 21150 }, { "epoch": 2.369540873460246, "grad_norm": 11.469594955444336, "learning_rate": 1.0509518477043675e-05, "loss": 2.0005, "step": 21160 }, { "epoch": 2.370660694288914, "grad_norm": 6.6395697593688965, "learning_rate": 1.0490854796565883e-05, "loss": 2.0595, "step": 21170 }, { "epoch": 2.3717805151175813, "grad_norm": 6.698371410369873, "learning_rate": 1.0472191116088093e-05, "loss": 2.0216, "step": 21180 }, { "epoch": 2.3729003359462486, "grad_norm": 12.63893985748291, "learning_rate": 1.0453527435610303e-05, "loss": 2.1768, "step": 21190 }, { "epoch": 2.374020156774916, "grad_norm": 18.96299171447754, "learning_rate": 1.0434863755132513e-05, "loss": 1.8743, "step": 21200 }, { "epoch": 2.3751399776035833, "grad_norm": 5.689150810241699, "learning_rate": 1.0416200074654723e-05, "loss": 1.7398, "step": 21210 }, { "epoch": 2.3762597984322507, "grad_norm": 7.150450229644775, "learning_rate": 1.0397536394176933e-05, "loss": 1.4524, "step": 21220 }, { "epoch": 2.3773796192609185, "grad_norm": 11.126862525939941, "learning_rate": 1.0378872713699141e-05, "loss": 2.1519, "step": 21230 }, { "epoch": 2.378499440089586, "grad_norm": 5.954022407531738, "learning_rate": 1.0360209033221351e-05, "loss": 2.1018, "step": 21240 }, { "epoch": 2.379619260918253, "grad_norm": 13.803711891174316, "learning_rate": 1.0341545352743561e-05, "loss": 1.5974, "step": 21250 }, { "epoch": 2.3807390817469205, "grad_norm": 8.766247749328613, "learning_rate": 1.0322881672265771e-05, "loss": 2.0202, "step": 21260 }, { "epoch": 2.381858902575588, "grad_norm": 10.347888946533203, "learning_rate": 1.0304217991787981e-05, "loss": 1.5816, "step": 21270 }, { "epoch": 2.382978723404255, "grad_norm": 14.29037094116211, "learning_rate": 1.0285554311310191e-05, "loss": 1.8299, "step": 21280 }, { "epoch": 2.3840985442329226, "grad_norm": 11.084178924560547, "learning_rate": 1.02668906308324e-05, "loss": 2.0029, "step": 21290 }, { "epoch": 2.38521836506159, "grad_norm": 4.837276935577393, "learning_rate": 1.024822695035461e-05, "loss": 1.931, "step": 21300 }, { "epoch": 2.3863381858902577, "grad_norm": 13.397496223449707, "learning_rate": 1.022956326987682e-05, "loss": 1.6302, "step": 21310 }, { "epoch": 2.387458006718925, "grad_norm": 14.555484771728516, "learning_rate": 1.021089958939903e-05, "loss": 1.988, "step": 21320 }, { "epoch": 2.3885778275475924, "grad_norm": 15.178104400634766, "learning_rate": 1.019223590892124e-05, "loss": 2.0671, "step": 21330 }, { "epoch": 2.3896976483762598, "grad_norm": 19.128128051757812, "learning_rate": 1.017357222844345e-05, "loss": 1.8196, "step": 21340 }, { "epoch": 2.390817469204927, "grad_norm": 5.763383865356445, "learning_rate": 1.015490854796566e-05, "loss": 1.6087, "step": 21350 }, { "epoch": 2.3919372900335945, "grad_norm": 8.197660446166992, "learning_rate": 1.013624486748787e-05, "loss": 1.7122, "step": 21360 }, { "epoch": 2.3930571108622622, "grad_norm": 14.964632987976074, "learning_rate": 1.011758118701008e-05, "loss": 1.8517, "step": 21370 }, { "epoch": 2.3941769316909296, "grad_norm": 4.898643970489502, "learning_rate": 1.009891750653229e-05, "loss": 1.7062, "step": 21380 }, { "epoch": 2.395296752519597, "grad_norm": 6.609580039978027, "learning_rate": 1.00802538260545e-05, "loss": 1.8959, "step": 21390 }, { "epoch": 2.3964165733482643, "grad_norm": 5.1216912269592285, "learning_rate": 1.0061590145576708e-05, "loss": 2.305, "step": 21400 }, { "epoch": 2.3975363941769317, "grad_norm": 12.660892486572266, "learning_rate": 1.0042926465098918e-05, "loss": 1.8053, "step": 21410 }, { "epoch": 2.398656215005599, "grad_norm": 14.176844596862793, "learning_rate": 1.0024262784621128e-05, "loss": 1.9157, "step": 21420 }, { "epoch": 2.3997760358342664, "grad_norm": 3.574338674545288, "learning_rate": 1.0005599104143338e-05, "loss": 2.1178, "step": 21430 }, { "epoch": 2.4008958566629337, "grad_norm": 7.545993804931641, "learning_rate": 9.986935423665546e-06, "loss": 1.9478, "step": 21440 }, { "epoch": 2.4020156774916015, "grad_norm": 5.757676601409912, "learning_rate": 9.968271743187756e-06, "loss": 1.6439, "step": 21450 }, { "epoch": 2.403135498320269, "grad_norm": 8.382559776306152, "learning_rate": 9.949608062709966e-06, "loss": 2.1037, "step": 21460 }, { "epoch": 2.404255319148936, "grad_norm": 20.870803833007812, "learning_rate": 9.930944382232176e-06, "loss": 2.1286, "step": 21470 }, { "epoch": 2.4053751399776035, "grad_norm": 10.343074798583984, "learning_rate": 9.912280701754386e-06, "loss": 1.7321, "step": 21480 }, { "epoch": 2.406494960806271, "grad_norm": 11.975387573242188, "learning_rate": 9.893617021276596e-06, "loss": 1.8359, "step": 21490 }, { "epoch": 2.4076147816349383, "grad_norm": 9.322503089904785, "learning_rate": 9.874953340798806e-06, "loss": 1.7441, "step": 21500 }, { "epoch": 2.408734602463606, "grad_norm": 5.631967544555664, "learning_rate": 9.856289660321016e-06, "loss": 1.8915, "step": 21510 }, { "epoch": 2.4098544232922734, "grad_norm": 11.873062133789062, "learning_rate": 9.837625979843225e-06, "loss": 1.924, "step": 21520 }, { "epoch": 2.4109742441209407, "grad_norm": 5.398472785949707, "learning_rate": 9.818962299365435e-06, "loss": 1.8141, "step": 21530 }, { "epoch": 2.412094064949608, "grad_norm": 5.431132793426514, "learning_rate": 9.800298618887645e-06, "loss": 2.0006, "step": 21540 }, { "epoch": 2.4132138857782754, "grad_norm": 6.424263000488281, "learning_rate": 9.781634938409855e-06, "loss": 2.2687, "step": 21550 }, { "epoch": 2.414333706606943, "grad_norm": 15.934911727905273, "learning_rate": 9.762971257932065e-06, "loss": 1.5339, "step": 21560 }, { "epoch": 2.41545352743561, "grad_norm": 8.640837669372559, "learning_rate": 9.744307577454275e-06, "loss": 1.9506, "step": 21570 }, { "epoch": 2.4165733482642775, "grad_norm": 10.464945793151855, "learning_rate": 9.725643896976485e-06, "loss": 2.1268, "step": 21580 }, { "epoch": 2.4176931690929453, "grad_norm": 10.642047882080078, "learning_rate": 9.706980216498695e-06, "loss": 1.7478, "step": 21590 }, { "epoch": 2.4188129899216126, "grad_norm": 10.365204811096191, "learning_rate": 9.688316536020905e-06, "loss": 2.1777, "step": 21600 }, { "epoch": 2.41993281075028, "grad_norm": 13.834842681884766, "learning_rate": 9.669652855543115e-06, "loss": 2.0466, "step": 21610 }, { "epoch": 2.4210526315789473, "grad_norm": 11.793619155883789, "learning_rate": 9.650989175065325e-06, "loss": 1.707, "step": 21620 }, { "epoch": 2.4221724524076147, "grad_norm": 12.311315536499023, "learning_rate": 9.632325494587533e-06, "loss": 1.8893, "step": 21630 }, { "epoch": 2.423292273236282, "grad_norm": 11.055502891540527, "learning_rate": 9.613661814109743e-06, "loss": 1.9288, "step": 21640 }, { "epoch": 2.42441209406495, "grad_norm": 4.9164934158325195, "learning_rate": 9.594998133631953e-06, "loss": 2.414, "step": 21650 }, { "epoch": 2.425531914893617, "grad_norm": 15.42969036102295, "learning_rate": 9.576334453154161e-06, "loss": 1.9903, "step": 21660 }, { "epoch": 2.4266517357222845, "grad_norm": 10.449507713317871, "learning_rate": 9.557670772676371e-06, "loss": 1.6622, "step": 21670 }, { "epoch": 2.427771556550952, "grad_norm": 14.963556289672852, "learning_rate": 9.539007092198581e-06, "loss": 1.6082, "step": 21680 }, { "epoch": 2.4288913773796192, "grad_norm": 13.123733520507812, "learning_rate": 9.520343411720791e-06, "loss": 1.9141, "step": 21690 }, { "epoch": 2.4300111982082866, "grad_norm": 6.848084449768066, "learning_rate": 9.501679731243001e-06, "loss": 1.83, "step": 21700 }, { "epoch": 2.431131019036954, "grad_norm": 5.545853137969971, "learning_rate": 9.483016050765211e-06, "loss": 2.2289, "step": 21710 }, { "epoch": 2.4322508398656213, "grad_norm": 11.716573715209961, "learning_rate": 9.464352370287421e-06, "loss": 1.954, "step": 21720 }, { "epoch": 2.433370660694289, "grad_norm": 7.332011699676514, "learning_rate": 9.445688689809631e-06, "loss": 1.6223, "step": 21730 }, { "epoch": 2.4344904815229564, "grad_norm": 8.037787437438965, "learning_rate": 9.427025009331841e-06, "loss": 1.8996, "step": 21740 }, { "epoch": 2.435610302351624, "grad_norm": 5.458945274353027, "learning_rate": 9.40836132885405e-06, "loss": 1.9408, "step": 21750 }, { "epoch": 2.436730123180291, "grad_norm": 16.545921325683594, "learning_rate": 9.38969764837626e-06, "loss": 1.7479, "step": 21760 }, { "epoch": 2.4378499440089585, "grad_norm": 12.776642799377441, "learning_rate": 9.37103396789847e-06, "loss": 1.8733, "step": 21770 }, { "epoch": 2.438969764837626, "grad_norm": 3.9929423332214355, "learning_rate": 9.35237028742068e-06, "loss": 1.4235, "step": 21780 }, { "epoch": 2.4400895856662936, "grad_norm": 12.087785720825195, "learning_rate": 9.33370660694289e-06, "loss": 1.8805, "step": 21790 }, { "epoch": 2.441209406494961, "grad_norm": 4.026576519012451, "learning_rate": 9.3150429264651e-06, "loss": 2.21, "step": 21800 }, { "epoch": 2.4423292273236283, "grad_norm": 5.352035999298096, "learning_rate": 9.29637924598731e-06, "loss": 1.8218, "step": 21810 }, { "epoch": 2.4434490481522957, "grad_norm": 9.776129722595215, "learning_rate": 9.27771556550952e-06, "loss": 2.0464, "step": 21820 }, { "epoch": 2.444568868980963, "grad_norm": 14.493003845214844, "learning_rate": 9.25905188503173e-06, "loss": 1.6871, "step": 21830 }, { "epoch": 2.4456886898096304, "grad_norm": 17.30157470703125, "learning_rate": 9.24038820455394e-06, "loss": 2.5444, "step": 21840 }, { "epoch": 2.4468085106382977, "grad_norm": 3.3516924381256104, "learning_rate": 9.221724524076148e-06, "loss": 1.6969, "step": 21850 }, { "epoch": 2.447928331466965, "grad_norm": 12.714744567871094, "learning_rate": 9.203060843598358e-06, "loss": 2.1882, "step": 21860 }, { "epoch": 2.449048152295633, "grad_norm": 5.864091396331787, "learning_rate": 9.184397163120568e-06, "loss": 1.9062, "step": 21870 }, { "epoch": 2.4501679731243002, "grad_norm": 14.587536811828613, "learning_rate": 9.165733482642776e-06, "loss": 1.6267, "step": 21880 }, { "epoch": 2.4512877939529676, "grad_norm": 9.073355674743652, "learning_rate": 9.147069802164986e-06, "loss": 1.619, "step": 21890 }, { "epoch": 2.452407614781635, "grad_norm": 13.970871925354004, "learning_rate": 9.128406121687196e-06, "loss": 1.7811, "step": 21900 }, { "epoch": 2.4535274356103023, "grad_norm": 13.218069076538086, "learning_rate": 9.109742441209406e-06, "loss": 2.1777, "step": 21910 }, { "epoch": 2.4546472564389696, "grad_norm": 7.0698981285095215, "learning_rate": 9.091078760731616e-06, "loss": 1.672, "step": 21920 }, { "epoch": 2.455767077267637, "grad_norm": 6.958558082580566, "learning_rate": 9.072415080253826e-06, "loss": 1.8189, "step": 21930 }, { "epoch": 2.456886898096305, "grad_norm": 11.154871940612793, "learning_rate": 9.053751399776036e-06, "loss": 1.9092, "step": 21940 }, { "epoch": 2.458006718924972, "grad_norm": 8.34592056274414, "learning_rate": 9.035087719298246e-06, "loss": 2.0183, "step": 21950 }, { "epoch": 2.4591265397536395, "grad_norm": 17.590225219726562, "learning_rate": 9.016424038820456e-06, "loss": 2.0918, "step": 21960 }, { "epoch": 2.460246360582307, "grad_norm": 16.22553825378418, "learning_rate": 8.997760358342666e-06, "loss": 1.9656, "step": 21970 }, { "epoch": 2.461366181410974, "grad_norm": 18.10919952392578, "learning_rate": 8.979096677864876e-06, "loss": 1.6443, "step": 21980 }, { "epoch": 2.4624860022396415, "grad_norm": 5.232290267944336, "learning_rate": 8.960432997387085e-06, "loss": 1.6849, "step": 21990 }, { "epoch": 2.463605823068309, "grad_norm": 9.676012992858887, "learning_rate": 8.941769316909295e-06, "loss": 1.4555, "step": 22000 }, { "epoch": 2.4647256438969762, "grad_norm": 6.006662845611572, "learning_rate": 8.923105636431505e-06, "loss": 1.5401, "step": 22010 }, { "epoch": 2.465845464725644, "grad_norm": 17.258005142211914, "learning_rate": 8.904441955953715e-06, "loss": 2.4357, "step": 22020 }, { "epoch": 2.4669652855543114, "grad_norm": 6.448751926422119, "learning_rate": 8.885778275475925e-06, "loss": 2.1087, "step": 22030 }, { "epoch": 2.4680851063829787, "grad_norm": 5.431372165679932, "learning_rate": 8.867114594998135e-06, "loss": 1.9074, "step": 22040 }, { "epoch": 2.469204927211646, "grad_norm": 12.926970481872559, "learning_rate": 8.848450914520345e-06, "loss": 1.9923, "step": 22050 }, { "epoch": 2.4703247480403134, "grad_norm": 5.959377288818359, "learning_rate": 8.829787234042553e-06, "loss": 1.672, "step": 22060 }, { "epoch": 2.471444568868981, "grad_norm": 16.465486526489258, "learning_rate": 8.811123553564763e-06, "loss": 1.8695, "step": 22070 }, { "epoch": 2.4725643896976486, "grad_norm": 5.742716312408447, "learning_rate": 8.792459873086973e-06, "loss": 1.8781, "step": 22080 }, { "epoch": 2.473684210526316, "grad_norm": 10.750136375427246, "learning_rate": 8.773796192609183e-06, "loss": 1.9953, "step": 22090 }, { "epoch": 2.4748040313549833, "grad_norm": 5.631348609924316, "learning_rate": 8.755132512131391e-06, "loss": 1.6181, "step": 22100 }, { "epoch": 2.4759238521836506, "grad_norm": 9.884969711303711, "learning_rate": 8.736468831653601e-06, "loss": 1.5844, "step": 22110 }, { "epoch": 2.477043673012318, "grad_norm": 4.6717376708984375, "learning_rate": 8.717805151175811e-06, "loss": 1.8376, "step": 22120 }, { "epoch": 2.4781634938409853, "grad_norm": 4.163270473480225, "learning_rate": 8.699141470698021e-06, "loss": 2.1804, "step": 22130 }, { "epoch": 2.4792833146696527, "grad_norm": 7.534999847412109, "learning_rate": 8.680477790220231e-06, "loss": 1.8526, "step": 22140 }, { "epoch": 2.48040313549832, "grad_norm": 9.90027904510498, "learning_rate": 8.661814109742441e-06, "loss": 1.8137, "step": 22150 }, { "epoch": 2.481522956326988, "grad_norm": 19.237462997436523, "learning_rate": 8.643150429264651e-06, "loss": 2.6685, "step": 22160 }, { "epoch": 2.482642777155655, "grad_norm": 9.682941436767578, "learning_rate": 8.624486748786861e-06, "loss": 1.826, "step": 22170 }, { "epoch": 2.4837625979843225, "grad_norm": 3.538151264190674, "learning_rate": 8.605823068309071e-06, "loss": 1.801, "step": 22180 }, { "epoch": 2.48488241881299, "grad_norm": 5.478328704833984, "learning_rate": 8.587159387831281e-06, "loss": 1.8762, "step": 22190 }, { "epoch": 2.4860022396416572, "grad_norm": 7.476826190948486, "learning_rate": 8.568495707353491e-06, "loss": 2.1028, "step": 22200 }, { "epoch": 2.4871220604703246, "grad_norm": 5.20843505859375, "learning_rate": 8.549832026875701e-06, "loss": 1.6034, "step": 22210 }, { "epoch": 2.4882418812989924, "grad_norm": 5.28059720993042, "learning_rate": 8.53116834639791e-06, "loss": 1.7654, "step": 22220 }, { "epoch": 2.4893617021276597, "grad_norm": 6.444010257720947, "learning_rate": 8.51250466592012e-06, "loss": 1.8655, "step": 22230 }, { "epoch": 2.490481522956327, "grad_norm": 6.940224647521973, "learning_rate": 8.49384098544233e-06, "loss": 1.7356, "step": 22240 }, { "epoch": 2.4916013437849944, "grad_norm": 23.465511322021484, "learning_rate": 8.47517730496454e-06, "loss": 1.846, "step": 22250 }, { "epoch": 2.4927211646136618, "grad_norm": 8.295751571655273, "learning_rate": 8.45651362448675e-06, "loss": 1.9118, "step": 22260 }, { "epoch": 2.493840985442329, "grad_norm": 12.125283241271973, "learning_rate": 8.43784994400896e-06, "loss": 2.0026, "step": 22270 }, { "epoch": 2.4949608062709965, "grad_norm": 5.106156349182129, "learning_rate": 8.419186263531168e-06, "loss": 1.8797, "step": 22280 }, { "epoch": 2.496080627099664, "grad_norm": 5.151834011077881, "learning_rate": 8.400522583053378e-06, "loss": 2.063, "step": 22290 }, { "epoch": 2.4972004479283316, "grad_norm": 13.763957023620605, "learning_rate": 8.381858902575588e-06, "loss": 1.6182, "step": 22300 }, { "epoch": 2.498320268756999, "grad_norm": 3.533198356628418, "learning_rate": 8.363195222097798e-06, "loss": 1.6865, "step": 22310 }, { "epoch": 2.4994400895856663, "grad_norm": 5.562738418579102, "learning_rate": 8.344531541620008e-06, "loss": 1.6556, "step": 22320 }, { "epoch": 2.5005599104143337, "grad_norm": 4.3271613121032715, "learning_rate": 8.325867861142216e-06, "loss": 2.088, "step": 22330 }, { "epoch": 2.501679731243001, "grad_norm": 12.457752227783203, "learning_rate": 8.307204180664426e-06, "loss": 2.2114, "step": 22340 }, { "epoch": 2.5027995520716684, "grad_norm": 5.431798934936523, "learning_rate": 8.288540500186636e-06, "loss": 1.5092, "step": 22350 }, { "epoch": 2.503919372900336, "grad_norm": 13.543461799621582, "learning_rate": 8.269876819708846e-06, "loss": 1.9257, "step": 22360 }, { "epoch": 2.5050391937290035, "grad_norm": 8.819217681884766, "learning_rate": 8.251213139231056e-06, "loss": 1.817, "step": 22370 }, { "epoch": 2.506159014557671, "grad_norm": 7.270272254943848, "learning_rate": 8.232549458753266e-06, "loss": 1.7905, "step": 22380 }, { "epoch": 2.5072788353863382, "grad_norm": 7.694066524505615, "learning_rate": 8.213885778275476e-06, "loss": 2.2374, "step": 22390 }, { "epoch": 2.5083986562150056, "grad_norm": 7.0074286460876465, "learning_rate": 8.195222097797686e-06, "loss": 1.8355, "step": 22400 }, { "epoch": 2.509518477043673, "grad_norm": 9.260007858276367, "learning_rate": 8.176558417319896e-06, "loss": 1.9881, "step": 22410 }, { "epoch": 2.5106382978723403, "grad_norm": 11.184020042419434, "learning_rate": 8.157894736842106e-06, "loss": 2.0734, "step": 22420 }, { "epoch": 2.5117581187010076, "grad_norm": 4.612696647644043, "learning_rate": 8.139231056364316e-06, "loss": 2.1045, "step": 22430 }, { "epoch": 2.512877939529675, "grad_norm": 4.042170524597168, "learning_rate": 8.120567375886526e-06, "loss": 2.1674, "step": 22440 }, { "epoch": 2.5139977603583428, "grad_norm": 6.870834827423096, "learning_rate": 8.101903695408735e-06, "loss": 1.5094, "step": 22450 }, { "epoch": 2.51511758118701, "grad_norm": 11.569465637207031, "learning_rate": 8.083240014930945e-06, "loss": 1.7942, "step": 22460 }, { "epoch": 2.5162374020156775, "grad_norm": 9.036044120788574, "learning_rate": 8.064576334453155e-06, "loss": 1.8934, "step": 22470 }, { "epoch": 2.517357222844345, "grad_norm": 16.669771194458008, "learning_rate": 8.045912653975365e-06, "loss": 2.087, "step": 22480 }, { "epoch": 2.518477043673012, "grad_norm": 11.910612106323242, "learning_rate": 8.027248973497575e-06, "loss": 2.067, "step": 22490 }, { "epoch": 2.51959686450168, "grad_norm": 8.747892379760742, "learning_rate": 8.008585293019783e-06, "loss": 2.064, "step": 22500 }, { "epoch": 2.5207166853303473, "grad_norm": 8.751014709472656, "learning_rate": 7.989921612541993e-06, "loss": 1.9492, "step": 22510 }, { "epoch": 2.5218365061590147, "grad_norm": 16.253923416137695, "learning_rate": 7.971257932064203e-06, "loss": 2.2059, "step": 22520 }, { "epoch": 2.522956326987682, "grad_norm": 5.416139125823975, "learning_rate": 7.952594251586413e-06, "loss": 2.3393, "step": 22530 }, { "epoch": 2.5240761478163494, "grad_norm": 5.805497169494629, "learning_rate": 7.933930571108623e-06, "loss": 2.3075, "step": 22540 }, { "epoch": 2.5251959686450167, "grad_norm": 14.180325508117676, "learning_rate": 7.915266890630833e-06, "loss": 1.3238, "step": 22550 }, { "epoch": 2.526315789473684, "grad_norm": 18.959636688232422, "learning_rate": 7.896603210153043e-06, "loss": 2.4995, "step": 22560 }, { "epoch": 2.5274356103023514, "grad_norm": 15.236656188964844, "learning_rate": 7.877939529675251e-06, "loss": 1.979, "step": 22570 }, { "epoch": 2.5285554311310188, "grad_norm": 11.582307815551758, "learning_rate": 7.859275849197461e-06, "loss": 1.8858, "step": 22580 }, { "epoch": 2.5296752519596866, "grad_norm": 4.920597553253174, "learning_rate": 7.840612168719671e-06, "loss": 1.9993, "step": 22590 }, { "epoch": 2.530795072788354, "grad_norm": 16.651355743408203, "learning_rate": 7.821948488241881e-06, "loss": 1.8391, "step": 22600 }, { "epoch": 2.5319148936170213, "grad_norm": 4.262025356292725, "learning_rate": 7.803284807764091e-06, "loss": 1.8714, "step": 22610 }, { "epoch": 2.5330347144456886, "grad_norm": 16.481779098510742, "learning_rate": 7.784621127286301e-06, "loss": 1.8893, "step": 22620 }, { "epoch": 2.534154535274356, "grad_norm": 6.779279708862305, "learning_rate": 7.765957446808511e-06, "loss": 1.7062, "step": 22630 }, { "epoch": 2.5352743561030238, "grad_norm": 11.93194580078125, "learning_rate": 7.747293766330721e-06, "loss": 1.9667, "step": 22640 }, { "epoch": 2.536394176931691, "grad_norm": 8.4479341506958, "learning_rate": 7.728630085852931e-06, "loss": 1.6635, "step": 22650 }, { "epoch": 2.5375139977603585, "grad_norm": 9.052682876586914, "learning_rate": 7.709966405375141e-06, "loss": 1.7176, "step": 22660 }, { "epoch": 2.538633818589026, "grad_norm": 17.69319725036621, "learning_rate": 7.691302724897351e-06, "loss": 2.0191, "step": 22670 }, { "epoch": 2.539753639417693, "grad_norm": 8.785430908203125, "learning_rate": 7.67263904441956e-06, "loss": 1.7666, "step": 22680 }, { "epoch": 2.5408734602463605, "grad_norm": 8.51176929473877, "learning_rate": 7.65397536394177e-06, "loss": 1.8087, "step": 22690 }, { "epoch": 2.541993281075028, "grad_norm": 5.72242546081543, "learning_rate": 7.63531168346398e-06, "loss": 2.0391, "step": 22700 }, { "epoch": 2.543113101903695, "grad_norm": 4.739030838012695, "learning_rate": 7.616648002986189e-06, "loss": 2.0501, "step": 22710 }, { "epoch": 2.5442329227323626, "grad_norm": 7.8822736740112305, "learning_rate": 7.597984322508399e-06, "loss": 1.7549, "step": 22720 }, { "epoch": 2.5453527435610304, "grad_norm": 14.290916442871094, "learning_rate": 7.579320642030609e-06, "loss": 1.606, "step": 22730 }, { "epoch": 2.5464725643896977, "grad_norm": 13.635068893432617, "learning_rate": 7.560656961552819e-06, "loss": 1.795, "step": 22740 }, { "epoch": 2.547592385218365, "grad_norm": 5.094437599182129, "learning_rate": 7.541993281075028e-06, "loss": 1.9598, "step": 22750 }, { "epoch": 2.5487122060470324, "grad_norm": 5.03619384765625, "learning_rate": 7.523329600597238e-06, "loss": 2.3806, "step": 22760 }, { "epoch": 2.5498320268756998, "grad_norm": 9.20356273651123, "learning_rate": 7.504665920119448e-06, "loss": 1.8016, "step": 22770 }, { "epoch": 2.5509518477043676, "grad_norm": 7.040286540985107, "learning_rate": 7.486002239641658e-06, "loss": 1.924, "step": 22780 }, { "epoch": 2.552071668533035, "grad_norm": 6.913671970367432, "learning_rate": 7.467338559163868e-06, "loss": 1.9914, "step": 22790 }, { "epoch": 2.5531914893617023, "grad_norm": 6.331127643585205, "learning_rate": 7.4486748786860764e-06, "loss": 1.7911, "step": 22800 }, { "epoch": 2.5543113101903696, "grad_norm": 6.453745365142822, "learning_rate": 7.4300111982082864e-06, "loss": 1.985, "step": 22810 }, { "epoch": 2.555431131019037, "grad_norm": 15.195472717285156, "learning_rate": 7.4113475177304964e-06, "loss": 2.2203, "step": 22820 }, { "epoch": 2.5565509518477043, "grad_norm": 13.667654037475586, "learning_rate": 7.3926838372527064e-06, "loss": 1.6591, "step": 22830 }, { "epoch": 2.5576707726763717, "grad_norm": 4.465586185455322, "learning_rate": 7.3740201567749165e-06, "loss": 2.0768, "step": 22840 }, { "epoch": 2.558790593505039, "grad_norm": 15.293925285339355, "learning_rate": 7.3553564762971265e-06, "loss": 1.9697, "step": 22850 }, { "epoch": 2.5599104143337064, "grad_norm": 4.840792655944824, "learning_rate": 7.336692795819336e-06, "loss": 1.5074, "step": 22860 }, { "epoch": 2.561030235162374, "grad_norm": 8.356353759765625, "learning_rate": 7.318029115341546e-06, "loss": 1.9765, "step": 22870 }, { "epoch": 2.5621500559910415, "grad_norm": 6.289432525634766, "learning_rate": 7.299365434863756e-06, "loss": 1.7277, "step": 22880 }, { "epoch": 2.563269876819709, "grad_norm": 14.32654094696045, "learning_rate": 7.280701754385966e-06, "loss": 2.1023, "step": 22890 }, { "epoch": 2.564389697648376, "grad_norm": 7.886980056762695, "learning_rate": 7.262038073908176e-06, "loss": 1.8874, "step": 22900 }, { "epoch": 2.5655095184770436, "grad_norm": 11.411437034606934, "learning_rate": 7.243374393430386e-06, "loss": 2.1147, "step": 22910 }, { "epoch": 2.5666293393057114, "grad_norm": 7.815008640289307, "learning_rate": 7.224710712952594e-06, "loss": 2.302, "step": 22920 }, { "epoch": 2.5677491601343787, "grad_norm": 13.516090393066406, "learning_rate": 7.206047032474804e-06, "loss": 1.6051, "step": 22930 }, { "epoch": 2.568868980963046, "grad_norm": 5.94198751449585, "learning_rate": 7.187383351997014e-06, "loss": 1.8294, "step": 22940 }, { "epoch": 2.5699888017917134, "grad_norm": 10.252525329589844, "learning_rate": 7.168719671519224e-06, "loss": 1.4627, "step": 22950 }, { "epoch": 2.5711086226203808, "grad_norm": 8.05044174194336, "learning_rate": 7.150055991041434e-06, "loss": 1.7721, "step": 22960 }, { "epoch": 2.572228443449048, "grad_norm": 5.872049331665039, "learning_rate": 7.131392310563643e-06, "loss": 2.0512, "step": 22970 }, { "epoch": 2.5733482642777155, "grad_norm": 12.562164306640625, "learning_rate": 7.112728630085853e-06, "loss": 2.3197, "step": 22980 }, { "epoch": 2.574468085106383, "grad_norm": 7.482940196990967, "learning_rate": 7.094064949608063e-06, "loss": 2.022, "step": 22990 }, { "epoch": 2.57558790593505, "grad_norm": 11.764483451843262, "learning_rate": 7.075401269130273e-06, "loss": 1.9567, "step": 23000 }, { "epoch": 2.576707726763718, "grad_norm": 10.071126937866211, "learning_rate": 7.056737588652483e-06, "loss": 1.8271, "step": 23010 }, { "epoch": 2.5778275475923853, "grad_norm": 10.216212272644043, "learning_rate": 7.038073908174693e-06, "loss": 1.789, "step": 23020 }, { "epoch": 2.5789473684210527, "grad_norm": 8.066825866699219, "learning_rate": 7.0194102276969015e-06, "loss": 1.6432, "step": 23030 }, { "epoch": 2.58006718924972, "grad_norm": 16.877470016479492, "learning_rate": 7.0007465472191115e-06, "loss": 1.7376, "step": 23040 }, { "epoch": 2.5811870100783874, "grad_norm": 5.387278079986572, "learning_rate": 6.9820828667413215e-06, "loss": 1.5875, "step": 23050 }, { "epoch": 2.5823068309070547, "grad_norm": 15.250850677490234, "learning_rate": 6.9634191862635315e-06, "loss": 1.6844, "step": 23060 }, { "epoch": 2.5834266517357225, "grad_norm": 7.900568962097168, "learning_rate": 6.9447555057857415e-06, "loss": 1.8616, "step": 23070 }, { "epoch": 2.58454647256439, "grad_norm": 6.394863128662109, "learning_rate": 6.926091825307951e-06, "loss": 1.4066, "step": 23080 }, { "epoch": 2.585666293393057, "grad_norm": 5.246687412261963, "learning_rate": 6.907428144830161e-06, "loss": 2.0123, "step": 23090 }, { "epoch": 2.5867861142217246, "grad_norm": 13.717708587646484, "learning_rate": 6.888764464352371e-06, "loss": 1.8026, "step": 23100 }, { "epoch": 2.587905935050392, "grad_norm": 5.991678237915039, "learning_rate": 6.870100783874581e-06, "loss": 1.9593, "step": 23110 }, { "epoch": 2.5890257558790593, "grad_norm": 6.467216491699219, "learning_rate": 6.851437103396791e-06, "loss": 1.8865, "step": 23120 }, { "epoch": 2.5901455767077266, "grad_norm": 11.342000961303711, "learning_rate": 6.832773422919001e-06, "loss": 2.0429, "step": 23130 }, { "epoch": 2.591265397536394, "grad_norm": 6.355552673339844, "learning_rate": 6.814109742441211e-06, "loss": 1.5285, "step": 23140 }, { "epoch": 2.5923852183650617, "grad_norm": 4.930696964263916, "learning_rate": 6.795446061963419e-06, "loss": 2.0815, "step": 23150 }, { "epoch": 2.593505039193729, "grad_norm": 6.06488037109375, "learning_rate": 6.776782381485629e-06, "loss": 1.6929, "step": 23160 }, { "epoch": 2.5946248600223965, "grad_norm": 17.244834899902344, "learning_rate": 6.758118701007839e-06, "loss": 2.1941, "step": 23170 }, { "epoch": 2.595744680851064, "grad_norm": 6.777196407318115, "learning_rate": 6.739455020530049e-06, "loss": 1.6116, "step": 23180 }, { "epoch": 2.596864501679731, "grad_norm": 10.06576156616211, "learning_rate": 6.720791340052258e-06, "loss": 1.6001, "step": 23190 }, { "epoch": 2.5979843225083985, "grad_norm": 17.110258102416992, "learning_rate": 6.702127659574468e-06, "loss": 1.9066, "step": 23200 }, { "epoch": 2.5991041433370663, "grad_norm": 10.628520011901855, "learning_rate": 6.683463979096678e-06, "loss": 1.9434, "step": 23210 }, { "epoch": 2.6002239641657336, "grad_norm": 6.576961040496826, "learning_rate": 6.664800298618888e-06, "loss": 1.2507, "step": 23220 }, { "epoch": 2.601343784994401, "grad_norm": 5.920810699462891, "learning_rate": 6.646136618141098e-06, "loss": 1.9261, "step": 23230 }, { "epoch": 2.6024636058230683, "grad_norm": 4.780271530151367, "learning_rate": 6.627472937663308e-06, "loss": 1.9078, "step": 23240 }, { "epoch": 2.6035834266517357, "grad_norm": 3.30251145362854, "learning_rate": 6.608809257185518e-06, "loss": 2.2392, "step": 23250 }, { "epoch": 2.604703247480403, "grad_norm": 6.361575126647949, "learning_rate": 6.590145576707728e-06, "loss": 1.5298, "step": 23260 }, { "epoch": 2.6058230683090704, "grad_norm": 5.133968830108643, "learning_rate": 6.5714818962299365e-06, "loss": 2.0936, "step": 23270 }, { "epoch": 2.6069428891377378, "grad_norm": 11.35536003112793, "learning_rate": 6.5528182157521465e-06, "loss": 2.1213, "step": 23280 }, { "epoch": 2.608062709966405, "grad_norm": 14.275880813598633, "learning_rate": 6.534154535274356e-06, "loss": 1.5543, "step": 23290 }, { "epoch": 2.609182530795073, "grad_norm": 11.91288948059082, "learning_rate": 6.515490854796566e-06, "loss": 1.8959, "step": 23300 }, { "epoch": 2.6103023516237402, "grad_norm": 7.54849910736084, "learning_rate": 6.496827174318776e-06, "loss": 1.9089, "step": 23310 }, { "epoch": 2.6114221724524076, "grad_norm": 4.366549968719482, "learning_rate": 6.478163493840986e-06, "loss": 1.6129, "step": 23320 }, { "epoch": 2.612541993281075, "grad_norm": 4.882798671722412, "learning_rate": 6.459499813363196e-06, "loss": 1.857, "step": 23330 }, { "epoch": 2.6136618141097423, "grad_norm": 18.530513763427734, "learning_rate": 6.440836132885406e-06, "loss": 2.0854, "step": 23340 }, { "epoch": 2.61478163493841, "grad_norm": 7.46307897567749, "learning_rate": 6.422172452407616e-06, "loss": 1.5743, "step": 23350 }, { "epoch": 2.6159014557670774, "grad_norm": 16.533849716186523, "learning_rate": 6.403508771929826e-06, "loss": 1.9256, "step": 23360 }, { "epoch": 2.617021276595745, "grad_norm": 10.886700630187988, "learning_rate": 6.384845091452035e-06, "loss": 1.8017, "step": 23370 }, { "epoch": 2.618141097424412, "grad_norm": 12.989828109741211, "learning_rate": 6.366181410974244e-06, "loss": 1.6722, "step": 23380 }, { "epoch": 2.6192609182530795, "grad_norm": 6.532835960388184, "learning_rate": 6.347517730496454e-06, "loss": 1.4352, "step": 23390 }, { "epoch": 2.620380739081747, "grad_norm": 4.241251468658447, "learning_rate": 6.328854050018663e-06, "loss": 1.3917, "step": 23400 }, { "epoch": 2.621500559910414, "grad_norm": 14.074196815490723, "learning_rate": 6.310190369540873e-06, "loss": 1.6827, "step": 23410 }, { "epoch": 2.6226203807390815, "grad_norm": 12.457422256469727, "learning_rate": 6.291526689063083e-06, "loss": 1.9047, "step": 23420 }, { "epoch": 2.623740201567749, "grad_norm": 5.373779296875, "learning_rate": 6.272863008585293e-06, "loss": 1.9629, "step": 23430 }, { "epoch": 2.6248600223964167, "grad_norm": 9.897968292236328, "learning_rate": 6.254199328107503e-06, "loss": 2.0395, "step": 23440 }, { "epoch": 2.625979843225084, "grad_norm": 8.989608764648438, "learning_rate": 6.235535647629713e-06, "loss": 2.2198, "step": 23450 }, { "epoch": 2.6270996640537514, "grad_norm": 4.570735931396484, "learning_rate": 6.216871967151923e-06, "loss": 1.5039, "step": 23460 }, { "epoch": 2.6282194848824187, "grad_norm": 5.221905708312988, "learning_rate": 6.198208286674132e-06, "loss": 1.9416, "step": 23470 }, { "epoch": 2.629339305711086, "grad_norm": 12.316040992736816, "learning_rate": 6.179544606196342e-06, "loss": 1.5188, "step": 23480 }, { "epoch": 2.630459126539754, "grad_norm": 7.457785606384277, "learning_rate": 6.160880925718552e-06, "loss": 1.8277, "step": 23490 }, { "epoch": 2.6315789473684212, "grad_norm": 2.610050916671753, "learning_rate": 6.1422172452407615e-06, "loss": 1.8489, "step": 23500 }, { "epoch": 2.6326987681970886, "grad_norm": 18.59623908996582, "learning_rate": 6.1235535647629715e-06, "loss": 2.0608, "step": 23510 }, { "epoch": 2.633818589025756, "grad_norm": 12.009276390075684, "learning_rate": 6.1048898842851815e-06, "loss": 1.4504, "step": 23520 }, { "epoch": 2.6349384098544233, "grad_norm": 5.837096214294434, "learning_rate": 6.086226203807391e-06, "loss": 1.832, "step": 23530 }, { "epoch": 2.6360582306830906, "grad_norm": 7.26812744140625, "learning_rate": 6.067562523329601e-06, "loss": 1.9916, "step": 23540 }, { "epoch": 2.637178051511758, "grad_norm": 12.086437225341797, "learning_rate": 6.048898842851811e-06, "loss": 2.0041, "step": 23550 }, { "epoch": 2.6382978723404253, "grad_norm": 13.357325553894043, "learning_rate": 6.030235162374021e-06, "loss": 1.9994, "step": 23560 }, { "epoch": 2.6394176931690927, "grad_norm": 5.371031284332275, "learning_rate": 6.011571481896231e-06, "loss": 2.0408, "step": 23570 }, { "epoch": 2.6405375139977605, "grad_norm": 5.823469638824463, "learning_rate": 5.992907801418441e-06, "loss": 1.8143, "step": 23580 }, { "epoch": 2.641657334826428, "grad_norm": 14.469114303588867, "learning_rate": 5.97424412094065e-06, "loss": 1.7717, "step": 23590 }, { "epoch": 2.642777155655095, "grad_norm": 5.268034934997559, "learning_rate": 5.955580440462859e-06, "loss": 1.4735, "step": 23600 }, { "epoch": 2.6438969764837625, "grad_norm": 9.155729293823242, "learning_rate": 5.936916759985069e-06, "loss": 1.6729, "step": 23610 }, { "epoch": 2.64501679731243, "grad_norm": 6.211864471435547, "learning_rate": 5.918253079507279e-06, "loss": 1.8493, "step": 23620 }, { "epoch": 2.6461366181410977, "grad_norm": 6.068511962890625, "learning_rate": 5.899589399029489e-06, "loss": 1.5881, "step": 23630 }, { "epoch": 2.647256438969765, "grad_norm": 9.574570655822754, "learning_rate": 5.880925718551699e-06, "loss": 2.2456, "step": 23640 }, { "epoch": 2.6483762597984324, "grad_norm": 17.160005569458008, "learning_rate": 5.862262038073908e-06, "loss": 2.0387, "step": 23650 }, { "epoch": 2.6494960806270997, "grad_norm": 9.527430534362793, "learning_rate": 5.843598357596118e-06, "loss": 1.9547, "step": 23660 }, { "epoch": 2.650615901455767, "grad_norm": 4.618621349334717, "learning_rate": 5.824934677118328e-06, "loss": 2.0931, "step": 23670 }, { "epoch": 2.6517357222844344, "grad_norm": 8.613204002380371, "learning_rate": 5.806270996640538e-06, "loss": 1.9036, "step": 23680 }, { "epoch": 2.652855543113102, "grad_norm": 6.867215156555176, "learning_rate": 5.787607316162748e-06, "loss": 1.6778, "step": 23690 }, { "epoch": 2.653975363941769, "grad_norm": 7.841908931732178, "learning_rate": 5.768943635684957e-06, "loss": 1.8852, "step": 23700 }, { "epoch": 2.6550951847704365, "grad_norm": 16.442686080932617, "learning_rate": 5.7502799552071665e-06, "loss": 2.0865, "step": 23710 }, { "epoch": 2.6562150055991043, "grad_norm": 6.369174957275391, "learning_rate": 5.7316162747293765e-06, "loss": 2.0048, "step": 23720 }, { "epoch": 2.6573348264277716, "grad_norm": 9.311386108398438, "learning_rate": 5.7129525942515865e-06, "loss": 2.0094, "step": 23730 }, { "epoch": 2.658454647256439, "grad_norm": 20.149980545043945, "learning_rate": 5.6942889137737965e-06, "loss": 2.0632, "step": 23740 }, { "epoch": 2.6595744680851063, "grad_norm": 6.561110496520996, "learning_rate": 5.6756252332960065e-06, "loss": 1.6642, "step": 23750 }, { "epoch": 2.6606942889137737, "grad_norm": 12.71684455871582, "learning_rate": 5.656961552818216e-06, "loss": 1.8664, "step": 23760 }, { "epoch": 2.6618141097424415, "grad_norm": 11.952552795410156, "learning_rate": 5.638297872340426e-06, "loss": 2.1605, "step": 23770 }, { "epoch": 2.662933930571109, "grad_norm": 6.831346035003662, "learning_rate": 5.619634191862636e-06, "loss": 2.0142, "step": 23780 }, { "epoch": 2.664053751399776, "grad_norm": 6.235137939453125, "learning_rate": 5.600970511384846e-06, "loss": 1.8274, "step": 23790 }, { "epoch": 2.6651735722284435, "grad_norm": 6.040173530578613, "learning_rate": 5.582306830907056e-06, "loss": 1.8131, "step": 23800 }, { "epoch": 2.666293393057111, "grad_norm": 5.556991100311279, "learning_rate": 5.563643150429265e-06, "loss": 1.8855, "step": 23810 }, { "epoch": 2.6674132138857782, "grad_norm": 12.537089347839355, "learning_rate": 5.544979469951474e-06, "loss": 2.0187, "step": 23820 }, { "epoch": 2.6685330347144456, "grad_norm": 6.158973693847656, "learning_rate": 5.526315789473684e-06, "loss": 1.9405, "step": 23830 }, { "epoch": 2.669652855543113, "grad_norm": 7.678884029388428, "learning_rate": 5.507652108995894e-06, "loss": 1.6122, "step": 23840 }, { "epoch": 2.6707726763717803, "grad_norm": 7.427279472351074, "learning_rate": 5.488988428518104e-06, "loss": 2.3574, "step": 23850 }, { "epoch": 2.671892497200448, "grad_norm": 7.1111369132995605, "learning_rate": 5.470324748040314e-06, "loss": 2.1574, "step": 23860 }, { "epoch": 2.6730123180291154, "grad_norm": 6.275892734527588, "learning_rate": 5.451661067562524e-06, "loss": 2.0668, "step": 23870 }, { "epoch": 2.674132138857783, "grad_norm": 6.145971775054932, "learning_rate": 5.432997387084733e-06, "loss": 1.7284, "step": 23880 }, { "epoch": 2.67525195968645, "grad_norm": 15.736746788024902, "learning_rate": 5.414333706606943e-06, "loss": 1.9066, "step": 23890 }, { "epoch": 2.6763717805151175, "grad_norm": 15.421698570251465, "learning_rate": 5.395670026129153e-06, "loss": 1.8727, "step": 23900 }, { "epoch": 2.677491601343785, "grad_norm": 8.590100288391113, "learning_rate": 5.377006345651362e-06, "loss": 2.0051, "step": 23910 }, { "epoch": 2.6786114221724526, "grad_norm": 5.847489356994629, "learning_rate": 5.358342665173572e-06, "loss": 1.8494, "step": 23920 }, { "epoch": 2.67973124300112, "grad_norm": 7.782309055328369, "learning_rate": 5.339678984695782e-06, "loss": 1.5807, "step": 23930 }, { "epoch": 2.6808510638297873, "grad_norm": 12.278881072998047, "learning_rate": 5.3210153042179915e-06, "loss": 1.7016, "step": 23940 }, { "epoch": 2.6819708846584547, "grad_norm": 8.289608001708984, "learning_rate": 5.3023516237402016e-06, "loss": 1.9277, "step": 23950 }, { "epoch": 2.683090705487122, "grad_norm": 10.736886024475098, "learning_rate": 5.2836879432624116e-06, "loss": 1.6973, "step": 23960 }, { "epoch": 2.6842105263157894, "grad_norm": 10.661380767822266, "learning_rate": 5.2650242627846216e-06, "loss": 1.8526, "step": 23970 }, { "epoch": 2.6853303471444567, "grad_norm": 13.474076271057129, "learning_rate": 5.2463605823068316e-06, "loss": 1.901, "step": 23980 }, { "epoch": 2.686450167973124, "grad_norm": 18.428157806396484, "learning_rate": 5.227696901829041e-06, "loss": 1.6495, "step": 23990 }, { "epoch": 2.687569988801792, "grad_norm": 9.425440788269043, "learning_rate": 5.209033221351251e-06, "loss": 1.563, "step": 24000 }, { "epoch": 2.6886898096304592, "grad_norm": 6.908605575561523, "learning_rate": 5.190369540873461e-06, "loss": 1.6201, "step": 24010 }, { "epoch": 2.6898096304591266, "grad_norm": 14.704828262329102, "learning_rate": 5.17170586039567e-06, "loss": 1.9403, "step": 24020 }, { "epoch": 2.690929451287794, "grad_norm": 6.244283676147461, "learning_rate": 5.15304217991788e-06, "loss": 1.8803, "step": 24030 }, { "epoch": 2.6920492721164613, "grad_norm": 5.735403537750244, "learning_rate": 5.13437849944009e-06, "loss": 1.7747, "step": 24040 }, { "epoch": 2.6931690929451286, "grad_norm": 10.276829719543457, "learning_rate": 5.115714818962299e-06, "loss": 1.4198, "step": 24050 }, { "epoch": 2.6942889137737964, "grad_norm": 12.467009544372559, "learning_rate": 5.097051138484509e-06, "loss": 1.7507, "step": 24060 }, { "epoch": 2.6954087346024638, "grad_norm": 5.5821685791015625, "learning_rate": 5.078387458006719e-06, "loss": 1.9415, "step": 24070 }, { "epoch": 2.696528555431131, "grad_norm": 5.4152092933654785, "learning_rate": 5.059723777528929e-06, "loss": 1.8896, "step": 24080 }, { "epoch": 2.6976483762597985, "grad_norm": 4.517449855804443, "learning_rate": 5.041060097051139e-06, "loss": 1.4663, "step": 24090 }, { "epoch": 2.698768197088466, "grad_norm": 4.125208377838135, "learning_rate": 5.022396416573349e-06, "loss": 1.7891, "step": 24100 }, { "epoch": 2.699888017917133, "grad_norm": 9.702006340026855, "learning_rate": 5.003732736095558e-06, "loss": 1.8673, "step": 24110 }, { "epoch": 2.7010078387458005, "grad_norm": 6.5520524978637695, "learning_rate": 4.985069055617768e-06, "loss": 1.216, "step": 24120 }, { "epoch": 2.702127659574468, "grad_norm": 6.257602214813232, "learning_rate": 4.966405375139977e-06, "loss": 1.8753, "step": 24130 }, { "epoch": 2.7032474804031352, "grad_norm": 14.424186706542969, "learning_rate": 4.947741694662187e-06, "loss": 1.7425, "step": 24140 }, { "epoch": 2.704367301231803, "grad_norm": 8.422646522521973, "learning_rate": 4.929078014184397e-06, "loss": 1.8206, "step": 24150 }, { "epoch": 2.7054871220604704, "grad_norm": 12.657279968261719, "learning_rate": 4.910414333706607e-06, "loss": 1.8706, "step": 24160 }, { "epoch": 2.7066069428891377, "grad_norm": 11.817652702331543, "learning_rate": 4.8917506532288166e-06, "loss": 1.5599, "step": 24170 }, { "epoch": 2.707726763717805, "grad_norm": 10.890949249267578, "learning_rate": 4.873086972751027e-06, "loss": 1.8615, "step": 24180 }, { "epoch": 2.7088465845464724, "grad_norm": 23.385997772216797, "learning_rate": 4.854423292273237e-06, "loss": 2.2412, "step": 24190 }, { "epoch": 2.70996640537514, "grad_norm": 5.702988147735596, "learning_rate": 4.835759611795447e-06, "loss": 2.1498, "step": 24200 }, { "epoch": 2.7110862262038076, "grad_norm": 14.075940132141113, "learning_rate": 4.817095931317657e-06, "loss": 2.0775, "step": 24210 }, { "epoch": 2.712206047032475, "grad_norm": 7.462947845458984, "learning_rate": 4.798432250839866e-06, "loss": 1.7159, "step": 24220 }, { "epoch": 2.7133258678611423, "grad_norm": 7.062658786773682, "learning_rate": 4.779768570362076e-06, "loss": 2.0002, "step": 24230 }, { "epoch": 2.7144456886898096, "grad_norm": 4.55973482131958, "learning_rate": 4.761104889884285e-06, "loss": 1.6569, "step": 24240 }, { "epoch": 2.715565509518477, "grad_norm": 5.43080997467041, "learning_rate": 4.742441209406495e-06, "loss": 1.8489, "step": 24250 }, { "epoch": 2.7166853303471443, "grad_norm": 2.3447320461273193, "learning_rate": 4.723777528928705e-06, "loss": 1.8083, "step": 24260 }, { "epoch": 2.7178051511758117, "grad_norm": 5.452301979064941, "learning_rate": 4.705113848450915e-06, "loss": 1.2504, "step": 24270 }, { "epoch": 2.718924972004479, "grad_norm": 5.787363052368164, "learning_rate": 4.686450167973125e-06, "loss": 1.8861, "step": 24280 }, { "epoch": 2.720044792833147, "grad_norm": 10.703194618225098, "learning_rate": 4.667786487495334e-06, "loss": 1.8052, "step": 24290 }, { "epoch": 2.721164613661814, "grad_norm": 4.210628032684326, "learning_rate": 4.649122807017544e-06, "loss": 1.3951, "step": 24300 }, { "epoch": 2.7222844344904815, "grad_norm": 10.037901878356934, "learning_rate": 4.630459126539754e-06, "loss": 1.4718, "step": 24310 }, { "epoch": 2.723404255319149, "grad_norm": 8.047080993652344, "learning_rate": 4.611795446061964e-06, "loss": 1.9238, "step": 24320 }, { "epoch": 2.724524076147816, "grad_norm": 7.7797980308532715, "learning_rate": 4.593131765584173e-06, "loss": 1.3415, "step": 24330 }, { "epoch": 2.725643896976484, "grad_norm": 10.15149211883545, "learning_rate": 4.574468085106383e-06, "loss": 1.6493, "step": 24340 }, { "epoch": 2.7267637178051514, "grad_norm": 9.764996528625488, "learning_rate": 4.555804404628592e-06, "loss": 1.9219, "step": 24350 }, { "epoch": 2.7278835386338187, "grad_norm": 5.193393230438232, "learning_rate": 4.5371407241508024e-06, "loss": 1.9921, "step": 24360 }, { "epoch": 2.729003359462486, "grad_norm": 6.264823913574219, "learning_rate": 4.5184770436730124e-06, "loss": 1.8523, "step": 24370 }, { "epoch": 2.7301231802911534, "grad_norm": 7.00139045715332, "learning_rate": 4.4998133631952224e-06, "loss": 1.6758, "step": 24380 }, { "epoch": 2.7312430011198208, "grad_norm": 12.917388916015625, "learning_rate": 4.4811496827174324e-06, "loss": 1.4985, "step": 24390 }, { "epoch": 2.732362821948488, "grad_norm": 8.675110816955566, "learning_rate": 4.462486002239642e-06, "loss": 1.711, "step": 24400 }, { "epoch": 2.7334826427771555, "grad_norm": 19.992246627807617, "learning_rate": 4.443822321761852e-06, "loss": 1.9475, "step": 24410 }, { "epoch": 2.734602463605823, "grad_norm": 11.201859474182129, "learning_rate": 4.425158641284062e-06, "loss": 1.5635, "step": 24420 }, { "epoch": 2.7357222844344906, "grad_norm": 16.30078887939453, "learning_rate": 4.406494960806272e-06, "loss": 1.5639, "step": 24430 }, { "epoch": 2.736842105263158, "grad_norm": 6.638637542724609, "learning_rate": 4.387831280328481e-06, "loss": 1.5362, "step": 24440 }, { "epoch": 2.7379619260918253, "grad_norm": 4.872713088989258, "learning_rate": 4.369167599850691e-06, "loss": 1.8645, "step": 24450 }, { "epoch": 2.7390817469204927, "grad_norm": 7.324185848236084, "learning_rate": 4.3505039193729e-06, "loss": 1.8918, "step": 24460 }, { "epoch": 2.74020156774916, "grad_norm": 4.039488792419434, "learning_rate": 4.33184023889511e-06, "loss": 1.6727, "step": 24470 }, { "epoch": 2.741321388577828, "grad_norm": 11.2632417678833, "learning_rate": 4.31317655841732e-06, "loss": 1.7408, "step": 24480 }, { "epoch": 2.742441209406495, "grad_norm": 5.795408725738525, "learning_rate": 4.29451287793953e-06, "loss": 1.7775, "step": 24490 }, { "epoch": 2.7435610302351625, "grad_norm": 11.392952919006348, "learning_rate": 4.27584919746174e-06, "loss": 2.0474, "step": 24500 }, { "epoch": 2.74468085106383, "grad_norm": 13.791424751281738, "learning_rate": 4.25718551698395e-06, "loss": 1.8472, "step": 24510 }, { "epoch": 2.745800671892497, "grad_norm": 5.293121337890625, "learning_rate": 4.238521836506159e-06, "loss": 1.7227, "step": 24520 }, { "epoch": 2.7469204927211646, "grad_norm": 12.450265884399414, "learning_rate": 4.219858156028369e-06, "loss": 1.8579, "step": 24530 }, { "epoch": 2.748040313549832, "grad_norm": 15.723026275634766, "learning_rate": 4.201194475550579e-06, "loss": 1.8304, "step": 24540 }, { "epoch": 2.7491601343784993, "grad_norm": 9.901751518249512, "learning_rate": 4.182530795072788e-06, "loss": 1.7486, "step": 24550 }, { "epoch": 2.7502799552071666, "grad_norm": 4.100079536437988, "learning_rate": 4.163867114594998e-06, "loss": 1.9, "step": 24560 }, { "epoch": 2.7513997760358344, "grad_norm": 12.591753005981445, "learning_rate": 4.145203434117208e-06, "loss": 1.8743, "step": 24570 }, { "epoch": 2.7525195968645018, "grad_norm": 15.76544189453125, "learning_rate": 4.1265397536394174e-06, "loss": 1.8585, "step": 24580 }, { "epoch": 2.753639417693169, "grad_norm": 16.583786010742188, "learning_rate": 4.1078760731616274e-06, "loss": 2.0335, "step": 24590 }, { "epoch": 2.7547592385218365, "grad_norm": 4.9236741065979, "learning_rate": 4.0892123926838375e-06, "loss": 1.7419, "step": 24600 }, { "epoch": 2.755879059350504, "grad_norm": 11.055110931396484, "learning_rate": 4.0705487122060475e-06, "loss": 1.8933, "step": 24610 }, { "epoch": 2.7569988801791716, "grad_norm": 5.0752153396606445, "learning_rate": 4.0518850317282575e-06, "loss": 2.1183, "step": 24620 }, { "epoch": 2.758118701007839, "grad_norm": 16.526071548461914, "learning_rate": 4.0332213512504675e-06, "loss": 2.0595, "step": 24630 }, { "epoch": 2.7592385218365063, "grad_norm": 9.256998062133789, "learning_rate": 4.014557670772677e-06, "loss": 2.0778, "step": 24640 }, { "epoch": 2.7603583426651737, "grad_norm": 5.081698894500732, "learning_rate": 3.995893990294887e-06, "loss": 2.3567, "step": 24650 }, { "epoch": 2.761478163493841, "grad_norm": 6.34022855758667, "learning_rate": 3.977230309817096e-06, "loss": 1.7605, "step": 24660 }, { "epoch": 2.7625979843225084, "grad_norm": 13.629969596862793, "learning_rate": 3.958566629339306e-06, "loss": 1.7667, "step": 24670 }, { "epoch": 2.7637178051511757, "grad_norm": 2.7139463424682617, "learning_rate": 3.939902948861516e-06, "loss": 1.9269, "step": 24680 }, { "epoch": 2.764837625979843, "grad_norm": 8.121241569519043, "learning_rate": 3.921239268383725e-06, "loss": 2.0188, "step": 24690 }, { "epoch": 2.7659574468085104, "grad_norm": 8.049278259277344, "learning_rate": 3.902575587905935e-06, "loss": 1.6694, "step": 24700 }, { "epoch": 2.767077267637178, "grad_norm": 11.107040405273438, "learning_rate": 3.883911907428145e-06, "loss": 2.3154, "step": 24710 }, { "epoch": 2.7681970884658456, "grad_norm": 6.505083084106445, "learning_rate": 3.865248226950355e-06, "loss": 1.8864, "step": 24720 }, { "epoch": 2.769316909294513, "grad_norm": 10.971221923828125, "learning_rate": 3.846584546472565e-06, "loss": 1.9134, "step": 24730 }, { "epoch": 2.7704367301231803, "grad_norm": 4.81821870803833, "learning_rate": 3.827920865994775e-06, "loss": 1.8286, "step": 24740 }, { "epoch": 2.7715565509518476, "grad_norm": 11.385892868041992, "learning_rate": 3.8092571855169837e-06, "loss": 2.0084, "step": 24750 }, { "epoch": 2.772676371780515, "grad_norm": 5.368199825286865, "learning_rate": 3.7905935050391937e-06, "loss": 1.6912, "step": 24760 }, { "epoch": 2.7737961926091828, "grad_norm": 8.81826400756836, "learning_rate": 3.7719298245614037e-06, "loss": 1.9957, "step": 24770 }, { "epoch": 2.77491601343785, "grad_norm": 11.901360511779785, "learning_rate": 3.7532661440836137e-06, "loss": 2.0144, "step": 24780 }, { "epoch": 2.7760358342665175, "grad_norm": 4.292434215545654, "learning_rate": 3.7346024636058233e-06, "loss": 1.7567, "step": 24790 }, { "epoch": 2.777155655095185, "grad_norm": 11.836398124694824, "learning_rate": 3.7159387831280333e-06, "loss": 2.2476, "step": 24800 }, { "epoch": 2.778275475923852, "grad_norm": 5.869718551635742, "learning_rate": 3.6972751026502425e-06, "loss": 1.7245, "step": 24810 }, { "epoch": 2.7793952967525195, "grad_norm": 17.10307502746582, "learning_rate": 3.6786114221724525e-06, "loss": 1.8758, "step": 24820 }, { "epoch": 2.780515117581187, "grad_norm": 12.18902587890625, "learning_rate": 3.6599477416946625e-06, "loss": 1.7381, "step": 24830 }, { "epoch": 2.781634938409854, "grad_norm": 6.835455894470215, "learning_rate": 3.641284061216872e-06, "loss": 1.7884, "step": 24840 }, { "epoch": 2.782754759238522, "grad_norm": 4.9272541999816895, "learning_rate": 3.622620380739082e-06, "loss": 1.7644, "step": 24850 }, { "epoch": 2.7838745800671894, "grad_norm": 8.834504127502441, "learning_rate": 3.603956700261292e-06, "loss": 1.7862, "step": 24860 }, { "epoch": 2.7849944008958567, "grad_norm": 4.071360111236572, "learning_rate": 3.5852930197835012e-06, "loss": 1.8609, "step": 24870 }, { "epoch": 2.786114221724524, "grad_norm": 10.676229476928711, "learning_rate": 3.5666293393057112e-06, "loss": 2.1334, "step": 24880 }, { "epoch": 2.7872340425531914, "grad_norm": 15.198076248168945, "learning_rate": 3.547965658827921e-06, "loss": 2.2799, "step": 24890 }, { "epoch": 2.7883538633818588, "grad_norm": 6.402156829833984, "learning_rate": 3.529301978350131e-06, "loss": 2.0125, "step": 24900 }, { "epoch": 2.7894736842105265, "grad_norm": 6.411627292633057, "learning_rate": 3.510638297872341e-06, "loss": 1.7944, "step": 24910 }, { "epoch": 2.790593505039194, "grad_norm": 8.75882625579834, "learning_rate": 3.491974617394551e-06, "loss": 1.9815, "step": 24920 }, { "epoch": 2.7917133258678613, "grad_norm": 7.310870170593262, "learning_rate": 3.47331093691676e-06, "loss": 1.8766, "step": 24930 }, { "epoch": 2.7928331466965286, "grad_norm": 10.259556770324707, "learning_rate": 3.45464725643897e-06, "loss": 1.7763, "step": 24940 }, { "epoch": 2.793952967525196, "grad_norm": 16.690889358520508, "learning_rate": 3.4359835759611796e-06, "loss": 1.8072, "step": 24950 }, { "epoch": 2.7950727883538633, "grad_norm": 3.8095781803131104, "learning_rate": 3.4173198954833896e-06, "loss": 1.7118, "step": 24960 }, { "epoch": 2.7961926091825307, "grad_norm": 14.5068941116333, "learning_rate": 3.3986562150055996e-06, "loss": 1.7467, "step": 24970 }, { "epoch": 2.797312430011198, "grad_norm": 6.723850250244141, "learning_rate": 3.3799925345278087e-06, "loss": 1.9952, "step": 24980 }, { "epoch": 2.7984322508398654, "grad_norm": 11.487224578857422, "learning_rate": 3.3613288540500187e-06, "loss": 2.0564, "step": 24990 }, { "epoch": 2.799552071668533, "grad_norm": 8.214585304260254, "learning_rate": 3.3426651735722283e-06, "loss": 1.7448, "step": 25000 }, { "epoch": 2.8006718924972005, "grad_norm": 17.024662017822266, "learning_rate": 3.3240014930944383e-06, "loss": 1.5265, "step": 25010 }, { "epoch": 2.801791713325868, "grad_norm": 8.459342002868652, "learning_rate": 3.3053378126166483e-06, "loss": 1.9152, "step": 25020 }, { "epoch": 2.802911534154535, "grad_norm": 8.775456428527832, "learning_rate": 3.2866741321388583e-06, "loss": 2.038, "step": 25030 }, { "epoch": 2.8040313549832026, "grad_norm": 5.435427665710449, "learning_rate": 3.2680104516610675e-06, "loss": 1.9425, "step": 25040 }, { "epoch": 2.8051511758118703, "grad_norm": 11.797080993652344, "learning_rate": 3.2493467711832775e-06, "loss": 2.1394, "step": 25050 }, { "epoch": 2.8062709966405377, "grad_norm": 15.878313064575195, "learning_rate": 3.230683090705487e-06, "loss": 2.2649, "step": 25060 }, { "epoch": 2.807390817469205, "grad_norm": 11.612027168273926, "learning_rate": 3.212019410227697e-06, "loss": 1.4949, "step": 25070 }, { "epoch": 2.8085106382978724, "grad_norm": 14.034370422363281, "learning_rate": 3.193355729749907e-06, "loss": 1.2324, "step": 25080 }, { "epoch": 2.8096304591265397, "grad_norm": 19.293161392211914, "learning_rate": 3.174692049272117e-06, "loss": 2.2584, "step": 25090 }, { "epoch": 2.810750279955207, "grad_norm": 6.628214359283447, "learning_rate": 3.1560283687943263e-06, "loss": 1.5751, "step": 25100 }, { "epoch": 2.8118701007838744, "grad_norm": 16.124217987060547, "learning_rate": 3.137364688316536e-06, "loss": 2.0243, "step": 25110 }, { "epoch": 2.812989921612542, "grad_norm": 6.106673717498779, "learning_rate": 3.118701007838746e-06, "loss": 1.9384, "step": 25120 }, { "epoch": 2.814109742441209, "grad_norm": 8.48365306854248, "learning_rate": 3.100037327360956e-06, "loss": 1.7521, "step": 25130 }, { "epoch": 2.815229563269877, "grad_norm": 6.083116054534912, "learning_rate": 3.0813736468831654e-06, "loss": 1.5675, "step": 25140 }, { "epoch": 2.8163493840985443, "grad_norm": 16.414230346679688, "learning_rate": 3.0627099664053754e-06, "loss": 1.8823, "step": 25150 }, { "epoch": 2.8174692049272116, "grad_norm": 9.585153579711914, "learning_rate": 3.044046285927585e-06, "loss": 2.0315, "step": 25160 }, { "epoch": 2.818589025755879, "grad_norm": 10.281465530395508, "learning_rate": 3.0253826054497946e-06, "loss": 1.8152, "step": 25170 }, { "epoch": 2.8197088465845463, "grad_norm": 4.486020565032959, "learning_rate": 3.0067189249720046e-06, "loss": 1.5388, "step": 25180 }, { "epoch": 2.820828667413214, "grad_norm": 9.82872200012207, "learning_rate": 2.9880552444942146e-06, "loss": 2.1895, "step": 25190 }, { "epoch": 2.8219484882418815, "grad_norm": 13.074673652648926, "learning_rate": 2.969391564016424e-06, "loss": 2.0497, "step": 25200 }, { "epoch": 2.823068309070549, "grad_norm": 14.259294509887695, "learning_rate": 2.9507278835386338e-06, "loss": 2.1442, "step": 25210 }, { "epoch": 2.824188129899216, "grad_norm": 4.93138313293457, "learning_rate": 2.9320642030608438e-06, "loss": 1.7481, "step": 25220 }, { "epoch": 2.8253079507278835, "grad_norm": 6.078362941741943, "learning_rate": 2.9134005225830533e-06, "loss": 1.7641, "step": 25230 }, { "epoch": 2.826427771556551, "grad_norm": 12.861568450927734, "learning_rate": 2.8947368421052634e-06, "loss": 1.8018, "step": 25240 }, { "epoch": 2.8275475923852182, "grad_norm": 8.707850456237793, "learning_rate": 2.8760731616274734e-06, "loss": 1.8521, "step": 25250 }, { "epoch": 2.8286674132138856, "grad_norm": 12.562678337097168, "learning_rate": 2.8574094811496825e-06, "loss": 1.7405, "step": 25260 }, { "epoch": 2.829787234042553, "grad_norm": 18.912614822387695, "learning_rate": 2.8387458006718925e-06, "loss": 1.7982, "step": 25270 }, { "epoch": 2.8309070548712207, "grad_norm": 6.430877208709717, "learning_rate": 2.8200821201941025e-06, "loss": 1.924, "step": 25280 }, { "epoch": 2.832026875699888, "grad_norm": 5.357717990875244, "learning_rate": 2.801418439716312e-06, "loss": 1.9311, "step": 25290 }, { "epoch": 2.8331466965285554, "grad_norm": 13.666546821594238, "learning_rate": 2.782754759238522e-06, "loss": 1.6254, "step": 25300 }, { "epoch": 2.834266517357223, "grad_norm": 3.9082486629486084, "learning_rate": 2.7640910787607317e-06, "loss": 1.9293, "step": 25310 }, { "epoch": 2.83538633818589, "grad_norm": 5.5125732421875, "learning_rate": 2.7454273982829413e-06, "loss": 2.0829, "step": 25320 }, { "epoch": 2.836506159014558, "grad_norm": 5.76453971862793, "learning_rate": 2.7267637178051513e-06, "loss": 1.9197, "step": 25330 }, { "epoch": 2.8376259798432253, "grad_norm": 7.775246620178223, "learning_rate": 2.7081000373273613e-06, "loss": 1.8856, "step": 25340 }, { "epoch": 2.8387458006718926, "grad_norm": 8.506511688232422, "learning_rate": 2.689436356849571e-06, "loss": 1.5753, "step": 25350 }, { "epoch": 2.83986562150056, "grad_norm": 15.149114608764648, "learning_rate": 2.670772676371781e-06, "loss": 1.8317, "step": 25360 }, { "epoch": 2.8409854423292273, "grad_norm": 11.696993827819824, "learning_rate": 2.6521089958939904e-06, "loss": 1.6111, "step": 25370 }, { "epoch": 2.8421052631578947, "grad_norm": 4.9371209144592285, "learning_rate": 2.6334453154162e-06, "loss": 1.7954, "step": 25380 }, { "epoch": 2.843225083986562, "grad_norm": 18.963680267333984, "learning_rate": 2.61478163493841e-06, "loss": 2.4235, "step": 25390 }, { "epoch": 2.8443449048152294, "grad_norm": 11.938720703125, "learning_rate": 2.59611795446062e-06, "loss": 1.667, "step": 25400 }, { "epoch": 2.8454647256438967, "grad_norm": 8.081610679626465, "learning_rate": 2.5774542739828296e-06, "loss": 1.9183, "step": 25410 }, { "epoch": 2.8465845464725645, "grad_norm": 6.90621280670166, "learning_rate": 2.558790593505039e-06, "loss": 1.9125, "step": 25420 }, { "epoch": 2.847704367301232, "grad_norm": 11.308591842651367, "learning_rate": 2.540126913027249e-06, "loss": 2.2834, "step": 25430 }, { "epoch": 2.8488241881298992, "grad_norm": 12.649473190307617, "learning_rate": 2.5214632325494588e-06, "loss": 2.0221, "step": 25440 }, { "epoch": 2.8499440089585666, "grad_norm": 2.6453304290771484, "learning_rate": 2.502799552071669e-06, "loss": 1.7468, "step": 25450 }, { "epoch": 2.851063829787234, "grad_norm": 10.066814422607422, "learning_rate": 2.484135871593879e-06, "loss": 1.706, "step": 25460 }, { "epoch": 2.8521836506159017, "grad_norm": 11.059213638305664, "learning_rate": 2.465472191116088e-06, "loss": 2.4896, "step": 25470 }, { "epoch": 2.853303471444569, "grad_norm": 5.4998016357421875, "learning_rate": 2.446808510638298e-06, "loss": 1.8076, "step": 25480 }, { "epoch": 2.8544232922732364, "grad_norm": 5.691712379455566, "learning_rate": 2.4281448301605075e-06, "loss": 1.9473, "step": 25490 }, { "epoch": 2.855543113101904, "grad_norm": 15.464776992797852, "learning_rate": 2.4094811496827175e-06, "loss": 2.2398, "step": 25500 }, { "epoch": 2.856662933930571, "grad_norm": 3.6245803833007812, "learning_rate": 2.3908174692049275e-06, "loss": 1.9342, "step": 25510 }, { "epoch": 2.8577827547592385, "grad_norm": 8.159955978393555, "learning_rate": 2.372153788727137e-06, "loss": 1.6683, "step": 25520 }, { "epoch": 2.858902575587906, "grad_norm": 11.253711700439453, "learning_rate": 2.3534901082493467e-06, "loss": 1.6143, "step": 25530 }, { "epoch": 2.860022396416573, "grad_norm": 14.729676246643066, "learning_rate": 2.3348264277715567e-06, "loss": 1.8251, "step": 25540 }, { "epoch": 2.8611422172452405, "grad_norm": 8.878775596618652, "learning_rate": 2.3161627472937663e-06, "loss": 1.8646, "step": 25550 }, { "epoch": 2.8622620380739083, "grad_norm": 14.987347602844238, "learning_rate": 2.2974990668159763e-06, "loss": 1.9119, "step": 25560 }, { "epoch": 2.8633818589025757, "grad_norm": 5.109477519989014, "learning_rate": 2.2788353863381863e-06, "loss": 2.2621, "step": 25570 }, { "epoch": 2.864501679731243, "grad_norm": 15.59926700592041, "learning_rate": 2.2601717058603955e-06, "loss": 1.9845, "step": 25580 }, { "epoch": 2.8656215005599104, "grad_norm": 4.6907057762146, "learning_rate": 2.2415080253826055e-06, "loss": 2.0773, "step": 25590 }, { "epoch": 2.8667413213885777, "grad_norm": 6.02996826171875, "learning_rate": 2.2228443449048155e-06, "loss": 2.0783, "step": 25600 }, { "epoch": 2.867861142217245, "grad_norm": 14.389623641967773, "learning_rate": 2.204180664427025e-06, "loss": 1.9496, "step": 25610 }, { "epoch": 2.868980963045913, "grad_norm": 5.188795566558838, "learning_rate": 2.185516983949235e-06, "loss": 1.5794, "step": 25620 }, { "epoch": 2.8701007838745802, "grad_norm": 11.492018699645996, "learning_rate": 2.1668533034714446e-06, "loss": 1.5701, "step": 25630 }, { "epoch": 2.8712206047032476, "grad_norm": 7.7545366287231445, "learning_rate": 2.1481896229936542e-06, "loss": 1.9405, "step": 25640 }, { "epoch": 2.872340425531915, "grad_norm": 6.0428314208984375, "learning_rate": 2.1295259425158642e-06, "loss": 1.6884, "step": 25650 }, { "epoch": 2.8734602463605823, "grad_norm": 15.78642749786377, "learning_rate": 2.1108622620380742e-06, "loss": 1.9008, "step": 25660 }, { "epoch": 2.8745800671892496, "grad_norm": 19.360076904296875, "learning_rate": 2.092198581560284e-06, "loss": 2.1805, "step": 25670 }, { "epoch": 2.875699888017917, "grad_norm": 10.965484619140625, "learning_rate": 2.0735349010824934e-06, "loss": 1.8339, "step": 25680 }, { "epoch": 2.8768197088465843, "grad_norm": 13.197468757629395, "learning_rate": 2.0548712206047034e-06, "loss": 1.7847, "step": 25690 }, { "epoch": 2.8779395296752517, "grad_norm": 8.53710651397705, "learning_rate": 2.036207540126913e-06, "loss": 1.6177, "step": 25700 }, { "epoch": 2.8790593505039195, "grad_norm": 5.556687355041504, "learning_rate": 2.017543859649123e-06, "loss": 1.8142, "step": 25710 }, { "epoch": 2.880179171332587, "grad_norm": 8.486677169799805, "learning_rate": 1.998880179171333e-06, "loss": 2.013, "step": 25720 }, { "epoch": 2.881298992161254, "grad_norm": 12.919852256774902, "learning_rate": 1.9802164986935426e-06, "loss": 1.7, "step": 25730 }, { "epoch": 2.8824188129899215, "grad_norm": 9.491839408874512, "learning_rate": 1.961552818215752e-06, "loss": 1.881, "step": 25740 }, { "epoch": 2.883538633818589, "grad_norm": 15.983736991882324, "learning_rate": 1.942889137737962e-06, "loss": 2.1679, "step": 25750 }, { "epoch": 2.8846584546472567, "grad_norm": 5.595208644866943, "learning_rate": 1.9242254572601717e-06, "loss": 1.8901, "step": 25760 }, { "epoch": 2.885778275475924, "grad_norm": 10.111395835876465, "learning_rate": 1.9055617767823815e-06, "loss": 2.3119, "step": 25770 }, { "epoch": 2.8868980963045914, "grad_norm": 18.772340774536133, "learning_rate": 1.8868980963045915e-06, "loss": 1.9794, "step": 25780 }, { "epoch": 2.8880179171332587, "grad_norm": 4.584385395050049, "learning_rate": 1.8682344158268011e-06, "loss": 1.7576, "step": 25790 }, { "epoch": 2.889137737961926, "grad_norm": 17.171106338500977, "learning_rate": 1.849570735349011e-06, "loss": 2.2186, "step": 25800 }, { "epoch": 2.8902575587905934, "grad_norm": 18.548585891723633, "learning_rate": 1.8309070548712205e-06, "loss": 2.2678, "step": 25810 }, { "epoch": 2.891377379619261, "grad_norm": 4.9082417488098145, "learning_rate": 1.8122433743934305e-06, "loss": 1.9042, "step": 25820 }, { "epoch": 2.892497200447928, "grad_norm": 15.952796936035156, "learning_rate": 1.7935796939156403e-06, "loss": 1.7798, "step": 25830 }, { "epoch": 2.8936170212765955, "grad_norm": 12.888788223266602, "learning_rate": 1.7749160134378499e-06, "loss": 1.648, "step": 25840 }, { "epoch": 2.8947368421052633, "grad_norm": 5.010805606842041, "learning_rate": 1.7562523329600599e-06, "loss": 1.8376, "step": 25850 }, { "epoch": 2.8958566629339306, "grad_norm": 9.813081741333008, "learning_rate": 1.7375886524822697e-06, "loss": 1.846, "step": 25860 }, { "epoch": 2.896976483762598, "grad_norm": 11.167447090148926, "learning_rate": 1.7189249720044792e-06, "loss": 1.8414, "step": 25870 }, { "epoch": 2.8980963045912653, "grad_norm": 5.086580276489258, "learning_rate": 1.700261291526689e-06, "loss": 1.4907, "step": 25880 }, { "epoch": 2.8992161254199327, "grad_norm": 12.3839693069458, "learning_rate": 1.681597611048899e-06, "loss": 1.8576, "step": 25890 }, { "epoch": 2.9003359462486005, "grad_norm": 6.351990222930908, "learning_rate": 1.6629339305711086e-06, "loss": 1.8742, "step": 25900 }, { "epoch": 2.901455767077268, "grad_norm": 5.9348978996276855, "learning_rate": 1.6442702500933184e-06, "loss": 2.1135, "step": 25910 }, { "epoch": 2.902575587905935, "grad_norm": 6.191033363342285, "learning_rate": 1.6256065696155284e-06, "loss": 1.6505, "step": 25920 }, { "epoch": 2.9036954087346025, "grad_norm": 6.510402202606201, "learning_rate": 1.606942889137738e-06, "loss": 1.8007, "step": 25930 }, { "epoch": 2.90481522956327, "grad_norm": 7.332479953765869, "learning_rate": 1.5882792086599478e-06, "loss": 1.9839, "step": 25940 }, { "epoch": 2.9059350503919372, "grad_norm": 3.8477463722229004, "learning_rate": 1.5696155281821578e-06, "loss": 2.1467, "step": 25950 }, { "epoch": 2.9070548712206046, "grad_norm": 11.45783805847168, "learning_rate": 1.5509518477043674e-06, "loss": 1.9373, "step": 25960 }, { "epoch": 2.908174692049272, "grad_norm": 18.16033363342285, "learning_rate": 1.5322881672265772e-06, "loss": 1.9102, "step": 25970 }, { "epoch": 2.9092945128779393, "grad_norm": 14.75328254699707, "learning_rate": 1.513624486748787e-06, "loss": 1.5849, "step": 25980 }, { "epoch": 2.910414333706607, "grad_norm": 7.447336196899414, "learning_rate": 1.4949608062709968e-06, "loss": 1.187, "step": 25990 }, { "epoch": 2.9115341545352744, "grad_norm": 6.396903038024902, "learning_rate": 1.4762971257932065e-06, "loss": 1.6073, "step": 26000 }, { "epoch": 2.9126539753639418, "grad_norm": 13.001298904418945, "learning_rate": 1.4576334453154161e-06, "loss": 2.1491, "step": 26010 }, { "epoch": 2.913773796192609, "grad_norm": 12.818767547607422, "learning_rate": 1.4389697648376261e-06, "loss": 1.6316, "step": 26020 }, { "epoch": 2.9148936170212765, "grad_norm": 11.24215030670166, "learning_rate": 1.420306084359836e-06, "loss": 2.058, "step": 26030 }, { "epoch": 2.9160134378499443, "grad_norm": 5.04683780670166, "learning_rate": 1.4016424038820455e-06, "loss": 1.6477, "step": 26040 }, { "epoch": 2.9171332586786116, "grad_norm": 8.69908618927002, "learning_rate": 1.3829787234042553e-06, "loss": 1.7106, "step": 26050 }, { "epoch": 2.918253079507279, "grad_norm": 10.91391372680664, "learning_rate": 1.364315042926465e-06, "loss": 1.878, "step": 26060 }, { "epoch": 2.9193729003359463, "grad_norm": 15.811108589172363, "learning_rate": 1.3456513624486749e-06, "loss": 2.1953, "step": 26070 }, { "epoch": 2.9204927211646137, "grad_norm": 12.007214546203613, "learning_rate": 1.3269876819708847e-06, "loss": 2.0814, "step": 26080 }, { "epoch": 2.921612541993281, "grad_norm": 11.011650085449219, "learning_rate": 1.3083240014930945e-06, "loss": 1.5842, "step": 26090 }, { "epoch": 2.9227323628219484, "grad_norm": 8.247806549072266, "learning_rate": 1.2896603210153043e-06, "loss": 1.6496, "step": 26100 }, { "epoch": 2.9238521836506157, "grad_norm": 8.519516944885254, "learning_rate": 1.270996640537514e-06, "loss": 2.2153, "step": 26110 }, { "epoch": 2.924972004479283, "grad_norm": 8.969647407531738, "learning_rate": 1.2523329600597239e-06, "loss": 2.1363, "step": 26120 }, { "epoch": 2.926091825307951, "grad_norm": 11.865436553955078, "learning_rate": 1.2336692795819336e-06, "loss": 2.0425, "step": 26130 }, { "epoch": 2.927211646136618, "grad_norm": 20.383333206176758, "learning_rate": 1.2150055991041434e-06, "loss": 2.0145, "step": 26140 }, { "epoch": 2.9283314669652856, "grad_norm": 5.632237434387207, "learning_rate": 1.1963419186263532e-06, "loss": 1.9828, "step": 26150 }, { "epoch": 2.929451287793953, "grad_norm": 7.68386697769165, "learning_rate": 1.177678238148563e-06, "loss": 2.4198, "step": 26160 }, { "epoch": 2.9305711086226203, "grad_norm": 5.8563151359558105, "learning_rate": 1.1590145576707726e-06, "loss": 1.6861, "step": 26170 }, { "epoch": 2.931690929451288, "grad_norm": 20.610515594482422, "learning_rate": 1.1403508771929826e-06, "loss": 2.0284, "step": 26180 }, { "epoch": 2.9328107502799554, "grad_norm": 5.223932266235352, "learning_rate": 1.1216871967151924e-06, "loss": 2.0917, "step": 26190 }, { "epoch": 2.9339305711086228, "grad_norm": 5.224584579467773, "learning_rate": 1.103023516237402e-06, "loss": 1.6706, "step": 26200 }, { "epoch": 2.93505039193729, "grad_norm": 6.829593181610107, "learning_rate": 1.084359835759612e-06, "loss": 2.0043, "step": 26210 }, { "epoch": 2.9361702127659575, "grad_norm": 5.992856979370117, "learning_rate": 1.0656961552818216e-06, "loss": 1.4484, "step": 26220 }, { "epoch": 2.937290033594625, "grad_norm": 9.6854829788208, "learning_rate": 1.0470324748040314e-06, "loss": 2.1745, "step": 26230 }, { "epoch": 2.938409854423292, "grad_norm": 5.901656627655029, "learning_rate": 1.0283687943262412e-06, "loss": 2.2163, "step": 26240 }, { "epoch": 2.9395296752519595, "grad_norm": 11.751072883605957, "learning_rate": 1.009705113848451e-06, "loss": 2.2047, "step": 26250 }, { "epoch": 2.940649496080627, "grad_norm": 6.053452491760254, "learning_rate": 9.910414333706607e-07, "loss": 1.5099, "step": 26260 }, { "epoch": 2.9417693169092947, "grad_norm": 9.445327758789062, "learning_rate": 9.723777528928705e-07, "loss": 2.0437, "step": 26270 }, { "epoch": 2.942889137737962, "grad_norm": 12.053526878356934, "learning_rate": 9.537140724150803e-07, "loss": 2.1465, "step": 26280 }, { "epoch": 2.9440089585666294, "grad_norm": 10.680662155151367, "learning_rate": 9.350503919372901e-07, "loss": 1.8593, "step": 26290 }, { "epoch": 2.9451287793952967, "grad_norm": 13.565116882324219, "learning_rate": 9.163867114594998e-07, "loss": 2.0493, "step": 26300 }, { "epoch": 2.946248600223964, "grad_norm": 6.383924961090088, "learning_rate": 8.977230309817097e-07, "loss": 2.2825, "step": 26310 }, { "epoch": 2.9473684210526314, "grad_norm": 19.03361701965332, "learning_rate": 8.790593505039194e-07, "loss": 1.7299, "step": 26320 }, { "epoch": 2.948488241881299, "grad_norm": 10.854887962341309, "learning_rate": 8.603956700261292e-07, "loss": 1.7987, "step": 26330 }, { "epoch": 2.9496080627099666, "grad_norm": 11.263670921325684, "learning_rate": 8.417319895483391e-07, "loss": 1.9531, "step": 26340 }, { "epoch": 2.950727883538634, "grad_norm": 6.675166606903076, "learning_rate": 8.230683090705488e-07, "loss": 2.1583, "step": 26350 }, { "epoch": 2.9518477043673013, "grad_norm": 10.624512672424316, "learning_rate": 8.044046285927585e-07, "loss": 1.9506, "step": 26360 }, { "epoch": 2.9529675251959686, "grad_norm": 8.63731575012207, "learning_rate": 7.857409481149684e-07, "loss": 1.802, "step": 26370 }, { "epoch": 2.954087346024636, "grad_norm": 8.122550964355469, "learning_rate": 7.670772676371781e-07, "loss": 1.5223, "step": 26380 }, { "epoch": 2.9552071668533033, "grad_norm": 16.51909065246582, "learning_rate": 7.484135871593878e-07, "loss": 1.8571, "step": 26390 }, { "epoch": 2.9563269876819707, "grad_norm": 7.450239658355713, "learning_rate": 7.297499066815976e-07, "loss": 1.5009, "step": 26400 }, { "epoch": 2.9574468085106385, "grad_norm": 14.5098237991333, "learning_rate": 7.110862262038074e-07, "loss": 1.9467, "step": 26410 }, { "epoch": 2.958566629339306, "grad_norm": 6.5906782150268555, "learning_rate": 6.924225457260172e-07, "loss": 1.8331, "step": 26420 }, { "epoch": 2.959686450167973, "grad_norm": 5.737934112548828, "learning_rate": 6.73758865248227e-07, "loss": 2.0944, "step": 26430 }, { "epoch": 2.9608062709966405, "grad_norm": 11.81939697265625, "learning_rate": 6.550951847704367e-07, "loss": 1.6853, "step": 26440 }, { "epoch": 2.961926091825308, "grad_norm": 7.474795341491699, "learning_rate": 6.364315042926465e-07, "loss": 1.4531, "step": 26450 }, { "epoch": 2.963045912653975, "grad_norm": 10.875497817993164, "learning_rate": 6.177678238148564e-07, "loss": 2.3634, "step": 26460 }, { "epoch": 2.964165733482643, "grad_norm": 7.887001991271973, "learning_rate": 5.991041433370661e-07, "loss": 1.9128, "step": 26470 }, { "epoch": 2.9652855543113104, "grad_norm": 6.30941915512085, "learning_rate": 5.804404628592759e-07, "loss": 2.0812, "step": 26480 }, { "epoch": 2.9664053751399777, "grad_norm": 6.238934516906738, "learning_rate": 5.617767823814857e-07, "loss": 1.7906, "step": 26490 }, { "epoch": 2.967525195968645, "grad_norm": 7.524406909942627, "learning_rate": 5.431131019036955e-07, "loss": 1.9674, "step": 26500 }, { "epoch": 2.9686450167973124, "grad_norm": 7.257613658905029, "learning_rate": 5.244494214259052e-07, "loss": 1.9621, "step": 26510 }, { "epoch": 2.9697648376259798, "grad_norm": 16.3148136138916, "learning_rate": 5.057857409481149e-07, "loss": 1.6907, "step": 26520 }, { "epoch": 2.970884658454647, "grad_norm": 5.2545952796936035, "learning_rate": 4.871220604703247e-07, "loss": 1.7305, "step": 26530 }, { "epoch": 2.9720044792833145, "grad_norm": 16.13141632080078, "learning_rate": 4.6845837999253457e-07, "loss": 1.9891, "step": 26540 }, { "epoch": 2.973124300111982, "grad_norm": 9.418984413146973, "learning_rate": 4.497946995147443e-07, "loss": 1.7309, "step": 26550 }, { "epoch": 2.9742441209406496, "grad_norm": 3.6016457080841064, "learning_rate": 4.311310190369541e-07, "loss": 1.6292, "step": 26560 }, { "epoch": 2.975363941769317, "grad_norm": 7.472093105316162, "learning_rate": 4.1246733855916395e-07, "loss": 1.8515, "step": 26570 }, { "epoch": 2.9764837625979843, "grad_norm": 6.936214923858643, "learning_rate": 3.9380365808137364e-07, "loss": 1.7409, "step": 26580 }, { "epoch": 2.9776035834266517, "grad_norm": 6.333293437957764, "learning_rate": 3.751399776035835e-07, "loss": 1.8553, "step": 26590 }, { "epoch": 2.978723404255319, "grad_norm": 5.685304164886475, "learning_rate": 3.564762971257932e-07, "loss": 1.7563, "step": 26600 }, { "epoch": 2.979843225083987, "grad_norm": 7.443325996398926, "learning_rate": 3.37812616648003e-07, "loss": 2.0484, "step": 26610 }, { "epoch": 2.980963045912654, "grad_norm": 6.894618034362793, "learning_rate": 3.1914893617021275e-07, "loss": 1.6829, "step": 26620 }, { "epoch": 2.9820828667413215, "grad_norm": 18.16074562072754, "learning_rate": 3.004852556924226e-07, "loss": 2.1276, "step": 26630 }, { "epoch": 2.983202687569989, "grad_norm": 13.232165336608887, "learning_rate": 2.8182157521463234e-07, "loss": 1.8867, "step": 26640 }, { "epoch": 2.984322508398656, "grad_norm": 3.8035480976104736, "learning_rate": 2.6315789473684213e-07, "loss": 1.9104, "step": 26650 }, { "epoch": 2.9854423292273236, "grad_norm": 6.058941841125488, "learning_rate": 2.4449421425905187e-07, "loss": 1.6504, "step": 26660 }, { "epoch": 2.986562150055991, "grad_norm": 8.38434886932373, "learning_rate": 2.258305337812617e-07, "loss": 1.4973, "step": 26670 }, { "epoch": 2.9876819708846583, "grad_norm": 16.152746200561523, "learning_rate": 2.0716685330347146e-07, "loss": 1.6986, "step": 26680 }, { "epoch": 2.9888017917133256, "grad_norm": 5.802217960357666, "learning_rate": 1.8850317282568122e-07, "loss": 1.6346, "step": 26690 }, { "epoch": 2.9899216125419934, "grad_norm": 18.607810974121094, "learning_rate": 1.6983949234789102e-07, "loss": 1.9916, "step": 26700 }, { "epoch": 2.9910414333706608, "grad_norm": 5.028476715087891, "learning_rate": 1.5117581187010078e-07, "loss": 1.7811, "step": 26710 }, { "epoch": 2.992161254199328, "grad_norm": 6.0024824142456055, "learning_rate": 1.3251213139231058e-07, "loss": 2.2338, "step": 26720 }, { "epoch": 2.9932810750279955, "grad_norm": 5.444699287414551, "learning_rate": 1.1384845091452034e-07, "loss": 2.1604, "step": 26730 }, { "epoch": 2.994400895856663, "grad_norm": 12.872075080871582, "learning_rate": 9.518477043673014e-08, "loss": 1.2952, "step": 26740 }, { "epoch": 2.9955207166853306, "grad_norm": 10.56808853149414, "learning_rate": 7.652108995893992e-08, "loss": 1.8126, "step": 26750 }, { "epoch": 2.996640537513998, "grad_norm": 5.550817966461182, "learning_rate": 5.785740948114969e-08, "loss": 2.0958, "step": 26760 }, { "epoch": 2.9977603583426653, "grad_norm": 6.287519454956055, "learning_rate": 3.919372900335946e-08, "loss": 1.8855, "step": 26770 }, { "epoch": 2.9988801791713326, "grad_norm": 8.283456802368164, "learning_rate": 2.0530048525569244e-08, "loss": 1.916, "step": 26780 }, { "epoch": 3.0, "grad_norm": 7.1067328453063965, "learning_rate": 1.866368047779022e-09, "loss": 1.6709, "step": 26790 } ], "logging_steps": 10, "max_steps": 26790, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.457535008768e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }